shithub: dav1d

Download patch

ref: c2292efcb35f45bced8ea13172ac8fbd6d612a38
parent: 7cb756ea5131285637259d264925c608a24d3443
author: Henrik Gramner <gramner@twoorioles.com>
date: Sun Dec 16 12:57:38 EST 2018

Implement support for PIC in x86-32 asm

Convert all existing 32-bit SSSE3 asm to use PIC.

--- a/meson.build
+++ b/meson.build
@@ -256,13 +256,12 @@
         cdata.set10('ARCH_X86_64', true)
         cdata_asm.set10('ARCH_X86_32', false)
         cdata.set10('ARCH_X86_32', false)
-
-        cdata_asm.set10('PIC', true)
     else
         cdata_asm.set10('ARCH_X86_64', false)
         cdata.set10('ARCH_X86_64', false)
         cdata_asm.set10('ARCH_X86_32', true)
         cdata.set10('ARCH_X86_32', true)
+        cdata_asm.set10('PIC', true)
     endif
 else
     cdata.set10('ARCH_X86', false)
--- a/src/ext/x86/x86inc.asm
+++ b/src/ext/x86/x86inc.asm
@@ -89,16 +89,13 @@
     %endif
 %endmacro
 
-%if WIN64
-    %define PIC
-%elif ARCH_X86_64 == 0
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-    %undef PIC
-%endif
-%ifdef PIC
+%if ARCH_X86_64
+    %define PIC 1 ; always use PIC on x86-64
     default rel
+%elifidn __OUTPUT_FORMAT__,win32
+    %define PIC 0 ; PIC isn't used on 32-bit Windows
+%elifndef PIC
+    %define PIC 0
 %endif
 
 %ifdef __NASM_VER__
@@ -219,6 +216,18 @@
 %else
     %define gprsize 4
 %endif
+
+%macro LEA 2
+%if ARCH_X86_64
+    lea %1, [%2]
+%elif PIC
+    call $+5 ; special-cased to not affect the RSB on most CPU:s
+    pop %1
+    add %1, (%2)-$+1
+%else
+    mov %1, %2
+%endif
+%endmacro
 
 %macro PUSH 1
     push %1
--- a/src/x86/ipred_ssse3.asm
+++ b/src/x86/ipred_ssse3.asm
@@ -93,7 +93,7 @@
 
 INIT_XMM ssse3
 cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
-    lea                          r5, [ipred_h_ssse3_table]
+    LEA                          r5, ipred_h_ssse3_table
     tzcnt                        wd, wm
     movifnidn                    hd, hm
     movsxd                       wq, [r5+wq*4]
--- a/src/x86/itx_ssse3.asm
+++ b/src/x86/itx_ssse3.asm
@@ -55,9 +55,15 @@
 
 %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
 
+%if ARCH_X86_64
+%define o(x) x
+%else
+%define o(x) r5-$$+x ; PIC
+%endif
+
 %macro ITX4_END 4-5 2048 ; row[1-4], rnd
 %if %5
-    mova                 m2, [pw_%5]
+    mova                 m2, [o(pw_%5)]
     pmulhrsw             m0, m2
     pmulhrsw             m1, m2
 %endif
@@ -100,7 +106,6 @@
     ret
 %endmacro
 
-
 ; flags: 1 = swap, 2: coef_regs
 %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
 %if %6 & 2
@@ -107,11 +112,11 @@
     pmaddwd              m%2, m%4, m%1
     pmaddwd              m%1, m%5
 %elif %6 & 1
-    pmaddwd              m%2, m%1, [pw_%5_%4]
+    pmaddwd              m%2, m%1, [o(pw_%5_%4)]
     pmaddwd              m%1, [pw_%4_m%5]
 %else
-    pmaddwd              m%2, m%1, [pw_%4_m%5]
-    pmaddwd              m%1, [pw_%5_%4]
+    pmaddwd              m%2, m%1, [o(pw_%4_m%5)]
+    pmaddwd              m%1, [o(pw_%5_%4)]
 %endif
     paddd                m%2, m%3
     paddd                m%1, m%3
@@ -126,13 +131,13 @@
     paddw                m0, m1
     punpcklqdq           m0, m3               ;high: in0-in2 ;low: in0+in2
 
-    mova                 m3, [pd_2048]
+    mova                 m3, [o(pd_2048)]
     ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
 
 %if %0 == 1
     pmulhrsw             m0, m%1
 %else
-    pmulhrsw             m0, [pw_2896x8]     ;high: t1 ;low: t0
+    pmulhrsw             m0, [o(pw_2896x8)]  ;high: t1 ;low: t0
 %endif
 
     psubsw               m1, m0, m2          ;high: out2 ;low: out3
@@ -146,15 +151,14 @@
     punpckhqdq           m1, m1                    ;
     paddw                m1, m0                    ;low: in0 - in2 + in3
 
-    pmaddwd              m0, m2, [pw_1321_3803]    ;1321 * in0 + 3803 * in2
-    pmaddwd              m2, [pw_2482_m1321]       ;2482 * in0 - 1321 * in2
-    pmaddwd              m4, m3, [pw_3344_2482]    ;3344 * in1 + 2482 * in3
-    pmaddwd              m5, m3, [pw_3344_m3803]   ;3344 * in1 - 3803 * in3
+    pmaddwd              m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
+    pmaddwd              m2, [o(pw_2482_m1321)]    ;2482 * in0 - 1321 * in2
+    pmaddwd              m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
+    pmaddwd              m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3
     paddd                m4, m0                    ;t0 + t3
-
-    pmaddwd              m3, [pw_m6688_m3803]      ;-2 * 3344 * in1 - 3803 * in3
-    pmulhrsw             m1, [pw_3344x8]           ;low: out2
-    mova                 m0, [pd_2048]
+    pmaddwd              m3, [o(pw_m6688_m3803)]   ;-2 * 3344 * in1 - 3803 * in3
+    pmulhrsw             m1, [o(pw_3344x8)]        ;low: out2
+    mova                 m0, [o(pd_2048)]
     paddd                m2, m0
     paddd                m0, m4                    ;t0 + t3 + 2048
     paddd                m5, m2                    ;t1 + t3 + 2048
@@ -169,9 +173,11 @@
 %endmacro
 
 %macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
-cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2
+cglobal inv_txfm_add_%1_%2_%4, 4, 6, 0, dst, stride, coeff, eob, tx2
     %undef cmp
-    lea tx2q, [m(i%2_%4_internal).pass2]
+%if ARCH_X86_32
+    LEA                    r5, $$
+%endif
 %if %3 > 0
     cmp                  eobd, %3
     jle %%end
@@ -179,7 +185,8 @@
     test                 eobd, eobd
     jz %%end
 %endif
-    call i%1_%4_internal
+    lea                  tx2q, [o(m(i%2_%4_internal).pass2)]
+    call m(i%1_%4_internal)
     RET
 ALIGN function_align
 %%end:
@@ -188,10 +195,10 @@
 %macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
     INV_TXFM_FN          %1, %2, %3, 4x4
 %ifidn %1_%2, dct_identity
-    mova                 m0, [pw_2896x8]
+    mova                 m0, [o(pw_2896x8)]
     pmulhrsw             m0, [coeffq]
     paddw                m0, m0
-    pmulhrsw             m0, [pw_5793x4]
+    pmulhrsw             m0, [o(pw_5793x4)]
     punpcklwd            m0, m0
     punpckhdq            m1, m0, m0
     punpckldq            m0, m0
@@ -205,8 +212,8 @@
     punpcklwd            m0, m1
     punpcklqdq           m0, m0
     paddw                m0, m0
-    pmulhrsw             m0, [pw_5793x4]
-    pmulhrsw             m0, [pw_2896x8]
+    pmulhrsw             m0, [o(pw_5793x4)]
+    pmulhrsw             m0, [o(pw_2896x8)]
     mova                 m1, m0
     call m(iadst_4x4_internal).end
     RET
@@ -214,17 +221,17 @@
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
 %ifidn %1, dct
-    mova                 m1, [pw_2896x8]
+    mova                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1
 %elifidn %1, adst
-    pmulhrsw             m0, [iadst4_dconly1a]
+    pmulhrsw             m0, [o(iadst4_dconly1a)]
 %elifidn %1, flipadst
-    pmulhrsw             m0, [iadst4_dconly1b]
+    pmulhrsw             m0, [o(iadst4_dconly1b)]
 %endif
     mov            [coeffq], eobd                ;0
 %ifidn %2, dct
 %ifnidn %1, dct
-    pmulhrsw             m0, [pw_2896x8]
+    pmulhrsw             m0, [o(pw_2896x8)]
 %else
     pmulhrsw             m0, m1
 %endif
@@ -232,8 +239,8 @@
     call m(iadst_4x4_internal).end2
     RET
 %else ; adst / flipadst
-    pmulhrsw             m1, m0, [iadst4_dconly2b]
-    pmulhrsw             m0, [iadst4_dconly2a]
+    pmulhrsw             m1, m0, [o(iadst4_dconly2b)]
+    pmulhrsw             m0, [o(iadst4_dconly2a)]
     call m(i%2_4x4_internal).end2
     RET
 %endif
@@ -240,9 +247,13 @@
 %endif
 %endmacro
 
-
 INIT_XMM ssse3
 
+INV_TXFM_4X4_FN dct, dct,      0
+INV_TXFM_4X4_FN dct, adst,     0
+INV_TXFM_4X4_FN dct, flipadst, 0
+INV_TXFM_4X4_FN dct, identity, 3
+
 cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
     mova                 m1, [coeffq+16*1]      ;high: in3 ;low in2
@@ -249,7 +260,7 @@
 
     IDCT4_1D_PACKED
 
-    mova                 m2, [deint_shuf]
+    mova                 m2, [o(deint_shuf)]
     shufps               m3, m0, m1, q1331
     shufps               m0, m1, q0220
     pshufb               m0, m2                 ;high: in1 ;low: in0
@@ -265,7 +276,10 @@
 
     ITX4_END     0, 1, 3, 2
 
-INV_TXFM_4X4_FN dct, dct, 0
+INV_TXFM_4X4_FN adst, dct,      0
+INV_TXFM_4X4_FN adst, adst,     0
+INV_TXFM_4X4_FN adst, flipadst, 0
+INV_TXFM_4X4_FN adst, identity
 
 cglobal iadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
@@ -294,9 +308,10 @@
     IADST4_1D_PACKED
     ret
 
-INV_TXFM_4X4_FN adst, adst, 0
-INV_TXFM_4X4_FN dct,  adst, 0
-INV_TXFM_4X4_FN adst, dct,  0
+INV_TXFM_4X4_FN flipadst, dct,      0
+INV_TXFM_4X4_FN flipadst, adst,     0
+INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, identity
 
 cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
@@ -321,16 +336,15 @@
 .end2:
     ITX4_END              3, 2, 1, 0
 
-INV_TXFM_4X4_FN flipadst, flipadst, 0
-INV_TXFM_4X4_FN flipadst, dct,      0
-INV_TXFM_4X4_FN flipadst, adst,     0
-INV_TXFM_4X4_FN dct,      flipadst, 0
-INV_TXFM_4X4_FN adst,     flipadst, 0
+INV_TXFM_4X4_FN identity, dct,      3
+INV_TXFM_4X4_FN identity, adst
+INV_TXFM_4X4_FN identity, flipadst
+INV_TXFM_4X4_FN identity, identity
 
 cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]
     mova                 m1, [coeffq+16*1]
-    mova                 m2, [pw_5793x4]
+    mova                 m2, [o(pw_5793x4)]
     paddw                m0, m0
     paddw                m1, m1
     pmulhrsw             m0, m2
@@ -343,20 +357,12 @@
     jmp                tx2q
 
 .pass2:
-    mova                 m2, [pw_5793x4]
+    mova                 m2, [o(pw_5793x4)]
     paddw                m0, m0
     paddw                m1, m1
     pmulhrsw             m0, m2
     pmulhrsw             m1, m2
     jmp m(iadst_4x4_internal).end
-
-INV_TXFM_4X4_FN identity, identity
-INV_TXFM_4X4_FN identity, dct,      3
-INV_TXFM_4X4_FN identity, adst
-INV_TXFM_4X4_FN identity, flipadst
-INV_TXFM_4X4_FN dct,      identity, 3
-INV_TXFM_4X4_FN adst,     identity
-INV_TXFM_4X4_FN flipadst, identity
 
 %macro IWHT4_1D_PACKED 0
     punpckhqdq           m3, m0, m1            ;low: in1 high: in3
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -186,7 +186,7 @@
 %endmacro
 
 cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
-    lea                  r6, [avg_ssse3_table]
+    LEA                  r6, avg_ssse3_table
     tzcnt                wd, wm ; leading zeros
     movifnidn            hd, hm ; move h(stack) to h(register) if not already that register
     movsxd               wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
@@ -216,7 +216,7 @@
 %define W_AVG_INC_PTR AVG_INC_PTR
 
 cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
-    lea                  r6, [w_avg_ssse3_table]
+    LEA                  r6, w_avg_ssse3_table
     tzcnt                wd, wm
     movifnidn            hd, hm
     movd                 m0, r6m
@@ -269,11 +269,12 @@
 cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
 %define hd dword r5m
 %endif
-    lea                  r6, [mask_ssse3_table]
+%define base r6-mask_ssse3_table
+    LEA                  r6, mask_ssse3_table
     tzcnt                wd, wm
     movsxd               wq, dword [r6+wq*4]
     pxor                 m4, m4
-    mova                 m5, [pw_2048+r6-mask_ssse3_table]
+    mova                 m5, [base+pw_2048]
     add                  wq, r6
     mov               maskq, r6m
     BIDIR_FN           MASK
@@ -284,9 +285,9 @@
  %define reg_pw_27        m9
  %define reg_pw_2048      m10
 %else
- %define reg_pw_8         [pw_8]
- %define reg_pw_27        [pw_26] ; 64 - 38
- %define reg_pw_2048      [pw_2048]
+ %define reg_pw_8         [base+pw_8]
+ %define reg_pw_27        [base+pw_26] ; 64 - 38
+ %define reg_pw_2048      [base+pw_2048]
 %endif
 
 %macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
@@ -323,63 +324,60 @@
     W_MASK_420_B (%1*16), %2
 %endmacro
 
+%define base r6-w_mask_420_ssse3_table
 %if ARCH_X86_64
 ; args: dst, stride, tmp1, tmp2, w, h, mask, sign
-cglobal w_mask_420, 4, 9, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
-    lea                  r7, [w_mask_420_ssse3_table]
+cglobal w_mask_420, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask
+    lea                  r6, [w_mask_420_ssse3_table]
     mov                  wd, wm
-    tzcnt               r8d, wd
+    tzcnt               r7d, wd
     movifnidn            hd, hm
-    mov               maskq, maskmp
     movd                 m0, r7m
     pshuflw              m0, m0, q0000 ; sign
     punpcklqdq           m0, m0
-    movsxd               r8, dword [r7+r8*4]
-    mova           reg_pw_8, [pw_8]
-    mova          reg_pw_27, [pw_26] ; 64 - 38
-    mova        reg_pw_2048, [pw_2048]
-    mova                 m6, [pw_258] ; 64 * 4 + 2
+    movsxd               r7, [r6+r7*4]
+    mova           reg_pw_8, [base+pw_8]
+    mova          reg_pw_27, [base+pw_26] ; 64 - 38
+    mova        reg_pw_2048, [base+pw_2048]
+    mova                 m6, [base+pw_258] ; 64 * 4 + 2
+    add                  r7, r6
+    mov               maskq, maskmp
     psubw                m6, m0
-    add                  r8, r7
     W_MASK_420            0, 4
-    lea            stride3q, [strideq*3]
-    jmp                  r8
-    %define dst_bak      r8
-    %define loop_w       r7
-    %define orig_w       wq
+    jmp                  r7
+    %define loop_w      r7d
 %else
-cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3
-    tzcnt               r6d, r4m
-    mov                  wd, w_mask_420_ssse3_table
-    add                  wd, [wq+r6*4]
+cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
+    tzcnt                wd, wm
+    LEA                  r6, w_mask_420_ssse3_table
+    mov                  wd, [r6+wq*4]
     mov               maskq, r6mp
     movd                 m0, r7m
     pshuflw              m0, m0, q0000 ; sign
     punpcklqdq           m0, m0
-    mova                 m6, [pw_258] ; 64 * 4 + 2
+    mova                 m6, [base+pw_258] ; 64 * 4 + 2
+    add                  wq, r6
     psubw                m6, m0
     W_MASK_420            0, 4
-    lea            stride3q, [strideq*3]
     jmp                  wd
-    %define dst_bak     r0m
-    %define loop_w      r6q
-    %define orig_w      r4m
-    %define hd    dword r5m
+    %define loop_w dword r0m
+    %define hd     dword r5m
 %endif
 .w4_loop:
     add               tmp1q, 2*16
     add               tmp2q, 2*16
     W_MASK_420            0, 4
-    lea                dstq, [dstq+strideq*4]
+    lea                dstq, [dstq+strideq*2]
     add               maskq, 4
 .w4:
     movd   [dstq          ], m0 ; copy m0[0]
     pshuflw              m1, m0, q1032
     movd   [dstq+strideq*1], m1 ; copy m0[1]
+    lea                dstq, [dstq+strideq*2]
     punpckhqdq           m0, m0
-    movd   [dstq+strideq*2], m0 ; copy m0[2]
+    movd   [dstq+strideq*0], m0 ; copy m0[2]
     psrlq                m0, 32
-    movd   [dstq+stride3q ], m0 ; copy m0[3]
+    movd   [dstq+strideq*1], m0 ; copy m0[3]
     pshufd               m5, m4, q3131; DBDB even lines repeated
     pshufd               m4, m4, q2020; CACA odd lines repeated
     psubw                m1, m6, m4   ; m9 == 64 * 4 + 2
@@ -409,20 +407,19 @@
     jg .w8_loop
     RET
 .w16: ; w32/64/128
-    mov             dst_bak, dstq
-    mov              loop_w, orig_w ; use width as counter
 %if ARCH_X86_32
-    mov                  wq, orig_w ; because we altered it in 32bit setup
+    mov                  wd, wm     ; because we altered it in 32bit setup
 %endif
+    mov              loop_w, wd     ; use width as counter
     jmp .w16ge_inner_loop_first
 .w16ge_loop:
     lea               tmp1q, [tmp1q+wq*2] ; skip even line pixels
     lea               tmp2q, [tmp2q+wq*2] ; skip even line pixels
+    sub                dstq, wq
+    mov              loop_w, wd
     lea                dstq, [dstq+strideq*2]
-    mov             dst_bak, dstq
-    mov              loop_w, orig_w
 .w16ge_inner_loop:
-    W_MASK_420_B           0, 4
+    W_MASK_420_B          0, 4
 .w16ge_inner_loop_first:
     mova   [dstq          ], m0
     W_MASK_420_B       wq*2, 5  ; load matching even line (offset = widthpx * (16+16))
@@ -438,7 +435,6 @@
     add                dstq, 16
     sub              loop_w, 16
     jg .w16ge_inner_loop
-    mov                dstq, dst_bak
     sub                  hd, 2
     jg .w16ge_loop
     RET
@@ -470,7 +466,7 @@
 
 cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
 %define base r6-blend_ssse3_table
-    lea                  r6, [blend_ssse3_table]
+    LEA                  r6, blend_ssse3_table
     tzcnt                wd, wm
     movifnidn            hd, hm
     movifnidn         maskq, maskmp
@@ -546,7 +542,7 @@
 
 cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask
 %define base r5-blend_v_ssse3_table
-    lea                  r5, [blend_v_ssse3_table]
+    LEA                  r5, blend_v_ssse3_table
     tzcnt                wd, wm
     movifnidn            hd, hm
     movsxd               wq, dword [r5+wq*4]
@@ -646,15 +642,21 @@
     jg .w32_loop
     RET
 
-cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
-%define base r5-blend_h_ssse3_table
-    lea                  r5, [blend_h_ssse3_table]
+cglobal blend_h, 3, 7, 6, dst, ds, tmp, w, h, mask
+%define base t0-blend_h_ssse3_table
+%if ARCH_X86_32
+    ; We need to keep the PIC pointer for w4, reload wd from stack instead
+    DECLARE_REG_TMP 6
+%else
+    DECLARE_REG_TMP 5
     mov                 r6d, wd
-    tzcnt                wd, wd
+%endif
+    LEA                  t0, blend_h_ssse3_table
+    tzcnt                wd, wm
     mov                  hd, hm
-    movsxd               wq, dword [r5+wq*4]
+    movsxd               wq, dword [t0+wq*4]
     mova                 m5, [base+pw_512]
-    add                  wq, r5
+    add                  wq, t0
     lea               maskq, [base+obmc_masks+hq*4]
     neg                  hq
     jmp                  wq
@@ -678,7 +680,11 @@
     jl .w2
     RET
 .w4:
+%if ARCH_X86_32
+    mova                 m3, [base+blend_shuf]
+%else
     mova                 m3, [blend_shuf]
+%endif
 .w4_loop:
     movd                 m0, [dstq+dsq*0]
     movd                 m2, [dstq+dsq*1]
@@ -716,6 +722,9 @@
     RET
 ; w16/w32/w64/w128
 .w16:
+%if ARCH_X86_32
+    mov                 r6d, wm
+%endif
     sub                 dsq, r6
 .w16_loop0:
     movd                 m3, [maskq+hq*2]
--- a/tests/checkasm/x86/checkasm.asm
+++ b/tests/checkasm/x86/checkasm.asm
@@ -200,7 +200,7 @@
     jz .ok
     mov  r3, eax
     mov  r4, edx
-    lea  r0, [error_message]
+    LEA  r0, error_message
     mov [esp], r0
     call fail_func
     mov  edx, r4