shithub: dav1d

Download patch

ref: c0865f35c74bdcc71021630f64dca2db35d2bc8c
parent: 1e4e6c7ae3c0b6183d737a81f7cd1e30a50d3b46
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Fri Jul 19 07:53:25 EDT 2019

x86: add 32-bit support to SSSE3 deblock lpf

------------------------------------------
x86_64: lpf_h_sb_uv_w4_8bpc_c: 430.6
x86_32: lpf_h_sb_uv_w4_8bpc_c: 788.6
x86_64: lpf_h_sb_uv_w4_8bpc_ssse3: 322.0
x86_32: lpf_h_sb_uv_w4_8bpc_ssse3: 302.4
---------------------
x86_64: lpf_h_sb_uv_w6_8bpc_c: 981.9
x86_32: lpf_h_sb_uv_w6_8bpc_c: 1579.6
x86_64: lpf_h_sb_uv_w6_8bpc_ssse3: 421.5
x86_32: lpf_h_sb_uv_w6_8bpc_ssse3: 431.6
---------------------
x86_64: lpf_h_sb_y_w4_8bpc_c: 3001.7
x86_32: lpf_h_sb_y_w4_8bpc_c: 7021.3
x86_64: lpf_h_sb_y_w4_8bpc_ssse3: 466.3
x86_32: lpf_h_sb_y_w4_8bpc_ssse3: 564.7
---------------------
x86_64: lpf_h_sb_y_w8_8bpc_c: 4457.7
x86_32: lpf_h_sb_y_w8_8bpc_c: 3657.8
x86_64: lpf_h_sb_y_w8_8bpc_ssse3: 818.9
x86_32: lpf_h_sb_y_w8_8bpc_ssse3: 927.9
---------------------
x86_64: lpf_h_sb_y_w16_8bpc_c: 1967.9
x86_32: lpf_h_sb_y_w16_8bpc_c: 3343.5
x86_64: lpf_h_sb_y_w16_8bpc_ssse3: 1836.7
x86_32: lpf_h_sb_y_w16_8bpc_ssse3: 1975.0
---------------------
x86_64: lpf_v_sb_uv_w4_8bpc_c: 369.4
x86_32: lpf_v_sb_uv_w4_8bpc_c: 793.6
x86_64: lpf_v_sb_uv_w4_8bpc_ssse3: 110.9
x86_32: lpf_v_sb_uv_w4_8bpc_ssse3: 133.0
---------------------
x86_64: lpf_v_sb_uv_w6_8bpc_c: 769.6
x86_32: lpf_v_sb_uv_w6_8bpc_c: 1576.7
x86_64: lpf_v_sb_uv_w6_8bpc_ssse3: 222.2
x86_32: lpf_v_sb_uv_w6_8bpc_ssse3: 232.2
---------------------
x86_64: lpf_v_sb_y_w4_8bpc_c: 772.4
x86_32: lpf_v_sb_y_w4_8bpc_c: 2596.5
x86_64: lpf_v_sb_y_w4_8bpc_ssse3: 179.8
x86_32: lpf_v_sb_y_w4_8bpc_ssse3: 234.7
---------------------
x86_64: lpf_v_sb_y_w8_8bpc_c: 1660.2
x86_32: lpf_v_sb_y_w8_8bpc_c: 3979.9
x86_64: lpf_v_sb_y_w8_8bpc_ssse3: 468.3
x86_32: lpf_v_sb_y_w8_8bpc_ssse3: 580.9
---------------------
x86_64: lpf_v_sb_y_w16_8bpc_c: 1889.6
x86_32: lpf_v_sb_y_w16_8bpc_c: 4728.7
x86_64: lpf_v_sb_y_w16_8bpc_ssse3: 1142.0
x86_32: lpf_v_sb_y_w16_8bpc_ssse3: 1174.8
------------------------------------------

--- a/src/x86/loopfilter_init_tmpl.c
+++ b/src/x86/loopfilter_init_tmpl.c
@@ -42,7 +42,7 @@
 
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
-#if BITDEPTH == 8 && ARCH_X86_64
+#if BITDEPTH == 8
     c->loop_filter_sb[0][0] = dav1d_lpf_h_sb_y_ssse3;
     c->loop_filter_sb[0][1] = dav1d_lpf_v_sb_y_ssse3;
     c->loop_filter_sb[1][0] = dav1d_lpf_h_sb_uv_ssse3;
--- a/src/x86/loopfilter_ssse3.asm
+++ b/src/x86/loopfilter_ssse3.asm
@@ -91,12 +91,12 @@
     lea         dstq, [dstq+stride3q*4]
 %endmacro
 
-%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
+%macro TRANSPOSE_16X16B 2 ; output_transpose, mem
 %if %1 == 0
-    mova          %3, m15
+    mova          %2, m15 ; m7 in 32-bit
 %endif
 
-    ; input in m0-15
+    ; input in m0-7
     punpcklbw    m15, m0, m1
     punpckhbw     m0, m1
     punpcklbw     m1, m2, m3
@@ -103,8 +103,79 @@
     punpckhbw     m2, m3
     punpcklbw     m3, m4, m5
     punpckhbw     m4, m5
-    punpcklbw     m5, m6, m7
-    punpckhbw     m6, m7
+%if ARCH_X86_64
+    SWAP           4, 5, 7
+%else
+ %if %1 == 0
+    mova          m5, %2
+ %else
+    mova          m5, [esp+1*16]
+ %endif
+    mova          %2, m4
+%endif
+    punpcklbw     m4, m6, m5
+    punpckhbw     m6, m5
+
+    ; interleaved in m15,0,1,2,3,7,4,6
+    punpcklwd     m5, m15, m1
+    punpckhwd    m15, m1
+    punpcklwd     m1, m0, m2
+    punpckhwd     m0, m2
+    punpcklwd     m2, m3, m4
+    punpckhwd     m3, m4
+%if ARCH_X86_64
+    SWAP           3, 4, 7
+%else
+    mova          m4, %2
+    mova          %2, m3
+%endif
+    punpcklwd     m3, m4, m6
+    punpckhwd     m4, m6
+
+    ; interleaved in m5,15,1,0,2,7,3,4
+    punpckldq     m6, m5, m2
+    punpckhdq     m5, m2
+%if ARCH_X86_64
+    SWAP           2, 7, 5
+%else
+    mova          m2, %2
+    mova  [esp+1*16], m5
+%endif
+    punpckldq     m5, m15, m2
+    punpckhdq    m15, m2
+    punpckldq     m2, m1, m3
+    punpckhdq     m1, m3
+    punpckldq     m3, m0, m4
+    punpckhdq     m0, m4
+
+%if ARCH_X86_32
+    mova  [esp+0*16], m6
+    mova  [esp+2*16], m5
+    mova  [esp+3*16], m15
+    mova  [esp+4*16], m2
+    mova  [esp+5*16], m1
+    mova  [esp+6*16], m3
+    mova  [esp+7*16], m0
+    mova          m8, [esp+ 8*16]
+    mova          m9, [esp+ 9*16]
+    mova         m10, [esp+10*16]
+ %if %1 == 0
+    mova         m11, [esp+11*16]
+    mova         m12, [esp+12*16]
+    mova         m13, [esp+13*16]
+    mova         m14, [esp+14*16]
+ %else
+    mova         m11, [esp+20*16]
+    mova         m12, [esp+15*16]
+    mova         m13, [esp+16*16]
+    mova         m14, [esp+17*16]
+ %endif
+%endif
+
+    ; input in m8-m15
+%if ARCH_X86_64
+    SWAP           7, 4
+%endif
     punpcklbw     m7, m8, m9
     punpckhbw     m8, m9
     punpcklbw     m9, m10, m11
@@ -111,81 +182,166 @@
     punpckhbw    m10, m11
     punpcklbw    m11, m12, m13
     punpckhbw    m12, m13
-    mova         m13, %3
-    mova          %3, m12
+%if ARCH_X86_64
+    mova         m13, %2
+%else
+ %if %1 == 0
+    mova         m13, [esp+15*16]
+ %else
+    mova         m13, [esp+18*16]
+ %endif
+%endif
+    mova          %2, m12
     punpcklbw    m12, m14, m13
     punpckhbw    m14, m14, m13
 
-    ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,14
-    punpcklwd    m13, m15, m1
-    punpckhwd    m15, m1
-    punpcklwd     m1, m0, m2
-    punpckhwd     m0, m2
-    punpcklwd     m2, m3, m5
-    punpckhwd     m3, m5
-    punpcklwd     m5, m4, m6
-    punpckhwd     m4, m6
-    punpcklwd     m6, m7, m9
+    ; interleaved in m7,8,9,10,11,rsp%2,12,14
+    punpcklwd    m13, m7, m9
     punpckhwd     m7, m9
     punpcklwd     m9, m8, m10
     punpckhwd     m8, m10
     punpcklwd    m10, m11, m12
     punpckhwd    m11, m12
-    mova         m12, %3
-    mova          %3, m11
+    mova         m12, %2
+    mova          %2, m11
     punpcklwd    m11, m12, m14
     punpckhwd    m12, m14
 
-    ; interleaved in m13,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
-    punpckldq    m14, m13, m2
-    punpckhdq    m13, m2
-    punpckldq     m2, m15, m3
-    punpckhdq    m15, m3
-    punpckldq     m3, m1, m5
-    punpckhdq     m1, m5
-    punpckldq     m5, m0, m4
-    punpckhdq     m0, m4
-    punpckldq     m4, m6, m10
-    punpckhdq     m6, m10
+    ; interleaved in m13,7,9,8,10,rsp%2,11,12
+    punpckldq    m14, m13, m10
+    punpckhdq    m13, m10
     punpckldq    m10, m9, m11
     punpckhdq     m9, m11
     punpckldq    m11, m8, m12
     punpckhdq     m8, m12
-    mova         m12, %3
-    mova          %3, m8
+    mova         m12, %2
+    mova          %2, m8
     punpckldq     m8, m7, m12
     punpckhdq     m7, m12
 
-    ; interleaved in m14,13,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
-    punpcklqdq   m12, m14, m4
-    punpckhqdq   m14, m4
-    punpcklqdq    m4, m13, m6
-    punpckhqdq   m13, m6
-    punpcklqdq    m6, m2, m8
-    punpckhqdq    m2, m8
-    punpcklqdq    m8, m15, m7
+%if ARCH_X86_32
+    mova [esp+ 8*16], m10
+    mova [esp+ 9*16], m9
+    mova [esp+10*16], m11
+    SWAP           6, 1
+    SWAP           4, 2
+    SWAP           5, 3
+    mova          m6, [esp+0*16]
+    mova          m4, [esp+1*16]
+    mova          m5, [esp+2*16]
+%endif
+
+    ; interleaved in m6,7,5,15,2,1,3,0,14,13,10,9,11,rsp%2,8,7
+    punpcklqdq   m12, m6, m14
+    punpckhqdq    m6, m14
+    punpcklqdq   m14, m4, m13
+    punpckhqdq    m4, m13
+    punpcklqdq   m13, m5, m8
+    punpckhqdq    m5, m8
+%if ARCH_X86_64
+    SWAP           8, 5
+%else
+    mova          m8, [esp+3*16]
+    mova [esp+27*16], m5
+ %define m15 m8
+%endif
+    punpcklqdq    m5, m15, m7
     punpckhqdq   m15, m7
-    punpcklqdq    m7, m3, m10
-    punpckhqdq    m3, m10
+
+%if ARCH_X86_32
+    mova [esp+11*16], m12
+    mova [esp+12*16], m6
+    mova [esp+13*16], m14
+    mova [esp+14*16], m4
+    mova [esp+26*16], m13
+    mova [esp+ 0*16], m5
+    mova [esp+ 1*16], m15
+    mova          m2, [esp+ 4*16]
+    mova         m10, [esp+ 8*16]
+    mova          m1, [esp+ 5*16]
+    mova          m9, [esp+ 9*16]
+    mova          m3, [esp+ 6*16]
+    mova         m11, [esp+10*16]
+    mova          m0, [esp+ 7*16]
+%endif
+
+    punpcklqdq    m7, m2, m10
+    punpckhqdq    m2, m10
     punpcklqdq   m10, m1, m9
     punpckhqdq    m1, m9
-    punpcklqdq    m9, m5, m11
-    punpckhqdq    m5, m11
-    mova         m11, %3
-    mova          %3, m12
+    punpcklqdq    m9, m3, m11
+    punpckhqdq    m3, m11
+    mova         m11, %2
+%if ARCH_X86_32
+ %define m12 m3
+%endif
+    mova          %2, m12
     punpcklqdq   m12, m0, m11
     punpckhqdq    m0, m11
-%if %2 == 0
-    mova         m11, %3
+%if %1 == 1
+    mova         m11, %2
 %endif
 
-    ; interleaved m11,14,4,13,6,2,8,15,7,3,10,1,9,5,12,0
-    SWAP          0, 11, 1, 14, 12, 9, 3, 13, 5, 2, 4, 6, 8, 7, 15
+%if ARCH_X86_64
+    ; interleaved m11,6,14,4,13,8,5,15,7,2,10,1,9,3,12,0
+    SWAP           0, 11, 1, 6, 5, 8, 7, 15
+    SWAP           2, 14, 12, 9
+    SWAP           3, 4, 13
+%else
+ %if %1 == 0
+    mova [esp+15*16], m9
+    mova [esp+17*16], m12
+    mova [esp+18*16], m0
+    mova [esp+28*16], m10
+    mova [esp+29*16], m1
+    mova          m3, [esp+0*16]
+    mova          m4, [esp+1*16]
+    SWAP          m5, m7
+    SWAP          m6, m2
+ %else
+    SWAP           0, 7
+    SWAP           3, 1, 2, 4, 6
+ %endif
+%endif
 %endmacro
 
 %macro FILTER 2 ; width [4/6/8/16], dir [h/v]
+%if ARCH_X86_64
+ %define %%flat8mem [rsp+0*16]
+ %define %%q2mem    [rsp+1*16]
+ %define %%q3mem    [rsp+2*16]
+%else
+ %if %1 == 4 || %1 == 6
+  %define %%p2mem      [esp+ 8*16]
+  %define %%q2mem      [esp+ 9*16]
+  %define %%flat8mem   [esp+10*16]
+ %else
+  %ifidn %2, v
+   %define %%p2mem      [esp+16*16]
+   %define %%q2mem      [esp+ 1*16]
+   %define %%q3mem      [esp+18*16]
+   %define %%flat8mem   [esp+ 0*16]
+   %define %%flat16mem  [esp+20*16]
+  %else
+   %define %%p2mem     [esp+27*16]
+   %define %%q2mem     [esp+28*16]
+   %define %%q3mem     [esp+29*16]
+   %define %%flat8mem  [esp+21*16]
+   %define %%flat16mem [esp+30*16]
+  %endif
+ %endif
+ %xdefine m12reg m12
+%endif
+
+%if ARCH_X86_32
+    lea     stride3q, [strideq*3]
+%endif
     ; load data
 %ifidn %2, v
+%if ARCH_X86_32
+    mov     mstrideq, strideq
+    neg     mstrideq
+%endif
 %if %1 == 4
     lea         tmpq, [dstq+mstrideq*2]
     mova          m3, [tmpq+strideq*0]          ; p1
@@ -197,6 +353,11 @@
     lea         tmpq, [dstq+mstrideq*4]
     ; we load p3 later
 %define %%p3mem [dstq+mstrideq*4]
+ %if ARCH_X86_32
+  %define m13 m0
+  %define m14 m1
+  %define m15 m2
+ %endif
     mova         m13, [tmpq+strideq*1]
     mova          m3, [tmpq+strideq*2]
     mova          m4, [tmpq+stride3q]
@@ -206,8 +367,18 @@
 %if %1 != 6
     mova         m15, [dstq+stride3q]
 %endif
+ %if ARCH_X86_32
+    mova     %%p2mem, m13
+    mova     %%q2mem, m14
+  %define m13 %%p2mem
+  %define m14 %%q2mem
+  %if %1 != 6
+    mova     %%q3mem, m15
+   %define m15 %%q3mem
+  %endif
+ %endif
 %endif
-%else
+%else ; %2 == h
     ; load lines
 %if %1 == 4
     ; transpose 4x16
@@ -288,139 +459,228 @@
     punpcklbw     m3, m1
     punpcklbw     m5, m6
     movq          m6, [tmpq+strideq*0-%1/2]
-    movq          m1, [tmpq+strideq*1-%1/2]
-    movq         m13, [tmpq+strideq*2-%1/2]
+    movq          m0, [tmpq+strideq*1-%1/2]
+    movq          m1, [tmpq+strideq*2-%1/2]
     movq          m2, [tmpq+stride3q -%1/2]
     lea         tmpq, [tmpq+strideq*8]
-    punpcklbw     m6, m1
-    punpcklbw    m13, m2
-    movq         m11, [tmpq+strideq*0-%1/2]
-    movq          m2, [tmpq+strideq*1-%1/2]
-    movq         m14, [tmpq+strideq*2-%1/2]
+    punpcklbw     m6, m0
+    punpcklbw     m1, m2
+    movq          m2, [tmpq+strideq*2-%1/2]
     movq          m0, [tmpq+stride3q -%1/2]
-    punpcklbw    m11, m2
-    punpcklbw    m14, m0
+    punpcklbw     m2, m0
+%if ARCH_X86_64
+    SWAP         m15, m2
+%else
+ %define m15 [esp+3*16]
+    mova         m15, m2
+%endif
+    movq          m0, [tmpq+strideq*0-%1/2]
+    movq          m2, [tmpq+strideq*1-%1/2]
+    punpcklbw     m0, m2
     ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
     ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
     ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
     ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
     ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
-    ; xm11: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
-    ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
-    ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
-    punpcklwd   m15, m7, m4
-    punpckhwd    m7, m4
-    punpcklwd    m4, m3, m5
-    punpckhwd    m3, m5
-    punpcklwd    m5, m6, m13
-    punpckhwd    m6, m13
-    punpcklwd   m13, m11, m14
-    punpckhwd   m11, m14
-    ; xm15: A0-3,B0-3,C0-3,D0-3
+    ; xm0: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
+    ; xm1: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
+    ; xm2: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
+    punpcklwd     m2, m7, m4
+    punpckhwd     m7, m4
+    punpcklwd     m4, m3, m5
+    punpckhwd     m3, m5
+    punpcklwd     m5, m6, m1
+    punpckhwd     m6, m1
+    punpcklwd     m1, m0, m15
+    punpckhwd     m0, m15
+%if ARCH_X86_64
+    SWAP         m15, m0
+%else
+    mova         m15, m0
+%endif
+    ; xm2: A0-3,B0-3,C0-3,D0-3
     ; xm7: E0-3,F0-3,G0-3,H0-3
     ; xm4: A8-11,B8-11,C8-11,D8-11
     ; xm3: E8-11,F8-11,G8-11,H8-11
     ; xm5: A4-7,B4-7,C4-7,D4-7
     ; xm6: E4-7,F4-7,G4-7,H4-7
-    ; xm13: A12-15,B12-15,C12-15,D12-15
-    ; xm11: E12-15,F12-15,G12-15,H12-15
-    punpckldq   m14, m15, m5
-    punpckhdq   m15, m5
-    punpckldq    m5, m7, m6
+    ; xm1: A12-15,B12-15,C12-15,D12-15
+    ; xm0: E12-15,F12-15,G12-15,H12-15
+    punpckldq     m0, m2, m5
+    punpckhdq     m2, m5
+    punpckldq     m5, m7, m6
 %if %1 != 6
-    punpckhdq    m7, m6
+    punpckhdq     m7, m6
 %endif
-    punpckldq    m6, m4, m13
-    punpckhdq    m4, m13
-    punpckldq   m13, m3, m11
+    punpckldq     m6, m4, m1
+    punpckhdq     m4, m1
+    punpckldq     m1, m3, m15
 %if %1 != 6
-    punpckhdq    m3, m3, m11
+    punpckhdq     m3, m15
+ %if ARCH_X86_64
+    SWAP         m15, m3
+ %else
+    mova         m15, m3
+ %endif
 %endif
-    ; xm14: A0-7,B0-7
-    ; xm15: C0-7,D0-7
+    ; xm0: A0-7,B0-7
+    ; xm2: C0-7,D0-7
     ; xm5: E0-7,F0-7
     ; xm7: G0-7,H0-7
     ; xm6: A8-15,B8-15
     ; xm4: C8-15,D8-15
-    ; xm13: E8-15,F8-15
+    ; xm1: E8-15,F8-15
     ; xm3: G8-15,H8-15
-    punpcklqdq  m11, m14, m6
-    punpckhqdq  m14, m6
-    punpckhqdq   m6, m15, m4
-    punpcklqdq  m15, m4
-    punpcklqdq   m4, m5, m13
-    punpckhqdq   m5, m5, m13
+    punpcklqdq    m3, m0, m6
+    punpckhqdq    m0, m6
+    punpckhqdq    m6, m2, m4
+    punpcklqdq    m2, m4
+    punpcklqdq    m4, m5, m1
+    punpckhqdq    m5, m1
 %if %1 == 8
-    punpcklqdq  m13, m7, m3
-    punpckhqdq   m7, m7, m3
-    ; xm11: A0-15
-    ; xm14: B0-15
-    ; xm15: C0-15
+    punpcklqdq    m1, m7, m15
+    punpckhqdq    m7, m15
+    ; xm3: A0-15
+    ; xm0: B0-15
+    ; xm2: C0-15
     ; xm6: D0-15
     ; xm4: E0-15
     ; xm5: F0-15
-    ; xm13: G0-15
+    ; xm1: G0-15
     ; xm7: H0-15
-    SWAP         13, 14
-    SWAP          3, 15, 7
-    SWAP          5, 4, 6
-    ; 11,14,15,6,4,5,13,7 -> 11,13,3,4,5,6,14,15
+%if ARCH_X86_64
+    SWAP          11, 3, 2
+    SWAP          13, 0
+    SWAP           6, 5, 4
+    SWAP          14, 1
+    SWAP          15, 7
+    ; 3,0,2,6,4,5,1,7 -> 11,13,3,4,5,6,14,15
     mova [rsp+21*16], m11
-%define %%p3mem [rsp+21*16]
+ %define %%p3mem [rsp+21*16]
 %else
-    SWAP         13, 11
-    SWAP         14, 5, 6, 4, 15, 3
-    ; 11,14,15,6,4,5 -> 13,3,4,5,6,14
+ %define m11 [esp+26*16]
+ %define m13 [esp+27*16]
+ %define m14 [esp+28*16]
+ %define m15 [esp+29*16]
+    mova         m11, m3
+    mova         m13, m0
+    SWAP           3, 2
+    SWAP           6, 5, 4
+    mova         m14, m1
+    mova         m15, m7
+ %define %%p3mem [esp+26*16]
 %endif
 %else
+ %if ARCH_X86_64
+    SWAP          13, 3, 0
+    SWAP          14, 5, 6, 4, 2
+    ; 3,0,2,6,4,5 -> 13,3,4,5,6,14
+ %else
+  %define m13 %%p2mem
+  %define m14 %%q2mem
+    mova         m13, m3
+    mova         m14, m5
+    SWAP           3, 0
+    SWAP           5, 6, 4, 2
+    ; 0,2,6,4 -> 3,4,5,6
+ %endif
+%endif
+%else
+%if ARCH_X86_64
     mova [rsp+20*16], m12
+%endif
     ; load and 16x16 transpose. We only use 14 pixels but we'll need the
     ; remainder at the end for the second transpose
-    movu         xm0, [dstq+strideq*0-8]
-    movu         xm1, [dstq+strideq*1-8]
-    movu         xm2, [dstq+strideq*2-8]
-    movu         xm3, [dstq+stride3q -8]
+%if ARCH_X86_32
+ %xdefine m8  m0
+ %xdefine m9  m1
+ %xdefine m10 m2
+ %xdefine m11 m3
+ %xdefine m12 m4
+ %xdefine m13 m5
+ %xdefine m14 m6
+ %xdefine m15 m7
+    lea         tmpq, [dstq+strideq*8]
+    movu          m8, [tmpq+strideq*0-8]
+    movu          m9, [tmpq+strideq*1-8]
+    movu         m10, [tmpq+strideq*2-8]
+    movu         m11, [tmpq+stride3q -8]
+    lea         tmpq, [tmpq+strideq*4]
+    movu         m12, [tmpq+strideq*0-8]
+    movu         m13, [tmpq+strideq*1-8]
+    movu         m14, [tmpq+strideq*2-8]
+    movu         m15, [tmpq+stride3q -8]
+    mova [esp+ 8*16], m8
+    mova [esp+ 9*16], m9
+    mova [esp+10*16], m10
+    mova [esp+11*16], m11
+    mova [esp+12*16], m12
+    mova [esp+13*16], m13
+    mova [esp+14*16], m14
+    mova [esp+15*16], m15
+%endif
+    movu          m0, [dstq+strideq*0-8]
+    movu          m1, [dstq+strideq*1-8]
+    movu          m2, [dstq+strideq*2-8]
+    movu          m3, [dstq+stride3q -8]
     lea         tmpq, [dstq+strideq*4]
-    movu         xm4, [tmpq+strideq*0-8]
-    movu         xm5, [tmpq+strideq*1-8]
-    movu         xm6, [tmpq+strideq*2-8]
-    movu         xm7, [tmpq+stride3q -8]
+    movu          m4, [tmpq+strideq*0-8]
+    movu          m5, [tmpq+strideq*1-8]
+    movu          m6, [tmpq+strideq*2-8]
+    movu          m7, [tmpq+stride3q -8]
     lea         tmpq, [tmpq+strideq*4]
-    movu         xm8, [tmpq+strideq*0-8]
-    movu         xm9, [tmpq+strideq*1-8]
-    movu        xm10, [tmpq+strideq*2-8]
-    movu        xm11, [tmpq+stride3q -8]
+%if ARCH_X86_64
+    movu          m8, [tmpq+strideq*0-8]
+    movu          m9, [tmpq+strideq*1-8]
+    movu         m10, [tmpq+strideq*2-8]
+    movu         m11, [tmpq+stride3q -8]
     lea         tmpq, [tmpq+strideq*4]
-    movu        xm12, [tmpq+strideq*0-8]
-    movu        xm13, [tmpq+strideq*1-8]
-    movu        xm14, [tmpq+strideq*2-8]
-    movu        xm15, [tmpq+stride3q -8]
+    movu         m12, [tmpq+strideq*0-8]
+    movu         m13, [tmpq+strideq*1-8]
+    movu         m14, [tmpq+strideq*2-8]
+    movu         m15, [tmpq+stride3q -8]
+%endif
 
-    TRANSPOSE_16X16B 0, 1, [rsp+11*16]
-    mova  [rsp+12*16], m1
-    mova  [rsp+13*16], m2
-    mova  [rsp+14*16], m3
-    mova  [rsp+15*16], m12
-    mova  [rsp+16*16], m13
-    mova  [rsp+17*16], m14
-    mova  [rsp+18*16], m15
+%if ARCH_X86_64
+    TRANSPOSE_16X16B 0, [rsp+11*16]
+    mova [rsp+12*16], m1
+    mova [rsp+13*16], m2
+    mova [rsp+14*16], m3
+    mova [rsp+15*16], m12
+    mova [rsp+16*16], m13
+    mova [rsp+17*16], m14
+    mova [rsp+18*16], m15
     ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
-    SWAP           12, 4, 7
-    SWAP           13, 5, 8
-    SWAP            3, 6, 9
-    SWAP           10, 14
-    SWAP           11, 15
-    mova  [rsp+21*16], m12
-%define %%p3mem [rsp+21*16]
-    mova          m12, [rsp+20*16]
+    SWAP          12, 4, 7
+    SWAP          13, 5, 8
+    SWAP           3, 6, 9
+    SWAP          10, 14
+    SWAP          11, 15
+    mova [rsp+21*16], m12
+ %define %%p3mem [rsp+21*16]
+    mova         m12, [rsp+20*16]
+%else
+    TRANSPOSE_16X16B 0, [esp+16*16]
+ %define %%p3mem [esp+26*16]
+ %define m11 %%p3mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
 %endif
-%endif
+%endif ; if 4 elif 6 or 8 else 16
+%endif ; if v else h
 
     ; load L/E/I/H
+%if ARCH_X86_32
+    mov    l_strideq, l_stridem
+%endif
 %ifidn %2, v
     movu          m1, [lq]
     movu          m0, [lq+l_strideq]
 %else
+ %if ARCH_X86_32
+    lea   l_stride3q, [l_strideq*3]
+ %endif
     movq         xm1, [lq]
     movq         xm2, [lq+l_strideq*2]
     movhps       xm1, [lq+l_strideq]
@@ -427,88 +687,148 @@
     movhps       xm2, [lq+l_stride3q]
     shufps        m0, m1, m2, q3131
     shufps        m1, m2, q2020
+ %if ARCH_X86_32
+    lea     stride3q, [strideq*3]
+ %endif
 %endif
+
+%if ARCH_X86_32
+ %ifidn %2, v
+    mov         lutd, lutm
+ %endif
+%endif
     pxor          m2, m2
-    pcmpeqb      m10, m2, m0
-    pand          m1, m10
+    pcmpeqb       m7, m2, m0
+    pand          m1, m7
     por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
-    pshufb        m0, [pb_4x0_4x4_4x8_4x12]     ; l[x][1]
-    pcmpeqb      m10, m2, m0                    ; !L
-    psrlq         m2, m0, [lutq+128]
-    pand          m2, [pb_63]
-    pminub        m2, minlvl
-    pmaxub        m2, [pb_1]                    ; I
-    pand          m1, m0, [pb_240]
+    pshufb        m0, [PIC_sym(pb_4x0_4x4_4x8_4x12)] ; l[x][1]
+    pcmpeqb       m2, m0                        ; !L
+    psrlq         m7, m0, [lutq+128]
+    pand          m7, [PIC_sym(pb_63)]
+    pminub        m7, minlvl
+    pmaxub        m7, [PIC_sym(pb_1)]           ; I
+    pand          m1, m0, [PIC_sym(pb_240)]
     psrlq         m1, 4                         ; H
-    paddb         m0, [pb_2]
+    paddb         m0, [PIC_sym(pb_2)]
     paddb         m0, m0
-    paddb         m0, m2                        ; E
-    pxor          m1, [pb_128]
-    pxor          m2, [pb_128]
-    pxor          m0, [pb_128]
+    paddb         m0, m7                        ; E
+    pxor          m1, [PIC_sym(pb_128)]
+    pxor          m7, [PIC_sym(pb_128)]
+    pxor          m0, [PIC_sym(pb_128)]
+    SWAP           2, 7
 
-    ABSSUB        m8, m3, m4, m9                ; abs(p1-p0)
-    pmaxub        m8, m10
-    ABSSUB        m9, m5, m6, m10               ; abs(q1-q0)
-    pmaxub        m8, m9
+%if ARCH_X86_64
+    SWAP           0, 8
+    SWAP           2, 10
+%else
+ %ifidn %2, v
+    mov     mstrideq, strideq
+    neg     mstrideq
+  %if %1 == 4
+    lea         tmpq, [dstq+mstrideq*2]
+  %elif %1 == 6 || %1 == 8
+    lea         tmpq, [dstq+mstrideq*4]
+  %endif
+ %endif
+    mova  [esp+3*16], m0
+    mova  [esp+4*16], m2
+%endif
+
+    ABSSUB        m0, m3, m4, m2                ; abs(p1-p0)
+    pmaxub        m0, m7
+    ABSSUB        m2, m5, m6, m7                ; abs(q1-q0)
+    pmaxub        m0, m2
 %if %1 == 4
-    pxor          m8, [pb_128]
-    pcmpgtb       m7, m8, m1                    ; hev
+    pxor          m0, [PIC_sym(pb_128)]
+    pcmpgtb       m7, m0, m1                    ; hev
+ %if ARCH_X86_64
+    SWAP           7, 11
+ %else
+    mova  [esp+5*16], m7
+ %endif
 %else
-    pxor          m7, m8, [pb_128]
+    pxor          m7, m0, [PIC_sym(pb_128)]
     pcmpgtb       m7, m1                        ; hev
+%if ARCH_X86_64
+    SWAP           7, 11
+%else
+    mova  [esp+5*16], m7
+%endif
 
 %if %1 == 6
-    ABSSUB        m9, m13, m4, m10              ; abs(p2-p0)
-    pmaxub        m9, m8
+    ABSSUB        m1, m13, m4, m7               ; abs(p2-p0)
+    pmaxub        m1, m0
 %else
-    mova         m11, %%p3mem
-    ABSSUB        m9, m11, m4, m10              ; abs(p3-p0)
-    pmaxub        m9, m8
-    ABSSUB       m10, m13, m4, m11              ; abs(p2-p0)
-    pmaxub        m9, m10
+    mova          m2, %%p3mem
+    ABSSUB        m1, m2, m4, m7                ; abs(p3-p0)
+    pmaxub        m1, m0
+    ABSSUB        m7, m13, m4, m2               ; abs(p2-p0)
+    pmaxub        m1, m7
 %endif
-    ABSSUB       m10, m5,  m14, m11             ; abs(q2-q0)
-    pmaxub        m9, m10
+    ABSSUB        m7, m5, m14, m2               ; abs(p2-p0)
+    pmaxub        m1, m7
 %if %1 != 6
-    ABSSUB       m10, m5,  m15, m11             ; abs(q3-q0)
-    pmaxub        m9, m10
+    ABSSUB        m7, m5, m15, m2               ; abs(q3-q0)
+    pmaxub        m1, m7
 %endif
-    pxor          m9, [pb_128]
-    pcmpgtb       m9, [pb_129]                  ; !flat8in
+    pxor          m1, [PIC_sym(pb_128)]
+    pcmpgtb       m1, [PIC_sym(pb_129)]         ; !flat8in
+%if ARCH_X86_64
+    SWAP           1, 9
+%else
+    mova  [esp+6*16], m1
+%endif
 
 %if %1 == 6
-    ABSSUB       m10, m13, m3,  m1              ; abs(p2-p1)
+    ABSSUB        m7, m13, m3, m1               ; abs(p2-p1)
 %else
-    mova         m11, %%p3mem
-    ABSSUB       m10, m11, m13, m1              ; abs(p3-p2)
-    ABSSUB       m11, m13, m3,  m1              ; abs(p2-p1)
-    pmaxub       m10, m11
-    ABSSUB       m11, m14, m15, m1              ; abs(q3-q2)
-    pmaxub       m10, m11
+    mova          m2, %%p3mem
+    ABSSUB        m7, m2, m13, m1               ; abs(p3-p2)
+    ABSSUB        m2, m13, m3, m1               ; abs(p2-p1)
+    pmaxub        m7, m2
+    ABSSUB        m2, m14, m15, m1              ; abs(q3-q2)
+    pmaxub        m7, m2
 %endif
-    ABSSUB       m11, m14, m6,  m1              ; abs(q2-q1)
-    pmaxub       m10, m11
-    pand         m11, m12, mask1
-    pcmpeqd      m11, m12
-    pand         m10, m11                       ; only apply fm-wide to wd>4 blocks
-    pmaxub        m8, m10
+    ABSSUB        m2, m14, m6,  m1              ; abs(q2-q1)
+    pmaxub        m7, m2
+%if ARCH_X86_32
+ %define m12 m1
+    mova         m12, maskmem
+%endif
+    pand          m2, m12, mask1
+    pcmpeqd       m2, m12
+    pand          m7, m2                        ; only apply fm-wide to wd>4 blocks
+    pmaxub        m0, m7
 
-    pxor          m8, [pb_128]
+    pxor          m0, [PIC_sym(pb_128)]
+%endif ; %if %1 == 4 else
+%if ARCH_X86_64
+    SWAP           2, 10
+    pcmpgtb       m0, m2
+%else
+    pcmpgtb       m0, [esp+4*16]
 %endif
-    pcmpgtb       m8, m2
 
-    ABSSUB       m10, m3, m6, m11               ; abs(p1-q1)
-    ABSSUB       m11, m4, m5, m2                ; abs(p0-q0)
-    paddusb      m11, m11
-    pand         m10, [pb_254]
-    psrlq        m10, 1
-    paddusb      m10, m11                       ; abs(p0-q0)*2+(abs(p1-q1)>>1)
-    pxor         m10, [pb_128]
-    pcmpgtb      m10, m0                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
-    por           m8, m10
+    ABSSUB        m1, m3, m6, m7                ; abs(p1-q1)
+    ABSSUB        m7, m4, m5, m2                ; abs(p0-q0)
+    paddusb       m7, m7
+    pand          m1, [PIC_sym(pb_254)]
+    psrlq         m1, 1
+    paddusb       m1, m7                        ; abs(p0-q0)*2+(abs(p1-q1)>>1)
+    pxor          m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+    pcmpgtb       m1, m8                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
+%else
+    pcmpgtb       m1, [esp+3*16]
+%endif
+    por           m0, m1
 
 %if %1 == 16
+%if ARCH_X86_64
+    SWAP           0, 8
+%else
+    mova  [esp+3*16], m0
+%endif
 %ifidn %2, v
     lea         tmpq, [dstq+mstrideq*8]
     mova          m0, [tmpq+strideq*1]
@@ -521,7 +841,7 @@
 %else
     mova          m0, [rsp+13*16]
 %endif
-    ABSSUB        m2, m0, m4, m10
+    ABSSUB        m2, m0, m4, m7
     pmaxub        m1, m2
 %ifidn %2, v
     mova          m0, [tmpq+stride3q]
@@ -528,7 +848,7 @@
 %else
     mova          m0, [rsp+14*16]
 %endif
-    ABSSUB        m2, m0, m4, m10
+    ABSSUB        m2, m0, m4, m7
     pmaxub        m1, m2
 %ifidn %2, v
     lea         tmpq, [dstq+strideq*4]
@@ -536,7 +856,7 @@
 %else
     mova          m0, [rsp+15*16]
 %endif
-    ABSSUB        m2, m0, m5, m10
+    ABSSUB        m2, m0, m5, m7
     pmaxub        m1, m2
 %ifidn %2, v
     mova          m0, [tmpq+strideq*1]
@@ -543,7 +863,7 @@
 %else
     mova          m0, [rsp+16*16]
 %endif
-    ABSSUB        m2, m0, m5, m10
+    ABSSUB        m2, m0, m5, m7
     pmaxub        m1, m2
 %ifidn %2, v
     mova          m0, [tmpq+strideq*2]
@@ -550,84 +870,133 @@
 %else
     mova          m0, [rsp+17*16]
 %endif
-    ABSSUB        m2, m0, m5, m10
+    ABSSUB        m2, m0, m5, m7
     pmaxub        m1, m2
-    pxor          m1, [pb_128]
-    pcmpgtb       m1, [pb_129]                  ; !flat8out
+    pxor          m1, [PIC_sym(pb_128)]
+    pcmpgtb       m1, [PIC_sym(pb_129)]         ; !flat8out
+%if ARCH_X86_64
     por           m1, m9                        ; !flat8in | !flat8out
-    pand         m10, m12, mask2
-    pcmpeqd      m10, m12
-    pandn         m1, m10                       ; flat16
-    pandn        m10, m8, m1                    ; flat16 & fm
-    SWAP           1, 10
+%else
+    por           m1, [esp+6*16]
+ %define m12 m7
+    mova         m12, maskmem
+%endif
+    pand          m2, m12, mask2
+    pcmpeqd       m2, m12
+    pandn         m1, m2                        ; flat16
+%if ARCH_X86_64
+    pandn         m2, m8, m1                    ; flat16 & fm
+%else
+    pandn         m2, [esp+3*16], m1            ; flat16 & fm
+    mova %%flat16mem, m2
+%endif
+    SWAP           1, 2
 
     pand          m2, m12, mask1
     pcmpeqd       m2, m12
-    pandn         m9, m2                        ; flat8in
+%if ARCH_X86_64
+    pandn         m9, m2                    ; flat8in
     pandn         m2, m8, m9
     SWAP           2, 9
+%else
+    pandn         m0, [esp+6*16], m2
+    pandn         m2, [esp+3*16], m0
+    mova  [esp+6*16], m2
+%endif
     pand          m2, m12, mask0
     pcmpeqd       m2, m12
+%if ARCH_X86_64
     pandn         m8, m2
     pandn         m2, m9, m8                    ; fm & !flat8 & !flat16
     SWAP           2, 8
     pandn         m2, m1, m9                    ; flat8 & !flat16
     SWAP           2, 9
+    SWAP           0, 8
+    SWAP           1, 10
+%else
+    pandn         m0, [esp+3*16], m2
+    pandn         m2, [esp+6*16], m0
+    SWAP           2, 0
+    pandn         m2, m1, [esp+6*16]
+    mova  %%flat8mem, m2
+%endif
 %elif %1 != 4
+ %if ARCH_X86_64
+    SWAP           1, 9
+ %else
+  %define m12 m7
+    mova         m12, maskmem
+    mova          m1, [esp+6*16]
+ %endif
     pand          m2, m12, mask1
     pcmpeqd       m2, m12
-    pandn         m9, m2
-    pandn         m2, m8, m9                    ; flat8 & fm
-    pand          m0, m12, mask0
-    pcmpeqd       m0, m12
-    pandn         m8, m0
-    pandn         m9, m2, m8                    ; fm & !flat8
-    SWAP           9, 2, 8
+    pandn         m1, m2
+    pandn         m2, m0, m1                    ; flat8 & fm
+    pand          m1, m12, mask0
+    pcmpeqd       m1, m12
+    pandn         m0, m1
+    pandn         m1, m2, m0                    ; fm & !flat8
+    SWAP           1, 2, 0
+ %if ARCH_X86_64
+    SWAP           1, 9
+ %else
+    mova  %%flat8mem, m1
+ %endif
 %else
-    pand          m0, m12, mask0
-    pcmpeqd       m0, m12
-    pandn         m8, m0                        ; fm
+%if ARCH_X86_32
+ %define m12 m1
+    mova         m12, maskmem
 %endif
+    pand          m2, m12, mask0
+    pcmpeqd       m2, m12
+    pandn         m0, m2                        ; fm
+%endif
 
     ; short filter
 
-    mova          m0, [pb_128]
-    mova          m2, [pb_16]
-    pxor          m3, m0
-    pxor          m6, m0
-    pxor          m4, m0
-    pxor          m5, m0
-    psubsb       m10, m3, m6                    ; iclip_diff(p1-q1)
-    pand         m10, m7                        ; f=iclip_diff(p1-q1)&hev
-    psubsb       m11, m5, m4
-    paddsb       m10, m11
-    paddsb       m10, m11
-    paddsb       m10, m11                       ; f=iclip_diff(3*(q0-p0)+f)
-    pand          m8, m10                       ; f&=fm
-    paddsb       m10, m8, [pb_3]
-    paddsb        m8, [pb_4]
-    pand         m10, [pb_248]
-    pand          m8, [pb_248]
-    psrlq        m10, 3
-    psrlq         m8, 3
-    pxor         m10, m2
-    pxor          m8, m2
-    psubb        m10, m2                        ; f2
-    psubb         m8, m2                        ; f1
-    paddsb        m4, m10
-    psubsb        m5, m8
-    pxor          m4, m0
-    pxor          m5, m0
+    mova          m1, [PIC_sym(pb_128)]
+%if ARCH_X86_64
+    SWAP           7, 11
+%else
+    mova          m7, [esp+5*16]
+%endif
+    pxor          m3, m1
+    pxor          m6, m1
+    pxor          m4, m1
+    pxor          m5, m1
+    psubsb        m1, m3, m6                    ; iclip_diff(p1-q1)
+    pand          m1, m7                        ; f=iclip_diff(p1-q1)&hev
+    psubsb        m2, m5, m4
+    paddsb        m1, m2
+    paddsb        m1, m2
+    paddsb        m1, m2                        ; f=iclip_diff(3*(q0-p0)+f)
+    mova          m2, [PIC_sym(pb_16)]
+    pand          m0, m1                        ; f&=fm
+    paddsb        m1, m0, [PIC_sym(pb_3)]
+    paddsb        m0, [PIC_sym(pb_4)]
+    pand          m1, [PIC_sym(pb_248)]
+    pand          m0, [PIC_sym(pb_248)]
+    psrlq         m1, 3
+    psrlq         m0, 3
+    pxor          m1, m2
+    pxor          m0, m2
+    psubb         m1, m2                        ; f2
+    psubb         m0, m2                        ; f1
+    mova          m2, [PIC_sym(pb_128)]
+    paddsb        m4, m1
+    psubsb        m5, m0
+    pxor          m4, m2
+    pxor          m5, m2
 
-    pxor          m8, m0
-    pxor         m10, m10
-    pavgb         m8, m10                       ; f=(f1+1)>>1
-    psubb         m8, [pb_64]
-    pandn         m7, m8                        ; f&=!hev
+    pxor          m0, m2
+    pxor          m1, m1
+    pavgb         m0, m1                        ; f=(f1+1)>>1
+    psubb         m0, [PIC_sym(pb_64)]
+    pandn         m7, m0                        ; f&=!hev
     paddsb        m3, m7
     psubsb        m6, m7
-    pxor          m3, m0
-    pxor          m6, m0
+    pxor          m3, m2
+    pxor          m6, m2
 
 %if %1 == 16
     ; flat16 filter
@@ -642,276 +1011,362 @@
     mova          m7, [rsp+14*16]
 %endif
 
-    mova  [rsp+0*16], m9
-    mova  [rsp+1*16], m14
-    mova  [rsp+2*16], m15
+%if ARCH_X86_64
+    SWAP           1, 10
+    mova  %%flat8mem, m9
+    mova     %%q2mem, m14
+    mova     %%q3mem, m15
+    SWAP           0, 8
+    SWAP           1, 9
+%else
+ %ifidn %2, v
+    mova [esp+17*16], m0
+    mova [esp+19*16], m3
+    mova [esp+21*16], m4
+    mova [esp+22*16], m5
+    mova [esp+23*16], m6
+  %xdefine m11 m3
+  %xdefine m14 m4
+  %xdefine m15 m5
+  %xdefine m10 m6
+  %define m13 %%p2mem
+  %define m8  [esp+17*16]
+  %define m9  %%flat16mem
+  %define m3  [esp+19*16]
+  %define m4  [esp+21*16]
+  %define m5  [esp+22*16]
+  %define m6  [esp+23*16]
+ %else
+    mova [esp+31*16], m0
+    mova [esp+32*16], m3
+    mova [esp+33*16], m4
+    mova [esp+34*16], m5
+    mova [esp+35*16], m6
+  %xdefine m11 m3
+  %xdefine m14 m4
+  %xdefine m15 m5
+  %xdefine m10 m6
+  %define m13 %%p2mem
+  %define m8  [esp+31*16]
+  %define m9  %%flat16mem
+  %define m3  [esp+32*16]
+  %define m4  [esp+33*16]
+  %define m5  [esp+34*16]
+  %define m6  [esp+35*16]
+ %endif
+%endif
 
     ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
     ; write -6
     mova         m11, %%p3mem
+%if ARCH_X86_64
+    punpcklbw    m14, m8, m11
+    punpckhbw    m15, m8, m11
+%else
     punpcklbw    m14, m0, m11
     punpckhbw    m15, m0, m11
+%endif
 %ifidn %2, v
     mova  [rsp+5*16], m11
 %endif
-    pmaddubsw    m10, m14, [pb_7_1]
-    pmaddubsw    m11, m15, [pb_7_1]             ; p6*7+p3
-    punpcklbw     m8, m2, m7
-    punpckhbw     m9, m2, m7
-    pmaddubsw     m8, [pb_2]
-    pmaddubsw     m9, [pb_2]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3
-    punpcklbw     m8, m13, m3
-    punpckhbw     m9, m13, m3
-    pmaddubsw     m8, [pb_1]
-    pmaddubsw     m9, [pb_1]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3+p2+p1
-    punpcklbw     m8, m4, m5
-    punpckhbw     m9, m4, m5
-    pmaddubsw     m8, [pb_1]
-    pmaddubsw     m9, [pb_1]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
-    pmulhrsw      m8, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m8, m9
-    pand          m8, m1
-    pandn         m9, m1, m2
-    por           m8, m9
+    pmaddubsw    m10, m14, [PIC_sym(pb_7_1)]
+    pmaddubsw    m11, m15, [PIC_sym(pb_7_1)]    ; p6*7+p3
+    punpcklbw     m0, m2, m7
+    punpckhbw     m1, m2, m7
+    pmaddubsw     m0, [PIC_sym(pb_2)]
+    pmaddubsw     m1, [PIC_sym(pb_2)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*7+p5*2+p4*2+p3
+    punpcklbw     m0, m13, m3
+    punpckhbw     m1, m13, m3
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*7+p5*2+p4*2+p3+p2+p1
+    punpcklbw     m0, m4, m5
+    punpckhbw     m1, m4, m5
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m2
+    por           m0, m1
 %ifidn %2, v
-    mova [tmpq+strideq*2], m8                   ; p5
+    mova [tmpq+strideq*2], m0                   ; p5
 %else
-    mova [rsp+13*16], m8
+    mova [rsp+13*16], m0
 %endif
 
     ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
     ; write -5
-    pmaddubsw    m14, [pb_m1_1]
-    pmaddubsw    m15, [pb_m1_1]
+    pmaddubsw    m14, [PIC_sym(pb_m1_1)]
+    pmaddubsw    m15, [PIC_sym(pb_m1_1)]
     paddw        m10, m14
     paddw        m11, m15                       ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
-    punpcklbw     m8, m0, m6
-    punpckhbw     m9, m0, m6
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    mova  [rsp+3*16], m8
-    mova  [rsp+4*16], m9
-    paddw        m10, m8
-    paddw        m11, m9                        ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
-    pmulhrsw      m8, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m8, m9
-    pand          m8, m1
-    pandn         m9, m1, m7
-    por           m8, m9
+    punpcklbw     m0, m8, m6
+    punpckhbw     m1, m8, m6
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    mova  [rsp+3*16], m0
+    mova  [rsp+4*16], m1
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m7
+    por           m0, m1
 %ifidn %2, v
-    mova [tmpq+stride3q], m8                    ; p4
+    mova [tmpq+stride3q], m0                    ; p4
 %else
-    mova [rsp+14*16], m8
+    mova [rsp+14*16], m0
 %endif
 
     ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
     ; write -4
-    mova         m14, [rsp+1*16]
-    punpcklbw     m8, m0, m13
-    punpckhbw     m9, m0, m13
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
-    punpcklbw     m8, m2, m14
+    mova         m14, %%q2mem
+    punpcklbw     m0, m8, m13
+    punpckhbw     m1, m8, m13
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
+    punpcklbw     m0, m2, m14
     punpckhbw     m2, m14
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m2, [pb_m1_1]
-    mova  [rsp+1*16], m8
-    paddw        m10, m8
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m2, [PIC_sym(pb_m1_1)]
+    mova  [rsp+1*16], m0
+    paddw        m10, m0
     paddw        m11, m2                        ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
-    pmulhrsw      m8, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m8, m9
-    pand          m8, m1
-    pandn         m9, m1, %%p3mem
-    por           m8, m9
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, %%p3mem
+    por           m0, m1
 %ifidn %2, v
-    mova [tmpq+strideq*4], m8                   ; p3
+    mova [tmpq+strideq*4], m0                   ; p3
 %else
-    mova [rsp+19*16], m8
+    mova [rsp+19*16], m0
 %endif
 
     ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
     ; write -3
-    mova         m15, [rsp+2*16]
-    punpcklbw     m8, m0, m3
-    punpckhbw     m9, m0, m3
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
-    punpcklbw     m8, m7, m15
+    mova         m15, %%q3mem
+    punpcklbw     m0, m8, m3
+    punpckhbw     m1, m8, m3
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
+    punpcklbw     m0, m7, m15
     punpckhbw     m7, m15
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m7, [pb_m1_1]
-    mova  [rsp+2*16], m8
-    paddw        m10, m8
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m7, [PIC_sym(pb_m1_1)]
+    mova  [rsp+2*16], m0
+%if ARCH_X86_32
+ %ifidn %2, v
+    mova [esp+24*16], m7
+ %else
+    mova [esp+36*16], m7
+ %endif
+%endif
+    paddw        m10, m0
     paddw        m11, m7                        ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
-    pmulhrsw      m8, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m8, m9
-    pand          m8, m1
-    pandn         m9, m1, m13
-    por           m8, m9
-    mova  [rsp+6*16], m8                        ; don't clobber p2/m13 since we need it in F
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m13
+    por           m0, m1
+    mova  [rsp+6*16], m0                        ; don't clobber p2/m13 since we need it in F
 
     ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
     ; write -2
-    punpcklbw     m8, m0, m4
-    punpckhbw     m9, m0, m4
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+    punpcklbw     m0, m8, m4
+    punpckhbw     m1, m8, m4
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
+%if ARCH_X86_64
+    SWAP           7, 8
+%endif
 %ifidn %2, v
-    mova          m9, [dstq+strideq*4]          ; q4
-    mova          m0, [rsp+5*16]                ; (pre-filter) p3
+    mova          m1, [dstq+strideq*4]          ; q4
+    mova          m7, [rsp+5*16]                ; (pre-filter) p3
 %else
-    mova          m9, [rsp+15*16]
-    mova          m0, %%p3mem                   ; (pre-filter) p3
+    mova          m1, [rsp+15*16]
+    mova          m7, %%p3mem                   ; (pre-filter) p3
 %endif
-    punpcklbw     m8, m9, m0
-    punpckhbw     m9, m9, m0
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    mova  [rsp+7*16], m8
-    mova  [rsp+5*16], m9
-    psubw        m10, m8
-    psubw        m11, m9                        ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
-    pmulhrsw      m8, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m8, m9
-    pand          m8, m1
-    pandn         m9, m1, m3
-    por           m8, m9
-    mova  [rsp+8*16], m8                        ; don't clobber p1/m3 since we need it in G
+    punpcklbw     m0, m1, m7
+    punpckhbw     m1, m1, m7
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    mova  [rsp+7*16], m0
+    mova  [rsp+5*16], m1
+    psubw        m10, m0
+    psubw        m11, m1                        ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m3
+    por           m0, m1
+    mova  [rsp+8*16], m0                        ; don't clobber p1/m3 since we need it in G
 
     ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
     ; write -1
 %ifidn %2, v
-    mova          m0, [tmpq+strideq*1]          ; p6
+    mova          m7, [tmpq+strideq*1]          ; p6
     lea         tmpq, [dstq+strideq*4]
-    mova          m9, [tmpq+strideq*1]          ; q5
+    mova          m1, [tmpq+strideq*1]          ; q5
 %else
-    mova          m0, [rsp+12*16]               ; p6
-    mova          m9, [rsp+16*16]
+    mova          m7, [rsp+12*16]               ; p6
+    mova          m1, [rsp+16*16]
 %endif
-    punpcklbw     m8, m0, m5
-    punpckhbw     m0, m5
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m0, [pb_m1_1]
-    paddw        m10, m8
-    paddw        m11, m0                        ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
-    punpcklbw     m0, m13, m9
-    punpckhbw    m13, m13, m9
-    SWAP           9, 13
-    mova         m13, [rsp+6*16]
-    pmaddubsw     m0, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    mova [rsp+ 9*16], m0
-    mova [rsp+10*16], m9
+    punpcklbw     m0, m7, m5
+    punpckhbw     m7, m5
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m7, [PIC_sym(pb_m1_1)]
     paddw        m10, m0
-    paddw        m11, m9                        ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
-    pmulhrsw      m0, m10, [pw_2048]
-    pmulhrsw      m8, m11, [pw_2048]
-    packuswb      m0, m8
-    pand          m0, m1
-    pandn         m8, m1, m4
-    por           m0, m8
-    mova  [rsp+6*16], m0                        ; don't clobber p0/m4 since we need it in H
+    paddw        m11, m7                        ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
+    punpcklbw     m7, m13, m1
+    pmaddubsw     m7, [PIC_sym(pb_m1_1)]
+    mova  [rsp+9*16], m7
+    paddw        m10, m7
+%if ARCH_X86_64
+    punpckhbw    m13, m1
+    mova          m1, [rsp+6*16]
+    SWAP           1, 13
+%else
+    punpckhbw     m7, m13, m1
+    mova          m1, [esp+6*16]
+    mova         m13, m1
+    SWAP           1, 7
+%endif
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    mova [rsp+10*16], m1
+    paddw        m11, m1                        ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
+    pmulhrsw      m7, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m0, m11, [PIC_sym(pw_2048)]
+    packuswb      m7, m0
+    pand          m7, m9
+    pandn         m0, m9, m4
+    por           m7, m0
+    mova  [rsp+6*16], m7                        ; don't clobber p0/m4 since we need it in H
 
     ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
     ; write +0
 %ifidn %2, v
-    mova          m0, [tmpq+strideq*2]          ; q6
+    mova          m7, [tmpq+strideq*2]          ; q6
 %else
-    mova          m0, [rsp+17*16]
+    mova          m7, [rsp+17*16]
 %endif
     paddw        m10, [rsp+3*16]
     paddw        m11, [rsp+4*16]                ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
-    punpcklbw     m8, m3, m0
-    punpckhbw     m9, m3, m0
+    punpcklbw     m0, m3, m7
+    punpckhbw     m1, m3, m7
+%if ARCH_X86_64
     mova          m3, [rsp+8*16]
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    mova  [rsp+3*16], m8
-    mova  [rsp+4*16], m9
-    paddw        m10, m8
-    paddw        m11, m9                        ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
-    pmulhrsw      m8, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m8, m9
-    pand          m8, m1
-    pandn         m9, m1, m5
-    por           m8, m9
-    mova  [rsp+8*16], m8                        ; don't clobber q0/m5 since we need it in I
+%endif
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    mova  [rsp+3*16], m0
+    mova  [rsp+4*16], m1
+    paddw        m10, m0
+    paddw        m11, m1                        ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m5
+    por           m0, m1
+%if ARCH_X86_32
+    mova          m1, [esp+8*16]
+    mova          m3, m1
+%endif
+    mova  [rsp+8*16], m0                        ; don't clobber q0/m5 since we need it in I
 
     ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
     ; write +1
     paddw        m10, [rsp+1*16]
     paddw        m11, m2                        ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
-    punpcklbw     m8, m4, m0
-    punpckhbw     m2, m4, m0
-    mova          m4, [rsp+6*16]
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m2, [pb_m1_1]
-    paddw        m10, m8
+    punpcklbw     m0, m4, m7
+    punpckhbw     m2, m4, m7
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m2, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
     paddw        m11, m2                        ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
-    pmulhrsw      m2, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m2, m9
-    pand          m2, m1
-    pandn         m9, m1, m6
-    por           m2, m9                        ; don't clobber q1/m6 since we need it in K
+%if ARCH_X86_64
+    mova          m4, [rsp+6*16]
+%else
+ %define m4 [esp+6*16]
+%endif
+    pmulhrsw      m2, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m2, m1
+    pand          m2, m9
+    pandn         m1, m9, m6
+    por           m2, m1                        ; don't clobber q1/m6 since we need it in K
 
     ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
     ; write +2
     paddw        m10, [rsp+2*16]
-    paddw        m11, m7                        ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
-    punpcklbw     m8, m5, m0
-    punpckhbw     m9, m5, m0
+%if ARCH_X86_64
+    SWAP           7, 8
+    paddw        m11, m7
+%else
+    mova          m8, m7
+ %ifidn %2, v
+    paddw        m11, [esp+24*16]               ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %else
+    paddw        m11, [esp+36*16]               ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
+ %endif
+%endif
+    punpcklbw     m0, m5, m8
+    punpckhbw     m1, m5, m8
+%if ARCH_X86_64
     mova          m5, [rsp+8*16]
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
-    pmulhrsw      m7, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m7, m9
-    pand          m7, m1
-    pandn         m9, m1, m14
-    por           m7, m9                        ; don't clobber q2/m14 since we need it in K
+%else
+ %define m5 [esp+8*16]
+%endif
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
+    pmulhrsw      m7, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m7, m1
+    pand          m7, m9
+    pandn         m1, m9, m14
+    por           m7, m1                        ; don't clobber q2/m14 since we need it in K
 
     ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
     ; write +3
     psubw        m10, [rsp+7*16]
     psubw        m11, [rsp+5*16]                ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
-    punpcklbw     m8, m6, m0
-    punpckhbw     m9, m6, m0
-    SWAP           2, 6
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
-    pmulhrsw      m8, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m8, m9
-    pand          m8, m1
-    pandn         m9, m1, m15
-    por           m8, m9
+    punpcklbw     m0, m6, m8
+    punpckhbw     m1, m6, m8
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m15
+    por           m0, m1
 %ifidn %2, v
-    mova [tmpq+mstrideq], m8                    ; q3
+    mova [tmpq+mstrideq], m0                    ; q3
 %else
-    mova [rsp+20*16], m8
+    mova [rsp+20*16], m0
 %endif
 
     ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
@@ -918,27 +1373,26 @@
     ; write +4
     paddw        m10, [rsp+ 9*16]
     paddw        m11, [rsp+10*16]               ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
-    punpcklbw     m8, m14, m0
-    punpckhbw     m9, m14, m0
-    SWAP          14, 7
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
-    pmulhrsw      m8, m10, [pw_2048]
-    pmulhrsw      m9, m11, [pw_2048]
-    packuswb      m8, m9
-    pand          m8, m1
+    punpcklbw     m0, m14, m8
+    punpckhbw     m1, m14, m8
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+    pmulhrsw      m0, m10, [PIC_sym(pw_2048)]
+    pmulhrsw      m1, m11, [PIC_sym(pw_2048)]
+    packuswb      m0, m1
+    pand          m0, m9
 %ifidn %2, v
-    pandn         m9, m1, [tmpq+strideq*0]
+    pandn         m1, m9, [tmpq+strideq*0]
 %else
-    pandn         m9, m1, [rsp+15*16]
+    pandn         m1, m9, [rsp+15*16]
 %endif
-    por           m8, m9
+    por           m0, m1
 %ifidn %2, v
-    mova [tmpq+strideq*0], m8                    ; q4
+    mova [tmpq+strideq*0], m0                    ; q4
 %else
-    mova [rsp+15*16], m8
+    mova [rsp+15*16], m0
 %endif
 
     ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
@@ -945,20 +1399,20 @@
     ; write +5
     paddw        m10, [rsp+3*16]
     paddw        m11, [rsp+4*16]                ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
-    punpcklbw     m8, m15, m0
-    punpckhbw     m9, m15, m0
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw     m9, [pb_m1_1]
-    paddw        m10, m8
-    paddw        m11, m9                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
-    pmulhrsw     m10, [pw_2048]
-    pmulhrsw     m11, [pw_2048]
+    punpcklbw     m0, m15, m8
+    punpckhbw     m1, m15, m8
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw        m10, m0
+    paddw        m11, m1                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
+    pmulhrsw     m10, [PIC_sym(pw_2048)]
+    pmulhrsw     m11, [PIC_sym(pw_2048)]
     packuswb     m10, m11
-    pand         m10, m1
+    pand         m10, m9
 %ifidn %2, v
-    pandn        m11, m1, [tmpq+strideq*1]
+    pandn        m11, m9, [tmpq+strideq*1]
 %else
-    pandn        m11, m1, [rsp+16*16]
+    pandn        m11, m9, [rsp+16*16]
 %endif
     por          m10, m11
 %ifidn %2, v
@@ -967,186 +1421,259 @@
     mova [rsp+16*16], m10
 %endif
 
-    mova          m9, [rsp+0*16]
+%if ARCH_X86_64
+    SWAP           0, 8
+    SWAP           1, 9
+    SWAP          14, 7
+%else
+ %xdefine m3 m11
+ %xdefine m4 m14
+ %xdefine m5 m15
+ %xdefine m6 m10
+    mova     %%q2mem, m7
+ %ifidn %2, v
+    mova          m3, [esp+19*16]
+ %else
+    mova          m3, [esp+32*16]
+ %endif
+    mova          m4, [esp+ 6*16]
+    mova          m5, [esp+ 8*16]
+%endif
+    SWAP          m6, m2
+
+%if ARCH_X86_64
+    mova          m9, %%flat8mem
+%endif
 %ifidn %2, v
     lea         tmpq, [dstq+mstrideq*4]
 %endif
-%endif
+%endif ; if %1 == 16
 %if %1 >= 8
     ; flat8 filter
+%if ARCH_X86_32
+ %define m9  %%flat8mem
+ %define m11 m1
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+ %define m15 %%q3mem
+%endif
     mova         m11, %%p3mem
     punpcklbw     m0, m11, m3
+    punpcklbw     m7, m13, m4
+    pmaddubsw     m2, m0, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+    pmaddubsw     m7, [PIC_sym(pb_2_1)]
+    paddw         m2, m7                    ; 3 * p3 + 2 * p2 + p1 + p0
+    punpcklbw     m7, m5, [PIC_sym(pb_4)]
+    pmaddubsw     m7, [PIC_sym(pb_1)]
+    paddw         m2, m7                    ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
     punpckhbw     m1, m11, m3
-    pmaddubsw     m2, m0, [pb_3_1]
-    pmaddubsw     m7, m1, [pb_3_1]              ; 3 * p3 + p1
-    punpcklbw     m8, m13, m4
-    punpckhbw    m11, m13, m4
-    pmaddubsw     m8, [pb_2_1]
-    pmaddubsw    m11, [pb_2_1]
-    paddw         m2, m8
-    paddw         m7, m11                       ; 3 * p3 + 2 * p2 + p1 + p0
-    punpcklbw     m8, m5, [pb_4]
-    punpckhbw    m11, m5, [pb_4]
-    pmaddubsw     m8, [pb_1]
-    pmaddubsw    m11, [pb_1]
-    paddw         m2, m8
-    paddw         m7, m11                       ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
-    psrlw         m8, m2, 3
-    psrlw        m11, m7, 3
-    packuswb      m8, m11
-    pand          m8, m9
-    pandn        m11, m9, m13
-    por          m10, m8, m11                  ; p2
+    pmaddubsw     m7, m1, [PIC_sym(pb_3_1)] ; 3 * p3 + p1
+    punpckhbw     m0, m13, m4
+    pmaddubsw     m0, [PIC_sym(pb_2_1)]
+    paddw         m7, m0                    ; 3 * p3 + 2 * p2 + p1 + p0
+    punpckhbw     m0, m5, [PIC_sym(pb_4)]
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    paddw         m7, m0                    ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
+    psrlw         m0, m2, 3
+    psrlw         m1, m7, 3
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m13
+    por           m0, m1                    ; p2
 %ifidn %2, v
-    mova [tmpq+strideq*1], m10                 ; p2
+    mova [tmpq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+    SWAP           0, 10
+ %else
+    mova  [esp+2*16], m0
+ %endif
 %endif
 
-    pmaddubsw     m8, m0, [pb_m1_1]
-    pmaddubsw    m11, m1, [pb_m1_1]
-    paddw         m2, m8
-    paddw         m7, m11
-    punpcklbw     m8, m13, m6
-    punpckhbw    m11, m13, m6
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw    m11, [pb_m1_1]
-    paddw         m2, m8
-    paddw         m7, m11                       ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
-    psrlw         m8, m2, 3
-    psrlw        m11, m7, 3
-    packuswb      m8, m11
-    pand          m8, m9
-    pandn        m11, m9, m3
-    por           m8, m11                       ; p1
+%if ARCH_X86_32
+    mova         m11, %%p3mem
+%endif
+    punpcklbw     m0, m11, m3
+    punpckhbw     m1, m11, m3
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw         m2, m0
+    paddw         m7, m1
+    punpcklbw     m0, m13, m6
+    punpckhbw     m1, m13, m6
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw         m2, m0
+    paddw         m7, m1            ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
+    psrlw         m0, m2, 3
+    psrlw         m1, m7, 3
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m3
+    por           m0, m1            ; p1
 %ifidn %2, v
-    mova [tmpq+strideq*2], m8                   ; p1
+    mova [tmpq+strideq*2], m0
 %else
-    mova  [rsp+0*16], m8
+    mova  [rsp+0*16], m0
 %endif
 
-    pmaddubsw     m0, [pb_1]
-    pmaddubsw     m1, [pb_1]
+%if ARCH_X86_32
+    mova         m11, %%p3mem
+%endif
+    punpcklbw     m0, m11, m3
+    punpckhbw     m1, m11, m3
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
     psubw         m2, m0
     psubw         m7, m1
-    punpcklbw     m8, m4, m14
-    punpckhbw    m11, m4, m14
-    pmaddubsw     m8, [pb_1]
-    pmaddubsw    m11, [pb_1]
-    paddw         m2, m8
-    paddw         m7, m11                       ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
-    psrlw         m8, m2, 3
-    psrlw        m11, m7, 3
-    packuswb      m8, m11
-    pand          m8, m9
-    pandn        m11, m9, m4
-    por           m8, m11                       ; p0
+    punpcklbw     m0, m4, m14
+    punpckhbw     m1, m4, m14
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    paddw         m2, m0
+    paddw         m7, m1            ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
+    psrlw         m0, m2, 3
+    psrlw         m1, m7, 3
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m4
+    por           m0, m1            ; p0
 %ifidn %2, v
-    mova [tmpq+stride3q ], m8                   ; p0
+    mova [tmpq+stride3q], m0
 %else
-    mova  [rsp+1*16], m8
+    mova  [rsp+1*16], m0
 %endif
 
     punpcklbw     m0, m5, m15
     punpckhbw     m1, m5, m15
-    pmaddubsw     m8, m0, [pb_1]
-    pmaddubsw    m11, m1, [pb_1]
-    paddw         m2, m8
-    paddw         m7, m11
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
+    paddw         m2, m0
+    paddw         m7, m1
+%if ARCH_X86_32
     mova         m11, %%p3mem
-    punpcklbw     m8, m11, m4
+%endif
+    punpcklbw     m0, m11, m4
     punpckhbw    m11, m11, m4
-    pmaddubsw     m8, [pb_1]
-    pmaddubsw    m11, [pb_1]
-    psubw         m2, m8
-    psubw         m7, m11                       ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
-    psrlw         m8, m2, 3
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw    m11, [PIC_sym(pb_1)]
+    psubw         m2, m0
+    psubw         m7, m11           ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
+    psrlw         m0, m2, 3
     psrlw        m11, m7, 3
-    packuswb      m8, m11
-    pand          m8, m9
+    packuswb      m0, m11
+    pand          m0, m9
     pandn        m11, m9, m5
-    por          m11, m8, m11                   ; q0
+    por          m11, m0            ; q0
 %ifidn %2, v
-    mova [dstq+strideq*0], m11                  ; q0
+    mova [dstq+strideq*0], m11
+%elif ARCH_X86_32
+    mova  [esp+8*16], m11
 %endif
 
-    pmaddubsw     m0, [pb_m1_1]
-    pmaddubsw     m1, [pb_m1_1]
+    punpcklbw     m0, m5, m15
+    punpckhbw     m1, m5, m15
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
     paddw         m2, m0
     paddw         m7, m1
-    punpcklbw     m8, m13, m6
-    punpckhbw    m13, m6
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw    m13, [pb_m1_1]
-    paddw         m2, m8
-    paddw         m7, m13                       ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
-    psrlw         m8, m2, 3
-    psrlw        m13, m7, 3
-    packuswb      m8, m13
-    pand          m8, m9
-    pandn        m13, m9, m6
-    por          m13, m8, m13                   ; q1
+    punpcklbw     m0, m13, m6
+    punpckhbw     m1, m13, m6
+    pmaddubsw     m0, [PIC_sym(pb_m1_1)]
+    pmaddubsw     m1, [PIC_sym(pb_m1_1)]
+    paddw         m2, m0
+    paddw         m7, m1            ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
+    psrlw         m0, m2, 3
+    psrlw         m1, m7, 3
+    packuswb      m0, m1
+    pand          m0, m9
+    pandn         m1, m9, m6
+    por           m0, m1            ; q1
 %ifidn %2, v
-    mova [dstq+strideq*1], m13                  ; q1
+    mova [dstq+strideq*1], m0
+%else
+ %if ARCH_X86_64
+    SWAP           0, 13
+ %else
+    mova  [esp+9*16], m0
+ %endif
 %endif
 
     punpcklbw     m0, m3, m6
     punpckhbw     m1, m3, m6
-    pmaddubsw     m0, [pb_1]
-    pmaddubsw     m1, [pb_1]
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
     psubw         m2, m0
     psubw         m7, m1
     punpcklbw     m0, m14, m15
     punpckhbw     m1, m14, m15
-    pmaddubsw     m0, [pb_1]
-    pmaddubsw     m1, [pb_1]
+    pmaddubsw     m0, [PIC_sym(pb_1)]
+    pmaddubsw     m1, [PIC_sym(pb_1)]
     paddw         m2, m0
-    paddw         m7, m1                        ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
+    paddw         m7, m1            ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
     psrlw         m2, 3
     psrlw         m7, 3
     packuswb      m2, m7
     pand          m2, m9
     pandn         m7, m9, m14
-    por           m2, m7                        ; q2
+    por           m2, m7            ; q2
 %ifidn %2, v
-    mova [dstq+strideq*2], m2                   ; q2
+    mova [dstq+strideq*2], m2
 %else
     mova          m0, [rsp+0*16]
-    mova          m1, [rsp+1*16]
 %if %1 == 8
-    mova          m4, [rsp+21*16]
+    mova          m1, [rsp+1*16]
+    mova          m4, %%p3mem
 
+%if ARCH_X86_32
+ %define m10 [esp+2*16]
+ %define m11 [esp+8*16]
+ %define m13 [esp+9*16]
+%endif
+
     ; 16x8 transpose
     punpcklbw     m3, m4, m10
     punpckhbw     m4, m10
-    punpcklbw    m10, m0, m1
+    punpcklbw     m5, m0, m1
     punpckhbw     m0, m1
     punpcklbw     m1, m11, m13
-    punpckhbw    m11, m13
-    punpcklbw    m13, m2, m15
+    punpckhbw     m6, m11, m13
+    punpcklbw     m7, m2, m15
     punpckhbw     m2, m15
+%if ARCH_X86_64
+    SWAP           2, 15
+%else
+    mova         m15, m2
+%endif
 
-    punpcklwd    m15, m3, m10
-    punpckhwd     m3, m10
-    punpcklwd    m10, m4, m0
+    punpcklwd     m2, m3, m5
+    punpckhwd     m3, m5
+    punpcklwd     m5, m4, m0
     punpckhwd     m4, m0
-    punpcklwd     m0, m1, m13
-    punpckhwd     m1, m13
-    punpcklwd    m13, m11, m2
-    punpckhwd    m11, m2
+    punpcklwd     m0, m1, m7
+    punpckhwd     m1, m7
+    punpcklwd     m7, m6, m15
+    punpckhwd     m6, m15
+%if ARCH_X86_64
+    SWAP           6, 15
+%else
+    mova         m15, m6
+%endif
 
-    punpckldq     m2, m15, m0
-    punpckhdq    m15, m0
+    punpckldq     m6, m2, m0
+    punpckhdq     m2, m0
     punpckldq     m0, m3, m1
     punpckhdq     m3, m1
-    punpckldq     m1, m10, m13
-    punpckhdq    m10, m13
-    punpckldq    m13, m4, m11
-    punpckhdq     m4, m11
+    punpckldq     m1, m5, m7
+    punpckhdq     m5, m7
+    punpckldq     m7, m4, m15
+    punpckhdq     m4, m15
 
     ; write 8x16
-    movq   [dstq+strideq*0-4], xm2
-    movhps [dstq+strideq*1-4], xm2
-    movq   [dstq+strideq*2-4], xm15
-    movhps [dstq+stride3q -4], xm15
+    movq   [dstq+strideq*0-4], xm6
+    movhps [dstq+strideq*1-4], xm6
+    movq   [dstq+strideq*2-4], xm2
+    movhps [dstq+stride3q -4], xm2
     lea         dstq, [dstq+strideq*4]
     movq   [dstq+strideq*0-4], xm0
     movhps [dstq+strideq*1-4], xm0
@@ -1155,32 +1682,52 @@
     lea         dstq, [dstq+strideq*4]
     movq   [dstq+strideq*0-4], xm1
     movhps [dstq+strideq*1-4], xm1
-    movq   [dstq+strideq*2-4], xm10
-    movhps [dstq+stride3q -4], xm10
+    movq   [dstq+strideq*2-4], xm5
+    movhps [dstq+stride3q -4], xm5
     lea         dstq, [dstq+strideq*4]
-    movq   [dstq+strideq*0-4], xm13
-    movhps [dstq+strideq*1-4], xm13
+    movq   [dstq+strideq*0-4], xm7
+    movhps [dstq+strideq*1-4], xm7
     movq   [dstq+strideq*2-4], xm4
     movhps [dstq+stride3q -4], xm4
     lea         dstq, [dstq+strideq*4]
 %else
-    mova [rsp+21*16], m12
     ; 16x16 transpose and store
-    SWAP           5, 10, 2
     SWAP           6, 0
     SWAP           7, 1
+ %if ARCH_X86_64
+    SWAP           5, 10, 2
     SWAP           8, 11
     SWAP           9, 13
+    mova [rsp+21*16], m12
+ %else
+    mova [esp+10*16], m2
+  %xdefine m8  m0
+  %xdefine m9  m1
+  %xdefine m10 m2
+  %xdefine m11 m3
+  %xdefine m12 m4
+  %xdefine m13 m5
+  %xdefine m14 m6
+  %xdefine m15 m7
+ %endif
     mova          m0, [rsp+11*16]
     mova          m1, [rsp+12*16]
     mova          m2, [rsp+13*16]
     mova          m3, [rsp+14*16]
     mova          m4, [rsp+19*16]
+%if ARCH_X86_64
+    mova          m7, [rsp+ 1*16]
     mova         m11, [rsp+20*16]
     mova         m12, [rsp+15*16]
     mova         m13, [rsp+16*16]
     mova         m14, [rsp+17*16]
-    TRANSPOSE_16X16B 1, 0, [rsp+18*16]
+    TRANSPOSE_16X16B 1, [rsp+18*16]
+%else
+    mova          m5, [esp+ 2*16]
+    TRANSPOSE_16X16B 1, [esp+32*16]
+    mov         tmpq, dstq
+    lea         dstq, [dstq+strideq*8]
+%endif
     movu [dstq+strideq*0-8], xm0
     movu [dstq+strideq*1-8], xm1
     movu [dstq+strideq*2-8], xm2
@@ -1190,7 +1737,27 @@
     movu [dstq+strideq*1-8], xm5
     movu [dstq+strideq*2-8], xm6
     movu [dstq+stride3q -8], xm7
+%if ARCH_X86_64
     lea         dstq, [dstq+strideq*4]
+%else
+  %xdefine m8  m0
+  %xdefine m9  m1
+  %xdefine m10 m2
+  %xdefine m11 m3
+  %xdefine m12 m4
+  %xdefine m13 m5
+  %xdefine m14 m6
+  %xdefine m15 m7
+    mova          m8, [esp+11*16]
+    mova          m9, [esp+12*16]
+    mova         m10, [esp+13*16]
+    mova         m11, [esp+14*16]
+    mova         m12, [esp+26*16]
+    mova         m13, [esp+27*16]
+    mova         m14, [esp+ 0*16]
+    mova         m15, [esp+ 1*16]
+    mov         dstq, tmpq
+%endif
     movu [dstq+strideq*0-8], xm8
     movu [dstq+strideq*1-8], xm9
     movu [dstq+strideq*2-8], xm10
@@ -1201,27 +1768,46 @@
     movu [dstq+strideq*2-8], xm14
     movu [dstq+stride3q -8], xm15
     lea         dstq, [dstq+strideq*4]
-    ; un-swap m12
-    SWAP           8, 12
+%if ARCH_X86_32
+    lea         dstq, [dstq+strideq*8]
+%else
     mova         m12, [rsp+21*16]
-
 %endif
-%endif
+
+%endif ; if %1 == 8
+%endif ; ifidn %2, v
 %elif %1 == 6
     ; flat6 filter
+%if ARCH_X86_32
+    mova  [esp+3*16], m3
+    mova  [esp+4*16], m4
+    mova  [esp+5*16], m5
+    mova  [esp+6*16], m6
+ %xdefine m8  m3
+ %xdefine m10 m4
+ %xdefine m11 m5
+ %xdefine m15 m6
+ %define m3  [esp+3*16]
+ %define m4  [esp+4*16]
+ %define m5  [esp+5*16]
+ %define m6  [esp+6*16]
+ %define m9  %%flat8mem
+ %define m13 %%p2mem
+ %define m14 %%q2mem
+%endif
 
     punpcklbw     m8, m13, m5
     punpckhbw    m11, m13, m5
-    pmaddubsw     m0, m8, [pb_3_1]
-    pmaddubsw     m1, m11, [pb_3_1]
+    pmaddubsw     m0, m8, [PIC_sym(pb_3_1)]
+    pmaddubsw     m1, m11, [PIC_sym(pb_3_1)]
     punpcklbw     m7, m4, m3
     punpckhbw    m10, m4, m3
-    pmaddubsw     m2, m7, [pb_2]
-    pmaddubsw    m15, m10, [pb_2]
+    pmaddubsw     m2, m7, [PIC_sym(pb_2)]
+    pmaddubsw    m15, m10, [PIC_sym(pb_2)]
     paddw         m0, m2
     paddw         m1, m15
-    pmulhrsw      m2, m0, [pw_4096]
-    pmulhrsw     m15, m1, [pw_4096]
+    pmulhrsw      m2, m0, [PIC_sym(pw_4096)]
+    pmulhrsw     m15, m1, [PIC_sym(pw_4096)]
     packuswb      m2, m15
     pand          m2, m9
     pandn        m15, m9, m3
@@ -1228,26 +1814,33 @@
     por           m2, m15
 %ifidn %2, v
     mova [tmpq+strideq*2], m2                   ; p1
+%elif ARCH_X86_32
+    mova [esp+11*16], m2
 %endif
 
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw    m11, [pb_m1_1]
+    pmaddubsw     m8, [PIC_sym(pb_m1_1)]
+    pmaddubsw    m11, [PIC_sym(pb_m1_1)]
     paddw         m0, m8
     paddw         m1, m11
     punpcklbw     m8, m13, m6
     punpckhbw    m11, m13, m6
-    pmaddubsw     m8, [pb_m1_1]
-    pmaddubsw    m11, [pb_m1_1]
+%if ARCH_X86_64
+    SWAP           2, 13
+%endif
+    pmaddubsw     m8, [PIC_sym(pb_m1_1)]
+    pmaddubsw    m11, [PIC_sym(pb_m1_1)]
     paddw         m0, m8
     paddw         m1, m11
-    pmulhrsw     m15, m0, [pw_4096]
-    pmulhrsw     m13, m1, [pw_4096]
-    packuswb     m15, m13
-    pand         m15, m9
-    pandn        m13, m9, m4
-    por          m15, m13
+    pmulhrsw      m2, m0, [PIC_sym(pw_4096)]
+    pmulhrsw     m15, m1, [PIC_sym(pw_4096)]
+    packuswb      m2, m15
+    pand          m2, m9
+    pandn        m15, m9, m4
+    por           m2, m15
 %ifidn %2, v
-    mova [tmpq+stride3q], m15                   ; p0
+    mova [tmpq+stride3q], m2                    ; p0
+%elif ARCH_X86_32
+    mova  [esp+8*16], m2
 %endif
 
     paddw         m0, m8
@@ -1254,40 +1847,58 @@
     paddw         m1, m11
     punpcklbw     m8, m3, m14
     punpckhbw    m11, m3, m14
-    pmaddubsw    m14, m8, [pb_m1_1]
-    pmaddubsw    m13, m11, [pb_m1_1]
-    paddw         m0, m14
-    paddw         m1, m13
-    pmulhrsw     m14, m0, [pw_4096]
-    pmulhrsw     m13, m1, [pw_4096]
-    packuswb     m14, m13
-    pand         m14, m9
-    pandn        m13, m9, m5
-    por          m14, m13
+%if ARCH_X86_64
+    SWAP           2, 14
+%endif
+    pmaddubsw     m2, m8, [PIC_sym(pb_m1_1)]
+    pmaddubsw    m15, m11, [PIC_sym(pb_m1_1)]
+    paddw         m0, m2
+    paddw         m1, m15
+    pmulhrsw      m2, m0, [PIC_sym(pw_4096)]
+    pmulhrsw     m15, m1, [PIC_sym(pw_4096)]
+    packuswb      m2, m15
+    pand          m2, m9
+    pandn        m15, m9, m5
+    por           m2, m15
 %ifidn %2, v
-    mova [dstq+strideq*0], m14                  ; q0
+    mova [dstq+strideq*0], m2                   ; q0
 %endif
 
-    pmaddubsw     m8, [pb_m1_2]
-    pmaddubsw    m11, [pb_m1_2]
+    pmaddubsw     m8, [PIC_sym(pb_m1_2)]
+    pmaddubsw    m11, [PIC_sym(pb_m1_2)]
     paddw         m0, m8
     paddw         m1, m11
-    pmaddubsw     m7, [pb_m1_0]
-    pmaddubsw    m10, [pb_m1_0]
+    pmaddubsw     m7, [PIC_sym(pb_m1_0)]
+    pmaddubsw    m10, [PIC_sym(pb_m1_0)]
     paddw         m0, m7
     paddw         m1, m10
-    pmulhrsw      m0, [pw_4096]
-    pmulhrsw      m1, [pw_4096]
+    pmulhrsw      m0, [PIC_sym(pw_4096)]
+    pmulhrsw      m1, [PIC_sym(pw_4096)]
     packuswb      m0, m1
     pand          m0, m9
-    pandn         m9, m6
-    por           m0, m9
+    pandn         m1, m9, m6
+    por           m0, m1
+%if ARCH_X86_32
+ %xdefine m3 m8
+ %xdefine m4 m10
+ %xdefine m5 m11
+ %xdefine m6 m15
+%endif
 %ifidn %2, v
     mova [dstq+strideq*1], m0                   ; q1
 %else
-    TRANSPOSE_16x4_AND_WRITE_4x16 2, 15, 14, 0, 1
+ %if ARCH_X86_64
+    SWAP           3, 13
+    SWAP           4, 14
+ %else
+    mova          m3, [esp+11*16]
+    mova          m4, [esp+ 8*16]
+ %endif
+    SWAP           5, 2
+    SWAP           6, 0
+    TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
 %endif
-%else
+%else ; if %1 == 4
 %ifidn %2, v
     mova [tmpq+strideq*0], m3                   ; p1
     mova [tmpq+strideq*1], m4                   ; p0
@@ -1297,21 +1908,97 @@
     TRANSPOSE_16x4_AND_WRITE_4x16 3, 4, 5, 6, 7
 %endif
 %endif
+%if ARCH_X86_32
+ %define m12 m12reg
+%endif
 %endmacro
 
-%if ARCH_X86_64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;          32-bit PIC helpers          ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+%if ARCH_X86_32
+ %define PIC_base_offset $$
+
+ %macro SETUP_PIC 0 ; PIC_reg
+  %define PIC_reg r2
+  %assign PIC_reg_stk_offset stack_size-gprsize*(1+copy_args*4)
+    LEA      PIC_reg, $$
+ %endmacro
+
+ %macro XCHG_PIC_REG 1 ; 0=mask 1=PIC_base
+  %if %1 == 0
+    mov [esp+PIC_reg_stk_offset], PIC_reg
+    mov      PIC_reg, maskm
+  %else
+    mov      PIC_reg, [esp+PIC_reg_stk_offset]
+  %endif
+ %endmacro
+
+ %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
+
+%else
+ %macro XCHG_PIC_REG 1
+ %endmacro
+ %define PIC_sym(sym) (sym)
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < required_stack_alignment
+  %assign copy_args 1
+ %else
+  %assign copy_args 0
+ %endif
+%endif
+
+%macro RELOC_ARGS 1
+ %if copy_args
+  %define maskm     [esp+stack_size-gprsize*1]
+  %define l_stridem [esp+stack_size-gprsize*2]
+  %define lutm      [esp+stack_size-gprsize*3]
+  %define %1m       [esp+stack_size-gprsize*4]
+    mov          r6d, r6m
+    mov        maskm, maskd
+    mov         lutm, lutd
+    mov          %1m, r6d
+ %else
+  %define %1m       r6m
+ %endif
+%endmacro
+
+%if ARCH_X86_32
+ %define tmpq       r4
+ %define mstrideq   r5
+ %define stride3q   r6
+ %define l_stride3q r6
+%endif
+
 INIT_XMM ssse3
+%if ARCH_X86_64
 cglobal lpf_v_sb_y, 7, 11, 16, 16 * 15, \
                     dst, stride, mask, l, l_stride, lut, \
                     w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_y, 6, 7, 8, -16 * (26 + copy_args), \
+                    dst, stride, mask, l, l_stride, lut, mask_bits
+    RELOC_ARGS w
+    SETUP_PIC
+ %define m12 m5
+%endif
     shl    l_strideq, 2
     sub           lq, l_strideq
+%if ARCH_X86_64
     mov     mstrideq, strideq
     neg     mstrideq
     lea     stride3q, [strideq*3]
+%else
+    mov    l_stridem, l_strided
+%endif
     mov   mask_bitsd, 0xf
-    mova         m12, [pd_mask]
+    mova         m12, [PIC_sym(pd_mask)]
+    XCHG_PIC_REG   0
     movu          m0, [maskq]
     pxor          m4, m4
     movd          m3, [lutq+136]
@@ -1326,15 +2013,21 @@
     mova [rsp+13*16], m2
     mova [rsp+14*16], m3
 
-%define mask0  [rsp+11*16]
-%define mask1  [rsp+12*16]
-%define mask2  [rsp+13*16]
-%define minlvl [rsp+14*16]
+%define maskmem [esp+15*16]
+%define mask0   [rsp+11*16]
+%define mask1   [rsp+12*16]
+%define mask2   [rsp+13*16]
+%define minlvl  [rsp+14*16]
 
 .loop:
     test   [maskq+8], mask_bitsd                ; vmask[2]
     je .no_flat16
 
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+25*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER        16, v
     jmp .end
 
@@ -1342,34 +2035,67 @@
     test   [maskq+4], mask_bitsd                ; vmask[1]
     je .no_flat
 
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+25*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER         8, v
     jmp .end
 
 .no_flat:
     test   [maskq+0], mask_bitsd                ; vmask[0]
-    je .end
+    XCHG_PIC_REG   1
+    je .no_filter
 
+%if ARCH_X86_32
+    mov  [esp+25*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER         4, v
 
 .end:
+%if ARCH_X86_32
+    mova         m12, maskmem
+    mov   mask_bitsd, [esp+25*16]
+%endif
+.no_filter:
     pslld        m12, 4
     shl   mask_bitsd, 4
     add           lq, 16
     add         dstq, 16
+%if ARCH_X86_64
     sub           wd, 4
+%else
+    sub     dword wm, 4
+%endif
+    XCHG_PIC_REG   0
     jg .loop
     RET
 
 INIT_XMM ssse3
+%if ARCH_X86_64
 cglobal lpf_h_sb_y, 7, 11, 16, 16 * 26, \
                     dst, stride, mask, l, l_stride, lut, \
                     h, stride3, l_stride3, tmp, mask_bits
-    shl    l_strideq, 2
+%else
+cglobal lpf_h_sb_y, 6, 7, 8, -16 * (39 + copy_args), \
+                    dst, stride, mask, l, l_stride, lut, mask_bits
+    RELOC_ARGS h
+    SETUP_PIC
+ %define m12 m5
+%endif
     sub           lq, 4
+    shl    l_strideq, 2
+%if ARCH_X86_64
     lea     stride3q, [strideq*3]
     lea   l_stride3q, [l_strideq*3]
+%else
+    mov    l_stridem, l_strided
+%endif
     mov   mask_bitsd, 0xf
-    mova         m12, [pd_mask]
+    mova         m12, [PIC_sym(pd_mask)]
+    XCHG_PIC_REG   0
     movu          m0, [maskq]
     pxor          m4, m4
     movd          m3, [lutq+136]
@@ -1384,15 +2110,21 @@
     mova [rsp+24*16], m2
     mova [rsp+25*16], m3
 
-%define mask0  [rsp+22*16]
-%define mask1  [rsp+23*16]
-%define mask2  [rsp+24*16]
-%define minlvl [rsp+25*16]
+%define maskmem [esp+37*16]
+%define mask0   [rsp+22*16]
+%define mask1   [rsp+23*16]
+%define mask2   [rsp+24*16]
+%define minlvl  [rsp+25*16]
 
 .loop:
     test   [maskq+8], mask_bitsd                ; vmask[2]
     je .no_flat16
 
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+38*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER        16, h
     jmp .end
 
@@ -1400,13 +2132,23 @@
     test   [maskq+4], mask_bitsd                ; vmask[1]
     je .no_flat
 
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+38*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER         8, h
     jmp .end
 
 .no_flat:
     test   [maskq+0], mask_bitsd                ; vmask[0]
+    XCHG_PIC_REG   1
     je .no_filter
 
+%if ARCH_X86_32
+    mov  [esp+38*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER         4, h
     jmp .end
 
@@ -1413,25 +2155,52 @@
 .no_filter:
     lea         dstq, [dstq+strideq*8]
     lea         dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+    jmp .end_noload
 .end:
+    mova         m12, maskmem
+    mov    l_strideq, l_stridem
+    mov   mask_bitsd, [esp+38*16]
+.end_noload:
+%else
+.end:
+%endif
     lea           lq, [lq+l_strideq*4]
     pslld        m12, 4
     shl   mask_bitsd, 4
+%if ARCH_X86_64
     sub           hd, 4
+%else
+    sub     dword hm, 4
+%endif
+    XCHG_PIC_REG   0
     jg .loop
     RET
 
 INIT_XMM ssse3
+%if ARCH_X86_64
 cglobal lpf_v_sb_uv, 7, 11, 16, 3 * 16, \
                      dst, stride, mask, l, l_stride, lut, \
                      w, stride3, mstride, tmp, mask_bits
+%else
+cglobal lpf_v_sb_uv, 6, 7, 8, -16 * (12 + copy_args), \
+                     dst, stride, mask, l, l_stride, lut, mask_bits
+    RELOC_ARGS w
+    SETUP_PIC
+ %define m12 m4
+%endif
     shl    l_strideq, 2
     sub           lq, l_strideq
+%if ARCH_X86_64
     mov     mstrideq, strideq
     neg     mstrideq
     lea     stride3q, [strideq*3]
+%else
+    mov    l_stridem, l_strided
+%endif
     mov   mask_bitsd, 0xf
-    mova         m12, [pd_mask]
+    mova         m12, [PIC_sym(pd_mask)]
+    XCHG_PIC_REG   0
     movq          m0, [maskq]
     pxor          m3, m3
     movd          m2, [lutq+136]
@@ -1443,42 +2212,76 @@
     mova  [rsp+1*16], m1
     mova  [rsp+2*16], m2
 
-%define mask0  [rsp+0*16]
-%define mask1  [rsp+1*16]
-%define minlvl [rsp+2*16]
+%define maskmem [esp+7*16]
+%define mask0   [rsp+0*16]
+%define mask1   [rsp+1*16]
+%define minlvl  [rsp+2*16]
 
 .loop:
     test   [maskq+4], mask_bitsd                ; vmask[1]
     je .no_flat
 
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+11*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER         6, v
     jmp .end
 
 .no_flat:
     test   [maskq+0], mask_bitsd                ; vmask[1]
-    je .end
+    XCHG_PIC_REG   1
+    je .no_filter
 
+%if ARCH_X86_32
+    mov  [esp+11*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER         4, v
 
 .end:
+%if ARCH_X86_32
+    mova         m12, maskmem
+    mov   mask_bitsd, [esp+11*16]
+%endif
+.no_filter:
     pslld        m12, 4
     shl   mask_bitsd, 4
     add           lq, 16
     add         dstq, 16
+%if ARCH_X86_64
     sub           wd, 4
+%else
+    sub     dword wm, 4
+%endif
+    XCHG_PIC_REG   0
     jg .loop
     RET
 
 INIT_XMM ssse3
-cglobal lpf_h_sb_uv, 7, 11, 16, 3 * 16, \
+%if ARCH_X86_64
+cglobal lpf_h_sb_uv, 7, 11, 16, 16 * 3, \
                      dst, stride, mask, l, l_stride, lut, \
                      h, stride3, l_stride3, tmp, mask_bits
-    shl    l_strideq, 2
+%else
+cglobal lpf_h_sb_uv, 6, 7, 8, -16 * (13 + copy_args), \
+                     dst, stride, mask, l, l_stride, lut, mask_bits
+    RELOC_ARGS h
+    SETUP_PIC
+ %define m12 m4
+%endif
     sub           lq, 4
+    shl    l_strideq, 2
+%if ARCH_X86_64
     lea     stride3q, [strideq*3]
     lea   l_stride3q, [l_strideq*3]
+%else
+    mov    l_stridem, l_strided
+%endif
     mov   mask_bitsd, 0xf
-    mova         m12, [pd_mask]
+    mova         m12, [PIC_sym(pd_mask)]
+    XCHG_PIC_REG   0
     movq          m0, [maskq]
     pxor          m3, m3
     movd          m2, [lutq+136]
@@ -1490,21 +2293,32 @@
     mova  [rsp+1*16], m1
     mova  [rsp+2*16], m2
 
-%define mask0  [rsp+0*16]
-%define mask1  [rsp+1*16]
-%define minlvl [rsp+2*16]
+%define maskmem [esp+7*16]
+%define mask0   [rsp+0*16]
+%define mask1   [rsp+1*16]
+%define minlvl  [rsp+2*16]
 
 .loop:
     test   [maskq+4], mask_bitsd                ; vmask[1]
     je .no_flat
 
+%if ARCH_X86_32
+    XCHG_PIC_REG   1
+    mov  [esp+12*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER         6, h
     jmp .end
 
 .no_flat:
     test   [maskq+0], mask_bitsd                ; vmask[1]
+    XCHG_PIC_REG   1
     je .no_filter
 
+%if ARCH_X86_32
+    mov  [esp+12*16], mask_bitsd
+    mova     maskmem, m12
+%endif
     FILTER         4, h
     jmp .end
 
@@ -1511,12 +2325,24 @@
 .no_filter:
     lea         dstq, [dstq+strideq*8]
     lea         dstq, [dstq+strideq*8]
+%if ARCH_X86_32
+    jmp .end_noload
 .end:
+    mova         m12, maskmem
+    mov    l_strided, l_stridem
+    mov   mask_bitsd, [esp+12*16]
+.end_noload:
+%else
+.end:
+%endif
     lea           lq, [lq+l_strideq*4]
     pslld        m12, 4
     shl   mask_bitsd, 4
+%if ARCH_X86_64
     sub           hd, 4
+%else
+    sub     dword hm, 4
+%endif
+    XCHG_PIC_REG   0
     jg .loop
     RET
-
-%endif