shithub: dav1d

Download patch

ref: 6cf58c8e7deb54e287afeee6710b2a3774eded9c
parent: f55cd4c6f3d57494c4cea5a3b56145981a28b0c5
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Jul 13 21:13:16 EDT 2020

x86: Add cdef_filter SSE optimizations

--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -28,28 +28,31 @@
 
 SECTION_RODATA 16
 
-%if ARCH_X86_32
-pb_0: times 16 db 0
-pb_0xFF: times 16 db 0xFF
-%endif
+%macro DUP8 1-*
+    %rep %0
+        times 8 db %1
+        %rotate 1
+    %endrep
+%endmacro
+
+div_table_sse4:  dd 840, 420, 280, 210, 168, 140, 120, 105
+                 dd 420, 210, 140, 105, 105, 105, 105, 105
+div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
+                 dw 168, 168, 140, 140, 120, 120, 105, 105
+                 dw 420, 420, 210, 210, 140, 140, 105, 105
+                 dw 105, 105, 105, 105, 105, 105, 105, 105
+shufw_6543210x:  db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
+shufb_lohi:      db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
 pw_8: times 8 dw 8
 pw_128: times 8 dw 128
 pw_256: times 8 dw 256
 pw_2048: times 8 dw 2048
-%if ARCH_X86_32
 pw_0x7FFF: times 8 dw 0x7FFF
 pw_0x8000: times 8 dw 0x8000
-%endif
-div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
-                dd 420, 210, 140, 105, 105, 105, 105, 105
-div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
-                 dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
-shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
 tap_table: ; masks for 8-bit shift emulation
-           db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+           DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
            ; weights
-           db 4, 2, 3, 3, 2, 1
+           DUP8 4, 2, 3, 3, 2, 1
            ; taps indices
            db -1 * 16 + 1, -2 * 16 + 2
            db  0 * 16 + 1, -1 * 16 + 2
@@ -75,59 +78,19 @@
  %endif
 %endmacro
 
-%macro SAVE_ARG 2   ; varname, argnum
- %define %1_stkloc  [rsp+%2*gprsize]
- %define %1_argnum  %2
-    mov             r2, r%2m
-    mov      %1_stkloc, r2
-%endmacro
-
-%macro LOAD_ARG 1-2 0 ; varname, load_to_varname_register
- %if %2 == 0
-    mov r %+ %{1}_argnum, %1_stkloc
+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if cpuflag(sse4) && %3 == 0
+    pmovzxbw        %1, %2
  %else
-    mov            %1q, %1_stkloc
- %endif
-%endmacro
-
-%macro LOAD_ARG32 1-2 ; varname, load_to_varname_register
- %if ARCH_X86_32
-  %if %0 == 1
-    LOAD_ARG %1
+  %if %3 == 1
+    movd            %1, %2
   %else
-    LOAD_ARG %1, %2
+    movq            %1, %2
   %endif
+    punpcklbw       %1, m7
  %endif
 %endmacro
 
-%if ARCH_X86_32
- %define PIC_base_offset $$
- %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
-%else
- %define PIC_sym(sym) sym
-%endif
-
-%macro SAVE_PIC_REG 1
- %if ARCH_X86_32
-    mov       [esp+%1], PIC_reg
- %endif
-%endmacro
-
-%macro LOAD_PIC_REG 1
- %if ARCH_X86_32
-    mov        PIC_reg, [esp+%1]
- %endif
-%endmacro
-
-%macro PMOVZXBW 2-3 0 ; %3 = half
- %if %3 == 1
-    movd            %1, %2
- %else
-    movq            %1, %2
- %endif
-    punpcklbw       %1, m15
-%endmacro
-
 %macro PSHUFB_0 2
  %if cpuflag(ssse3)
     pshufb          %1, %2
@@ -138,34 +101,33 @@
  %endif
 %endmacro
 
-%macro LOAD_SEC_TAP 0
- %if ARCH_X86_64
-    movd            m3, [secq+kq]
-    PSHUFB_0        m3, m15
- %else
-    movd            m2, [secq+kq]             ; sec_taps
-    pxor            m3, m3
-    PSHUFB_0        m2, m3
- %endif
+%macro MOVDDUP 2
+%if cpuflag(ssse3)
+    movddup         %1, %2
+%else
+    movq            %1, %2
+    punpcklqdq      %1, %1
+%endif
 %endmacro
 
-%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
+%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
     ; load p0/p1
-    movsx         offq, byte [dirq+kq+%1]       ; off1
+    movsx         offq, byte [dirq+kq+%1+14*8]  ; off1
  %if %6 == 4
-    movq            m5, [stkq+offq*2+%7*0]      ; p0
-    movhps          m5, [stkq+offq*2+%7*1]
+    movq            m5, [stkq+offq*2+32*0]      ; p0
+    movhps          m5, [stkq+offq*2+32*1]
  %else
-    movu            m5, [stkq+offq*2+%7*0]      ; p0
+    movu            m5, [stkq+offq*2+32*0]      ; p0
  %endif
     neg           offq                          ; -off1
  %if %6 == 4
-    movq            m6, [stkq+offq*2+%7*0]      ; p1
-    movhps          m6, [stkq+offq*2+%7*1]
+    movq            m6, [stkq+offq*2+32*0]      ; p1
+    movhps          m6, [stkq+offq*2+32*1]
  %else
-    movu            m6, [stkq+offq*2+%7*0]      ; p1
+    movu            m6, [stkq+offq*2+32*0]      ; p1
  %endif
- %if cpuflag(sse4)
+ %if %7
+  %if cpuflag(sse4)
     ; out of bounds values are set to a value that is a both a large unsigned
     ; value and a negative signed value.
     ; use signed max and unsigned min to remove them
@@ -173,40 +135,26 @@
     pminuw          m8, m5
     pmaxsw          m7, m6
     pminuw          m8, m6
- %else
-  %if ARCH_X86_64
-    pcmpeqw         m9, m14, m5
-    pcmpeqw        m10, m14, m6
-    pandn           m9, m5
-    pandn          m10, m6
-    pmaxsw          m7, m9                      ; max after p0
-    pminsw          m8, m5                      ; min after p0
-    pmaxsw          m7, m10                     ; max after p1
-    pminsw          m8, m6                      ; min after p1
   %else
-    pcmpeqw         m9, m5, OUT_OF_BOUNDS_MEM
-    pandn           m9, m5
-    pmaxsw          m7, m9                      ; max after p0
-    pminsw          m8, m5                      ; min after p0
-    pcmpeqw         m9, m6, OUT_OF_BOUNDS_MEM
-    pandn           m9, m6
-    pmaxsw          m7, m9                      ; max after p1
-    pminsw          m8, m6                      ; min after p1
+    pcmpeqw         m3, m14, m5
+    pminsw          m8, m5     ; min after p0
+    pandn           m3, m5
+    pmaxsw          m7, m3     ; max after p0
+    pcmpeqw         m3, m14, m6
+    pminsw          m8, m6     ; min after p1
+    pandn           m3, m6
+    pmaxsw          m7, m3     ; max after p1
   %endif
  %endif
 
     ; accumulate sum[m13] over p0/p1
-    psubw           m5, m4          ; diff_p0(p0 - px)
-    psubw           m6, m4          ; diff_p1(p1 - px)
-    packsswb        m5, m6          ; convert pixel diff to 8-bit
+    psubw           m5, m4     ; diff_p0(p0 - px)
+    psubw           m6, m4     ; diff_p1(p1 - px)
+    packsswb        m5, m6     ; convert pixel diff to 8-bit
  %if cpuflag(ssse3)
-  %if ARCH_X86_64 && cpuflag(sse4)
-    pshufb          m5, m14         ; group diffs p0 and p1 into pairs
-  %else
-    pshufb          m5, [PIC_sym(shufb_lohi)]
-  %endif
+    pshufb          m5, m13    ; group diffs p0 and p1 into pairs
     pabsb           m6, m5
-    psignb          m9, %5, m5
+    psignb          m3, %5, m5
  %else
     movlhps         m6, m5
     punpckhbw       m6, m5
@@ -214,111 +162,113 @@
     pcmpgtb         m5, m6
     paddb           m6, m5
     pxor            m6, m5
-    paddb           m9, %5, m5
-    pxor            m9, m5
+    paddb           m3, %5, m5
+    pxor            m3, m5
  %endif
- %if ARCH_X86_64
-    psrlw          m10, m6, %2      ; emulate 8-bit shift
-    pand           m10, %3
-    psubusb         m5, %4, m10
- %else
-    psrlw           m5, m6, %2      ; emulate 8-bit shift
-    pand            m5, %3
-    paddusb         m5, %4
-    pxor            m5, [PIC_sym(pb_0xFF)]
- %endif
-    pminub          m5, m6          ; constrain(diff_p)
+    pand            m9, %3, m6 ; emulate 8-bit shift
+    psrlw           m9, %2
+    psubusb         m5, %4, m9
+    pminub          m5, m6     ; constrain(diff_p)
  %if cpuflag(ssse3)
-    pmaddubsw       m5, m9          ; constrain(diff_p) * taps
+    pmaddubsw       m5, m3     ; constrain(diff_p) * taps
  %else
-    psrlw           m2, m5, 8
-    psraw           m6, m9, 8
+    psrlw           m9, m5, 8
+    psraw           m6, m3, 8
     psllw           m5, 8
-    psllw           m9, 8
-    pmullw          m2, m6
-    pmulhw          m5, m9
-    paddw           m5, m2
+    psllw           m3, 8
+    pmullw          m9, m6
+    pmulhw          m5, m3
+    paddw           m5, m9
  %endif
-    paddw          m13, m5
+    paddw           m0, m5
 %endmacro
 
-%macro LOAD_BODY 4  ; dst, src, block_width, tmp_stride
+%macro LOAD_BODY 3 ; dst, src, block_width
  %if %3 == 4
     PMOVZXBW        m0, [%2+strideq*0]
     PMOVZXBW        m1, [%2+strideq*1]
     PMOVZXBW        m2, [%2+strideq*2]
     PMOVZXBW        m3, [%2+stride3q]
+    mova     [%1+32*0], m0
+    mova     [%1+32*1], m1
+    mova     [%1+32*2], m2
+    mova     [%1+32*3], m3
  %else
     movu            m0, [%2+strideq*0]
     movu            m1, [%2+strideq*1]
     movu            m2, [%2+strideq*2]
     movu            m3, [%2+stride3q]
-    punpckhbw       m4, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m5, m1, m15
-    punpcklbw       m1, m15
-    punpckhbw       m6, m2, m15
-    punpcklbw       m2, m15
-    punpckhbw       m7, m3, m15
-    punpcklbw       m3, m15
+    punpcklbw       m4, m0, m7
+    punpckhbw       m0, m7
+    mova  [%1+32*0+ 0], m4
+    mova  [%1+32*0+16], m0
+    punpcklbw       m4, m1, m7
+    punpckhbw       m1, m7
+    mova  [%1+32*1+ 0], m4
+    mova  [%1+32*1+16], m1
+    punpcklbw       m4, m2, m7
+    punpckhbw       m2, m7
+    mova  [%1+32*2+ 0], m4
+    mova  [%1+32*2+16], m2
+    punpcklbw       m4, m3, m7
+    punpckhbw       m3, m7
+    mova  [%1+32*3+ 0], m4
+    mova  [%1+32*3+16], m3
  %endif
-    mova     [%1+0*%4], m0
-    mova     [%1+1*%4], m1
-    mova     [%1+2*%4], m2
-    mova     [%1+3*%4], m3
- %if %3 == 8
-    mova [%1+0*%4+2*8], m4
-    mova [%1+1*%4+2*8], m5
-    mova [%1+2*%4+2*8], m6
-    mova [%1+3*%4+2*8], m7
- %endif
 %endmacro
 
-%macro CDEF_FILTER 3 ; w, h, stride
-
- %if cpuflag(sse4)
-  %define OUT_OF_BOUNDS 0x80008000
+%macro CDEF_FILTER_END 2 ; w, minmax
+    pxor            m6, m6
+    pcmpgtw         m6, m0
+    paddw           m0, m6
+ %if cpuflag(ssse3)
+    pmulhrsw        m0, m15
  %else
-  %define OUT_OF_BOUNDS 0x7FFF7FFF
+    paddw           m0, m15
+    psraw           m0, 4
  %endif
+    paddw           m4, m0
+ %if %2
+    pminsw          m4, m7
+    pmaxsw          m4, m8
+ %endif
+    packuswb        m4, m4
+ %if %1 == 4
+    movd [dstq+strideq*0], m4
+    psrlq           m4, 32
+    movd [dstq+strideq*1], m4
+    add           stkq, 32*2
+    lea           dstq, [dstq+strideq*2]
+ %else
+    movq        [dstq], m4
+    add           stkq, 32
+    add           dstq, strideq
+ %endif
+%endmacro
 
+%macro CDEF_FILTER 2 ; w, h
  %if ARCH_X86_64
-cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*%3, \
-                           dst, stride, left, top, pri, sec, stride3, dst4, edge
-    pcmpeqw        m14, m14
-  %if cpuflag(sse4)
-    psllw          m14, 15                  ; 0x8000
-  %else
-    psrlw          m14, 1                   ; 0x7FFF
-  %endif
-    pxor           m15, m15
-
-  %define px rsp+3*16+2*%3
+cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \
+                           dst, stride, left, top, pri, sec, edge, stride3, dst4
+  %define px rsp+3*16+2*32
+  %define base 0
  %else
-cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
-                           dst, stride, left, top, stride3, dst4, edge
-    SAVE_ARG      left, 2
-    SAVE_ARG       top, 3
-    SAVE_ARG       pri, 4
-    SAVE_ARG       sec, 5
-    SAVE_ARG       dir, 6
-    SAVE_ARG   damping, 7
-
-  %define PIC_reg r2
-    LEA        PIC_reg, PIC_base_offset
-
-  %if cpuflag(sse4)
-   %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x8000)]
-  %else
-   %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x7FFF)]
-  %endif
-
-  %define m15 [PIC_sym(pb_0)]
-
-  %define px esp+7*16+2*%3
+cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
+                           dst, stride, left, edge, stride3
+    %define       topq  r2
+    %define      dst4q  r2
+    LEA             r5, tap_table
+  %define px esp+7*16+2*32
+  %define base r5-tap_table
  %endif
-
     mov          edged, r8m
+ %if cpuflag(sse4)
+   %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
+ %else
+   %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
+ %endif
+    mova            m6, OUT_OF_BOUNDS_MEM
+    pxor            m7, m7
 
     ; prepare pixel buffers - body/right
  %if %2 == 8
@@ -325,11 +275,11 @@
     lea          dst4q, [dstq+strideq*4]
  %endif
     lea       stride3q, [strideq*3]
-    test         edged, 2                   ; have_right
+    test         edgeb, 2 ; have_right
     jz .no_right
-    LOAD_BODY       px, dstq, %1, %3
+    LOAD_BODY       px, dstq, %1
  %if %2 == 8
-    LOAD_BODY  px+4*%3, dst4q, %1, %3
+    LOAD_BODY  px+4*32, dst4q, %1
  %endif
     jmp .body_done
 .no_right:
@@ -337,39 +287,37 @@
     PMOVZXBW        m1, [dstq+strideq*1], %1 == 4
     PMOVZXBW        m2, [dstq+strideq*2], %1 == 4
     PMOVZXBW        m3, [dstq+stride3q ], %1 == 4
+    mova     [px+32*0], m0
+    mova     [px+32*1], m1
+    mova     [px+32*2], m2
+    mova     [px+32*3], m3
+    movd [px+32*0+%1*2], m6
+    movd [px+32*1+%1*2], m6
+    movd [px+32*2+%1*2], m6
+    movd [px+32*3+%1*2], m6
  %if %2 == 8
-    PMOVZXBW        m4, [dst4q+strideq*0], %1 == 4
-    PMOVZXBW        m5, [dst4q+strideq*1], %1 == 4
-    PMOVZXBW        m6, [dst4q+strideq*2], %1 == 4
-    PMOVZXBW        m7, [dst4q+stride3q ], %1 == 4
+    PMOVZXBW        m0, [dst4q+strideq*0], %1 == 4
+    PMOVZXBW        m1, [dst4q+strideq*1], %1 == 4
+    PMOVZXBW        m2, [dst4q+strideq*2], %1 == 4
+    PMOVZXBW        m3, [dst4q+stride3q ], %1 == 4
+    mova     [px+32*4], m0
+    mova     [px+32*5], m1
+    mova     [px+32*6], m2
+    mova     [px+32*7], m3
+    movd [px+32*4+%1*2], m6
+    movd [px+32*5+%1*2], m6
+    movd [px+32*6+%1*2], m6
+    movd [px+32*7+%1*2], m6
  %endif
-    mova     [px+0*%3], m0
-    mova     [px+1*%3], m1
-    mova     [px+2*%3], m2
-    mova     [px+3*%3], m3
- %if %2 == 8
-    mova     [px+4*%3], m4
-    mova     [px+5*%3], m5
-    mova     [px+6*%3], m6
-    mova     [px+7*%3], m7
-    mov dword [px+4*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+5*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+6*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+7*%3+%1*2], OUT_OF_BOUNDS
- %endif
-    mov dword [px+0*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+1*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+2*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+3*%3+%1*2], OUT_OF_BOUNDS
 .body_done:
 
     ; top
-    LOAD_ARG32     top
-    test         edged, 4                    ; have_top
+    movifnidn     topq, r3mp
+    test         edgeb, 4 ; have_top
     jz .no_top
-    test         edged, 1                    ; have_left
+    test         edgeb, 1 ; have_left
     jz .top_no_left
-    test         edged, 2                    ; have_right
+    test         edgeb, 2 ; have_right
     jz .top_no_right
  %if %1 == 4
     PMOVZXBW        m0, [topq+strideq*0-2]
@@ -377,39 +325,39 @@
  %else
     movu            m0, [topq+strideq*0-4]
     movu            m1, [topq+strideq*1-4]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    movu  [px-2*%3+8], m2
-    movu  [px-1*%3+8], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    movu  [px-32*2+8], m2
+    movu  [px-32*1+8], m3
  %endif
-    movu  [px-2*%3-%1], m0
-    movu  [px-1*%3-%1], m1
+    movu  [px-32*2-%1], m0
+    movu  [px-32*1-%1], m1
     jmp .top_done
 .top_no_right:
  %if %1 == 4
     PMOVZXBW        m0, [topq+strideq*0-%1]
     PMOVZXBW        m1, [topq+strideq*1-%1]
-    movu [px-2*%3-4*2], m0
-    movu [px-1*%3-4*2], m1
+    movu   [px-32*2-8], m0
+    movu   [px-32*1-8], m1
  %else
     movu            m0, [topq+strideq*0-%1]
     movu            m1, [topq+strideq*1-%2]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    mova [px-2*%3-8*2], m0
-    mova [px-2*%3-0*2], m2
-    mova [px-1*%3-8*2], m1
-    mova [px-1*%3-0*2], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    mova  [px-32*2-16], m0
+    mova  [px-32*2+ 0], m2
+    mova  [px-32*1-16], m1
+    mova  [px-32*1+ 0], m3
  %endif
-    mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
+    movd [px-32*2+%1*2], m6
+    movd [px-32*1+%1*2], m6
     jmp .top_done
 .top_no_left:
-    test         edged, 2                   ; have_right
+    test         edgeb, 2 ; have_right
     jz .top_no_left_right
  %if %1 == 4
     PMOVZXBW        m0, [topq+strideq*0]
@@ -417,102 +365,92 @@
  %else
     movu            m0, [topq+strideq*0]
     movu            m1, [topq+strideq*1]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    movd [px-2*%3+8*2], m2
-    movd [px-1*%3+8*2], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    movd  [px-32*2+16], m2
+    movd  [px-32*1+16], m3
  %endif
-    mova     [px-2*%3], m0
-    mova     [px-1*%3], m1
-    mov dword [px-2*%3-4], OUT_OF_BOUNDS
-    mov dword [px-1*%3-4], OUT_OF_BOUNDS
+    movd  [px-32*2- 4], m6
+    movd  [px-32*1- 4], m6
+    mova  [px-32*2+ 0], m0
+    mova  [px-32*1+ 0], m1
     jmp .top_done
 .top_no_left_right:
     PMOVZXBW        m0, [topq+strideq*0], %1 == 4
     PMOVZXBW        m1, [topq+strideq*1], %1 == 4
-    mova     [px-2*%3], m0
-    mova     [px-1*%3], m1
-    mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px-2*%3-4], OUT_OF_BOUNDS
-    mov dword [px-1*%3-4], OUT_OF_BOUNDS
+    movd   [px-32*2-4], m6
+    movd   [px-32*1-4], m6
+    mova   [px-32*2+0], m0
+    mova   [px-32*1+0], m1
+    movd [px-32*2+%1*2], m6
+    movd [px-32*1+%1*2], m6
     jmp .top_done
 .no_top:
- %if ARCH_X86_64
-    SWAP            m0, m14
- %else
-    mova            m0, OUT_OF_BOUNDS_MEM
- %endif
-    movu   [px-2*%3-4], m0
-    movu   [px-1*%3-4], m0
+    movu  [px-32*2- 4], m6
+    movu  [px-32*1- 4], m6
  %if %1 == 8
-    movq   [px-2*%3+12], m0
-    movq   [px-1*%3+12], m0
+    movq  [px-32*2+12], m6
+    movq  [px-32*1+12], m6
  %endif
- %if ARCH_X86_64
-    SWAP            m0, m14
- %endif
 .top_done:
 
     ; left
-    test         edged, 1                   ; have_left
+    test         edgeb, 1 ; have_left
     jz .no_left
-    SAVE_PIC_REG     0
-    LOAD_ARG32    left
+    movifnidn    leftq, leftmp
  %if %2 == 4
     movq            m0, [leftq]
  %else
     movu            m0, [leftq]
  %endif
-    LOAD_PIC_REG     0
  %if %2 == 4
-    punpcklbw       m0, m15
+    punpcklbw       m0, m7
  %else
-    punpckhbw       m1, m0, m15
-    punpcklbw       m0, m15
+    punpckhbw       m1, m0, m7
+    punpcklbw       m0, m7
     movhlps         m3, m1
-    movd   [px+4*%3-4], m1
-    movd   [px+6*%3-4], m3
+    movd   [px+32*4-4], m1
+    movd   [px+32*6-4], m3
     psrlq           m1, 32
     psrlq           m3, 32
-    movd   [px+5*%3-4], m1
-    movd   [px+7*%3-4], m3
+    movd   [px+32*5-4], m1
+    movd   [px+32*7-4], m3
  %endif
     movhlps         m2, m0
-    movd   [px+0*%3-4], m0
-    movd   [px+2*%3-4], m2
+    movd   [px+32*0-4], m0
+    movd   [px+32*2-4], m2
     psrlq           m0, 32
     psrlq           m2, 32
-    movd   [px+1*%3-4], m0
-    movd   [px+3*%3-4], m2
+    movd   [px+32*1-4], m0
+    movd   [px+32*3-4], m2
     jmp .left_done
 .no_left:
-    mov dword [px+0*%3-4], OUT_OF_BOUNDS
-    mov dword [px+1*%3-4], OUT_OF_BOUNDS
-    mov dword [px+2*%3-4], OUT_OF_BOUNDS
-    mov dword [px+3*%3-4], OUT_OF_BOUNDS
+    movd   [px+32*0-4], m6
+    movd   [px+32*1-4], m6
+    movd   [px+32*2-4], m6
+    movd   [px+32*3-4], m6
  %if %2 == 8
-    mov dword [px+4*%3-4], OUT_OF_BOUNDS
-    mov dword [px+5*%3-4], OUT_OF_BOUNDS
-    mov dword [px+6*%3-4], OUT_OF_BOUNDS
-    mov dword [px+7*%3-4], OUT_OF_BOUNDS
+    movd   [px+32*4-4], m6
+    movd   [px+32*5-4], m6
+    movd   [px+32*6-4], m6
+    movd   [px+32*7-4], m6
  %endif
 .left_done:
 
     ; bottom
  %if ARCH_X86_64
-    DEFINE_ARGS dst, stride, dummy1, dst8, pri, sec, stride3, dummy2, edge
+    DEFINE_ARGS dst, stride, dst8, dummy, pri, sec, edge, stride3
  %else
-    DEFINE_ARGS dst, stride, dummy1, dst8, stride3, dummy2, edge
+    DEFINE_ARGS dst, stride, dst8, edge, stride3
  %endif
-    test         edged, 8                   ; have_bottom
+    test         edgeb, 8 ; have_bottom
     jz .no_bottom
     lea          dst8q, [dstq+%2*strideq]
-    test         edged, 1                   ; have_left
+    test         edgeb, 1 ; have_left
     jz .bottom_no_left
-    test         edged, 2                   ; have_right
+    test         edgeb, 2 ; have_right
     jz .bottom_no_right
  %if %1 == 4
     PMOVZXBW        m0, [dst8q-(%1/2)]
@@ -520,40 +458,40 @@
  %else
     movu            m0, [dst8q-4]
     movu            m1, [dst8q+strideq-4]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    movu [px+(%2+0)*%3+8], m2
-    movu [px+(%2+1)*%3+8], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    movu [px+32*(%2+0)+8], m2
+    movu [px+32*(%2+1)+8], m3
  %endif
-    movu [px+(%2+0)*%3-%1], m0
-    movu [px+(%2+1)*%3-%1], m1
+    movu [px+32*(%2+0)-%1], m0
+    movu [px+32*(%2+1)-%1], m1
     jmp .bottom_done
 .bottom_no_right:
  %if %1 == 4
     PMOVZXBW        m0, [dst8q-4]
     PMOVZXBW        m1, [dst8q+strideq-4]
-    movu [px+(%2+0)*%3-4*2], m0
-    movu [px+(%2+1)*%3-4*2], m1
+    movu [px+32*(%2+0)-8], m0
+    movu [px+32*(%2+1)-8], m1
  %else
     movu            m0, [dst8q-8]
     movu            m1, [dst8q+strideq-8]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    mova [px+(%2+0)*%3-8*2], m0
-    mova [px+(%2+0)*%3-0*2], m2
-    mova [px+(%2+1)*%3-8*2], m1
-    mova [px+(%2+1)*%3-0*2], m3
-    mov dword [px+(%2-1)*%3+8*2], OUT_OF_BOUNDS     ; overwritten by first mova
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    mova [px+32*(%2+0)-16], m0
+    mova [px+32*(%2+0)+ 0], m2
+    mova [px+32*(%2+1)-16], m1
+    mova [px+32*(%2+1)+ 0], m3
+    movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
  %endif
-    mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
+    movd [px+32*(%2+0)+%1*2], m6
+    movd [px+32*(%2+1)+%1*2], m6
     jmp .bottom_done
 .bottom_no_left:
-    test          edged, 2                  ; have_right
+    test         edgeb, 2 ; have_right
     jz .bottom_no_left_right
  %if %1 == 4
     PMOVZXBW        m0, [dst8q]
@@ -561,233 +499,245 @@
  %else
     movu            m0, [dst8q]
     movu            m1, [dst8q+strideq]
-    punpckhbw       m2, m0, m15
-    punpcklbw       m0, m15
-    punpckhbw       m3, m1, m15
-    punpcklbw       m1, m15
-    mova [px+(%2+0)*%3+8*2], m2
-    mova [px+(%2+1)*%3+8*2], m3
+    punpckhbw       m2, m0, m7
+    punpcklbw       m0, m7
+    punpckhbw       m3, m1, m7
+    punpcklbw       m1, m7
+    mova [px+32*(%2+0)+16], m2
+    mova [px+32*(%2+1)+16], m3
  %endif
-    mova [px+(%2+0)*%3], m0
-    mova [px+(%2+1)*%3], m1
-    mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
-    mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+    mova [px+32*(%2+0)+ 0], m0
+    mova [px+32*(%2+1)+ 0], m1
+    movd [px+32*(%2+0)- 4], m6
+    movd [px+32*(%2+1)- 4], m6
     jmp .bottom_done
 .bottom_no_left_right:
     PMOVZXBW        m0, [dst8q+strideq*0], %1 == 4
     PMOVZXBW        m1, [dst8q+strideq*1], %1 == 4
-    mova [px+(%2+0)*%3], m0
-    mova [px+(%2+1)*%3], m1
-    mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
-    mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
-    mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+    mova [px+32*(%2+0)+ 0], m0
+    mova [px+32*(%2+1)+ 0], m1
+    movd [px+32*(%2+0)+%1*2], m6
+    movd [px+32*(%2+1)+%1*2], m6
+    movd [px+32*(%2+0)- 4], m6
+    movd [px+32*(%2+1)- 4], m6
     jmp .bottom_done
 .no_bottom:
- %if ARCH_X86_64
-    SWAP            m0, m14
- %else
-    mova            m0, OUT_OF_BOUNDS_MEM
- %endif
-    movu [px+(%2+0)*%3-4], m0
-    movu [px+(%2+1)*%3-4], m0
+    movu [px+32*(%2+0)- 4], m6
+    movu [px+32*(%2+1)- 4], m6
  %if %1 == 8
-    movq [px+(%2+0)*%3+12], m0
-    movq [px+(%2+1)*%3+12], m0
+    movq [px+32*(%2+0)+12], m6
+    movq [px+32*(%2+1)+12], m6
  %endif
- %if ARCH_X86_64
-    SWAP            m0, m14
- %endif
 .bottom_done:
 
     ; actual filter
-    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp
  %if ARCH_X86_64
-    movifnidn     prid, prim
-    movifnidn     secd, secm
-    mov       dampingd, r7m
+    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec
+    mova           m13, [shufb_lohi]
+ %if cpuflag(ssse3)
+    mova           m15, [pw_2048]
  %else
-    LOAD_ARG       pri
-    LOAD_ARG       sec
-    LOAD_ARG   damping, 1
+    mova           m15, [pw_8]
  %endif
-
-    SAVE_PIC_REG     8
-    mov        pridmpd, prid
-    mov        secdmpd, secd
-    or         pridmpd, 1
-    or         secdmpd, 1
-    bsr        pridmpd, pridmpd
-    bsr        secdmpd, secdmpd
+    mova           m14, m6
+ %else
+    DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
+    %xdefine        m8  m1
+    %xdefine        m9  m2
+    %xdefine       m10  m0
+    %xdefine       m13  [base+shufb_lohi]
+    %xdefine       m14  OUT_OF_BOUNDS_MEM
+ %if cpuflag(ssse3)
+    %xdefine       m15  [base+pw_2048]
+ %else
+    %xdefine       m15  [base+pw_8]
+ %endif
+ %endif
+    movifnidn     prid, r4m
+    movifnidn     secd, r5m
+    mov       dampingd, r7m
+    movif32 [esp+0x3C], r1d
+    test          prid, prid
+    jz .sec_only
+    movd            m1, prim
+    bsr        pridmpd, prid
+    test          secd, secd
+    jz .pri_only
+    movd           m10, r5m
+    bsr           secd, secd
+    and           prid, 1
     sub        pridmpd, dampingd
-    sub        secdmpd, dampingd
+    sub           secd, dampingd
     xor       dampingd, dampingd
+    add           prid, prid
     neg        pridmpd
     cmovs      pridmpd, dampingd
-    neg        secdmpd
-    cmovs      secdmpd, dampingd
+    neg           secd
+    cmovs         secd, dampingd
+    PSHUFB_0        m1, m7
+    PSHUFB_0       m10, m7
  %if ARCH_X86_64
-    mov       [rsp+ 0], pridmpq                 ; pri_shift
-    mov       [rsp+16], secdmpq                 ; sec_shift
+    DEFINE_ARGS dst, stride, pridmp, tap, pri, sec
+    lea           tapq, [tap_table]
+    MOVDDUP        m11, [tapq+pridmpq*8] ; pri_shift_mask
+    MOVDDUP        m12, [tapq+secq*8]    ; sec_shift_mask
+    mov     [rsp+0x00], pridmpq          ; pri_shift
+    mov     [rsp+0x10], secq             ; sec_shift
+    DEFINE_ARGS dst, stride, dir, tap, pri, stk, k, off, h
  %else
+    MOVDDUP         m2, [tapq+pridmpq*8]
+    MOVDDUP         m3, [tapq+secq*8]
+    mov     [esp+0x04], dampingd         ; zero upper 32 bits of psrlw
+    mov     [esp+0x34], dampingd         ; source operand in ACCUMULATE_TAP
     mov     [esp+0x00], pridmpd
-    mov     [esp+0x30], secdmpd
-    mov dword [esp+0x04], 0                     ; zero upper 32 bits of psrlw
-    mov dword [esp+0x34], 0                     ; source operand in ACCUMULATE_TAP
-  %define PIC_reg r4
-    LOAD_PIC_REG     8
+    mov     [esp+0x30], secd
+    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+  %define         offq  dstq
+  %define           kd  strided
+  %define           kq  strideq
+    mova    [esp+0x10], m2
+    mova    [esp+0x40], m3
+    mova    [esp+0x20], m1
+    mova    [esp+0x50], m10
  %endif
-
-    DEFINE_ARGS dst, stride, pridmp, table, pri, sec, secdmp
-    lea         tableq, [PIC_sym(tap_table)]
- %if ARCH_X86_64
-    SWAP            m2, m11
-    SWAP            m3, m12
+    mov           dird, r6m
+    lea           stkq, [px]
+    lea           priq, [tapq+8*8+priq*8] ; pri_taps
+    mov             hd, %1*%2/8
+    lea           dirq, [tapq+dirq*2]
+.v_loop:
+    movif32 [esp+0x38], dstd
+    mov             kd, 1
+ %if %1 == 4
+    movq            m4, [stkq+32*0]
+    movhps          m4, [stkq+32*1]
+ %else
+    mova            m4, [stkq+32*0]       ; px
  %endif
-    movd            m2, [tableq+pridmpq]
-    movd            m3, [tableq+secdmpq]
-    PSHUFB_0        m2, m15                     ; pri_shift_mask
-    PSHUFB_0        m3, m15                     ; sec_shift_mask
+    pxor            m0, m0                ; sum
+    mova            m7, m4                ; max
+    mova            m8, m4                ; min
+.k_loop:
+    MOVDDUP         m2, [priq+kq*8]
  %if ARCH_X86_64
-    SWAP            m2, m11
-    SWAP            m3, m12
+    ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
+    MOVDDUP         m2, [tapq+12*8+kq*8]
+    ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
+    ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
  %else
-  %define PIC_reg r6
-    mov        PIC_reg, r4
-    DEFINE_ARGS dst, stride, dir, table, pri, sec, secdmp
-    LOAD_ARG       pri
-    LOAD_ARG       dir, 1
-    mova    [esp+0x10], m2
-    mova    [esp+0x40], m3
+    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
+    MOVDDUP         m2, [tapq+12*8+kq*8]
+    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+    MOVDDUP         m2, [tapq+12*8+kq*8]
+    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
  %endif
+    dec             kd
+    jge .k_loop
+    movif32       dstq, [esp+0x38]
+    movif32    strideq, [esp+0x3C]
+    CDEF_FILTER_END %1, 1
+    dec             hd
+    jg .v_loop
+    RET
 
-    ; pri/sec_taps[k] [4 total]
-    DEFINE_ARGS dst, stride, dummy, tap, pri, sec
-    movd            m0, prid
-    movd            m1, secd
- %if ARCH_X86_64
-    PSHUFB_0        m0, m15
-    PSHUFB_0        m1, m15
+.pri_only:
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, pridmp, damping, pri, tap, zero
+    lea           tapq, [tap_table]
  %else
-  %if cpuflag(ssse3)
-    pxor            m2, m2
-  %endif
-    mova            m3, [PIC_sym(pb_0xFF)]
-    PSHUFB_0        m0, m2
-    PSHUFB_0        m1, m2
-    pxor            m0, m3
-    pxor            m1, m3
-    mova    [esp+0x20], m0
-    mova    [esp+0x50], m1
+    DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
  %endif
     and           prid, 1
-    lea           priq, [tapq+8+priq*2]         ; pri_taps
-    lea           secq, [tapq+12]               ; sec_taps
-
- %if ARCH_X86_64 && cpuflag(sse4)
-    mova           m14, [shufb_lohi]
- %endif
-
-    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
-    DEFINE_ARGS dst, stride, dir, tap, pri, sec
+    xor          zerod, zerod
+    sub       dampingd, pridmpd
+    cmovs     dampingd, zerod
+    add           prid, prid
+    PSHUFB_0        m1, m7
+    MOVDDUP         m7, [tapq+dampingq*8]
+    mov     [rsp+0x00], dampingq
  %if ARCH_X86_64
-    mov           dird, r6m
-    lea           dirq, [tapq+14+dirq*2]
-    DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+    DEFINE_ARGS dst, stride, dir, stk, pri, tap, k, off, h
  %else
-    lea           dird, [tapd+14+dird*2]
-    DEFINE_ARGS dst, stride, dir, stk, pri, sec
-  %define hd    dword [esp+8]
-  %define offq  dstq
-  %define kq    strideq
+    mov     [rsp+0x04], zerod
+    DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
  %endif
-    mov             hd, %1*%2*2/mmsize
+    mov           dird, r6m
     lea           stkq, [px]
-    movif32 [esp+0x3C], strided
-.v_loop:
+    lea           priq, [tapq+8*8+priq*8]
+    mov             hd, %1*%2/8
+    lea           dirq, [tapq+dirq*2]
+.pri_v_loop:
     movif32 [esp+0x38], dstd
-    mov             kq, 1
+    mov             kd, 1
  %if %1 == 4
-    movq            m4, [stkq+%3*0]
-    movhps          m4, [stkq+%3*1]
+    movq            m4, [stkq+32*0]
+    movhps          m4, [stkq+32*1]
  %else
-    mova            m4, [stkq+%3*0]             ; px
+    mova            m4, [stkq+32*0]
  %endif
+    pxor            m0, m0
+.pri_k_loop:
+    MOVDDUP         m2, [priq+kq*8]
+    ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
+    dec             kd
+    jge .pri_k_loop
+    movif32       dstq, [esp+0x38]
+    movif32    strideq, [esp+0x3C]
+    CDEF_FILTER_END %1, 0
+    dec             hd
+    jg .pri_v_loop
+    RET
 
- %if ARCH_X86_32
-  %xdefine m9   m3
-  %xdefine m13  m7
-  %xdefine  m7  m0
-  %xdefine  m8  m1
- %endif
-
-    pxor           m13, m13                     ; sum
-    mova            m7, m4                      ; max
-    mova            m8, m4                      ; min
-.k_loop:
-    movd            m2, [priq+kq]               ; pri_taps
+.sec_only:
+%if ARCH_X86_64
+    DEFINE_ARGS dst, stride, dir, damping, tap, sec, zero
+%else
+    DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
+%endif
+    movd            m1, r5m
+    bsr           secd, secd
+    mov           dird, r6m
+    xor          zerod, zerod
+    sub       dampingd, secd
+    cmovs     dampingd, zerod
+    PSHUFB_0        m1, m7
  %if ARCH_X86_64
-    PSHUFB_0        m2, m15
-  %if cpuflag(ssse3)
-    LOAD_SEC_TAP                                ; sec_taps
-  %endif
-    ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
-  %if notcpuflag(ssse3)
-    LOAD_SEC_TAP                                ; sec_taps
-  %endif
-    ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
-    ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
+    lea           tapq, [tap_table]
  %else
-  %if cpuflag(ssse3)
-    pxor            m3, m3
-  %endif
-    PSHUFB_0        m2, m3
-    ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
-    LOAD_SEC_TAP                                ; sec_taps
-    ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
-  %if notcpuflag(ssse3)
-    LOAD_SEC_TAP                                ; sec_taps
-  %endif
-    ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
+    mov     [rsp+0x04], zerod
  %endif
-
-    dec             kq
-    jge .k_loop
-
-    pxor            m6, m6
-    pcmpgtw         m6, m13
-    paddw          m13, m6
- %if cpuflag(ssse3)
-    pmulhrsw       m13, [PIC_sym(pw_2048)]
+    mov     [rsp+0x00], dampingq
+    MOVDDUP         m7, [tapq+dampingq*8]
+    lea           dirq, [tapq+dirq*2]
+ %if ARCH_X86_64
+    DEFINE_ARGS dst, stride, dir, stk, tap, off, k, h
  %else
-    paddw          m13, [PIC_sym(pw_8)]
-    psraw          m13, 4
+    DEFINE_ARGS dst, stride, off, stk, dir, tap, h
  %endif
-    paddw           m4, m13
-    pminsw          m4, m7
-    pmaxsw          m4, m8
-    packuswb        m4, m4
-    movif32       dstd, [esp+0x38]
-    movif32    strided, [esp+0x3C]
+    lea           stkq, [px]
+    mov             hd, %1*%2/8
+.sec_v_loop:
+    mov             kd, 1
  %if %1 == 4
-    movd [dstq+strideq*0], m4
-    psrlq           m4, 32
-    movd [dstq+strideq*1], m4
+    movq            m4, [stkq+32*0]
+    movhps          m4, [stkq+32*1]
  %else
-    movq [dstq], m4
+    mova            m4, [stkq+32*0]
  %endif
-
- %if %1 == 4
- %define vloop_lines (mmsize/(%1*2))
-    lea           dstq, [dstq+strideq*vloop_lines]
-    add           stkq, %3*vloop_lines
- %else
-    lea           dstq, [dstq+strideq]
-    add           stkq, %3
+    pxor            m0, m0
+.sec_k_loop:
+    MOVDDUP         m2, [tapq+12*8+kq*8]
+    ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
+ %if ARCH_X86_32
+    MOVDDUP         m2, [tapq+12*8+kq*8]
  %endif
+    ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
+    dec             kd
+    jge .sec_k_loop
+    movif32    strideq, [esp+0x3C]
+    CDEF_FILTER_END %1, 0
     dec             hd
-    jg .v_loop
-
+    jg .sec_v_loop
     RET
 %endmacro
 
@@ -1079,18 +1029,16 @@
     shr            r1d, 10
     mov         [varq], r1d
  %else
-cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
-  %define PIC_reg r4
-    LEA        PIC_reg, PIC_base_offset
-
+cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3
+%define base r2-shufw_6543210x
+    LEA             r2, shufw_6543210x
     pxor            m0, m0
-    mova            m1, [PIC_sym(pw_128)]
-
     lea       stride3q, [strideq*3]
     movq            m5, [srcq+strideq*0]
     movhps          m5, [srcq+strideq*1]
     movq            m7, [srcq+strideq*2]
     movhps          m7, [srcq+stride3q]
+    mova            m1, [base+pw_128]
     psadbw          m2, m5, m0
     psadbw          m3, m7, m0
     packssdw        m2, m3
@@ -1143,7 +1091,7 @@
     pmaddwd         m0, m0
 
     phaddd          m2, m0
-    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+48]
+    MULLD           m2, [base+div_table%+SUFFIX+48]
     mova    [esp+0x30], m2
 
     mova            m1, [esp+0x10]
@@ -1176,13 +1124,13 @@
     paddw           m0, m2                  ; partial_sum_diag[0][0-7]
     paddw           m1, m3                  ; partial_sum_diag[0][8-14,zero]
     mova            m3, [esp+0x50]
-    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    pshufb          m1, [base+shufw_6543210x]
     punpckhwd       m2, m0, m1
     punpcklwd       m0, m1
     pmaddwd         m2, m2
     pmaddwd         m0, m0
-    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+16]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+0]
+    MULLD           m2, [base+div_table%+SUFFIX+16]
+    MULLD           m0, [base+div_table%+SUFFIX+ 0]
     paddd           m0, m2                  ; cost[0a-d]
     mova    [esp+0x40], m0
 
@@ -1217,13 +1165,13 @@
     paddw           m0, m2                  ; partial_sum_diag[1][0-7]
     paddw           m1, m3                  ; partial_sum_diag[1][8-14,zero]
     mova            m3, [esp+0x50]
-    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    pshufb          m1, [base+shufw_6543210x]
     punpckhwd       m2, m0, m1
     punpcklwd       m0, m1
     pmaddwd         m2, m2
     pmaddwd         m0, m0
-    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+16]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+0]
+    MULLD           m2, [base+div_table%+SUFFIX+16]
+    MULLD           m0, [base+div_table%+SUFFIX+ 0]
     paddd           m0, m2                  ; cost[4a-d]
     phaddd          m1, [esp+0x40], m0      ; cost[0a/b,4a/b]
     phaddd          m1, [esp+0x30]          ; cost[0,4,2,6]
@@ -1259,8 +1207,8 @@
     punpcklwd       m0, m1
     pmaddwd         m2, m2
     pmaddwd         m0, m0
-    MULLD           m2, [PIC_sym(div_table%+SUFFIX)+48]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+32]
+    MULLD           m2, [base+div_table%+SUFFIX+48]
+    MULLD           m0, [base+div_table%+SUFFIX+32]
     paddd           m0, m2                  ; cost[7a-d]
     mova    [esp+0x40], m0
 
@@ -1280,8 +1228,8 @@
     punpcklwd       m0, m2
     pmaddwd         m7, m7
     pmaddwd         m0, m0
-    MULLD           m7, [PIC_sym(div_table%+SUFFIX)+48]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+32]
+    MULLD           m7, [base+div_table%+SUFFIX+48]
+    MULLD           m0, [base+div_table%+SUFFIX+32]
     paddd           m0, m7                  ; cost[5a-d]
     mova    [esp+0x50], m0
 
@@ -1303,8 +1251,8 @@
     punpcklwd       m0, m2
     pmaddwd         m7, m7
     pmaddwd         m0, m0
-    MULLD           m7, [PIC_sym(div_table%+SUFFIX)+48]
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+32]
+    MULLD           m7, [base+div_table%+SUFFIX+48]
+    MULLD           m0, [base+div_table%+SUFFIX+32]
     paddd           m0, m7                  ; cost[1a-d]
     SWAP            m0, m4
 
@@ -1330,8 +1278,8 @@
     punpcklwd       m4, m2
     pmaddwd         m0, m0
     pmaddwd         m4, m4
-    MULLD           m0, [PIC_sym(div_table%+SUFFIX)+48]
-    MULLD           m4, [PIC_sym(div_table%+SUFFIX)+32]
+    MULLD           m0, [base+div_table%+SUFFIX+48]
+    MULLD           m4, [base+div_table%+SUFFIX+32]
     paddd           m4, m0                   ; cost[3a-d]
 
     mova            m1, [esp+0x00]
@@ -1367,6 +1315,7 @@
   %endif
 
     ; get direction and variance
+    mov           vard, varm
     punpckhdq       m3, m2, m1
     punpckldq       m2, m1
     psubd           m1, m0, m3
@@ -1388,18 +1337,18 @@
 %endmacro
 
 INIT_XMM sse4
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
 CDEF_DIR
 
 INIT_XMM ssse3
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
 CDEF_DIR
 
 INIT_XMM sse2
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4