ref: 6cf58c8e7deb54e287afeee6710b2a3774eded9c
parent: f55cd4c6f3d57494c4cea5a3b56145981a28b0c5
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Jul 13 21:13:16 EDT 2020
x86: Add cdef_filter SSE optimizations
--- a/src/x86/cdef_sse.asm
+++ b/src/x86/cdef_sse.asm
@@ -28,28 +28,31 @@
SECTION_RODATA 16
-%if ARCH_X86_32
-pb_0: times 16 db 0
-pb_0xFF: times 16 db 0xFF
-%endif
+%macro DUP8 1-*
+ %rep %0
+ times 8 db %1
+ %rotate 1
+ %endrep
+%endmacro
+
+div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
+ dd 420, 210, 140, 105, 105, 105, 105, 105
+div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210
+ dw 168, 168, 140, 140, 120, 120, 105, 105
+ dw 420, 420, 210, 210, 140, 140, 105, 105
+ dw 105, 105, 105, 105, 105, 105, 105, 105
+shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pw_8: times 8 dw 8
pw_128: times 8 dw 128
pw_256: times 8 dw 256
pw_2048: times 8 dw 2048
-%if ARCH_X86_32
pw_0x7FFF: times 8 dw 0x7FFF
pw_0x8000: times 8 dw 0x8000
-%endif
-div_table_sse4: dd 840, 420, 280, 210, 168, 140, 120, 105
- dd 420, 210, 140, 105, 105, 105, 105, 105
-div_table_ssse3: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
- dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
-shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
tap_table: ; masks for 8-bit shift emulation
- db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+ DUP8 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80
; weights
- db 4, 2, 3, 3, 2, 1
+ DUP8 4, 2, 3, 3, 2, 1
; taps indices
db -1 * 16 + 1, -2 * 16 + 2
db 0 * 16 + 1, -1 * 16 + 2
@@ -75,59 +78,19 @@
%endif
%endmacro
-%macro SAVE_ARG 2 ; varname, argnum
- %define %1_stkloc [rsp+%2*gprsize]
- %define %1_argnum %2
- mov r2, r%2m
- mov %1_stkloc, r2
-%endmacro
-
-%macro LOAD_ARG 1-2 0 ; varname, load_to_varname_register
- %if %2 == 0
- mov r %+ %{1}_argnum, %1_stkloc
+%macro PMOVZXBW 2-3 0 ; %3 = half
+ %if cpuflag(sse4) && %3 == 0
+ pmovzxbw %1, %2
%else
- mov %1q, %1_stkloc
- %endif
-%endmacro
-
-%macro LOAD_ARG32 1-2 ; varname, load_to_varname_register
- %if ARCH_X86_32
- %if %0 == 1
- LOAD_ARG %1
+ %if %3 == 1
+ movd %1, %2
%else
- LOAD_ARG %1, %2
+ movq %1, %2
%endif
+ punpcklbw %1, m7
%endif
%endmacro
-%if ARCH_X86_32
- %define PIC_base_offset $$
- %define PIC_sym(sym) (PIC_reg+(sym)-PIC_base_offset)
-%else
- %define PIC_sym(sym) sym
-%endif
-
-%macro SAVE_PIC_REG 1
- %if ARCH_X86_32
- mov [esp+%1], PIC_reg
- %endif
-%endmacro
-
-%macro LOAD_PIC_REG 1
- %if ARCH_X86_32
- mov PIC_reg, [esp+%1]
- %endif
-%endmacro
-
-%macro PMOVZXBW 2-3 0 ; %3 = half
- %if %3 == 1
- movd %1, %2
- %else
- movq %1, %2
- %endif
- punpcklbw %1, m15
-%endmacro
-
%macro PSHUFB_0 2
%if cpuflag(ssse3)
pshufb %1, %2
@@ -138,34 +101,33 @@
%endif
%endmacro
-%macro LOAD_SEC_TAP 0
- %if ARCH_X86_64
- movd m3, [secq+kq]
- PSHUFB_0 m3, m15
- %else
- movd m2, [secq+kq] ; sec_taps
- pxor m3, m3
- PSHUFB_0 m2, m3
- %endif
+%macro MOVDDUP 2
+%if cpuflag(ssse3)
+ movddup %1, %2
+%else
+ movq %1, %2
+ punpcklqdq %1, %1
+%endif
%endmacro
-%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, stride
+%macro ACCUMULATE_TAP 7 ; tap_offset, shift, shift_mask, strength, mul_tap, w, minmax
; load p0/p1
- movsx offq, byte [dirq+kq+%1] ; off1
+ movsx offq, byte [dirq+kq+%1+14*8] ; off1
%if %6 == 4
- movq m5, [stkq+offq*2+%7*0] ; p0
- movhps m5, [stkq+offq*2+%7*1]
+ movq m5, [stkq+offq*2+32*0] ; p0
+ movhps m5, [stkq+offq*2+32*1]
%else
- movu m5, [stkq+offq*2+%7*0] ; p0
+ movu m5, [stkq+offq*2+32*0] ; p0
%endif
neg offq ; -off1
%if %6 == 4
- movq m6, [stkq+offq*2+%7*0] ; p1
- movhps m6, [stkq+offq*2+%7*1]
+ movq m6, [stkq+offq*2+32*0] ; p1
+ movhps m6, [stkq+offq*2+32*1]
%else
- movu m6, [stkq+offq*2+%7*0] ; p1
+ movu m6, [stkq+offq*2+32*0] ; p1
%endif
- %if cpuflag(sse4)
+ %if %7
+ %if cpuflag(sse4)
; out of bounds values are set to a value that is a both a large unsigned
; value and a negative signed value.
; use signed max and unsigned min to remove them
@@ -173,40 +135,26 @@
pminuw m8, m5
pmaxsw m7, m6
pminuw m8, m6
- %else
- %if ARCH_X86_64
- pcmpeqw m9, m14, m5
- pcmpeqw m10, m14, m6
- pandn m9, m5
- pandn m10, m6
- pmaxsw m7, m9 ; max after p0
- pminsw m8, m5 ; min after p0
- pmaxsw m7, m10 ; max after p1
- pminsw m8, m6 ; min after p1
%else
- pcmpeqw m9, m5, OUT_OF_BOUNDS_MEM
- pandn m9, m5
- pmaxsw m7, m9 ; max after p0
- pminsw m8, m5 ; min after p0
- pcmpeqw m9, m6, OUT_OF_BOUNDS_MEM
- pandn m9, m6
- pmaxsw m7, m9 ; max after p1
- pminsw m8, m6 ; min after p1
+ pcmpeqw m3, m14, m5
+ pminsw m8, m5 ; min after p0
+ pandn m3, m5
+ pmaxsw m7, m3 ; max after p0
+ pcmpeqw m3, m14, m6
+ pminsw m8, m6 ; min after p1
+ pandn m3, m6
+ pmaxsw m7, m3 ; max after p1
%endif
%endif
; accumulate sum[m13] over p0/p1
- psubw m5, m4 ; diff_p0(p0 - px)
- psubw m6, m4 ; diff_p1(p1 - px)
- packsswb m5, m6 ; convert pixel diff to 8-bit
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+ packsswb m5, m6 ; convert pixel diff to 8-bit
%if cpuflag(ssse3)
- %if ARCH_X86_64 && cpuflag(sse4)
- pshufb m5, m14 ; group diffs p0 and p1 into pairs
- %else
- pshufb m5, [PIC_sym(shufb_lohi)]
- %endif
+ pshufb m5, m13 ; group diffs p0 and p1 into pairs
pabsb m6, m5
- psignb m9, %5, m5
+ psignb m3, %5, m5
%else
movlhps m6, m5
punpckhbw m6, m5
@@ -214,111 +162,113 @@
pcmpgtb m5, m6
paddb m6, m5
pxor m6, m5
- paddb m9, %5, m5
- pxor m9, m5
+ paddb m3, %5, m5
+ pxor m3, m5
%endif
- %if ARCH_X86_64
- psrlw m10, m6, %2 ; emulate 8-bit shift
- pand m10, %3
- psubusb m5, %4, m10
- %else
- psrlw m5, m6, %2 ; emulate 8-bit shift
- pand m5, %3
- paddusb m5, %4
- pxor m5, [PIC_sym(pb_0xFF)]
- %endif
- pminub m5, m6 ; constrain(diff_p)
+ pand m9, %3, m6 ; emulate 8-bit shift
+ psrlw m9, %2
+ psubusb m5, %4, m9
+ pminub m5, m6 ; constrain(diff_p)
%if cpuflag(ssse3)
- pmaddubsw m5, m9 ; constrain(diff_p) * taps
+ pmaddubsw m5, m3 ; constrain(diff_p) * taps
%else
- psrlw m2, m5, 8
- psraw m6, m9, 8
+ psrlw m9, m5, 8
+ psraw m6, m3, 8
psllw m5, 8
- psllw m9, 8
- pmullw m2, m6
- pmulhw m5, m9
- paddw m5, m2
+ psllw m3, 8
+ pmullw m9, m6
+ pmulhw m5, m3
+ paddw m5, m9
%endif
- paddw m13, m5
+ paddw m0, m5
%endmacro
-%macro LOAD_BODY 4 ; dst, src, block_width, tmp_stride
+%macro LOAD_BODY 3 ; dst, src, block_width
%if %3 == 4
PMOVZXBW m0, [%2+strideq*0]
PMOVZXBW m1, [%2+strideq*1]
PMOVZXBW m2, [%2+strideq*2]
PMOVZXBW m3, [%2+stride3q]
+ mova [%1+32*0], m0
+ mova [%1+32*1], m1
+ mova [%1+32*2], m2
+ mova [%1+32*3], m3
%else
movu m0, [%2+strideq*0]
movu m1, [%2+strideq*1]
movu m2, [%2+strideq*2]
movu m3, [%2+stride3q]
- punpckhbw m4, m0, m15
- punpcklbw m0, m15
- punpckhbw m5, m1, m15
- punpcklbw m1, m15
- punpckhbw m6, m2, m15
- punpcklbw m2, m15
- punpckhbw m7, m3, m15
- punpcklbw m3, m15
+ punpcklbw m4, m0, m7
+ punpckhbw m0, m7
+ mova [%1+32*0+ 0], m4
+ mova [%1+32*0+16], m0
+ punpcklbw m4, m1, m7
+ punpckhbw m1, m7
+ mova [%1+32*1+ 0], m4
+ mova [%1+32*1+16], m1
+ punpcklbw m4, m2, m7
+ punpckhbw m2, m7
+ mova [%1+32*2+ 0], m4
+ mova [%1+32*2+16], m2
+ punpcklbw m4, m3, m7
+ punpckhbw m3, m7
+ mova [%1+32*3+ 0], m4
+ mova [%1+32*3+16], m3
%endif
- mova [%1+0*%4], m0
- mova [%1+1*%4], m1
- mova [%1+2*%4], m2
- mova [%1+3*%4], m3
- %if %3 == 8
- mova [%1+0*%4+2*8], m4
- mova [%1+1*%4+2*8], m5
- mova [%1+2*%4+2*8], m6
- mova [%1+3*%4+2*8], m7
- %endif
%endmacro
-%macro CDEF_FILTER 3 ; w, h, stride
-
- %if cpuflag(sse4)
- %define OUT_OF_BOUNDS 0x80008000
+%macro CDEF_FILTER_END 2 ; w, minmax
+ pxor m6, m6
+ pcmpgtw m6, m0
+ paddw m0, m6
+ %if cpuflag(ssse3)
+ pmulhrsw m0, m15
%else
- %define OUT_OF_BOUNDS 0x7FFF7FFF
+ paddw m0, m15
+ psraw m0, 4
%endif
+ paddw m4, m0
+ %if %2
+ pminsw m4, m7
+ pmaxsw m4, m8
+ %endif
+ packuswb m4, m4
+ %if %1 == 4
+ movd [dstq+strideq*0], m4
+ psrlq m4, 32
+ movd [dstq+strideq*1], m4
+ add stkq, 32*2
+ lea dstq, [dstq+strideq*2]
+ %else
+ movq [dstq], m4
+ add stkq, 32
+ add dstq, strideq
+ %endif
+%endmacro
+%macro CDEF_FILTER 2 ; w, h
%if ARCH_X86_64
-cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*%3, \
- dst, stride, left, top, pri, sec, stride3, dst4, edge
- pcmpeqw m14, m14
- %if cpuflag(sse4)
- psllw m14, 15 ; 0x8000
- %else
- psrlw m14, 1 ; 0x7FFF
- %endif
- pxor m15, m15
-
- %define px rsp+3*16+2*%3
+cglobal cdef_filter_%1x%2, 4, 9, 16, 3 * 16 + (%2+4)*32, \
+ dst, stride, left, top, pri, sec, edge, stride3, dst4
+ %define px rsp+3*16+2*32
+ %define base 0
%else
-cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*%3, \
- dst, stride, left, top, stride3, dst4, edge
- SAVE_ARG left, 2
- SAVE_ARG top, 3
- SAVE_ARG pri, 4
- SAVE_ARG sec, 5
- SAVE_ARG dir, 6
- SAVE_ARG damping, 7
-
- %define PIC_reg r2
- LEA PIC_reg, PIC_base_offset
-
- %if cpuflag(sse4)
- %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x8000)]
- %else
- %define OUT_OF_BOUNDS_MEM [PIC_sym(pw_0x7FFF)]
- %endif
-
- %define m15 [PIC_sym(pb_0)]
-
- %define px esp+7*16+2*%3
+cglobal cdef_filter_%1x%2, 2, 7, 8, - 7 * 16 - (%2+4)*32, \
+ dst, stride, left, edge, stride3
+ %define topq r2
+ %define dst4q r2
+ LEA r5, tap_table
+ %define px esp+7*16+2*32
+ %define base r5-tap_table
%endif
-
mov edged, r8m
+ %if cpuflag(sse4)
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x8000]
+ %else
+ %define OUT_OF_BOUNDS_MEM [base+pw_0x7FFF]
+ %endif
+ mova m6, OUT_OF_BOUNDS_MEM
+ pxor m7, m7
; prepare pixel buffers - body/right
%if %2 == 8
@@ -325,11 +275,11 @@
lea dst4q, [dstq+strideq*4]
%endif
lea stride3q, [strideq*3]
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .no_right
- LOAD_BODY px, dstq, %1, %3
+ LOAD_BODY px, dstq, %1
%if %2 == 8
- LOAD_BODY px+4*%3, dst4q, %1, %3
+ LOAD_BODY px+4*32, dst4q, %1
%endif
jmp .body_done
.no_right:
@@ -337,39 +287,37 @@
PMOVZXBW m1, [dstq+strideq*1], %1 == 4
PMOVZXBW m2, [dstq+strideq*2], %1 == 4
PMOVZXBW m3, [dstq+stride3q ], %1 == 4
+ mova [px+32*0], m0
+ mova [px+32*1], m1
+ mova [px+32*2], m2
+ mova [px+32*3], m3
+ movd [px+32*0+%1*2], m6
+ movd [px+32*1+%1*2], m6
+ movd [px+32*2+%1*2], m6
+ movd [px+32*3+%1*2], m6
%if %2 == 8
- PMOVZXBW m4, [dst4q+strideq*0], %1 == 4
- PMOVZXBW m5, [dst4q+strideq*1], %1 == 4
- PMOVZXBW m6, [dst4q+strideq*2], %1 == 4
- PMOVZXBW m7, [dst4q+stride3q ], %1 == 4
+ PMOVZXBW m0, [dst4q+strideq*0], %1 == 4
+ PMOVZXBW m1, [dst4q+strideq*1], %1 == 4
+ PMOVZXBW m2, [dst4q+strideq*2], %1 == 4
+ PMOVZXBW m3, [dst4q+stride3q ], %1 == 4
+ mova [px+32*4], m0
+ mova [px+32*5], m1
+ mova [px+32*6], m2
+ mova [px+32*7], m3
+ movd [px+32*4+%1*2], m6
+ movd [px+32*5+%1*2], m6
+ movd [px+32*6+%1*2], m6
+ movd [px+32*7+%1*2], m6
%endif
- mova [px+0*%3], m0
- mova [px+1*%3], m1
- mova [px+2*%3], m2
- mova [px+3*%3], m3
- %if %2 == 8
- mova [px+4*%3], m4
- mova [px+5*%3], m5
- mova [px+6*%3], m6
- mova [px+7*%3], m7
- mov dword [px+4*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+5*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+6*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+7*%3+%1*2], OUT_OF_BOUNDS
- %endif
- mov dword [px+0*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+1*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+2*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+3*%3+%1*2], OUT_OF_BOUNDS
.body_done:
; top
- LOAD_ARG32 top
- test edged, 4 ; have_top
+ movifnidn topq, r3mp
+ test edgeb, 4 ; have_top
jz .no_top
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .top_no_left
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .top_no_right
%if %1 == 4
PMOVZXBW m0, [topq+strideq*0-2]
@@ -377,39 +325,39 @@
%else
movu m0, [topq+strideq*0-4]
movu m1, [topq+strideq*1-4]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- movu [px-2*%3+8], m2
- movu [px-1*%3+8], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px-32*2+8], m2
+ movu [px-32*1+8], m3
%endif
- movu [px-2*%3-%1], m0
- movu [px-1*%3-%1], m1
+ movu [px-32*2-%1], m0
+ movu [px-32*1-%1], m1
jmp .top_done
.top_no_right:
%if %1 == 4
PMOVZXBW m0, [topq+strideq*0-%1]
PMOVZXBW m1, [topq+strideq*1-%1]
- movu [px-2*%3-4*2], m0
- movu [px-1*%3-4*2], m1
+ movu [px-32*2-8], m0
+ movu [px-32*1-8], m1
%else
movu m0, [topq+strideq*0-%1]
movu m1, [topq+strideq*1-%2]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- mova [px-2*%3-8*2], m0
- mova [px-2*%3-0*2], m2
- mova [px-1*%3-8*2], m1
- mova [px-1*%3-0*2], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px-32*2-16], m0
+ mova [px-32*2+ 0], m2
+ mova [px-32*1-16], m1
+ mova [px-32*1+ 0], m3
%endif
- mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
jmp .top_done
.top_no_left:
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .top_no_left_right
%if %1 == 4
PMOVZXBW m0, [topq+strideq*0]
@@ -417,102 +365,92 @@
%else
movu m0, [topq+strideq*0]
movu m1, [topq+strideq*1]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- movd [px-2*%3+8*2], m2
- movd [px-1*%3+8*2], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movd [px-32*2+16], m2
+ movd [px-32*1+16], m3
%endif
- mova [px-2*%3], m0
- mova [px-1*%3], m1
- mov dword [px-2*%3-4], OUT_OF_BOUNDS
- mov dword [px-1*%3-4], OUT_OF_BOUNDS
+ movd [px-32*2- 4], m6
+ movd [px-32*1- 4], m6
+ mova [px-32*2+ 0], m0
+ mova [px-32*1+ 0], m1
jmp .top_done
.top_no_left_right:
PMOVZXBW m0, [topq+strideq*0], %1 == 4
PMOVZXBW m1, [topq+strideq*1], %1 == 4
- mova [px-2*%3], m0
- mova [px-1*%3], m1
- mov dword [px-2*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px-1*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px-2*%3-4], OUT_OF_BOUNDS
- mov dword [px-1*%3-4], OUT_OF_BOUNDS
+ movd [px-32*2-4], m6
+ movd [px-32*1-4], m6
+ mova [px-32*2+0], m0
+ mova [px-32*1+0], m1
+ movd [px-32*2+%1*2], m6
+ movd [px-32*1+%1*2], m6
jmp .top_done
.no_top:
- %if ARCH_X86_64
- SWAP m0, m14
- %else
- mova m0, OUT_OF_BOUNDS_MEM
- %endif
- movu [px-2*%3-4], m0
- movu [px-1*%3-4], m0
+ movu [px-32*2- 4], m6
+ movu [px-32*1- 4], m6
%if %1 == 8
- movq [px-2*%3+12], m0
- movq [px-1*%3+12], m0
+ movq [px-32*2+12], m6
+ movq [px-32*1+12], m6
%endif
- %if ARCH_X86_64
- SWAP m0, m14
- %endif
.top_done:
; left
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .no_left
- SAVE_PIC_REG 0
- LOAD_ARG32 left
+ movifnidn leftq, leftmp
%if %2 == 4
movq m0, [leftq]
%else
movu m0, [leftq]
%endif
- LOAD_PIC_REG 0
%if %2 == 4
- punpcklbw m0, m15
+ punpcklbw m0, m7
%else
- punpckhbw m1, m0, m15
- punpcklbw m0, m15
+ punpckhbw m1, m0, m7
+ punpcklbw m0, m7
movhlps m3, m1
- movd [px+4*%3-4], m1
- movd [px+6*%3-4], m3
+ movd [px+32*4-4], m1
+ movd [px+32*6-4], m3
psrlq m1, 32
psrlq m3, 32
- movd [px+5*%3-4], m1
- movd [px+7*%3-4], m3
+ movd [px+32*5-4], m1
+ movd [px+32*7-4], m3
%endif
movhlps m2, m0
- movd [px+0*%3-4], m0
- movd [px+2*%3-4], m2
+ movd [px+32*0-4], m0
+ movd [px+32*2-4], m2
psrlq m0, 32
psrlq m2, 32
- movd [px+1*%3-4], m0
- movd [px+3*%3-4], m2
+ movd [px+32*1-4], m0
+ movd [px+32*3-4], m2
jmp .left_done
.no_left:
- mov dword [px+0*%3-4], OUT_OF_BOUNDS
- mov dword [px+1*%3-4], OUT_OF_BOUNDS
- mov dword [px+2*%3-4], OUT_OF_BOUNDS
- mov dword [px+3*%3-4], OUT_OF_BOUNDS
+ movd [px+32*0-4], m6
+ movd [px+32*1-4], m6
+ movd [px+32*2-4], m6
+ movd [px+32*3-4], m6
%if %2 == 8
- mov dword [px+4*%3-4], OUT_OF_BOUNDS
- mov dword [px+5*%3-4], OUT_OF_BOUNDS
- mov dword [px+6*%3-4], OUT_OF_BOUNDS
- mov dword [px+7*%3-4], OUT_OF_BOUNDS
+ movd [px+32*4-4], m6
+ movd [px+32*5-4], m6
+ movd [px+32*6-4], m6
+ movd [px+32*7-4], m6
%endif
.left_done:
; bottom
%if ARCH_X86_64
- DEFINE_ARGS dst, stride, dummy1, dst8, pri, sec, stride3, dummy2, edge
+ DEFINE_ARGS dst, stride, dst8, dummy, pri, sec, edge, stride3
%else
- DEFINE_ARGS dst, stride, dummy1, dst8, stride3, dummy2, edge
+ DEFINE_ARGS dst, stride, dst8, edge, stride3
%endif
- test edged, 8 ; have_bottom
+ test edgeb, 8 ; have_bottom
jz .no_bottom
lea dst8q, [dstq+%2*strideq]
- test edged, 1 ; have_left
+ test edgeb, 1 ; have_left
jz .bottom_no_left
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .bottom_no_right
%if %1 == 4
PMOVZXBW m0, [dst8q-(%1/2)]
@@ -520,40 +458,40 @@
%else
movu m0, [dst8q-4]
movu m1, [dst8q+strideq-4]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- movu [px+(%2+0)*%3+8], m2
- movu [px+(%2+1)*%3+8], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ movu [px+32*(%2+0)+8], m2
+ movu [px+32*(%2+1)+8], m3
%endif
- movu [px+(%2+0)*%3-%1], m0
- movu [px+(%2+1)*%3-%1], m1
+ movu [px+32*(%2+0)-%1], m0
+ movu [px+32*(%2+1)-%1], m1
jmp .bottom_done
.bottom_no_right:
%if %1 == 4
PMOVZXBW m0, [dst8q-4]
PMOVZXBW m1, [dst8q+strideq-4]
- movu [px+(%2+0)*%3-4*2], m0
- movu [px+(%2+1)*%3-4*2], m1
+ movu [px+32*(%2+0)-8], m0
+ movu [px+32*(%2+1)-8], m1
%else
movu m0, [dst8q-8]
movu m1, [dst8q+strideq-8]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- mova [px+(%2+0)*%3-8*2], m0
- mova [px+(%2+0)*%3-0*2], m2
- mova [px+(%2+1)*%3-8*2], m1
- mova [px+(%2+1)*%3-0*2], m3
- mov dword [px+(%2-1)*%3+8*2], OUT_OF_BOUNDS ; overwritten by first mova
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)-16], m0
+ mova [px+32*(%2+0)+ 0], m2
+ mova [px+32*(%2+1)-16], m1
+ mova [px+32*(%2+1)+ 0], m3
+ movd [px+32*(%2-1)+16], m6 ; overwritten by first mova
%endif
- mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
jmp .bottom_done
.bottom_no_left:
- test edged, 2 ; have_right
+ test edgeb, 2 ; have_right
jz .bottom_no_left_right
%if %1 == 4
PMOVZXBW m0, [dst8q]
@@ -561,233 +499,245 @@
%else
movu m0, [dst8q]
movu m1, [dst8q+strideq]
- punpckhbw m2, m0, m15
- punpcklbw m0, m15
- punpckhbw m3, m1, m15
- punpcklbw m1, m15
- mova [px+(%2+0)*%3+8*2], m2
- mova [px+(%2+1)*%3+8*2], m3
+ punpckhbw m2, m0, m7
+ punpcklbw m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m1, m7
+ mova [px+32*(%2+0)+16], m2
+ mova [px+32*(%2+1)+16], m3
%endif
- mova [px+(%2+0)*%3], m0
- mova [px+(%2+1)*%3], m1
- mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
- mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
jmp .bottom_done
.bottom_no_left_right:
PMOVZXBW m0, [dst8q+strideq*0], %1 == 4
PMOVZXBW m1, [dst8q+strideq*1], %1 == 4
- mova [px+(%2+0)*%3], m0
- mova [px+(%2+1)*%3], m1
- mov dword [px+(%2+0)*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+(%2+1)*%3+%1*2], OUT_OF_BOUNDS
- mov dword [px+(%2+0)*%3-4], OUT_OF_BOUNDS
- mov dword [px+(%2+1)*%3-4], OUT_OF_BOUNDS
+ mova [px+32*(%2+0)+ 0], m0
+ mova [px+32*(%2+1)+ 0], m1
+ movd [px+32*(%2+0)+%1*2], m6
+ movd [px+32*(%2+1)+%1*2], m6
+ movd [px+32*(%2+0)- 4], m6
+ movd [px+32*(%2+1)- 4], m6
jmp .bottom_done
.no_bottom:
- %if ARCH_X86_64
- SWAP m0, m14
- %else
- mova m0, OUT_OF_BOUNDS_MEM
- %endif
- movu [px+(%2+0)*%3-4], m0
- movu [px+(%2+1)*%3-4], m0
+ movu [px+32*(%2+0)- 4], m6
+ movu [px+32*(%2+1)- 4], m6
%if %1 == 8
- movq [px+(%2+0)*%3+12], m0
- movq [px+(%2+1)*%3+12], m0
+ movq [px+32*(%2+0)+12], m6
+ movq [px+32*(%2+1)+12], m6
%endif
- %if ARCH_X86_64
- SWAP m0, m14
- %endif
.bottom_done:
; actual filter
- DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp
%if ARCH_X86_64
- movifnidn prid, prim
- movifnidn secd, secm
- mov dampingd, r7m
+ DEFINE_ARGS dst, stride, pridmp, damping, pri, sec
+ mova m13, [shufb_lohi]
+ %if cpuflag(ssse3)
+ mova m15, [pw_2048]
%else
- LOAD_ARG pri
- LOAD_ARG sec
- LOAD_ARG damping, 1
+ mova m15, [pw_8]
%endif
-
- SAVE_PIC_REG 8
- mov pridmpd, prid
- mov secdmpd, secd
- or pridmpd, 1
- or secdmpd, 1
- bsr pridmpd, pridmpd
- bsr secdmpd, secdmpd
+ mova m14, m6
+ %else
+ DEFINE_ARGS dst, pridmp, sec, damping, pri, tap
+ %xdefine m8 m1
+ %xdefine m9 m2
+ %xdefine m10 m0
+ %xdefine m13 [base+shufb_lohi]
+ %xdefine m14 OUT_OF_BOUNDS_MEM
+ %if cpuflag(ssse3)
+ %xdefine m15 [base+pw_2048]
+ %else
+ %xdefine m15 [base+pw_8]
+ %endif
+ %endif
+ movifnidn prid, r4m
+ movifnidn secd, r5m
+ mov dampingd, r7m
+ movif32 [esp+0x3C], r1d
+ test prid, prid
+ jz .sec_only
+ movd m1, prim
+ bsr pridmpd, prid
+ test secd, secd
+ jz .pri_only
+ movd m10, r5m
+ bsr secd, secd
+ and prid, 1
sub pridmpd, dampingd
- sub secdmpd, dampingd
+ sub secd, dampingd
xor dampingd, dampingd
+ add prid, prid
neg pridmpd
cmovs pridmpd, dampingd
- neg secdmpd
- cmovs secdmpd, dampingd
+ neg secd
+ cmovs secd, dampingd
+ PSHUFB_0 m1, m7
+ PSHUFB_0 m10, m7
%if ARCH_X86_64
- mov [rsp+ 0], pridmpq ; pri_shift
- mov [rsp+16], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, pridmp, tap, pri, sec
+ lea tapq, [tap_table]
+ MOVDDUP m11, [tapq+pridmpq*8] ; pri_shift_mask
+ MOVDDUP m12, [tapq+secq*8] ; sec_shift_mask
+ mov [rsp+0x00], pridmpq ; pri_shift
+ mov [rsp+0x10], secq ; sec_shift
+ DEFINE_ARGS dst, stride, dir, tap, pri, stk, k, off, h
%else
+ MOVDDUP m2, [tapq+pridmpq*8]
+ MOVDDUP m3, [tapq+secq*8]
+ mov [esp+0x04], dampingd ; zero upper 32 bits of psrlw
+ mov [esp+0x34], dampingd ; source operand in ACCUMULATE_TAP
mov [esp+0x00], pridmpd
- mov [esp+0x30], secdmpd
- mov dword [esp+0x04], 0 ; zero upper 32 bits of psrlw
- mov dword [esp+0x34], 0 ; source operand in ACCUMULATE_TAP
- %define PIC_reg r4
- LOAD_PIC_REG 8
+ mov [esp+0x30], secd
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
+ %define offq dstq
+ %define kd strided
+ %define kq strideq
+ mova [esp+0x10], m2
+ mova [esp+0x40], m3
+ mova [esp+0x20], m1
+ mova [esp+0x50], m10
%endif
-
- DEFINE_ARGS dst, stride, pridmp, table, pri, sec, secdmp
- lea tableq, [PIC_sym(tap_table)]
- %if ARCH_X86_64
- SWAP m2, m11
- SWAP m3, m12
+ mov dird, r6m
+ lea stkq, [px]
+ lea priq, [tapq+8*8+priq*8] ; pri_taps
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.v_loop:
+ movif32 [esp+0x38], dstd
+ mov kd, 1
+ %if %1 == 4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
+ %else
+ mova m4, [stkq+32*0] ; px
%endif
- movd m2, [tableq+pridmpq]
- movd m3, [tableq+secdmpq]
- PSHUFB_0 m2, m15 ; pri_shift_mask
- PSHUFB_0 m3, m15 ; sec_shift_mask
+ pxor m0, m0 ; sum
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+.k_loop:
+ MOVDDUP m2, [priq+kq*8]
%if ARCH_X86_64
- SWAP m2, m11
- SWAP m3, m12
+ ACCUMULATE_TAP 0*2, [rsp+0x00], m11, m1, m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp+0x10], m12, m10, m2, %1, 1
+ ACCUMULATE_TAP 6*2, [rsp+0x10], m12, m10, m2, %1, 1
%else
- %define PIC_reg r6
- mov PIC_reg, r4
- DEFINE_ARGS dst, stride, dir, table, pri, sec, secdmp
- LOAD_ARG pri
- LOAD_ARG dir, 1
- mova [esp+0x10], m2
- mova [esp+0x40], m3
+ ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, 1
%endif
+ dec kd
+ jge .k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 1
+ dec hd
+ jg .v_loop
+ RET
- ; pri/sec_taps[k] [4 total]
- DEFINE_ARGS dst, stride, dummy, tap, pri, sec
- movd m0, prid
- movd m1, secd
- %if ARCH_X86_64
- PSHUFB_0 m0, m15
- PSHUFB_0 m1, m15
+.pri_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, pridmp, damping, pri, tap, zero
+ lea tapq, [tap_table]
%else
- %if cpuflag(ssse3)
- pxor m2, m2
- %endif
- mova m3, [PIC_sym(pb_0xFF)]
- PSHUFB_0 m0, m2
- PSHUFB_0 m1, m2
- pxor m0, m3
- pxor m1, m3
- mova [esp+0x20], m0
- mova [esp+0x50], m1
+ DEFINE_ARGS dst, pridmp, zero, damping, pri, tap
%endif
and prid, 1
- lea priq, [tapq+8+priq*2] ; pri_taps
- lea secq, [tapq+12] ; sec_taps
-
- %if ARCH_X86_64 && cpuflag(sse4)
- mova m14, [shufb_lohi]
- %endif
-
- ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
- DEFINE_ARGS dst, stride, dir, tap, pri, sec
+ xor zerod, zerod
+ sub dampingd, pridmpd
+ cmovs dampingd, zerod
+ add prid, prid
+ PSHUFB_0 m1, m7
+ MOVDDUP m7, [tapq+dampingq*8]
+ mov [rsp+0x00], dampingq
%if ARCH_X86_64
- mov dird, r6m
- lea dirq, [tapq+14+dirq*2]
- DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, k, off, h
%else
- lea dird, [tapd+14+dird*2]
- DEFINE_ARGS dst, stride, dir, stk, pri, sec
- %define hd dword [esp+8]
- %define offq dstq
- %define kq strideq
+ mov [rsp+0x04], zerod
+ DEFINE_ARGS dst, stride, dir, stk, pri, tap, h
%endif
- mov hd, %1*%2*2/mmsize
+ mov dird, r6m
lea stkq, [px]
- movif32 [esp+0x3C], strided
-.v_loop:
+ lea priq, [tapq+8*8+priq*8]
+ mov hd, %1*%2/8
+ lea dirq, [tapq+dirq*2]
+.pri_v_loop:
movif32 [esp+0x38], dstd
- mov kq, 1
+ mov kd, 1
%if %1 == 4
- movq m4, [stkq+%3*0]
- movhps m4, [stkq+%3*1]
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
%else
- mova m4, [stkq+%3*0] ; px
+ mova m4, [stkq+32*0]
%endif
+ pxor m0, m0
+.pri_k_loop:
+ MOVDDUP m2, [priq+kq*8]
+ ACCUMULATE_TAP 0*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .pri_k_loop
+ movif32 dstq, [esp+0x38]
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
+ dec hd
+ jg .pri_v_loop
+ RET
- %if ARCH_X86_32
- %xdefine m9 m3
- %xdefine m13 m7
- %xdefine m7 m0
- %xdefine m8 m1
- %endif
-
- pxor m13, m13 ; sum
- mova m7, m4 ; max
- mova m8, m4 ; min
-.k_loop:
- movd m2, [priq+kq] ; pri_taps
+.sec_only:
+%if ARCH_X86_64
+ DEFINE_ARGS dst, stride, dir, damping, tap, sec, zero
+%else
+ DEFINE_ARGS dst, stride, sec, damping, dir, tap, zero
+%endif
+ movd m1, r5m
+ bsr secd, secd
+ mov dird, r6m
+ xor zerod, zerod
+ sub dampingd, secd
+ cmovs dampingd, zerod
+ PSHUFB_0 m1, m7
%if ARCH_X86_64
- PSHUFB_0 m2, m15
- %if cpuflag(ssse3)
- LOAD_SEC_TAP ; sec_taps
- %endif
- ACCUMULATE_TAP 0*2, [rsp+ 0], m11, m0, m2, %1, %3
- %if notcpuflag(ssse3)
- LOAD_SEC_TAP ; sec_taps
- %endif
- ACCUMULATE_TAP 2*2, [rsp+16], m12, m1, m3, %1, %3
- ACCUMULATE_TAP 6*2, [rsp+16], m12, m1, m3, %1, %3
+ lea tapq, [tap_table]
%else
- %if cpuflag(ssse3)
- pxor m3, m3
- %endif
- PSHUFB_0 m2, m3
- ACCUMULATE_TAP 0*2, [esp+0x00], [esp+0x10], [esp+0x20], m2, %1, %3
- LOAD_SEC_TAP ; sec_taps
- ACCUMULATE_TAP 2*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
- %if notcpuflag(ssse3)
- LOAD_SEC_TAP ; sec_taps
- %endif
- ACCUMULATE_TAP 6*2, [esp+0x30], [esp+0x40], [esp+0x50], m2, %1, %3
+ mov [rsp+0x04], zerod
%endif
-
- dec kq
- jge .k_loop
-
- pxor m6, m6
- pcmpgtw m6, m13
- paddw m13, m6
- %if cpuflag(ssse3)
- pmulhrsw m13, [PIC_sym(pw_2048)]
+ mov [rsp+0x00], dampingq
+ MOVDDUP m7, [tapq+dampingq*8]
+ lea dirq, [tapq+dirq*2]
+ %if ARCH_X86_64
+ DEFINE_ARGS dst, stride, dir, stk, tap, off, k, h
%else
- paddw m13, [PIC_sym(pw_8)]
- psraw m13, 4
+ DEFINE_ARGS dst, stride, off, stk, dir, tap, h
%endif
- paddw m4, m13
- pminsw m4, m7
- pmaxsw m4, m8
- packuswb m4, m4
- movif32 dstd, [esp+0x38]
- movif32 strided, [esp+0x3C]
+ lea stkq, [px]
+ mov hd, %1*%2/8
+.sec_v_loop:
+ mov kd, 1
%if %1 == 4
- movd [dstq+strideq*0], m4
- psrlq m4, 32
- movd [dstq+strideq*1], m4
+ movq m4, [stkq+32*0]
+ movhps m4, [stkq+32*1]
%else
- movq [dstq], m4
+ mova m4, [stkq+32*0]
%endif
-
- %if %1 == 4
- %define vloop_lines (mmsize/(%1*2))
- lea dstq, [dstq+strideq*vloop_lines]
- add stkq, %3*vloop_lines
- %else
- lea dstq, [dstq+strideq]
- add stkq, %3
+ pxor m0, m0
+.sec_k_loop:
+ MOVDDUP m2, [tapq+12*8+kq*8]
+ ACCUMULATE_TAP 2*2, [rsp], m7, m1, m2, %1, 0
+ %if ARCH_X86_32
+ MOVDDUP m2, [tapq+12*8+kq*8]
%endif
+ ACCUMULATE_TAP 6*2, [rsp], m7, m1, m2, %1, 0
+ dec kd
+ jge .sec_k_loop
+ movif32 strideq, [esp+0x3C]
+ CDEF_FILTER_END %1, 0
dec hd
- jg .v_loop
-
+ jg .sec_v_loop
RET
%endmacro
@@ -1079,18 +1029,16 @@
shr r1d, 10
mov [varq], r1d
%else
-cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
- %define PIC_reg r4
- LEA PIC_reg, PIC_base_offset
-
+cglobal cdef_dir, 2, 4, 8, 96, src, stride, var, stride3
+%define base r2-shufw_6543210x
+ LEA r2, shufw_6543210x
pxor m0, m0
- mova m1, [PIC_sym(pw_128)]
-
lea stride3q, [strideq*3]
movq m5, [srcq+strideq*0]
movhps m5, [srcq+strideq*1]
movq m7, [srcq+strideq*2]
movhps m7, [srcq+stride3q]
+ mova m1, [base+pw_128]
psadbw m2, m5, m0
psadbw m3, m7, m0
packssdw m2, m3
@@ -1143,7 +1091,7 @@
pmaddwd m0, m0
phaddd m2, m0
- MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
+ MULLD m2, [base+div_table%+SUFFIX+48]
mova [esp+0x30], m2
mova m1, [esp+0x10]
@@ -1176,13 +1124,13 @@
paddw m0, m2 ; partial_sum_diag[0][0-7]
paddw m1, m3 ; partial_sum_diag[0][8-14,zero]
mova m3, [esp+0x50]
- pshufb m1, [PIC_sym(shufw_6543210x)]
+ pshufb m1, [base+shufw_6543210x]
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
paddd m0, m2 ; cost[0a-d]
mova [esp+0x40], m0
@@ -1217,13 +1165,13 @@
paddw m0, m2 ; partial_sum_diag[1][0-7]
paddw m1, m3 ; partial_sum_diag[1][8-14,zero]
mova m3, [esp+0x50]
- pshufb m1, [PIC_sym(shufw_6543210x)]
+ pshufb m1, [base+shufw_6543210x]
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table%+SUFFIX)+16]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+0]
+ MULLD m2, [base+div_table%+SUFFIX+16]
+ MULLD m0, [base+div_table%+SUFFIX+ 0]
paddd m0, m2 ; cost[4a-d]
phaddd m1, [esp+0x40], m0 ; cost[0a/b,4a/b]
phaddd m1, [esp+0x30] ; cost[0,4,2,6]
@@ -1259,8 +1207,8 @@
punpcklwd m0, m1
pmaddwd m2, m2
pmaddwd m0, m0
- MULLD m2, [PIC_sym(div_table%+SUFFIX)+48]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
+ MULLD m2, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
paddd m0, m2 ; cost[7a-d]
mova [esp+0x40], m0
@@ -1280,8 +1228,8 @@
punpcklwd m0, m2
pmaddwd m7, m7
pmaddwd m0, m0
- MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
paddd m0, m7 ; cost[5a-d]
mova [esp+0x50], m0
@@ -1303,8 +1251,8 @@
punpcklwd m0, m2
pmaddwd m7, m7
pmaddwd m0, m0
- MULLD m7, [PIC_sym(div_table%+SUFFIX)+48]
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+32]
+ MULLD m7, [base+div_table%+SUFFIX+48]
+ MULLD m0, [base+div_table%+SUFFIX+32]
paddd m0, m7 ; cost[1a-d]
SWAP m0, m4
@@ -1330,8 +1278,8 @@
punpcklwd m4, m2
pmaddwd m0, m0
pmaddwd m4, m4
- MULLD m0, [PIC_sym(div_table%+SUFFIX)+48]
- MULLD m4, [PIC_sym(div_table%+SUFFIX)+32]
+ MULLD m0, [base+div_table%+SUFFIX+48]
+ MULLD m4, [base+div_table%+SUFFIX+32]
paddd m4, m0 ; cost[3a-d]
mova m1, [esp+0x00]
@@ -1367,6 +1315,7 @@
%endif
; get direction and variance
+ mov vard, varm
punpckhdq m3, m2, m1
punpckldq m2, m1
psubd m1, m0, m3
@@ -1388,18 +1337,18 @@
%endmacro
INIT_XMM sse4
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
CDEF_DIR
INIT_XMM ssse3
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
CDEF_DIR
INIT_XMM sse2
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4