ref: 007fd651e9d76b4f8080b49c74a01275d3dd358f
parent: 3dda2dd62e80516476bbd5575b972e002bba9066
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Feb 11 12:22:30 EST 2019
x86: Optimize MC w_mask
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -62,13 +62,12 @@
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_64: times 4 db 64
-pw_8: times 2 dw 8
-pw_26: times 2 dw 26
pw_34: times 2 dw 34
pw_258: times 2 dw 258
pw_512: times 2 dw 512
pw_1024: times 2 dw 1024
pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
pw_8192: times 2 dw 8192
pd_32: dd 32
pd_512: dd 512
@@ -3060,9 +3059,8 @@
mova m1, [tmp2q+(%1+0)*mmsize]
psubw m1, m0
pabsw m%2, m1
- paddw m%2, m6
- psrlw m%2, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
- psubusw m%2, m7, m%2 ; 64 - min(m, 64)
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
psllw m2, m%2, 10
pmulhw m1, m2
paddw m0, m1
@@ -3070,32 +3068,32 @@
mova m2, [tmp2q+(%1+1)*mmsize]
psubw m2, m1
pabsw m3, m2
- paddw m3, m6
+ psubusw m3, m6, m3
psrlw m3, 8
- psubusw m3, m7, m3
phaddw m%2, m3
psllw m3, 10
pmulhw m2, m3
paddw m1, m2
- pmulhrsw m0, m8
- pmulhrsw m1, m8
+ pmulhrsw m0, m7
+ pmulhrsw m1, m7
packuswb m0, m1
%endmacro
-cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
+cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
lea r7, [w_mask_420_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
mov maskq, maskmp
- vpbroadcastw m0, r7m ; sign
+ movd xm0, r7m ; sign
movsxd wq, dword [r7+wq*4]
- vpbroadcastd m6, [pw_8 +r7-w_mask_420_avx2_table]
- vpbroadcastd m7, [pw_26 +r7-w_mask_420_avx2_table] ; 64 - 38
- vpbroadcastd m8, [pw_2048 +r7-w_mask_420_avx2_table]
- vpbroadcastd m9, [pw_258 +r7-w_mask_420_avx2_table] ; 64 * 4 + 2
- pmovzxbd m10, [deint_shuf4+r7-w_mask_420_avx2_table]
- psubw m9, m0
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ movd xm8, [base+pw_258] ; 64 * 4 + 2
+ pmovzxbd m9, [base+deint_shuf4]
+ psubw xm8, xm0
add wq, r7
+ vpbroadcastw m8, xm8
W_MASK_420 0, 4
lea stride3q, [strideq*3]
jmp wq
@@ -3105,14 +3103,13 @@
pextrd [dstq+strideq*1], xm0, 1
movd [dstq+strideq*2], xm1
pextrd [dstq+stride3q ], xm1, 1
- cmp hd, 4
- je .w4_end
+ cmp hd, 8
+ jl .w4_end
lea dstq, [dstq+strideq*4]
pextrd [dstq ], xm0, 2
pextrd [dstq+strideq*1], xm0, 3
pextrd [dstq+strideq*2], xm1, 2
pextrd [dstq+stride3q ], xm1, 3
- cmp hd, 8
jg .w4_h16
.w4_end:
vextracti128 xm0, m4, 1
@@ -3119,7 +3116,7 @@
vpblendd xm1, xm4, xm0, 0x05
vpblendd xm4, xm4, xm0, 0x0a
pshufd xm1, xm1, q2301
- psubw xm4, xm9, xm4
+ psubw xm4, xm8, xm4
psubw xm4, xm1
psrlw xm4, 2
packuswb xm4, xm4
@@ -3130,9 +3127,9 @@
lea dstq, [dstq+strideq*4]
phaddd m4, m5
vextracti128 xm1, m0, 1
- psubw m4, m9, m4
+ psubw m4, m8, m4
psrlw m4, 2
- vpermd m4, m10, m4
+ vpermd m4, m9, m4
vextracti128 xm5, m4, 1
packuswb xm4, xm5
movd [dstq ], xm0
@@ -3155,7 +3152,7 @@
.w8:
vextracti128 xm2, m4, 1
vextracti128 xm1, m0, 1
- psubw xm4, xm9, xm4
+ psubw xm4, xm8, xm4
psubw xm4, xm2
psrlw xm4, 2
packuswb xm4, xm4
@@ -3180,12 +3177,12 @@
W_MASK_420 2, 5
punpckhqdq m1, m4, m5
punpcklqdq m4, m5
- psubw m1, m9, m1
+ psubw m1, m8, m1
psubw m1, m4
psrlw m1, 2
vpermq m0, m0, q3120
packuswb m1, m1
- vpermd m1, m10, m1
+ vpermd m1, m9, m1
mova [dstq+strideq*2], xm0
vextracti128 [dstq+stride3q ], m0, 1
mova [maskq], xm1
@@ -3202,12 +3199,12 @@
vpermq m0, m0, q3120
mova [dstq], m0
W_MASK_420 2, 5
- psubw m4, m9, m4
+ psubw m4, m8, m4
psubw m4, m5
psrlw m4, 2
vpermq m0, m0, q3120
packuswb m4, m4
- vpermd m4, m10, m4
+ vpermd m4, m9, m4
mova [dstq+strideq*1], m0
mova [maskq], xm4
sub hd, 2
@@ -3214,8 +3211,8 @@
jg .w32_loop
RET
.w64_loop_even:
- psubw m11, m9, m4
- psubw m12, m9, m5
+ psubw m10, m8, m4
+ psubw m11, m8, m5
dec hd
.w64_loop:
add tmp1q, 4*32
@@ -3230,12 +3227,12 @@
mova [dstq+32], m0
test hd, 1
jz .w64_loop_even
- psubw m4, m11, m4
- psubw m5, m12, m5
+ psubw m4, m10, m4
+ psubw m5, m11, m5
psrlw m4, 2
psrlw m5, 2
packuswb m4, m5
- vpermd m4, m10, m4
+ vpermd m4, m9, m4
mova [maskq], m4
add maskq, 32
dec hd
@@ -3242,8 +3239,8 @@
jg .w64_loop
RET
.w128_loop_even:
- psubw m13, m9, m4
- psubw m14, m9, m5
+ psubw m12, m8, m4
+ psubw m13, m8, m5
dec hd
.w128_loop:
W_MASK_420 0, 4
@@ -3258,17 +3255,17 @@
add tmp2q, 8*32
test hd, 1
jz .w128_even
- psubw m4, m11, m4
- psubw m5, m12, m5
+ psubw m4, m10, m4
+ psubw m5, m11, m5
psrlw m4, 2
psrlw m5, 2
packuswb m4, m5
- vpermd m4, m10, m4
+ vpermd m4, m9, m4
mova [maskq], m4
jmp .w128_odd
.w128_even:
- psubw m11, m9, m4
- psubw m12, m9, m5
+ psubw m10, m8, m4
+ psubw m11, m8, m5
.w128_odd:
W_MASK_420 -4, 4
vpermq m0, m0, q3120
@@ -3278,12 +3275,12 @@
mova [dstq+3*32], m0
test hd, 1
jz .w128_loop_even
- psubw m4, m13, m4
- psubw m5, m14, m5
+ psubw m4, m12, m4
+ psubw m5, m13, m5
psrlw m4, 2
psrlw m5, 2
packuswb m4, m5
- vpermd m4, m10, m4
+ vpermd m4, m9, m4
mova [maskq+32], m4
add maskq, 64
dec hd
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -51,13 +51,13 @@
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_64: times 16 db 64
-pw_8: times 8 dw 8
-pw_26: times 8 dw 26
-pw_258: times 8 dw 258
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048
+pw_6903: times 8 dw 6903
+pw_258: times 2 dw 258
+
%macro BIDIR_JMP_TABLE 1-*
;evaluated at definition time (in loop below)
%xdefine %1_table (%%table - 2*%2)
@@ -918,41 +918,34 @@
BIDIR_FN MASK
%undef hd
-%if ARCH_X86_64
- %define reg_pw_8 m8
- %define reg_pw_27 m9
- %define reg_pw_2048 m10
-%else
- %define reg_pw_8 [base+pw_8]
- %define reg_pw_27 [base+pw_26] ; 64 - 38
- %define reg_pw_2048 [base+pw_2048]
-%endif
-
%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
mova m0, [tmp1q+(%1)]
mova m1, [tmp2q+(%1)]
- psubw m1, m0 ; tmp1 - tmp2
- pabsw m3, m1 ; abs(tmp1 - tmp2)
- paddw m3, reg_pw_8 ; abs(tmp1 - tmp2) + 8
- psrlw m3, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
- psubusw m%2, reg_pw_27, m3 ; 64 - min(m, 64)
- psllw m2, m%2, 10
+ mova m2, reg_pw_6903
+ psubw m1, m0
+ pabsw m%2, m1 ; abs(tmp1 - tmp2)
+ mova m3, m2
+ psubusw m2, m%2
+ psrlw m2, 8 ; 64 - m
+ mova m%2, m2
+ psllw m2, 10
pmulhw m1, m2 ; tmp2 * ()
paddw m0, m1 ; tmp1 + ()
;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] ****
mova m1, [tmp1q+(%1)+mmsize]
mova m2, [tmp2q+(%1)+mmsize]
- psubw m2, m1 ; tmp1 - tmp2
+ psubw m2, m1
pabsw m7, m2 ; abs(tmp1 - tmp2)
- paddw m7, reg_pw_8 ; abs(tmp1 - tmp2) + 8
- psrlw m7, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
- psubusw m3, reg_pw_27, m7 ; 64 - min(m, 64)
+ psubusw m3, m7
+ psrlw m3, 8 ; 64 - m
phaddw m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0]
psllw m3, 10
pmulhw m2, m3
+%if ARCH_X86_32
+ mova reg_pw_2048, [base+pw_2048]
+%endif
paddw m1, m2
- ;********
pmulhrsw m0, reg_pw_2048 ; round/scale 2048
pmulhrsw m1, reg_pw_2048 ; round/scale 2048
packuswb m0, m1 ; concat m0 = u8.dst[15..0]
@@ -964,38 +957,41 @@
%define base r6-w_mask_420_ssse3_table
%if ARCH_X86_64
+%define reg_pw_6903 m8
+%define reg_pw_2048 m9
; args: dst, stride, tmp1, tmp2, w, h, mask, sign
-cglobal w_mask_420, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask
+cglobal w_mask_420, 4, 8, 10, dst, stride, tmp1, tmp2, w, h, mask
lea r6, [w_mask_420_ssse3_table]
mov wd, wm
tzcnt r7d, wd
+ movd m0, r7m ; sign
movifnidn hd, hm
- movd m0, r7m
- pshuflw m0, m0, q0000 ; sign
- punpcklqdq m0, m0
movsxd r7, [r6+r7*4]
- mova reg_pw_8, [base+pw_8]
- mova reg_pw_27, [base+pw_26] ; 64 - 38
+ mova reg_pw_6903, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
mova reg_pw_2048, [base+pw_2048]
- mova m6, [base+pw_258] ; 64 * 4 + 2
+ movd m6, [base+pw_258] ; 64 * 4 + 2
add r7, r6
mov maskq, maskmp
psubw m6, m0
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
W_MASK_420 0, 4
jmp r7
%define loop_w r7d
%else
+%define reg_pw_6903 [base+pw_6903]
+%define reg_pw_2048 m3
cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
tzcnt wd, wm
LEA r6, w_mask_420_ssse3_table
- mov wd, [r6+wq*4]
+ movd m0, r7m ; sign
mov maskq, r6mp
- movd m0, r7m
- pshuflw m0, m0, q0000 ; sign
- punpcklqdq m0, m0
- mova m6, [base+pw_258] ; 64 * 4 + 2
+ mov wd, [r6+wq*4]
+ movd m6, [base+pw_258]
add wq, r6
psubw m6, m0
+ pshuflw m6, m6, q0000
+ punpcklqdq m6, m6
W_MASK_420 0, 4
jmp wd
%define loop_w dword r0m
@@ -1016,12 +1012,12 @@
movd [dstq+strideq*0], m0 ; copy m0[2]
psrlq m0, 32
movd [dstq+strideq*1], m0 ; copy m0[3]
- pshufd m5, m4, q3131; DBDB even lines repeated
- pshufd m4, m4, q2020; CACA odd lines repeated
- psubw m1, m6, m4 ; m9 == 64 * 4 + 2
- psubw m1, m5 ; C-D A-B C-D A-B
- psrlw m1, 2 ; >> 2
+ psubw m1, m6, m4 ; a _ c _
+ psrlq m4, 32 ; b _ d _
+ psubw m1, m4
+ psrlw m1, 2
packuswb m1, m1
+ pshuflw m1, m1, q2020
movd [maskq], m1
sub hd, 4
jg .w4_loop
@@ -1035,9 +1031,9 @@
.w8:
movq [dstq ], m0
movhps [dstq+strideq*1], m0
- pshufd m1, m4, q3232
psubw m0, m6, m4
- psubw m0, m1
+ punpckhqdq m4, m4
+ psubw m0, m4
psrlw m0, 2
packuswb m0, m0
movd [maskq], m0
@@ -1077,8 +1073,7 @@
jg .w16ge_loop
RET
-%undef reg_pw_8
-%undef reg_pw_27
+%undef reg_pw_6903
%undef reg_pw_2048
%undef dst_bak
%undef loop_w