ref: 37093f98aee62eb79d8bc0d31ef29c13d3901066
parent: 007fd651e9d76b4f8080b49c74a01275d3dd358f
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Feb 11 12:23:41 EST 2019
x86: Add w_mask_422 AVX2 asm
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -91,6 +91,7 @@
BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
@@ -3054,7 +3055,7 @@
add wq, r7
BIDIR_FN MASK
-%macro W_MASK_420 2 ; src_offset, mask_out
+%macro W_MASK 2 ; src_offset, mask_out
mova m0, [tmp1q+(%1+0)*mmsize]
mova m1, [tmp2q+(%1+0)*mmsize]
psubw m1, m0
@@ -3094,7 +3095,7 @@
psubw xm8, xm0
add wq, r7
vpbroadcastw m8, xm8
- W_MASK_420 0, 4
+ W_MASK 0, 4
lea stride3q, [strideq*3]
jmp wq
.w4:
@@ -3123,7 +3124,7 @@
movq [maskq], xm4
RET
.w4_h16:
- W_MASK_420 2, 5
+ W_MASK 2, 5
lea dstq, [dstq+strideq*4]
phaddd m4, m5
vextracti128 xm1, m0, 1
@@ -3146,7 +3147,7 @@
.w8_loop:
add tmp1q, 2*32
add tmp2q, 2*32
- W_MASK_420 0, 4
+ W_MASK 0, 4
lea dstq, [dstq+strideq*4]
add maskq, 8
.w8:
@@ -3167,7 +3168,7 @@
.w16_loop:
add tmp1q, 4*32
add tmp2q, 4*32
- W_MASK_420 0, 4
+ W_MASK 0, 4
lea dstq, [dstq+strideq*4]
add maskq, 16
.w16:
@@ -3174,7 +3175,7 @@
vpermq m0, m0, q3120
mova [dstq ], xm0
vextracti128 [dstq+strideq*1], m0, 1
- W_MASK_420 2, 5
+ W_MASK 2, 5
punpckhqdq m1, m4, m5
punpcklqdq m4, m5
psubw m1, m8, m1
@@ -3192,13 +3193,13 @@
.w32_loop:
add tmp1q, 4*32
add tmp2q, 4*32
- W_MASK_420 0, 4
+ W_MASK 0, 4
lea dstq, [dstq+strideq*2]
add maskq, 16
.w32:
vpermq m0, m0, q3120
mova [dstq], m0
- W_MASK_420 2, 5
+ W_MASK 2, 5
psubw m4, m8, m4
psubw m4, m5
psrlw m4, 2
@@ -3217,12 +3218,12 @@
.w64_loop:
add tmp1q, 4*32
add tmp2q, 4*32
- W_MASK_420 0, 4
+ W_MASK 0, 4
add dstq, strideq
.w64:
vpermq m0, m0, q3120
mova [dstq], m0
- W_MASK_420 2, 5
+ W_MASK 2, 5
vpermq m0, m0, q3120
mova [dstq+32], m0
test hd, 1
@@ -3243,12 +3244,12 @@
psubw m13, m8, m5
dec hd
.w128_loop:
- W_MASK_420 0, 4
+ W_MASK 0, 4
add dstq, strideq
.w128:
vpermq m0, m0, q3120
mova [dstq+0*32], m0
- W_MASK_420 2, 5
+ W_MASK 2, 5
vpermq m0, m0, q3120
mova [dstq+1*32], m0
add tmp1q, 8*32
@@ -3267,10 +3268,10 @@
psubw m10, m8, m4
psubw m11, m8, m5
.w128_odd:
- W_MASK_420 -4, 4
+ W_MASK -4, 4
vpermq m0, m0, q3120
mova [dstq+2*32], m0
- W_MASK_420 -2, 5
+ W_MASK -2, 5
vpermq m0, m0, q3120
mova [dstq+3*32], m0
test hd, 1
@@ -3283,6 +3284,181 @@
vpermd m4, m9, m4
mova [maskq+32], m4
add maskq, 64
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+ lea r7, [w_mask_422_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movd xm0, r7m ; sign
+ pxor m9, m9
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m10, [base+deint_shuf4]
+ add wq, r7
+ psrlw xm8, xm7, 4 ; pw_128
+ psubb xm8, xm0
+ vpbroadcastb m8, xm8
+ W_MASK 0, 4
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ mova [maskq], xm5
+ RET
+.w4_h16:
+ W_MASK 2, 5
+ lea dstq, [dstq+strideq*4]
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermd m5, m10, m5
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], m5
+ RET
+.w8_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8:
+ vextracti128 xm5, m4, 1
+ vextracti128 xm1, m0, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ mova [maskq], xm5
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 2, 5
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], m5
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 2, 5
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*1], m0
+ mova [maskq], m5
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4
+ add dstq, strideq
+ add maskq, 32
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 2, 5
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq], m5
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ W_MASK 0, 4
+ add dstq, strideq
+ add maskq, 32*2
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 2, 5
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq+32*0], m5
+ W_MASK 4, 4
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 6, 5
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*3], m0
+ mova [maskq+32*1], m5
dec hd
jg .w128_loop
RET
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -59,6 +59,7 @@
decl_mask_fn(dav1d_mask_ssse3);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
decl_w_mask_fn(dav1d_w_mask_420_ssse3);
+decl_w_mask_fn(dav1d_w_mask_422_avx2);
decl_blend_fn(dav1d_blend_avx2);
decl_blend_fn(dav1d_blend_ssse3);
decl_blend_dir_fn(dav1d_blend_v_avx2);
@@ -125,6 +126,7 @@
c->avg = dav1d_avg_avx2;
c->w_avg = dav1d_w_avg_avx2;
c->mask = dav1d_mask_avx2;
+ c->w_mask[1] = dav1d_w_mask_422_avx2;
c->w_mask[2] = dav1d_w_mask_420_avx2;
c->blend = dav1d_blend_avx2;
c->blend_v = dav1d_blend_v_avx2;