ref: 50e9a39a07c038d3c636176ac4fd85a76c0bdc6e
parent: d085424c9225906e375788ded32f77323ae31f03
author: Henrik Gramner <gramner@twoorioles.com>
date: Thu Feb 20 10:25:43 EST 2020
x86: Add mc w_mask 4:2:0 AVX-512 (Ice Lake) asm
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -47,6 +47,22 @@
db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
+ db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
+ db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+ db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31
+ db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
+ db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
+ db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47
+ db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
+ db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
+ db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+ db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
@@ -115,6 +131,11 @@
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+wm_420_perm64: dq 0xfedcba9876543210
+wm_420_sign: dd 0x01020102, 0x01010101
+wm_sign_avx512: dd 0x40804080, 0x40404040
+
+pb_m64: times 4 db -64
pb_64: times 4 db 64
pw_34: times 2 dw 34
pw_258: times 2 dw 258
@@ -213,6 +234,7 @@
BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128
SECTION .text
@@ -4362,23 +4384,27 @@
BIDIR_FN MASK
%endmacro MASK_FN
-%macro W_MASK 2-3 0 ; src_offset, mask_out, 4:4:4
- mova m0, [tmp1q+(%1+0)*mmsize]
- mova m1, [tmp2q+(%1+0)*mmsize]
- psubw m1, m0
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+mmsize*%3]
+ mova m1, [tmp2q+mmsize*%3]
+ psubw m1, m%1
pabsw m%2, m1
psubusw m%2, m6, m%2
psrlw m%2, 8 ; 64 - m
psllw m2, m%2, 10
pmulhw m1, m2
- paddw m0, m1
- mova m1, [tmp1q+(%1+1)*mmsize]
- mova m2, [tmp2q+(%1+1)*mmsize]
+ paddw m%1, m1
+ mova m1, [tmp1q+mmsize*%4]
+ mova m2, [tmp2q+mmsize*%4]
psubw m2, m1
pabsw m3, m2
psubusw m3, m6, m3
+%if cpuflag(avx512icl)
+ vpshldw m%2, m3, 8
+ psllw m3, m%2, 10
+%else
psrlw m3, 8
-%if %3
+%if %5
packuswb m%2, m3
psubb m%2, m5, m%2
vpermq m%2, m%2, q3120
@@ -4386,221 +4412,14 @@
phaddw m%2, m3
%endif
psllw m3, 10
+%endif
pmulhw m2, m3
paddw m1, m2
- pmulhrsw m0, m7
+ pmulhrsw m%1, m7
pmulhrsw m1, m7
- packuswb m0, m1
+ packuswb m%1, m1
%endmacro
-cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
-%define base r7-w_mask_420_avx2_table
- lea r7, [w_mask_420_avx2_table]
- tzcnt wd, wm
- movifnidn hd, hm
- mov maskq, maskmp
- movd xm0, r7m ; sign
- movsxd wq, dword [r7+wq*4]
- vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
- vpbroadcastd m7, [base+pw_2048]
- movd xm8, [base+pw_258] ; 64 * 4 + 2
- pmovzxbd m9, [base+deint_shuf4]
- psubw xm8, xm0
- add wq, r7
- vpbroadcastw m8, xm8
- W_MASK 0, 4
- lea stride3q, [strideq*3]
- jmp wq
-.w4:
- vextracti128 xm1, m0, 1
- movd [dstq ], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q ], xm1, 1
- cmp hd, 8
- jl .w4_end
- lea dstq, [dstq+strideq*4]
- pextrd [dstq ], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
- jg .w4_h16
-.w4_end:
- vextracti128 xm0, m4, 1
- vpblendd xm1, xm4, xm0, 0x05
- vpblendd xm4, xm4, xm0, 0x0a
- pshufd xm1, xm1, q2301
- psubw xm4, xm8, xm4
- psubw xm4, xm1
- psrlw xm4, 2
- packuswb xm4, xm4
- movq [maskq], xm4
- RET
-.w4_h16:
- W_MASK 2, 5
- lea dstq, [dstq+strideq*4]
- phaddd m4, m5
- vextracti128 xm1, m0, 1
- psubw m4, m8, m4
- psrlw m4, 2
- vpermd m4, m9, m4
- vextracti128 xm5, m4, 1
- packuswb xm4, xm5
- movd [dstq ], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q], xm1, 1
- lea dstq, [dstq+strideq*4]
- pextrd [dstq ], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
- mova [maskq], xm4
- RET
-.w8_loop:
- add tmp1q, 2*32
- add tmp2q, 2*32
- W_MASK 0, 4
- lea dstq, [dstq+strideq*4]
- add maskq, 8
-.w8:
- vextracti128 xm2, m4, 1
- vextracti128 xm1, m0, 1
- psubw xm4, xm8, xm4
- psubw xm4, xm2
- psrlw xm4, 2
- packuswb xm4, xm4
- movq [dstq ], xm0
- movq [dstq+strideq*1], xm1
- movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xm1
- movq [maskq], xm4
- sub hd, 4
- jg .w8_loop
- RET
-.w16_loop:
- add tmp1q, 4*32
- add tmp2q, 4*32
- W_MASK 0, 4
- lea dstq, [dstq+strideq*4]
- add maskq, 16
-.w16:
- vpermq m0, m0, q3120
- mova [dstq ], xm0
- vextracti128 [dstq+strideq*1], m0, 1
- W_MASK 2, 5
- punpckhqdq m1, m4, m5
- punpcklqdq m4, m5
- psubw m1, m8, m1
- psubw m1, m4
- psrlw m1, 2
- vpermq m0, m0, q3120
- packuswb m1, m1
- vpermd m1, m9, m1
- mova [dstq+strideq*2], xm0
- vextracti128 [dstq+stride3q ], m0, 1
- mova [maskq], xm1
- sub hd, 4
- jg .w16_loop
- RET
-.w32_loop:
- add tmp1q, 4*32
- add tmp2q, 4*32
- W_MASK 0, 4
- lea dstq, [dstq+strideq*2]
- add maskq, 16
-.w32:
- vpermq m0, m0, q3120
- mova [dstq], m0
- W_MASK 2, 5
- psubw m4, m8, m4
- psubw m4, m5
- psrlw m4, 2
- vpermq m0, m0, q3120
- packuswb m4, m4
- vpermd m4, m9, m4
- mova [dstq+strideq*1], m0
- mova [maskq], xm4
- sub hd, 2
- jg .w32_loop
- RET
-.w64_loop_even:
- psubw m10, m8, m4
- psubw m11, m8, m5
- dec hd
-.w64_loop:
- add tmp1q, 4*32
- add tmp2q, 4*32
- W_MASK 0, 4
- add dstq, strideq
-.w64:
- vpermq m0, m0, q3120
- mova [dstq], m0
- W_MASK 2, 5
- vpermq m0, m0, q3120
- mova [dstq+32], m0
- test hd, 1
- jz .w64_loop_even
- psubw m4, m10, m4
- psubw m5, m11, m5
- psrlw m4, 2
- psrlw m5, 2
- packuswb m4, m5
- vpermd m4, m9, m4
- mova [maskq], m4
- add maskq, 32
- dec hd
- jg .w64_loop
- RET
-.w128_loop_even:
- psubw m12, m8, m4
- psubw m13, m8, m5
- dec hd
-.w128_loop:
- W_MASK 0, 4
- add dstq, strideq
-.w128:
- vpermq m0, m0, q3120
- mova [dstq+0*32], m0
- W_MASK 2, 5
- vpermq m0, m0, q3120
- mova [dstq+1*32], m0
- add tmp1q, 8*32
- add tmp2q, 8*32
- test hd, 1
- jz .w128_even
- psubw m4, m10, m4
- psubw m5, m11, m5
- psrlw m4, 2
- psrlw m5, 2
- packuswb m4, m5
- vpermd m4, m9, m4
- mova [maskq], m4
- jmp .w128_odd
-.w128_even:
- psubw m10, m8, m4
- psubw m11, m8, m5
-.w128_odd:
- W_MASK -4, 4
- vpermq m0, m0, q3120
- mova [dstq+2*32], m0
- W_MASK -2, 5
- vpermq m0, m0, q3120
- mova [dstq+3*32], m0
- test hd, 1
- jz .w128_loop_even
- psubw m4, m12, m4
- psubw m5, m13, m5
- psrlw m4, 2
- psrlw m5, 2
- packuswb m4, m5
- vpermd m4, m9, m4
- mova [maskq+32], m4
- add maskq, 64
- dec hd
- jg .w128_loop
- RET
-
cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
%define base r7-w_mask_422_avx2_table
lea r7, [w_mask_422_avx2_table]
@@ -4617,7 +4436,7 @@
psrlw xm8, xm7, 4 ; pw_128
psubb xm8, xm0
vpbroadcastb m8, xm8
- W_MASK 0, 4
+ W_MASK 0, 4, 0, 1
lea stride3q, [strideq*3]
jmp wq
.w4:
@@ -4643,7 +4462,7 @@
mova [maskq], xm5
RET
.w4_h16:
- W_MASK 2, 5
+ W_MASK 0, 5, 2, 3
lea dstq, [dstq+strideq*4]
packuswb m4, m5
psubb m5, m8, m4
@@ -4664,7 +4483,7 @@
.w8_loop:
add tmp1q, 32*2
add tmp2q, 32*2
- W_MASK 0, 4
+ W_MASK 0, 4, 0, 1
lea dstq, [dstq+strideq*4]
add maskq, 16
.w8:
@@ -4685,7 +4504,7 @@
.w16_loop:
add tmp1q, 32*4
add tmp2q, 32*4
- W_MASK 0, 4
+ W_MASK 0, 4, 0, 1
lea dstq, [dstq+strideq*4]
add maskq, 32
.w16:
@@ -4692,7 +4511,7 @@
vpermq m0, m0, q3120
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
- W_MASK 2, 5
+ W_MASK 0, 5, 2, 3
packuswb m4, m5
psubb m5, m8, m4
pavgb m5, m9
@@ -4707,13 +4526,13 @@
.w32_loop:
add tmp1q, 32*4
add tmp2q, 32*4
- W_MASK 0, 4
+ W_MASK 0, 4, 0, 1
lea dstq, [dstq+strideq*2]
add maskq, 32
.w32:
vpermq m0, m0, q3120
mova [dstq+strideq*0], m0
- W_MASK 2, 5
+ W_MASK 0, 5, 2, 3
packuswb m4, m5
psubb m5, m8, m4
pavgb m5, m9
@@ -4727,13 +4546,13 @@
.w64_loop:
add tmp1q, 32*4
add tmp2q, 32*4
- W_MASK 0, 4
+ W_MASK 0, 4, 0, 1
add dstq, strideq
add maskq, 32
.w64:
vpermq m0, m0, q3120
mova [dstq+32*0], m0
- W_MASK 2, 5
+ W_MASK 0, 5, 2, 3
packuswb m4, m5
psubb m5, m8, m4
pavgb m5, m9
@@ -4747,13 +4566,13 @@
.w128_loop:
add tmp1q, 32*8
add tmp2q, 32*8
- W_MASK 0, 4
+ W_MASK 0, 4, 0, 1
add dstq, strideq
add maskq, 32*2
.w128:
vpermq m0, m0, q3120
mova [dstq+32*0], m0
- W_MASK 2, 5
+ W_MASK 0, 5, 2, 3
packuswb m4, m5
psubb m5, m8, m4
pavgb m5, m9
@@ -4761,10 +4580,10 @@
vpermd m5, m10, m5
mova [dstq+32*1], m0
mova [maskq+32*0], m5
- W_MASK 4, 4
+ W_MASK 0, 4, 4, 5
vpermq m0, m0, q3120
mova [dstq+32*2], m0
- W_MASK 6, 5
+ W_MASK 0, 5, 6, 7
packuswb m4, m5
psubb m5, m8, m4
pavgb m5, m9
@@ -4787,7 +4606,7 @@
vpbroadcastd m7, [base+pw_2048]
vpbroadcastd m5, [base+pb_64]
add wq, r7
- W_MASK 0, 4, 1
+ W_MASK 0, 4, 0, 1, 1
lea stride3q, [strideq*3]
jmp wq
.w4:
@@ -4805,7 +4624,7 @@
pextrd [dstq+strideq*2], xm1, 2
pextrd [dstq+stride3q ], xm1, 3
je .w4_end
- W_MASK 2, 4, 1
+ W_MASK 0, 4, 2, 3, 1
lea dstq, [dstq+strideq*4]
vextracti128 xm1, m0, 1
movd [dstq+strideq*0], xm0
@@ -4823,7 +4642,7 @@
.w8_loop:
add tmp1q, 32*2
add tmp2q, 32*2
- W_MASK 0, 4, 1
+ W_MASK 0, 4, 0, 1, 1
lea dstq, [dstq+strideq*4]
add maskq, 32
.w8:
@@ -4839,7 +4658,7 @@
.w16_loop:
add tmp1q, 32*2
add tmp2q, 32*2
- W_MASK 0, 4, 1
+ W_MASK 0, 4, 0, 1, 1
lea dstq, [dstq+strideq*2]
add maskq, 32
.w16:
@@ -4853,7 +4672,7 @@
.w32_loop:
add tmp1q, 32*2
add tmp2q, 32*2
- W_MASK 0, 4, 1
+ W_MASK 0, 4, 0, 1, 1
add dstq, strideq
add maskq, 32
.w32:
@@ -4866,7 +4685,7 @@
.w64_loop:
add tmp1q, 32*4
add tmp2q, 32*4
- W_MASK 0, 4, 1
+ W_MASK 0, 4, 0, 1, 1
add dstq, strideq
add maskq, 32*2
.w64:
@@ -4873,7 +4692,7 @@
vpermq m0, m0, q3120
mova [dstq+32*0], m0
mova [maskq+32*0], m4
- W_MASK 2, 4, 1
+ W_MASK 0, 4, 2, 3, 1
vpermq m0, m0, q3120
mova [dstq+32*1], m0
mova [maskq+32*1], m4
@@ -4883,7 +4702,7 @@
.w128_loop:
add tmp1q, 32*8
add tmp2q, 32*8
- W_MASK 0, 4, 1
+ W_MASK 0, 4, 0, 1, 1
add dstq, strideq
add maskq, 32*4
.w128:
@@ -4890,15 +4709,15 @@
vpermq m0, m0, q3120
mova [dstq+32*0], m0
mova [maskq+32*0], m4
- W_MASK 2, 4, 1
+ W_MASK 0, 4, 2, 3, 1
vpermq m0, m0, q3120
mova [dstq+32*1], m0
mova [maskq+32*1], m4
- W_MASK 4, 4, 1
+ W_MASK 0, 4, 4, 5, 1
vpermq m0, m0, q3120
mova [dstq+32*2], m0
mova [maskq+32*2], m4
- W_MASK 6, 4, 1
+ W_MASK 0, 4, 6, 7, 1
vpermq m0, m0, q3120
mova [dstq+32*3], m0
mova [maskq+32*3], m4
@@ -5461,6 +5280,212 @@
W_AVG_FN
MASK_FN
+cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m9, [base+deint_shuf4]
+ vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign
+ add wq, r7
+ W_MASK 0, 4, 0, 1
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm0, m4, 1
+ vpblendd xm1, xm4, xm0, 0x05
+ vpblendd xm4, xm4, xm0, 0x0a
+ pshufd xm1, xm1, q2301
+ psubw xm4, xm8, xm4
+ psubw xm4, xm1
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [maskq], xm4
+ RET
+.w4_h16:
+ W_MASK 0, 5, 2, 3
+ lea dstq, [dstq+strideq*4]
+ phaddd m4, m5
+ vextracti128 xm1, m0, 1
+ psubw m4, m8, m4
+ psrlw m4, 2
+ vpermd m4, m9, m4
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], xm4
+ RET
+.w8_loop:
+ add tmp1q, 2*32
+ add tmp2q, 2*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 8
+.w8:
+ vextracti128 xm2, m4, 1
+ vextracti128 xm1, m0, 1
+ psubw xm4, xm8, xm4
+ psubw xm4, xm2
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ movq [maskq], xm4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 0, 5, 2, 3
+ punpckhqdq m1, m4, m5
+ punpcklqdq m4, m5
+ psubw m1, m8, m1
+ psubw m1, m4
+ psrlw m1, 2
+ vpermq m0, m0, q3120
+ packuswb m1, m1
+ vpermd m1, m9, m1
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], xm1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 0, 5, 2, 3
+ psubw m4, m8, m4
+ psubw m4, m5
+ psrlw m4, 2
+ vpermq m0, m0, q3120
+ packuswb m4, m4
+ vpermd m4, m9, m4
+ mova [dstq+strideq*1], m0
+ mova [maskq], xm4
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+ dec hd
+.w64_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ test hd, 1
+ jz .w64_loop_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq], m4
+ add maskq, 32
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop_even:
+ psubw m12, m8, m4
+ psubw m13, m8, m5
+ dec hd
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ add tmp1q, 8*32
+ add tmp2q, 8*32
+ test hd, 1
+ jz .w128_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*0], m4
+ jmp .w128_odd
+.w128_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+.w128_odd:
+ W_MASK 0, 4, -4, -3
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 0, 5, -2, -1
+ vpermq m0, m0, q3120
+ mova [dstq+32*3], m0
+ test hd, 1
+ jz .w128_loop_even
+ psubw m4, m12, m4
+ psubw m5, m13, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*1], m4
+ add maskq, 64
+ dec hd
+ jg .w128_loop
+ RET
+
INIT_ZMM avx512icl
PREP_BILIN
PREP_8TAP
@@ -5467,5 +5492,198 @@
AVG_FN
W_AVG_FN
MASK_FN
+
+cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+ lea r7, [w_mask_420_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pb_m64] ; -1 << 6
+ mova ym10, [base+wm_420_mask+32]
+ vpbroadcastd m8, [base+wm_sign_avx512+r6*4] ; (258 - sign) << 6
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova m5, [wm_420_perm4]
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm4+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vextracti128 xmm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xmm1
+ pextrd [dstq+stride3q ], xmm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xmm1, 2
+ pextrd [dstq+stride3q ], xmm1, 3
+.w4_end:
+ vpermb ym8, ym10, ym8
+ movq [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m11, strided
+ pmulld m11, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ vpdpbusd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vpscatterdd [dstq+m11]{k1}, m0
+ RET
+.w8:
+ mova m5, [wm_420_perm8]
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm8+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vextracti128 xmm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xmm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 16
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ vpermb m1, m10, m1
+ mova [maskq], xm1
+ vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xmm2, m0, 2
+ vextracti32x4 xmm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*2], xmm2
+ movq [dstq+stride3q ], xmm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm2
+ movhps [dstq+stride3q ], xmm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16:
+ mova m5, [wm_420_perm16]
+.w16_loop:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [warp_8x8_shufA]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
+ psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15
+.w64_loop:
+ W_MASK 0, 4, 0, 2
+ W_MASK 11, 5, 1, 3
+ mova m2, m8
+ vpdpbusd m2, m4, m9
+ mova m3, m8
+ vpdpbusd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ mova m1, m0
+ vpermt2q m0, m12, m11
+ vpermt2q m1, m13, m11
+ mova [maskq], ym2
+ add maskq, 32
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m14, [wm_420_perm64]
+ mova m10, [wm_420_mask]
+ psrlq m15, m14, 4
+.w128_loop:
+ W_MASK 0, 12, 0, 4
+ W_MASK 11, 13, 1, 5
+ mova m4, m8
+ vpdpbusd m4, m12, m9
+ mova m5, m8
+ vpdpbusd m5, m13, m9
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*1+64*0], m1
+ W_MASK 0, 12, 2, 6
+ W_MASK 11, 13, 3, 7
+ vprold m4, 16
+ vprold m5, 16
+ vpdpbusd m4, m12, m9
+ vpdpbusd m5, m13, m9
+ add tmp1q, 512
+ add tmp2q, 512
+ vpermt2b m4, m10, m5
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0+64*1], m0
+ mova [dstq+strideq*1+64*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w128_loop
+ RET
%endif ; ARCH_X86_64
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -89,6 +89,7 @@
decl_mask_fn(dav1d_mask_avx512icl);
decl_mask_fn(dav1d_mask_avx2);
decl_mask_fn(dav1d_mask_ssse3);
+decl_w_mask_fn(dav1d_w_mask_420_avx512icl);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
decl_w_mask_fn(dav1d_w_mask_420_ssse3);
decl_w_mask_fn(dav1d_w_mask_422_avx2);
@@ -236,6 +237,7 @@
c->avg = dav1d_avg_avx512icl;
c->w_avg = dav1d_w_avg_avx512icl;
c->mask = dav1d_mask_avx512icl;
+ c->w_mask[2] = dav1d_w_mask_420_avx512icl;
#endif
#endif
}