ref: 95068df6a6597106973031df62bf52c695561361
parent: daaf44890aa75f878499c34b49509b61da257ecd
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Tue Feb 18 09:41:22 EST 2020
x86: Add cdef_filter_{4,8}x8 AVX-512 (Ice Lake) asm cdef_filter_4x8_8bpc_avx2: 54.0 cdef_filter_4x8_8bpc_avx512icl: 35.5 => +52.1% cdef_filter_8x8_8bpc_avx2: 71.0 cdef_filter_8x8_8bpc_avx512icl: 49.0 => +44.9%
--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -67,6 +67,21 @@
db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13
db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
+lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+ db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13
+lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
+ db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
+ db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
+ db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
+pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7
+lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+ db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55
+ db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87
+ db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119
+lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27
+ db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
+ db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91
+ db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123
edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
@@ -81,7 +96,13 @@
dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
+ times 16 db 0 ; realign (introduced by cdef_dirs)
+end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
+ db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
+ db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
+ db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4
sec_tap: db 32, 32, 16, 16
pd_268435568: dd 268435568
@@ -1891,7 +1912,7 @@
psubb m2, m1, m6
lzcnt r6d, prid
vpsubb m2{k1}, m6, m1 ; abs(diff)
- vpbroadcastb m4, prim
+ vpbroadcastb m4, prid
and prid, 1
vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
movifnidn t1d, secm
@@ -2000,6 +2021,613 @@
vpdpbusd m0, m5, m9
ret
-%endif ; HAVE_AVX512ICL
+DECLARE_REG_TMP 2, 7
+; lut top lut bottom
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55
+; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85
+; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95
+
+cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ vpbroadcastd ym21, strided
+ mov r6d, edgem
+ lea r8, [edge_mask]
+ movq xm1, [topq+strideq*0-2]
+ pmulld ym21, [base+pd_01234567]
+ kxnorb k1, k1, k1
+ movq xm2, [topq+strideq*1-2]
+ vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7
+ mova m14, [base+lut_perm_4x8a]
+ movu m15, [base+lut_perm_4x8b]
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ lea r7, [dstq+strideq*8-2]
+ vinserti32x4 ym1, [r7+strideq*0], 1
+ vinserti32x4 ym2, [r7+strideq*1], 1
+.main:
+ punpcklqdq ym1, ym2
+ vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m16, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m14, m0, m1 ; lut top
+ vpermi2b m15, m0, m1 ; lut bottom
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m20, m20
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m2, m16, m14 ; pxt
+ vpermb m3, m16, m15 ; pxb
+ mova m1, m0
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1
+ vpermb m5, m6, m15 ; pNb
+%macro CDEF_FILTER_4x8_PRI 0
+ vpcmpub k1, m2, m4, 6 ; pxt > pNt
+ vpcmpub k2, m3, m5, 6 ; pxb > pNb
+ psubb m6, m4, m2
+ psubb m7, m5, m3
+ lzcnt r6d, prid
+ vpsubb m6{k1}, m2, m4 ; abs(diff_top)
+ vpsubb m7{k2}, m3, m5 ; abs(diff_bottom)
+ vpbroadcastb m13, prid
+ vpbroadcastq m9, [r3+r6*8]
+ and prid, 1
+ vpbroadcastd m11, [base+pri_tap+priq*4]
+ vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
+ vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
+ mova m10, m11
+ movifnidn t1d, secm
+ vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top)
+ vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom)
+ psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift)))
+ psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift)))
+ pminub m6, m12
+ pminub m7, m13
+ vpdpbusd m0, m6, m10 ; sum top
+ vpdpbusd m1, m7, m11 ; sum bottom
+%endmacro
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m10, m4, m2
+ pminub m12, m6, m8
+ pminub m11, m5, m3
+ pminub m13, m7, m9
+ pmaxub m4, m2
+ pmaxub m6, m8
+ pmaxub m5, m3
+ pmaxub m7, m9
+ pminub m10, m12
+ pminub m11, m13
+ pmaxub m4, m6
+ pmaxub m5, m7
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ kxnorb k2, k2, k2 ; hw lw
+ vpshrdd m12, m0, m1, 16 ; m1lw m0hw
+ vpshrdd m6, m10, m11, 16 ; m11lw m10hw
+ vpshrdd m8, m4, m5, 16 ; m5lw m4hw
+ vpblendmw m7{k1}, m10, m11 ; m11hw m10lw
+ vpblendmw m9{k1}, m4, m5 ; m5hw m4lw
+ vpblendmw m4{k1}, m0, m12 ; m1lw m0lw
+ vpblendmw m5{k1}, m12, m1 ; m1hw m0hw
+ vpshrdd m2, m3, 16
+ pminub m6, m7
+ pmaxub m8, m9
+ mova ym14, [base+end_perm]
+ vpcmpw k1, m4, m20, 1
+ vpshldw m2, m5, 8
+ pslldq m7, m6, 1
+ pslldq m9, m8, 1
+ psubw m5, m20, m4
+ paddusw m0, m4, m2 ; clip >0xff
+ pminub m6, m7
+ pmaxub m8, m9
+ psubusw m0{k1}, m2, m5 ; clip <0x00
+ pmaxub m0, m6
+ pminub m0, m8
+ vpermb m0, m14, m0
+ vpscatterdd [dstq+ym21]{k2}, ym0
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova ym4, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m3, m1, 8
+ paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m3
+ pslld m0, 16
+ vpshrdd m0, m1, 16
+ vpermb m0, m4, m0 ; output in bits 8-15 of each word
+ vpscatterdd [dstq+ym21]{k1}, ym0
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t1d, r6d
+ or r6d, 8 ; top 4x4 has bottom
+ or t1d, 4 ; bottom 4x4 has top
+ vpbroadcastq m17, [base+edge_mask+r6*8]
+ vpbroadcastq m18, [base+edge_mask+t1*8]
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m17, m6 ; index in-range
+ vpshufbitqmb k2, m18, m6
+ mova m4, m2
+ mova m5, m3
+ vpermb m4{k1}, m6, m14
+ vpermb m5{k2}, m6, m15
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m17, m10
+ vpshufbitqmb k2, m18, m10
+ vpshufbitqmb k3, m17, m11
+ vpshufbitqmb k4, m18, m11
+ mova m6, m2
+ mova m7, m3
+ mova m8, m2
+ mova m9, m3
+ vpermb m6{k1}, m10, m14
+ vpermb m7{k2}, m10, m15
+ vpermb m8{k3}, m11, m14
+ vpermb m9{k4}, m11, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
+ vpermb m7, m8, m15 ; pNb
+ vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
+ vpermb m9, m9, m15 ; pNb
+.sec_main:
+ vpbroadcastb m18, t1d
+ lzcnt t1d, t1d
+ vpcmpub k1, m2, m6, 6
+ vpcmpub k2, m3, m7, 6
+ vpcmpub k3, m2, m8, 6
+ vpcmpub k4, m3, m9, 6
+ vpbroadcastq m17, [r3+t1*8]
+ psubb m10, m6, m2
+ psubb m11, m7, m3
+ psubb m12, m8, m2
+ psubb m13, m9, m3
+ vpsubb m10{k1}, m2, m6 ; abs(dt0)
+ vpsubb m11{k2}, m3, m7 ; abs(db0)
+ vpsubb m12{k3}, m2, m8 ; abs(dt1)
+ vpsubb m13{k4}, m3, m9 ; abs(db1)
+ vpbroadcastd m19, [base+sec_tap]
+ gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
+ gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
+ gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
+ gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
+ psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift)))
+ psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift)))
+ psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift)))
+ psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift)))
+ pminub m10, m14
+ pminub m11, m15
+ pminub m12, m16
+ pminub m13, m17
+ mova m14, m19
+ mova m15, m19
+ mova m16, m19
+ vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0)
+ vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0)
+ vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1)
+ vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1)
+ vpdpbusd m0, m10, m14
+ vpdpbusd m1, m11, m15
+ vpdpbusd m0, m12, m16
+ vpdpbusd m1, m13, m19
+ ret
+
+; lut tl lut tr
+; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td
+; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD
+; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b
+; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b
+; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b
+; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b
+; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b
+; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b
+; lut bl lut br
+; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b
+; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b
+; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b
+; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b
+; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b
+; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b
+; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b
+; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b
+
+cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ mov r6d, edgem
+ lea r10, [dstq+strideq*4-2]
+ movu xmm0, [topq+strideq*0-2]
+ movu xmm1, [dstq+strideq*2-2]
+ movu xmm2, [r10 +strideq*2 ]
+ lea r8, [edge_mask]
+ lea r9, [strideq*3]
+ pmovzxwq m10, [leftq-4]
+ vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1
+ vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1
+ vinserti32x4 ym2, ymm2, [r10 +r9 ], 1
+ lea r7, [r10 +strideq*4 ]
+ pmovzxwq m11, [leftq+4]
+ vinserti32x4 m0, [dstq+strideq*0-2], 2
+ vinserti32x4 m1, [r10 +strideq*0 ], 2
+ mova m12, [base+lut_perm_8x8a]
+ movu m13, [base+lut_perm_8x8b]
+ vinserti32x4 m0, [dstq+strideq*1-2], 3
+ vinserti32x4 m1, [r10 +strideq*1 ], 3
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 m2, [r7 +strideq*0], 2
+ vinserti32x4 m2, [r7 +strideq*1], 3
+.main:
+ mov t1d, 0x11111100
+ mova m14, m12
+ mova m15, m13
+ kmovd k1, t1d
+ kshiftrd k2, k1, 8
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m30, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m12, m0, m1 ; lut tl
+ vpermi2b m14, m1, m2 ; lut bl
+ vpermi2b m13, m0, m1 ; lut tr
+ vpermi2b m15, m1, m2 ; lut br
+ vpblendmw m12{k1}, m12, m10
+ vpblendmw m14{k2}, m14, m11
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m31, m31
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m4, m30, m12 ; pxtl
+ vpermb m5, m30, m13 ; pxtr
+ vpermb m6, m30, m14 ; pxbl
+ vpermb m7, m30, m15 ; pxbr
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1
+ vpermb m9, m11, m13 ; pNtr
+ vpermb m10, m11, m14 ; pNbl
+ vpermb m11, m11, m15 ; pNbr
+%macro CDEF_FILTER_8x8_PRI 0
+ vpcmpub k1, m4, m8, 6 ; pxtl > pNtl
+ vpcmpub k2, m5, m9, 6 ; pxtr > pNtr
+ vpcmpub k3, m6, m10, 6 ; pxbl > pNbl
+ vpcmpub k4, m7, m11, 6 ; pxbr > pNbr
+ psubb m16, m8, m4
+ psubb m17, m9, m5
+ psubb m18, m10, m6
+ psubb m19, m11, m7
+ lzcnt r6d, prid
+ vpsubb m16{k1}, m4, m8 ; abs(diff_tl)
+ vpsubb m17{k2}, m5, m9 ; abs(diff_tr)
+ vpsubb m18{k3}, m6, m10 ; abs(diff_bl)
+ vpsubb m19{k4}, m7, m11 ; abs(diff_br)
+ vpbroadcastq m28, [r3+r6*8]
+ vpbroadcastb m29, prid
+ and prid, 1
+ vpbroadcastd m27, [base+pri_tap+priq*4]
+ vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
+ vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
+ vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
+ vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
+ mova m24, m27
+ mova m25, m27
+ mova m26, m27
+ movifnidn t1d, secm
+ vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr)
+ vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr)
+ psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift)))
+ psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift)))
+ psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift)))
+ psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift)))
+ pminub m16, m20
+ pminub m17, m21
+ pminub m18, m22
+ pminub m19, m23
+ vpdpbusd m0, m16, m24 ; sum tl
+ vpdpbusd m1, m17, m25 ; sum tr
+ vpdpbusd m2, m18, m26 ; sum bl
+ vpdpbusd m3, m19, m27 ; sum br
+%endmacro
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m20, m8, m4
+ pminub m24, m12, m16
+ pminub m21, m9, m5
+ pminub m25, m13, m17
+ pminub m22, m10, m6
+ pminub m26, m14, m18
+ pminub m23, m11, m7
+ pminub m27, m15, m19
+ pmaxub m8, m4
+ pmaxub m12, m16
+ pmaxub m9, m5
+ pmaxub m13, m17
+ pmaxub m10, m6
+ pmaxub m14, m18
+ pmaxub m11, m7
+ pmaxub m15, m19
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ pmaxub m8, m12
+ pmaxub m9, m13
+ pmaxub m10, m14
+ pmaxub m11, m15
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ vpshrdd m24, m0, m1, 16
+ vpshrdd m25, m2, m3, 16
+ vpshrdd m12, m20, m21, 16
+ vpshrdd m14, m22, m23, 16
+ vpshrdd m16, m8, m9, 16
+ vpshrdd m18, m10, m11, 16
+ vpblendmw m13{k1}, m20, m21
+ vpblendmw m15{k1}, m22, m23
+ vpblendmw m17{k1}, m8, m9
+ vpblendmw m19{k1}, m10, m11
+ vpblendmw m20{k1}, m0, m24
+ vpblendmw m21{k1}, m24, m1
+ vpblendmw m22{k1}, m2, m25
+ vpblendmw m23{k1}, m25, m3
+ vpshrdd m4, m5, 16
+ vpshrdd m6, m7, 16
+ pminub m12, m13
+ pminub m14, m15
+ pmaxub m16, m17
+ pmaxub m18, m19
+ mova m8, [base+end_perm_w8clip]
+ vpcmpw k2, m20, m31, 1
+ vpcmpw k3, m22, m31, 1
+ vpshldw m4, m21, 8
+ vpshldw m6, m23, 8
+ kunpckdq k1, k1, k1
+ kxnorb k4, k4, k4
+ vpshrdw m11, m12, m14, 8
+ vpshrdw m15, m16, m18, 8
+ vpblendmb m13{k1}, m12, m14
+ vpblendmb m17{k1}, m16, m18
+ psubw m21, m31, m20
+ psubw m23, m31, m22
+ paddusw m0, m20, m4 ; clip >0xff
+ paddusw m1, m22, m6
+ pminub m11, m13
+ pmaxub m15, m17
+ psubusw m0{k2}, m4, m21 ; clip <0x00
+ psubusw m1{k3}, m6, m23
+ psrlw m0, 8
+ vmovdqu8 m0{k1}, m1
+ pmaxub m0, m11
+ pminub m0, m15
+ vpermb m0, m8, m0
+ add r10, 2
+ vextracti32x4 xm1, m0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movq [r10 +strideq*0], xm2
+ movq [r10 +strideq*2], xm3
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+r9 ], xm1
+ movhps [r10 +strideq*1], xm2
+ movhps [r10 +r9 ], xm3
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova xm8, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m5, m1, 8
+ vpshldd m6, m2, 8
+ vpshldd m7, m3, 8
+ paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ vpermb m0, m8, m0
+ vpermb m1, m8, m1
+ vpermb m2, m8, m2
+ vpermb m3, m8, m3
+ add r10, 2
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm0
+ movq [r10 +strideq*0], xm5
+ movq [r10 +strideq*2], xm2
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+r9 ], xm0
+ movhps [r10 +strideq*1], xm5
+ movhps [r10 +r9 ], xm2
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t0d, r6d
+ mov t1d, r6d
+ or t0d, 0xA ; top-left 4x4 has bottom and right
+ or t1d, 0x9 ; top-right 4x4 has bottom and left
+ vpbroadcastq m26, [base+edge_mask+t0*8]
+ vpbroadcastq m27, [base+edge_mask+t1*8]
+ mov t1d, r6d
+ or r6d, 0x6 ; bottom-left 4x4 has top and right
+ or t1d, 0x5 ; bottom-right 4x4 has top and left
+ vpbroadcastq m28, [base+edge_mask+r6*8]
+ vpbroadcastq m29, [base+edge_mask+t1*8]
+ mov t0d, dirm
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m26, m20 ; index in-range
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m8, m4
+ mova m9, m5
+ mova m10, m6
+ mova m11, m7
+ vpermb m8{k1}, m20, m12
+ vpermb m9{k2}, m20, m13
+ vpermb m10{k3}, m20, m14
+ vpermb m11{k4}, m20, m15
+ mova [rsp+0x00], m26
+ mova [rsp+0x40], m27
+ mova [rsp+0x80], m28
+ mova [rsp+0xC0], m29
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ mova m26, [rsp+0x00]
+ mova m27, [rsp+0x40]
+ mova m28, [rsp+0x80]
+ mova m29, [rsp+0xC0]
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m26, m20
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m16, m4
+ mova m17, m5
+ mova m18, m6
+ mova m19, m7
+ vpermb m16{k1}, m20, m12
+ vpermb m17{k2}, m20, m13
+ vpermb m18{k3}, m20, m14
+ vpermb m19{k4}, m20, m15
+ vpshufbitqmb k1, m26, m21
+ vpshufbitqmb k2, m27, m21
+ vpshufbitqmb k3, m28, m21
+ vpshufbitqmb k4, m29, m21
+ vpermb m12, m21, m12
+ vpermb m13, m21, m13
+ vpermb m14, m21, m14
+ vpermb m15, m21, m15
+ vpblendmb m12{k1}, m4, m12
+ vpblendmb m13{k2}, m5, m13
+ vpblendmb m14{k3}, m6, m14
+ vpblendmb m15{k4}, m7, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
+ vpermb m17, m20, m13 ; pNtr
+ vpermb m18, m20, m14 ; pNbl
+ vpermb m19, m20, m15 ; pNbr
+ vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
+ vpermb m13, m21, m13 ; pNtr
+ vpermb m14, m21, m14 ; pNbl
+ vpermb m15, m21, m15 ; pNbr
+.sec_main:
+%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
+ vpcmpub k1, m4, %1, 6
+ vpcmpub k2, m5, %2, 6
+ vpcmpub k3, m6, %3, 6
+ vpcmpub k4, m7, %4, 6
+ psubb m20, %1, m4
+ psubb m21, %2, m5
+ psubb m22, %3, m6
+ psubb m23, %4, m7
+%if %5
+ vpbroadcastb m28, t1d
+ lzcnt t1d, t1d
+ vpbroadcastq m29, [r3+t1*8]
+%endif
+ vpsubb m20{k1}, m4, %1
+ vpsubb m21{k2}, m5, %2
+ vpsubb m22{k3}, m6, %3
+ vpsubb m23{k4}, m7, %4
+ gf2p8affineqb m24, m20, m29, 0
+ gf2p8affineqb m25, m21, m29, 0
+ gf2p8affineqb m26, m22, m29, 0
+ gf2p8affineqb m27, m23, m29, 0
+%if %5
+ vpbroadcastd m30, [base+sec_tap]
+%endif
+ psubusb m24, m28, m24
+ psubusb m25, m28, m25
+ psubusb m26, m28, m26
+ psubusb m27, m28, m27
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ mova m24, m30
+ mova m25, m30
+ mova m26, m30
+ mova m27, m30
+ vpsubb m24{k1}, m31, m30
+ vpsubb m25{k2}, m31, m30
+ vpsubb m26{k3}, m31, m30
+ vpsubb m27{k4}, m31, m30
+ vpdpbusd m0, m20, m24
+ vpdpbusd m1, m21, m25
+ vpdpbusd m2, m22, m26
+ vpdpbusd m3, m23, m27
+%endmacro
+ CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
+ CDEF_FILTER_8x8_SEC m12, m13, m14, m15
+ ret
+
+%endif ; HAVE_AVX512ICL
%endif ; ARCH_X86_64
--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -85,6 +85,8 @@
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
#if HAVE_AVX512ICL && BITDEPTH == 8
+ c->fb[0] = dav1d_cdef_filter_8x8_avx512icl;
+ c->fb[1] = dav1d_cdef_filter_4x8_avx512icl;
c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
#endif