shithub: dav1d

Download patch

ref: d4a7c6470b80a50a29cbfd6f791f891277ce0a62
parent: 50e9a39a07c038d3c636176ac4fd85a76c0bdc6e
author: Henrik Gramner <gramner@twoorioles.com>
date: Thu Feb 20 10:25:44 EST 2020

x86: Add mc w_mask 4:2:2 AVX-512 (Ice Lake) asm

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -63,6 +63,10 @@
                 db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
                 db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
                 db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_422_mask:    db  2,  6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
+                db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+                db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
+                db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
 bilin_h_perm16: db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
                 db  9,  8, 10,  9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
                 db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
@@ -133,10 +137,10 @@
 
 wm_420_perm64:  dq 0xfedcba9876543210
 wm_420_sign:    dd 0x01020102, 0x01010101
-wm_sign_avx512: dd 0x40804080, 0x40404040
+wm_422_sign:    dd 0x80808080, 0x7f7f7f7f
+wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040
 
-pb_m64:  times 4 db -64
-pb_64:   times 4 db 64
+pw_m128  times 2 dw -128
 pw_34:   times 2 dw 34
 pw_258:  times 2 dw 258
 pw_512:  times 2 dw 512
@@ -149,6 +153,10 @@
 pd_512:   dd 512
 pd_32768: dd 32768
 
+%define pb_m64 (wm_sign_avx512+4)
+%define pb_64  (wm_sign_avx512+8)
+%define pb_127 (wm_422_sign   +4)
+
 cextern mc_subpel_filters
 cextern mc_warp_filter
 %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
@@ -235,6 +243,7 @@
 BIDIR_JMP_TABLE w_avg_avx512icl,           4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE mask_avx512icl,            4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE w_mask_420_avx512icl,      4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422_avx512icl,      4, 8, 16, 32, 64, 128
 
 SECTION .text
 
@@ -4420,181 +4429,6 @@
     packuswb            m%1, m1
 %endmacro
 
-cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
-%define base r7-w_mask_422_avx2_table
-    lea                  r7, [w_mask_422_avx2_table]
-    tzcnt                wd, wm
-    movifnidn            hd, hm
-    mov               maskq, maskmp
-    movd                xm0, r7m ; sign
-    pxor                 m9, m9
-    movsxd               wq, dword [r7+wq*4]
-    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
-    vpbroadcastd         m7, [base+pw_2048]
-    pmovzxbd            m10, [base+deint_shuf4]
-    add                  wq, r7
-    psrlw               xm8, xm7, 4 ; pw_128
-    psubb               xm8, xm0
-    vpbroadcastb         m8, xm8
-    W_MASK                0, 4, 0, 1
-    lea            stride3q, [strideq*3]
-    jmp                  wq
-.w4:
-    vextracti128        xm1, m0, 1
-    movd   [dstq+strideq*0], xm0
-    pextrd [dstq+strideq*1], xm0, 1
-    movd   [dstq+strideq*2], xm1
-    pextrd [dstq+stride3q ], xm1, 1
-    cmp                  hd, 8
-    jl .w4_end
-    lea                dstq, [dstq+strideq*4]
-    pextrd [dstq+strideq*0], xm0, 2
-    pextrd [dstq+strideq*1], xm0, 3
-    pextrd [dstq+strideq*2], xm1, 2
-    pextrd [dstq+stride3q ], xm1, 3
-    jg .w4_h16
-.w4_end:
-    vextracti128        xm5, m4, 1
-    packuswb            xm4, xm5
-    psubb               xm5, xm8, xm4
-    pavgb               xm5, xm9
-    pshufd              xm5, xm5, q3120
-    mova            [maskq], xm5
-    RET
-.w4_h16:
-    W_MASK                0, 5, 2, 3
-    lea                dstq, [dstq+strideq*4]
-    packuswb             m4, m5
-    psubb                m5, m8, m4
-    pavgb                m5, m9
-    vpermd               m5, m10, m5
-    vextracti128        xm1, m0, 1
-    movd   [dstq+strideq*0], xm0
-    pextrd [dstq+strideq*1], xm0, 1
-    movd   [dstq+strideq*2], xm1
-    pextrd [dstq+stride3q ], xm1, 1
-    lea                dstq, [dstq+strideq*4]
-    pextrd [dstq+strideq*0], xm0, 2
-    pextrd [dstq+strideq*1], xm0, 3
-    pextrd [dstq+strideq*2], xm1, 2
-    pextrd [dstq+stride3q ], xm1, 3
-    mova            [maskq], m5
-    RET
-.w8_loop:
-    add               tmp1q, 32*2
-    add               tmp2q, 32*2
-    W_MASK                0, 4, 0, 1
-    lea                dstq, [dstq+strideq*4]
-    add               maskq, 16
-.w8:
-    vextracti128        xm5, m4, 1
-    vextracti128        xm1, m0, 1
-    packuswb            xm4, xm5
-    psubb               xm5, xm8, xm4
-    pavgb               xm5, xm9
-    pshufd              xm5, xm5, q3120
-    movq   [dstq+strideq*0], xm0
-    movq   [dstq+strideq*1], xm1
-    movhps [dstq+strideq*2], xm0
-    movhps [dstq+stride3q ], xm1
-    mova            [maskq], xm5
-    sub                  hd, 4
-    jg .w8_loop
-    RET
-.w16_loop:
-    add               tmp1q, 32*4
-    add               tmp2q, 32*4
-    W_MASK                0, 4, 0, 1
-    lea                dstq, [dstq+strideq*4]
-    add               maskq, 32
-.w16:
-    vpermq               m0, m0, q3120
-    mova         [dstq+strideq*0], xm0
-    vextracti128 [dstq+strideq*1], m0, 1
-    W_MASK                0, 5, 2, 3
-    packuswb             m4, m5
-    psubb                m5, m8, m4
-    pavgb                m5, m9
-    vpermq               m0, m0, q3120
-    vpermd               m5, m10, m5
-    mova         [dstq+strideq*2], xm0
-    vextracti128 [dstq+stride3q ], m0, 1
-    mova            [maskq], m5
-    sub                  hd, 4
-    jg .w16_loop
-    RET
-.w32_loop:
-    add               tmp1q, 32*4
-    add               tmp2q, 32*4
-    W_MASK                0, 4, 0, 1
-    lea                dstq, [dstq+strideq*2]
-    add               maskq, 32
-.w32:
-    vpermq               m0, m0, q3120
-    mova   [dstq+strideq*0], m0
-    W_MASK                0, 5, 2, 3
-    packuswb             m4, m5
-    psubb                m5, m8, m4
-    pavgb                m5, m9
-    vpermq               m0, m0, q3120
-    vpermd               m5, m10, m5
-    mova   [dstq+strideq*1], m0
-    mova            [maskq], m5
-    sub                  hd, 2
-    jg .w32_loop
-    RET
-.w64_loop:
-    add               tmp1q, 32*4
-    add               tmp2q, 32*4
-    W_MASK                0, 4, 0, 1
-    add                dstq, strideq
-    add               maskq, 32
-.w64:
-    vpermq               m0, m0, q3120
-    mova        [dstq+32*0], m0
-    W_MASK                0, 5, 2, 3
-    packuswb             m4, m5
-    psubb                m5, m8, m4
-    pavgb                m5, m9
-    vpermq               m0, m0, q3120
-    vpermd               m5, m10, m5
-    mova        [dstq+32*1], m0
-    mova            [maskq], m5
-    dec                  hd
-    jg .w64_loop
-    RET
-.w128_loop:
-    add               tmp1q, 32*8
-    add               tmp2q, 32*8
-    W_MASK                0, 4, 0, 1
-    add                dstq, strideq
-    add               maskq, 32*2
-.w128:
-    vpermq               m0, m0, q3120
-    mova        [dstq+32*0], m0
-    W_MASK                0, 5, 2, 3
-    packuswb             m4, m5
-    psubb                m5, m8, m4
-    pavgb                m5, m9
-    vpermq               m0, m0, q3120
-    vpermd               m5, m10, m5
-    mova        [dstq+32*1], m0
-    mova       [maskq+32*0], m5
-    W_MASK                0, 4, 4, 5
-    vpermq               m0, m0, q3120
-    mova        [dstq+32*2], m0
-    W_MASK                0, 5, 6, 7
-    packuswb             m4, m5
-    psubb                m5, m8, m4
-    pavgb                m5, m9
-    vpermq               m0, m0, q3120
-    vpermd               m5, m10, m5
-    mova        [dstq+32*3], m0
-    mova       [maskq+32*1], m5
-    dec                  hd
-    jg .w128_loop
-    RET
-
 cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
 %define base r7-w_mask_444_avx2_table
     lea                  r7, [w_mask_444_avx2_table]
@@ -5486,6 +5320,179 @@
     jg .w128_loop
     RET
 
+cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+    lea                  r7, [w_mask_422_avx2_table]
+    tzcnt                wd, wm
+    mov                 r6d, r7m ; sign
+    movifnidn            hd, hm
+    pxor                 m9, m9
+    movsxd               wq, dword [r7+wq*4]
+    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+    vpbroadcastd         m7, [base+pw_2048]
+    pmovzxbd            m10, [base+deint_shuf4]
+    vpbroadcastd         m8, [base+wm_422_sign+r6*4] ; 128 - sign
+    add                  wq, r7
+    mov               maskq, maskmp
+    W_MASK                0, 4, 0, 1
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    cmp                  hd, 8
+    jl .w4_end
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    jg .w4_h16
+.w4_end:
+    vextracti128        xm5, m4, 1
+    packuswb            xm4, xm5
+    psubb               xm5, xm8, xm4
+    pavgb               xm5, xm9
+    pshufd              xm5, xm5, q3120
+    mova            [maskq], xm5
+    RET
+.w4_h16:
+    W_MASK                0, 5, 2, 3
+    lea                dstq, [dstq+strideq*4]
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermd               m5, m10, m5
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xm1
+    pextrd [dstq+stride3q ], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xm1, 2
+    pextrd [dstq+stride3q ], xm1, 3
+    mova            [maskq], m5
+    RET
+.w8_loop:
+    add               tmp1q, 32*2
+    add               tmp2q, 32*2
+    W_MASK                0, 4, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 16
+.w8:
+    vextracti128        xm5, m4, 1
+    vextracti128        xm1, m0, 1
+    packuswb            xm4, xm5
+    psubb               xm5, xm8, xm4
+    pavgb               xm5, xm9
+    pshufd              xm5, xm5, q3120
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xm1
+    mova            [maskq], xm5
+    sub                  hd, 4
+    jg .w8_loop
+    RET
+.w16_loop:
+    add               tmp1q, 32*4
+    add               tmp2q, 32*4
+    W_MASK                0, 4, 0, 1
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 32
+.w16:
+    vpermq               m0, m0, q3120
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    W_MASK                0, 5, 2, 3
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova         [dstq+strideq*2], xm0
+    vextracti128 [dstq+stride3q ], m0, 1
+    mova            [maskq], m5
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32_loop:
+    add               tmp1q, 32*4
+    add               tmp2q, 32*4
+    W_MASK                0, 4, 0, 1
+    lea                dstq, [dstq+strideq*2]
+    add               maskq, 32
+.w32:
+    vpermq               m0, m0, q3120
+    mova   [dstq+strideq*0], m0
+    W_MASK                0, 5, 2, 3
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova   [dstq+strideq*1], m0
+    mova            [maskq], m5
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64_loop:
+    add               tmp1q, 32*4
+    add               tmp2q, 32*4
+    W_MASK                0, 4, 0, 1
+    add                dstq, strideq
+    add               maskq, 32
+.w64:
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*0], m0
+    W_MASK                0, 5, 2, 3
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova        [dstq+32*1], m0
+    mova            [maskq], m5
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128_loop:
+    add               tmp1q, 32*8
+    add               tmp2q, 32*8
+    W_MASK                0, 4, 0, 1
+    add                dstq, strideq
+    add               maskq, 32*2
+.w128:
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*0], m0
+    W_MASK                0, 5, 2, 3
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova        [dstq+32*1], m0
+    mova       [maskq+32*0], m5
+    W_MASK                0, 4, 4, 5
+    vpermq               m0, m0, q3120
+    mova        [dstq+32*2], m0
+    W_MASK                0, 5, 6, 7
+    packuswb             m4, m5
+    psubb                m5, m8, m4
+    pavgb                m5, m9
+    vpermq               m0, m0, q3120
+    vpermd               m5, m10, m5
+    mova        [dstq+32*3], m0
+    mova       [maskq+32*1], m5
+    dec                  hd
+    jg .w128_loop
+    RET
+
 INIT_ZMM avx512icl
 PREP_BILIN
 PREP_8TAP
@@ -5504,7 +5511,7 @@
     vpbroadcastd         m7, [base+pw_2048]
     vpbroadcastd         m9, [base+pb_m64]             ; -1 << 6
     mova               ym10, [base+wm_420_mask+32]
-    vpbroadcastd         m8, [base+wm_sign_avx512+r6*4] ; (258 - sign) << 6
+    vpbroadcastd         m8, [base+wm_sign_avx512+r6*8] ; (258 - sign) << 6
     add                  wq, r7
     mov               maskq, maskmp
     lea            stride3q, [strideq*3]
@@ -5683,6 +5690,179 @@
     mova [dstq+strideq*1+64*1], m1
     lea                dstq, [dstq+strideq*2]
     sub                  hd, 2
+    jg .w128_loop
+    RET
+
+cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+    lea                  r7, [w_mask_422_avx512icl_table]
+    tzcnt                wd, wm
+    mov                 r6d, r7m ; sign
+    movifnidn            hd, hm
+    movsxd               wq, dword [r7+wq*4]
+    vpbroadcastd         m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+    vpbroadcastd         m7, [base+pw_2048]
+    vpbroadcastd         m9, [base+pw_m128]
+    mova                m10, [base+wm_422_mask]
+    vpbroadcastd        m11, [base+pb_127]
+    add                  wq, r7
+    vpbroadcastd         m8, [base+wm_sign_avx512+4+r6*4]
+    mov               maskq, maskmp
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.w4:
+    cmp                  hd, 8
+    jg .w4_h16
+    WRAP_YMM W_MASK       0, 4, 0, 1
+    movhps             xm10, [wm_422_mask+16]
+    vpdpwssd            ym8, ym4, ym9
+    vpermb              ym8, ym10, ym8
+    vextracti128       xmm1, m0, 1
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    movd   [dstq+strideq*2], xmm1
+    pextrd [dstq+stride3q ], xmm1, 1
+    jl .w4_end
+    lea                dstq, [dstq+strideq*4]
+    pextrd [dstq+strideq*0], xm0, 2
+    pextrd [dstq+strideq*1], xm0, 3
+    pextrd [dstq+strideq*2], xmm1, 2
+    pextrd [dstq+stride3q ], xmm1, 3
+.w4_end:
+    pand                xm8, xm11
+    mova            [maskq], xm8
+    RET
+.w4_h16:
+    vpbroadcastd         m5, strided
+    pmulld               m5, [bidir_sctr_w4]
+    W_MASK                0, 4, 0, 1
+    vpdpwssd             m8, m4, m9
+    kxnorw               k1, k1, k1
+    vpermb               m8, m10, m8
+    pand                ym8, ym11
+    mova            [maskq], ym8
+    vpscatterdd [dstq+m5]{k1}, m0
+    RET
+.w8:
+    cmp                  hd, 4
+    jne .w8_h8
+    WRAP_YMM W_MASK       0, 4, 0, 1
+    movhps             xm10, [wm_422_mask+16]
+    vpdpwssd            ym8, ym4, ym9
+    vpermb              ym8, ym10, ym8
+    pand                xm8, xm11
+    mova            [maskq], xm8
+    vextracti128       xmm1, ym0, 1
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xm0
+    movhps [dstq+stride3q ], xmm1
+    RET
+.w8_loop:
+    add               tmp1q, 128
+    add               tmp2q, 128
+    add               maskq, 32
+    lea                dstq, [dstq+strideq*4]
+.w8_h8:
+    W_MASK                0, 4, 0, 1
+    mova                 m1, m8
+    vpdpwssd             m1, m4, m9
+    vpermb               m1, m10, m1
+    pand                ym1, ym11
+    mova            [maskq], ym1
+    vextracti32x4      xmm1, ym0, 1
+    vextracti32x4      xmm2, m0, 2
+    vextracti32x4      xmm3, m0, 3
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*1], xmm1
+    movq   [dstq+strideq*2], xmm2
+    movq   [dstq+stride3q ], xmm3
+    lea                dstq, [dstq+strideq*4]
+    movhps [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xmm1
+    movhps [dstq+strideq*2], xmm2
+    movhps [dstq+stride3q ], xmm3
+    sub                  hd, 8
+    jg .w8_loop
+    RET
+.w16_loop:
+    add               tmp1q, 128
+    add               tmp2q, 128
+    add               maskq, 32
+    lea                dstq, [dstq+strideq*4]
+.w16:
+    W_MASK                0, 4, 0, 1
+    mova                 m1, m8
+    vpdpwssd             m1, m4, m9
+    vpermb               m1, m10, m1
+    vpermq               m0, m0, q3120
+    pand                ym1, ym11
+    mova            [maskq], ym1
+    mova          [dstq+strideq*0], xm0
+    vextracti32x4 [dstq+strideq*1], m0, 2
+    vextracti32x4 [dstq+strideq*2], ym0, 1
+    vextracti32x4 [dstq+stride3q ], m0, 3
+    sub                  hd, 4
+    jg .w16_loop
+    RET
+.w32:
+    pmovzxbq             m5, [warp_8x8_shufA]
+.w32_loop:
+    W_MASK                0, 4, 0, 1
+    mova                 m1, m8
+    vpdpwssd             m1, m4, m9
+    add               tmp1q, 128
+    add               tmp2q, 128
+    vpermb               m1, m10, m1
+    vpermq               m0, m5, m0
+    pand                ym1, ym11
+    mova            [maskq], ym1
+    add               maskq, 32
+    mova          [dstq+strideq*0], ym0
+    vextracti32x8 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32_loop
+    RET
+.w64:
+    pmovzxbq             m5, [warp_8x8_shufA]
+.w64_loop:
+    W_MASK                0, 4, 0, 1
+    mova                 m1, m8
+    vpdpwssd             m1, m4, m9
+    add               tmp1q, 128
+    add               tmp2q, 128
+    vpermb               m1, m10, m1
+    vpermq               m0, m5, m0
+    pand                ym1, ym11
+    mova            [maskq], ym1
+    add               maskq, 32
+    mova             [dstq], m0
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+    RET
+.w128:
+    pmovzxbq            m13, [warp_8x8_shufA]
+.w128_loop:
+    W_MASK                0, 4, 0, 1
+    W_MASK               12, 5, 2, 3
+    mova                 m2, m8
+    vpdpwssd             m2, m4, m9
+    mova                 m3, m8
+    vpdpwssd             m3, m5, m9
+    add               tmp1q, 256
+    add               tmp2q, 256
+    vpermt2b             m2, m10, m3
+    vpermq               m0, m13, m0
+    vpermq               m1, m13, m12
+    pand                 m2, m11
+    mova            [maskq], m2
+    add               maskq, 64
+    mova        [dstq+64*0], m0
+    mova        [dstq+64*1], m1
+    add                dstq, strideq
+    dec                  hd
     jg .w128_loop
     RET
 
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -92,6 +92,7 @@
 decl_w_mask_fn(dav1d_w_mask_420_avx512icl);
 decl_w_mask_fn(dav1d_w_mask_420_avx2);
 decl_w_mask_fn(dav1d_w_mask_420_ssse3);
+decl_w_mask_fn(dav1d_w_mask_422_avx512icl);
 decl_w_mask_fn(dav1d_w_mask_422_avx2);
 decl_w_mask_fn(dav1d_w_mask_444_avx2);
 decl_blend_fn(dav1d_blend_avx2);
@@ -237,6 +238,7 @@
     c->avg = dav1d_avg_avx512icl;
     c->w_avg = dav1d_w_avg_avx512icl;
     c->mask = dav1d_mask_avx512icl;
+    c->w_mask[1] = dav1d_w_mask_422_avx512icl;
     c->w_mask[2] = dav1d_w_mask_420_avx512icl;
 #endif
 #endif