ref: 1d16d38d4ee4dccb2a3c9dad2f5be55a2236bd27
dir: /src/x86/cdef_avx512.asm/
; Copyright © 2020, VideoLAN and dav1d authors ; Copyright © 2020, Two Orioles, LLC ; All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions are met: ; ; 1. Redistributions of source code must retain the above copyright notice, this ; list of conditions and the following disclaimer. ; ; 2. Redistributions in binary form must reproduce the above copyright notice, ; this list of conditions and the following disclaimer in the documentation ; and/or other materials provided with the distribution. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "ext/x86/x86inc.asm" %if HAVE_AVX512ICL && ARCH_X86_64 %macro DUP4 1-* %rep %0 times 4 db %1 %rotate 1 %endrep %endmacro %macro DIRS 16 ; cdef_directions[] %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 ; masking away unused bits allows us to use a single vpaddd {1to16} ; instruction instead of having to do vpbroadcastd + paddb db %13 & 0x3f, -%13 & 0x3f %rotate 1 %endrep %endmacro SECTION_RODATA 64 lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55 db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87 db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119 lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27 db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91 db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123 edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 times 16 db 0 ; realign (introduced by cdef_dirs) end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 sec_tap: db 32, 32, 16, 16 pd_268435568: dd 268435568 SECTION .text %if WIN64 DECLARE_REG_TMP 5, 6 %else DECLARE_REG_TMP 8, 5 %endif ; lut: ; t0 t1 t2 t3 t4 t5 t6 t7 ; T0 T1 T2 T3 T4 T5 T6 T7 ; L0 L1 00 01 02 03 04 05 ; L2 L3 10 11 12 13 14 15 ; L4 L5 20 21 22 23 24 25 ; L6 L7 30 31 32 33 34 35 ; 4e 4f 40 41 42 43 44 45 ; 5e 5f 50 51 52 53 54 55 INIT_ZMM avx512icl cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge %define base r7-edge_mask movq xmm0, [dstq+strideq*0] movhps xmm0, [dstq+strideq*1] lea r7, [edge_mask] movq xmm1, [topq+strideq*0-2] movhps xmm1, [topq+strideq*1-2] mov r6d, edgem vinserti32x4 ym0, ymm0, [leftq], 1 lea r2, [strideq*3] vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 mova m5, [base+lut_perm_4x4] vinserti32x4 m0, [dstq+r2], 2 test r6b, 0x08 ; avoid buffer overread jz .main lea r3, [dstq+strideq*4-4] vinserti32x4 m1, [r3+strideq*0], 2 vinserti32x4 m0, [r3+strideq*1], 3 .main: movifnidn prid, prim mov t0d, dirm mova m3, [base+px_idx] mov r3d, dampingm vpermi2b m5, m0, m1 ; lut vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m7, m7 lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m6, m3, m5 ; px cmp r6d, 0x0f jne .mask_edges ; mask edges only if required test prid, prid jz .sec_only vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 %macro CDEF_FILTER_4x4_PRI 0 vpcmpub k1, m6, m1, 6 ; px > pN psubb m2, m1, m6 lzcnt r6d, prid vpsubb m2{k1}, m6, m1 ; abs(diff) vpbroadcastb m4, prid and prid, 1 vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift movifnidn t1d, secm vpbroadcastd m10, [base+pri_tap+priq*4] vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) pminub m2, m4 vpdpbusd m0, m2, m10 ; sum %endmacro CDEF_FILTER_4x4_PRI test t1d, t1d ; sec jz .end_no_clip call .sec .end_clip: pminub m4, m6, m1 pmaxub m1, m6 pminub m5, m2, m3 pmaxub m2, m3 pminub m4, m5 pmaxub m2, m1 psrldq m1, m4, 2 psrldq m3, m2, 2 pminub m1, m4 vpcmpw k1, m0, m7, 1 vpshldd m6, m0, 8 pmaxub m2, m3 pslldq m3, m1, 1 psubw m7, m0 paddusw m0, m6 ; clip >0xff vpsubusw m0{k1}, m6, m7 ; clip <0x00 pslldq m4, m2, 1 pminub m1, m3 pmaxub m2, m4 pmaxub m0, m1 pminub m0, m2 jmp .end .sec_only: movifnidn t1d, secm call .sec .end_no_clip: vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) .end: mova xm1, [base+end_perm] vpermb m0, m1, m0 ; output in bits 8-15 of each dword movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 pextrd [dstq+strideq*2], xm0, 2 pextrd [dstq+r2 ], xm0, 3 RET .mask_edges_sec_only: movifnidn t1d, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align .mask_edges: vpbroadcastq m8, [base+edge_mask+r6*8] test prid, prid jz .mask_edges_sec_only vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} vpshufbitqmb k1, m8, m2 ; index in-range mova m1, m6 vpermb m1{k1}, m2, m5 CDEF_FILTER_4x4_PRI test t1d, t1d jz .end_no_clip call .mask_edges_sec jmp .end_clip .mask_edges_sec: vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} vpshufbitqmb k1, m8, m4 mova m2, m6 vpermb m2{k1}, m4, m5 vpshufbitqmb k1, m8, m9 mova m3, m6 vpermb m3{k1}, m9, m5 jmp .sec_main ALIGN function_align .sec: vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 .sec_main: vpbroadcastd m8, [base+sec_tap] vpcmpub k1, m6, m2, 6 psubb m4, m2, m6 vpbroadcastb m12, t1d lzcnt t1d, t1d vpsubb m4{k1}, m6, m2 vpcmpub k2, m6, m3, 6 vpbroadcastq m11, [r3+t1*8] gf2p8affineqb m10, m4, m11, 0 psubb m5, m3, m6 mova m9, m8 vpsubb m8{k1}, m7, m8 psubusb m10, m12, m10 vpsubb m5{k2}, m6, m3 pminub m4, m10 vpdpbusd m0, m4, m8 gf2p8affineqb m11, m5, m11, 0 vpsubb m9{k2}, m7, m9 psubusb m12, m11 pminub m5, m12 vpdpbusd m0, m5, m9 ret DECLARE_REG_TMP 2, 7 ; lut top lut bottom ; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 ; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 ; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 ; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 ; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 ; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 ; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 ; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \ pri, sec, dir, damping, edge %define base r8-edge_mask vpbroadcastd ym21, strided mov r6d, edgem lea r8, [edge_mask] movq xm1, [topq+strideq*0-2] pmulld ym21, [base+pd_01234567] kxnorb k1, k1, k1 movq xm2, [topq+strideq*1-2] vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 mova m14, [base+lut_perm_4x8a] movu m15, [base+lut_perm_4x8b] test r6b, 0x08 ; avoid buffer overread jz .main lea r7, [dstq+strideq*8-2] vinserti32x4 ym1, [r7+strideq*0], 1 vinserti32x4 ym2, [r7+strideq*1], 1 .main: punpcklqdq ym1, ym2 vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ movifnidn prid, prim mov t0d, dirm mova m16, [base+px_idx] mov r3d, dampingm vpermi2b m14, m0, m1 ; lut top vpermi2b m15, m0, m1 ; lut bottom vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m20, m20 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m2, m16, m14 ; pxt vpermb m3, m16, m15 ; pxb mova m1, m0 cmp r6b, 0x0f jne .mask_edges ; mask edges only if required test prid, prid jz .sec_only vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 vpermb m5, m6, m15 ; pNb %macro CDEF_FILTER_4x8_PRI 0 vpcmpub k1, m2, m4, 6 ; pxt > pNt vpcmpub k2, m3, m5, 6 ; pxb > pNb psubb m6, m4, m2 psubb m7, m5, m3 lzcnt r6d, prid vpsubb m6{k1}, m2, m4 ; abs(diff_top) vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) vpbroadcastb m13, prid vpbroadcastq m9, [r3+r6*8] and prid, 1 vpbroadcastd m11, [base+pri_tap+priq*4] vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift mova m10, m11 movifnidn t1d, secm vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) pminub m6, m12 pminub m7, m13 vpdpbusd m0, m6, m10 ; sum top vpdpbusd m1, m7, m11 ; sum bottom %endmacro CDEF_FILTER_4x8_PRI test t1d, t1d ; sec jz .end_no_clip call .sec .end_clip: pminub m10, m4, m2 pminub m12, m6, m8 pminub m11, m5, m3 pminub m13, m7, m9 pmaxub m4, m2 pmaxub m6, m8 pmaxub m5, m3 pmaxub m7, m9 pminub m10, m12 pminub m11, m13 pmaxub m4, m6 pmaxub m5, m7 mov r2d, 0xAAAAAAAA kmovd k1, r2d kxnorb k2, k2, k2 ; hw lw vpshrdd m12, m0, m1, 16 ; m1lw m0hw vpshrdd m6, m10, m11, 16 ; m11lw m10hw vpshrdd m8, m4, m5, 16 ; m5lw m4hw vpblendmw m7{k1}, m10, m11 ; m11hw m10lw vpblendmw m9{k1}, m4, m5 ; m5hw m4lw vpblendmw m4{k1}, m0, m12 ; m1lw m0lw vpblendmw m5{k1}, m12, m1 ; m1hw m0hw vpshrdd m2, m3, 16 pminub m6, m7 pmaxub m8, m9 mova ym14, [base+end_perm] vpcmpw k1, m4, m20, 1 vpshldw m2, m5, 8 pslldq m7, m6, 1 pslldq m9, m8, 1 psubw m5, m20, m4 paddusw m0, m4, m2 ; clip >0xff pminub m6, m7 pmaxub m8, m9 psubusw m0{k1}, m2, m5 ; clip <0x00 pmaxub m0, m6 pminub m0, m8 vpermb m0, m14, m0 vpscatterdd [dstq+ym21]{k2}, ym0 RET .sec_only: movifnidn t1d, secm call .sec .end_no_clip: mova ym4, [base+end_perm] kxnorb k1, k1, k1 vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) vpshldd m3, m1, 8 paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) paddw m1, m3 pslld m0, 16 vpshrdd m0, m1, 16 vpermb m0, m4, m0 ; output in bits 8-15 of each word vpscatterdd [dstq+ym21]{k1}, ym0 RET .mask_edges_sec_only: movifnidn t1d, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align .mask_edges: mov t1d, r6d or r6d, 8 ; top 4x4 has bottom or t1d, 4 ; bottom 4x4 has top vpbroadcastq m17, [base+edge_mask+r6*8] vpbroadcastq m18, [base+edge_mask+t1*8] test prid, prid jz .mask_edges_sec_only vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} vpshufbitqmb k1, m17, m6 ; index in-range vpshufbitqmb k2, m18, m6 mova m4, m2 mova m5, m3 vpermb m4{k1}, m6, m14 vpermb m5{k2}, m6, m15 CDEF_FILTER_4x8_PRI test t1d, t1d jz .end_no_clip call .mask_edges_sec jmp .end_clip .mask_edges_sec: vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} vpshufbitqmb k1, m17, m10 vpshufbitqmb k2, m18, m10 vpshufbitqmb k3, m17, m11 vpshufbitqmb k4, m18, m11 mova m6, m2 mova m7, m3 mova m8, m2 mova m9, m3 vpermb m6{k1}, m10, m14 vpermb m7{k2}, m10, m15 vpermb m8{k3}, m11, m14 vpermb m9{k4}, m11, m15 jmp .sec_main ALIGN function_align .sec: vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 vpermb m7, m8, m15 ; pNb vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 vpermb m9, m9, m15 ; pNb .sec_main: vpbroadcastb m18, t1d lzcnt t1d, t1d vpcmpub k1, m2, m6, 6 vpcmpub k2, m3, m7, 6 vpcmpub k3, m2, m8, 6 vpcmpub k4, m3, m9, 6 vpbroadcastq m17, [r3+t1*8] psubb m10, m6, m2 psubb m11, m7, m3 psubb m12, m8, m2 psubb m13, m9, m3 vpsubb m10{k1}, m2, m6 ; abs(dt0) vpsubb m11{k2}, m3, m7 ; abs(db0) vpsubb m12{k3}, m2, m8 ; abs(dt1) vpsubb m13{k4}, m3, m9 ; abs(db1) vpbroadcastd m19, [base+sec_tap] gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) pminub m10, m14 pminub m11, m15 pminub m12, m16 pminub m13, m17 mova m14, m19 mova m15, m19 mova m16, m19 vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) vpdpbusd m0, m10, m14 vpdpbusd m1, m11, m15 vpdpbusd m0, m12, m16 vpdpbusd m1, m13, m19 ret ; lut tl lut tr ; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td ; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD ; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b ; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b ; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b ; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b ; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b ; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b ; lut bl lut br ; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b ; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b ; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b ; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b ; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b ; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b ; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b ; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \ pri, sec, dir, damping, edge %define base r8-edge_mask mov r6d, edgem lea r10, [dstq+strideq*4-2] movu xmm0, [topq+strideq*0-2] movu xmm1, [dstq+strideq*2-2] movu xmm2, [r10 +strideq*2 ] lea r8, [edge_mask] lea r9, [strideq*3] pmovzxwq m10, [leftq-4] vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1 vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1 vinserti32x4 ym2, ymm2, [r10 +r9 ], 1 lea r7, [r10 +strideq*4 ] pmovzxwq m11, [leftq+4] vinserti32x4 m0, [dstq+strideq*0-2], 2 vinserti32x4 m1, [r10 +strideq*0 ], 2 mova m12, [base+lut_perm_8x8a] movu m13, [base+lut_perm_8x8b] vinserti32x4 m0, [dstq+strideq*1-2], 3 vinserti32x4 m1, [r10 +strideq*1 ], 3 test r6b, 0x08 ; avoid buffer overread jz .main vinserti32x4 m2, [r7 +strideq*0], 2 vinserti32x4 m2, [r7 +strideq*1], 3 .main: mov t1d, 0x11111100 mova m14, m12 mova m15, m13 kmovd k1, t1d kshiftrd k2, k1, 8 movifnidn prid, prim mov t0d, dirm mova m30, [base+px_idx] mov r3d, dampingm vpermi2b m12, m0, m1 ; lut tl vpermi2b m14, m1, m2 ; lut bl vpermi2b m13, m0, m1 ; lut tr vpermi2b m15, m1, m2 ; lut br vpblendmw m12{k1}, m12, m10 vpblendmw m14{k2}, m14, m11 vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) pxor m31, m31 lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 vpermb m4, m30, m12 ; pxtl vpermb m5, m30, m13 ; pxtr vpermb m6, m30, m14 ; pxbl vpermb m7, m30, m15 ; pxbr mova m1, m0 mova m2, m0 mova m3, m0 cmp r6b, 0x0f jne .mask_edges ; mask edges only if required test prid, prid jz .sec_only vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 vpermb m9, m11, m13 ; pNtr vpermb m10, m11, m14 ; pNbl vpermb m11, m11, m15 ; pNbr %macro CDEF_FILTER_8x8_PRI 0 vpcmpub k1, m4, m8, 6 ; pxtl > pNtl vpcmpub k2, m5, m9, 6 ; pxtr > pNtr vpcmpub k3, m6, m10, 6 ; pxbl > pNbl vpcmpub k4, m7, m11, 6 ; pxbr > pNbr psubb m16, m8, m4 psubb m17, m9, m5 psubb m18, m10, m6 psubb m19, m11, m7 lzcnt r6d, prid vpsubb m16{k1}, m4, m8 ; abs(diff_tl) vpsubb m17{k2}, m5, m9 ; abs(diff_tr) vpsubb m18{k3}, m6, m10 ; abs(diff_bl) vpsubb m19{k4}, m7, m11 ; abs(diff_br) vpbroadcastq m28, [r3+r6*8] vpbroadcastb m29, prid and prid, 1 vpbroadcastd m27, [base+pri_tap+priq*4] vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift mova m24, m27 mova m25, m27 mova m26, m27 movifnidn t1d, secm vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) pminub m16, m20 pminub m17, m21 pminub m18, m22 pminub m19, m23 vpdpbusd m0, m16, m24 ; sum tl vpdpbusd m1, m17, m25 ; sum tr vpdpbusd m2, m18, m26 ; sum bl vpdpbusd m3, m19, m27 ; sum br %endmacro CDEF_FILTER_8x8_PRI test t1d, t1d ; sec jz .end_no_clip call .sec .end_clip: pminub m20, m8, m4 pminub m24, m12, m16 pminub m21, m9, m5 pminub m25, m13, m17 pminub m22, m10, m6 pminub m26, m14, m18 pminub m23, m11, m7 pminub m27, m15, m19 pmaxub m8, m4 pmaxub m12, m16 pmaxub m9, m5 pmaxub m13, m17 pmaxub m10, m6 pmaxub m14, m18 pmaxub m11, m7 pmaxub m15, m19 pminub m20, m24 pminub m21, m25 pminub m22, m26 pminub m23, m27 pmaxub m8, m12 pmaxub m9, m13 pmaxub m10, m14 pmaxub m11, m15 mov r2d, 0xAAAAAAAA kmovd k1, r2d vpshrdd m24, m0, m1, 16 vpshrdd m25, m2, m3, 16 vpshrdd m12, m20, m21, 16 vpshrdd m14, m22, m23, 16 vpshrdd m16, m8, m9, 16 vpshrdd m18, m10, m11, 16 vpblendmw m13{k1}, m20, m21 vpblendmw m15{k1}, m22, m23 vpblendmw m17{k1}, m8, m9 vpblendmw m19{k1}, m10, m11 vpblendmw m20{k1}, m0, m24 vpblendmw m21{k1}, m24, m1 vpblendmw m22{k1}, m2, m25 vpblendmw m23{k1}, m25, m3 vpshrdd m4, m5, 16 vpshrdd m6, m7, 16 pminub m12, m13 pminub m14, m15 pmaxub m16, m17 pmaxub m18, m19 mova m8, [base+end_perm_w8clip] vpcmpw k2, m20, m31, 1 vpcmpw k3, m22, m31, 1 vpshldw m4, m21, 8 vpshldw m6, m23, 8 kunpckdq k1, k1, k1 kxnorb k4, k4, k4 vpshrdw m11, m12, m14, 8 vpshrdw m15, m16, m18, 8 vpblendmb m13{k1}, m12, m14 vpblendmb m17{k1}, m16, m18 psubw m21, m31, m20 psubw m23, m31, m22 paddusw m0, m20, m4 ; clip >0xff paddusw m1, m22, m6 pminub m11, m13 pmaxub m15, m17 psubusw m0{k2}, m4, m21 ; clip <0x00 psubusw m1{k3}, m6, m23 psrlw m0, 8 vmovdqu8 m0{k1}, m1 pmaxub m0, m11 pminub m0, m15 vpermb m0, m8, m0 add r10, 2 vextracti32x4 xm1, m0, 1 vextracti32x4 xm2, m0, 2 vextracti32x4 xm3, m0, 3 movq [dstq+strideq*0], xm0 movq [dstq+strideq*2], xm1 movq [r10 +strideq*0], xm2 movq [r10 +strideq*2], xm3 movhps [dstq+strideq*1], xm0 movhps [dstq+r9 ], xm1 movhps [r10 +strideq*1], xm2 movhps [r10 +r9 ], xm3 RET .sec_only: movifnidn t1d, secm call .sec .end_no_clip: mova xm8, [base+end_perm] kxnorb k1, k1, k1 vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) vpshldd m5, m1, 8 vpshldd m6, m2, 8 vpshldd m7, m3, 8 paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) paddw m1, m5 paddw m2, m6 paddw m3, m7 vpermb m0, m8, m0 vpermb m1, m8, m1 vpermb m2, m8, m2 vpermb m3, m8, m3 add r10, 2 punpckldq m4, m0, m1 punpckhdq m0, m1 punpckldq m5, m2, m3 punpckhdq m2, m3 movq [dstq+strideq*0], xm4 movq [dstq+strideq*2], xm0 movq [r10 +strideq*0], xm5 movq [r10 +strideq*2], xm2 movhps [dstq+strideq*1], xm4 movhps [dstq+r9 ], xm0 movhps [r10 +strideq*1], xm5 movhps [r10 +r9 ], xm2 RET .mask_edges_sec_only: movifnidn t1d, secm call .mask_edges_sec jmp .end_no_clip ALIGN function_align .mask_edges: mov t0d, r6d mov t1d, r6d or t0d, 0xA ; top-left 4x4 has bottom and right or t1d, 0x9 ; top-right 4x4 has bottom and left vpbroadcastq m26, [base+edge_mask+t0*8] vpbroadcastq m27, [base+edge_mask+t1*8] mov t1d, r6d or r6d, 0x6 ; bottom-left 4x4 has top and right or t1d, 0x5 ; bottom-right 4x4 has top and left vpbroadcastq m28, [base+edge_mask+r6*8] vpbroadcastq m29, [base+edge_mask+t1*8] mov t0d, dirm test prid, prid jz .mask_edges_sec_only vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} vpshufbitqmb k1, m26, m20 ; index in-range vpshufbitqmb k2, m27, m20 vpshufbitqmb k3, m28, m20 vpshufbitqmb k4, m29, m20 mova m8, m4 mova m9, m5 mova m10, m6 mova m11, m7 vpermb m8{k1}, m20, m12 vpermb m9{k2}, m20, m13 vpermb m10{k3}, m20, m14 vpermb m11{k4}, m20, m15 mova [rsp+0x00], m26 mova [rsp+0x40], m27 mova [rsp+0x80], m28 mova [rsp+0xC0], m29 CDEF_FILTER_8x8_PRI test t1d, t1d jz .end_no_clip mova m26, [rsp+0x00] mova m27, [rsp+0x40] mova m28, [rsp+0x80] mova m29, [rsp+0xC0] call .mask_edges_sec jmp .end_clip .mask_edges_sec: vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} vpshufbitqmb k1, m26, m20 vpshufbitqmb k2, m27, m20 vpshufbitqmb k3, m28, m20 vpshufbitqmb k4, m29, m20 mova m16, m4 mova m17, m5 mova m18, m6 mova m19, m7 vpermb m16{k1}, m20, m12 vpermb m17{k2}, m20, m13 vpermb m18{k3}, m20, m14 vpermb m19{k4}, m20, m15 vpshufbitqmb k1, m26, m21 vpshufbitqmb k2, m27, m21 vpshufbitqmb k3, m28, m21 vpshufbitqmb k4, m29, m21 vpermb m12, m21, m12 vpermb m13, m21, m13 vpermb m14, m21, m14 vpermb m15, m21, m15 vpblendmb m12{k1}, m4, m12 vpblendmb m13{k2}, m5, m13 vpblendmb m14{k3}, m6, m14 vpblendmb m15{k4}, m7, m15 jmp .sec_main ALIGN function_align .sec: vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 vpermb m17, m20, m13 ; pNtr vpermb m18, m20, m14 ; pNbl vpermb m19, m20, m15 ; pNbr vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 vpermb m13, m21, m13 ; pNtr vpermb m14, m21, m14 ; pNbl vpermb m15, m21, m15 ; pNbr .sec_main: %macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants vpcmpub k1, m4, %1, 6 vpcmpub k2, m5, %2, 6 vpcmpub k3, m6, %3, 6 vpcmpub k4, m7, %4, 6 psubb m20, %1, m4 psubb m21, %2, m5 psubb m22, %3, m6 psubb m23, %4, m7 %if %5 vpbroadcastb m28, t1d lzcnt t1d, t1d vpbroadcastq m29, [r3+t1*8] %endif vpsubb m20{k1}, m4, %1 vpsubb m21{k2}, m5, %2 vpsubb m22{k3}, m6, %3 vpsubb m23{k4}, m7, %4 gf2p8affineqb m24, m20, m29, 0 gf2p8affineqb m25, m21, m29, 0 gf2p8affineqb m26, m22, m29, 0 gf2p8affineqb m27, m23, m29, 0 %if %5 vpbroadcastd m30, [base+sec_tap] %endif psubusb m24, m28, m24 psubusb m25, m28, m25 psubusb m26, m28, m26 psubusb m27, m28, m27 pminub m20, m24 pminub m21, m25 pminub m22, m26 pminub m23, m27 mova m24, m30 mova m25, m30 mova m26, m30 mova m27, m30 vpsubb m24{k1}, m31, m30 vpsubb m25{k2}, m31, m30 vpsubb m26{k3}, m31, m30 vpsubb m27{k4}, m31, m30 vpdpbusd m0, m20, m24 vpdpbusd m1, m21, m25 vpdpbusd m2, m22, m26 vpdpbusd m3, m23, m27 %endmacro CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 CDEF_FILTER_8x8_SEC m12, m13, m14, m15 ret %endif ; HAVE_AVX512ICL && ARCH_X86_64