shithub: dav1d

Download patch

ref: e2c6d0295c58c9f1c9ce6570e993530b6bc94b68
parent: 3b02d3a9e5cdefdb57c8b7ea22e2fd022eb27ea1
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Mon Oct 29 05:26:24 EDT 2018

Add 8x8 cdef_filter AVX2 implementation

cdef_filter_8x8_8bpc_c: 7913.0
cdef_filter_8x8_8bpc_avx2: 309.9

First 1000 frames of Chimera 1080p:
before: 0m23.100s
after: 0m17.863s

--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -36,8 +36,320 @@
 shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
 shufw_210xxxxx: db 4, 5, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 pw_128: times 2 dw 128
+pw_2048: times 2 dw 2048
+tap_table: dw 4, 2, 3, 3, 2, 1, 2, 1
+           db -1 * 16 + 1, -2 * 16 + 2
+           db  0 * 16 + 1, -1 * 16 + 2
+           db  0 * 16 + 1,  0 * 16 + 2
+           db  0 * 16 + 1,  1 * 16 + 2
+           db  1 * 16 + 1,  2 * 16 + 2
+           db  1 * 16 + 0,  2 * 16 + 1
+           db  1 * 16 + 0,  2 * 16 + 0
+           db  1 * 16 + 0,  2 * 16 - 1
+           ; the last 6 are repeats of the first 6 so we don't need to & 7
+           db -1 * 16 + 1, -2 * 16 + 2
+           db  0 * 16 + 1, -1 * 16 + 2
+           db  0 * 16 + 1,  0 * 16 + 2
+           db  0 * 16 + 1,  1 * 16 + 2
+           db  1 * 16 + 1,  2 * 16 + 2
+           db  1 * 16 + 0,  2 * 16 + 1
 
 SECTION .text
+
+INIT_YMM avx2
+cglobal cdef_filter_8x8, 4, 9, 16, 26 * 16, dst, stride, left, top, \
+                                            pri, sec, stride3, dst4, edge
+%define px rsp+32+2*32
+    pcmpeqw        m14, m14
+    psrlw          m14, 1                   ; 0x7fff
+    mov          edged, r8m
+
+    ; prepare pixel buffers - body/right
+    lea          dst4q, [dstq+strideq*4]
+    lea       stride3q, [strideq*3]
+    test         edged, 2                   ; have_right
+    jz .no_right
+    pmovzxbw        m1, [dstq+strideq*0]
+    pmovzxbw        m2, [dstq+strideq*1]
+    pmovzxbw        m3, [dstq+strideq*2]
+    pmovzxbw        m4, [dstq+stride3q]
+    movu     [px+0*32], m1
+    movu     [px+1*32], m2
+    movu     [px+2*32], m3
+    movu     [px+3*32], m4
+    pmovzxbw        m1, [dst4q+strideq*0]
+    pmovzxbw        m2, [dst4q+strideq*1]
+    pmovzxbw        m3, [dst4q+strideq*2]
+    pmovzxbw        m4, [dst4q+stride3q]
+    movu     [px+4*32], m1
+    movu     [px+5*32], m2
+    movu     [px+6*32], m3
+    movu     [px+7*32], m4
+    jmp .body_done
+.no_right:
+    pmovzxbw       xm1, [dstq+strideq*0]
+    pmovzxbw       xm2, [dstq+strideq*1]
+    pmovzxbw       xm3, [dstq+strideq*2]
+    pmovzxbw       xm4, [dstq+stride3q]
+    movu     [px+0*32], xm1
+    movu     [px+1*32], xm2
+    movu     [px+2*32], xm3
+    movu     [px+3*32], xm4
+    movd  [px+0*32+16], xm14
+    movd  [px+1*32+16], xm14
+    movd  [px+2*32+16], xm14
+    movd  [px+3*32+16], xm14
+    pmovzxbw       xm1, [dst4q+strideq*0]
+    pmovzxbw       xm2, [dst4q+strideq*1]
+    pmovzxbw       xm3, [dst4q+strideq*2]
+    pmovzxbw       xm4, [dst4q+stride3q]
+    movu     [px+4*32], xm1
+    movu     [px+5*32], xm2
+    movu     [px+6*32], xm3
+    movu     [px+7*32], xm4
+    movd  [px+4*32+16], xm14
+    movd  [px+5*32+16], xm14
+    movd  [px+6*32+16], xm14
+    movd  [px+7*32+16], xm14
+.body_done:
+
+    ; top
+    DEFINE_ARGS dst, stride, left, top2, pri, sec, top1, dummy, edge
+    test         edged, 4                    ; have_top
+    jz .no_top
+    mov          top1q, [top2q+0*gprsize]
+    mov          top2q, [top2q+1*gprsize]
+    test         edged, 1                    ; have_left
+    jz .top_no_left
+    test         edged, 2                    ; have_right
+    jz .top_no_right
+    pmovzxbw        m1, [top1q-4]
+    pmovzxbw        m2, [top2q-4]
+    movu   [px-2*32-8], m1
+    movu   [px-1*32-8], m2
+    jmp .top_done
+.top_no_right:
+    pmovzxbw        m1, [top1q-8]
+    pmovzxbw        m2, [top2q-8]
+    movu  [px-2*32-16], m1
+    movu  [px-1*32-16], m2
+    movd  [px-2*32+16], xm14
+    movd  [px-1*32+16], xm14
+    jmp .top_done
+.top_no_left:
+    test         edged, 2                   ; have_right
+    jz .top_no_left_right
+    pmovzxbw        m1, [top1q]
+    pmovzxbw        m2, [top2q]
+    movu   [px-2*32+0], m1
+    movu   [px-1*32+0], m2
+    movd   [px-2*32-4], xm14
+    movd   [px-1*32-4], xm14
+    jmp .top_done
+.top_no_left_right:
+    pmovzxbw       xm1, [top1q]
+    pmovzxbw       xm2, [top2q]
+    movu   [px-2*32+0], xm1
+    movu   [px-1*32+0], xm2
+    movd   [px-2*32-4], xm14
+    movd   [px-1*32-4], xm14
+    movd  [px-2*32+16], xm14
+    movd  [px-1*32+16], xm14
+    jmp .top_done
+.no_top:
+    movu   [px-2*32-8], m14
+    movu   [px-1*32-8], m14
+.top_done:
+
+    ; left
+    test         edged, 1                   ; have_left
+    jz .no_left
+    pmovzxbw       xm1, [leftq+ 0]
+    pmovzxbw       xm2, [leftq+ 8]
+    movd   [px+0*32-4], xm1
+    pextrd [px+1*32-4], xm1, 1
+    pextrd [px+2*32-4], xm1, 2
+    pextrd [px+3*32-4], xm1, 3
+    movd   [px+4*32-4], xm2
+    pextrd [px+5*32-4], xm2, 1
+    pextrd [px+6*32-4], xm2, 2
+    pextrd [px+7*32-4], xm2, 3
+    jmp .left_done
+.no_left:
+    movd   [px+0*32-4], xm14
+    movd   [px+1*32-4], xm14
+    movd   [px+2*32-4], xm14
+    movd   [px+3*32-4], xm14
+    movd   [px+4*32-4], xm14
+    movd   [px+5*32-4], xm14
+    movd   [px+6*32-4], xm14
+    movd   [px+7*32-4], xm14
+.left_done:
+
+    ; bottom
+    DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, dummy2, dummy3, edge
+    test         edged, 8                   ; have_bottom
+    jz .no_bottom
+    lea          dst8q, [dstq+8*strideq]
+    test         edged, 1                   ; have_left
+    jz .bottom_no_left
+    test         edged, 2                   ; have_right
+    jz .bottom_no_right
+    pmovzxbw        m1, [dst8q-4]
+    pmovzxbw        m2, [dst8q+strideq-4]
+    movu   [px+8*32-8], m1
+    movu   [px+9*32-8], m2
+    jmp .bottom_done
+.bottom_no_right:
+    pmovzxbw        m1, [dst8q-8]
+    pmovzxbw        m2, [dst8q+strideq-8]
+    movu  [px+8*32-16], m1
+    movu  [px+9*32-16], m2
+    movd  [px+7*32+16], xm14                ; overwritten by previous movu
+    movd  [px+8*32+16], xm14
+    movd  [px+9*32+16], xm14
+    jmp .bottom_done
+.bottom_no_left:
+    test          edged, 2                  ; have_right
+    jz .bottom_no_left_right
+    pmovzxbw        m1, [dst8q]
+    pmovzxbw        m2, [dst8q+strideq]
+    movu   [px+8*32+0], m1
+    movu   [px+9*32+0], m2
+    movd   [px+8*32-4], xm14
+    movd   [px+9*32-4], xm14
+    jmp .bottom_done
+.bottom_no_left_right:
+    pmovzxbw       xm1, [dst8q]
+    pmovzxbw       xm2, [dst8q+strideq]
+    movu   [px+8*32+0], xm1
+    movu   [px+9*32+0], xm2
+    movd   [px+8*32-4], xm14
+    movd   [px+9*32-4], xm14
+    movd  [px+8*32+16], xm14
+    movd  [px+9*32+16], xm14
+    jmp .bottom_done
+.no_bottom:
+    movu   [px+8*32-8], m14
+    movu   [px+9*32-8], m14
+.bottom_done:
+
+    ; actual filter
+    DEFINE_ARGS dst, stride, pridmp, damping, pri, sec, secdmp
+%undef edged
+    movifnidn     prid, prim
+    movifnidn     secd, secm
+    mov       dampingd, r7m
+
+    mov        pridmpd, prid
+    mov        secdmpd, secd
+    or         pridmpd, 1
+    or         secdmpd, 1
+    lzcnt      pridmpd, pridmpd
+    lzcnt      secdmpd, secdmpd
+    lea        pridmpd, [pridmpd+dampingd-31]
+    lea        secdmpd, [secdmpd+dampingd-31]
+    xor       dampingd, dampingd
+    test       pridmpd, pridmpd
+    cmovl      pridmpd, dampingd
+    test       secdmpd, secdmpd
+    cmovl      secdmpd, dampingd
+    mov        [rsp+0], pridmpq                 ; pri_shift
+    mov        [rsp+8], secdmpq                 ; sec_shift
+
+    ; pri/sec_taps[k] [4 total]
+    DEFINE_ARGS dst, stride, tap, dummy, pri, sec
+    movd           xm0, prid
+    movd           xm1, secd
+    vpbroadcastw    m0, xm0                     ; pri_strength
+    vpbroadcastw    m1, xm1                     ; sec_strength
+    and           prid, 1
+    and           secd, 1
+    lea           tapq, [tap_table]
+    lea           priq, [tapq+priq*4]           ; pri_taps
+    lea           secq, [tapq+secq*4+8]         ; sec_taps
+
+    ; off1/2/3[k] [6 total] from [tapq+16+(dir+0/2/6)*2+k]
+    DEFINE_ARGS dst, stride, tap, dir, pri, sec
+    mov           dird, r6m
+    lea           tapq, [tapq+dirq*2+16]
+    DEFINE_ARGS dst, stride, dir, h, pri, sec, stk, off, k
+    mov             hd, 4
+    lea           stkq, [px]
+    pxor           m13, m13
+.v_loop:
+    mov             kd, 1
+    mova           xm4, [stkq+32*0]             ; px
+    vinserti128     m4, [stkq+32*1], 1
+    pxor           m15, m15                     ; sum
+    mova            m7, m4                      ; max
+    mova            m8, m4                      ; min
+.k_loop:
+    vpbroadcastw    m2, [priq+kq*2]             ; pri_taps
+    vpbroadcastw    m3, [secq+kq*2]             ; sec_taps
+
+%macro ACCUMULATE_TAP 4 ; tap_offset, shift, strength, mul_tap
+    ; load p0/p1
+    movsx         offq, byte [dirq+kq+%1]       ; off1
+    movu           xm5, [stkq+offq*2+32*0]      ; p0
+    vinserti128     m5, [stkq+offq*2+32*1], 1
+    neg           offq                          ; -off1
+    movu           xm6, [stkq+offq*2+32*0]      ; p1
+    vinserti128     m6, [stkq+offq*2+32*1], 1
+    pcmpeqw         m9, m14, m5
+    pcmpeqw        m10, m14, m6
+    pandn           m9, m5
+    pandn          m10, m6
+    pmaxsw          m7, m9                      ; max after p0
+    pminsw          m8, m5                      ; min after p0
+    pmaxsw          m7, m10                     ; max after p1
+    pminsw          m8, m6                      ; min after p1
+
+    ; accumulate sum[m15] over p0/p1
+    psubw           m5, m4                      ; diff_p0(p0 - px)
+    psubw           m6, m4                      ; diff_p1(p1 - px)
+    pabsw           m9, m5
+    pabsw          m10, m6
+    psraw          m11, m9,  %2
+    psraw          m12, m10, %2
+    psubw          m11, %3, m11
+    psubw          m12, %3, m12
+    pmaxsw         m11, m13
+    pmaxsw         m12, m13
+    pminsw         m11, m9
+    pminsw         m12, m10
+    psignw         m11, m5                      ; constrain(diff_p0)
+    psignw         m12, m6                      ; constrain(diff_p1)
+    pmullw         m11, %4                      ; constrain(diff_p0) * pri_taps
+    pmullw         m12, %4                      ; constrain(diff_p1) * pri_taps
+    paddw          m15, m11
+    paddw          m15, m12
+%endmacro
+
+    ACCUMULATE_TAP 0*2, [rsp+0], m0, m2
+    ACCUMULATE_TAP 2*2, [rsp+8], m1, m3
+    ACCUMULATE_TAP 6*2, [rsp+8], m1, m3
+
+    dec             kq
+    jge .k_loop
+
+    vpbroadcastd   m12, [pw_2048]
+    pcmpgtw        m11, m13, m15
+    paddw          m15, m11
+    pmulhrsw       m15, m12
+    paddw           m4, m15
+    pminsw          m4, m7
+    pmaxsw          m4, m8
+    packuswb        m4, m4
+    vextracti128   xm5, m4, 1
+    movq [dstq+strideq*0], xm4
+    movq [dstq+strideq*1], xm5
+    lea           dstq, [dstq+strideq*2]
+    add           stkq, 32*2
+    dec             hd
+    jg .v_loop
+
+    RET
 
 INIT_YMM avx2
 cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -28,6 +28,7 @@
 #include "src/cpu.h"
 #include "src/cdef.h"
 
+decl_cdef_fn(dav1d_cdef_filter_8x8_avx2);
 decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
 
 void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
@@ -37,5 +38,6 @@
 
 #if BITDEPTH == 8 && ARCH_X86_64
     c->dir = dav1d_cdef_dir_avx2;
+    c->fb[0] = dav1d_cdef_filter_8x8_avx2;
 #endif
 }