shithub: dav1d

Download patch

ref: b3da18fefd17c427207ca1dfd2eaaac951f053a9
parent: 00d1f4d50117c48f39d8497682755b63b16389a4
author: Francois Cartegnie <fcvlcdev@free.fr>
date: Thu Nov 29 09:30:08 EST 2018

add SSSE3 w_mask_420

--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -57,6 +57,7 @@
 decl_mask_fn(dav1d_mask_avx2);
 decl_mask_fn(dav1d_mask_ssse3);
 decl_w_mask_fn(dav1d_w_mask_420_avx2);
+decl_w_mask_fn(dav1d_w_mask_420_ssse3);
 decl_blend_fn(dav1d_blend_avx2);
 decl_blend_dir_fn(dav1d_blend_v_avx2);
 decl_blend_dir_fn(dav1d_blend_h_avx2);
@@ -81,6 +82,7 @@
     c->avg = dav1d_avg_ssse3;
     c->w_avg = dav1d_w_avg_ssse3;
     c->mask = dav1d_mask_ssse3;
+    c->w_mask[2] = dav1d_w_mask_420_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -29,6 +29,9 @@
 
 SECTION_RODATA 16
 
+pw_8:    times 8 dw 8
+pw_26:   times 8 dw 26
+pw_258:  times 8 dw 258
 pw_1024: times 8 dw 1024
 pw_2048: times 8 dw 2048
 
@@ -48,6 +51,7 @@
 BIDIR_JMP_TABLE avg_ssse3,        4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE w_avg_ssse3,      4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE mask_ssse3,       4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
 
 SECTION .text
 
@@ -250,4 +254,176 @@
     add                  wq, r6
     mov               maskq, r6m
     BIDIR_FN           MASK
+%undef hd
+
+%if ARCH_X86_64
+ %define reg_pw_8         m8
+ %define reg_pw_27        m9
+ %define reg_pw_2048      m10
+%else
+ %define reg_pw_8         [pw_8]
+ %define reg_pw_27        [pw_26] ; 64 - 38
+ %define reg_pw_2048      [pw_2048]
+%endif
+
+%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
+    ;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
+    mova                 m0, [tmp1q+(%1)]
+    mova                 m1, [tmp2q+(%1)]
+    psubw                m1, m0 ; tmp1 - tmp2
+    pabsw                m3, m1 ; abs(tmp1 - tmp2)
+    paddw                m3, reg_pw_8 ; abs(tmp1 - tmp2) + 8
+    psrlw                m3, 8  ; (abs(tmp1 - tmp2) + 8) >> 8
+    psubusw             m%2, reg_pw_27, m3 ; 64 - min(m, 64)
+    psllw                m2, m%2, 10
+    pmulhw               m1, m2 ; tmp2 * ()
+    paddw                m0, m1 ; tmp1 + ()
+    ;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] ****
+    mova                 m1, [tmp1q+(%1)+mmsize]
+    mova                 m2, [tmp2q+(%1)+mmsize]
+    psubw                m2, m1 ; tmp1 - tmp2
+    pabsw                m7, m2 ; abs(tmp1 - tmp2)
+    paddw                m7, reg_pw_8 ; abs(tmp1 - tmp2) + 8
+    psrlw                m7, 8  ; (abs(tmp1 - tmp2) + 8) >> 8
+    psubusw              m3, reg_pw_27, m7 ; 64 - min(m, 64)
+    phaddw              m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0]
+    psllw                m3, 10
+    pmulhw               m2, m3
+    paddw                m1, m2
+    ;********
+    pmulhrsw             m0, reg_pw_2048 ; round/scale 2048
+    pmulhrsw             m1, reg_pw_2048 ; round/scale 2048
+    packuswb             m0, m1 ; concat m0 = u8.dst[15..0]
+%endmacro
+
+%macro W_MASK_420 2
+    W_MASK_420_B (%1*16), %2
+%endmacro
+
+%if ARCH_X86_64
+; args: dst, stride, tmp1, tmp2, w, h, mask, sign
+cglobal w_mask_420, 4, 9, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+    lea                  r7, [w_mask_420_ssse3_table]
+    mov                  wd, wm
+    tzcnt               r8d, wd
+    movifnidn            hd, hm
+    mov               maskq, maskmp
+    movd                 m0, r7m
+    pshuflw              m0, m0, q0000 ; sign
+    punpcklqdq           m0, m0
+    movsxd               r8, dword [r7+r8*4]
+    mova           reg_pw_8, [pw_8]
+    mova          reg_pw_27, [pw_26] ; 64 - 38
+    mova        reg_pw_2048, [pw_2048]
+    mova                 m6, [pw_258] ; 64 * 4 + 2
+    psubw                m6, m0
+    add                  r8, r7
+    W_MASK_420            0, 4
+    lea            stride3q, [strideq*3]
+    jmp                  r8
+    %define dst_bak      r8
+    %define loop_w       r7
+    %define orig_w       wq
+%else
+cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3
+    tzcnt               r6d, r4m
+    mov                  wd, w_mask_420_ssse3_table
+    add                  wd, [wq+r6*4]
+    mov               maskq, r6mp
+    movd                 m0, r7m
+    pshuflw              m0, m0, q0000 ; sign
+    punpcklqdq           m0, m0
+    mova                 m6, [pw_258] ; 64 * 4 + 2
+    psubw                m6, m0
+    W_MASK_420            0, 4
+    lea            stride3q, [strideq*3]
+    jmp                  wd
+    %define dst_bak     r0m
+    %define loop_w      r6q
+    %define orig_w      r4m
+    %define hd    dword r5m
+%endif
+.w4_loop:
+    add               tmp1q, 2*16
+    add               tmp2q, 2*16
+    W_MASK_420            0, 4
+    lea                dstq, [dstq+strideq*4]
+    add               maskq, 4
+.w4:
+    movd   [dstq          ], m0 ; copy m0[0]
+    pshuflw              m1, m0, q1032
+    movd   [dstq+strideq*1], m1 ; copy m0[1]
+    punpckhqdq           m0, m0
+    movd   [dstq+strideq*2], m0 ; copy m0[2]
+    psrlq                m0, 32
+    movd   [dstq+stride3q ], m0 ; copy m0[3]
+    pshufd               m5, m4, q3131; DBDB even lines repeated
+    pshufd               m4, m4, q2020; CACA odd lines repeated
+    psubw                m1, m6, m4   ; m9 == 64 * 4 + 2
+    psubw                m1, m5       ; C-D A-B C-D A-B
+    psrlw                m1, 2        ; >> 2
+    packuswb             m1, m1
+    movd            [maskq], m1
+    sub                  hd, 4
+    jg .w4_loop
+    RET
+.w8_loop:
+    add               tmp1q, 2*16
+    add               tmp2q, 2*16
+    W_MASK_420            0, 4
+    lea                dstq, [dstq+strideq*2]
+    add               maskq, 4
+.w8:
+    movq   [dstq          ], m0
+    movhps [dstq+strideq*1], m0
+    pshufd               m1, m4, q3232
+    psubw                m0, m6, m4
+    psubw                m0, m1
+    psrlw                m0, 2
+    packuswb             m0, m0
+    movd            [maskq], m0
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+.w16: ; w32/64/128
+    mov             dst_bak, dstq
+    mov              loop_w, orig_w ; use width as counter
+%if ARCH_X86_32
+    mov                  wq, orig_w ; because we altered it in 32bit setup
+%endif
+    jmp .w16ge_inner_loop_first
+.w16ge_loop:
+    lea               tmp1q, [tmp1q+wq*2] ; skip even line pixels
+    lea               tmp2q, [tmp2q+wq*2] ; skip even line pixels
+    lea                dstq, [dstq+strideq*2]
+    mov             dst_bak, dstq
+    mov              loop_w, orig_w
+.w16ge_inner_loop:
+    W_MASK_420_B           0, 4
+.w16ge_inner_loop_first:
+    mova   [dstq          ], m0
+    W_MASK_420_B       wq*2, 5  ; load matching even line (offset = widthpx * (16+16))
+    mova   [dstq+strideq*1], m0
+    psubw                m1, m6, m4 ; m9 == 64 * 4 + 2
+    psubw                m1, m5     ; - odd line mask
+    psrlw                m1, 2      ; >> 2
+    packuswb             m1, m1
+    movq            [maskq], m1
+    add               tmp1q, 2*16
+    add               tmp2q, 2*16
+    add               maskq, 8
+    add                dstq, 16
+    sub              loop_w, 16
+    jg .w16ge_inner_loop
+    mov                dstq, dst_bak
+    sub                  hd, 2
+    jg .w16ge_loop
+    RET
+
+%undef reg_pw_8
+%undef reg_pw_27
+%undef reg_pw_2048
+%undef dst_bak
+%undef loop_w
+%undef orig_w
 %undef hd