shithub: dav1d

Download patch

ref: 7606bb08fe1b94951a92d04086305215fc7ef044
parent: b3da18fefd17c427207ca1dfd2eaaac951f053a9
author: Francois Cartegnie <fcvlcdev@free.fr>
date: Fri Nov 30 09:07:15 EST 2018

add SSSE3 blend

--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -59,6 +59,7 @@
 decl_w_mask_fn(dav1d_w_mask_420_avx2);
 decl_w_mask_fn(dav1d_w_mask_420_ssse3);
 decl_blend_fn(dav1d_blend_avx2);
+decl_blend_fn(dav1d_blend_ssse3);
 decl_blend_dir_fn(dav1d_blend_v_avx2);
 decl_blend_dir_fn(dav1d_blend_h_avx2);
 
@@ -83,6 +84,7 @@
     c->w_avg = dav1d_w_avg_ssse3;
     c->mask = dav1d_mask_ssse3;
     c->w_mask[2] = dav1d_w_mask_420_ssse3;
+    c->blend = dav1d_blend_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -29,9 +29,11 @@
 
 SECTION_RODATA 16
 
+pb_64:   times 16 db 64
 pw_8:    times 8 dw 8
 pw_26:   times 8 dw 26
 pw_258:  times 8 dw 258
+pw_512:  times 8 dw 512
 pw_1024: times 8 dw 1024
 pw_2048: times 8 dw 2048
 
@@ -52,6 +54,7 @@
 BIDIR_JMP_TABLE w_avg_ssse3,      4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE mask_ssse3,       4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
+BIDIR_JMP_TABLE blend_ssse3,      4, 8, 16, 32
 
 SECTION .text
 
@@ -427,3 +430,92 @@
 %undef loop_w
 %undef orig_w
 %undef hd
+
+%macro BLEND 2; a, b
+    psubb                m3, m4, m0 ; m3 = (64 - m)
+    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
+    punpckhbw            m3, m0     ; {m;(64-m)}[15..8]
+    punpcklbw            m0, %1, %2 ; {b;a}[7..0]
+    punpckhbw            %1, %2     ; {b;a}[15..8]
+    pmaddubsw            m0, m2     ; {b*m + (64-m)*a}[7..0] u16
+    pmaddubsw            %1, m3     ; {b*m + (64-m)*a}[15..8] u16
+    pmulhrsw             m0, m5     ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+    pmulhrsw             %1, m5     ; {((b*m + (64-m)*a) + 1) / 32}[15..8] u16
+    packuswb             m0, %1     ; {blendpx}[15..0] u8
+%endmacro
+
+cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_ssse3_table
+    lea                  r6, [blend_ssse3_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movifnidn         maskq, maskmp
+    movsxd               wq, dword [r6+wq*4]
+    mova                 m4, [base+pb_64]
+    mova                 m5, [base+pw_512]
+    add                  wq, r6
+    lea                  r6, [dsq*3]
+    jmp                  wq
+.w4:
+    movq                 m0, [maskq]; m
+    movd                 m1, [dstq+dsq*0] ; a
+    movd                 m6, [dstq+dsq*1]
+    punpckldq            m1, m6
+    movq                 m6, [tmpq] ; b
+    psubb                m3, m4, m0 ; m3 = (64 - m)
+    punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
+    punpcklbw            m1, m6    ; {b;a}[7..0]
+    pmaddubsw            m1, m2    ; {b*m[0] + (64-m[0])*a}[7..0] u16
+    pmulhrsw             m1, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+    packuswb             m1, m0    ; {blendpx}[15..0] u8
+    movd       [dstq+dsq*0], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    add               maskq, 8
+    add                tmpq, 8
+    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
+    sub                  hd, 2
+    jg .w4
+    RET
+.w8:
+    mova                 m0, [maskq]; m
+    movq                 m1, [dstq+dsq*0] ; a
+    movhps               m1, [dstq+dsq*1]
+    mova                 m6, [tmpq] ; b
+    BLEND                m1, m6
+    movq       [dstq+dsq*0], m0
+    movhps     [dstq+dsq*1], m0
+    add               maskq, 16
+    add                tmpq, 16
+    lea                dstq, [dstq+dsq*2] ; dst_stride * 2
+    sub                  hd, 2
+    jg .w8
+    RET
+.w16:
+    mova                 m0, [maskq]; m
+    mova                 m1, [dstq] ; a
+    mova                 m6, [tmpq] ; b
+    BLEND                m1, m6
+    mova             [dstq], m0
+    add               maskq, 16
+    add                tmpq, 16
+    add                dstq, dsq ; dst_stride
+    dec                  hd
+    jg .w16
+    RET
+.w32:
+    %assign i 0
+    %rep 2
+    mova                 m0, [maskq+16*i]; m
+    mova                 m1, [dstq+16*i] ; a
+    mova                 m6, [tmpq+16*i] ; b
+    BLEND                m1, m6
+    mova        [dstq+i*16], m0
+    %assign i i+1
+    %endrep
+    add               maskq, 32
+    add                tmpq, 32
+    add                dstq, dsq ; dst_stride
+    dec                  hd
+    jg .w32
+    RET