shithub: dav1d

Download patch

ref: fef13fd67cf560c91e0e30eab3c4553a88b8ef63
parent: 7606bb08fe1b94951a92d04086305215fc7ef044
author: Francois Cartegnie <fcvlcdev@free.fr>
date: Fri Nov 30 10:19:15 EST 2018

add SSSE3 blend_v

--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -61,6 +61,7 @@
 decl_blend_fn(dav1d_blend_avx2);
 decl_blend_fn(dav1d_blend_ssse3);
 decl_blend_dir_fn(dav1d_blend_v_avx2);
+decl_blend_dir_fn(dav1d_blend_v_ssse3);
 decl_blend_dir_fn(dav1d_blend_h_avx2);
 
 decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
@@ -85,6 +86,7 @@
     c->mask = dav1d_mask_ssse3;
     c->w_mask[2] = dav1d_w_mask_420_ssse3;
     c->blend = dav1d_blend_ssse3;
+    c->blend_v = dav1d_blend_v_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -29,6 +29,23 @@
 
 SECTION_RODATA 16
 
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db  0,  0,  0,  0
+            ; 2 @4
+            db 45, 19, 64,  0
+            ; 4 @8
+            db 39, 25, 50, 14, 59,  5, 64,  0
+            ; 8 @16
+            db 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
+            ; 16 @32
+            db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+            db 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
+            ; 32 @64
+            db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+            db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
+            db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
+            db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
+
 pb_64:   times 16 db 64
 pw_8:    times 8 dw 8
 pw_26:   times 8 dw 26
@@ -55,6 +72,7 @@
 BIDIR_JMP_TABLE mask_ssse3,       4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
 BIDIR_JMP_TABLE blend_ssse3,      4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
 
 SECTION .text
 
@@ -431,17 +449,21 @@
 %undef orig_w
 %undef hd
 
+%macro BLEND_64M 4; a, b, mask1, mask2
+    punpcklbw            m0, %1, %2; {b;a}[7..0]
+    punpckhbw            %1, %2    ; {b;a}[15..8]
+    pmaddubsw            m0, %3    ; {b*m[0] + (64-m[0])*a}[7..0] u16
+    pmaddubsw            %1, %4    ; {b*m[1] + (64-m[1])*a}[15..8] u16
+    pmulhrsw             m0, m5    ; {((b*m[0] + (64-m[0])*a) + 1) / 32}[7..0] u16
+    pmulhrsw             %1, m5    ; {((b*m[1] + (64-m[0])*a) + 1) / 32}[15..8] u16
+    packuswb             m0, %1    ; {blendpx}[15..0] u8
+%endmacro
+
 %macro BLEND 2; a, b
     psubb                m3, m4, m0 ; m3 = (64 - m)
     punpcklbw            m2, m3, m0 ; {m;(64-m)}[7..0]
     punpckhbw            m3, m0     ; {m;(64-m)}[15..8]
-    punpcklbw            m0, %1, %2 ; {b;a}[7..0]
-    punpckhbw            %1, %2     ; {b;a}[15..8]
-    pmaddubsw            m0, m2     ; {b*m + (64-m)*a}[7..0] u16
-    pmaddubsw            %1, m3     ; {b*m + (64-m)*a}[15..8] u16
-    pmulhrsw             m0, m5     ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
-    pmulhrsw             %1, m5     ; {((b*m + (64-m)*a) + 1) / 32}[15..8] u16
-    packuswb             m0, %1     ; {blendpx}[15..0] u8
+    BLEND_64M            %1, %2, m2, m3
 %endmacro
 
 cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
@@ -518,4 +540,106 @@
     add                dstq, dsq ; dst_stride
     dec                  hd
     jg .w32
+    RET
+
+cglobal blend_v, 3, 6, 8, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_ssse3_table
+    lea                  r5, [blend_v_ssse3_table]
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, dword [r5+wq*4]
+    mova                 m5, [base+pw_512]
+    add                  wq, r5
+    add               maskq, obmc_masks-blend_v_ssse3_table
+    jmp                  wq
+.w2:
+    movd                 m3, [maskq+4]
+    punpckldq            m3, m3
+    ; 2 mask blend is provided for 4 pixels / 2 lines
+.w2_loop:
+    movd                 m1, [dstq+dsq*0] ; a {..;a;a}
+    pinsrw               m1, [dstq+dsq*1], 1
+    movd                 m2, [tmpq] ; b
+    punpcklbw            m0, m1, m2; {b;a}[7..0]
+    pmaddubsw            m0, m3    ; {b*m + (64-m)*a}[7..0] u16
+    pmulhrsw             m0, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+    packuswb             m0, m1    ; {blendpx}[8..0] u8
+    movd                r3d, m0
+    mov        [dstq+dsq*0], r3w
+    shr                 r3d, 16
+    mov        [dstq+dsq*1], r3w
+    add                tmpq, 2*2
+    lea                dstq, [dstq + dsq * 2]
+    sub                  hd, 2
+    jg .w2_loop
+    RET
+.w4:
+    movddup              m3, [maskq+8]
+    ; 4 mask blend is provided for 8 pixels / 2 lines
+.w4_loop:
+    movd                 m1, [dstq+dsq*0] ; a
+    movd                 m2, [dstq+dsq*1] ;
+    punpckldq            m1, m2
+    movq                 m2, [tmpq] ; b
+    punpcklbw            m1, m2    ; {b;a}[7..0]
+    pmaddubsw            m1, m3    ; {b*m + (64-m)*a}[7..0] u16
+    pmulhrsw             m1, m5    ; {((b*m + (64-m)*a) + 1) / 32}[7..0] u16
+    packuswb             m1, m1    ; {blendpx}[8..0] u8
+    movd             [dstq], m1
+    psrlq                m1, 32
+    movd       [dstq+dsq*1], m1
+    add                tmpq, 2*4
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w4_loop
+    RET
+.w8:
+    mova                 m3, [maskq+16]
+    ; 8 mask blend is provided for 16 pixels
+.w8_loop:
+    movq                 m1, [dstq+dsq*0] ; a
+    movhps               m1, [dstq+dsq*1]
+    mova                 m2, [tmpq]; b
+    BLEND_64M            m1, m2, m3, m3
+    movq       [dstq+dsq*0], m0
+    punpckhqdq           m0, m0
+    movq       [dstq+dsq*1], m0
+    add                tmpq, 16
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+.w16:
+    ; 16 mask blend is provided for 32 pixels
+    mova                  m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
+    mova                  m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
+.w16_loop:
+    mova                 m1, [dstq] ; a
+    mova                 m2, [tmpq] ; b
+    BLEND_64M            m1, m2, m3, m4
+    mova             [dstq], m0
+    add                tmpq, 16
+    add                dstq, dsq
+    dec                  hd
+    jg .w16_loop
+    RET
+.w32:
+    mova                 m3, [maskq+64 ] ; obmc_masks_32[0] (64-m[0])
+    mova                 m4, [maskq+80 ] ; obmc_masks_32[1] (64-m[1])
+    mova                 m6, [maskq+96 ] ; obmc_masks_32[2] (64-m[2])
+    mova                 m7, [maskq+112] ; obmc_masks_32[3] (64-m[3])
+    ; 16 mask blend is provided for 64 pixels
+.w32_loop:
+    mova                 m1, [dstq+16*0] ; a
+    mova                 m2, [tmpq+16*0] ; b
+    BLEND_64M            m1, m2, m3, m4
+    mova        [dstq+16*0], m0
+    mova                 m1, [dstq+16*1] ; a
+    mova                 m2, [tmpq+16*1] ; b
+    BLEND_64M            m1, m2, m6, m7
+    mova        [dstq+16*1], m0
+    add                tmpq, 32
+    add                dstq, dsq
+    dec                  hd
+    jg .w32_loop
     RET