ref: b3da18fefd17c427207ca1dfd2eaaac951f053a9
parent: 00d1f4d50117c48f39d8497682755b63b16389a4
author: Francois Cartegnie <fcvlcdev@free.fr>
date: Thu Nov 29 09:30:08 EST 2018
add SSSE3 w_mask_420
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -57,6 +57,7 @@
decl_mask_fn(dav1d_mask_avx2);
decl_mask_fn(dav1d_mask_ssse3);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
+decl_w_mask_fn(dav1d_w_mask_420_ssse3);
decl_blend_fn(dav1d_blend_avx2);
decl_blend_dir_fn(dav1d_blend_v_avx2);
decl_blend_dir_fn(dav1d_blend_h_avx2);
@@ -81,6 +82,7 @@
c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3;
c->mask = dav1d_mask_ssse3;
+ c->w_mask[2] = dav1d_w_mask_420_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -29,6 +29,9 @@
SECTION_RODATA 16
+pw_8: times 8 dw 8
+pw_26: times 8 dw 26
+pw_258: times 8 dw 258
pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048
@@ -48,6 +51,7 @@
BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
SECTION .text
@@ -250,4 +254,176 @@
add wq, r6
mov maskq, r6m
BIDIR_FN MASK
+%undef hd
+
+%if ARCH_X86_64
+ %define reg_pw_8 m8
+ %define reg_pw_27 m9
+ %define reg_pw_2048 m10
+%else
+ %define reg_pw_8 [pw_8]
+ %define reg_pw_27 [pw_26] ; 64 - 38
+ %define reg_pw_2048 [pw_2048]
+%endif
+
+%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
+ ;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
+ mova m0, [tmp1q+(%1)]
+ mova m1, [tmp2q+(%1)]
+ psubw m1, m0 ; tmp1 - tmp2
+ pabsw m3, m1 ; abs(tmp1 - tmp2)
+ paddw m3, reg_pw_8 ; abs(tmp1 - tmp2) + 8
+ psrlw m3, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
+ psubusw m%2, reg_pw_27, m3 ; 64 - min(m, 64)
+ psllw m2, m%2, 10
+ pmulhw m1, m2 ; tmp2 * ()
+ paddw m0, m1 ; tmp1 + ()
+ ;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] ****
+ mova m1, [tmp1q+(%1)+mmsize]
+ mova m2, [tmp2q+(%1)+mmsize]
+ psubw m2, m1 ; tmp1 - tmp2
+ pabsw m7, m2 ; abs(tmp1 - tmp2)
+ paddw m7, reg_pw_8 ; abs(tmp1 - tmp2) + 8
+ psrlw m7, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
+ psubusw m3, reg_pw_27, m7 ; 64 - min(m, 64)
+ phaddw m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0]
+ psllw m3, 10
+ pmulhw m2, m3
+ paddw m1, m2
+ ;********
+ pmulhrsw m0, reg_pw_2048 ; round/scale 2048
+ pmulhrsw m1, reg_pw_2048 ; round/scale 2048
+ packuswb m0, m1 ; concat m0 = u8.dst[15..0]
+%endmacro
+
+%macro W_MASK_420 2
+ W_MASK_420_B (%1*16), %2
+%endmacro
+
+%if ARCH_X86_64
+; args: dst, stride, tmp1, tmp2, w, h, mask, sign
+cglobal w_mask_420, 4, 9, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ lea r7, [w_mask_420_ssse3_table]
+ mov wd, wm
+ tzcnt r8d, wd
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movd m0, r7m
+ pshuflw m0, m0, q0000 ; sign
+ punpcklqdq m0, m0
+ movsxd r8, dword [r7+r8*4]
+ mova reg_pw_8, [pw_8]
+ mova reg_pw_27, [pw_26] ; 64 - 38
+ mova reg_pw_2048, [pw_2048]
+ mova m6, [pw_258] ; 64 * 4 + 2
+ psubw m6, m0
+ add r8, r7
+ W_MASK_420 0, 4
+ lea stride3q, [strideq*3]
+ jmp r8
+ %define dst_bak r8
+ %define loop_w r7
+ %define orig_w wq
+%else
+cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3
+ tzcnt r6d, r4m
+ mov wd, w_mask_420_ssse3_table
+ add wd, [wq+r6*4]
+ mov maskq, r6mp
+ movd m0, r7m
+ pshuflw m0, m0, q0000 ; sign
+ punpcklqdq m0, m0
+ mova m6, [pw_258] ; 64 * 4 + 2
+ psubw m6, m0
+ W_MASK_420 0, 4
+ lea stride3q, [strideq*3]
+ jmp wd
+ %define dst_bak r0m
+ %define loop_w r6q
+ %define orig_w r4m
+ %define hd dword r5m
+%endif
+.w4_loop:
+ add tmp1q, 2*16
+ add tmp2q, 2*16
+ W_MASK_420 0, 4
+ lea dstq, [dstq+strideq*4]
+ add maskq, 4
+.w4:
+ movd [dstq ], m0 ; copy m0[0]
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq*1], m1 ; copy m0[1]
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0 ; copy m0[2]
+ psrlq m0, 32
+ movd [dstq+stride3q ], m0 ; copy m0[3]
+ pshufd m5, m4, q3131; DBDB even lines repeated
+ pshufd m4, m4, q2020; CACA odd lines repeated
+ psubw m1, m6, m4 ; m9 == 64 * 4 + 2
+ psubw m1, m5 ; C-D A-B C-D A-B
+ psrlw m1, 2 ; >> 2
+ packuswb m1, m1
+ movd [maskq], m1
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ add tmp1q, 2*16
+ add tmp2q, 2*16
+ W_MASK_420 0, 4
+ lea dstq, [dstq+strideq*2]
+ add maskq, 4
+.w8:
+ movq [dstq ], m0
+ movhps [dstq+strideq*1], m0
+ pshufd m1, m4, q3232
+ psubw m0, m6, m4
+ psubw m0, m1
+ psrlw m0, 2
+ packuswb m0, m0
+ movd [maskq], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16: ; w32/64/128
+ mov dst_bak, dstq
+ mov loop_w, orig_w ; use width as counter
+%if ARCH_X86_32
+ mov wq, orig_w ; because we altered it in 32bit setup
+%endif
+ jmp .w16ge_inner_loop_first
+.w16ge_loop:
+ lea tmp1q, [tmp1q+wq*2] ; skip even line pixels
+ lea tmp2q, [tmp2q+wq*2] ; skip even line pixels
+ lea dstq, [dstq+strideq*2]
+ mov dst_bak, dstq
+ mov loop_w, orig_w
+.w16ge_inner_loop:
+ W_MASK_420_B 0, 4
+.w16ge_inner_loop_first:
+ mova [dstq ], m0
+ W_MASK_420_B wq*2, 5 ; load matching even line (offset = widthpx * (16+16))
+ mova [dstq+strideq*1], m0
+ psubw m1, m6, m4 ; m9 == 64 * 4 + 2
+ psubw m1, m5 ; - odd line mask
+ psrlw m1, 2 ; >> 2
+ packuswb m1, m1
+ movq [maskq], m1
+ add tmp1q, 2*16
+ add tmp2q, 2*16
+ add maskq, 8
+ add dstq, 16
+ sub loop_w, 16
+ jg .w16ge_inner_loop
+ mov dstq, dst_bak
+ sub hd, 2
+ jg .w16ge_loop
+ RET
+
+%undef reg_pw_8
+%undef reg_pw_27
+%undef reg_pw_2048
+%undef dst_bak
+%undef loop_w
+%undef orig_w
%undef hd