ref: e308ae49b39502ee871c0df8b64fb79561ce17c5
parent: 9e36b9b00122364cf9391380c336595d7dc6e957
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Mon Mar 30 10:49:16 EDT 2020
x86: add SSSE3 version of mc.resize() resize_8bpc_c: 1613670.2 resize_8bpc_ssse3: 110469.5 resize_8bpc_avx2: 93580.6
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -116,6 +116,7 @@
decl_emu_edge_fn(dav1d_emu_edge_ssse3);
decl_resize_fn(dav1d_resize_avx2);
+decl_resize_fn(dav1d_resize_ssse3);
COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
@@ -170,6 +171,7 @@
c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
c->emu_edge = dav1d_emu_edge_ssse3;
+ c->resize = dav1d_resize_ssse3;
#endif
if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -57,7 +57,15 @@
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+pb_8x0_8x8: times 8 db 0
+ times 8 db 8
+resize_mul: dd 0, 1, 2, 3
+resize_shuf: times 5 db 0
+ db 1, 2, 3, 4, 5, 6
+ times 5+16 db 7
+
pb_64: times 16 db 64
+pw_m256: times 8 dw -256
pw_8: times 8 dw 8
pw_26: times 8 dw 26
pw_34: times 8 dw 34
@@ -67,6 +75,7 @@
pw_6903: times 8 dw 6903
pw_8192: times 8 dw 8192
pd_32: times 4 dd 32
+pd_63: times 4 dd 63
pd_512: times 4 dd 512
pd_16384: times 4 dd 16484
pd_32768: times 4 dd 32768
@@ -5081,3 +5090,218 @@
%undef reg_dstride
%undef reg_blkm
%undef reg_tmp
+
+cextern resize_filter
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+ mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+ SWAP %1, %2
+%endif
+%endmacro
+
+INIT_XMM ssse3
+%if ARCH_X86_64
+cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%elif STACK_ALIGNMENT >= 16
+cglobal resize, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%else
+cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+%endif
+ movifnidn dstq, dstmp
+ movifnidn srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+ movifnidn dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+ movifnidn hd, hm
+%endif
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ movd m7, dxm
+ movd m6, mx0m
+ movd m5, src_wm
+ pshufd m7, m7, q0000
+ pshufd m6, m6, q0000
+ pshufd m5, m5, q0000
+
+%if ARCH_X86_64
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+ LEA r7, $$
+%define base r7-$$
+%else
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+%if STACK_ALIGNMENT >= 16
+ LEA r6, $$
+%define base r6-$$
+%else
+ LEA r4, $$
+%define base r4-$$
+%endif
+%endif
+
+%if ARCH_X86_64
+ mova m12, [base+pw_m256]
+ mova m11, [base+pd_63]
+ mova m10, [base+pb_8x0_8x8]
+%else
+%define m12 [base+pw_m256]
+%define m11 [base+pd_63]
+%define m10 [base+pb_8x0_8x8]
+%endif
+ pmaddwd m4, m7, [base+resize_mul] ; dx*[0,1,2,3]
+ pslld m7, 2 ; dx*4
+ pslld m5, 14
+ paddd m6, m4 ; mx+[0..3]*dx
+ SCRATCH 7, 15, 0
+ SCRATCH 6, 14, 1
+ SCRATCH 5, 13, 2
+
+ ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+ ; m8 = mx+[0..3]*dx, m5 = dx*4, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+ xor xd, xd
+ mova m0, m14 ; per-line working version of mx
+
+.loop_x:
+ pxor m1, m1
+ pcmpgtd m1, m0
+ pandn m1, m0
+ psrad m2, m0, 8 ; filter offset (unmasked)
+ pcmpgtd m3, m13, m1
+ pand m1, m3
+ pandn m3, m13
+ por m1, m3
+ psubd m3, m0, m1 ; pshufb offset
+ psrad m1, 14 ; clipped src_x offset
+ psrad m3, 14 ; pshufb edge_emu offset
+ pand m2, m11 ; filter offset (masked)
+
+ ; load source pixels
+%if ARCH_X86_64
+ movd r8d, xm1
+ pshuflw xm1, xm1, q3232
+ movd r9d, xm1
+ punpckhqdq xm1, xm1
+ movd r10d, xm1
+ psrlq xm1, 32
+ movd r11d, xm1
+ movq xm4, [srcq+r8]
+ movq xm5, [srcq+r10]
+ movhps xm4, [srcq+r9]
+ movhps xm5, [srcq+r11]
+%else
+ movd r3d, xm1
+ pshufd xm1, xm1, q3312
+ movd r1d, xm1
+ pshuflw xm1, xm1, q3232
+ movq xm4, [srcq+r3]
+ movq xm5, [srcq+r1]
+ movd r3d, xm1
+ punpckhqdq xm1, xm1
+ movd r1d, xm1
+ movhps xm4, [srcq+r3]
+ movhps xm5, [srcq+r1]
+%endif
+
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ ; this also saves 2 quasi-vpgatherdqs
+ vptest m3, m3
+ jz .filter
+
+%if ARCH_X86_64
+ movd r8d, xm3
+ pshuflw xm3, xm3, q3232
+ movd r9d, xm3
+ punpckhqdq xm3, xm3
+ movd r10d, xm3
+ psrlq xm3, 32
+ movd r11d, xm3
+ movsxd r8, r8d
+ movsxd r9, r9d
+ movsxd r10, r10d
+ movsxd r11, r11d
+ movq xm6, [base+resize_shuf+4+r8]
+ movq xm7, [base+resize_shuf+4+r10]
+ movhps xm6, [base+resize_shuf+4+r9]
+ movhps xm7, [base+resize_shuf+4+r11]
+%else
+ movd r3d, xm3
+ pshufd xm3, xm3, q3312
+ movd r1d, xm3
+ pshuflw xm3, xm3, q3232
+ movq xm6, [base+resize_shuf+4+r3]
+ movq xm7, [base+resize_shuf+4+r1]
+ movd r3d, xm3
+ punpckhqdq xm3, xm3
+ movd r1d, xm3
+ movhps xm6, [base+resize_shuf+4+r3]
+ movhps xm7, [base+resize_shuf+4+r1]
+%endif
+
+ paddb m6, m10
+ paddb m7, m10
+ pshufb m4, m6
+ pshufb m5, m7
+
+.filter:
+%if ARCH_X86_64
+ movd r8d, xm2
+ pshuflw xm2, xm2, q3232
+ movd r9d, xm2
+ punpckhqdq xm2, xm2
+ movd r10d, xm2
+ psrlq xm2, 32
+ movd r11d, xm2
+ movq xm6, [base+resize_filter+r8*8]
+ movq xm7, [base+resize_filter+r10*8]
+ movhps xm6, [base+resize_filter+r9*8]
+ movhps xm7, [base+resize_filter+r11*8]
+%else
+ movd r3d, xm2
+ pshufd xm2, xm2, q3312
+ movd r1d, xm2
+ pshuflw xm2, xm2, q3232
+ movq xm6, [base+resize_filter+r3*8]
+ movq xm7, [base+resize_filter+r1*8]
+ movd r3d, xm2
+ punpckhqdq xm2, xm2
+ movd r1d, xm2
+ movhps xm6, [base+resize_filter+r3*8]
+ movhps xm7, [base+resize_filter+r1*8]
+%endif
+
+ pmaddubsw m4, m6
+ pmaddubsw m5, m7
+ phaddw m4, m5
+ phaddsw m4, m4
+ pmulhrsw m4, m12 ; x=(x+64)>>7
+ packuswb m4, m4
+ movd [dstq+xq], m4
+
+ paddd m0, m15
+ add xd, 4
+%if STACK_ALIGNMENT >= 16
+ cmp xd, dst_wd
+%else
+ cmp xd, dst_wm
+%endif
+ jl .loop_x
+
+%if ARCH_X86_64
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+%else
+ add dstq, dst_stridem
+ add srcq, src_stridem
+ dec dword r5m
+%endif
+ jg .loop_y
+ RET