ref: 6044a1ae4a606ec5940bbc28ada4a5727c9358dc
parent: 08184fd81d9e04f5cf6e8a92dc0461295c863e0a
author: Francois Cartegnie <fcvlcdev@free.fr>
date: Fri Nov 23 06:37:58 EST 2018
add SSSE3 avg/w_avg/mask Adaption of the avx2 code
--- a/src/meson.build
+++ b/src/meson.build
@@ -121,6 +121,7 @@
'x86/loopfilter.asm',
'x86/looprestoration.asm',
'x86/mc.asm',
+ 'x86/mc_ssse3.asm',
)
# Compile the ASM sources with NASM
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -51,8 +51,11 @@
decl_mct_fn(dav1d_prep_bilin_avx2);
decl_avg_fn(dav1d_avg_avx2);
+decl_avg_fn(dav1d_avg_ssse3);
decl_w_avg_fn(dav1d_w_avg_avx2);
+decl_w_avg_fn(dav1d_w_avg_ssse3);
decl_mask_fn(dav1d_mask_avx2);
+decl_mask_fn(dav1d_mask_ssse3);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
decl_blend_fn(dav1d_blend_avx2);
decl_blend_dir_fn(dav1d_blend_v_avx2);
@@ -70,7 +73,18 @@
c->mct[type] = dav1d_prep_##name##_##suffix
const unsigned flags = dav1d_get_cpu_flags();
- if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+ if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
+ return;
+
+#if BITDEPTH == 8 && ARCH_X86_64
+ c->avg = dav1d_avg_ssse3;
+ c->w_avg = dav1d_w_avg_ssse3;
+ c->mask = dav1d_mask_ssse3;
+#endif
+
+ if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
+ return;
#if BITDEPTH == 8 && ARCH_X86_64
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
--- /dev/null
+++ b/src/x86/mc_ssse3.asm
@@ -1,0 +1,251 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; Copyright © 2018, VideoLabs
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 16
+
+pw_1024: times 8 dw 1024
+pw_2048: times 8 dw 2048
+
+%macro BIDIR_JMP_TABLE 1-*
+ ;evaluated at definition time (in loop below)
+ %xdefine %1_table (%%table - 2*%2)
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ ; dynamically generated label
+ %%table:
+ %rep %0 - 1 ; repeat for num args
+ dd %%prefix %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+INIT_XMM ssse3
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w4: ; tile 4x
+ movd [dstq ], m0 ; copy dw[0]
+ pshuflw m1, m0, q1032 ; swap dw[1] and dw[0]
+ movd [dstq+strideq*1], m1 ; copy dw[1]
+ punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0]
+ movd [dstq+strideq*2], m0 ; dw[2]
+ psrlq m0, 32 ; shift right in dw[3]
+ movd [dstq+stride3q ], m0 ; copy
+ sub hd, 4
+ jg .w4_loop
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w8:
+ movq [dstq ], m0
+ movhps [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq]
+.w16:
+ mova [dstq ], m0
+ dec hd
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq]
+.w32:
+ mova [dstq ], m0
+ %1 2
+ mova [dstq + 16 ], m0
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 8
+ %1 0
+ add dstq, strideq
+.w64:
+ %assign i 0
+ %rep 4
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 4
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1_INC_PTR 16
+ %1 0
+ add dstq, strideq
+.w128:
+ %assign i 0
+ %rep 8
+ mova [dstq + i*16 ], m0
+ %assign i i+1
+ %if i < 8
+ %1 2*i
+ %endif
+ %endrep
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ ; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
+ mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
+ paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+ lea r6, [avg_ssse3_table]
+ tzcnt wd, wm ; leading zeros
+ movifnidn hd, hm ; move h(stack) to h(register) if not already that register
+ movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
+ mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp2q+(%1+0)*mmsize]
+ psubw m2, m0, [tmp1q+(%1+0)*mmsize]
+ mova m1, [tmp2q+(%1+1)*mmsize]
+ psubw m3, m1, [tmp1q+(%1+1)*mmsize]
+ paddw m2, m2 ; compensate for the weight only being half
+ paddw m3, m3 ; of what it should be
+ pmulhw m2, m4 ; (b-a) * (-weight << 12)
+ pmulhw m3, m4 ; (b-a) * (-weight << 12)
+ paddw m0, m2 ; ((b-a) * -weight) + b
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+ lea r6, [w_avg_ssse3_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movd m0, r6m
+ pshuflw m0, m0, q0000
+ punpcklqdq m0, m0
+ movsxd wq, dword [r6+wq*4]
+ pxor m4, m4
+ psllw m0, 11 ; can't shift by 12, sign bit must be preserved
+ psubw m4, m0
+ mova m5, [pw_2048+r6-w_avg_ssse3_table]
+ add wq, r6
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ mova m3, [maskq+(%1+0)*(mmsize/2)]
+ mova m0, [tmp2q+(%1+0)*mmsize] ; b
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
+ mova m6, m3 ; m
+ psubb m3, m4, m6 ; -m
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3 ; -m << 1
+ punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
+ pmulhw m1, m2 ; (-m * (b - a)) << 10
+ paddw m0, m1 ; + b
+ mova m1, [tmp2q+(%1+1)*mmsize] ; b
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
+ paddw m2, m2 ; (b - a) << 1
+ mova m6, m3 ; (-m << 1)
+ punpckhbw m3, m4, m6 ; (-m << 9)
+ pmulhw m2, m3 ; (-m << 9)
+ paddw m1, m2 ; (-m * (b - a)) << 10
+ pmulhrsw m0, m5 ; round
+ pmulhrsw m1, m5 ; round
+ packuswb m0, m1 ; interleave 16 -> 8
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*mmsize/2
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
+ lea r7, [mask_ssse3_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ pxor m4, m4
+ mova m5, [pw_2048+r7-mask_ssse3_table]
+ add wq, r7
+ BIDIR_FN MASK
+
+%endif ; ARCH_X86_64