ref: 71e13008acae0a3cf6cee1790914248ee1140c89
parent: 8c5d34c85613aa8c6ba2f6da4cf8ba6b6e41ef8f
author: Xuefeng Jiang <xuefeng@multicorewareinc.com>
date: Thu Dec 20 06:49:58 EST 2018
Add SSSE3 implementation for dav1d_ipred_v and dav1d_ipred_dc Cycle times: intra_pred_dc_w4_8bpc_c: 1051.4 intra_pred_dc_w4_8bpc_ssse3: 58.8 intra_pred_dc_w8_8bpc_c: 1587.6 intra_pred_dc_w8_8bpc_ssse3: 75.3 intra_pred_dc_w16_8bpc_c: 2526.2 intra_pred_dc_w16_8bpc_ssse3: 103.5 intra_pred_dc_w32_8bpc_c: 2646.6 intra_pred_dc_w32_8bpc_ssse3: 179.5 intra_pred_dc_w64_8bpc_c: 4084.6 intra_pred_dc_w64_8bpc_ssse3: 356.1 intra_pred_v_w4_8bpc_c: 468.5 intra_pred_v_w4_8bpc_ssse3: 46.8 intra_pred_v_w8_8bpc_c: 839.1 intra_pred_v_w8_8bpc_ssse3: 56.7 intra_pred_v_w16_8bpc_c: 1750.5 intra_pred_v_w16_8bpc_ssse3: 73.0 intra_pred_v_w32_8bpc_c: 1552.5 intra_pred_v_w32_8bpc_ssse3: 135.4 intra_pred_v_w64_8bpc_c: 2463.6 intra_pred_v_w64_8bpc_ssse3: 305.6
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -51,7 +51,9 @@
decl_pal_pred_fn(dav1d_pal_pred_avx2);
+decl_angular_ipred_fn(dav1d_ipred_dc_ssse3);
decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -59,7 +61,9 @@
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
- c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
+ c->intra_pred[DC_PRED] = dav1d_ipred_dc_ssse3;
+ c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
+ c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/ipred_ssse3.asm
+++ b/src/x86/ipred_ssse3.asm
@@ -39,11 +39,18 @@
%endrep
%endmacro
-JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
+%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
+JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
+ s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
+
SECTION .text
-
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8
pshuflw m1, m0, %3 ; extend 8 byte for 2 pos
punpcklqdq m1, m1
@@ -110,3 +117,262 @@
IPRED_H 32
.w64:
IPRED_H 64
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
+ LEA r5, ipred_dc_splat_ssse3_table
+ tzcnt wd, wm
+ movu m0, [tlq+ 1]
+ movu m1, [tlq+17]
+ movu m2, [tlq+33]
+ movu m3, [tlq+49]
+ movifnidn hd, hm
+ movsxd wq, [r5+wq*4]
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp wq
+
+;---------------------------------------------------------------------------------------
+;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
+; const int width, const int height, const int a);
+;---------------------------------------------------------------------------------------
+cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
+ movifnidn hd, hm
+ movifnidn wd, wm
+ tzcnt r6d, hd
+ lea r5d, [wq+hq]
+ movd m4, r5d
+ tzcnt r5d, r5d
+ movd m5, r5d
+ LEA r5, ipred_dc_ssse3_table
+ tzcnt wd, wd
+ movsxd r6, [r5+r6*4]
+ movsxd wq, [r5+wq*4+20]
+ pcmpeqd m3, m3
+ psrlw m4, 1 ; dc = (width + height) >> 1;
+ add r6, r5
+ add wq, r5
+ lea stride3q, [strideq*3]
+ jmp r6
+.h4:
+ movd m0, [tlq-4]
+ pmaddubsw m0, m3
+ jmp wq
+.w4:
+ movd m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m0, m4
+ paddw m0, m1
+ pmaddwd m0, m3
+ cmp hd, 4
+ jg .w4_mul
+ psrlw m0, 3 ; dc >>= ctz(width + height);
+ jmp .w4_end
+.w4_mul:
+ punpckhqdq m1, m0, m0
+ paddw m0, m1
+ psrlq m1, m0, 32
+ paddw m0, m1
+ psrlw m0, 2
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8
+ cmovz r6d, r2d
+ movd m5, r6d
+ pmulhuw m0, m5
+.w4_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s4:
+ movd [dstq+strideq*0], m0
+ movd [dstq+strideq*1], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s4
+ RET
+ALIGN function_align
+.h8:
+ movq m0, [tlq-8]
+ pmaddubsw m0, m3
+ jmp wq
+.w8:
+ movq m1, [tlq+1]
+ pmaddubsw m1, m3
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ paddw m0, m1
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 8
+ je .w8_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ cmp hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w8_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s8:
+ movq [dstq+strideq*0], m0
+ movq [dstq+strideq*1], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s8
+ RET
+ALIGN function_align
+.h16:
+ mova m0, [tlq-16]
+ pmaddubsw m0, m3
+ jmp wq
+.w16:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 16
+ je .w16_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 8|32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w16_end:
+ pxor m1, m1
+ pshufb m0, m1
+.s16:
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s16
+ RET
+ALIGN function_align
+.h32:
+ mova m0, [tlq-32]
+ pmaddubsw m0, m3
+ mova m2, [tlq-16]
+ pmaddubsw m2, m3
+ paddw m0, m2
+ jmp wq
+.w32:
+ movu m1, [tlq+1]
+ pmaddubsw m1, m3
+ movu m2, [tlq+17]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 32
+ je .w32_end
+ lea r2d, [hq*2]
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 64|16
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w32_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+.s32:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq*2], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q], m0
+ mova [dstq+stride3q+16], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .s32
+ RET
+ALIGN function_align
+.h64:
+ mova m0, [tlq-64]
+ mova m1, [tlq-48]
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-32]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ mova m1, [tlq-16]
+ pmaddubsw m1, m3
+ paddw m0, m1
+ jmp wq
+.w64:
+ movu m1, [tlq+ 1]
+ movu m2, [tlq+17]
+ pmaddubsw m1, m3
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+33]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ movu m2, [tlq+49]
+ pmaddubsw m2, m3
+ paddw m1, m2
+ paddw m0, m1
+ psubw m4, m0
+ punpckhqdq m0, m0
+ psubw m0, m4
+ pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
+ paddw m0, m1
+ pmaddwd m0, m3
+ psrlw m0, m5
+ cmp hd, 64
+ je .w64_end
+ mov r6d, 0x5556
+ mov r2d, 0x3334
+ test hd, 32
+ cmovz r6d, r2d
+ movd m1, r6d
+ pmulhuw m0, m1
+.w64_end:
+ pxor m1, m1
+ pshufb m0, m1
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+.s64:
+ mova [dstq], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ mova [dstq+strideq], m0
+ mova [dstq+strideq+16], m1
+ mova [dstq+strideq+32], m2
+ mova [dstq+strideq+48], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .s64
+ RET