ref: d5cc850312ac712f33cdd35a09f36f338bd1644a
parent: 7acf1360d6355e626a4fe8198892605d96d0c422
author: Xuefeng Jiang <xuefeng@multicorewareinc.com>
date: Tue Feb 12 10:24:22 EST 2019
Add SSSE3 implementation for pal_pred pal_pred_w4_8bpc_c: 141.0 pal_pred_w4_8bpc_ssse3: 23.4 pal_pred_w8_8bpc_c: 374.5 pal_pred_w8_8bpc_ssse3: 29.0 pal_pred_w16_8bpc_c: 946.3 pal_pred_w16_8bpc_ssse3: 45.6 pal_pred_w32_8bpc_c: 1946.1 pal_pred_w32_8bpc_ssse3: 92.3 pal_pred_w64_8bpc_c: 4925.9 pal_pred_w64_8bpc_ssse3: 180.1
--- a/src/internal.h
+++ b/src/internal.h
@@ -279,7 +279,7 @@
// FIXME types can be changed to pixel (and dynamically allocated)
// which would make copy/assign operations slightly faster?
uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
- uint16_t pal[3 /* plane */][8 /* palette_idx */];
+ ALIGN(uint16_t pal[3 /* plane */][8 /* palette_idx */], 16);
uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
uint8_t txtp_map[32 * 32]; // inter-only
Dav1dWarpedMotionParams warpmv;
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -62,6 +62,8 @@
decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
+decl_pal_pred_fn(dav1d_pal_pred_ssse3);
+
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -77,6 +79,8 @@
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_ssse3;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
+
+ c->pal_pred = dav1d_pal_pred_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/ipred_ssse3.asm
+++ b/src/x86/ipred_ssse3.asm
@@ -89,6 +89,7 @@
JMP_TABLE ipred_smooth, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
@@ -1179,3 +1180,91 @@
sub hd, 1
jg .w64_loop
RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
+; const uint8_t *idx, const int w, const int h);
+;---------------------------------------------------------------------------------------
+cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
+ mova m4, [palq]
+ LEA r2, pal_pred_ssse3_table
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, [r2+wq*4]
+ packuswb m4, m4
+ add wq, r2
+ lea r2, [strideq*3]
+ jmp wq
+.w4:
+ pshufb m0, m4, [idxq]
+ add idxq, 16
+ movd [dstq ], m0
+ pshuflw m1, m0, q1032
+ movd [dstq+strideq ], m1
+ punpckhqdq m0, m0
+ movd [dstq+strideq*2], m0
+ psrlq m0, 32
+ movd [dstq+r2 ], m0
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ add idxq, 32
+ movq [dstq ], m0
+ movhps [dstq+strideq ], m0
+ movq [dstq+strideq*2], m1
+ movhps [dstq+r2 ], m1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+strideq ], m1
+ mova [dstq+strideq*2], m2
+ mova [dstq+r2 ], m3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+16 ], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16], m3
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32
+ RET
+ALIGN function_align
+.w64:
+ pshufb m0, m4, [idxq]
+ pshufb m1, m4, [idxq+16]
+ pshufb m2, m4, [idxq+32]
+ pshufb m3, m4, [idxq+48]
+ add idxq, 64
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, strideq
+ sub hd, 1
+ jg .w64
+ RET
+