shithub: dav1d

Download patch

ref: d5cc850312ac712f33cdd35a09f36f338bd1644a
parent: 7acf1360d6355e626a4fe8198892605d96d0c422
author: Xuefeng Jiang <xuefeng@multicorewareinc.com>
date: Tue Feb 12 10:24:22 EST 2019

Add SSSE3 implementation for pal_pred

pal_pred_w4_8bpc_c: 141.0
pal_pred_w4_8bpc_ssse3: 23.4
pal_pred_w8_8bpc_c: 374.5
pal_pred_w8_8bpc_ssse3: 29.0
pal_pred_w16_8bpc_c: 946.3
pal_pred_w16_8bpc_ssse3: 45.6
pal_pred_w32_8bpc_c: 1946.1
pal_pred_w32_8bpc_ssse3: 92.3
pal_pred_w64_8bpc_c: 4925.9
pal_pred_w64_8bpc_ssse3: 180.1

--- a/src/internal.h
+++ b/src/internal.h
@@ -279,7 +279,7 @@
     // FIXME types can be changed to pixel (and dynamically allocated)
     // which would make copy/assign operations slightly faster?
     uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
-    uint16_t pal[3 /* plane */][8 /* palette_idx */];
+    ALIGN(uint16_t pal[3 /* plane */][8 /* palette_idx */], 16);
     uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
     uint8_t txtp_map[32 * 32]; // inter-only
     Dav1dWarpedMotionParams warpmv;
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -62,6 +62,8 @@
 decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
 
+decl_pal_pred_fn(dav1d_pal_pred_ssse3);
+
 void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
 
@@ -77,6 +79,8 @@
     c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_ssse3;
     c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
     c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
+
+    c->pal_pred = dav1d_pal_pred_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
--- a/src/x86/ipred_ssse3.asm
+++ b/src/x86/ipred_ssse3.asm
@@ -89,6 +89,7 @@
 JMP_TABLE ipred_smooth,     ssse3, w4, w8, w16, w32, w64
 JMP_TABLE ipred_smooth_v,   ssse3, w4, w8, w16, w32, w64
 JMP_TABLE ipred_smooth_h,   ssse3, w4, w8, w16, w32, w64
+JMP_TABLE pal_pred,         ssse3, w4, w8, w16, w32, w64
 
 
 
@@ -1179,3 +1180,91 @@
     sub                  hd, 1
     jg .w64_loop
     RET
+
+;---------------------------------------------------------------------------------------
+;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
+;                                         const uint8_t *idx, const int w, const int h);
+;---------------------------------------------------------------------------------------
+cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
+    mova                 m4, [palq]
+    LEA                  r2, pal_pred_ssse3_table
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    movsxd               wq, [r2+wq*4]
+    packuswb             m4, m4
+    add                  wq, r2
+    lea                  r2, [strideq*3]
+    jmp                  wq
+.w4:
+    pshufb               m0, m4, [idxq]
+    add                idxq, 16
+    movd   [dstq          ], m0
+    pshuflw              m1, m0, q1032
+    movd   [dstq+strideq  ], m1
+    punpckhqdq           m0, m0
+    movd   [dstq+strideq*2], m0
+    psrlq                m0, 32
+    movd   [dstq+r2       ], m0
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4
+    RET
+ALIGN function_align
+.w8:
+    pshufb               m0, m4, [idxq]
+    pshufb               m1, m4, [idxq+16]
+    add                idxq, 32
+    movq   [dstq          ], m0
+    movhps [dstq+strideq  ], m0
+    movq   [dstq+strideq*2], m1
+    movhps [dstq+r2       ], m1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8
+    RET
+ALIGN function_align
+.w16:
+    pshufb               m0, m4, [idxq]
+    pshufb               m1, m4, [idxq+16]
+    pshufb               m2, m4, [idxq+32]
+    pshufb               m3, m4, [idxq+48]
+    add                idxq, 64
+    mova   [dstq          ], m0
+    mova   [dstq+strideq  ], m1
+    mova   [dstq+strideq*2], m2
+    mova   [dstq+r2       ], m3
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w16
+    RET
+ALIGN function_align
+.w32:
+    pshufb               m0, m4, [idxq]
+    pshufb               m1, m4, [idxq+16]
+    pshufb               m2, m4, [idxq+32]
+    pshufb               m3, m4, [idxq+48]
+    add                idxq, 64
+    mova  [dstq           ], m0
+    mova  [dstq+16        ], m1
+    mova  [dstq+strideq   ], m2
+    mova  [dstq+strideq+16], m3
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32
+    RET
+ALIGN function_align
+.w64:
+    pshufb               m0, m4, [idxq]
+    pshufb               m1, m4, [idxq+16]
+    pshufb               m2, m4, [idxq+32]
+    pshufb               m3, m4, [idxq+48]
+    add                idxq, 64
+    mova          [dstq   ], m0
+    mova          [dstq+16], m1
+    mova          [dstq+32], m2
+    mova          [dstq+48], m3
+    add                dstq, strideq
+    sub                  hd, 1
+    jg .w64
+    RET
+