shithub: dav1d

Download patch

ref: 44d0de41d478b6b41a1ebbf1de012caa8d75cca0
parent: 8bbcd3f7a7518808032be82260fd1ebb02337d2d
author: Xuefeng Jiang <xuefeng@multicorewareinc.com>
date: Wed Apr 10 08:58:01 EDT 2019

Add SSSE3 implementation for ipred_paeth

intra_pred_paeth_w4_8bpc_c: 561.6
intra_pred_paeth_w4_8bpc_ssse3: 49.2
intra_pred_paeth_w8_8bpc_c: 1475.8
intra_pred_paeth_w8_8bpc_ssse3: 103.0
intra_pred_paeth_w16_8bpc_c: 4697.8
intra_pred_paeth_w16_8bpc_ssse3: 279.0
intra_pred_paeth_w32_8bpc_c: 13245.1
intra_pred_paeth_w32_8bpc_ssse3: 614.7
intra_pred_paeth_w64_8bpc_c: 32638.9
intra_pred_paeth_w64_8bpc_ssse3: 1477.6

--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -58,6 +58,7 @@
 decl_angular_ipred_fn(dav1d_ipred_dc_left_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
+decl_angular_ipred_fn(dav1d_ipred_paeth_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_smooth_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_smooth_v_ssse3);
 decl_angular_ipred_fn(dav1d_ipred_smooth_h_ssse3);
@@ -85,6 +86,7 @@
     c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_ssse3;
     c->intra_pred[HOR_PRED]      = dav1d_ipred_h_ssse3;
     c->intra_pred[VERT_PRED]     = dav1d_ipred_v_ssse3;
+    c->intra_pred[PAETH_PRED]    = dav1d_ipred_paeth_ssse3;
     c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_ssse3;
     c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_ssse3;
     c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_ssse3;
--- a/src/x86/ipred_ssse3.asm
+++ b/src/x86/ipred_ssse3.asm
@@ -56,10 +56,10 @@
      18,  16,  15,  13,  12,  10,   9,   8, \
       7,   6,   6,   5,   5,   4,   4,   4
 
+ipred_v_shuf      : db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
+ipred_h_shuf      : db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
+ipred_paeth_shuf  : db  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0
 
-ipred_v_shuf  : db  0,  1,  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7
-ipred_h_shuf  : db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
-
 pb_3        : times 16 db 3
 pb_128      : times 8  db 128
 pw_128      : times 4  dw 128
@@ -90,6 +90,7 @@
 JMP_TABLE ipred_smooth,     ssse3, w4, w8, w16, w32, w64
 JMP_TABLE ipred_smooth_v,   ssse3, w4, w8, w16, w32, w64
 JMP_TABLE ipred_smooth_h,   ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_paeth,      ssse3, w4, w8, w16, w32, w64
 JMP_TABLE pal_pred,         ssse3, w4, w8, w16, w32, w64
 JMP_TABLE ipred_cfl,        ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
                                 s4-8*4, s8-8*4, s16-8*4, s32-8*4
@@ -2725,3 +2726,186 @@
     sub                 szd, 8
     jg .sub_loop
     RET
+
+; %1 simd register that hold the mask and will hold the result
+; %2 simd register that holds the "true" values
+; %3 location of the "false" values (simd register/memory)
+%macro BLEND 3 ; mask, true, false
+    pand  %2, %1
+    pandn %1, %3
+    por   %1, %2
+%endmacro
+
+%macro PAETH 2                                 ; top, ldiff
+    pavgb                m1, m%1, m3
+    pxor                 m0, m%1, m3
+    pand                 m0, m4
+    psubusb              m2, m5, m1
+    psubb                m1, m0
+    psubusb              m1, m5
+    por                  m1, m2
+    paddusb              m1, m1
+    por                  m1, m0               ; min(tldiff, 255)
+    psubusb              m2, m5, m3
+    psubusb              m0, m3, m5
+    por                  m2, m0               ; tdiff
+%ifnum %2
+    pminub               m2, m%2
+    pcmpeqb              m0, m%2, m2          ; ldiff <= tdiff
+%else
+    mova                 m0, %2
+    pminub               m2, m0
+    pcmpeqb              m0, m2
+%endif
+    pminub               m1, m2
+    pcmpeqb              m1, m2               ; ldiff <= tldiff && tdiff <= tldiff
+    mova                 m2, m3
+    BLEND                m0, m2, m%1
+    BLEND                m1, m0, m5
+%endmacro
+
+cglobal ipred_paeth, 3, 6, 8, -7*16, dst, stride, tl, w, h
+%define base r5-ipred_paeth_ssse3_table
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    pxor                 m0, m0
+    movd                 m5, [tlq]
+    pshufb               m5, m0
+    LEA                  r5, ipred_paeth_ssse3_table
+    movsxd               wq, [r5+wq*4]
+    movddup              m4, [base+ipred_paeth_shuf]
+    add                  wq, r5
+    jmp                  wq
+.w4:
+    movd                 m6, [tlq+1]            ; top
+    pshufd               m6, m6, q0000
+    lea                  r3, [strideq*3]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0                 ; ldiff
+.w4_loop:
+    sub                 tlq, 4
+    movd                 m3, [tlq]
+    mova                 m1, [base+ipred_h_shuf]
+    pshufb               m3, m1                 ; left
+    PAETH                 6, 7
+    movd   [dstq          ], m1
+    pshuflw              m0, m1, q1032
+    movd   [dstq+strideq  ], m0
+    punpckhqdq           m1, m1
+    movd   [dstq+strideq*2], m1
+    psrlq                m1, 32
+    movd   [dstq+r3       ], m1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4_loop
+    RET
+ALIGN function_align
+.w8:
+    movddup              m6, [tlq+1]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+.w8_loop:
+    sub                 tlq, 2
+    movd                 m3, [tlq]
+    pshufb               m3, [base+ipred_paeth_shuf]
+    PAETH                 6, 7
+    movq     [dstq        ], m1
+    movhps   [dstq+strideq], m1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w8_loop
+    RET
+ALIGN function_align
+.w16:
+    movu                 m6, [tlq+1]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+.w16_loop:
+    sub                 tlq, 1
+    movd                 m3, [tlq]
+    pxor                 m1, m1
+    pshufb               m3, m1
+    PAETH                 6, 7
+    mova             [dstq], m1
+    add                dstq, strideq
+    sub                  hd, 1
+    jg .w16_loop
+    RET
+ALIGN function_align
+.w32:
+    movu                 m6, [tlq+1]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp   ], m6
+    mova           [rsp+16], m7
+    movu                 m6, [tlq+17]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp+32], m6
+.w32_loop:
+    dec                 tlq
+    movd                 m3, [tlq]
+    pxor                 m1, m1
+    pshufb               m3, m1
+    mova                 m6, [rsp]
+    PAETH                 6, [rsp+16]
+    mova          [dstq   ], m1
+    mova                 m6, [rsp+32]
+    PAETH                 6, 7
+    mova          [dstq+16], m1
+    add                dstq, strideq
+    dec                  hd
+    jg .w32_loop
+    RET
+ALIGN function_align
+.w64:
+    movu                 m6, [tlq+1]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp   ], m6
+    mova           [rsp+16], m7
+    movu                 m6, [tlq+17]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp+32], m6
+    mova           [rsp+48], m7
+    movu                 m6, [tlq+33]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp+64], m6
+    mova           [rsp+80], m7
+    movu                 m6, [tlq+49]
+    psubusb              m7, m5, m6
+    psubusb              m0, m6, m5
+    por                  m7, m0
+    mova           [rsp+96], m6
+.w64_loop:
+    dec                 tlq
+    movd                 m3, [tlq]
+    pxor                 m1, m1
+    pshufb               m3, m1
+    mova                 m6, [rsp]
+    PAETH                 6, [rsp+16]
+    mova          [dstq   ], m1
+    mova                 m6, [rsp+32]
+    PAETH                 6, [rsp+48]
+    mova          [dstq+16], m1
+    mova                 m6, [rsp+64]
+    PAETH                 6, [rsp+80]
+    mova          [dstq+32], m1
+    mova                 m6, [rsp+96]
+    PAETH                 6, 7
+    mova          [dstq+48], m1
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_loop
+    RET
+