shithub: dav1d

Download patch

ref: 4f14573cffd640ea54f11dfae8f77a905a48e985
parent: 4318600e75f33a8cb7079e43c72efa99694698c5
author: Martin Storsjö <martin@martin.st>
date: Wed Sep 25 17:50:42 EDT 2019

arm64: ipred: NEON implementation of palette prediction

Relative speedups over the C code:
                    Cortex A53    A72    A73
pal_pred_w4_8bpc_neon:    8.75   6.15   7.60
pal_pred_w8_8bpc_neon:   19.93  11.79  10.98
pal_pred_w16_8bpc_neon:  24.68  13.28  16.06
pal_pred_w32_8bpc_neon:  23.56  11.81  16.74
pal_pred_w64_8bpc_neon:  23.16  12.19  17.60

--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -1326,3 +1326,94 @@
         .hword L(ipred_smooth_h_tbl) -  80b
         .hword L(ipred_smooth_h_tbl) -  40b
 endfunc
+
+// void pal_pred_neon(pixel *dst, const ptrdiff_t stride,
+//                    const uint16_t *const pal, const uint8_t *idx,
+//                    const int w, const int h);
+function pal_pred_neon, export=1
+        ld1             {v0.8h}, [x2]
+        clz             w9,  w4
+        adr             x6,  L(pal_pred_tbl)
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x6, w9, uxtw #1]
+        xtn             v0.8b,  v0.8h
+        sub             x6,  x6,  w9, uxtw
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x6
+4:
+        ld1             {v1.16b}, [x3], #16
+        subs            w5,  w5,  #4
+        tbl             v1.16b, {v0.16b}, v1.16b
+        st1             {v1.s}[0], [x0], x1
+        st1             {v1.s}[1], [x2], x1
+        st1             {v1.s}[2], [x0], x1
+        st1             {v1.s}[3], [x2], x1
+        b.gt            4b
+        ret
+8:
+        ld1             {v1.16b, v2.16b}, [x3], #32
+        subs            w5,  w5,  #4
+        tbl             v1.16b, {v0.16b}, v1.16b
+        st1             {v1.d}[0], [x0], x1
+        tbl             v2.16b, {v0.16b}, v2.16b
+        st1             {v1.d}[1], [x2], x1
+        st1             {v2.d}[0], [x0], x1
+        st1             {v2.d}[1], [x2], x1
+        b.gt            8b
+        ret
+16:
+        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
+        subs            w5,  w5,  #4
+        tbl             v1.16b, {v0.16b}, v1.16b
+        tbl             v2.16b, {v0.16b}, v2.16b
+        st1             {v1.16b}, [x0], x1
+        tbl             v3.16b, {v0.16b}, v3.16b
+        st1             {v2.16b}, [x2], x1
+        tbl             v4.16b, {v0.16b}, v4.16b
+        st1             {v3.16b}, [x0], x1
+        st1             {v4.16b}, [x2], x1
+        b.gt            16b
+        ret
+32:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
+        subs            w5,  w5,  #4
+        tbl             v16.16b, {v0.16b}, v16.16b
+        tbl             v17.16b, {v0.16b}, v17.16b
+        tbl             v18.16b, {v0.16b}, v18.16b
+        tbl             v19.16b, {v0.16b}, v19.16b
+        tbl             v20.16b, {v0.16b}, v20.16b
+        st1             {v16.16b, v17.16b}, [x0], x1
+        tbl             v21.16b, {v0.16b}, v21.16b
+        st1             {v18.16b, v19.16b}, [x2], x1
+        tbl             v22.16b, {v0.16b}, v22.16b
+        st1             {v20.16b, v21.16b}, [x0], x1
+        tbl             v23.16b, {v0.16b}, v23.16b
+        st1             {v22.16b, v23.16b}, [x2], x1
+        b.gt            32b
+        ret
+64:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
+        subs            w5,  w5,  #2
+        tbl             v16.16b, {v0.16b}, v16.16b
+        tbl             v17.16b, {v0.16b}, v17.16b
+        tbl             v18.16b, {v0.16b}, v18.16b
+        tbl             v19.16b, {v0.16b}, v19.16b
+        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        tbl             v20.16b, {v0.16b}, v20.16b
+        tbl             v21.16b, {v0.16b}, v21.16b
+        tbl             v22.16b, {v0.16b}, v22.16b
+        tbl             v23.16b, {v0.16b}, v23.16b
+        st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
+        b.gt            64b
+        ret
+
+L(pal_pred_tbl):
+        .hword L(pal_pred_tbl) - 64b
+        .hword L(pal_pred_tbl) - 32b
+        .hword L(pal_pred_tbl) - 16b
+        .hword L(pal_pred_tbl) -  8b
+        .hword L(pal_pred_tbl) -  4b
+endfunc
--- a/src/arm/ipred_init_tmpl.c
+++ b/src/arm/ipred_init_tmpl.c
@@ -38,6 +38,8 @@
 decl_angular_ipred_fn(dav1d_ipred_smooth_v_neon);
 decl_angular_ipred_fn(dav1d_ipred_smooth_h_neon);
 
+decl_pal_pred_fn(dav1d_pal_pred_neon);
+
 COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
 
@@ -54,5 +56,7 @@
     c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_neon;
     c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
     c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_neon;
+
+    c->pal_pred                  = dav1d_pal_pred_neon;
 #endif
 }