ref: 4f14573cffd640ea54f11dfae8f77a905a48e985
parent: 4318600e75f33a8cb7079e43c72efa99694698c5
author: Martin Storsjö <martin@martin.st>
date: Wed Sep 25 17:50:42 EDT 2019
arm64: ipred: NEON implementation of palette prediction Relative speedups over the C code: Cortex A53 A72 A73 pal_pred_w4_8bpc_neon: 8.75 6.15 7.60 pal_pred_w8_8bpc_neon: 19.93 11.79 10.98 pal_pred_w16_8bpc_neon: 24.68 13.28 16.06 pal_pred_w32_8bpc_neon: 23.56 11.81 16.74 pal_pred_w64_8bpc_neon: 23.16 12.19 17.60
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -1326,3 +1326,94 @@
.hword L(ipred_smooth_h_tbl) - 80b
.hword L(ipred_smooth_h_tbl) - 40b
endfunc
+
+// void pal_pred_neon(pixel *dst, const ptrdiff_t stride,
+// const uint16_t *const pal, const uint8_t *idx,
+// const int w, const int h);
+function pal_pred_neon, export=1
+ ld1 {v0.8h}, [x2]
+ clz w9, w4
+ adr x6, L(pal_pred_tbl)
+ sub w9, w9, #25
+ ldrh w9, [x6, w9, uxtw #1]
+ xtn v0.8b, v0.8h
+ sub x6, x6, w9, uxtw
+ add x2, x0, x1
+ lsl x1, x1, #1
+ br x6
+4:
+ ld1 {v1.16b}, [x3], #16
+ subs w5, w5, #4
+ tbl v1.16b, {v0.16b}, v1.16b
+ st1 {v1.s}[0], [x0], x1
+ st1 {v1.s}[1], [x2], x1
+ st1 {v1.s}[2], [x0], x1
+ st1 {v1.s}[3], [x2], x1
+ b.gt 4b
+ ret
+8:
+ ld1 {v1.16b, v2.16b}, [x3], #32
+ subs w5, w5, #4
+ tbl v1.16b, {v0.16b}, v1.16b
+ st1 {v1.d}[0], [x0], x1
+ tbl v2.16b, {v0.16b}, v2.16b
+ st1 {v1.d}[1], [x2], x1
+ st1 {v2.d}[0], [x0], x1
+ st1 {v2.d}[1], [x2], x1
+ b.gt 8b
+ ret
+16:
+ ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
+ subs w5, w5, #4
+ tbl v1.16b, {v0.16b}, v1.16b
+ tbl v2.16b, {v0.16b}, v2.16b
+ st1 {v1.16b}, [x0], x1
+ tbl v3.16b, {v0.16b}, v3.16b
+ st1 {v2.16b}, [x2], x1
+ tbl v4.16b, {v0.16b}, v4.16b
+ st1 {v3.16b}, [x0], x1
+ st1 {v4.16b}, [x2], x1
+ b.gt 16b
+ ret
+32:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
+ subs w5, w5, #4
+ tbl v16.16b, {v0.16b}, v16.16b
+ tbl v17.16b, {v0.16b}, v17.16b
+ tbl v18.16b, {v0.16b}, v18.16b
+ tbl v19.16b, {v0.16b}, v19.16b
+ tbl v20.16b, {v0.16b}, v20.16b
+ st1 {v16.16b, v17.16b}, [x0], x1
+ tbl v21.16b, {v0.16b}, v21.16b
+ st1 {v18.16b, v19.16b}, [x2], x1
+ tbl v22.16b, {v0.16b}, v22.16b
+ st1 {v20.16b, v21.16b}, [x0], x1
+ tbl v23.16b, {v0.16b}, v23.16b
+ st1 {v22.16b, v23.16b}, [x2], x1
+ b.gt 32b
+ ret
+64:
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
+ ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
+ subs w5, w5, #2
+ tbl v16.16b, {v0.16b}, v16.16b
+ tbl v17.16b, {v0.16b}, v17.16b
+ tbl v18.16b, {v0.16b}, v18.16b
+ tbl v19.16b, {v0.16b}, v19.16b
+ st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+ tbl v20.16b, {v0.16b}, v20.16b
+ tbl v21.16b, {v0.16b}, v21.16b
+ tbl v22.16b, {v0.16b}, v22.16b
+ tbl v23.16b, {v0.16b}, v23.16b
+ st1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
+ b.gt 64b
+ ret
+
+L(pal_pred_tbl):
+ .hword L(pal_pred_tbl) - 64b
+ .hword L(pal_pred_tbl) - 32b
+ .hword L(pal_pred_tbl) - 16b
+ .hword L(pal_pred_tbl) - 8b
+ .hword L(pal_pred_tbl) - 4b
+endfunc
--- a/src/arm/ipred_init_tmpl.c
+++ b/src/arm/ipred_init_tmpl.c
@@ -38,6 +38,8 @@
decl_angular_ipred_fn(dav1d_ipred_smooth_v_neon);
decl_angular_ipred_fn(dav1d_ipred_smooth_h_neon);
+decl_pal_pred_fn(dav1d_pal_pred_neon);
+
COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
@@ -54,5 +56,7 @@
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_neon;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_neon;
+
+ c->pal_pred = dav1d_pal_pred_neon;
#endif
}