ref: ea54dbe2a89d3eb4edbdbbf1810180984467c6aa
parent: ad392d71ebed07fbcecb49e4544f1dfdc6ae85f8
author: Martin Storsjö <martin@martin.st>
date: Wed Apr 1 19:56:34 EDT 2020
arm64: mc: NEON implementation of emu_edge for 8bpc Relative speedups over C code: Cortex A53 A72 A73 emu_edge_w4_8bpc_neon: 3.82 2.93 2.41 emu_edge_w8_8bpc_neon: 3.28 2.86 2.51 emu_edge_w16_8bpc_neon: 3.58 3.27 2.63 emu_edge_w32_8bpc_neon: 3.04 1.68 2.12 emu_edge_w64_8bpc_neon: 2.58 1.45 1.48 emu_edge_w128_8bpc_neon: 1.79 1.02 1.57 The benchmark numbers for the larger size on A72 fluctuate a whole lot and thus seem very unreliable.
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -3089,3 +3089,161 @@
warp , 11
warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+ ldp x8, x9, [sp]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub x12, x3, #1 // ih - 1
+ cmp x5, x3
+ sub x13, x2, #1 // iw - 1
+ csel x12, x12, x5, ge // min(y, ih - 1)
+ cmp x4, x2
+ bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+ csel x13, x13, x4, ge // min(x, iw - 1)
+ bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+ madd x8, x12, x9, x8 // ref += iclip() * stride
+ add x8, x8, x13 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add x10, x5, x1 // y + bh
+ neg x5, x5 // -y
+ sub x10, x10, x3 // y + bh - ih
+ sub x12, x1, #1 // bh - 1
+ cmp x10, x1
+ bic x5, x5, x5, asr #63 // max(-y, 0)
+ csel x10, x10, x12, lt // min(y + bh - ih, bh-1)
+ cmp x5, x1
+ bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+ csel x5, x5, x12, lt // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add x11, x4, x0 // x + bw
+ neg x4, x4 // -x
+ sub x11, x11, x2 // x + bw - iw
+ sub x13, x0, #1 // bw - 1
+ cmp x11, x0
+ bic x4, x4, x4, asr #63 // max(-x, 0)
+ csel x11, x11, x13, lt // min(x + bw - iw, bw-1)
+ cmp x4, x0
+ bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+ csel x4, x4, x13, lt // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub x1, x1, x5 // bh - top_ext
+ madd x6, x5, x7, x6
+ sub x2, x0, x4 // bw - left_ext
+ sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext
+ sub x2, x2, x11 // center_w = bw - left_ext - right_ext
+
+ mov x14, x6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ ld1r {v0.16b}, [x8]
+ mov x12, x6 // out = dst
+ mov x3, x4
+1:
+ subs x3, x3, #16
+ st1 {v0.16b}, [x12], #16
+ b.gt 1b
+.endif
+ mov x13, x8
+ add x12, x6, x4 // out = dst + left_ext
+ mov x3, x2
+1:
+ ld1 {v0.16b, v1.16b}, [x13], #32
+ subs x3, x3, #32
+ st1 {v0.16b, v1.16b}, [x12], #32
+ b.gt 1b
+.if \need_right
+ add x3, x8, x2 // in + center_w
+ sub x3, x3, #1 // in + center_w - 1
+ add x12, x6, x4 // dst + left_ext
+ ld1r {v0.16b}, [x3]
+ add x12, x12, x2 // out = dst + left_ext + center_w
+ mov x3, x11
+1:
+ subs x3, x3, #16
+ st1 {v0.16b}, [x12], #16
+ b.gt 1b
+.endif
+
+ subs x1, x1, #1 // center_h--
+ add x6, x6, x7
+ add x8, x8, x9
+ b.gt 0b
+.endm
+
+ cbz x4, 2f
+ // need_left
+ cbz x11, 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cbz x11, 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+
+ cbz x10, 3f
+ // need_bottom
+ sub x8, x6, x7 // ref = dst - stride
+ mov x4, x0
+1:
+ ld1 {v0.16b, v1.16b}, [x8], #32
+ mov x3, x10
+2:
+ subs x3, x3, #1
+ st1 {v0.16b, v1.16b}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x10, x6 // dst -= bottom_ext * stride
+ subs x4, x4, #32 // bw -= 32
+ add x6, x6, #32 // dst += 32
+ b.gt 1b
+
+3:
+ cbz x5, 3f
+ // need_top
+ msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride
+1:
+ ld1 {v0.16b, v1.16b}, [x14], #32
+ mov x3, x5
+2:
+ subs x3, x3, #1
+ st1 {v0.16b, v1.16b}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x5, x6 // dst -= top_ext * stride
+ subs x0, x0, #32 // bw -= 32
+ add x6, x6, #32 // dst += 32
+ b.gt 1b
+
+3:
+ ret
+endfunc
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -66,6 +66,8 @@
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
+decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
+
void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
c->mc[type] = BF(dav1d_put_##name, suffix)
@@ -109,5 +111,8 @@
c->w_mask[2] = BF(dav1d_w_mask_420, neon);
c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+#endif
+#if BITDEPTH == 8 && ARCH_AARCH64
+ c->emu_edge = BF(dav1d_emu_edge, neon);
#endif
}