ref: 88798ebf44d5ab6c5c92d28b9190cbe619fcbc29
parent: 6cdfd4c53a6e6c8a9a903b6e5922112b8e1deaa3
author: Martin Storsjö <martin@martin.st>
date: Fri Jun 5 12:48:25 EDT 2020
arm64: itx16: Add a missed eob check in the 16x8 transform This allows skipping half of the first transforms if the input coefficients lie within the upper 4x4 (but checkasm only tests in increments of 8x8 at the moment). With checkasm modified to test in smaller increments, the speedup is like this: Before: Cortex A53 A72 A73 inv_txfm_add_16x8_dct_dct_1_10bpc_neon: 874.4 709.0 707.3 After: inv_txfm_add_16x8_dct_dct_1_10bpc_neon: 618.0 479.5 472.9
--- a/src/arm/64/itx16.S
+++ b/src/arm/64/itx16.S
@@ -1635,12 +1635,15 @@
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
movi v4.4s, #0
movz w16, #2896*8, lsl #16
dup v0.2s, w16
- mov x11, #32
-
add x6, x2, #16
.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
ld1 {\i}, [x6]
@@ -1671,6 +1674,12 @@
transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5
+ b 2f
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
+ movi \i, #0
+.endr
+2:
movz w16, #2896*8, lsl #16
dup v0.2s, w16
@@ -1897,7 +1906,6 @@
.endif
adr x4, inv_\txfm1\()_4s_x\w\()_neon
movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
-.if \w == 8
.ifc \txfm1, identity
.ifc \txfm2, identity
movrel x13, eob_8x16
@@ -1911,6 +1919,8 @@
movrel x13, eob_8x16
.endif
.endif
+.if \h == 8
+ ldrh w13, [x13]
.endif
b inv_txfm_add_\w\()x\h\()_neon
endfunc