shithub: dav1d

Download patch

ref: 88798ebf44d5ab6c5c92d28b9190cbe619fcbc29
parent: 6cdfd4c53a6e6c8a9a903b6e5922112b8e1deaa3
author: Martin Storsjö <martin@martin.st>
date: Fri Jun 5 12:48:25 EDT 2020

arm64: itx16: Add a missed eob check in the 16x8 transform

This allows skipping half of the first transforms if the input
coefficients lie within the upper 4x4 (but checkasm only tests in
increments of 8x8 at the moment).

With checkasm modified to test in smaller increments, the speedup
is like this:

Before:                             Cortex A53     A72     A73
inv_txfm_add_16x8_dct_dct_1_10bpc_neon:  874.4   709.0   707.3
After:
inv_txfm_add_16x8_dct_dct_1_10bpc_neon:  618.0   479.5   472.9

--- a/src/arm/64/itx16.S
+++ b/src/arm/64/itx16.S
@@ -1635,12 +1635,15 @@
         stp             d10, d11, [sp, #0x10]
         stp             d12, d13, [sp, #0x20]
         stp             d14, d15, [sp, #0x30]
+
+        cmp             w3,  w13
+        mov             x11, #32
+        b.lt            1f
+
         movi            v4.4s,  #0
         movz            w16, #2896*8, lsl #16
         dup             v0.2s,   w16
 
-        mov             x11, #32
-
         add             x6,  x2,  #16
 .irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
         ld1             {\i},    [x6]
@@ -1671,6 +1674,12 @@
         transpose_4x8h  v8,  v9,  v10, v11, v2,  v3,  v4,  v5
         transpose_4x8h  v12, v13, v14, v15, v2,  v3,  v4,  v5
 
+        b               2f
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
+        movi            \i,  #0
+.endr
+2:
         movz            w16, #2896*8, lsl #16
         dup             v0.2s,   w16
 
@@ -1897,7 +1906,6 @@
 .endif
         adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
         movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
-.if \w == 8
 .ifc \txfm1, identity
 .ifc \txfm2, identity
         movrel          x13, eob_8x16
@@ -1911,6 +1919,8 @@
         movrel          x13, eob_8x16
 .endif
 .endif
+.if \h == 8
+        ldrh            w13, [x13]
 .endif
         b               inv_txfm_add_\w\()x\h\()_neon
 endfunc