shithub: dav1d

Download patch

ref: b1167ce169f004f90bcc4a9e8841ffb90fe4abf1
parent: 0bad117eb0f97594a938f17ba05d3ca89ba81a9f
author: Martin Storsjö <martin@martin.st>
date: Sat Feb 1 09:33:58 EST 2020

arm64: mc: Use two regs for alternating output rows for w4/8 in avg/w_avg/mask

It was already done this way for w32/64. Not doing it for w16 as it
didn't help there (and instead gave a small slowdown due to the two
setup instructions).

This gives a small speedup on in-order cores like A53.

Before:         Cortex A53     A72     A73
avg_w4_8bpc_neon:     60.9    25.6    29.0
avg_w8_8bpc_neon:    143.6    52.8    64.0
After:
avg_w4_8bpc_neon:     56.7    26.7    28.5
avg_w8_8bpc_neon:    137.2    54.5    64.4

--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -85,38 +85,44 @@
         \type           v4,  v0,  v1,  v2,  v3
         sub             x7,  x7,  w4, uxtw
         br              x7
+40:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
 4:
         cmp             w5,  #4
         st1             {v4.s}[0],  [x0], x1
-        st1             {v4.s}[1],  [x0], x1
+        st1             {v4.s}[1],  [x7], x1
         st1             {v4.s}[2],  [x0], x1
-        st1             {v4.s}[3],  [x0], x1
+        st1             {v4.s}[3],  [x7], x1
         b.eq            0f
         \type           v5,  v0,  v1,  v2,  v3
         cmp             w5,  #8
         st1             {v5.s}[0],  [x0], x1
-        st1             {v5.s}[1],  [x0], x1
+        st1             {v5.s}[1],  [x7], x1
         st1             {v5.s}[2],  [x0], x1
-        st1             {v5.s}[3],  [x0], x1
+        st1             {v5.s}[3],  [x7], x1
         b.eq            0f
         \type           v4,  v0,  v1,  v2,  v3
         st1             {v4.s}[0],  [x0], x1
-        st1             {v4.s}[1],  [x0], x1
+        st1             {v4.s}[1],  [x7], x1
         \type           v5,  v0,  v1,  v2,  v3
         st1             {v4.s}[2],  [x0], x1
-        st1             {v4.s}[3],  [x0], x1
+        st1             {v4.s}[3],  [x7], x1
         st1             {v5.s}[0],  [x0], x1
-        st1             {v5.s}[1],  [x0], x1
+        st1             {v5.s}[1],  [x7], x1
         st1             {v5.s}[2],  [x0], x1
-        st1             {v5.s}[3],  [x0], x1
+        st1             {v5.s}[3],  [x7], x1
         ret
+80:
+        add             x7,  x0,  x1
+        lsl             x1,  x1,  #1
 8:
         st1             {v4.d}[0],  [x0], x1
         \type           v5,  v0,  v1,  v2,  v3
-        st1             {v4.d}[1],  [x0], x1
+        st1             {v4.d}[1],  [x7], x1
         st1             {v5.d}[0],  [x0], x1
         subs            w5,  w5,  #4
-        st1             {v5.d}[1],  [x0], x1
+        st1             {v5.d}[1],  [x7], x1
         b.le            0f
         \type           v4,  v0,  v1,  v2,  v3
         b               8b
@@ -185,8 +191,8 @@
         .hword L(\type\()_tbl) -  640b
         .hword L(\type\()_tbl) -  320b
         .hword L(\type\()_tbl) -   16b
-        .hword L(\type\()_tbl) -    8b
-        .hword L(\type\()_tbl) -    4b
+        .hword L(\type\()_tbl) -   80b
+        .hword L(\type\()_tbl) -   40b
 endfunc
 .endm