shithub: dav1d

Download patch

ref: 5647a57eabc454e2e2360429aba494452af00cb3
parent: 3489a9c116ae2b2e258d41509fe35c9acf7cf5f5
author: Martin Storsjö <martin@martin.st>
date: Mon Oct 7 08:24:04 EDT 2019

arm64: mc: Use addp instead of addv+trn1 in warp

Before:           Cortex A53     A72     A73
warp_8x8_8bpc_neon:   1952.8  1161.3  1151.1
warp_8x8t_8bpc_neon:  1937.1  1147.5  1139.0
After:
warp_8x8_8bpc_neon:   1860.8  1068.6  1105.8
warp_8x8t_8bpc_neon:  1846.9  1056.4  1099.8

--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -3007,28 +3007,20 @@
         saddlp          v19.4s,  v19.8h
         mul             v22.8h,  v22.8h,  v5.8h
         saddlp          v20.4s,  v20.8h
-        addv            s23,     v23.4s
         saddlp          v21.4s,  v21.8h
-        addv            s18,     v18.4s
         saddlp          v22.4s,  v22.8h
-        addv            s19,     v19.4s
-        trn1            v18.2s,  v23.2s,  v18.2s
-        addv            s20,     v20.4s
+        addp            v18.4s,  v23.4s,  v18.4s
         ext             v23.16b, v16.16b, v17.16b, #2*6
-        trn1            v19.2s,  v19.2s,  v20.2s
-        addv            s21,     v21.4s
+        addp            v19.4s,  v19.4s,  v20.4s
         mul             v23.8h,  v23.8h,  v6.8h
         ext             v20.16b, v16.16b, v17.16b, #2*7
-        addv            s22,     v22.4s
         mul             v20.8h,  v20.8h,  v7.8h
         saddlp          v23.4s,  v23.8h
-        trn1            v21.2s,  v21.2s,  v22.2s
+        addp            v21.4s,  v21.4s,  v22.4s
         saddlp          v20.4s,  v20.8h
-        addv            s23,     v23.4s
-        addv            s20,     v20.4s
-        trn1            v20.2s,  v23.2s,  v20.2s
-        trn1            v18.2d,  v18.2d,  v19.2d
-        trn1            v20.2d,  v21.2d,  v20.2d
+        addp            v20.4s,  v23.4s,  v20.4s
+        addp            v18.4s,  v18.4s,  v19.4s
+        addp            v20.4s,  v21.4s,  v20.4s
 
         add             w5,  w5,  w8