shithub: dav1d

Download patch

ref: 72af9329c0c003f68639301be33d4632147245b6
parent: fc5a3728144c62b634bb6fb036a6da47ee9bdf8f
author: Martin Storsjö <martin@martin.st>
date: Wed Jan 9 18:27:00 EST 2019

arm64: mc: Simplify the 8tap_2w_hv code slightly

Before:                       Cortex A53   Snapdragon 835
mc_8tap_regular_w2_hv_8bpc_neon:   415.0   286.9
After:
mc_8tap_regular_w2_hv_8bpc_neon:   399.1   269.9

--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1307,21 +1307,19 @@
         ext             v29.16b, v28.16b, v28.16b, #2
         mul             v28.4h,  v28.4h,  v0.4h
         mul             v29.4h,  v29.4h,  v0.4h
-        addv            h28, v28.4h
-        addv            h29, v29.4h
-        trn1            v16.4h, v28.4h, v29.4h
-        srshr           v16.4h, v16.4h, #2
+        addp            v28.4h,  v28.4h,  v29.4h
+        addp            v16.4h,  v28.4h,  v28.4h
+        srshr           v16.4h,  v16.4h,  #2
         bl              L(\type\()_8tap_filter_2)
 
         trn1            v16.2s, v16.2s, v28.2s
-        trn1            v17.2s, v28.2s, v30.2s
-        mov             v18.8b, v30.8b
+        mov             v17.8b, v28.8b
 
 2:
         bl              L(\type\()_8tap_filter_2)
 
-        trn1            v18.2s, v18.2s, v28.2s
-        trn1            v19.2s, v28.2s, v30.2s
+        ext             v18.8b, v17.8b, v28.8b, #4
+        mov             v19.8b, v28.8b
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
@@ -1335,7 +1333,6 @@
         b.le            0f
         mov             v16.8b, v18.8b
         mov             v17.8b, v19.8b
-        mov             v18.8b, v30.8b
         b               2b
 
 280:    // 2x8, 2x16, 2x32 hv
@@ -1355,28 +1352,24 @@
         ext             v29.16b, v28.16b, v28.16b, #2
         mul             v28.4h,  v28.4h,  v0.4h
         mul             v29.4h,  v29.4h,  v0.4h
-        addv            h28, v28.4h
-        addv            h29, v29.4h
-        trn1            v16.4h, v28.4h, v29.4h
-        srshr           v16.4h, v16.4h, #2
+        addp            v28.4h,  v28.4h,  v29.4h
+        addp            v16.4h,  v28.4h,  v28.4h
+        srshr           v16.4h,  v16.4h,  #2
 
         bl              L(\type\()_8tap_filter_2)
         trn1            v16.2s, v16.2s, v28.2s
-        trn1            v17.2s, v28.2s, v30.2s
-        mov             v18.8b, v30.8b
+        mov             v17.8b, v28.8b
         bl              L(\type\()_8tap_filter_2)
-        trn1            v18.2s, v18.2s, v28.2s
-        trn1            v19.2s, v28.2s, v30.2s
-        mov             v20.8b, v30.8b
+        ext             v18.8b, v17.8b, v28.8b, #4
+        mov             v19.8b, v28.8b
         bl              L(\type\()_8tap_filter_2)
-        trn1            v20.2s, v20.2s, v28.2s
-        trn1            v21.2s, v28.2s, v30.2s
-        mov             v22.8b, v30.8b
+        ext             v20.8b, v19.8b, v28.8b, #4
+        mov             v21.8b, v28.8b
 
 28:
         bl              L(\type\()_8tap_filter_2)
-        trn1            v22.2s, v22.2s, v28.2s
-        trn1            v23.2s, v28.2s, v30.2s
+        ext             v22.8b, v21.8b, v28.8b, #4
+        mov             v23.8b, v28.8b
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
@@ -1398,7 +1391,6 @@
         mov             v19.8b, v21.8b
         mov             v20.8b, v22.8b
         mov             v21.8b, v23.8b
-        mov             v22.8b, v30.8b
         b               28b
 
 0:
@@ -1420,7 +1412,6 @@
         mla             v27.4h,  v30.4h,  v0.h[2]
         mla             v27.4h,  v31.4h,  v0.h[3]
         srshr           v28.4h,  v27.4h,  #2
-        trn2            v30.2s,  v28.2s,  v28.2s
         ret
 .endif