shithub: dav1d

Download patch

ref: e80955cc94e78e1de28d8ef9462cd2df026f6fad
parent: 72af9329c0c003f68639301be33d4632147245b6
author: Martin Storsjö <martin@martin.st>
date: Thu Jan 10 05:48:50 EST 2019

arm64: mc: Optimize mc_8tap_regular_w4_hv_8bpc for A53

Before:                       Cortex A53   Snapdragon 835
mc_8tap_regular_w4_hv_8bpc_neon:   543.6   359.1
After:
mc_8tap_regular_w4_hv_8bpc_neon:   466.7   355.5

The same kind of change doesn't seem to give any benefits on the 8
pixel wide hv filtering though, potentially related to the fact that
it uses not only smull/smlal but also smull2/smlal2.

--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1447,14 +1447,17 @@
         mov             v18.8b, v29.8b
 
 4:
-        smull           v2.4s,  v16.4h, v1.h[0]
         bl              L(\type\()_8tap_filter_4)
-        smull           v3.4s,  v17.4h, v1.h[0]
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
-        smlal           v3.4s,  v18.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v3.4s,  v28.4h, v1.h[2]
         smlal           v2.4s,  v28.4h, v1.h[3]
+        smull           v3.4s,  v17.4h, v1.h[0]
+        smlal           v3.4s,  v18.4h, v1.h[1]
+        smlal           v3.4s,  v28.4h, v1.h[2]
         smlal           v3.4s,  v29.4h, v1.h[3]
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqrshrn         v3.4h,  v3.4s,  #\shift_hv
@@ -1508,22 +1511,22 @@
         mov             v22.8b, v29.8b
 
 48:
-        smull           v2.4s,  v16.4h, v1.h[0]
         bl              L(\type\()_8tap_filter_4)
-        smull           v3.4s,  v17.4h, v1.h[0]
+        smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
-        smlal           v3.4s,  v18.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v3.4s,  v19.4h, v1.h[2]
         smlal           v2.4s,  v19.4h, v1.h[3]
-        smlal           v3.4s,  v20.4h, v1.h[3]
         smlal           v2.4s,  v20.4h, v1.h[4]
-        smlal           v3.4s,  v21.4h, v1.h[4]
         smlal           v2.4s,  v21.4h, v1.h[5]
-        smlal           v3.4s,  v22.4h, v1.h[5]
         smlal           v2.4s,  v22.4h, v1.h[6]
-        smlal           v3.4s,  v28.4h, v1.h[6]
         smlal           v2.4s,  v28.4h, v1.h[7]
+        smull           v3.4s,  v17.4h, v1.h[0]
+        smlal           v3.4s,  v18.4h, v1.h[1]
+        smlal           v3.4s,  v19.4h, v1.h[2]
+        smlal           v3.4s,  v20.4h, v1.h[3]
+        smlal           v3.4s,  v21.4h, v1.h[4]
+        smlal           v3.4s,  v22.4h, v1.h[5]
+        smlal           v3.4s,  v28.4h, v1.h[6]
         smlal           v3.4s,  v29.4h, v1.h[7]
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqrshrn         v3.4h,  v3.4s,  #\shift_hv