shithub: dav1d

Download patch

ref: 65a1aafda9fc9eda432408477cc7a3d0d7bd0d28
parent: 458273ed9e407253c434bc131916305902c19a1e
author: Martin Storsjö <martin@martin.st>
date: Thu Sep 3 05:35:29 EDT 2020

arm: mc: Avoid an unnecessary mov in 8tap_hv w2

This matches how the same logic is written for w4 and above.

--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -1951,11 +1951,10 @@
         bl              L(\type\()_8tap_filter_2)
 
         vext.8          d18, d17, d26, #4
-        vmov            d19, d26
         vmull.s16       q2,  d16, d2[0]
         vmlal.s16       q2,  d17, d2[1]
         vmlal.s16       q2,  d18, d2[2]
-        vmlal.s16       q2,  d19, d2[3]
+        vmlal.s16       q2,  d26, d2[3]
 
         vqrshrn.s32     d4,  q2,  #\shift_hv
         vqmovun.s16     d4,  q2
@@ -1964,7 +1963,7 @@
         vst1.16         {d4[1]}, [\ds2, :16], \d_strd
         ble             0f
         vmov            d16, d18
-        vmov            d17, d19
+        vmov            d17, d26
         b               2b
 
 280:    // 2x8, 2x16, 2x32 hv
@@ -2001,7 +2000,6 @@
 28:
         bl              L(\type\()_8tap_filter_2)
         vext.8          d22, d21, d26, #4
-        vmov            d23, d26
         vmull.s16       q2,  d16, d2[0]
         vmlal.s16       q2,  d17, d2[1]
         vmlal.s16       q2,  d18, d2[2]
@@ -2009,7 +2007,7 @@
         vmlal.s16       q2,  d20, d3[0]
         vmlal.s16       q2,  d21, d3[1]
         vmlal.s16       q2,  d22, d3[2]
-        vmlal.s16       q2,  d23, d3[3]
+        vmlal.s16       q2,  d26, d3[3]
 
         vqrshrn.s32     d4,  q2,  #\shift_hv
         vqmovun.s16     d4,  q2
@@ -2022,7 +2020,7 @@
         vmov            d18, d20
         vmov            d19, d21
         vmov            d20, d22
-        vmov            d21, d23
+        vmov            d21, d26
         b               28b
 
 0:
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1906,11 +1906,10 @@
         bl              L(\type\()_8tap_filter_2)
 
         ext             v18.8b, v17.8b, v28.8b, #4
-        mov             v19.8b, v28.8b
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v2.4s,  v19.4h, v1.h[3]
+        smlal           v2.4s,  v28.4h, v1.h[3]
 
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqxtun          v2.8b,  v2.8h
@@ -1919,7 +1918,7 @@
         st1             {v2.h}[1], [\ds2], \d_strd
         b.le            0f
         mov             v16.8b, v18.8b
-        mov             v17.8b, v19.8b
+        mov             v17.8b, v28.8b
         b               2b
 
 280:    // 2x8, 2x16, 2x32 hv
@@ -1956,7 +1955,6 @@
 28:
         bl              L(\type\()_8tap_filter_2)
         ext             v22.8b, v21.8b, v28.8b, #4
-        mov             v23.8b, v28.8b
         smull           v2.4s,  v16.4h, v1.h[0]
         smlal           v2.4s,  v17.4h, v1.h[1]
         smlal           v2.4s,  v18.4h, v1.h[2]
@@ -1964,7 +1962,7 @@
         smlal           v2.4s,  v20.4h, v1.h[4]
         smlal           v2.4s,  v21.4h, v1.h[5]
         smlal           v2.4s,  v22.4h, v1.h[6]
-        smlal           v2.4s,  v23.4h, v1.h[7]
+        smlal           v2.4s,  v28.4h, v1.h[7]
 
         sqrshrn         v2.4h,  v2.4s,  #\shift_hv
         sqxtun          v2.8b,  v2.8h
@@ -1977,7 +1975,7 @@
         mov             v18.8b, v20.8b
         mov             v19.8b, v21.8b
         mov             v20.8b, v22.8b
-        mov             v21.8b, v23.8b
+        mov             v21.8b, v28.8b
         b               28b
 
 0:
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -2057,11 +2057,10 @@
         bl              L(\type\()_8tap_filter_2)
 
         ext             v18.16b, v17.16b, v24.16b, #8
-        mov             v19.16b, v24.16b
         mul             v2.4s,   v16.4s,  v1.s[0]
         mla             v2.4s,   v17.4s,  v1.s[1]
         mla             v2.4s,   v18.4s,  v1.s[2]
-        mla             v2.4s,   v19.4s,  v1.s[3]
+        mla             v2.4s,   v24.4s,  v1.s[3]
 
         srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
         sqxtun          v2.4h,   v2.4s
@@ -2071,7 +2070,7 @@
         st1             {v2.s}[1], [\ds2], \d_strd
         b.le            0f
         mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
+        mov             v17.16b, v24.16b
         b               2b
 
 280:    // 2x8, 2x16, 2x32 hv
@@ -2109,7 +2108,6 @@
 28:
         bl              L(\type\()_8tap_filter_2)
         ext             v22.16b, v21.16b, v24.16b, #8
-        mov             v23.16b, v24.16b
         mul             v3.4s,   v16.4s,  v1.s[0]
         mla             v3.4s,   v17.4s,  v1.s[1]
         mla             v3.4s,   v18.4s,  v1.s[2]
@@ -2117,7 +2115,7 @@
         mla             v3.4s,   v20.4s,  v2.s[0]
         mla             v3.4s,   v21.4s,  v2.s[1]
         mla             v3.4s,   v22.4s,  v2.s[2]
-        mla             v3.4s,   v23.4s,  v2.s[3]
+        mla             v3.4s,   v24.4s,  v2.s[3]
 
         srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
         sqxtun          v3.4h,   v3.4s
@@ -2131,7 +2129,7 @@
         mov             v18.16b, v20.16b
         mov             v19.16b, v21.16b
         mov             v20.16b, v22.16b
-        mov             v21.16b, v23.16b
+        mov             v21.16b, v24.16b
         b               28b
 
 0: