ref: 65a1aafda9fc9eda432408477cc7a3d0d7bd0d28
parent: 458273ed9e407253c434bc131916305902c19a1e
author: Martin Storsjö <martin@martin.st>
date: Thu Sep 3 05:35:29 EDT 2020
arm: mc: Avoid an unnecessary mov in 8tap_hv w2 This matches how the same logic is written for w4 and above.
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -1951,11 +1951,10 @@
bl L(\type\()_8tap_filter_2)
vext.8 d18, d17, d26, #4
- vmov d19, d26
vmull.s16 q2, d16, d2[0]
vmlal.s16 q2, d17, d2[1]
vmlal.s16 q2, d18, d2[2]
- vmlal.s16 q2, d19, d2[3]
+ vmlal.s16 q2, d26, d2[3]
vqrshrn.s32 d4, q2, #\shift_hv
vqmovun.s16 d4, q2
@@ -1964,7 +1963,7 @@
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
ble 0f
vmov d16, d18
- vmov d17, d19
+ vmov d17, d26
b 2b
280: // 2x8, 2x16, 2x32 hv
@@ -2001,7 +2000,6 @@
28:
bl L(\type\()_8tap_filter_2)
vext.8 d22, d21, d26, #4
- vmov d23, d26
vmull.s16 q2, d16, d2[0]
vmlal.s16 q2, d17, d2[1]
vmlal.s16 q2, d18, d2[2]
@@ -2009,7 +2007,7 @@
vmlal.s16 q2, d20, d3[0]
vmlal.s16 q2, d21, d3[1]
vmlal.s16 q2, d22, d3[2]
- vmlal.s16 q2, d23, d3[3]
+ vmlal.s16 q2, d26, d3[3]
vqrshrn.s32 d4, q2, #\shift_hv
vqmovun.s16 d4, q2
@@ -2022,7 +2020,7 @@
vmov d18, d20
vmov d19, d21
vmov d20, d22
- vmov d21, d23
+ vmov d21, d26
b 28b
0:
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -1906,11 +1906,10 @@
bl L(\type\()_8tap_filter_2)
ext v18.8b, v17.8b, v28.8b, #4
- mov v19.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
- smlal v2.4s, v19.4h, v1.h[3]
+ smlal v2.4s, v28.4h, v1.h[3]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqxtun v2.8b, v2.8h
@@ -1919,7 +1918,7 @@
st1 {v2.h}[1], [\ds2], \d_strd
b.le 0f
mov v16.8b, v18.8b
- mov v17.8b, v19.8b
+ mov v17.8b, v28.8b
b 2b
280: // 2x8, 2x16, 2x32 hv
@@ -1956,7 +1955,6 @@
28:
bl L(\type\()_8tap_filter_2)
ext v22.8b, v21.8b, v28.8b, #4
- mov v23.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
@@ -1964,7 +1962,7 @@
smlal v2.4s, v20.4h, v1.h[4]
smlal v2.4s, v21.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
- smlal v2.4s, v23.4h, v1.h[7]
+ smlal v2.4s, v28.4h, v1.h[7]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqxtun v2.8b, v2.8h
@@ -1977,7 +1975,7 @@
mov v18.8b, v20.8b
mov v19.8b, v21.8b
mov v20.8b, v22.8b
- mov v21.8b, v23.8b
+ mov v21.8b, v28.8b
b 28b
0:
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -2057,11 +2057,10 @@
bl L(\type\()_8tap_filter_2)
ext v18.16b, v17.16b, v24.16b, #8
- mov v19.16b, v24.16b
mul v2.4s, v16.4s, v1.s[0]
mla v2.4s, v17.4s, v1.s[1]
mla v2.4s, v18.4s, v1.s[2]
- mla v2.4s, v19.4s, v1.s[3]
+ mla v2.4s, v24.4s, v1.s[3]
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
sqxtun v2.4h, v2.4s
@@ -2071,7 +2070,7 @@
st1 {v2.s}[1], [\ds2], \d_strd
b.le 0f
mov v16.16b, v18.16b
- mov v17.16b, v19.16b
+ mov v17.16b, v24.16b
b 2b
280: // 2x8, 2x16, 2x32 hv
@@ -2109,7 +2108,6 @@
28:
bl L(\type\()_8tap_filter_2)
ext v22.16b, v21.16b, v24.16b, #8
- mov v23.16b, v24.16b
mul v3.4s, v16.4s, v1.s[0]
mla v3.4s, v17.4s, v1.s[1]
mla v3.4s, v18.4s, v1.s[2]
@@ -2117,7 +2115,7 @@
mla v3.4s, v20.4s, v2.s[0]
mla v3.4s, v21.4s, v2.s[1]
mla v3.4s, v22.4s, v2.s[2]
- mla v3.4s, v23.4s, v2.s[3]
+ mla v3.4s, v24.4s, v2.s[3]
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
sqxtun v3.4h, v3.4s
@@ -2131,7 +2129,7 @@
mov v18.16b, v20.16b
mov v19.16b, v21.16b
mov v20.16b, v22.16b
- mov v21.16b, v23.16b
+ mov v21.16b, v24.16b
b 28b
0: