ref: 4ae3f5f7f330fa8b5d3ae0794eaac3c25dc4ae48
parent: 65a1aafda9fc9eda432408477cc7a3d0d7bd0d28
author: Martin Storsjö <martin@martin.st>
date: Thu Sep 3 07:34:14 EDT 2020
arm64: mc: Apply tuning from w4/w8 case to w2 case in 16 bpc 8tap_hv Narrowing the intermediates from the horizontal pass is beneficial (on most cores, but a small slowdown on A53) here as well. This increases consistency in the code between the cases. (The corresponding change in the upcoming arm32 version is beneficial on all tested cores except for on A53 - it helps, on some cores a lot, on A7, A8, A9, A72, A73 and only makes it marginally slower on A53.) Before: Cortex A53 A72 A73 mc_8tap_regular_w2_hv_16bpc_neon: 457.7 301.0 317.1 After: mc_8tap_regular_w2_hv_16bpc_neon: 472.0 276.0 284.3
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -2039,7 +2039,6 @@
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
mov x15, x30
- sxtl v1.4s, v1.4h
ld1 {v27.8h}, [\src], \s_strd
ext v28.16b, v27.16b, v27.16b, #2
@@ -2049,18 +2048,23 @@
addp v16.4s, v27.4s, v27.4s
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
bl L(\type\()_8tap_filter_2)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
+ xtn v16.4h, v16.4s
- trn1 v16.2d, v16.2d, v24.2d
- mov v17.16b, v24.16b
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
2:
bl L(\type\()_8tap_filter_2)
- ext v18.16b, v17.16b, v24.16b, #8
- mul v2.4s, v16.4s, v1.s[0]
- mla v2.4s, v17.4s, v1.s[1]
- mla v2.4s, v18.4s, v1.s[2]
- mla v2.4s, v24.4s, v1.s[3]
+ ext v18.8b, v17.8b, v24.8b, #4
+ smull v2.4s, v16.4h, v1.h[0]
+ smlal v2.4s, v17.4h, v1.h[1]
+ smlal v2.4s, v18.4h, v1.h[2]
+ smlal v2.4s, v24.4h, v1.h[3]
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
sqxtun v2.4h, v2.4s
@@ -2069,8 +2073,8 @@
st1 {v2.s}[0], [\dst], \d_strd
st1 {v2.s}[1], [\ds2], \d_strd
b.le 0f
- mov v16.16b, v18.16b
- mov v17.16b, v24.16b
+ mov v16.8b, v18.8b
+ mov v17.8b, v24.8b
b 2b
280: // 2x8, 2x16, 2x32 hv
@@ -2084,8 +2088,6 @@
sxtl v0.8h, v0.8b
sxtl v1.8h, v1.8b
mov x15, x30
- sxtl2 v2.4s, v1.8h
- sxtl v1.4s, v1.4h
ld1 {v27.8h}, [\src], \s_strd
ext v28.16b, v27.16b, v27.16b, #2
@@ -2094,28 +2096,33 @@
addp v27.4s, v27.4s, v28.4s
addp v16.4s, v27.4s, v27.4s
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
+ // The intermediates from the horizontal pass fit in 16 bit without
+ // any bias; we could just as well keep them as .4s, but narrowing
+ // them to .4h gives a significant speedup on out of order cores
+ // (at the cost of a smaller slowdown on in-order cores such as A53).
bl L(\type\()_8tap_filter_2)
- trn1 v16.2d, v16.2d, v24.2d
- mov v17.16b, v24.16b
+ xtn v16.4h, v16.4s
+ trn1 v16.2s, v16.2s, v24.2s
+ mov v17.8b, v24.8b
bl L(\type\()_8tap_filter_2)
- ext v18.16b, v17.16b, v24.16b, #8
- mov v19.16b, v24.16b
+ ext v18.8b, v17.8b, v24.8b, #4
+ mov v19.8b, v24.8b
bl L(\type\()_8tap_filter_2)
- ext v20.16b, v19.16b, v24.16b, #8
- mov v21.16b, v24.16b
+ ext v20.8b, v19.8b, v24.8b, #4
+ mov v21.8b, v24.8b
28:
bl L(\type\()_8tap_filter_2)
- ext v22.16b, v21.16b, v24.16b, #8
- mul v3.4s, v16.4s, v1.s[0]
- mla v3.4s, v17.4s, v1.s[1]
- mla v3.4s, v18.4s, v1.s[2]
- mla v3.4s, v19.4s, v1.s[3]
- mla v3.4s, v20.4s, v2.s[0]
- mla v3.4s, v21.4s, v2.s[1]
- mla v3.4s, v22.4s, v2.s[2]
- mla v3.4s, v24.4s, v2.s[3]
+ ext v22.8b, v21.8b, v24.8b, #4
+ smull v3.4s, v16.4h, v1.h[0]
+ smlal v3.4s, v17.4h, v1.h[1]
+ smlal v3.4s, v18.4h, v1.h[2]
+ smlal v3.4s, v19.4h, v1.h[3]
+ smlal v3.4s, v20.4h, v1.h[4]
+ smlal v3.4s, v21.4h, v1.h[5]
+ smlal v3.4s, v22.4h, v1.h[6]
+ smlal v3.4s, v24.4h, v1.h[7]
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
sqxtun v3.4h, v3.4s
@@ -2124,12 +2131,12 @@
st1 {v3.s}[0], [\dst], \d_strd
st1 {v3.s}[1], [\ds2], \d_strd
b.le 0f
- mov v16.16b, v18.16b
- mov v17.16b, v19.16b
- mov v18.16b, v20.16b
- mov v19.16b, v21.16b
- mov v20.16b, v22.16b
- mov v21.16b, v24.16b
+ mov v16.8b, v18.8b
+ mov v17.8b, v19.8b
+ mov v18.8b, v20.8b
+ mov v19.8b, v21.8b
+ mov v20.8b, v22.8b
+ mov v21.8b, v24.8b
b 28b
0:
@@ -2149,6 +2156,7 @@
smlal v24.4s, v27.4h, v0.h[2]
smlal v24.4s, v28.4h, v0.h[3]
srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
+ xtn v24.4h, v24.4s
ret
.endif