ref: 8d574f70278873c74444e65f762902e20811bc27
parent: 9f084b0d267029817599e6a6a9692350f823c1ae
author: Martin Storsjö <martin@martin.st>
date: Sat Jan 4 19:41:59 EST 2020
arm64: msac: Avoid 32 bit intermediates in symbol_adapt This gives small gains on A72 and A73, and on A53 on symbol_adapt16. Before: Cortex A53 A72 A73 msac_decode_symbol_adapt4_neon: 63.2 52.8 53.3 msac_decode_symbol_adapt8_neon: 68.5 57.9 55.7 msac_decode_symbol_adapt16_neon: 92.8 59.7 62.8 After: msac_decode_symbol_adapt4_neon: 63.3 48.3 50.0 msac_decode_symbol_adapt8_neon: 68.7 55.5 54.0 msac_decode_symbol_adapt16_neon: 88.6 58.8 60.0
--- a/src/arm/64/msac.S
+++ b/src/arm/64/msac.S
@@ -110,28 +110,13 @@
.endif
.endm
-.macro umull_n d0, d1, d2, d3, s0, s1, s2, s3, n
- umull \d0\().4s, \s0\().4h, \s2\().4h
-.if \n >= 8
- umull2 \d1\().4s, \s0\().8h, \s2\().8h
-.endif
+.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
+ sqdmulh \d0\sz, \s0\sz, \s2\sz
.if \n == 16
- umull \d2\().4s, \s1\().4h, \s3\().4h
- umull2 \d3\().4s, \s1\().8h, \s3\().8h
+ sqdmulh \d1\sz, \s1\sz, \s3\sz
.endif
.endm
-.macro shrn_n d0, d1, s0, s1, s2, s3, shift, n
- shrn \d0\().4h, \s0\().4s, \shift
-.if \n >= 8
- shrn2 \d0\().8h, \s1\().4s, \shift
-.endif
-.if \n == 16
- shrn \d1\().4h, \s2\().4s, \shift
- shrn2 \d1\().8h, \s3\().4s, \shift
-.endif
-.endm
-
.macro str_n idx0, idx1, dstreg, dstoff, n
str q\idx0, [\dstreg, \dstoff]
.if \n == 16
@@ -149,17 +134,19 @@
ld1_n v0, v1, x1, \sz, \n // cdf
ld1r {v4\sz}, [x8] // rng
movrel x9, coeffs, 30
+ movi v31\sz, #0x7f, lsl #8 // 0x7f00
sub x9, x9, x2, lsl #1
- ushr_n v2, v3, v0, v1, #6, \sz, \n // cdf >> EC_PROB_SHIFT
+ mvni v30\sz, #0x3f // 0xffc0
+ and v7\szb, v4\szb, v31\szb // rng & 0x7f00
str h4, [sp, #14] // store original u = s->rng
- ushr v4\sz, v4\sz, #8 // r = rng >> 8
+ and_n v2, v3, v0, v1, v30, v30, \szb, \n // cdf & 0xffc0
- umull_n v16, v17, v18, v19, v4, v4, v2, v3, \n // r * (cdf >> EC_PROB_SHIFT)
ld1_n v4, v5, x9, \sz, \n // EC_MIN_PROB * (n_symbols - ret)
- shrn_n v2, v3, v16, v17, v18, v19, #1, \n // v >>= 7 - EC_PROB_SHIFT
+ sqdmulh_n v6, v7, v2, v3, v7, v7, \sz, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
add x8, x0, #DIF + 6
- add_n v4, v5, v2, v3, v4, v5, \sz, \n // v += EC_MIN_PROB * (n_symbols - ret)
+ add_n v4, v5, v2, v3, v4, v5, \sz, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+ add_n v4, v5, v6, v7, v4, v5, \sz, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16)
movrel x8, bits