shithub: dav1d

Download patch

ref: 8d574f70278873c74444e65f762902e20811bc27
parent: 9f084b0d267029817599e6a6a9692350f823c1ae
author: Martin Storsjö <martin@martin.st>
date: Sat Jan 4 19:41:59 EST 2020

arm64: msac: Avoid 32 bit intermediates in symbol_adapt

This gives small gains on A72 and A73, and on A53 on symbol_adapt16.

Before:                      Cortex A53    A72    A73
msac_decode_symbol_adapt4_neon:    63.2   52.8   53.3
msac_decode_symbol_adapt8_neon:    68.5   57.9   55.7
msac_decode_symbol_adapt16_neon:   92.8   59.7   62.8
After:
msac_decode_symbol_adapt4_neon:    63.3   48.3   50.0
msac_decode_symbol_adapt8_neon:    68.7   55.5   54.0
msac_decode_symbol_adapt16_neon:   88.6   58.8   60.0

--- a/src/arm/64/msac.S
+++ b/src/arm/64/msac.S
@@ -110,28 +110,13 @@
 .endif
 .endm
 
-.macro umull_n d0, d1, d2, d3, s0, s1, s2, s3, n
-        umull           \d0\().4s, \s0\().4h,  \s2\().4h
-.if \n >= 8
-        umull2          \d1\().4s, \s0\().8h,  \s2\().8h
-.endif
+.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
+        sqdmulh         \d0\sz,  \s0\sz,  \s2\sz
 .if \n == 16
-        umull           \d2\().4s, \s1\().4h,  \s3\().4h
-        umull2          \d3\().4s, \s1\().8h,  \s3\().8h
+        sqdmulh         \d1\sz,  \s1\sz,  \s3\sz
 .endif
 .endm
 
-.macro shrn_n d0, d1, s0, s1, s2, s3, shift, n
-        shrn            \d0\().4h,  \s0\().4s, \shift
-.if \n >= 8
-        shrn2           \d0\().8h,  \s1\().4s, \shift
-.endif
-.if \n == 16
-        shrn            \d1\().4h,  \s2\().4s, \shift
-        shrn2           \d1\().8h,  \s3\().4s, \shift
-.endif
-.endm
-
 .macro str_n            idx0, idx1, dstreg, dstoff, n
         str             q\idx0,  [\dstreg, \dstoff]
 .if \n == 16
@@ -149,17 +134,19 @@
         ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
         ld1r            {v4\sz},  [x8]                            // rng
         movrel          x9,  coeffs, 30
+        movi            v31\sz, #0x7f, lsl #8                     // 0x7f00
         sub             x9,  x9,  x2, lsl #1
-        ushr_n          v2,  v3,  v0,  v1,  #6, \sz, \n           // cdf >> EC_PROB_SHIFT
+        mvni            v30\sz, #0x3f                             // 0xffc0
+        and             v7\szb, v4\szb, v31\szb                   // rng & 0x7f00
         str             h4,  [sp, #14]                            // store original u = s->rng
-        ushr            v4\sz,  v4\sz,  #8                        // r = rng >> 8
+        and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0
 
-        umull_n         v16, v17, v18, v19, v4,  v4,  v2,  v3, \n // r * (cdf >> EC_PROB_SHIFT)
         ld1_n           v4,  v5,  x9,  \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)
-        shrn_n          v2,  v3,  v16, v17, v18, v19, #1, \n      // v >>= 7 - EC_PROB_SHIFT
+        sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
         add             x8,  x0,  #DIF + 6
 
-        add_n           v4,  v5,  v2,  v3,  v4,  v5, \sz, \n      // v += EC_MIN_PROB * (n_symbols - ret)
+        add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+        add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
 
         ld1r            {v6.8h},  [x8]                            // dif >> (EC_WIN_SIZE - 16)
         movrel          x8,  bits