shithub: dav1d

--- a/src/arm/64/msac.S

+++ b/src/arm/64/msac.S

@@ -110,28 +110,13 @@

 .endif

 .endm

-.macro umull_n d0, d1, d2, d3, s0, s1, s2, s3, n

-        umull           \d0\().4s, \s0\().4h,  \s2\().4h

-.if \n >= 8

-        umull2          \d1\().4s, \s0\().8h,  \s2\().8h

-.endif

+.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n

+        sqdmulh         \d0\sz,  \s0\sz,  \s2\sz

 .if \n == 16

-        umull           \d2\().4s, \s1\().4h,  \s3\().4h

-        umull2          \d3\().4s, \s1\().8h,  \s3\().8h

+        sqdmulh         \d1\sz,  \s1\sz,  \s3\sz

 .endif

 .endm

-.macro shrn_n d0, d1, s0, s1, s2, s3, shift, n

-        shrn            \d0\().4h,  \s0\().4s, \shift

-.if \n >= 8

-        shrn2           \d0\().8h,  \s1\().4s, \shift

-.endif

-.if \n == 16

-        shrn            \d1\().4h,  \s2\().4s, \shift

-        shrn2           \d1\().8h,  \s3\().4s, \shift

-.endif

-.endm

 .macro str_n            idx0, idx1, dstreg, dstoff, n

         str             q\idx0,  [\dstreg, \dstoff]

 .if \n == 16

@@ -149,17 +134,19 @@

         ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf

         ld1r            {v4\sz},  [x8]                            // rng

         movrel          x9,  coeffs, 30

+        movi            v31\sz, #0x7f, lsl #8                     // 0x7f00

         sub             x9,  x9,  x2, lsl #1

-        ushr_n          v2,  v3,  v0,  v1,  #6, \sz, \n           // cdf >> EC_PROB_SHIFT

+        mvni            v30\sz, #0x3f                             // 0xffc0

+        and             v7\szb, v4\szb, v31\szb                   // rng & 0x7f00

         str             h4,  [sp, #14]                            // store original u = s->rng

-        ushr            v4\sz,  v4\sz,  #8                        // r = rng >> 8

+        and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0

-        umull_n         v16, v17, v18, v19, v4,  v4,  v2,  v3, \n // r * (cdf >> EC_PROB_SHIFT)

         ld1_n           v4,  v5,  x9,  \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)

-        shrn_n          v2,  v3,  v16, v17, v18, v19, #1, \n      // v >>= 7 - EC_PROB_SHIFT

+        sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1

         add             x8,  x0,  #DIF + 6

-        add_n           v4,  v5,  v2,  v3,  v4,  v5, \sz, \n      // v += EC_MIN_PROB * (n_symbols - ret)

+        add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)

+        add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)

         ld1r            {v6.8h},  [x8]                            // dif >> (EC_WIN_SIZE - 16)

         movrel          x8,  bits

--

⑨