shithub: libvpx

--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c

@@ -19,14 +19,14 @@

                                                      int32x4x2_t *const d1) {

   int32x2x2_t t32[4];

-  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], 14);

-  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], 14);

-  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], 14);

-  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], 14);

-  t32[2].val[0] = vrshrn_n_s64(t[2].val[0], 14);

-  t32[2].val[1] = vrshrn_n_s64(t[2].val[1], 14);

-  t32[3].val[0] = vrshrn_n_s64(t[3].val[0], 14);

-  t32[3].val[1] = vrshrn_n_s64(t[3].val[1], 14);

+  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);

+  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);

+  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);

+  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);

+  t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS);

+  t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS);

+  t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS);

+  t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS);

   d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);

   d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);

   d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]);

--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c

@@ -82,10 +82,10 @@

   b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);

   b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);

   b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);

-  b0 = vrshrq_n_s32(b0, 14);

-  b1 = vrshrq_n_s32(b1, 14);

-  b2 = vrshrq_n_s32(b2, 14);

-  b3 = vrshrq_n_s32(b3, 14);

+  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);

+  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);

+  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);

+  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);

   *a0 = vaddq_s32(b0, b3);

   *a1 = vaddq_s32(b1, b2);

   *a2 = vsubq_s32(b1, b2);

@@ -119,10 +119,14 @@

   c5 = vsubq_s64(c5, c9);

   c6 = vaddq_s64(c6, c10);

   c7 = vaddq_s64(c7, c11);

-  b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));

-  b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));

-  b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));

-  b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));

+  b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),

+                    vrshrn_n_s64(c1, DCT_CONST_BITS));

+  b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),

+                    vrshrn_n_s64(c3, DCT_CONST_BITS));

+  b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),

+                    vrshrn_n_s64(c5, DCT_CONST_BITS));

+  b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),

+                    vrshrn_n_s64(c7, DCT_CONST_BITS));

   *a0 = vaddq_s32(b0, b3);

   *a1 = vaddq_s32(b1, b2);

   *a2 = vsubq_s32(b1, b2);

--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c

@@ -82,18 +82,18 @@

   step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);

   step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);

   step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);

-  step1[4] = vrshrq_n_s32(step1[4], 14);

-  step1[5] = vrshrq_n_s32(step1[5], 14);

-  step1[6] = vrshrq_n_s32(step1[6], 14);

-  step1[7] = vrshrq_n_s32(step1[7], 14);

+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);

+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);

+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);

+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);

   // stage 2

   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);

   step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);

   step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);

-  step2[1] = vrshrq_n_s32(step2[1], 14);

-  step2[2] = vrshrq_n_s32(step2[2], 14);

-  step2[3] = vrshrq_n_s32(step2[3], 14);

+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);

+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);

+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);

   step2[4] = vaddq_s32(step1[4], step1[5]);

   step2[5] = vsubq_s32(step1[4], step1[5]);

@@ -109,8 +109,8 @@

   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);

   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);

   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);

-  step1[5] = vrshrq_n_s32(step1[5], 14);

-  step1[6] = vrshrq_n_s32(step1[6], 14);

+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);

+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);

   // stage 4

   *io0 = vaddq_s32(step1[0], step2[7]);

@@ -154,14 +154,14 @@

   t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);

   t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);

   t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);

-  t32[0] = vrshrn_n_s64(t64[0], 14);

-  t32[1] = vrshrn_n_s64(t64[1], 14);

-  t32[2] = vrshrn_n_s64(t64[2], 14);

-  t32[3] = vrshrn_n_s64(t64[3], 14);

-  t32[4] = vrshrn_n_s64(t64[4], 14);

-  t32[5] = vrshrn_n_s64(t64[5], 14);

-  t32[6] = vrshrn_n_s64(t64[6], 14);

-  t32[7] = vrshrn_n_s64(t64[7], 14);

+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);

+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);

+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);

+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);

+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);

+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);

+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);

+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);

   step1[4] = vcombine_s32(t32[0], t32[1]);

   step1[5] = vcombine_s32(t32[2], t32[3]);

   step1[6] = vcombine_s32(t32[4], t32[5]);

@@ -174,12 +174,12 @@

   t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);

   t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);

   t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);

-  t32[2] = vrshrn_n_s64(t64[2], 14);

-  t32[3] = vrshrn_n_s64(t64[3], 14);

-  t32[4] = vrshrn_n_s64(t64[4], 14);

-  t32[5] = vrshrn_n_s64(t64[5], 14);

-  t32[6] = vrshrn_n_s64(t64[6], 14);

-  t32[7] = vrshrn_n_s64(t64[7], 14);

+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);

+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);

+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);

+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);

+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);

+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);

   step2[1] = vcombine_s32(t32[2], t32[3]);

   step2[2] = vcombine_s32(t32[4], t32[5]);

   step2[3] = vcombine_s32(t32[6], t32[7]);

@@ -205,10 +205,10 @@

       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);

   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),

                           vget_high_s32(cospis0), 0);

-  t32[0] = vrshrn_n_s64(t64[0], 14);

-  t32[1] = vrshrn_n_s64(t64[1], 14);

-  t32[2] = vrshrn_n_s64(t64[2], 14);

-  t32[3] = vrshrn_n_s64(t64[3], 14);

+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);

+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);

+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);

+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);

   step1[5] = vcombine_s32(t32[0], t32[1]);

   step1[6] = vcombine_s32(t32[2], t32[3]);

@@ -377,10 +377,10 @@

   step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);

   step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);

-  step1[4] = vrshrq_n_s32(step1[4], 14);

-  step1[5] = vrshrq_n_s32(step1[5], 14);

-  step1[6] = vrshrq_n_s32(step1[6], 14);

-  step1[7] = vrshrq_n_s32(step1[7], 14);

+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);

+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);

+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);

+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);

   // stage 2

   step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);

@@ -392,10 +392,10 @@

   step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);

   step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);

-  step2[0] = vrshrq_n_s32(step2[0], 14);

-  step2[1] = vrshrq_n_s32(step2[1], 14);

-  step2[2] = vrshrq_n_s32(step2[2], 14);

-  step2[3] = vrshrq_n_s32(step2[3], 14);

+  step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);

+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);

+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);

+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);

   step2[4] = vaddq_s32(step1[4], step1[5]);

   step2[5] = vsubq_s32(step1[4], step1[5]);

@@ -411,8 +411,8 @@

   step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);

   step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);

   step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);

-  step1[5] = vrshrq_n_s32(step1[5], 14);

-  step1[6] = vrshrq_n_s32(step1[6], 14);

+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);

+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);

   // stage 4

   *io0 = vaddq_s32(step1[0], step2[7]);

@@ -473,14 +473,14 @@

   t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);

   t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);

   t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);

-  t32[0] = vrshrn_n_s64(t64[0], 14);

-  t32[1] = vrshrn_n_s64(t64[1], 14);

-  t32[2] = vrshrn_n_s64(t64[2], 14);

-  t32[3] = vrshrn_n_s64(t64[3], 14);

-  t32[4] = vrshrn_n_s64(t64[4], 14);

-  t32[5] = vrshrn_n_s64(t64[5], 14);

-  t32[6] = vrshrn_n_s64(t64[6], 14);

-  t32[7] = vrshrn_n_s64(t64[7], 14);

+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);

+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);

+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);

+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);

+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);

+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);

+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);

+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);

   step1[4] = vcombine_s32(t32[0], t32[1]);

   step1[5] = vcombine_s32(t32[2], t32[3]);

   step1[6] = vcombine_s32(t32[4], t32[5]);

@@ -501,14 +501,14 @@

   t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);

   t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);

   t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);

-  t32[0] = vrshrn_n_s64(t64[0], 14);

-  t32[1] = vrshrn_n_s64(t64[1], 14);

-  t32[2] = vrshrn_n_s64(t64[2], 14);

-  t32[3] = vrshrn_n_s64(t64[3], 14);

-  t32[4] = vrshrn_n_s64(t64[4], 14);

-  t32[5] = vrshrn_n_s64(t64[5], 14);

-  t32[6] = vrshrn_n_s64(t64[6], 14);

-  t32[7] = vrshrn_n_s64(t64[7], 14);

+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);

+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);

+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);

+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);

+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);

+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);

+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);

+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);

   step2[0] = vcombine_s32(t32[0], t32[1]);

   step2[1] = vcombine_s32(t32[2], t32[3]);

   step2[2] = vcombine_s32(t32[4], t32[5]);

@@ -535,10 +535,10 @@

       vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);

   t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),

                           vget_high_s32(cospis0), 0);

-  t32[0] = vrshrn_n_s64(t64[0], 14);

-  t32[1] = vrshrn_n_s64(t64[1], 14);

-  t32[2] = vrshrn_n_s64(t64[2], 14);

-  t32[3] = vrshrn_n_s64(t64[3], 14);

+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);

+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);

+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);

+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);

   step1[5] = vcombine_s32(t32[0], t32[1]);

   step1[6] = vcombine_s32(t32[2], t32[3]);

--- a/vpx_dsp/arm/idct16x16_add_neon.c

+++ b/vpx_dsp/arm/idct16x16_add_neon.c

@@ -16,8 +16,8 @@

 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,

                                 int16x4_t *const d1) {

-  *d0 = vrshrn_n_s32(t32[0], 14);

-  *d1 = vrshrn_n_s32(t32[1], 14);

+  *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);

+  *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);

 static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,

--- a/vpx_dsp/arm/idct32x32_add_neon.c

+++ b/vpx_dsp/arm/idct32x32_add_neon.c

@@ -147,8 +147,10 @@

   q11s32 = vaddq_s32(q12s32, q11s32);

   q10s32 = vaddq_s32(q10s32, q15s32);

-  *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14));

-  *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));

+  *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, DCT_CONST_BITS),

+                        vrshrn_n_s32(q9s32, DCT_CONST_BITS));

+  *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, DCT_CONST_BITS),

+                        vrshrn_n_s32(q10s32, DCT_CONST_BITS));

 static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,

--- a/vpx_dsp/arm/idct_neon.h

+++ b/vpx_dsp/arm/idct_neon.h

@@ -15,6 +15,7 @@

 #include "./vpx_config.h"

 #include "vpx_dsp/arm/transpose_neon.h"

+#include "vpx_dsp/txfm_common.h"

 #include "vpx_dsp/vpx_dsp_common.h"

 DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {

@@ -93,14 +94,14 @@

 //------------------------------------------------------------------------------

-// Multiply a by a_const. Saturate, shift and narrow by 14.

+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.

 static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,

                                                       const int16_t a_const) {

-  // Shift by 14 + rounding will be within 16 bits for well formed streams.

-  // See WRAPLOW and dct_const_round_shift for details.

+  // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed

+  // streams. See WRAPLOW and dct_const_round_shift for details.

   // This instruction doubles the result and returns the high half, essentially

   // resulting in a right shift by 15. By multiplying the constant first that

-  // becomes a right shift by 14.

+  // becomes a right shift by DCT_CONST_BITS.

   // The largest possible value used here is

   // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*

   // within the range of int16_t (+32767 / -32768) even when negated.

@@ -107,7 +108,7 @@

   return vqrdmulhq_n_s16(a, a_const * 2);

-// Add a and b, then multiply by ab_const. Shift and narrow by 14.

+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.

 static INLINE int16x8_t add_multiply_shift_and_narrow_s16(

     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {

   // In both add_ and it's pair, sub_, the input for well-formed streams will be

@@ -121,10 +122,12 @@

   int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));

   temp_low = vmulq_n_s32(temp_low, ab_const);

   temp_high = vmulq_n_s32(temp_high, ab_const);

-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));

+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),

+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));

-// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.

+// Subtract b from a, then multiply by ab_const. Shift and narrow by

+// DCT_CONST_BITS.

 static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(

     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {

   int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));

@@ -131,11 +134,12 @@

   int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));

   temp_low = vmulq_n_s32(temp_low, ab_const);

   temp_high = vmulq_n_s32(temp_high, ab_const);

-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));

+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),

+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));

 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by

-// 14.

+// DCT_CONST_BITS.

 static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(

     const int16x8_t a, const int16_t a_const, const int16x8_t b,

     const int16_t b_const) {

@@ -143,7 +147,8 @@

   int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);

   temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);

   temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);

-  return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));

+  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),

+                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));

 // Shift the output down by 6 and add it to the destination buffer.

@@ -233,10 +238,10 @@

   c3 = vmull_lane_s16(b2, cospis, 1);

   c2 = vmlsl_lane_s16(c2, b3, cospis, 1);

   c3 = vmlal_lane_s16(c3, b3, cospis, 3);

-  b0 = vrshrn_n_s32(c0, 14);

-  b1 = vrshrn_n_s32(c1, 14);

-  b2 = vrshrn_n_s32(c2, 14);

-  b3 = vrshrn_n_s32(c3, 14);

+  b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);

+  b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);

+  b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);

+  b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);

   d0 = vcombine_s16(b0, b1);

   d1 = vcombine_s16(b3, b2);

   *a0 = vaddq_s16(d0, d1);

@@ -278,8 +283,8 @@

   t32[1] = vmull_lane_s16(step2[6], cospis0, 2);

   t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);

   t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);

-  step1[5] = vrshrn_n_s32(t32[0], 14);

-  step1[6] = vrshrn_n_s32(t32[1], 14);

+  step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);

+  step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);

   // stage 4

   *io0 = vadd_s16(step1[0], step2[7]);

@@ -337,10 +342,10 @@

   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);

   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);

   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);

-  t16[0] = vrshrn_n_s32(t32[0], 14);

-  t16[1] = vrshrn_n_s32(t32[1], 14);

-  t16[2] = vrshrn_n_s32(t32[2], 14);

-  t16[3] = vrshrn_n_s32(t32[3], 14);

+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);

+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);

+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);

+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);

   step1[5] = vcombine_s16(t16[0], t16[1]);

   step1[6] = vcombine_s16(t16[2], t16[3]);

@@ -405,14 +410,14 @@

   t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);

   t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);

   t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);

-  t16[0] = vrshrn_n_s32(t32[0], 14);

-  t16[1] = vrshrn_n_s32(t32[1], 14);

-  t16[2] = vrshrn_n_s32(t32[2], 14);

-  t16[3] = vrshrn_n_s32(t32[3], 14);

-  t16[4] = vrshrn_n_s32(t32[4], 14);

-  t16[5] = vrshrn_n_s32(t32[5], 14);

-  t16[6] = vrshrn_n_s32(t32[6], 14);

-  t16[7] = vrshrn_n_s32(t32[7], 14);

+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);

+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);

+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);

+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);

+  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);

+  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);

+  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);

+  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);

   step1[4] = vcombine_s16(t16[0], t16[1]);

   step1[5] = vcombine_s16(t16[2], t16[3]);

   step1[6] = vcombine_s16(t16[4], t16[5]);

@@ -433,14 +438,14 @@

   t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);

   t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);

   t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);

-  t16[0] = vrshrn_n_s32(t32[0], 14);

-  t16[1] = vrshrn_n_s32(t32[1], 14);

-  t16[2] = vrshrn_n_s32(t32[2], 14);

-  t16[3] = vrshrn_n_s32(t32[3], 14);

-  t16[4] = vrshrn_n_s32(t32[4], 14);

-  t16[5] = vrshrn_n_s32(t32[5], 14);

-  t16[6] = vrshrn_n_s32(t32[6], 14);

-  t16[7] = vrshrn_n_s32(t32[7], 14);

+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);

+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);

+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);

+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);

+  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);

+  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);

+  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);

+  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);

   step2[0] = vcombine_s16(t16[0], t16[1]);

   step2[1] = vcombine_s16(t16[2], t16[3]);

   step2[2] = vcombine_s16(t16[4], t16[5]);

@@ -463,10 +468,10 @@

   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);

   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);

   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);

-  t16[0] = vrshrn_n_s32(t32[0], 14);

-  t16[1] = vrshrn_n_s32(t32[1], 14);

-  t16[2] = vrshrn_n_s32(t32[2], 14);

-  t16[3] = vrshrn_n_s32(t32[3], 14);

+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);

+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);

+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);

+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);

   step1[5] = vcombine_s16(t16[0], t16[1]);

   step1[6] = vcombine_s16(t16[2], t16[3]);

@@ -486,10 +491,10 @@

                                               int16x8_t *const d1) {

   int16x4_t t16[4];

-  t16[0] = vrshrn_n_s32(t32[0], 14);

-  t16[1] = vrshrn_n_s32(t32[1], 14);

-  t16[2] = vrshrn_n_s32(t32[2], 14);

-  t16[3] = vrshrn_n_s32(t32[3], 14);

+  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);

+  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);

+  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);

+  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);

   *d0 = vcombine_s16(t16[0], t16[1]);

   *d1 = vcombine_s16(t16[2], t16[3]);

--

⑨