shithub: libvpx

--- a/test/dct16x16_test.cc

+++ b/test/dct16x16_test.cc

@@ -353,7 +353,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -475,10 +475,10 @@

         ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

-        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,

+        inv_txfm_ref(output_ref_block, CAST_TO_BYTEPTR(ref16), pitch_,

                      tx_type_);

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(output_ref_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(output_ref_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

       if (bit_depth_ == VPX_BITS_8) {

@@ -530,8 +530,7 @@

         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

-        ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), 16));

+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), 16));

 #endif  // CONFIG_VP9_HIGHBITDEPTH

@@ -585,9 +584,9 @@

         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));

       } else {

 #if CONFIG_VP9_HIGHBITDEPTH

-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);

+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/test/dct32x32_test.cc

+++ b/test/dct32x32_test.cc

@@ -137,7 +137,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

     } else {

       ASM_REGISTER_STATE_CHECK(

-          inv_txfm_(test_temp_block, CONVERT_TO_BYTEPTR(dst16), 32));

+          inv_txfm_(test_temp_block, CAST_TO_BYTEPTR(dst16), 32));

 #endif

@@ -275,7 +275,7 @@

       ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));

 #if CONFIG_VP9_HIGHBITDEPTH

     } else {

-      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));

+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CAST_TO_BYTEPTR(dst16), 32));

 #endif

     for (int j = 0; j < kNumCoeffs; ++j) {

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -135,7 +135,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -249,7 +249,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

--- a/test/fdct8x8_test.cc

+++ b/test/fdct8x8_test.cc

@@ -257,7 +257,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -340,7 +340,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -413,7 +413,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

@@ -497,9 +497,9 @@

         ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));

 #if CONFIG_VP9_HIGHBITDEPTH

       } else {

-        ref_txfm(coeff, CONVERT_TO_BYTEPTR(ref16), pitch_);

+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);

         ASM_REGISTER_STATE_CHECK(

-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));

+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));

 #endif

--- a/test/partial_idct_test.cc

+++ b/test/partial_idct_test.cc

@@ -45,7 +45,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

 template <InvTxfmWithBdFunc fn>

 void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {

-  fn(in, CONVERT_TO_BYTEPTR(out), stride, bd);

+  fn(in, out, stride, bd);

 #endif

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -213,7 +213,7 @@

     { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },  // DCT_ADST = 2

     { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }  // ADST_ADST = 3

};

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   int i, j;

   tran_low_t out[4 * 4];

@@ -252,7 +252,7 @@

   tran_low_t *outptr = out;

   tran_low_t temp_in[8], temp_out[8];

   const highbd_transform_2d ht = HIGH_IHT_8[tx_type];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   // Inverse transform row vectors.

   for (i = 0; i < 8; ++i) {

@@ -286,7 +286,7 @@

   tran_low_t *outptr = out;

   tran_low_t temp_in[16], temp_out[16];

   const highbd_transform_2d ht = HIGH_IHT_16[tx_type];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   // Rows

   for (i = 0; i < 16; ++i) {

--- a/vp9/decoder/vp9_decodeframe.c

+++ b/vp9/decoder/vp9_decodeframe.c

@@ -189,21 +189,22 @@

   assert(eob > 0);

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));

     if (xd->lossless) {

-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);

+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);

     } else {

       switch (tx_size) {

         case TX_4X4:

-          vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct4x4_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_8X8:

-          vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct8x8_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_16X16:

-          vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct16x16_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_32X32:

-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         default: assert(0 && "Invalid transform size");

@@ -256,21 +257,22 @@

   assert(eob > 0);

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));

     if (xd->lossless) {

-      vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);

+      vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);

     } else {

       switch (tx_size) {

         case TX_4X4:

-          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_8X8:

-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_16X16:

-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, stride, eob, xd->bd);

           break;

         case TX_32X32:

-          vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);

+          vp9_highbd_idct32x32_add(dqcoeff, dst16, stride, eob, xd->bd);

           break;

         default: assert(0 && "Invalid transform size");

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -637,17 +637,18 @@

   if (x->skip_encode || p->eobs[block] == 0) return;

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));

     switch (tx_size) {

       case TX_32X32:

-        vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],

+        vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],

                                  xd->bd);

         break;

       case TX_16X16:

-        vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],

+        vp9_highbd_idct16x16_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],

                                  xd->bd);

         break;

       case TX_8X8:

-        vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],

+        vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],

                                xd->bd);

         break;

       case TX_4X4:

@@ -654,7 +655,7 @@

         // this is like vp9_short_idct4x4 but has a special case around eob<=1

         // which is significant (not just an optimization) for the lossless

         // case.

-        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],

+        x->highbd_itxm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],

                            xd->bd);

         break;

       default: assert(0 && "Invalid transform size");

@@ -699,7 +700,8 @@

   if (p->eobs[block] > 0) {

 #if CONFIG_VP9_HIGHBITDEPTH

     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-      x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);

+      x->highbd_itxm_add(dqcoeff, CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)),

+                         pd->dst.stride, p->eobs[block], xd->bd);

       return;

 #endif  // CONFIG_VP9_HIGHBITDEPTH

@@ -799,6 +801,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

+    uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));

     switch (tx_size) {

       case TX_32X32:

         if (!x->skip_recode) {

@@ -814,7 +817,7 @@

           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;

         if (!x->skip_encode && *eob) {

-          vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);

+          vp9_highbd_idct32x32_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);

         break;

       case TX_16X16:

@@ -834,7 +837,7 @@

           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;

         if (!x->skip_encode && *eob) {

-          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob,

+          vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst16, dst_stride, *eob,

                                   xd->bd);

         break;

@@ -855,7 +858,7 @@

           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;

         if (!x->skip_encode && *eob) {

-          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,

+          vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst16, dst_stride, *eob,

                                 xd->bd);

         break;

@@ -880,9 +883,10 @@

             // this is like vp9_short_idct4x4 but has a special case around

             // eob<=1 which is significant (not just an optimization) for the

             // lossless case.

-            x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);

+            x->highbd_itxm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);

           } else {

-            vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);

+            vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type,

+                                     xd->bd);

         break;

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -601,7 +601,7 @@

       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

         vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,

                                  32, NULL, 0, NULL, 0, bs, bs, xd->bd);

-        recon = CONVERT_TO_BYTEPTR(recon16);

+        recon = CAST_TO_BYTEPTR(recon16);

         if (xd->lossless) {

           vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);

         } else {

@@ -621,6 +621,7 @@

             default: assert(0 && "Invalid transform size");

+        recon = CONVERT_TO_BYTEPTR(recon16);

       } else {

 #endif  // CONFIG_VP9_HIGHBITDEPTH

         vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);

@@ -1004,6 +1005,7 @@

           const int block = (row + idy) * 2 + (col + idx);

           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];

           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];

+          uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));

           int16_t *const src_diff =

               vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);

           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);

@@ -1025,7 +1027,7 @@

             tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);

             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)

               goto next_highbd;

-            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst,

+            vp9_highbd_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst16,

                                    dst_stride, p->eobs[block], xd->bd);

           } else {

             int64_t unused;

@@ -1048,7 +1050,7 @@

             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)

               goto next_highbd;

             vp9_highbd_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),

-                                  dst, dst_stride, p->eobs[block], xd->bd);

+                                  dst16, dst_stride, p->eobs[block], xd->bd);

--- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c

@@ -1270,7 +1270,7 @@

 void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,

                                        int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t row_idct_output[16 * 16];

@@ -1315,7 +1315,7 @@

 void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,

                                       int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t row_idct_output[16 * 16];

@@ -1351,7 +1351,7 @@

 void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8,

                                       int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t row_idct_output[4 * 16];

@@ -1422,7 +1422,7 @@

       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);

   const int16x8_t dc = vdupq_n_s16(a1);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   int i;

   if (a1 >= 0) {

--- a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c

@@ -394,7 +394,7 @@

   int32_t pass2[32 * 32];

   int32_t *out;

   int32x4x2_t q[16];

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dest);

+  uint16_t *dst = CAST_TO_SHORTPTR(dest);

   for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;

        idct32_pass_loop++, input = pass1, out = pass2) {

--- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c

@@ -729,7 +729,7 @@

 void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8,

                                        int stride, int bd) {

   int i;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t temp[32 * 16];

--- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c

@@ -597,7 +597,7 @@

 void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8,

                                       int stride, int bd) {

   int i;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   if (bd == 8) {

     int16_t temp[32 * 8];

--- a/vpx_dsp/arm/highbd_idct32x32_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct32x32_add_neon.c

@@ -67,7 +67,7 @@

       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);

   const int16x8_t dc = vdupq_n_s16(a1);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   int i;

   if (a1 >= 0) {

--- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c

@@ -60,7 +60,7 @@

       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);

   const int16x8_t dc = vdupq_n_s16(a1);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);

   highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);

@@ -140,7 +140,7 @@

   int32x4_t c1 = vld1q_s32(input + 4);

   int32x4_t c2 = vld1q_s32(input + 8);

   int32x4_t c3 = vld1q_s32(input + 12);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   int16x8_t a0, a1;

   if (bd == 8) {

--- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c

+++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c

@@ -44,7 +44,7 @@

       HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);

   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);

   const int16x8_t dc = vdupq_n_s16(a1);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   if (a1 >= 0) {

     const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

@@ -294,7 +294,7 @@

 void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8,

                                     int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   int32x4_t a0 = vld1q_s32(input);

   int32x4_t a1 = vld1q_s32(input + 8);

   int32x4_t a2 = vld1q_s32(input + 16);

@@ -555,7 +555,7 @@

 void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8,

                                     int stride, int bd) {

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   int32x4_t a0 = vld1q_s32(input);

   int32x4_t a1 = vld1q_s32(input + 4);

   int32x4_t a2 = vld1q_s32(input + 8);

--- a/vpx_dsp/arm/idct32x32_add_neon.c

+++ b/vpx_dsp/arm/idct32x32_add_neon.c

@@ -517,7 +517,7 @@

   const int16_t *input_pass2 = pass1;  // input of pass2 is the result of pass1

   int16_t *out;

   int16x8_t q[16];

-  uint16_t *dst = CONVERT_TO_SHORTPTR(dest);

+  uint16_t *dst = CAST_TO_SHORTPTR(dest);

   for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;

        idct32_pass_loop++, out = pass2) {

--- a/vpx_dsp/inv_txfm.c

+++ b/vpx_dsp/inv_txfm.c

@@ -1299,7 +1299,7 @@

   tran_high_t a1, b1, c1, d1, e1;

   const tran_low_t *ip = input;

   tran_low_t *op = output;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   for (i = 0; i < 4; i++) {

     a1 = ip[0] >> UNIT_QUANT_SHIFT;

@@ -1355,7 +1355,7 @@

   tran_low_t tmp[4];

   const tran_low_t *ip = in;

   tran_low_t *op = tmp;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   (void)bd;

   a1 = ip[0] >> UNIT_QUANT_SHIFT;

@@ -1458,7 +1458,7 @@

   tran_low_t out[4 * 4];

   tran_low_t *outptr = out;

   tran_low_t temp_in[4], temp_out[4];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   // Rows

   for (i = 0; i < 4; ++i) {

@@ -1484,7 +1484,7 @@

   tran_high_t a1;

   tran_low_t out =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

   a1 = ROUND_POWER_OF_TWO(out, 4);

@@ -1642,7 +1642,7 @@

   tran_low_t out[8 * 8];

   tran_low_t *outptr = out;

   tran_low_t temp_in[8], temp_out[8];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   // First transform rows

   for (i = 0; i < 8; ++i) {

@@ -1668,7 +1668,7 @@

   tran_low_t out[8 * 8] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[8], temp_out[8];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   // First transform rows

   // Only first 4 row has non-zero coefs

@@ -1695,7 +1695,7 @@

   tran_high_t a1;

   tran_low_t out =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

   a1 = ROUND_POWER_OF_TWO(out, 5);

@@ -2062,7 +2062,7 @@

   tran_low_t out[16 * 16];

   tran_low_t *outptr = out;

   tran_low_t temp_in[16], temp_out[16];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   // First transform rows

   for (i = 0; i < 16; ++i) {

@@ -2088,7 +2088,7 @@

   tran_low_t out[16 * 16] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[16], temp_out[16];

-  uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *const dest = CAST_TO_SHORTPTR(dest8);

   // First transform rows. Since all non-zero dct coefficients are in

   // upper-left 8x8 area, we only need to calculate first 8 rows here.

@@ -2117,7 +2117,7 @@

   tran_low_t out[16 * 16] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[16], temp_out[16];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   // First transform rows. Since all non-zero dct coefficients are in

   // upper-left 4x4 area, we only need to calculate first 4 rows here.

@@ -2144,7 +2144,7 @@

   tran_high_t a1;

   tran_low_t out =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);

   a1 = ROUND_POWER_OF_TWO(out, 6);

@@ -2537,7 +2537,7 @@

   tran_low_t out[32 * 32];

   tran_low_t *outptr = out;

   tran_low_t temp_in[32], temp_out[32];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   // Rows

   for (i = 0; i < 32; ++i) {

@@ -2575,7 +2575,7 @@

   tran_low_t out[32 * 32] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[32], temp_out[32];

-  uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *const dest = CAST_TO_SHORTPTR(dest8);

   // Rows

   // Only upper-left 16x16 has non-zero coeff

@@ -2604,7 +2604,7 @@

   tran_low_t out[32 * 32] = { 0 };

   tran_low_t *outptr = out;

   tran_low_t temp_in[32], temp_out[32];

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   // Rows

   // Only upper-left 8x8 has non-zero coeff

@@ -2629,7 +2629,7 @@

                                   int stride, int bd) {

   int i, j;

   int a1;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   tran_low_t out =

       HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

--- a/vpx_dsp/x86/inv_txfm_sse2.c

+++ b/vpx_dsp/x86/inv_txfm_sse2.c

@@ -3373,7 +3373,7 @@

   __m128i sign_bits[2];

   __m128i temp_mm, min_input, max_input;

   int test;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   int optimised_cols = 0;

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i eight = _mm_set1_epi16(8);

@@ -3486,7 +3486,7 @@

   int i, j, test;

   __m128i inptr[8];

   __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i sixteen = _mm_set1_epi16(16);

   const __m128i max = _mm_set1_epi16(6201);

@@ -3586,7 +3586,7 @@

   int i, j, test;

   __m128i inptr[8];

   __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i sixteen = _mm_set1_epi16(16);

   const __m128i max = _mm_set1_epi16(6201);

@@ -3689,7 +3689,7 @@

   int i, j, test;

   __m128i inptr[32];

   __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i rounding = _mm_set1_epi16(32);

   const __m128i max = _mm_set1_epi16(3155);

@@ -3802,7 +3802,7 @@

   int i, j, test;

   __m128i inptr[32];

   __m128i min_input, max_input, temp1, temp2, sign_bits;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i rounding = _mm_set1_epi16(32);

   const __m128i max = _mm_set1_epi16(3155);

@@ -3920,7 +3920,7 @@

   const __m128i one = _mm_set1_epi16(1);

   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

   int a, i, j;

-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  uint16_t *dest = CAST_TO_SHORTPTR(dest8);

   tran_low_t out;

   out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);

--

⑨