shithub: libvpx

--- a/test/partial_idct_test.cc

+++ b/test/partial_idct_test.cc

@@ -414,6 +414,24 @@

                    &vpx_highbd_idct4x4_16_add_sse2, TX_4X4, 1, 12)));

 #endif  // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE

+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE

+INSTANTIATE_TEST_CASE_P(

+    NEON, PartialIDctTest,

+    ::testing::Values(

+        make_tuple(&vpx_highbd_fdct4x4_c, &vpx_highbd_idct4x4_16_add_c,

+                   &vpx_highbd_idct4x4_16_add_neon, TX_4X4, 16, 8),

+        make_tuple(&vpx_highbd_fdct4x4_c, &vpx_highbd_idct4x4_16_add_c,

+                   &vpx_highbd_idct4x4_16_add_neon, TX_4X4, 16, 10),

+        make_tuple(&vpx_highbd_fdct4x4_c, &vpx_highbd_idct4x4_16_add_c,

+                   &vpx_highbd_idct4x4_16_add_neon, TX_4X4, 16, 12),

+        make_tuple(&vpx_highbd_fdct4x4_c, &vpx_highbd_idct4x4_1_add_c,

+                   &vpx_highbd_idct4x4_1_add_neon, TX_4X4, 1, 8),

+        make_tuple(&vpx_highbd_fdct4x4_c, &vpx_highbd_idct4x4_1_add_c,

+                   &vpx_highbd_idct4x4_1_add_neon, TX_4X4, 1, 10),

+        make_tuple(&vpx_highbd_fdct4x4_c, &vpx_highbd_idct4x4_1_add_c,

+                   &vpx_highbd_idct4x4_1_add_neon, TX_4X4, 1, 12)));

+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE

 #else  // !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

--- /dev/null

+++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c

@@ -1,0 +1,177 @@

+/*

+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/arm/idct_neon.h"

+#include "vpx_dsp/inv_txfm.h"

+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,

+                                   int dest_stride, int bd) {

+  int i;

+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

+  const tran_low_t out0 = dct_const_round_shift(input[0] * cospi_16_64);

+  const tran_low_t out1 = dct_const_round_shift(out0 * cospi_16_64);

+  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);

+  const int16x8_t dc = vdupq_n_s16(a1);

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  int16x8_t a;

+  uint16x8_t b;

+  uint16x4_t d0, d1;

+  for (i = 0; i < 2; i++) {

+    d0 = vld1_u16(dest);

+    d1 = vld1_u16(dest + dest_stride);

+    a = vreinterpretq_s16_u16(vcombine_u16(d0, d1));

+    a = vaddq_s16(dc, a);

+    a = vminq_s16(a, max);

+    b = vqshluq_n_s16(a, 0);

+    vst1_u16(dest, vget_low_u16(b));

+    dest += dest_stride;

+    vst1_u16(dest, vget_high_u16(b));

+    dest += dest_stride;

+  }

+}

+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,

+                                          int32x4_t *const a0,

+                                          int32x4_t *const a1,

+                                          int32x4_t *const a2,

+                                          int32x4_t *const a3) {

+  int32x4_t b0, b1, b2, b3;

+  transpose_s32_4x4(a0, a1, a2, a3);

+  b0 = vaddq_s32(*a0, *a2);

+  b1 = vsubq_s32(*a0, *a2);

+  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);

+  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);

+  b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);

+  b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);

+  b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);

+  b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);

+  b0 = vrshrq_n_s32(b0, 14);

+  b1 = vrshrq_n_s32(b1, 14);

+  b2 = vrshrq_n_s32(b2, 14);

+  b3 = vrshrq_n_s32(b3, 14);

+  *a0 = vaddq_s32(b0, b3);

+  *a1 = vaddq_s32(b1, b2);

+  *a2 = vsubq_s32(b1, b2);

+  *a3 = vsubq_s32(b0, b3);

+}

+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,

+                                          int32x4_t *const a0,

+                                          int32x4_t *const a1,

+                                          int32x4_t *const a2,

+                                          int32x4_t *const a3) {

+  int32x4_t b0, b1, b2, b3;

+  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;

+  transpose_s32_4x4(a0, a1, a2, a3);

+  b0 = vaddq_s32(*a0, *a2);

+  b1 = vsubq_s32(*a0, *a2);

+  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);

+  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);

+  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);

+  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);

+  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);

+  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);

+  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);

+  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);

+  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);

+  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);

+  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);

+  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);

+  c4 = vsubq_s64(c4, c8);

+  c5 = vsubq_s64(c5, c9);

+  c6 = vaddq_s64(c6, c10);

+  c7 = vaddq_s64(c7, c11);

+  b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));

+  b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));

+  b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));

+  b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));

+  *a0 = vaddq_s32(b0, b3);

+  *a1 = vaddq_s32(b1, b2);

+  *a2 = vsubq_s32(b1, b2);

+  *a3 = vsubq_s32(b0, b3);

+}

+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8,

+                                    int dest_stride, int bd) {

+  DECLARE_ALIGNED(16, static const int32_t, kCospi32[4]) = { 0, 15137, 11585,

+                                                             6270 };

+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);

+  int32x4_t c0 = vld1q_s32(input);

+  int32x4_t c1 = vld1q_s32(input + 4);

+  int32x4_t c2 = vld1q_s32(input + 8);

+  int32x4_t c3 = vld1q_s32(input + 12);

+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

+  const uint16_t *dst = dest;

+  int16x8_t a0, a1, d01, d32;

+  int16x4_t d0, d1, d2, d3;

+  uint16x8_t d01_u16, d32_u16;

+  if (bd == 8) {

+    const int16x4_t cospis = vld1_s16(kCospi);

+    // Rows

+    a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));

+    a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));

+    idct4x4_16_kernel_bd8(cospis, &a0, &a1);

+    // Columns

+    a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));

+    idct4x4_16_kernel_bd8(cospis, &a0, &a1);

+    a0 = vrshrq_n_s16(a0, 4);

+    a1 = vrshrq_n_s16(a1, 4);

+  } else {

+    const int32x4_t cospis = vld1q_s32(kCospi32);

+    if (bd == 10) {

+      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);

+      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);

+    } else {

+      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);

+      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);

+    }

+    // Note: In some profile tests, a0 and a1 are quite close to +/-32767.

+    // We use saturating narrow shift in case they could be even larger.

+    a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));

+    a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));

+  }

+  d0 = vreinterpret_s16_u16(vld1_u16(dst));

+  dst += dest_stride;

+  d1 = vreinterpret_s16_u16(vld1_u16(dst));

+  dst += dest_stride;

+  d2 = vreinterpret_s16_u16(vld1_u16(dst));

+  dst += dest_stride;

+  d3 = vreinterpret_s16_u16(vld1_u16(dst));

+  d01 = vcombine_s16(d0, d1);

+  d32 = vcombine_s16(d3, d2);

+  // Note: In some profile tests, a0 and a1 is quite close to +/-32767.

+  // We use saturating addition.

+  d01 = vqaddq_s16(a0, d01);

+  d32 = vqaddq_s16(a1, d32);

+  d01 = vminq_s16(d01, max);

+  d32 = vminq_s16(d32, max);

+  d01_u16 = vqshluq_n_s16(d01, 0);

+  d32_u16 = vqshluq_n_s16(d32, 0);

+  vst1_u16(dest, vget_low_u16(d01_u16));

+  dest += dest_stride;

+  vst1_u16(dest, vget_high_u16(d01_u16));

+  dest += dest_stride;

+  vst1_u16(dest, vget_high_u16(d32_u16));

+  dest += dest_stride;

+  vst1_u16(dest, vget_low_u16(d32_u16));

+}

--- a/vpx_dsp/arm/idct4x4_add_neon.c

+++ b/vpx_dsp/arm/idct4x4_add_neon.c

@@ -13,45 +13,12 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/arm/idct_neon.h"

-#include "vpx_dsp/arm/transpose_neon.h"

 #include "vpx_dsp/txfm_common.h"

-static INLINE void idct4x4_16_kernel(const int16x4_t cospis, int16x8_t *a0,

-                                     int16x8_t *a1) {

-  int16x4_t b0, b1, b2, b3;

-  int32x4_t c0, c1, c2, c3;

-  int16x8_t d0, d1;

-  transpose_s16_4x4q(a0, a1);

-  b0 = vget_low_s16(*a0);

-  b1 = vget_high_s16(*a0);

-  b2 = vget_low_s16(*a1);

-  b3 = vget_high_s16(*a1);

-  c0 = vmull_lane_s16(b0, cospis, 2);

-  c2 = vmull_lane_s16(b1, cospis, 2);

-  c1 = vsubq_s32(c0, c2);

-  c0 = vaddq_s32(c0, c2);

-  c2 = vmull_lane_s16(b2, cospis, 3);

-  c3 = vmull_lane_s16(b2, cospis, 1);

-  c2 = vmlsl_lane_s16(c2, b3, cospis, 1);

-  c3 = vmlal_lane_s16(c3, b3, cospis, 3);

-  b0 = vrshrn_n_s32(c0, 14);

-  b1 = vrshrn_n_s32(c1, 14);

-  b2 = vrshrn_n_s32(c2, 14);

-  b3 = vrshrn_n_s32(c3, 14);

-  d0 = vcombine_s16(b0, b1);

-  d1 = vcombine_s16(b3, b2);

-  *a0 = vaddq_s16(d0, d1);

-  *a1 = vsubq_s16(d0, d1);

-}

 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,

                              int dest_stride) {

-  DECLARE_ALIGNED(16, static const int16_t, cospi[4]) = {

-    0, (int16_t)cospi_8_64, (int16_t)cospi_16_64, (int16_t)cospi_24_64

-  };

   const uint8_t *dst = dest;

-  const int16x4_t cospis = vld1_s16(cospi);

+  const int16x4_t cospis = vld1_s16(kCospi);

   uint32x2_t dest01_u32 = vdup_n_u32(0);

   uint32x2_t dest32_u32 = vdup_n_u32(0);

   int16x8_t a0, a1;

@@ -64,11 +31,11 @@

   // Rows

   a0 = load_tran_low_to_s16q(input);

   a1 = load_tran_low_to_s16q(input + 8);

-  idct4x4_16_kernel(cospis, &a0, &a1);

+  idct4x4_16_kernel_bd8(cospis, &a0, &a1);

   // Columns

   a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));

-  idct4x4_16_kernel(cospis, &a0, &a1);

+  idct4x4_16_kernel_bd8(cospis, &a0, &a1);

   a0 = vrshrq_n_s16(a0, 4);

   a1 = vrshrq_n_s16(a1, 4);

--- a/vpx_dsp/arm/idct_neon.h

+++ b/vpx_dsp/arm/idct_neon.h

@@ -17,6 +17,9 @@

 #include "vpx_dsp/arm/transpose_neon.h"

 #include "vpx_dsp/vpx_dsp_common.h"

+DECLARE_ALIGNED(16, static const int16_t, kCospi[4]) = { 0, 15137, 11585,

+                                                         6270 };

 //------------------------------------------------------------------------------

 // Helper functions used to load tran_low_t into int16, narrowing if necessary.

@@ -180,4 +183,35 @@

   b += b_stride;

   vst1_u8(b, b7);

+static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,

+                                         int16x8_t *const a0,

+                                         int16x8_t *const a1) {

+  int16x4_t b0, b1, b2, b3;

+  int32x4_t c0, c1, c2, c3;

+  int16x8_t d0, d1;

+  transpose_s16_4x4q(a0, a1);

+  b0 = vget_low_s16(*a0);

+  b1 = vget_high_s16(*a0);

+  b2 = vget_low_s16(*a1);

+  b3 = vget_high_s16(*a1);

+  c0 = vmull_lane_s16(b0, cospis, 2);

+  c2 = vmull_lane_s16(b1, cospis, 2);

+  c1 = vsubq_s32(c0, c2);

+  c0 = vaddq_s32(c0, c2);

+  c2 = vmull_lane_s16(b2, cospis, 3);

+  c3 = vmull_lane_s16(b2, cospis, 1);

+  c2 = vmlsl_lane_s16(c2, b3, cospis, 1);

+  c3 = vmlal_lane_s16(c3, b3, cospis, 3);

+  b0 = vrshrn_n_s32(c0, 14);

+  b1 = vrshrn_n_s32(c1, 14);

+  b2 = vrshrn_n_s32(c2, 14);

+  b3 = vrshrn_n_s32(c3, 14);

+  d0 = vcombine_s16(b0, b1);

+  d1 = vcombine_s16(b3, b2);

+  *a0 = vaddq_s16(d0, d1);

+  *a1 = vsubq_s16(d0, d1);

+}

 #endif  // VPX_DSP_ARM_IDCT_NEON_H_

--- a/vpx_dsp/arm/transpose_neon.h

+++ b/vpx_dsp/arm/transpose_neon.h

@@ -30,6 +30,13 @@

   return b0;

+static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {

+  int32x4x2_t b0;

+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));

+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));

+  return b0;

+}

 static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) {

   uint8x16x2_t b0;

   b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),

@@ -170,6 +177,37 @@

   *a0 = d0.val[0];

   *a1 = d0.val[1];

+}

+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,

+                                     int32x4_t *a2, int32x4_t *a3) {

+  // Swap 32 bit elements. Goes from:

+  // a0: 00 01 02 03

+  // a1: 10 11 12 13

+  // a2: 20 21 22 23

+  // a3: 30 31 32 33

+  // to:

+  // b0.val[0]: 00 10 02 12

+  // b0.val[1]: 01 11 03 13

+  // b1.val[0]: 20 30 22 32

+  // b1.val[1]: 21 31 23 33

+  const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);

+  const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);

+  // Swap 64 bit elements resulting in:

+  // c0.val[0]: 00 10 20 30

+  // c0.val[1]: 02 12 22 32

+  // c1.val[0]: 01 11 21 31

+  // c1.val[1]: 03 13 23 33

+  const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);

+  const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);

+  *a0 = c0.val[0];

+  *a1 = c1.val[0];

+  *a2 = c0.val[1];

+  *a3 = c1.val[1];

 static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -226,6 +226,8 @@

 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c

 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c

 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c

+else  # CONFIG_VP9_HIGHBITDEPTH

+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c

 endif  # !CONFIG_VP9_HIGHBITDEPTH

 ifeq ($(HAVE_NEON_ASM),yes)

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -618,6 +618,7 @@

   specialize qw/vpx_iwht4x4_16_add sse2/;

   add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

+  specialize qw/vpx_highbd_idct4x4_1_add neon/;

   add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

@@ -709,7 +710,7 @@

     specialize qw/vpx_idct32x32_1_add neon sse2/;

     add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

-    specialize qw/vpx_highbd_idct4x4_16_add sse2/;

+    specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;

     add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

     specialize qw/vpx_highbd_idct8x8_64_add sse2/;

--- a/vpx_dsp/x86/inv_txfm_sse2.c

+++ b/vpx_dsp/x86/inv_txfm_sse2.c

@@ -402,10 +402,10 @@

       MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \

                              stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \

-      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \

-      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \

-      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \

-      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \

+      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \

+      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \

+      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \

+      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \

     }                                                                         \

     /* Stage3 */                                                              \

@@ -413,10 +413,10 @@

       const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \

       const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \

-      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \

-      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \

-      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \

-      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \

+      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \

+      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \

+      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \

+      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \

       tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \

       tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \

@@ -438,14 +438,14 @@

     }                                                                         \

     /* Stage4  */                                                             \

-    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \

-    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \

-    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \

-    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \

-    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \

-    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \

-    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \

-    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \

+    out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \

+    out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \

+    out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \

+    out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \

+    out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \

+    out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \

+    out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \

+    out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \

 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,

@@ -866,8 +866,8 @@

     stp2_0 = _mm_packs_epi32(tmp0, tmp2);

     stp2_2 = _mm_packs_epi32(tmp6, tmp4);

-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);

-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);

+    tmp0 = _mm_add_epi16(stp1_4, stp1_5);

+    tmp1 = _mm_sub_epi16(stp1_4, stp1_5);

     stp2_4 = tmp0;

     stp2_5 = _mm_unpacklo_epi64(tmp1, zero);

@@ -878,8 +878,8 @@

     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);

-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);

-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);

+    tmp4 = _mm_add_epi16(stp2_0, stp2_2);

+    tmp6 = _mm_sub_epi16(stp2_0, stp2_2);

     stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);

     stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);

@@ -896,10 +896,10 @@

   // Stage4

-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);

-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);

-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);

-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);

+  tmp0 = _mm_add_epi16(stp1_3, stp2_4);

+  tmp1 = _mm_add_epi16(stp1_2, stp1_5);

+  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);

+  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);

   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)

@@ -3449,7 +3449,7 @@

   __m128i ubounded, retval;

   const __m128i zero = _mm_set1_epi16(0);

   const __m128i one = _mm_set1_epi16(1);

-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);

+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

   ubounded = _mm_cmpgt_epi16(value, max);

   retval = _mm_andnot_si128(ubounded, value);

   ubounded = _mm_and_si128(ubounded, max);

@@ -4012,7 +4012,7 @@

   __m128i dc_value, d;

   const __m128i zero = _mm_setzero_si128();

   const __m128i one = _mm_set1_epi16(1);

-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);

+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);

   int a, i, j;

   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

   tran_low_t out;

--

⑨