shithub: libvpx

--- a/test/dct_test.cc

+++ b/test/dct_test.cc

@@ -597,7 +597,9 @@

   TransHT() { fwd_txfm_ref = fht_ref; }

};

-TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); }

+TEST_P(TransHT, AccuracyCheck) {

+  RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 ? 2 : 1);

+}

 TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }

@@ -605,17 +607,6 @@

 TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }

-/* TODO:(johannkoenig) Determine why these fail AccuracyCheck

-   make_tuple(&vp9_highbd_fht16x16_c,

-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0, VPX_BITS_12, 2),

-   make_tuple(&vp9_highbd_fht16x16_c,

-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1, VPX_BITS_12, 2),

-   make_tuple(&vp9_highbd_fht16x16_c,

-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2, VPX_BITS_12, 2),

-   make_tuple(&vp9_highbd_fht16x16_c,

-   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, VPX_BITS_12, 2),

-  */

 const DctParam c_ht_tests[] = {

 #if CONFIG_VP9_HIGHBITDEPTH

   make_tuple(&vp9_highbd_fht16x16_c,

@@ -642,6 +633,19 @@

   make_tuple(&vp9_highbd_fht16x16_c,

              &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,

              VPX_BITS_10, 2),

+  make_tuple(&vp9_highbd_fht16x16_c,

+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0,

+             VPX_BITS_12, 2),

+  make_tuple(&vp9_highbd_fht16x16_c,

+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1,

+             VPX_BITS_12, 2),

+  make_tuple(&vp9_highbd_fht16x16_c,

+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2,

+             VPX_BITS_12, 2),

+  make_tuple(&vp9_highbd_fht16x16_c,

+             &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3,

+             VPX_BITS_12, 2),

   make_tuple(&vp9_highbd_fht8x8_c,

              &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8, 0, VPX_BITS_8,

2),

@@ -784,6 +788,43 @@

 INSTANTIATE_TEST_CASE_P(

     SSE4_1, TransHT,

     ::testing::Values(

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   0, VPX_BITS_8, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   1, VPX_BITS_8, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   2, VPX_BITS_8, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   3, VPX_BITS_8, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   0, VPX_BITS_10, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   1, VPX_BITS_10, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   2, VPX_BITS_10, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   3, VPX_BITS_10, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   0, VPX_BITS_12, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   1, VPX_BITS_12, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   2, VPX_BITS_12, 2),

+        make_tuple(&vp9_highbd_fht16x16_c,

+                   &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16,

+                   3, VPX_BITS_12, 2),

         make_tuple(&vp9_highbd_fht8x8_c,

                    &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0,

                    VPX_BITS_8, 2),

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -105,6 +105,7 @@

   if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {

     specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;

     specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;

+    specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;

--- /dev/null

+++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c

@@ -1,0 +1,419 @@

+/*

+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vp9_rtcd.h"

+#include "vp9/common/vp9_idct.h"

+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"

+#include "vpx_dsp/x86/inv_txfm_sse2.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

+#include "vpx_dsp/x86/txfm_common_sse2.h"

+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,

+                                                      const int c,

+                                                      __m128i *const s) {

+  const __m128i pair_c = pair_set_epi32(4 * c, 0);

+  __m128i x[2];

+  extend_64bit(in, x);

+  s[0] = _mm_mul_epi32(pair_c, x[0]);

+  s[1] = _mm_mul_epi32(pair_c, x[1]);

+}

+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,

+                                                 const __m128i in1,

+                                                 const int c0, const int c1,

+                                                 __m128i *const s0,

+                                                 __m128i *const s1) {

+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);

+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);

+  __m128i t00[2], t01[2], t10[2], t11[2];

+  __m128i x0[2], x1[2];

+  extend_64bit(in0, x0);

+  extend_64bit(in1, x1);

+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);

+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);

+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);

+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);

+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);

+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);

+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);

+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);

+  s0[0] = _mm_add_epi64(t00[0], t11[0]);

+  s0[1] = _mm_add_epi64(t00[1], t11[1]);

+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);

+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);

+}

+static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {

+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],

+      s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];

+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],

+      x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];

+  // stage 1

+  highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);

+  highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);

+  highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);

+  highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);

+  highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);

+  highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,

+                                s11);

+  highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,

+                                s13);

+  highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,

+                                s15);

+  x0[0] = _mm_add_epi64(s0[0], s8[0]);

+  x0[1] = _mm_add_epi64(s0[1], s8[1]);

+  x1[0] = _mm_add_epi64(s1[0], s9[0]);

+  x1[1] = _mm_add_epi64(s1[1], s9[1]);

+  x2[0] = _mm_add_epi64(s2[0], s10[0]);

+  x2[1] = _mm_add_epi64(s2[1], s10[1]);

+  x3[0] = _mm_add_epi64(s3[0], s11[0]);

+  x3[1] = _mm_add_epi64(s3[1], s11[1]);

+  x4[0] = _mm_add_epi64(s4[0], s12[0]);

+  x4[1] = _mm_add_epi64(s4[1], s12[1]);

+  x5[0] = _mm_add_epi64(s5[0], s13[0]);

+  x5[1] = _mm_add_epi64(s5[1], s13[1]);

+  x6[0] = _mm_add_epi64(s6[0], s14[0]);

+  x6[1] = _mm_add_epi64(s6[1], s14[1]);

+  x7[0] = _mm_add_epi64(s7[0], s15[0]);

+  x7[1] = _mm_add_epi64(s7[1], s15[1]);

+  x8[0] = _mm_sub_epi64(s0[0], s8[0]);

+  x8[1] = _mm_sub_epi64(s0[1], s8[1]);

+  x9[0] = _mm_sub_epi64(s1[0], s9[0]);

+  x9[1] = _mm_sub_epi64(s1[1], s9[1]);

+  x10[0] = _mm_sub_epi64(s2[0], s10[0]);

+  x10[1] = _mm_sub_epi64(s2[1], s10[1]);

+  x11[0] = _mm_sub_epi64(s3[0], s11[0]);

+  x11[1] = _mm_sub_epi64(s3[1], s11[1]);

+  x12[0] = _mm_sub_epi64(s4[0], s12[0]);

+  x12[1] = _mm_sub_epi64(s4[1], s12[1]);

+  x13[0] = _mm_sub_epi64(s5[0], s13[0]);

+  x13[1] = _mm_sub_epi64(s5[1], s13[1]);

+  x14[0] = _mm_sub_epi64(s6[0], s14[0]);

+  x14[1] = _mm_sub_epi64(s6[1], s14[1]);

+  x15[0] = _mm_sub_epi64(s7[0], s15[0]);

+  x15[1] = _mm_sub_epi64(s7[1], s15[1]);

+  x0[0] = dct_const_round_shift_64bit(x0[0]);

+  x0[1] = dct_const_round_shift_64bit(x0[1]);

+  x1[0] = dct_const_round_shift_64bit(x1[0]);

+  x1[1] = dct_const_round_shift_64bit(x1[1]);

+  x2[0] = dct_const_round_shift_64bit(x2[0]);

+  x2[1] = dct_const_round_shift_64bit(x2[1]);

+  x3[0] = dct_const_round_shift_64bit(x3[0]);

+  x3[1] = dct_const_round_shift_64bit(x3[1]);

+  x4[0] = dct_const_round_shift_64bit(x4[0]);

+  x4[1] = dct_const_round_shift_64bit(x4[1]);

+  x5[0] = dct_const_round_shift_64bit(x5[0]);

+  x5[1] = dct_const_round_shift_64bit(x5[1]);

+  x6[0] = dct_const_round_shift_64bit(x6[0]);

+  x6[1] = dct_const_round_shift_64bit(x6[1]);

+  x7[0] = dct_const_round_shift_64bit(x7[0]);

+  x7[1] = dct_const_round_shift_64bit(x7[1]);

+  x8[0] = dct_const_round_shift_64bit(x8[0]);

+  x8[1] = dct_const_round_shift_64bit(x8[1]);

+  x9[0] = dct_const_round_shift_64bit(x9[0]);

+  x9[1] = dct_const_round_shift_64bit(x9[1]);

+  x10[0] = dct_const_round_shift_64bit(x10[0]);

+  x10[1] = dct_const_round_shift_64bit(x10[1]);

+  x11[0] = dct_const_round_shift_64bit(x11[0]);

+  x11[1] = dct_const_round_shift_64bit(x11[1]);

+  x12[0] = dct_const_round_shift_64bit(x12[0]);

+  x12[1] = dct_const_round_shift_64bit(x12[1]);

+  x13[0] = dct_const_round_shift_64bit(x13[0]);

+  x13[1] = dct_const_round_shift_64bit(x13[1]);

+  x14[0] = dct_const_round_shift_64bit(x14[0]);

+  x14[1] = dct_const_round_shift_64bit(x14[1]);

+  x15[0] = dct_const_round_shift_64bit(x15[0]);

+  x15[1] = dct_const_round_shift_64bit(x15[1]);

+  x0[0] = pack_4(x0[0], x0[1]);

+  x1[0] = pack_4(x1[0], x1[1]);

+  x2[0] = pack_4(x2[0], x2[1]);

+  x3[0] = pack_4(x3[0], x3[1]);

+  x4[0] = pack_4(x4[0], x4[1]);

+  x5[0] = pack_4(x5[0], x5[1]);

+  x6[0] = pack_4(x6[0], x6[1]);

+  x7[0] = pack_4(x7[0], x7[1]);

+  x8[0] = pack_4(x8[0], x8[1]);

+  x9[0] = pack_4(x9[0], x9[1]);

+  x10[0] = pack_4(x10[0], x10[1]);

+  x11[0] = pack_4(x11[0], x11[1]);

+  x12[0] = pack_4(x12[0], x12[1]);

+  x13[0] = pack_4(x13[0], x13[1]);

+  x14[0] = pack_4(x14[0], x14[1]);

+  x15[0] = pack_4(x15[0], x15[1]);

+  // stage 2

+  s0[0] = x0[0];

+  s1[0] = x1[0];

+  s2[0] = x2[0];

+  s3[0] = x3[0];

+  s4[0] = x4[0];

+  s5[0] = x5[0];

+  s6[0] = x6[0];

+  s7[0] = x7[0];

+  x0[0] = _mm_add_epi32(s0[0], s4[0]);

+  x1[0] = _mm_add_epi32(s1[0], s5[0]);

+  x2[0] = _mm_add_epi32(s2[0], s6[0]);

+  x3[0] = _mm_add_epi32(s3[0], s7[0]);

+  x4[0] = _mm_sub_epi32(s0[0], s4[0]);

+  x5[0] = _mm_sub_epi32(s1[0], s5[0]);

+  x6[0] = _mm_sub_epi32(s2[0], s6[0]);

+  x7[0] = _mm_sub_epi32(s3[0], s7[0]);

+  highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);

+  highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,

+                                s11);

+  highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,

+                                s12);

+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,

+                                s14);

+  x8[0] = _mm_add_epi64(s8[0], s12[0]);

+  x8[1] = _mm_add_epi64(s8[1], s12[1]);

+  x9[0] = _mm_add_epi64(s9[0], s13[0]);

+  x9[1] = _mm_add_epi64(s9[1], s13[1]);

+  x10[0] = _mm_add_epi64(s10[0], s14[0]);

+  x10[1] = _mm_add_epi64(s10[1], s14[1]);

+  x11[0] = _mm_add_epi64(s11[0], s15[0]);

+  x11[1] = _mm_add_epi64(s11[1], s15[1]);

+  x12[0] = _mm_sub_epi64(s8[0], s12[0]);

+  x12[1] = _mm_sub_epi64(s8[1], s12[1]);

+  x13[0] = _mm_sub_epi64(s9[0], s13[0]);

+  x13[1] = _mm_sub_epi64(s9[1], s13[1]);

+  x14[0] = _mm_sub_epi64(s10[0], s14[0]);

+  x14[1] = _mm_sub_epi64(s10[1], s14[1]);

+  x15[0] = _mm_sub_epi64(s11[0], s15[0]);

+  x15[1] = _mm_sub_epi64(s11[1], s15[1]);

+  x8[0] = dct_const_round_shift_64bit(x8[0]);

+  x8[1] = dct_const_round_shift_64bit(x8[1]);

+  x9[0] = dct_const_round_shift_64bit(x9[0]);

+  x9[1] = dct_const_round_shift_64bit(x9[1]);

+  x10[0] = dct_const_round_shift_64bit(x10[0]);

+  x10[1] = dct_const_round_shift_64bit(x10[1]);

+  x11[0] = dct_const_round_shift_64bit(x11[0]);

+  x11[1] = dct_const_round_shift_64bit(x11[1]);

+  x12[0] = dct_const_round_shift_64bit(x12[0]);

+  x12[1] = dct_const_round_shift_64bit(x12[1]);

+  x13[0] = dct_const_round_shift_64bit(x13[0]);

+  x13[1] = dct_const_round_shift_64bit(x13[1]);

+  x14[0] = dct_const_round_shift_64bit(x14[0]);

+  x14[1] = dct_const_round_shift_64bit(x14[1]);

+  x15[0] = dct_const_round_shift_64bit(x15[0]);

+  x15[1] = dct_const_round_shift_64bit(x15[1]);

+  x8[0] = pack_4(x8[0], x8[1]);

+  x9[0] = pack_4(x9[0], x9[1]);

+  x10[0] = pack_4(x10[0], x10[1]);

+  x11[0] = pack_4(x11[0], x11[1]);

+  x12[0] = pack_4(x12[0], x12[1]);

+  x13[0] = pack_4(x13[0], x13[1]);

+  x14[0] = pack_4(x14[0], x14[1]);

+  x15[0] = pack_4(x15[0], x15[1]);

+  // stage 3

+  s0[0] = x0[0];

+  s1[0] = x1[0];

+  s2[0] = x2[0];

+  s3[0] = x3[0];

+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);

+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);

+  s8[0] = x8[0];

+  s9[0] = x9[0];

+  s10[0] = x10[0];

+  s11[0] = x11[0];

+  highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,

+                                s13);

+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,

+                                s14);

+  x0[0] = _mm_add_epi32(s0[0], s2[0]);

+  x1[0] = _mm_add_epi32(s1[0], s3[0]);

+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);

+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);

+  x4[0] = _mm_add_epi64(s4[0], s6[0]);

+  x4[1] = _mm_add_epi64(s4[1], s6[1]);

+  x5[0] = _mm_add_epi64(s5[0], s7[0]);

+  x5[1] = _mm_add_epi64(s5[1], s7[1]);

+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);

+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);

+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);

+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);

+  x4[0] = dct_const_round_shift_64bit(x4[0]);

+  x4[1] = dct_const_round_shift_64bit(x4[1]);

+  x5[0] = dct_const_round_shift_64bit(x5[0]);

+  x5[1] = dct_const_round_shift_64bit(x5[1]);

+  x6[0] = dct_const_round_shift_64bit(x6[0]);

+  x6[1] = dct_const_round_shift_64bit(x6[1]);

+  x7[0] = dct_const_round_shift_64bit(x7[0]);

+  x7[1] = dct_const_round_shift_64bit(x7[1]);

+  x4[0] = pack_4(x4[0], x4[1]);

+  x5[0] = pack_4(x5[0], x5[1]);

+  x6[0] = pack_4(x6[0], x6[1]);

+  x7[0] = pack_4(x7[0], x7[1]);

+  x8[0] = _mm_add_epi32(s8[0], s10[0]);

+  x9[0] = _mm_add_epi32(s9[0], s11[0]);

+  x10[0] = _mm_sub_epi32(s8[0], s10[0]);

+  x11[0] = _mm_sub_epi32(s9[0], s11[0]);

+  x12[0] = _mm_add_epi64(s12[0], s14[0]);

+  x12[1] = _mm_add_epi64(s12[1], s14[1]);

+  x13[0] = _mm_add_epi64(s13[0], s15[0]);

+  x13[1] = _mm_add_epi64(s13[1], s15[1]);

+  x14[0] = _mm_sub_epi64(s12[0], s14[0]);

+  x14[1] = _mm_sub_epi64(s12[1], s14[1]);

+  x15[0] = _mm_sub_epi64(s13[0], s15[0]);

+  x15[1] = _mm_sub_epi64(s13[1], s15[1]);

+  x12[0] = dct_const_round_shift_64bit(x12[0]);

+  x12[1] = dct_const_round_shift_64bit(x12[1]);

+  x13[0] = dct_const_round_shift_64bit(x13[0]);

+  x13[1] = dct_const_round_shift_64bit(x13[1]);

+  x14[0] = dct_const_round_shift_64bit(x14[0]);

+  x14[1] = dct_const_round_shift_64bit(x14[1]);

+  x15[0] = dct_const_round_shift_64bit(x15[0]);

+  x15[1] = dct_const_round_shift_64bit(x15[1]);

+  x12[0] = pack_4(x12[0], x12[1]);

+  x13[0] = pack_4(x13[0], x13[1]);

+  x14[0] = pack_4(x14[0], x14[1]);

+  x15[0] = pack_4(x15[0], x15[1]);

+  // stage 4

+  s2[0] = _mm_add_epi32(x2[0], x3[0]);

+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);

+  s6[0] = _mm_add_epi32(x7[0], x6[0]);

+  s7[0] = _mm_sub_epi32(x7[0], x6[0]);

+  s10[0] = _mm_add_epi32(x11[0], x10[0]);

+  s11[0] = _mm_sub_epi32(x11[0], x10[0]);

+  s14[0] = _mm_add_epi32(x14[0], x15[0]);

+  s15[0] = _mm_sub_epi32(x14[0], x15[0]);

+  highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);

+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);

+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);

+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);

+  highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);

+  highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);

+  highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);

+  highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);

+  x2[0] = dct_const_round_shift_64bit(s2[0]);

+  x2[1] = dct_const_round_shift_64bit(s2[1]);

+  x3[0] = dct_const_round_shift_64bit(s3[0]);

+  x3[1] = dct_const_round_shift_64bit(s3[1]);

+  x6[0] = dct_const_round_shift_64bit(s6[0]);

+  x6[1] = dct_const_round_shift_64bit(s6[1]);

+  x7[0] = dct_const_round_shift_64bit(s7[0]);

+  x7[1] = dct_const_round_shift_64bit(s7[1]);

+  x10[0] = dct_const_round_shift_64bit(s10[0]);

+  x10[1] = dct_const_round_shift_64bit(s10[1]);

+  x11[0] = dct_const_round_shift_64bit(s11[0]);

+  x11[1] = dct_const_round_shift_64bit(s11[1]);

+  x14[0] = dct_const_round_shift_64bit(s14[0]);

+  x14[1] = dct_const_round_shift_64bit(s14[1]);

+  x15[0] = dct_const_round_shift_64bit(s15[0]);

+  x15[1] = dct_const_round_shift_64bit(s15[1]);

+  x2[0] = pack_4(x2[0], x2[1]);

+  x3[0] = pack_4(x3[0], x3[1]);

+  x6[0] = pack_4(x6[0], x6[1]);

+  x7[0] = pack_4(x7[0], x7[1]);

+  x10[0] = pack_4(x10[0], x10[1]);

+  x11[0] = pack_4(x11[0], x11[1]);

+  x14[0] = pack_4(x14[0], x14[1]);

+  x15[0] = pack_4(x15[0], x15[1]);

+  io[0] = x0[0];

+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);

+  io[2] = x12[0];

+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);

+  io[4] = x6[0];

+  io[5] = x14[0];

+  io[6] = x10[0];

+  io[7] = x2[0];

+  io[8] = x3[0];

+  io[9] = x11[0];

+  io[10] = x15[0];

+  io[11] = x7[0];

+  io[12] = x5[0];

+  io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);

+  io[14] = x9[0];

+  io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);

+}

+void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,

+                                        int stride, int tx_type, int bd) {

+  int i;

+  __m128i out[16], *in;

+  if (bd == 8) {

+    __m128i l[16], r[16];

+    in = l;

+    for (i = 0; i < 2; i++) {

+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);

+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);

+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {

+        idct16_8col(in, in);

+      } else {

+        vpx_iadst16_8col_sse2(in);

+      }

+      in = r;

+      input += 128;

+    }

+    for (i = 0; i < 16; i += 8) {

+      int j;

+      transpose_16bit_8x8(l + i, out);

+      transpose_16bit_8x8(r + i, out + 8);

+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {

+        idct16_8col(out, out);

+      } else {

+        vpx_iadst16_8col_sse2(out);

+      }

+      for (j = 0; j < 16; ++j) {

+        highbd_write_buffer_8(dest + j * stride, out[j], bd);

+      }

+      dest += 8;

+    }

+  } else {

+    __m128i all[4][16];

+    for (i = 0; i < 4; i++) {

+      in = all[i];

+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);

+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);

+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {

+        vpx_highbd_idct16_4col_sse4_1(in);

+      } else {

+        highbd_iadst16_4col_sse4_1(in);

+      }

+      input += 4 * 16;

+    }

+    for (i = 0; i < 16; i += 4) {

+      int j;

+      transpose_32bit_4x4(all[0] + i, out + 0);

+      transpose_32bit_4x4(all[1] + i, out + 4);

+      transpose_32bit_4x4(all[2] + i, out + 8);

+      transpose_32bit_4x4(all[3] + i, out + 12);

+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {

+        vpx_highbd_idct16_4col_sse4_1(out);

+      } else {

+        highbd_iadst16_4col_sse4_1(out);

+      }

+      for (j = 0; j < 16; ++j) {

+        highbd_write_buffer_4(dest + j * stride, out[j], bd);

+      }

+      dest += 4;

+    }

+  }

+}

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -83,6 +83,7 @@

 else

 VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c

 VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c

+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c

 endif

 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))

--- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c

+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c

@@ -53,7 +53,7 @@

   out[15] = in[15];

-static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {

+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {

   __m128i step1[16], step2[16];

   // stage 2

@@ -233,7 +233,7 @@

       in = all[i];

       highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);

       highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);

-      highbd_idct16_4col(in);

+      vpx_highbd_idct16_4col_sse4_1(in);

       input += 4 * 16;

@@ -243,7 +243,7 @@

       transpose_32bit_4x4(all[1] + i, out + 4);

       transpose_32bit_4x4(all[2] + i, out + 8);

       transpose_32bit_4x4(all[3] + i, out + 12);

-      highbd_idct16_4col(out);

+      vpx_highbd_idct16_4col_sse4_1(out);

       for (j = 0; j < 16; ++j) {

         highbd_write_buffer_4(dest + j * stride, out[j], bd);

--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h

+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h

@@ -107,5 +107,6 @@

 void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io);

+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/);

 #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_

--- a/vpx_dsp/x86/inv_txfm_sse2.c

+++ b/vpx_dsp/x86/inv_txfm_sse2.c

@@ -514,7 +514,7 @@

-static void iadst16_8col(__m128i *const in) {

+void vpx_iadst16_8col_sse2(__m128i *const in) {

   // perform 16x16 1-D ADST for 8 columns

   __m128i s[16], x[16], u[32], v[32];

   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);

@@ -874,8 +874,8 @@

 void iadst16_sse2(__m128i *const in0, __m128i *const in1) {

   transpose_16bit_16x16(in0, in1);

-  iadst16_8col(in0);

-  iadst16_8col(in1);

+  vpx_iadst16_8col_sse2(in0);

+  vpx_iadst16_8col_sse2(in1);

 // Group the coefficient calculation into smaller functions to prevent stack

--- a/vpx_dsp/x86/inv_txfm_sse2.h

+++ b/vpx_dsp/x86/inv_txfm_sse2.h

@@ -701,6 +701,7 @@

 void idct16_sse2(__m128i *const in0, __m128i *const in1);

 void iadst4_sse2(__m128i *const in);

 void iadst8_sse2(__m128i *const in);

+void vpx_iadst16_8col_sse2(__m128i *const in);

 void iadst16_sse2(__m128i *const in0, __m128i *const in1);

 void idct32_1024_8x32(const __m128i *const in, __m128i *const out);

 void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);

--

⑨