shithub: libvpx

--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c

@@ -106,20 +106,20 @@

   __m128i temp1[2], sign[2];

   // stage 2

-  highbd_multiplication_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64,

-                             &step2[8], &step2[15]);

-  highbd_multiplication_neg_sse2(io[7], (int)cospi_14_64, (int)cospi_18_64,

-                                 &step2[9], &step2[14]);

-  highbd_multiplication_sse2(io[5], (int)cospi_22_64, (int)cospi_10_64,

-                             &step2[10], &step2[13]);

-  highbd_multiplication_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64,

-                                 &step2[11], &step2[12]);

+  highbd_partial_butterfly_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64,

+                                &step2[8], &step2[15]);

+  highbd_partial_butterfly_neg_sse2(io[7], (int)cospi_14_64, (int)cospi_18_64,

+                                    &step2[9], &step2[14]);

+  highbd_partial_butterfly_sse2(io[5], (int)cospi_22_64, (int)cospi_10_64,

+                                &step2[10], &step2[13]);

+  highbd_partial_butterfly_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64,

+                                    &step2[11], &step2[12]);

   // stage 3

-  highbd_multiplication_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64,

-                             &step1[4], &step1[7]);

-  highbd_multiplication_neg_sse2(io[6], (int)cospi_12_64, (int)cospi_20_64,

-                                 &step1[5], &step1[6]);

+  highbd_partial_butterfly_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64,

+                                &step1[4], &step1[7]);

+  highbd_partial_butterfly_neg_sse2(io[6], (int)cospi_12_64, (int)cospi_20_64,

+                                    &step1[5], &step1[6]);

   step1[8] = _mm_add_epi32(step2[8], step2[9]);

   step1[9] = _mm_sub_epi32(step2[8], step2[9]);

   step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]

@@ -133,8 +133,8 @@

   abs_extend_64bit_sse2(io[0], temp1, sign);

   step2[0] = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);

   step2[1] = step2[0];

-  highbd_multiplication_sse2(io[4], (int)cospi_24_64, (int)cospi_8_64,

-                             &step2[2], &step2[3]);

+  highbd_partial_butterfly_sse2(io[4], (int)cospi_24_64, (int)cospi_8_64,

+                                &step2[2], &step2[3]);

   highbd_butterfly_sse2(step1[14], step1[9], (int)cospi_24_64, (int)cospi_8_64,

                         &step2[9], &step2[14]);

   highbd_butterfly_sse2(step1[10], step1[13], (int)cospi_8_64, (int)cospi_24_64,

@@ -158,14 +158,14 @@

   __m128i temp[2], sign[2];

   // stage 2

-  highbd_multiplication_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64,

-                             &step2[8], &step2[15]);

-  highbd_multiplication_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64,

-                                 &step2[11], &step2[12]);

+  highbd_partial_butterfly_sse2(io[1], (int)cospi_30_64, (int)cospi_2_64,

+                                &step2[8], &step2[15]);

+  highbd_partial_butterfly_neg_sse2(io[3], (int)cospi_6_64, (int)cospi_26_64,

+                                    &step2[11], &step2[12]);

   // stage 3

-  highbd_multiplication_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64,

-                             &step1[4], &step1[7]);

+  highbd_partial_butterfly_sse2(io[2], (int)cospi_28_64, (int)cospi_4_64,

+                                &step1[4], &step1[7]);

   step1[8] = step2[8];

   step1[9] = step2[8];

   step1[10] =

@@ -209,25 +209,8 @@

     in = l;

     for (i = 0; i < 2; i++) {

-      in[0] = load_pack_8_32bit(input + 0 * 16);

-      in[1] = load_pack_8_32bit(input + 1 * 16);

-      in[2] = load_pack_8_32bit(input + 2 * 16);

-      in[3] = load_pack_8_32bit(input + 3 * 16);

-      in[4] = load_pack_8_32bit(input + 4 * 16);

-      in[5] = load_pack_8_32bit(input + 5 * 16);

-      in[6] = load_pack_8_32bit(input + 6 * 16);

-      in[7] = load_pack_8_32bit(input + 7 * 16);

-      transpose_16bit_8x8(in, in);

-      in[8] = load_pack_8_32bit(input + 0 * 16 + 8);

-      in[9] = load_pack_8_32bit(input + 1 * 16 + 8);

-      in[10] = load_pack_8_32bit(input + 2 * 16 + 8);

-      in[11] = load_pack_8_32bit(input + 3 * 16 + 8);

-      in[12] = load_pack_8_32bit(input + 4 * 16 + 8);

-      in[13] = load_pack_8_32bit(input + 5 * 16 + 8);

-      in[14] = load_pack_8_32bit(input + 6 * 16 + 8);

-      in[15] = load_pack_8_32bit(input + 7 * 16 + 8);

-      transpose_16bit_8x8(in + 8, in + 8);

+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);

+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);

       idct16_8col(in, in);

       in = r;

       input += 128;

@@ -249,26 +232,8 @@

     for (i = 0; i < 4; i++) {

       in = all[i];

-      in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0));

-      in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4));

-      in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0));

-      in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4));

-      in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0));

-      in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4));

-      in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0));

-      in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4));

-      transpose_32bit_8x4(in, in);

-      in[8] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 8));

-      in[9] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 12));

-      in[10] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 8));

-      in[11] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 12));

-      in[12] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 8));

-      in[13] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 12));

-      in[14] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 8));

-      in[15] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 12));

-      transpose_32bit_8x4(in + 8, in + 8);

+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);

+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);

       highbd_idct16_4col(in);

       input += 4 * 16;

@@ -275,26 +240,10 @@

     for (i = 0; i < 16; i += 4) {

       int j;

-      out[0] = all[0][i + 0];

-      out[1] = all[1][i + 0];

-      out[2] = all[0][i + 1];

-      out[3] = all[1][i + 1];

-      out[4] = all[0][i + 2];

-      out[5] = all[1][i + 2];

-      out[6] = all[0][i + 3];

-      out[7] = all[1][i + 3];

-      transpose_32bit_8x4(out, out);

-      out[8] = all[2][i + 0];

-      out[9] = all[3][i + 0];

-      out[10] = all[2][i + 1];

-      out[11] = all[3][i + 1];

-      out[12] = all[2][i + 2];

-      out[13] = all[3][i + 2];

-      out[14] = all[2][i + 3];

-      out[15] = all[3][i + 3];

-      transpose_32bit_8x4(out + 8, out + 8);

+      transpose_32bit_4x4(all[0] + i, out + 0);

+      transpose_32bit_4x4(all[1] + i, out + 4);

+      transpose_32bit_4x4(all[2] + i, out + 8);

+      transpose_32bit_4x4(all[3] + i, out + 12);

       highbd_idct16_4col(out);

       for (j = 0; j < 16; ++j) {

@@ -313,16 +262,7 @@

   if (bd == 8) {

     __m128i in[16], temp[16];

-    in[0] = load_pack_8_32bit(input + 0 * 16);

-    in[1] = load_pack_8_32bit(input + 1 * 16);

-    in[2] = load_pack_8_32bit(input + 2 * 16);

-    in[3] = load_pack_8_32bit(input + 3 * 16);

-    in[4] = load_pack_8_32bit(input + 4 * 16);

-    in[5] = load_pack_8_32bit(input + 5 * 16);

-    in[6] = load_pack_8_32bit(input + 6 * 16);

-    in[7] = load_pack_8_32bit(input + 7 * 16);

-    transpose_16bit_8x8(in, in);

+    highbd_load_pack_transpose_32bit_8x8(input, 16, in);

     for (i = 8; i < 16; i++) {

       in[i] = _mm_setzero_si128();

@@ -343,15 +283,7 @@

     for (i = 0; i < 2; i++) {

       in = all[i];

-      in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0));

-      in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4));

-      in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0));

-      in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4));

-      in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0));

-      in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4));

-      in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0));

-      in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4));

-      transpose_32bit_8x4(in, in);

+      highbd_load_transpose_32bit_8x4(input, 16, in);

       highbd_idct16x16_38_4col(in);

       input += 4 * 16;

@@ -358,15 +290,8 @@

     for (i = 0; i < 16; i += 4) {

       int j;

-      out[0] = all[0][i + 0];

-      out[1] = all[1][i + 0];

-      out[2] = all[0][i + 1];

-      out[3] = all[1][i + 1];

-      out[4] = all[0][i + 2];

-      out[5] = all[1][i + 2];

-      out[6] = all[0][i + 3];

-      out[7] = all[1][i + 3];

-      transpose_32bit_8x4(out, out);

+      transpose_32bit_4x4(all[0] + i, out + 0);

+      transpose_32bit_4x4(all[1] + i, out + 4);

       highbd_idct16x16_38_4col(out);

       for (j = 0; j < 16; ++j) {

@@ -406,11 +331,7 @@

     for (i = 0; i < 2; i++) {

       in = all[i];

-      in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));

-      in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));

-      in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));

-      in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));

-      transpose_32bit_4x4(in, in);

+      highbd_load_transpose_32bit_4x4(input, 16, in);

       highbd_idct16x16_10_4col(in);

       input += 4 * 16;

--- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c

+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c

@@ -107,20 +107,20 @@

   __m128i temp1[2];

   // stage 2

-  highbd_multiplication_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64,

-                               &step2[8], &step2[15]);

-  highbd_multiplication_sse4_1(io[7], -(int)cospi_18_64, (int)cospi_14_64,

-                               &step2[9], &step2[14]);

-  highbd_multiplication_sse4_1(io[5], (int)cospi_22_64, (int)cospi_10_64,

-                               &step2[10], &step2[13]);

-  highbd_multiplication_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64,

-                               &step2[11], &step2[12]);

+  highbd_partial_butterfly_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64,

+                                  &step2[8], &step2[15]);

+  highbd_partial_butterfly_sse4_1(io[7], -(int)cospi_18_64, (int)cospi_14_64,

+                                  &step2[9], &step2[14]);

+  highbd_partial_butterfly_sse4_1(io[5], (int)cospi_22_64, (int)cospi_10_64,

+                                  &step2[10], &step2[13]);

+  highbd_partial_butterfly_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64,

+                                  &step2[11], &step2[12]);

   // stage 3

-  highbd_multiplication_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64,

-                               &step1[4], &step1[7]);

-  highbd_multiplication_sse4_1(io[6], -(int)cospi_20_64, (int)cospi_12_64,

-                               &step1[5], &step1[6]);

+  highbd_partial_butterfly_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64,

+                                  &step1[4], &step1[7]);

+  highbd_partial_butterfly_sse4_1(io[6], -(int)cospi_20_64, (int)cospi_12_64,

+                                  &step1[5], &step1[6]);

   step1[8] = _mm_add_epi32(step2[8], step2[9]);

   step1[9] = _mm_sub_epi32(step2[8], step2[9]);

   step1[10] = _mm_sub_epi32(step2[11], step2[10]);

@@ -134,8 +134,8 @@

   extend_64bit(io[0], temp1);

   step2[0] = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);

   step2[1] = step2[0];

-  highbd_multiplication_sse4_1(io[4], (int)cospi_24_64, (int)cospi_8_64,

-                               &step2[2], &step2[3]);

+  highbd_partial_butterfly_sse4_1(io[4], (int)cospi_24_64, (int)cospi_8_64,

+                                  &step2[2], &step2[3]);

   highbd_butterfly_sse4_1(step1[14], step1[9], (int)cospi_24_64,

                           (int)cospi_8_64, &step2[9], &step2[14]);

   highbd_butterfly_sse4_1(step1[10], step1[13], -(int)cospi_8_64,

@@ -159,14 +159,14 @@

   __m128i temp[2];

   // stage 2

-  highbd_multiplication_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64,

-                               &step2[8], &step2[15]);

-  highbd_multiplication_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64,

-                               &step2[11], &step2[12]);

+  highbd_partial_butterfly_sse4_1(io[1], (int)cospi_30_64, (int)cospi_2_64,

+                                  &step2[8], &step2[15]);

+  highbd_partial_butterfly_sse4_1(io[3], -(int)cospi_26_64, (int)cospi_6_64,

+                                  &step2[11], &step2[12]);

   // stage 3

-  highbd_multiplication_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64,

-                               &step1[4], &step1[7]);

+  highbd_partial_butterfly_sse4_1(io[2], (int)cospi_28_64, (int)cospi_4_64,

+                                  &step1[4], &step1[7]);

   step1[8] = step2[8];

   step1[9] = step2[8];

   step1[10] = step2[11];

@@ -208,25 +208,8 @@

     in = l;

     for (i = 0; i < 2; i++) {

-      in[0] = load_pack_8_32bit(input + 0 * 16);

-      in[1] = load_pack_8_32bit(input + 1 * 16);

-      in[2] = load_pack_8_32bit(input + 2 * 16);

-      in[3] = load_pack_8_32bit(input + 3 * 16);

-      in[4] = load_pack_8_32bit(input + 4 * 16);

-      in[5] = load_pack_8_32bit(input + 5 * 16);

-      in[6] = load_pack_8_32bit(input + 6 * 16);

-      in[7] = load_pack_8_32bit(input + 7 * 16);

-      transpose_16bit_8x8(in, in);

-      in[8] = load_pack_8_32bit(input + 0 * 16 + 8);

-      in[9] = load_pack_8_32bit(input + 1 * 16 + 8);

-      in[10] = load_pack_8_32bit(input + 2 * 16 + 8);

-      in[11] = load_pack_8_32bit(input + 3 * 16 + 8);

-      in[12] = load_pack_8_32bit(input + 4 * 16 + 8);

-      in[13] = load_pack_8_32bit(input + 5 * 16 + 8);

-      in[14] = load_pack_8_32bit(input + 6 * 16 + 8);

-      in[15] = load_pack_8_32bit(input + 7 * 16 + 8);

-      transpose_16bit_8x8(in + 8, in + 8);

+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);

+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);

       idct16_8col(in, in);

       in = r;

       input += 128;

@@ -248,26 +231,8 @@

     for (i = 0; i < 4; i++) {

       in = all[i];

-      in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0));

-      in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4));

-      in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0));

-      in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4));

-      in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0));

-      in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4));

-      in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0));

-      in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4));

-      transpose_32bit_8x4(in, in);

-      in[8] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 8));

-      in[9] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 12));

-      in[10] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 8));

-      in[11] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 12));

-      in[12] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 8));

-      in[13] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 12));

-      in[14] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 8));

-      in[15] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 12));

-      transpose_32bit_8x4(in + 8, in + 8);

+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);

+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);

       highbd_idct16_4col(in);

       input += 4 * 16;

@@ -274,26 +239,10 @@

     for (i = 0; i < 16; i += 4) {

       int j;

-      out[0] = all[0][i + 0];

-      out[1] = all[1][i + 0];

-      out[2] = all[0][i + 1];

-      out[3] = all[1][i + 1];

-      out[4] = all[0][i + 2];

-      out[5] = all[1][i + 2];

-      out[6] = all[0][i + 3];

-      out[7] = all[1][i + 3];

-      transpose_32bit_8x4(out, out);

-      out[8] = all[2][i + 0];

-      out[9] = all[3][i + 0];

-      out[10] = all[2][i + 1];

-      out[11] = all[3][i + 1];

-      out[12] = all[2][i + 2];

-      out[13] = all[3][i + 2];

-      out[14] = all[2][i + 3];

-      out[15] = all[3][i + 3];

-      transpose_32bit_8x4(out + 8, out + 8);

+      transpose_32bit_4x4(all[0] + i, out + 0);

+      transpose_32bit_4x4(all[1] + i, out + 4);

+      transpose_32bit_4x4(all[2] + i, out + 8);

+      transpose_32bit_4x4(all[3] + i, out + 12);

       highbd_idct16_4col(out);

       for (j = 0; j < 16; ++j) {

@@ -312,16 +261,7 @@

   if (bd == 8) {

     __m128i in[16], temp[16];

-    in[0] = load_pack_8_32bit(input + 0 * 16);

-    in[1] = load_pack_8_32bit(input + 1 * 16);

-    in[2] = load_pack_8_32bit(input + 2 * 16);

-    in[3] = load_pack_8_32bit(input + 3 * 16);

-    in[4] = load_pack_8_32bit(input + 4 * 16);

-    in[5] = load_pack_8_32bit(input + 5 * 16);

-    in[6] = load_pack_8_32bit(input + 6 * 16);

-    in[7] = load_pack_8_32bit(input + 7 * 16);

-    transpose_16bit_8x8(in, in);

+    highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);

     for (i = 8; i < 16; i++) {

       in[i] = _mm_setzero_si128();

@@ -342,15 +282,7 @@

     for (i = 0; i < 2; i++) {

       in = all[i];

-      in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 0));

-      in[1] = _mm_load_si128((const __m128i *)(input + 0 * 16 + 4));

-      in[2] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 0));

-      in[3] = _mm_load_si128((const __m128i *)(input + 1 * 16 + 4));

-      in[4] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 0));

-      in[5] = _mm_load_si128((const __m128i *)(input + 2 * 16 + 4));

-      in[6] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 0));

-      in[7] = _mm_load_si128((const __m128i *)(input + 3 * 16 + 4));

-      transpose_32bit_8x4(in, in);

+      highbd_load_transpose_32bit_8x4(input, 16, in);

       highbd_idct16x16_38_4col(in);

       input += 4 * 16;

@@ -357,15 +289,8 @@

     for (i = 0; i < 16; i += 4) {

       int j;

-      out[0] = all[0][i + 0];

-      out[1] = all[1][i + 0];

-      out[2] = all[0][i + 1];

-      out[3] = all[1][i + 1];

-      out[4] = all[0][i + 2];

-      out[5] = all[1][i + 2];

-      out[6] = all[0][i + 3];

-      out[7] = all[1][i + 3];

-      transpose_32bit_8x4(out, out);

+      transpose_32bit_4x4(all[0] + i, out + 0);

+      transpose_32bit_4x4(all[1] + i, out + 4);

       highbd_idct16x16_38_4col(out);

       for (j = 0; j < 16; ++j) {

@@ -405,11 +330,7 @@

     for (i = 0; i < 2; i++) {

       in = all[i];

-      in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));

-      in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));

-      in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));

-      in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));

-      transpose_32bit_4x4(in, in);

+      highbd_load_transpose_32bit_4x4(input, 16, in);

       highbd_idct16x16_10_4col(in);

       input += 4 * 16;

--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h

+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h

@@ -16,6 +16,7 @@

 #include "./vpx_config.h"

 #include "vpx/vpx_integer.h"

 #include "vpx_dsp/inv_txfm.h"

+#include "vpx_dsp/x86/transpose_sse2.h"

 #include "vpx_dsp/x86/txfm_common_sse2.h"

 static INLINE void extend_64bit(const __m128i in,

@@ -84,6 +85,7 @@

   const __m128i pair_c = pair_set_epi32(c << 2, 0);

   __m128i t0, t1;

+  assert(c >= 0);

   t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);

   t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);

   t0 = dct_const_round_shift_64bit(t0);

@@ -99,6 +101,7 @@

   const __m128i pair_c = pair_set_epi32(c << 2, 0);

   __m128i t0, t1;

+  assert(c >= 0);

   t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);

   t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);

   t0 = _mm_sub_epi64(_mm_setzero_si128(), t0);

@@ -118,6 +121,8 @@

   const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);

   __m128i temp1[4], temp2[4], sign1[2], sign2[2];

+  assert(c0 >= 0);

+  assert(c1 >= 0);

   abs_extend_64bit_sse2(in0, temp1, sign1);

   abs_extend_64bit_sse2(in1, temp2, sign2);

   temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1);

@@ -140,26 +145,15 @@

   *out1 = pack_4(temp2[0], temp2[1]);

-static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,

-                                                 const __m128i in1,

+// Note: c0 and c1 must be non negative.

+static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0,

+                                                 const int c1,

                                                  __m128i *const out0,

                                                  __m128i *const out1) {

-  __m128i temp1[2], temp2, sign[2];

-  temp2 = _mm_add_epi32(in0, in1);

-  abs_extend_64bit_sse2(temp2, temp1, sign);

-  *out0 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);

-  temp2 = _mm_sub_epi32(in0, in1);

-  abs_extend_64bit_sse2(temp2, temp1, sign);

-  *out1 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);

-}

-// Note: c0 and c1 must be non negative.

-static INLINE void highbd_multiplication_sse2(const __m128i in, const int c0,

-                                              const int c1, __m128i *const out0,

-                                              __m128i *const out1) {

   __m128i temp[2], sign[2];

+  assert(c0 >= 0);

+  assert(c1 >= 0);

   abs_extend_64bit_sse2(in, temp, sign);

   *out0 = multiplication_round_shift_sse2(temp, sign, c0);

   *out1 = multiplication_round_shift_sse2(temp, sign, c1);

@@ -166,17 +160,46 @@

 // Note: c0 and c1 must be non negative.

-static INLINE void highbd_multiplication_neg_sse2(const __m128i in,

-                                                  const int c0, const int c1,

-                                                  __m128i *const out0,

-                                                  __m128i *const out1) {

+static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in,

+                                                     const int c0, const int c1,

+                                                     __m128i *const out0,

+                                                     __m128i *const out1) {

   __m128i temp[2], sign[2];

+  assert(c0 >= 0);

+  assert(c1 >= 0);

   abs_extend_64bit_sse2(in, temp, sign);

   *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);

   *out1 = multiplication_round_shift_sse2(temp, sign, c0);

+static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,

+                                                 const __m128i in1,

+                                                 __m128i *const out0,

+                                                 __m128i *const out1) {

+  __m128i temp1[2], temp2, sign[2];

+  temp2 = _mm_add_epi32(in0, in1);

+  abs_extend_64bit_sse2(temp2, temp1, sign);

+  *out0 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);

+  temp2 = _mm_sub_epi32(in0, in1);

+  abs_extend_64bit_sse2(temp2, temp1, sign);

+  *out1 = multiplication_round_shift_sse2(temp1, sign, (int)cospi_16_64);

+}

+// Only do addition and subtraction butterfly, size = 16, 32

+static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out,

+                                            int size) {

+  int i = 0;

+  const int num = size >> 1;

+  const int bound = size - 1;

+  while (i < num) {

+    out[i] = _mm_add_epi32(in[i], in[bound - i]);

+    out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]);

+    i++;

+  }

+}

 static INLINE void highbd_idct8_stage4(const __m128i *const in,

                                        __m128i *const out) {

   out[0] = _mm_add_epi32(in[0], in[7]);

@@ -311,6 +334,44 @@

   const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));

   const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));

   return _mm_packs_epi32(t0, t1);

+}

+static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input,

+                                                        const int stride,

+                                                        __m128i *const in) {

+  in[0] = load_pack_8_32bit(input + 0 * stride);

+  in[1] = load_pack_8_32bit(input + 1 * stride);

+  in[2] = load_pack_8_32bit(input + 2 * stride);

+  in[3] = load_pack_8_32bit(input + 3 * stride);

+  in[4] = load_pack_8_32bit(input + 4 * stride);

+  in[5] = load_pack_8_32bit(input + 5 * stride);

+  in[6] = load_pack_8_32bit(input + 6 * stride);

+  in[7] = load_pack_8_32bit(input + 7 * stride);

+  transpose_16bit_8x8(in, in);

+}

+static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input,

+                                                   const int stride,

+                                                   __m128i *in) {

+  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));

+  in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4));

+  in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));

+  in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4));

+  in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));

+  in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4));

+  in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));

+  in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4));

+  transpose_32bit_8x4(in, in);

+}

+static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input,

+                                                   const int stride,

+                                                   __m128i *in) {

+  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));

+  in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));

+  in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));

+  in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));

+  transpose_32bit_4x4(in, in);

 static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,

--- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h

+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h

@@ -73,10 +73,10 @@

   *out1 = multiplication_round_shift_sse4_1(temp1, (int)cospi_16_64);

-static INLINE void highbd_multiplication_sse4_1(const __m128i in, const int c0,

-                                                const int c1,

-                                                __m128i *const out0,

-                                                __m128i *const out1) {

+static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in,

+                                                   const int c0, const int c1,

+                                                   __m128i *const out0,

+                                                   __m128i *const out1) {

   __m128i temp[2];

   extend_64bit(in, temp);

--

⑨