shithub: libvpx

Download patch

ref: 1e3a93e72e9fe9048bcff1deeb86afebdbc04921
parent: 469986f96399cbd2cf929e7e6c418196184e7ffa
parent: c338f3635e5b259fec57b8406f1416a863a4b04b
author: Linfeng Zhang <linfengz@google.com>
date: Fri Jun 30 16:49:19 EDT 2017

Merge changes I5d038b4f,I9d00d1dd,I0722841d,I1f640db7

* changes:
  Add vpx_highbd_idct8x8_{12, 64}_add_sse4_1
  sse2: Add transpose_32bit_4x4x2() and update transpose_32bit_4x4()
  Refactor highbd idct 4x4 sse4.1 code and add highbd_inv_txfm_sse4.h
  Refactor vpx_idct8x8_12_add_ssse3() and add inv_txfm_ssse3.h

--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -210,6 +210,7 @@
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
@@ -240,7 +241,9 @@
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct16x16_add_sse2.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct32x32_add_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h
 DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 ifeq ($(HAVE_NEON_ASM),yes)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -655,8 +655,8 @@
 
   if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
     specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
-    specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
-    specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
+    specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
     specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
     specialize qw/vpx_highbd_idct16x16_38_add neon sse2/;
     $vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2;
--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -105,8 +105,8 @@
         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
         inptr[i] = _mm_srai_epi16(inptr[i], 6);
         inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
+        d[0] = add_clamp(d[0], inptr[i], bd);
+        d[1] = add_clamp(d[1], inptr[i + 16], bd);
         // Store
         _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
         _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
@@ -222,8 +222,8 @@
         d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
         inptr[i] = _mm_srai_epi16(inptr[i], 6);
         inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
+        d[0] = add_clamp(d[0], inptr[i], bd);
+        d[1] = add_clamp(d[1], inptr[i + 16], bd);
         // Store
         _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
         _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -27,7 +27,7 @@
   const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
   __m128i temp1[4], temp2[4], step[4];
 
-  transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+  transpose_32bit_4x4(io, io);
 
   // Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
   //       _mm_mul_epu32() is used which can only guarantee the lower 32-bit
@@ -98,7 +98,7 @@
       _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
   __m128i temp1[4], temp2[4], step[4], sign1[4], sign2[4];
 
-  transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+  transpose_32bit_4x4(io, io);
 
   // stage 1
   temp1[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
@@ -187,19 +187,15 @@
       highbd_idct4_large_sse2(io);
       highbd_idct4_large_sse2(io);
     }
-    io[0] = wraplow_16bit(io[0], io[1], _mm_set1_epi32(8));
-    io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8));
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
   }
 
-  recon_and_store_4(dest, io, stride, bd);
+  recon_and_store_4(io, dest, stride, bd);
 }
 
 void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  // Faster than _mm_set1_epi16((1 << bd) - 1).
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
   int a1, i;
   tran_low_t out;
   __m128i dc, d;
@@ -211,7 +207,7 @@
 
   for (i = 0; i < 4; ++i) {
     d = _mm_loadl_epi64((const __m128i *)dest);
-    d = add_dc_clamp(&zero, &max, &dc, &d);
+    d = add_clamp(d, dc, bd);
     _mm_storel_epi64((__m128i *)dest, d);
     dest += stride;
   }
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -12,15 +12,10 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 
-static INLINE void extend_64bit(const __m128i in,
-                                __m128i *const out /*out[2]*/) {
-  out[0] = _mm_unpacklo_epi32(in, in);  // 0, 0, 1, 1
-  out[1] = _mm_unpackhi_epi32(in, in);  // 2, 2, 3, 3
-}
-
 static INLINE void highbd_idct4(__m128i *const io) {
   const __m128i cospi_p16_p16 =
       _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
@@ -28,47 +23,20 @@
       _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
   const __m128i cospi_p24_p24 =
       _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
-  __m128i temp1[4], temp2[4], step[4];
+  __m128i temp1[4], step[4];
 
-  transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+  transpose_32bit_4x4(io, io);
 
   // stage 1
   temp1[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
-  temp2[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
   extend_64bit(temp1[0], temp1);
-  extend_64bit(temp2[0], temp2);
-  temp1[0] = _mm_mul_epi32(temp1[0], cospi_p16_p16);
-  temp1[1] = _mm_mul_epi32(temp1[1], cospi_p16_p16);
-  temp2[0] = _mm_mul_epi32(temp2[0], cospi_p16_p16);
-  temp2[1] = _mm_mul_epi32(temp2[1], cospi_p16_p16);
-  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
-  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
-  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
-  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
-  step[0] = pack_4(temp1[0], temp1[1]);
-  step[1] = pack_4(temp2[0], temp2[1]);
+  step[0] = multiplication_round_shift(temp1, cospi_p16_p16);
+  temp1[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
+  extend_64bit(temp1[0], temp1);
+  step[1] = multiplication_round_shift(temp1, cospi_p16_p16);
+  multiplication_and_add_2_ssse4_1(&io[1], &io[3], &cospi_p24_p24,
+                                   &cospi_p08_p08, &step[2], &step[3]);
 
-  extend_64bit(io[1], temp1);
-  extend_64bit(io[3], temp2);
-  temp1[2] = _mm_mul_epi32(temp1[0], cospi_p08_p08);
-  temp1[3] = _mm_mul_epi32(temp1[1], cospi_p08_p08);
-  temp1[0] = _mm_mul_epi32(temp1[0], cospi_p24_p24);
-  temp1[1] = _mm_mul_epi32(temp1[1], cospi_p24_p24);
-  temp2[2] = _mm_mul_epi32(temp2[0], cospi_p24_p24);
-  temp2[3] = _mm_mul_epi32(temp2[1], cospi_p24_p24);
-  temp2[0] = _mm_mul_epi32(temp2[0], cospi_p08_p08);
-  temp2[1] = _mm_mul_epi32(temp2[1], cospi_p08_p08);
-  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);  // [1]*cospi_24 - [3]*cospi_8
-  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);  // [1]*cospi_24 - [3]*cospi_8
-  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);  // [1]*cospi_8 + [3]*cospi_24
-  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);  // [1]*cospi_8 + [3]*cospi_24
-  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
-  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
-  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
-  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
-  step[2] = pack_4(temp1[0], temp1[1]);
-  step[3] = pack_4(temp2[0], temp2[1]);
-
   // stage 2
   io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
   io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
@@ -99,9 +67,9 @@
   } else {
     highbd_idct4(io);
     highbd_idct4(io);
-    io[0] = wraplow_16bit(io[0], io[1], _mm_set1_epi32(8));
-    io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8));
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
   }
 
-  recon_and_store_4(dest, io, stride, bd);
+  recon_and_store_4(io, dest, stride, bd);
 }
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -94,7 +94,7 @@
         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
         inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        d[i] = add_clamp(d[i], inptr[i], bd);
         // Store
         _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
       }
@@ -196,7 +196,7 @@
         inptr[i] = _mm_add_epi16(inptr[i], sixteen);
         d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
         inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        d[i] = add_clamp(d[i], inptr[i], bd);
         // Store
         _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
       }
--- /dev/null
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -1,0 +1,280 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+  const __m128i cp_4q_4q =
+      _mm_setr_epi32(cospi_4_64 << 2, 0, cospi_4_64 << 2, 0);
+  const __m128i cp_8q_8q =
+      _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
+  const __m128i cp_12q_12q =
+      _mm_setr_epi32(cospi_12_64 << 2, 0, cospi_12_64 << 2, 0);
+  const __m128i cp_16q_16q =
+      _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
+  const __m128i cp_20q_20q =
+      _mm_setr_epi32(cospi_20_64 << 2, 0, cospi_20_64 << 2, 0);
+  const __m128i cp_24q_24q =
+      _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
+  const __m128i cp_28q_28q =
+      _mm_setr_epi32(cospi_28_64 << 2, 0, cospi_28_64 << 2, 0);
+  __m128i temp1[4], temp2[4], step1[8], step2[8];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[2] = io[4];
+  step1[1] = io[2];
+  step1[3] = io[6];
+  multiplication_and_add_2_ssse4_1(&io[1], &io[7], &cp_28q_28q, &cp_4q_4q,
+                                   &step1[4], &step1[7]);
+  multiplication_and_add_2_ssse4_1(&io[5], &io[3], &cp_12q_12q, &cp_20q_20q,
+                                   &step1[5], &step1[6]);
+
+  // stage 2
+  temp2[0] = _mm_add_epi32(step1[0], step1[2]);
+  extend_64bit(temp2[0], temp1);
+  step2[0] = multiplication_round_shift(temp1, cp_16q_16q);
+  temp2[0] = _mm_sub_epi32(step1[0], step1[2]);
+  extend_64bit(temp2[0], temp1);
+  step2[1] = multiplication_round_shift(temp1, cp_16q_16q);
+  multiplication_and_add_2_ssse4_1(&step1[1], &step1[3], &cp_24q_24q, &cp_8q_8q,
+                                   &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
+  extend_64bit(temp2[0], temp1);
+  step1[5] = multiplication_round_shift(temp1, cp_16q_16q);
+  temp2[0] = _mm_add_epi32(step2[6], step2[5]);
+  extend_64bit(temp2[0], temp1);
+  step1[6] = multiplication_round_shift(temp1, cp_16q_16q);
+  step1[7] = step2[7];
+
+  // stage 4
+  io[0] = _mm_add_epi32(step1[0], step1[7]);
+  io[1] = _mm_add_epi32(step1[1], step1[6]);
+  io[2] = _mm_add_epi32(step1[2], step1[5]);
+  io[3] = _mm_add_epi32(step1[3], step1[4]);
+  io[4] = _mm_sub_epi32(step1[3], step1[4]);
+  io[5] = _mm_sub_epi32(step1[2], step1[5]);
+  io[6] = _mm_sub_epi32(step1[1], step1[6]);
+  io[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+  const __m128i cp_28q_28q =
+      _mm_setr_epi32(cospi_28_64 << 2, 0, cospi_28_64 << 2, 0);
+  const __m128i cp_4q_4q =
+      _mm_setr_epi32(cospi_4_64 << 2, 0, cospi_4_64 << 2, 0);
+  const __m128i cp_n20q_n20q =
+      _mm_setr_epi32(-cospi_20_64 << 2, 0, -cospi_20_64 << 2, 0);
+  const __m128i cp_12q_12q =
+      _mm_setr_epi32(cospi_12_64 << 2, 0, cospi_12_64 << 2, 0);
+  const __m128i cp_16q_16q =
+      _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
+  const __m128i cp_8q_8q =
+      _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
+  const __m128i cp_24q_24q =
+      _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
+  __m128i temp1[4], temp2[4], step1[8], step2[8];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[1] = io[2];
+  extend_64bit(io[1], temp1);
+  step1[4] = multiplication_round_shift(temp1, cp_28q_28q);
+  step1[7] = multiplication_round_shift(temp1, cp_4q_4q);
+  extend_64bit(io[3], temp1);
+  step1[5] = multiplication_round_shift(temp1, cp_n20q_n20q);
+  step1[6] = multiplication_round_shift(temp1, cp_12q_12q);
+
+  // stage 2
+  extend_64bit(step1[0], temp1);
+  step2[0] = multiplication_round_shift(temp1, cp_16q_16q);
+  extend_64bit(step1[1], temp1);
+  step2[2] = multiplication_round_shift(temp1, cp_24q_24q);
+  step2[3] = multiplication_round_shift(temp1, cp_8q_8q);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  temp2[0] = _mm_sub_epi32(step2[6], step2[5]);
+  extend_64bit(temp2[0], temp1);
+  step1[5] = multiplication_round_shift(temp1, cp_16q_16q);
+  temp2[0] = _mm_add_epi32(step2[6], step2[5]);
+  extend_64bit(temp2[0], temp1);
+  step1[6] = multiplication_round_shift(temp1, cp_16q_16q);
+  step1[7] = step2[7];
+
+  // stage 4
+  io[0] = _mm_add_epi32(step1[0], step1[7]);
+  io[1] = _mm_add_epi32(step1[1], step1[6]);
+  io[2] = _mm_add_epi32(step1[2], step1[5]);
+  io[3] = _mm_add_epi32(step1[3], step1[4]);
+  io[4] = _mm_sub_epi32(step1[3], step1[4]);
+  io[5] = _mm_sub_epi32(step1[2], step1[5]);
+  io[6] = _mm_sub_epi32(step1[1], step1[6]);
+  io[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    idct8_sse2(io_short);
+    idct8_sse2(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_half1d(io);
+
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    highbd_idct8x8_half1d(&io[8]);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+    highbd_idct8x8_half1d(io);
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_half1d(&io[8]);
+
+    io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+    io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+    io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+    io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+    io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+    io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+    io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+    io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+  }
+
+  recon_and_store_8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], zero);
+    io_short[1] = _mm_packs_epi32(io[1], zero);
+    io_short[2] = _mm_packs_epi32(io[2], zero);
+    io_short[3] = _mm_packs_epi32(io[3], zero);
+
+    idct8x8_12_add_kernel_ssse3(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_12_half1d(io);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    highbd_idct8x8_12_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_12_half1d(&io[8]);
+
+    io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+    io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+    io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+    io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+    io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+    io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+    io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+    io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+  }
+
+  recon_and_store_8(io, dest, stride, bd);
+}
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -17,8 +17,14 @@
 #include "vpx_dsp/inv_txfm.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-static INLINE __m128i wraplow_16bit(const __m128i in0, const __m128i in1,
-                                    const __m128i rounding) {
+static INLINE void extend_64bit(const __m128i in,
+                                __m128i *const out /*out[2]*/) {
+  out[0] = _mm_unpacklo_epi32(in, in);  // 0, 0, 1, 1
+  out[1] = _mm_unpackhi_epi32(in, in);  // 2, 2, 3, 3
+}
+
+static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
+                                           const __m128i rounding) {
   __m128i temp[2];
   temp[0] = _mm_add_epi32(in0, rounding);
   temp[1] = _mm_add_epi32(in1, rounding);
@@ -27,6 +33,16 @@
   return _mm_packs_epi32(temp[0], temp[1]);
 }
 
+static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
+                                           const __m128i rounding) {
+  __m128i temp[2];
+  temp[0] = _mm_add_epi32(in0, rounding);
+  temp[1] = _mm_add_epi32(in1, rounding);
+  temp[0] = _mm_srai_epi32(temp[0], 5);
+  temp[1] = _mm_srai_epi32(temp[1], 5);
+  return _mm_packs_epi32(temp[0], temp[1]);
+}
+
 static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
   const __m128i t = _mm_add_epi64(
       in,
@@ -40,24 +56,24 @@
   return _mm_unpacklo_epi32(t0, t1);                // 0, 1, 2, 3
 }
 
-static INLINE __m128i add_dc_clamp(const __m128i *const min,
-                                   const __m128i *const max,
-                                   const __m128i *const dc,
-                                   const __m128i *const in) {
-  __m128i out;
-  out = _mm_adds_epi16(*in, *dc);
-  out = _mm_max_epi16(out, *min);
-  out = _mm_min_epi16(out, *max);
-  return out;
+static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
+                                const int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
+  // Faster than _mm_set1_epi16((1 << bd) - 1).
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i d;
+
+  d = _mm_adds_epi16(in0, in1);
+  d = _mm_max_epi16(d, zero);
+  d = _mm_min_epi16(d, max);
+
+  return d;
 }
 
 static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
                                             uint16_t *dest, int stride, int bd,
                                             const int size) {
-  const __m128i zero = _mm_setzero_si128();
-  // Faster than _mm_set1_epi16((1 << bd) - 1).
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
   int a1, i, j;
   tran_low_t out;
   __m128i dc, d;
@@ -70,7 +86,7 @@
   for (i = 0; i < size; ++i) {
     for (j = 0; j < (size >> 3); ++j) {
       d = _mm_load_si128((const __m128i *)(&dest[j * 8]));
-      d = add_dc_clamp(&zero, &max, &dc, &d);
+      d = add_clamp(d, dc, bd);
       _mm_store_si128((__m128i *)(&dest[j * 8]), d);
     }
     dest += stride;
@@ -77,36 +93,47 @@
   }
 }
 
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
-  __m128i ubounded, retval;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  ubounded = _mm_cmpgt_epi16(value, max);
-  retval = _mm_andnot_si128(ubounded, value);
-  ubounded = _mm_and_si128(ubounded, max);
-  retval = _mm_or_si128(retval, ubounded);
-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
-  return retval;
+static INLINE void recon_and_store_4_dual(const __m128i in,
+                                          uint16_t *const dest,
+                                          const int stride, const int bd) {
+  __m128i d;
+
+  d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+  d = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
+  d = add_clamp(d, in, bd);
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
+  _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
 }
 
-static INLINE void recon_and_store_4(uint16_t *const dest,
-                                     const __m128i *const io, const int stride,
-                                     int bd) {
-  __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-  __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-  d0 =
-      _mm_unpacklo_epi64(d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-  d2 = _mm_unpacklo_epi64(
-      d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-  d0 = clamp_high_sse2(_mm_adds_epi16(d0, io[0]), bd);
-  d2 = clamp_high_sse2(_mm_adds_epi16(d2, io[1]), bd);
-  _mm_storel_epi64((__m128i *)dest, d0);
-  d0 = _mm_srli_si128(d0, 8);
-  _mm_storel_epi64((__m128i *)(dest + stride), d0);
-  _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-  d2 = _mm_srli_si128(d2, 8);
-  _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+static INLINE void recon_and_store_4(const __m128i *const in, uint16_t *dest,
+                                     const int stride, const int bd) {
+  recon_and_store_4_dual(in[0], dest, stride, bd);
+  dest += 2 * stride;
+  recon_and_store_4_dual(in[1], dest, stride, bd);
+}
+
+static INLINE void recon_and_store_8_kernel(const __m128i in,
+                                            uint16_t **const dest,
+                                            const int stride, const int bd) {
+  __m128i d;
+
+  d = _mm_load_si128((const __m128i *)(*dest));
+  d = add_clamp(d, in, bd);
+  _mm_store_si128((__m128i *)(*dest), d);
+  *dest += stride;
+}
+
+static INLINE void recon_and_store_8(const __m128i *const in, uint16_t *dest,
+                                     const int stride, const int bd) {
+  recon_and_store_8_kernel(in[0], &dest, stride, bd);
+  recon_and_store_8_kernel(in[1], &dest, stride, bd);
+  recon_and_store_8_kernel(in[2], &dest, stride, bd);
+  recon_and_store_8_kernel(in[3], &dest, stride, bd);
+  recon_and_store_8_kernel(in[4], &dest, stride, bd);
+  recon_and_store_8_kernel(in[5], &dest, stride, bd);
+  recon_and_store_8_kernel(in[6], &dest, stride, bd);
+  recon_and_store_8_kernel(in[7], &dest, stride, bd);
 }
 
 #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
--- /dev/null
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -1,0 +1,60 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE __m128i multiplication_round_shift(const __m128i *const in,
+                                                 const __m128i cospi) {
+  __m128i t0, t1;
+  t0 = _mm_mul_epi32(in[0], cospi);
+  t1 = _mm_mul_epi32(in[1], cospi);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+  return pack_4(t0, t1);
+}
+
+static INLINE void multiplication_and_add_2_ssse4_1(const __m128i *const in0,
+                                                    const __m128i *const in1,
+                                                    const __m128i *const cst0,
+                                                    const __m128i *const cst1,
+                                                    __m128i *const out0,
+                                                    __m128i *const out1) {
+  __m128i temp1[4], temp2[4];
+  extend_64bit(*in0, temp1);
+  extend_64bit(*in1, temp2);
+  temp1[2] = _mm_mul_epi32(temp1[0], *cst1);
+  temp1[3] = _mm_mul_epi32(temp1[1], *cst1);
+  temp1[0] = _mm_mul_epi32(temp1[0], *cst0);
+  temp1[1] = _mm_mul_epi32(temp1[1], *cst0);
+  temp2[2] = _mm_mul_epi32(temp2[0], *cst0);
+  temp2[3] = _mm_mul_epi32(temp2[1], *cst0);
+  temp2[0] = _mm_mul_epi32(temp2[0], *cst1);
+  temp2[1] = _mm_mul_epi32(temp2[1], *cst1);
+  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  *out0 = pack_4(temp1[0], temp1[1]);
+  *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+#endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -152,28 +152,34 @@
   _mm_storel_epi64((__m128i *)(dest), d0);
 }
 
+static INLINE void round_shift_8x8(const __m128i *const in,
+                                   __m128i *const out) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+
+  out[0] = _mm_add_epi16(in[0], final_rounding);
+  out[1] = _mm_add_epi16(in[1], final_rounding);
+  out[2] = _mm_add_epi16(in[2], final_rounding);
+  out[3] = _mm_add_epi16(in[3], final_rounding);
+  out[4] = _mm_add_epi16(in[4], final_rounding);
+  out[5] = _mm_add_epi16(in[5], final_rounding);
+  out[6] = _mm_add_epi16(in[6], final_rounding);
+  out[7] = _mm_add_epi16(in[7], final_rounding);
+
+  out[0] = _mm_srai_epi16(out[0], 5);
+  out[1] = _mm_srai_epi16(out[1], 5);
+  out[2] = _mm_srai_epi16(out[2], 5);
+  out[3] = _mm_srai_epi16(out[3], 5);
+  out[4] = _mm_srai_epi16(out[4], 5);
+  out[5] = _mm_srai_epi16(out[5], 5);
+  out[6] = _mm_srai_epi16(out[6], 5);
+  out[7] = _mm_srai_epi16(out[7], 5);
+}
+
 static INLINE void write_buffer_8x8(const __m128i *const in,
                                     uint8_t *const dest, const int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
   __m128i t[8];
-  // Final rounding and shift
-  t[0] = _mm_adds_epi16(in[0], final_rounding);
-  t[1] = _mm_adds_epi16(in[1], final_rounding);
-  t[2] = _mm_adds_epi16(in[2], final_rounding);
-  t[3] = _mm_adds_epi16(in[3], final_rounding);
-  t[4] = _mm_adds_epi16(in[4], final_rounding);
-  t[5] = _mm_adds_epi16(in[5], final_rounding);
-  t[6] = _mm_adds_epi16(in[6], final_rounding);
-  t[7] = _mm_adds_epi16(in[7], final_rounding);
 
-  t[0] = _mm_srai_epi16(t[0], 5);
-  t[1] = _mm_srai_epi16(t[1], 5);
-  t[2] = _mm_srai_epi16(t[2], 5);
-  t[3] = _mm_srai_epi16(t[3], 5);
-  t[4] = _mm_srai_epi16(t[4], 5);
-  t[5] = _mm_srai_epi16(t[5], 5);
-  t[6] = _mm_srai_epi16(t[6], 5);
-  t[7] = _mm_srai_epi16(t[7], 5);
+  round_shift_8x8(in, t);
 
   recon_and_store(dest + 0 * stride, t[0]);
   recon_and_store(dest + 1 * stride, t[1]);
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -12,103 +12,21 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
 void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
                               int stride) {
-  const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
-  const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
-  const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
-  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i cospi_16_64d = _mm_set1_epi16(2 * cospi_16_64);
-  const __m128i cospi_28_64d = _mm_set1_epi16(2 * cospi_28_64);
-  const __m128i cospi_4_64d = _mm_set1_epi16(2 * cospi_4_64);
-  const __m128i cospi_n20_64d = _mm_set1_epi16(-2 * cospi_20_64);
-  const __m128i cospi_12_64d = _mm_set1_epi16(2 * cospi_12_64);
-  const __m128i cospi_24_64d = _mm_set1_epi16(2 * cospi_24_64);
-  const __m128i cospi_8_64d = _mm_set1_epi16(2 * cospi_8_64);
-  __m128i in[8], step1[8], step2[8], tmp[4];
+  __m128i io[8];
 
-  in[0] = load_input_data4(input + 0 * 8);
-  in[1] = load_input_data4(input + 1 * 8);
-  in[2] = load_input_data4(input + 2 * 8);
-  in[3] = load_input_data4(input + 3 * 8);
+  io[0] = load_input_data4(input + 0 * 8);
+  io[1] = load_input_data4(input + 1 * 8);
+  io[2] = load_input_data4(input + 2 * 8);
+  io[3] = load_input_data4(input + 3 * 8);
 
-  // pass 1
-
-  transpose_16bit_4x4(in, in);
-  // in[0]: 00 10 20 30  01 11 21 31
-  // in[1]: 02 12 22 32  03 13 23 33
-
-  // stage 1
-  tmp[0] = _mm_unpacklo_epi64(in[0], in[0]);
-  tmp[1] = _mm_unpackhi_epi64(in[0], in[0]);
-  tmp[2] = _mm_unpacklo_epi64(in[1], in[1]);
-  tmp[3] = _mm_unpackhi_epi64(in[1], in[1]);
-  step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7
-  step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6
-
-  // stage 2
-  step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1
-  step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2
-  step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7
-  step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6
-  step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6
-
-  // stage 3
-  tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
-  step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6
-  tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1
-  tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2
-  step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1
-  step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0
-
-  // stage 4
-  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
-  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
-  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
-  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
-
-  // pass 2
-
-  idct8x8_12_transpose_16bit_4x8(tmp, in);
-
-  // stage 1
-  step1[4] = _mm_mulhrs_epi16(in[1], cospi_28_64d);
-  step1[7] = _mm_mulhrs_epi16(in[1], cospi_4_64d);
-  step1[5] = _mm_mulhrs_epi16(in[3], cospi_n20_64d);
-  step1[6] = _mm_mulhrs_epi16(in[3], cospi_12_64d);
-
-  // stage 2
-  step2[0] = _mm_mulhrs_epi16(in[0], cospi_16_64d);  // step2[1] = step2[0]
-  step2[2] = _mm_mulhrs_epi16(in[2], cospi_24_64d);
-  step2[3] = _mm_mulhrs_epi16(in[2], cospi_8_64d);
-  step2[4] = _mm_add_epi16(step1[4], step1[5]);
-  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
-  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
-  step2[7] = _mm_add_epi16(step1[7], step1[6]);
-
-  // stage 3
-  step1[0] = _mm_add_epi16(step2[0], step2[3]);
-  step1[1] = _mm_add_epi16(step2[0], step2[2]);
-  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
-  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
-  multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
-                           &step1[5], &step1[6]);
-
-  // stage 4
-  in[0] = _mm_add_epi16(step1[0], step2[7]);
-  in[1] = _mm_add_epi16(step1[1], step1[6]);
-  in[2] = _mm_add_epi16(step1[2], step1[5]);
-  in[3] = _mm_add_epi16(step1[3], step2[4]);
-  in[4] = _mm_sub_epi16(step1[3], step2[4]);
-  in[5] = _mm_sub_epi16(step1[2], step1[5]);
-  in[6] = _mm_sub_epi16(step1[1], step1[6]);
-  in[7] = _mm_sub_epi16(step1[0], step2[7]);
-
-  write_buffer_8x8(in, dest, stride);
+  idct8x8_12_add_kernel_ssse3(io);
+  write_buffer_8x8(io, dest, stride);
 }
 
 static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
--- /dev/null
+++ b/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -1,0 +1,109 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#define VPX_DSP_X86_INV_TXFM_SSSE3_H_
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
+  const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+  const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+  const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i cospi_16_64d = _mm_set1_epi16(2 * cospi_16_64);
+  const __m128i cospi_28_64d = _mm_set1_epi16(2 * cospi_28_64);
+  const __m128i cospi_4_64d = _mm_set1_epi16(2 * cospi_4_64);
+  const __m128i cospi_n20_64d = _mm_set1_epi16(-2 * cospi_20_64);
+  const __m128i cospi_12_64d = _mm_set1_epi16(2 * cospi_12_64);
+  const __m128i cospi_24_64d = _mm_set1_epi16(2 * cospi_24_64);
+  const __m128i cospi_8_64d = _mm_set1_epi16(2 * cospi_8_64);
+  __m128i step1[8], step2[8], tmp[4];
+
+  // pass 1
+
+  transpose_16bit_4x4(io, io);
+  // io[0]: 00 10 20 30  01 11 21 31
+  // io[1]: 02 12 22 32  03 13 23 33
+
+  // stage 1
+  tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
+  tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
+  tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
+  tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
+  step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7
+  step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1
+  step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6
+  step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6
+
+  // stage 3
+  tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+  step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6
+  tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1
+  tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2
+  step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1
+  step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0
+
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
+
+  // pass 2
+
+  idct8x8_12_transpose_16bit_4x8(tmp, io);
+
+  // stage 1
+  step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
+  step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
+  step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
+  step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d);  // step2[1] = step2[0]
+  step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
+  step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
+                           &step1[5], &step1[6]);
+
+  // stage 4
+  io[0] = _mm_add_epi16(step1[0], step2[7]);
+  io[1] = _mm_add_epi16(step1[1], step1[6]);
+  io[2] = _mm_add_epi16(step1[2], step1[5]);
+  io[3] = _mm_add_epi16(step1[3], step2[4]);
+  io[4] = _mm_sub_epi16(step1[3], step2[4]);
+  io[5] = _mm_sub_epi16(step1[2], step1[5]);
+  io[6] = _mm_sub_epi16(step1[1], step1[6]);
+  io[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+#endif  // VPX_DSP_X86_INV_TXFM_SSSE3_H_
--- a/vpx_dsp/x86/transpose_sse2.h
+++ b/vpx_dsp/x86/transpose_sse2.h
@@ -21,16 +21,16 @@
   // in[2]: 20 21 22 23  XX XX XX XX
   // in[3]: 30 31 32 33  XX XX XX XX
   // to:
-  // tr0_0: 00 10 01 11  02 12 03 13
-  // tr0_1: 20 30 21 31  22 32 23 33
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
 
   // Unpack 32 bit elements resulting in:
   // out[0]: 00 10 20 30  01 11 21 31
   // out[1]: 02 12 22 32  03 13 23 33
-  out[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  out[1] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_unpackhi_epi32(a0, a1);
 }
 
 static INLINE void transpose_16bit_4x8(const __m128i *const in,
@@ -45,24 +45,24 @@
   // in[6]: 60 61 62 63  XX XX XX XX
   // in[7]: 70 71 72 73  XX XX XX XX
   // to:
-  // tr0_0: 00 10 01 11  02 12 03 13
-  // tr0_1: 20 30 21 31  22 32 23 33
-  // tr0_2: 40 50 41 51  42 52 43 53
-  // tr0_3: 60 70 61 71  62 72 63 73
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_3 = _mm_unpacklo_epi16(in[6], in[7]);
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
 
   // Unpack 32 bit elements resulting in:
-  // tr1_0: 00 10 20 30  01 11 21 31
-  // tr1_1: 40 50 60 70  41 51 61 71
-  // tr1_2: 02 12 22 32  03 13 23 33
-  // tr1_3: 42 52 62 72  43 53 63 73
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
 
   // Unpack 64 bit elements resulting in:
   // out[0]: 00 10 20 30  40 50 60 70
@@ -69,10 +69,10 @@
   // out[1]: 01 11 21 31  41 51 61 71
   // out[2]: 02 12 22 32  42 52 62 72
   // out[3]: 03 13 23 33  43 53 63 73
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
 }
 
 static INLINE void transpose_16bit_8x8(const __m128i *const in,
@@ -87,40 +87,40 @@
   // in[6]: 60 61 62 63  64 65 66 67
   // in[7]: 70 71 72 73  74 75 76 77
   // to:
-  // tr0_0: 00 10 01 11  02 12 03 13
-  // tr0_1: 20 30 21 31  22 32 23 33
-  // tr0_2: 40 50 41 51  42 52 43 53
-  // tr0_3: 60 70 61 71  62 72 63 73
-  // tr0_4: 04 14 05 15  06 16 07 17
-  // tr0_5: 24 34 25 35  26 36 27 37
-  // tr0_6: 44 54 45 55  46 56 47 57
-  // tr0_7: 64 74 65 75  66 76 67 77
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_3 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_4 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_5 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
 
   // Unpack 32 bit elements resulting in:
-  // tr1_0: 00 10 20 30  01 11 21 31
-  // tr1_1: 40 50 60 70  41 51 61 71
-  // tr1_2: 04 14 24 34  05 15 25 35
-  // tr1_3: 44 54 64 74  45 55 65 75
-  // tr1_4: 02 12 22 32  03 13 23 33
-  // tr1_5: 42 52 62 72  43 53 63 73
-  // tr1_6: 06 16 26 36  07 17 27 37
-  // tr1_7: 46 56 66 76  47 57 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_3 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_5 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
 
   // Unpack 64 bit elements resulting in:
   // out[0]: 00 10 20 30  40 50 60 70
@@ -131,14 +131,14 @@
   // out[5]: 05 15 25 35  45 55 65 75
   // out[6]: 06 16 26 36  46 56 66 76
   // out[7]: 07 17 27 37  47 57 67 77
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  out[2] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  out[3] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  out[4] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  out[5] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  out[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  out[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
 }
 
 // Transpose in-place
@@ -160,33 +160,81 @@
   left[15] = tbuf[7];
 }
 
-static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,
-                                       __m128i *const a2, __m128i *const a3) {
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
   // Unpack 32 bit elements. Goes from:
-  // a0: 00 01 02 03
-  // a1: 10 11 12 13
-  // a2: 20 21 22 23
-  // a3: 30 31 32 33
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
   // to:
-  // b0: 00 10 01 11
-  // b1: 20 30 21 31
-  // b2: 02 12 03 13
-  // b3: 22 32 23 33
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
 
-  const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1);
-  const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3);
-  const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1);
-  const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3);
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
 
   // Unpack 64 bit elements resulting in:
-  // a0: 00 10 20 30
-  // a1: 01 11 21 31
-  // a2: 02 12 22 32
-  // a3: 03 13 23 33
-  *a0 = _mm_unpacklo_epi64(b0, b1);
-  *a1 = _mm_unpackhi_epi64(b0, b1);
-  *a2 = _mm_unpacklo_epi64(b2, b3);
-  *a3 = _mm_unpackhi_epi64(b2, b3);
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
 }
 
 #endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_