shithub: libvpx

Download patch

ref: 18e8baa5c08cb388c5b53ce0c93c865b1851db2c
parent: 31cb852a908ef36277a98b1a1beff0a8f6a7dc4f
author: Linfeng Zhang <linfengz@google.com>
date: Tue May 16 12:10:43 EDT 2017

Add transpose_32bit_4x4() and rename transpose_4x4() for vpx_dsp/x86

Change-Id: Ib57377f6cf6573c04720d3cc5dea4285362b4220

--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -58,7 +58,7 @@
     test = _mm_movemask_epi8(temp_mm);
 
     if (test) {
-      transpose_4x4(inptr);
+      transpose_16bit_4x4(inptr);
       sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
       sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
       inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -77,7 +77,7 @@
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
 
-  transpose_4x4(in);
+  transpose_16bit_4x4(in);
   // stage 1
   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
@@ -115,7 +115,7 @@
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8], in7;
 
-  transpose_4x4(in);
+  transpose_16bit_4x4(in);
   in7 = _mm_srli_si128(in[1], 8);
   in7 = _mm_add_epi16(in7, in[0]);
   in7 = _mm_sub_epi16(in7, in[1]);
--- a/vpx_dsp/x86/transpose_sse2.h
+++ b/vpx_dsp/x86/transpose_sse2.h
@@ -15,12 +15,41 @@
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-static INLINE void transpose_4x4(__m128i *res) {
+static INLINE void transpose_16bit_4x4(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
 
   res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
+static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,
+                                       __m128i *const a2, __m128i *const a3) {
+  // Unpack 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0: 00 10 01 11
+  // b1: 20 30 21 31
+  // b2: 02 12 03 13
+  // b3: 22 32 23 33
+
+  const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1);
+  const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3);
+  const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1);
+  const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3);
+
+  // Unpack 64 bit elements resulting in:
+  // a0: 00 10 20 30
+  // a1: 01 11 21 31
+  // a2: 02 12 22 32
+  // a3: 03 13 23 33
+  *a0 = _mm_unpacklo_epi64(b0, b1);
+  *a1 = _mm_unpackhi_epi64(b0, b1);
+  *a2 = _mm_unpacklo_epi64(b2, b3);
+  *a3 = _mm_unpackhi_epi64(b2, b3);
 }
 
 #endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_