shithub: libvpx

Download patch

ref: 0c613312446c1bb867fad1a093bb099ce29dcb14
parent: cbb83ba4aa99b40b0b4a2a407bfd6d0d8be87d1f
author: Linfeng Zhang <linfengz@google.com>
date: Fri Aug 4 11:10:12 EDT 2017

Update high bitdepth 16x16 idct x86 code

Prepare for high bitdepth 16x16 idct sse4.1 code.
Just functions moving and renaming.

BUG=webm:1412

Change-Id: Ie056fe4494b1f299491968beadcef990e2ab714a

--- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -16,43 +16,6 @@
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
-  const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
-  const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
-  return _mm_packs_epi32(t0, t1);
-}
-
-static INLINE void highbd_write_buffer_8x1(uint16_t *dest, const __m128i in,
-                                           const int bd) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  __m128i out;
-
-  out = _mm_adds_epi16(in, final_rounding);
-  out = _mm_srai_epi16(out, 6);
-  recon_and_store_8_kernel(out, &dest, 0, bd);
-}
-
-static INLINE void recon_and_store_4_kernel(const __m128i in,
-                                            uint16_t *const dest,
-                                            const int bd) {
-  __m128i d;
-
-  d = _mm_loadl_epi64((const __m128i *)dest);
-  d = add_clamp(d, in, bd);
-  _mm_storel_epi64((__m128i *)dest, d);
-}
-
-static INLINE void highbd_write_buffer_4x1(uint16_t *const dest,
-                                           const __m128i in, const int bd) {
-  const __m128i final_rounding = _mm_set1_epi32(1 << 5);
-  __m128i out;
-
-  out = _mm_add_epi32(in, final_rounding);
-  out = _mm_srai_epi32(out, 6);
-  out = _mm_packs_epi32(out, out);
-  recon_and_store_4_kernel(out, dest, bd);
-}
-
 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
                                              __m128i *const out) {
   __m128i temp1[2], temp2, sign[2];
@@ -107,26 +70,6 @@
   out[15] = in[15];
 }
 
-static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
-                                             __m128i *const out) {
-  out[0] = _mm_add_epi32(in[0], in[15]);
-  out[1] = _mm_add_epi32(in[1], in[14]);
-  out[2] = _mm_add_epi32(in[2], in[13]);
-  out[3] = _mm_add_epi32(in[3], in[12]);
-  out[4] = _mm_add_epi32(in[4], in[11]);
-  out[5] = _mm_add_epi32(in[5], in[10]);
-  out[6] = _mm_add_epi32(in[6], in[9]);
-  out[7] = _mm_add_epi32(in[7], in[8]);
-  out[8] = _mm_sub_epi32(in[7], in[8]);
-  out[9] = _mm_sub_epi32(in[6], in[9]);
-  out[10] = _mm_sub_epi32(in[5], in[10]);
-  out[11] = _mm_sub_epi32(in[4], in[11]);
-  out[12] = _mm_sub_epi32(in[3], in[12]);
-  out[13] = _mm_sub_epi32(in[2], in[13]);
-  out[14] = _mm_sub_epi32(in[1], in[14]);
-  out[15] = _mm_sub_epi32(in[0], in[15]);
-}
-
 static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
   __m128i step1[16], step2[16];
   __m128i temp1[4], temp2, sign[2];
@@ -314,14 +257,14 @@
       input += 128;
     }
 
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i < 16; i += 8) {
       int j;
-      transpose_16bit_8x8(l + i * 8, out);
-      transpose_16bit_8x8(r + i * 8, out + 8);
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
       idct16_8col(out);
 
       for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
       }
       dest += 8;
     }
@@ -354,32 +297,32 @@
       input += 4 * 16;
     }
 
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < 16; i += 4) {
       int j;
-      out[0] = all[0][4 * i + 0];
-      out[1] = all[1][4 * i + 0];
-      out[2] = all[0][4 * i + 1];
-      out[3] = all[1][4 * i + 1];
-      out[4] = all[0][4 * i + 2];
-      out[5] = all[1][4 * i + 2];
-      out[6] = all[0][4 * i + 3];
-      out[7] = all[1][4 * i + 3];
+      out[0] = all[0][i + 0];
+      out[1] = all[1][i + 0];
+      out[2] = all[0][i + 1];
+      out[3] = all[1][i + 1];
+      out[4] = all[0][i + 2];
+      out[5] = all[1][i + 2];
+      out[6] = all[0][i + 3];
+      out[7] = all[1][i + 3];
       transpose_32bit_8x4(out, out);
 
-      out[8] = all[2][4 * i + 0];
-      out[9] = all[3][4 * i + 0];
-      out[10] = all[2][4 * i + 1];
-      out[11] = all[3][4 * i + 1];
-      out[12] = all[2][4 * i + 2];
-      out[13] = all[3][4 * i + 2];
-      out[14] = all[2][4 * i + 3];
-      out[15] = all[3][4 * i + 3];
+      out[8] = all[2][i + 0];
+      out[9] = all[3][i + 0];
+      out[10] = all[2][i + 1];
+      out[11] = all[3][i + 1];
+      out[12] = all[2][i + 2];
+      out[13] = all[3][i + 2];
+      out[14] = all[2][i + 3];
+      out[15] = all[3][i + 3];
       transpose_32bit_8x4(out + 8, out + 8);
 
       highbd_idct16_4col(out);
 
       for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
       }
       dest += 4;
     }
@@ -414,9 +357,9 @@
     in[15] = _mm_setzero_si128();
     idct16_8col(in);
 
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i < 16; i += 8) {
       int j;
-      transpose_16bit_8x8(in + i * 8, out);
+      transpose_16bit_8x8(in + i, out);
       out[8] = _mm_setzero_si128();
       out[9] = _mm_setzero_si128();
       out[10] = _mm_setzero_si128();
@@ -428,7 +371,7 @@
       idct16_8col(out);
 
       for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_8x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
       }
       dest += 8;
     }
@@ -450,21 +393,21 @@
       input += 4 * 16;
     }
 
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < 16; i += 4) {
       int j;
-      out[0] = all[0][4 * i + 0];
-      out[1] = all[1][4 * i + 0];
-      out[2] = all[0][4 * i + 1];
-      out[3] = all[1][4 * i + 1];
-      out[4] = all[0][4 * i + 2];
-      out[5] = all[1][4 * i + 2];
-      out[6] = all[0][4 * i + 3];
-      out[7] = all[1][4 * i + 3];
+      out[0] = all[0][i + 0];
+      out[1] = all[1][i + 0];
+      out[2] = all[0][i + 1];
+      out[3] = all[1][i + 1];
+      out[4] = all[0][i + 2];
+      out[5] = all[1][i + 2];
+      out[6] = all[0][i + 3];
+      out[7] = all[1][i + 3];
       transpose_32bit_8x4(out, out);
       highbd_idct16x16_38_4col(out);
 
       for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
       }
       dest += 4;
     }
@@ -486,12 +429,12 @@
 
     idct16x16_10_pass1(in, l);
 
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i < 16; i += 8) {
       int j;
-      idct16x16_10_pass2(l + 8 * i, in);
+      idct16x16_10_pass2(l + i, in);
 
       for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_8x1(dest + j * stride, in[j], bd);
+        highbd_write_buffer_8(dest + j * stride, in[j], bd);
       }
       dest += 8;
     }
@@ -509,13 +452,13 @@
       input += 4 * 16;
     }
 
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < 16; i += 4) {
       int j;
-      transpose_32bit_4x4(&all[0][4 * i], out);
+      transpose_32bit_4x4(&all[0][i], out);
       highbd_idct16x16_10_4col(out);
 
       for (j = 0; j < 16; ++j) {
-        highbd_write_buffer_4x1(dest + j * stride, out[j], bd);
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
       }
       dest += 4;
     }
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -143,7 +143,7 @@
     io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
   }
 
-  recon_and_store_4(io, dest, stride, bd);
+  recon_and_store_4x4(io, dest, stride, bd);
 }
 
 void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
--- a/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -65,5 +65,5 @@
     io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
   }
 
-  recon_and_store_4(io, dest, stride, bd);
+  recon_and_store_4x4(io, dest, stride, bd);
 }
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -176,7 +176,7 @@
     highbd_idct8x8_final_round(io);
   }
 
-  recon_and_store_8(io, dest, stride, bd);
+  recon_and_store_8x8(io, dest, stride, bd);
 }
 
 void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
@@ -219,7 +219,7 @@
     highbd_idct8x8_final_round(io);
   }
 
-  recon_and_store_8(io, dest, stride, bd);
+  recon_and_store_8x8(io, dest, stride, bd);
 }
 
 void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
--- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -178,7 +178,7 @@
     highbd_idct8x8_final_round(io);
   }
 
-  recon_and_store_8(io, dest, stride, bd);
+  recon_and_store_8x8(io, dest, stride, bd);
 }
 
 void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
@@ -221,5 +221,5 @@
     highbd_idct8x8_final_round(io);
   }
 
-  recon_and_store_8(io, dest, stride, bd);
+  recon_and_store_8x8(io, dest, stride, bd);
 }
--- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -184,6 +184,27 @@
   io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
   io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
 }
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[15]);
+  out[1] = _mm_add_epi32(in[1], in[14]);
+  out[2] = _mm_add_epi32(in[2], in[13]);
+  out[3] = _mm_add_epi32(in[3], in[12]);
+  out[4] = _mm_add_epi32(in[4], in[11]);
+  out[5] = _mm_add_epi32(in[5], in[10]);
+  out[6] = _mm_add_epi32(in[6], in[9]);
+  out[7] = _mm_add_epi32(in[7], in[8]);
+  out[8] = _mm_sub_epi32(in[7], in[8]);
+  out[9] = _mm_sub_epi32(in[6], in[9]);
+  out[10] = _mm_sub_epi32(in[5], in[10]);
+  out[11] = _mm_sub_epi32(in[4], in[11]);
+  out[12] = _mm_sub_epi32(in[3], in[12]);
+  out[13] = _mm_sub_epi32(in[2], in[13]);
+  out[14] = _mm_sub_epi32(in[1], in[14]);
+  out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
 static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
                                 const int bd) {
   const __m128i zero = _mm_set1_epi16(0);
@@ -221,11 +242,19 @@
   }
 }
 
-static INLINE void recon_and_store_4_dual(const __m128i in,
-                                          uint16_t *const dest,
-                                          const int stride, const int bd) {
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+                                     const int bd) {
   __m128i d;
 
+  d = _mm_loadl_epi64((const __m128i *)dest);
+  d = add_clamp(d, in, bd);
+  _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+                                       const int stride, const int bd) {
+  __m128i d;
+
   d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
   d = _mm_castps_si128(
       _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
@@ -234,16 +263,15 @@
   _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
 }
 
-static INLINE void recon_and_store_4(const __m128i *const in, uint16_t *dest,
-                                     const int stride, const int bd) {
-  recon_and_store_4_dual(in[0], dest, stride, bd);
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_4x2(in[0], dest, stride, bd);
   dest += 2 * stride;
-  recon_and_store_4_dual(in[1], dest, stride, bd);
+  recon_and_store_4x2(in[1], dest, stride, bd);
 }
 
-static INLINE void recon_and_store_8_kernel(const __m128i in,
-                                            uint16_t **const dest,
-                                            const int stride, const int bd) {
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+                                     const int stride, const int bd) {
   __m128i d;
 
   d = _mm_load_si128((const __m128i *)(*dest));
@@ -252,16 +280,43 @@
   *dest += stride;
 }
 
-static INLINE void recon_and_store_8(const __m128i *const in, uint16_t *dest,
-                                     const int stride, const int bd) {
-  recon_and_store_8_kernel(in[0], &dest, stride, bd);
-  recon_and_store_8_kernel(in[1], &dest, stride, bd);
-  recon_and_store_8_kernel(in[2], &dest, stride, bd);
-  recon_and_store_8_kernel(in[3], &dest, stride, bd);
-  recon_and_store_8_kernel(in[4], &dest, stride, bd);
-  recon_and_store_8_kernel(in[5], &dest, stride, bd);
-  recon_and_store_8_kernel(in[6], &dest, stride, bd);
-  recon_and_store_8_kernel(in[7], &dest, stride, bd);
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_8(in[0], &dest, stride, bd);
+  recon_and_store_8(in[1], &dest, stride, bd);
+  recon_and_store_8(in[2], &dest, stride, bd);
+  recon_and_store_8(in[3], &dest, stride, bd);
+  recon_and_store_8(in[4], &dest, stride, bd);
+  recon_and_store_8(in[5], &dest, stride, bd);
+  recon_and_store_8(in[6], &dest, stride, bd);
+  recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+  const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+  const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+  return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  __m128i out;
+
+  out = _mm_adds_epi16(in, final_rounding);
+  out = _mm_srai_epi16(out, 6);
+  recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+  __m128i out;
+
+  out = _mm_add_epi32(in, final_rounding);
+  out = _mm_srai_epi32(out, 6);
+  out = _mm_packs_epi32(out, out);
+  recon_and_store_4(out, dest, bd);
 }
 
 #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_