shithub: libvpx

Download patch

ref: b8a4b5dd8d2ae895ff08e88c2cd2b9b8c8bf17c5
parent: 0d1c78230618be3967bee4f71e5e00c8b2bbd8ac
author: Linfeng Zhang <linfengz@google.com>
date: Wed Jun 21 11:18:17 EDT 2017

Cosmetics, 8x8 idct SSE2 optimization

Change-Id: Id21fa94fd323e36cd19a2d890bf4a0cafb7d964d

--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -175,46 +175,52 @@
 }
 
 static INLINE void idct8(const __m128i *const in, __m128i *const out) {
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  /* Stage1 */
-  multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &stg1_0, &stg1_1,
-                         &stg1_2, &stg1_3, &stp1_4, &stp1_7, &stp1_5, &stp1_6);
+  const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  __m128i step1[8], step2[8];
 
-  /* Stage2 */
-  multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &stg2_0, &stg2_1,
-                         &stg2_2, &stg2_3, &stp2_0, &stp2_1, &stp2_2, &stp2_3);
+  // stage 1
+  {
+    const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+    const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+    multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28,
+                           &cp_n20_12, &cp_12_20, &step1[4], &step1[7],
+                           &step1[5], &step1[6]);
+  }
 
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+  // stage 2
+  {
+    const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+    const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+    multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16,
+                           &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0],
+                           &step2[1], &step2[2], &step2[3]);
+  }
 
-  /* Stage3 */
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-  multiplication_and_add_2(&stp2_6, &stp2_5, &stg2_1, &stg2_0, &stp1_5,
-                           &stp1_6);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
 
-  /* Stage4  */
-  out[0] = _mm_add_epi16(stp1_0, stp2_7);
-  out[1] = _mm_add_epi16(stp1_1, stp1_6);
-  out[2] = _mm_add_epi16(stp1_2, stp1_5);
-  out[3] = _mm_add_epi16(stp1_3, stp2_4);
-  out[4] = _mm_sub_epi16(stp1_3, stp2_4);
-  out[5] = _mm_sub_epi16(stp1_2, stp1_5);
-  out[6] = _mm_sub_epi16(stp1_1, stp1_6);
-  out[7] = _mm_sub_epi16(stp1_0, stp2_7);
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
+                           &step1[5], &step1[6]);
+
+  // stage 4
+  out[0] = _mm_add_epi16(step1[0], step2[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step2[4]);
+  out[4] = _mm_sub_epi16(step1[3], step2[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step2[7]);
 }
 
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -481,70 +487,59 @@
 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
                              int stride) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  __m128i in[8], step1[8], step2[8], tmp[4];
 
-  __m128i in[8];
-  __m128i stp1_2, stp1_3, stp1_4, stp1_5;
-  __m128i stp2_0, stp2_2, stp2_4, stp2_5, stp2_6;
-  __m128i tmp[4];
+  in[0] = load_input_data(input + 0 * 8);
+  in[1] = load_input_data(input + 1 * 8);
+  in[2] = load_input_data(input + 2 * 8);
+  in[3] = load_input_data(input + 3 * 8);
 
-  // Rows. Load 4-row input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
   transpose_16bit_4x4(in, in);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in[1], zero);
+  // in[0]: 00 10 20 30  01 11 21 31
+  // in[1]: 02 12 22 32  03 13 23 33
 
-    stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17);
-    stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35);
+  // stage 1
+  {
+    const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+    const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+    const __m128i lo_1 = _mm_unpackhi_epi16(in[0], zero);
+    const __m128i lo_3 = _mm_unpackhi_epi16(in[1], zero);
+    step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1);    // step1 4&7
+    step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3);  // step1 5&6
   }
 
-  // Stage2
+  // stage 2
   {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in[1], zero);
-
-    stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04);
-    stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26);
-
-    tmp[0] = _mm_add_epi16(stp1_4, stp1_5);
-    tmp[1] = _mm_sub_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp[0];
-    stp2_5 = _mm_unpacklo_epi64(tmp[1], zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp[1], zero);
+    const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+    const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+    const __m128i lo_0 = _mm_unpacklo_epi16(in[0], zero);
+    const __m128i lo_2 = _mm_unpacklo_epi16(in[1], zero);
+    step2[0] = idct_calc_wraplow_sse2(cp_16_16, cp_16_n16, lo_0);  // step2 0&1
+    step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2);    // step2 3&2
+    step2[4] = _mm_add_epi16(step1[4], step1[5]);                  // step2 4&7
+    step2[5] = _mm_sub_epi16(step1[4], step1[5]);                  // step2 5&6
+    step2[6] = _mm_unpackhi_epi64(step2[5], zero);                 // step2 6
   }
 
-  // Stage3
+  // stage 3
   {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp[0] = _mm_add_epi16(stp2_0, stp2_2);
-    tmp[1] = _mm_sub_epi16(stp2_0, stp2_2);
-    stp1_2 = _mm_unpackhi_epi64(tmp[1], tmp[0]);
-    stp1_3 = _mm_unpacklo_epi64(tmp[1], tmp[0]);
-    stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56);  // stg3_1 = stg2_0
+    const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+    tmp[0] = _mm_add_epi16(step2[0], step2[2]);                     // step1 0&1
+    tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                     // step1 3&2
+    step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                  // step1 2&1
+    step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                  // step1 3&0
+    step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65);  // step1 5&6
   }
 
-  // Stage4
-  tmp[0] = _mm_add_epi16(stp1_3, stp2_4);
-  tmp[1] = _mm_add_epi16(stp1_2, stp1_5);
-  tmp[2] = _mm_sub_epi16(stp1_3, stp2_4);
-  tmp[3] = _mm_sub_epi16(stp1_2, stp1_5);
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
 
   idct8x8_12_transpose_16bit_4x8(tmp, in);
   in[4] = in[5] = in[6] = in[7] = zero;