shithub: libvpx

--- a/vpx_dsp/x86/inv_txfm_sse2.c

+++ b/vpx_dsp/x86/inv_txfm_sse2.c

@@ -163,17 +163,6 @@

   *res3 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst3);

-static void multiplication_and_add_2(const __m128i *const in0,

-                                     const __m128i *const in1,

-                                     const __m128i *const cst0,

-                                     const __m128i *const cst1,

-                                     __m128i *const res0, __m128i *const res1) {

-  const __m128i lo = _mm_unpacklo_epi16(*in0, *in1);

-  const __m128i hi = _mm_unpackhi_epi16(*in0, *in1);

-  *res0 = idct_calc_wraplow_sse2(lo, hi, *cst0);

-  *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1);

-}

 static INLINE void idct8(const __m128i *const in, __m128i *const out) {

   const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);

   const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

--- a/vpx_dsp/x86/inv_txfm_sse2.h

+++ b/vpx_dsp/x86/inv_txfm_sse2.h

@@ -76,6 +76,18 @@

   return _mm_packs_epi32(t0, t1);

+static INLINE void multiplication_and_add_2(const __m128i *const in0,

+                                            const __m128i *const in1,

+                                            const __m128i *const cst0,

+                                            const __m128i *const cst1,

+                                            __m128i *const res0,

+                                            __m128i *const res1) {

+  const __m128i lo = _mm_unpacklo_epi16(*in0, *in1);

+  const __m128i hi = _mm_unpackhi_epi16(*in0, *in1);

+  *res0 = idct_calc_wraplow_sse2(lo, hi, *cst0);

+  *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1);

+}

 // Functions to allow 8 bit optimisations to be used when profile 0 is used with

 // highbitdepth enabled

 static INLINE __m128i load_input_data4(const tran_low_t *data) {

--- a/vpx_dsp/x86/inv_txfm_ssse3.c

+++ b/vpx_dsp/x86/inv_txfm_ssse3.c

@@ -17,135 +17,96 @@

 void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,

                               int stride) {

-  const __m128i zero = _mm_setzero_si128();

-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);

-  const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);

-  const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);

-  const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);

-  const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);

-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

-  const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);

-  const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);

-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);

+  const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);

+  const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);

+  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);

+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i cospi_16_64d = _mm_set1_epi16(2 * cospi_16_64);

+  const __m128i cospi_28_64d = _mm_set1_epi16(2 * cospi_28_64);

+  const __m128i cospi_4_64d = _mm_set1_epi16(2 * cospi_4_64);

+  const __m128i cospi_n20_64d = _mm_set1_epi16(-2 * cospi_20_64);

+  const __m128i cospi_12_64d = _mm_set1_epi16(2 * cospi_12_64);

+  const __m128i cospi_24_64d = _mm_set1_epi16(2 * cospi_24_64);

+  const __m128i cospi_8_64d = _mm_set1_epi16(2 * cospi_8_64);

+  __m128i in[8], step1[8], step2[8], tmp[4];

-  __m128i in[8];

-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;

-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;

-  __m128i tmp[4];

-  // Rows. Load 4-row input data.

   in[0] = load_input_data4(input + 0 * 8);

   in[1] = load_input_data4(input + 1 * 8);

   in[2] = load_input_data4(input + 2 * 8);

   in[3] = load_input_data4(input + 3 * 8);

-  // 4x4 Transpose

+  // pass 1

   transpose_16bit_4x4(in, in);

+  // in[0]: 00 10 20 30  01 11 21 31

+  // in[1]: 02 12 22 32  03 13 23 33

-  // Stage1

-  tmp[0] = _mm_mulhrs_epi16(in[0], stg1_0);

-  tmp[1] = _mm_mulhrs_epi16(in[0], stg1_1);

-  tmp[2] = _mm_mulhrs_epi16(in[1], stg1_2);

-  tmp[3] = _mm_mulhrs_epi16(in[1], stg1_3);

+  // stage 1

+  tmp[0] = _mm_unpacklo_epi64(in[0], in[0]);

+  tmp[1] = _mm_unpackhi_epi64(in[0], in[0]);

+  tmp[2] = _mm_unpacklo_epi64(in[1], in[1]);

+  tmp[3] = _mm_unpackhi_epi64(in[1], in[1]);

+  step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7

+  step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6

-  stp1_4 = _mm_unpackhi_epi64(tmp[0], tmp[1]);

-  stp1_5 = _mm_unpackhi_epi64(tmp[2], tmp[3]);

+  // stage 2

+  step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1

+  step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2

+  step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7

+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6

+  step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6

-  // Stage2

-  tmp[0] = _mm_mulhrs_epi16(in[0], stg2_0);

-  stp2_0 = _mm_unpacklo_epi64(tmp[0], tmp[0]);

+  // stage 3

+  tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);

+  step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6

+  tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1

+  tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2

+  step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1

+  step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0

-  tmp[1] = _mm_mulhrs_epi16(in[1], stg2_2);

-  tmp[2] = _mm_mulhrs_epi16(in[1], stg2_3);

-  stp2_2 = _mm_unpacklo_epi64(tmp[2], tmp[1]);

+  // stage 4

+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0

+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1

+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7

+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6

-  tmp[0] = _mm_add_epi16(stp1_4, stp1_5);

-  tmp[1] = _mm_sub_epi16(stp1_4, stp1_5);

+  // pass 2

-  stp2_4 = tmp[0];

-  stp2_5 = _mm_unpacklo_epi64(tmp[1], zero);

-  stp2_6 = _mm_unpackhi_epi64(tmp[1], zero);

-  tmp[0] = _mm_unpacklo_epi16(stp2_5, stp2_6);

-  tmp[1] = _mm_madd_epi16(tmp[0], stg3_0);

-  tmp[2] = _mm_madd_epi16(tmp[0], stk2_0);  // stg3_1 = stk2_0

-  tmp[1] = _mm_add_epi32(tmp[1], rounding);

-  tmp[2] = _mm_add_epi32(tmp[2], rounding);

-  tmp[1] = _mm_srai_epi32(tmp[1], DCT_CONST_BITS);

-  tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS);

-  stp1_5 = _mm_packs_epi32(tmp[1], tmp[2]);

-  // Stage3

-  tmp[2] = _mm_add_epi16(stp2_0, stp2_2);

-  tmp[3] = _mm_sub_epi16(stp2_0, stp2_2);

-  stp1_2 = _mm_unpackhi_epi64(tmp[3], tmp[2]);

-  stp1_3 = _mm_unpacklo_epi64(tmp[3], tmp[2]);

-  // Stage4

-  tmp[0] = _mm_add_epi16(stp1_3, stp2_4);

-  tmp[1] = _mm_add_epi16(stp1_2, stp1_5);

-  tmp[2] = _mm_sub_epi16(stp1_3, stp2_4);

-  tmp[3] = _mm_sub_epi16(stp1_2, stp1_5);

   idct8x8_12_transpose_16bit_4x8(tmp, in);

-  /* Stage1 */

-  stp1_4 = _mm_mulhrs_epi16(in[1], stg1_0);

-  stp1_7 = _mm_mulhrs_epi16(in[1], stg1_1);

-  stp1_5 = _mm_mulhrs_epi16(in[3], stg1_2);

-  stp1_6 = _mm_mulhrs_epi16(in[3], stg1_3);

+  // stage 1

+  step1[4] = _mm_mulhrs_epi16(in[1], cospi_28_64d);

+  step1[7] = _mm_mulhrs_epi16(in[1], cospi_4_64d);

+  step1[5] = _mm_mulhrs_epi16(in[3], cospi_n20_64d);

+  step1[6] = _mm_mulhrs_epi16(in[3], cospi_12_64d);

-  /* Stage2 */

-  stp2_0 = _mm_mulhrs_epi16(in[0], stg2_0);

-  stp2_1 = _mm_mulhrs_epi16(in[0], stg2_0);

+  // stage 2

+  step2[0] = _mm_mulhrs_epi16(in[0], cospi_16_64d);  // step2[1] = step2[0]

+  step2[2] = _mm_mulhrs_epi16(in[2], cospi_24_64d);

+  step2[3] = _mm_mulhrs_epi16(in[2], cospi_8_64d);

+  step2[4] = _mm_add_epi16(step1[4], step1[5]);

+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);

+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);

+  step2[7] = _mm_add_epi16(step1[7], step1[6]);

-  stp2_2 = _mm_mulhrs_epi16(in[2], stg2_2);

-  stp2_3 = _mm_mulhrs_epi16(in[2], stg2_3);

+  // stage 3

+  step1[0] = _mm_add_epi16(step2[0], step2[3]);

+  step1[1] = _mm_add_epi16(step2[0], step2[2]);

+  step1[2] = _mm_sub_epi16(step2[0], step2[2]);

+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);

+  multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,

+                           &step1[5], &step1[6]);

-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);

-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);

-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);

-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);

-  /* Stage3 */

-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);

-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);

-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);

-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);

-  tmp[0] = _mm_unpacklo_epi16(stp2_6, stp2_5);

-  tmp[1] = _mm_unpackhi_epi16(stp2_6, stp2_5);

-  tmp[2] = _mm_madd_epi16(tmp[0], stk2_0);

-  tmp[3] = _mm_madd_epi16(tmp[1], stk2_0);

-  tmp[2] = _mm_add_epi32(tmp[2], rounding);

-  tmp[3] = _mm_add_epi32(tmp[3], rounding);

-  tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS);

-  tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS);

-  stp1_6 = _mm_packs_epi32(tmp[2], tmp[3]);

-  tmp[2] = _mm_madd_epi16(tmp[0], stk2_1);

-  tmp[3] = _mm_madd_epi16(tmp[1], stk2_1);

-  tmp[2] = _mm_add_epi32(tmp[2], rounding);

-  tmp[3] = _mm_add_epi32(tmp[3], rounding);

-  tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS);

-  tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS);

-  stp1_5 = _mm_packs_epi32(tmp[2], tmp[3]);

-  /* Stage4  */

-  in[0] = _mm_add_epi16(stp1_0, stp2_7);

-  in[1] = _mm_add_epi16(stp1_1, stp1_6);

-  in[2] = _mm_add_epi16(stp1_2, stp1_5);

-  in[3] = _mm_add_epi16(stp1_3, stp2_4);

-  in[4] = _mm_sub_epi16(stp1_3, stp2_4);

-  in[5] = _mm_sub_epi16(stp1_2, stp1_5);

-  in[6] = _mm_sub_epi16(stp1_1, stp1_6);

-  in[7] = _mm_sub_epi16(stp1_0, stp2_7);

+  // stage 4

+  in[0] = _mm_add_epi16(step1[0], step2[7]);

+  in[1] = _mm_add_epi16(step1[1], step1[6]);

+  in[2] = _mm_add_epi16(step1[2], step1[5]);

+  in[3] = _mm_add_epi16(step1[3], step2[4]);

+  in[4] = _mm_sub_epi16(step1[3], step2[4]);

+  in[5] = _mm_sub_epi16(step1[2], step1[5]);

+  in[6] = _mm_sub_epi16(step1[1], step1[6]);

+  in[7] = _mm_sub_epi16(step1[0], step2[7]);

   write_buffer_8x8(in, dest, stride);

--

⑨