shithub: libvpx

--- a/vp9/common/vp9_idctllm.c

+++ b/vp9/common/vp9_idctllm.c

@@ -26,6 +26,7 @@

 #include <math.h>

 #include "./vpx_config.h"

+#include "./vp9_rtcd.h"

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_common.h"

@@ -109,7 +110,7 @@

-static void idct4_1d(int16_t *input, int16_t *output) {

+void vp9_idct4_1d_c(int16_t *input, int16_t *output) {

   int16_t step[4];

   int temp1, temp2;

   // stage 1

@@ -140,7 +141,7 @@

   for (i = 0; i < 4; ++i) {

     for (j = 0; j < 4; ++j)

       temp_in[j] = input[j];

-    idct4_1d(temp_in, outptr);

+    vp9_idct4_1d(temp_in, outptr);

     input += 4;

     outptr += 4;

@@ -149,7 +150,7 @@

   for (i = 0; i < 4; ++i) {

     for (j = 0; j < 4; ++j)

       temp_in[j] = out[j * 4 + i];

-    idct4_1d(temp_in, temp_out);

+    vp9_idct4_1d(temp_in, temp_out);

     for (j = 0; j < 4; ++j)

       output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

@@ -205,7 +206,7 @@

   step1[6] = dct_const_round_shift(temp2);

   // stage 2 & stage 3 - even half

-  idct4_1d(step1, step1);

+  vp9_idct4_1d(step1, step1);

   // stage 2 - odd half

   step2[4] = step1[4] + step1[5];

@@ -298,24 +299,23 @@

   output[3] = dct_const_round_shift(s3);

-static const transform_2d IHT_4[] = {

-  { idct4_1d,  idct4_1d  },  // DCT_DCT  = 0

-  { iadst4_1d, idct4_1d  },  // ADST_DCT = 1

-  { idct4_1d,  iadst4_1d },  // DCT_ADST = 2

-  { iadst4_1d, iadst4_1d }   // ADST_ADST = 3

-};

 void vp9_short_iht4x4_c(int16_t *input, int16_t *output,

-                        int pitch, TX_TYPE tx_type) {

+                        int pitch, int tx_type) {

+  const transform_2d IHT_4[] = {

+    { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0

+    { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1

+    { vp9_idct4_1d,  iadst4_1d },      // DCT_ADST = 2

+    { iadst4_1d, iadst4_1d }           // ADST_ADST = 3

+  };

   int i, j;

   int16_t out[4 * 4];

   int16_t *outptr = out;

   int16_t temp_in[4], temp_out[4];

-  const transform_2d ht = IHT_4[tx_type];

   // inverse transform row vectors

   for (i = 0; i < 4; ++i) {

-    ht.rows(input, outptr);

+    IHT_4[tx_type].rows(input, outptr);

     input  += 4;

     outptr += 4;

@@ -324,7 +324,7 @@

   for (i = 0; i < 4; ++i) {

     for (j = 0; j < 4; ++j)

       temp_in[j] = out[j * 4 + i];

-    ht.cols(temp_in, temp_out);

+    IHT_4[tx_type].cols(temp_in, temp_out);

     for (j = 0; j < 4; ++j)

       output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

@@ -415,7 +415,7 @@

};

 void vp9_short_iht8x8_c(int16_t *input, int16_t *output,

-                        int pitch, TX_TYPE tx_type) {

+                        int pitch, int tx_type) {

   int i, j;

   int16_t out[8 * 8];

   int16_t *outptr = out;

@@ -838,7 +838,7 @@

};

 void vp9_short_iht16x16_c(int16_t *input, int16_t *output,

-                          int pitch, TX_TYPE tx_type) {

+                          int pitch, int tx_type) {

   int i, j;

   int16_t out[16 * 16];

   int16_t *outptr = out;

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -322,6 +322,9 @@

 prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"

 specialize vp9_short_iht16x16

+prototype void vp9_idct4_1d "int16_t *input, int16_t *output"

+specialize vp9_idct4_1d sse2

 # dct and add

 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

--- a/vp9/common/x86/vp9_idctllm_x86.c

+++ b/vp9/common/x86/vp9_idctllm_x86.c

@@ -77,10 +77,10 @@

 void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) {

   const __m128i zero = _mm_setzero_si128();

   const __m128i eight = _mm_set1_epi16(8);

-  const __m128i cst = _mm_setr_epi16((short)cospi_16_64, (short)cospi_16_64,

-                                     (short)cospi_16_64, (short)-cospi_16_64,

-                                     (short)cospi_24_64, (short)-cospi_8_64,

-                                     (short)cospi_8_64, (short)cospi_24_64);

+  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,

+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

   const int half_pitch = pitch >> 1;

   __m128i input0, input1, input2, input3;

@@ -198,4 +198,40 @@

   input3 = _mm_srli_si128(input3, 8);

   _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);

+void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,

+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);

+  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  __m128i in, temp;

+  // Load input data.

+  in = _mm_loadl_epi64((__m128i *)input);

+  // Construct i3, i1, i3, i1, i2, i0, i2, i0

+  in = _mm_shufflelo_epi16(in, 0xd8);

+  in = _mm_unpacklo_epi32(in, in);

+  // Stage 1

+  in = _mm_madd_epi16(in, c1);

+  in = _mm_add_epi32(in, rounding);

+  in = _mm_srai_epi32(in, DCT_CONST_BITS);

+  in = _mm_packs_epi32(in, zero);

+  // Stage 2

+  temp = _mm_shufflelo_epi16(in, 0x9c);

+  in = _mm_shufflelo_epi16(in, 0xc9);

+  in = _mm_unpacklo_epi64(temp, in);

+  in = _mm_madd_epi16(in, c2);

+  in = _mm_packs_epi32(in, zero);

+  // Store results

+  _mm_storel_epi64((__m128i *)output, in);

+}

 #endif

--

⑨