shithub: libvpx

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -225,6 +225,19 @@

+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

+  int i, j;

+  int a1;

+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  a1 = ROUND_POWER_OF_TWO(out, 5);

+  for (j = 0; j < 8; ++j) {

+    for (i = 0; i < 8; ++i)

+      dest[i] = clip_pixel(dest[i] + a1);

+    dest += dest_stride;

+  }

+}

 static void iadst4_1d(int16_t *input, int16_t *output) {

   int s0, s1, s2, s3, s4, s5, s6, s7;

@@ -431,12 +444,6 @@

       dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)

                                   + dest[j * dest_stride + i]);

-}

-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {

-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

-  out = dct_const_round_shift(out * cospi_16_64);

-  output[0] = ROUND_POWER_OF_TWO(out, 5);

 static void idct16_1d(int16_t *input, int16_t *output) {

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -297,14 +297,14 @@

 prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct4x4_add sse2

+prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct8x8_1_add sse2

 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct8x8_add sse2 neon

 prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct10_8x8_add sse2

-prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"

-specialize vp9_short_idct1_8x8

 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct16x16_add sse2

--- a/vp9/common/x86/vp9_idct_intrin_sse2.c

+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -523,9 +523,9 @@

   {                                                     \

      __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \

       d0 = _mm_unpacklo_epi8(d0, zero); \

-      in_x = _mm_add_epi16(in_x, d0); \

-      in_x = _mm_packus_epi16(in_x, in_x); \

-      _mm_storel_epi64((__m128i *)(dest), in_x); \

+      d0 = _mm_add_epi16(in_x, d0); \

+      d0 = _mm_packus_epi16(d0, d0); \

+      _mm_storel_epi64((__m128i *)(dest), d0); \

       dest += stride; \

@@ -595,6 +595,27 @@

   RECON_AND_STORE(dest, in5);

   RECON_AND_STORE(dest, in6);

   RECON_AND_STORE(dest, in7);

+}

+void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {

+  __m128i dc_value;

+  const __m128i zero = _mm_setzero_si128();

+  int a;

+  a = dct_const_round_shift(input[0] * cospi_16_64);

+  a = dct_const_round_shift(a * cospi_16_64);

+  a = ROUND_POWER_OF_TWO(a, 5);

+  dc_value = _mm_set1_epi16(a);

+  RECON_AND_STORE(dest, dc_value);

+  RECON_AND_STORE(dest, dc_value);

+  RECON_AND_STORE(dest, dc_value);

+  RECON_AND_STORE(dest, dc_value);

+  RECON_AND_STORE(dest, dc_value);

+  RECON_AND_STORE(dest, dc_value);

+  RECON_AND_STORE(dest, dc_value);

+  RECON_AND_STORE(dest, dc_value);

 // perform 8x8 transpose

--- a/vp9/decoder/vp9_idct_blk.c

+++ b/vp9/decoder/vp9_idct_blk.c

@@ -93,15 +93,8 @@

   if (eob) {

     if (eob == 1) {

       // DC only DCT coefficient

-      int16_t in = input[0];

-      int16_t out;

-      // Note: the idct1 will need to be modified accordingly whenever

-      // vp9_short_idct8x8_c() is modified.

-      vp9_short_idct1_8x8_c(&in, &out);

+      vp9_short_idct8x8_1_add(input, dest, stride);

       input[0] = 0;

-      vp9_add_constant_residual_8x8(out, dest, stride);

     } else {

       vp9_short_idct8x8_add(input, dest, stride);

       vpx_memset(input, 0, 128);

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -47,6 +47,14 @@

     xd->inv_txm4x4_add(dqcoeff, dest, stride);

+static void inverse_transform_b_8x8_add(MACROBLOCKD *xd, int eob,

+                                        int16_t *dqcoeff, uint8_t *dest,

+                                        int stride) {

+  if (eob <= 1)

+    vp9_short_idct8x8_1_add(dqcoeff, dest, stride);

+  else

+    vp9_short_idct8x8_add(dqcoeff, dest, stride);

+}

 static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {

   struct macroblock_plane *const p = &x->plane[plane];

@@ -533,7 +541,8 @@

       vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);

       break;

     case TX_8X8:

-      vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);

+      inverse_transform_b_8x8_add(xd, pd->eobs[block], dqcoeff,

+                                  dst, pd->dst.stride);

       break;

     case TX_4X4:

       // this is like vp9_short_idct4x4 but has a special case around eob<=1

@@ -711,7 +720,7 @@

                      pd->dequant, p->zbin_extra, eob, scan, iscan);

       if (!x->skip_encode && *eob) {

         if (tx_type == DCT_DCT)

-          vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);

+          inverse_transform_b_8x8_add(xd, *eob, dqcoeff, dst, pd->dst.stride);

         else

           vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);

@@ -746,8 +755,7 @@

           // this is like vp9_short_idct4x4 but has a special case around eob<=1

           // which is significant (not just an optimization) for the lossless

           // case.

-          inverse_transform_b_4x4_add(xd, *eob, dqcoeff,

-                                      dst, pd->dst.stride);

+          inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);

         else

           vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);

--

⑨