shithub: libvpx

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -96,11 +96,15 @@

   for (int i = 0; i < count_test_block; ++i) {

     int16_t test_input_block[16];

     int16_t test_temp_block[16];

-    int16_t test_output_block[16];

+    uint8_t dst[16], src[16];

+    for (int j = 0; j < 16; ++j) {

+      src[j] = rnd.Rand8();

+      dst[j] = rnd.Rand8();

+    }

     // Initialize a test block with input range [-255, 255].

     for (int j = 0; j < 16; ++j)

-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

+      test_input_block[j] = src[j] - dst[j];

     // TODO(Yaowu): this should be converted to a parameterized test

     // to test optimized versions of this function.

@@ -120,10 +124,10 @@

     // Because the bitstream is not frozen yet, use the idct in the codebase.

-    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct4x4_add_c(test_temp_block, dst, 4);

     for (int j = 0; j < 16; ++j) {

-      const int diff = test_input_block[j] - test_output_block[j];

+      const int diff = dst[j] - src[j];

       const int error = diff * diff;

       if (max_error < error)

         max_error = error;

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -391,8 +391,8 @@

   int lossless;

   /* Inverse transform function pointers. */

-  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);

-  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);

+  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);

+  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);

   void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);

   void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,

     struct macroblockd *xd);

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -18,12 +18,12 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_idct.h"

-void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int i;

+  int16_t output[16];

   int a1, b1, c1, d1;

   int16_t *ip = input;

   int16_t *op = output;

-  const int half_pitch = pitch >> 1;

   for (i = 0; i < 4; i++) {

     a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;

@@ -37,45 +37,52 @@

     op[3] = (d1 - c1) >> 1;

     ip += 4;

-    op += half_pitch;

+    op += 4;

   ip = output;

-  op = output;

   for (i = 0; i < 4; i++) {

-    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];

-    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];

-    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];

-    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];

+    a1 = ip[4 * 0] + ip[4 * 3];

+    b1 = ip[4 * 1] + ip[4 * 2];

+    c1 = ip[4 * 1] - ip[4 * 2];

+    d1 = ip[4 * 0] - ip[4 * 3];

-    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;

-    op[half_pitch * 1] = (c1 + d1) >> 1;

-    op[half_pitch * 2] = (a1 - b1) >> 1;

-    op[half_pitch * 3] = (d1 - c1) >> 1;

+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +

+                                       ((a1 + b1 + 1) >> 1));

+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +

+                                       ((c1 + d1) >> 1));

+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +

+                                       ((a1 - b1) >> 1));

+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +

+                                       ((d1 - c1) >> 1));

     ip++;

-    op++;

+    dest++;

-void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {

+void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {

   int i;

   int16_t tmp[4];

   int16_t *ip = in;

   int16_t *op = tmp;

-  const int half_pitch = pitch >> 1;

   op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

   op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;

   ip = tmp;

-  op = out;

   for (i = 0; i < 4; i++) {

-    op[half_pitch * 0] = (ip[0] + 1) >> 1;

-    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;

+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +

+                                       ((ip[0] + 1) >> 1));

+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +

+                                       (ip[0] >> 1));

+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +

+                                       (ip[0] >> 1));

+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +

+                                       (ip[0] >> 1));

     ip++;

-    op++;

+    dest++;

@@ -82,18 +89,8 @@

 void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,

                                  uint8_t *dst_ptr,

                                  int pitch, int stride) {

-  int r, c;

   int16_t dc = input_dc;

-  int16_t tmp[4 * 4];

-  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++)

-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);

-    dst_ptr += stride;

-    pred_ptr += pitch;

-  }

+  vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride);

 void vp9_idct4_1d_c(int16_t *input, int16_t *output) {

@@ -116,10 +113,9 @@

   output[3] = step[0] - step[3];

-void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int16_t out[4 * 4];

   int16_t *outptr = out;

-  const int half_pitch = pitch >> 1;

   int i, j;

   int16_t temp_in[4], temp_out[4];

@@ -138,22 +134,24 @@

       temp_in[j] = out[j * 4 + i];

     vp9_idct4_1d(temp_in, temp_out);

     for (j = 0; j < 4; ++j)

-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)

+                                  + dest[j * dest_stride + i]);

-void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int i;

   int a1;

-  int16_t *op = output;

-  const int half_pitch = pitch >> 1;

   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

   out = dct_const_round_shift(out * cospi_16_64);

   a1 = ROUND_POWER_OF_TWO(out, 4);

   for (i = 0; i < 4; i++) {

-    op[0] = op[1] = op[2] = op[3] = a1;

-    op += half_pitch;

+    dest[0] = clip_pixel(dest[0] + a1);

+    dest[1] = clip_pixel(dest[1] + a1);

+    dest[2] = clip_pixel(dest[2] + a1);

+    dest[3] = clip_pixel(dest[3] + a1);

+    dest += dest_stride;

@@ -285,8 +283,8 @@

   output[3] = dct_const_round_shift(s3);

-void vp9_short_iht4x4_c(int16_t *input, int16_t *output,

-                        int pitch, int tx_type) {

+void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,

+                            int tx_type) {

   const transform_2d IHT_4[] = {

     { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0

     { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1

@@ -312,10 +310,10 @@

       temp_in[j] = out[j * 4 + i];

     IHT_4[tx_type].cols(temp_in, temp_out);

     for (j = 0; j < 4; ++j)

-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)

+                                  + dest[j * dest_stride + i]);

 static void iadst8_1d(int16_t *input, int16_t *output) {

   int s0, s1, s2, s3, s4, s5, s6, s7;

--- a/vp9/common/vp9_invtrans.c

+++ b/vp9/common/vp9_invtrans.c

@@ -11,11 +11,10 @@

 #include "vp9/common/vp9_invtrans.h"

 #include "./vp9_rtcd.h"

-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,

-                                 int16_t *dqcoeff, int16_t *diff,

-                                 int pitch) {

+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,

+                                     uint8_t *dest, int stride) {

   if (eob <= 1)

-    xd->inv_txm4x4_1(dqcoeff, diff, pitch);

+    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);

   else

-    xd->inv_txm4x4(dqcoeff, diff, pitch);

+    xd->inv_txm4x4_add(dqcoeff, dest, stride);

--- a/vp9/common/vp9_invtrans.h

+++ b/vp9/common/vp9_invtrans.h

@@ -15,7 +15,6 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_blockd.h"

-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,

-                                 int16_t *dqcoeff, int16_t *diff,

-                                 int pitch);

+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,

+                                     uint8_t *dest, int stride);

 #endif  // VP9_COMMON_VP9_INVTRANS_H_

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -85,9 +85,6 @@

 specialize vp9_intra4x4_predict;

 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then

-prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"

-specialize vp9_add_residual_4x4 sse2

 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"

 specialize vp9_add_constant_residual_8x8 sse2

@@ -179,11 +176,11 @@

 # dct

-prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct4x4_1

+prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct4x4_1_add

-prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct4x4 sse2

+prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_idct4x4_add sse2

 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct8x8_add sse2

@@ -212,12 +209,12 @@

 prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct10_32x32_add

+prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"

+specialize vp9_short_iht4x4_add

 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"

 specialize vp9_short_iht8x8_add

-prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"

-specialize vp9_short_iht4x4

 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"

 specialize vp9_short_iht16x16_add

@@ -229,12 +226,11 @@

 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

 specialize vp9_dc_only_idct_add sse2

-prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_iwalsh4x4_1

-prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_iwalsh4x4

-prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

-specialize vp9_dc_only_inv_walsh_add

+prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_iwalsh4x4_1_add

+prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_short_iwalsh4x4_add

 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"

 specialize vp9_sad32x3

--- a/vp9/common/x86/vp9_idct_intrin_sse2.c

+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -73,7 +73,7 @@

   *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

-void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {

+void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   const __m128i zero = _mm_setzero_si128();

   const __m128i eight = _mm_set1_epi16(8);

   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

@@ -81,7 +81,6 @@

                                     (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

                                     (int16_t)cospi_8_64, (int16_t)cospi_24_64);

   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

-  const int half_pitch = pitch >> 1;

   __m128i input0, input1, input2, input3;

   // Rows

@@ -188,14 +187,23 @@

   input2 = _mm_srai_epi16(input2, 4);

   input3 = _mm_srai_epi16(input3, 4);

-  // Store results

-  _mm_storel_epi64((__m128i *)output, input2);

-  input2 = _mm_srli_si128(input2, 8);

-  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);

+#define RECON_AND_STORE4X4(dest, in_x) \

+  {                                                     \

+      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \

+      d0 = _mm_unpacklo_epi8(d0, zero); \

+      d0 = _mm_add_epi16(in_x, d0); \

+      d0 = _mm_packus_epi16(d0, d0); \

+      *(int *)dest = _mm_cvtsi128_si32(d0); \

+      dest += stride; \

+  }

-  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);

-  input3 = _mm_srli_si128(input3, 8);

-  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);

+  input0 = _mm_srli_si128(input2, 8);

+  input1 = _mm_srli_si128(input3, 8);

+  RECON_AND_STORE4X4(dest, input2);

+  RECON_AND_STORE4X4(dest, input0);

+  RECON_AND_STORE4X4(dest, input1);

+  RECON_AND_STORE4X4(dest, input3);

 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {

--- a/vp9/decoder/vp9_decodframe.c

+++ b/vp9/decoder/vp9_decodframe.c

@@ -1006,14 +1006,10 @@

                  pc->uv_dc_delta_q == 0 &&

                  pc->uv_ac_delta_q == 0;

   if (xd->lossless) {

-    xd->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;

-    xd->inv_txm4x4        = vp9_short_iwalsh4x4;

     xd->itxm_add          = vp9_idct_add_lossless_c;

     xd->itxm_add_y_block  = vp9_idct_add_y_block_lossless_c;

     xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;

   } else {

-    xd->inv_txm4x4_1      = vp9_short_idct4x4_1;

-    xd->inv_txm4x4        = vp9_short_idct4x4;

     xd->itxm_add          = vp9_idct_add;

     xd->itxm_add_y_block  = vp9_idct_add_y_block;

     xd->itxm_add_uv_block = vp9_idct_add_uv_block;

--- a/vp9/decoder/vp9_idct_blk.c

+++ b/vp9/decoder/vp9_idct_blk.c

@@ -84,23 +84,6 @@

-static void add_residual(const int16_t *diff, uint8_t *dest, int stride,

-                         int width, int height) {

-  int r, c;

-  for (r = 0; r < height; r++) {

-    for (c = 0; c < width; c++)

-      dest[c] = clip_pixel(diff[c] + dest[c]);

-    dest += stride;

-    diff += width;

-  }

-}

-void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {

-  add_residual(diff, dest, stride, 4, 4);

-}

 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,

                                   int width, int height) {

   int r, c;

@@ -133,11 +116,8 @@

   if (tx_type == DCT_DCT) {

     vp9_idct_add(input, dest, stride, eob);

   } else {

-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-    vp9_short_iht4x4(input, output, 4, tx_type);

+    vp9_short_iht4x4_add(input, dest, stride, tx_type);

     vpx_memset(input, 0, 32);

-    vp9_add_residual_4x4(output, dest, stride);

@@ -154,13 +134,9 @@

 void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

   if (eob > 1) {

-    // the idct halves ( >> 1) the pitch

-    vp9_short_idct4x4(input, output, 4 << 1);

+    vp9_short_idct4x4_add(input, dest, stride);

     vpx_memset(input, 0, 32);

-    vp9_add_residual_4x4(output, dest, stride);

   } else {

     vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);

     ((int *)input)[0] = 0;

@@ -168,26 +144,18 @@

 void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

   input[0] = dc;

-  // the idct halves ( >> 1) the pitch

-  vp9_short_idct4x4(input, output, 4 << 1);

+  vp9_short_idct4x4_add(input, dest, stride);

   vpx_memset(input, 0, 32);

-  vp9_add_residual_4x4(output, dest, stride);

 void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,

                              int eob) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

   if (eob > 1) {

-    vp9_short_iwalsh4x4_c(input, output, 4 << 1);

+    vp9_short_iwalsh4x4_add(input, dest, stride);

     vpx_memset(input, 0, 32);

-    vp9_add_residual_4x4(output, dest, stride);

   } else {

-    vp9_dc_only_inv_walsh_add(input[0], dest, dest, stride, stride);

+    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);

     ((int *)input)[0] = 0;

@@ -194,12 +162,9 @@

 void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,

                                 int stride, int dc) {

-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

   input[0] = dc;

-  vp9_short_iwalsh4x4_c(input, output, 4 << 1);

+  vp9_short_iwalsh4x4_add(input, dest, stride);

   vpx_memset(input, 0, 32);

-  vp9_add_residual_4x4(output, dest, stride);

 void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {

--- a/vp9/decoder/x86/vp9_dequantize_sse2.c

+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c

@@ -15,49 +15,6 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_idct.h"

-void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {

-  const int width = 4;

-  const __m128i zero = _mm_setzero_si128();

-  // Diff data

-  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));

-  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));

-  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));

-  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));

-  // Prediction data.

-  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(dest + 0 * stride));

-  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(dest + 1 * stride));

-  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(dest + 2 * stride));

-  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(dest + 3 * stride));

-  p0 = _mm_unpacklo_epi8(p0, zero);

-  p1 = _mm_unpacklo_epi8(p1, zero);

-  p2 = _mm_unpacklo_epi8(p2, zero);

-  p3 = _mm_unpacklo_epi8(p3, zero);

-  p0 = _mm_add_epi16(p0, d0);

-  p1 = _mm_add_epi16(p1, d1);

-  p2 = _mm_add_epi16(p2, d2);

-  p3 = _mm_add_epi16(p3, d3);

-  p0 = _mm_packus_epi16(p0, p1);

-  p2 = _mm_packus_epi16(p2, p3);

-  *(int *)dest = _mm_cvtsi128_si32(p0);

-  dest += stride;

-  p0 = _mm_srli_si128(p0, 8);

-  *(int *)dest = _mm_cvtsi128_si32(p0);

-  dest += stride;

-  *(int *)dest = _mm_cvtsi128_si32(p2);

-  dest += stride;

-  p2 = _mm_srli_si128(p2, 8);

-  *(int *)dest = _mm_cvtsi128_si32(p2);

-}

 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,

                                         int stride) {

   uint8_t abs_diff;

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1169,8 +1169,8 @@

   if (lossless) {

     cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;

     cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;

-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;

-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;

     cpi->mb.optimize              = 0;

     cpi->common.filter_level      = 0;

     cpi->zbin_mode_boost_enabled  = 0;

@@ -1178,8 +1178,8 @@

   } else {

     cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;

     cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;

-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;

-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;

--- a/vp9/encoder/vp9_encodeintra.c

+++ b/vp9/encoder/vp9_encodeintra.c

@@ -53,9 +53,6 @@

   int16_t* const src_diff =

       raster_block_offset_int16(xd, bsize, 0, ib,

                                 x->plane[0].src_diff);

-  int16_t* const diff =

-      raster_block_offset_int16(xd, bsize, 0, ib,

-                                xd->plane[0].diff);

   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);

   const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);

@@ -72,17 +69,15 @@

   if (tx_type != DCT_DCT) {

     vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type);

     x->quantize_b_4x4(x, ib, tx_type, 16);

-    vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),

-                     diff, 4 << bwl, tx_type);

+    vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), dst,

+                         xd->plane[0].dst.stride, tx_type);

   } else {

     x->fwd_txm4x4(src_diff, coeff, 8 << bwl);

     x->quantize_b_4x4(x, ib, tx_type, 16);

-    vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],

+    vp9_inverse_transform_b_4x4_add(&x->e_mbd, xd->plane[0].eobs[ib],

                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),

-                                diff, 8 << bwl);

+                                dst, xd->plane[0].dst.stride);

-  vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride);

 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -425,7 +425,6 @@

   VP9_COMMON *cm;

   MACROBLOCK *x;

   struct optimize_ctx *ctx;

-  int *wip_txfrm_size;  // for "work in progress" only... will remove once done

};

 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,

@@ -494,14 +493,9 @@

                          int ss_txfrm_size, void *arg) {

   struct encode_b_args* const args = arg;

   MACROBLOCK* const x = args->x;

-  int *wip_txfrm_size = args->wip_txfrm_size;

   MACROBLOCKD* const xd = &x->e_mbd;

-  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);

   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,

                                                        block, ss_txfrm_size);

-  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,

-                                                  raster_block,

-                                                  xd->plane[plane].diff);

   uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,

                                                  raster_block,

                                                  xd->plane[plane].dst.buf,

@@ -517,7 +511,6 @@

     case TX_32X32:

         vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,

                                 block, 16), dst, xd->plane[plane].dst.stride);

-        *wip_txfrm_size = 32;

       break;

     case TX_16X16:

       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;

@@ -529,7 +522,6 @@

                                block, 16), dst, xd->plane[plane].dst.stride,

                                tx_type);

-      *wip_txfrm_size = 16;

       break;

     case TX_8X8:

       tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;

@@ -541,7 +533,6 @@

                              block, 16), dst, xd->plane[plane].dst.stride,

                              tx_type);

-      *wip_txfrm_size = 8;

       break;

     case TX_4X4:

       tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;

@@ -549,13 +540,13 @@

         // this is like vp9_short_idct4x4 but has a special case around eob<=1

         // which is significant (not just an optimization) for the lossless

         // case.

-        vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block],

-            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2);

+        vp9_inverse_transform_b_4x4_add(xd, xd->plane[plane].eobs[block],

+            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), dst,

+            xd->plane[plane].dst.stride);

       } else {

-        vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),

-                         diff, bw, tx_type);

+        vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),

+                             dst, xd->plane[plane].dst.stride, tx_type);

-      *wip_txfrm_size = 4;

       break;

@@ -563,16 +554,15 @@

 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,

                          BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

-  struct encode_b_args arg = {cm, x, NULL, NULL};

+  struct encode_b_args arg = {cm, x, NULL};

-  foreach_transformed_block_in_plane(xd, bsize, 0,

-                                     xform_quant, &arg);

+  foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);

 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,

                          BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

-  struct encode_b_args arg = {cm, x, NULL, NULL};

+  struct encode_b_args arg = {cm, x, NULL};

   foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);

@@ -581,17 +571,13 @@

                     BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

   struct optimize_ctx ctx;

-  int wip_txfrm_size = 0;

-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};

+  struct encode_b_args arg = {cm, x, &ctx};

   vp9_subtract_sby(x, bsize);

   if (x->optimize)

     vp9_optimize_init(xd, bsize, &ctx);

-  foreach_transformed_block_in_plane(xd, bsize, 0,

-                                     encode_block, &arg);

-  if (wip_txfrm_size < 8)

-    vp9_recon_sby(xd, bsize);

+  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);

 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,

@@ -598,8 +584,7 @@

                      BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

   struct optimize_ctx ctx;

-  int wip_txfrm_size = 0;

-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};

+  struct encode_b_args arg = {cm, x, &ctx};

   vp9_subtract_sbuv(x, bsize);

   if (x->optimize)

@@ -606,9 +591,6 @@

     vp9_optimize_init(xd, bsize, &ctx);

   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);

-  if (wip_txfrm_size < 8)

-    vp9_recon_sbuv(xd, bsize);

 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,

@@ -615,27 +597,11 @@

                    BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD* const xd = &x->e_mbd;

   struct optimize_ctx ctx;

-  int wip_txfrm_size = 0;

-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};

+  struct encode_b_args arg = {cm, x, &ctx};

   vp9_subtract_sb(x, bsize);

   if (x->optimize)

     vp9_optimize_init(xd, bsize, &ctx);

-#if 0

-  foreach_transformed_block(xd, bsize, encode_block, &arg);

-  vp9_recon_sb(xd, bsize);

-#else

-  // wip version... will use foreach_transformed_block when done

-  foreach_transformed_block_in_plane(xd, bsize, 0,

-                                     encode_block, &arg);

-  if (wip_txfrm_size < 8)

-    vp9_recon_sby(xd, bsize);

-  wip_txfrm_size = 0;

-  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);

-  if (wip_txfrm_size < 8)

-    vp9_recon_sbuv(xd, bsize);

-#endif

+  foreach_transformed_block(xd, bsize, encode_block, &arg);

--- a/vp9/encoder/vp9_onyx_if.c

+++ b/vp9/encoder/vp9_onyx_if.c

@@ -1178,11 +1178,11 @@

   cpi->oxcf.lossless = oxcf->lossless;

   if (cpi->oxcf.lossless) {

-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;

-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_iwalsh4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;

   } else {

-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;

-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_idct4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;

   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -627,11 +627,6 @@

                                 BLOCK_SIZE_SB8X8,

                                 0, ib,

                                 x->plane[0].src_diff);

-  int16_t* const diff =

-      raster_block_offset_int16(xd,

-                                BLOCK_SIZE_SB8X8,

-                                0, ib,

-                                xd->plane[0].diff);

   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);

   uint8_t* const dst =

       raster_block_offset_uint8(xd,

@@ -703,18 +698,18 @@

   xd->mode_info_context->bmi[ib].as_mode.first =

     (B_PREDICTION_MODE)(*best_mode);

-  // inverse transform

-  if (best_tx_type != DCT_DCT)

-    vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type);

-  else

-    xd->inv_txm4x4(best_dqcoeff, diff, 16);

   vp9_intra4x4_predict(xd, ib,

                        BLOCK_SIZE_SB8X8,

                        *best_mode,

                        dst, xd->plane[0].dst.stride);

-  vp9_recon_b(dst, diff, 8,

-              dst, xd->plane[0].dst.stride);

+  // inverse transform

+  if (best_tx_type != DCT_DCT) {

+    vp9_short_iht4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride,

+                           best_tx_type);

+  } else {

+    xd->inv_txm4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride);

+  }

   return best_rd;

--

⑨