shithub: libvpx

Download patch

ref: ba48a11130aa88cf20c2c54e43585968ce49e964
parent: 9aa37a51b28596137ba6fbcb1411c070287d6e11
author: Scott LaVarnway <slavarnway@google.com>
date: Mon May 20 09:03:17 EDT 2013

WIP: 4x4 idct/recon merge

This patch eliminates the intermediate diff buffer usage by
combining the short idct and the add residual into one function.
The encoder can use the same code as well.

Change-Id: I296604bf73579c45105de0dd1adbcc91bcc53c22

--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -96,11 +96,15 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[16];
     int16_t test_temp_block[16];
-    int16_t test_output_block[16];
+    uint8_t dst[16], src[16];
 
+    for (int j = 0; j < 16; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 16; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     // TODO(Yaowu): this should be converted to a parameterized test
     // to test optimized versions of this function.
@@ -120,10 +124,10 @@
     }
 
     // Because the bitstream is not frozen yet, use the idct in the codebase.
-    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct4x4_add_c(test_temp_block, dst, 4);
 
     for (int j = 0; j < 16; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -393,8 +393,8 @@
 
   int lossless;
   /* Inverse transform function pointers. */
-  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
+  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
   void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
   void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,
     struct macroblockd *xd);
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,12 +18,12 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
+  int16_t output[16];
   int a1, b1, c1, d1;
   int16_t *ip = input;
   int16_t *op = output;
-  const int half_pitch = pitch >> 1;
 
   for (i = 0; i < 4; i++) {
     a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;
@@ -37,45 +37,52 @@
     op[3] = (d1 - c1) >> 1;
 
     ip += 4;
-    op += half_pitch;
+    op += 4;
   }
 
   ip = output;
-  op = output;
   for (i = 0; i < 4; i++) {
-    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];
-    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];
-    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];
-    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];
+    a1 = ip[4 * 0] + ip[4 * 3];
+    b1 = ip[4 * 1] + ip[4 * 2];
+    c1 = ip[4 * 1] - ip[4 * 2];
+    d1 = ip[4 * 0] - ip[4 * 3];
 
 
-    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;
-    op[half_pitch * 1] = (c1 + d1) >> 1;
-    op[half_pitch * 2] = (a1 - b1) >> 1;
-    op[half_pitch * 3] = (d1 - c1) >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((a1 + b1 + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       ((c1 + d1) >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       ((a1 - b1) >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       ((d1 - c1) >> 1));
 
     ip++;
-    op++;
+    dest++;
   }
 }
 
-void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {
+void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
   int i;
   int16_t tmp[4];
   int16_t *ip = in;
   int16_t *op = tmp;
-  const int half_pitch = pitch >> 1;
 
   op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
   op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;
 
   ip = tmp;
-  op = out;
   for (i = 0; i < 4; i++) {
-    op[half_pitch * 0] = (ip[0] + 1) >> 1;
-    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((ip[0] + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       (ip[0] >> 1));
     ip++;
-    op++;
+    dest++;
   }
 }
 
@@ -82,18 +89,8 @@
 void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
                                  uint8_t *dst_ptr,
                                  int pitch, int stride) {
-  int r, c;
   int16_t dc = input_dc;
-  int16_t tmp[4 * 4];
-  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
+  vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride);
 }
 
 void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
@@ -116,10 +113,9 @@
   output[3] = step[0] - step[3];
 }
 
-void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[4], temp_out[4];
 
@@ -138,22 +134,24 @@
       temp_in[j] = out[j * 4 + i];
     vp9_idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
-void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int a1;
-  int16_t *op = output;
-  const int half_pitch = pitch >> 1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
-    op[0] = op[1] = op[2] = op[3] = a1;
-    op += half_pitch;
+    dest[0] = clip_pixel(dest[0] + a1);
+    dest[1] = clip_pixel(dest[1] + a1);
+    dest[2] = clip_pixel(dest[2] + a1);
+    dest[3] = clip_pixel(dest[3] + a1);
+    dest += dest_stride;
   }
 }
 
@@ -285,8 +283,8 @@
   output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, int tx_type) {
+void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                            int tx_type) {
   const transform_2d IHT_4[] = {
     { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
     { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
@@ -312,10 +310,10 @@
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
-
 static void iadst8_1d(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -11,11 +11,10 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "./vp9_rtcd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch) {
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride) {
   if (eob <= 1)
-    xd->inv_txm4x4_1(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
   else
-    xd->inv_txm4x4(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_add(dqcoeff, dest, stride);
 }
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -15,7 +15,6 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch);
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride);
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -85,9 +85,6 @@
 specialize vp9_intra4x4_predict;
 
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
-prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_4x4 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -179,11 +176,11 @@
 #
 # dct
 #
-prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4_1
+prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_1_add
 
-prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4 sse2
+prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_add sse2
 
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2
@@ -212,12 +209,12 @@
 prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_32x32_add
 
+prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht4x4_add
+
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht8x8_add
 
-prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht4x4
-
 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16_add
 
@@ -229,12 +226,11 @@
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add sse2
 
-prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4_1
-prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4
-prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_inv_walsh_add
+prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_1_add
+
+prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_add
 
 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad32x3
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -73,7 +73,7 @@
   *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
 }
 
-void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -81,7 +81,6 @@
                                     (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
                                     (int16_t)cospi_8_64, (int16_t)cospi_24_64);
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const int half_pitch = pitch >> 1;
   __m128i input0, input1, input2, input3;
 
   // Rows
@@ -188,14 +187,23 @@
   input2 = _mm_srai_epi16(input2, 4);
   input3 = _mm_srai_epi16(input3, 4);
 
-  // Store results
-  _mm_storel_epi64((__m128i *)output, input2);
-  input2 = _mm_srli_si128(input2, 8);
-  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);
+#define RECON_AND_STORE4X4(dest, in_x) \
+  {                                                     \
+      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      *(int *)dest = _mm_cvtsi128_si32(d0); \
+      dest += stride; \
+  }
 
-  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
-  input3 = _mm_srli_si128(input3, 8);
-  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
+  input0 = _mm_srli_si128(input2, 8);
+  input1 = _mm_srli_si128(input3, 8);
+
+  RECON_AND_STORE4X4(dest, input2);
+  RECON_AND_STORE4X4(dest, input0);
+  RECON_AND_STORE4X4(dest, input1);
+  RECON_AND_STORE4X4(dest, input3);
 }
 
 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -998,14 +998,10 @@
                  pc->uv_dc_delta_q == 0 &&
                  pc->uv_ac_delta_q == 0;
   if (xd->lossless) {
-    xd->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;
-    xd->inv_txm4x4        = vp9_short_iwalsh4x4;
     xd->itxm_add          = vp9_idct_add_lossless_c;
     xd->itxm_add_y_block  = vp9_idct_add_y_block_lossless_c;
     xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;
   } else {
-    xd->inv_txm4x4_1      = vp9_short_idct4x4_1;
-    xd->inv_txm4x4        = vp9_short_idct4x4;
     xd->itxm_add          = vp9_idct_add;
     xd->itxm_add_y_block  = vp9_idct_add_y_block;
     xd->itxm_add_uv_block = vp9_idct_add_uv_block;
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -84,23 +84,6 @@
   }
 }
 
-static void add_residual(const int16_t *diff, uint8_t *dest, int stride,
-                         int width, int height) {
-  int r, c;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++)
-      dest[c] = clip_pixel(diff[c] + dest[c]);
-
-    dest += stride;
-    diff += width;
-  }
-}
-
-void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 4, 4);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -133,11 +116,8 @@
   if (tx_type == DCT_DCT) {
     vp9_idct_add(input, dest, stride, eob);
   } else {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-    vp9_short_iht4x4(input, output, 4, tx_type);
+    vp9_short_iht4x4_add(input, dest, stride, tx_type);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   }
 }
 
@@ -154,13 +134,9 @@
 }
 
 void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   if (eob > 1) {
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct4x4(input, output, 4 << 1);
+    vp9_short_idct4x4_add(input, dest, stride);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   } else {
     vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
     ((int *)input)[0] = 0;
@@ -168,26 +144,18 @@
 }
 
 void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   input[0] = dc;
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct4x4(input, output, 4 << 1);
+  vp9_short_idct4x4_add(input, dest, stride);
   vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }
 
 void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
                              int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   if (eob > 1) {
-    vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+    vp9_short_iwalsh4x4_add(input, dest, stride);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   } else {
-    vp9_dc_only_inv_walsh_add(input[0], dest, dest, stride, stride);
+    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
     ((int *)input)[0] = 0;
   }
 }
@@ -194,12 +162,9 @@
 
 void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
                                 int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   input[0] = dc;
-  vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+  vp9_short_iwalsh4x4_add(input, dest, stride);
   vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }
 
 void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -15,49 +15,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
-  const int width = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));
-  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));
-  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));
-  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));
-
-  // Prediction data.
-  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(dest + 0 * stride));
-  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(dest + 1 * stride));
-  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(dest + 2 * stride));
-  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(dest + 3 * stride));
-
-  p0 = _mm_unpacklo_epi8(p0, zero);
-  p1 = _mm_unpacklo_epi8(p1, zero);
-  p2 = _mm_unpacklo_epi8(p2, zero);
-  p3 = _mm_unpacklo_epi8(p3, zero);
-
-  p0 = _mm_add_epi16(p0, d0);
-  p1 = _mm_add_epi16(p1, d1);
-  p2 = _mm_add_epi16(p2, d2);
-  p3 = _mm_add_epi16(p3, d3);
-
-  p0 = _mm_packus_epi16(p0, p1);
-  p2 = _mm_packus_epi16(p2, p3);
-
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  p0 = _mm_srli_si128(p0, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-  dest += stride;
-
-  p2 = _mm_srli_si128(p2, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1207,8 +1207,8 @@
   if (lossless) {
     cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
     cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
     cpi->mb.optimize              = 0;
     cpi->common.filter_level      = 0;
     cpi->zbin_mode_boost_enabled  = 0;
@@ -1216,8 +1216,8 @@
   } else {
     cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
     cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
   }
 }
 
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -53,9 +53,6 @@
   int16_t* const src_diff =
       raster_block_offset_int16(xd, bsize, 0, ib,
                                 x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, bsize, 0, ib,
-                                xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
 
@@ -72,17 +69,15 @@
   if (tx_type != DCT_DCT) {
     vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type);
     x->quantize_b_4x4(x, ib, tx_type, 16);
-    vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                     diff, 4 << bwl, tx_type);
+    vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), dst,
+                         xd->plane[0].dst.stride, tx_type);
   } else {
     x->fwd_txm4x4(src_diff, coeff, 8 << bwl);
     x->quantize_b_4x4(x, ib, tx_type, 16);
-    vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
+    vp9_inverse_transform_b_4x4_add(&x->e_mbd, xd->plane[0].eobs[ib],
                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                diff, 8 << bwl);
+                                dst, xd->plane[0].dst.stride);
   }
-
-  vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride);
 }
 
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -425,7 +425,6 @@
   VP9_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
-  int *wip_txfrm_size;  // for "work in progress" only... will remove once done
 };
 
 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -494,14 +493,9 @@
                          int ss_txfrm_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
-  int *wip_txfrm_size = args->wip_txfrm_size;
   MACROBLOCKD* const xd = &x->e_mbd;
-  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
                                                        block, ss_txfrm_size);
-  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
-                                                  raster_block,
-                                                  xd->plane[plane].diff);
   uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
                                                  raster_block,
                                                  xd->plane[plane].dst.buf,
@@ -517,7 +511,6 @@
     case TX_32X32:
         vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
                                 block, 16), dst, xd->plane[plane].dst.stride);
-        *wip_txfrm_size = 32;
       break;
     case TX_16X16:
       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
@@ -529,7 +522,6 @@
                                block, 16), dst, xd->plane[plane].dst.stride,
                                tx_type);
       }
-      *wip_txfrm_size = 16;
       break;
     case TX_8X8:
       tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@@ -541,7 +533,6 @@
                              block, 16), dst, xd->plane[plane].dst.stride,
                              tx_type);
       }
-      *wip_txfrm_size = 8;
       break;
     case TX_4X4:
       tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@@ -549,13 +540,13 @@
         // this is like vp9_short_idct4x4 but has a special case around eob<=1
         // which is significant (not just an optimization) for the lossless
         // case.
-        vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block],
-            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2);
+        vp9_inverse_transform_b_4x4_add(xd, xd->plane[plane].eobs[block],
+            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), dst,
+            xd->plane[plane].dst.stride);
       } else {
-        vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                         diff, bw, tx_type);
+        vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                             dst, xd->plane[plane].dst.stride, tx_type);
       }
-      *wip_txfrm_size = 4;
       break;
   }
 }
@@ -563,16 +554,15 @@
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL, NULL};
+  struct encode_b_args arg = {cm, x, NULL};
 
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     xform_quant, &arg);
+  foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);
 }
 
 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL, NULL};
+  struct encode_b_args arg = {cm, x, NULL};
 
   foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
@@ -581,17 +571,13 @@
                     BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
 
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     encode_block, &arg);
-  if (wip_txfrm_size < 8)
-    vp9_recon_sby(xd, bsize);
+  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
 }
 
 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -598,8 +584,7 @@
                      BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sbuv(x, bsize);
   if (x->optimize)
@@ -606,9 +591,6 @@
     vp9_optimize_init(xd, bsize, &ctx);
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-
-  if (wip_txfrm_size < 8)
-    vp9_recon_sbuv(xd, bsize);
 }
 
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
@@ -615,27 +597,11 @@
                    BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sb(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
-#if 0
-  foreach_transformed_block(xd, bsize, encode_block, &arg);
 
-  vp9_recon_sb(xd, bsize);
-#else
-  // wip version... will use foreach_transformed_block when done
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     encode_block, &arg);
-  if (wip_txfrm_size < 8)
-    vp9_recon_sby(xd, bsize);
-  wip_txfrm_size = 0;
-
-  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-
-  if (wip_txfrm_size < 8)
-    vp9_recon_sbuv(xd, bsize);
-#endif
+  foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1129,11 +1129,11 @@
 
   cpi->oxcf.lossless = oxcf->lossless;
   if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
   } else {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
   }
 
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -592,11 +592,6 @@
                                 BLOCK_SIZE_SB8X8,
                                 0, ib,
                                 x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   uint8_t* const dst =
       raster_block_offset_uint8(xd,
@@ -668,18 +663,18 @@
   xd->mode_info_context->bmi[ib].as_mode.first =
     (B_PREDICTION_MODE)(*best_mode);
 
-  // inverse transform
-  if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type);
-  else
-    xd->inv_txm4x4(best_dqcoeff, diff, 16);
-
   vp9_intra4x4_predict(xd, ib,
                        BLOCK_SIZE_SB8X8,
                        *best_mode,
                        dst, xd->plane[0].dst.stride);
-  vp9_recon_b(dst, diff, 8,
-              dst, xd->plane[0].dst.stride);
+
+  // inverse transform
+  if (best_tx_type != DCT_DCT) {
+    vp9_short_iht4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride,
+                           best_tx_type);
+  } else {
+    xd->inv_txm4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride);
+  }
 
   return best_rd;
 }