shithub: libvpx

Download patch

ref: e60478d46d9a692e2e7b90b35355660682bfe58b
parent: 5d65614fdda553a7016d94848ee0564c41c9a5b1
author: Yunqing Wang <yunqingwang@google.com>
date: Fri Nov 9 12:50:13 EST 2012

Optimize 8x8 dequant and idct

Similar to 16x16 dequant and idct, based on the value of eobs, the
8x8 dequant and idct calculation was simplified to improve decorder
performance.

Combined vp9_dequant_idct_add_8x8 and vp9_dequant_dc_idct_add_8x8
to eliminate duplicate code.

Change-Id: Ia58e50ab27f7012b7379c495837c9c0b5ba9cf7f

--- a/vp9/common/generic/systemdependent.c
+++ b/vp9/common/generic/systemdependent.c
@@ -29,10 +29,11 @@
   rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
   rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_c;
   rtcd->idct.idct8        = vp9_short_idct8x8_c;
+  rtcd->idct.idct10_8     = vp9_short_idct10_8x8_c;
   rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
   rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;
   rtcd->idct.idct16x16    = vp9_short_idct16x16_c;
-  rtcd->idct.idct10_16x16    = vp9_short_idct10_16x16_c;
+  rtcd->idct.idct10_16x16 = vp9_short_idct10_16x16_c;
 
   rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;
   rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;
--- a/vp9/common/idct.h
+++ b/vp9/common/idct.h
@@ -60,6 +60,11 @@
 #endif
 extern prototype_idct(vp9_idct_idct8);
 
+#ifndef vp9_idct_idct10_8
+#define vp9_idct_idct10_8 vp9_short_idct10_8x8_c
+#endif
+extern prototype_idct(vp9_idct_idct10_8);
+
 #ifndef vp9_idct_idct8_1
 #define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
 #endif
@@ -132,6 +137,7 @@
   vp9_second_order_fn_t iwalsh16;
 
   vp9_idct_fn_t            idct8;
+  vp9_idct_fn_t            idct10_8;
   vp9_idct_fn_t            idct8_1;
   vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
   vp9_idct_fn_t ihaar2;
--- a/vp9/common/idctllm.c
+++ b/vp9/common/idctllm.c
@@ -967,6 +967,127 @@
   }
 }
 
+/* Row IDCT when only first 4 coefficients are non-zero. */
+static void idctrow10(int *blk) {
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |
+        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {
+    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]
+           = blk[5] = blk[6] = blk[7] = blk[0] << 3;
+    return;
+  }
+
+  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */
+  /* first stage */
+  x5 = W7 * x4;
+  x4 = W1 * x4;
+  x6 = W3 * x7;
+  x7 = -W5 * x7;
+
+  /* second stage */
+  x2 = W6 * x3;
+  x3 = W2 * x3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x0 + x3;
+  x8 = x0 - x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[0] = (x7 + x1) >> 8;
+  blk[1] = (x3 + x2) >> 8;
+  blk[2] = (x0 + x4) >> 8;
+  blk[3] = (x8 + x6) >> 8;
+  blk[4] = (x8 - x6) >> 8;
+  blk[5] = (x0 - x4) >> 8;
+  blk[6] = (x3 - x2) >> 8;
+  blk[7] = (x7 - x1) >> 8;
+}
+
+/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */
+static void idctcol10(int *blk) {
+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;
+
+  /* shortcut */
+  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |
+        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
+        (x7 = blk[8 * 3]))) {
+    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
+        = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
+        = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
+    return;
+  }
+
+  x0 = (blk[8 * 0] << 8) + 16384;
+
+  /* first stage */
+  x5 = (W7 * x4 + 4) >> 3;
+  x4 = (W1 * x4 + 4) >> 3;
+  x6 = (W3 * x7 + 4) >> 3;
+  x7 = (-W5 * x7 + 4) >> 3;
+
+  /* second stage */
+  x2 = (W6 * x3 + 4) >> 3;
+  x3 = (W2 * x3 + 4) >> 3;
+  x1 = x4 + x6;
+  x4 -= x6;
+  x6 = x5 + x7;
+  x5 -= x7;
+
+  /* third stage */
+  x7 = x0 + x3;
+  x8 = x0 - x3;
+  x3 = x0 + x2;
+  x0 -= x2;
+  x2 = (181 * (x4 + x5) + 128) >> 8;
+  x4 = (181 * (x4 - x5) + 128) >> 8;
+
+  /* fourth stage */
+  blk[8 * 0] = (x7 + x1) >> 14;
+  blk[8 * 1] = (x3 + x2) >> 14;
+  blk[8 * 2] = (x0 + x4) >> 14;
+  blk[8 * 3] = (x8 + x6) >> 14;
+  blk[8 * 4] = (x8 - x6) >> 14;
+  blk[8 * 5] = (x0 - x4) >> 14;
+  blk[8 * 6] = (x3 - x2) >> 14;
+  blk[8 * 7] = (x7 - x1) >> 14;
+}
+
+void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) {
+  int X[TX_DIM * TX_DIM];
+  int i, j;
+  int shortpitch = pitch >> 1;
+
+  for (i = 0; i < TX_DIM; i++) {
+    for (j = 0; j < TX_DIM; j++) {
+      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1
+                                + (coefs[i * TX_DIM + j] < 0)) >> 2;
+    }
+  }
+
+  /* Do first 4 row idct only since non-zero dct coefficients are all in
+   *  upper-left 4x4 area. */
+  for (i = 0; i < 4; i++)
+    idctrow10(X + 8 * i);
+
+  for (i = 0; i < 8; i++)
+    idctcol10(X + i);
+
+  for (i = 0; i < TX_DIM; i++) {
+    for (j = 0; j < TX_DIM; j++) {
+      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;
+    }
+  }
+}
 
 void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
   int i;
--- a/vp9/common/rtcd_defs.sh
+++ b/vp9/common/rtcd_defs.sh
@@ -57,11 +57,8 @@
 prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"
 specialize vp9_dequant_idct_add_16x16
 
-prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
+prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int dc, unsigned short eobs"
 specialize vp9_dequant_idct_add_8x8
-
-prototype void vp9_dequant_dc_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
-specialize vp9_dequant_dc_idct_add_8x8
 
 prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
 specialize vp9_dequant_idct_add
--- a/vp9/decoder/decodframe.c
+++ b/vp9/decoder/decodframe.c
@@ -442,7 +442,8 @@
           vp9_ht_dequant_idct_add_8x8_c(tx_type,
                                         q, dq, pre, dst, 16, stride);
         } else {
-          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0,
+                                     xd->eobs[idx]);
         }
         q += 64;
       } else {
--- a/vp9/decoder/dequantize.c
+++ b/vp9/decoder/dequantize.c
@@ -19,8 +19,8 @@
 extern int dec_debug;
 #endif
 
-static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,
-                  int stride, int width, int height) {
+static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
+                         uint8_t *dest, int stride, int width, int height) {
   int r, c;
 
   for (r = 0; r < height; r++) {
@@ -41,12 +41,34 @@
   }
 }
 
+static void add_constant_residual(const int16_t diff, const uint8_t *pred,
+                                  int pitch, uint8_t *dest, int stride,
+                                  int width, int height) {
+  int r, c;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      int a = diff + pred[c];
+
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+
+      dest[c] = (uint8_t) a;
+    }
+
+    dest += stride;
+    pred += pitch;
+  }
+}
+
 void vp9_dequantize_b_c(BLOCKD *d) {
 
   int i;
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
+  int16_t *DQ  = d->dqcoeff;
+  int16_t *Q   = d->qcoeff;
+  int16_t *DQC = d->dequant;
 
   for (i = 0; i < 16; i++) {
     DQ[i] = Q[i] * DQC[i];
@@ -54,11 +76,11 @@
 }
 
 
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,
-                               unsigned char *pred, unsigned char *dest,
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
+                               uint8_t *pred, uint8_t *dest,
                                int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
   int i;
 
   for (i = 0; i < 16; i++) {
@@ -69,18 +91,15 @@
 
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
-                                   unsigned char *pred, unsigned char *dest,
+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,
+                                   uint8_t *pred, uint8_t *dest,
                                    int pitch, int stride) {
-  short output[64];
-  short *diff_ptr = output;
-  int b, r, c;
+  int16_t output[64];
+  int16_t *diff_ptr = output;
   int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
 
   input[0] = dq[0] * input[0];
   for (i = 1; i < 64; i++) {
@@ -91,35 +110,13 @@
 
   vpx_memset(input, 0, 128);
 
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
-
-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    // shift buffer pointers to next 4x4 block in the submacroblock
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;
-  }
+  add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
 }
 
-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
-                            unsigned char *dest, int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
+void vp9_dequant_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
+                            uint8_t *dest, int pitch, int stride) {
+  int16_t output[16];
+  int16_t *diff_ptr = output;
   int i;
 
   for (i = 0; i < 16; i++) {
@@ -131,17 +128,17 @@
 
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 
-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
-                               unsigned char *dest, int pitch, int stride,
+void vp9_dequant_dc_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,
+                               uint8_t *dest, int pitch, int stride,
                                int Dc) {
   int i;
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
 
-  input[0] = (short)Dc;
+  input[0] = (int16_t)Dc;
 
   for (i = 1; i < 16; i++) {
     input[i] = dq[i] * input[i];
@@ -152,15 +149,15 @@
 
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 
 #if CONFIG_LOSSLESS
-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
+void vp9_dequant_idct_add_lossless_c(int16_t *input, int16_t *dq,
+                                     uint8_t *pred, uint8_t *dest,
                                      int pitch, int stride) {
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
   int i;
 
   for (i = 0; i < 16; i++) {
@@ -171,18 +168,18 @@
 
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 
-void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,
-                                        unsigned char *pred,
-                                        unsigned char *dest,
+void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, int16_t *dq,
+                                        uint8_t *pred,
+                                        uint8_t *dest,
                                         int pitch, int stride, int dc) {
   int i;
-  short output[16];
-  short *diff_ptr = output;
+  int16_t output[16];
+  int16_t *diff_ptr = output;
 
-  input[0] = (short)dc;
+  input[0] = (int16_t)dc;
 
   for (i = 1; i < 16; i++) {
     input[i] = dq[i] * input[i];
@@ -191,18 +188,18 @@
   vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);
   vpx_memset(input, 0, 32);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);
 }
 #endif
 
 void vp9_dequantize_b_2x2_c(BLOCKD *d) {
   int i;
-  short *DQ  = d->dqcoeff;
-  short *Q   = d->qcoeff;
-  short *DQC = d->dequant;
+  int16_t *DQ  = d->dqcoeff;
+  int16_t *Q   = d->qcoeff;
+  int16_t *DQC = d->dequant;
 
   for (i = 0; i < 16; i++) {
-    DQ[i] = (short)((Q[i] * DQC[i]));
+    DQ[i] = (int16_t)((Q[i] * DQC[i]));
   }
 #ifdef DEC_DEBUG
   if (dec_debug) {
@@ -216,14 +213,12 @@
 #endif
 }
 
-void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
-                                unsigned char *dest, int pitch, int stride) {
-  short output[64];
-  short *diff_ptr = output;
-  int r, c, b;
+void vp9_dequant_idct_add_8x8_c(int16_t *input, int16_t *dq, uint8_t *pred,
+                                uint8_t *dest, int pitch, int stride,
+                                int dc, uint16_t eobs) {
+  int16_t output[64];
+  int16_t *diff_ptr = output;
   int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
 
 #ifdef DEC_DEBUG
   if (dec_debug) {
@@ -236,104 +231,60 @@
   }
 #endif
 
-  input[0] = input[0] * dq[0];
+  /* If dc is 1, then input[0] is the reconstructed value, do not need
+   * dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+   */
+  if (!dc)
+    input[0] *= dq[0];
 
-  // recover quantizer for 4 4x4 blocks
-  for (i = 1; i < 64; i++) {
-    input[i] = input[i] * dq[1];
-  }
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Input DQ 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
+  /* The calculation can be simplified if there are not many non-zero dct
+   * coefficients. Use eobs to decide what to do.
+   * TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+   * Combine that with code here.
+   */
+  if (eobs == 0) {
+    /* All 0 DCT coefficient */
+    vp9_copy_mem8x8(pred, pitch, dest, stride);
+  } else if (eobs == 1) {
+    /* DC only DCT coefficient. */
+    int16_t out;
 
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct8x8_c(input, output, 16);
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
-    printf("Output 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", output[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
+    /* Note: the idct1 will need to be modified accordingly whenever
+     * vp9_short_idct8x8_c() is modified. */
+    out = (input[0] + 1 + (input[0] < 0)) >> 2;
+    out = out << 3;
+    out = (out + 32) >> 7;
 
-  vpx_memset(input, 0, 128);// test what should i put here
+    input[0] = 0;
 
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
+    add_constant_residual(out, pred, pitch, dest, stride, 8, 8);
+  } else if (eobs <= 10) {
+    input[1] = input[1] * dq[1];
+    input[2] = input[2] * dq[1];
+    input[3] = input[3] * dq[1];
+    input[8] = input[8] * dq[1];
+    input[9] = input[9] * dq[1];
+    input[10] = input[10] * dq[1];
+    input[16] = input[16] * dq[1];
+    input[17] = input[17] * dq[1];
+    input[24] = input[24] * dq[1];
 
-        if (a < 0)
-          a = 0;
+    vp9_short_idct10_8x8_c(input, output, 16);
 
-        if (a > 255)
-          a = 255;
+    input[0] = input[1] = input[2] = input[3] = 0;
+    input[8] = input[9] = input[10] = 0;
+    input[16] = input[17] = 0;
+    input[24] = 0;
 
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
+  } else {
+    // recover quantizer for 4 4x4 blocks
+    for (i = 1; i < 64; i++) {
+      input[i] = input[i] * dq[1];
     }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
-  }
 #ifdef DEC_DEBUG
   if (dec_debug) {
-    int k, j;
-    printf("Final 8x8\n");
-    for (j = 0; j < 8; j++) {
-      for (k = 0; k < 8; k++) {
-        printf("%d ", origdest[k]);
-      }
-      printf("\n");
-      origdest += stride;
-    }
-  }
-#endif
-}
-
-void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,
-                                   unsigned char *dest, int pitch, int stride,
-                                   int Dc) { // Dc for 1st order T in some rear case
-  short output[64];
-  short *diff_ptr = output;
-  int r, c, b;
-  int i;
-  unsigned char *origdest = dest;
-  unsigned char *origpred = pred;
-
-  input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization
-  // dc value is recovered after dequantization, since dc need not quantization
-#ifdef DEC_DEBUG
-  if (dec_debug) {
     int j;
-    printf("Input 8x8\n");
-    for (j = 0; j < 64; j++) {
-      printf("%d ", input[j]);
-      if (j % 8 == 7) printf("\n");
-    }
-  }
-#endif
-  for (i = 1; i < 64; i++) {
-    input[i] = input[i] * dq[1];
-  }
-
-#ifdef DEC_DEBUG
-  if (dec_debug) {
-    int j;
     printf("Input DQ 8x8\n");
     for (j = 0; j < 64; j++) {
       printf("%d ", input[j]);
@@ -342,8 +293,8 @@
   }
 #endif
 
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct8x8_c(input, output, 16);
+    // the idct halves ( >> 1) the pitch
+    vp9_short_idct8x8_c(input, output, 16);
 #ifdef DEC_DEBUG
   if (dec_debug) {
     int j;
@@ -354,30 +305,11 @@
     }
   }
 #endif
-  vpx_memset(input, 0, 128);
 
-  for (b = 0; b < 4; b++) {
-    for (r = 0; r < 4; r++) {
-      for (c = 0; c < 4; c++) {
-        int a = diff_ptr[c] + pred[c];
+    vpx_memset(input, 0, 128);
 
-        if (a < 0)
-          a = 0;
+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);
 
-        if (a > 255)
-          a = 255;
-
-        dest[c] = (unsigned char) a;
-      }
-
-      dest += stride;
-      diff_ptr += 8;
-      pred += pitch;
-    }
-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;
-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;
-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;
-  }
 #ifdef DEC_DEBUG
   if (dec_debug) {
     int k, j;
@@ -391,13 +323,14 @@
     }
   }
 #endif
+  }
 }
 
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,
-                                     unsigned char *pred, unsigned char *dest,
+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
+                                     int16_t *dq, uint8_t *pred, uint8_t *dest,
                                      int pitch, int stride) {
-  short output[256];
-  short *diff_ptr = output;
+  int16_t output[256];
+  int16_t *diff_ptr = output;
   int i;
 
   input[0]= input[0] * dq[0];
@@ -414,7 +347,7 @@
 
   vpx_memset(input, 0, 512);
 
-  recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
+  add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
 }
 
 void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,
@@ -422,7 +355,7 @@
                                   uint16_t eobs) {
   int16_t output[256];
   int16_t *diff_ptr = output;
-  int r, c, i;
+  int i;
 
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
@@ -433,6 +366,8 @@
     /* DC only DCT coefficient. */
     int16_t out;
 
+    /* Note: the idct1 will need to be modified accordingly whenever
+     * vp9_short_idct16x16_c() is modified. */
     out = (input[0] * dq[0] + 2) >> 2;
     out = (out + 2) >> 2;
     out = (out + 4) >> 3;
@@ -439,22 +374,7 @@
 
     input[0] = 0;
 
-    for (r = 0; r < 16; r++) {
-      for (c = 0; c < 16; c++) {
-        int a = out + pred[c];
-
-        if (a < 0)
-          a = 0;
-        else if (a > 255)
-          a = 255;
-
-        dest[c] = (uint8_t) a;
-      }
-
-      dest += stride;
-      pred += pitch;
-    }
-
+    add_constant_residual(out, pred, pitch, dest, stride, 16, 16);
   } else if (eobs <= 10) {
     input[0]= input[0] * dq[0];
     input[1] = input[1] * dq[1];
@@ -475,7 +395,7 @@
     input[32] = input[33] = 0;
     input[48] = 0;
 
-    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
   } else {
     input[0]= input[0] * dq[0];
 
@@ -488,6 +408,6 @@
 
     vpx_memset(input, 0, 512);
 
-    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);
+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
   }
 }
--- a/vp9/decoder/idct_blk.c
+++ b/vp9/decoder/idct_blk.c
@@ -177,12 +177,21 @@
                                            int stride, unsigned short *eobs,
                                            short *dc,
                                            MACROBLOCKD *xd) {
-  vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
-                                dst + 8 * stride, 16, stride, dc[4]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
-                                dst + 8 * stride + 8, 16, stride, dc[8]);
+  q[0] = dc[0];
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 1, xd->eobs[0]);
+
+  q[64] = dc[1];
+  vp9_dequant_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, 1,
+                             xd->eobs[4]);
+
+  q[128] = dc[4];
+  vp9_dequant_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,
+                                dst + 8 * stride, 16, stride, 1, xd->eobs[8]);
+
+  q[192] = dc[8];
+  vp9_dequant_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,
+                                dst + 8 * stride + 8, 16, stride, 1,
+                                xd->eobs[12]);
 }
 
 #if CONFIG_SUPERBLOCKS
@@ -191,13 +200,22 @@
                                                    int stride,
                                                    unsigned short *eobs,
                                                    short *dc, MACROBLOCKD *xd) {
-  vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,
-                                dst + 8, stride, stride, dc[1]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
-                                dst + 8 * stride, stride, stride, dc[4]);
-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
-                                dst + 8 * stride + 8, stride, stride, dc[8]);
+  q[0] = dc[0];
+  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, 1, xd->eobs[0]);
+
+  q[64] = dc[1];
+  vp9_dequant_idct_add_8x8_c(&q[64], dq, dst + 8,
+                                dst + 8, stride, stride, 1, xd->eobs[4]);
+
+  q[128] = dc[4];
+  vp9_dequant_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,
+                                dst + 8 * stride, stride, stride, 1,
+                                xd->eobs[8]);
+
+  q[192] = dc[8];
+  vp9_dequant_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,
+                                dst + 8 * stride + 8, stride, stride, 1,
+                                xd->eobs[12]);
 }
 #endif
 
@@ -209,13 +227,14 @@
   unsigned char *origdest = dst;
   unsigned char *origpred = pre;
 
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[0]);
   vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
-                             origdest + 8, 16, stride);
+                             origdest + 8, 16, stride, 0, xd->eobs[4]);
   vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,
-                             origdest + 8 * stride, 16, stride);
+                             origdest + 8 * stride, 16, stride, 0, xd->eobs[8]);
   vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,
-                             origdest + 8 * stride + 8, 16, stride);
+                             origdest + 8 * stride + 8, 16, stride, 0,
+                             xd->eobs[12]);
 }
 
 void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,
@@ -224,12 +243,12 @@
                                          unsigned char *dstv,
                                          int stride, unsigned short *eobs,
                                          MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, 0, xd->eobs[16]);
 
   q    += 64;
   pre  += 64;
 
-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, 0, xd->eobs[20]);
 }
 
 #if CONFIG_SUPERBLOCKS
@@ -239,11 +258,12 @@
                                                  int stride,
                                                  unsigned short *eobs,
                                                  MACROBLOCKD *xd) {
-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
+  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, 0,
+                             xd->eobs[16]);
 
-  q    += 64;
-
-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
+  q += 64;
+  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, 0,
+                             xd->eobs[20]);
 }
 #endif