shithub: libvpx

--- a/vp9/common/generic/systemdependent.c

+++ b/vp9/common/generic/systemdependent.c

@@ -29,10 +29,11 @@

   rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;

   rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_c;

   rtcd->idct.idct8        = vp9_short_idct8x8_c;

+  rtcd->idct.idct10_8     = vp9_short_idct10_8x8_c;

   rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;

   rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;

   rtcd->idct.idct16x16    = vp9_short_idct16x16_c;

-  rtcd->idct.idct10_16x16    = vp9_short_idct10_16x16_c;

+  rtcd->idct.idct10_16x16 = vp9_short_idct10_16x16_c;

   rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;

   rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;

--- a/vp9/common/idct.h

+++ b/vp9/common/idct.h

@@ -60,6 +60,11 @@

 #endif

 extern prototype_idct(vp9_idct_idct8);

+#ifndef vp9_idct_idct10_8

+#define vp9_idct_idct10_8 vp9_short_idct10_8x8_c

+#endif

+extern prototype_idct(vp9_idct_idct10_8);

 #ifndef vp9_idct_idct8_1

 #define vp9_idct_idct8_1 vp9_short_idct8x8_1_c

 #endif

@@ -132,6 +137,7 @@

   vp9_second_order_fn_t iwalsh16;

   vp9_idct_fn_t            idct8;

+  vp9_idct_fn_t            idct10_8;

   vp9_idct_fn_t            idct8_1;

   vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;

   vp9_idct_fn_t ihaar2;

--- a/vp9/common/idctllm.c

+++ b/vp9/common/idctllm.c

@@ -967,6 +967,127 @@

+/* Row IDCT when only first 4 coefficients are non-zero. */

+static void idctrow10(int *blk) {

+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

+  /* shortcut */

+  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |

+        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {

+    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]

+           = blk[5] = blk[6] = blk[7] = blk[0] << 3;

+    return;

+  }

+  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */

+  /* first stage */

+  x5 = W7 * x4;

+  x4 = W1 * x4;

+  x6 = W3 * x7;

+  x7 = -W5 * x7;

+  /* second stage */

+  x2 = W6 * x3;

+  x3 = W2 * x3;

+  x1 = x4 + x6;

+  x4 -= x6;

+  x6 = x5 + x7;

+  x5 -= x7;

+  /* third stage */

+  x7 = x0 + x3;

+  x8 = x0 - x3;

+  x3 = x0 + x2;

+  x0 -= x2;

+  x2 = (181 * (x4 + x5) + 128) >> 8;

+  x4 = (181 * (x4 - x5) + 128) >> 8;

+  /* fourth stage */

+  blk[0] = (x7 + x1) >> 8;

+  blk[1] = (x3 + x2) >> 8;

+  blk[2] = (x0 + x4) >> 8;

+  blk[3] = (x8 + x6) >> 8;

+  blk[4] = (x8 - x6) >> 8;

+  blk[5] = (x0 - x4) >> 8;

+  blk[6] = (x3 - x2) >> 8;

+  blk[7] = (x7 - x1) >> 8;

+}

+/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */

+static void idctcol10(int *blk) {

+  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

+  /* shortcut */

+  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |

+        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |

+        (x7 = blk[8 * 3]))) {

+    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]

+        = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]

+        = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);

+    return;

+  }

+  x0 = (blk[8 * 0] << 8) + 16384;

+  /* first stage */

+  x5 = (W7 * x4 + 4) >> 3;

+  x4 = (W1 * x4 + 4) >> 3;

+  x6 = (W3 * x7 + 4) >> 3;

+  x7 = (-W5 * x7 + 4) >> 3;

+  /* second stage */

+  x2 = (W6 * x3 + 4) >> 3;

+  x3 = (W2 * x3 + 4) >> 3;

+  x1 = x4 + x6;

+  x4 -= x6;

+  x6 = x5 + x7;

+  x5 -= x7;

+  /* third stage */

+  x7 = x0 + x3;

+  x8 = x0 - x3;

+  x3 = x0 + x2;

+  x0 -= x2;

+  x2 = (181 * (x4 + x5) + 128) >> 8;

+  x4 = (181 * (x4 - x5) + 128) >> 8;

+  /* fourth stage */

+  blk[8 * 0] = (x7 + x1) >> 14;

+  blk[8 * 1] = (x3 + x2) >> 14;

+  blk[8 * 2] = (x0 + x4) >> 14;

+  blk[8 * 3] = (x8 + x6) >> 14;

+  blk[8 * 4] = (x8 - x6) >> 14;

+  blk[8 * 5] = (x0 - x4) >> 14;

+  blk[8 * 6] = (x3 - x2) >> 14;

+  blk[8 * 7] = (x7 - x1) >> 14;

+}

+void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) {

+  int X[TX_DIM * TX_DIM];

+  int i, j;

+  int shortpitch = pitch >> 1;

+  for (i = 0; i < TX_DIM; i++) {

+    for (j = 0; j < TX_DIM; j++) {

+      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1

+                                + (coefs[i * TX_DIM + j] < 0)) >> 2;

+    }

+  }

+  /* Do first 4 row idct only since non-zero dct coefficients are all in

+   *  upper-left 4x4 area. */

+  for (i = 0; i < 4; i++)

+    idctrow10(X + 8 * i);

+  for (i = 0; i < 8; i++)

+    idctcol10(X + i);

+  for (i = 0; i < TX_DIM; i++) {

+    for (j = 0; j < TX_DIM; j++) {

+      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;

+    }

+  }

+}

 void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {

   int i;

--- a/vp9/common/rtcd_defs.sh

+++ b/vp9/common/rtcd_defs.sh

@@ -57,11 +57,8 @@

 prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"

 specialize vp9_dequant_idct_add_16x16

-prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"

+prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int dc, unsigned short eobs"

 specialize vp9_dequant_idct_add_8x8

-prototype void vp9_dequant_dc_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"

-specialize vp9_dequant_dc_idct_add_8x8

 prototype void vp9_dequant_idct_add "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"

 specialize vp9_dequant_idct_add

--- a/vp9/decoder/decodframe.c

+++ b/vp9/decoder/decodframe.c

@@ -442,7 +442,8 @@

           vp9_ht_dequant_idct_add_8x8_c(tx_type,

                                         q, dq, pre, dst, 16, stride);

         } else {

-          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);

+          vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0,

+                                     xd->eobs[idx]);

         q += 64;

       } else {

--- a/vp9/decoder/dequantize.c

+++ b/vp9/decoder/dequantize.c

@@ -19,8 +19,8 @@

 extern int dec_debug;

 #endif

-static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,

-                  int stride, int width, int height) {

+static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,

+                         uint8_t *dest, int stride, int width, int height) {

   int r, c;

   for (r = 0; r < height; r++) {

@@ -41,12 +41,34 @@

+static void add_constant_residual(const int16_t diff, const uint8_t *pred,

+                                  int pitch, uint8_t *dest, int stride,

+                                  int width, int height) {

+  int r, c;

+  for (r = 0; r < height; r++) {

+    for (c = 0; c < width; c++) {

+      int a = diff + pred[c];

+      if (a < 0)

+        a = 0;

+      else if (a > 255)

+        a = 255;

+      dest[c] = (uint8_t) a;

+    }

+    dest += stride;

+    pred += pitch;

+  }

+}

 void vp9_dequantize_b_c(BLOCKD *d) {

   int i;

-  short *DQ  = d->dqcoeff;

-  short *Q   = d->qcoeff;

-  short *DQC = d->dequant;

+  int16_t *DQ  = d->dqcoeff;

+  int16_t *Q   = d->qcoeff;

+  int16_t *DQC = d->dequant;

   for (i = 0; i < 16; i++) {

     DQ[i] = Q[i] * DQC[i];

@@ -54,11 +76,11 @@

-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, short *dq,

-                               unsigned char *pred, unsigned char *dest,

+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,

+                               uint8_t *pred, uint8_t *dest,

                                int pitch, int stride) {

-  short output[16];

-  short *diff_ptr = output;

+  int16_t output[16];

+  int16_t *diff_ptr = output;

   int i;

   for (i = 0; i < 16; i++) {

@@ -69,18 +91,15 @@

   vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,

-                                   unsigned char *pred, unsigned char *dest,

+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, int16_t *dq,

+                                   uint8_t *pred, uint8_t *dest,

                                    int pitch, int stride) {

-  short output[64];

-  short *diff_ptr = output;

-  int b, r, c;

+  int16_t output[64];

+  int16_t *diff_ptr = output;

   int i;

-  unsigned char *origdest = dest;

-  unsigned char *origpred = pred;

   input[0] = dq[0] * input[0];

   for (i = 1; i < 64; i++) {

@@ -91,35 +110,13 @@

   vpx_memset(input, 0, 128);

-  for (b = 0; b < 4; b++) {

-    for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        int a = diff_ptr[c] + pred[c];

-        if (a < 0)

-          a = 0;

-        if (a > 255)

-          a = 255;

-        dest[c] = (unsigned char) a;

-      }

-      dest += stride;

-      diff_ptr += 8;

-      pred += pitch;

-    }

-    // shift buffer pointers to next 4x4 block in the submacroblock

-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + ((b + 1) % 2) * 4;

-    dest = origdest + (b + 1) / 2 * 4 * stride + ((b + 1) % 2) * 4;

-    pred = origpred + (b + 1) / 2 * 4 * pitch + ((b + 1) % 2) * 4;

-  }

+  add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);

-void vp9_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,

-                            unsigned char *dest, int pitch, int stride) {

-  short output[16];

-  short *diff_ptr = output;

+void vp9_dequant_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,

+                            uint8_t *dest, int pitch, int stride) {

+  int16_t output[16];

+  int16_t *diff_ptr = output;

   int i;

   for (i = 0; i < 16; i++) {

@@ -131,17 +128,17 @@

   vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

-void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,

-                               unsigned char *dest, int pitch, int stride,

+void vp9_dequant_dc_idct_add_c(int16_t *input, int16_t *dq, uint8_t *pred,

+                               uint8_t *dest, int pitch, int stride,

                                int Dc) {

   int i;

-  short output[16];

-  short *diff_ptr = output;

+  int16_t output[16];

+  int16_t *diff_ptr = output;

-  input[0] = (short)Dc;

+  input[0] = (int16_t)Dc;

   for (i = 1; i < 16; i++) {

     input[i] = dq[i] * input[i];

@@ -152,15 +149,15 @@

   vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

 #if CONFIG_LOSSLESS

-void vp9_dequant_idct_add_lossless_c(short *input, short *dq,

-                                     unsigned char *pred, unsigned char *dest,

+void vp9_dequant_idct_add_lossless_c(int16_t *input, int16_t *dq,

+                                     uint8_t *pred, uint8_t *dest,

                                      int pitch, int stride) {

-  short output[16];

-  short *diff_ptr = output;

+  int16_t output[16];

+  int16_t *diff_ptr = output;

   int i;

   for (i = 0; i < 16; i++) {

@@ -171,18 +168,18 @@

   vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

-void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,

-                                        unsigned char *pred,

-                                        unsigned char *dest,

+void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, int16_t *dq,

+                                        uint8_t *pred,

+                                        uint8_t *dest,

                                         int pitch, int stride, int dc) {

   int i;

-  short output[16];

-  short *diff_ptr = output;

+  int16_t output[16];

+  int16_t *diff_ptr = output;

-  input[0] = (short)dc;

+  input[0] = (int16_t)dc;

   for (i = 1; i < 16; i++) {

     input[i] = dq[i] * input[i];

@@ -191,18 +188,18 @@

   vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

   vpx_memset(input, 0, 32);

-  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

+  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

 #endif

 void vp9_dequantize_b_2x2_c(BLOCKD *d) {

   int i;

-  short *DQ  = d->dqcoeff;

-  short *Q   = d->qcoeff;

-  short *DQC = d->dequant;

+  int16_t *DQ  = d->dqcoeff;

+  int16_t *Q   = d->qcoeff;

+  int16_t *DQC = d->dequant;

   for (i = 0; i < 16; i++) {

-    DQ[i] = (short)((Q[i] * DQC[i]));

+    DQ[i] = (int16_t)((Q[i] * DQC[i]));

 #ifdef DEC_DEBUG

   if (dec_debug) {

@@ -216,14 +213,12 @@

 #endif

-void vp9_dequant_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,

-                                unsigned char *dest, int pitch, int stride) {

-  short output[64];

-  short *diff_ptr = output;

-  int r, c, b;

+void vp9_dequant_idct_add_8x8_c(int16_t *input, int16_t *dq, uint8_t *pred,

+                                uint8_t *dest, int pitch, int stride,

+                                int dc, uint16_t eobs) {

+  int16_t output[64];

+  int16_t *diff_ptr = output;

   int i;

-  unsigned char *origdest = dest;

-  unsigned char *origpred = pred;

 #ifdef DEC_DEBUG

   if (dec_debug) {

@@ -236,104 +231,60 @@

 #endif

-  input[0] = input[0] * dq[0];

+  /* If dc is 1, then input[0] is the reconstructed value, do not need

+   * dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

+   */

+  if (!dc)

+    input[0] *= dq[0];

-  // recover quantizer for 4 4x4 blocks

-  for (i = 1; i < 64; i++) {

-    input[i] = input[i] * dq[1];

-  }

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

-    printf("Input DQ 8x8\n");

-    for (j = 0; j < 64; j++) {

-      printf("%d ", input[j]);

-      if (j % 8 == 7) printf("\n");

-    }

-  }

-#endif

+  /* The calculation can be simplified if there are not many non-zero dct

+   * coefficients. Use eobs to decide what to do.

+   * TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

+   * Combine that with code here.

+   */

+  if (eobs == 0) {

+    /* All 0 DCT coefficient */

+    vp9_copy_mem8x8(pred, pitch, dest, stride);

+  } else if (eobs == 1) {

+    /* DC only DCT coefficient. */

+    int16_t out;

-  // the idct halves ( >> 1) the pitch

-  vp9_short_idct8x8_c(input, output, 16);

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

-    printf("Output 8x8\n");

-    for (j = 0; j < 64; j++) {

-      printf("%d ", output[j]);

-      if (j % 8 == 7) printf("\n");

-    }

-  }

-#endif

+    /* Note: the idct1 will need to be modified accordingly whenever

+     * vp9_short_idct8x8_c() is modified. */

+    out = (input[0] + 1 + (input[0] < 0)) >> 2;

+    out = out << 3;

+    out = (out + 32) >> 7;

-  vpx_memset(input, 0, 128);// test what should i put here

+    input[0] = 0;

-  for (b = 0; b < 4; b++) {

-    for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        int a = diff_ptr[c] + pred[c];

+    add_constant_residual(out, pred, pitch, dest, stride, 8, 8);

+  } else if (eobs <= 10) {

+    input[1] = input[1] * dq[1];

+    input[2] = input[2] * dq[1];

+    input[3] = input[3] * dq[1];

+    input[8] = input[8] * dq[1];

+    input[9] = input[9] * dq[1];

+    input[10] = input[10] * dq[1];

+    input[16] = input[16] * dq[1];

+    input[17] = input[17] * dq[1];

+    input[24] = input[24] * dq[1];

-        if (a < 0)

-          a = 0;

+    vp9_short_idct10_8x8_c(input, output, 16);

-        if (a > 255)

-          a = 255;

+    input[0] = input[1] = input[2] = input[3] = 0;

+    input[8] = input[9] = input[10] = 0;

+    input[16] = input[17] = 0;

+    input[24] = 0;

-        dest[c] = (unsigned char) a;

-      }

-      dest += stride;

-      diff_ptr += 8;

-      pred += pitch;

+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);

+  } else {

+    // recover quantizer for 4 4x4 blocks

+    for (i = 1; i < 64; i++) {

+      input[i] = input[i] * dq[1];

-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;

-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;

-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;

-  }

 #ifdef DEC_DEBUG

   if (dec_debug) {

-    int k, j;

-    printf("Final 8x8\n");

-    for (j = 0; j < 8; j++) {

-      for (k = 0; k < 8; k++) {

-        printf("%d ", origdest[k]);

-      }

-      printf("\n");

-      origdest += stride;

-    }

-  }

-#endif

-}

-void vp9_dequant_dc_idct_add_8x8_c(short *input, short *dq, unsigned char *pred,

-                                   unsigned char *dest, int pitch, int stride,

-                                   int Dc) { // Dc for 1st order T in some rear case

-  short output[64];

-  short *diff_ptr = output;

-  int r, c, b;

-  int i;

-  unsigned char *origdest = dest;

-  unsigned char *origpred = pred;

-  input[0] = (short)Dc;// Dc is the reconstructed value, do not need dequantization

-  // dc value is recovered after dequantization, since dc need not quantization

-#ifdef DEC_DEBUG

-  if (dec_debug) {

     int j;

-    printf("Input 8x8\n");

-    for (j = 0; j < 64; j++) {

-      printf("%d ", input[j]);

-      if (j % 8 == 7) printf("\n");

-    }

-  }

-#endif

-  for (i = 1; i < 64; i++) {

-    input[i] = input[i] * dq[1];

-  }

-#ifdef DEC_DEBUG

-  if (dec_debug) {

-    int j;

     printf("Input DQ 8x8\n");

     for (j = 0; j < 64; j++) {

       printf("%d ", input[j]);

@@ -342,8 +293,8 @@

 #endif

-  // the idct halves ( >> 1) the pitch

-  vp9_short_idct8x8_c(input, output, 16);

+    // the idct halves ( >> 1) the pitch

+    vp9_short_idct8x8_c(input, output, 16);

 #ifdef DEC_DEBUG

   if (dec_debug) {

     int j;

@@ -354,30 +305,11 @@

 #endif

-  vpx_memset(input, 0, 128);

-  for (b = 0; b < 4; b++) {

-    for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        int a = diff_ptr[c] + pred[c];

+    vpx_memset(input, 0, 128);

-        if (a < 0)

-          a = 0;

+    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);

-        if (a > 255)

-          a = 255;

-        dest[c] = (unsigned char) a;

-      }

-      dest += stride;

-      diff_ptr += 8;

-      pred += pitch;

-    }

-    diff_ptr = output + (b + 1) / 2 * 4 * 8 + (b + 1) % 2 * 4;

-    dest = origdest + (b + 1) / 2 * 4 * stride + (b + 1) % 2 * 4;

-    pred = origpred + (b + 1) / 2 * 4 * pitch + (b + 1) % 2 * 4;

-  }

 #ifdef DEC_DEBUG

   if (dec_debug) {

     int k, j;

@@ -391,13 +323,14 @@

 #endif

+  }

-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input, short *dq,

-                                     unsigned char *pred, unsigned char *dest,

+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,

+                                     int16_t *dq, uint8_t *pred, uint8_t *dest,

                                      int pitch, int stride) {

-  short output[256];

-  short *diff_ptr = output;

+  int16_t output[256];

+  int16_t *diff_ptr = output;

   int i;

   input[0]= input[0] * dq[0];

@@ -414,7 +347,7 @@

   vpx_memset(input, 0, 512);

-  recon(diff_ptr, pred, pitch, dest, stride, 16, 16);

+  add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);

 void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,

@@ -422,7 +355,7 @@

                                   uint16_t eobs) {

   int16_t output[256];

   int16_t *diff_ptr = output;

-  int r, c, i;

+  int i;

   /* The calculation can be simplified if there are not many non-zero dct

    * coefficients. Use eobs to separate different cases. */

@@ -433,6 +366,8 @@

     /* DC only DCT coefficient. */

     int16_t out;

+    /* Note: the idct1 will need to be modified accordingly whenever

+     * vp9_short_idct16x16_c() is modified. */

     out = (input[0] * dq[0] + 2) >> 2;

     out = (out + 2) >> 2;

     out = (out + 4) >> 3;

@@ -439,22 +374,7 @@

     input[0] = 0;

-    for (r = 0; r < 16; r++) {

-      for (c = 0; c < 16; c++) {

-        int a = out + pred[c];

-        if (a < 0)

-          a = 0;

-        else if (a > 255)

-          a = 255;

-        dest[c] = (uint8_t) a;

-      }

-      dest += stride;

-      pred += pitch;

-    }

+    add_constant_residual(out, pred, pitch, dest, stride, 16, 16);

   } else if (eobs <= 10) {

     input[0]= input[0] * dq[0];

     input[1] = input[1] * dq[1];

@@ -475,7 +395,7 @@

     input[32] = input[33] = 0;

     input[48] = 0;

-    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);

+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);

   } else {

     input[0]= input[0] * dq[0];

@@ -488,6 +408,6 @@

     vpx_memset(input, 0, 512);

-    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);

+    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);

--- a/vp9/decoder/idct_blk.c

+++ b/vp9/decoder/idct_blk.c

@@ -177,12 +177,21 @@

                                            int stride, unsigned short *eobs,

                                            short *dc,

                                            MACROBLOCKD *xd) {

-  vp9_dequant_dc_idct_add_8x8_c(q, dq, pre, dst, 16, stride, dc[0]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, dc[1]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,

-                                dst + 8 * stride, 16, stride, dc[4]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,

-                                dst + 8 * stride + 8, 16, stride, dc[8]);

+  q[0] = dc[0];

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 1, xd->eobs[0]);

+  q[64] = dc[1];

+  vp9_dequant_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, 1,

+                             xd->eobs[4]);

+  q[128] = dc[4];

+  vp9_dequant_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,

+                                dst + 8 * stride, 16, stride, 1, xd->eobs[8]);

+  q[192] = dc[8];

+  vp9_dequant_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,

+                                dst + 8 * stride + 8, 16, stride, 1,

+                                xd->eobs[12]);

 #if CONFIG_SUPERBLOCKS

@@ -191,13 +200,22 @@

                                                    int stride,

                                                    unsigned short *eobs,

                                                    short *dc, MACROBLOCKD *xd) {

-  vp9_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8,

-                                dst + 8, stride, stride, dc[1]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,

-                                dst + 8 * stride, stride, stride, dc[4]);

-  vp9_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,

-                                dst + 8 * stride + 8, stride, stride, dc[8]);

+  q[0] = dc[0];

+  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, 1, xd->eobs[0]);

+  q[64] = dc[1];

+  vp9_dequant_idct_add_8x8_c(&q[64], dq, dst + 8,

+                                dst + 8, stride, stride, 1, xd->eobs[4]);

+  q[128] = dc[4];

+  vp9_dequant_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,

+                                dst + 8 * stride, stride, stride, 1,

+                                xd->eobs[8]);

+  q[192] = dc[8];

+  vp9_dequant_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,

+                                dst + 8 * stride + 8, stride, stride, 1,

+                                xd->eobs[12]);

 #endif

@@ -209,13 +227,14 @@

   unsigned char *origdest = dst;

   unsigned char *origpred = pre;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride);

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[0]);

   vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,

-                             origdest + 8, 16, stride);

+                             origdest + 8, 16, stride, 0, xd->eobs[4]);

   vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,

-                             origdest + 8 * stride, 16, stride);

+                             origdest + 8 * stride, 16, stride, 0, xd->eobs[8]);

   vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,

-                             origdest + 8 * stride + 8, 16, stride);

+                             origdest + 8 * stride + 8, 16, stride, 0,

+                             xd->eobs[12]);

 void vp9_dequant_idct_add_uv_block_8x8_c(short *q, short *dq,

@@ -224,12 +243,12 @@

                                          unsigned char *dstv,

                                          int stride, unsigned short *eobs,

                                          MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride);

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, 0, xd->eobs[16]);

   q    += 64;

   pre  += 64;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, 0, xd->eobs[20]);

 #if CONFIG_SUPERBLOCKS

@@ -239,11 +258,12 @@

                                                  int stride,

                                                  unsigned short *eobs,

                                                  MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);

+  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, 0,

+                             xd->eobs[16]);

-  q    += 64;

-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);

+  q += 64;

+  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, 0,

+                             xd->eobs[20]);

 #endif

--

⑨