shithub: libvpx

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -336,7 +336,7 @@

   int stride;

};

-struct mb_plane {

+struct macroblockd_plane {

   DECLARE_ALIGNED(16, int16_t,  qcoeff[64 * 64]);

   DECLARE_ALIGNED(16, int16_t,  dqcoeff[64 * 64]);

   DECLARE_ALIGNED(16, uint16_t, eobs[256]);

@@ -356,7 +356,7 @@

   BLOCK_OFFSET((x)->plane[2].field, ((i) - 20), 16))

 typedef struct macroblockd {

-  struct mb_plane plane[MAX_MB_PLANE];

+  struct macroblockd_plane plane[MAX_MB_PLANE];

   /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */

   BLOCKD block[24];

@@ -919,6 +919,18 @@

   for (plane = 1; plane < MAX_MB_PLANE; plane++) {

     foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg);

+}

+static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,

+                               int plane, int block) {

+  const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x;

+  const int stride = 4 << bw;

+  const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1));

+  return y * stride + x;

+}

+static int16_t* raster_block_offset_int16(MACROBLOCKD *xd,

+                                         BLOCK_SIZE_TYPE bsize,

+                                         int plane, int block, int16_t *base) {

+  return base + raster_block_offset(xd, bsize, plane, block);

 #if CONFIG_CODE_ZEROGROUP

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -561,12 +561,6 @@

 specialize vp9_block_error mmx sse2

 vp9_block_error_sse2=vp9_block_error_xmm

-prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"

-# TODO(jingning): The prototype function in c has been changed to remove

-# the use of predictor buffer in MACROBLOCKD. Need to modify the mmx and sse2

-# versions accordingly.

-specialize vp9_subtract_b

 # Structured Similarity (SSIM)

--- a/vp9/decoder/vp9_decodframe.c

+++ b/vp9/decoder/vp9_decodframe.c

@@ -305,7 +305,7 @@

 static INLINE void dequant_add_y(MACROBLOCKD *xd, TX_TYPE tx_type, int idx) {

   BLOCKD *const b = &xd->block[idx];

-  struct mb_plane *const y = &xd->plane[0];

+  struct macroblockd_plane *const y = &xd->plane[0];

   if (tx_type != DCT_DCT) {

     vp9_dequant_iht_add_c(tx_type,

                           BLOCK_OFFSET(y->qcoeff, idx, 16),

--- a/vp9/encoder/vp9_asm_enc_offsets.c

+++ b/vp9/encoder/vp9_asm_enc_offsets.c

@@ -20,7 +20,6 @@

 BEGIN

 /* regular quantize */

-DEFINE(vp9_block_coeff,                         offsetof(BLOCK, coeff));

 DEFINE(vp9_block_zbin,                          offsetof(BLOCK, zbin));

 DEFINE(vp9_block_round,                         offsetof(BLOCK, round));

 DEFINE(vp9_block_quant,                         offsetof(BLOCK, quant));

--- a/vp9/encoder/vp9_block.h

+++ b/vp9/encoder/vp9_block.h

@@ -25,7 +25,6 @@

 typedef struct block {

   // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

-  int16_t *src_diff;

   int16_t *coeff;

   // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries

@@ -89,9 +88,13 @@

   unsigned int frames_with_high_error;

 } PICK_MODE_CONTEXT;

+struct macroblock_plane {

+  DECLARE_ALIGNED(16, int16_t, src_diff[64*64]);

+};

 typedef struct macroblock MACROBLOCK;

 struct macroblock {

-  DECLARE_ALIGNED(16, int16_t, src_diff[64*64+32*32*2]);

+  struct macroblock_plane plane[MAX_MB_PLANE];

   DECLARE_ALIGNED(16, int16_t, coeff[64*64+32*32*2]);

   // 16 Y blocks, 4 U blocks, 4 V blocks,

   BLOCK block[24];

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1734,25 +1734,8 @@

 void vp9_setup_block_ptrs(MACROBLOCK *x) {

-  int r, c;

   int i;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++)

-      x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++)

-      x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;

-  }

-  for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++)

-      x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;

-  }

   for (i = 0; i < 24; i++)

     x->block[i].coeff = x->coeff + i * 16;

@@ -2100,14 +2083,6 @@

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  const uint8_t *src = x->src.y_buffer;

-  uint8_t *dst = xd->plane[0].dst.buf;

-  const uint8_t *usrc = x->src.u_buffer;

-  uint8_t *udst = xd->plane[1].dst.buf;

-  const uint8_t *vsrc = x->src.v_buffer;

-  uint8_t *vdst = xd->plane[2].dst.buf;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->plane[0].dst.stride;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->plane[1].dst.stride;

   int n;

   MODE_INFO *mi = x->e_mbd.mode_info_context;

   unsigned int segment_id = mi->mbmi.segment_id;

@@ -2187,10 +2162,7 @@

   if (!x->skip) {

-    vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride,

-                         bsize);

-    vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,

-                          udst, vdst, dst_uv_stride, bsize);

+    vp9_subtract_sb(x, bsize);

     switch (xd->mode_info_context->mbmi.txfm_size) {

       case TX_32X32:

--- a/vp9/encoder/vp9_encodeintra.c

+++ b/vp9/encoder/vp9_encodeintra.c

@@ -37,7 +37,7 @@

-  return vp9_get_mb_ss(x->src_diff);

+  return vp9_get_mb_ss(x->plane[0].src_diff);

 static void encode_intra4x4block(MACROBLOCK *x, int ib) {

@@ -45,6 +45,9 @@

   BLOCK *be = &x->block[ib];

   MACROBLOCKD * const xd = &x->e_mbd;

   TX_TYPE tx_type;

+  int16_t* const src_diff =

+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,

+                                x->plane[0].src_diff);

   assert(ib < 16);

@@ -54,16 +57,18 @@

   vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first,

                        *(b->base_dst) + b->dst, b->dst_stride);

-  vp9_subtract_b(be, b, 16);

+  vp9_subtract_block(4, 4, src_diff, 16,

+                     *(be->base_src) + be->src, be->src_stride,

+                     *(b->base_dst) + b->dst, b->dst_stride);

   tx_type = get_tx_type_4x4(&x->e_mbd, ib);

   if (tx_type != DCT_DCT) {

-    vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

+    vp9_short_fht4x4(src_diff, be->coeff, 16, tx_type);

     vp9_ht_quantize_b_4x4(x, ib, tx_type);

     vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),

                      b->diff, 16, tx_type);

   } else {

-    x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+    x->fwd_txm4x4(src_diff, be->coeff, 32);

     x->quantize_b_4x4(x, ib, 16);

     vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],

                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),

@@ -86,10 +91,7 @@

   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

   vp9_build_intra_predictors_sby_s(xd, BLOCK_SIZE_MB16X16);

-  vp9_subtract_sby_s_c(x->src_diff,

-                       x->src.y_buffer, x->src.y_stride,

-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride,

-                       BLOCK_SIZE_MB16X16);

+  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);

   switch (tx_size) {

     case TX_16X16:

@@ -123,11 +125,7 @@

   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

   vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_MB16X16);

-  vp9_subtract_sbuv_s_c(x->src_diff,

-                        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,

-                        xd->plane[1].dst.buf, xd->plane[2].dst.buf,

-                        xd->plane[1].dst.stride,

-                        BLOCK_SIZE_MB16X16);

+  vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);

   switch (tx_size) {

     case TX_4X4:

@@ -153,6 +151,9 @@

   MACROBLOCKD *xd = &x->e_mbd;

   BLOCKD *b = &xd->block[ib];

   BLOCK *be = &x->block[ib];

+  int16_t* const src_diff =

+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,

+                                x->plane[0].src_diff);

   const int iblock[4] = {0, 1, 4, 5};

   int i;

   TX_TYPE tx_type;

@@ -160,7 +161,9 @@

   vp9_intra8x8_predict(xd, b, b->bmi.as_mode.first,

                        *(b->base_dst) + b->dst, b->dst_stride);

   // generate residual blocks

-  vp9_subtract_4b_c(be, b, 16);

+  vp9_subtract_block(8, 8, src_diff, 16,

+                     *(be->base_src) + be->src, be->src_stride,

+                     *(b->base_dst) + b->dst, b->dst_stride);

   if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

     int idx = (ib & 0x02) ? (ib + 2) : ib;

@@ -169,12 +172,12 @@

     assert(idx < 16);

     tx_type = get_tx_type_8x8(xd, ib);

     if (tx_type != DCT_DCT) {

-      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);

+      vp9_short_fht8x8(src_diff, (x->block + idx)->coeff, 16, tx_type);

       x->quantize_b_8x8(x, idx, tx_type, 16);

       vp9_short_iht8x8(dqcoeff, xd->block[ib].diff,

                             16, tx_type);

     } else {

-      x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);

+      x->fwd_txm8x8(src_diff, (x->block + idx)->coeff, 32);

       x->quantize_b_8x8(x, idx, DCT_DCT, 16);

       vp9_short_idct8x8(dqcoeff, xd->block[ib].diff, 32);

@@ -182,6 +185,9 @@

     for (i = 0; i < 4; i++) {

       int idx = ib + iblock[i];

       int16_t * const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16);

+      int16_t* const src_diff =

+          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, idx,

+                                    x->plane[0].src_diff);

       assert(idx < 16);

       b = &xd->block[ib + iblock[i]];

@@ -188,12 +194,12 @@

       be = &x->block[ib + iblock[i]];

       tx_type = get_tx_type_4x4(xd, ib + iblock[i]);

       if (tx_type != DCT_DCT) {

-        vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

+        vp9_short_fht4x4(src_diff, be->coeff, 16, tx_type);

         vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);

         vp9_short_iht4x4(dqcoeff, b->diff, 16, tx_type);

       } else if (!(i & 1) &&

                  get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {

-        x->fwd_txm8x4(be->src_diff, be->coeff, 32);

+        x->fwd_txm8x4(src_diff, be->coeff, 32);

         x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);

         vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],

                                     dqcoeff, b->diff, 32);

@@ -201,7 +207,7 @@

                                     dqcoeff + 16, (b + 1)->diff, 32);

         i++;

       } else {

-        x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+        x->fwd_txm4x4(src_diff, be->coeff, 32);

         x->quantize_b_4x4(x, ib + iblock[i], 16);

         vp9_inverse_transform_b_4x4(xd, xd->plane[0].eobs[ib + iblock[i]],

                                     dqcoeff, b->diff, 32);

@@ -231,14 +237,20 @@

   int16_t * const dqcoeff = MB_SUBBLOCK_FIELD(xd, dqcoeff, ib);

   const int plane = ib < 20 ? 1 : 2;

   const int block = ib < 20 ? ib - 16 : ib - 20;

+  int16_t* const src_diff =

+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, plane, block,

+                                x->plane[plane].src_diff);

   assert(ib >= 16 && ib < 24);

   vp9_intra_uv4x4_predict(&x->e_mbd, b, mode,

                           *(b->base_dst) + b->dst, b->dst_stride);

-  vp9_subtract_b(be, b, 8);

+  assert(xd->plane[1].subsampling_x == 1);

+  vp9_subtract_block(4, 4, src_diff, 8,

+                     *(be->base_src) + be->src, be->src_stride,

+                     *(b->base_dst) + b->dst, b->dst_stride);

-  x->fwd_txm4x4(be->src_diff, be->coeff, 16);

+  x->fwd_txm4x4(src_diff, be->coeff, 16);

   x->quantize_b_4x4(x, ib, 16);

   vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[plane].eobs[block],

                               dqcoeff, b->diff, 16);

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -20,102 +20,54 @@

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9_rtcd.h"

-void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {

-  uint8_t *src_ptr = (*(be->base_src) + be->src);

-  int16_t *diff_ptr = be->src_diff;

-  uint8_t *pred_ptr = *(bd->base_dst) + bd->dst;

-  int src_stride = be->src_stride;

-  int dst_stride = bd->dst_stride;

+void vp9_subtract_block(int rows, int cols,

+                        int16_t *diff_ptr, int diff_stride,

+                        const uint8_t *src_ptr, int src_stride,

+                        const uint8_t *pred_ptr, int pred_stride) {

   int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++)

+  for (r = 0; r < rows; r++) {

+    for (c = 0; c < cols; c++)

       diff_ptr[c] = src_ptr[c] - pred_ptr[c];

-    diff_ptr += pitch;

-    pred_ptr += dst_stride;

+    diff_ptr += diff_stride;

+    pred_ptr += pred_stride;

     src_ptr  += src_stride;

-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {

-  uint8_t *src_ptr = (*(be->base_src) + be->src);

-  int16_t *diff_ptr = be->src_diff;

-  uint8_t *pred_ptr = *(bd->base_dst) + bd->dst;

-  int src_stride = be->src_stride;

-  int dst_stride = bd->dst_stride;

-  int r, c;

-  for (r = 0; r < 8; r++) {

-    for (c = 0; c < 8; c++)

-      diff_ptr[c] = src_ptr[c] - pred_ptr[c];

+static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {

+  const MACROBLOCKD * const xd = &x->e_mbd;

+  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);

+  const int bh = 4 << (b_height_log2(bsize) - xd->plane[plane].subsampling_y);

+  const uint8_t *src = plane == 0 ? x->src.y_buffer :

+                       plane == 1 ? x->src.u_buffer : x->src.v_buffer;

+  const int src_stride = plane == 0 ? x->src.y_stride : x->src.uv_stride;

-    diff_ptr += pitch;

-    pred_ptr += dst_stride;

-    src_ptr  += src_stride;

-  }

+  assert(plane < 3);

+  vp9_subtract_block(bh, bw,

+                     x->plane[plane].src_diff, bw, src, src_stride,

+                     xd->plane[plane].dst.buf, xd->plane[plane].dst.stride);

-void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,

-                          const uint8_t *pred, int dst_stride,

-                          BLOCK_SIZE_TYPE bsize) {

-  const int bh = 16 << mb_height_log2(bsize), bw = 16 << mb_width_log2(bsize);

-  int r, c;

-  for (r = 0; r < bh; r++) {

-    for (c = 0; c < bw; c++)

-      diff[c] = src[c] - pred[c];

-    diff += bw;

-    pred += dst_stride;

-    src  += src_stride;

-  }

+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  subtract_plane(x, bsize, 0);

-void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,

-                           const uint8_t *vsrc, int src_stride,

-                           const uint8_t *upred,

-                           const uint8_t *vpred, int dst_stride,

-                           BLOCK_SIZE_TYPE bsize) {

-  const int bhl = mb_height_log2(bsize), bwl = mb_width_log2(bsize);

-  const int uoff = (16 * 16) << (bhl + bwl), voff = (uoff * 5) >> 2;

-  const int bw = 8 << bwl, bh = 8 << bhl;

-  int16_t *udiff = diff + uoff;

-  int16_t *vdiff = diff + voff;

-  int r, c;

+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  int i;

-  for (r = 0; r < bh; r++) {

-    for (c = 0; c < bw; c++)

-      udiff[c] = usrc[c] - upred[c];

-    udiff += bw;

-    upred += dst_stride;

-    usrc  += src_stride;

-  }

-  for (r = 0; r < bh; r++) {

-    for (c = 0; c < bw; c++)

-      vdiff[c] = vsrc[c] - vpred[c];

-    vdiff += bw;

-    vpred += dst_stride;

-    vsrc  += src_stride;

-  }

+  for (i = 1; i < MAX_MB_PLANE; i++)

+    subtract_plane(x, bsize, i);

-static void subtract_mb(MACROBLOCK *x) {

-  MACROBLOCKD *xd = &x->e_mbd;

-  vp9_subtract_sby_s_c(x->src_diff, x->src.y_buffer, x->src.y_stride,

-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride,

-                       BLOCK_SIZE_MB16X16);

-  vp9_subtract_sbuv_s_c(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                        x->src.uv_stride,

-                        xd->plane[1].dst.buf, xd->plane[2].dst.buf,

-                        xd->plane[1].dst.stride,

-                        BLOCK_SIZE_MB16X16);

+void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

+  vp9_subtract_sby(x, bsize);

+  vp9_subtract_sbuv(x, bsize);

 void vp9_transform_sby_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

   const int bwl = mb_width_log2(bsize) - 1, bw = 1 << bwl;

   const int bh = 1 << (mb_height_log2(bsize) - 1);

@@ -125,7 +77,7 @@

   for (n = 0; n < bw * bh; n++) {

     const int x_idx = n & (bw - 1), y_idx = n >> bwl;

-    vp9_short_fdct32x32(x->src_diff + y_idx * stride * 32 + x_idx * 32,

+    vp9_short_fdct32x32(x->plane[0].src_diff + y_idx * stride * 32 + x_idx * 32,

                         x->coeff + n * 1024, stride * 2);

@@ -143,10 +95,11 @@

                                               (y_idx * bstride + x_idx) * 4);

     if (tx_type != DCT_DCT) {

-      vp9_short_fht16x16(x->src_diff + y_idx * stride * 16 + x_idx * 16,

+      vp9_short_fht16x16(x->plane[0].src_diff +

+                             y_idx * stride * 16 + x_idx * 16,

                          x->coeff + n * 256, stride, tx_type);

     } else {

-      x->fwd_txm16x16(x->src_diff + y_idx * stride * 16 + x_idx * 16,

+      x->fwd_txm16x16(x->plane[0].src_diff + y_idx * stride * 16 + x_idx * 16,

                       x->coeff + n * 256, stride * 2);

@@ -164,10 +117,10 @@

     const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * bstride + x_idx) * 2);

     if (tx_type != DCT_DCT) {

-      vp9_short_fht8x8(x->src_diff + y_idx * stride * 8 + x_idx * 8,

+      vp9_short_fht8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,

                        x->coeff + n * 64, stride, tx_type);

     } else {

-      x->fwd_txm8x8(x->src_diff + y_idx * stride * 8 + x_idx * 8,

+      x->fwd_txm8x8(x->plane[0].src_diff + y_idx * stride * 8 + x_idx * 8,

                     x->coeff + n * 64, stride * 2);

@@ -185,10 +138,10 @@

     const TX_TYPE tx_type = get_tx_type_4x4(xd, n);

     if (tx_type != DCT_DCT) {

-      vp9_short_fht4x4(x->src_diff + y_idx * stride * 4 + x_idx * 4,

+      vp9_short_fht4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,

                        x->coeff + n * 16, stride, tx_type);

     } else {

-      x->fwd_txm4x4(x->src_diff + y_idx * stride * 4 + x_idx * 4,

+      x->fwd_txm4x4(x->plane[0].src_diff + y_idx * stride * 4 + x_idx * 4,

                     x->coeff + n * 16, stride * 2);

@@ -197,9 +150,9 @@

 void vp9_transform_sbuv_32x32(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {

   assert(bsize == BLOCK_SIZE_SB64X64);

   vp9_clear_system_state();

-  vp9_short_fdct32x32(x->src_diff + 4096,

+  vp9_short_fdct32x32(x->plane[1].src_diff,

                       x->coeff + 4096, 64);

-  vp9_short_fdct32x32(x->src_diff + 4096 + 1024,

+  vp9_short_fdct32x32(x->plane[2].src_diff,

                       x->coeff + 4096 + 1024, 64);

@@ -214,9 +167,9 @@

   for (n = 0; n < bw * bh; n++) {

     const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);

-    x->fwd_txm16x16(x->src_diff + uoff + y_idx * stride * 16 + x_idx * 16,

+    x->fwd_txm16x16(x->plane[1].src_diff + y_idx * stride * 16 + x_idx * 16,

                     x->coeff + uoff + n * 256, stride * 2);

-    x->fwd_txm16x16(x->src_diff + voff + y_idx * stride * 16 + x_idx * 16,

+    x->fwd_txm16x16(x->plane[2].src_diff + y_idx * stride * 16 + x_idx * 16,

                     x->coeff + voff + n * 256, stride * 2);

@@ -232,9 +185,9 @@

   for (n = 0; n < bw * bh; n++) {

     const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);

-    x->fwd_txm8x8(x->src_diff + uoff + y_idx * stride * 8 + x_idx * 8,

+    x->fwd_txm8x8(x->plane[1].src_diff + y_idx * stride * 8 + x_idx * 8,

                   x->coeff + uoff + n * 64, stride * 2);

-    x->fwd_txm8x8(x->src_diff + voff + y_idx * stride * 8 + x_idx * 8,

+    x->fwd_txm8x8(x->plane[2].src_diff + y_idx * stride * 8 + x_idx * 8,

                   x->coeff + voff + n * 64, stride * 2);

@@ -250,9 +203,9 @@

   for (n = 0; n < bw * bh; n++) {

     const int x_idx = n & (bw - 1), y_idx = n >> (bwl - 1);

-    x->fwd_txm4x4(x->src_diff + uoff + y_idx * stride * 4 + x_idx * 4,

+    x->fwd_txm4x4(x->plane[1].src_diff + y_idx * stride * 4 + x_idx * 4,

                   x->coeff + uoff + n * 16, stride * 2);

-    x->fwd_txm4x4(x->src_diff + voff + y_idx * stride * 4 + x_idx * 4,

+    x->fwd_txm4x4(x->plane[2].src_diff + y_idx * stride * 4 + x_idx * 4,

                   x->coeff + voff + n * 16, stride * 2);

@@ -826,7 +779,7 @@

   MACROBLOCKD *const xd = &x->e_mbd;

   vp9_build_inter_predictors_sb(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);

-  subtract_mb(x);

+  vp9_subtract_sb(x, BLOCK_SIZE_MB16X16);

   vp9_fidct_mb(cm, x);

   vp9_recon_sb(xd, BLOCK_SIZE_MB16X16);

@@ -836,9 +789,7 @@

   MACROBLOCKD *xd = &x->e_mbd;

   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);

-  vp9_subtract_sby_s_c(x->src_diff, x->src.y_buffer, x->src.y_stride,

-                       xd->plane[0].dst.buf, xd->plane[0].dst.stride,

-                       BLOCK_SIZE_MB16X16);

+  vp9_subtract_sby(x, BLOCK_SIZE_MB16X16);

   vp9_transform_sby_4x4(x, BLOCK_SIZE_MB16X16);

   vp9_quantize_sby_4x4(x, BLOCK_SIZE_MB16X16);

--- a/vp9/encoder/vp9_encodemb.h

+++ b/vp9/encoder/vp9_encodemb.h

@@ -56,15 +56,12 @@

 void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);

-void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,

-                          const uint8_t *pred, int dst_stride,

-                          BLOCK_SIZE_TYPE bsize);

-void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,

-                           const uint8_t *vsrc, int src_stride,

-                           const uint8_t *upred,

-                           const uint8_t *vpred, int dst_stride,

-                           BLOCK_SIZE_TYPE bsize);

+void vp9_subtract_block(int rows, int cols,

+                        int16_t *diff_ptr, int diff_stride,

+                        const uint8_t *src_ptr, int src_stride,

+                        const uint8_t *pred_ptr, int pred_stride);

+void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

+void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);

+void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);

 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -827,12 +827,9 @@

                             int *skip, BLOCK_SIZE_TYPE bs,

                             int64_t txfm_cache[NB_TXFM_MODES]) {

   VP9_COMMON *const cm = &cpi->common;

-  MACROBLOCKD *const xd = &x->e_mbd;

   int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];

-  uint8_t *src = x->src.y_buffer, *dst = xd->plane[0].dst.buf;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->plane[0].dst.stride;

-  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride, bs);

+  vp9_subtract_sby(x, bs);

   if (bs >= BLOCK_SIZE_SB32X32)

     super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],

@@ -859,7 +856,9 @@

   VP9_COMMON *const cm = &cpi->common;

   BLOCK *be = x->block + ib;

   BLOCKD *b = xd->block + ib;

+  int16_t* const src_diff =

+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,

+                                x->plane[0].src_diff);

   ENTROPY_CONTEXT ta = *a, tempa = *a;

   ENTROPY_CONTEXT tl = *l, templ = *l;

   TX_TYPE tx_type = DCT_DCT;

@@ -899,15 +898,17 @@

 #endif

     vp9_intra4x4_predict(xd, b, mode, *(b->base_dst) + b->dst, b->dst_stride);

-    vp9_subtract_b(be, b, 16);

+    vp9_subtract_block(4, 4, src_diff, 16,

+                       *(be->base_src) + be->src, be->src_stride,

+                       *(b->base_dst) + b->dst, b->dst_stride);

     b->bmi.as_mode.first = mode;

     tx_type = get_tx_type_4x4(xd, be - x->block);

     if (tx_type != DCT_DCT) {

-      vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

+      vp9_short_fht4x4(src_diff, be->coeff, 16, tx_type);

       vp9_ht_quantize_b_4x4(x, be - x->block, tx_type);

     } else {

-      x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+      x->fwd_txm4x4(src_diff, be->coeff, 32);

       x->quantize_b_4x4(x, be - x->block, 16);

@@ -1089,10 +1090,12 @@

   ENTROPY_CONTEXT_PLANES ta, tl;

   ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0;

   ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0;

   // perform transformation of dimension 8x8

   // note the input and output index mapping

   int idx = (ib & 0x02) ? (ib + 2) : ib;

+  int16_t* const src_diff =

+      raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,

+                                x->plane[0].src_diff);

   assert(ib < 16);

   for (mode = DC_PRED; mode <= TM_PRED; mode++) {

@@ -1105,14 +1108,16 @@

     vp9_intra8x8_predict(xd, b, mode, *(b->base_dst) + b->dst, b->dst_stride);

-    vp9_subtract_4b_c(be, b, 16);

+    vp9_subtract_block(8, 8, src_diff, 16,

+                       *(be->base_src) + be->src, be->src_stride,

+                       *(b->base_dst) + b->dst, b->dst_stride);

     if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

       TX_TYPE tx_type = get_tx_type_8x8(xd, ib);

       if (tx_type != DCT_DCT)

-        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);

+        vp9_short_fht8x8(src_diff, (x->block + idx)->coeff, 16, tx_type);

       else

-        x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);

+        x->fwd_txm8x8(src_diff, (x->block + idx)->coeff, 32);

       x->quantize_b_8x8(x, idx, tx_type, 16);

       // compute quantization mse of 8x8 block

@@ -1144,20 +1149,24 @@

       distortion = 0;

       rate_t = 0;

       for (i = 0; i < 4; ++i) {

+        int16_t* const src_diff =

+            raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,

+                                      0, ib + iblock[i],

+                                      x->plane[0].src_diff);

         int do_two = 0;

         b = &xd->block[ib + iblock[i]];

         be = &x->block[ib + iblock[i]];

         tx_type = get_tx_type_4x4(xd, ib + iblock[i]);

         if (tx_type != DCT_DCT) {

-          vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

+          vp9_short_fht4x4(src_diff, be->coeff, 16, tx_type);

           vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);

         } else if (!(i & 1) &&

                    get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {

-          x->fwd_txm8x4(be->src_diff, be->coeff, 32);

+          x->fwd_txm8x4(src_diff, be->coeff, 32);

           x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1, 16);

           do_two = 1;

         } else {

-          x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+          x->fwd_txm4x4(src_diff, be->coeff, 32);

           x->quantize_b_4x4(x, ib + iblock[i], 16);

         distortion += vp9_block_error_c(be->coeff,

@@ -1513,12 +1522,8 @@

                              BLOCK_SIZE_TYPE bsize) {

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

-  uint8_t *usrc = x->src.u_buffer, *udst = xd->plane[1].dst.buf;

-  uint8_t *vsrc = x->src.v_buffer, *vdst = xd->plane[2].dst.buf;

-  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->plane[1].dst.stride;

-  vp9_subtract_sbuv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,

-                        udst, vdst, dst_uv_stride, bsize);

+  vp9_subtract_sbuv(x, bsize);

   if (mbmi->txfm_size >= TX_32X32 && bsize >= BLOCK_SIZE_SB64X64) {

     super_block_uvrd_32x32(cm, x, rate, distortion, skippable, bsize);

@@ -1720,6 +1725,9 @@

     if (labels[i] == which_label) {

       BLOCKD *bd = &x->e_mbd.block[i];

       BLOCK *be = &x->block[i];

+      int16_t* const src_diff =

+          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, i,

+                                    x->plane[0].src_diff);

       int thisdistortion;

       vp9_build_inter_predictor(*(bd->base_pre) + bd->pre,

@@ -1741,8 +1749,10 @@

             &xd->subpix);

-      vp9_subtract_b(be, bd, 16);

-      x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+      vp9_subtract_block(4, 4, src_diff, 16,

+                         *(be->base_src) + be->src, be->src_stride,

+                         *(bd->base_dst) + bd->dst, bd->dst_stride);

+      x->fwd_txm4x4(src_diff, be->coeff, 32);

       x->quantize_b_4x4(x, i, 16);

       thisdistortion = vp9_block_error(be->coeff,

           BLOCK_OFFSET(xd->plane[0].dqcoeff, i, 16), 16);

@@ -1790,6 +1800,9 @@

       const int idx = (ib & 8) + ((ib & 2) << 1);

       BLOCKD *bd = &xd->block[ib];

       BLOCK *be = &x->block[ib], *be2 = &x->block[idx];

+      int16_t* const src_diff =

+          raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16, 0, ib,

+                                    x->plane[0].src_diff);

       int thisdistortion;

       assert(idx < 16);

@@ -1806,11 +1819,13 @@

             which_mv, &xd->subpix);

-      vp9_subtract_4b_c(be, bd, 16);

+      vp9_subtract_block(8, 8, src_diff, 16,

+                         *(be->base_src) + be->src, be->src_stride,

+                         *(bd->base_dst) + bd->dst, bd->dst_stride);

       if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {

         if (otherrd) {

-          x->fwd_txm8x8(be->src_diff, be2->coeff, 32);

+          x->fwd_txm8x8(src_diff, be2->coeff, 32);

           x->quantize_b_8x8(x, idx, DCT_DCT, 16);

           thisdistortion = vp9_block_error_c(be2->coeff,

               BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);

@@ -1823,9 +1838,13 @@

           xd->mode_info_context->mbmi.txfm_size = TX_4X4;

         for (j = 0; j < 4; j += 2) {

+          int16_t* const src_diff =

+              raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,

+                                        0, ib + iblock[j],

+                                        x->plane[0].src_diff);

           bd = &xd->block[ib + iblock[j]];

           be = &x->block[ib + iblock[j]];

-          x->fwd_txm8x4(be->src_diff, be->coeff, 32);

+          x->fwd_txm8x4(src_diff, be->coeff, 32);

           x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);

           thisdistortion = vp9_block_error_c(be->coeff,

               BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);

@@ -1846,7 +1865,11 @@

         if (otherrd) {

           for (j = 0; j < 4; j += 2) {

             BLOCK *be = &x->block[ib + iblock[j]];

-            x->fwd_txm8x4(be->src_diff, be->coeff, 32);

+            int16_t* const src_diff =

+                raster_block_offset_int16(xd, BLOCK_SIZE_MB16X16,

+                                          0, ib + iblock[j],

+                                          x->plane[0].src_diff);

+            x->fwd_txm8x4(src_diff, be->coeff, 32);

             x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1, 16);

             thisdistortion = vp9_block_error_c(be->coeff,

                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib + iblock[j], 16), 32);

@@ -1866,7 +1889,7 @@

             xd->mode_info_context->mbmi.txfm_size = TX_8X8;

-        x->fwd_txm8x8(be->src_diff, be2->coeff, 32);

+        x->fwd_txm8x8(src_diff, be2->coeff, 32);

         x->quantize_b_8x8(x, idx, DCT_DCT, 16);

         thisdistortion = vp9_block_error_c(be2->coeff,

             BLOCK_OFFSET(xd->plane[0].dqcoeff, idx, 16), 64);

@@ -3768,12 +3791,7 @@

         vp9_build_inter_predictors_sbuv(&x->e_mbd, mb_row, mb_col,

                                         BLOCK_SIZE_MB16X16);

-        vp9_subtract_sbuv_s_c(x->src_diff,

-                              x->src.u_buffer,

-                              x->src.v_buffer, x->src.uv_stride,

-                              xd->plane[1].dst.buf,

-                              xd->plane[2].dst.buf, xd->plane[1].dst.stride,

-                              BLOCK_SIZE_MB16X16);

+        vp9_subtract_sbuv(x, BLOCK_SIZE_MB16X16);

         super_block_uvrd_4x4(cm, x, &rate_uv, &distortion_uv,

                              &uv_skippable, BLOCK_SIZE_MB16X16);

--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c

+++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c

@@ -17,7 +17,7 @@

 // TODO(jimbankoski) Consider rewriting the c to take the same values rather

 // than going through these pointer conversions

-#if HAVE_MMX

+#if 0 && HAVE_MMX

 void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {

   vp9_short_fdct4x4_mmx(input,   output,    pitch);

   vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);

@@ -38,7 +38,7 @@

 #endif

-#if HAVE_SSE2

+#if 0 && HAVE_SSE2

 void vp9_subtract_b_sse2_impl(unsigned char *z,  int src_stride,

                               short *diff, unsigned char *predictor,

                               int pitch);

--

⑨