shithub: libvpx

Download patch

ref: f153a5d06302eec5e2ada78e4576ca29f804c999
parent: 1f7d810a72a290f3c52e518a72e130d42e69f3e1
author: Jingning Han <jingning@google.com>
date: Mon May 20 12:04:28 EDT 2013

Make the intra rd search support 8x4/4x8

This commit allows the rate-distortion optimization of intra coding
capable of supporting 8x4 and 4x8 partition settings.

It enables the entropy coding of intra modes in key frame using a
unified contextual probability model conditioned on its above/left
prediction modes.

Coding performance:
derf 0.464%

Change-Id: Ieed055084e11fcb64d5d5faeb0e706d30268ba18

--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -107,10 +107,10 @@
                                   [PARTITION_TYPES - 1] = {
   // FIXME(jingning,rbultje) put real probabilities here
 #if CONFIG_AB4X4
-  {105,  88,  252},
-  {113,  88,  249},
-  {113, 106,  251},
-  {126, 105,  107},
+  {202, 162, 107},
+  {16,  2,   169},
+  {3,   246,  19},
+  {104, 90,  134},
 #endif
   {202, 162, 107},
   {16,  2,   169},
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -153,7 +153,7 @@
     } else if (cur_mb->mbmi.mode == I4X4_PRED) {
       return ((cur_mb->bmi + 1 + b)->as_mode.first);
     } else {
-      return B_DC_PRED;
+      return DC_PRED;
     }
   }
   assert(b == 1 || b == 3);
@@ -171,7 +171,7 @@
     } else if (cur_mb->mbmi.mode == I4X4_PRED) {
       return ((cur_mb->bmi + 2 + b)->as_mode.first);
     } else {
-      return B_DC_PRED;
+      return DC_PRED;
     }
   }
 
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -103,6 +103,7 @@
                          vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
+  const int mis = cm->mode_info_stride;
   m->mbmi.ref_frame = INTRA_FRAME;
 
   // Read segmentation map if it is being updated explicitly this frame
@@ -119,11 +120,14 @@
 
   // luma mode
 #if CONFIG_AB4X4
-  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
-    m->mbmi.mode = read_kf_sb_ymode(r,
-                     cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]);
-  else
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+    const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
+    const MB_PREDICTION_MODE L = xd->left_available ?
+                                  left_block_mode(m, 0) : DC_PRED;
+    m->mbmi.mode = read_kf_bmode(r, cm->kf_bmode_prob[A][L]);
+  } else {
      m->mbmi.mode = I4X4_PRED;
+  }
 #else
   m->mbmi.mode = m->mbmi.sb_type > BLOCK_SIZE_SB8X8 ?
       read_kf_sb_ymode(r, cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]):
@@ -140,15 +144,25 @@
     int idx, idy;
     int bw = 1 << b_width_log2(m->mbmi.sb_type);
     int bh = 1 << b_height_log2(m->mbmi.sb_type);
-    // FIXME(jingning): fix intra4x4 rate-distortion optimization, then
-    // use bw and bh as the increment values.
-#if !CONFIG_AB4X4 || CONFIG_AB4X4
+
+#if !CONFIG_AB4X4
     bw = 1, bh = 1;
 #endif
-    for (idy = 0; idy < 2; idy += bh)
-      for (idx = 0; idx < 2; idx += bw)
-        m->bmi[idy * 2 + idx].as_mode.first =
-            read_kf_sb_ymode(r, cm->sb_kf_ymode_prob[cm->kf_ymode_probs_index]);
+    for (idy = 0; idy < 2; idy += bh) {
+      for (idx = 0; idx < 2; idx += bw) {
+        int ib = idy * 2 + idx;
+        int k;
+        const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis);
+        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
+                                      left_block_mode(m, ib) : DC_PRED;
+        m->bmi[ib].as_mode.first =
+            read_kf_bmode(r, cm->kf_bmode_prob[A][L]);
+        for (k = 1; k < bh; ++k)
+          m->bmi[ib + k * 2].as_mode.first = m->bmi[ib].as_mode.first;
+        for (k = 1; k < bw; ++k)
+          m->bmi[ib + k].as_mode.first = m->bmi[ib].as_mode.first;
+      }
+    }
   }
 
   m->mbmi.uv_mode = read_uv_mode(r, cm->kf_uv_mode_prob[m->mbmi.mode]);
@@ -858,16 +872,19 @@
     if (mbmi->mode == I4X4_PRED) {
 #endif
       int idx, idy;
-      // FIXME(jingning): fix intra4x4 rate-distortion optimization, then
-      // use bw and bh as the increment values.
-#if !CONFIG_AB4X4 || CONFIG_AB4X4
+#if !CONFIG_AB4X4
       bw = 1, bh = 1;
 #endif
       for (idy = 0; idy < 2; idy += bh) {
         for (idx = 0; idx < 2; idx += bw) {
+          int ib = idy * 2 + idx, k;
           int m = read_sb_ymode(r, cm->fc.sb_ymode_prob);
-          mi->bmi[idy * 2 + idx].as_mode.first = m;
+          mi->bmi[ib].as_mode.first = m;
           cm->fc.sb_ymode_counts[m]++;
+          for (k = 1; k < bh; ++k)
+            mi->bmi[ib + k * 2].as_mode.first = m;
+          for (k = 1; k < bw; ++k)
+            mi->bmi[ib + k].as_mode.first = m;
         }
       }
     }
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -740,9 +740,7 @@
       int idx, idy;
       int bw = 1 << b_width_log2(mi->sb_type);
       int bh = 1 << b_height_log2(mi->sb_type);
-      // FIXME(jingning): fix intra4x4 rate-distortion optimization, then
-      // use bw and bh as the increment values.
-#if !CONFIG_AB4X4 || CONFIG_AB4X4
+#if !CONFIG_AB4X4
       bw = 1, bh = 1;
 #endif
       for (idy = 0; idy < 2; idy += bh)
@@ -892,6 +890,7 @@
   const VP9_COMMON *const c = &cpi->common;
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   const int ym = m->mbmi.mode;
+  const int mis = c->mode_info_stride;
   const int segment_id = m->mbmi.segment_id;
   int skip_coeff;
 
@@ -906,8 +905,12 @@
   }
 
 #if CONFIG_AB4X4
-  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8)
-    sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+  if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
+    const B_PREDICTION_MODE A = above_block_mode(m, 0, mis);
+    const B_PREDICTION_MODE L = xd->left_available ?
+                                 left_block_mode(m, 0) : DC_PRED;
+    write_kf_bmode(bc, ym, c->kf_bmode_prob[A][L]);
+  }
 #else
   if (m->mbmi.sb_type > BLOCK_SIZE_SB8X8)
     sb_kfwrite_ymode(bc, ym, c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
@@ -923,15 +926,19 @@
     int idx, idy;
     int bw = 1 << b_width_log2(m->mbmi.sb_type);
     int bh = 1 << b_height_log2(m->mbmi.sb_type);
-    // FIXME(jingning): fix intra4x4 rate-distortion optimization, then
-    // use bw and bh as the increment values.
-#if !CONFIG_AB4X4 || CONFIG_AB4X4
+#if !CONFIG_AB4X4
     bw = 1, bh = 1;
 #endif
-    for (idy = 0; idy < 2; idy += bh)
-      for (idx = 0; idx < 2; idx += bw)
-        sb_kfwrite_ymode(bc, m->bmi[idy * 2 + idx].as_mode.first,
-                         c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+    for (idy = 0; idy < 2; idy += bh) {
+      for (idx = 0; idx < 2; idx += bw) {
+        int i = idy * 2 + idx;
+        const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
+        const B_PREDICTION_MODE L = (xd->left_available || idx) ?
+                                     left_block_mode(m, i) : DC_PRED;
+        write_kf_bmode(bc, m->bmi[i].as_mode.first,
+                       c->kf_bmode_prob[A][L]);
+      }
+    }
   }
 
   write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1617,9 +1617,7 @@
     int idx, idy;
     int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type);
     int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type);
-    // FIXME(jingning): fix intra4x4 rate-distortion optimization, then
-    // use bw and bh as the increment values.
-#if !CONFIG_AB4X4 || CONFIG_AB4X4
+#if !CONFIG_AB4X4
     bw = 1, bh = 1;
 #endif
     for (idy = 0; idy < 2; idy += bh) {
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -33,10 +33,18 @@
                   x->fc.sub_mv_ref_prob[0], vp9_sub_mv_ref_tree);
 
   // TODO(rbultje) separate tables for superblock costing?
+#if CONFIG_AB4X4
+  vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.sb_ymode_prob,
+                  vp9_sb_ymode_tree);
+  vp9_cost_tokens(c->mb.mbmode_cost[0],
+                  x->sb_kf_ymode_prob[c->common.kf_ymode_probs_index],
+                  vp9_sb_ymode_tree);
+#else
   vp9_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp9_ymode_tree);
   vp9_cost_tokens(c->mb.mbmode_cost[0],
                   x->kf_ymode_prob[c->common.kf_ymode_probs_index],
                   vp9_kf_ymode_tree);
+#endif
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
                   x->fc.uv_mode_prob[VP9_YMODES - 1], vp9_uv_mode_tree);
   vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -580,8 +580,13 @@
                                      int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
-                                     int *bestdistortion) {
+                                     int *bestdistortion,
+                                     BLOCK_SIZE_TYPE bsize) {
+#if CONFIG_AB4X4
+  MB_PREDICTION_MODE mode;
+#else
   B_PREDICTION_MODE mode;
+#endif
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = INT64_MAX;
   int rate = 0;
@@ -588,70 +593,91 @@
   int distortion;
   VP9_COMMON *const cm = &cpi->common;
   const int src_stride = x->plane[0].src.stride;
-  uint8_t* const src =
-      raster_block_offset_uint8(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                x->plane[0].src.buf, src_stride);
-  int16_t* const src_diff =
-      raster_block_offset_int16(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                x->plane[0].src_diff);
-  int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
-  uint8_t* const dst =
-      raster_block_offset_uint8(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                xd->plane[0].dst.buf, xd->plane[0].dst.stride);
-  ENTROPY_CONTEXT ta = *a, tempa = *a;
-  ENTROPY_CONTEXT tl = *l, templ = *l;
+  uint8_t *src, *dst;
+  int16_t *src_diff, *coeff;
+
+  ENTROPY_CONTEXT ta[2], tempa[2];
+  ENTROPY_CONTEXT tl[2], templ[2];
   TX_TYPE tx_type = DCT_DCT;
   TX_TYPE best_tx_type = DCT_DCT;
-  /*
-   * The predictor buffer is a 2d buffer with a stride of 16.  Create
-   * a temp buffer that meets the stride requirements, but we are only
-   * interested in the left 4x4 block
-   * */
-  DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
+  int bw = 1 << b_width_log2(bsize);
+  int bh = 1 << b_height_log2(bsize);
+  int idx, idy, block;
+  DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]);
 
   assert(ib < 4);
+#if !CONFIG_AB4X4
+  bw = 1, bh = 1;
+#endif
 
+  vpx_memcpy(ta, a, sizeof(ta));
+  vpx_memcpy(tl, l, sizeof(tl));
   xd->mode_info_context->mbmi.txfm_size = TX_4X4;
+
+#if CONFIG_AB4X4
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+#else
   for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
+#endif
     int64_t this_rd;
-    int ratey;
+    int ratey = 0;
 
     xd->mode_info_context->bmi[ib].as_mode.first = mode;
-    rate = bmode_costs[mode];
+    if (cm->frame_type == KEY_FRAME)
+      rate = bmode_costs[mode];
+    else
+      rate = x->mbmode_cost[cm->frame_type][mode];
+    distortion = 0;
 
-    vp9_intra4x4_predict(xd, ib,
-                         BLOCK_SIZE_SB8X8,
-                         mode, dst, xd->plane[0].dst.stride);
-    vp9_subtract_block(4, 4, src_diff, 8,
-                       src, src_stride,
-                       dst, xd->plane[0].dst.stride);
+    vpx_memcpy(tempa, ta, sizeof(ta));
+    vpx_memcpy(templ, tl, sizeof(tl));
 
-    xd->mode_info_context->bmi[ib].as_mode.first = mode;
-    tx_type = get_tx_type_4x4(xd, ib);
-    if (tx_type != DCT_DCT) {
-      vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
-      x->quantize_b_4x4(x, ib, tx_type, 16);
-    } else {
-      x->fwd_txm4x4(src_diff, coeff, 16);
-      x->quantize_b_4x4(x, ib, tx_type, 16);
-    }
+    for (idy = 0; idy < bh; ++idy) {
+      for (idx = 0; idx < bw; ++idx) {
+        block = ib + idy * 2 + idx;
+        xd->mode_info_context->bmi[block].as_mode.first = mode;
+        src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                        x->plane[0].src.buf, src_stride);
+        src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                             x->plane[0].src_diff);
+        coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
+        dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                        xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride);
+        vp9_intra4x4_predict(xd, block,
+                             BLOCK_SIZE_SB8X8,
+                             mode, dst, xd->plane[0].dst.stride);
+        vp9_subtract_block(4, 4, src_diff, 8,
+                           src, src_stride,
+                           dst, xd->plane[0].dst.stride);
 
-    tempa = ta;
-    templ = tl;
+        tx_type = get_tx_type_4x4(xd, block);
+        if (tx_type != DCT_DCT) {
+          vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
+          x->quantize_b_4x4(x, block, tx_type, 16);
+        } else {
+          x->fwd_txm4x4(src_diff, coeff, 16);
+          x->quantize_b_4x4(x, block, tx_type, 16);
+        }
 
-    ratey = cost_coeffs(cm, x, 0, ib,
-                        PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4, 16);
-    rate += ratey;
-    distortion = vp9_block_error(coeff,
-                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                 16) >> 2;
+        ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
+                             tempa + idx, templ + idy, TX_4X4, 16);
+        distortion += vp9_block_error(coeff, BLOCK_OFFSET(xd->plane[0].dqcoeff,
+                                                         block, 16), 16) >> 2;
 
+        vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode,
+                             dst, xd->plane[0].dst.stride);
+
+        if (best_tx_type != DCT_DCT)
+          vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                               dst, xd->plane[0].dst.stride, best_tx_type);
+        else
+          xd->inv_txm4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                             dst, xd->plane[0].dst.stride);
+      }
+    }
+
+    rate += ratey;
     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
     if (this_rd < best_rd) {
@@ -661,25 +687,37 @@
       best_rd = this_rd;
       *best_mode = mode;
       best_tx_type = tx_type;
-      *a = tempa;
-      *l = templ;
-      vpx_memcpy(best_dqcoeff, BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), 32);
+      vpx_memcpy(a, tempa, sizeof(tempa));
+      vpx_memcpy(l, templ, sizeof(templ));
+      for (idy = 0; idy < bh; ++idy) {
+        for (idx = 0; idx < bw; ++idx) {
+          block = ib + idy * 2 + idx;
+          vpx_memcpy(best_dqcoeff[idy * 2 + idx],
+                     BLOCK_OFFSET(xd->plane[0].dqcoeff, block, 16),
+                     sizeof(best_dqcoeff[0]));
+        }
+      }
     }
   }
-  xd->mode_info_context->bmi[ib].as_mode.first =
-    (B_PREDICTION_MODE)(*best_mode);
 
-  vp9_intra4x4_predict(xd, ib,
-                       BLOCK_SIZE_SB8X8,
-                       *best_mode,
-                       dst, xd->plane[0].dst.stride);
+  for (idy = 0; idy < bh; ++idy) {
+    for (idx = 0; idx < bw; ++idx) {
+      block = ib + idy * 2 + idx;
+      xd->mode_info_context->bmi[block].as_mode.first = *best_mode;
+      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
+                                      xd->plane[0].dst.buf,
+                                      xd->plane[0].dst.stride);
 
-  // inverse transform
-  if (best_tx_type != DCT_DCT) {
-    vp9_short_iht4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride,
-                           best_tx_type);
-  } else {
-    xd->inv_txm4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride);
+      vp9_intra4x4_predict(xd, block, BLOCK_SIZE_SB8X8, *best_mode,
+                           dst, xd->plane[0].dst.stride);
+      // inverse transform
+      if (best_tx_type != DCT_DCT)
+        vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
+                             xd->plane[0].dst.stride, best_tx_type);
+      else
+        xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
+                           xd->plane[0].dst.stride);
+    }
   }
 
   return best_rd;
@@ -688,8 +726,12 @@
 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
                                          int *Rate, int *rate_y,
                                          int *Distortion, int64_t best_rd) {
-  int i;
+  int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
+  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+  int bw = 1 << b_width_log2(bsize);
+  int bh = 1 << b_height_log2(bsize);
+  int idx, idy;
 #if CONFIG_AB4X4
   int cost = 0;
 #else
@@ -698,7 +740,7 @@
   int distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[2], t_left[2];
+  ENTROPY_CONTEXT t_above[4], t_left[4];
   int *bmode_costs;
 
   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
@@ -707,31 +749,43 @@
   xd->mode_info_context->mbmi.mode = I4X4_PRED;
   bmode_costs = mb->inter_bmode_costs;
 
-  for (i = 0; i < 4; i++) {
-    const int x_idx = i & 1, y_idx = i >> 1;
-    MODE_INFO *const mic = xd->mode_info_context;
-    const int mis = xd->mode_info_stride;
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-    int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+#if !CONFIG_AB4X4
+  bw = 1, bh = 1;
+#endif
 
-    if (xd->frame_type == KEY_FRAME) {
-      const B_PREDICTION_MODE A = above_block_mode(mic, i, mis);
-      const B_PREDICTION_MODE L = left_block_mode(mic, i);
+  for (idy = 0; idy < 2; idy += bh) {
+    for (idx = 0; idx < 2; idx += bw) {
+      MODE_INFO *const mic = xd->mode_info_context;
+      const int mis = xd->mode_info_stride;
+      B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+      int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
+      int UNINITIALIZED_IS_SAFE(d);
+      i = idy * 2 + idx;
 
-      bmode_costs  = mb->bmode_costs[A][L];
-    }
+      if (xd->frame_type == KEY_FRAME) {
+        const MB_PREDICTION_MODE A = above_block_mode(mic, i, mis);
+        const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
+                                     left_block_mode(mic, i) : DC_PRED;
 
-    total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
-                                      t_above + x_idx, t_left + y_idx,
-                                      &r, &ry, &d);
-    cost += r;
-    distortion += d;
-    tot_rate_y += ry;
+        bmode_costs  = mb->bmode_costs[A][L];
+      }
 
-    mic->bmi[i].as_mode.first = best_mode;
+      total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
+                                        t_above + idx, t_left + idy,
+                                        &r, &ry, &d, bsize);
+      cost += r;
+      distortion += d;
+      tot_rate_y += ry;
 
-    if (total_rd >= best_rd)
-      break;
+      mic->bmi[i].as_mode.first = best_mode;
+      for (j = 1; j < bh; ++j)
+        mic->bmi[i + j * 2].as_mode.first = best_mode;
+      for (j = 1; j < bw; ++j)
+        mic->bmi[i + j].as_mode.first = best_mode;
+
+      if (total_rd >= best_rd)
+        break;
+    }
   }
 
   if (total_rd >= best_rd)
@@ -751,6 +805,7 @@
                                       int64_t txfm_cache[NB_TXFM_MODES]) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  MACROBLOCKD *xd = &x->e_mbd;
   int this_rate, this_rate_tokenonly;
   int this_distortion, s;
   int64_t best_rd = INT64_MAX, this_rd;
@@ -770,13 +825,20 @@
   /* Y Search for 32x32 intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t local_txfm_cache[NB_TXFM_MODES];
+    MODE_INFO *const mic = xd->mode_info_context;
+    const int mis = xd->mode_info_stride;
+    const MB_PREDICTION_MODE A = above_block_mode(mic, 0, mis);
+    const MB_PREDICTION_MODE L = xd->left_available ?
+                                 left_block_mode(mic, 0) : DC_PRED;
 
+    int *bmode_costs  = x->bmode_costs[A][L];
+
     x->e_mbd.mode_info_context->mbmi.mode = mode;
     vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
                     bsize, local_txfm_cache);
-    this_rate = this_rate_tokenonly + x->mbmode_cost[x->e_mbd.frame_type][mode];
+    this_rate = this_rate_tokenonly + bmode_costs[mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {