shithub: libvpx

Download patch

ref: 5d4cffb35f4bc23462eedc95a4802c65e32d7d5a
parent: 319dd1c0f58d3db46713460ca10e920f2b18b605
author: Ronald S. Bultje <rbultje@google.com>
date: Mon Aug 20 10:43:34 EDT 2012

Superblock coding.

This commit adds a pick_sb_mode() function which selects the best 32x32
superblock coding mode. Then it selects the best per-MB modes, compares
the two and encodes that in the bitstream.

The bitstream coding is rather simplistic right now. At the SB level,
we code a bit to indicate whether this block uses SB-coding (32x32
prediction) or MB-coding (anything else), and then we follow with the
actual modes. This could and should be modified in the future, but is
omitted from this commit because it will likely involve reorganizing
much more code rather than just adding SB coding, so it's better to let
that be judged on its own merits.

Gains on derf: about even, YT/HD: +0.75%, STD/HD: +1.5%.

Change-Id: Iae313a7cbd8f75b3c66d04a68b991cb096eaaba6

--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -148,6 +148,7 @@
 #define VP8_YMODES  (B_PRED + 1)
 #define VP8_UV_MODES (TM_PRED + 1)
 #define VP8_I8X8_MODES (TM_PRED + 1)
+#define VP8_I32X32_MODES (TM_PRED + 1)
 
 #define VP8_MVREFS (1 + SPLITMV - NEARESTMV)
 
@@ -293,6 +294,11 @@
     INTERPOLATIONFILTERTYPE interp_filter;
 #endif
 
+#if CONFIG_SUPERBLOCKS
+  // FIXME need a SB array of 4 MB_MODE_INFOs that
+  // only needs one encoded_as_sb.
+  unsigned char encoded_as_sb;
+#endif
 } MB_MODE_INFO;
 
 typedef struct {
--- a/vp8/common/entropymode.c
+++ b/vp8/common/entropymode.c
@@ -227,6 +227,14 @@
   -NEWMV, -SPLITMV
 };
 
+#if CONFIG_SUPERBLOCKS
+const vp8_tree_index vp8_sb_mv_ref_tree[6] = {
+  -ZEROMV, 2,
+  -NEARESTMV, 4,
+  -NEARMV, -NEWMV
+};
+#endif
+
 const vp8_tree_index vp8_sub_mv_ref_tree[6] = {
   -LEFT4X4, 2,
   -ABOVE4X4, 4,
@@ -236,12 +244,18 @@
 
 struct vp8_token_struct vp8_bmode_encodings   [VP8_BINTRAMODES];
 struct vp8_token_struct vp8_ymode_encodings   [VP8_YMODES];
+#if CONFIG_SUPERBLOCKS
+struct vp8_token_struct vp8_sb_kf_ymode_encodings [VP8_I32X32_MODES];
+#endif
 struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES];
 struct vp8_token_struct vp8_uv_mode_encodings  [VP8_UV_MODES];
-struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_UV_MODES];
+struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_I8X8_MODES];
 struct vp8_token_struct vp8_mbsplit_encodings [VP8_NUMMBSPLITS];
 
 struct vp8_token_struct vp8_mv_ref_encoding_array    [VP8_MVREFS];
+#if CONFIG_SUPERBLOCKS
+struct vp8_token_struct vp8_sb_mv_ref_encoding_array  [VP8_MVREFS];
+#endif
 struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS];
 
 
@@ -253,11 +267,18 @@
     vp8_ymode_tree, x->fc.ymode_prob, bct, y_mode_cts, 256, 1);
   {
     int i;
-    for (i = 0; i < 8; i++)
+    for (i = 0; i < 8; i++) {
       vp8_tree_probs_from_distribution(
         VP8_YMODES, vp8_kf_ymode_encodings, vp8_kf_ymode_tree,
         x->kf_ymode_prob[i], bct, kf_y_mode_cts[i],
         256, 1);
+#if CONFIG_SUPERBLOCKS
+      vp8_tree_probs_from_distribution(
+        VP8_I32X32_MODES, vp8_sb_kf_ymode_encodings, vp8_sb_ymode_tree,
+        x->sb_kf_ymode_prob[i], bct, kf_y_mode_cts[i],
+        256, 1);
+#endif
+    }
   }
   {
     int i;
@@ -360,6 +381,9 @@
   vp8_tokens_from_tree(vp8_bmode_encodings,   vp8_bmode_tree);
   vp8_tokens_from_tree(vp8_ymode_encodings,   vp8_ymode_tree);
   vp8_tokens_from_tree(vp8_kf_ymode_encodings, vp8_kf_ymode_tree);
+#if CONFIG_SUPERBLOCKS
+  vp8_tokens_from_tree(vp8_sb_kf_ymode_encodings, vp8_sb_ymode_tree);
+#endif
   vp8_tokens_from_tree(vp8_uv_mode_encodings,  vp8_uv_mode_tree);
   vp8_tokens_from_tree(vp8_i8x8_mode_encodings,  vp8_i8x8_mode_tree);
   vp8_tokens_from_tree(vp8_mbsplit_encodings, vp8_mbsplit_tree);
@@ -370,6 +394,10 @@
 
   vp8_tokens_from_tree_offset(vp8_mv_ref_encoding_array,
                               vp8_mv_ref_tree, NEARESTMV);
+#if CONFIG_SUPERBLOCKS
+  vp8_tokens_from_tree_offset(vp8_sb_mv_ref_encoding_array,
+                              vp8_sb_mv_ref_tree, NEARESTMV);
+#endif
   vp8_tokens_from_tree_offset(vp8_sub_mv_ref_encoding_array,
                               vp8_sub_mv_ref_tree, LEFT4X4);
 }
--- a/vp8/common/entropymode.h
+++ b/vp8/common/entropymode.h
@@ -40,15 +40,18 @@
 extern const vp8_tree_index  vp8_ymode_tree[];
 extern const vp8_tree_index  vp8_kf_ymode_tree[];
 extern const vp8_tree_index  vp8_uv_mode_tree[];
+#define vp8_sb_ymode_tree vp8_uv_mode_tree
 extern const vp8_tree_index  vp8_i8x8_mode_tree[];
 extern const vp8_tree_index  vp8_mbsplit_tree[];
 extern const vp8_tree_index  vp8_mv_ref_tree[];
+extern const vp8_tree_index  vp8_sb_mv_ref_tree[];
 extern const vp8_tree_index  vp8_sub_mv_ref_tree[];
 
 extern struct vp8_token_struct vp8_bmode_encodings   [VP8_BINTRAMODES];
 extern struct vp8_token_struct vp8_ymode_encodings   [VP8_YMODES];
+extern struct vp8_token_struct vp8_sb_kf_ymode_encodings [VP8_I32X32_MODES];
 extern struct vp8_token_struct vp8_kf_ymode_encodings [VP8_YMODES];
-extern struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_UV_MODES];
+extern struct vp8_token_struct vp8_i8x8_mode_encodings  [VP8_I8X8_MODES];
 extern struct vp8_token_struct vp8_uv_mode_encodings  [VP8_UV_MODES];
 extern struct vp8_token_struct vp8_mbsplit_encodings  [VP8_NUMMBSPLITS];
 
@@ -55,6 +58,7 @@
 /* Inter mode values do not start at zero */
 
 extern struct vp8_token_struct vp8_mv_ref_encoding_array    [VP8_MVREFS];
+extern struct vp8_token_struct vp8_sb_mv_ref_encoding_array    [VP8_MVREFS];
 extern struct vp8_token_struct vp8_sub_mv_ref_encoding_array [VP8_SUBMVREFS];
 
 void vp8_entropy_mode_init(void);
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -47,6 +47,12 @@
   rtcd->recon.recon4      = vp8_recon4b_c;
   rtcd->recon.recon_mb    = vp8_recon_mb_c;
   rtcd->recon.recon_mby   = vp8_recon_mby_c;
+#if CONFIG_SUPERBLOCKS
+  rtcd->recon.build_intra_predictors_sby_s =
+    vp8_build_intra_predictors_sby_s;
+  rtcd->recon.build_intra_predictors_sbuv_s =
+    vp8_build_intra_predictors_sbuv_s;
+#endif
   rtcd->recon.build_intra_predictors_mby =
     vp8_build_intra_predictors_mby;
 #if CONFIG_COMP_INTRA_PRED
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -325,7 +325,13 @@
           lfi.lim = lfi_n->lim[filter_level];
           lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-          if (mb_col > 0)
+          if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+              )
             vp8_loop_filter_mbv_c
             (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
 
@@ -344,7 +350,13 @@
           }
 
           /* don't apply across umv border */
-          if (mb_row > 0)
+          if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+              )
             vp8_loop_filter_mbh_c
             (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
 
@@ -362,7 +374,13 @@
           }
         } else {
           // FIXME: Not 8x8 aware
-          if (mb_col > 0)
+          if (mb_col > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-1].mbmi.mb_skip_coeff)
+#endif
+              )
             LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
             (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
 
@@ -371,7 +389,13 @@
             (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
 
           /* don't apply across umv border */
-          if (mb_row > 0)
+          if (mb_row > 0
+#if CONFIG_SUPERBLOCKS
+              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
+                   mode_info_context[0].mbmi.mb_skip_coeff &&
+                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
+#endif
+              )
             LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
             (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
 
--- a/vp8/common/onyxc_int.h
+++ b/vp8/common/onyxc_int.h
@@ -226,12 +226,15 @@
 
   /* Y,U,V,Y2 */
   ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
-  ENTROPY_CONTEXT_PLANES left_context;  /* (up to) 4 contexts "" */
+  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */
 
   /* keyframe block modes are predicted by their above, left neighbors */
 
   vp8_prob kf_bmode_prob [VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES - 1];
   vp8_prob kf_ymode_prob[8][VP8_YMODES - 1]; /* keyframe "" */
+#if CONFIG_SUPERBLOCKS
+  vp8_prob sb_kf_ymode_prob[8][VP8_I32X32_MODES - 1];
+#endif
   int kf_ymode_probs_index;
   int kf_ymode_probs_update;
   vp8_prob kf_uv_mode_prob[VP8_YMODES] [VP8_UV_MODES - 1];
@@ -239,6 +242,9 @@
   vp8_prob prob_intra_coded;
   vp8_prob prob_last_coded;
   vp8_prob prob_gf_coded;
+#if CONFIG_SUPERBLOCKS
+  vp8_prob sb_coded;
+#endif
 
   // Context probabilities when using predictive coding of segment id
   vp8_prob segment_pred_probs[PREDICTION_PROBS];
--- a/vp8/common/pred_common.c
+++ b/vp8/common/pred_common.c
@@ -1,3 +1,4 @@
+
 /*
  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  *
@@ -224,10 +225,24 @@
   switch (pred_id) {
     case PRED_SEG_ID:
       xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride].mbmi.seg_id_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride+1].mbmi.seg_id_predicted = pred_flag;
+      }
+#endif
       break;
 
     case PRED_REF:
       xd->mode_info_context->mbmi.ref_predicted = pred_flag;
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride].mbmi.ref_predicted = pred_flag;
+        xd->mode_info_context[xd->mode_info_stride+1].mbmi.ref_predicted = pred_flag;
+      }
+#endif
       break;
 
     case PRED_MBSKIP:
--- a/vp8/common/recon.c
+++ b/vp8/common/recon.c
@@ -124,6 +124,52 @@
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_recon_mby_s_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd, uint8_t *dst) {
+  int x, y;
+  BLOCKD *b = &xd->block[0];
+  int stride = b->dst_stride;
+  short *diff = b->diff;
+
+  for (y = 0; y < 16; y++) {
+    for (x = 0; x < 16; x++) {
+      int a = dst[x] + diff[x];
+      if (a < 0)
+        a = 0;
+      else if (a > 255)
+        a = 255;
+      dst[x] = a;
+    }
+    dst += stride;
+    diff += 16;
+  }
+}
+
+void vp8_recon_mbuv_s_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+  int x, y, i;
+  uint8_t *dst = udst;
+
+  for (i = 0; i < 2; i++, dst = vdst) {
+    BLOCKD *b = &xd->block[16 + 4 * i];
+    int stride = b->dst_stride;
+    short *diff = b->diff;
+
+    for (y = 0; y < 8; y++) {
+      for (x = 0; x < 8; x++) {
+        int a = dst[x] + diff[x];
+        if (a < 0)
+          a = 0;
+        else if (a > 255)
+          a = 255;
+        dst[x] = a;
+      }
+      dst += stride;
+      diff += 8;
+    }
+  }
+}
+#endif
+
 void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) {
 #if ARCH_ARM
   BLOCKD *b = &xd->block[0];
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -100,6 +100,11 @@
 #endif
 extern prototype_recon_macroblock(vp8_recon_recon_mby);
 
+#ifndef vp8_recon_build_intra_predictors_sby_s
+#define vp8_recon_build_intra_predictors_sby_s vp8_build_intra_predictors_sby_s
+#endif
+extern prototype_build_intra_predictors(vp8_recon_build_intra_predictors_sby_s);
+
 #ifndef vp8_recon_build_intra_predictors_mby
 #define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
 #endif
@@ -126,6 +131,11 @@
 extern prototype_build_intra_predictors\
 (vp8_recon_build_intra_predictors_mby_s);
 
+#ifndef vp8_recon_build_intra_predictors_sbuv_s
+#define vp8_recon_build_intra_predictors_sbuv_s vp8_build_intra_predictors_sbuv_s
+#endif
+extern prototype_build_intra_predictors(vp8_recon_build_intra_predictors_sbuv_s);
+
 #ifndef vp8_recon_build_intra_predictors_mbuv
 #define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
 #endif
@@ -214,10 +224,16 @@
   vp8_recon_fn_t       recon4;
   vp8_recon_mb_fn_t    recon_mb;
   vp8_recon_mb_fn_t    recon_mby;
+#if CONFIG_SUPERBLOCKS
+  vp8_build_intra_pred_fn_t  build_intra_predictors_sby_s;
+#endif
   vp8_build_intra_pred_fn_t  build_intra_predictors_mby_s;
   vp8_build_intra_pred_fn_t  build_intra_predictors_mby;
 #if CONFIG_COMP_INTRA_PRED
   vp8_build_intra_pred_fn_t  build_comp_intra_predictors_mby;
+#endif
+#if CONFIG_SUPERBLOCKS
+  vp8_build_intra_pred_fn_t  build_intra_predictors_sbuv_s;
 #endif
   vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv_s;
   vp8_build_intra_pred_fn_t  build_intra_predictors_mbuv;
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -759,6 +759,56 @@
   vp8_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+                                        unsigned char *dst_y,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride) {
+  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
+  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
+          *v2 = x->second_pre.v_buffer;
+  int n;
+
+  for (n = 0; n < 4; n++)
+  {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
+    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+
+    vp8_build_1st_inter16x16_predictors_mb(x,
+      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
+      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
+      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
+      dst_ystride, dst_uvstride);
+    if (x->mode_info_context->mbmi.second_ref_frame) {
+      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;
+      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;
+
+      vp8_build_2nd_inter16x16_predictors_mb(x,
+        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,
+        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,
+        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,
+        dst_ystride, dst_uvstride);
+    }
+  }
+
+  x->pre.y_buffer = y1;
+  x->pre.u_buffer = u1;
+  x->pre.v_buffer = v1;
+
+  if (x->mode_info_context->mbmi.second_ref_frame) {
+    x->second_pre.y_buffer = y2;
+    x->second_pre.u_buffer = u2;
+    x->second_pre.v_buffer = v2;
+  }
+}
+#endif
+
 /*
  * The following functions should be called after an initial
  * call to vp8_build_inter16x16_predictors_mb() or _mby()/_mbuv().
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -207,17 +207,18 @@
   }
 }
 
-void vp8_build_intra_predictors_mby_internal(MACROBLOCKD *xd,
-                                             unsigned char *ypred_ptr,
-                                             int y_stride, int mode) {
+void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
+                                         unsigned char *src, int src_stride,
+                                         unsigned char *ypred_ptr,
+                                         int y_stride, int mode, int bsize) {
 
-  unsigned char *yabove_row = xd->dst.y_buffer - xd->dst.y_stride;
-  unsigned char yleft_col[16];
+  unsigned char *yabove_row = src - src_stride;
+  unsigned char yleft_col[32];
   unsigned char ytop_left = yabove_row[-1];
   int r, c, i;
 
-  for (i = 0; i < 16; i++) {
-    yleft_col[i] = xd->dst.y_buffer [i * xd->dst.y_stride - 1];
+  for (i = 0; i < bsize; i++) {
+    yleft_col[i] = xd->dst.y_buffer [i * src_stride - 1];
   }
 
   /* for Y */
@@ -227,58 +228,58 @@
       int i;
       int shift;
       int average = 0;
+      int log2_bsize_minus_1;
 
+      assert(bsize == 8 || bsize == 16 || bsize == 32);
+      if (bsize == 8) {
+        log2_bsize_minus_1 = 2;
+      } else if (bsize == 16) {
+        log2_bsize_minus_1 = 3;
+      } else /* bsize == 32 */ {
+        log2_bsize_minus_1 = 4;
+      }
 
       if (xd->up_available || xd->left_available) {
         if (xd->up_available) {
-          for (i = 0; i < 16; i++) {
+          for (i = 0; i < bsize; i++) {
             average += yabove_row[i];
           }
         }
 
         if (xd->left_available) {
-          for (i = 0; i < 16; i++) {
+          for (i = 0; i < bsize; i++) {
             average += yleft_col[i];
           }
         }
-        shift = 3 + xd->up_available + xd->left_available;
+        shift = log2_bsize_minus_1 + xd->up_available + xd->left_available;
         expected_dc = (average + (1 << (shift - 1))) >> shift;
       } else {
         expected_dc = 128;
       }
 
-      for (r = 0; r < 16; r++) {
-        vpx_memset(ypred_ptr, expected_dc, 16);
-        ypred_ptr += y_stride; /*16;*/
+      for (r = 0; r < bsize; r++) {
+        vpx_memset(ypred_ptr, expected_dc, bsize);
+        ypred_ptr += y_stride;
       }
     }
     break;
     case V_PRED: {
-
-      for (r = 0; r < 16; r++) {
-
-        ((int *)ypred_ptr)[0] = ((int *)yabove_row)[0];
-        ((int *)ypred_ptr)[1] = ((int *)yabove_row)[1];
-        ((int *)ypred_ptr)[2] = ((int *)yabove_row)[2];
-        ((int *)ypred_ptr)[3] = ((int *)yabove_row)[3];
+      for (r = 0; r < bsize; r++) {
+        memcpy(ypred_ptr, yabove_row, bsize);
         ypred_ptr += y_stride;
       }
     }
     break;
     case H_PRED: {
-
-      for (r = 0; r < 16; r++) {
-
-        vpx_memset(ypred_ptr, yleft_col[r], 16);
+      for (r = 0; r < bsize; r++) {
+        vpx_memset(ypred_ptr, yleft_col[r], bsize);
         ypred_ptr += y_stride;
       }
-
     }
     break;
     case TM_PRED: {
-
-      for (r = 0; r < 16; r++) {
-        for (c = 0; c < 16; c++) {
+      for (r = 0; r < bsize; r++) {
+        for (c = 0; c < bsize; c++) {
           int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
 
           if (pred < 0)
@@ -292,31 +293,30 @@
 
         ypred_ptr += y_stride;
       }
-
     }
     break;
     case D45_PRED: {
-      d45_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D135_PRED: {
-      d135_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D117_PRED: {
-      d117_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D153_PRED: {
-      d153_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D27_PRED: {
-      d27_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case D63_PRED: {
-      d63_predictor(ypred_ptr, y_stride, 16,  yabove_row, yleft_col);
+      d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);
     }
     break;
     case I8X8_PRED:
@@ -332,25 +332,36 @@
 }
 
 void vp8_build_intra_predictors_mby(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mby_internal(xd, xd->predictor, 16,
-                                          xd->mode_info_context->mbmi.mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->predictor, 16,
+                                      xd->mode_info_context->mbmi.mode, 16);
 }
 
 void vp8_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mby_internal(xd, xd->dst.y_buffer,
-                                          xd->dst.y_stride,
-                                          xd->mode_info_context->mbmi.mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->mode_info_context->mbmi.mode, 16);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_build_intra_predictors_sby_s(MACROBLOCKD *x) {
+  vp8_build_intra_predictors_internal(x, x->dst.y_buffer, x->dst.y_stride,
+                                      x->dst.y_buffer, x->dst.y_stride,
+                                      x->mode_info_context->mbmi.mode, 32);
+}
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
 void vp8_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
   unsigned char predictor[2][256];
   int i;
 
-  vp8_build_intra_predictors_mby_internal(
-    xd, predictor[0], 16, xd->mode_info_context->mbmi.mode);
-  vp8_build_intra_predictors_mby_internal(
-    xd, predictor[1], 16, xd->mode_info_context->mbmi.second_mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      predictor[0], 16,
+                                      xd->mode_info_context->mbmi.mode);
+  vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+                                      predictor[1], 16,
+                                      xd->mode_info_context->mbmi.second_mode);
 
   for (i = 0; i < 256; i++) {
     xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
@@ -362,172 +373,37 @@
                                               unsigned char *upred_ptr,
                                               unsigned char *vpred_ptr,
                                               int uv_stride,
-                                              int mode) {
-  YV12_BUFFER_CONFIG * dst = &xd->dst;
-  unsigned char *uabove_row = dst->u_buffer - dst->uv_stride;
-  unsigned char uleft_col[16];
-  unsigned char utop_left = uabove_row[-1];
-  unsigned char *vabove_row = dst->v_buffer - dst->uv_stride;
-  unsigned char vleft_col[20];
-  unsigned char vtop_left = vabove_row[-1];
-
-  int i, j;
-
-  for (i = 0; i < 8; i++) {
-    uleft_col[i] = dst->u_buffer [i * dst->uv_stride - 1];
-    vleft_col[i] = dst->v_buffer [i * dst->uv_stride - 1];
-  }
-
-  switch (mode) {
-    case DC_PRED: {
-      int expected_udc;
-      int expected_vdc;
-      int i;
-      int shift;
-      int Uaverage = 0;
-      int Vaverage = 0;
-
-      if (xd->up_available) {
-        for (i = 0; i < 8; i++) {
-          Uaverage += uabove_row[i];
-          Vaverage += vabove_row[i];
-        }
-      }
-
-      if (xd->left_available) {
-        for (i = 0; i < 8; i++) {
-          Uaverage += uleft_col[i];
-          Vaverage += vleft_col[i];
-        }
-      }
-
-      if (!xd->up_available && !xd->left_available) {
-        expected_udc = 128;
-        expected_vdc = 128;
-      } else {
-        shift = 2 + xd->up_available + xd->left_available;
-        expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
-        expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
-      }
-
-
-      /*vpx_memset(upred_ptr,expected_udc,64);*/
-      /*vpx_memset(vpred_ptr,expected_vdc,64);*/
-      for (i = 0; i < 8; i++) {
-        vpx_memset(upred_ptr, expected_udc, 8);
-        vpx_memset(vpred_ptr, expected_vdc, 8);
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-    }
-    break;
-    case V_PRED: {
-      int i;
-
-      for (i = 0; i < 8; i++) {
-        vpx_memcpy(upred_ptr, uabove_row, 8);
-        vpx_memcpy(vpred_ptr, vabove_row, 8);
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-
-    }
-    break;
-    case H_PRED: {
-      int i;
-
-      for (i = 0; i < 8; i++) {
-        vpx_memset(upred_ptr, uleft_col[i], 8);
-        vpx_memset(vpred_ptr, vleft_col[i], 8);
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-    }
-
-    break;
-    case TM_PRED: {
-      int i;
-
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-          int predu = uleft_col[i] + uabove_row[j] - utop_left;
-          int predv = vleft_col[i] + vabove_row[j] - vtop_left;
-
-          if (predu < 0)
-            predu = 0;
-
-          if (predu > 255)
-            predu = 255;
-
-          if (predv < 0)
-            predv = 0;
-
-          if (predv > 255)
-            predv = 255;
-
-          upred_ptr[j] = predu;
-          vpred_ptr[j] = predv;
-        }
-
-        upred_ptr += uv_stride; /*8;*/
-        vpred_ptr += uv_stride; /*8;*/
-      }
-
-    }
-    break;
-    case D45_PRED: {
-      d45_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d45_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D135_PRED: {
-      d135_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d135_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D117_PRED: {
-      d117_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d117_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D153_PRED: {
-      d153_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d153_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D27_PRED: {
-      d27_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d27_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case D63_PRED: {
-      d63_predictor(upred_ptr, uv_stride, 8,  uabove_row, uleft_col);
-      d63_predictor(vpred_ptr, uv_stride, 8,  vabove_row, vleft_col);
-    }
-    break;
-    case B_PRED:
-    case NEARESTMV:
-    case NEARMV:
-    case ZEROMV:
-    case NEWMV:
-    case SPLITMV:
-    case MB_MODE_COUNT:
-      break;
-  }
+                                              int mode, int bsize) {
+  vp8_build_intra_predictors_internal(xd, xd->dst.u_buffer, xd->dst.uv_stride,
+                                      upred_ptr, uv_stride, mode, bsize);
+  vp8_build_intra_predictors_internal(xd, xd->dst.v_buffer, xd->dst.uv_stride,
+                                      vpred_ptr, uv_stride, mode, bsize);
 }
 
 void vp8_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mbuv_internal(
-    xd, &xd->predictor[256], &xd->predictor[320],
-    8, xd->mode_info_context->mbmi.uv_mode);
+  vp8_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256],
+                                           &xd->predictor[320], 8,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           8);
 }
 
 void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) {
-  vp8_build_intra_predictors_mbuv_internal(
-    xd, xd->dst.u_buffer, xd->dst.v_buffer,
-    xd->dst.uv_stride, xd->mode_info_context->mbmi.uv_mode);
+  vp8_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer,
+                                           xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           8);
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
+  vp8_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer, xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           16);
+}
+#endif
+
 #if CONFIG_COMP_INTRA_PRED
 void vp8_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
   unsigned char predictor[2][2][64];
@@ -541,7 +417,8 @@
     xd->mode_info_context->mbmi.second_uv_mode);
   for (i = 0; i < 64; i++) {
     xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;
-    xd->predictor[256 + 64 + i] = (predictor[1][0][i] + predictor[1][1][i] + 1) >> 1;
+    xd->predictor[256 + 64 + i] = (predictor[1][0][i] +
+                                   predictor[1][1][i] + 1) >> 1;
   }
 }
 #endif
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -29,34 +29,31 @@
 #endif
 
 static int vp8_read_bmode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_bmode_tree, p);
-
-  return i;
+  return vp8_treed_read(bc, vp8_bmode_tree, p);
 }
 
 
 static int vp8_read_ymode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_ymode_tree, p);
+  return vp8_treed_read(bc, vp8_ymode_tree, p);
+}
 
-  return i;
+#if CONFIG_SUPERBLOCKS
+static int vp8_sb_kfread_ymode(vp8_reader *bc, const vp8_prob *p) {
+  return vp8_treed_read(bc, vp8_uv_mode_tree, p);
 }
+#endif
 
 static int vp8_kfread_ymode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_kf_ymode_tree, p);
-
-  return i;
+  return vp8_treed_read(bc, vp8_kf_ymode_tree, p);
 }
-static int vp8_read_i8x8_mode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_i8x8_mode_tree, p);
 
-  return i;
+static int vp8_read_i8x8_mode(vp8_reader *bc, const vp8_prob *p) {
+  return vp8_treed_read(bc, vp8_i8x8_mode_tree, p);
 }
 
 
 static int vp8_read_uv_mode(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_uv_mode_tree, p);
-
-  return i;
+  return vp8_treed_read(bc, vp8_uv_mode_tree, p);
 }
 
 // This function reads the current macro block's segnent id from the bitstream
@@ -112,8 +109,14 @@
       m->mbmi.mb_skip_coeff = 0;
   }
 
+#if CONFIG_SUPERBLOCKS
+  if (m->mbmi.encoded_as_sb) {
+    y_mode = (MB_PREDICTION_MODE) vp8_sb_kfread_ymode(bc,
+      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+  } else
+#endif
   y_mode = (MB_PREDICTION_MODE) vp8_kfread_ymode(bc,
-                                                 pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+    pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
 #if CONFIG_COMP_INTRA_PRED
   m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
 #endif
@@ -398,16 +401,18 @@
   return (MV_REFERENCE_FRAME)ref_frame;
 }
 
-static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_mv_ref_tree, p);
+#if CONFIG_SUPERBLOCKS
+static MB_PREDICTION_MODE read_sb_mv_ref(vp8_reader *bc, const vp8_prob *p) {
+  return (MB_PREDICTION_MODE) vp8_treed_read(bc, vp8_sb_mv_ref_tree, p);
+}
+#endif
 
-  return (MB_PREDICTION_MODE)i;
+static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p) {
+  return (MB_PREDICTION_MODE) vp8_treed_read(bc, vp8_mv_ref_tree, p);
 }
 
 static B_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p) {
-  const int i = vp8_treed_read(bc, vp8_sub_mv_ref_tree, p);
-
-  return (B_PREDICTION_MODE)i;
+  return (B_PREDICTION_MODE) vp8_treed_read(bc, vp8_sub_mv_ref_tree, p);
 }
 
 #ifdef VPX_MODE_COUNT
@@ -537,15 +542,36 @@
         // Else .... decode it explicitly
         else {
           vp8_read_mb_segid(bc, mbmi, xd);
-          cm->last_frame_seg_map[index] = mbmi->segment_id;
         }
-
       }
       // Normal unpredicted coding mode
       else {
         vp8_read_mb_segid(bc, mbmi, xd);
+      }
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        cm->last_frame_seg_map[index] =
+        cm->last_frame_seg_map[index + 1] =
+        cm->last_frame_seg_map[index + cm->mb_cols] =
+        cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
+      } else
+#endif
+      {
         cm->last_frame_seg_map[index] = mbmi->segment_id;
       }
+    } else {
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        mbmi->segment_id =
+              cm->last_frame_seg_map[index] &&
+              cm->last_frame_seg_map[index + 1] &&
+              cm->last_frame_seg_map[index + cm->mb_cols] &&
+              cm->last_frame_seg_map[index + cm->mb_cols + 1];
+      } else
+#endif
+      {
+        mbmi->segment_id = cm->last_frame_seg_map[index];
+      }
     }
   } else {
     // The encoder explicitly sets the segment_id to 0
@@ -667,6 +693,11 @@
       mbmi->mode =
         get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
     } else {
+#if CONFIG_SUPERBLOCKS
+      if (mbmi->encoded_as_sb) {
+        mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
+      } else
+#endif
       mbmi->mode = read_mv_ref(bc, mv_ref_p);
 
       vp8_accum_mv_refs(&pbi->common, mbmi->mode, rct);
@@ -963,6 +994,7 @@
       mbmi->mode = (MB_PREDICTION_MODE)
                    get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
     else {
+      // FIXME write using SB mode tree
       mbmi->mode = (MB_PREDICTION_MODE)
                    vp8_read_ymode(bc, pbi->common.fc.ymode_prob);
       pbi->common.fc.ymode_counts[mbmi->mode]++;
@@ -1045,6 +1077,9 @@
     int mb_row = (sb_row << 1);
 
     for (sb_col = 0; sb_col < sb_cols; sb_col++) {
+#if CONFIG_SUPERBLOCKS
+      mi->mbmi.encoded_as_sb = vp8_read(&pbi->bc, cm->sb_coded);
+#endif
       for (i = 0; i < 4; i++) {
 
         int dy = row_delta[i];
@@ -1059,6 +1094,10 @@
           prev_mi += offset_extended;
           continue;
         }
+#if CONFIG_SUPERBLOCKS
+        if (i)
+          mi->mbmi.encoded_as_sb = 0;
+#endif
 
         // Make sure the MacroBlockD mode info pointer is set correctly
         xd->mode_info_context = mi;
@@ -1073,6 +1112,18 @@
         else
           read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row,
                            mb_col);
+
+#if CONFIG_SUPERBLOCKS
+        if (mi->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          mi[1] = mi[cm->mode_info_stride] =
+            mi[cm->mode_info_stride + 1] = mi[0];
+          mi += 2;
+          prev_mi += 2;
+          break;
+        }
+#endif
 
         /* next macroblock */
         mb_row += dy;
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -175,10 +175,27 @@
  */
 static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_sbuv_s)(xd);
+      RECON_INVOKE(&pbi->common.rtcd.recon,
+                   build_intra_predictors_sby_s)(xd);
+    } else {
+#endif
     RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
     RECON_INVOKE(&pbi->common.rtcd.recon,
                  build_intra_predictors_mby_s)(xd);
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
   } else {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+    } else {
+#endif
     vp8_build_1st_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
                                            xd->dst.u_buffer, xd->dst.v_buffer,
                                            xd->dst.y_stride, xd->dst.uv_stride);
@@ -188,6 +205,9 @@
                                              xd->dst.u_buffer, xd->dst.v_buffer,
                                              xd->dst.y_stride, xd->dst.uv_stride);
     }
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
   }
 #ifdef DEC_DEBUG
   if (dec_debug) {
@@ -204,11 +224,15 @@
 
 extern const int vp8_i8x8_block[4];
 static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
-                              unsigned int mb_idx) {
+                              unsigned int mb_col) {
   int eobtotal = 0;
   MB_PREDICTION_MODE mode;
   int i;
   int tx_type;
+#if CONFIG_SUPERBLOCKS
+  VP8_COMMON *pc = &pbi->common;
+  int orig_skip_flag = xd->mode_info_context->mbmi.mb_skip_coeff;
+#endif
 
 #if CONFIG_HYBRIDTRANSFORM
   int QIndex = xd->q_index;
@@ -264,11 +288,25 @@
     xd->mode_info_context->mbmi.txfm_size = TX_8X8;
   }
 #endif
+#if CONFIG_SUPERBLOCKS
+  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+  }
+#endif
 
   tx_type = xd->mode_info_context->mbmi.txfm_size;
 
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
     vp8_reset_mb_tokens_context(xd);
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      xd->above_context++;
+      xd->left_context++;
+      vp8_reset_mb_tokens_context(xd);
+      xd->above_context--;
+      xd->left_context--;
+    }
+#endif
   } else if (!vp8dx_bool_error(xd->current_bc)) {
     for (i = 0; i < 25; i++) {
       xd->block[i].eob = 0;
@@ -311,8 +349,13 @@
      * */
     xd->mode_info_context->mbmi.mb_skip_coeff = 1;
 
-    skip_recon_mb(pbi, xd);
-    return;
+#if CONFIG_SUPERBLOCKS
+    if (!xd->mode_info_context->mbmi.encoded_as_sb || orig_skip_flag)
+#endif
+    {
+      skip_recon_mb(pbi, xd);
+      return;
+    }
   }
 
 #ifdef DEC_DEBUG
@@ -343,6 +386,12 @@
 
   /* do prediction */
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_sby_s)(xd);
+      RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_sbuv_s)(xd);
+    } else
+#endif
     if (mode != I8X8_PRED) {
       RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd);
       if (mode != B_PRED) {
@@ -358,6 +407,13 @@
 #endif
     }
   } else {
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+    } else
+#endif
     vp8_build_inter_predictors_mb(xd);
   }
 
@@ -481,6 +537,32 @@
     else
 #endif
     if (tx_type == TX_8X8) {
+#if CONFIG_SUPERBLOCKS
+      void *orig = xd->mode_info_context;
+      int n, num = xd->mode_info_context->mbmi.encoded_as_sb ? 4 : 1;
+      for (n = 0; n < num; n++) {
+        if (n != 0) {
+          for (i = 0; i < 25; i++) {
+            xd->block[i].eob = 0;
+            xd->eobs[i] = 0;
+          }
+          xd->above_context = pc->above_context + mb_col + (n & 1);
+          xd->left_context = pc->left_context + (n >> 1);
+          xd->mode_info_context = orig;
+          xd->mode_info_context += (n & 1);
+          xd->mode_info_context += (n >> 1) * pc->mode_info_stride;
+          if (!orig_skip_flag) {
+            eobtotal = vp8_decode_mb_tokens_8x8(pbi, xd);
+            if (eobtotal == 0) // skip loopfilter
+              xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+          } else {
+            vp8_reset_mb_tokens_context(xd);
+          }
+        }
+
+        if (xd->mode_info_context->mbmi.mb_skip_coeff)
+          continue; // only happens for SBs, which are already in dest buffer
+#endif
       DEQUANT_INVOKE(&pbi->dequant, block_2x2)(b);
 #ifdef DEC_DEBUG
       if (dec_debug) {
@@ -501,10 +583,27 @@
       ((int *)b->qcoeff)[5] = 0;
       ((int *)b->qcoeff)[6] = 0;
       ((int *)b->qcoeff)[7] = 0;
-      DEQUANT_INVOKE(&pbi->dequant, dc_idct_add_y_block_8x8)
-      (xd->qcoeff, xd->block[0].dequant,
-       xd->predictor, xd->dst.y_buffer,
-       xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        vp8_dequant_dc_idct_add_y_block_8x8_inplace_c(xd->qcoeff,
+          xd->block[0].dequant,
+          xd->dst.y_buffer + (n >> 1) * 16 * xd->dst.y_stride + (n & 1) * 16,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+        // do UV inline also
+        vp8_dequant_idct_add_uv_block_8x8_inplace_c(xd->qcoeff + 16 * 16,
+          xd->block[16].dequant,
+          xd->dst.u_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+          xd->dst.v_buffer + (n >> 1) * 8 * xd->dst.uv_stride + (n & 1) * 8,
+          xd->dst.uv_stride, xd->eobs + 16, xd);
+      } else
+#endif
+        DEQUANT_INVOKE(&pbi->dequant, dc_idct_add_y_block_8x8)(xd->qcoeff,
+          xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
+          xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
+#if CONFIG_SUPERBLOCKS
+      }
+      xd->mode_info_context = orig;
+#endif
     } else {
       DEQUANT_INVOKE(&pbi->dequant, block)(b);
       if (xd->eobs[24] > 1) {
@@ -529,7 +628,10 @@
     }
   }
 
-  if (tx_type == TX_8X8
+#if CONFIG_SUPERBLOCKS
+    if (!xd->mode_info_context->mbmi.encoded_as_sb) {
+#endif
+      if (tx_type == TX_8X8
 #if CONFIG_TX16X16
       || tx_type == TX_16X16
 #endif
@@ -543,6 +645,9 @@
     (xd->qcoeff + 16 * 16, xd->block[16].dequant,
      xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,
      xd->dst.uv_stride, xd->eobs + 16);
+#if CONFIG_SUPERBLOCKS
+  }
+#endif
 }
 
 
@@ -582,15 +687,21 @@
   int row_delta[4] = { 0, +1,  0, -1};
   int col_delta[4] = { +1, -1, +1, +1};
   int sb_cols = (pc->mb_cols + 1) >> 1;
-  ENTROPY_CONTEXT_PLANES left_context[2];
 
   // For a SB there are 2 left contexts, each pertaining to a MB row within
-  vpx_memset(left_context, 0, sizeof(left_context));
+  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
 
   mb_row = mbrow;
   mb_col = 0;
 
   for (sb_col = 0; sb_col < sb_cols; sb_col++) {
+    MODE_INFO *mi = xd->mode_info_context;
+
+#if CONFIG_SUPERBLOCKS
+    if (pbi->interleaved_decoding)
+      mi->mbmi.encoded_as_sb = vp8_read(&pbi->bc, pc->sb_coded);
+#endif
+
     // Process the 4 MBs within the SB in the order:
     // top-left, top-right, bottom-left, bottom-right
     for (i = 0; i < 4; i++) {
@@ -598,6 +709,7 @@
       int dx = col_delta[i];
       int offset_extended = dy * xd->mode_info_stride + dx;
 
+      mi = xd->mode_info_context;
       if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
         // MB lies outside frame, skip on to next
         mb_row += dy;
@@ -610,13 +722,10 @@
 #ifdef DEC_DEBUG
       dec_debug = (pc->current_video_frame == 0 && mb_row == 0 && mb_col == 0);
 #endif
-      // Copy in the appropriate left context for this MB row
-      vpx_memcpy(&pc->left_context,
-                 &left_context[i >> 1],
-                 sizeof(ENTROPY_CONTEXT_PLANES));
 
       // Set above context pointer
       xd->above_context = pc->above_context + mb_col;
+      xd->left_context = pc->left_context + (i >> 1);
 
       /* Distance of Mb to the various image edges.
        * These are specified to 8th pel as they are always compared to
@@ -639,6 +748,10 @@
       xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
       xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
 
+#if CONFIG_SUPERBLOCKS
+      if (i)
+        mi->mbmi.encoded_as_sb = 0;
+#endif
       if(pbi->interleaved_decoding)
         vpx_decode_mb_mode_mv(pbi, xd, mb_row, mb_col);
 
@@ -681,15 +794,34 @@
         xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
       }
 
-      decode_macroblock(pbi, xd, mb_row * pc->mb_cols + mb_col);
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        mi[1] = mi[0];
+        mi[pc->mode_info_stride] = mi[0];
+        mi[pc->mode_info_stride + 1] = mi[0];
+      }
+#endif
+      decode_macroblock(pbi, xd, mb_col);
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+        mi[1].mbmi.txfm_size = mi[0].mbmi.txfm_size;
+        mi[pc->mode_info_stride].mbmi.txfm_size = mi[0].mbmi.txfm_size;
+        mi[pc->mode_info_stride + 1].mbmi.txfm_size = mi[0].mbmi.txfm_size;
+      }
+#endif
 
       /* check if the boolean decoder has suffered an error */
       xd->corrupted |= vp8dx_bool_error(xd->current_bc);
 
-      // Store the modified left context for the MB row locally
-      vpx_memcpy(&left_context[i >> 1],
-                 &pc->left_context,
-                 sizeof(ENTROPY_CONTEXT_PLANES));
+#if CONFIG_SUPERBLOCKS
+      if (mi->mbmi.encoded_as_sb) {
+        assert(!i);
+        mb_col += 2;
+        xd->mode_info_context += 2;
+        xd->prev_mode_info_context += 2;
+        break;
+      }
+#endif
 
       // skip to next MB
       xd->mode_info_context += offset_extended;
@@ -806,7 +938,6 @@
     vp8_setup_interp_filters(xd, pc->mcomp_filter_type, pc);
   }
 
-  xd->left_context = &pc->left_context;
   xd->mode_info_context = pc->mi;
   xd->prev_mode_info_context = pc->prev_mi;
   xd->frame_type = pc->frame_type;
@@ -1150,6 +1281,10 @@
         pc->ref_pred_probs[i] = (vp8_prob)vp8_read_literal(bc, 8);
     }
   }
+
+#if CONFIG_SUPERBLOCKS
+  pc->sb_coded = vp8_read_literal(bc, 8);
+#endif
 
   /* Read the loop filter level and type */
   pc->txfm_mode = (TXFM_MODE) vp8_read_bit(bc);
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -127,6 +127,19 @@
 
 }
 
+#if CONFIG_SUPERBLOCKS
+void vp8_dequant_dc_idct_add_y_block_8x8_inplace_c
+(short *q, short *dq,
+ unsigned char *dst, int stride, char *eobs, short *dc, MACROBLOCKD *xd) {
+
+  vp8_dequant_dc_idct_add_8x8_c(q, dq, dst, dst, stride, stride, dc[0]);
+  vp8_dequant_dc_idct_add_8x8_c(&q[64], dq, dst + 8, dst + 8, stride, stride, dc[1]);
+  vp8_dequant_dc_idct_add_8x8_c(&q[128], dq, dst + 8 * stride, dst + 8 * stride, stride, stride, dc[4]);
+  vp8_dequant_dc_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8, dst + 8 * stride + 8, stride, stride, dc[8]);
+
+}
+#endif
+
 void vp8_dequant_idct_add_y_block_8x8_c
 (short *q, short *dq, unsigned char *pre,
  unsigned char *dst, int stride, char *eobs, MACROBLOCKD *xd) {
@@ -152,6 +165,18 @@
 
   vp8_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride);
 }
+
+#if CONFIG_SUPERBLOCKS
+void vp8_dequant_idct_add_uv_block_8x8_inplace_c
+(short *q, short *dq,
+ unsigned char *dstu, unsigned char *dstv, int stride, char *eobs, MACROBLOCKD *xd) {
+  vp8_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride);
+
+  q    += 64;
+
+  vp8_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride);
+}
+#endif
 
 #if CONFIG_LOSSLESS
 void vp8_dequant_dc_idct_add_y_block_lossless_c
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -149,7 +149,7 @@
 
   pbi->decoded_key_frame = 0;
 
-  pbi->interleaved_decoding = CONFIG_NEWBESTREFMV;
+  pbi->interleaved_decoding = CONFIG_NEWBESTREFMV || CONFIG_SUPERBLOCKS;
 
   return (VP8D_PTR) pbi;
 }
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -288,6 +288,12 @@
   vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
 }
 
+#if CONFIG_SUPERBLOCKS
+static void sb_kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p) {
+  vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_sb_kf_ymode_encodings + m);
+}
+#endif
+
 static void write_i8x8_mode(vp8_writer *bc, int m, const vp8_prob *p) {
   vp8_write_token(bc, vp8_i8x8_mode_tree, p, vp8_i8x8_mode_encodings + m);
 }
@@ -533,6 +539,16 @@
                   vp8_mv_ref_encoding_array - NEARESTMV + m);
 }
 
+#if CONFIG_SUPERBLOCKS
+static void write_sb_mv_ref(vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p) {
+#if CONFIG_DEBUG
+  assert(NEARESTMV <= m  &&  m < SPLITMV);
+#endif
+  vp8_write_token(w, vp8_sb_mv_ref_tree, p,
+                  vp8_sb_mv_ref_encoding_array - NEARESTMV + m);
+}
+#endif
+
 static void write_sub_mv_ref
 (
   vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
@@ -810,6 +826,9 @@
 
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
+#if CONFIG_SUPERBLOCKS
+      vp8_write(w, m->mbmi.encoded_as_sb, pc->sb_coded);
+#endif
       for (i = 0; i < 4; i++) {
         MB_MODE_INFO *mi;
         MV_REFERENCE_FRAME rf;
@@ -872,7 +891,15 @@
         if (pc->mb_no_coeff_skip &&
             (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          vp8_encode_bool(w, mi->mb_skip_coeff,
+          int skip_coeff = mi->mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+          if (mi->encoded_as_sb) {
+            skip_coeff &= m[1].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+          }
+#endif
+          vp8_encode_bool(w, skip_coeff,
                           get_pred_prob(pc, xd, PRED_MBSKIP));
         }
 
@@ -884,6 +911,8 @@
           active_section = 6;
 #endif
 
+          // TODO(rbultje) write using SB tree structure
+
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
             write_ymode(w, mode, pc->fc.ymode_prob);
           }
@@ -949,7 +978,14 @@
 
           // Is the segment coding of mode enabled
           if (!segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-            write_mv_ref(w, mode, mv_ref_p);
+#if CONFIG_SUPERBLOCKS
+            if (mi->encoded_as_sb) {
+              write_sb_mv_ref(w, mode, mv_ref_p);
+            } else
+#endif
+            {
+              write_mv_ref(w, mode, mv_ref_p);
+            }
             vp8_accum_mv_refs(&cpi->common, mode, ct);
           }
 
@@ -1085,6 +1121,17 @@
           }
         }
 
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          cpi->mb.partition_info += 2;
+          prev_m += 2;
+          break;
+        }
+#endif
+
         // Next MB
         mb_row += dy;
         mb_col += dx;
@@ -1151,6 +1198,9 @@
 
     mb_col = 0;
     for (col = 0; col < c->mb_cols; col += 2) {
+#if CONFIG_SUPERBLOCKS
+      vp8_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
+#endif
       // Process the 4 MBs in the order:
       // top-left, top-right, bottom-left, bottom-right
       for (i = 0; i < 4; i++) {
@@ -1181,11 +1231,27 @@
         if (c->mb_no_coeff_skip &&
             (!segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
              (get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          vp8_encode_bool(bc, m->mbmi.mb_skip_coeff,
+              int skip_coeff = m->mbmi.mb_skip_coeff;
+#if CONFIG_SUPERBLOCKS
+              if (m->mbmi.encoded_as_sb) {
+                skip_coeff &= m[1].mbmi.mb_skip_coeff;
+                skip_coeff &= m[mis].mbmi.mb_skip_coeff;
+                skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
+              }
+#endif
+              vp8_encode_bool(bc, skip_coeff,
                           get_pred_prob(c, xd, PRED_MBSKIP));
         }
-        kfwrite_ymode(bc, ym,
-                      c->kf_ymode_prob[c->kf_ymode_probs_index]);
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          sb_kfwrite_ymode(bc, ym,
+                           c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
+        } else
+#endif
+        {
+          kfwrite_ymode(bc, ym,
+                        c->kf_ymode_prob[c->kf_ymode_probs_index]);
+        }
 
         if (ym == B_PRED) {
           const int mis = c->mode_info_stride;
@@ -1233,6 +1299,14 @@
         } else
           write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
+#if CONFIG_SUPERBLOCKS
+        if (m->mbmi.encoded_as_sb) {
+          assert(!i);
+          mb_col += 2;
+          m += 2;
+          break;
+        }
+#endif
         // Next MB
         mb_row += dy;
         mb_col += dx;
@@ -1793,7 +1867,7 @@
   } else
     vp8_write_bit(bc, 0);
 }
-extern const unsigned int kf_y_mode_cts[8][VP8_YMODES];
+
 static void decide_kf_ymode_entropy(VP8_COMP *cpi) {
 
   int mode_cost[MB_MODE_COUNT];
@@ -1808,6 +1882,13 @@
     for (j = 0; j < VP8_YMODES; j++) {
       cost += mode_cost[j] * cpi->ymode_count[j];
     }
+#if CONFIG_SUPERBLOCKS
+    vp8_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
+                    vp8_sb_ymode_tree);
+    for (j = 0; j < VP8_I32X32_MODES; j++) {
+      cost += mode_cost[j] * cpi->sb_ymode_count[j];
+    }
+#endif
     if (cost < bestcost) {
       bestindex = i;
       bestcost = cost;
@@ -1906,11 +1987,6 @@
       // Select the coding strategy (temporal or spatial)
       choose_segmap_coding_method(cpi);
 
-      // Take a copy of the segment map if it changed for
-      // future comparison
-      vpx_memcpy(pc->last_frame_seg_map,
-                 cpi->segmentation_map, pc->MBs);
-
       // Write out the chosen coding method.
       vp8_write_bit(bc, (pc->temporal_update) ? 1 : 0);
     }
@@ -2047,6 +2123,19 @@
         vp8_write_bit(bc, 0);
     }
   }
+
+#if CONFIG_SUPERBLOCKS
+  {
+    /* sb mode probability */
+    int sb_coded = 256 - (cpi->sb_count << 8) / (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
+    if (sb_coded <= 0)
+      sb_coded = 1;
+    else if (sb_coded >= 256)
+      sb_coded = 255;
+    pc->sb_coded = sb_coded;
+    vp8_write_literal(bc, pc->sb_coded, 8);
+  }
+#endif
 
   vp8_write_bit(bc, pc->txfm_mode);
 
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -82,7 +82,9 @@
   int best_mode_index;
   int rddiv;
   int rdmult;
-
+  int hybrid_pred_diff;
+  int comp_pred_diff;
+  int single_pred_diff;
 } PICK_MODE_CONTEXT;
 
 typedef struct {
@@ -139,12 +141,6 @@
   int mv_col_max;
   int mv_row_min;
   int mv_row_max;
-#if CONFIG_SUPERBLOCKS
-  int mv_col_min_sb;
-  int mv_col_max_sb;
-  int mv_row_min_sb;
-  int mv_row_max_sb;
-#endif
 
   int skip;
 
@@ -162,8 +158,6 @@
 
   int optimize;
   int q_index;
-
-  int encode_as_sb;
 
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -57,16 +57,24 @@
                                       MB_ROW_COMP *mbr_ei,
                                       int mb_row,
                                       int count);
-extern int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                              int recon_yoffset, int recon_uvoffset,
+                              int *returnrate, int *returndistortion);
+extern void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
                                             int recon_yoffset,
-                                            int recon_uvoffset);
+                                            int recon_uvoffset, int *r, int *d);
 void vp8_build_block_offsets(MACROBLOCK *x);
 void vp8_setup_block_ptrs(MACROBLOCK *x);
 void vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
                                    int recon_yoffset, int recon_uvoffset,
                                    int output_enabled);
+void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int recon_yoffset, int recon_uvoffset, int mb_col, int mb_row);
 void vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x,
                                     TOKENEXTRA **t, int output_enabled);
+void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
+                                    MACROBLOCK *x,
+                                    TOKENEXTRA **t, int mb_col);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
 
 
@@ -378,6 +386,13 @@
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
+#if CONFIG_SUPERBLOCKS
+  if (mi->mbmi.encoded_as_sb) {
+    vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
+    vpx_memcpy(xd->mode_info_context + cpi->common.mode_info_stride, mi, sizeof(MODE_INFO));
+    vpx_memcpy(xd->mode_info_context + cpi->common.mode_info_stride + 1, mi, sizeof(MODE_INFO));
+  }
+#endif
 
   if (mb_mode == B_PRED) {
     for (i = 0; i < 16; i++) {
@@ -448,6 +463,10 @@
 
     cpi->prediction_error += ctx->distortion;
     cpi->intra_error += ctx->intra_error;
+
+    cpi->rd_comp_pred_diff[0] += ctx->single_pred_diff;
+    cpi->rd_comp_pred_diff[1] += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[2] += ctx->hybrid_pred_diff;
   }
 }
 
@@ -458,7 +477,8 @@
                           MACROBLOCK  *x,
                           MACROBLOCKD *xd,
                           TOKENEXTRA **tp,
-                          int *totalrate) {
+                          int *totalrate,
+                          int *totaldist) {
   int i;
   int map_index;
   int recon_yoffset, recon_uvoffset;
@@ -477,7 +497,7 @@
 
   /* Function should not modify L & A contexts; save and restore on exit */
   vpx_memcpy(left_context,
-             cpi->left_context,
+             cm->left_context,
              sizeof(left_context));
   vpx_memcpy(above_context,
              initial_above_context_ptr,
@@ -525,9 +545,7 @@
 
     // Restore the appropriate left context depending on which
     // row in the SB the MB is situated
-    vpx_memcpy(&cm->left_context,
-               &cpi->left_context[i >> 1],
-               sizeof(ENTROPY_CONTEXT_PLANES));
+    xd->left_context = cm->left_context + (i >> 1);
 
     // Set up distance of MB to edge of frame in 1/8th pel units
     xd->mb_to_top_edge    = -((mb_row * 16) << 3);
@@ -568,9 +586,11 @@
     // Is segmentation enabled
     if (xd->segmentation_enabled) {
       // Code to set segment id in xd->mbmi.segment_id
-      if (cpi->segmentation_map[map_index] <= 3)
+      if (xd->update_mb_segmentation_map)
         mbmi->segment_id = cpi->segmentation_map[map_index];
       else
+        mbmi->segment_id = cm->last_frame_seg_map[map_index];
+      if (mbmi->segment_id > 3)
         mbmi->segment_id = 0;
 
       vp8cx_mb_init_quantizer(cpi, x);
@@ -583,22 +603,29 @@
     /* force 4x4 transform for mode selection */
     mbmi->txfm_size = TX_4X4; // TODO IS this right??
 
+#if CONFIG_SUPERBLOCKS
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+
     cpi->update_context = 0;    // TODO Do we need this now??
 
     // Find best coding mode & reconstruct the MB so it is available
     // as a predictor for MBs that follow in the SB
     if (cm->frame_type == KEY_FRAME) {
-      *totalrate += vp8_rd_pick_intra_mode(cpi, x);
+      int r, d;
+      vp8_rd_pick_intra_mode(cpi, x, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
 
-      // Save the coding context
-      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
-                 sizeof(MODE_INFO));
-
       // Dummy encode, do not do the tokenization
       vp8cx_encode_intra_macro_block(cpi, x, tp, 0);
       // Note the encoder may have changed the segment_id
+
+      // Save the coding context
+      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
+                 sizeof(MODE_INFO));
     } else {
-      int seg_id;
+      int seg_id, r, d;
 
       if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
           !segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
@@ -612,9 +639,10 @@
         cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
       }
 
-      *totalrate += vp8cx_pick_mode_inter_macroblock(cpi, x,
-                                                     recon_yoffset,
-                                                     recon_uvoffset);
+      vp8cx_pick_mode_inter_macroblock(cpi, x, recon_yoffset,
+                                       recon_uvoffset, &r, &d);
+      *totalrate += r;
+      *totaldist += d;
 
       // Dummy encode, do not do the tokenization
       vp8cx_encode_inter_macroblock(cpi, x, tp,
@@ -639,11 +667,6 @@
       }
     }
 
-    // Keep a copy of the updated left context
-    vpx_memcpy(&cpi->left_context[i >> 1],
-               &cm->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
-
     // Next MB
     mb_row += dy;
     mb_col += dx;
@@ -664,7 +687,7 @@
   }
 
   /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy(cpi->left_context,
+  vpx_memcpy(cm->left_context,
              left_context,
              sizeof(left_context));
   vpx_memcpy(initial_above_context_ptr,
@@ -672,6 +695,156 @@
              sizeof(above_context));
 }
 
+#if CONFIG_SUPERBLOCKS
+static void pick_sb_modes (VP8_COMP *cpi,
+                           VP8_COMMON *cm,
+                           int mb_row,
+                           int mb_col,
+                           MACROBLOCK  *x,
+                           MACROBLOCKD *xd,
+                           TOKENEXTRA **tp,
+                           int *totalrate,
+                           int *totaldist)
+{
+  int map_index;
+  int recon_yoffset, recon_uvoffset;
+  int ref_fb_idx = cm->lst_fb_idx;
+  int dst_fb_idx = cm->new_fb_idx;
+  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+  ENTROPY_CONTEXT_PLANES left_context[2];
+  ENTROPY_CONTEXT_PLANES above_context[2];
+  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
+    + mb_col;
+
+  /* Function should not modify L & A contexts; save and restore on exit */
+  vpx_memcpy (left_context,
+              cm->left_context,
+              sizeof(left_context));
+  vpx_memcpy (above_context,
+              initial_above_context_ptr,
+              sizeof(above_context));
+
+  map_index = (mb_row * cpi->common.mb_cols) + mb_col;
+  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
+  /* set above context pointer */
+  xd->above_context = cm->above_context + mb_col;
+
+  /* Restore the appropriate left context depending on which
+   * row in the SB the MB is situated */
+  xd->left_context = cm->left_context;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units
+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+  xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+
+  /* Set up limit values for MV components to prevent them from
+   * extending beyond the UMV borders assuming 16x16 block size */
+  x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                   (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+
+  xd->up_available   = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
+
+  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
+  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
+
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+#if 0 // FIXME
+  /* Copy current MB to a work buffer */
+  RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer,
+                                            x->src.y_stride,
+                                            x->thismb, 16);
+#endif
+  x->rddiv = cpi->RDDIV;
+  x->rdmult = cpi->RDMULT;
+  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    vp8_activity_masking(cpi, x);
+  /* Is segmentation enabled */
+  if (xd->segmentation_enabled)
+  {
+    /* Code to set segment id in xd->mbmi.segment_id */
+    if (xd->update_mb_segmentation_map)
+      xd->mode_info_context->mbmi.segment_id =
+            cpi->segmentation_map[map_index] &&
+            cpi->segmentation_map[map_index + 1] &&
+            cpi->segmentation_map[map_index + cm->mb_cols] &&
+            cpi->segmentation_map[map_index + cm->mb_cols + 1];
+    else
+      xd->mode_info_context->mbmi.segment_id =
+            cm->last_frame_seg_map[map_index] &&
+            cm->last_frame_seg_map[map_index + 1] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols] &&
+            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
+    if (xd->mode_info_context->mbmi.segment_id > 3)
+      xd->mode_info_context->mbmi.segment_id = 0;
+
+    vp8cx_mb_init_quantizer(cpi, x);
+  }
+  else
+    /* Set to Segment 0 by default */
+    xd->mode_info_context->mbmi.segment_id = 0;
+
+  x->active_ptr = cpi->active_map + map_index;
+  
+  cpi->update_context = 0;    // TODO Do we need this now??
+
+  /* Find best coding mode & reconstruct the MB so it is available
+   * as a predictor for MBs that follow in the SB */
+  if (cm->frame_type == KEY_FRAME)
+  {
+    vp8_rd_pick_intra_mode_sb(cpi, x,
+                              totalrate,
+                              totaldist);
+
+    /* Save the coding context */
+    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
+               sizeof(MODE_INFO));
+  }
+  else
+  {
+    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+        !segfeature_active( xd, 0, SEG_LVL_REF_FRAME ) &&
+        segfeature_active( xd, 1, SEG_LVL_REF_FRAME ) &&
+        check_segref(xd, 1, INTRA_FRAME)  +
+        check_segref(xd, 1, LAST_FRAME)   +
+        check_segref(xd, 1, GOLDEN_FRAME) +
+        check_segref(xd, 1, ALTREF_FRAME) == 1)
+    {
+      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+    }
+    else
+    {
+      cpi->seg0_progress =
+        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
+    }
+
+    vp8_rd_pick_inter_mode_sb(cpi, x,
+                              recon_yoffset,
+                              recon_uvoffset,
+                              totalrate,
+                              totaldist);
+  }
+
+  /* Restore L & A coding context to those in place on entry */
+  vpx_memcpy (cm->left_context,
+              left_context,
+              sizeof(left_context));
+  vpx_memcpy (initial_above_context_ptr,
+              above_context,
+              sizeof(above_context));
+}
+#endif
+
 static void encode_sb(VP8_COMP *cpi,
                       VP8_COMMON *cm,
                       int mbrow,
@@ -679,6 +852,7 @@
                       MACROBLOCK  *x,
                       MACROBLOCKD *xd,
                       TOKENEXTRA **tp) {
+  VP8_COMMON *pc = cm;
   int i;
   int map_index;
   int mb_row, mb_col;
@@ -733,22 +907,19 @@
 
     // Restore MB state to that when it was picked
 #if CONFIG_SUPERBLOCKS
-    if (x->encode_as_sb)
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
       update_state(cpi, x, &x->sb_context[i]);
-    else
+      cpi->sb_count++;
+    } else
 #endif
       update_state(cpi, x, &x->mb_context[i]);
 
-    // Copy in the appropriate left context
-    vpx_memcpy(&cm->left_context,
-               &cpi->left_context[i >> 1],
-               sizeof(ENTROPY_CONTEXT_PLANES));
-
     map_index = (mb_row * cpi->common.mb_cols) + mb_col;
     x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
 
     // reset above block coeffs
     xd->above_context = cm->above_context + mb_col;
+    xd->left_context  = cm->left_context + (i >> 1);
 
     // Set up distance of MB to edge of the frame in 1/8th pel units
     xd->mb_to_top_edge    = -((mb_row * 16) << 3);
@@ -756,25 +927,29 @@
     xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
     xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
 
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 16x16 block size
-    x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                     (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
-
 #if CONFIG_SUPERBLOCKS
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 32x32 block size
-    x->mv_row_min_sb = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_col_min_sb = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
-    x->mv_row_max_sb = ((cm->mb_rows - mb_row) * 16 +
-                        (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
-    x->mv_col_max_sb = ((cm->mb_cols - mb_col) * 16 +
-                        (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 32x32 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 32 - INTERP_EXTEND));
+    } else {
 #endif
+      // Set up limit values for MV components to prevent them from
+      // extending beyond the UMV borders assuming 16x16 block size
+      x->mv_row_min = -((mb_row * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_col_min = -((mb_col * 16) + VP8BORDERINPIXELS - INTERP_EXTEND);
+      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                       (VP8BORDERINPIXELS - 16 - INTERP_EXTEND));
+#if CONFIG_SUPERBLOCKS
+    }
+#endif
 
     xd->up_available = (mb_row != 0);
     xd->left_available = (mb_col != 0);
@@ -796,16 +971,8 @@
 
     // Is segmentation enabled
     if (xd->segmentation_enabled) {
-      // Code to set segment id in xd->mbmi.segment_id
-      if (cpi->segmentation_map[map_index] <= 3)
-        mbmi->segment_id = cpi->segmentation_map[map_index];
-      else
-        mbmi->segment_id = 0;
-
       vp8cx_mb_init_quantizer(cpi, x);
-    } else
-      // Set to Segment 0 by default
-      mbmi->segment_id = 0;
+    }
 
     x->active_ptr = cpi->active_map + map_index;
 
@@ -812,8 +979,13 @@
     cpi->update_context = 0;
 
     if (cm->frame_type == KEY_FRAME) {
-      vp8cx_encode_intra_macro_block(cpi, x, tp, 1);
-      // Note the encoder may have changed the segment_id
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp8cx_encode_intra_super_block(cpi, x, tp, mb_col);
+      else
+#endif
+        vp8cx_encode_intra_macro_block(cpi, x, tp, 1);
+        // Note the encoder may have changed the segment_id
 
 #ifdef MODE_STATS
       y_modes[mbmi->mode]++;
@@ -822,10 +994,26 @@
       unsigned char *segment_id;
       int seg_ref_active;
 
-      vp8cx_encode_inter_macroblock(cpi, x, tp,
-                                    recon_yoffset, recon_uvoffset, 1);
-      // Note the encoder may have changed the segment_id
+      if (xd->mode_info_context->mbmi.ref_frame) {
+        unsigned char pred_context;
 
+        pred_context = get_pred_context(cm, xd, PRED_COMP);
+
+        if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME)
+          cpi->single_pred_count[pred_context]++;
+        else
+          cpi->comp_pred_count[pred_context]++;
+      }
+
+#if CONFIG_SUPERBLOCKS
+      if (xd->mode_info_context->mbmi.encoded_as_sb)
+        vp8cx_encode_inter_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset, mb_col, mb_row);
+      else
+#endif
+        vp8cx_encode_inter_macroblock(cpi, x, tp,
+                                      recon_yoffset, recon_uvoffset, 1);
+        // Note the encoder may have changed the segment_id
+
 #ifdef MODE_STATS
       inter_y_modes[mbmi->mode]++;
 
@@ -864,11 +1052,21 @@
     // TODO Partitioning is broken!
     cpi->tplist[mb_row].stop = *tp;
 
-    // Copy back updated left context
-    vpx_memcpy(&cpi->left_context[i >> 1],
-               &cm->left_context,
-               sizeof(ENTROPY_CONTEXT_PLANES));
+#if CONFIG_SUPERBLOCKS
+    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      x->src.y_buffer += 32;
+      x->src.u_buffer += 16;
+      x->src.v_buffer += 16;
 
+      x->gf_active_ptr      += 2;
+      x->partition_info     += 2;
+      xd->mode_info_context += 2;
+      xd->prev_mode_info_context += 2;
+      
+      break;
+    }
+#endif
+
     // Next MB
     mb_row += dy;
     mb_col += dx;
@@ -911,14 +1109,13 @@
   int mb_cols = cm->mb_cols;
 
   // Initialize the left context for the new SB row
-  vpx_memset(cpi->left_context, 0, sizeof(cpi->left_context));
-  vpx_memset(&cm->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
 
   // Code each SB in the row
   for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
-    int mb_rate = 0;
+    int mb_rate = 0, mb_dist = 0;
 #if CONFIG_SUPERBLOCKS
-    int sb_rate = INT_MAX;
+    int sb_rate = INT_MAX, sb_dist;
 #endif
 
 #if CONFIG_DEBUG
@@ -930,8 +1127,14 @@
     unsigned char *vb = x->src.v_buffer;
 #endif
 
+#if CONFIG_SUPERBLOCKS
     // Pick modes assuming the SB is coded as 4 independent MBs
-    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate);
+    xd->mode_info_context->mbmi.encoded_as_sb = 0;
+#endif
+    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
+#if CONFIG_SUPERBLOCKS
+    mb_rate += vp8_cost_bit(cm->sb_coded, 0);
+#endif
 
     x->src.y_buffer -= 32;
     x->src.u_buffer -= 16;
@@ -952,21 +1155,40 @@
 #endif
 
 #if CONFIG_SUPERBLOCKS
-    // Pick a mode assuming that it applies all 4 of the MBs in the SB
-    pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, &sb_rate);
+    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||
+          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
+      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
+      sb_rate += vp8_cost_bit(cm->sb_coded, 1);
+    }
 
-    // Decide whether to encode as a SB or 4xMBs
-    if (sb_rate < mb_rate) {
-      x->encode_as_sb = 1;
+    /* Decide whether to encode as a SB or 4xMBs */
+    if (sb_rate < INT_MAX &&
+        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
+          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
+      xd->mode_info_context->mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
+      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
       *totalrate += sb_rate;
     } else
 #endif
     {
-      x->encode_as_sb = 0;
+#if CONFIG_SUPERBLOCKS
+      xd->mode_info_context->mbmi.encoded_as_sb = 0;
+      if (cm->mb_cols - 1 > mb_col)
+        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
+      if (cm->mb_rows - 1 > mb_row) {
+        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+        if (cm->mb_cols - 1 > mb_col)
+          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+      }
+#endif
       *totalrate += mb_rate;
     }
 
-    // Encode SB using best computed mode(s)
+    /* Encode SB using best computed mode(s) */
     encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
 
 #if CONFIG_DEBUG
@@ -1038,8 +1260,6 @@
   xd->mode_info_context->mbmi.mode = DC_PRED;
   xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
-  xd->left_context = &cm->left_context;
-
   vp8_zero(cpi->count_mb_ref_frame_usage)
   vp8_zero(cpi->bmode_count)
   vp8_zero(cpi->ymode_count)
@@ -1049,6 +1269,10 @@
   vp8_zero(cpi->mbsplit_count)
   vp8_zero(cpi->common.fc.mv_ref_ct)
   vp8_zero(cpi->common.fc.mv_ref_ct_a)
+#if CONFIG_SUPERBLOCKS
+  vp8_zero(cpi->sb_ymode_count)
+  cpi->sb_count = 0;
+#endif
   // vp8_zero(cpi->uv_mode_count)
 
   x->mvc = cm->fc.mvc;
@@ -1380,7 +1604,12 @@
   }
 #endif
 
-  ++cpi->ymode_count[m];
+#if CONFIG_SUPERBLOCKS
+  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    ++cpi->sb_ymode_count[m];
+  } else
+#endif
+    ++cpi->ymode_count[m];
   if (m != I8X8_PRED)
     ++cpi->y_uv_mode_count[m][uvm];
   else {
@@ -1418,6 +1647,160 @@
 #endif
 }
 
+#if CONFIG_SUPERBLOCKS
+static void update_sb_skip_coeff_state(VP8_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       ENTROPY_CONTEXT_PLANES ta[4],
+                                       ENTROPY_CONTEXT_PLANES tl[4],
+                                       TOKENEXTRA *t[4],
+                                       TOKENEXTRA **tp,
+                                       int skip[4])
+{
+  TOKENEXTRA tokens[4][16 * 24];
+  int n_tokens[4], n;
+
+  // if there were no skips, we don't need to do anything
+  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
+    return;
+
+  // if we don't do coeff skipping for this frame, we don't
+  // need to do anything here
+  if (!cpi->common.mb_no_coeff_skip)
+    return;
+
+  // if all 4 MBs skipped coeff coding, nothing to be done
+  if (skip[0] && skip[1] && skip[2] && skip[3])
+    return;
+
+  // so the situation now is that we want to skip coeffs
+  // for some MBs, but not all, and we didn't code EOB
+  // coefficients for them. However, the skip flag for this
+  // SB will be 0 overall, so we need to insert EOBs in the
+  // middle of the token tree. Do so here.
+  n_tokens[0] = t[1] - t[0];
+  n_tokens[1] = t[2] - t[1];
+  n_tokens[2] = t[3] - t[2];
+  n_tokens[3] = *tp  - t[3];
+  if (n_tokens[0])
+    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));
+  if (n_tokens[1])
+    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));
+  if (n_tokens[2])
+    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));
+  if (n_tokens[3])
+    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));
+
+  // reset pointer, stuff EOBs where necessary
+  *tp = t[0];
+  for (n = 0; n < 4; n++) {
+    TOKENEXTRA *tbak = *tp;
+    if (skip[n]) {
+      x->e_mbd.above_context = &ta[n];
+      x->e_mbd.left_context  = &tl[n];
+      vp8_stuff_mb_8x8(cpi, &x->e_mbd, tp, 0);
+    } else {
+      if (n_tokens[n]) {
+        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+      }
+      (*tp) += n_tokens[n];
+    }
+  }
+}
+
+void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
+                                    MACROBLOCK *x,
+                                    TOKENEXTRA **t,
+                                    int mb_col) {
+  const int output_enabled = 1;
+  int n;
+  MACROBLOCKD *xd = &x->e_mbd;
+  VP8_COMMON *cm = &cpi->common;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  if ((cpi->oxcf.tuning == VP8_TUNE_SSIM) && output_enabled) {
+    adjust_act_zbin(cpi, x);
+    vp8_update_zbin_extra(cpi, x);
+  }
+
+  /* test code: set transform size based on mode selection */
+  if (cpi->common.txfm_mode == ALLOW_8X8) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[1].mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[cm->mode_info_stride].mbmi.txfm_size = TX_8X8;
+    x->e_mbd.mode_info_context[cm->mode_info_stride+1].mbmi.txfm_size = TX_8X8;
+    cpi->t8x8_count++;
+  } else {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    cpi->t4x4_count++;
+  }
+
+  RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
+  RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++)
+  {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    xd->above_context = cm->above_context + mb_col + (n & 1);
+    xd->left_context = cm->left_context + (n >> 1);
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp8_transform_intra_mby_8x8(x);
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mby_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+    if (x->optimize) {
+      vp8_optimize_mby_8x8(x, rtcd);
+      vp8_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_recon_mby_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp8_recon_mbuv_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (output_enabled) {
+      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+      vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
+      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+    }
+  }
+
+  if (output_enabled) {
+    // Tokenize
+    xd->mode_info_context = mi;
+    sum_intra_stats(cpi, x);
+    update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+  }
+}
+#endif
+
 void vp8cx_encode_intra_macro_block(VP8_COMP *cpi,
                                     MACROBLOCK *x,
                                     TOKENEXTRA **t,
@@ -1484,6 +1867,9 @@
   unsigned char ref_pred_flag;
 
   x->skip = 0;
+#if CONFIG_SUPERBLOCKS
+  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
+#endif
 
 #if CONFIG_SWITCHABLE_INTERP
   vp8_setup_interp_filters(xd, mbmi->interp_filter, cm);
@@ -1648,3 +2034,190 @@
     }
   }
 }
+
+#if CONFIG_SUPERBLOCKS
+void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+                                   int recon_yoffset, int recon_uvoffset, int mb_col, int mb_row) {
+  const int output_enabled = 1;
+  VP8_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
+  int mis = xd->mode_info_stride;
+  unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int seg_ref_active;
+  unsigned char ref_pred_flag;
+  int n;
+  TOKENEXTRA *tp[4];
+  int skip[4];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+
+  x->skip = 0;
+
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+    // Adjust the zbin based on this MB rate.
+    adjust_act_zbin(cpi, x);
+  }
+
+  {
+    // Experimental code. Special case for gf and arf zeromv modes.
+    // Increase zbin size to suppress noise
+    cpi->zbin_mode_boost = 0;
+    if (cpi->zbin_mode_boost_enabled) {
+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
+          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+          else
+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+        } else if (xd->mode_info_context->mbmi.mode == SPLITMV)
+          cpi->zbin_mode_boost = 0;
+        else
+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+      }
+    }
+
+    vp8_update_zbin_extra(cpi, x);
+  }
+
+  seg_ref_active = segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
+
+  // SET VARIOUS PREDICTION FLAGS
+
+  // Did the chosen reference frame match its predicted value.
+  ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
+                    get_pred_ref(cm, xd)));
+  set_pred_flag(xd, PRED_REF, ref_pred_flag);
+
+  /* test code: set transform size based on mode selection */
+  if (cpi->common.txfm_mode == ALLOW_8X8
+      && x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED
+      && x->e_mbd.mode_info_context->mbmi.mode != B_PRED
+      && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+    cpi->t8x8_count++;
+  } else {
+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_4X4;
+    cpi->t4x4_count++;
+  }
+
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sby_s)(&x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_sbuv_s)(&x->e_mbd);
+  } else {
+    int ref_fb_idx;
+
+    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+      ref_fb_idx = cpi->common.lst_fb_idx;
+    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cpi->common.gld_fb_idx;
+    else
+      ref_fb_idx = cpi->common.alt_fb_idx;
+
+    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+    if (xd->mode_info_context->mbmi.second_ref_frame) {
+      int second_ref_fb_idx;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cpi->common.lst_fb_idx;
+      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cpi->common.gld_fb_idx;
+      else
+        second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +
+                                    recon_yoffset;
+      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +
+                                    recon_uvoffset;
+      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +
+                                    recon_uvoffset;
+    }
+
+    vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  assert(x->e_mbd.mode_info_context->mbmi.txfm_size == TX_8X8);
+  for (n = 0; n < 4; n++)
+  {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      vp8_transform_intra_mby_8x8(x);
+    } else {
+      vp8_transform_mby_8x8(x);
+    }
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mby_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+    if (x->optimize) {
+      vp8_optimize_mby_8x8(x, rtcd);
+      vp8_optimize_mbuv_8x8(x, rtcd);
+    }
+    vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+    vp8_recon_mby_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+    vp8_recon_mbuv_s_c(IF_RTCD(&rtcd->common->recon), &x->e_mbd,
+                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+    if (!x->skip) {
+      if (output_enabled) {
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n >> 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
+        vp8_tokenize_mb(cpi, &x->e_mbd, t, 0);
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+      }
+    } else {
+      int mb_skip_context =
+        cpi->common.mb_no_coeff_skip ?
+          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+          0;
+      if (cpi->common.mb_no_coeff_skip) {
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        xd->left_context = cm->left_context + (n >> 1);
+        xd->above_context = cm->above_context + mb_col + (n >> 1);
+        memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+        memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+        tp[n] = *t;
+        cpi->skip_true_count[mb_skip_context]++;
+        vp8_fix_contexts(xd);
+      } else {
+        vp8_stuff_mb(cpi, xd, t, 0);
+        xd->mode_info_context->mbmi.mb_skip_coeff = 0;
+        cpi->skip_false_count[mb_skip_context]++;
+      }
+    }
+  }
+
+  xd->mode_info_context = mi;
+  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
+}
+#endif
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -67,11 +67,10 @@
   }
 }
 
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_s_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride,
+                           unsigned char *upred, unsigned char *vpred, int dst_stride) {
   short *udiff = diff + 256;
   short *vdiff = diff + 320;
-  unsigned char *upred = pred + 256;
-  unsigned char *vpred = pred + 320;
 
   int r, c;
 
@@ -81,8 +80,8 @@
     }
 
     udiff += 8;
-    upred += 8;
-    usrc  += stride;
+    upred += dst_stride;
+    usrc  += src_stride;
   }
 
   for (r = 0; r < 8; r++) {
@@ -91,12 +90,19 @@
     }
 
     vdiff += 8;
-    vpred += 8;
-    vsrc  += stride;
+    vpred += dst_stride;
+    vsrc  += src_stride;
   }
 }
 
-void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+  unsigned char *upred = pred + 256;
+  unsigned char *vpred = pred + 320;
+
+  vp8_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
+}
+
+void vp8_subtract_mby_s_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int dst_stride) {
   int r, c;
 
   for (r = 0; r < 16; r++) {
@@ -105,9 +111,14 @@
     }
 
     diff += 16;
-    pred += 16;
-    src  += stride;
+    pred += dst_stride;
+    src  += src_stride;
   }
+}
+
+void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
+{
+  vp8_subtract_mby_s_c(diff, src, stride, pred, 16);
 }
 
 static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -23,6 +23,9 @@
 void vp8_cmachine_specific_config(VP8_COMP *cpi) {
 #if CONFIG_RUNTIME_CPU_DETECT
   cpi->rtcd.common                    = &cpi->common.rtcd;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32              = vp8_sad32x32_c;
+#endif
   cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
   cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
   cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
@@ -29,6 +32,9 @@
   cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
   cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x3            = vp8_sad32x32x3_c;
+#endif
   cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_c;
   cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_c;
   cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_c;
@@ -35,6 +41,9 @@
   cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
   cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x8            = vp8_sad32x32x8_c;
+#endif
   cpi->rtcd.variance.sad16x16x8            = vp8_sad16x16x8_c;
   cpi->rtcd.variance.sad16x8x8             = vp8_sad16x8x8_c;
   cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_c;
@@ -41,6 +50,9 @@
   cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_c;
   cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_c;
 
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.sad32x32x4d           = vp8_sad32x32x4d_c;
+#endif
   cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
   cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
   cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
@@ -54,6 +66,9 @@
   cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
   cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
   cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.var32x32              = vp8_variance32x32_c;
+#endif
 
   cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
   cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
@@ -60,10 +75,25 @@
   cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
   cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
   cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.subpixvar32x32        = vp8_sub_pixel_variance32x32_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_h     = vp8_variance_halfpixvar16x16_h_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_h     = vp8_variance_halfpixvar32x32_h_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_v     = vp8_variance_halfpixvar16x16_v_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_v     = vp8_variance_halfpixvar32x32_v_c;
+#endif
   cpi->rtcd.variance.halfpixvar16x16_hv    = vp8_variance_halfpixvar16x16_hv_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.halfpixvar32x32_hv    = vp8_variance_halfpixvar32x32_hv_c;
+#endif
   cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_c;
+#if CONFIG_SUPERBLOCKS
+  cpi->rtcd.variance.subpixmse32x32        = vp8_sub_pixel_mse32x32_c;
+#endif
 
   cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
   cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -243,7 +243,7 @@
   int y_stride;
   int offset;
 
-#if ARCH_X86 || ARCH_X86_64
+#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
   unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
   unsigned char *y;
   int buf_r1, buf_r2, buf_c1, buf_c2;
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -620,6 +620,42 @@
   fclose(statsfile);
 }
 
+static void update_reference_segmentation_map(VP8_COMP *cpi) {
+  VP8_COMMON *cm = &cpi->common;
+  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
+  MODE_INFO *mi = cm->mi;
+  uint8_t *segmap = cpi->segmentation_map;
+  uint8_t *segcache = cm->last_frame_seg_map;
+
+  for (row = 0; row < sb_rows; row++) {
+    for (col = 0; col < sb_cols; col++) {
+      MODE_INFO *miptr = mi + col * 2;
+      uint8_t *seg = segmap + col * 2;
+      uint8_t *cache = segcache + col * 2;
+#if CONFIG_SUPERBLOCKS
+      if (miptr->mbmi.encoded_as_sb) {
+        cache[0] = cache[1] = cache[cm->mb_cols] = cache[cm->mb_cols + 1] =
+          miptr->mbmi.segment_id;
+      } else
+#endif
+      {
+        cache[0] = miptr[0].mbmi.segment_id;
+        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+          cache[1] = miptr[1].mbmi.segment_id;
+        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
+          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
+          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
+            cache[1] = miptr[1].mbmi.segment_id;
+          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
+        }
+      }
+    }
+    segmap += 2 * cm->mb_cols;
+    segcache += 2 * cm->mb_cols;
+    mi += 2 * cm->mode_info_stride;
+  }
+}
+
 static void set_default_lf_deltas(VP8_COMP *cpi) {
   cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -1736,6 +1772,9 @@
   cm->prob_last_coded               = 128;
   cm->prob_gf_coded                 = 128;
   cm->prob_intra_coded              = 63;
+#if CONFIG_SUPERBLOCKS
+  cm->sb_coded                      = 200;
+#endif
   for (i = 0; i < COMP_PRED_CONTEXTS; i++)
     cm->prob_comppred[i]         = 128;
 
@@ -1919,6 +1958,18 @@
   init_mv_ref_counts();
 #endif
 
+#if CONFIG_SUPERBLOCKS
+  cpi->fn_ptr[BLOCK_32X32].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32);
+  cpi->fn_ptr[BLOCK_32X32].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32);
+  cpi->fn_ptr[BLOCK_32X32].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar32x32);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_h  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_h);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_v);
+  cpi->fn_ptr[BLOCK_32X32].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar32x32_hv);
+  cpi->fn_ptr[BLOCK_32X32].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x3);
+  cpi->fn_ptr[BLOCK_32X32].sdx8f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x8);
+  cpi->fn_ptr[BLOCK_32X32].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad32x32x4d);
+#endif
+
   cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
   cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
   cpi->fn_ptr[BLOCK_16X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
@@ -3617,6 +3668,10 @@
   // build the bitstream
   cpi->dummy_packing = 0;
   vp8_pack_bitstream(cpi, dest, size);
+
+  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+    update_reference_segmentation_map(cpi);
+  }
 
 #if CONFIG_PRED_FILTER
   // Select the prediction filtering mode to use for the
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -359,7 +359,9 @@
   BLOCK_8X8,
   BLOCK_4X4,
   BLOCK_16X16,
-  BLOCK_MAX_SEGMENTS
+  BLOCK_MAX_SEGMENTS,
+  BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+  BLOCK_MAX_SB_SEGMENTS,
 };
 
 typedef struct VP8_COMP {
@@ -528,6 +530,10 @@
 
   int cq_target_quality;
 
+#if CONFIG_SUPERBLOCKS
+  int sb_count;
+  int sb_ymode_count [VP8_I32X32_MODES];
+#endif
   int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
   int bmode_count [VP8_BINTRAMODES];
   int i8x8_mode_count [VP8_I8X8_MODES];
@@ -628,7 +634,7 @@
   vp8_full_search_fn_t full_search_sad;
   vp8_refining_search_fn_t refining_search_sad;
   vp8_diamond_search_fn_t diamond_search_sad;
-  vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
+  vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SB_SEGMENTS];
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
@@ -731,9 +737,6 @@
   int force_next_frame_intra;
 
   int droppable;
-
-  // Global store for SB left contexts, one for each MB row in the SB
-  ENTROPY_CONTEXT_PLANES left_context[2];
 
   // TODO Do we still need this??
   int update_context;
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -718,7 +718,7 @@
   *Rate = vp8_rdcost_mby(mb);
 }
 
-static int vp8_rdcost_mby_8x8(MACROBLOCK *mb) {
+static int vp8_rdcost_mby_8x8(MACROBLOCK *mb, int backup) {
   int cost = 0;
   int b;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -726,11 +726,16 @@
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
 
-  vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
 
   for (b = 0; b < 16; b += 4)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_Y_NO_DC,
@@ -775,7 +780,7 @@
 
   *Distortion = (d >> 2);
   // rate
-  *Rate = vp8_rdcost_mby_8x8(mb);
+  *Rate = vp8_rdcost_mby_8x8(mb, 1);
 }
 
 #if CONFIG_TX16X16
@@ -823,6 +828,66 @@
   d[12] = p[12];
 }
 
+#if CONFIG_SUPERBLOCKS
+static void super_block_yrd_8x8(MACROBLOCK *x,
+                                int *rate,
+                                int *distortion,
+                                const VP8_ENCODER_RTCD *rtcd, int *skip)
+{
+  MACROBLOCKD *const xd = &x->e_mbd;
+  BLOCK *const by2 = x->block + 24;
+  BLOCKD *const bdy2  = xd->block + 24;
+  int d = 0, r = 0, n;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+  ENTROPY_CONTEXT_PLANES t_above[2];
+  ENTROPY_CONTEXT_PLANES t_left[2];
+  int skippable = 1;
+
+  vpx_memcpy(t_above, xd->above_context, sizeof(t_above));
+  vpx_memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+    vp8_transform_mby_8x8(x);
+    vp8_quantize_mby_8x8(x);
+
+    /* remove 1st order dc to properly combine 1st/2nd order distortion */
+    x->coeff[  0] = 0;
+    x->coeff[ 64] = 0;
+    x->coeff[128] = 0;
+    x->coeff[192] = 0;
+    xd->dqcoeff[  0] = 0;
+    xd->dqcoeff[ 64] = 0;
+    xd->dqcoeff[128] = 0;
+    xd->dqcoeff[192] = 0;
+
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, 0);
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, berr)(by2->coeff, bdy2->dqcoeff, 16);
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += vp8_rdcost_mby_8x8(x, 0);
+    skippable = skippable && mby_is_skippable_8x8(xd);
+  }
+
+  *distortion = (d >> 2);
+  *rate       = r;
+  if (skip) *skip = skippable;
+  xd->above_context = ta;
+  xd->left_context = tl;
+  vpx_memcpy(xd->above_context, &t_above, sizeof(t_above));
+  vpx_memcpy(xd->left_context, &t_left, sizeof(t_left));
+}
+#endif
+
 static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {
   const unsigned int *p = (const unsigned int *)predictor;
   unsigned int *d = (unsigned int *)dst;
@@ -1062,7 +1127,46 @@
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_pick_intra_sby_mode(VP8_COMP *cpi,
+                                      MACROBLOCK *x,
+                                      int *rate,
+                                      int *rate_tokenonly,
+                                      int *distortion) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int this_rate, this_rate_tokenonly;
+  int this_distortion;
+  int64_t best_rd = INT64_MAX, this_rd;
 
+  /* Y Search for 32x32 intra prediction mode */
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.mode = mode;
+    RECON_INVOKE(&cpi->common.rtcd.recon,
+                 build_intra_predictors_sby_s)(&x->e_mbd);
+
+    super_block_yrd_8x8(x, &this_rate_tokenonly,
+                        &this_distortion, IF_RTCD(&cpi->rtcd), NULL);
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+
+  return best_rd;
+}
+#endif
+
 static int64_t rd_pick_intra16x16mby_mode(VP8_COMP *cpi,
                                       MACROBLOCK *x,
                                       int *Rate,
@@ -1372,7 +1476,7 @@
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static int rd_cost_mbuv_8x8(MACROBLOCK *mb) {
+static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {
   int b;
   int cost = 0;
   MACROBLOCKD *xd = &mb->e_mbd;
@@ -1379,11 +1483,16 @@
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
-  vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
+    ta = (ENTROPY_CONTEXT *)&t_above;
+    tl = (ENTROPY_CONTEXT *)&t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;
+    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;
+  }
 
   for (b = 16; b < 24; b += 4)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
@@ -1393,7 +1502,55 @@
   return cost;
 }
 
+#if CONFIG_SUPERBLOCKS
+static int64_t rd_inter32x32_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                                int *distortion, int fullpixel, int *skip) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  int n, r = 0, d = 0;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  int skippable = 1;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
 
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left, xd->left_context, sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+    d += ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+    skippable = skippable && mbuv_is_skippable_8x8(xd);
+  }
+
+  *rate = r;
+  *distortion = d;
+  if (skip) *skip = skippable;
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context, t_left, sizeof(t_left));
+
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+#endif
+
 static int64_t rd_inter16x16_uv_8x8(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
                                     int *distortion, int fullpixel) {
   ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
@@ -1403,7 +1560,7 @@
 
   vp8_quantize_mbuv_8x8(x);
 
-  *rate       = rd_cost_mbuv_8x8(x);
+  *rate       = rd_cost_mbuv_8x8(x, 1);
   *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
 
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
@@ -1527,7 +1684,7 @@
 
     vp8_quantize_mbuv_8x8(x);
 
-    rate_to = rd_cost_mbuv_8x8(x);
+    rate_to = rd_cost_mbuv_8x8(x, 1);
     rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
     distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
@@ -1546,6 +1703,91 @@
   mbmi->uv_mode = mode_selected;
 }
 
+#if CONFIG_SUPERBLOCKS
+static void super_block_uvrd_8x8(MACROBLOCK *x,
+                                 int *rate,
+                                 int *distortion,
+                                 const VP8_ENCODER_RTCD *rtcd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int d = 0, r = 0, n;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
+
+  memcpy(t_above, xd->above_context, sizeof(t_above));
+  memcpy(t_left,  xd->left_context,  sizeof(t_left));
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+
+    vp8_subtract_mbuv_s_c(x->src_diff,
+                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                          src_uv_stride,
+                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                          dst_uv_stride);
+    vp8_transform_mbuv_8x8(x);
+    vp8_quantize_mbuv_8x8(x);
+
+    d += ENCODEMB_INVOKE(&rtcd->encodemb, mbuverr)(x) >> 2;
+    xd->above_context = ta + x_idx;
+    xd->left_context = tl + y_idx;
+    r += rd_cost_mbuv_8x8(x, 0);
+  }
+
+  xd->above_context = ta;
+  xd->left_context = tl;
+  *distortion = (d >> 2);
+  *rate       = r;
+
+  xd->left_context = tl;
+  xd->above_context = ta;
+  memcpy(xd->above_context, t_above, sizeof(t_above));
+  memcpy(xd->left_context,  t_left,  sizeof(t_left));
+}
+
+static int64_t rd_pick_intra_sbuv_mode(VP8_COMP *cpi,
+                                       MACROBLOCK *x,
+                                       int *rate,
+                                       int *rate_tokenonly,
+                                       int *distortion) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate_tokenonly, this_rate;
+  int this_distortion;
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
+    RECON_INVOKE(&cpi->rtcd.common->recon,
+                 build_intra_predictors_sbuv_s)(&x->e_mbd);
+
+    super_block_uvrd_8x8(x, &this_rate_tokenonly,
+                         &this_distortion, IF_RTCD(&cpi->rtcd));
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+
+  return best_rd;
+}
+#endif
+
 int vp8_cost_mv_ref(VP8_COMP *cpi,
                     MB_PREDICTION_MODE m,
                     const int near_mv_ref_ct[4]) {
@@ -2568,25 +2810,33 @@
   }
 }
 
-static void store_coding_context(MACROBLOCK *x, int mb_index,
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                                  int mode_index,
                                  PARTITION_INFO *partition,
                                  int_mv *ref_mv,
-                                 int_mv *second_ref_mv) {
+                                 int_mv *second_ref_mv,
+                                 int single_pred_diff,
+                                 int comp_pred_diff,
+                                 int hybrid_pred_diff) {
   MACROBLOCKD *xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
-  x->mb_context[mb_index].best_mode_index = mode_index;
-  vpx_memcpy(&x->mb_context[mb_index].mic, xd->mode_info_context,
+  ctx->best_mode_index = mode_index;
+  vpx_memcpy(&ctx->mic, xd->mode_info_context,
              sizeof(MODE_INFO));
-  vpx_memcpy(&x->mb_context[mb_index].partition_info, partition,
-             sizeof(PARTITION_INFO));
-  x->mb_context[mb_index].best_ref_mv.as_int = ref_mv->as_int;
-  x->mb_context[mb_index].second_best_ref_mv.as_int = second_ref_mv->as_int;
+  if (partition)
+    vpx_memcpy(&ctx->partition_info, partition,
+               sizeof(PARTITION_INFO));
+  ctx->best_ref_mv.as_int = ref_mv->as_int;
+  ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
 
-  // x->mb_context[mb_index].rddiv = x->rddiv;
-  // x->mb_context[mb_index].rdmult = x->rdmult;
+  // ctx[mb_index].rddiv = x->rddiv;
+  // ctx[mb_index].rdmult = x->rdmult;
+
+  ctx->single_pred_diff = single_pred_diff;
+  ctx->comp_pred_diff   = comp_pred_diff;
+  ctx->hybrid_pred_diff = hybrid_pred_diff;
 }
 
 static void inter_mode_cost(VP8_COMP *cpi, MACROBLOCK *x, int this_mode,
@@ -3464,7 +3714,7 @@
     }
 #endif
 
-    if (x->skip)
+    if (x->skip && !mode_excluded)
       break;
   }
 
@@ -3557,16 +3807,36 @@
   }
 
 end:
-  // TODO Save these to add in only if MB coding mode is selected?
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
-    cpi->rd_comp_pred_diff[i] += best_pred_diff[i];
+  store_coding_context(x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+                       &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+                       best_pred_diff[0], best_pred_diff[1], best_pred_diff[2]);
+}
 
-  store_coding_context(x, xd->mb_index, best_mode_index, &best_partition,
-                       &frame_best_ref_mv[mbmi->ref_frame],
-                       &frame_best_ref_mv[mbmi->second_ref_frame]);
+#if CONFIG_SUPERBLOCKS
+void vp8_rd_pick_intra_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                               int *returnrate,
+                               int *returndist) {
+  int rate_y, rate_uv;
+  int rate_y_tokenonly, rate_uv_tokenonly;
+  int error_y, error_uv;
+  int dist_y, dist_uv;
+
+  x->e_mbd.mode_info_context->mbmi.txfm_size = TX_8X8;
+
+  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                     &dist_uv);
+  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                   &dist_y);
+
+  // TODO(rbultje): add rate_uv
+  *returnrate = rate_y;
+  *returndist = dist_y + (dist_uv >> 2);
 }
+#endif
 
-int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x) {
+void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
+                            int *returnrate, int *returndist) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int64_t error4x4, error16x16;
@@ -3585,6 +3855,8 @@
   int rate8x8, dist8x8;
   int mode16x16;
   int mode8x8[2][4];
+  int dist;
+  int rateuv8, rateuv_tokenonly8, distuv8;
 
   mbmi->ref_frame = INTRA_FRAME;
   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
@@ -3646,9 +3918,11 @@
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      dist = dist4x4;
     } else {
       mbmi->mode = mode16x16;
       rate += rate16x16;
+      dist = dist16x16;
     }
   } else {
     if (error4x4 < error8x8) {
@@ -3663,18 +3937,728 @@
       rate += rate4x4;
 #endif
       mbmi->mode = B_PRED;
+      dist = dist4x4;
     } else {
       mbmi->mode = I8X8_PRED;
       set_i8x8_block_modes(x, mode8x8);
       rate += rate8x8;
+      dist = dist8x8;
     }
   }
-  return rate;
+
+  // TODO(rbultje): should add rateuv here also
+  *returnrate = rate - rateuv;
+  *returndist = dist + (distuv >> 2);
 }
 
-int vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                     int recon_yoffset, int recon_uvoffset) {
+#if CONFIG_SUPERBLOCKS
+int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
+                                  int recon_yoffset, int recon_uvoffset,
+                                  int *returnrate, int *returndistortion) {
   VP8_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  BLOCK *b = &x->block[0];
+  BLOCKD *d = &xd->block[0];
+  MB_PREDICTION_MODE this_mode;
+  MV_REFERENCE_FRAME ref_frame;
+  int mis = xd->mode_info_stride;
+  unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
+  int comp_pred;
+  int_mv best_ref_mv, second_best_ref_mv;
+  int_mv mode_mv[MB_MODE_COUNT];
+  int_mv frame_nearest_mv[4];
+  int_mv frame_near_mv[4];
+  int_mv frame_best_ref_mv[4];
+  int_mv mc_search_result[4];
+  int frame_mdcounts[4][4];
+  unsigned char *y_buffer[4];
+  unsigned char *u_buffer[4];
+  unsigned char *v_buffer[4];
+  static const int flag_list[4] = { 0, VP8_LAST_FLAG, VP8_GOLD_FLAG, VP8_ALT_FLAG };
+  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx, cpi->common.alt_fb_idx };
+  int mdcounts[4];
+  int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  int saddone = 0;
+  int sr = 0;  // search range got from mv_pred(). It uses step_param levels. (0-7)
+  int64_t best_rd = INT64_MAX;
+  int64_t best_comp_rd = INT64_MAX;
+  int64_t best_single_rd = INT64_MAX;
+  int64_t best_hybrid_rd = INT64_MAX;
+  int64_t best_yrd = INT64_MAX;
+  MB_MODE_INFO best_mbmode;
+  int mode_index = 0;
+#if 0
+  PARTITION_INFO best_partition;
+  union b_mode_info best_bmodes[16];
+#endif
+  unsigned int ref_costs[MAX_REF_FRAMES];
+
+  xd->mode_info_context->mbmi.segment_id = segment_id;
+  vp8_estimate_ref_frame_costs(cpi, segment_id, ref_costs);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      YV12_BUFFER_CONFIG *ref_buf = &cpi->common.yv12_fb[idx_list[ref_frame]];
+
+      vp8_find_near_mvs(xd, xd->mode_info_context,
+                        xd->prev_mode_info_context,
+                        &frame_nearest_mv[ref_frame], &frame_near_mv[ref_frame],
+                        &frame_best_ref_mv[ref_frame], frame_mdcounts[ref_frame],
+                        ref_frame, cpi->common.ref_frame_sign_bias);
+
+      y_buffer[ref_frame] = ref_buf->y_buffer + recon_yoffset;
+      u_buffer[ref_frame] = ref_buf->u_buffer + recon_uvoffset;
+      v_buffer[ref_frame] = ref_buf->v_buffer + recon_uvoffset;
+    }
+    mc_search_result[ref_frame].as_int = INVALID_MV;
+  }
+
+  for (mode_index = 0; mode_index < MAX_MODES; mode_index++) {
+    int_mv mvp;
+    int mode_excluded;
+    int64_t this_rd = INT64_MAX;
+    int disable_skip = 0;
+    int other_cost = 0;
+    int compmode_cost = 0;
+    int rate2 = 0;
+    int distortion2 = 0;
+    int rate_y = 0;
+    int rate_uv = 0;
+    int distortion_uv;
+    int distortion;
+    int skippable_y, skippable_uv;
+
+    // Test best rd so far against threshold for trying this mode.
+    if (best_rd <= cpi->rd_threshes[mode_index]) {
+      continue;
+    }
+
+    this_mode = vp8_mode_order[mode_index].mode;
+    ref_frame = vp8_mode_order[mode_index].ref_frame;
+    xd->mode_info_context->mbmi.ref_frame = ref_frame;
+    comp_pred = vp8_mode_order[mode_index].second_ref_frame != INTRA_FRAME;
+    xd->mode_info_context->mbmi.mode = this_mode;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+#if 0 && CONFIG_PRED_FILTER
+    xd->mode_info_context->mbmi.pred_filter_enabled = 0;
+#endif
+
+#if 0 && CONFIG_COMP_INTRA_PRED
+    xd->mode_info_context->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+    xd->mode_info_context->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
+#endif
+
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+
+    // not yet supported or not superblocky
+    // TODO(rbultje): support intra coding
+    if (ref_frame == INTRA_FRAME || this_mode == SPLITMV)
+      continue;
+
+    if (comp_pred) {
+      int second_ref;
+
+      if (ref_frame == ALTREF_FRAME) {
+        second_ref = LAST_FRAME;
+      } else {
+        second_ref = ref_frame + 1;
+      }
+      if (!(cpi->ref_frame_flags & flag_list[second_ref]))
+        continue;
+      xd->mode_info_context->mbmi.second_ref_frame = second_ref;
+
+      xd->second_pre.y_buffer = y_buffer[second_ref];
+      xd->second_pre.u_buffer = u_buffer[second_ref];
+      xd->second_pre.v_buffer = v_buffer[second_ref];
+      second_best_ref_mv  = frame_best_ref_mv[second_ref];
+      mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+    } else {
+      xd->mode_info_context->mbmi.second_ref_frame = INTRA_FRAME;
+      mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+    }
+
+    xd->pre.y_buffer = y_buffer[ref_frame];
+    xd->pre.u_buffer = u_buffer[ref_frame];
+    xd->pre.v_buffer = v_buffer[ref_frame];
+    mode_mv[ZEROMV].as_int = 0;
+    mode_mv[NEARESTMV] = frame_nearest_mv[ref_frame];
+    mode_mv[NEARMV] = frame_near_mv[ref_frame];
+    best_ref_mv = frame_best_ref_mv[ref_frame];
+    vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+        !check_segref(xd, segment_id, ref_frame)) {
+      continue;
+    }
+    // If the segment mode feature is enabled....
+    // then do nothing if the current mode is not allowed..
+    else if (segfeature_active(xd, segment_id, SEG_LVL_MODE)  &&
+             (this_mode != get_segdata(xd, segment_id, SEG_LVL_MODE))) {
+      continue;
+    }
+    // Disable this drop out case if either the mode or ref frame
+    // segment level feature is enabled for this segment. This is to
+    // prevent the possibility that we end up unable to pick any mode.
+    else if (!segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+             !segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+      // unless ARNR filtering is enabled in which case we want
+      // an unfiltered alternative
+      if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+        if (this_mode != ZEROMV || ref_frame != ALTREF_FRAME) {
+          continue;
+        }
+      }
+    }
+
+    if (!comp_pred) {
+      switch (this_mode) {
+        case NEWMV: {
+          int thissme;
+          int bestsme = INT_MAX;
+          int step_param = cpi->sf.first_step;
+          int further_steps;
+          int n;
+          int do_refine = 1;   /* If last step (1-away) of n-step search doesn't pick the center point as the best match,
+                                  we will do a final 1-away diamond refining search  */
+          int num00;
+
+          int sadpb = x->sadperbit16;
+          int_mv mvp_full;
+
+          int col_min = (best_ref_mv.as_mv.col >> 3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.col & 7) ? 1 : 0);
+          int row_min = (best_ref_mv.as_mv.row >> 3) - MAX_FULL_PEL_VAL + ((best_ref_mv.as_mv.row & 7) ? 1 : 0);
+          int col_max = (best_ref_mv.as_mv.col >> 3) + MAX_FULL_PEL_VAL;
+          int row_max = (best_ref_mv.as_mv.row >> 3) + MAX_FULL_PEL_VAL;
+
+          int tmp_col_min = x->mv_col_min;
+          int tmp_col_max = x->mv_col_max;
+          int tmp_row_min = x->mv_row_min;
+          int tmp_row_max = x->mv_row_max;
+
+          if (!saddone) {
+            vp8_cal_sad(cpi, xd, x, recon_yoffset, &near_sadidx[0]);
+            saddone = 1;
+          }
+
+          vp8_mv_pred(cpi, xs, xd->mode_info_context, &mvp,
+                      xd->mode_info_context->mbmi.ref_frame,
+                      cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
+
+          mvp_full.as_mv.col = mvp.as_mv.col >> 3;
+          mvp_full.as_mv.row = mvp.as_mv.row >> 3;
+
+          // Get intersection of UMV window and valid MV window to reduce # of checks in diamond search.
+          if (x->mv_col_min < col_min)
+            x->mv_col_min = col_min;
+          if (x->mv_col_max > col_max)
+            x->mv_col_max = col_max;
+          if (x->mv_row_min < row_min)
+            x->mv_row_min = row_min;
+          if (x->mv_row_max > row_max)
+            x->mv_row_max = row_max;
+
+          // adjust search range according to sr from mv prediction
+          if (sr > step_param)
+            step_param = sr;
+
+          // Initial step/diamond search
+          {
+            bestsme = cpi->diamond_search_sad(x, b, d, &mvp_full, &d->bmi.as_mv.first,
+                                              step_param, sadpb, &num00,
+                                              &cpi->fn_ptr[BLOCK_32X32],
+                                              XMVCOST, &best_ref_mv);
+            mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+
+            // Further step/diamond searches as necessary
+            n = 0;
+            further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+            n = num00;
+            num00 = 0;
+
+            /* If there won't be more n-step search, check to see if refining search is needed. */
+            if (n > further_steps)
+              do_refine = 0;
+
+            while (n < further_steps) {
+              n++;
+
+              if (num00)
+                num00--;
+              else {
+                thissme = cpi->diamond_search_sad(x, b, d, &mvp_full,
+                                                  &d->bmi.as_mv.first, step_param + n, sadpb, &num00,
+                                                  &cpi->fn_ptr[BLOCK_32X32],
+                                                  XMVCOST, &best_ref_mv);
+
+                /* check to see if refining search is needed. */
+                if (num00 > (further_steps - n))
+                  do_refine = 0;
+
+                if (thissme < bestsme) {
+                  bestsme = thissme;
+                  mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+                } else {
+                  d->bmi.as_mv.first.as_int = mode_mv[NEWMV].as_int;
+                }
+              }
+            }
+          }
+
+          /* final 1-away diamond refining search */
+          if (do_refine == 1) {
+            int search_range;
+
+            // It seems not a good way to set search_range. Need further investigation.
+            // search_range = MAXF(abs((mvp.row>>3) - d->bmi.mv.as_mv.row), abs((mvp.col>>3) - d->bmi.mv.as_mv.col));
+            search_range = 8;
+
+            thissme = cpi->refining_search_sad(x, b, d, &d->bmi.as_mv.first, sadpb,
+                                               search_range, &cpi->fn_ptr[BLOCK_32X32],
+                                               XMVCOST, &best_ref_mv);
+
+            if (thissme < bestsme) {
+              bestsme = thissme;
+              mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+            } else {
+              d->bmi.as_mv.first.as_int = mode_mv[NEWMV].as_int;
+            }
+          }
+
+          x->mv_col_min = tmp_col_min;
+          x->mv_col_max = tmp_col_max;
+          x->mv_row_min = tmp_row_min;
+          x->mv_row_max = tmp_row_max;
+
+          if (bestsme < INT_MAX) {
+            int dis; /* TODO: use dis in distortion calculation later. */
+            unsigned int sse;
+            cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first, &best_ref_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[BLOCK_32X32],
+                                         XMVCOST, &dis, &sse);
+          }
+          mc_search_result[xd->mode_info_context->mbmi.ref_frame].as_int =
+            d->bmi.as_mv.first.as_int;
+
+          mode_mv[NEWMV].as_int = d->bmi.as_mv.first.as_int;
+
+          // Add the new motion vector cost to our rolling cost variable
+          rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv,
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+        }
+
+        case NEARESTMV:
+        case NEARMV:
+          // Clip "next_nearest" so that it does not extend to far out of image
+          vp8_clamp_mv2(&mode_mv[this_mode], xd);
+
+          // Do not bother proceeding if the vector (from newmv,nearest or near) is 0,0 as this should then be coded using the zeromv mode.
+          if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) && (mode_mv[this_mode].as_int == 0)) {
+            continue;
+          }
+
+        case ZEROMV:
+          // Trap vectors that reach beyond the UMV borders
+          // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
+          // because of the lack of break statements in the previous two cases.
+          if (((mode_mv[this_mode].as_mv.row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].as_mv.row >> 3) > x->mv_row_max) ||
+              ((mode_mv[this_mode].as_mv.col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].as_mv.col >> 3) > x->mv_col_max)) {
+            continue;
+          }
+
+          vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
+
+#if CONFIG_PRED_FILTER
+          // Filtered prediction:
+          xd->mode_info_context->mbmi.pred_filter_enabled =
+          vp8_mode_order[mode_index].pred_filter_flag;
+          rate2 += vp8_cost_bit(cpi->common.prob_pred_filter_off,
+                                xd->mode_info_context->mbmi.pred_filter_enabled);
+#endif
+
+          vp8_build_inter32x32_predictors_sb(xd,
+                                             xd->dst.y_buffer,
+                                             xd->dst.u_buffer,
+                                             xd->dst.v_buffer,
+                                             xd->dst.y_stride,
+                                             xd->dst.uv_stride);
+
+          compmode_cost =
+            vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 0);
+
+          if (cpi->active_map_enabled && x->active_ptr[0] == 0) {
+            x->skip = 1;
+          } else if (x->encode_breakout) {
+            unsigned int sse;
+            unsigned int var;
+            int threshold = (xd->block[0].dequant[1] *
+                             xd->block[0].dequant[1] >> 4);
+
+            if (threshold < x->encode_breakout)
+              threshold = x->encode_breakout;
+
+            var = VARIANCE_INVOKE(&cpi->rtcd.variance, var32x32)(*(b->base_src),
+              b->src_stride, xd->dst.y_buffer, xd->dst.y_stride, &sse);
+
+            if (sse < threshold) {
+              unsigned int q2dc = xd->block[24].dequant[0];
+              /* If there is no codeable 2nd order dc
+                or a very small uniform pixel change change */
+              if ((sse - var < q2dc *q2dc >> 4) ||
+                  (sse / 2 > var && sse - var < 64)) {
+                // Check u and v to make sure skip is ok
+                int sse2, sse3;
+                int var2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                                  (x->src.u_buffer, x->src.uv_stride,
+                                   xd->dst.u_buffer, xd->dst.uv_stride, &sse2);
+                int var3 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                                  (x->src.v_buffer, x->src.uv_stride,
+                                   xd->dst.v_buffer, xd->dst.uv_stride, &sse3);
+                sse2 += sse3;
+                if (sse2 * 2 < threshold) {
+                  x->skip = 1;
+                  distortion2 = sse + sse2;
+                  rate2 = 500;
+
+                  /* for best_yrd calculation */
+                  rate_uv = 0;
+                  distortion_uv = sse2;
+
+                  disable_skip = 1;
+                  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+                  break;
+                }
+              }
+            }
+          }
+
+          // Add in the Mv/mode cost
+          rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
+
+          // Y cost and distortion - FIXME support other transform sizes
+          super_block_yrd_8x8(x, &rate_y, &distortion,
+                              IF_RTCD(&cpi->rtcd), &skippable_y);
+          rate2 += rate_y;
+          distortion2 += distortion;
+
+          rd_inter32x32_uv_8x8(cpi, x, &rate_uv, &distortion_uv,
+                               cpi->common.full_pixel, &skippable_uv);
+
+          rate2 += rate_uv;
+          distortion2 += distortion_uv;
+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+          break;
+
+        default:
+          break;
+      }
+    } else { /* xd->mode_info_context->mbmi.second_ref_frame != 0 */
+      int ref1 = xd->mode_info_context->mbmi.ref_frame;
+      int ref2 = xd->mode_info_context->mbmi.second_ref_frame;
+
+      mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+      switch (this_mode) {
+        case NEWMV:
+          if (mc_search_result[ref1].as_int == INVALID_MV ||
+              mc_search_result[ref2].as_int == INVALID_MV)
+            continue;
+          xd->mode_info_context->mbmi.mv[0].as_int = mc_search_result[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = mc_search_result[ref2].as_int;
+          rate2 += vp8_mv_bit_cost(&mc_search_result[ref1],
+                                   &frame_best_ref_mv[ref1],
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+          rate2 += vp8_mv_bit_cost(&mc_search_result[ref2],
+                                   &frame_best_ref_mv[ref2],
+                                   XMVCOST, 96,
+                                   xd->allow_high_precision_mv);
+          break;
+        case ZEROMV:
+          xd->mode_info_context->mbmi.mv[0].as_int = 0;
+          xd->mode_info_context->mbmi.mv[1].as_int = 0;
+          break;
+        case NEARMV:
+          if (frame_near_mv[ref1].as_int == 0 || frame_near_mv[ref2].as_int == 0) {
+            continue;
+          }
+          xd->mode_info_context->mbmi.mv[0].as_int = frame_near_mv[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = frame_near_mv[ref2].as_int;
+          break;
+        case NEARESTMV:
+          if (frame_nearest_mv[ref1].as_int == 0 || frame_nearest_mv[ref2].as_int == 0) {
+            continue;
+          }
+          xd->mode_info_context->mbmi.mv[0].as_int = frame_nearest_mv[ref1].as_int;
+          xd->mode_info_context->mbmi.mv[1].as_int = frame_nearest_mv[ref2].as_int;
+          break;
+        default:
+          break;
+      }
+
+      /* Add in the Mv/mode cost */
+      rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);
+
+      vp8_clamp_mv2(&xd->mode_info_context->mbmi.mv[0], xd);
+      vp8_clamp_mv2(&xd->mode_info_context->mbmi.mv[1], xd);
+      if (((xd->mode_info_context->mbmi.mv[0].as_mv.row >> 3) < x->mv_row_min) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.row >> 3) > x->mv_row_max) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.col >> 3) < x->mv_col_min) ||
+          ((xd->mode_info_context->mbmi.mv[0].as_mv.col >> 3) > x->mv_col_max) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.row >> 3) < x->mv_row_min) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.row >> 3) > x->mv_row_max) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.col >> 3) < x->mv_col_min) ||
+          ((xd->mode_info_context->mbmi.mv[1].as_mv.col >> 3) > x->mv_col_max)) {
+        continue;
+      }
+
+      /* build first and second prediction */
+      vp8_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,
+                                         xd->dst.u_buffer, xd->dst.v_buffer,
+                                         xd->dst.y_stride, xd->dst.uv_stride);
+
+      /* Y cost and distortion - TODO(rbultje) support other transform sizes */
+      super_block_yrd_8x8(x, &rate_y, &distortion,
+                          IF_RTCD(&cpi->rtcd), &skippable_y);
+
+      rate2 += rate_y;
+      distortion2 += distortion;
+
+      /* UV cost and distortion */
+      rd_inter32x32_uv_8x8(cpi, x, &rate_uv, &distortion_uv,
+                           cpi->common.full_pixel, &skippable_uv);
+
+      rate2 += rate_uv;
+      distortion2 += distortion_uv;
+
+      /* don't bother w/ skip, we would never have come here if skip were
+       * enabled */
+      xd->mode_info_context->mbmi.mode = this_mode;
+
+      /* We don't include the cost of the second reference here, because there
+       * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in
+       * other words if you present them in that order, the second one is
+       * always known if the first is known */
+      compmode_cost = vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 1);
+    }
+
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      rate2 += compmode_cost;
+    }
+
+
+    // Estimate the reference frame signaling cost and add it
+    // to the rolling cost variable.
+    rate2 += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+    if (!disable_skip) {
+      // Test for the condition where skip block will be activated
+      // because there are no non zero coefficients and make any
+      // necessary adjustment for rate. Ignore if skip is coded at
+      // segment level as the cost wont have been added in.
+      if (cpi->common.mb_no_coeff_skip) {
+        int mb_skippable = skippable_y && skippable_uv;
+        int mb_skip_allowed;
+
+        // Is Mb level skip allowed for this mb.
+        mb_skip_allowed =
+          !segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+          get_segdata(xd, segment_id, SEG_LVL_EOB);
+
+        if (mb_skippable) {
+          // Back out the coefficient coding costs
+          rate2 -= (rate_y + rate_uv);
+          // for best_yrd calculation
+          rate_uv = 0;
+
+          if (mb_skip_allowed) {
+            int prob_skip_cost;
+
+            // Cost the skip mb case
+            vp8_prob skip_prob =
+              get_pred_prob(cm, xd, PRED_MBSKIP);
+
+            if (skip_prob) {
+              prob_skip_cost = vp8_cost_bit(skip_prob, 1);
+              rate2 += prob_skip_cost;
+              other_cost += prob_skip_cost;
+            }
+          }
+        }
+        // Add in the cost of the no skip flag.
+        else if (mb_skip_allowed) {
+          int prob_skip_cost = vp8_cost_bit(get_pred_prob(cm, xd,
+                                                          PRED_MBSKIP), 0);
+          rate2 += prob_skip_cost;
+          other_cost += prob_skip_cost;
+        }
+      }
+
+      // Calculate the final RD estimate for this mode.
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    }
+
+#if 0
+    // Keep record of best intra distortion
+    if ((xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) &&
+        (this_rd < best_intra_rd)) {
+      best_intra_rd = this_rd;
+      *returnintra = distortion2;
+    }
+#endif
+
+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+      if (this_rd < best_comp_rd)
+        best_comp_rd = this_rd;
+      if (this_rd < best_single_rd)
+        best_single_rd = this_rd;
+      if (this_rd < best_hybrid_rd)
+        best_hybrid_rd = this_rd;
+    }
+
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+#if 0
+        // Note index of best mode so far
+        best_mode_index = mode_index;
+
+        if (this_mode <= B_PRED) {
+          xd->mode_info_context->mbmi.uv_mode = uv_intra_mode_8x8;
+          /* required for left and above block mv */
+          xd->mode_info_context->mbmi.mv.as_int = 0;
+        }
+#endif
+
+        other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
+
+        /* Calculate the final y RD estimate for this mode */
+        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
+                          (distortion2 - distortion_uv));
+
+        *returnrate = rate2;
+        *returndistortion = distortion2;
+        best_rd = this_rd;
+        vpx_memcpy(&best_mbmode, &xd->mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+      }
+#if 0
+      // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+      cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+    // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+    else {
+#if 0
+      cpi->rd_thresh_mult[mode_index] += 4;
+
+      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+#endif
+    }
+
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
+
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME &&
+          single_rd < best_single_rd) {
+        best_single_rd = single_rd;
+      } else if (xd->mode_info_context->mbmi.second_ref_frame != INTRA_FRAME &&
+                 single_rd < best_comp_rd) {
+        best_comp_rd = single_rd;
+      }
+      if (hybrid_rd < best_hybrid_rd) {
+        best_hybrid_rd = hybrid_rd;
+      }
+    }
+
+    if (x->skip && !mode_excluded)
+      break;
+  }
+
+  // TODO(rbultje) integrate with RD thresholding
+#if 0
+  // Reduce the activation RD thresholds for the best choice mode
+  if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
+      (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
+    int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+    cpi->rd_thresh_mult[best_mode_index] =
+      (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ?
+      cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+    cpi->rd_threshes[best_mode_index] =
+      (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+  }
+#endif
+
+  // This code forces Altref,0,0 and skip for the frame that overlays a
+  // an alrtef unless Altref is filtered. However, this is unsafe if
+  // segment level coding of ref frame or mode is enabled for this
+  // segment.
+  if (!segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
+      !segfeature_active(xd, segment_id, SEG_LVL_MODE) &&
+      cpi->is_src_frame_alt_ref &&
+      (cpi->oxcf.arnr_max_frames == 0) &&
+      (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
+    xd->mode_info_context->mbmi.mode = ZEROMV;
+    xd->mode_info_context->mbmi.ref_frame = ALTREF_FRAME;
+    xd->mode_info_context->mbmi.mv[0].as_int = 0;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+    xd->mode_info_context->mbmi.mb_skip_coeff =
+      (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+    xd->mode_info_context->mbmi.partitioning = 0;
+
+    xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+    if (best_rd != INT64_MAX)
+      store_coding_context(x, &x->sb_context[0], mode_index, NULL,
+        &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+        &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+        0, 0, 0);
+    return best_rd;
+  }
+
+  // macroblock modes
+  vpx_memcpy(&xd->mode_info_context->mbmi, &best_mbmode,
+             sizeof(MB_MODE_INFO));
+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
+
+  if (best_rd != INT64_MAX)
+    store_coding_context(x, &x->sb_context[0], mode_index, NULL,
+      &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
+      &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame],
+      (best_single_rd == INT64_MAX) ? INT_MIN : (best_rd - best_single_rd),
+      (best_comp_rd   == INT64_MAX) ? INT_MIN : (best_rd - best_comp_rd),
+      (best_hybrid_rd == INT64_MAX) ? INT_MIN : (best_rd - best_hybrid_rd));
+
+  return best_rd;
+}
+#endif
+
+void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
+                                      int recon_yoffset,
+                                      int recon_uvoffset,
+                                      int *totalrate, int *totaldist) {
+  VP8_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int rate, distortion;
@@ -3694,17 +4678,6 @@
     vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,
                            &distortion, &intra_error);
 
-    if (mbmi->ref_frame) {
-      unsigned char pred_context;
-
-      pred_context = get_pred_context(cm, xd, PRED_COMP);
-
-      if (mbmi->second_ref_frame == INTRA_FRAME)
-        cpi->single_pred_count[pred_context]++;
-      else
-        cpi->comp_pred_count[pred_context]++;
-    }
-
     /* restore cpi->zbin_mode_boost_enabled */
     cpi->zbin_mode_boost_enabled = zbin_mode_boost_enabled;
   }
@@ -3717,5 +4690,6 @@
   x->mb_context[xd->mb_index].distortion  = distortion;
   x->mb_context[xd->mb_index].intra_error = intra_error;
 
-  return rate;
+  *totalrate = rate;
+  *totaldist = distortion;
 }
--- a/vp8/encoder/rdopt.h
+++ b/vp8/encoder/rdopt.h
@@ -18,7 +18,8 @@
 extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
 extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset,
                                    int *returnrate, int *returndistortion, int64_t *returnintra);
-extern int vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x);
+extern void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *r, int *d);
+extern void vp8_rd_pick_intra_mode_sb(VP8_COMP *cpi, MACROBLOCK *x, int *r, int *d);
 
 extern void vp8_mv_pred
 (
--- a/vp8/encoder/sad_c.c
+++ b/vp8/encoder/sad_c.c
@@ -13,29 +13,6 @@
 #include "vpx_ports/config.h"
 #include "vpx/vpx_integer.h"
 
-unsigned int vp8_sad16x16_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int max_sad) {
-
-  int r, c;
-  unsigned int sad = 0;
-
-  for (r = 0; r < 16; r++) {
-    for (c = 0; c < 16; c++) {
-      sad += abs(src_ptr[c] - ref_ptr[c]);
-    }
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  return sad;
-}
-
-
 static __inline
 unsigned int sad_mx_n_c(
   const unsigned char *src_ptr,
@@ -60,7 +37,22 @@
   return sad;
 }
 
+unsigned int vp8_sad32x32_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
+}
 
+unsigned int vp8_sad16x16_c(const unsigned char *src_ptr,
+                            int  src_stride,
+                            const unsigned char *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
+}
+
 unsigned int vp8_sad8x8_c(
   const unsigned char *src_ptr,
   int  src_stride,
@@ -104,6 +96,7 @@
 
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
 }
+
 #if CONFIG_NEWBESTREFMV
 unsigned int vp8_sad2x16_c(
   const unsigned char *src_ptr,
@@ -122,6 +115,34 @@
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 2);
 }
 #endif
+
+void vp8_sad32x32x3_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned int *sad_array
+                      ) {
+  sad_array[0] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad32x32x8_c(const unsigned char *src_ptr,
+                      int  src_stride,
+                      const unsigned char *ref_ptr,
+                      int  ref_stride,
+                      unsigned short *sad_array
+                      ) {
+  sad_array[0] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+  sad_array[3] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, 0x7fffffff);
+  sad_array[4] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff);
+  sad_array[5] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
+  sad_array[6] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, 0x7fffffff);
+  sad_array[7] = (unsigned short)vp8_sad32x32_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
 void vp8_sad16x16x3_c(
   const unsigned char *src_ptr,
   int  src_stride,
@@ -265,6 +286,18 @@
   sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff);
   sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, 0x7fffffff);
   sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad32x32x4d_c(const unsigned char *src_ptr,
+                       int  src_stride,
+                       unsigned char *ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array
+                       ) {
+  sad_array[0] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp8_sad32x32_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
 void vp8_sad16x16x4d_c(
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c
@@ -200,42 +200,59 @@
   // in the frame
   xd->mode_info_context = cm->mi;
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      segment_id = xd->mode_info_context->mbmi.segment_id;
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
+      for (i = 0; i < 4; i++) {
+        static const int dx[4] = { +1, -1, +1, +1 };
+        static const int dy[4] = {  0, +1,  0, -1 };
+        int x_idx = i & 1, y_idx = i >> 1;
 
-      // Count the number of hits on each segment with no prediction
-      no_pred_segcounts[segment_id]++;
+        if (mb_col + x_idx >= cm->mb_cols ||
+            mb_row + y_idx >= cm->mb_rows) {
+          goto end;
+        }
 
-      // Temporal prediction not allowed on key frames
-      if (cm->frame_type != KEY_FRAME) {
-        // Test to see if the segment id matches the predicted value.
-        int seg_predicted =
-          (segment_id == get_pred_mb_segid(cm, segmap_index));
+        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
+        segment_id = xd->mode_info_context->mbmi.segment_id;
 
-        // Get the segment id prediction context
-        pred_context =
-          get_pred_context(cm, xd, PRED_SEG_ID);
+        // Count the number of hits on each segment with no prediction
+        no_pred_segcounts[segment_id]++;
 
-        // Store the prediction status for this mb and update counts
-        // as appropriate
-        set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
-        temporal_predictor_count[pred_context][seg_predicted]++;
+        // Temporal prediction not allowed on key frames
+        if (cm->frame_type != KEY_FRAME) {
+          // Test to see if the segment id matches the predicted value.
+          int seg_predicted =
+            (segment_id == get_pred_mb_segid(cm, segmap_index));
 
-        if (!seg_predicted)
-          // Update the "unpredicted" segment count
-          t_unpred_seg_counts[segment_id]++;
-      }
+          // Get the segment id prediction context
+          pred_context =
+            get_pred_context(cm, xd, PRED_SEG_ID);
 
-      // Step on to the next mb
-      xd->mode_info_context++;
+          // Store the prediction status for this mb and update counts
+          // as appropriate
+          set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
+          temporal_predictor_count[pred_context][seg_predicted]++;
 
-      // Step on to the next entry in the segment maps
-      segmap_index++;
+          if (!seg_predicted)
+            // Update the "unpredicted" segment count
+            t_unpred_seg_counts[segment_id]++;
+        }
+
+#if CONFIG_SUPERBLOCKS
+        if (xd->mode_info_context->mbmi.encoded_as_sb) {
+          assert(!i);
+          xd->mode_info_context += 2;
+          break;
+        }
+#endif
+      end:
+        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
+      }
     }
 
     // this is to account for the border in mode_info_context
-    xd->mode_info_context++;
+    xd->mode_info_context -= mb_col;
+    xd->mode_info_context += cm->mode_info_stride * 2;
   }
 
   // Work out probability tree for coding segments without prediction
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -145,8 +145,18 @@
 #endif
 extern prototype_sad(vp8_variance_sad16x16);
 
+#ifndef vp8_variance_sad32x32
+#define vp8_variance_sad32x32 vp8_sad32x32_c
+#endif
+extern prototype_sad(vp8_variance_sad32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
+#ifndef vp8_variance_sad32x32x3
+#define vp8_variance_sad32x32x3 vp8_sad32x32x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad32x32x3);
+
 #ifndef vp8_variance_sad16x16x3
 #define vp8_variance_sad16x16x3 vp8_sad16x16x3_c
 #endif
@@ -172,6 +182,11 @@
 #endif
 extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
 
+#ifndef vp8_variance_sad32x32x8
+#define vp8_variance_sad32x32x8 vp8_sad32x32x8_c
+#endif
+extern prototype_sad_multi_same_address_1(vp8_variance_sad32x32x8);
+
 #ifndef vp8_variance_sad16x16x8
 #define vp8_variance_sad16x16x8 vp8_sad16x16x8_c
 #endif
@@ -199,6 +214,11 @@
 
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
+#ifndef vp8_variance_sad32x32x4d
+#define vp8_variance_sad32x32x4d vp8_sad32x32x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad32x32x4d);
+
 #ifndef vp8_variance_sad16x16x4d
 #define vp8_variance_sad16x16x4d vp8_sad16x16x4d_c
 #endif
@@ -258,6 +278,11 @@
 #endif
 extern prototype_variance(vp8_variance_var16x16);
 
+#ifndef vp8_variance_var32x32
+#define vp8_variance_var32x32 vp8_variance32x32_c
+#endif
+extern prototype_variance(vp8_variance_var32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_subpixvar4x4
@@ -285,26 +310,51 @@
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixvar16x16);
 
+#ifndef vp8_variance_subpixvar32x32
+#define vp8_variance_subpixvar32x32 vp8_sub_pixel_variance32x32_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar32x32);
+
 #ifndef vp8_variance_halfpixvar16x16_h
 #define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_h);
 
+#ifndef vp8_variance_halfpixvar32x32_h
+#define vp8_variance_halfpixvar32x32_h vp8_variance_halfpixvar32x32_h_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_h);
+
 #ifndef vp8_variance_halfpixvar16x16_v
 #define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_v);
 
+#ifndef vp8_variance_halfpixvar32x32_v
+#define vp8_variance_halfpixvar32x32_v vp8_variance_halfpixvar32x32_v_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_v);
+
 #ifndef vp8_variance_halfpixvar16x16_hv
 #define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_c
 #endif
 extern prototype_variance(vp8_variance_halfpixvar16x16_hv);
 
+#ifndef vp8_variance_halfpixvar32x32_hv
+#define vp8_variance_halfpixvar32x32_hv vp8_variance_halfpixvar32x32_hv_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar32x32_hv);
+
 #ifndef vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixmse16x16);
 
+#ifndef vp8_variance_subpixmse32x32
+#define vp8_variance_subpixmse32x32 vp8_sub_pixel_mse32x32_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixmse32x32);
+
 // -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
 
 #ifndef vp8_variance_getmbss
@@ -349,6 +399,9 @@
   vp8_sad_fn_t             sad8x16;
   vp8_sad_fn_t             sad16x8;
   vp8_sad_fn_t             sad16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_fn_t             sad32x32;
+#endif
 
   vp8_variance_fn_t        var4x4;
   vp8_variance_fn_t        var8x8;
@@ -355,6 +408,9 @@
   vp8_variance_fn_t        var8x16;
   vp8_variance_fn_t        var16x8;
   vp8_variance_fn_t        var16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        var32x32;
+#endif
 
   vp8_subpixvariance_fn_t  subpixvar4x4;
   vp8_subpixvariance_fn_t  subpixvar8x8;
@@ -361,14 +417,30 @@
   vp8_subpixvariance_fn_t  subpixvar8x16;
   vp8_subpixvariance_fn_t  subpixvar16x8;
   vp8_subpixvariance_fn_t  subpixvar16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_subpixvariance_fn_t  subpixvar32x32;
+#endif
   vp8_variance_fn_t        halfpixvar16x16_h;
+  vp8_variance_fn_t        halfpixvar32x32_h;
   vp8_variance_fn_t        halfpixvar16x16_v;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        halfpixvar32x32_v;
+#endif
   vp8_variance_fn_t        halfpixvar16x16_hv;
+#if CONFIG_SUPERBLOCKS
+  vp8_variance_fn_t        halfpixvar32x32_hv;
+#endif
   vp8_subpixvariance_fn_t  subpixmse16x16;
+#if CONFIG_SUPERBLOCKS
+  vp8_subpixvariance_fn_t  subpixmse32x32;
+#endif
 
   vp8_getmbss_fn_t         getmbss;
   vp8_variance_fn_t        mse16x16;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi_fn_t       sad32x32x3;
+#endif
   vp8_sad_multi_fn_t       sad16x16x3;
   vp8_sad_multi_fn_t       sad16x8x3;
   vp8_sad_multi_fn_t       sad8x16x3;
@@ -375,6 +447,9 @@
   vp8_sad_multi_fn_t       sad8x8x3;
   vp8_sad_multi_fn_t       sad4x4x3;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi1_fn_t      sad32x32x8;
+#endif
   vp8_sad_multi1_fn_t      sad16x16x8;
   vp8_sad_multi1_fn_t      sad16x8x8;
   vp8_sad_multi1_fn_t      sad8x16x8;
@@ -381,6 +456,9 @@
   vp8_sad_multi1_fn_t      sad8x8x8;
   vp8_sad_multi1_fn_t      sad4x4x8;
 
+#if CONFIG_SUPERBLOCKS
+  vp8_sad_multi_d_fn_t     sad32x32x4d;
+#endif
   vp8_sad_multi_d_fn_t     sad16x16x4d;
   vp8_sad_multi_d_fn_t     sad16x8x4d;
   vp8_sad_multi_d_fn_t     sad8x16x4d;
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c
@@ -55,7 +55,21 @@
   }
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance32x32_c(const unsigned char *src_ptr,
+                                 int  source_stride,
+                                 const unsigned char *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
 
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
+  *sse = var;
+  return (var - ((avg * avg) >> 10));
+}
+#endif
+
 unsigned int vp8_variance16x16_c(
   const unsigned char *src_ptr,
   int  source_stride,
@@ -334,7 +348,28 @@
   return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_sub_pixel_variance32x32_c(const unsigned char  *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const unsigned char *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering
+  unsigned char  temp2[36 * 32];
+  const short *HFilter, *VFilter;
 
+  HFilter = vp8_bilinear_filters[xoffset];
+  VFilter = vp8_bilinear_filters[yoffset];
+
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
+
+  return vp8_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif
+
 unsigned int vp8_variance_halfpixvar16x16_h_c(
   const unsigned char *src_ptr,
   int  source_stride,
@@ -345,18 +380,39 @@
                                        ref_ptr, recon_stride, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
 
-unsigned int vp8_variance_halfpixvar16x16_v_c(
+
+unsigned int vp8_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,
+                                              int  source_stride,
+                                              const unsigned char *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_v_c(
   const unsigned char *src_ptr,
   int  source_stride,
   const unsigned char *ref_ptr,
   int  recon_stride,
   unsigned int *sse) {
-  return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
                                        ref_ptr, recon_stride, sse);
 }
+#endif
 
-
 unsigned int vp8_variance_halfpixvar16x16_hv_c(
   const unsigned char *src_ptr,
   int  source_stride,
@@ -367,6 +423,16 @@
                                        ref_ptr, recon_stride, sse);
 }
 
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,
+                                               int  source_stride,
+                                               const unsigned char *ref_ptr,
+                                               int  recon_stride,
+                                               unsigned int *sse) {
+  return vp8_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+#endif
 
 unsigned int vp8_sub_pixel_mse16x16_c
 (
@@ -381,6 +447,19 @@
   vp8_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
   return *sse;
 }
+
+#if CONFIG_SUPERBLOCKS
+unsigned int vp8_sub_pixel_mse32x32_c(const unsigned char  *src_ptr,
+                                      int  src_pixels_per_line,
+                                      int  xoffset,
+                                      int  yoffset,
+                                      const unsigned char *dst_ptr,
+                                      int dst_pixels_per_line,
+                                      unsigned int *sse) {
+  vp8_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+  return *sse;
+}
+#endif
 
 unsigned int vp8_sub_pixel_variance16x8_c
 (