shithub: libvpx

Download patch

ref: de5546c37297383aae7648e84fd38009a333b9cf
parent: 652589d56c60b7db1e4d0af28d585372335ea81e
parent: 9bf73f46f9ce98be0f62d5f858be3e2100ddae5d
author: John Koleszar <jkoleszar@google.com>
date: Mon Jan 14 11:25:26 EST 2013

Merge branch 'experimental' of review:webm/libvpx

Change-Id: Ib2c2236349c2ae8ee81bd01c5067dddcbac713ca

--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -436,10 +436,10 @@
 EOF
 
     if enabled rvct; then cat >> $1 << EOF
-fmt_deps = sed -e 's;^__image.axf;\$\${@:.d=.o} \$\$@;' #hide
+fmt_deps = sed -e 's;^__image.axf;\${@:.d=.o} \$@;' #hide
 EOF
     else cat >> $1 << EOF
-fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\$\${@:.d=.o} \$\$@;'
+fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;'
 EOF
     fi
 
--- a/configure
+++ b/configure
@@ -239,15 +239,17 @@
 "
 EXPERIMENT_LIST="
     csm
-    comp_intra_pred
-    superblocks
-    pred_filter
     lossless
-    subpelrefmv
     new_mvref
     implicit_segmentation
     newbintramodes
     comp_interintra_pred
+    tx64x64
+    dwtdcthybrid
+    cnvcontext
+    newcoefcontext
+    enable_6tap
+    abovesprefmv
 "
 CONFIG_LIST="
     external_build
--- /dev/null
+++ b/test/dct32x32_test.cc
@@ -1,0 +1,197 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+extern "C" {
+#include "vp9/common/vp9_entropy.h"
+#include "./vp9_rtcd.h"
+  void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
+  void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+}
+
+#include "test/acm_random.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+#ifdef _MSC_VER
+static int round(double x) {
+  if (x < 0)
+    return (int)ceil(x - 0.5);
+  else
+    return (int)floor(x + 0.5);
+}
+#endif
+
+#if !CONFIG_DWTDCTHYBRID
+static const double kPi = 3.141592653589793238462643383279502884;
+static void reference2_32x32_idct_2d(double *input, double *output) {
+  double x;
+  for (int l = 0; l < 32; ++l) {
+    for (int k = 0; k < 32; ++k) {
+      double s = 0;
+      for (int i = 0; i < 32; ++i) {
+        for (int j = 0; j < 32; ++j) {
+          x = cos(kPi * j * (l + 0.5) / 32.0) *
+              cos(kPi * i * (k + 0.5) / 32.0) * input[i * 32 + j] / 1024;
+          if (i != 0)
+            x *= sqrt(2.0);
+          if (j != 0)
+            x *= sqrt(2.0);
+          s += x;
+        }
+      }
+      output[k * 32 + l] = s / 4;
+    }
+  }
+}
+
+static void reference_32x32_dct_1d(double in[32], double out[32], int stride) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 32; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 32; n++)
+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 64.0);
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
+  // First transform columns
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = input[j*32 + i];
+    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    for (int j = 0; j < 32; ++j)
+      output[j * 32 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 32; ++i) {
+    double temp_in[32], temp_out[32];
+    for (int j = 0; j < 32; ++j)
+      temp_in[j] = output[j + i*32];
+    reference_32x32_dct_1d(temp_in, temp_out, 1);
+    // Scale by some magic number
+    for (int j = 0; j < 32; ++j)
+      output[j + i * 32] = temp_out[j] / 4;
+  }
+}
+
+
+TEST(VP9Idct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t in[1024], coeff[1024];
+    int16_t out_c[1024];
+    double out_r[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      in[j] = rnd.Rand8() - rnd.Rand8();
+
+    reference_32x32_dct_2d(in, out_r);
+    for (int j = 0; j < 1024; j++)
+      coeff[j] = round(out_r[j]);
+    vp9_short_idct32x32_c(coeff, out_c, 64);
+    for (int j = 0; j < 1024; ++j) {
+      const int diff = out_c[j] - in[j];
+      const int error = diff * diff;
+      EXPECT_GE(1, error)
+          << "Error: 3x32 IDCT has error " << error
+          << " at index " << j;
+    }
+
+    vp9_short_fdct32x32_c(in, out_c, 64);
+    for (int j = 0; j < 1024; ++j) {
+      const double diff = coeff[j] - out_c[j];
+      const double error = diff * diff;
+      EXPECT_GE(1.0, error)
+          << "Error: 32x32 FDCT has error " << error
+          << " at index " << j;
+    }
+  }
+}
+#else  // CONFIG_DWTDCTHYBRID
+  // TODO(rbultje/debargha): add DWT-specific tests
+#endif  // CONFIG_DWTDCTHYBRID
+TEST(VP9Fdct32x32Test, AccuracyCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  unsigned int max_error = 0;
+  int64_t total_error = 0;
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t test_input_block[1024];
+    int16_t test_temp_block[1024];
+    int16_t test_output_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j)
+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
+    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+
+    for (int j = 0; j < 1024; ++j) {
+      const unsigned diff = test_input_block[j] - test_output_block[j];
+      const unsigned error = diff * diff;
+      if (max_error < error)
+        max_error = error;
+      total_error += error;
+    }
+  }
+
+  EXPECT_GE(1u, max_error)
+      << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";
+
+  EXPECT_GE(count_test_block/10, total_error)
+      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1/10 per block";
+}
+
+TEST(VP9Fdct32x32Test, CoeffSizeCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int count_test_block = 1000;
+  for (int i = 0; i < count_test_block; ++i) {
+    int16_t input_block[1024], input_extreme_block[1024];
+    int16_t output_block[1024], output_extreme_block[1024];
+
+    // Initialize a test block with input range [-255, 255].
+    for (int j = 0; j < 1024; ++j) {
+      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+    }
+    if (i == 0)
+      for (int j = 0; j < 1024; ++j)
+        input_extreme_block[j] = 255;
+
+    const int pitch = 64;
+    vp9_short_fdct32x32_c(input_block, output_block, pitch);
+    vp9_short_fdct32x32_c(input_extreme_block, output_extreme_block, pitch);
+
+    // The minimum quant value is 4.
+    for (int j = 0; j < 1024; ++j) {
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))
+          << "Error: 32x32 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))
+          << "Error: 32x32 FDCT extreme has coefficient larger than "
+             "4*DCT_MAX_VALUE";
+    }
+  }
+}
+}  // namespace
--- a/test/test.mk
+++ b/test/test.mk
@@ -69,6 +69,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 #LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
 endif # VP9
 
 
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -15,7 +15,7 @@
 #include "vpx_scale/yv12config.h"
 #include "postproc.h"
 #include "common.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "systemdependent.h"
 
 #include <limits.h>
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -21,7 +21,7 @@
 #include "vp8/common/alloccommon.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/quant_common.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/setupintrarecon.h"
 
 #include "decodemv.h"
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c
@@ -26,7 +26,7 @@
 
 #include "vp8/common/quant_common.h"
 #include "./vpx_scale_rtcd.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
 #include "detokenize.h"
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -21,7 +21,7 @@
 #include "vp8/common/systemdependent.h"
 #include "mcomp.h"
 #include "firstpass.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "encodemb.h"
 #include "vp8/common/extend.h"
 #include "vpx_mem/vpx_mem.h"
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -20,7 +20,7 @@
 #include "mcomp.h"
 #include "firstpass.h"
 #include "psnr.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
@@ -2588,7 +2588,7 @@
         Scale2Ratio(cm->horiz_scale, &hr, &hs);
         Scale2Ratio(cm->vert_scale, &vr, &vs);
 
-        vp8_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
+        vpx_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
                         tmp_height, hs, hr, vs, vr, 0);
 
         vp8_yv12_extend_frame_borders(&cpi->scaled_source);
@@ -3466,7 +3466,7 @@
         /* Note that we should not throw out a key frame (especially when
          * spatial resampling is enabled).
          */
-        if ((cm->frame_type == KEY_FRAME))
+        if (cm->frame_type == KEY_FRAME)
         {
             cpi->decimation_count = cpi->decimation_factor;
         }
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c
@@ -14,7 +14,7 @@
 #include "onyx_int.h"
 #include "quantize.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/alloccommon.h"
 #include "vp8/common/loopfilter.h"
 #if ARCH_ARM
--- a/vp8/encoder/psnr.c
+++ b/vp8/encoder/psnr.c
@@ -13,7 +13,7 @@
 #include "math.h"
 #include "vp8/common/systemdependent.h" /* for vp8_clear_system_state() */
 
-#define MAX_PSNR 60
+#define MAX_PSNR 100
 
 double vp8_mse2psnr(double Samples, double Peak, double Mse)
 {
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -17,7 +17,7 @@
 #include "mcomp.h"
 #include "firstpass.h"
 #include "psnr.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1178,7 +1178,9 @@
     {
         int res;
         vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data ;
-        res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, scalemode.v_scaling_mode);
+        res = vp8_set_internal_size(ctx->cpi,
+                                    (VPX_SCALING)scalemode.h_scaling_mode,
+                                    (VPX_SCALING)scalemode.v_scaling_mode);
 
         if (!res)
         {
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -220,4 +220,8 @@
   vp9_entropy_mode_init();
 
   vp9_entropy_mv_init();
+
+#if CONFIG_NEWCOEFCONTEXT
+  vp9_init_neighbors();
+#endif
 }
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h
@@ -23,4 +23,4 @@
 void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base);
 void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi);
 
-#endif
+#endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_
--- a/vp9/common/vp9_blockd.c
+++ b/vp9/common/vp9_blockd.c
@@ -12,18 +12,15 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
-
-const unsigned char vp9_block2left[25] = {
-  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
+const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25] = {
+  {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}
 };
-const unsigned char vp9_block2above[25] = {
-  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
+const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25] = {
+  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8},
+  {0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8},
+  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}
 };
-
-const unsigned char vp9_block2left_8x8[25] = {
-  0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
-};
-const unsigned char vp9_block2above_8x8[25] = {
-  0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8
-};
-
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -45,7 +45,20 @@
 #define SEGMENT_DELTADATA   0
 #define SEGMENT_ABSDATA     1
 #define MAX_MV_REFS 9
+#define MAX_MV_REF_CANDIDATES 4
 
+#if CONFIG_DWTDCTHYBRID
+#define DWT_MAX_LENGTH     64
+#define DWT_TYPE           26    // 26/53/97
+#define DWT_PRECISION_BITS 2
+#define DWT_PRECISION_RND  ((1 << DWT_PRECISION_BITS) / 2)
+
+#define DWTDCT16X16        0
+#define DWTDCT16X16_LEAN   1
+#define DWTDCT8X8          2
+#define DWTDCT_TYPE        DWTDCT16X16_LEAN
+#endif
+
 typedef struct {
   int r, c;
 } POS;
@@ -65,11 +78,6 @@
   ENTROPY_CONTEXT y2;
 } ENTROPY_CONTEXT_PLANES;
 
-extern const unsigned char vp9_block2left[25];
-extern const unsigned char vp9_block2above[25];
-extern const unsigned char vp9_block2left_8x8[25];
-extern const unsigned char vp9_block2above_8x8[25];
-
 #define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \
   Dest = ((A)!=0) + ((B)!=0);
 
@@ -80,10 +88,13 @@
 
 typedef enum
 {
-  SIXTAP   = 0,
-  BILINEAR = 1,
-  EIGHTTAP = 2,
-  EIGHTTAP_SHARP = 3,
+#if CONFIG_ENABLE_6TAP
+  SIXTAP,
+#endif
+  EIGHTTAP_SMOOTH,
+  EIGHTTAP,
+  EIGHTTAP_SHARP,
+  BILINEAR,
   SWITCHABLE  /* should be the last one */
 } INTERPOLATIONFILTERTYPE;
 
@@ -101,13 +112,11 @@
   TM_PRED,            /* Truemotion prediction */
   I8X8_PRED,          /* 8x8 based prediction, each 8x8 has its own prediction mode */
   B_PRED,             /* block based prediction, each block has its own prediction mode */
-
   NEARESTMV,
   NEARMV,
   ZEROMV,
   NEWMV,
   SPLITMV,
-
   MB_MODE_COUNT
 } MB_PREDICTION_MODE;
 
@@ -120,15 +129,16 @@
   SEG_LVL_EOB = 4,                 // EOB end stop marker.
   SEG_LVL_TRANSFORM = 5,           // Block transform size.
   SEG_LVL_MAX = 6                  // Number of MB level features supported
-
 } SEG_LVL_FEATURES;
 
 // Segment level features.
 typedef enum {
-  TX_4X4,                      // 4x4 dct transform
-  TX_8X8,                      // 8x8 dct transform
-  TX_16X16,                    // 16x16 dct transform
-  TX_SIZE_MAX                  // Number of different transforms available
+  TX_4X4 = 0,                      // 4x4 dct transform
+  TX_8X8 = 1,                      // 8x8 dct transform
+  TX_16X16 = 2,                    // 16x16 dct transform
+  TX_SIZE_MAX_MB = 3,              // Number of different transforms available
+  TX_32X32 = TX_SIZE_MAX_MB,       // 32x32 dct transform
+  TX_SIZE_MAX_SB,                  // Number of transforms available to SBs
 } TX_SIZE;
 
 typedef enum {
@@ -205,9 +215,6 @@
   struct {
     B_PREDICTION_MODE first;
     TX_TYPE           tx_type;
-#if CONFIG_COMP_INTRA_PRED
-    B_PREDICTION_MODE second;
-#endif
 #if CONFIG_NEWBINTRAMODES
     B_PREDICTION_MODE context;
 #endif
@@ -227,11 +234,14 @@
   MAX_REF_FRAMES = 4
 } MV_REFERENCE_FRAME;
 
+typedef enum {
+  BLOCK_SIZE_MB16X16 = 0,
+  BLOCK_SIZE_SB32X32 = 1,
+  BLOCK_SIZE_SB64X64 = 2,
+} BLOCK_SIZE_TYPE;
+
 typedef struct {
   MB_PREDICTION_MODE mode, uv_mode;
-#if CONFIG_COMP_INTRA_PRED
-  MB_PREDICTION_MODE second_mode, second_uv_mode;
-#endif
 #if CONFIG_COMP_INTERINTRA_PRED
   MB_PREDICTION_MODE interintra_mode, interintra_uv_mode;
 #endif
@@ -238,7 +248,7 @@
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   TX_SIZE txfm_size;
   int_mv mv[2]; // for each reference frame used
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int_mv best_mv, best_second_mv;
 #if CONFIG_NEW_MVREF
   int best_index, best_second_index;
@@ -261,17 +271,9 @@
   // a valid predictor
   unsigned char mb_in_image;
 
-#if CONFIG_PRED_FILTER
-  // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level
-  unsigned int pred_filter_enabled;
-#endif
-    INTERPOLATIONFILTERTYPE interp_filter;
+  INTERPOLATIONFILTERTYPE interp_filter;
 
-#if CONFIG_SUPERBLOCKS
-  // FIXME need a SB array of 4 MB_MODE_INFOs that
-  // only needs one encoded_as_sb.
-  unsigned char encoded_as_sb;
-#endif
+  BLOCK_SIZE_TYPE sb_type;
 } MB_MODE_INFO;
 
 typedef struct {
@@ -280,19 +282,19 @@
 } MODE_INFO;
 
 typedef struct blockd {
-  short *qcoeff;
-  short *dqcoeff;
-  unsigned char  *predictor;
-  short *diff;
-  short *dequant;
+  int16_t *qcoeff;
+  int16_t *dqcoeff;
+  uint8_t *predictor;
+  int16_t *diff;
+  int16_t *dequant;
 
   /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
-  unsigned char **base_pre;
-  unsigned char **base_second_pre;
+  uint8_t **base_pre;
+  uint8_t **base_second_pre;
   int pre;
   int pre_stride;
 
-  unsigned char **base_dst;
+  uint8_t **base_dst;
   int dst;
   int dst_stride;
 
@@ -301,13 +303,22 @@
   union b_mode_info bmi;
 } BLOCKD;
 
+typedef struct superblockd {
+  /* 32x32 Y and 16x16 U/V. No 2nd order transform yet. */
+  DECLARE_ALIGNED(16, int16_t, diff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, int16_t, qcoeff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, int16_t, dqcoeff[32*32+16*16*2]);
+} SUPERBLOCKD;
+
 typedef struct macroblockd {
-  DECLARE_ALIGNED(16, short, diff[400]);      /* from idct diff */
-  DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
-  DECLARE_ALIGNED(16, short, qcoeff[400]);
-  DECLARE_ALIGNED(16, short, dqcoeff[400]);
-  DECLARE_ALIGNED(16, unsigned short,  eobs[25]);
+  DECLARE_ALIGNED(16, int16_t,  diff[400]);      /* from idct diff */
+  DECLARE_ALIGNED(16, uint8_t,  predictor[384]);
+  DECLARE_ALIGNED(16, int16_t,  qcoeff[400]);
+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[400]);
+  DECLARE_ALIGNED(16, uint16_t, eobs[25]);
 
+  SUPERBLOCKD sb_coeff_data;
+
   /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
   BLOCKD block[25];
   int fullpixel_mask;
@@ -350,7 +361,7 @@
   vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];
 
 #if CONFIG_NEW_MVREF
-  vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3];
+  vp9_prob mb_mv_ref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1];
 #endif
 
   // Segment features
@@ -377,17 +388,17 @@
   unsigned int frames_till_alt_ref_frame;
 
   /* Inverse transform function pointers. */
-  void (*inv_xform4x4_1_x8)(short *input, short *output, int pitch);
-  void (*inv_xform4x4_x8)(short *input, short *output, int pitch);
-  void (*inv_walsh4x4_1)(short *in, short *out);
-  void (*inv_walsh4x4_lossless)(short *in, short *out);
+  void (*inv_xform4x4_1_x8)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_xform4x4_x8)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);
+  void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);
 
 
-  vp9_subpix_fn_t  subpixel_predict;
+  vp9_subpix_fn_t  subpixel_predict4x4;
   vp9_subpix_fn_t  subpixel_predict8x4;
   vp9_subpix_fn_t  subpixel_predict8x8;
   vp9_subpix_fn_t  subpixel_predict16x16;
-  vp9_subpix_fn_t  subpixel_predict_avg;
+  vp9_subpix_fn_t  subpixel_predict_avg4x4;
   vp9_subpix_fn_t  subpixel_predict_avg8x4;
   vp9_subpix_fn_t  subpixel_predict_avg8x8;
   vp9_subpix_fn_t  subpixel_predict_avg16x16;
@@ -395,14 +406,7 @@
 
   int corrupted;
 
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
-  /* This is an intermediate buffer currently used in sub-pixel motion search
-   * to keep a copy of the reference area. This buffer can be used for other
-   * purpose.
-   */
-  DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]);
-#endif
-
+  int sb_index;
   int mb_index;   // Index of the MB in the SB (0..3)
   int q_index;
 
@@ -490,6 +494,9 @@
   return tx_type;
 }
 
+extern const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25];
+extern const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25];
+
 #define USE_ADST_FOR_I16X16_8X8   0
 #define USE_ADST_FOR_I16X16_4X4   0
 #define USE_ADST_FOR_I8X8_4X4     1
@@ -502,11 +509,9 @@
   int ib = (int)(b - xd->block);
   if (ib >= 16)
     return tx_type;
-#if CONFIG_SUPERBLOCKS
   // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.encoded_as_sb)
+  if (xd->mode_info_context->mbmi.sb_type)
     return tx_type;
-#endif
   if (xd->mode_info_context->mbmi.mode == B_PRED &&
       xd->q_index < ACTIVE_HT) {
     tx_type = txfm_map(
@@ -559,11 +564,9 @@
   int ib = (int)(b - xd->block);
   if (ib >= 16)
     return tx_type;
-#if CONFIG_SUPERBLOCKS
   // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.encoded_as_sb)
+  if (xd->mode_info_context->mbmi.sb_type)
     return tx_type;
-#endif
   if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&
       xd->q_index < ACTIVE_HT8) {
     // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged
@@ -594,11 +597,9 @@
   int ib = (int)(b - xd->block);
   if (ib >= 16)
     return tx_type;
-#if CONFIG_SUPERBLOCKS
   // TODO(rbultje, debargha): Explore ADST usage for superblocks
-  if (xd->mode_info_context->mbmi.encoded_as_sb)
+  if (xd->mode_info_context->mbmi.sb_type)
     return tx_type;
-#endif
   if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&
       xd->q_index < ACTIVE_HT16) {
     tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));
@@ -650,4 +651,4 @@
     }
   }
 }
-#endif  /* __INC_BLOCKD_H */
+#endif  // VP9_COMMON_VP9_BLOCKD_H_
--- a/vp9/common/vp9_coefupdateprobs.h
+++ b/vp9/common/vp9_coefupdateprobs.h
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef VP9_COMMON_VP9_COEFUPDATEPROBS_H_
+#define VP9_COMMON_VP9_COEFUPDATEPROBS_H__
 
 /* Update probabilities for the nodes in the token entropy tree.
    Generated file included by vp9_entropy.c */
@@ -14,3 +16,5 @@
 #define COEF_UPDATE_PROB 252
 #define COEF_UPDATE_PROB_8X8 252
 #define COEF_UPDATE_PROB_16X16 252
+
+#endif  // VP9_COMMON_VP9_COEFUPDATEPROBS_H__
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_COMMON_H_
 #define VP9_COMMON_VP9_COMMON_H_
 
@@ -17,25 +16,34 @@
 /* Interface header for common constant data structures and lookup tables */
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx/vpx_integer.h"
 
-#include "vp9/common/vp9_common_types.h"
+#define TRUE    1
+#define FALSE   0
 
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+
 /* Only need this for fixed-size arrays, for structs just assign. */
 
-#define vp9_copy( Dest, Src) { \
-    assert( sizeof( Dest) == sizeof( Src)); \
-    vpx_memcpy( Dest, Src, sizeof( Src)); \
+#define vp9_copy(Dest, Src) { \
+    assert(sizeof(Dest) == sizeof(Src)); \
+    vpx_memcpy(Dest, Src, sizeof(Src)); \
   }
 
 /* Use this for variably-sized arrays. */
 
-#define vp9_copy_array( Dest, Src, N) { \
-    assert( sizeof( *Dest) == sizeof( *Src)); \
-    vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+#define vp9_copy_array(Dest, Src, N) { \
+    assert(sizeof(*Dest) == sizeof(*Src)); \
+    vpx_memcpy(Dest, Src, N * sizeof(*Src)); \
   }
 
-#define vp9_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));
+#define vp9_zero(Dest) vpx_memset(&Dest, 0, sizeof(Dest));
 
-#define vp9_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
+#define vp9_zero_array(Dest, N) vpx_memset(Dest, 0, N * sizeof(*Dest));
 
-#endif  /* common_h */
+static __inline uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255u : (val < 0) ? 0u : val;
+}
+
+#endif  // VP9_COMMON_VP9_COMMON_H_
--- a/vp9/common/vp9_common_types.h
+++ /dev/null
@@ -1,18 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_VP9_COMMON_TYPES_H_
-#define VP9_COMMON_VP9_COMMON_TYPES_H_
-
-#define TRUE    1
-#define FALSE   0
-
-#endif
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -87,9 +87,6 @@
 
         if (mi[mb_index].mbmi.mode == B_PRED) {
           fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first);
-#if CONFIG_COMP_INTRA_PRED
-          fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second);
-#endif
         } else
           fprintf(mvs, "xx ");
 
--- a/vp9/common/vp9_default_coef_probs.h
+++ b/vp9/common/vp9_default_coef_probs.h
@@ -12,1366 +12,1200 @@
 /*Generated file, included by vp9_entropy.c*/
 
 
-static const vp9_prob default_coef_probs [BLOCK_TYPES]
-                                         [COEF_BANDS]
-                                         [PREV_COEF_CONTEXTS]
-                                         [ENTROPY_NODES] = {
-  {
-    /* Block Type ( 0 ) */
-    {
-      /* Coeff Band ( 0 )*/
+static const vp9_coeff_probs default_coef_probs_4x4[BLOCK_TYPES_4X4] = {
+  { /* block Type 0 */
+    { /* Coeff Band 0 */
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 224, 180, 254, 255, 234, 224, 255, 227, 128, 128, 128 },
+      { 187, 178, 250, 255, 226, 218, 255, 229, 255, 255, 128 },
+      { 145, 171, 243, 253, 219, 211, 254, 226, 255, 224, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      {   1, 187, 252, 255, 231, 220, 255, 229, 255, 255, 128 },
+      { 129, 174, 244, 254, 225, 216, 253, 219, 255, 255, 128 },
+      {  16, 131, 193, 251, 205, 205, 254, 222, 255, 255, 128 },
+      {   2,  93, 136, 236, 159, 179, 255, 197, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      {   1, 188, 254, 255, 241, 236, 254, 220, 255, 255, 128 },
+      { 133, 165, 249, 255, 236, 220, 252, 220, 255, 255, 128 },
+      {  20, 112, 203, 254, 217, 214, 255, 224, 255, 255, 128 },
+      {   4,  61, 106, 240, 155, 189, 252, 202, 255, 255, 128 }
+    }, { /* Coeff Band 4 */
+      {   1, 168, 252, 255, 239, 228, 253, 217, 255, 255, 128 },
+      { 158, 163, 247, 255, 231, 221, 255, 242, 128, 128, 128 },
+      {  23, 127, 205, 253, 212, 224, 255, 234, 255, 255, 128 },
+      {   2,  83, 141, 237, 176, 210, 245, 207, 255, 255, 128 }
+    }, { /* Coeff Band 5 */
+      {   1, 233, 254, 255, 243, 241, 255, 213, 128, 128, 128 },
+      { 155, 213, 253, 255, 240, 221, 216, 112, 255, 255, 128 },
+      {  41, 159, 237, 254, 229, 216, 255, 161, 128, 128, 128 },
+      {  11,  95, 176, 244, 194, 191, 255, 167, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      {   1, 160, 253, 255, 238, 231, 255, 230, 255, 255, 128 },
+      { 174, 152, 248, 255, 230, 223, 255, 223, 255, 255, 128 },
+      {  86, 125, 213, 253, 207, 207, 254, 224, 255, 171, 128 },
+      {  39,  89, 156, 240, 168, 190, 251, 181, 255, 255, 128 }
+    }, { /* Coeff Band 7 */
+      {   1, 101, 255, 255, 243, 244, 255, 255, 128, 128, 128 },
+      { 230,  66, 255, 255, 238, 238, 128, 128, 128, 128, 128 },
+      { 151,  92, 229, 255, 224, 197, 128, 128, 128, 128, 128 },
+      { 109,  57, 171, 255,  73, 255, 128, 128, 128, 128, 128 }
+    }
+  }, { /* block Type 1 */
+    { /* Coeff Band 0 */
+      { 148, 109, 219, 239, 203, 184, 222, 172, 238, 203, 192 },
+      { 101, 110, 206, 229, 181, 178, 224, 171, 250, 206, 180 },
+      {  67, 108, 186, 222, 172, 174, 216, 167, 246, 195, 221 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 184, 249, 254, 226, 220, 253, 241, 255, 255, 128 },
+      {  84, 182, 244, 254, 222, 218, 254, 217, 255, 255, 128 },
+      {  56, 147, 210, 252, 208, 210, 253, 218, 255, 255, 128 },
+      {  32, 124, 170, 233, 165, 178, 249, 196, 255, 253, 128 }
+    }, { /* Coeff Band 2 */
+      {   1, 182, 242, 245, 208, 194, 239, 179, 255, 238, 128 },
+      {  28, 170, 230, 241, 202, 192, 243, 171, 255, 243, 128 },
+      {  16, 109, 165, 231, 182, 184, 237, 168, 255, 249, 255 },
+      {   2,  76, 113, 202, 141, 172, 221, 160, 252, 227, 255 }
+    }, { /* Coeff Band 3 */
+      {   1, 195, 249, 254, 230, 239, 251, 211, 255, 255, 128 },
+      {  39, 164, 242, 254, 224, 222, 255, 235, 255, 255, 128 },
+      {  16, 111, 179, 251, 204, 197, 251, 234, 255, 209, 128 },
+      {   3,  84, 130, 225, 155, 176, 226, 196, 255, 238, 128 }
+    }, { /* Coeff Band 4 */
+      {   1, 180, 248, 254, 227, 219, 254, 211, 255, 255, 128 },
+      {  38, 170, 242, 253, 222, 214, 254, 242, 255, 255, 128 },
+      {   5, 111, 176, 250, 204, 197, 255, 208, 128, 128, 128 },
+      {   1,  75, 120, 233, 146, 186, 250, 203, 255, 255, 128 }
+    }, { /* Coeff Band 5 */
+      {   1, 183, 251, 255, 232, 223, 252, 229, 255, 255, 128 },
+      {  51, 158, 245, 255, 230, 224, 255, 239, 128, 128, 128 },
+      {  13,  80, 158, 253, 206, 216, 255, 233, 128, 128, 128 },
+      {   4,  39,  76, 212, 107, 153, 252, 206, 255, 255, 128 }
+    }, { /* Coeff Band 6 */
+      {   1, 181, 252, 254, 231, 214, 242, 225, 255, 236, 128 },
+      {  81, 167, 247, 254, 229, 217, 252, 226, 255, 255, 128 },
+      {  20, 122, 195, 253, 213, 212, 249, 211, 255, 238, 128 },
+      {  18, 100, 153, 231, 158, 182, 244, 203, 255, 219, 128 }
+    }, { /* Coeff Band 7 */
+      {   1, 100, 254, 255, 242, 246, 255, 230, 128, 128, 128 },
+      { 177,  62, 250, 255, 246, 210, 255, 255, 128, 128, 128 },
+      {  65,  58, 186, 255, 227, 241, 255, 219, 128, 128, 128 },
+      {  45,  23, 118, 244, 162, 208, 255, 228, 128, 128, 128 }
+    }
+  }, { /* block Type 2 */
+    { /* Coeff Band 0 */
+      { 242,  73, 238, 244, 198, 192, 241, 189, 253, 226, 247 },
+      { 171,  70, 204, 231, 180, 183, 228, 172, 247, 215, 221 },
+      {  73,  62, 144, 202, 153, 169, 207, 153, 245, 199, 230 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 163, 241, 245, 201, 192, 243, 191, 255, 229, 255 },
+      { 165, 147, 230, 245, 201, 193, 244, 193, 255, 231, 255 },
+      {  76, 109, 191, 243, 190, 193, 243, 192, 255, 231, 255 },
+      {  22,  63, 111, 202, 138, 164, 225, 164, 252, 218, 248 }
+    }, { /* Coeff Band 2 */
+      {   1, 113, 225, 245, 201, 195, 238, 185, 254, 225, 255 },
+      { 122, 105, 195, 236, 183, 186, 235, 180, 254, 227, 252 },
+      {  38,  79, 135, 217, 154, 172, 229, 171, 253, 220, 250 },
+      {   9,  53,  78, 161, 121, 151, 202, 141, 251, 207, 244 }
+    }, { /* Coeff Band 3 */
+      {   1, 150, 238, 250, 213, 202, 244, 194, 255, 236, 255 },
+      { 140, 132, 223, 247, 204, 199, 243, 193, 255, 234, 255 },
+      {  51, 101, 182, 240, 188, 189, 240, 186, 255, 232, 255 },
+      {   6,  59, 100, 201, 137, 165, 225, 161, 252, 221, 249 }
+    }, { /* Coeff Band 4 */
+      {   1, 151, 233, 248, 205, 199, 248, 196, 255, 243, 255 },
+      { 133, 140, 214, 244, 193, 193, 245, 194, 255, 236, 255 },
+      {  27, 104, 168, 235, 172, 183, 243, 187, 254, 235, 255 },
+      {   2,  61, 101, 202, 135, 164, 229, 167, 254, 223, 255 }
+    }, { /* Coeff Band 5 */
+      {   1, 227, 246, 254, 225, 215, 254, 217, 255, 255, 128 },
+      { 132, 195, 239, 253, 219, 210, 252, 212, 255, 255, 128 },
+      {  49, 143, 214, 251, 207, 204, 253, 212, 255, 238, 128 },
+      {  11,  93, 151, 235, 169, 185, 247, 190, 255, 238, 128 }
+    }, { /* Coeff Band 6 */
+      {   1, 143, 237, 251, 213, 203, 249, 203, 255, 243, 128 },
+      { 137, 120, 216, 246, 198, 196, 248, 199, 255, 240, 255 },
+      {  50,  94, 166, 233, 169, 181, 245, 189, 255, 240, 255 },
+      {   9,  56,  97, 190, 129, 158, 228, 159, 255, 226, 255 }
+    }, { /* Coeff Band 7 */
+      {   1,  96, 245, 254, 229, 216, 255, 212, 255, 255, 128 },
+      { 179,  81, 234, 253, 217, 209, 255, 230, 255, 255, 128 },
+      { 105,  56, 192, 248, 192, 197, 252, 212, 255, 205, 128 },
+      {  53,  32, 133, 228, 151, 177, 250, 192, 255, 255, 128 }
+    }
+  }, { /* block Type 3 */
+    { /* Coeff Band 0 */
+      { 209,  89, 216, 242, 191, 190, 245, 191, 240, 235, 168 },
+      { 142,  96, 196, 229, 173, 180, 233, 175, 247, 220, 174 },
+      {  66,  89, 157, 205, 155, 171, 209, 156, 243, 200, 197 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 159, 235, 246, 202, 197, 237, 186, 248, 223, 223 },
+      {  96, 137, 223, 247, 203, 198, 242, 188, 241, 202, 209 },
+      {  22,  95, 167, 243, 184, 196, 237, 187, 247, 221, 221 },
+      {   3,  51,  81, 192, 125, 158, 220, 164, 242, 211, 197 }
+    }, { /* Coeff Band 2 */
+      {   1, 145, 226, 244, 196, 194, 240, 191, 247, 225, 233 },
+      {  66, 127, 203, 240, 188, 189, 239, 188, 248, 225, 220 },
+      {   9,  83, 136, 224, 159, 176, 235, 177, 247, 223, 207 },
+      {   2,  46,  71, 169, 121, 152, 210, 149, 241, 212, 199 }
+    }, { /* Coeff Band 3 */
+      {   1, 174, 238, 249, 209, 201, 245, 198, 241, 196, 241 },
+      {  76, 151, 223, 247, 203, 197, 245, 194, 243, 202, 198 },
+      {  12, 102, 170, 240, 183, 187, 242, 191, 247, 225, 209 },
+      {   1,  52,  85, 202, 135, 162, 225, 168, 240, 209, 221 }
+    }, { /* Coeff Band 4 */
+      {   1, 140, 230, 247, 204, 198, 242, 190, 249, 209, 248 },
+      {  94, 126, 213, 244, 195, 194, 240, 190, 247, 210, 237 },
+      {  13,  95, 159, 232, 171, 181, 237, 179, 245, 205, 237 },
+      {   1,  51,  83, 186, 128, 158, 216, 154, 240, 193, 229 }
+    }, { /* Coeff Band 5 */
+      {   1, 218, 244, 251, 214, 202, 243, 199, 253, 214, 255 },
+      {  91, 194, 238, 249, 210, 200, 247, 203, 251, 223, 255 },
+      {  18, 140, 207, 247, 198, 194, 246, 203, 252, 213, 255 },
+      {   3,  76, 126, 223, 156, 172, 233, 185, 251, 206, 255 }
+    }, { /* Coeff Band 6 */
+      {   1, 135, 235, 250, 210, 203, 246, 206, 251, 219, 241 },
+      { 105, 120, 214, 246, 196, 196, 245, 195, 250, 216, 243 },
+      {  24,  91, 154, 231, 166, 180, 241, 183, 250, 214, 242 },
+      {   3,  53,  84, 183, 127, 157, 218, 153, 244, 195, 237 }
+    }, { /* Coeff Band 7 */
+      {   1,  83, 246, 252, 215, 208, 246, 206, 255, 237, 128 },
+      { 184,  61, 233, 250, 208, 204, 245, 198, 254, 227, 255 },
+      {  83,  58, 190, 246, 189, 195, 244, 198, 255, 229, 128 },
+      {  41,  38, 125, 214, 144, 169, 229, 171, 251, 216, 255 }
+    }
+  }
+};
+static const vp9_coeff_probs default_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4] = {
+  { /* block Type 0 */
+    { /* Coeff Band 0 */
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
-      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
-      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
-      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
-      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
-      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
-      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
-      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
-      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
-      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
-      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
-      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
-      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
-      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
-      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
-      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
-      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
-      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
-      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  {
-    /* Block Type ( 1 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
-      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
-      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
-      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
-      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
-      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
-      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
-      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
-      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
-      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
-      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
-      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
-      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
-      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
-      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
-      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
-      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
-      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
-      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
-      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
-      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
-      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
-      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
+  }, { /* block Type 1 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  {
-    /* Block Type ( 2 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
-      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
-      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
-      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
-      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
-      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
-      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
-      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
-      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
-      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
-      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
-      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
+  }, { /* block Type 2 */
+    { /* Coeff Band 0 */
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  {
-    /* Block Type ( 3 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
-      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
-      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
-      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
-      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
-      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
-      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
-      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
-      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
-      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
-      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
-      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
-      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
-      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+  }, { /* block Type 3 */
+    { /* Coeff Band 0 */
+      { 191,  34, 178, 193, 160, 173, 196, 142, 247, 191, 244 },
+      {  84,  45, 129, 187, 145, 170, 189, 145, 240, 186, 212 },
+      {  14,  36,  69, 149, 120, 154, 177, 136, 231, 177, 196 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1,  76, 169, 226, 167, 180, 227, 171, 247, 218, 226 },
+      {  72,  75, 162, 226, 166, 181, 231, 172, 242, 200, 219 },
+      {  30,  63, 130, 218, 153, 175, 226, 170, 247, 216, 219 },
+      {   5,  39,  67, 156, 119, 151, 194, 140, 239, 202, 216 }
+    }, { /* Coeff Band 2 */
+      {   1,  79, 182, 228, 175, 183, 224, 170, 247, 215, 220 },
+      {  69,  77, 168, 224, 170, 180, 223, 168, 246, 215, 223 },
+      {  24,  63, 126, 209, 153, 171, 219, 160, 247, 215, 225 },
+      {   3,  35,  58, 151, 115, 151, 191, 138, 240, 199, 220 }
+    }, { /* Coeff Band 3 */
+      {   1, 139, 213, 238, 194, 192, 234, 180, 244, 193, 236 },
+      {  82, 127, 204, 238, 190, 186, 234, 175, 244, 191, 235 },
+      {  26,  93, 161, 230, 173, 179, 233, 178, 249, 217, 241 },
+      {   3,  48,  78, 186, 132, 158, 212, 157, 244, 205, 233 }
+    }, { /* Coeff Band 4 */
+      {   1, 100, 208, 233, 180, 182, 238, 175, 250, 206, 225 },
+      {  84,  87, 184, 230, 175, 180, 236, 179, 250, 209, 243 },
+      {  14,  61, 111, 217, 146, 171, 236, 174, 249, 207, 245 },
+      {   1,  32,  49, 150, 106, 142, 212, 145, 242, 191, 237 }
+    }, { /* Coeff Band 5 */
+      {   1, 130, 223, 241, 192, 189, 231, 176, 250, 209, 246 },
+      { 101, 120, 207, 239, 188, 187, 240, 196, 250, 202, 255 },
+      {  19,  90, 155, 232, 169, 181, 238, 190, 250, 207, 249 },
+      {   1,  54,  86, 197, 130, 161, 220, 170, 248, 196, 248 }
+    }, { /* Coeff Band 6 */
+      {   1, 103, 208, 236, 183, 185, 235, 190, 243, 202, 219 },
+      {  95,  92, 185, 230, 175, 181, 233, 174, 242, 203, 225 },
+      {  24,  72, 131, 213, 152, 171, 226, 164, 241, 202, 220 },
+      {   3,  45,  74, 169, 123, 154, 204, 145, 238, 188, 222 }
+    }, { /* Coeff Band 7 */
+      {   1,  63, 236, 247, 205, 194, 241, 189, 252, 222, 255 },
+      { 151,  48, 224, 245, 200, 193, 240, 187, 255, 234, 255 },
+      {  76,  45, 178, 240, 180, 189, 239, 182, 253, 231, 255 },
+      {  38,  31, 111, 187, 125, 154, 217, 155, 253, 214, 255 }
     }
   }
 };
-
-static const vp9_prob default_hybrid_coef_probs [BLOCK_TYPES]
-                                                [COEF_BANDS]
-                                                [PREV_COEF_CONTEXTS]
-                                                [ENTROPY_NODES] = {
-  {
-    /* Block Type ( 0 ) */
-    {
-      /* Coeff Band ( 0 )*/
+static const vp9_coeff_probs default_coef_probs_8x8[BLOCK_TYPES_8X8] = {
+  { /* block Type 0 */
+    { /* Coeff Band 0 */
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
-      { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
-      { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
-      { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
-      { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
-      {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
-      {  64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
-      { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
-      {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
-      {  64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
-      { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
-      {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
-      {  28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
-      { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
-      { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
-      { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
-      { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
-      {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
-      {  64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 179, 203, 246, 252, 217, 208, 249, 197, 238, 237, 255 },
+      { 136, 193, 232, 247, 202, 199, 245, 194, 255, 235, 255 },
+      {  66, 170, 209, 244, 190, 191, 250, 199, 255, 242, 192 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      {   1, 191, 232, 250, 204, 201, 248, 199, 254, 243, 213 },
+      {  50, 161, 209, 247, 196, 197, 250, 206, 253, 240, 213 },
+      {   6, 118, 160, 239, 173, 186, 249, 203, 254, 235, 255 },
+      {   2,  90, 110, 211, 141, 166, 242, 181, 254, 235, 255 }
+    }, { /* Coeff Band 3 */
+      {   1, 209, 242, 254, 223, 215, 253, 218, 255, 253, 128 },
+      {  58, 168, 227, 253, 216, 211, 254, 226, 255, 251, 128 },
+      {   7, 111, 178, 249, 195, 202, 253, 222, 254, 240, 255 },
+      {   2,  63, 103, 226, 142, 175, 250, 202, 255, 246, 128 }
+    }, { /* Coeff Band 4 */
+      {   1, 207, 241, 252, 213, 205, 252, 215, 255, 228, 255 },
+      {  55, 171, 225, 251, 209, 205, 251, 212, 254, 234, 255 },
+      {   5, 108, 173, 247, 187, 195, 251, 211, 255, 231, 128 },
+      {   2,  56,  97, 220, 138, 169, 248, 191, 253, 237, 255 }
+    }, { /* Coeff Band 5 */
+      {   1, 211, 245, 255, 227, 219, 255, 233, 255, 255, 128 },
+      {  58, 175, 228, 254, 217, 215, 255, 231, 255, 255, 128 },
+      {   6, 124, 181, 249, 191, 199, 255, 222, 255, 251, 128 },
+      {   2,  85, 122, 227, 149, 172, 250, 195, 255, 245, 128 }
+    }, { /* Coeff Band 6 */
+      {   1, 216, 246, 255, 231, 217, 254, 220, 255, 250, 128 },
+      {  74, 177, 236, 254, 222, 214, 254, 221, 255, 255, 128 },
+      {  13, 125, 192, 250, 200, 203, 254, 217, 255, 245, 128 },
+      {   2,  70, 114, 227, 147, 175, 251, 198, 255, 240, 128 }
+    }, { /* Coeff Band 7 */
+      {   1, 199, 246, 255, 238, 229, 255, 226, 255, 255, 128 },
+      { 132, 162, 240, 255, 229, 222, 255, 239, 255, 255, 128 },
+      {  79, 125, 207, 253, 213, 214, 255, 232, 255, 255, 128 },
+      {  41,  89, 149, 240, 161, 187, 250, 216, 255, 255, 128 }
     }
-  },
-  {
-    /* Block Type ( 1 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
-      { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
-      {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
-      {  48,  32, 146, 208, 149, 167, 221, 162, 255, 223, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
-      { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
-      {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
-      {  66,  90, 181, 242, 176, 190, 249, 202, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
-      {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
-      {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
-      {  18,  80, 163, 242, 170, 187, 247, 210, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
-      { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
-      {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
-      {  36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
-      {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
-      {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
-      {  18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
-      { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
-      {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
-      {  28,  70, 181, 251, 193, 211, 255, 205, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
-      { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
-      {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
-      {  40,  90, 188, 251, 195, 217, 255, 224, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
-      { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
-      { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 },
-    }
-  },
-  {
-    /* Block Type ( 2 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
-      { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
-      {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
-      {  64,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
-      { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
-      { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
-      { 140,  70, 195, 248, 188, 195, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
-      { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
-      {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
-      {  60,  40, 190, 239, 201, 218, 255, 228, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
-      { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
-      { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
-      {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      {  48,  85, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
+  }, { /* block Type 1 */
+    { /* Coeff Band 0 */
+      { 138,  65, 189, 212, 172, 169, 200, 153, 233, 182, 214 },
+      {  93,  60, 162, 203, 160, 169, 200, 153, 239, 190, 213 },
+      {  66,  55, 141, 195, 152, 166, 199, 152, 238, 190, 212 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 102, 221, 247, 205, 198, 248, 201, 255, 235, 128 },
+      { 122,  95, 215, 247, 200, 197, 248, 200, 254, 227, 255 },
+      {  60,  81, 166, 241, 177, 190, 245, 193, 255, 246, 255 },
+      {  32,  61, 108, 195, 133, 159, 230, 163, 254, 230, 238 }
+    }, { /* Coeff Band 2 */
+      {   1,  58, 203, 242, 194, 193, 229, 177, 253, 225, 249 },
+      { 113,  62, 192, 237, 184, 187, 231, 181, 253, 220, 249 },
+      {  50,  50, 135, 225, 159, 177, 229, 172, 254, 222, 241 },
+      {  24,  34,  82, 185, 125, 152, 223, 158, 253, 212, 219 }
+    }, { /* Coeff Band 3 */
+      {   1,   1, 220, 253, 218, 209, 251, 213, 255, 255, 128 },
+      { 154,   1, 216, 252, 211, 206, 252, 212, 255, 252, 128 },
+      { 102,   1, 157, 249, 184, 200, 253, 214, 255, 247, 128 },
+      {  68,   1, 101, 213, 129, 161, 247, 186, 255, 237, 255 }
+    }, { /* Coeff Band 4 */
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  {
-    /* Block Type ( 3 ) */
-    {
-      /* Coeff Band ( 0 )*/
-      { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
-      { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
-      {  63,  48, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
-      {  54,  40, 138, 219, 151, 178, 240, 170, 255, 216, 128 },
-    },
-    {
-      /* Coeff Band ( 1 )*/
-      {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
-      { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
-      {  44,  84, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
-      {  32,  70, 162, 232, 172, 180, 245, 178, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 2 )*/
-      {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
-      { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
-      {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 3 )*/
-      {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
-      { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
-      {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
-      {  26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 },
-    },
-    {
-      /* Coeff Band ( 4 )*/
-      {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
-      { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
-      {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 5 )*/
-      {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
-      { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
-      {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 6 )*/
-      {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
-      { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
-      {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 },
-    },
-    {
-      /* Coeff Band ( 7 )*/
-      {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
-      { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+  }, { /* block Type 2 */
+    { /* Coeff Band 0 */
+      { 229,  64, 235, 236, 189, 190, 227, 179, 247, 203, 226 },
+      { 148,  70, 194, 228, 175, 182, 216, 170, 238, 192, 224 },
+      {  53,  63, 134, 207, 150, 169, 213, 161, 247, 204, 232 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 173, 234, 244, 201, 193, 239, 180, 252, 214, 255 },
+      { 160, 156, 222, 243, 200, 193, 237, 179, 253, 216, 255 },
+      {  55, 119, 187, 240, 189, 192, 236, 180, 253, 226, 255 },
+      {  14,  65, 105, 193, 142, 165, 205, 151, 249, 200, 250 }
+    }, { /* Coeff Band 2 */
+      {   1, 124, 218, 246, 195, 196, 242, 198, 254, 229, 255 },
+      {  85, 114, 180, 240, 179, 187, 239, 191, 253, 223, 239 },
+      {  18,  81, 128, 220, 152, 173, 232, 176, 252, 221, 254 },
+      {   2,  42,  64, 150, 115, 149, 192, 137, 247, 197, 247 }
+    }, { /* Coeff Band 3 */
+      {   1, 164, 230, 251, 210, 204, 245, 201, 255, 238, 255 },
+      {  96, 137, 210, 248, 199, 199, 244, 198, 254, 218, 255 },
+      {  20,  97, 169, 240, 179, 188, 242, 190, 254, 228, 255 },
+      {   2,  58,  95, 197, 137, 164, 220, 158, 252, 217, 248 }
+    }, { /* Coeff Band 4 */
+      {   1, 193, 236, 245, 203, 194, 243, 191, 254, 223, 255 },
+      {  86, 163, 217, 241, 190, 188, 242, 189, 253, 220, 255 },
+      {  14, 108, 161, 228, 167, 178, 238, 180, 253, 224, 255 },
+      {   1,  51,  84, 186, 127, 159, 216, 155, 251, 208, 243 }
+    }, { /* Coeff Band 5 */
+      {   1, 183, 235, 248, 209, 197, 244, 195, 253, 236, 239 },
+      {  79, 144, 208, 243, 193, 190, 244, 191, 254, 231, 255 },
+      {  13, 100, 151, 227, 163, 176, 240, 180, 255, 233, 244 },
+      {   1,  48,  77, 171, 121, 153, 214, 150, 252, 214, 245 }
+    }, { /* Coeff Band 6 */
+      {   1, 202, 234, 252, 215, 207, 248, 207, 254, 242, 255 },
+      {  75, 153, 216, 249, 203, 201, 248, 203, 255, 239, 255 },
+      {  11, 104, 168, 241, 179, 189, 245, 194, 255, 237, 128 },
+      {   1,  57,  95, 201, 134, 163, 229, 165, 254, 223, 246 }
+    }, { /* Coeff Band 7 */
+      {   1, 184, 236, 254, 222, 212, 254, 225, 255, 255, 128 },
+      {  74, 149, 220, 252, 210, 208, 253, 223, 255, 249, 128 },
+      {  18, 109, 175, 247, 184, 195, 253, 211, 255, 250, 128 },
+      {   3,  64, 113, 219, 144, 171, 246, 187, 255, 250, 128 }
     }
+  }, { /* block Type 3 */
+    { /* Coeff Band 0 */
+      { 140, 101, 214, 227, 176, 182, 218, 167, 233, 205, 164 },
+      {  96, 101, 176, 204, 161, 173, 193, 152, 223, 182, 182 },
+      {  27,  84, 123, 176, 140, 162, 190, 142, 238, 189, 210 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 178, 218, 240, 189, 189, 238, 184, 250, 232, 189 },
+      {  69, 146, 204, 239, 187, 189, 238, 183, 251, 226, 221 },
+      {  16,  98, 157, 234, 170, 185, 237, 183, 252, 220, 218 },
+      {   3,  49,  78, 172, 122, 154, 204, 150, 242, 198, 207 }
+    }, { /* Coeff Band 2 */
+      {   1, 165, 207, 230, 179, 181, 234, 172, 252, 228, 218 },
+      {  25, 130, 175, 224, 169, 177, 232, 169, 252, 230, 207 },
+      {   4,  81, 118, 205, 144, 167, 227, 162, 252, 225, 219 },
+      {   2,  51,  63, 150, 114, 148, 197, 138, 244, 202, 204 }
+    }, { /* Coeff Band 3 */
+      {   1, 181, 222, 247, 200, 197, 246, 199, 252, 232, 228 },
+      {  25, 142, 200, 244, 190, 193, 245, 195, 253, 233, 204 },
+      {   3,  90, 146, 233, 166, 181, 242, 188, 252, 229, 216 },
+      {   1,  47,  79, 188, 124, 157, 222, 162, 245, 213, 203 }
+    }, { /* Coeff Band 4 */
+      {   1, 179, 220, 242, 195, 191, 237, 182, 251, 217, 231 },
+      {  27, 144, 200, 241, 188, 190, 238, 185, 250, 224, 235 },
+      {   3,  93, 149, 230, 166, 180, 235, 180, 249, 222, 221 },
+      {   1,  47,  79, 181, 125, 157, 211, 154, 241, 205, 198 }
+    }, { /* Coeff Band 5 */
+      {   1, 176, 222, 247, 202, 198, 247, 199, 252, 234, 219 },
+      {  24, 139, 197, 244, 190, 192, 246, 196, 253, 232, 220 },
+      {   2,  89, 140, 229, 161, 178, 243, 185, 253, 233, 234 },
+      {   1,  49,  76, 176, 121, 154, 214, 153, 243, 209, 208 }
+    }, { /* Coeff Band 6 */
+      {   1, 197, 233, 251, 213, 205, 247, 206, 249, 222, 247 },
+      {  35, 159, 216, 249, 203, 201, 246, 203, 250, 222, 223 },
+      {   4, 108, 167, 240, 178, 188, 244, 195, 248, 220, 235 },
+      {   1,  58,  93, 198, 133, 161, 220, 167, 233, 195, 221 }
+    }, { /* Coeff Band 7 */
+      {   1, 188, 240, 253, 221, 209, 248, 207, 252, 223, 255 },
+      {  84, 153, 227, 251, 212, 205, 247, 205, 254, 215, 255 },
+      {  25, 117, 182, 244, 186, 192, 243, 198, 250, 209, 255 },
+      {   7,  72, 108, 197, 138, 162, 203, 161, 240, 178, 247 }
+    }
   }
 };
-
-static const vp9_prob
-default_coef_probs_8x8[BLOCK_TYPES_8X8]
-[COEF_BANDS]
-[PREV_COEF_CONTEXTS]
-[ENTROPY_NODES] = {
-  {
-    /* block Type 0 */
-    {
-      /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+static const vp9_coeff_probs default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8] = {
+  { /* block Type 0 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  {
-    /* block Type 1 */
-    {
-      /* Coeff Band 0 */
-      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
-      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
-      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
-      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
-      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
-      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+  }, { /* block Type 1 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  {
-    /* block Type 2 */
-    {
-      /* Coeff Band 0 */
-      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
-      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
-      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
-      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
-      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
-      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
-      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
-      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
-      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
+  }, { /* block Type 2 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  { /* block Type 3 */
+  }, { /* block Type 3 */
     { /* Coeff Band 0 */
-      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
-      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
-      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
-      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
-      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
-      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
-    },
-    { /* Coeff Band 2 */
-      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
-      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
-      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
-      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
-    },
-    { /* Coeff Band 3 */
-      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
-      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
-      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
-      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
-    },
-    { /* Coeff Band 4 */
-      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
-      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
-      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
-      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
-      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
-      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
-      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
-      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
-      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
-      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 7 */
-      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
-      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
-      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+      { 118,  27, 105, 170, 137, 166, 183, 137, 243, 189, 241 },
+      {  44,  34,  85, 142, 127, 158, 161, 128, 232, 174, 213 },
+      {   8,  26,  47, 104, 108, 145, 143, 117, 226, 168, 207 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 134, 172, 217, 163, 175, 226, 167, 251, 220, 204 },
+      {  56, 129, 168, 217, 161, 174, 223, 164, 249, 218, 223 },
+      {  20, 110, 151, 215, 158, 174, 221, 165, 249, 209, 221 },
+      {   2,  59,  88, 169, 128, 157, 192, 143, 239, 189, 214 }
+    }, { /* Coeff Band 2 */
+      {   1,  65, 126, 191, 140, 163, 218, 153, 252, 218, 229 },
+      {  21,  57,  92, 175, 126, 156, 214, 148, 252, 218, 229 },
+      {   4,  44,  66, 148, 114, 148, 200, 136, 251, 211, 228 },
+      {   1,  28,  42, 108, 104, 141, 158, 119, 235, 180, 210 }
+    }, { /* Coeff Band 3 */
+      {   1, 114, 172, 227, 166, 177, 236, 178, 252, 226, 233 },
+      {  41,  94, 152, 218, 156, 172, 233, 172, 251, 223, 231 },
+      {   9,  69, 116, 202, 142, 165, 226, 162, 251, 221, 227 },
+      {   1,  36,  60, 151, 113, 148, 195, 140, 241, 198, 211 }
+    }, { /* Coeff Band 4 */
+      {   1, 186, 200, 227, 174, 178, 230, 169, 248, 210, 238 },
+      {  27, 148, 181, 221, 167, 176, 226, 166, 250, 218, 228 },
+      {   3,  96, 139, 208, 154, 170, 219, 161, 249, 214, 229 },
+      {   1,  44,  70, 156, 120, 152, 188, 139, 239, 193, 200 }
+    }, { /* Coeff Band 5 */
+      {   1, 169, 203, 238, 186, 186, 238, 184, 252, 224, 230 },
+      {  32, 119, 173, 232, 172, 181, 236, 182, 252, 222, 237 },
+      {   6,  84, 128, 215, 150, 170, 232, 172, 251, 221, 235 },
+      {   1,  49,  78, 167, 124, 154, 200, 145, 243, 198, 217 }
+    }, { /* Coeff Band 6 */
+      {   1, 193, 215, 244, 197, 195, 239, 192, 249, 213, 240 },
+      {  52, 136, 193, 239, 184, 189, 237, 189, 248, 211, 226 },
+      {  13,  90, 146, 227, 162, 178, 233, 182, 248, 211, 231 },
+      {   1,  49,  79, 177, 124, 156, 201, 154, 234, 188, 212 }
+    }, { /* Coeff Band 7 */
+      {   1, 189, 238, 248, 219, 196, 232, 180, 253, 211, 255 },
+      { 104, 148, 224, 245, 211, 194, 225, 171, 251, 206, 255 },
+      {  43, 116, 190, 231, 179, 183, 217, 168, 249, 199, 255 },
+      {  13,  65,  92, 154, 131, 152, 167, 132, 238, 174, 243 }
     }
   }
 };
-
-static const vp9_prob
-default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
-                             [COEF_BANDS]
-                             [PREV_COEF_CONTEXTS]
-                             [ENTROPY_NODES] = {
-  {
-    /* block Type 0 */
-    {
-      /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+static const vp9_coeff_probs default_coef_probs_16x16[BLOCK_TYPES_16X16] = {
+  { /* block Type 0 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  {
-    /* block Type 1 */
-    {
-      /* Coeff Band 0 */
-      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},
-      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},
-      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},
-      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},
-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},
-      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},
-      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},
-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},
-      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},
-      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},
-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},
-      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+  }, { /* block Type 1 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  {
-    /* block Type 2 */
-    {
-      /* Coeff Band 0 */
-      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},
-      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},
-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},
-      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 1 */
-      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},
-      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},
-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},
-      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 2 */
-      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},
-      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},
-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},
-      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 3 */
-      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},
-      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},
-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},
-      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 4 */
-      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},
-      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},
-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},
-      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}
-    },
-    {
-      /* Coeff Band 5 */
-      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},
-      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},
-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},
-      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 6 */
-      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},
-      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},
-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},
-      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}
-    },
-    {
-      /* Coeff Band 7 */
-      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},
-      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},
-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},
-      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}
+  }, { /* block Type 2 */
+    { /* Coeff Band 0 */
+      { 223,  34, 236, 234, 193, 185, 216, 169, 239, 189, 229 },
+      { 125,  40, 195, 221, 173, 175, 209, 165, 220, 181, 196 },
+      {  41,  37, 127, 185, 145, 162, 191, 150, 227, 180, 219 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 160, 224, 239, 193, 190, 213, 178, 244, 174, 255 },
+      { 199, 154, 212, 238, 190, 190, 210, 173, 246, 183, 249 },
+      {  88, 122, 178, 234, 180, 187, 213, 174, 244, 182, 247 },
+      {  27,  69, 100, 174, 139, 165, 159, 142, 225, 157, 240 }
+    }, { /* Coeff Band 2 */
+      {   1, 118, 207, 237, 179, 185, 234, 189, 241, 194, 237 },
+      {  86, 103, 161, 227, 163, 176, 231, 183, 241, 196, 234 },
+      {  19,  69, 113, 205, 140, 166, 220, 169, 240, 188, 242 },
+      {   3,  32,  49, 106, 111, 144, 132, 121, 225, 151, 237 }
+    }, { /* Coeff Band 3 */
+      {   1, 160, 218, 245, 197, 195, 235, 189, 254, 218, 255 },
+      {  90, 127, 193, 240, 186, 189, 235, 187, 251, 217, 230 },
+      {  18,  92, 148, 229, 164, 179, 228, 180, 254, 212, 229 },
+      {   2,  50,  79, 163, 126, 156, 186, 140, 247, 191, 236 }
+    }, { /* Coeff Band 4 */
+      {   1, 196, 231, 240, 203, 191, 225, 171, 253, 214, 255 },
+      {  71, 167, 210, 234, 194, 188, 218, 165, 253, 215, 236 },
+      {  11, 119, 165, 217, 171, 177, 213, 155, 252, 209, 255 },
+      {   1,  46,  70, 145, 121, 153, 180, 131, 249, 192, 246 }
+    }, { /* Coeff Band 5 */
+      {   1, 176, 223, 242, 202, 194, 222, 169, 253, 211, 244 },
+      {  62, 131, 191, 233, 185, 186, 219, 164, 251, 211, 252 },
+      {   7,  89, 133, 207, 156, 173, 211, 157, 251, 206, 247 },
+      {   1,  36,  56, 127, 113, 147, 166, 125, 243, 183, 242 }
+    }, { /* Coeff Band 6 */
+      {   1, 203, 232, 249, 213, 202, 245, 193, 254, 237, 255 },
+      {  51, 155, 212, 245, 199, 195, 244, 192, 254, 234, 255 },
+      {   7, 101, 158, 233, 170, 181, 244, 185, 253, 242, 255 },
+      {   1,  49,  82, 185, 123, 157, 226, 156, 252, 225, 240 }
+    }, { /* Coeff Band 7 */
+      {   1, 222, 233, 252, 220, 207, 247, 206, 255, 240, 128 },
+      {  40, 159, 216, 250, 205, 201, 248, 207, 249, 219, 255 },
+      {   6, 106, 163, 240, 176, 188, 247, 198, 251, 222, 255 },
+      {   1,  51,  88, 196, 127, 159, 232, 169, 252, 214, 255 }
     }
-  },
-  { /* block Type 3 */
+  }, { /* block Type 3 */
     { /* Coeff Band 0 */
-      { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255},
-      { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255},
-      { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128},
-      { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128},
-      { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128},
-      { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128}
-    },
-    { /* Coeff Band 2 */
-      { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128},
-      { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128},
-      { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128},
-      { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128}
-    },
-    { /* Coeff Band 3 */
-      { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128},
-      { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128},
-      { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128},
-      { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128}
-    },
-    { /* Coeff Band 4 */
-      { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128},
-      { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128},
-      { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128},
-      { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128},
-      { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128},
-      { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128},
-      { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128},
-      { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128},
-      { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128},
-      { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 7 */
-      { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128},
-      { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128},
-      { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
+      {  14,  78, 225, 217, 173, 181, 198, 153, 228, 185, 176 },
+      {   9,  74, 179, 191, 157, 171, 178, 143, 229, 175, 209 },
+      {   3,  48,  92, 128, 130, 155, 135, 123, 220, 155, 219 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 178, 209, 214, 173, 175, 208, 152, 252, 210, 237 },
+      { 142, 151, 193, 212, 170, 175, 209, 151, 251, 208, 237 },
+      {  38, 105, 150, 206, 159, 173, 208, 151, 250, 209, 238 },
+      {   5,  44,  61, 128, 114, 147, 167, 125, 239, 184, 217 }
+    }, { /* Coeff Band 2 */
+      {   1, 154, 195, 202, 166, 173, 184, 144, 245, 184, 236 },
+      {  49, 110, 150, 188, 155, 168, 180, 141, 244, 183, 239 },
+      {   4,  63,  90, 158, 132, 157, 171, 134, 243, 179, 239 },
+      {   1,  25,  37,  93, 104, 141, 133, 114, 231, 161, 226 }
+    }, { /* Coeff Band 3 */
+      {   1, 184, 201, 223, 173, 177, 224, 164, 253, 220, 238 },
+      {  42, 127, 170, 215, 164, 173, 223, 162, 253, 219, 233 },
+      {   4,  75, 114, 195, 142, 164, 218, 155, 253, 217, 235 },
+      {   1,  32,  50, 128, 108, 144, 180, 127, 247, 197, 219 }
+    }, { /* Coeff Band 4 */
+      {   1, 190, 207, 232, 181, 184, 228, 172, 251, 216, 212 },
+      {  35, 136, 180, 227, 173, 180, 227, 171, 251, 216, 218 },
+      {   2,  85, 131, 214, 154, 173, 224, 166, 250, 214, 225 },
+      {   1,  44,  71, 162, 120, 153, 195, 143, 240, 195, 197 }
+    }, { /* Coeff Band 5 */
+      {   1, 185, 201, 230, 177, 180, 232, 172, 253, 225, 235 },
+      {  27, 122, 165, 221, 164, 175, 230, 169, 253, 224, 220 },
+      {   1,  72, 108, 197, 139, 163, 224, 159, 253, 224, 226 },
+      {   1,  33,  51, 132, 107, 144, 186, 130, 245, 201, 206 }
+    }, { /* Coeff Band 6 */
+      {   1, 203, 214, 240, 193, 191, 235, 178, 252, 225, 224 },
+      {  20, 140, 188, 235, 182, 186, 234, 177, 252, 226, 226 },
+      {   1,  85, 132, 218, 155, 174, 230, 170, 251, 224, 227 },
+      {   1,  39,  62, 154, 114, 150, 199, 141, 241, 203, 214 }
+    }, { /* Coeff Band 7 */
+      {   1, 217, 224, 244, 202, 193, 241, 187, 252, 227, 239 },
+      {  22, 151, 200, 239, 187, 188, 240, 184, 252, 226, 237 },
+      {   2,  90, 138, 222, 158, 174, 237, 176, 252, 226, 239 },
+      {   1,  41,  66, 163, 116, 151, 206, 146, 243, 201, 230 }
     }
   }
 };
-
-static const vp9_prob
-  default_coef_probs_16x16[BLOCK_TYPES_16X16]
-                          [COEF_BANDS]
-                          [PREV_COEF_CONTEXTS]
-                          [ENTROPY_NODES] = {
+static const vp9_coeff_probs default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16] = {
   { /* block Type 0 */
     { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
-    },
-    { /* Coeff Band 2 */
-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 3 */
-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 4 */
-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    { /* Coeff Band 7 */
-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  { /* block Type 1 */
-      { /* Coeff Band 0 */
-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-      },
-      { /* Coeff Band 1 */
-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
-      },
-      { /* Coeff Band 2 */
-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
-      },
-      { /* Coeff Band 3 */
-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
-      },
-      { /* Coeff Band 4 */
-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
-      },
-      { /* Coeff Band 5 */
-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
-      },
-      { /* Coeff Band 6 */
-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
-      },
-      { /* Coeff Band 7 */
-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
-      }
-  },
-  { /* block Type 2 */
-      { /* Coeff Band 0 */
-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-      },
-      { /* Coeff Band 1 */
-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
-      },
-      { /* Coeff Band 2 */
-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
-      },
-      { /* Coeff Band 3 */
-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
-      },
-      { /* Coeff Band 4 */
-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
-      },
-      { /* Coeff Band 5 */
-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
-      },
-      { /* Coeff Band 6 */
-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
-      },
-      { /* Coeff Band 7 */
-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
-      }
-  },
-  { /* block Type 3 */
+  }, { /* block Type 1 */
     { /* Coeff Band 0 */
-      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
-      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
-      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
-      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
-      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
-      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
-    },
-    { /* Coeff Band 2 */
-      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
-      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
-      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
-      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
-    },
-    { /* Coeff Band 3 */
-      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
-      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
-      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
-      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
-    },
-    { /* Coeff Band 4 */
-      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
-      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
-      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
-      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
-      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
-      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
-      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
-      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
-      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
-      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
-    },
-    { /* Coeff Band 7 */
-      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
-      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
-      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
-      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
+  }, { /* block Type 2 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  }, { /* block Type 3 */
+    { /* Coeff Band 0 */
+      {   3,  29,  86, 140, 130, 163, 135, 131, 190, 148, 186 },
+      {   1,  26,  61, 105, 124, 156, 105, 119, 178, 138, 173 },
+      {   1,  15,  28,  60, 105, 142,  80, 105, 173, 128, 178 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 130, 142, 172, 141, 161, 191, 140, 244, 193, 216 },
+      {  61, 124, 141, 173, 141, 161, 190, 139, 244, 194, 215 },
+      {  28, 103, 124, 171, 138, 160, 190, 140, 243, 194, 225 },
+      {   1,  36,  51, 111, 109, 144, 152, 120, 227, 173, 205 }
+    }, { /* Coeff Band 2 */
+      {   1,  60, 125, 153, 143, 159, 156, 127, 234, 170, 233 },
+      {  22,  48,  78, 129, 124, 152, 151, 123, 234, 170, 233 },
+      {   3,  32,  46,  98, 107, 142, 138, 114, 232, 165, 232 },
+      {   1,  15,  23,  61,  96, 135, 101, 103, 210, 144, 213 }
+    }, { /* Coeff Band 3 */
+      {   1, 102, 144, 182, 146, 162, 194, 143, 246, 196, 239 },
+      {  34,  76, 116, 171, 136, 159, 192, 140, 246, 195, 239 },
+      {   4,  51,  81, 153, 124, 153, 184, 135, 246, 192, 239 },
+      {   1,  23,  37,  98, 102, 140, 142, 116, 230, 167, 227 }
+    }, { /* Coeff Band 4 */
+      {   1, 165, 171, 214, 163, 174, 214, 160, 245, 203, 219 },
+      {  16, 120, 154, 210, 158, 172, 212, 159, 245, 201, 219 },
+      {   1,  80, 122, 199, 147, 167, 208, 154, 244, 200, 223 },
+      {   1,  40,  65, 145, 118, 151, 171, 135, 226, 175, 202 }
+    }, { /* Coeff Band 5 */
+      {   1, 146, 162, 215, 159, 172, 226, 165, 251, 218, 231 },
+      {  16,  92, 131, 205, 147, 167, 224, 162, 252, 217, 228 },
+      {   2,  60,  92, 182, 129, 158, 216, 152, 251, 214, 234 },
+      {   1,  32,  50, 126, 107, 144, 176, 128, 240, 189, 216 }
+    }, { /* Coeff Band 6 */
+      {   1, 178, 186, 224, 172, 178, 224, 167, 251, 214, 232 },
+      {  14, 118, 158, 215, 160, 173, 223, 164, 250, 214, 228 },
+      {   2,  70, 109, 194, 139, 164, 217, 156, 250, 213, 227 },
+      {   1,  32,  51, 129, 108, 146, 175, 128, 240, 187, 218 }
+    }, { /* Coeff Band 7 */
+      {   1, 210, 214, 240, 192, 188, 235, 182, 251, 221, 228 },
+      {  22, 140, 187, 233, 177, 183, 234, 178, 251, 219, 233 },
+      {   3,  82, 130, 215, 152, 171, 229, 171, 250, 217, 232 },
+      {   1,  38,  63, 154, 115, 149, 195, 141, 240, 196, 219 }
+    }
   }
 };
-
-static const vp9_prob
-  default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
-                                 [COEF_BANDS]
-                                 [PREV_COEF_CONTEXTS]
-                                 [ENTROPY_NODES] = {
+static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES_32X32] = {
   { /* block Type 0 */
     { /* Coeff Band 0 */
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},
-      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},
-      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}
-    },
-    { /* Coeff Band 2 */
-      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},
-      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},
-      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 3 */
-      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},
-      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},
-      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}
-    },
-    { /* Coeff Band 4 */
-      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},
-      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},
-      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},
-      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},
-      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},
-      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}
-    },
-    { /* Coeff Band 7 */
-      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},
-      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},
-      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
     }
-  },
-  { /* block Type 1 */
-      { /* Coeff Band 0 */
-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-      },
-      { /* Coeff Band 1 */
-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
-      },
-      { /* Coeff Band 2 */
-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
-      },
-      { /* Coeff Band 3 */
-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
-      },
-      { /* Coeff Band 4 */
-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
-      },
-      { /* Coeff Band 5 */
-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
-      },
-      { /* Coeff Band 6 */
-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
-      },
-      { /* Coeff Band 7 */
-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
-      }
-  },
-  { /* block Type 2 */
-      { /* Coeff Band 0 */
-        { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128},
-        { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255},
-        { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255},
-        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-      },
-      { /* Coeff Band 1 */
-        { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128},
-        { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128},
-        { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128},
-        { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128}
-      },
-      { /* Coeff Band 2 */
-        { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128},
-        { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128},
-        { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128},
-        { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128}
-      },
-      { /* Coeff Band 3 */
-        { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128},
-        { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128},
-        { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128},
-        { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128}
-      },
-      { /* Coeff Band 4 */
-        { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128},
-        { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128},
-        { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128},
-        { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128}
-      },
-      { /* Coeff Band 5 */
-        { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128},
-        { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128},
-        { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128},
-        { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128}
-      },
-      { /* Coeff Band 6 */
-        { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128},
-        { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128},
-        { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128},
-        { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128}
-      },
-      { /* Coeff Band 7 */
-        { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128},
-        { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128},
-        { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128},
-        { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128}
-      }
-  },
-  { /* block Type 3 */
+  }, { /* block Type 1 */
     { /* Coeff Band 0 */
-      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},
-      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},
-      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},
-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}
-    },
-    { /* Coeff Band 1 */
-      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},
-      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},
-      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},
-      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}
-    },
-    { /* Coeff Band 2 */
-      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},
-      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},
-      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},
-      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}
-    },
-    { /* Coeff Band 3 */
-      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},
-      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},
-      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},
-      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}
-    },
-    { /* Coeff Band 4 */
-      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},
-      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},
-      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},
-      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}
-    },
-    { /* Coeff Band 5 */
-      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},
-      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},
-      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},
-      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}
-    },
-    { /* Coeff Band 6 */
-      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},
-      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},
-      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},
-      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}
-    },
-    { /* Coeff Band 7 */
-      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},
-      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},
-      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},
-      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  }, { /* block Type 2 */
+    { /* Coeff Band 0 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 2 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 3 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 4 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 5 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 6 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 7 */
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }
+  }, { /* block Type 3 */
+    { /* Coeff Band 0 */
+      {   8,  40, 224, 217, 183, 181, 180, 148, 200, 180, 123 },
+      {   6,  37, 178, 193, 173, 171, 160, 139, 205, 166, 173 },
+      {   3,  27,  93, 133, 143, 159, 115, 125, 183, 141, 178 },
+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+    }, { /* Coeff Band 1 */
+      {   1, 170, 209, 202, 172, 175, 179, 143, 238, 181, 214 },
+      { 184, 164, 199, 199, 169, 173, 180, 143, 238, 184, 217 },
+      {  99, 128, 165, 194, 161, 171, 180, 142, 239, 182, 219 },
+      {  17,  49,  59, 102, 117, 148, 122, 116, 208, 152, 191 }
+    }, { /* Coeff Band 2 */
+      {   1, 136, 200, 197, 172, 172, 168, 142, 226, 170, 216 },
+      {  66, 104, 146, 175, 152, 165, 163, 139, 225, 170, 219 },
+      {  11,  52,  83, 144, 130, 156, 151, 130, 222, 165, 216 },
+      {   1,  16,  25,  65,  99, 137,  96, 106, 190, 138, 184 }
+    }, { /* Coeff Band 3 */
+      {   1, 180, 203, 198, 166, 170, 190, 143, 241, 190, 227 },
+      {  74, 125, 161, 187, 154, 165, 187, 142, 241, 189, 224 },
+      {  15,  70,  98, 163, 133, 157, 182, 137, 241, 187, 226 },
+      {   1,  25,  37,  89, 104, 140, 128, 113, 218, 158, 206 }
+    }, { /* Coeff Band 4 */
+      {   1, 191, 208, 213, 169, 173, 212, 156, 246, 206, 217 },
+      {  53, 136, 170, 205, 159, 170, 211, 156, 246, 205, 208 },
+      {   3,  75, 112, 189, 140, 163, 209, 151, 246, 205, 215 },
+      {   1,  32,  51, 127, 108, 145, 171, 128, 231, 183, 197 }
+    }, { /* Coeff Band 5 */
+      {   1, 183, 195, 202, 161, 168, 206, 150, 247, 202, 229 },
+      {  42, 113, 144, 190, 147, 163, 203, 148, 247, 202, 229 },
+      {   2,  56,  82, 160, 124, 153, 195, 140, 246, 200, 229 },
+      {   1,  22,  34,  93,  99, 138, 143, 115, 227, 170, 206 }
+    }, { /* Coeff Band 6 */
+      {   1, 202, 193, 221, 168, 175, 227, 167, 251, 217, 236 },
+      {  26, 122, 158, 213, 157, 171, 225, 165, 251, 216, 242 },
+      {   1,  68, 105, 194, 136, 162, 221, 158, 251, 215, 239 },
+      {   1,  32,  51, 131, 107, 145, 179, 130, 240, 188, 231 }
+    }, { /* Coeff Band 7 */
+      {   1, 234, 212, 243, 195, 192, 240, 187, 253, 226, 227 },
+      {  14, 141, 186, 237, 181, 186, 239, 184, 253, 226, 233 },
+      {   1,  85, 132, 221, 155, 174, 235, 176, 253, 224, 226 },
+      {   1,  39,  65, 159, 115, 150, 202, 144, 245, 202, 214 }
     }
   }
 };
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -17,20 +17,12 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vpx_mem/vpx_mem.h"
-
-#define uchar unsigned char     /* typedefs can clash */
-#define uint  unsigned int
-
-typedef const uchar cuchar;
-typedef const uint cuint;
-
-typedef vp9_prob Prob;
-
+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_coefupdateprobs.h"
 
 const int vp9_i8x8_block[4] = {0, 2, 8, 10};
 
-DECLARE_ALIGNED(16, const unsigned char, vp9_norm[256]) = {
+DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -49,15 +41,15 @@
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]) = {
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_4x4[16]) = {
   0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7
 };
 
-DECLARE_ALIGNED(16, cuchar, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {
+DECLARE_ALIGNED(16, const uint8_t, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {
   0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0
 };
 
-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]) = {
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = {
   0,  1,  4,  8,
   5,  2,  3,  6,
   9, 12, 13, 10,
@@ -64,13 +56,14 @@
   7, 11, 14, 15,
 };
 
-DECLARE_ALIGNED(16, const int, vp9_col_scan[16]) = {
+DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = {
   0, 4,  8, 12,
   1, 5,  9, 13,
   2, 6, 10, 14,
   3, 7, 11, 15
 };
-DECLARE_ALIGNED(16, const int, vp9_row_scan[16]) = {
+
+DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = {
   0,   1,  2,  3,
   4,   5,  6,  7,
   8,   9, 10, 11,
@@ -77,16 +70,17 @@
   12, 13, 14, 15
 };
 
+DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = {
+  0, 1, 2, 3, 5, 4, 4, 5,
+  5, 3, 6, 3, 5, 4, 6, 6,
+  6, 5, 5, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7
+};
 
-DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,
-                                                           5, 3, 6, 3, 5, 4, 6, 6,
-                                                           6, 5, 5, 6, 6, 6, 6, 6,
-                                                           6, 6, 6, 6, 6, 6, 6, 6,
-                                                           6, 6, 6, 6, 7, 7, 7, 7,
-                                                           7, 7, 7, 7, 7, 7, 7, 7,
-                                                           7, 7, 7, 7, 7, 7, 7, 7,
-                                                           7, 7, 7, 7, 7, 7, 7, 7
-                                                         };
 DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {
   0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
   12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
@@ -96,43 +90,783 @@
 
 // Table can be optimized.
 DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = {
-    0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
-    6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-    6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
+  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
 };
+
 DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {
-      0,   1,  16,  32,  17,   2,   3,  18,  33,  48,  64,  49,  34,  19,   4,   5,
-     20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,   6,   7,  22,  37,  52,
-     67,  82,  97, 112, 128, 113,  98,  83,  68,  53,  38,  23,   8,   9,  24,  39,
-     54,  69,  84,  99, 114, 129, 144, 160, 145, 130, 115, 100,  85,  70,  55,  40,
-     25,  10,  11,  26,  41,  56,  71,  86, 101, 116, 131, 146, 161, 176, 192, 177,
-    162, 147, 132, 117, 102,  87,  72,  57,  42,  27,  12,  13,  28,  43,  58,  73,
-     88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134,
-    119, 104,  89,  74,  59,  44,  29,  14,  15,  30,  45,  60,  75,  90, 105, 120,
-    135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136,
-    121, 106,  91,  76,  61,  46,  31,  47,  62,  77,  92, 107, 122, 137, 152, 167,
-    182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108,  93,
-     78,  63,  79,  94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230,
-    215, 200, 185, 170, 155, 140, 125, 110,  95, 111, 126, 141, 156, 171, 186, 201,
-    216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188,
-    203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
-    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,
+  0,   1,  16,  32,  17,   2,   3,  18,
+  33,  48,  64,  49,  34,  19,   4,   5,
+  20,  35,  50,  65,  80,  96,  81,  66,
+  51,  36,  21,   6,   7,  22,  37,  52,
+  67,  82,  97, 112, 128, 113,  98,  83,
+  68,  53,  38,  23,   8,   9,  24,  39,
+  54,  69,  84,  99, 114, 129, 144, 160,
+  145, 130, 115, 100,  85,  70,  55,  40,
+  25,  10,  11,  26,  41,  56,  71,  86,
+  101, 116, 131, 146, 161, 176, 192, 177,
+  162, 147, 132, 117, 102,  87,  72,  57,
+  42,  27,  12,  13,  28,  43,  58, 73,
+  88, 103, 118, 133, 148, 163, 178, 193,
+  208, 224, 209, 194, 179, 164, 149, 134,
+  119, 104,  89,  74,  59,  44,  29,  14,
+  15,  30, 45,  60,  75,  90, 105, 120,
+  135, 150, 165, 180, 195, 210, 225, 240,
+  241, 226, 211, 196, 181, 166, 151, 136,
+  121, 106,  91,  76,  61,  46,  31,  47,
+  62,  77, 92, 107, 122, 137, 152, 167,
+  182, 197, 212, 227, 242, 243, 228, 213,
+  198, 183, 168, 153, 138, 123, 108, 93,
+  78,  63,  79,  94, 109, 124, 139, 154,
+  169, 184, 199, 214, 229, 244, 245, 230,
+  215, 200, 185, 170, 155, 140, 125, 110,
+  95, 111, 126, 141, 156, 171, 186, 201,
+  216, 231, 246, 247, 232, 217, 202, 187,
+  172, 157, 142, 127, 143, 158, 173, 188,
+  203, 218, 233, 248, 249, 234, 219, 204,
+  189, 174, 159, 175, 190, 205, 220, 235,
+  250, 251, 236, 221, 206, 191, 207, 222,
+  237, 252, 253, 238, 223, 239, 254, 255,
 };
 
+#if CONFIG_DWTDCTHYBRID
 
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
+  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
+  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
+  0,    1,   32,   64,   33,    2,    3,   34,
+  65,   96, 128,   97,   66,   35,    4,  5,
+  36,   67,   98,  129,  160,  192,  161,  130,
+  99,   68,   37,    6,    7,   38,   69,  100,
+  131,  162,  193,  224, 256,  225,  194,  163,
+  132,  101,   70,   39,    8,    9,   40,   71,
+  102,  133,  164,  195,  226,  257,  288,  320,
+  289,  258,  227,  196,  165,  134,  103,   72,
+  41,   10,   11,   42,   73,  104,  135,  166,
+  197,  228,  259,  290,  321,  352,  384,  353,
+  322,  291,  260,  229,  198,  167,  136,  105,
+  74,   43,   12,   13,   44,   75,  106,  137,
+  168,  199,  230,  261,  292,  323,  354,  385,
+  416,  448,  417,  386,  355,  324,  293,  262,
+  231,  200,  169,  138,  107,   76,   45,   14,
+  15,   46,   77,  108,  139,  170,  201,  232,
+  263,  294,  325,  356,  387,  418,  449,  480,
+  481,  450,  419,  388,  357,  326,  295,  264,
+  233,  202,  171,  140,  109,   78,   47,   79,
+  110,  141,  172,  203,  234,  265,  296,  327,
+  358,  389,  420,  451,  482,  483,  452,  421,
+  390,  359,  328,  297,  266,  235,  204,  173,
+  142,  111,  143,  174,  205,  236,  267,  298,
+  329,  360,  391,  422,  453,  484,  485,  454,
+  423,  392,  361,  330,  299,  268,  237,  206,
+  175,  207,  238,  269,  300,  331,  362,  393,
+  424,  455,  486,  487,  456,  425,  394,  363,
+  332,  301,  270,  239,  271,  302,  333,  364,
+  395,  426,  457,  488,  489,  458,  427,  396,
+  365,  334,  303,  335,  366,  397,  428,  459,
+  490,  491,  460,  429,  398,  367,  399,  430,
+  461,  492,  493,  462,  431,  463,  494,  495,
+
+  16,   512,  528, 17,  513,  529,   48,  544,
+  560, 80,  576,  592,   49,  545,  561,   18,
+  514,  530,   19,  515,  531,   50,  546,  562,
+  81,  577,  593,  112,  608,  624,  144,  640,
+  656,  113,  609,  625,   82,  578,  594,   51,
+  547,  563,   20,  516,  532,   21,  517,  533,
+  52,  548,  564,   83,  579,  595,  114,  610,
+  626,  145,  641,  657,  176,  672,  688,  208,
+  704,  720,  177,  673,  689,  146,  642,  658,
+  115,  611,  627,   84,  580,  596,   53,  549,
+  565,   22,  518,  534,   23,  519,  535,   54,
+  550,  566,   85,  581,  597,  116,  612,  628,
+  147,  643,  659,  178,  674,  690,  209,  705,
+  721,  240,  736,  752,  272,  768,  784,  241,
+  737,  753,  210,  706,  722,  179,  675,  691,
+  148,  644,  660,  117,  613,  629,   86,  582,
+  598,   55,  551,  567,   24,  520,  536,   25,
+  521,  537,   56,  552,  568,   87,  583,  599,
+  118,  614,  630,  149,  645,  661,  180,  676,
+  692,  211,  707,  723,  242,  738,  754,  273,
+  769,  785,  304,  800,  816,  336,  832,  848,
+  305,  801,  817,  274,  770,  786,  243,  739,
+  755,  212,  708,  724,  181,  677,  693,  150,
+  646,  662,  119,  615,  631,   88,  584,  600,
+  57,  553,  569,   26,  522,  538,   27,  523,
+  539,   58,  554,  570,   89,  585,  601,  120,
+  616,  632,  151,  647,  663,  182,  678,  694,
+  213,  709,  725,  244,  740,  756,  275,  771,
+  787,  306,  802,  818,  337,  833,  849,  368,
+  864,  880,  400,  896,  912,  369,  865,  881,
+  338,  834,  850,  307,  803,  819,  276,  772,
+  788,  245,  741,  757,  214,  710,  726,  183,
+
+  679,  695,  152,  648,  664,  121,  617,  633,
+  90,  586,  602,   59,  555,  571,   28,  524,
+  540,   29,  525,  541,   60,  556,  572,   91,
+  587,  603,  122,  618,  634,  153,  649,  665,
+  184,  680,  696,  215,  711,  727,  246,  742,
+  758,  277,  773,  789,  308,  804,  820,  339,
+  835,  851,  370,  866,  882,  401,  897,  913,
+  432,  928,  944,  464,  960,  976,  433,  929,
+  945,  402,  898,  914,  371,  867,  883,  340,
+  836,  852,  309,  805,  821,  278,  774,  790,
+  247,  743,  759,  216,  712,  728,  185,  681,
+  697,  154,  650,  666,  123,  619,  635,   92,
+  588,  604,   61,  557,  573,   30,  526,  542,
+  31,  527,  543,   62,  558,  574,   93,  589,
+  605,  124,  620,  636,  155,  651,  667,  186,
+  682,  698,  217,  713,  729,  248,  744,  760,
+  279,  775,  791,  310,  806,  822,  341,  837,
+  853,  372,  868,  884,  403,  899,  915,  434,
+  930,  946,  465,  961,  977,  496,  992, 1008,
+  497,  993, 1009,  466,  962,  978,  435,  931,
+  947,  404,  900,  916,  373,  869,  885,  342,
+  838,  854,  311,  807,  823,  280,  776,  792,
+  249,  745,  761,  218,  714,  730,  187,  683,
+  699,  156,  652,  668,  125,  621,  637,   94,
+  590,  606,   63,  559,  575,   95,  591,  607,
+  126,  622,  638,  157,  653,  669,  188,  684,
+  700,  219,  715,  731,  250,  746,  762,  281,
+  777,  793,  312,  808,  824,  343,  839,  855,
+  374,  870,  886,  405,  901,  917,  436,  932,
+  948,  467,  963,  979,  498,  994, 1010,  499,
+  995, 1011,  468,  964,  980,  437,  933,  949,
+  406,  902,  918,  375,  871,  887,  344,  840,
+
+  856,  313,  809,  825,  282,  778,  794,  251,
+  747,  763,  220,  716,  732,  189,  685,  701,
+  158,  654,  670,  127,  623,  639,  159,  655,
+  671,  190,  686,  702,  221,  717,  733,  252,
+  748,  764,  283,  779,  795,  314,  810,  826,
+  345,  841,  857,  376,  872,  888,  407,  903,
+  919,  438,  934,  950,  469,  965,  981,  500,
+  996, 1012,  501,  997, 1013,  470,  966,  982,
+  439,  935,  951,  408,  904,  920,  377,  873,
+  889,  346,  842,  858,  315,  811,  827,  284,
+  780,  796,  253,  749,  765,  222,  718,  734,
+  191,  687,  703,  223,  719,  735,  254,  750,
+  766,  285,  781,  797,  316,  812,  828,  347,
+  843,  859,  378,  874,  890,  409,  905,  921,
+  440,  936,  952,  471,  967,  983,  502,  998,
+  1014,  503,  999, 1015,  472,  968,  984,  441,
+  937,  953,  410,  906,  922,  379,  875,  891,
+  348,  844,  860,  317,  813,  829,  286,  782,
+  798,  255,  751,  767,  287,  783,  799,  318,
+  814,  830,  349,  845,  861,  380,  876,  892,
+  411,  907,  923,  442,  938,  954,  473,  969,
+  985,  504, 1000, 1016,  505, 1001, 1017,  474,
+  970,  986,  443,  939,  955,  412,  908,  924,
+  381,  877,  893,  350,  846,  862,  319,  815,
+  831,  351,  847,  863,  382,  878,  894,  413,
+  909,  925,  444,  940,  956,  475,  971,  987,
+  506, 1002, 1018,  507, 1003, 1019,  476,  972,
+  988,  445,  941,  957,  414,  910,  926,  383,
+  879,  895,  415,  911,  927,  446,  942,  958,
+  477,  973,  989,  508, 1004, 1020,  509, 1005,
+  1021,  478,  974,  990,  447,  943,  959,  479,
+  975,  991,  510, 1006, 1022,  511, 1007, 1023,
+};
+
+#elif DWTDCT_TYPE == DWTDCT16X16
+
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
+  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6,
+  6, 6, 6,
+  6,
+  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
+  0,    1,   32,   64,   33,    2,    3,   34,
+  65,   96, 128,   97,   66,   35,    4,
+  16,   512,  528,
+  5,
+  36,   67,   98,  129,  160,  192,  161,  130,
+  99,   68,   37,    6,    7,   38,   69,  100,
+  131,  162,  193,  224, 256,  225,  194,  163,
+  132,  101,   70,   39,    8,    9,   40,   71,
+  102,  133,  164,  195,  226,  257,  288,  320,
+  289,  258,  227,  196,  165,  134,  103,   72,
+  41,   10,   11,   42,   73,  104,  135,  166,
+  197,  228,  259,  290,  321,  352,  384,  353,
+  322,  291,  260,  229,  198,  167,  136,  105,
+  74,   43,   12,   13,   44,   75,  106,  137,
+  168,  199,  230,  261,  292,  323,  354,  385,
+  416,  448,  417,  386,  355,  324,  293,  262,
+  231,  200,  169,  138,  107,   76,   45,   14,
+  15,   46,   77,  108,  139,  170,  201,  232,
+  263,  294,  325,  356,  387,  418,  449,  480,
+  481,  450,  419,  388,  357,  326,  295,  264,
+  233,  202,  171,  140,  109,   78,   47,   79,
+  110,  141,  172,  203,  234,  265,  296,  327,
+  358,  389,  420,  451,  482,  483,  452,  421,
+  390,  359,  328,  297,  266,  235,  204,  173,
+  142,  111,  143,  174,  205,  236,  267,  298,
+  329,  360,  391,  422,  453,  484,  485,  454,
+  423,  392,  361,  330,  299,  268,  237,  206,
+  175,  207,  238,  269,  300,  331,  362,  393,
+  424,  455,  486,  487,  456,  425,  394,  363,
+  332,  301,  270,  239,  271,  302,  333,  364,
+  395,  426,  457,  488,  489,  458,  427,  396,
+  365,  334,  303,  335,  366,  397,  428,  459,
+  490,  491,  460,  429,  398,  367,  399,  430,
+  461,  492,  493,  462,  431,  463,  494,  495,
+
+  17,  513,  529,   48,  544,
+  560, 80,  576,  592,   49,  545,  561,   18,
+  514,  530,   19,  515,  531,   50,  546,  562,
+  81,  577,  593,  112,  608,  624,  144,  640,
+  656,  113,  609,  625,   82,  578,  594,   51,
+  547,  563,   20,  516,  532,   21,  517,  533,
+  52,  548,  564,   83,  579,  595,  114,  610,
+  626,  145,  641,  657,  176,  672,  688,  208,
+  704,  720,  177,  673,  689,  146,  642,  658,
+  115,  611,  627,   84,  580,  596,   53,  549,
+  565,   22,  518,  534,   23,  519,  535,   54,
+  550,  566,   85,  581,  597,  116,  612,  628,
+  147,  643,  659,  178,  674,  690,  209,  705,
+  721,  240,  736,  752,  272,  768,  784,  241,
+  737,  753,  210,  706,  722,  179,  675,  691,
+  148,  644,  660,  117,  613,  629,   86,  582,
+  598,   55,  551,  567,   24,  520,  536,   25,
+  521,  537,   56,  552,  568,   87,  583,  599,
+  118,  614,  630,  149,  645,  661,  180,  676,
+  692,  211,  707,  723,  242,  738,  754,  273,
+  769,  785,  304,  800,  816,  336,  832,  848,
+  305,  801,  817,  274,  770,  786,  243,  739,
+  755,  212,  708,  724,  181,  677,  693,  150,
+  646,  662,  119,  615,  631,   88,  584,  600,
+  57,  553,  569,   26,  522,  538,   27,  523,
+  539,   58,  554,  570,   89,  585,  601,  120,
+  616,  632,  151,  647,  663,  182,  678,  694,
+  213,  709,  725,  244,  740,  756,  275,  771,
+  787,  306,  802,  818,  337,  833,  849,  368,
+  864,  880,  400,  896,  912,  369,  865,  881,
+  338,  834,  850,  307,  803,  819,  276,  772,
+  788,  245,  741,  757,  214,  710,  726,  183,
+
+  679,  695,  152,  648,  664,  121,  617,  633,
+  90,  586,  602,   59,  555,  571,   28,  524,
+  540,   29,  525,  541,   60,  556,  572,   91,
+  587,  603,  122,  618,  634,  153,  649,  665,
+  184,  680,  696,  215,  711,  727,  246,  742,
+  758,  277,  773,  789,  308,  804,  820,  339,
+  835,  851,  370,  866,  882,  401,  897,  913,
+  432,  928,  944,  464,  960,  976,  433,  929,
+  945,  402,  898,  914,  371,  867,  883,  340,
+  836,  852,  309,  805,  821,  278,  774,  790,
+  247,  743,  759,  216,  712,  728,  185,  681,
+  697,  154,  650,  666,  123,  619,  635,   92,
+  588,  604,   61,  557,  573,   30,  526,  542,
+  31,  527,  543,   62,  558,  574,   93,  589,
+  605,  124,  620,  636,  155,  651,  667,  186,
+  682,  698,  217,  713,  729,  248,  744,  760,
+  279,  775,  791,  310,  806,  822,  341,  837,
+  853,  372,  868,  884,  403,  899,  915,  434,
+  930,  946,  465,  961,  977,  496,  992, 1008,
+  497,  993, 1009,  466,  962,  978,  435,  931,
+  947,  404,  900,  916,  373,  869,  885,  342,
+  838,  854,  311,  807,  823,  280,  776,  792,
+  249,  745,  761,  218,  714,  730,  187,  683,
+  699,  156,  652,  668,  125,  621,  637,   94,
+  590,  606,   63,  559,  575,   95,  591,  607,
+  126,  622,  638,  157,  653,  669,  188,  684,
+  700,  219,  715,  731,  250,  746,  762,  281,
+  777,  793,  312,  808,  824,  343,  839,  855,
+  374,  870,  886,  405,  901,  917,  436,  932,
+  948,  467,  963,  979,  498,  994, 1010,  499,
+  995, 1011,  468,  964,  980,  437,  933,  949,
+  406,  902,  918,  375,  871,  887,  344,  840,
+
+  856,  313,  809,  825,  282,  778,  794,  251,
+  747,  763,  220,  716,  732,  189,  685,  701,
+  158,  654,  670,  127,  623,  639,  159,  655,
+  671,  190,  686,  702,  221,  717,  733,  252,
+  748,  764,  283,  779,  795,  314,  810,  826,
+  345,  841,  857,  376,  872,  888,  407,  903,
+  919,  438,  934,  950,  469,  965,  981,  500,
+  996, 1012,  501,  997, 1013,  470,  966,  982,
+  439,  935,  951,  408,  904,  920,  377,  873,
+  889,  346,  842,  858,  315,  811,  827,  284,
+  780,  796,  253,  749,  765,  222,  718,  734,
+  191,  687,  703,  223,  719,  735,  254,  750,
+  766,  285,  781,  797,  316,  812,  828,  347,
+  843,  859,  378,  874,  890,  409,  905,  921,
+  440,  936,  952,  471,  967,  983,  502,  998,
+  1014,  503,  999, 1015,  472,  968,  984,  441,
+  937,  953,  410,  906,  922,  379,  875,  891,
+  348,  844,  860,  317,  813,  829,  286,  782,
+  798,  255,  751,  767,  287,  783,  799,  318,
+  814,  830,  349,  845,  861,  380,  876,  892,
+  411,  907,  923,  442,  938,  954,  473,  969,
+  985,  504, 1000, 1016,  505, 1001, 1017,  474,
+  970,  986,  443,  939,  955,  412,  908,  924,
+  381,  877,  893,  350,  846,  862,  319,  815,
+  831,  351,  847,  863,  382,  878,  894,  413,
+  909,  925,  444,  940,  956,  475,  971,  987,
+  506, 1002, 1018,  507, 1003, 1019,  476,  972,
+  988,  445,  941,  957,  414,  910,  926,  383,
+  879,  895,  415,  911,  927,  446,  942,  958,
+  477,  973,  989,  508, 1004, 1020,  509, 1005,
+  1021,  478,  974,  990,  447,  943,  959,  479,
+  975,  991,  510, 1006, 1022,  511, 1007, 1023,
+};
+
+#elif DWTDCT_TYPE == DWTDCT8X8
+
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
+  0, 1, 2, 3, 5, 4, 4, 5,
+  5, 3, 6, 3, 5, 4, 6, 6,
+  6, 5, 5, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7,
+
+  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
+  0,    1,   32,   64,   33,    2,    3,   34,
+  65,   96,  128,   97,   66,   35,    4,    5,
+  36,   67,   98,  129,  160,  192,  161,  130,
+  99,   68,   37,    6,    7,   38,   69,  100,
+  131,  162,  193,  224,  225,  194,  163,  132,
+  101,   70,   39,   71,  102,  133,  164,  195,
+  226,  227,  196,  165,  134,  103,  135,  166,
+  197,  228,  229,  198,  167,  199,  230,  231,
+
+  8,  256,  264,    9,  257,  265,   40,  288, 296, 72,  320,  328,
+  41,  289,  297,   10, 258,  266, 11,  259,  267,   42,  290,  298,
+  73,  321,  329,  104,  352,  360,  136,  384, 392,  105,  353,  361,
+  74,  322,  330,   43, 291,  299,   12,  260,  268,   13,  261,  269,
+  44,  292,  300,   75,  323,  331,  106,  354, 362,  137,  385,  393,
+  168,  416,  424,  200, 448,  456,  169,  417,  425,  138,  386,  394,
+  107,  355,  363,   76,  324,  332,   45,  293, 301,   14,  262,  270,
+  15,  263,  271,   46, 294,  302,   77,  325,  333,  108,  356,  364,
+  139,  387,  395,  170, 418,  426,  201,  449, 457,  232,  480,  488,
+  233,  481,  489,  202, 450,  458,  171,  419,  427,  140,  388,  396,
+  109,  357,  365,   78,  326,  334,   47,  295, 303,   79,  327,  335,
+  110,  358,  366,  141, 389,  397,  172,  420,  428,  203,  451,  459,
+  234,  482,  490,  235,  483,  491,  204,  452, 460,  173,  421,  429,
+  142,  390,  398,  111, 359,  367,  143,  391,  399,  174,  422,  430,
+  205,  453,  461,  236,  484,  492,  237,  485, 493,  206,  454,  462,
+  175,  423,  431,  207, 455,  463,  238,  486,  494,  239,  487,  495,
+
+  16,  512,  528,   17,  513,  529,   18,  514,
+  530,   19,  515,  531,   20,  516,  532,   21,
+  517,  533,   22,  518,  534,   23,  519,  535,
+  24,  520,  536,   25,  521,  537,   26,  522,
+  538,   27,  523,  539,   28,  524,  540,   29,
+  525,  541,   30,  526,  542,   31,  527,  543,
+  48,  544,  560,   49,  545,  561,   50,  546,
+  562,   51,  547,  563,   52,  548,  564,   53,
+  549,  565,   54,  550,  566,   55,  551,  567,
+  56,  552,  568,   57,  553,  569,   58,  554,
+  570,   59,  555,  571,   60,  556,  572,   61,
+  557,  573,   62,  558,  574,   63,  559,  575,
+  80,  576,  592,   81,  577,  593,   82,  578,
+  594,   83,  579,  595,   84,  580,  596,   85,
+  581,  597,   86,  582,  598,   87,  583,  599,
+  88,  584,  600,   89,  585,  601,   90,  586,
+  602,   91,  587,  603,   92,  588,  604,   93,
+  589,  605,   94,  590,  606,   95,  591,  607,
+  112,  608,  624,  113,  609,  625,  114,  610,
+  626,  115,  611,  627,  116,  612,  628,  117,
+  613,  629,  118,  614,  630,  119,  615,  631,
+  120,  616,  632,  121,  617,  633,  122,  618,
+  634,  123,  619,  635,  124,  620,  636,  125,
+  621,  637,  126,  622,  638,  127,  623,  639,
+  144,  640,  656,  145,  641,  657,  146,  642,
+  658,  147,  643,  659,  148,  644,  660,  149,
+  645,  661,  150,  646,  662,  151,  647,  663,
+  152,  648,  664,  153,  649,  665,  154,  650,
+  666,  155,  651,  667,  156,  652,  668,  157,
+  653,  669,  158,  654,  670,  159,  655,  671,
+  176,  672,  688,  177,  673,  689,  178,  674,
+  690,  179,  675,  691,  180,  676,  692,  181,
+  677,  693,  182,  678,  694,  183,  679,  695,
+  184,  680,  696,  185,  681,  697,  186,  682,
+  698,  187,  683,  699,  188,  684,  700,  189,
+  685,  701,  190,  686,  702,  191,  687,  703,
+  208,  704,  720,  209,  705,  721,  210,  706,
+  722,  211,  707,  723,  212,  708,  724,  213,
+  709,  725,  214,  710,  726,  215,  711,  727,
+  216,  712,  728,  217,  713,  729,  218,  714,
+  730,  219,  715,  731,  220,  716,  732,  221,
+  717,  733,  222,  718,  734,  223,  719,  735,
+  240,  736,  752,  241,  737,  753,  242,  738,
+  754,  243,  739,  755,  244,  740,  756,  245,
+  741,  757,  246,  742,  758,  247,  743,  759,
+  248,  744,  760,  249,  745,  761,  250,  746,
+  762,  251,  747,  763,  252,  748,  764,  253,
+  749,  765,  254,  750,  766,  255,  751,  767,
+  272,  768,  784,  273,  769,  785,  274,  770,
+  786,  275,  771,  787,  276,  772,  788,  277,
+  773,  789,  278,  774,  790,  279,  775,  791,
+  280,  776,  792,  281,  777,  793,  282,  778,
+  794,  283,  779,  795,  284,  780,  796,  285,
+  781,  797,  286,  782,  798,  287,  783,  799,
+  304,  800,  816,  305,  801,  817,  306,  802,
+  818,  307,  803,  819,  308,  804,  820,  309,
+  805,  821,  310,  806,  822,  311,  807,  823,
+  312,  808,  824,  313,  809,  825,  314,  810,
+  826,  315,  811,  827,  316,  812,  828,  317,
+  813,  829,  318,  814,  830,  319,  815,  831,
+  336,  832,  848,  337,  833,  849,  338,  834,
+  850,  339,  835,  851,  340,  836,  852,  341,
+  837,  853,  342,  838,  854,  343,  839,  855,
+  344,  840,  856,  345,  841,  857,  346,  842,
+  858,  347,  843,  859,  348,  844,  860,  349,
+  845,  861,  350,  846,  862,  351,  847,  863,
+  368,  864,  880,  369,  865,  881,  370,  866,
+  882,  371,  867,  883,  372,  868,  884,  373,
+  869,  885,  374,  870,  886,  375,  871,  887,
+  376,  872,  888,  377,  873,  889,  378,  874,
+  890,  379,  875,  891,  380,  876,  892,  381,
+  877,  893,  382,  878,  894,  383,  879,  895,
+  400,  896,  912,  401,  897,  913,  402,  898,
+  914,  403,  899,  915,  404,  900,  916,  405,
+  901,  917,  406,  902,  918,  407,  903,  919,
+  408,  904,  920,  409,  905,  921,  410,  906,
+  922,  411,  907,  923,  412,  908,  924,  413,
+  909,  925,  414,  910,  926,  415,  911,  927,
+  432,  928,  944,  433,  929,  945,  434,  930,
+  946,  435,  931,  947,  436,  932,  948,  437,
+  933,  949,  438,  934,  950,  439,  935,  951,
+  440,  936,  952,  441,  937,  953,  442,  938,
+  954,  443,  939,  955,  444,  940,  956,  445,
+  941,  957,  446,  942,  958,  447,  943,  959,
+  464,  960,  976,  465,  961,  977,  466,  962,
+  978,  467,  963,  979,  468,  964,  980,  469,
+  965,  981,  470,  966,  982,  471,  967,  983,
+  472,  968,  984,  473,  969,  985,  474,  970,
+  986,  475,  971,  987,  476,  972,  988,  477,
+  973,  989,  478,  974,  990,  479,  975,  991,
+  496,  992, 1008,  497,  993, 1009,  498,  994,
+  1010,  499,  995, 1011,  500,  996, 1012,  501,
+  997, 1013,  502,  998, 1014,  503,  999, 1015,
+  504, 1000, 1016,  505, 1001, 1017,  506, 1002,
+  1018,  507, 1003, 1019,  508, 1004, 1020,  509,
+  1005, 1021,  510, 1006, 1022,  511, 1007, 1023,
+};
+#endif
+
+#else
+
+DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {
+  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,
+  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+};
+
+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {
+    0,    1,   32,   64,   33,    2,    3,   34,   65,   96,  128,   97,   66,   35,    4,    5,   36,   67,   98,  129,  160,  192,  161,  130,   99,   68,   37,    6,    7,   38,   69,  100,
+  131,  162,  193,  224,  256,  225,  194,  163,  132,  101,   70,   39,    8,    9,   40,   71,  102,  133,  164,  195,  226,  257,  288,  320,  289,  258,  227,  196,  165,  134,  103,   72,
+   41,   10,   11,   42,   73,  104,  135,  166,  197,  228,  259,  290,  321,  352,  384,  353,  322,  291,  260,  229,  198,  167,  136,  105,   74,   43,   12,   13,   44,   75,  106,  137,
+  168,  199,  230,  261,  292,  323,  354,  385,  416,  448,  417,  386,  355,  324,  293,  262,  231,  200,  169,  138,  107,   76,   45,   14,   15,   46,   77,  108,  139,  170,  201,  232,
+  263,  294,  325,  356,  387,  418,  449,  480,  512,  481,  450,  419,  388,  357,  326,  295,  264,  233,  202,  171,  140,  109,   78,   47,   16,   17,   48,   79,  110,  141,  172,  203,
+  234,  265,  296,  327,  358,  389,  420,  451,  482,  513,  544,  576,  545,  514,  483,  452,  421,  390,  359,  328,  297,  266,  235,  204,  173,  142,  111,   80,   49,   18,   19,   50,
+   81,  112,  143,  174,  205,  236,  267,  298,  329,  360,  391,  422,  453,  484,  515,  546,  577,  608,  640,  609,  578,  547,  516,  485,  454,  423,  392,  361,  330,  299,  268,  237,
+  206,  175,  144,  113,   82,   51,   20,   21,   52,   83,  114,  145,  176,  207,  238,  269,  300,  331,  362,  393,  424,  455,  486,  517,  548,  579,  610,  641,  672,  704,  673,  642,
+  611,  580,  549,  518,  487,  456,  425,  394,  363,  332,  301,  270,  239,  208,  177,  146,  115,   84,   53,   22,   23,   54,   85,  116,  147,  178,  209,  240,  271,  302,  333,  364,
+  395,  426,  457,  488,  519,  550,  581,  612,  643,  674,  705,  736,  768,  737,  706,  675,  644,  613,  582,  551,  520,  489,  458,  427,  396,  365,  334,  303,  272,  241,  210,  179,
+  148,  117,   86,   55,   24,   25,   56,   87,  118,  149,  180,  211,  242,  273,  304,  335,  366,  397,  428,  459,  490,  521,  552,  583,  614,  645,  676,  707,  738,  769,  800,  832,
+  801,  770,  739,  708,  677,  646,  615,  584,  553,  522,  491,  460,  429,  398,  367,  336,  305,  274,  243,  212,  181,  150,  119,   88,   57,   26,   27,   58,   89,  120,  151,  182,
+  213,  244,  275,  306,  337,  368,  399,  430,  461,  492,  523,  554,  585,  616,  647,  678,  709,  740,  771,  802,  833,  864,  896,  865,  834,  803,  772,  741,  710,  679,  648,  617,
+  586,  555,  524,  493,  462,  431,  400,  369,  338,  307,  276,  245,  214,  183,  152,  121,   90,   59,   28,   29,   60,   91,  122,  153,  184,  215,  246,  277,  308,  339,  370,  401,
+  432,  463,  494,  525,  556,  587,  618,  649,  680,  711,  742,  773,  804,  835,  866,  897,  928,  960,  929,  898,  867,  836,  805,  774,  743,  712,  681,  650,  619,  588,  557,  526,
+  495,  464,  433,  402,  371,  340,  309,  278,  247,  216,  185,  154,  123,   92,   61,   30,   31,   62,   93,  124,  155,  186,  217,  248,  279,  310,  341,  372,  403,  434,  465,  496,
+  527,  558,  589,  620,  651,  682,  713,  744,  775,  806,  837,  868,  899,  930,  961,  992,  993,  962,  931,  900,  869,  838,  807,  776,  745,  714,  683,  652,  621,  590,  559,  528,
+  497,  466,  435,  404,  373,  342,  311,  280,  249,  218,  187,  156,  125,   94,   63,   95,  126,  157,  188,  219,  250,  281,  312,  343,  374,  405,  436,  467,  498,  529,  560,  591,
+  622,  653,  684,  715,  746,  777,  808,  839,  870,  901,  932,  963,  994,  995,  964,  933,  902,  871,  840,  809,  778,  747,  716,  685,  654,  623,  592,  561,  530,  499,  468,  437,
+  406,  375,  344,  313,  282,  251,  220,  189,  158,  127,  159,  190,  221,  252,  283,  314,  345,  376,  407,  438,  469,  500,  531,  562,  593,  624,  655,  686,  717,  748,  779,  810,
+  841,  872,  903,  934,  965,  996,  997,  966,  935,  904,  873,  842,  811,  780,  749,  718,  687,  656,  625,  594,  563,  532,  501,  470,  439,  408,  377,  346,  315,  284,  253,  222,
+  191,  223,  254,  285,  316,  347,  378,  409,  440,  471,  502,  533,  564,  595,  626,  657,  688,  719,  750,  781,  812,  843,  874,  905,  936,  967,  998,  999,  968,  937,  906,  875,
+  844,  813,  782,  751,  720,  689,  658,  627,  596,  565,  534,  503,  472,  441,  410,  379,  348,  317,  286,  255,  287,  318,  349,  380,  411,  442,  473,  504,  535,  566,  597,  628,
+  659,  690,  721,  752,  783,  814,  845,  876,  907,  938,  969, 1000, 1001,  970,  939,  908,  877,  846,  815,  784,  753,  722,  691,  660,  629,  598,  567,  536,  505,  474,  443,  412,
+  381,  350,  319,  351,  382,  413,  444,  475,  506,  537,  568,  599,  630,  661,  692,  723,  754,  785,  816,  847,  878,  909,  940,  971, 1002, 1003,  972,  941,  910,  879,  848,  817,
+  786,  755,  724,  693,  662,  631,  600,  569,  538,  507,  476,  445,  414,  383,  415,  446,  477,  508,  539,  570,  601,  632,  663,  694,  725,  756,  787,  818,  849,  880,  911,  942,
+  973, 1004, 1005,  974,  943,  912,  881,  850,  819,  788,  757,  726,  695,  664,  633,  602,  571,  540,  509,  478,  447,  479,  510,  541,  572,  603,  634,  665,  696,  727,  758,  789,
+  820,  851,  882,  913,  944,  975, 1006, 1007,  976,  945,  914,  883,  852,  821,  790,  759,  728,  697,  666,  635,  604,  573,  542,  511,  543,  574,  605,  636,  667,  698,  729,  760,
+  791,  822,  853,  884,  915,  946,  977, 1008, 1009,  978,  947,  916,  885,  854,  823,  792,  761,  730,  699,  668,  637,  606,  575,  607,  638,  669,  700,  731,  762,  793,  824,  855,
+  886,  917,  948,  979, 1010, 1011,  980,  949,  918,  887,  856,  825,  794,  763,  732,  701,  670,  639,  671,  702,  733,  764,  795,  826,  857,  888,  919,  950,  981, 1012, 1013,  982,
+  951,  920,  889,  858,  827,  796,  765,  734,  703,  735,  766,  797,  828,  859,  890,  921,  952,  983, 1014, 1015,  984,  953,  922,  891,  860,  829,  798,  767,  799,  830,  861,  892,
+  923,  954,  985, 1016, 1017,  986,  955,  924,  893,  862,  831,  863,  894,  925,  956,  987, 1018, 1019,  988,  957,  926,  895,  927,  958,  989, 1020, 1021,  990,  959,  991, 1022, 1023,
+};
+#endif  // CONFIG_DWTDCTHYBRID
+
 /* Array indices are identical to previously-existing CONTEXT_NODE indices */
 
 const vp9_tree_index vp9_coef_tree[ 22] =     /* corresponding _CONTEXT_NODEs */
@@ -155,15 +889,16 @@
 /* Trees for extra bits.  Probabilities are constant and
    do not depend on previously encoded bits */
 
-static const Prob Pcat1[] = { 159};
-static const Prob Pcat2[] = { 165, 145};
-static const Prob Pcat3[] = { 173, 148, 140};
-static const Prob Pcat4[] = { 176, 155, 140, 135};
-static const Prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const Prob Pcat6[] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129};
+static const vp9_prob Pcat1[] = { 159};
+static const vp9_prob Pcat2[] = { 165, 145};
+static const vp9_prob Pcat3[] = { 173, 148, 140};
+static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp9_prob Pcat6[] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};
 
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
 
 static void init_bit_tree(vp9_tree_index *p, int n) {
   int i = 0;
@@ -182,7 +917,7 @@
   init_bit_tree(cat3, 3);
   init_bit_tree(cat4, 4);
   init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 13);
+  init_bit_tree(cat6, 14);
 }
 
 vp9_extra_bit_struct vp9_extra_bits[12] = {
@@ -196,17 +931,157 @@
   { cat3, Pcat3, 3, 11},
   { cat4, Pcat4, 4, 19},
   { cat5, Pcat5, 5, 35},
-  { cat6, Pcat6, 13, 67},
+  { cat6, Pcat6, 14, 67},
   { 0, 0, 0, 0}
 };
 
 #include "vp9/common/vp9_default_coef_probs.h"
 
+#if CONFIG_NEWCOEFCONTEXT
+
+// Neighborhood 5-tuples for various scans and blocksizes,
+// in {top, left, topleft, topright, bottomleft} order
+// for each position in raster scan order.
+// -1 indicates the neighbor does not exist.
+DECLARE_ALIGNED(16, int,
+                vp9_default_zig_zag1d_4x4_neighbors[16 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_default_zig_zag1d_8x8_neighbors[64 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_default_zig_zag1d_16x16_neighbors[256 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, int,
+                vp9_default_zig_zag1d_32x32_neighbors[1024 * MAX_NEIGHBORS]);
+
+static int find_in_scan(const int *scan, int l, int m) {
+  int i, l2 = l * l;
+  for (i = 0; i < l2; ++i) {
+    if (scan[i] == m)
+      return i;
+  }
+  return -1;
+}
+
+static void init_scan_neighbors(const int *scan, int l, int *neighbors) {
+  int l2 = l * l;
+  int m, n, i, j, k;
+  for (n = 0; n < l2; ++n) {
+    int locn = find_in_scan(scan, l, n);
+    int z = -1;
+    i = n / l;
+    j = n % l;
+    for (k = 0; k < MAX_NEIGHBORS; ++k)
+      neighbors[MAX_NEIGHBORS * n + k] = -1;
+    if (i - 1 >= 0) {
+      m = (i - 1) * l + j;
+      if (find_in_scan(scan, l, m) < locn) {
+        neighbors[MAX_NEIGHBORS * n] = m;
+        if (m == 0) z = 0;
+      }
+    }
+    if (j - 1 >= 0) {
+      m = i * l + j - 1;
+      if (find_in_scan(scan, l, m) < locn) {
+        neighbors[MAX_NEIGHBORS * n + 1] = m;
+        if (m == 0) z = 1;
+      }
+    }
+    if (i - 1 >= 0 && j - 1 >= 0) {
+      m = (i - 1) * l + j - 1;
+      if (find_in_scan(scan, l, m) < locn) {
+        neighbors[MAX_NEIGHBORS * n + 2] = m;
+        if (m == 0) z = 2;
+      }
+    }
+    if (i - 1 >= 0 && j + 1 < l) {
+      m = (i - 1) * l + j + 1;
+      if (find_in_scan(scan, l, m) < locn) {
+        neighbors[MAX_NEIGHBORS * n + 3] = m;
+        if (m == 0) z = 3;
+      }
+    }
+    if (i + 1 < l && j - 1 >= 0) {
+       m = (i + 1) * l + j - 1;
+      if (find_in_scan(scan, l, m) < locn) {
+        neighbors[MAX_NEIGHBORS * n + 4] = m;
+        if (m == 0) z = 4;
+      }
+    }
+    if (z != -1) {  // zero exists
+      int v = 0;
+      for (k = 0; k < MAX_NEIGHBORS; ++k)
+        v += (neighbors[MAX_NEIGHBORS * n + k] > 0);
+      if (v) {
+        neighbors[MAX_NEIGHBORS * n + z] = -1;
+      }
+    }
+  }
+}
+
+void vp9_init_neighbors() {
+  init_scan_neighbors(vp9_default_zig_zag1d_4x4, 4,
+                      vp9_default_zig_zag1d_4x4_neighbors);
+  init_scan_neighbors(vp9_row_scan_4x4, 4,
+                      vp9_row_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_col_scan_4x4, 4,
+                      vp9_col_scan_4x4_neighbors);
+  init_scan_neighbors(vp9_default_zig_zag1d_8x8, 8,
+                      vp9_default_zig_zag1d_8x8_neighbors);
+  init_scan_neighbors(vp9_default_zig_zag1d_16x16, 16,
+                      vp9_default_zig_zag1d_16x16_neighbors);
+  init_scan_neighbors(vp9_default_zig_zag1d_32x32, 32,
+                      vp9_default_zig_zag1d_32x32_neighbors);
+}
+
+const int *vp9_get_coef_neighbors_handle(const int *scan) {
+  if (scan == vp9_default_zig_zag1d_4x4) {
+    return vp9_default_zig_zag1d_4x4_neighbors;
+  } else if (scan == vp9_row_scan_4x4) {
+    return vp9_row_scan_4x4_neighbors;
+  } else if (scan == vp9_col_scan_4x4) {
+    return vp9_col_scan_4x4_neighbors;
+  } else if (scan == vp9_default_zig_zag1d_8x8) {
+    return vp9_default_zig_zag1d_8x8_neighbors;
+  } else if (scan == vp9_default_zig_zag1d_16x16) {
+    return vp9_default_zig_zag1d_16x16_neighbors;
+  } else if (scan == vp9_default_zig_zag1d_32x32) {
+    return vp9_default_zig_zag1d_32x32_neighbors;
+  }
+  return vp9_default_zig_zag1d_4x4_neighbors;
+}
+
+int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc,
+                                  const int *neigbor_handle, int rc) {
+  static int neighbors_used = MAX_NEIGHBORS;   // maximum is MAX_NEIGHBORS
+  const int *nb = neigbor_handle + rc * MAX_NEIGHBORS;
+  int i, v, val = 0, n = 0;
+  for (i = 0; i < neighbors_used; ++i) {
+    if (nb[i] == -1 || (nb[i] == 0 && nodc)) {
+      continue;
+    }
+    v = abs(qcoeff_ptr[nb[i]]);
+    val = (v > val ? v : val);
+    n++;
+  }
+  if (n == 0)
+    return 0;
+  else if (val <= 1)
+    return val;
+  else if (val < 4)
+    return 2;
+  else
+    return 3;
+}
+#endif  /* CONFIG_NEWCOEFCONTEXT */
+
 void vp9_default_coef_probs(VP9_COMMON *pc) {
-  vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
-             sizeof(pc->fc.coef_probs));
-  vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs,
-             sizeof(pc->fc.hybrid_coef_probs));
+  vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4,
+             sizeof(pc->fc.coef_probs_4x4));
+  vpx_memcpy(pc->fc.hybrid_coef_probs_4x4, default_hybrid_coef_probs_4x4,
+             sizeof(pc->fc.hybrid_coef_probs_4x4));
 
   vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,
              sizeof(pc->fc.coef_probs_8x8));
@@ -218,6 +1093,8 @@
   vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,
              default_hybrid_coef_probs_16x16,
              sizeof(pc->fc.hybrid_coef_probs_16x16));
+  vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,
+             sizeof(pc->fc.coef_probs_32x32));
 }
 
 void vp9_coef_tree_initialize() {
@@ -234,13 +1111,40 @@
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
-void vp9_adapt_coef_probs(VP9_COMMON *cm) {
+static void update_coef_probs(vp9_coeff_probs *dst_coef_probs,
+                              vp9_coeff_probs *pre_coef_probs,
+                              int block_types, vp9_coeff_count *coef_counts,
+                              int count_sat, int update_factor) {
   int t, i, j, k, count;
   unsigned int branch_ct[ENTROPY_NODES][2];
   vp9_prob coef_probs[ENTROPY_NODES];
-  int update_factor; /* denominator 256 */
   int factor;
+
+  for (i = 0; i < block_types; ++i)
+    for (j = 0; j < COEF_BANDS; ++j)
+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
+          continue;
+        vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,
+                                         vp9_coef_encodings, vp9_coef_tree,
+                                         coef_probs, branch_ct,
+                                         coef_counts[i][j][k]);
+        for (t = 0; t < ENTROPY_NODES; ++t) {
+          count = branch_ct[t][0] + branch_ct[t][1];
+          count = count > count_sat ? count_sat : count;
+          factor = (update_factor * count / count_sat);
+          dst_coef_probs[i][j][k][t] = weighted_prob(pre_coef_probs[i][j][k][t],
+                                                     coef_probs[t], factor);
+        }
+      }
+}
+
+void vp9_adapt_coef_probs(VP9_COMMON *cm) {
+#ifdef COEF_COUNT_TESTING
+  int t, i, j, k;
+#endif
   int count_sat;
+  int update_factor; /* denominator 256 */
 
   // printf("Frame type: %d\n", cm->frame_type);
   if (cm->frame_type == KEY_FRAME) {
@@ -313,135 +1217,28 @@
   }
 #endif
 
-  for (i = 0; i < BLOCK_TYPES; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.coef_counts [i][j][k],
-          256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_coef_probs[i][j][k][t] * (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.coef_probs[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.coef_probs[i][j][k][t] = 255;
-          else cm->fc.coef_probs[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.hybrid_coef_counts [i][j][k],
-          256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_hybrid_coef_probs[i][j][k][t] * (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.hybrid_coef_probs[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.hybrid_coef_probs[i][j][k][t] = 255;
-          else cm->fc.hybrid_coef_probs[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES_8X8; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.coef_counts_8x8 [i][j][k],
-          256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_coef_probs_8x8[i][j][k][t] * (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.coef_probs_8x8[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.coef_probs_8x8[i][j][k][t] = 255;
-          else cm->fc.coef_probs_8x8[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES_8X8; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_8x8 [i][j][k],
-          256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_hybrid_coef_probs_8x8[i][j][k][t] *
-                  (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 255;
-          else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES_16X16; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] *
-                  (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255;
-          else cm->fc.coef_probs_16x16[i][j][k][t] = prob;
-        }
-      }
-
-  for (i = 0; i < BLOCK_TYPES_16X16; ++i)
-    for (j = 0; j < COEF_BANDS; ++j)
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, cm->fc.hybrid_coef_counts_16x16[i][j][k], 256, 1);
-        for (t = 0; t < ENTROPY_NODES; ++t) {
-          int prob;
-          count = branch_ct[t][0] + branch_ct[t][1];
-          count = count > count_sat ? count_sat : count;
-          factor = (update_factor * count / count_sat);
-          prob = ((int)cm->fc.pre_hybrid_coef_probs_16x16[i][j][k][t] * (256 - factor) +
-                  (int)coef_probs[t] * factor + 128) >> 8;
-          if (prob <= 0) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 1;
-          else if (prob > 255) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 255;
-          else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob;
-        }
-      }
+  update_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,
+                    BLOCK_TYPES_4X4, cm->fc.coef_counts_4x4,
+                    count_sat, update_factor);
+  update_coef_probs(cm->fc.hybrid_coef_probs_4x4,
+                    cm->fc.pre_hybrid_coef_probs_4x4,
+                    BLOCK_TYPES_4X4, cm->fc.hybrid_coef_counts_4x4,
+                    count_sat, update_factor);
+  update_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,
+                    BLOCK_TYPES_8X8, cm->fc.coef_counts_8x8,
+                    count_sat, update_factor);
+  update_coef_probs(cm->fc.hybrid_coef_probs_8x8,
+                    cm->fc.pre_hybrid_coef_probs_8x8,
+                    BLOCK_TYPES_8X8, cm->fc.hybrid_coef_counts_8x8,
+                    count_sat, update_factor);
+  update_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,
+                    BLOCK_TYPES_16X16, cm->fc.coef_counts_16x16,
+                    count_sat, update_factor);
+  update_coef_probs(cm->fc.hybrid_coef_probs_16x16,
+                    cm->fc.pre_hybrid_coef_probs_16x16,
+                    BLOCK_TYPES_16X16, cm->fc.hybrid_coef_counts_16x16,
+                    count_sat, update_factor);
+  update_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,
+                    BLOCK_TYPES_32X32, cm->fc.coef_counts_32x32,
+                    count_sat, update_factor);
 }
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -8,10 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_ENTROPY_H_
 #define VP9_COMMON_VP9_ENTROPY_H_
 
+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_treecoder.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
@@ -55,24 +55,27 @@
 #define PROB_UPDATE_BASELINE_COST   7
 
 #define MAX_PROB                255
-#define DCT_MAX_VALUE           8192
+#define DCT_MAX_VALUE           16384
 
 /* Coefficients are predicted via a 3-dimensional probability table. */
 
 /* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */
-#define BLOCK_TYPES 4
+#define BLOCK_TYPES_4X4 4
 
 #define BLOCK_TYPES_8X8 4
 
 #define BLOCK_TYPES_16X16 4
 
+#define BLOCK_TYPES_32X32 4
+
 /* Middle dimension is a coarsening of the coefficient's
    position within the 4x4 DCT. */
 
 #define COEF_BANDS 8
-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_4x4[16]);
 extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);
 extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);
+extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]);
 
 /* Inside dimension is 3-valued measure of nearby complexity, that is,
    the extent to which nearby coefficients are nonzero.  For the first
@@ -91,24 +94,61 @@
    distinct bands). */
 
 /*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
-#define PREV_COEF_CONTEXTS       4
+#define PREV_COEF_CONTEXTS          4
 
+typedef unsigned int vp9_coeff_count[COEF_BANDS][PREV_COEF_CONTEXTS]
+                                    [MAX_ENTROPY_TOKENS];
+typedef unsigned int vp9_coeff_stats[COEF_BANDS][PREV_COEF_CONTEXTS]
+                                    [ENTROPY_NODES][2];
+typedef vp9_prob vp9_coeff_probs[COEF_BANDS][PREV_COEF_CONTEXTS]
+                                [ENTROPY_NODES];
+
 #define SUBEXP_PARAM                4   /* Subexponential code parameter */
 #define MODULUS_PARAM               13  /* Modulus parameter */
 
-extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]);
+extern DECLARE_ALIGNED(16, const uint8_t,
+                       vp9_prev_token_class[MAX_ENTROPY_TOKENS]);
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *);
-extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]);
 
-extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]);
-extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]);
+extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]);
 
 extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);
-void vp9_coef_tree_initialize(void);
-
 extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);
+extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]);
+
+void vp9_coef_tree_initialize(void);
 void vp9_adapt_coef_probs(struct VP9Common *);
 
-#endif
+static void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
+  /* Clear entropy contexts */
+  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+}
+
+#if CONFIG_NEWCOEFCONTEXT
+
+#define MAX_NEIGHBORS 5
+#define NEWCOEFCONTEXT_BAND_COND(b)   ((b) >= 1)
+void vp9_init_neighbors(void);
+
+const int *vp9_get_coef_neighbors_handle(const int *scan);
+int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc,
+                                  const int *neigbor_handle, int rc);
+extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_4x4_neighbors[
+                       16 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_row_scan_4x4_neighbors[
+                       16 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_col_scan_4x4_neighbors[
+                       16 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_8x8_neighbors[
+                       64 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_16x16_neighbors[
+                       256 * MAX_NEIGHBORS]);
+extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_32x32_neighbors[
+                       1024 * MAX_NEIGHBORS]);
+#endif  // CONFIG_NEWCOEFCONTEXT
+#endif  // VP9_COMMON_VP9_ENTROPY_H_
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -272,13 +272,11 @@
   -NEWMV, -SPLITMV
 };
 
-#if CONFIG_SUPERBLOCKS
 const vp9_tree_index vp9_sb_mv_ref_tree[6] = {
   -ZEROMV, 2,
   -NEARESTMV, 4,
   -NEARMV, -NEWMV
 };
-#endif
 
 const vp9_tree_index vp9_sub_mv_ref_tree[6] = {
   -LEFT4X4, 2,
@@ -289,10 +287,8 @@
 struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES];
 struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES];
 struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES];
-#if CONFIG_SUPERBLOCKS
 struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES];
 struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES];
-#endif
 struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES];
 struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES];
 struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES];
@@ -299,9 +295,7 @@
 struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS];
 
 struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS];
-#if CONFIG_SUPERBLOCKS
 struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS];
-#endif
 struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS];
 
 void vp9_init_mbmode_probs(VP9_COMMON *x) {
@@ -309,25 +303,21 @@
 
   vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings,
                                    vp9_ymode_tree, x->fc.ymode_prob,
-                                   bct, y_mode_cts, 256, 1);
-#if CONFIG_SUPERBLOCKS
+                                   bct, y_mode_cts);
   vp9_tree_probs_from_distribution(VP9_I32X32_MODES, vp9_sb_ymode_encodings,
                                    vp9_sb_ymode_tree, x->fc.sb_ymode_prob,
-                                   bct, y_mode_cts, 256, 1);
-#endif
+                                   bct, y_mode_cts);
   {
     int i;
     for (i = 0; i < 8; i++) {
       vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings,
                                        vp9_kf_ymode_tree, x->kf_ymode_prob[i],
-                                       bct, kf_y_mode_cts[i], 256, 1);
-#if CONFIG_SUPERBLOCKS
+                                       bct, kf_y_mode_cts[i]);
       vp9_tree_probs_from_distribution(VP9_I32X32_MODES,
                                        vp9_sb_kf_ymode_encodings,
                                        vp9_sb_kf_ymode_tree,
                                        x->sb_kf_ymode_prob[i], bct,
-                                       kf_y_mode_cts[i], 256, 1);
-#endif
+                                       kf_y_mode_cts[i]);
     }
   }
   {
@@ -335,16 +325,16 @@
     for (i = 0; i < VP9_YMODES; i++) {
       vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
                                        vp9_uv_mode_tree, x->kf_uv_mode_prob[i],
-                                       bct, kf_uv_mode_cts[i], 256, 1);
+                                       bct, kf_uv_mode_cts[i]);
       vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
                                        vp9_uv_mode_tree, x->fc.uv_mode_prob[i],
-                                       bct, uv_mode_cts[i], 256, 1);
+                                       bct, uv_mode_cts[i]);
     }
   }
 
   vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
                                    vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,
-                                   bct, i8x8_mode_cts, 256, 1);
+                                   bct, i8x8_mode_cts);
 
   vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,
              sizeof(vp9_sub_mv_ref_prob2));
@@ -362,7 +352,7 @@
   unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2],
   const unsigned int events[VP9_NKF_BINTRAMODES]) {
   vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings,
-    vp9_bmode_tree, p, branch_ct, events, 256, 1);
+                                   vp9_bmode_tree, p, branch_ct, events);
 }
 
 void vp9_default_bmode_probs(vp9_prob p[VP9_NKF_BINTRAMODES - 1]) {
@@ -375,7 +365,7 @@
   unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2],
   const unsigned int events[VP9_KF_BINTRAMODES]) {
   vp9_tree_probs_from_distribution(VP9_KF_BINTRAMODES, vp9_kf_bmode_encodings,
-    vp9_kf_bmode_tree, p, branch_ct, events, 256, 1);
+                                   vp9_kf_bmode_tree, p, branch_ct, events);
 }
 
 void vp9_kf_default_bmode_probs(vp9_prob p[VP9_KF_BINTRAMODES]
@@ -398,9 +388,15 @@
   -1, -2
 };
 struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
+#if CONFIG_ENABLE_6TAP
 const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
-  EIGHTTAP, SIXTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1};
+  SIXTAP, EIGHTTAP, EIGHTTAP_SHARP};
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {0, -1, 1, 2, -1, -1};
+#else
+const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
+  EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP};
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1};
+#endif
 const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1]
                                           [VP9_SWITCHABLE_FILTERS-1] = {
   {248, 192}, { 32, 248}, { 32,  32}, {192, 160}
@@ -418,8 +414,12 @@
 };
 const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = {
   EIGHTTAP, EIGHTTAP_SHARP};
-const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s
+#if CONFIG_ENABLE_6TAP
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1, -1};
+#else
+const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, 0, 1, -1, -1};
 #endif
+#endif
 
 void vp9_entropy_mode_init() {
   vp9_tokens_from_tree(vp9_kf_bmode_encodings,   vp9_kf_bmode_tree);
@@ -426,10 +426,8 @@
   vp9_tokens_from_tree(vp9_bmode_encodings,   vp9_bmode_tree);
   vp9_tokens_from_tree(vp9_ymode_encodings,   vp9_ymode_tree);
   vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree);
-#if CONFIG_SUPERBLOCKS
   vp9_tokens_from_tree(vp9_sb_ymode_encodings, vp9_sb_ymode_tree);
   vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_kf_ymode_tree);
-#endif
   vp9_tokens_from_tree(vp9_uv_mode_encodings,  vp9_uv_mode_tree);
   vp9_tokens_from_tree(vp9_i8x8_mode_encodings,  vp9_i8x8_mode_tree);
   vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree);
@@ -438,10 +436,8 @@
 
   vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array,
                               vp9_mv_ref_tree, NEARESTMV);
-#if CONFIG_SUPERBLOCKS
   vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array,
                               vp9_sb_mv_ref_tree, NEARESTMV);
-#endif
   vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array,
                               vp9_sub_mv_ref_tree, LEFT4X4);
 }
@@ -495,17 +491,14 @@
 
   for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
     for (i = 0; i < 4; i++) {
-      int this_prob;
-      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
-      int factor;
-      {
-        this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128;
-        count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
-        factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
-        this_prob = (pc->fc.vp9_mode_contexts[j][i] * (256 - factor) +
-                     this_prob * factor + 128) >> 8;
-        mode_context[j][i] = clip_prob(this_prob);
-      }
+      int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1], factor;
+
+      count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count;
+      factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT);
+      mode_context[j][i] = weighted_prob(pc->fc.vp9_mode_contexts[j][i],
+                                         get_binary_prob(mv_ref_ct[j][i][0],
+                                                         mv_ref_ct[j][i][1]),
+                                         factor);
     }
   }
 }
@@ -531,25 +524,33 @@
 }
 #endif
 
-// #define MODE_COUNT_TESTING
 #define MODE_COUNT_SAT 20
 #define MODE_MAX_UPDATE_FACTOR 144
+static void update_mode_probs(int n_modes, struct vp9_token_struct *encoding,
+                              const vp9_tree_index *tree, unsigned int *cnt,
+                              vp9_prob *pre_probs, vp9_prob *dst_probs) {
+#define MAX_PROBS 32
+  vp9_prob probs[MAX_PROBS];
+  unsigned int branch_ct[MAX_PROBS][2];
+  int t, count, factor;
+
+  assert(n_modes - 1 < MAX_PROBS);
+  vp9_tree_probs_from_distribution(n_modes, encoding, tree, probs,
+                                   branch_ct, cnt);
+  for (t = 0; t < n_modes - 1; ++t) {
+    count = branch_ct[t][0] + branch_ct[t][1];
+    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
+    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
+    dst_probs[t] = weighted_prob(pre_probs[t], probs[t], factor);
+  }
+}
+
+// #define MODE_COUNT_TESTING
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
-  int i, t, count, factor;
-  unsigned int branch_ct[32][2];
-  vp9_prob ymode_probs[VP9_YMODES - 1];
-#if CONFIG_SUPERBLOCKS
-  vp9_prob sb_ymode_probs[VP9_I32X32_MODES - 1];
-#endif
-  vp9_prob uvmode_probs[VP9_UV_MODES - 1];
-  vp9_prob bmode_probs[VP9_NKF_BINTRAMODES - 1];
-  vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1];
-  vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1];
-  vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1];
-#if CONFIG_COMP_INTERINTRA_PRED
-  vp9_prob interintra_prob;
-#endif
+  int i;
 #ifdef MODE_COUNT_TESTING
+  int t;
+
   printf("static const unsigned int\nymode_counts"
          "[VP9_YMODES] = {\n");
   for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]);
@@ -590,116 +591,43 @@
   printf("};\n");
 #endif
 #endif
-  vp9_tree_probs_from_distribution(
-    VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
-    ymode_probs, branch_ct, cm->fc.ymode_counts,
-    256, 1);
-  for (t = 0; t < VP9_YMODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) +
-            (int)ymode_probs[t] * factor + 128) >> 8;
-    cm->fc.ymode_prob[t] = clip_prob(prob);
-  }
-#if CONFIG_SUPERBLOCKS
-  vp9_tree_probs_from_distribution(VP9_I32X32_MODES,
-                                   vp9_sb_ymode_encodings, vp9_sb_ymode_tree,
-                                   sb_ymode_probs, branch_ct,
-                                   cm->fc.sb_ymode_counts,
-                                   256, 1);
-  for (t = 0; t < VP9_I32X32_MODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_sb_ymode_prob[t] * (256 - factor) +
-            (int)sb_ymode_probs[t] * factor + 128) >> 8;
-    cm->fc.sb_ymode_prob[t] = clip_prob(prob);
-  }
-#endif
+
+  update_mode_probs(VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
+                    cm->fc.ymode_counts, cm->fc.pre_ymode_prob,
+                    cm->fc.ymode_prob);
+  update_mode_probs(VP9_I32X32_MODES, vp9_sb_ymode_encodings, vp9_sb_ymode_tree,
+                    cm->fc.sb_ymode_counts, cm->fc.pre_sb_ymode_prob,
+                    cm->fc.sb_ymode_prob);
   for (i = 0; i < VP9_YMODES; ++i) {
-    vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,
-                                     vp9_uv_mode_tree, uvmode_probs, branch_ct,
-                                     cm->fc.uv_mode_counts[i], 256, 1);
-    for (t = 0; t < VP9_UV_MODES - 1; ++t) {
-      int prob;
-      count = branch_ct[t][0] + branch_ct[t][1];
-      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-      prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) +
-              (int)uvmode_probs[t] * factor + 128) >> 8;
-      cm->fc.uv_mode_prob[i][t] = clip_prob(prob);
-    }
+    update_mode_probs(VP9_UV_MODES, vp9_uv_mode_encodings, vp9_uv_mode_tree,
+                      cm->fc.uv_mode_counts[i], cm->fc.pre_uv_mode_prob[i],
+                      cm->fc.uv_mode_prob[i]);
   }
-  vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings,
-                                   vp9_bmode_tree, bmode_probs, branch_ct,
-                                   cm->fc.bmode_counts, 256, 1);
-  for (t = 0; t < VP9_NKF_BINTRAMODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) +
-            (int)bmode_probs[t] * factor + 128) >> 8;
-    cm->fc.bmode_prob[t] = clip_prob(prob);
-  }
-  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
-                                   vp9_i8x8_mode_tree, i8x8_mode_probs,
-                                   branch_ct, cm->fc.i8x8_mode_counts, 256, 1);
-  for (t = 0; t < VP9_I8X8_MODES - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) +
-            (int)i8x8_mode_probs[t] * factor + 128) >> 8;
-    cm->fc.i8x8_mode_prob[t] = clip_prob(prob);
-  }
+  update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_encodings, vp9_bmode_tree,
+                    cm->fc.bmode_counts, cm->fc.pre_bmode_prob,
+                    cm->fc.bmode_prob);
+  update_mode_probs(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,
+                    vp9_i8x8_mode_tree, cm->fc.i8x8_mode_counts,
+                    cm->fc.pre_i8x8_mode_prob, cm->fc.i8x8_mode_prob);
   for (i = 0; i < SUBMVREF_COUNT; ++i) {
-    vp9_tree_probs_from_distribution(VP9_SUBMVREFS,
-                                     vp9_sub_mv_ref_encoding_array,
-                                     vp9_sub_mv_ref_tree, sub_mv_ref_probs,
-                                     branch_ct, cm->fc.sub_mv_ref_counts[i],
-                                     256, 1);
-    for (t = 0; t < VP9_SUBMVREFS - 1; ++t) {
-      int prob;
-      count = branch_ct[t][0] + branch_ct[t][1];
-      count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-      factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-      prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) +
-              (int)sub_mv_ref_probs[t] * factor + 128) >> 8;
-      cm->fc.sub_mv_ref_prob[i][t] = clip_prob(prob);
-    }
+    update_mode_probs(VP9_SUBMVREFS, vp9_sub_mv_ref_encoding_array,
+                      vp9_sub_mv_ref_tree, cm->fc.sub_mv_ref_counts[i],
+                      cm->fc.pre_sub_mv_ref_prob[i], cm->fc.sub_mv_ref_prob[i]);
   }
-  vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings,
-                                   vp9_mbsplit_tree, mbsplit_probs, branch_ct,
-                                   cm->fc.mbsplit_counts, 256, 1);
-  for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) {
-    int prob;
-    count = branch_ct[t][0] + branch_ct[t][1];
-    count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
-    factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) +
-            (int)mbsplit_probs[t] * factor + 128) >> 8;
-    cm->fc.mbsplit_prob[t] = clip_prob(prob);
-  }
+  update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_encodings, vp9_mbsplit_tree,
+                    cm->fc.mbsplit_counts, cm->fc.pre_mbsplit_prob,
+                    cm->fc.mbsplit_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
   if (cm->use_interintra) {
-    int prob;
-    interintra_prob = vp9_bin_prob_from_distribution(cm->fc.interintra_counts);
+    int factor, interintra_prob, count;
+
+    interintra_prob = get_binary_prob(cm->fc.interintra_counts[0],
+                                      cm->fc.interintra_counts[1]);
     count = cm->fc.interintra_counts[0] + cm->fc.interintra_counts[1];
     count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;
     factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT);
-    prob = ((int)cm->fc.pre_interintra_prob * (256 - factor) +
-            (int)interintra_prob * factor + 128) >> 8;
-    if (prob <= 0)
-      cm->fc.interintra_prob = 1;
-    else if (prob > 255)
-      cm->fc.interintra_prob = 255;
-    else
-      cm->fc.interintra_prob = prob;
+    cm->fc.interintra_prob = weighted_prob(cm->fc.pre_interintra_prob,
+                                           interintra_prob, factor);
   }
 #endif
 }
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
 #define VP9_COMMON_VP9_ENTROPYMODE_H_
 
@@ -17,9 +16,6 @@
 
 #define SUBMVREF_COUNT 5
 #define VP9_NUMMBSPLITS 4
-#if CONFIG_COMP_INTRA_PRED
-#define DEFAULT_COMP_INTRA_PROB  32
-#endif
 
 #if CONFIG_COMP_INTERINTRA_PRED
 #define VP9_DEF_INTERINTRA_PROB 248
@@ -98,7 +94,7 @@
 
 void vp9_adapt_mode_probs(struct VP9Common *);
 
-#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */
+#define VP9_SWITCHABLE_FILTERS 3 /* number of switchable filters */
 
 extern const  INTERPOLATIONFILTERTYPE vp9_switchable_interp
                   [VP9_SWITCHABLE_FILTERS];
@@ -114,4 +110,4 @@
 extern const  vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                                  [VP9_SWITCHABLE_FILTERS - 1];
 
-#endif
+#endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -213,16 +213,12 @@
 
 static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp,
                        unsigned int ct[2]) {
-  int factor;
-  int prob;
   int count = ct[0] + ct[1];
+
   if (count) {
     count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;
-    factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
-    prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8;
-    prob += !prob;
-    prob = (prob > 255 ? 255 : prob);
-    *dest = prob;
+    *dest = weighted_prob(prep, newp,
+                          MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);
   }
 }
 
@@ -251,11 +247,10 @@
                                    vp9_mv_joint_tree,
                                    prob->joints,
                                    branch_ct_joint,
-                                   NMVcount->joints,
-                                   256, 1);
+                                   NMVcount->joints);
   for (i = 0; i < 2; ++i) {
-    prob->comps[i].sign =
-        vp9_bin_prob_from_distribution(NMVcount->comps[i].sign);
+    prob->comps[i].sign = get_binary_prob(NMVcount->comps[i].sign[0],
+                                          NMVcount->comps[i].sign[1]);
     branch_ct_sign[i][0] = NMVcount->comps[i].sign[0];
     branch_ct_sign[i][1] = NMVcount->comps[i].sign[1];
     vp9_tree_probs_from_distribution(MV_CLASSES,
@@ -263,18 +258,16 @@
                                      vp9_mv_class_tree,
                                      prob->comps[i].classes,
                                      branch_ct_classes[i],
-                                     NMVcount->comps[i].classes,
-                                     256, 1);
+                                     NMVcount->comps[i].classes);
     vp9_tree_probs_from_distribution(CLASS0_SIZE,
                                      vp9_mv_class0_encodings,
                                      vp9_mv_class0_tree,
                                      prob->comps[i].class0,
                                      branch_ct_class0[i],
-                                     NMVcount->comps[i].class0,
-                                     256, 1);
+                                     NMVcount->comps[i].class0);
     for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      prob->comps[i].bits[j] = vp9_bin_prob_from_distribution(
-          NMVcount->comps[i].bits[j]);
+      prob->comps[i].bits[j] = get_binary_prob(NMVcount->comps[i].bits[j][0],
+                                               NMVcount->comps[i].bits[j][1]);
       branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0];
       branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1];
     }
@@ -286,8 +279,7 @@
                                        vp9_mv_fp_tree,
                                        prob->comps[i].class0_fp[k],
                                        branch_ct_class0_fp[i][k],
-                                       NMVcount->comps[i].class0_fp[k],
-                                       256, 1);
+                                       NMVcount->comps[i].class0_fp[k]);
     }
     vp9_tree_probs_from_distribution(4,
                                      vp9_mv_fp_encodings,
@@ -294,18 +286,18 @@
                                      vp9_mv_fp_tree,
                                      prob->comps[i].fp,
                                      branch_ct_fp[i],
-                                     NMVcount->comps[i].fp,
-                                     256, 1);
+                                     NMVcount->comps[i].fp);
   }
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      prob->comps[i].class0_hp = vp9_bin_prob_from_distribution(
-          NMVcount->comps[i].class0_hp);
+      prob->comps[i].class0_hp =
+          get_binary_prob(NMVcount->comps[i].class0_hp[0],
+                          NMVcount->comps[i].class0_hp[1]);
       branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0];
       branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1];
 
-      prob->comps[i].hp =
-          vp9_bin_prob_from_distribution(NMVcount->comps[i].hp);
+      prob->comps[i].hp = get_binary_prob(NMVcount->comps[i].hp[0],
+                                          NMVcount->comps[i].hp[1]);
       branch_ct_hp[i][0] = NMVcount->comps[i].hp[0];
       branch_ct_hp[i][1] = NMVcount->comps[i].hp[1];
     }
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -25,6 +25,13 @@
 int vp9_use_nmv_hp(const MV *ref);
 
 #define VP9_NMV_UPDATE_PROB  255
+
+#if CONFIG_NEW_MVREF
+#define VP9_MVREF_UPDATE_PROB 252
+#define VP9_DEFAULT_MV_REF_PROB 192
+#define VP9_MV_REF_UPDATE_COST (14 << 8)
+#endif
+
 //#define MV_GROUP_UPDATE
 
 #define LOW_PRECISION_MV_UPDATE  /* Use 7 bit forward update */
@@ -126,4 +133,5 @@
     unsigned int (*branch_ct_class0_hp)[2],
     unsigned int (*branch_ct_hp)[2]);
 void vp9_counts_process(nmv_context_counts *NMVcount, int usehp);
-#endif
+
+#endif  // VP9_COMMON_VP9_ENTROPYMV_H_
--- a/vp9/common/vp9_extend.c
+++ b/vp9/common/vp9_extend.c
@@ -11,9 +11,9 @@
 #include "vp9/common/vp9_extend.h"
 #include "vpx_mem/vpx_mem.h"
 
-static void copy_and_extend_plane(unsigned char *s, /* source */
+static void copy_and_extend_plane(uint8_t *s,       /* source */
                                   int sp,           /* source pitch */
-                                  unsigned char *d, /* destination */
+                                  uint8_t *d,       /* destination */
                                   int dp,           /* destination pitch */
                                   int h,            /* height */
                                   int w,            /* width */
@@ -22,8 +22,8 @@
                                   int eb,           /* extend bottom border */
                                   int er) {         /* extend right border */
   int i;
-  unsigned char *src_ptr1, *src_ptr2;
-  unsigned char *dest_ptr1, *dest_ptr2;
+  uint8_t *src_ptr1, *src_ptr2;
+  uint8_t *dest_ptr1, *dest_ptr2;
   int linesize;
 
   /* copy the left and right most columns out */
@@ -143,8 +143,8 @@
 }
 
 /* note the extension is only for the last row, for intra prediction purpose */
-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
-                       unsigned char *UPtr, unsigned char *VPtr) {
+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, uint8_t *YPtr,
+                       uint8_t *UPtr, uint8_t *VPtr) {
   int i;
 
   YPtr += ybf->y_stride * 14;
--- a/vp9/common/vp9_extend.h
+++ b/vp9/common/vp9_extend.h
@@ -12,9 +12,10 @@
 #define VP9_COMMON_VP9_EXTEND_H_
 
 #include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
 
-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr,
-                       unsigned char *UPtr, unsigned char *VPtr);
+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, uint8_t *YPtr,
+                       uint8_t *UPtr, uint8_t *VPtr);
 
 void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
                                YV12_BUFFER_CONFIG *dst);
@@ -24,4 +25,4 @@
                                          int srcy, int srcx,
                                          int srch, int srcw);
 
-#endif  // __INC_EXTEND_H
+#endif  // VP9_COMMON_VP9_EXTEND_H_
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -13,8 +13,9 @@
 #include "vp9/common/vp9_filter.h"
 #include "vpx_ports/mem.h"
 #include "vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
 
-DECLARE_ALIGNED(16, const short, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {
   { 128,   0 },
   { 120,   8 },
   { 112,  16 },
@@ -35,7 +36,7 @@
 
 #define FILTER_ALPHA       0
 #define FILTER_ALPHA_SHARP 1
-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
 #if FILTER_ALPHA == 0
   /* Lagrangian interpolation filter */
   { 0,   0,   0, 128,   0,   0,   0,  0},
@@ -81,7 +82,7 @@
 #endif  /* FILTER_ALPHA */
 };
 
-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {
 #if FILTER_ALPHA_SHARP == 1
   /* dct based filter */
   {0,   0,   0, 128,   0,   0,   0, 0},
@@ -121,7 +122,29 @@
 #endif  /* FILTER_ALPHA_SHARP */
 };
 
-DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {
+DECLARE_ALIGNED(16, const int16_t,
+                vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {
+  /* 8-tap lowpass filter */
+  /* Hamming window */
+  {-1, -7, 32, 80, 32, -7, -1,  0},
+  {-1, -8, 28, 80, 37, -7, -2,  1},
+  { 0, -8, 24, 79, 41, -7, -2,  1},
+  { 0, -8, 20, 78, 45, -5, -3,  1},
+  { 0, -8, 16, 76, 50, -4, -3,  1},
+  { 0, -7, 13, 74, 54, -3, -4,  1},
+  { 1, -7,  9, 71, 58, -1, -4,  1},
+  { 1, -6,  6, 68, 62,  1, -5,  1},
+  { 1, -6,  4, 65, 65,  4, -6,  1},
+  { 1, -5,  1, 62, 68,  6, -6,  1},
+  { 1, -4, -1, 58, 71,  9, -7,  1},
+  { 1, -4, -3, 54, 74, 13, -7,  0},
+  { 1, -3, -4, 50, 76, 16, -8,  0},
+  { 1, -3, -5, 45, 78, 20, -8,  0},
+  { 1, -2, -7, 41, 79, 24, -8,  0},
+  { 1, -2, -7, 37, 80, 28, -8, -1}
+};
+
+DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {
   {0,   0, 128,   0,   0, 0},
   {1,  -5, 125,   8,  -2, 1},
   {1,  -8, 122,  17,  -5, 1},
@@ -140,19 +163,19 @@
   {1,  -2,   8, 125,  -5, 1}
 };
 
-static void filter_block2d_first_pass_6(unsigned char *src_ptr,
+static void filter_block2d_first_pass_6(uint8_t *src_ptr,
                                         int *output_ptr,
                                         unsigned int src_pixels_per_line,
                                         unsigned int pixel_step,
                                         unsigned int output_height,
                                         unsigned int output_width,
-                                        const short *vp9_filter) {
+                                        const int16_t *vp9_filter) {
   unsigned int i, j;
-  int  Temp;
+  int temp;
 
   for (i = 0; i < output_height; i++) {
     for (j = 0; j < output_width; j++) {
-      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
+      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
              ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
              ((int)src_ptr[0]                    * vp9_filter[2]) +
              ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
@@ -161,14 +184,7 @@
              (VP9_FILTER_WEIGHT >> 1);      /* Rounding */
 
       /* Normalize back to 0-255 */
-      Temp = Temp >> VP9_FILTER_SHIFT;
-
-      if (Temp < 0)
-        Temp = 0;
-      else if (Temp > 255)
-        Temp = 255;
-
-      output_ptr[j] = Temp;
+      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);
       src_ptr++;
     }
 
@@ -179,20 +195,20 @@
 }
 
 static void filter_block2d_second_pass_6(int *src_ptr,
-                                         unsigned char *output_ptr,
+                                         uint8_t *output_ptr,
                                          int output_pitch,
                                          unsigned int src_pixels_per_line,
                                          unsigned int pixel_step,
                                          unsigned int output_height,
                                          unsigned int output_width,
-                                         const short *vp9_filter) {
+                                         const int16_t *vp9_filter) {
   unsigned int i, j;
-  int  Temp;
+  int temp;
 
   for (i = 0; i < output_height; i++) {
     for (j = 0; j < output_width; j++) {
       /* Apply filter */
-      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
+      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
              ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
              ((int)src_ptr[0]                    * vp9_filter[2]) +
              ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
@@ -201,14 +217,7 @@
              (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
 
       /* Normalize back to 0-255 */
-      Temp = Temp >> VP9_FILTER_SHIFT;
-
-      if (Temp < 0)
-        Temp = 0;
-      else if (Temp > 255)
-        Temp = 255;
-
-      output_ptr[j] = (unsigned char)Temp;
+      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);
       src_ptr++;
     }
 
@@ -227,20 +236,20 @@
  * ((filter_result + dest + 1) >> 1) and stores that in the output.
  */
 static void filter_block2d_second_pass_avg_6(int *src_ptr,
-                                             unsigned char *output_ptr,
+                                             uint8_t *output_ptr,
                                              int output_pitch,
                                              unsigned int src_pixels_per_line,
                                              unsigned int pixel_step,
                                              unsigned int output_height,
                                              unsigned int output_width,
-                                             const short *vp9_filter) {
+                                             const int16_t *vp9_filter) {
   unsigned int i, j;
-  int  Temp;
+  int temp;
 
   for (i = 0; i < output_height; i++) {
     for (j = 0; j < output_width; j++) {
       /* Apply filter */
-      Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
+      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
              ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
              ((int)src_ptr[0]                    * vp9_filter[2]) +
              ((int)src_ptr[pixel_step]           * vp9_filter[3]) +
@@ -249,14 +258,8 @@
              (VP9_FILTER_WEIGHT >> 1);   /* Rounding */
 
       /* Normalize back to 0-255 */
-      Temp = Temp >> VP9_FILTER_SHIFT;
-
-      if (Temp < 0)
-        Temp = 0;
-      else if (Temp > 255)
-        Temp = 255;
-
-      output_ptr[j] = (unsigned char)((output_ptr[j] + Temp + 1) >> 1);
+      output_ptr[j] = (clip_pixel(temp >> VP9_FILTER_SHIFT) +
+                       output_ptr[j] + 1) >> 1;
       src_ptr++;
     }
 
@@ -267,36 +270,39 @@
 }
 
 #define Interp_Extend 3
-static void filter_block2d_6(unsigned char  *src_ptr,
-                             unsigned char  *output_ptr,
+static void filter_block2d_6(uint8_t *src_ptr,
+                             uint8_t *output_ptr,
                              unsigned int src_pixels_per_line,
                              int output_pitch,
-                             const short  *HFilter,
-                             const short  *VFilter) {
-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
+                             const int16_t *HFilter,
+                             const int16_t *VFilter) {
+  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */
 
   /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              3 + Interp_Extend * 2, 4, HFilter);
+  filter_block2d_first_pass_6(
+      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
+      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);
 
-  /* then filter verticaly... */
-  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, output_pitch, 4, 4, 4, 4, VFilter);
+  /* then filter vertically... */
+  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr,
+                               output_pitch, 4, 4, 4, 4, VFilter);
 }
 
 
-void vp9_sixtap_predict_c(unsigned char  *src_ptr,
-                          int   src_pixels_per_line,
-                          int  xoffset,
-                          int  yoffset,
-                          unsigned char *dst_ptr,
-                          int dst_pitch) {
-  const short  *HFilter;
-  const short  *VFilter;
+void vp9_sixtap_predict4x4_c(uint8_t *src_ptr,
+                             int src_pixels_per_line,
+                             int xoffset,
+                             int yoffset,
+                             uint8_t *dst_ptr,
+                             int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
   VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
 
-  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
+  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
+                   VFilter);
 }
 
 /*
@@ -306,173 +312,154 @@
  * then averages that with the content already present in the output
  * ((filter_result + dest + 1) >> 1) and stores that in the output.
  */
-static void filter_block2d_avg_6(unsigned char  *src_ptr,
-                                 unsigned char  *output_ptr,
+static void filter_block2d_avg_6(uint8_t *src_ptr,
+                                 uint8_t *output_ptr,
                                  unsigned int src_pixels_per_line,
                                  int output_pitch,
-                                 const short  *HFilter,
-                                 const short  *VFilter) {
-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */
+                                 const int16_t *HFilter,
+                                 const int16_t *VFilter) {
+  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */
 
   /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line),
-                              FData, src_pixels_per_line, 1,
-                              3 + Interp_Extend * 2, 4, HFilter);
+  filter_block2d_first_pass_6(
+      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
+      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);
 
-  /* then filter verticaly... */
+  /* then filter vertically... */
   filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,
                                    output_pitch, 4, 4, 4, 4, VFilter);
 }
 
-void vp9_sixtap_predict_avg_c
-(
-  unsigned char  *src_ptr,
-  int   src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
+void vp9_sixtap_predict_avg4x4_c(uint8_t *src_ptr,
+                                 int src_pixels_per_line,
+                                 int xoffset,
+                                 int yoffset,
+                                 uint8_t *dst_ptr,
+                                 int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
   VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
 
-  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line,
-                       dst_pitch, HFilter, VFilter);
+  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
+                       HFilter, VFilter);
 }
 
-void vp9_sixtap_predict8x8_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */
-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
+void vp9_sixtap_predict8x8_c(uint8_t *src_ptr,
+                             int src_pixels_per_line,
+                             int xoffset,
+                             int yoffset,
+                             uint8_t *dst_ptr,
+                             int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
+  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */
 
   HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
   VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
 
   /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              7 + Interp_Extend * 2, 8, HFilter);
+  filter_block2d_first_pass_6(
+      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
+      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);
 
+  /* then filter vertically... */
+  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
+                               dst_pitch, 8, 8, 8, 8, VFilter);
 
-  /* then filter verticaly... */
-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
-
 }
 
-void vp9_sixtap_predict_avg8x8_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */
-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
+void vp9_sixtap_predict_avg8x8_c(uint8_t *src_ptr,
+                                 int src_pixels_per_line,
+                                 int xoffset,
+                                 int yoffset,
+                                 uint8_t *dst_ptr,
+                                 int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
+  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */
 
   HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
   VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
 
   /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              7 + Interp_Extend * 2, 8, HFilter);
+  filter_block2d_first_pass_6(
+      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
+      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);
 
-  /* then filter verticaly... */
-  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter);
+  /* then filter vertically... */
+  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
+                                   dst_pitch, 8, 8, 8, 8, VFilter);
 }
 
-void vp9_sixtap_predict8x4_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(7+Interp_Extend*2)*16];   /* Temp data buffer used in filtering */
-  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */
+void vp9_sixtap_predict8x4_c(uint8_t *src_ptr,
+                             int src_pixels_per_line,
+                             int xoffset,
+                             int yoffset,
+                             uint8_t *dst_ptr,
+                             int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
+  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer */
 
   HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
   VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
 
   /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              3 + Interp_Extend * 2, 8, HFilter);
+  filter_block2d_first_pass_6(
+      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
+      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 8, HFilter);
 
-
-  /* then filter verticaly... */
-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter);
-
+  /* then filter vertically... */
+  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
+                               dst_pitch, 8, 8, 4, 8, VFilter);
 }
 
-void vp9_sixtap_predict16x16_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */
-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
+void vp9_sixtap_predict16x16_c(uint8_t *src_ptr,
+                               int src_pixels_per_line,
+                               int xoffset,
+                               int yoffset,
+                               uint8_t *dst_ptr,
+                               int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
+  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */
 
-
   HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
   VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
 
   /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1,
-                              15 + Interp_Extend * 2, 16, HFilter);
+  filter_block2d_first_pass_6(
+      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
+      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
 
-  /* then filter verticaly... */
-  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter);
-
+  /* then filter vertically... */
+  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr,
+                               dst_pitch, 16, 16, 16, 16, VFilter);
 }
 
-void vp9_sixtap_predict_avg16x16_c
-(
-  unsigned char  *src_ptr,
-  int  src_pixels_per_line,
-  int  xoffset,
-  int  yoffset,
-  unsigned char *dst_ptr,
-  int  dst_pitch
-) {
-  const short  *HFilter;
-  const short  *VFilter;
-  // int FData[(15+Interp_Extend*2)*24];   /* Temp data buffer used in filtering */
-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */
+void vp9_sixtap_predict_avg16x16_c(uint8_t *src_ptr,
+                                   int src_pixels_per_line,
+                                   int xoffset,
+                                   int yoffset,
+                                   uint8_t *dst_ptr,
+                                   int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
+  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */
 
   HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */
   VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */
 
   /* First filter 1-D horizontally... */
-  filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
-                              src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
+  filter_block2d_first_pass_6(
+      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
+      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
 
-  /* then filter verticaly... */
-  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch,
-                                   16, 16, 16, 16, VFilter);
+  /* then filter vertically... */
+  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr,
+                                   dst_pitch, 16, 16, 16, 16, VFilter);
 }
 
 typedef enum {
@@ -489,13 +476,13 @@
   {16,16},
 };
 
-static void filter_block2d_8_c(const unsigned char *src_ptr,
-                               const unsigned int   src_stride,
-                               const short *HFilter,
-                               const short *VFilter,
+static void filter_block2d_8_c(const uint8_t *src_ptr,
+                               const unsigned int src_stride,
+                               const int16_t *HFilter,
+                               const int16_t *VFilter,
                                const filter_size_t filter_size,
-                               unsigned char *dst_ptr,
-                               unsigned int   dst_stride) {
+                               uint8_t *dst_ptr,
+                               unsigned int dst_stride) {
   const unsigned int output_width = filter_size_to_wh[filter_size][0];
   const unsigned int output_height = filter_size_to_wh[filter_size][1];
 
@@ -514,12 +501,12 @@
    *                               = 23
    * and filter_max_width = 16
    */
-  unsigned char intermediate_buffer[23 * 16];
+  uint8_t intermediate_buffer[23 * 16];
   const int intermediate_next_stride = 1 - intermediate_height * output_width;
 
   // Horizontal pass (src -> transposed intermediate).
   {
-    unsigned char *output_ptr = intermediate_buffer;
+    uint8_t *output_ptr = intermediate_buffer;
     const int src_next_row_stride = src_stride - output_width;
     unsigned int i, j;
     src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
@@ -537,14 +524,8 @@
                    (VP9_FILTER_WEIGHT >> 1); // Rounding
 
         // Normalize back to 0-255...
-        temp >>= VP9_FILTER_SHIFT;
-        if (temp < 0) {
-          temp = 0;
-        } else if (temp > 255) {
-          temp = 255;
-        }
+        *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
         src_ptr++;
-        *output_ptr = temp;
         output_ptr += intermediate_height;
       }
       src_ptr += src_next_row_stride;
@@ -554,7 +535,7 @@
 
   // Vertical pass (transposed intermediate -> dst).
   {
-    unsigned char *src_ptr = intermediate_buffer;
+    uint8_t *src_ptr = intermediate_buffer;
     const int dst_next_row_stride = dst_stride - output_width;
     unsigned int i, j;
     for (i = 0; i < output_height; i++) {
@@ -571,15 +552,8 @@
                    (VP9_FILTER_WEIGHT >> 1); // Rounding
 
         // Normalize back to 0-255...
-        temp >>= VP9_FILTER_SHIFT;
-        if (temp < 0) {
-          temp = 0;
-        } else if (temp > 255) {
-          temp = 255;
-        }
-
+        *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
         src_ptr += intermediate_height;
-        *dst_ptr++ = (unsigned char)temp;
       }
       src_ptr += intermediate_next_stride;
       dst_ptr += dst_next_row_stride;
@@ -587,53 +561,49 @@
   }
 }
 
-void vp9_filter_block2d_4x4_8_c(const unsigned char *src_ptr,
+void vp9_filter_block2d_4x4_8_c(const uint8_t *src_ptr,
                                 const unsigned int src_stride,
-                                const short *HFilter_aligned16,
-                                const short *VFilter_aligned16,
-                                unsigned char *dst_ptr,
+                                const int16_t *HFilter_aligned16,
+                                const int16_t *VFilter_aligned16,
+                                uint8_t *dst_ptr,
                                 unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride,
-                     HFilter_aligned16, VFilter_aligned16,
+  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
                      VPX_FILTER_4x4, dst_ptr, dst_stride);
 }
 
-void vp9_filter_block2d_8x4_8_c(const unsigned char *src_ptr,
+void vp9_filter_block2d_8x4_8_c(const uint8_t *src_ptr,
                                 const unsigned int src_stride,
-                                const short *HFilter_aligned16,
-                                const short *VFilter_aligned16,
-                                unsigned char *dst_ptr,
+                                const int16_t *HFilter_aligned16,
+                                const int16_t *VFilter_aligned16,
+                                uint8_t *dst_ptr,
                                 unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride,
-                     HFilter_aligned16, VFilter_aligned16,
+  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
                      VPX_FILTER_8x4, dst_ptr, dst_stride);
 }
 
-void vp9_filter_block2d_8x8_8_c(const unsigned char *src_ptr,
+void vp9_filter_block2d_8x8_8_c(const uint8_t *src_ptr,
                                 const unsigned int src_stride,
-                                const short *HFilter_aligned16,
-                                const short *VFilter_aligned16,
-                                unsigned char *dst_ptr,
+                                const int16_t *HFilter_aligned16,
+                                const int16_t *VFilter_aligned16,
+                                uint8_t *dst_ptr,
                                 unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride,
-                     HFilter_aligned16, VFilter_aligned16,
+  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
                      VPX_FILTER_8x8, dst_ptr, dst_stride);
 }
 
-void vp9_filter_block2d_16x16_8_c(const unsigned char *src_ptr,
+void vp9_filter_block2d_16x16_8_c(const uint8_t *src_ptr,
                                   const unsigned int src_stride,
-                                  const short *HFilter_aligned16,
-                                  const short *VFilter_aligned16,
-                                  unsigned char *dst_ptr,
+                                  const int16_t *HFilter_aligned16,
+                                  const int16_t *VFilter_aligned16,
+                                  uint8_t *dst_ptr,
                                   unsigned int dst_stride) {
-  filter_block2d_8_c(src_ptr, src_stride,
-                     HFilter_aligned16, VFilter_aligned16,
+  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
                      VPX_FILTER_16x16, dst_ptr, dst_stride);
 }
 
-static void block2d_average_c(unsigned char *src,
-                              unsigned int   src_stride,
-                              unsigned char *output_ptr,
+static void block2d_average_c(uint8_t *src,
+                              unsigned int src_stride,
+                              uint8_t *output_ptr,
                               unsigned int output_stride,
                               const filter_size_t filter_size) {
   const unsigned int output_width = filter_size_to_wh[filter_size][0];
@@ -650,231 +620,319 @@
 
 #define block2d_average block2d_average_c
 
-void vp9_eighttap_predict_c(unsigned char  *src_ptr,
-                            int   src_pixels_per_line,
-                            int  xoffset,
-                            int  yoffset,
-                            unsigned char *dst_ptr,
-                            int dst_pitch) {
-  const short  *HFilter;
-  const short  *VFilter;
+void vp9_eighttap_predict4x4_c(uint8_t *src_ptr,
+                               int src_pixels_per_line,
+                               int xoffset,
+                               int yoffset,
+                               uint8_t *dst_ptr,
+                               int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_sub_pel_filters_8[xoffset];
   VFilter = vp9_sub_pel_filters_8[yoffset];
 
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
                            dst_ptr, dst_pitch);
 }
 
-void vp9_eighttap_predict_avg4x4_c(unsigned char  *src_ptr,
-                                   int   src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
+void vp9_eighttap_predict_avg4x4_c(uint8_t *src_ptr,
+                                   int src_pixels_per_line,
+                                   int xoffset,
+                                   int yoffset,
+                                   uint8_t *dst_ptr,
                                    int dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
-  unsigned char tmp[4 * 4];
+  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
+  uint8_t tmp[4 * 4];
 
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           tmp, 4);
+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
+                           4);
   block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
 }
 
-void vp9_eighttap_predict_sharp_c(unsigned char  *src_ptr,
-                                  int   src_pixels_per_line,
-                                  int  xoffset,
-                                  int  yoffset,
-                                  unsigned char *dst_ptr,
-                                  int dst_pitch) {
-  const short  *HFilter;
-  const short  *VFilter;
+void vp9_eighttap_predict4x4_sharp_c(uint8_t *src_ptr,
+                                     int src_pixels_per_line,
+                                     int xoffset,
+                                     int yoffset,
+                                     uint8_t *dst_ptr,
+                                     int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_sub_pel_filters_8s[xoffset];
   VFilter = vp9_sub_pel_filters_8s[yoffset];
 
+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
+                           dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict4x4_smooth_c(uint8_t *src_ptr,
+                                      int src_pixels_per_line,
+                                      int xoffset,
+                                      int yoffset,
+                                      uint8_t *dst_ptr,
+                                      int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
+
+  HFilter = vp9_sub_pel_filters_8lp[xoffset];
+  VFilter = vp9_sub_pel_filters_8lp[yoffset];
+
   vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
                            HFilter, VFilter,
                            dst_ptr, dst_pitch);
 }
 
-void vp9_eighttap_predict_avg4x4_sharp_c(unsigned char  *src_ptr,
-                                         int   src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         unsigned char *dst_ptr,
+void vp9_eighttap_predict_avg4x4_sharp_c(uint8_t *src_ptr,
+                                         int src_pixels_per_line,
+                                         int xoffset,
+                                         int yoffset,
+                                         uint8_t *dst_ptr,
                                          int dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
-  unsigned char tmp[4 * 4];
+  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
+  uint8_t tmp[4 * 4];
 
-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           tmp, 4);
+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
+                           4);
   block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
 }
 
-void vp9_eighttap_predict8x8_c(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+void vp9_eighttap_predict_avg4x4_smooth_c(uint8_t *src_ptr,
+                                          int src_pixels_per_line,
+                                          int xoffset,
+                                          int yoffset,
+                                          uint8_t *dst_ptr,
+                                          int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
+  uint8_t tmp[4 * 4];
 
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
+  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
+                           4);
+  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
+}
+
+
+void vp9_eighttap_predict8x8_c(uint8_t *src_ptr,
+                               int src_pixels_per_line,
+                               int xoffset,
+                               int yoffset,
+                               uint8_t *dst_ptr,
+                               int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
                            dst_ptr, dst_pitch);
 }
 
-void vp9_eighttap_predict8x8_sharp_c(unsigned char  *src_ptr,
-                                     int  src_pixels_per_line,
-                                     int  xoffset,
-                                     int  yoffset,
-                                     unsigned char *dst_ptr,
-                                     int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+void vp9_eighttap_predict8x8_sharp_c(uint8_t *src_ptr,
+                                     int src_pixels_per_line,
+                                     int xoffset,
+                                     int yoffset,
+                                     uint8_t *dst_ptr,
+                                     int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
 
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
                            dst_ptr, dst_pitch);
 }
 
-void vp9_eighttap_predict_avg8x8_c(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  unsigned char tmp[8 * 8];
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+void vp9_eighttap_predict8x8_smooth_c(uint8_t *src_ptr,
+                                      int src_pixels_per_line,
+                                      int xoffset,
+                                      int yoffset,
+                                      uint8_t *dst_ptr,
+                                      int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
 
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           tmp, 8);
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
+                           dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg8x8_c(uint8_t *src_ptr,
+                                   int src_pixels_per_line,
+                                   int xoffset,
+                                   int yoffset,
+                                   uint8_t *dst_ptr,
+                                   int dst_pitch) {
+  uint8_t tmp[8 * 8];
+  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
+                           8);
   block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
 }
 
-void vp9_eighttap_predict_avg8x8_sharp_c(unsigned char  *src_ptr,
-                                         int  src_pixels_per_line,
-                                         int  xoffset,
-                                         int  yoffset,
-                                         unsigned char *dst_ptr,
-                                         int  dst_pitch) {
-  unsigned char tmp[8 * 8];
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+void vp9_eighttap_predict_avg8x8_sharp_c(uint8_t *src_ptr,
+                                         int src_pixels_per_line,
+                                         int xoffset,
+                                         int yoffset,
+                                         uint8_t *dst_ptr,
+                                         int dst_pitch) {
+  uint8_t tmp[8 * 8];
+  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
 
-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
-                           tmp, 8);
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
+                           8);
   block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
 }
 
-void vp9_eighttap_predict8x4_c(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+void vp9_eighttap_predict_avg8x8_smooth_c(uint8_t *src_ptr,
+                                          int src_pixels_per_line,
+                                          int xoffset,
+                                          int yoffset,
+                                          uint8_t *dst_ptr,
+                                          int dst_pitch) {
+  uint8_t tmp[8 * 8];
+  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
 
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
+  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
+                           8);
+  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
+}
+
+void vp9_eighttap_predict8x4_c(uint8_t *src_ptr,
+                               int src_pixels_per_line,
+                               int xoffset,
+                               int yoffset,
+                               uint8_t *dst_ptr,
+                               int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
                            dst_ptr, dst_pitch);
 }
 
-void vp9_eighttap_predict8x4_sharp_c(unsigned char  *src_ptr,
-                                     int  src_pixels_per_line,
-                                     int  xoffset,
-                                     int  yoffset,
-                                     unsigned char *dst_ptr,
-                                     int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+void vp9_eighttap_predict8x4_sharp_c(uint8_t *src_ptr,
+                                     int src_pixels_per_line,
+                                     int xoffset,
+                                     int yoffset,
+                                     uint8_t *dst_ptr,
+                                     int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
 
-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line,
-                           HFilter, VFilter,
+  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
                            dst_ptr, dst_pitch);
 }
 
-void vp9_eighttap_predict16x16_c(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+void vp9_eighttap_predict8x4_smooth_c(uint8_t *src_ptr,
+                                      int src_pixels_per_line,
+                                      int xoffset,
+                                      int yoffset,
+                                      uint8_t *dst_ptr,
+                                      int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
 
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
-                       HFilter, VFilter,
-                       dst_ptr, dst_pitch);
+  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
+                           dst_ptr, dst_pitch);
 }
 
-void vp9_eighttap_predict16x16_sharp_c(unsigned char  *src_ptr,
-                                       int  src_pixels_per_line,
-                                       int  xoffset,
-                                       int  yoffset,
-                                       unsigned char *dst_ptr,
-                                       int  dst_pitch) {
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+void vp9_eighttap_predict16x16_c(uint8_t *src_ptr,
+                                 int src_pixels_per_line,
+                                 int xoffset,
+                                 int yoffset,
+                                 uint8_t *dst_ptr,
+                                 int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
 
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
-                       HFilter, VFilter,
-                       dst_ptr, dst_pitch);
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
+                             dst_ptr, dst_pitch);
 }
 
-void vp9_eighttap_predict_avg16x16_c(unsigned char  *src_ptr,
-                                     int  src_pixels_per_line,
-                                     int  xoffset,
-                                     int  yoffset,
-                                     unsigned char *dst_ptr,
-                                     int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
-  const short  *HFilter = vp9_sub_pel_filters_8[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8[yoffset];
+void vp9_eighttap_predict16x16_sharp_c(uint8_t *src_ptr,
+                                       int src_pixels_per_line,
+                                       int xoffset,
+                                       int yoffset,
+                                       uint8_t *dst_ptr,
+                                       int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
 
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
-                       HFilter, VFilter,
-                       tmp, 16);
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
+                             dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict16x16_smooth_c(uint8_t *src_ptr,
+                                        int src_pixels_per_line,
+                                        int xoffset,
+                                        int yoffset,
+                                        uint8_t *dst_ptr,
+                                        int dst_pitch) {
+  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
+
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
+                             dst_ptr, dst_pitch);
+}
+
+void vp9_eighttap_predict_avg16x16_c(uint8_t *src_ptr,
+                                     int src_pixels_per_line,
+                                     int xoffset,
+                                     int yoffset,
+                                     uint8_t *dst_ptr,
+                                     int dst_pitch) {
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
+  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
+
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
+                             tmp, 16);
   block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
 }
 
-void vp9_eighttap_predict_avg16x16_sharp_c(unsigned char  *src_ptr,
-                                           int  src_pixels_per_line,
-                                           int  xoffset,
-                                           int  yoffset,
-                                           unsigned char *dst_ptr,
-                                           int  dst_pitch) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16);
-  const short  *HFilter = vp9_sub_pel_filters_8s[xoffset];
-  const short  *VFilter = vp9_sub_pel_filters_8s[yoffset];
+void vp9_eighttap_predict_avg16x16_sharp_c(uint8_t *src_ptr,
+                                           int src_pixels_per_line,
+                                           int xoffset,
+                                           int yoffset,
+                                           uint8_t *dst_ptr,
+                                           int dst_pitch) {
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
+  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
 
-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line,
-                       HFilter, VFilter,
-                       tmp, 16);
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
+                             tmp, 16);
   block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
 }
 
+void vp9_eighttap_predict_avg16x16_smooth_c(uint8_t *src_ptr,
+                                            int src_pixels_per_line,
+                                            int xoffset,
+                                            int yoffset,
+                                            uint8_t *dst_ptr,
+                                            int dst_pitch) {
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
+  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
+  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
+
+  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
+                             tmp, 16);
+  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
+}
+
 /****************************************************************************
  *
  *  ROUTINE       : filter_block2d_bil_first_pass
  *
- *  INPUTS        : UINT8  *src_ptr    : Pointer to source block.
- *                  UINT32  src_stride : Stride of source block.
- *                  UINT32  height     : Block height.
- *                  UINT32  width      : Block width.
- *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.
+ *  INPUTS        : uint8_t  *src_ptr    : Pointer to source block.
+ *                  uint32_t  src_stride : Stride of source block.
+ *                  uint32_t  height     : Block height.
+ *                  uint32_t  width      : Block width.
+ *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.
  *
- *  OUTPUTS       : INT32  *dst_ptr    : Pointer to filtered block.
+ *  OUTPUTS       : int32_t  *dst_ptr    : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
@@ -882,16 +940,16 @@
  *                  in the horizontal direction to produce the filtered output
  *                  block. Used to implement first-pass of 2-D separable filter.
  *
- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
  *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
  *
  ****************************************************************************/
-static void filter_block2d_bil_first_pass(unsigned char  *src_ptr,
-                                          unsigned short *dst_ptr,
-                                          unsigned int    src_stride,
-                                          unsigned int    height,
-                                          unsigned int    width,
-                                          const short    *vp9_filter) {
+static void filter_block2d_bil_first_pass(uint8_t *src_ptr,
+                                          uint16_t *dst_ptr,
+                                          unsigned int src_stride,
+                                          unsigned int height,
+                                          unsigned int width,
+                                          const int16_t *vp9_filter) {
   unsigned int i, j;
 
   for (i = 0; i < height; i++) {
@@ -913,13 +971,13 @@
  *
  *  ROUTINE       : filter_block2d_bil_second_pass
  *
- *  INPUTS        : INT32  *src_ptr    : Pointer to source block.
- *                  UINT32  dst_pitch  : Destination block pitch.
- *                  UINT32  height     : Block height.
- *                  UINT32  width      : Block width.
- *                  INT32  *vp9_filter : Array of 2 bi-linear filter taps.
+ *  INPUTS        : int32_t  *src_ptr    : Pointer to source block.
+ *                  uint32_t  dst_pitch  : Destination block pitch.
+ *                  uint32_t  height     : Block height.
+ *                  uint32_t  width      : Block width.
+ *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.
  *
- *  OUTPUTS       : UINT16 *dst_ptr    : Pointer to filtered block.
+ *  OUTPUTS       : uint16_t *dst_ptr    : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
@@ -931,22 +989,22 @@
  *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
  *
  ****************************************************************************/
-static void filter_block2d_bil_second_pass(unsigned short *src_ptr,
-                                           unsigned char  *dst_ptr,
-                                           int             dst_pitch,
-                                           unsigned int    height,
-                                           unsigned int    width,
-                                           const short    *vp9_filter) {
-  unsigned int  i, j;
-  int  Temp;
+static void filter_block2d_bil_second_pass(uint16_t *src_ptr,
+                                           uint8_t *dst_ptr,
+                                           int dst_pitch,
+                                           unsigned int height,
+                                           unsigned int width,
+                                           const int16_t *vp9_filter) {
+  unsigned int i, j;
+  int temp;
 
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       /* Apply filter */
-      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +
+      temp = ((int)src_ptr[0]     * vp9_filter[0]) +
              ((int)src_ptr[width] * vp9_filter[1]) +
              (VP9_FILTER_WEIGHT / 2);
-      dst_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT);
+      dst_ptr[j] = (unsigned int)(temp >> VP9_FILTER_SHIFT);
       src_ptr++;
     }
 
@@ -964,22 +1022,22 @@
  * with the values already present in the output and stores the result of
  * that back into the output ((filter_result + dest + 1) >> 1).
  */
-static void filter_block2d_bil_second_pass_avg(unsigned short *src_ptr,
-                                               unsigned char  *dst_ptr,
-                                               int             dst_pitch,
-                                               unsigned int    height,
-                                               unsigned int    width,
-                                               const short    *vp9_filter) {
-  unsigned int  i, j;
-  int  Temp;
+static void filter_block2d_bil_second_pass_avg(uint16_t *src_ptr,
+                                               uint8_t *dst_ptr,
+                                               int dst_pitch,
+                                               unsigned int height,
+                                               unsigned int width,
+                                               const int16_t *vp9_filter) {
+  unsigned int i, j;
+  int temp;
 
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
       /* Apply filter */
-      Temp = ((int)src_ptr[0]     * vp9_filter[0]) +
-             ((int)src_ptr[width] * vp9_filter[1]) +
-             (VP9_FILTER_WEIGHT / 2);
-      dst_ptr[j] = (unsigned int)(((Temp >> VP9_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1);
+      temp = (((int)src_ptr[0]     * vp9_filter[0]) +
+              ((int)src_ptr[width] * vp9_filter[1]) +
+              (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
+      dst_ptr[j] = (unsigned int)((temp + dst_ptr[j] + 1) >> 1);
       src_ptr++;
     }
 
@@ -992,15 +1050,15 @@
  *
  *  ROUTINE       : filter_block2d_bil
  *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32  src_pitch        : Stride of source block.
- *                  UINT32  dst_pitch        : Stride of destination block.
- *                  INT32  *HFilter          : Array of 2 horizontal filter taps.
- *                  INT32  *VFilter          : Array of 2 vertical filter taps.
- *                  INT32  Width             : Block width
- *                  INT32  Height            : Block height
+ *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t  src_pitch        : Stride of source block.
+ *                  uint32_t  dst_pitch        : Stride of destination block.
+ *                  int32_t  *HFilter          : Array of 2 horizontal filter taps.
+ *                  int32_t  *VFilter          : Array of 2 vertical filter taps.
+ *                  int32_t  Width             : Block width
+ *                  int32_t  Height            : Block height
  *
- *  OUTPUTS       : UINT16 *dst_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : uint16_t *dst_ptr       : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
@@ -1011,16 +1069,16 @@
  *  SPECIAL NOTES : The largest block size can be handled here is 16x16
  *
  ****************************************************************************/
-static void filter_block2d_bil(unsigned char *src_ptr,
-                               unsigned char *dst_ptr,
-                               unsigned int   src_pitch,
-                               unsigned int   dst_pitch,
-                               const short   *HFilter,
-                               const short   *VFilter,
-                               int            Width,
-                               int            Height) {
+static void filter_block2d_bil(uint8_t *src_ptr,
+                               uint8_t *dst_ptr,
+                               unsigned int src_pitch,
+                               unsigned int dst_pitch,
+                               const int16_t *HFilter,
+                               const int16_t *VFilter,
+                               int Width,
+                               int Height) {
 
-  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */
+  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */
 
   /* First filter 1-D horizontally... */
   filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
@@ -1029,15 +1087,15 @@
   filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 
-static void filter_block2d_bil_avg(unsigned char *src_ptr,
-                                   unsigned char *dst_ptr,
-                                   unsigned int   src_pitch,
-                                   unsigned int   dst_pitch,
-                                   const short   *HFilter,
-                                   const short   *VFilter,
-                                   int            Width,
-                                   int            Height) {
-  unsigned short FData[17 * 16];  /* Temp data buffer used in filtering */
+static void filter_block2d_bil_avg(uint8_t *src_ptr,
+                                   uint8_t *dst_ptr,
+                                   unsigned int src_pitch,
+                                   unsigned int dst_pitch,
+                                   const int16_t *HFilter,
+                                   const int16_t *VFilter,
+                                   int Width,
+                                   int Height) {
+  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */
 
   /* First filter 1-D horizontally... */
   filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
@@ -1046,14 +1104,14 @@
   filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
 }
 
-void vp9_bilinear_predict4x4_c(unsigned char  *src_ptr,
-                               int   src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
+void vp9_bilinear_predict4x4_c(uint8_t *src_ptr,
+                               int src_pixels_per_line,
+                               int xoffset,
+                               int yoffset,
+                               uint8_t *dst_ptr,
                                int dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -1061,14 +1119,14 @@
   filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
 }
 
-void vp9_bilinear_predict_avg4x4_c(unsigned char  *src_ptr,
-                                   int   src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
+void vp9_bilinear_predict_avg4x4_c(uint8_t *src_ptr,
+                                   int src_pixels_per_line,
+                                   int xoffset,
+                                   int yoffset,
+                                   uint8_t *dst_ptr,
                                    int dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -1077,14 +1135,14 @@
                          dst_pitch, HFilter, VFilter, 4, 4);
 }
 
-void vp9_bilinear_predict8x8_c(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
+void vp9_bilinear_predict8x8_c(uint8_t *src_ptr,
+                               int src_pixels_per_line,
+                               int xoffset,
+                               int yoffset,
+                               uint8_t *dst_ptr,
+                               int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -1093,14 +1151,14 @@
 
 }
 
-void vp9_bilinear_predict_avg8x8_c(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
+void vp9_bilinear_predict_avg8x8_c(uint8_t *src_ptr,
+                                   int src_pixels_per_line,
+                                   int xoffset,
+                                   int yoffset,
+                                   uint8_t *dst_ptr,
+                                   int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -1109,14 +1167,14 @@
                          dst_pitch, HFilter, VFilter, 8, 8);
 }
 
-void vp9_bilinear_predict8x4_c(unsigned char  *src_ptr,
-                               int  src_pixels_per_line,
-                               int  xoffset,
-                               int  yoffset,
-                               unsigned char *dst_ptr,
-                               int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
+void vp9_bilinear_predict8x4_c(uint8_t *src_ptr,
+                               int src_pixels_per_line,
+                               int xoffset,
+                               int yoffset,
+                               uint8_t *dst_ptr,
+                               int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -1125,14 +1183,14 @@
 
 }
 
-void vp9_bilinear_predict16x16_c(unsigned char  *src_ptr,
-                                 int  src_pixels_per_line,
-                                 int  xoffset,
-                                 int  yoffset,
-                                 unsigned char *dst_ptr,
-                                 int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
+void vp9_bilinear_predict16x16_c(uint8_t *src_ptr,
+                                 int src_pixels_per_line,
+                                 int xoffset,
+                                 int yoffset,
+                                 uint8_t *dst_ptr,
+                                 int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -1140,14 +1198,14 @@
   filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
 }
 
-void vp9_bilinear_predict_avg16x16_c(unsigned char  *src_ptr,
-                                     int  src_pixels_per_line,
-                                     int  xoffset,
-                                     int  yoffset,
-                                     unsigned char *dst_ptr,
-                                     int  dst_pitch) {
-  const short *HFilter;
-  const short *VFilter;
+void vp9_bilinear_predict_avg16x16_c(uint8_t *src_ptr,
+                                     int src_pixels_per_line,
+                                     int xoffset,
+                                     int yoffset,
+                                     uint8_t *dst_ptr,
+                                     int dst_pitch) {
+  const int16_t *HFilter;
+  const int16_t *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -13,6 +13,7 @@
 
 #include "vpx_config.h"
 #include "vpx_scale/yv12config.h"
+#include "vpx/vpx_integer.h"
 
 #define BLOCK_HEIGHT_WIDTH 4
 #define VP9_FILTER_WEIGHT 128
@@ -20,9 +21,10 @@
 
 #define SUBPEL_SHIFTS 16
 
-extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2];
-extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
-extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
-extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2];
+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
+extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];
 
-#endif // FILTER_H
+#endif  // VP9_COMMON_VP9_FILTER_H_
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -14,7 +14,7 @@
 #include "vp9/common/vp9_subpelvar.h"
 #include <limits.h>
 
-const unsigned char vp9_mbsplit_offset[4][16] = {
+const uint8_t vp9_mbsplit_offset[4][16] = {
   { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
   { 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
   { 0,  2,  8, 10,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},
@@ -42,23 +42,23 @@
 }
 
 #define SP(x) (((x) & 7) << 1)
-unsigned int vp9_sad3x16_c(const unsigned char *src_ptr,
+unsigned int vp9_sad3x16_c(const uint8_t *src_ptr,
                            int  src_stride,
-                           const unsigned char *ref_ptr,
+                           const uint8_t *ref_ptr,
                            int  ref_stride) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16);
 }
-unsigned int vp9_sad16x3_c(const unsigned char *src_ptr,
+unsigned int vp9_sad16x3_c(const uint8_t *src_ptr,
                            int  src_stride,
-                           const unsigned char *ref_ptr,
+                           const uint8_t *ref_ptr,
                            int  ref_stride) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3);
 }
 
-#if CONFIG_SUBPELREFMV
-unsigned int vp9_variance2x16_c(const unsigned char *src_ptr,
+
+unsigned int vp9_variance2x16_c(const uint8_t *src_ptr,
                                 const int  source_stride,
-                                const unsigned char *ref_ptr,
+                                const uint8_t *ref_ptr,
                                 const int  recon_stride,
                                 unsigned int *sse) {
   int sum;
@@ -66,9 +66,9 @@
   return (*sse - (((unsigned int)sum * sum) >> 5));
 }
 
-unsigned int vp9_variance16x2_c(const unsigned char *src_ptr,
+unsigned int vp9_variance16x2_c(const uint8_t *src_ptr,
                                 const int  source_stride,
-                                const unsigned char *ref_ptr,
+                                const uint8_t *ref_ptr,
                                 const int  recon_stride,
                                 unsigned int *sse) {
   int sum;
@@ -76,16 +76,16 @@
   return (*sse - (((unsigned int)sum * sum) >> 5));
 }
 
-unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
                                           const int  src_pixels_per_line,
                                           const int  xoffset,
                                           const int  yoffset,
-                                          const unsigned char *dst_ptr,
+                                          const uint8_t *dst_ptr,
                                           const int dst_pixels_per_line,
                                           unsigned int *sse) {
-  unsigned short FData3[16 * 3];  // Temp data buffer used in filtering
-  unsigned char  temp2[2 * 16];
-  const short *HFilter, *VFilter;
+  uint16_t FData3[16 * 3];  // Temp data buffer used in filtering
+  uint8_t temp2[2 * 16];
+  const int16_t *HFilter, *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -97,16 +97,16 @@
   return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
-unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
                                           const int  src_pixels_per_line,
                                           const int  xoffset,
                                           const int  yoffset,
-                                          const unsigned char *dst_ptr,
+                                          const uint8_t *dst_ptr,
                                           const int dst_pixels_per_line,
                                           unsigned int *sse) {
-  unsigned short FData3[2 * 17];  // Temp data buffer used in filtering
-  unsigned char  temp2[2 * 16];
-  const short *HFilter, *VFilter;
+  uint16_t FData3[2 * 17];  // Temp data buffer used in filtering
+  uint8_t temp2[2 * 16];
+  const int16_t *HFilter, *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -117,7 +117,6 @@
 
   return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);
 }
-#endif
 
 /* check a list of motion vectors by sad score using a number rows of pixels
  * above and a number cols of pixels in the left to select the one with best
@@ -124,44 +123,40 @@
  * score to use as ref motion vector
  */
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           unsigned char *ref_y_buffer,
+                           uint8_t *ref_y_buffer,
                            int ref_y_stride,
                            int_mv *mvlist,
-                           int_mv *best_mv,
                            int_mv *nearest,
                            int_mv *near) {
   int i, j;
-  unsigned char *above_src;
-  unsigned char *left_src;
-  unsigned char *above_ref;
-  unsigned char *left_ref;
+  uint8_t *above_src;
+  uint8_t *above_ref;
+#if !CONFIG_ABOVESPREFMV
+  uint8_t *left_src;
+  uint8_t *left_ref;
+#endif
   unsigned int score;
-#if CONFIG_SUBPELREFMV
   unsigned int sse;
-#endif
-  unsigned int ref_scores[MAX_MV_REFS] = {0};
-  int_mv sorted_mvs[MAX_MV_REFS];
+  unsigned int ref_scores[MAX_MV_REF_CANDIDATES] = {0};
+  int_mv sorted_mvs[MAX_MV_REF_CANDIDATES];
   int zero_seen = FALSE;
 
   // Default all to 0,0 if nothing else available
-  best_mv->as_int = nearest->as_int = near->as_int = 0;
+  nearest->as_int = near->as_int = 0;
   vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
 
-#if CONFIG_SUBPELREFMV
   above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
-  left_src  = xd->dst.y_buffer - 2;
   above_ref = ref_y_buffer - ref_y_stride * 2;
-  left_ref  = ref_y_buffer - 2;
+#if CONFIG_ABOVESPREFMV
+  above_src -= 4;
+  above_ref -= 4;
 #else
-  above_src = xd->dst.y_buffer - xd->dst.y_stride * 3;
-  left_src  = xd->dst.y_buffer - 3;
-  above_ref = ref_y_buffer - ref_y_stride * 3;
-  left_ref  = ref_y_buffer - 3;
+  left_src  = xd->dst.y_buffer - 2;
+  left_ref  = ref_y_buffer - 2;
 #endif
 
-  //for(i = 0; i < MAX_MV_REFS; ++i) {
-  // Limit search to the predicted best 4
-  for(i = 0; i < 4; ++i) {
+  // Limit search to the predicted best few candidates
+  for(i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
     int_mv this_mv;
     int offset = 0;
     int row_offset, col_offset;
@@ -175,34 +170,54 @@
 
     zero_seen = zero_seen || !this_mv.as_int;
 
+#if !CONFIG_ABOVESPREFMV
     clamp_mv(&this_mv,
              xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,
              xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
              xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
              xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+#else
+    clamp_mv(&this_mv,
+             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32,
+             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,
+             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+#endif
 
-#if CONFIG_SUBPELREFMV
     row_offset = this_mv.as_mv.row >> 3;
     col_offset = this_mv.as_mv.col >> 3;
     offset = ref_y_stride * row_offset + col_offset;
     score = 0;
     if (xd->up_available) {
-      vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride,
+      vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,
+                                 SP(this_mv.as_mv.col),
+                                 SP(this_mv.as_mv.row),
+                                 above_src, xd->dst.y_stride, &sse);
+      score += sse;
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
+        vp9_sub_pixel_variance16x2(above_ref + offset + 16,
+                                   ref_y_stride,
                                    SP(this_mv.as_mv.col),
                                    SP(this_mv.as_mv.row),
-                                   above_src, xd->dst.y_stride, &sse);
-      score += sse;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        vp9_sub_pixel_variance16x2_c(above_ref + offset + 16,
-                                     ref_y_stride,
-                                     SP(this_mv.as_mv.col),
-                                     SP(this_mv.as_mv.row),
-                                     above_src + 16, xd->dst.y_stride, &sse);
+                                   above_src + 16, xd->dst.y_stride, &sse);
         score += sse;
       }
-#endif
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+        vp9_sub_pixel_variance16x2(above_ref + offset + 32,
+                                   ref_y_stride,
+                                   SP(this_mv.as_mv.col),
+                                   SP(this_mv.as_mv.row),
+                                   above_src + 32, xd->dst.y_stride, &sse);
+        score += sse;
+        vp9_sub_pixel_variance16x2(above_ref + offset + 48,
+                                   ref_y_stride,
+                                   SP(this_mv.as_mv.col),
+                                   SP(this_mv.as_mv.row),
+                                   above_src + 48, xd->dst.y_stride, &sse);
+        score += sse;
+      }
     }
+#if !CONFIG_ABOVESPREFMV
     if (xd->left_available) {
       vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,
                                    SP(this_mv.as_mv.col),
@@ -209,8 +224,7 @@
                                    SP(this_mv.as_mv.row),
                                    left_src, xd->dst.y_stride, &sse);
       score += sse;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
         vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16,
                                      ref_y_stride,
                                      SP(this_mv.as_mv.col),
@@ -219,38 +233,24 @@
                                      xd->dst.y_stride, &sse);
         score += sse;
       }
-#endif
-    }
-#else
-    row_offset = (this_mv.as_mv.row > 0) ?
-      ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
-    col_offset = (this_mv.as_mv.col > 0) ?
-      ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
-    offset = ref_y_stride * row_offset + col_offset;
-    score = 0;
-    if (xd->up_available) {
-      score += vp9_sad16x3(above_src, xd->dst.y_stride,
-                           above_ref + offset, ref_y_stride);
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        score += vp9_sad16x3(above_src + 16, xd->dst.y_stride,
-                             above_ref + offset + 16, ref_y_stride);
+      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {
+        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32,
+                                     ref_y_stride,
+                                     SP(this_mv.as_mv.col),
+                                     SP(this_mv.as_mv.row),
+                                     left_src + xd->dst.y_stride * 32,
+                                     xd->dst.y_stride, &sse);
+        score += sse;
+        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48,
+                                     ref_y_stride,
+                                     SP(this_mv.as_mv.col),
+                                     SP(this_mv.as_mv.row),
+                                     left_src + xd->dst.y_stride * 48,
+                                     xd->dst.y_stride, &sse);
+        score += sse;
       }
-#endif
     }
-    if (xd->left_available) {
-      score += vp9_sad3x16(left_src, xd->dst.y_stride,
-                           left_ref + offset, ref_y_stride);
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        score += vp9_sad3x16(left_src + xd->dst.y_stride * 16,
-                             xd->dst.y_stride,
-                             left_ref + offset + ref_y_stride * 16,
-                             ref_y_stride);
-      }
 #endif
-    }
-#endif
     // Add the entry to our list and then resort the list on score.
     ref_scores[i] = score;
     sorted_mvs[i].as_int = this_mv.as_int;
@@ -268,13 +268,10 @@
   }
 
   // Make sure all the candidates are properly clamped etc
-  for (i = 0; i < 4; ++i) {
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
     lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv);
     clamp_mv2(&sorted_mvs[i], xd);
   }
-
-  // Set the best mv to the first entry in the sorted list
-  best_mv->as_int = sorted_mvs[0].as_int;
 
   // Provided that there are non zero vectors available there will not
   // be more than one 0,0 entry in the sorted list.
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -22,10 +22,9 @@
  * score to use as ref motion vector
  */
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
-                           unsigned char *ref_y_buffer,
+                           uint8_t *ref_y_buffer,
                            int ref_y_stride,
                            int_mv *mvlist,
-                           int_mv *best_mv,
                            int_mv *nearest,
                            int_mv *near);
 
@@ -82,7 +81,7 @@
                            vp9_prob p[VP9_MVREFS - 1],
                            const int context);
 
-extern const unsigned char vp9_mbsplit_offset[4][16];
+extern const uint8_t vp9_mbsplit_offset[4][16];
 
 static int left_block_mv(const MODE_INFO *cur_mb, int b) {
   if (!(b & 3)) {
@@ -182,4 +181,4 @@
   return (cur_mb->bmi + b - 4)->as_mode.first;
 }
 
-#endif
+#endif  // VP9_COMMON_VP9_FINDNEARMV_H_
--- a/vp9/common/vp9_header.h
+++ b/vp9/common/vp9_header.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_HEADER_H_
 #define VP9_COMMON_VP9_HEADER_H_
 
@@ -38,5 +37,4 @@
 #define VP9_HEADER_SIZE 3
 #endif
 
-
-#endif
+#endif  // VP9_COMMON_VP9_HEADER_H_
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -26,8 +26,8 @@
 #include <math.h>
 #include "./vpx_config.h"
 #include "vp9/common/vp9_systemdependent.h"
-
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_common.h"
 
 static const int cospi8sqrt2minus1 = 20091;
 static const int sinpi8sqrt2      = 35468;
@@ -159,10 +159,10 @@
 
 
 /* Converted the transforms to integer form. */
-#define VERTICAL_SHIFT 14  // 16
-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
-#define HORIZONTAL_SHIFT 17  // 15
+#define HORIZONTAL_SHIFT 14  // 16
 #define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)
+#define VERTICAL_SHIFT 17  // 15
+#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)
 void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
                       TX_TYPE tx_type, int tx_dim, uint16_t eobs) {
   int i, j, k;
@@ -218,51 +218,57 @@
     }
   }
 
-  /* vertical transformation */
+  /* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps
+   * from right to left:
+   * 1. horizontal transform: Y= Z*Transposed_M2
+   * 2. vertical transform: X = M1*Y
+   * In SIMD, doing this way could eliminate the transpose needed if it is
+   * calculated from left to right.
+   */
+  /* Horizontal transformation */
   for (j = 0; j < tx_dim; j++) {
     for (i = 0; i < nz_dim; i++) {
       int temp = 0;
 
       for (k = 0; k < nz_dim; k++) {
-        temp += ptv[k] * ip[(k * tx_dim)];
+        temp += ip[k] * pth[k];
       }
 
-      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
-      ip++;
+      /* Calculate im and store it in its transposed position. */
+      im[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
+      ip += tx_dim;
     }
-    im += tx_dim;  // 16
-    ptv += tx_dim;
+    im += tx_dim;
+    pth += tx_dim;
     ip = input;
   }
 
-  /* horizontal transformation */
+  /* Vertical transformation */
   im = &imbuf[0];
 
-  for (j = 0; j < tx_dim; j++) {
-    const int16_t *pthc = pth;
-
-    for (i = 0; i < tx_dim; i++) {
+  for (i = 0; i < tx_dim; i++) {
+    for (j = 0; j < tx_dim; j++) {
       int temp = 0;
 
       for (k = 0; k < nz_dim; k++) {
-        temp += im[k] * pthc[k];
+        temp += ptv[k] * im[k];
       }
 
-      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);
-      pthc += tx_dim;
+      op[j] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);
+      im += tx_dim;
     }
-
-    im += tx_dim;  // 16
+    im = &imbuf[0];
+    ptv += tx_dim;
     op += shortpitch;
   }
 }
 
-void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) {
+void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
 
-  short *ip = input;
-  short *op = output;
+  int16_t *ip = input;
+  int16_t *op = output;
   int temp1, temp2;
   int shortpitch = pitch >> 1;
 
@@ -314,10 +320,10 @@
   }
 }
 
-void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) {
+void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {
   int i;
   int a1;
-  short *op = output;
+  int16_t *op = output;
   int shortpitch = pitch >> 1;
   a1 = ((input[0] + 16) >> 5);
   for (i = 0; i < 4; i++) {
@@ -329,22 +335,14 @@
   }
 }
 
-void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
-                            unsigned char *dst_ptr, int pitch, int stride) {
+void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,
+                            uint8_t *dst_ptr, int pitch, int stride) {
   int a1 = ((input_dc + 16) >> 5);
   int r, c;
 
   for (r = 0; r < 4; r++) {
     for (c = 0; c < 4; c++) {
-      int a = a1 + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
     }
 
     dst_ptr += stride;
@@ -352,11 +350,11 @@
   }
 }
 
-void vp9_short_inv_walsh4x4_c(short *input, short *output) {
+void vp9_short_inv_walsh4x4_c(int16_t *input, int16_t *output) {
   int i;
   int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
+  int16_t *ip = input;
+  int16_t *op = output;
 
   for (i = 0; i < 4; i++) {
     a1 = ((ip[0] + ip[3]));
@@ -389,11 +387,11 @@
   }
 }
 
-void vp9_short_inv_walsh4x4_1_c(short *in, short *out) {
+void vp9_short_inv_walsh4x4_1_c(int16_t *in, int16_t *out) {
   int i;
-  short tmp[4];
-  short *ip = in;
-  short *op = tmp;
+  int16_t tmp[4];
+  int16_t *ip = in;
+  int16_t *op = tmp;
 
   op[0] = (ip[0] + 1) >> 1;
   op[1] = op[2] = op[3] = (ip[0] >> 1);
@@ -409,11 +407,11 @@
 }
 
 #if CONFIG_LOSSLESS
-void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) {
+void vp9_short_inv_walsh4x4_lossless_c(int16_t *input, int16_t *output) {
   int i;
   int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
+  int16_t *ip = input;
+  int16_t *op = output;
 
   for (i = 0; i < 4; i++) {
     a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;
@@ -449,11 +447,11 @@
   }
 }
 
-void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) {
+void vp9_short_inv_walsh4x4_1_lossless_c(int16_t *in, int16_t *out) {
   int i;
-  short tmp[4];
-  short *ip = in;
-  short *op = tmp;
+  int16_t tmp[4];
+  int16_t *ip = in;
+  int16_t *op = tmp;
 
   op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;
   op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);
@@ -468,11 +466,11 @@
   }
 }
 
-void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) {
+void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) {
   int i;
   int a1, b1, c1, d1;
-  short *ip = input;
-  short *op = output;
+  int16_t *ip = input;
+  int16_t *op = output;
   int shortpitch = pitch >> 1;
 
   for (i = 0; i < 4; i++) {
@@ -509,11 +507,11 @@
   }
 }
 
-void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) {
+void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) {
   int i;
-  short tmp[4];
-  short *ip = in;
-  short *op = tmp;
+  int16_t tmp[4];
+  int16_t *ip = in;
+  int16_t *op = tmp;
   int shortpitch = pitch >> 1;
 
   op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
@@ -530,8 +528,8 @@
   }
 }
 
-void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr,
-                                 unsigned char *dst_ptr,
+void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr,
+                                 uint8_t *dst_ptr,
                                  int pitch, int stride) {
   int r, c;
   short tmp[16];
@@ -539,14 +537,7 @@
 
   for (r = 0; r < 4; r++) {
     for (c = 0; c < 4; c++) {
-      int a = tmp[r * 4 + c] + pred_ptr[c];
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
     }
 
     dst_ptr += stride;
@@ -556,25 +547,17 @@
 #endif
 
 void vp9_dc_only_idct_add_8x8_c(short input_dc,
-                                unsigned char *pred_ptr,
-                                unsigned char *dst_ptr,
+                                uint8_t *pred_ptr,
+                                uint8_t *dst_ptr,
                                 int pitch, int stride) {
   int a1 = ((input_dc + 16) >> 5);
   int r, c, b;
-  unsigned char *orig_pred = pred_ptr;
-  unsigned char *orig_dst = dst_ptr;
+  uint8_t *orig_pred = pred_ptr;
+  uint8_t *orig_dst = dst_ptr;
   for (b = 0; b < 4; b++) {
     for (r = 0; r < 4; r++) {
       for (c = 0; c < 4; c++) {
-        int a = a1 + pred_ptr[c];
-
-        if (a < 0)
-          a = 0;
-
-        if (a > 255)
-          a = 255;
-
-        dst_ptr[c] = (unsigned char) a;
+        dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);
       }
 
       dst_ptr += stride;
@@ -662,8 +645,8 @@
         (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |
         (x7 = blk[8 * 3]))) {
     blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]
-                                           = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
-                                                                       = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
+        = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]
+        = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);
     return;
   }
 
@@ -708,7 +691,7 @@
 }
 
 #define TX_DIM 8
-void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) {
+void vp9_short_idct8x8_c(int16_t *coefs, int16_t *block, int pitch) {
   int X[TX_DIM * TX_DIM];
   int i, j;
   int shortpitch = pitch >> 1;
@@ -827,7 +810,7 @@
   blk[8 * 7] = (x7 - x1) >> 14;
 }
 
-void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) {
+void vp9_short_idct10_8x8_c(int16_t *coefs, int16_t *block, int pitch) {
   int X[TX_DIM * TX_DIM];
   int i, j;
   int shortpitch = pitch >> 1;
@@ -840,7 +823,7 @@
   }
 
   /* Do first 4 row idct only since non-zero dct coefficients are all in
-   *  upper-left 4x4 area. */
+   * upper-left 4x4 area. */
   for (i = 0; i < 4; i++)
     idctrow10(X + 8 * i);
 
@@ -854,10 +837,10 @@
   }
 }
 
-void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) {
+void vp9_short_ihaar2x2_c(int16_t *input, int16_t *output, int pitch) {
   int i;
-  short *ip = input; // 0,1, 4, 8
-  short *op = output;
+  int16_t *ip = input;  // 0, 1, 4, 8
+  int16_t *op = output;
   for (i = 0; i < 16; i++) {
     op[i] = 0;
   }
@@ -871,7 +854,7 @@
 
 #if 0
 // Keep a really bad float version as reference for now.
-void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
+void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
 
   vp9_clear_system_state(); // Make it simd safe : __asm emms;
   {
@@ -901,25 +884,25 @@
 
 #define TEST_INT_16x16_IDCT 1
 #if !TEST_INT_16x16_IDCT
-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
 
-
 static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
 
+  static const double C1 = 0.995184726672197;
+  static const double C2 = 0.98078528040323;
+  static const double C3 = 0.956940335732209;
+  static const double C4 = 0.923879532511287;
+  static const double C5 = 0.881921264348355;
+  static const double C6 = 0.831469612302545;
+  static const double C7 = 0.773010453362737;
+  static const double C8 = 0.707106781186548;
+  static const double C9 = 0.634393284163646;
+  static const double C10 = 0.555570233019602;
+  static const double C11 = 0.471396736825998;
+  static const double C12 = 0.38268343236509;
+  static const double C13 = 0.290284677254462;
+  static const double C14 = 0.195090322016128;
+  static const double C15 = 0.098017140329561;
+
   vp9_clear_system_state(); // Make it simd safe : __asm emms;
   {
     double step[16];
@@ -1131,7 +1114,7 @@
 }
 #endif
 
-void vp9_short_idct16x16_c(short *input, short *output, int pitch) {
+void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
 
   vp9_clear_system_state(); // Make it simd safe : __asm emms;
   {
@@ -1163,6 +1146,12 @@
 }
 
 #else
+
+#define INITIAL_SHIFT 2
+#define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1))
+#define RIGHT_SHIFT 14
+#define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1))
+
 static const int16_t C1 = 16305;
 static const int16_t C2 = 16069;
 static const int16_t C3 = 15679;
@@ -1179,212 +1168,207 @@
 static const int16_t C14 = 3196;
 static const int16_t C15 = 1606;
 
-#define INITIAL_SHIFT 2
-#define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1))
-#define RIGHT_SHIFT 14
-#define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1))
-
 static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16],
                                     int last_shift_bits) {
-    int16_t step[16];
-    int intermediate[16];
-    int temp1, temp2;
+  int16_t step[16];
+  int intermediate[16];
+  int temp1, temp2;
 
-    int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT;
-    int step1_rounding = 1 << (step1_shift - 1);
-    int last_rounding = 0;
+  int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT;
+  int step1_rounding = 1 << (step1_shift - 1);
+  int last_rounding = 0;
 
-    if (last_shift_bits > 0)
-      last_rounding = 1 << (last_shift_bits - 1);
+  if (last_shift_bits > 0)
+    last_rounding = 1 << (last_shift_bits - 1);
 
-    // step 1 and 2
-    step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
+  // step 1 and 2
+  step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
+  step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;
 
-    temp1 = input[4] * C12;
-    temp2 = input[12] * C4;
-    temp1 = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp1  *= C8;
-    step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift;
+  temp1 = input[4] * C12;
+  temp2 = input[12] * C4;
+  temp1 = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1  *= C8;
+  step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift;
 
-    temp1 = input[4] * C4;
-    temp2 = input[12] * C12;
-    temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp1 *= C8;
-    step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift;
+  temp1 = input[4] * C4;
+  temp2 = input[12] * C12;
+  temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 *= C8;
+  step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift;
 
-    temp1 = input[2] * C8;
-    temp1 = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp2 = input[6] + input[10];
-    step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
+  temp1 = input[2] * C8;
+  temp1 = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp2 = input[6] + input[10];
+  step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
+  step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
 
-    temp1 = input[14] * C8;
-    temp1 = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp2 = input[6] - input[10];
-    step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
-    step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
+  temp1 = input[14] * C8;
+  temp1 = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp2 = input[6] - input[10];
+  step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
+  step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;
 
-    // for odd input
-    temp1 = input[3] * C12;
-    temp2 = input[13] * C4;
-    temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp1 *= C8;
-    intermediate[ 8] = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  // for odd input
+  temp1 = input[3] * C12;
+  temp2 = input[13] * C4;
+  temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 *= C8;
+  intermediate[ 8] = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = input[3] * C4;
-    temp2 = input[13] * C12;
-    temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp2 *= C8;
-    intermediate[ 9] = (2 * (temp2) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = input[3] * C4;
+  temp2 = input[13] * C12;
+  temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp2 *= C8;
+  intermediate[ 9] = (2 * (temp2) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    intermediate[11] = input[15] - input[1];
-    intermediate[12] = input[15] + input[1];
-    intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  intermediate[11] = input[15] - input[1];
+  intermediate[12] = input[15] + input[1];
+  intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = input[11] * C12;
-    temp2 = input[5] * C4;
-    temp2 = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp2 *= C8;
-    intermediate[14] = (2 * (temp2) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = input[11] * C12;
+  temp2 = input[5] * C4;
+  temp2 = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp2 *= C8;
+  intermediate[14] = (2 * (temp2) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = input[11] * C4;
-    temp2 = input[5] * C12;
-    temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
-    temp1 *= C8;
-    intermediate[15] = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = input[11] * C4;
+  temp2 = input[5] * C12;
+  temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 *= C8;
+  intermediate[15] = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING)
-        >> INITIAL_SHIFT;
-    step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING)
-        >> INITIAL_SHIFT;
-    step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING)
-        >> INITIAL_SHIFT;
-    step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING)
-        >> INITIAL_SHIFT;
-    step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING)
-        >> INITIAL_SHIFT;
-    step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING)
-        >> INITIAL_SHIFT;
-    step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING)
-        >> INITIAL_SHIFT;
-    step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING)
-        >> INITIAL_SHIFT;
+  step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING)
+      >> INITIAL_SHIFT;
+  step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING)
+      >> INITIAL_SHIFT;
+  step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING)
+      >> INITIAL_SHIFT;
+  step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING)
+      >> INITIAL_SHIFT;
+  step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING)
+      >> INITIAL_SHIFT;
+  step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING)
+      >> INITIAL_SHIFT;
+  step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING)
+      >> INITIAL_SHIFT;
+  step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING)
+      >> INITIAL_SHIFT;
 
-    // step 3
-    output[0] = step[ 0] + step[ 3];
-    output[1] = step[ 1] + step[ 2];
-    output[2] = step[ 1] - step[ 2];
-    output[3] = step[ 0] - step[ 3];
+  // step 3
+  output[0] = step[ 0] + step[ 3];
+  output[1] = step[ 1] + step[ 2];
+  output[2] = step[ 1] - step[ 2];
+  output[3] = step[ 0] - step[ 3];
 
-    temp1 = step[ 4] * C14;
-    temp2 = step[ 7] * C2;
-    output[4] =  (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = step[ 4] * C14;
+  temp2 = step[ 7] * C2;
+  output[4] =  (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = step[ 4] * C2;
-    temp2 = step[ 7] * C14;
-    output[7] =  (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = step[ 4] * C2;
+  temp2 = step[ 7] * C14;
+  output[7] =  (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = step[ 5] * C10;
-    temp2 = step[ 6] * C6;
-    output[5] =  (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = step[ 5] * C10;
+  temp2 = step[ 6] * C6;
+  output[5] =  (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = step[ 5] * C6;
-    temp2 = step[ 6] * C10;
-    output[6] =  (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = step[ 5] * C6;
+  temp2 = step[ 6] * C10;
+  output[6] =  (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    output[8] = step[ 8] + step[11];
-    output[9] = step[ 9] + step[10];
-    output[10] = step[ 9] - step[10];
-    output[11] = step[ 8] - step[11];
-    output[12] = step[12] + step[15];
-    output[13] = step[13] + step[14];
-    output[14] = step[13] - step[14];
-    output[15] = step[12] - step[15];
+  output[8] = step[ 8] + step[11];
+  output[9] = step[ 9] + step[10];
+  output[10] = step[ 9] - step[10];
+  output[11] = step[ 8] - step[11];
+  output[12] = step[12] + step[15];
+  output[13] = step[13] + step[14];
+  output[14] = step[13] - step[14];
+  output[15] = step[12] - step[15];
 
-    // output 4
-    step[ 0] = output[0] + output[7];
-    step[ 1] = output[1] + output[6];
-    step[ 2] = output[2] + output[5];
-    step[ 3] = output[3] + output[4];
-    step[ 4] = output[3] - output[4];
-    step[ 5] = output[2] - output[5];
-    step[ 6] = output[1] - output[6];
-    step[ 7] = output[0] - output[7];
+  // output 4
+  step[ 0] = output[0] + output[7];
+  step[ 1] = output[1] + output[6];
+  step[ 2] = output[2] + output[5];
+  step[ 3] = output[3] + output[4];
+  step[ 4] = output[3] - output[4];
+  step[ 5] = output[2] - output[5];
+  step[ 6] = output[1] - output[6];
+  step[ 7] = output[0] - output[7];
 
-    temp1 = output[8] * C7;
-    temp2 = output[15] * C9;
-    step[ 8] = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = output[8] * C7;
+  temp2 = output[15] * C9;
+  step[ 8] = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = output[9] * C11;
-    temp2 = output[14] * C5;
-    step[ 9] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = output[9] * C11;
+  temp2 = output[14] * C5;
+  step[ 9] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = output[10] * C3;
-    temp2 = output[13] * C13;
-    step[10] = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = output[10] * C3;
+  temp2 = output[13] * C13;
+  step[10] = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = output[11] * C15;
-    temp2 = output[12] * C1;
-    step[11] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = output[11] * C15;
+  temp2 = output[12] * C1;
+  step[11] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = output[11] * C1;
-    temp2 = output[12] * C15;
-    step[12] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = output[11] * C1;
+  temp2 = output[12] * C15;
+  step[12] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = output[10] * C13;
-    temp2 = output[13] * C3;
-    step[13] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = output[10] * C13;
+  temp2 = output[13] * C3;
+  step[13] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = output[9] * C5;
-    temp2 = output[14] * C11;
-    step[14] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = output[9] * C5;
+  temp2 = output[14] * C11;
+  step[14] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    temp1 = output[8] * C9;
-    temp2 = output[15] * C7;
-    step[15] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
+  temp1 = output[8] * C9;
+  temp2 = output[15] * C7;
+  step[15] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;
 
-    // step 5
-    output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;
-    output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;
-    output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;
-    output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;
-    output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;
-    output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;
-    output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;
-    output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;
+  // step 5
+  output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;
+  output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;
+  output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;
+  output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;
+  output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;
+  output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;
+  output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;
+  output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;
 
-    output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;
-    output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;
-    output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;
-    output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;
-    output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;
-    output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;
-    output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;
-    output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;
+  output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;
+  output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;
+  output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;
+  output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;
+  output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;
+  output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;
+  output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;
+  output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;
 }
 
 void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
-    int16_t out[16 * 16];
-    int16_t *outptr = &out[0];
-    const int short_pitch = pitch >> 1;
-    int i, j;
-    int16_t temp_in[16], temp_out[16];
+  int16_t out[16 * 16];
+  int16_t *outptr = &out[0];
+  const int short_pitch = pitch >> 1;
+  int i, j;
+  int16_t temp_in[16], temp_out[16];
 
-    // First transform rows
-    for (i = 0; i < 16; ++i) {
-      butterfly_16x16_idct_1d(input, outptr, 0);
-      input += short_pitch;
-      outptr += 16;
-    }
+  // First transform rows
+  for (i = 0; i < 16; ++i) {
+    butterfly_16x16_idct_1d(input, outptr, 0);
+    input += short_pitch;
+    outptr += 16;
+  }
 
-    // Then transform columns
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j * 16 + i];
-      butterfly_16x16_idct_1d(temp_in, temp_out, 3);
-      for (j = 0; j < 16; ++j)
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    butterfly_16x16_idct_1d(temp_in, temp_out, 3);
+    for (j = 0; j < 16; ++j)
         output[j * 16 + i] = temp_out[j];
     }
 }
@@ -1548,3 +1532,1139 @@
 #undef RIGHT_SHIFT
 #undef RIGHT_ROUNDING
 #endif
+
+#if !CONFIG_DWTDCTHYBRID
+#define DownshiftMultiplyBy2(x) x * 2
+#define DownshiftMultiply(x) x
+
+static void idct16(double *input, double *output, int stride) {
+  static const double C1 = 0.995184726672197;
+  static const double C2 = 0.98078528040323;
+  static const double C3 = 0.956940335732209;
+  static const double C4 = 0.923879532511287;
+  static const double C5 = 0.881921264348355;
+  static const double C6 = 0.831469612302545;
+  static const double C7 = 0.773010453362737;
+  static const double C8 = 0.707106781186548;
+  static const double C9 = 0.634393284163646;
+  static const double C10 = 0.555570233019602;
+  static const double C11 = 0.471396736825998;
+  static const double C12 = 0.38268343236509;
+  static const double C13 = 0.290284677254462;
+  static const double C14 = 0.195090322016128;
+  static const double C15 = 0.098017140329561;
+
+  double step[16];
+  double intermediate[16];
+  double temp1, temp2;
+
+  // step 1 and 2
+  step[ 0] = input[stride*0] + input[stride*8];
+  step[ 1] = input[stride*0] - input[stride*8];
+
+  temp1 = input[stride*4]*C12;
+  temp2 = input[stride*12]*C4;
+
+  temp1 -= temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+
+  step[ 2] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*4]*C4;
+  temp2 = input[stride*12]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  step[ 3] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*2]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] + input[stride*10];
+
+  step[ 4] = temp1 + temp2;
+  step[ 5] = temp1 - temp2;
+
+  temp1 = input[stride*14]*C8;
+  temp1 = DownshiftMultiplyBy2(temp1);
+  temp2 = input[stride*6] - input[stride*10];
+
+  step[ 6] = temp2 - temp1;
+  step[ 7] = temp2 + temp1;
+
+  // for odd input
+  temp1 = input[stride*3]*C12;
+  temp2 = input[stride*13]*C4;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[ 8] = DownshiftMultiplyBy2(temp1);
+
+  temp1 = input[stride*3]*C4;
+  temp2 = input[stride*13]*C12;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[ 9] = DownshiftMultiplyBy2(temp2);
+
+  intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8);
+  intermediate[11] = input[stride*15] - input[stride*1];
+  intermediate[12] = input[stride*15] + input[stride*1];
+  intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8));
+
+  temp1 = input[stride*11]*C12;
+  temp2 = input[stride*5]*C4;
+  temp2 -= temp1;
+  temp2 = DownshiftMultiply(temp2);
+  temp2 *= C8;
+  intermediate[14] = DownshiftMultiplyBy2(temp2);
+
+  temp1 = input[stride*11]*C4;
+  temp2 = input[stride*5]*C12;
+  temp1 += temp2;
+  temp1 = DownshiftMultiply(temp1);
+  temp1 *= C8;
+  intermediate[15] = DownshiftMultiplyBy2(temp1);
+
+  step[ 8] = intermediate[ 8] + intermediate[14];
+  step[ 9] = intermediate[ 9] + intermediate[15];
+  step[10] = intermediate[10] + intermediate[11];
+  step[11] = intermediate[10] - intermediate[11];
+  step[12] = intermediate[12] + intermediate[13];
+  step[13] = intermediate[12] - intermediate[13];
+  step[14] = intermediate[ 8] - intermediate[14];
+  step[15] = intermediate[ 9] - intermediate[15];
+
+  // step 3
+  output[stride*0] = step[ 0] + step[ 3];
+  output[stride*1] = step[ 1] + step[ 2];
+  output[stride*2] = step[ 1] - step[ 2];
+  output[stride*3] = step[ 0] - step[ 3];
+
+  temp1 = step[ 4]*C14;
+  temp2 = step[ 7]*C2;
+  temp1 -= temp2;
+  output[stride*4] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 4]*C2;
+  temp2 = step[ 7]*C14;
+  temp1 += temp2;
+  output[stride*7] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C10;
+  temp2 = step[ 6]*C6;
+  temp1 -= temp2;
+  output[stride*5] =  DownshiftMultiply(temp1);
+
+  temp1 = step[ 5]*C6;
+  temp2 = step[ 6]*C10;
+  temp1 += temp2;
+  output[stride*6] =  DownshiftMultiply(temp1);
+
+  output[stride*8] = step[ 8] + step[11];
+  output[stride*9] = step[ 9] + step[10];
+  output[stride*10] = step[ 9] - step[10];
+  output[stride*11] = step[ 8] - step[11];
+  output[stride*12] = step[12] + step[15];
+  output[stride*13] = step[13] + step[14];
+  output[stride*14] = step[13] - step[14];
+  output[stride*15] = step[12] - step[15];
+
+  // output 4
+  step[ 0] = output[stride*0] + output[stride*7];
+  step[ 1] = output[stride*1] + output[stride*6];
+  step[ 2] = output[stride*2] + output[stride*5];
+  step[ 3] = output[stride*3] + output[stride*4];
+  step[ 4] = output[stride*3] - output[stride*4];
+  step[ 5] = output[stride*2] - output[stride*5];
+  step[ 6] = output[stride*1] - output[stride*6];
+  step[ 7] = output[stride*0] - output[stride*7];
+
+  temp1 = output[stride*8]*C7;
+  temp2 = output[stride*15]*C9;
+  temp1 -= temp2;
+  step[ 8] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C11;
+  temp2 = output[stride*14]*C5;
+  temp1 += temp2;
+  step[ 9] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*10]*C3;
+  temp2 = output[stride*13]*C13;
+  temp1 -= temp2;
+  step[10] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C15;
+  temp2 = output[stride*12]*C1;
+  temp1 += temp2;
+  step[11] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*11]*C1;
+  temp2 = output[stride*12]*C15;
+  temp2 -= temp1;
+  step[12] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*10]*C13;
+  temp2 = output[stride*13]*C3;
+  temp1 += temp2;
+  step[13] = DownshiftMultiply(temp1);
+
+  temp1 = output[stride*9]*C5;
+  temp2 = output[stride*14]*C11;
+  temp2 -= temp1;
+  step[14] = DownshiftMultiply(temp2);
+
+  temp1 = output[stride*8]*C9;
+  temp2 = output[stride*15]*C7;
+  temp1 += temp2;
+  step[15] = DownshiftMultiply(temp1);
+
+  // step 5
+  output[stride*0] = step[0] + step[15];
+  output[stride*1] = step[1] + step[14];
+  output[stride*2] = step[2] + step[13];
+  output[stride*3] = step[3] + step[12];
+  output[stride*4] = step[4] + step[11];
+  output[stride*5] = step[5] + step[10];
+  output[stride*6] = step[6] + step[ 9];
+  output[stride*7] = step[7] + step[ 8];
+
+  output[stride*15] = step[0] - step[15];
+  output[stride*14] = step[1] - step[14];
+  output[stride*13] = step[2] - step[13];
+  output[stride*12] = step[3] - step[12];
+  output[stride*11] = step[4] - step[11];
+  output[stride*10] = step[5] - step[10];
+  output[stride*9] = step[6] - step[ 9];
+  output[stride*8] = step[7] - step[ 8];
+}
+
+static void butterfly_32_idct_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step1[32];
+  double step2[32];
+
+  step1[ 0] = input[stride*0];
+  step1[ 1] = input[stride*2];
+  step1[ 2] = input[stride*4];
+  step1[ 3] = input[stride*6];
+  step1[ 4] = input[stride*8];
+  step1[ 5] = input[stride*10];
+  step1[ 6] = input[stride*12];
+  step1[ 7] = input[stride*14];
+  step1[ 8] = input[stride*16];
+  step1[ 9] = input[stride*18];
+  step1[10] = input[stride*20];
+  step1[11] = input[stride*22];
+  step1[12] = input[stride*24];
+  step1[13] = input[stride*26];
+  step1[14] = input[stride*28];
+  step1[15] = input[stride*30];
+
+  step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16);
+  step1[17] = (input[stride*3] + input[stride*1]);
+  step1[18] = (input[stride*5] + input[stride*3]);
+  step1[19] = (input[stride*7] + input[stride*5]);
+  step1[20] = (input[stride*9] + input[stride*7]);
+  step1[21] = (input[stride*11] + input[stride*9]);
+  step1[22] = (input[stride*13] + input[stride*11]);
+  step1[23] = (input[stride*15] + input[stride*13]);
+  step1[24] = (input[stride*17] + input[stride*15]);
+  step1[25] = (input[stride*19] + input[stride*17]);
+  step1[26] = (input[stride*21] + input[stride*19]);
+  step1[27] = (input[stride*23] + input[stride*21]);
+  step1[28] = (input[stride*25] + input[stride*23]);
+  step1[29] = (input[stride*27] + input[stride*25]);
+  step1[30] = (input[stride*29] + input[stride*27]);
+  step1[31] = (input[stride*31] + input[stride*29]);
+
+  idct16(step1, step2, 1);
+  idct16(step1 + 16, step2 + 16, 1);
+
+  step2[16] = DownshiftMultiply(step2[16] / (2*C1));
+  step2[17] = DownshiftMultiply(step2[17] / (2*C3));
+  step2[18] = DownshiftMultiply(step2[18] / (2*C5));
+  step2[19] = DownshiftMultiply(step2[19] / (2*C7));
+  step2[20] = DownshiftMultiply(step2[20] / (2*C9));
+  step2[21] = DownshiftMultiply(step2[21] / (2*C11));
+  step2[22] = DownshiftMultiply(step2[22] / (2*C13));
+  step2[23] = DownshiftMultiply(step2[23] / (2*C15));
+  step2[24] = DownshiftMultiply(step2[24] / (2*C17));
+  step2[25] = DownshiftMultiply(step2[25] / (2*C19));
+  step2[26] = DownshiftMultiply(step2[26] / (2*C21));
+  step2[27] = DownshiftMultiply(step2[27] / (2*C23));
+  step2[28] = DownshiftMultiply(step2[28] / (2*C25));
+  step2[29] = DownshiftMultiply(step2[29] / (2*C27));
+  step2[30] = DownshiftMultiply(step2[30] / (2*C29));
+  step2[31] = DownshiftMultiply(step2[31] / (2*C31));
+
+  output[stride* 0] = step2[ 0] + step2[16];
+  output[stride* 1] = step2[ 1] + step2[17];
+  output[stride* 2] = step2[ 2] + step2[18];
+  output[stride* 3] = step2[ 3] + step2[19];
+  output[stride* 4] = step2[ 4] + step2[20];
+  output[stride* 5] = step2[ 5] + step2[21];
+  output[stride* 6] = step2[ 6] + step2[22];
+  output[stride* 7] = step2[ 7] + step2[23];
+  output[stride* 8] = step2[ 8] + step2[24];
+  output[stride* 9] = step2[ 9] + step2[25];
+  output[stride*10] = step2[10] + step2[26];
+  output[stride*11] = step2[11] + step2[27];
+  output[stride*12] = step2[12] + step2[28];
+  output[stride*13] = step2[13] + step2[29];
+  output[stride*14] = step2[14] + step2[30];
+  output[stride*15] = step2[15] + step2[31];
+  output[stride*16] = step2[15] - step2[(31 - 0)];
+  output[stride*17] = step2[14] - step2[(31 - 1)];
+  output[stride*18] = step2[13] - step2[(31 - 2)];
+  output[stride*19] = step2[12] - step2[(31 - 3)];
+  output[stride*20] = step2[11] - step2[(31 - 4)];
+  output[stride*21] = step2[10] - step2[(31 - 5)];
+  output[stride*22] = step2[ 9] - step2[(31 - 6)];
+  output[stride*23] = step2[ 8] - step2[(31 - 7)];
+  output[stride*24] = step2[ 7] - step2[(31 - 8)];
+  output[stride*25] = step2[ 6] - step2[(31 - 9)];
+  output[stride*26] = step2[ 5] - step2[(31 - 10)];
+  output[stride*27] = step2[ 4] - step2[(31 - 11)];
+  output[stride*28] = step2[ 3] - step2[(31 - 12)];
+  output[stride*29] = step2[ 2] - step2[(31 - 13)];
+  output[stride*30] = step2[ 1] - step2[(31 - 14)];
+  output[stride*31] = step2[ 0] - step2[(31 - 15)];
+}
+
+void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double out[32*32], out2[32*32];
+    const int short_pitch = pitch >> 1;
+    int i, j;
+    // First transform rows
+    for (i = 0; i < 32; ++i) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = input[j + i*short_pitch];
+      butterfly_32_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out[j + i*32] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 32; ++i) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = out[j*32 + i];
+      butterfly_32_idct_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out2[j*32 + i] = temp_out[j];
+    }
+    for (i = 0; i < 32*32; ++i)
+      output[i] = round(out2[i]/128);
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+#else  // !CONFIG_DWTDCTHYBRID
+
+#if DWT_TYPE == 53
+
+// Note: block length must be even for this implementation
+static void synthesis_53_row(int length, int16_t *lowpass, int16_t *highpass,
+                             int16_t *x) {
+  int16_t r, *a, *b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *x++ = ((r = *a++) + 1) >> 1;
+    *x++ = *b++ + ((r + (*a) + 2) >> 2);
+  }
+  *x++ = ((r = *a) + 1) >> 1;
+  *x++ = *b + ((r + 1) >> 1);
+}
+
+static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass,
+                             int16_t *x) {
+  int16_t r, *a, *b;
+  int n;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ -= (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    r = *a++;
+    *x++ = r;
+    *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);
+  }
+  *x++ = *a;
+  *x++ = ((*b) << 1) + *a;
+}
+
+static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c,
+                                 int pitch_c, int16_t *x, int pitch_x) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  short buffer[2 * DWT_MAX_LENGTH];
+
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = c[i * pitch_c + j];
+      synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
+      synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
+    }
+  }
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
+          ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
+          -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
+    }
+  }
+}
+
+#elif DWT_TYPE == 26
+
+// Note: block length must be even for this implementation
+static void synthesis_26_row(int length, int16_t *lowpass, int16_t *highpass,
+                             int16_t *x) {
+  int16_t r, s, *a, *b;
+  int i, n = length >> 1;
+
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ += (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b += (r - *a + 4) >> 3;
+  }
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    s = *b++;
+    r = *a++;
+    *x++ = (r + s + 1) >> 1;
+    *x++ = (r - s + 1) >> 1;
+  }
+}
+
+static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass,
+                             int16_t *x) {
+  int16_t r, s, *a, *b;
+  int i, n = length >> 1;
+
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ += (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b += (r - *a + 4) >> 3;
+  }
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    s = *b++;
+    r = *a++;
+    *x++ = r + s;
+    *x++ = r - s;
+  }
+}
+
+static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c,
+                                 int pitch_c, int16_t *x, int pitch_x) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  int16_t buffer[2 * DWT_MAX_LENGTH];
+
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = c[i * pitch_c + j];
+      synthesis_26_col(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));
+      synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]);
+    }
+  }
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?
+          ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :
+          -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);
+    }
+  }
+}
+
+#elif DWT_TYPE == 97
+
+static void synthesis_97(int length, double *lowpass, double *highpass,
+                         double *x) {
+  static const double a_predict1 = -1.586134342;
+  static const double a_update1 = -0.05298011854;
+  static const double a_predict2 = 0.8829110762;
+  static const double a_update2 = 0.4435068522;
+  static const double s_low = 1.149604398;
+  static const double s_high = 1/1.149604398;
+  static const double inv_s_low = 1 / s_low;
+  static const double inv_s_high = 1 / s_high;
+  int i;
+  double y[DWT_MAX_LENGTH];
+  // Undo pack and scale
+  for (i = 0; i < length / 2; i++) {
+    y[i * 2] = lowpass[i] * inv_s_low;
+    y[i * 2 + 1] = highpass[i] * inv_s_high;
+  }
+  memcpy(x, y, sizeof(*y) * length);
+  // Undo update 2
+  for (i = 2; i < length; i += 2) {
+    x[i] -= a_update2 * (x[i-1] + x[i+1]);
+  }
+  x[0] -= 2 * a_update2 * x[1];
+  // Undo predict 2
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] -= a_predict2 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] -= 2 * a_predict2 * x[length - 2];
+  // Undo update 1
+  for (i = 2; i < length; i += 2) {
+    x[i] -= a_update1 * (x[i - 1] + x[i + 1]);
+  }
+  x[0] -= 2 * a_update1 * x[1];
+  // Undo predict 1
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] -= a_predict1 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] -= 2 * a_predict1 * x[length - 2];
+}
+
+static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c,
+                                 int pitch_c, int16_t *x, int pitch_x) {
+  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;
+  double buffer[2 * DWT_MAX_LENGTH];
+  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
+
+  th[0] = hh;
+  tw[0] = hw;
+  for (i = 1; i <= levels; i++) {
+    th[i] = (th[i - 1] + 1) >> 1;
+    tw[i] = (tw[i - 1] + 1) >> 1;
+  }
+  for (lv = levels - 1; lv >= 0; lv--) {
+    nh = th[lv];
+    nw = tw[lv];
+    hh = th[lv + 1];
+    hw = tw[lv + 1];
+    if ((nh < 2) || (nw < 2)) continue;
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i] = c[i * pitch_c + j];
+      synthesis_97(nh, buffer, buffer + hh, buffer + nh);
+      for (i = 0; i < nh; i++)
+        y[i * DWT_MAX_LENGTH + j] = buffer[i + nh];
+    }
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
+      synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]);
+    }
+  }
+  for (i = 0; i < height; i++)
+    for (j = 0; j < width; j++)
+      x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] /
+                                 (1 << DWT_PRECISION_BITS));
+}
+
+#endif  // DWT_TYPE
+
+// TODO(debargha): Implement scaling differently so as not to have to use the
+// floating point 16x16 dct
+static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) {
+  static const double C1 = 0.995184726672197;
+  static const double C2 = 0.98078528040323;
+  static const double C3 = 0.956940335732209;
+  static const double C4 = 0.923879532511287;
+  static const double C5 = 0.881921264348355;
+  static const double C6 = 0.831469612302545;
+  static const double C7 = 0.773010453362737;
+  static const double C8 = 0.707106781186548;
+  static const double C9 = 0.634393284163646;
+  static const double C10 = 0.555570233019602;
+  static const double C11 = 0.471396736825998;
+  static const double C12 = 0.38268343236509;
+  static const double C13 = 0.290284677254462;
+  static const double C14 = 0.195090322016128;
+  static const double C15 = 0.098017140329561;
+
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double step[16];
+    double intermediate[16];
+    double temp1, temp2;
+
+
+    // step 1 and 2
+    step[ 0] = input[0] + input[8];
+    step[ 1] = input[0] - input[8];
+
+    temp1 = input[4]*C12;
+    temp2 = input[12]*C4;
+
+    temp1 -= temp2;
+    temp1 *= C8;
+
+    step[ 2] = 2*(temp1);
+
+    temp1 = input[4]*C4;
+    temp2 = input[12]*C12;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    step[ 3] = 2*(temp1);
+
+    temp1 = input[2]*C8;
+    temp1 = 2*(temp1);
+    temp2 = input[6] + input[10];
+
+    step[ 4] = temp1 + temp2;
+    step[ 5] = temp1 - temp2;
+
+    temp1 = input[14]*C8;
+    temp1 = 2*(temp1);
+    temp2 = input[6] - input[10];
+
+    step[ 6] = temp2 - temp1;
+    step[ 7] = temp2 + temp1;
+
+    // for odd input
+    temp1 = input[3]*C12;
+    temp2 = input[13]*C4;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    intermediate[ 8] = 2*(temp1);
+
+    temp1 = input[3]*C4;
+    temp2 = input[13]*C12;
+    temp2 -= temp1;
+    temp2 = (temp2);
+    temp2 *= C8;
+    intermediate[ 9] = 2*(temp2);
+
+    intermediate[10] = 2*(input[9]*C8);
+    intermediate[11] = input[15] - input[1];
+    intermediate[12] = input[15] + input[1];
+    intermediate[13] = 2*((input[7]*C8));
+
+    temp1 = input[11]*C12;
+    temp2 = input[5]*C4;
+    temp2 -= temp1;
+    temp2 = (temp2);
+    temp2 *= C8;
+    intermediate[14] = 2*(temp2);
+
+    temp1 = input[11]*C4;
+    temp2 = input[5]*C12;
+    temp1 += temp2;
+    temp1 = (temp1);
+    temp1 *= C8;
+    intermediate[15] = 2*(temp1);
+
+    step[ 8] = intermediate[ 8] + intermediate[14];
+    step[ 9] = intermediate[ 9] + intermediate[15];
+    step[10] = intermediate[10] + intermediate[11];
+    step[11] = intermediate[10] - intermediate[11];
+    step[12] = intermediate[12] + intermediate[13];
+    step[13] = intermediate[12] - intermediate[13];
+    step[14] = intermediate[ 8] - intermediate[14];
+    step[15] = intermediate[ 9] - intermediate[15];
+
+    // step 3
+    output[0] = step[ 0] + step[ 3];
+    output[1] = step[ 1] + step[ 2];
+    output[2] = step[ 1] - step[ 2];
+    output[3] = step[ 0] - step[ 3];
+
+    temp1 = step[ 4]*C14;
+    temp2 = step[ 7]*C2;
+    temp1 -= temp2;
+    output[4] =  (temp1);
+
+    temp1 = step[ 4]*C2;
+    temp2 = step[ 7]*C14;
+    temp1 += temp2;
+    output[7] =  (temp1);
+
+    temp1 = step[ 5]*C10;
+    temp2 = step[ 6]*C6;
+    temp1 -= temp2;
+    output[5] =  (temp1);
+
+    temp1 = step[ 5]*C6;
+    temp2 = step[ 6]*C10;
+    temp1 += temp2;
+    output[6] =  (temp1);
+
+    output[8] = step[ 8] + step[11];
+    output[9] = step[ 9] + step[10];
+    output[10] = step[ 9] - step[10];
+    output[11] = step[ 8] - step[11];
+    output[12] = step[12] + step[15];
+    output[13] = step[13] + step[14];
+    output[14] = step[13] - step[14];
+    output[15] = step[12] - step[15];
+
+    // output 4
+    step[ 0] = output[0] + output[7];
+    step[ 1] = output[1] + output[6];
+    step[ 2] = output[2] + output[5];
+    step[ 3] = output[3] + output[4];
+    step[ 4] = output[3] - output[4];
+    step[ 5] = output[2] - output[5];
+    step[ 6] = output[1] - output[6];
+    step[ 7] = output[0] - output[7];
+
+    temp1 = output[8]*C7;
+    temp2 = output[15]*C9;
+    temp1 -= temp2;
+    step[ 8] = (temp1);
+
+    temp1 = output[9]*C11;
+    temp2 = output[14]*C5;
+    temp1 += temp2;
+    step[ 9] = (temp1);
+
+    temp1 = output[10]*C3;
+    temp2 = output[13]*C13;
+    temp1 -= temp2;
+    step[10] = (temp1);
+
+    temp1 = output[11]*C15;
+    temp2 = output[12]*C1;
+    temp1 += temp2;
+    step[11] = (temp1);
+
+    temp1 = output[11]*C1;
+    temp2 = output[12]*C15;
+    temp2 -= temp1;
+    step[12] = (temp2);
+
+    temp1 = output[10]*C13;
+    temp2 = output[13]*C3;
+    temp1 += temp2;
+    step[13] = (temp1);
+
+    temp1 = output[9]*C5;
+    temp2 = output[14]*C11;
+    temp2 -= temp1;
+    step[14] = (temp2);
+
+    temp1 = output[8]*C9;
+    temp2 = output[15]*C7;
+    temp1 += temp2;
+    step[15] = (temp1);
+
+    // step 5
+    output[0] = (step[0] + step[15]);
+    output[1] = (step[1] + step[14]);
+    output[2] = (step[2] + step[13]);
+    output[3] = (step[3] + step[12]);
+    output[4] = (step[4] + step[11]);
+    output[5] = (step[5] + step[10]);
+    output[6] = (step[6] + step[ 9]);
+    output[7] = (step[7] + step[ 8]);
+
+    output[15] = (step[0] - step[15]);
+    output[14] = (step[1] - step[14]);
+    output[13] = (step[2] - step[13]);
+    output[12] = (step[3] - step[12]);
+    output[11] = (step[4] - step[11]);
+    output[10] = (step[5] - step[10]);
+    output[9] = (step[6] - step[ 9]);
+    output[8] = (step[7] - step[ 8]);
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch,
+                                    int scale) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double out[16*16], out2[16*16];
+    const int short_pitch = pitch >> 1;
+    int i, j;
+      // First transform rows
+    for (i = 0; i < 16; ++i) {
+      double temp_in[16], temp_out[16];
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j + i*short_pitch];
+      butterfly_16x16_idct_1d_f(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out[j + i*16] = temp_out[j];
+    }
+    // Then transform columns
+    for (i = 0; i < 16; ++i) {
+      double temp_in[16], temp_out[16];
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j*16 + i];
+      butterfly_16x16_idct_1d_f(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        out2[j*16 + i] = temp_out[j];
+    }
+    for (i = 0; i < 16*16; ++i)
+      output[i] = round(out2[i] / (128 >> scale));
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+static void idct8_1d(double *x) {
+  int i, j;
+  double t[8];
+  static const double idctmat[64] = {
+    0.35355339059327,  0.49039264020162,  0.46193976625564,  0.41573480615127,
+    0.35355339059327,   0.2777851165098,  0.19134171618254, 0.097545161008064,
+    0.35355339059327,  0.41573480615127,  0.19134171618254, -0.097545161008064,
+    -0.35355339059327, -0.49039264020161, -0.46193976625564,  -0.2777851165098,
+    0.35355339059327,   0.2777851165098, -0.19134171618254, -0.49039264020162,
+    -0.35355339059327, 0.097545161008064,  0.46193976625564,  0.41573480615127,
+    0.35355339059327, 0.097545161008063, -0.46193976625564,  -0.2777851165098,
+    0.35355339059327,  0.41573480615127, -0.19134171618254, -0.49039264020162,
+    0.35355339059327, -0.097545161008063, -0.46193976625564,   0.2777851165098,
+    0.35355339059327, -0.41573480615127, -0.19134171618255,  0.49039264020162,
+    0.35355339059327,  -0.2777851165098, -0.19134171618254,  0.49039264020161,
+    -0.35355339059327, -0.097545161008064,  0.46193976625564, -0.41573480615127,
+    0.35355339059327, -0.41573480615127,  0.19134171618254, 0.097545161008065,
+    -0.35355339059327,  0.49039264020162, -0.46193976625564,   0.2777851165098,
+    0.35355339059327, -0.49039264020162,  0.46193976625564, -0.41573480615127,
+    0.35355339059327,  -0.2777851165098,  0.19134171618255, -0.097545161008064
+  };
+  for (i = 0; i < 8; ++i) {
+    t[i] = 0;
+    for (j = 0; j < 8; ++j)
+      t[i] += idctmat[i * 8 + j] * x[j];
+  }
+  for (i = 0; i < 8; ++i) {
+    x[i] = t[i];
+  }
+}
+
+static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch,
+                                  int scale) {
+  double X[8 * 8], Y[8];
+  int i, j;
+  int shortpitch = pitch >> 1;
+
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    for (i = 0; i < 8; i++) {
+      for (j = 0; j < 8; j++) {
+        X[i * 8 + j] = (double)coefs[i * shortpitch + j];
+      }
+    }
+    for (i = 0; i < 8; i++)
+      idct8_1d(X + 8 * i);
+    for (i = 0; i < 8; i++) {
+      for (j = 0; j < 8; ++j)
+        Y[j] = X[i + 8 * j];
+      idct8_1d(Y);
+      for (j = 0; j < 8; ++j)
+        X[i + 8 * j] = Y[j];
+    }
+    for (i = 0; i < 8; i++) {
+      for (j = 0; j < 8; j++) {
+        block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale));
+      }
+    }
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+#define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n))
+
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+
+void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+  // assume output is a 32x32 buffer
+  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
+  int16_t buffer[16 * 16];
+  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
+  int16_t buffer2[32 * 32];
+  // Note: pitch is in bytes, short_pitch is in short units
+  const int short_pitch = pitch >> 1;
+  int i, j;
+
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the idct16x16 function
+  vp9_short_idct16x16_c_f(input, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
+  }
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 32; ++j) {
+      buffer2[i * 32 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+  for (i = 16; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      buffer2[i * 32 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
+#endif
+}
+
+#elif DWTDCT_TYPE == DWTDCT16X16
+
+void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+  // assume output is a 32x32 buffer
+  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
+  int16_t buffer[16 * 16];
+  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
+  int16_t buffer2[32 * 32];
+  // Note: pitch is in bytes, short_pitch is in short units
+  const int short_pitch = pitch >> 1;
+  int i, j;
+
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the idct16x16 function
+  vp9_short_idct16x16_c_f(input, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);
+  }
+  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
+  }
+  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16,
+               sizeof(*buffer2) * 16);
+  }
+  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
+                          1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16,
+               sizeof(*buffer2) * 16);
+  }
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);
+#endif
+}
+
+#elif DWTDCT_TYPE == DWTDCT8X8
+
+void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+  // assume output is a 32x32 buffer
+  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
+  int16_t buffer[8 * 8];
+  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
+  int16_t buffer2[32 * 32];
+  // Note: pitch is in bytes, short_pitch is in short units
+  const int short_pitch = pitch >> 1;
+  int i, j;
+
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the idct16x16 function
+  vp9_short_idct8x8_c_f(input, buffer, pitch,
+                        1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i) {
+    vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8);
+  }
+  vp9_short_idct8x8_c_f(input + 8, buffer, pitch,
+                        1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i) {
+    vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8);
+  }
+  vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch,
+                        1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i) {
+    vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8,
+               sizeof(*buffer2) * 8);
+  }
+  vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch,
+                        1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i) {
+    vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8,
+               sizeof(*buffer2) * 8);
+  }
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 32; ++j) {
+      buffer2[i * 32 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+  for (i = 16; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      buffer2[i * 32 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32);
+#endif
+}
+
+#endif
+
+#if CONFIG_TX64X64
+void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) {
+  // assume output is a 64x64 buffer
+  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct
+  int16_t buffer[16 * 16];
+  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt
+  int16_t buffer2[64 * 64];
+  // Note: pitch is in bytes, short_pitch is in short units
+  const int short_pitch = pitch >> 1;
+  int i, j;
+
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the idct16x16 function
+  vp9_short_idct16x16_c_f(input, buffer, pitch,
+                          2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16);
+  }
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 64; ++j) {
+      buffer2[i * 64 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+  for (i = 16; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      buffer2[i * 64 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+#elif DWTDCT_TYPE == DWTDCT16X16
+  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,
+                          2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16);
+  }
+  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,
+                          2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16,
+               sizeof(*buffer2) * 16);
+  }
+  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,
+                          2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i) {
+    vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16,
+               sizeof(*buffer2) * 16);
+  }
+
+  // Copying and scaling highest bands into buffer2
+  for (i = 0; i < 32; ++i) {
+    for (j = 32; j < 64; ++j) {
+      buffer2[i * 64 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+  for (i = 32; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      buffer2[i * 64 + j] =
+          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+#endif  // DWTDCT_TYPE
+
+#if DWT_TYPE == 26
+  dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64);
+#elif DWT_TYPE == 97
+  dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64);
+#elif DWT_TYPE == 53
+  dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64);
+#endif
+}
+#endif  // CONFIG_TX64X64
+#endif  // !CONFIG_DWTDCTHYBRID
--- a/vp9/common/vp9_implicit_segmentation.c
+++ b/vp9/common/vp9_implicit_segmentation.c
@@ -33,8 +33,8 @@
   int min_y;
   int max_x;
   int max_y;
-  long long sum_x;
-  long long sum_y;
+  int64_t sum_x;
+  int64_t sum_y;
   int pixels;
   int seg_value;
   int label;
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -72,7 +72,7 @@
   vp9_inverse_transform_mbuv_4x4(xd);
 }
 
-void vp9_inverse_transform_b_8x8(short *input_dqcoeff, short *output_coeff,
+void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff, int16_t *output_coeff,
                                  int pitch) {
   vp9_short_idct8x8(input_dqcoeff, output_coeff, pitch);
 }
@@ -125,8 +125,8 @@
   vp9_inverse_transform_mbuv_8x8(xd);
 }
 
-void vp9_inverse_transform_b_16x16(short *input_dqcoeff,
-                                   short *output_coeff, int pitch) {
+void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
+                                   int16_t *output_coeff, int pitch) {
   vp9_short_idct16x16(input_dqcoeff, output_coeff, pitch);
 }
 
@@ -144,4 +144,15 @@
 void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) {
   vp9_inverse_transform_mby_16x16(xd);
   vp9_inverse_transform_mbuv_8x8(xd);
+}
+
+void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb) {
+  vp9_short_idct32x32(xd_sb->dqcoeff, xd_sb->diff, 64);
+}
+
+void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb) {
+  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1024,
+                                xd_sb->diff + 1024, 32);
+  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1280,
+                                xd_sb->diff + 1280, 32);
 }
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -12,6 +12,7 @@
 #define VP9_COMMON_VP9_INVTRANS_H_
 
 #include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
 extern void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch);
@@ -22,8 +23,8 @@
 
 extern void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_b_8x8(short *input_dqcoeff,
-                                        short *output_coeff, int pitch);
+extern void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,
+                                        int16_t *output_coeff, int pitch);
 
 extern void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd);
 
@@ -31,11 +32,14 @@
 
 extern void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_b_16x16(short *input_dqcoeff,
-                                          short *output_coeff, int pitch);
+extern void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,
+                                          int16_t *output_coeff, int pitch);
 
 extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);
 
 extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);
 
-#endif  // __INC_INVTRANS_H
+extern void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb);
+extern void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb);
+
+#endif  // VP9_COMMON_VP9_INVTRANS_H_
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -176,46 +176,70 @@
   }
 }
 
-void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) {
+// Determine if we should skip inner-MB loop filtering within a MB
+// The current condition is that the loop filtering is skipped only
+// the MB uses a prediction size of 16x16 and either 16x16 transform
+// is used or there is no residue at all.
+static int mb_lf_skip(const MB_MODE_INFO *const mbmi) {
+  const MB_PREDICTION_MODE mode = mbmi->mode;
+  const int skip_coef = mbmi->mb_skip_coeff;
+  const int tx_size = mbmi->txfm_size;
+  return mode != B_PRED && mode != I8X8_PRED && mode != SPLITMV &&
+         (tx_size >= TX_16X16 || skip_coef);
+}
+
+// Determine if we should skip MB loop filtering on a MB edge within
+// a superblock, the current condition is that MB loop filtering is
+// skipped only when both MBs do not use inner MB loop filtering, and
+// same motion vector with same reference frame
+static int sb_mb_lf_skip(const MODE_INFO *const mip0,
+                         const MODE_INFO *const mip1) {
+  const MB_MODE_INFO *mbmi0 = &mip0->mbmi;
+  const MB_MODE_INFO *mbmi1 = &mip0->mbmi;
+  return mb_lf_skip(mbmi0) && mb_lf_skip(mbmi1) &&
+         (mbmi0->ref_frame == mbmi1->ref_frame) &&
+         (mbmi0->mv[mbmi0->ref_frame].as_int ==
+          mbmi1->mv[mbmi1->ref_frame].as_int) &&
+         mbmi0->ref_frame != INTRA_FRAME;
+}
+void vp9_loop_filter_frame(VP9_COMMON *cm,
+                           MACROBLOCKD *xd,
+                           int frame_filter_level,
+                           int y_only) {
   YV12_BUFFER_CONFIG *post = cm->frame_to_show;
   loop_filter_info_n *lfi_n = &cm->lf_info;
   struct loop_filter_info lfi;
+  const FRAME_TYPE frame_type = cm->frame_type;
+  int mb_row, mb_col;
+  uint8_t *y_ptr, *u_ptr, *v_ptr;
 
-  FRAME_TYPE frame_type = cm->frame_type;
-
-  int mb_row;
-  int mb_col;
-
-  int filter_level;
-
-  unsigned char *y_ptr, *u_ptr, *v_ptr;
-
   /* Point at base of Mb MODE_INFO list */
   const MODE_INFO *mode_info_context = cm->mi;
+  const int mis = cm->mode_info_stride;
 
   /* Initialize the loop filter for this frame. */
-  vp9_loop_filter_frame_init(cm, xd, cm->filter_level);
-
+  vp9_loop_filter_frame_init(cm, xd, frame_filter_level);
   /* Set up the buffer pointers */
   y_ptr = post->y_buffer;
-  u_ptr = post->u_buffer;
-  v_ptr = post->v_buffer;
+  if (y_only) {
+    u_ptr = 0;
+    v_ptr = 0;
+  } else {
+    u_ptr = post->u_buffer;
+    v_ptr = post->v_buffer;
+  }
 
   /* vp9_filter each macro block */
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
-                     mode_info_context->mbmi.mode != I8X8_PRED &&
-                     mode_info_context->mbmi.mode != SPLITMV &&
-                     mode_info_context->mbmi.mb_skip_coeff);
-
-      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+      const MB_PREDICTION_MODE mode = mode_info_context->mbmi.mode;
+      const int mode_index = lfi_n->mode_lf_lut[mode];
       const int seg = mode_info_context->mbmi.segment_id;
       const int ref_frame = mode_info_context->mbmi.ref_frame;
-      int tx_type = mode_info_context->mbmi.txfm_size;
-      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
+      const int filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
       if (filter_level) {
+        const int skip_lf = mb_lf_skip(&mode_info_context->mbmi);
+        const int tx_size = mode_info_context->mbmi.txfm_size;
         if (cm->filter_type == NORMAL_LOOPFILTER) {
           const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
           lfi.mblim = lfi_n->mblim[filter_level];
@@ -223,198 +247,102 @@
           lfi.lim = lfi_n->lim[filter_level];
           lfi.hev_thr = lfi_n->hev_thr[hev_index];
 
-          if (mb_col > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-1].mbmi.mb_skip_coeff)
-#endif
-              )
-            vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                post->uv_stride, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                    post->uv_stride, &lfi);
+          if (mb_col > 0 &&
+              !((mb_col & 1) && mode_info_context->mbmi.sb_type &&
+                (sb_mb_lf_skip(mode_info_context - 1, mode_info_context) ||
+                 tx_size >= TX_32X32))
+              ) {
+            if (tx_size >= TX_16X16)
+              vp9_lpf_mbv_w(y_ptr, u_ptr, v_ptr, post->y_stride,
+                            post->uv_stride, &lfi);
             else
+              vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                  post->uv_stride, &lfi);
+          }
+          if (!skip_lf) {
+            if (tx_size >= TX_8X8) {
+              if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
+                vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                      post->uv_stride, &lfi);
+              else
+                vp9_loop_filter_bv8x8(y_ptr, NULL, NULL, post->y_stride,
+                                      post->uv_stride, &lfi);
+            } else {
               vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,
                                  post->uv_stride, &lfi);
+            }
 
           }
-
           /* don't apply across umv border */
-          if (mb_row > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
-#endif
-              )
-            vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                post->uv_stride, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
-                                    post->uv_stride, &lfi);
+          if (mb_row > 0 &&
+              !((mb_row & 1) && mode_info_context->mbmi.sb_type &&
+                (sb_mb_lf_skip(mode_info_context - mis, mode_info_context) ||
+                tx_size >= TX_32X32))
+              ) {
+            if (tx_size >= TX_16X16)
+              vp9_lpf_mbh_w(y_ptr, u_ptr, v_ptr, post->y_stride,
+                            post->uv_stride, &lfi);
             else
+              vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                  post->uv_stride, &lfi);
+          }
+          if (!skip_lf) {
+            if (tx_size >= TX_8X8) {
+              if (tx_size == TX_8X8 && (mode == I8X8_PRED || mode == SPLITMV))
+                vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride,
+                                      post->uv_stride, &lfi);
+              else
+                vp9_loop_filter_bh8x8(y_ptr, NULL, NULL, post->y_stride,
+                                      post->uv_stride, &lfi);
+            } else {
               vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride,
                                  post->uv_stride, &lfi);
+            }
           }
         } else {
           // FIXME: Not 8x8 aware
-          if (mb_col > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-1].mbmi.mb_skip_coeff)
-#endif
-              )
+          if (mb_col > 0 &&
+              !(skip_lf && mb_lf_skip(&mode_info_context[-1].mbmi)) &&
+              !((mb_col & 1) && mode_info_context->mbmi.sb_type))
             vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
                                        lfi_n->mblim[filter_level]);
-
           if (!skip_lf)
             vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
                                       lfi_n->blim[filter_level]);
 
           /* don't apply across umv border */
-          if (mb_row > 0
-#if CONFIG_SUPERBLOCKS
-              && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb &&
-                   mode_info_context[0].mbmi.mb_skip_coeff &&
-                   mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff)
-#endif
-              )
+          if (mb_row > 0 &&
+              !(skip_lf && mb_lf_skip(&mode_info_context[-mis].mbmi)) &&
+              !((mb_row & 1) && mode_info_context->mbmi.sb_type))
             vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
                                        lfi_n->mblim[filter_level]);
-
           if (!skip_lf)
             vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
                                       lfi_n->blim[filter_level]);
         }
       }
-
       y_ptr += 16;
-      u_ptr += 8;
-      v_ptr += 8;
-
+      if (!y_only) {
+        u_ptr += 8;
+        v_ptr += 8;
+      }
       mode_info_context++;     /* step to next MB */
     }
-
     y_ptr += post->y_stride  * 16 - post->y_width;
-    u_ptr += post->uv_stride *  8 - post->uv_width;
-    v_ptr += post->uv_stride *  8 - post->uv_width;
-
+    if (!y_only) {
+      u_ptr += post->uv_stride *  8 - post->uv_width;
+      v_ptr += post->uv_stride *  8 - post->uv_width;
+    }
     mode_info_context++;         /* Skip border mb */
   }
 }
 
-void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                 int default_filt_lvl) {
-  YV12_BUFFER_CONFIG *post = cm->frame_to_show;
 
-  unsigned char *y_ptr;
-  int mb_row;
-  int mb_col;
-
-  loop_filter_info_n *lfi_n = &cm->lf_info;
-  struct loop_filter_info lfi;
-
-  int filter_level;
-  FRAME_TYPE frame_type = cm->frame_type;
-
-  /* Point at base of Mb MODE_INFO list */
-  const MODE_INFO *mode_info_context = cm->mi;
-
-#if 0
-  if (default_filt_lvl == 0) /* no filter applied */
-    return;
-#endif
-
-  /* Initialize the loop filter for this frame. */
-  vp9_loop_filter_frame_init(cm, xd, default_filt_lvl);
-
-  /* Set up the buffer pointers */
-  y_ptr = post->y_buffer;
-
-  /* vp9_filter each macro block */
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
-      int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
-                     mode_info_context->mbmi.mode != I8X8_PRED &&
-                     mode_info_context->mbmi.mode != SPLITMV &&
-                     mode_info_context->mbmi.mb_skip_coeff);
-
-      const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
-      const int seg = mode_info_context->mbmi.segment_id;
-      const int ref_frame = mode_info_context->mbmi.ref_frame;
-      int tx_type = mode_info_context->mbmi.txfm_size;
-      filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
-
-      if (filter_level) {
-        if (cm->filter_type == NORMAL_LOOPFILTER) {
-          const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
-          lfi.mblim = lfi_n->mblim[filter_level];
-          lfi.blim = lfi_n->blim[filter_level];
-          lfi.lim = lfi_n->lim[filter_level];
-          lfi.hev_thr = lfi_n->hev_thr[hev_index];
-
-          if (mb_col > 0)
-            vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-            else
-              vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-          }
-
-          /* don't apply across umv border */
-          if (mb_row > 0)
-            vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-
-          if (!skip_lf && tx_type != TX_16X16) {
-            if (tx_type == TX_8X8)
-              vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-            else
-              vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi);
-          }
-        } else {
-          // FIXME: Not 8x8 aware
-          if (mb_col > 0)
-            vp9_loop_filter_simple_mbv(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bv(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-
-          /* don't apply across umv border */
-          if (mb_row > 0)
-            vp9_loop_filter_simple_mbh(y_ptr, post->y_stride,
-                                       lfi_n->mblim[filter_level]);
-
-          if (!skip_lf)
-            vp9_loop_filter_simple_bh(y_ptr, post->y_stride,
-                                      lfi_n->blim[filter_level]);
-        }
-      }
-
-      y_ptr += 16;
-      mode_info_context++;        /* step to next MB */
-    }
-
-    y_ptr += post->y_stride  * 16 - post->y_width;
-    mode_info_context++;            /* Skip border mb */
-  }
-}
-
 void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    int default_filt_lvl) {
   YV12_BUFFER_CONFIG *post = cm->frame_to_show;
 
-  unsigned char *y_ptr;
+  uint8_t *y_ptr;
   int mb_row;
   int mb_col;
   int mb_cols = post->y_width  >> 4;
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -49,26 +49,26 @@
 };
 
 #define prototype_loopfilter(sym) \
-  void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+  void sym(uint8_t *src, int pitch, const unsigned char *blimit, \
            const unsigned char *limit, const unsigned char *thresh, int count)
 
 #define prototype_loopfilter_block(sym) \
-  void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
+  void sym(uint8_t *y, uint8_t *u, uint8_t *v, \
            int ystride, int uv_stride, struct loop_filter_info *lfi)
 
 #define prototype_simple_loopfilter(sym) \
-  void sym(unsigned char *y, int ystride, const unsigned char *blimit)
+  void sym(uint8_t *y, int ystride, const unsigned char *blimit)
 
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/vp9_loopfilter_x86.h"
 #endif
 
-typedef void loop_filter_uvfunction(unsigned char *u,   /* source pointer */
+typedef void loop_filter_uvfunction(uint8_t *u,   /* source pointer */
                                     int p,              /* pitch */
                                     const unsigned char *blimit,
                                     const unsigned char *limit,
                                     const unsigned char *thresh,
-                                    unsigned char *v);
+                                    uint8_t *v);
 
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
@@ -80,17 +80,27 @@
                                 struct macroblockd *mbd,
                                 int default_filt_lvl);
 
-void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd);
+void vp9_loop_filter_frame(struct VP9Common *cm,
+                           struct macroblockd *mbd,
+                           int filter_level,
+                           int y_only);
 
 void vp9_loop_filter_partial_frame(struct VP9Common *cm,
                                    struct macroblockd *mbd,
                                    int default_filt_lvl);
 
-void vp9_loop_filter_frame_yonly(struct VP9Common *cm,
-                                 struct macroblockd *mbd,
-                                 int default_filt_lvl);
-
 void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi,
                                       int sharpness_lvl);
 
-#endif  // loopfilter_h
+void vp9_mb_lpf_horizontal_edge_w(unsigned char *s, int p,
+                                  const unsigned char *blimit,
+                                  const unsigned char *limit,
+                                  const unsigned char *thresh,
+                                  int count);
+
+void vp9_mb_lpf_vertical_edge_w(unsigned char *s, int p,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh,
+                                int count);
+#endif  // VP9_COMMON_VP9_LOOPFILTER_H_
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -13,20 +13,20 @@
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-typedef unsigned char uc;
-
-static __inline signed char signed_char_clamp(int t) {
+static __inline int8_t signed_char_clamp(int t) {
   t = (t < -128 ? -128 : t);
   t = (t > 127 ? 127 : t);
-  return (signed char) t;
+  return (int8_t) t;
 }
 
 
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char filter_mask(uc limit, uc blimit,
-                                        uc p3, uc p2, uc p1, uc p0,
-                                        uc q0, uc q1, uc q2, uc q3) {
-  signed char mask = 0;
+static __inline int8_t filter_mask(uint8_t limit, uint8_t blimit,
+                                   uint8_t p3, uint8_t p2,
+                                   uint8_t p1, uint8_t p0,
+                                   uint8_t q0, uint8_t q1,
+                                   uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
   mask |= (abs(p3 - p2) > limit) * -1;
   mask |= (abs(p2 - p1) > limit) * -1;
   mask |= (abs(p1 - p0) > limit) * -1;
@@ -39,27 +39,26 @@
 }
 
 /* is there high variance internal edge ( 11111111 yes, 00000000 no) */
-static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) {
-  signed char hev = 0;
+static __inline int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                               uint8_t q0, uint8_t q1) {
+  int8_t hev = 0;
   hev  |= (abs(p1 - p0) > thresh) * -1;
   hev  |= (abs(q1 - q0) > thresh) * -1;
   return hev;
 }
 
-static __inline void filter(signed char mask, uc hev, uc *op1,
-                            uc *op0, uc *oq0, uc *oq1)
+static __inline void filter(int8_t mask, uint8_t hev, uint8_t *op1,
+                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+  int8_t ps0, qs0;
+  int8_t ps1, qs1;
+  int8_t filter, Filter1, Filter2;
+  int8_t u;
 
-{
-  signed char ps0, qs0;
-  signed char ps1, qs1;
-  signed char filter, Filter1, Filter2;
-  signed char u;
+  ps1 = (int8_t) *op1 ^ 0x80;
+  ps0 = (int8_t) *op0 ^ 0x80;
+  qs0 = (int8_t) *oq0 ^ 0x80;
+  qs1 = (int8_t) *oq1 ^ 0x80;
 
-  ps1 = (signed char) * op1 ^ 0x80;
-  ps0 = (signed char) * op0 ^ 0x80;
-  qs0 = (signed char) * oq0 ^ 0x80;
-  qs1 = (signed char) * oq1 ^ 0x80;
-
   /* add outer taps if we have high edge variance */
   filter = signed_char_clamp(ps1 - qs1);
   filter &= hev;
@@ -91,20 +90,16 @@
   *oq1 = u ^ 0x80;
   u = signed_char_clamp(ps1 + filter);
   *op1 = u ^ 0x80;
-
 }
 
-void vp9_loop_filter_horizontal_edge_c
-(
-  unsigned char *s,
-  int p, /* pitch */
-  const unsigned char *blimit,
-  const unsigned char *limit,
-  const unsigned char *thresh,
-  int count
-) {
-  int  hev = 0; /* high edge variance */
-  signed char mask = 0;
+void vp9_loop_filter_horizontal_edge_c(uint8_t *s,
+                                       int p, /* pitch */
+                                       const unsigned char *blimit,
+                                       const unsigned char *limit,
+                                       const unsigned char *thresh,
+                                       int count) {
+  int hev = 0; /* high edge variance */
+  int8_t mask = 0;
   int i = 0;
 
   /* loop filter designed to work using chars so that we can make maximum use
@@ -123,7 +118,7 @@
   } while (++i < count * 8);
 }
 
-void vp9_loop_filter_vertical_edge_c(unsigned char *s,
+void vp9_loop_filter_vertical_edge_c(uint8_t *s,
                                      int p,
                                      const unsigned char *blimit,
                                      const unsigned char *limit,
@@ -130,7 +125,7 @@
                                      const unsigned char *thresh,
                                      int count) {
   int  hev = 0; /* high edge variance */
-  signed char mask = 0;
+  int8_t mask = 0;
   int i = 0;
 
   /* loop filter designed to work using chars so that we can make maximum use
@@ -148,32 +143,36 @@
     s += p;
   } while (++i < count * 8);
 }
-static __inline signed char flatmask(uc thresh,
-                                     uc p4, uc p3, uc p2, uc p1, uc p0,
-                                     uc q0, uc q1, uc q2, uc q3, uc q4) {
-  signed char flat = 0;
-  flat |= (abs(p1 - p0) > 1) * -1;
-  flat |= (abs(q1 - q0) > 1) * -1;
-  flat |= (abs(p0 - p2) > 1) * -1;
-  flat |= (abs(q0 - q2) > 1) * -1;
-  flat |= (abs(p3 - p0) > 1) * -1;
-  flat |= (abs(q3 - q0) > 1) * -1;
-  flat |= (abs(p4 - p0) > 1) * -1;
-  flat |= (abs(q4 - q0) > 1) * -1;
+static __inline signed char flatmask(uint8_t thresh,
+                                     uint8_t p4, uint8_t p3, uint8_t p2,
+                                     uint8_t p1, uint8_t p0,
+                                     uint8_t q0, uint8_t q1, uint8_t q2,
+                                     uint8_t q3, uint8_t q4) {
+  int8_t flat = 0;
+  flat |= (abs(p1 - p0) > thresh) * -1;
+  flat |= (abs(q1 - q0) > thresh) * -1;
+  flat |= (abs(p0 - p2) > thresh) * -1;
+  flat |= (abs(q0 - q2) > thresh) * -1;
+  flat |= (abs(p3 - p0) > thresh) * -1;
+  flat |= (abs(q3 - q0) > thresh) * -1;
+  flat |= (abs(p4 - p0) > thresh) * -1;
+  flat |= (abs(q4 - q0) > thresh) * -1;
   flat = ~flat;
   return flat;
 }
 
-static __inline void mbfilter(signed char mask, uc hev, uc flat,
-                              uc *op4, uc *op3, uc *op2, uc *op1, uc *op0,
-                              uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) {
+static __inline void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,
+                              uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                              uint8_t *op1, uint8_t *op0,
+                              uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
+                              uint8_t *oq3, uint8_t *oq4) {
   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
   if (flat && mask) {
-    unsigned char p0, q0;
-    unsigned char p1, q1;
-    unsigned char p2, q2;
-    unsigned char p3, q3;
-    unsigned char p4, q4;
+    uint8_t p0, q0;
+    uint8_t p1, q1;
+    uint8_t p2, q2;
+    uint8_t p3, q3;
+    uint8_t p4, q4;
 
     p4 = *op4;
     p3 = *op3;
@@ -193,15 +192,15 @@
     *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;
     *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;
   } else {
-    signed char ps0, qs0;
-    signed char ps1, qs1;
-    signed char filter, Filter1, Filter2;
-    signed char u;
+    int8_t ps0, qs0;
+    int8_t ps1, qs1;
+    int8_t filter, Filter1, Filter2;
+    int8_t u;
 
-    ps1 = (signed char) * op1 ^ 0x80;
-    ps0 = (signed char) * op0 ^ 0x80;
-    qs0 = (signed char) * oq0 ^ 0x80;
-    qs1 = (signed char) * oq1 ^ 0x80;
+    ps1 = (int8_t) *op1 ^ 0x80;
+    ps0 = (int8_t) *op0 ^ 0x80;
+    qs0 = (int8_t) *oq0 ^ 0x80;
+    qs1 = (int8_t) *oq1 ^ 0x80;
 
     /* add outer taps if we have high edge variance */
     filter = signed_char_clamp(ps1 - qs1);
@@ -233,18 +232,16 @@
     *op1 = u ^ 0x80;
   }
 }
-void vp9_mbloop_filter_horizontal_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit,
-  const unsigned char *limit,
-  const unsigned char *thresh,
-  int count
-) {
-  signed char hev = 0; /* high edge variance */
-  signed char mask = 0;
-  signed char flat = 0;
+
+void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s,
+                                         int p,
+                                         const unsigned char *blimit,
+                                         const unsigned char *limit,
+                                         const unsigned char *thresh,
+                                         int count) {
+  int8_t hev = 0; /* high edge variance */
+  int8_t mask = 0;
+  int8_t flat = 0;
   int i = 0;
 
   /* loop filter designed to work using chars so that we can make maximum use
@@ -251,7 +248,6 @@
    * of 8 bit simd instructions.
    */
   do {
-
     mask = filter_mask(limit[0], blimit[0],
                        s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
                        s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
@@ -258,7 +254,7 @@
 
     hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
 
-    flat = flatmask(thresh[0],
+    flat = flatmask(1,
                     s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
                     s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
     mbfilter(mask, hev, flat,
@@ -269,28 +265,25 @@
   } while (++i < count * 8);
 
 }
-void vp9_mbloop_filter_vertical_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit,
-  const unsigned char *limit,
-  const unsigned char *thresh,
-  int count
-) {
-  signed char hev = 0; /* high edge variance */
-  signed char mask = 0;
-  signed char flat = 0;
+
+void vp9_mbloop_filter_vertical_edge_c(uint8_t *s,
+                                       int p,
+                                       const unsigned char *blimit,
+                                       const unsigned char *limit,
+                                       const unsigned char *thresh,
+                                       int count) {
+  int8_t hev = 0; /* high edge variance */
+  int8_t mask = 0;
+  int8_t flat = 0;
   int i = 0;
 
   do {
-
     mask = filter_mask(limit[0], blimit[0],
                        s[-4], s[-3], s[-2], s[-1],
                        s[0], s[1], s[2], s[3]);
 
     hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
-    flat = flatmask(thresh[0],
+    flat = flatmask(1,
                     s[-5], s[-4], s[-3], s[-2], s[-1],
                     s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
     mbfilter(mask, hev, flat,
@@ -302,26 +295,26 @@
 }
 
 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */
-static __inline signed char simple_filter_mask(uc blimit,
-                                               uc p1, uc p0,
-                                               uc q0, uc q1) {
+static __inline int8_t simple_filter_mask(uint8_t blimit,
+                                          uint8_t p1, uint8_t p0,
+                                          uint8_t q0, uint8_t q1) {
   /* Why does this cause problems for win32?
    * error C2143: syntax error : missing ';' before 'type'
    *  (void) limit;
    */
-  signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
+  int8_t mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;
   return mask;
 }
 
-static __inline void simple_filter(signed char mask,
-                                   uc *op1, uc *op0,
-                                   uc *oq0, uc *oq1) {
-  signed char filter, Filter1, Filter2;
-  signed char p1 = (signed char) * op1 ^ 0x80;
-  signed char p0 = (signed char) * op0 ^ 0x80;
-  signed char q0 = (signed char) * oq0 ^ 0x80;
-  signed char q1 = (signed char) * oq1 ^ 0x80;
-  signed char u;
+static __inline void simple_filter(int8_t mask,
+                                   uint8_t *op1, uint8_t *op0,
+                                   uint8_t *oq0, uint8_t *oq1) {
+  int8_t filter, Filter1, Filter2;
+  int8_t p1 = (int8_t) *op1 ^ 0x80;
+  int8_t p0 = (int8_t) *op0 ^ 0x80;
+  int8_t q0 = (int8_t) *oq0 ^ 0x80;
+  int8_t q1 = (int8_t) *oq1 ^ 0x80;
+  int8_t u;
 
   filter = signed_char_clamp(p1 - q1);
   filter = signed_char_clamp(filter + 3 * (q0 - p0));
@@ -339,13 +332,10 @@
   *op0 = u ^ 0x80;
 }
 
-void vp9_loop_filter_simple_horizontal_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit
-) {
-  signed char mask = 0;
+void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s,
+                                              int p,
+                                              const unsigned char *blimit) {
+  int8_t mask = 0;
   int i = 0;
 
   do {
@@ -359,13 +349,10 @@
   } while (++i < 16);
 }
 
-void vp9_loop_filter_simple_vertical_edge_c
-(
-  unsigned char *s,
-  int p,
-  const unsigned char *blimit
-) {
-  signed char mask = 0;
+void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s,
+                                            int p,
+                                            const unsigned char *blimit) {
+  int8_t mask = 0;
   int i = 0;
 
   do {
@@ -373,12 +360,11 @@
     simple_filter(mask, s - 2, s - 1, s, s + 1);
     s += p;
   } while (++i < 16);
-
 }
 
 /* Vertical MB Filtering */
-void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                           unsigned char *v_ptr, int y_stride, int uv_stride,
+void vp9_loop_filter_mbv_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                           uint8_t *v_ptr, int y_stride, int uv_stride,
                            struct loop_filter_info *lfi) {
   vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride,
                                     lfi->mblim, lfi->lim, lfi->hev_thr, 2);
@@ -393,8 +379,8 @@
 }
 
 /* Vertical B Filtering */
-void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                          unsigned char *v_ptr, int y_stride, int uv_stride,
+void vp9_loop_filter_bv_c(uint8_t*y_ptr, uint8_t *u_ptr,
+                          uint8_t *v_ptr, int y_stride, int uv_stride,
                           struct loop_filter_info *lfi) {
   vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride,
                                   lfi->blim, lfi->lim, lfi->hev_thr, 2);
@@ -413,8 +399,8 @@
 }
 
 /* Horizontal MB filtering */
-void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                           unsigned char *v_ptr, int y_stride, int uv_stride,
+void vp9_loop_filter_mbh_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                           uint8_t *v_ptr, int y_stride, int uv_stride,
                            struct loop_filter_info *lfi) {
   vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride,
                                       lfi->mblim, lfi->lim, lfi->hev_thr, 2);
@@ -429,8 +415,8 @@
 }
 
 /* Horizontal B Filtering */
-void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                          unsigned char *v_ptr, int y_stride, int uv_stride,
+void vp9_loop_filter_bh_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                          uint8_t *v_ptr, int y_stride, int uv_stride,
                           struct loop_filter_info *lfi) {
   vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride,
                                     lfi->blim, lfi->lim, lfi->hev_thr, 2);
@@ -448,14 +434,22 @@
                                       lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
+void vp9_loop_filter_bh8x8_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                             uint8_t *v_ptr, int y_stride, int uv_stride,
                              struct loop_filter_info *lfi) {
   vp9_mbloop_filter_horizontal_edge_c(
     y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
+void vp9_loop_filter_bhs_c(uint8_t *y_ptr, int y_stride,
                            const unsigned char *blimit) {
   vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride,
                                            y_stride, blimit);
@@ -465,16 +459,263 @@
                                            y_stride, blimit);
 }
 
-void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr,
-                             unsigned char *v_ptr, int y_stride, int uv_stride,
+void vp9_loop_filter_bv8x8_c(uint8_t *y_ptr, uint8_t *u_ptr,
+                             uint8_t *v_ptr, int y_stride, int uv_stride,
                              struct loop_filter_info *lfi) {
   vp9_mbloop_filter_vertical_edge_c(
     y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride,
+                                    lfi->blim, lfi->lim, lfi->hev_thr, 1);
 }
 
-void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
+void vp9_loop_filter_bvs_c(uint8_t *y_ptr, int y_stride,
                            const unsigned char *blimit) {
   vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
   vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
   vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
 }
+
+static __inline void wide_mbfilter(int8_t mask, uint8_t hev,
+                                   uint8_t flat, uint8_t flat2,
+                                   uint8_t *op7, uint8_t *op6, uint8_t *op5,
+                                   uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                                   uint8_t *op1, uint8_t *op0, uint8_t *oq0,
+                                   uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
+                                   uint8_t *oq4, uint8_t *oq5, uint8_t *oq6,
+                                   uint8_t *oq7) {
+  /* use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line */
+  if (flat2 && flat && mask) {
+    uint8_t p0, q0;
+    uint8_t p1, q1;
+    uint8_t p2, q2;
+    uint8_t p3, q3;
+    uint8_t p4, q4;
+    uint8_t p5, q5;
+    uint8_t p6, q6;
+    uint8_t p7, q7;
+
+    p7 = *op7;
+    p6 = *op6;
+    p5 = *op5;
+    p4 = *op4;
+    p3 = *op3;
+    p2 = *op2;
+    p1 = *op1;
+    p0 = *op0;
+    q0 = *oq0;
+    q1 = *oq1;
+    q2 = *oq2;
+    q3 = *oq3;
+    q4 = *oq4;
+    q5 = *oq5;
+    q6 = *oq6;
+    q7 = *oq7;
+
+    *op6 = (p7 * 7 + p6 * 2 +
+            p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
+    *op5 = (p7 * 6 + p6 + p5 * 2 +
+            p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
+    *op4 = (p7 * 5 + p6 + p5 + p4 * 2 +
+            p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
+    *op3 = (p7 * 4 + p6 + p5 + p4 + p3 * 2 +
+            p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
+    *op2 = (p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 +
+            p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
+    *op1 = (p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+            p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
+    *op0 = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+            q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
+    *oq0 = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+            q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
+    *oq1 = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+            q2 + q3 + q4 + q5 + q6 + q7 * 2 + 8) >> 4;
+    *oq2 = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+            q3 + q4 + q5 + q6 + q7 * 3 + 8) >> 4;
+    *oq3 = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
+            q4 + q5 + q6 + q7 * 4 + 8) >> 4;
+    *oq4 = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
+            q5 + q6 + q7 * 5 + 8) >> 4;
+    *oq5 = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
+            q6 + q7 * 6 + 8) >> 4;
+    *oq6 = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
+            q7 * 7 + 8) >> 4;
+  } else if (flat && mask) {
+    unsigned char p0, q0;
+    unsigned char p1, q1;
+    unsigned char p2, q2;
+    unsigned char p3, q3;
+    unsigned char p4, q4;
+
+    p4 = *op4;
+    p3 = *op3;
+    p2 = *op2;
+    p1 = *op1;
+    p0 = *op0;
+    q0 = *oq0;
+    q1 = *oq1;
+    q2 = *oq2;
+    q3 = *oq3;
+    q4 = *oq4;
+
+    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;
+    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;
+    *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;
+    *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;
+    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;
+    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;
+  } else {
+    signed char ps0, qs0;
+    signed char ps1, qs1;
+    signed char filter, Filter1, Filter2;
+    signed char u;
+
+    ps1 = (signed char) * op1 ^ 0x80;
+    ps0 = (signed char) * op0 ^ 0x80;
+    qs0 = (signed char) * oq0 ^ 0x80;
+    qs1 = (signed char) * oq1 ^ 0x80;
+
+    /* add outer taps if we have high edge variance */
+    filter = signed_char_clamp(ps1 - qs1);
+    filter &= hev;
+
+    /* inner taps */
+    filter = signed_char_clamp(filter + 3 * (qs0 - ps0));
+    filter &= mask;
+
+    Filter1 = signed_char_clamp(filter + 4);
+    Filter2 = signed_char_clamp(filter + 3);
+    Filter1 >>= 3;
+    Filter2 >>= 3;
+
+    u = signed_char_clamp(qs0 - Filter1);
+    *oq0 = u ^ 0x80;
+    u = signed_char_clamp(ps0 + Filter2);
+    *op0 = u ^ 0x80;
+    filter = Filter1;
+
+    /* outer tap adjustments */
+    filter += 1;
+    filter >>= 1;
+    filter &= ~hev;
+
+    u = signed_char_clamp(qs1 - filter);
+    *oq1 = u ^ 0x80;
+    u = signed_char_clamp(ps1 + filter);
+    *op1 = u ^ 0x80;
+  }
+}
+
+void vp9_mb_lpf_horizontal_edge_w
+(
+  unsigned char *s,
+  int p,
+  const unsigned char *blimit,
+  const unsigned char *limit,
+  const unsigned char *thresh,
+  int count
+) {
+  signed char hev = 0; /* high edge variance */
+  signed char mask = 0;
+  signed char flat = 0;
+  signed char flat2 = 0;
+  int i = 0;
+
+  /* loop filter designed to work using chars so that we can make maximum use
+   * of 8 bit simd instructions.
+   */
+  do {
+    mask = filter_mask(limit[0], blimit[0],
+                       s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+                       s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);
+
+    hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);
+
+    flat = flatmask(1,
+                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],
+                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);
+
+    flat2 = flatmask(1,
+                    s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],
+                    s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);
+
+    wide_mbfilter(mask, hev, flat, flat2,
+             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+             s,         s + 1 * p, s + 2 * p, s + 3 * p,
+             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);
+
+    ++s;
+  } while (++i < count * 8);
+}
+void vp9_mb_lpf_vertical_edge_w
+(
+  unsigned char *s,
+  int p,
+  const unsigned char *blimit,
+  const unsigned char *limit,
+  const unsigned char *thresh,
+  int count
+) {
+  signed char hev = 0; /* high edge variance */
+  signed char mask = 0;
+  signed char flat = 0;
+  signed char flat2 = 0;
+  int i = 0;
+
+  do {
+    mask = filter_mask(limit[0], blimit[0],
+                       s[-4], s[-3], s[-2], s[-1],
+                       s[0], s[1], s[2], s[3]);
+
+    hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
+    flat = flatmask(1,
+                    s[-5], s[-4], s[-3], s[-2], s[-1],
+                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);
+    flat2 = flatmask(1,
+                    s[-8], s[-7], s[-6], s[-5], s[-1],
+                    s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);
+
+    wide_mbfilter(mask, hev, flat, flat2,
+             s - 8, s - 7, s - 6, s - 5,
+             s - 4, s - 3, s - 2, s - 1,
+             s,     s + 1, s + 2, s + 3,
+             s + 4, s + 5, s + 6, s + 7);
+    s += p;
+  } while (++i < count * 8);
+}
+
+void vp9_lpf_mbv_w_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                   unsigned char *v_ptr, int y_stride, int uv_stride,
+                   struct loop_filter_info *lfi) {
+  vp9_mb_lpf_vertical_edge_w(y_ptr, y_stride,
+                                    lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+void vp9_lpf_mbh_w_c(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           struct loop_filter_info *lfi) {
+  vp9_mb_lpf_horizontal_edge_w(y_ptr, y_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride,
+                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride,
+                                        lfi->mblim, lfi->lim, lfi->hev_thr, 1);
+}
+
--- a/vp9/common/vp9_mbpitch.c
+++ b/vp9/common/vp9_mbpitch.c
@@ -16,17 +16,13 @@
   DEST = 1
 } BLOCKSET;
 
-static void setup_block
-(
-  BLOCKD *b,
-  int mv_stride,
-  unsigned char **base,
-  unsigned char **base2,
-  int Stride,
-  int offset,
-  BLOCKSET bs
-) {
-
+static void setup_block(BLOCKD *b,
+                        int mv_stride,
+                        uint8_t **base,
+                        uint8_t **base2,
+                        int Stride,
+                        int offset,
+                        BLOCKSET bs) {
   if (bs == DEST) {
     b->dst_stride = Stride;
     b->dst = offset;
@@ -37,15 +33,13 @@
     b->base_pre = base;
     b->base_second_pre = base2;
   }
-
 }
 
-
 static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) {
   int block;
 
-  unsigned char **y, **u, **v;
-  unsigned char **y2 = NULL, **u2 = NULL, **v2 = NULL;
+  uint8_t **y, **u, **v;
+  uint8_t **y2 = NULL, **u2 = NULL, **v2 = NULL;
   BLOCKD *blockd = xd->block;
   int stride;
 
@@ -117,7 +111,6 @@
 }
 
 void vp9_build_block_doffsets(MACROBLOCKD *xd) {
-
   /* handle the destination pitch features */
   setup_macroblock(xd, DEST);
   setup_macroblock(xd, PRED);
--- a/vp9/common/vp9_modecont.h
+++ b/vp9/common/vp9_modecont.h
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_MODECONT_H_
 #define VP9_COMMON_VP9_MODECONT_H_
 
 extern const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4];
-#endif
+
+#endif  // VP9_COMMON_VP9_MODECONT_H_
--- a/vp9/common/vp9_mv.h
+++ b/vp9/common/vp9_mv.h
@@ -8,14 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_MV_H_
 #define VP9_COMMON_VP9_MV_H_
+
 #include "vpx/vpx_integer.h"
 
 typedef struct {
-  short row;
-  short col;
+  int16_t row;
+  int16_t col;
 } MV;
 
 typedef union int_mv {
@@ -23,4 +23,4 @@
   MV as_mv;
 } int_mv; /* facilitates faster equality tests and copies */
 
-#endif
+#endif  // VP9_COMMON_VP9_MV_H_
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -17,7 +17,6 @@
 };
 static int mb_ref_distance_weight[MVREF_NEIGHBOURS] =
   { 3, 3, 2, 1, 1, 1, 1, 1 };
-#if CONFIG_SUPERBLOCKS
 static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = {
     {0, -1}, {-1, 0}, {1, -1}, {-1, 1},
     {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}
@@ -24,7 +23,7 @@
 };
 static int sb_ref_distance_weight[MVREF_NEIGHBOURS] =
   { 3, 3, 2, 2, 2, 1, 1, 1 };
-#endif
+
 // clamp_mv
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
 static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
@@ -40,10 +39,29 @@
     mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;
 }
 
+// Gets a candidate refenence motion vector from the given mode info
+// structure if one exists that matches the given reference frame.
+static int get_matching_candidate(
+  const MODE_INFO *candidate_mi,
+  MV_REFERENCE_FRAME ref_frame,
+  int_mv *c_mv
+) {
+  int ret_val = TRUE;
 
-// Gets a best matching candidate refenence motion vector
-// from the given mode info structure (if available)
-static int get_candidate_mvref(
+  if (ref_frame == candidate_mi->mbmi.ref_frame) {
+    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
+  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
+    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+  } else {
+    ret_val = FALSE;
+  }
+
+  return ret_val;
+}
+
+// Gets candidate refenence motion vector(s) from the given mode info
+// structure if they exists and do NOT match the given reference frame.
+static void get_non_matching_candidates(
   const MODE_INFO *candidate_mi,
   MV_REFERENCE_FRAME ref_frame,
   MV_REFERENCE_FRAME *c_ref_frame,
@@ -52,61 +70,29 @@
   int_mv *c2_mv
 ) {
 
-  int ret_val = FALSE;
+  c_mv->as_int = 0;
   c2_mv->as_int = 0;
+  *c_ref_frame = INTRA_FRAME;
   *c2_ref_frame = INTRA_FRAME;
 
-  // Target ref frame matches candidate first ref frame
-  if (ref_frame == candidate_mi->mbmi.ref_frame) {
-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-    *c_ref_frame = ref_frame;
-    ret_val = TRUE;
-
-    // Is there a second non zero vector we can use.
-    if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) &&
-        (candidate_mi->mbmi.mv[1].as_int != 0) &&
-        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
-      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-      *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
+  // If first candidate not valid neither will be.
+  if (candidate_mi->mbmi.ref_frame > INTRA_FRAME) {
+    // First candidate
+    if (candidate_mi->mbmi.ref_frame != ref_frame) {
+      *c_ref_frame = candidate_mi->mbmi.ref_frame;
+      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
     }
 
-  // Target ref frame matches candidate second ref frame
-  } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-    *c_ref_frame = ref_frame;
-    ret_val = TRUE;
-
-    // Is there a second non zero vector we can use.
-    if ((candidate_mi->mbmi.ref_frame > INTRA_FRAME) &&
-        (candidate_mi->mbmi.mv[0].as_int != 0) &&
-        (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) {
-      c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-      *c2_ref_frame = candidate_mi->mbmi.ref_frame;
-    }
-
-  // No ref frame matches so use first ref mv as first choice
-  } else if (candidate_mi->mbmi.ref_frame > INTRA_FRAME) {
-    c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-    *c_ref_frame = candidate_mi->mbmi.ref_frame;
-    ret_val = TRUE;
-
-    // Is there a second non zero vector we can use.
+    // Second candidate
     if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) &&
-        (candidate_mi->mbmi.mv[1].as_int != 0) &&
-        (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) {
-      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
+        (candidate_mi->mbmi.second_ref_frame != ref_frame)) {  // &&
+        // (candidate_mi->mbmi.mv[1].as_int != 0) &&
+        // (candidate_mi->mbmi.mv[1].as_int !=
+        // candidate_mi->mbmi.mv[0].as_int)) {
       *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;
+      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
     }
-
-  // If only the second ref mv is valid:- (Should not trigger in current code
-  // base given current possible compound prediction options).
-  } else if (candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) {
-    c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-    *c_ref_frame = candidate_mi->mbmi.second_ref_frame;
-    ret_val = TRUE;
   }
-
-  return ret_val;
 }
 
 // Performs mv adjustment based on reference frame and clamps the MV
@@ -170,14 +156,20 @@
   int weight
 ) {
 
-  int i = *index;
+  int i;
+  int insert_point;
   int duplicate_found = FALSE;
 
-  // Check for duplicates. If there is one increment its score.
-  // Duplicate defined as being the same full pel vector with rounding.
+  // Check for duplicates. If there is one increase its score.
+  // We only compare vs the current top candidates.
+  insert_point = (*index < (MAX_MV_REF_CANDIDATES - 1))
+                 ? *index : (MAX_MV_REF_CANDIDATES - 1);
+
+  i = insert_point;
+  if (*index > i)
+    i++;
   while (i > 0) {
     i--;
-
     if (candidate_mv.as_int == mv_list[i].as_int) {
       duplicate_found = TRUE;
       mv_scores[i] += weight;
@@ -185,11 +177,13 @@
     }
   }
 
-  // If no duplicate was found add the new vector and give it a weight
-  if (!duplicate_found) {
-    mv_list[*index].as_int = candidate_mv.as_int;
-    mv_scores[*index] = weight;
-    i = *index;
+  // If no duplicate and the new candidate is good enough then add it.
+  if (!duplicate_found ) {
+    if (weight > mv_scores[insert_point]) {
+      mv_list[insert_point].as_int = candidate_mv.as_int;
+      mv_scores[insert_point] = weight;
+      i = insert_point;
+    }
     (*index)++;
   }
 
@@ -224,26 +218,23 @@
   int i;
   MODE_INFO *candidate_mi;
   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int_mv candidate_mvs[MAX_MV_REFS];
+  int_mv candidate_mvs[MAX_MV_REF_CANDIDATES];
   int_mv c_refmv;
-  MV_REFERENCE_FRAME c_ref_frame;
   int_mv c2_refmv;
+  MV_REFERENCE_FRAME c_ref_frame;
   MV_REFERENCE_FRAME c2_ref_frame;
-  int candidate_scores[MAX_MV_REFS];
+  int candidate_scores[MAX_MV_REF_CANDIDATES];
   int index = 0;
   int split_count = 0;
-  int ref_weight = 0;
-  int valid_mv_ref;
   int (*mv_ref_search)[2];
   int *ref_distance_weight;
 
   // Blank the reference vector lists and other local structures.
-  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);
-  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);
+  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
+  vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
   vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
 
-#if CONFIG_SUPERBLOCKS
-  if (mbmi->encoded_as_sb) {
+  if (mbmi->sb_type) {
     mv_ref_search = sb_mv_ref_search;
     ref_distance_weight = sb_ref_distance_weight;
   } else {
@@ -250,12 +241,9 @@
     mv_ref_search = mb_mv_ref_search;
     ref_distance_weight = mb_ref_distance_weight;
   }
-#else
-  mv_ref_search = mb_mv_ref_search;
-  ref_distance_weight = mb_ref_distance_weight;
-#endif
-  // Populate a list with candidate reference vectors from the
-  // spatial neighbours.
+
+  // We first scan for candidate vectors that match the current reference frame
+  // Look at nearest neigbours
   for (i = 0; i < 2; ++i) {
     if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
         ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
@@ -263,95 +251,89 @@
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
-      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
-                                         &c_ref_frame, &c_refmv,
-                                         &c2_ref_frame, &c2_refmv);
-
-      // If there is a valid MV candidate then add it to the list
-      if (valid_mv_ref) {
-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
-        ref_weight = ref_distance_weight[i] +
-                     ((c_ref_frame == ref_frame) << 4);
-        split_count += (candidate_mi->mbmi.mode == SPLITMV);
-
+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
+        clamp_mv(xd, &c_refmv);
         addmv_and_shuffle(candidate_mvs, candidate_scores,
-                          &index, c_refmv, ref_weight);
-
-        // If there is a second valid mv then add it as well.
-        if (c2_ref_frame > INTRA_FRAME) {
-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
-          ref_weight = ref_distance_weight[i] +
-                       ((c2_ref_frame == ref_frame) << 4);
-
-          addmv_and_shuffle(candidate_mvs, candidate_scores,
-                            &index, c2_refmv, ref_weight);
-        }
+                          &index, c_refmv, ref_distance_weight[i] + 16);
       }
+      split_count += (candidate_mi->mbmi.mode == SPLITMV);
     }
   }
-
-  // Look at the corresponding vector in the last frame
+  // Look in the last frame
   candidate_mi = lf_here;
-  valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
-                                     &c_ref_frame, &c_refmv,
-                                     &c2_ref_frame, &c2_refmv);
-
-  // If there is a valid MV candidate then add it to the list
-  if (valid_mv_ref) {
-    scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
-    ref_weight = 2 + ((c_ref_frame == ref_frame) << 4);
+  if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
+    clamp_mv(xd, &c_refmv);
     addmv_and_shuffle(candidate_mvs, candidate_scores,
-                      &index, c_refmv, ref_weight);
-
-    // If there is a second valid mv then add it as well.
-    if (c2_ref_frame > INTRA_FRAME) {
-      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
-      ref_weight = ref_distance_weight[i] +
-                   ((c2_ref_frame == ref_frame) << 4);
-
-      addmv_and_shuffle(candidate_mvs, candidate_scores,
-                        &index, c2_refmv, ref_weight);
-    }
+                      &index, c_refmv, 18);
   }
-
-  // Populate a list with candidate reference vectors from the
-  // spatial neighbours.
-  for (i = 2; (i < MVREF_NEIGHBOURS) && (index < (MAX_MV_REFS - 2)); ++i) {
+  // More distant neigbours
+  for (i = 2; (i < MVREF_NEIGHBOURS) &&
+              (index < (MAX_MV_REF_CANDIDATES - 1)); ++i) {
     if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
         ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
-
       candidate_mi = here + mv_ref_search[i][0] +
                      (mv_ref_search[i][1] * xd->mode_info_stride);
 
-      valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
-                                         &c_ref_frame, &c_refmv,
-                                         &c2_ref_frame, &c2_refmv);
+      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {
+        clamp_mv(xd, &c_refmv);
+        addmv_and_shuffle(candidate_mvs, candidate_scores,
+                          &index, c_refmv, ref_distance_weight[i] + 16);
+      }
+    }
+  }
 
-      // If there is a valid MV candidate then add it to the list
-      if (valid_mv_ref) {
-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
-        ref_weight = ref_distance_weight[i] +
-                     ((c_ref_frame == ref_frame) << 4);
+  // If we have not found enough candidates consider ones where the
+  // reference frame does not match. Break out when we have
+  // MAX_MV_REF_CANDIDATES candidates.
+  // Look first at spatial neighbours
+  if (index < (MAX_MV_REF_CANDIDATES - 1)) {
+    for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
+      if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+          ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
 
-        addmv_and_shuffle(candidate_mvs, candidate_scores,
-                          &index, c_refmv, ref_weight);
+        candidate_mi = here + mv_ref_search[i][0] +
+                       (mv_ref_search[i][1] * xd->mode_info_stride);
 
-        // If there is a second valid mv then add it as well.
-        if (c2_ref_frame > INTRA_FRAME) {
-          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias );
-          ref_weight = ref_distance_weight[i] +
-                       ((c2_ref_frame == ref_frame) << 4);
+        get_non_matching_candidates(candidate_mi, ref_frame,
+                                    &c_ref_frame, &c_refmv,
+                                    &c2_ref_frame, &c2_refmv);
 
+        if (c_ref_frame != INTRA_FRAME) {
+          scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
           addmv_and_shuffle(candidate_mvs, candidate_scores,
-                            &index, c2_refmv, ref_weight);
+                            &index, c_refmv, ref_distance_weight[i]);
         }
+
+        if (c2_ref_frame != INTRA_FRAME) {
+          scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
+          addmv_and_shuffle(candidate_mvs, candidate_scores,
+                            &index, c2_refmv, ref_distance_weight[i]);
+        }
       }
+
+      if (index >= (MAX_MV_REF_CANDIDATES - 1)) {
+        break;
+      }
     }
   }
+  // Look at the last frame
+  if (index < (MAX_MV_REF_CANDIDATES - 1)) {
+    candidate_mi = lf_here;
+    get_non_matching_candidates(candidate_mi, ref_frame,
+                                &c_ref_frame, &c_refmv,
+                                &c2_ref_frame, &c2_refmv);
 
-  // Make sure we are able to add 0,0
-  if (index > (MAX_MV_REFS - 1)) {
-    index = (MAX_MV_REFS - 1);
+    if (c_ref_frame != INTRA_FRAME) {
+      scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
+      addmv_and_shuffle(candidate_mvs, candidate_scores,
+                        &index, c_refmv, 2);
+    }
+
+    if (c2_ref_frame != INTRA_FRAME) {
+      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
+      addmv_and_shuffle(candidate_mvs, candidate_scores,
+                        &index, c2_refmv, 2);
+    }
   }
 
   // Define inter mode coding context.
@@ -383,14 +365,12 @@
   }
 
   // 0,0 is always a valid reference.
-  for (i = 0; i < index; ++i) {
+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
     if (candidate_mvs[i].as_int == 0)
       break;
   }
-  if (i == index) {
-    c_refmv.as_int = 0;
-    addmv_and_shuffle(candidate_mvs, candidate_scores,
-                      &index, c_refmv, candidate_scores[3]+1 );
+  if (i == MAX_MV_REF_CANDIDATES) {
+    candidate_mvs[MAX_MV_REF_CANDIDATES-1].as_int = 0;
   }
 
   // Copy over the candidate list.
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -11,18 +11,14 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"
 
-
 #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
 #define VP9_COMMON_VP9_MVREF_COMMON_H_
 
-void vp9_find_mv_refs(
-  MACROBLOCKD *xd,
-  MODE_INFO *here,
-  MODE_INFO *lf_here,
-  MV_REFERENCE_FRAME ref_frame,
-  int_mv * mv_ref_list,
-  int *ref_sign_bias
-);
+void vp9_find_mv_refs(MACROBLOCKD *xd,
+                      MODE_INFO *here,
+                      MODE_INFO *lf_here,
+                      MV_REFERENCE_FRAME ref_frame,
+                      int_mv *mv_ref_list,
+                      int *ref_sign_bias);
 
-#endif
-
+#endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
--- a/vp9/common/vp9_onyx.h
+++ b/vp9/common/vp9_onyx.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_ONYX_H_
 #define VP9_COMMON_VP9_ONYX_H_
 
@@ -20,7 +19,6 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 #include "vpx_scale/yv12config.h"
-#include "vp9/common/vp9_type_aliases.h"
 #include "vp9/common/vp9_ppflags.h"
   typedef int *VP9_PTR;
 
@@ -222,4 +220,4 @@
 }
 #endif
 
-#endif  // __INC_ONYX_H
+#endif  // VP9_COMMON_VP9_ONYX_H_
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_ONYXC_INT_H_
 #define VP9_COMMON_VP9_ONYXC_INT_H_
 
@@ -45,27 +44,24 @@
 typedef struct frame_contexts {
   vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
-#if CONFIG_SUPERBLOCKS
   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
-#endif
   vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
   vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];
   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
   vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];
-  vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32];
 
   nmv_context nmvc;
   nmv_context pre_nmvc;
   vp9_prob pre_bmode_prob[VP9_NKF_BINTRAMODES - 1];
   vp9_prob pre_ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
-#if CONFIG_SUPERBLOCKS
   vp9_prob pre_sb_ymode_prob[VP9_I32X32_MODES - 1];
-#endif
   vp9_prob pre_uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
   vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1];
   vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];
@@ -72,44 +68,28 @@
   vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1];
   unsigned int bmode_counts[VP9_NKF_BINTRAMODES];
   unsigned int ymode_counts[VP9_YMODES];   /* interframe intra mode probs */
-#if CONFIG_SUPERBLOCKS
   unsigned int sb_ymode_counts[VP9_I32X32_MODES];
-#endif
   unsigned int uv_mode_counts[VP9_YMODES][VP9_UV_MODES];
   unsigned int i8x8_mode_counts[VP9_I8X8_MODES];   /* interframe intra probs */
   unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];
   unsigned int mbsplit_counts[VP9_NUMMBSPLITS];
 
-  vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs pre_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs pre_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs pre_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES_32X32];
 
-  vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
+  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32];
 
-  vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-
-  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]
-      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
   nmv_context_counts NMVcount;
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
@@ -139,16 +119,17 @@
   ONLY_4X4            = 0,
   ALLOW_8X8           = 1,
   ALLOW_16X16         = 2,
-  TX_MODE_SELECT      = 3,
-  NB_TXFM_MODES       = 4,
+  ALLOW_32X32         = 3,
+  TX_MODE_SELECT      = 4,
+  NB_TXFM_MODES       = 5,
 } TXFM_MODE;
 
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
 
-  DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]);
-  DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, Y1dequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, Y2dequant[QINDEX_RANGE][16]);
+  DECLARE_ALIGNED(16, int16_t, UVdequant[QINDEX_RANGE][16]);
 
   int Width;
   int Height;
@@ -234,7 +215,7 @@
 
   /* Y,U,V,Y2 */
   ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */
-  ENTROPY_CONTEXT_PLANES left_context[2];  /* (up to) 4 contexts "" */
+  ENTROPY_CONTEXT_PLANES left_context[4];  /* (up to) 4 contexts "" */
 
   /* keyframe block modes are predicted by their above, left neighbors */
 
@@ -242,9 +223,7 @@
                         [VP9_KF_BINTRAMODES]
                         [VP9_KF_BINTRAMODES - 1];
   vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */
-#if CONFIG_SUPERBLOCKS
   vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1];
-#endif
   int kf_ymode_probs_index;
   int kf_ymode_probs_update;
   vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1];
@@ -252,9 +231,8 @@
   vp9_prob prob_intra_coded;
   vp9_prob prob_last_coded;
   vp9_prob prob_gf_coded;
-#if CONFIG_SUPERBLOCKS
-  vp9_prob sb_coded;
-#endif
+  vp9_prob sb32_coded;
+  vp9_prob sb64_coded;
 
   // Context probabilities when using predictive coding of segment id
   vp9_prob segment_pred_probs[PREDICTION_PROBS];
@@ -268,7 +246,7 @@
   vp9_prob prob_comppred[COMP_PRED_CONTEXTS];
 
   // FIXME contextualize
-  vp9_prob prob_tx[TX_SIZE_MAX - 1];
+  vp9_prob prob_tx[TX_SIZE_MAX_SB - 1];
 
   vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];
 
@@ -290,13 +268,6 @@
   struct postproc_state  postproc_state;
 #endif
 
-#if CONFIG_PRED_FILTER
-  /* Prediction filter variables */
-  int pred_filter_mode;   // 0=disabled at the frame level (no MB filtered)
-  // 1=enabled at the frame level (all MB filtered)
-  // 2=specified per MB (1=filtered, 0=non-filtered)
-  vp9_prob prob_pred_filter_off;
-#endif
 #if CONFIG_COMP_INTERINTRA_PRED
   int use_interintra;
 #endif
@@ -303,4 +274,4 @@
 
 } VP9_COMMON;
 
-#endif  // __INC_ONYX_INT_H
+#endif  // VP9_COMMON_VP9_ONYXC_INT_H_
--- a/vp9/common/vp9_onyxd.h
+++ /dev/null
@@ -1,68 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_VP9_ONYXD_H_
-#define VP9_COMMON_VP9_ONYXD_H_
-
-
-/* Create/destroy static data structures. */
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-#include "vp9/common/vp9_type_aliases.h"
-#include "vpx_scale/yv12config.h"
-#include "vp9/common/vp9_ppflags.h"
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_codec.h"
-
-  typedef void   *VP9D_PTR;
-  typedef struct {
-    int     Width;
-    int     Height;
-    int     Version;
-    int     postprocess;
-    int     max_threads;
-    int     input_partition;
-  } VP9D_CONFIG;
-  typedef enum {
-    VP9_LAST_FLAG = 1,
-    VP9_GOLD_FLAG = 2,
-    VP9_ALT_FLAG = 4
-  } VP9_REFFRAME;
-
-  void vp9_initialize_dec(void);
-
-  int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,
-                                  const unsigned char **dest,
-                                  int64_t time_stamp);
-
-  int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
-                        int64_t *time_stamp, int64_t *time_end_stamp,
-                        vp9_ppflags_t *flags);
-
-  vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,
-                                        VP9_REFFRAME ref_frame_flag,
-                                        YV12_BUFFER_CONFIG *sd);
-
-  vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
-                                        VP9_REFFRAME ref_frame_flag,
-                                        YV12_BUFFER_CONFIG *sd);
-
-  VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
-
-  void vp9_remove_decompressor(VP9D_PTR comp);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // __INC_ONYXD_H
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c
@@ -13,7 +13,7 @@
 #include "vpx_scale/yv12config.h"
 #include "vp9/common/vp9_postproc.h"
 #include "vp9/common/vp9_textblit.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
@@ -132,20 +132,20 @@
 
 /****************************************************************************
  */
-void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,
-                                     unsigned char *dst_ptr,
+void vp9_post_proc_down_and_across_c(uint8_t *src_ptr,
+                                     uint8_t *dst_ptr,
                                      int src_pixels_per_line,
                                      int dst_pixels_per_line,
                                      int rows,
                                      int cols,
                                      int flimit) {
-  unsigned char *p_src, *p_dst;
+  uint8_t *p_src, *p_dst;
   int row;
   int col;
   int i;
   int v;
   int pitch = src_pixels_per_line;
-  unsigned char d[8];
+  uint8_t d[8];
   (void)dst_pixels_per_line;
 
   for (row = 0; row < rows; row++) {
@@ -215,12 +215,12 @@
   return x * x / 3;
 }
 
-void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,
+void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch,
                                  int rows, int cols, int flimit) {
   int r, c, i;
 
-  unsigned char *s = src;
-  unsigned char d[16];
+  uint8_t *s = src;
+  uint8_t d[16];
 
 
   for (r = 0; r < rows; r++) {
@@ -253,16 +253,16 @@
   }
 }
 
-void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,
+void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch,
                             int rows, int cols, int flimit) {
   int r, c, i;
   const short *rv3 = &vp9_rv[63 & rand()];
 
   for (c = 0; c < cols; c++) {
-    unsigned char *s = &dst[c];
+    uint8_t *s = &dst[c];
     int sumsq = 0;
     int sum   = 0;
-    unsigned char d[16];
+    uint8_t d[16];
     const short *rv2 = rv3 + ((c * 17) & 127);
 
     for (i = -8; i <= 6; i++) {
@@ -439,7 +439,7 @@
  *  SPECIAL NOTES : None.
  *
  ****************************************************************************/
-void vp9_plane_add_noise_c(unsigned char *Start, char *noise,
+void vp9_plane_add_noise_c(uint8_t *Start, char *noise,
                            char blackclamp[16],
                            char whiteclamp[16],
                            char bothclamp[16],
@@ -447,7 +447,7 @@
   unsigned int i, j;
 
   for (i = 0; i < Height; i++) {
-    unsigned char *Pos = Start + i * Pitch;
+    uint8_t *Pos = Start + i * Pitch;
     char  *Ref = (char *)(noise + (rand() & 0xff));
 
     for (j = 0; j < Width; j++) {
@@ -466,7 +466,7 @@
  * edges unblended to give distinction to macro blocks in areas
  * filled with the same color block.
  */
-void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
+void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v,
                           int y1, int u1, int v1, int alpha, int stride) {
   int i, j;
   int y1_const = y1 * ((1 << 16) - alpha);
@@ -499,7 +499,7 @@
 /* Blend only the edge of the macro block.  Leave center
  * unblended to allow for other visualizations to be layered.
  */
-void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
+void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v,
                           int y1, int u1, int v1, int alpha, int stride) {
   int i, j;
   int y1_const = y1 * ((1 << 16) - alpha);
@@ -554,7 +554,7 @@
   }
 }
 
-void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
+void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v,
                    int y1, int u1, int v1, int alpha, int stride) {
   int i, j;
   int y1_const = y1 * ((1 << 16) - alpha);
@@ -688,7 +688,7 @@
 
   if (flags & VP9D_DEBUG_TXT_MBLK_MODES) {
     int i, j;
-    unsigned char *y_ptr;
+    uint8_t *y_ptr;
     YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
     int mb_rows = post->y_height >> 4;
     int mb_cols = post->y_width  >> 4;
@@ -717,7 +717,7 @@
 
   if (flags & VP9D_DEBUG_TXT_DC_DIFF) {
     int i, j;
-    unsigned char *y_ptr;
+    uint8_t *y_ptr;
     YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
     int mb_rows = post->y_height >> 4;
     int mb_cols = post->y_width  >> 4;
@@ -764,7 +764,7 @@
     YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
     int width  = post->y_width;
     int height = post->y_height;
-    unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
+    uint8_t *y_buffer = oci->post_proc_buffer.y_buffer;
     int y_stride = oci->post_proc_buffer.y_stride;
     MODE_INFO *mi = oci->mi;
     int x0, y0;
@@ -906,9 +906,9 @@
     YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
     int width  = post->y_width;
     int height = post->y_height;
-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
     int y_stride = oci->post_proc_buffer.y_stride;
     MODE_INFO *mi = oci->mi;
 
@@ -920,7 +920,7 @@
             ((ppflags->display_mb_modes_flag & B_PRED) ||
              ppflags->display_b_modes_flag)) {
           int by, bx;
-          unsigned char *yl, *ul, *vl;
+          uint8_t *yl, *ul, *vl;
           union b_mode_info *bmi = mi->bmi;
 
           yl = y_ptr + x;
@@ -971,9 +971,9 @@
     YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
     int width  = post->y_width;
     int height = post->y_height;
-    unsigned char *y_ptr = oci->post_proc_buffer.y_buffer;
-    unsigned char *u_ptr = oci->post_proc_buffer.u_buffer;
-    unsigned char *v_ptr = oci->post_proc_buffer.v_buffer;
+    uint8_t *y_ptr = oci->post_proc_buffer.y_buffer;
+    uint8_t *u_ptr = oci->post_proc_buffer.u_buffer;
+    uint8_t *v_ptr = oci->post_proc_buffer.v_buffer;
     int y_stride = oci->post_proc_buffer.y_stride;
     MODE_INFO *mi = oci->mi;
 
--- a/vp9/common/vp9_postproc.h
+++ b/vp9/common/vp9_postproc.h
@@ -38,4 +38,5 @@
                  int                         q,
                  int                         low_var_thresh,
                  int                         flag);
-#endif
+
+#endif  // VP9_COMMON_VP9_POSTPROC_H_
--- a/vp9/common/vp9_ppflags.h
+++ b/vp9/common/vp9_ppflags.h
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_PPFLAGS_H_
 #define VP9_COMMON_VP9_PPFLAGS_H_
+
 enum {
   VP9D_NOFILTERING            = 0,
   VP9D_DEBLOCK                = 1 << 0,
@@ -35,4 +35,4 @@
   int display_mv_flag;
 } vp9_ppflags_t;
 
-#endif
+#endif  // VP9_COMMON_VP9_PPFLAGS_H_
--- a/vp9/common/vp9_pragmas.h
+++ b/vp9/common/vp9_pragmas.h
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef VP9_COMMON_VP9_PRAGMAS_H_
+#define VP9_COMMON_VP9_PRAGMAS_H_
 
-
-
 #ifdef __INTEL_COMPILER
 #pragma warning(disable:997 1011 170)
 #endif
@@ -17,3 +17,5 @@
 #ifdef _MSC_VER
 #pragma warning(disable:4799)
 #endif
+
+#endif  // VP9_COMMON_VP9_PRAGMAS_H_
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -9,8 +9,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_treecoder.h"
 
 // TBD prediction functions for various bitstream signals
 
@@ -221,54 +223,57 @@
 void vp9_set_pred_flag(MACROBLOCKD *const xd,
                        PRED_ID pred_id,
                        unsigned char pred_flag) {
-#if CONFIG_SUPERBLOCKS
   const int mis = xd->mode_info_stride;
-#endif
 
   switch (pred_id) {
     case PRED_SEG_ID:
       xd->mode_info_context->mbmi.seg_id_predicted = pred_flag;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge >= 0)
-          xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag;
-        if (xd->mb_to_bottom_edge >= 0) {
-          xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag;
-          if (xd->mb_to_right_edge >= 0)
-            xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag;
+      if (xd->mode_info_context->mbmi.sb_type) {
+#define sub(a, b) (b) < 0 ? (a) + (b) : (a)
+        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
+        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
+        int x, y;
+
+        for (y = 0; y < y_mbs; y++) {
+          for (x = !y; x < x_mbs; x++) {
+            xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted =
+                pred_flag;
+          }
         }
       }
-#endif
       break;
 
     case PRED_REF:
       xd->mode_info_context->mbmi.ref_predicted = pred_flag;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge >= 0)
-          xd->mode_info_context[1].mbmi.ref_predicted = pred_flag;
-        if (xd->mb_to_bottom_edge >= 0) {
-          xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag;
-          if (xd->mb_to_right_edge >= 0)
-            xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag;
+      if (xd->mode_info_context->mbmi.sb_type) {
+        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
+        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
+        int x, y;
+
+        for (y = 0; y < y_mbs; y++) {
+          for (x = !y; x < x_mbs; x++) {
+            xd->mode_info_context[y * mis + x].mbmi.ref_predicted = pred_flag;
+          }
         }
       }
-#endif
       break;
 
     case PRED_MBSKIP:
       xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag;
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (xd->mb_to_right_edge >= 0)
-          xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag;
-        if (xd->mb_to_bottom_edge >= 0) {
-          xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag;
-          if (xd->mb_to_right_edge >= 0)
-            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag;
+      if (xd->mode_info_context->mbmi.sb_type) {
+        const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+        const int x_mbs = sub(n_mbs, xd->mb_to_right_edge >> 7);
+        const int y_mbs = sub(n_mbs, xd->mb_to_bottom_edge >> 7);
+        int x, y;
+
+        for (y = 0; y < y_mbs; y++) {
+          for (x = !y; x < x_mbs; x++) {
+            xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag;
+          }
         }
       }
-#endif
       break;
 
     default:
@@ -286,25 +291,25 @@
                                     const MACROBLOCKD *const xd, int MbIndex) {
   // Currently the prediction for the macroblock segment ID is
   // the value stored for this macroblock in the previous frame.
-#if CONFIG_SUPERBLOCKS
-  if (!xd->mode_info_context->mbmi.encoded_as_sb) {
-#endif
+  if (!xd->mode_info_context->mbmi.sb_type) {
     return cm->last_frame_seg_map[MbIndex];
-#if CONFIG_SUPERBLOCKS
   } else {
-    int seg_id = cm->last_frame_seg_map[MbIndex];
-    int mb_col = MbIndex % cm->mb_cols;
-    int mb_row = MbIndex / cm->mb_cols;
-    if (mb_col + 1 < cm->mb_cols)
-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1];
-    if (mb_row + 1 < cm->mb_rows) {
-      seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols];
-      if (mb_col + 1 < cm->mb_cols)
-        seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1];
+    const int n_mbs = 1 << xd->mode_info_context->mbmi.sb_type;
+    const int mb_col = MbIndex % cm->mb_cols;
+    const int mb_row = MbIndex / cm->mb_cols;
+    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
+    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
+    int x, y;
+    unsigned seg_id = -1;
+
+    for (y = mb_row; y < mb_row + y_mbs; y++) {
+      for (x = mb_col; x < mb_col + x_mbs; x++) {
+        seg_id = MIN(seg_id, cm->last_frame_seg_map[cm->mb_cols * y + x]);
+      }
     }
+
     return seg_id;
   }
-#endif
 }
 
 MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,
@@ -383,26 +388,13 @@
   int tot_count;
 
   tot_count = count[0] + count[1] + count[2] + count[3];
-  if (tot_count) {
-    probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count);
-    probs[0] += !probs[0];
-  } else
-    probs[0] = 128;
+  probs[0] = get_prob(count[0], tot_count);
 
   tot_count -= count[0];
-  if (tot_count) {
-    probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count);
-    probs[1] += !probs[1];
-  } else
-    probs[1] = 128;
+  probs[1] = get_prob(count[1], tot_count);
 
   tot_count -= count[1];
-  if (tot_count) {
-    probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count);
-    probs[2] += !probs[2];
-  } else
-    probs[2] = 128;
-
+  probs[2] = get_prob(count[2], tot_count);
 }
 
 // Computes a set of modified conditional probabilities for the reference frame
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_type_aliases.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"
 
@@ -53,4 +52,4 @@
                                        const MACROBLOCKD *const xd);
 extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);
 
-#endif /* __INC_PRED_COMMON_H__ */
+#endif  // VP9_COMMON_VP9_PRED_COMMON_H_
--- a/vp9/common/vp9_quant_common.h
+++ b/vp9/common/vp9_quant_common.h
@@ -8,12 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_
+#define VP9_COMMON_VP9_QUANT_COMMON_H_
 
 #include "string.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-extern void vp9_init_quant_tables();
+extern void vp9_init_quant_tables(void);
 extern int vp9_ac_yquant(int QIndex);
 extern int vp9_dc_quant(int QIndex, int Delta);
 extern int vp9_dc2quant(int QIndex, int Delta);
@@ -20,3 +22,5 @@
 extern int vp9_ac2quant(int QIndex, int Delta);
 extern int vp9_dc_uv_quant(int QIndex, int Delta);
 extern int vp9_ac_uv_quant(int QIndex, int Delta);
+
+#endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
--- a/vp9/common/vp9_recon.c
+++ b/vp9/common/vp9_recon.c
@@ -13,26 +13,15 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_recon_b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
+void vp9_recon_b_c(uint8_t *pred_ptr,
+                   int16_t *diff_ptr,
+                   uint8_t *dst_ptr,
+                   int stride) {
   int r, c;
 
   for (r = 0; r < 4; r++) {
     for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
     }
 
     dst_ptr += stride;
@@ -41,26 +30,15 @@
   }
 }
 
-void vp9_recon_uv_b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
+void vp9_recon_uv_b_c(uint8_t *pred_ptr,
+                      int16_t *diff_ptr,
+                      uint8_t *dst_ptr,
+                      int stride) {
   int r, c;
 
   for (r = 0; r < 4; r++) {
     for (c = 0; c < 4; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
     }
 
     dst_ptr += stride;
@@ -68,26 +46,16 @@
     pred_ptr += 8;
   }
 }
-void vp9_recon4b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
+
+void vp9_recon4b_c(uint8_t *pred_ptr,
+                   int16_t *diff_ptr,
+                   uint8_t *dst_ptr,
+                   int stride) {
   int r, c;
 
   for (r = 0; r < 4; r++) {
     for (c = 0; c < 16; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
     }
 
     dst_ptr += stride;
@@ -96,26 +64,15 @@
   }
 }
 
-void vp9_recon2b_c
-(
-  unsigned char *pred_ptr,
-  short *diff_ptr,
-  unsigned char *dst_ptr,
-  int stride
-) {
+void vp9_recon2b_c(uint8_t *pred_ptr,
+                   int16_t *diff_ptr,
+                   uint8_t *dst_ptr,
+                   int stride) {
   int r, c;
 
   for (r = 0; r < 4; r++) {
     for (c = 0; c < 8; c++) {
-      int a = diff_ptr[c] + pred_ptr[c];
-
-      if (a < 0)
-        a = 0;
-
-      if (a > 255)
-        a = 255;
-
-      dst_ptr[c] = (unsigned char) a;
+      dst_ptr[c] = clip_pixel(diff_ptr[c] + pred_ptr[c]);
     }
 
     dst_ptr += stride;
@@ -124,21 +81,15 @@
   }
 }
 
-#if CONFIG_SUPERBLOCKS
 void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
   int x, y;
   BLOCKD *b = &xd->block[0];
   int stride = b->dst_stride;
-  short *diff = b->diff;
+  int16_t *diff = b->diff;
 
   for (y = 0; y < 16; y++) {
     for (x = 0; x < 16; x++) {
-      int a = dst[x] + diff[x];
-      if (a < 0)
-        a = 0;
-      else if (a > 255)
-        a = 255;
-      dst[x] = a;
+      dst[x] = clip_pixel(dst[x] + diff[x]);
     }
     dst += stride;
     diff += 16;
@@ -152,16 +103,11 @@
   for (i = 0; i < 2; i++, dst = vdst) {
     BLOCKD *b = &xd->block[16 + 4 * i];
     int stride = b->dst_stride;
-    short *diff = b->diff;
+    int16_t *diff = b->diff;
 
     for (y = 0; y < 8; y++) {
       for (x = 0; x < 8; x++) {
-        int a = dst[x] + diff[x];
-        if (a < 0)
-          a = 0;
-        else if (a > 255)
-          a = 255;
-        dst[x] = a;
+        dst[x] = clip_pixel(dst[x] + diff[x]);
       }
       dst += stride;
       diff += 8;
@@ -168,7 +114,36 @@
     }
   }
 }
-#endif
+
+void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) {
+  int x, y, stride = xd->block[0].dst_stride;
+  int16_t *diff = xd->sb_coeff_data.diff;
+
+  for (y = 0; y < 32; y++) {
+    for (x = 0; x < 32; x++) {
+      dst[x] = clip_pixel(dst[x] + diff[x]);
+    }
+    dst += stride;
+    diff += 32;
+  }
+}
+
+void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {
+  int x, y, stride = xd->block[16].dst_stride;
+  int16_t *udiff = xd->sb_coeff_data.diff + 1024;
+  int16_t *vdiff = xd->sb_coeff_data.diff + 1280;
+
+  for (y = 0; y < 16; y++) {
+    for (x = 0; x < 16; x++) {
+      udst[x] = clip_pixel(udst[x] + udiff[x]);
+      vdst[x] = clip_pixel(vdst[x] + vdiff[x]);
+    }
+    udst += stride;
+    vdst += stride;
+    udiff += 16;
+    vdiff += 16;
+  }
+}
 
 void vp9_recon_mby_c(MACROBLOCKD *xd) {
   int i;
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -18,45 +18,58 @@
 void vp9_setup_interp_filters(MACROBLOCKD *xd,
                               INTERPOLATIONFILTERTYPE mcomp_filter_type,
                               VP9_COMMON *cm) {
+#if CONFIG_ENABLE_6TAP
   if (mcomp_filter_type == SIXTAP) {
-    xd->subpixel_predict        = vp9_sixtap_predict;
+    xd->subpixel_predict4x4     = vp9_sixtap_predict4x4;
     xd->subpixel_predict8x4     = vp9_sixtap_predict8x4;
     xd->subpixel_predict8x8     = vp9_sixtap_predict8x8;
     xd->subpixel_predict16x16   = vp9_sixtap_predict16x16;
-    xd->subpixel_predict_avg    = vp9_sixtap_predict_avg;
+    xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4;
     xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8;
     xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16;
-  } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
-    xd->subpixel_predict        = vp9_eighttap_predict;
+  } else {
+#endif
+  if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
+    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4;
     xd->subpixel_predict8x4     = vp9_eighttap_predict8x4;
     xd->subpixel_predict8x8     = vp9_eighttap_predict8x8;
     xd->subpixel_predict16x16   = vp9_eighttap_predict16x16;
-    xd->subpixel_predict_avg    = vp9_eighttap_predict_avg4x4;
+    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4;
     xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8;
     xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16;
+  } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) {
+    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_smooth;
+    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_smooth;
+    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_smooth;
+    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_smooth;
+    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth;
+    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth;
+    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth;
   } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
-    xd->subpixel_predict        = vp9_eighttap_predict_sharp;
+    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_sharp;
     xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_sharp;
     xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_sharp;
     xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_sharp;
-    xd->subpixel_predict_avg    = vp9_eighttap_predict_avg4x4_sharp;
+    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp;
     xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp;
     xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c;
-  }
-  else {
-    xd->subpixel_predict        = vp9_bilinear_predict4x4;
+  } else {
+    xd->subpixel_predict4x4     = vp9_bilinear_predict4x4;
     xd->subpixel_predict8x4     = vp9_bilinear_predict8x4;
     xd->subpixel_predict8x8     = vp9_bilinear_predict8x8;
     xd->subpixel_predict16x16   = vp9_bilinear_predict16x16;
-    xd->subpixel_predict_avg    = vp9_bilinear_predict_avg4x4;
+    xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4;
     xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8;
     xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16;
   }
+#if CONFIG_ENABLE_6TAP
+  }
+#endif
 }
 
-void vp9_copy_mem16x16_c(unsigned char *src,
+void vp9_copy_mem16x16_c(uint8_t *src,
                          int src_stride,
-                         unsigned char *dst,
+                         uint8_t *dst,
                          int dst_stride) {
   int r;
 
@@ -91,9 +104,9 @@
   }
 }
 
-void vp9_avg_mem16x16_c(unsigned char *src,
+void vp9_avg_mem16x16_c(uint8_t *src,
                         int src_stride,
-                        unsigned char *dst,
+                        uint8_t *dst,
                         int dst_stride) {
   int r;
 
@@ -109,9 +122,9 @@
   }
 }
 
-void vp9_copy_mem8x8_c(unsigned char *src,
+void vp9_copy_mem8x8_c(uint8_t *src,
                        int src_stride,
-                       unsigned char *dst,
+                       uint8_t *dst,
                        int dst_stride) {
   int r;
 
@@ -134,9 +147,9 @@
   }
 }
 
-void vp9_avg_mem8x8_c(unsigned char *src,
+void vp9_avg_mem8x8_c(uint8_t *src,
                       int src_stride,
-                      unsigned char *dst,
+                      uint8_t *dst,
                       int dst_stride) {
   int r;
 
@@ -152,9 +165,9 @@
   }
 }
 
-void vp9_copy_mem8x4_c(unsigned char *src,
+void vp9_copy_mem8x4_c(uint8_t *src,
                        int src_stride,
-                       unsigned char *dst,
+                       uint8_t *dst,
                        int dst_stride) {
   int r;
 
@@ -179,9 +192,9 @@
 
 void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
   int r;
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
   int_mv mv;
 
   ptr_base = *(d->base_pre);
@@ -221,9 +234,9 @@
 void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
                                       vp9_subpix_fn_t sppf) {
   int r;
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
   int_mv mv;
 
   ptr_base = *(d->base_second_pre);
@@ -251,9 +264,9 @@
 }
 
 void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
   int_mv mv;
 
   ptr_base = *(d->base_pre);
@@ -277,9 +290,9 @@
  */
 void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
                                       BLOCKD *d, int pitch) {
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
   int_mv mv;
 
   ptr_base = *(d->base_second_pre);
@@ -296,9 +309,9 @@
 }
 
 static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
-  unsigned char *ptr_base;
-  unsigned char *ptr;
-  unsigned char *pred_ptr = d->predictor;
+  uint8_t *ptr_base;
+  uint8_t *ptr;
+  uint8_t *pred_ptr = d->predictor;
   int_mv mv;
 
   ptr_base = *(d->base_pre);
@@ -314,133 +327,7 @@
   }
 }
 
-
 /*encoder only*/
-#if CONFIG_PRED_FILTER
-
-// Select the thresholded or non-thresholded filter
-#define USE_THRESH_FILTER 0
-
-#define PRED_FILT_LEN 5
-
-static const int filt_shift = 4;
-static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1};
-// Alternative filter {1, 1, 4, 1, 1}
-
-#if !USE_THRESH_FILTER
-void filter_mb(unsigned char *src, int src_stride,
-               unsigned char *dst, int dst_stride,
-               int width, int height) {
-  int i, j, k;
-  unsigned int Temp[32 * 32];
-  unsigned int  *pTmp = Temp;
-  unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2);
-
-  // Horizontal
-  for (i = 0; i < height + PRED_FILT_LEN - 1; i++) {
-    for (j = 0; j < width; j++) {
-      int sum = 0;
-      for (k = 0; k < PRED_FILT_LEN; k++)
-        sum += pSrc[j + k] * pred_filter[k];
-      pTmp[j] = sum;
-    }
-
-    pSrc += src_stride;
-    pTmp += width;
-  }
-
-  // Vertical
-  pTmp = Temp;
-  for (i = 0; i < width; i++) {
-    unsigned char *pDst = dst + i;
-    for (j = 0; j < height; j++) {
-      int sum = 0;
-      for (k = 0; k < PRED_FILT_LEN; k++)
-        sum += pTmp[(j + k) * width] * pred_filter[k];
-      // Round
-      sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1);
-      pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum);
-    }
-    ++pTmp;
-  }
-}
-#else
-// Based on vp9_post_proc_down_and_across_c (vp9_postproc.c)
-void filter_mb(unsigned char *src, int src_stride,
-               unsigned char *dst, int dst_stride,
-               int width, int height) {
-  unsigned char *pSrc, *pDst;
-  int row;
-  int col;
-  int i;
-  int v;
-  unsigned char d[8];
-
-  /* TODO flimit should be linked to the quantizer value */
-  int flimit = 7;
-
-  for (row = 0; row < height; row++) {
-    /* post_proc_down for one row */
-    pSrc = src;
-    pDst = dst;
-
-    for (col = 0; col < width; col++) {
-      int kernel = (1 << (filt_shift - 1));
-      int v = pSrc[col];
-
-      for (i = -2; i <= 2; i++) {
-        if (abs(v - pSrc[col + i * src_stride]) > flimit)
-          goto down_skip_convolve;
-
-        kernel += pred_filter[2 + i] * pSrc[col + i * src_stride];
-      }
-
-      v = (kernel >> filt_shift);
-    down_skip_convolve:
-      pDst[col] = v;
-    }
-
-    /* now post_proc_across */
-    pSrc = dst;
-    pDst = dst;
-
-    for (i = 0; i < 8; i++)
-      d[i] = pSrc[i];
-
-    for (col = 0; col < width; col++) {
-      int kernel = (1 << (filt_shift - 1));
-      v = pSrc[col];
-
-      d[col & 7] = v;
-
-      for (i = -2; i <= 2; i++) {
-        if (abs(v - pSrc[col + i]) > flimit)
-          goto across_skip_convolve;
-
-        kernel += pred_filter[2 + i] * pSrc[col + i];
-      }
-
-      d[col & 7] = (kernel >> filt_shift);
-    across_skip_convolve:
-
-      if (col >= 2)
-        pDst[col - 2] = d[(col - 2) & 7];
-    }
-
-    /* handle the last two pixels */
-    pDst[col - 2] = d[(col - 2) & 7];
-    pDst[col - 1] = d[(col - 1) & 7];
-
-    /* next row */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-#endif  // !USE_THRESH_FILTER
-
-#endif  // CONFIG_PRED_FILTER
-
-/*encoder only*/
 void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
   int i, j;
   BLOCKD *blockd = xd->block;
@@ -524,13 +411,13 @@
     if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
       build_inter_predictors2b(xd, d0, 8);
     else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
+      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
+      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
     }
 
     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
+      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
+      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
     }
   }
 }
@@ -573,11 +460,11 @@
 
 /*encoder only*/
 void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             unsigned char *dst_y,
+                                             uint8_t *dst_y,
                                              int dst_ystride,
                                              int clamp_mvs) {
-  unsigned char *ptr_base = xd->pre.y_buffer;
-  unsigned char *ptr;
+  uint8_t *ptr_base = xd->pre.y_buffer;
+  uint8_t *ptr;
   int pre_stride = xd->block[0].pre_stride;
   int_mv ymv;
 
@@ -588,30 +475,7 @@
 
   ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
 
-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
     if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
-      // Sub-pel filter needs extended input
-      int len = 15 + (VP9_INTERP_EXTEND << 1);
-      unsigned char Temp[32 * 32]; // Data required by sub-pel filter
-      unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1);
-
-      // Copy extended MB into Temp array, applying the spatial filter
-      filter_mb(ptr - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                Temp, len, len, len);
-
-      // Sub-pel interpolation
-      xd->subpixel_predict16x16(pTemp, len,
-                                (ymv.as_mv.col & 7) << 1,
-                                (ymv.as_mv.row & 7) << 1,
-                                dst_y, dst_ystride);
-    } else {
-      // Apply spatial filter to create the prediction directly
-      filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16);
-    }
-  } else
-#endif
-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
       xd->subpixel_predict16x16(ptr, pre_stride,
                                 (ymv.as_mv.col & 7) << 1,
                                 (ymv.as_mv.row & 7) << 1,
@@ -622,11 +486,11 @@
 }
 
 void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              unsigned char *dst_u,
-                                              unsigned char *dst_v,
+                                              uint8_t *dst_u,
+                                              uint8_t *dst_v,
                                               int dst_uvstride) {
   int offset;
-  unsigned char *uptr, *vptr;
+  uint8_t *uptr, *vptr;
   int pre_stride = xd->block[0].pre_stride;
   int_mv _o16x16mv;
   int_mv _16x16mv;
@@ -659,37 +523,6 @@
   uptr = xd->pre.u_buffer + offset;
   vptr = xd->pre.v_buffer + offset;
 
-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    int i;
-    unsigned char *pSrc = uptr;
-    unsigned char *pDst = dst_u;
-    int len = 7 + (VP9_INTERP_EXTEND << 1);
-    unsigned char Temp[32 * 32]; // Data required by the sub-pel filter
-    unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1);
-
-    // U & V
-    for (i = 0; i < 2; i++) {
-      if (_o16x16mv.as_int & 0x000f000f) {
-        // Copy extended MB into Temp array, applying the spatial filter
-        filter_mb(pSrc - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                  Temp, len, len, len);
-
-        // Sub-pel filter
-        xd->subpixel_predict8x8(pTemp, len,
-                                _o16x16mv.as_mv.col & 15,
-                                _o16x16mv.as_mv.row & 15,
-                                pDst, dst_uvstride);
-      } else {
-        filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8);
-      }
-
-      // V
-      pSrc = vptr;
-      pDst = dst_v;
-    }
-  } else
-#endif
     if (_o16x16mv.as_int & 0x000f000f) {
       xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
                               _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
@@ -703,9 +536,9 @@
 
 
 void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            unsigned char *dst_y,
-                                            unsigned char *dst_u,
-                                            unsigned char *dst_v,
+                                            uint8_t *dst_y,
+                                            uint8_t *dst_u,
+                                            uint8_t *dst_v,
                                             int dst_ystride, int dst_uvstride) {
   vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,
       xd->mode_info_context->mbmi.need_to_clamp_mvs);
@@ -712,11 +545,10 @@
   vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);
 }
 
-#if CONFIG_SUPERBLOCKS
 void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                        unsigned char *dst_y,
-                                        unsigned char *dst_u,
-                                        unsigned char *dst_v,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
                                         int dst_ystride,
                                         int dst_uvstride) {
   uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
@@ -781,14 +613,77 @@
   }
 #endif
 }
+
+void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
+                                        uint8_t *dst_y,
+                                        uint8_t *dst_u,
+                                        uint8_t *dst_v,
+                                        int dst_ystride,
+                                        int dst_uvstride) {
+  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;
+  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,
+          *v2 = x->second_pre.v_buffer;
+  int edge[4], n;
+
+  edge[0] = x->mb_to_top_edge;
+  edge[1] = x->mb_to_bottom_edge;
+  edge[2] = x->mb_to_left_edge;
+  edge[3] = x->mb_to_right_edge;
+
+  for (n = 0; n < 4; n++) {
+    const int x_idx = n & 1, y_idx = n >> 1;
+
+    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);
+    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);
+    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);
+    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);
+
+    x->pre.y_buffer = y1 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
+    x->pre.u_buffer = u1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+    x->pre.v_buffer = v1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+
+    if (x->mode_info_context->mbmi.second_ref_frame > 0) {
+      x->second_pre.y_buffer = y2 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;
+      x->second_pre.u_buffer = u2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+      x->second_pre.v_buffer = v2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;
+    }
+
+    vp9_build_inter32x32_predictors_sb(x,
+        dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,
+        dst_u + y_idx * 16 * dst_uvstride + x_idx * 16,
+        dst_v + y_idx * 16 * dst_uvstride + x_idx * 16,
+        dst_ystride, dst_uvstride);
+  }
+
+  x->mb_to_top_edge    = edge[0];
+  x->mb_to_bottom_edge = edge[1];
+  x->mb_to_left_edge   = edge[2];
+  x->mb_to_right_edge  = edge[3];
+
+  x->pre.y_buffer = y1;
+  x->pre.u_buffer = u1;
+  x->pre.v_buffer = v1;
+
+  if (x->mode_info_context->mbmi.second_ref_frame > 0) {
+    x->second_pre.y_buffer = y2;
+    x->second_pre.u_buffer = u2;
+    x->second_pre.v_buffer = v2;
+  }
+
+#if CONFIG_COMP_INTERINTRA_PRED
+  if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+    vp9_build_interintra_64x64_predictors_sb(x, dst_y, dst_u, dst_v,
+                                             dst_ystride, dst_uvstride);
+  }
 #endif
+}
 
 /*
  * The following functions should be called after an initial
  * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().
- * It will run a second sixtap filter on a (different) ref
+ * It will run a second filter on a (different) ref
  * frame and average the result with the output of the
- * first sixtap filter. The second reference frame is stored
+ * first filter. The second reference frame is stored
  * in x->second_pre (the reference frame index is in
  * x->mode_info_context->mbmi.second_ref_frame). The second
  * motion vector is x->mode_info_context->mbmi.second_mv.
@@ -798,15 +693,15 @@
  * single reference framer.
  */
 void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                             unsigned char *dst_y,
+                                             uint8_t *dst_y,
                                              int dst_ystride) {
-  unsigned char *ptr;
+  uint8_t *ptr;
 
   int_mv _16x16mv;
   int mv_row;
   int mv_col;
 
-  unsigned char *ptr_base = xd->second_pre.y_buffer;
+  uint8_t *ptr_base = xd->second_pre.y_buffer;
   int pre_stride = xd->block[0].pre_stride;
 
   _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;
@@ -819,44 +714,20 @@
 
   ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
 
-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    if ((mv_row | mv_col) & 7) {
-      // Sub-pel filter needs extended input
-      int len = 15 + (VP9_INTERP_EXTEND << 1);
-      unsigned char Temp[32 * 32]; // Data required by sub-pel filter
-      unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1);
-
-      // Copy extended MB into Temp array, applying the spatial filter
-      filter_mb(ptr - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                Temp, len, len, len);
-
-      // Sub-pel filter
-      xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1,
-                                    (mv_row & 7) << 1, dst_y, dst_ystride);
-    } else {
-      // TODO Needs to AVERAGE with the dst_y
-      // For now, do not apply the prediction filter in these cases!
-      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-    }
-  } else
-#endif  // CONFIG_PRED_FILTER
-  {
-    if ((mv_row | mv_col) & 7) {
-      xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
-                                    (mv_row & 7) << 1, dst_y, dst_ystride);
-    } else {
-      vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
-    }
+  if ((mv_row | mv_col) & 7) {
+    xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
+                                  (mv_row & 7) << 1, dst_y, dst_ystride);
+  } else {
+    vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
   }
 }
 
 void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                              unsigned char *dst_u,
-                                              unsigned char *dst_v,
+                                              uint8_t *dst_u,
+                                              uint8_t *dst_v,
                                               int dst_uvstride) {
   int offset;
-  unsigned char *uptr, *vptr;
+  uint8_t *uptr, *vptr;
 
   int_mv _16x16mv;
   int mv_row;
@@ -887,37 +758,6 @@
   uptr = xd->second_pre.u_buffer + offset;
   vptr = xd->second_pre.v_buffer + offset;
 
-#if CONFIG_PRED_FILTER
-  if (xd->mode_info_context->mbmi.pred_filter_enabled) {
-    int i;
-    int len = 7 + (VP9_INTERP_EXTEND << 1);
-    unsigned char Temp[32 * 32]; // Data required by sub-pel filter
-    unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1);
-    unsigned char *pSrc = uptr;
-    unsigned char *pDst = dst_u;
-
-    // U & V
-    for (i = 0; i < 2; i++) {
-      if ((omv_row | omv_col) & 15) {
-        // Copy extended MB into Temp array, applying the spatial filter
-        filter_mb(pSrc - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride,
-                  Temp, len, len, len);
-
-        // Sub-pel filter
-        xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15,
-                                    omv_row & 15, pDst, dst_uvstride);
-      } else {
-        // TODO Needs to AVERAGE with the dst_[u|v]
-        // For now, do not apply the prediction filter here!
-        vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride);
-      }
-
-      // V
-      pSrc = vptr;
-      pDst = dst_v;
-    }
-  } else
-#endif  // CONFIG_PRED_FILTER
     if ((omv_row | omv_col) & 15) {
       xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
                                   omv_row & 15, dst_u, dst_uvstride);
@@ -930,9 +770,9 @@
 }
 
 void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                            unsigned char *dst_y,
-                                            unsigned char *dst_u,
-                                            unsigned char *dst_v,
+                                            uint8_t *dst_y,
+                                            uint8_t *dst_u,
+                                            uint8_t *dst_v,
                                             int dst_ystride,
                                             int dst_uvstride) {
   vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);
@@ -995,13 +835,13 @@
       if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
         build_inter_predictors2b(xd, d0, 16);
       else {
-        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict);
-        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict);
+        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4);
+        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4);
       }
 
       if (mbmi->second_ref_frame > 0) {
-        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg);
-        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg);
+        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4);
+        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4);
       }
     }
   }
@@ -1013,13 +853,13 @@
     if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
       build_inter_predictors2b(xd, d0, 8);
     else {
-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict);
-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict);
+      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
+      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
     }
 
     if (mbmi->second_ref_frame > 0) {
-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg);
-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg);
+      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
+      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
     }
   }
 }
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -11,50 +11,56 @@
 #ifndef VP9_COMMON_VP9_RECONINTER_H_
 #define VP9_COMMON_VP9_RECONINTER_H_
 
+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
 extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    unsigned char *dst_y,
+                                                    uint8_t *dst_y,
                                                     int dst_ystride,
                                                     int clamp_mvs);
 
 extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     unsigned char *dst_u,
-                                                     unsigned char *dst_v,
+                                                     uint8_t *dst_u,
+                                                     uint8_t *dst_v,
                                                      int dst_uvstride);
 
 extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   unsigned char *dst_y,
-                                                   unsigned char *dst_u,
-                                                   unsigned char *dst_v,
+                                                   uint8_t *dst_y,
+                                                   uint8_t *dst_u,
+                                                   uint8_t *dst_v,
                                                    int dst_ystride,
                                                    int dst_uvstride);
 
 extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
-                                                    unsigned char *dst_y,
+                                                    uint8_t *dst_y,
                                                     int dst_ystride);
 
 extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                     unsigned char *dst_u,
-                                                     unsigned char *dst_v,
+                                                     uint8_t *dst_u,
+                                                     uint8_t *dst_v,
                                                      int dst_uvstride);
 
 extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
-                                                   unsigned char *dst_y,
-                                                   unsigned char *dst_u,
-                                                   unsigned char *dst_v,
+                                                   uint8_t *dst_y,
+                                                   uint8_t *dst_u,
+                                                   uint8_t *dst_v,
                                                    int dst_ystride,
                                                    int dst_uvstride);
 
-#if CONFIG_SUPERBLOCKS
 extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,
-                                               unsigned char *dst_y,
-                                               unsigned char *dst_u,
-                                               unsigned char *dst_v,
+                                               uint8_t *dst_y,
+                                               uint8_t *dst_u,
+                                               uint8_t *dst_v,
                                                int dst_ystride,
                                                int dst_uvstride);
-#endif
 
+extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
+                                               uint8_t *dst_y,
+                                               uint8_t *dst_u,
+                                               uint8_t *dst_v,
+                                               int dst_ystride,
+                                               int dst_uvstride);
+
 extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
 
 extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
@@ -75,4 +81,4 @@
                                      INTERPOLATIONFILTERTYPE filter,
                                      VP9_COMMON *cm);
 
-#endif  // __INC_RECONINTER_H
+#endif  // VP9_COMMON_VP9_RECONINTER_H_
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -124,18 +124,20 @@
     }
   }
   for (c = 0; c <= r; ++c) {
-    int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1];
-    int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1];
-    yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext));
-    yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext));
+    int yabove_ext = yabove_row[r];  // clip_pixel(2 * yabove_row[r] -
+                                     //            yabove_row[r - 1]);
+    int yleft_ext = yleft_col[r];  // clip_pixel(2 * yleft_col[r] -
+                                   //            yleft_col[r-1]);
     ypred_ptr[(r - c) * y_stride + c] =
       (yabove_ext * (c + 1) +
        yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);
   }
   for (r = 1; r < n; ++r) {
-    for (c = n - r; c < n; ++c)
-      ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] +
-                                     ypred_ptr[r * y_stride + c - 1] + 1) >> 1;
+    for (c = n - r; c < n; ++c) {
+      const int yabove_ext = ypred_ptr[(r - 1) * y_stride + c];
+      const int yleft_ext = ypred_ptr[r * y_stride + c - 1];
+      ypred_ptr[r * y_stride + c] = (yabove_ext + yleft_ext + 1) >> 1;
+    }
   }
 }
 
@@ -196,9 +198,9 @@
   }
 }
 
-static void corner_predictor(unsigned char *ypred_ptr, int y_stride, int n,
-                             unsigned char *yabove_row,
-                             unsigned char *yleft_col) {
+static void corner_predictor(uint8_t *ypred_ptr, int y_stride, int n,
+                             uint8_t *yabove_row,
+                             uint8_t *yleft_col) {
   int mh, mv, maxgradh, maxgradv, x, y, nx, ny;
   int i, j;
   int top_left = yabove_row[-1];
@@ -246,14 +248,14 @@
   }
 }
 
-void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride,
-                                         unsigned char *ypred_ptr,
+void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride,
+                                         uint8_t *ypred_ptr,
                                          int y_stride, int mode, int bsize,
                                          int up_available, int left_available) {
 
-  unsigned char *yabove_row = src - src_stride;
-  unsigned char yleft_col[32];
-  unsigned char ytop_left = yabove_row[-1];
+  uint8_t *yabove_row = src - src_stride;
+  uint8_t yleft_col[64];
+  uint8_t ytop_left = yabove_row[-1];
   int r, c, i;
 
   for (i = 0; i < bsize; i++) {
@@ -269,7 +271,8 @@
       int average = 0;
       int log2_bsize_minus_1;
 
-      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);
+      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32 ||
+             bsize == 64);
       if (bsize == 4) {
         log2_bsize_minus_1 = 1;
       } else if (bsize == 8) {
@@ -276,8 +279,11 @@
         log2_bsize_minus_1 = 2;
       } else if (bsize == 16) {
         log2_bsize_minus_1 = 3;
-      } else /* bsize == 32 */ {
+      } else if (bsize == 32) {
         log2_bsize_minus_1 = 4;
+      } else {
+        assert(bsize == 64);
+        log2_bsize_minus_1 = 5;
       }
 
       if (up_available || left_available) {
@@ -321,15 +327,7 @@
     case TM_PRED: {
       for (r = 0; r < bsize; r++) {
         for (c = 0; c < bsize; c++) {
-          int pred =  yleft_col[r] + yabove_row[ c] - ytop_left;
-
-          if (pred < 0)
-            pred = 0;
-
-          if (pred > 255)
-            pred = 255;
-
-          ypred_ptr[c] = pred;
+          ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left);
         }
 
         ypred_ptr += y_stride;
@@ -374,9 +372,9 @@
 
 #if CONFIG_COMP_INTERINTRA_PRED
 static void combine_interintra(MB_PREDICTION_MODE mode,
-                               unsigned char *interpred,
+                               uint8_t *interpred,
                                int interstride,
-                               unsigned char *intrapred,
+                               uint8_t *intrapred,
                                int intrastride,
                                int size) {
   // TODO(debargha): Explore different ways of combining predictors
@@ -523,9 +521,10 @@
     71,  70,  70,  70,  69,  69,  69,  68,
     68,  68,  68,  68,  67,  67,  67,  67,
   };
-  int size_scale = (size == 32 ? 1 :
+  int size_scale = (size >= 32 ? 1 :
                     size == 16 ? 2 :
                     size == 8  ? 4 : 8);
+  int size_shift = size == 64 ? 1 : 0;
   int i, j;
   switch (mode) {
     case V_PRED:
@@ -532,7 +531,7 @@
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = weights1d[i * size_scale];
+          int scale = weights1d[i * size_scale >> size_shift];
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -545,7 +544,7 @@
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = weights1d[j * size_scale];
+          int scale = weights1d[j * size_scale >> size_shift];
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -559,8 +558,9 @@
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = (weights2d[i * size_scale * 32 + j * size_scale] +
-                       weights1d[i * size_scale]) >> 1;
+          int scale = (weights2d[(i * size_scale * 32 +
+                                  j * size_scale) >> size_shift] +
+                       weights1d[i * size_scale >> size_shift]) >> 1;
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -574,8 +574,9 @@
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = (weights2d[i * size_scale * 32 + j * size_scale] +
-                       weights1d[j * size_scale]) >> 1;
+          int scale = (weights2d[(i * size_scale * 32 +
+                                  j * size_scale) >> size_shift] +
+                       weights1d[j * size_scale >> size_shift]) >> 1;
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -588,7 +589,8 @@
       for (i = 0; i < size; ++i) {
         for (j = 0; j < size; ++j) {
           int k = i * interstride + j;
-          int scale = weights2d[i * size_scale * 32 + j * size_scale];
+          int scale = weights2d[(i * size_scale * 32 +
+                                 j * size_scale) >> size_shift];
           interpred[k] =
               ((scale_max - scale) * interpred[k] +
                scale * intrapred[i * intrastride + j] + scale_round)
@@ -613,9 +615,9 @@
 }
 
 void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
-                                              unsigned char *ypred,
-                                              unsigned char *upred,
-                                              unsigned char *vpred,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
                                               int ystride, int uvstride) {
   vp9_build_interintra_16x16_predictors_mby(xd, ypred, ystride);
   vp9_build_interintra_16x16_predictors_mbuv(xd, upred, vpred, uvstride);
@@ -622,9 +624,9 @@
 }
 
 void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
-                                               unsigned char *ypred,
+                                               uint8_t *ypred,
                                                int ystride) {
-  unsigned char intrapredictor[256];
+  uint8_t intrapredictor[256];
   vp9_build_intra_predictors_internal(
       xd->dst.y_buffer, xd->dst.y_stride,
       intrapredictor, 16,
@@ -635,11 +637,11 @@
 }
 
 void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                unsigned char *upred,
-                                                unsigned char *vpred,
+                                                uint8_t *upred,
+                                                uint8_t *vpred,
                                                 int uvstride) {
-  unsigned char uintrapredictor[64];
-  unsigned char vintrapredictor[64];
+  uint8_t uintrapredictor[64];
+  uint8_t vintrapredictor[64];
   vp9_build_intra_predictors_internal(
       xd->dst.u_buffer, xd->dst.uv_stride,
       uintrapredictor, 8,
@@ -656,11 +658,10 @@
                      vpred, uvstride, vintrapredictor, 8, 8);
 }
 
-#if CONFIG_SUPERBLOCKS
 void vp9_build_interintra_32x32_predictors_sby(MACROBLOCKD *xd,
-                                               unsigned char *ypred,
+                                               uint8_t *ypred,
                                                int ystride) {
-  unsigned char intrapredictor[1024];
+  uint8_t intrapredictor[1024];
   vp9_build_intra_predictors_internal(
       xd->dst.y_buffer, xd->dst.y_stride,
       intrapredictor, 32,
@@ -671,11 +672,11 @@
 }
 
 void vp9_build_interintra_32x32_predictors_sbuv(MACROBLOCKD *xd,
-                                                unsigned char *upred,
-                                                unsigned char *vpred,
+                                                uint8_t *upred,
+                                                uint8_t *vpred,
                                                 int uvstride) {
-  unsigned char uintrapredictor[256];
-  unsigned char vintrapredictor[256];
+  uint8_t uintrapredictor[256];
+  uint8_t vintrapredictor[256];
   vp9_build_intra_predictors_internal(
       xd->dst.u_buffer, xd->dst.uv_stride,
       uintrapredictor, 16,
@@ -693,17 +694,57 @@
 }
 
 void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                              unsigned char *ypred,
-                                              unsigned char *upred,
-                                              unsigned char *vpred,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
                                               int ystride,
                                               int uvstride) {
   vp9_build_interintra_32x32_predictors_sby(xd, ypred, ystride);
   vp9_build_interintra_32x32_predictors_sbuv(xd, upred, vpred, uvstride);
 }
-#endif
-#endif
 
+void vp9_build_interintra_64x64_predictors_sby(MACROBLOCKD *xd,
+                                               uint8_t *ypred,
+                                               int ystride) {
+  uint8_t intrapredictor[4096];
+  const int mode = xd->mode_info_context->mbmi.interintra_mode;
+  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+                                      intrapredictor, 64, mode, 64,
+                                      xd->up_available, xd->left_available);
+  combine_interintra(xd->mode_info_context->mbmi.interintra_mode,
+                     ypred, ystride, intrapredictor, 64, 64);
+}
+
+void vp9_build_interintra_64x64_predictors_sbuv(MACROBLOCKD *xd,
+                                                uint8_t *upred,
+                                                uint8_t *vpred,
+                                                int uvstride) {
+  uint8_t uintrapredictor[1024];
+  uint8_t vintrapredictor[1024];
+  const int mode = xd->mode_info_context->mbmi.interintra_uv_mode;
+  vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
+                                      uintrapredictor, 32, mode, 32,
+                                      xd->up_available, xd->left_available);
+  vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
+                                      vintrapredictor, 32, mode, 32,
+                                      xd->up_available, xd->left_available);
+  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
+                     upred, uvstride, uintrapredictor, 32, 32);
+  combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,
+                     vpred, uvstride, vintrapredictor, 32, 32);
+}
+
+void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
+                                              uint8_t *ypred,
+                                              uint8_t *upred,
+                                              uint8_t *vpred,
+                                              int ystride,
+                                              int uvstride) {
+  vp9_build_interintra_64x64_predictors_sby(xd, ypred, ystride);
+  vp9_build_interintra_64x64_predictors_sbuv(xd, upred, vpred, uvstride);
+}
+#endif  // CONFIG_COMP_INTERINTRA_PRED
+
 void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) {
   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->predictor, 16,
@@ -718,7 +759,6 @@
                                       xd->up_available, xd->left_available);
 }
 
-#if CONFIG_SUPERBLOCKS
 void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
                                       xd->dst.y_buffer, xd->dst.y_stride,
@@ -725,33 +765,17 @@
                                       xd->mode_info_context->mbmi.mode, 32,
                                       xd->up_available, xd->left_available);
 }
-#endif
 
-#if CONFIG_COMP_INTRA_PRED
-void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
-  unsigned char predictor[2][256];
-  int i;
-
+void vp9_build_intra_predictors_sb64y_s(MACROBLOCKD *xd) {
   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      predictor[0], 16,
-                                      xd->mode_info_context->mbmi.mode,
-                                      16, xd->up_available,
-                                      xd->left_available);
-  vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
-                                      predictor[1], 16,
-                                      xd->mode_info_context->mbmi.second_mode,
-                                      16, xd->up_available,
-                                      xd->left_available);
-
-  for (i = 0; i < 256; i++) {
-    xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
-  }
+                                      xd->dst.y_buffer, xd->dst.y_stride,
+                                      xd->mode_info_context->mbmi.mode, 64,
+                                      xd->up_available, xd->left_available);
 }
-#endif
 
 void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
-                                              unsigned char *upred_ptr,
-                                              unsigned char *vpred_ptr,
+                                              uint8_t *upred_ptr,
+                                              uint8_t *vpred_ptr,
                                               int uv_stride,
                                               int mode, int bsize) {
   vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
@@ -777,7 +801,6 @@
                                            8);
 }
 
-#if CONFIG_SUPERBLOCKS
 void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) {
   vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
                                            xd->dst.v_buffer, xd->dst.uv_stride,
@@ -784,78 +807,29 @@
                                            xd->mode_info_context->mbmi.uv_mode,
                                            16);
 }
-#endif
 
-#if CONFIG_COMP_INTRA_PRED
-void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
-  unsigned char predictor[2][2][64];
-  int i;
-
-  vp9_build_intra_predictors_mbuv_internal(
-      xd, predictor[0][0], predictor[1][0], 8,
-      xd->mode_info_context->mbmi.uv_mode, 8);
-  vp9_build_intra_predictors_mbuv_internal(
-      xd, predictor[0][1], predictor[1][1], 8,
-      xd->mode_info_context->mbmi.second_uv_mode, 8);
-  for (i = 0; i < 64; i++) {
-    xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1;
-    xd->predictor[256 + 64 + i] = (predictor[1][0][i] +
-                                   predictor[1][1][i] + 1) >> 1;
-  }
+void vp9_build_intra_predictors_sb64uv_s(MACROBLOCKD *xd) {
+  vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer,
+                                           xd->dst.v_buffer, xd->dst.uv_stride,
+                                           xd->mode_info_context->mbmi.uv_mode,
+                                           32);
 }
-#endif
 
 void vp9_intra8x8_predict(BLOCKD *xd,
                           int mode,
-                          unsigned char *predictor) {
+                          uint8_t *predictor) {
   vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
                                       xd->dst_stride, predictor, 16,
                                       mode, 8, 1, 1);
 }
 
-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra8x8_predict(BLOCKD *xd,
-                               int mode, int second_mode,
-                               unsigned char *out_predictor) {
-  unsigned char predictor[2][8 * 16];
-  int i, j;
-
-  vp9_intra8x8_predict(xd, mode, predictor[0]);
-  vp9_intra8x8_predict(xd, second_mode, predictor[1]);
-
-  for (i = 0; i < 8 * 16; i += 16) {
-    for (j = i; j < i + 8; j++) {
-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
-    }
-  }
-}
-#endif
-
 void vp9_intra_uv4x4_predict(BLOCKD *xd,
                              int mode,
-                             unsigned char *predictor) {
+                             uint8_t *predictor) {
   vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
                                       xd->dst_stride, predictor, 8,
                                       mode, 4, 1, 1);
 }
-
-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra_uv4x4_predict(BLOCKD *xd,
-                                  int mode, int mode2,
-                                  unsigned char *out_predictor) {
-  unsigned char predictor[2][8 * 4];
-  int i, j;
-
-  vp9_intra_uv4x4_predict(xd, mode, predictor[0]);
-  vp9_intra_uv4x4_predict(xd, mode2, predictor[1]);
-
-  for (i = 0; i < 4 * 8; i += 8) {
-    for (j = i; j < i + 4; j++) {
-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
-    }
-  }
-}
-#endif
 
 /* TODO: try different ways of use Y-UV mode correlation
    Current code assumes that a uv 4x4 block use same mode
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -11,34 +11,40 @@
 #ifndef VP9_COMMON_VP9_RECONINTRA_H_
 #define VP9_COMMON_VP9_RECONINTRA_H_
 
+#include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
 extern void vp9_recon_intra_mbuv(MACROBLOCKD *xd);
-extern B_PREDICTION_MODE vp9_find_dominant_direction(unsigned char *ptr,
+extern B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
                                                      int stride, int n);
 extern B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);
 #if CONFIG_COMP_INTERINTRA_PRED
 extern void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,
-                                                     unsigned char *ypred,
-                                                     unsigned char *upred,
-                                                     unsigned char *vpred,
+                                                     uint8_t *ypred,
+                                                     uint8_t *upred,
+                                                     uint8_t *vpred,
                                                      int ystride,
                                                      int uvstride);
 extern void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,
-                                                      unsigned char *ypred,
+                                                      uint8_t *ypred,
                                                       int ystride);
 extern void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,
-                                                       unsigned char *upred,
-                                                       unsigned char *vpred,
+                                                       uint8_t *upred,
+                                                       uint8_t *vpred,
                                                        int uvstride);
-#if CONFIG_SUPERBLOCKS
+#endif  // CONFIG_COMP_INTERINTRA_PRED
+
 extern void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,
-                                                     unsigned char *ypred,
-                                                     unsigned char *upred,
-                                                     unsigned char *vpred,
+                                                     uint8_t *ypred,
+                                                     uint8_t *upred,
+                                                     uint8_t *vpred,
                                                      int ystride,
                                                      int uvstride);
-#endif
-#endif
+extern void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,
+                                                     uint8_t *ypred,
+                                                     uint8_t *upred,
+                                                     uint8_t *vpred,
+                                                     int ystride,
+                                                     int uvstride);
 
-#endif  // __INC_RECONINTRA_H
+#endif  // VP9_COMMON_VP9_RECONINTRA_H_
--- a/vp9/common/vp9_reconintra4x4.c
+++ b/vp9/common/vp9_reconintra4x4.c
@@ -15,7 +15,7 @@
 #include "vp9_rtcd.h"
 
 #if CONFIG_NEWBINTRAMODES
-static int find_grad_measure(unsigned char *x, int stride, int n, int t,
+static int find_grad_measure(uint8_t *x, int stride, int n, int t,
                              int dx, int dy) {
   int i, j;
   int count = 0, gsum = 0, gdiv;
@@ -35,8 +35,8 @@
 }
 
 #if CONTEXT_PRED_REPLACEMENTS == 6
-B_PREDICTION_MODE vp9_find_dominant_direction(
-    unsigned char *ptr, int stride, int n) {
+B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
+                                              int stride, int n) {
   int g[8], i, imin, imax;
   g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
   g[2] = find_grad_measure(ptr, stride, n, 4,  1, 1);
@@ -72,8 +72,8 @@
   }
 }
 #elif CONTEXT_PRED_REPLACEMENTS == 4
-B_PREDICTION_MODE vp9_find_dominant_direction(
-    unsigned char *ptr, int stride, int n) {
+B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
+                                              int stride, int n) {
   int g[8], i, imin, imax;
   g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
   g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);
@@ -103,8 +103,8 @@
   }
 }
 #elif CONTEXT_PRED_REPLACEMENTS == 0
-B_PREDICTION_MODE vp9_find_dominant_direction(
-    unsigned char *ptr, int stride, int n) {
+B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,
+                                              int stride, int n) {
   int g[8], i, imin, imax;
   g[0] = find_grad_measure(ptr, stride, n, 4,  1, 0);
   g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);
@@ -145,7 +145,7 @@
 #endif
 
 B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) {
-  unsigned char *ptr = *(x->base_dst) + x->dst;
+  uint8_t *ptr = *(x->base_dst) + x->dst;
   int stride = x->dst_stride;
   return vp9_find_dominant_direction(ptr, stride, 4);
 }
@@ -153,17 +153,17 @@
 
 void vp9_intra4x4_predict(BLOCKD *x,
                           int b_mode,
-                          unsigned char *predictor) {
+                          uint8_t *predictor) {
   int i, r, c;
 
-  unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
-  unsigned char Left[4];
-  unsigned char top_left = Above[-1];
+  uint8_t *above = *(x->base_dst) + x->dst - x->dst_stride;
+  uint8_t left[4];
+  uint8_t top_left = above[-1];
 
-  Left[0] = (*(x->base_dst))[x->dst - 1];
-  Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
-  Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
-  Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
+  left[0] = (*(x->base_dst))[x->dst - 1];
+  left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
+  left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
+  left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
 
 #if CONFIG_NEWBINTRAMODES
   if (b_mode == B_CONTEXT_PRED)
@@ -175,8 +175,8 @@
       int expected_dc = 0;
 
       for (i = 0; i < 4; i++) {
-        expected_dc += Above[i];
-        expected_dc += Left[i];
+        expected_dc += above[i];
+        expected_dc += left[i];
       }
 
       expected_dc = (expected_dc + 4) >> 3;
@@ -194,15 +194,7 @@
       /* prediction similar to true_motion prediction */
       for (r = 0; r < 4; r++) {
         for (c = 0; c < 4; c++) {
-          int pred = Above[c] - top_left + Left[r];
-
-          if (pred < 0)
-            pred = 0;
-
-          if (pred > 255)
-            pred = 255;
-
-          predictor[c] = pred;
+          predictor[c] = clip_pixel(above[c] - top_left + left[r]);
         }
 
         predictor += 16;
@@ -211,34 +203,31 @@
     break;
 
     case B_VE_PRED: {
-
       unsigned int ap[4];
-      ap[0] = Above[0];
-      ap[1] = Above[1];
-      ap[2] = Above[2];
-      ap[3] = Above[3];
 
+      ap[0] = above[0];
+      ap[1] = above[1];
+      ap[2] = above[2];
+      ap[3] = above[3];
+
       for (r = 0; r < 4; r++) {
         for (c = 0; c < 4; c++) {
-
           predictor[c] = ap[c];
         }
 
         predictor += 16;
       }
-
     }
     break;
 
-
     case B_HE_PRED: {
-
       unsigned int lp[4];
-      lp[0] = Left[0];
-      lp[1] = Left[1];
-      lp[2] = Left[2];
-      lp[3] = Left[3];
 
+      lp[0] = left[0];
+      lp[1] = left[1];
+      lp[2] = left[2];
+      lp[3] = left[3];
+
       for (r = 0; r < 4; r++) {
         for (c = 0; c < 4; c++) {
           predictor[c] = lp[r];
@@ -249,7 +238,8 @@
     }
     break;
     case B_LD_PRED: {
-      unsigned char *ptr = Above;
+      uint8_t *ptr = above;
+
       predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
       predictor[0 * 16 + 1] =
         predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
@@ -270,18 +260,17 @@
     }
     break;
     case B_RD_PRED: {
+      uint8_t pp[9];
 
-      unsigned char pp[9];
-
-      pp[0] = Left[3];
-      pp[1] = Left[2];
-      pp[2] = Left[1];
-      pp[3] = Left[0];
+      pp[0] = left[3];
+      pp[1] = left[2];
+      pp[2] = left[1];
+      pp[3] = left[0];
       pp[4] = top_left;
-      pp[5] = Above[0];
-      pp[6] = Above[1];
-      pp[7] = Above[2];
-      pp[8] = Above[3];
+      pp[5] = above[0];
+      pp[6] = above[1];
+      pp[7] = above[2];
+      pp[8] = above[3];
 
       predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
       predictor[3 * 16 + 1] =
@@ -303,20 +292,18 @@
     }
     break;
     case B_VR_PRED: {
+      uint8_t pp[9];
 
-      unsigned char pp[9];
-
-      pp[0] = Left[3];
-      pp[1] = Left[2];
-      pp[2] = Left[1];
-      pp[3] = Left[0];
+      pp[0] = left[3];
+      pp[1] = left[2];
+      pp[2] = left[1];
+      pp[3] = left[0];
       pp[4] = top_left;
-      pp[5] = Above[0];
-      pp[6] = Above[1];
-      pp[7] = Above[2];
-      pp[8] = Above[3];
+      pp[5] = above[0];
+      pp[6] = above[1];
+      pp[7] = above[2];
+      pp[8] = above[3];
 
-
       predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
       predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
       predictor[3 * 16 + 1] =
@@ -337,9 +324,8 @@
     }
     break;
     case B_VL_PRED: {
+      uint8_t *pp = above;
 
-      unsigned char *pp = Above;
-
       predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
       predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
       predictor[2 * 16 + 0] =
@@ -360,16 +346,17 @@
     break;
 
     case B_HD_PRED: {
-      unsigned char pp[9];
-      pp[0] = Left[3];
-      pp[1] = Left[2];
-      pp[2] = Left[1];
-      pp[3] = Left[0];
+      uint8_t pp[9];
+
+      pp[0] = left[3];
+      pp[1] = left[2];
+      pp[2] = left[1];
+      pp[3] = left[0];
       pp[4] = top_left;
-      pp[5] = Above[0];
-      pp[6] = Above[1];
-      pp[7] = Above[2];
-      pp[8] = Above[3];
+      pp[5] = above[0];
+      pp[6] = above[1];
+      pp[7] = above[2];
+      pp[8] = above[3];
 
 
       predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
@@ -393,7 +380,7 @@
 
 
     case B_HU_PRED: {
-      unsigned char *pp = Left;
+      uint8_t *pp = left;
       predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
       predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
       predictor[0 * 16 + 2] =
@@ -418,7 +405,7 @@
     break;
     /*
     case B_CORNER_PRED:
-    corner_predictor(predictor, 16, 4, Above, Left);
+    corner_predictor(predictor, 16, 4, above, left);
     break;
     */
 #endif
@@ -425,41 +412,31 @@
   }
 }
 
-#if CONFIG_COMP_INTRA_PRED
-void vp9_comp_intra4x4_predict_c(BLOCKD *x,
-                               int b_mode, int b_mode2,
-                               unsigned char *out_predictor) {
-  unsigned char predictor[2][4 * 16];
-  int i, j;
-
-  vp9_intra4x4_predict(x, b_mode, predictor[0]);
-  vp9_intra4x4_predict(x, b_mode2, predictor[1]);
-
-  for (i = 0; i < 16 * 4; i += 16) {
-    for (j = i; j < i + 4; j++) {
-      out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1;
-    }
-  }
-}
-#endif
-
 /* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
  * to the right prediction have filled in pixels to use.
  */
 void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {
-  int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2);
-  unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
+  int extend_edge = xd->mb_to_right_edge == 0 && xd->mb_index < 2;
+  uint8_t *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -
                                xd->block[0].dst_stride + 16;
-  unsigned int *src_ptr = (unsigned int *)
-      (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0));
+  uint32_t *dst_ptr0 = (uint32_t *)above_right;
+  uint32_t *dst_ptr1 =
+    (uint32_t *)(above_right + 4 * xd->block[0].dst_stride);
+  uint32_t *dst_ptr2 =
+    (uint32_t *)(above_right + 8 * xd->block[0].dst_stride);
+  uint32_t *dst_ptr3 =
+    (uint32_t *)(above_right + 12 * xd->block[0].dst_stride);
 
-  unsigned int *dst_ptr0 = (unsigned int *)above_right;
-  unsigned int *dst_ptr1 =
-    (unsigned int *)(above_right + 4 * xd->block[0].dst_stride);
-  unsigned int *dst_ptr2 =
-    (unsigned int *)(above_right + 8 * xd->block[0].dst_stride);
-  unsigned int *dst_ptr3 =
-    (unsigned int *)(above_right + 12 * xd->block[0].dst_stride);
+  uint32_t *src_ptr = (uint32_t *) above_right;
+
+  if ((xd->sb_index >= 2 && xd->mb_to_right_edge == 0) ||
+      (xd->sb_index == 3 && xd->mb_index & 1))
+    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 32 *
+                                                    xd->block[0].dst_stride);
+  if (xd->mb_index == 3 ||
+      (xd->mb_to_right_edge == 0 && xd->mb_index == 2))
+    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 16 *
+                                                    xd->block[0].dst_stride);
 
   if (extend_edge) {
     *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;
--- a/vp9/common/vp9_reconintra4x4.h
+++ b/vp9/common/vp9_reconintra4x4.h
@@ -14,4 +14,4 @@
 
 extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);
 
-#endif
+#endif  // VP9_COMMON_VP9_RECONINTRA4X4_H_
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -4,6 +4,8 @@
  * VP9
  */
 
+#include "vpx/vpx_integer.h"
+
 struct loop_filter_info;
 struct blockd;
 struct macroblockd;
@@ -21,10 +23,10 @@
 }
 forward_decls vp9_common_forward_decls
 
-prototype void vp9_filter_block2d_4x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x4_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x8_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_16x16_8 "const unsigned char *src_ptr, const unsigned int src_stride, const short *HFilter_aligned16, const short *VFilter_aligned16, unsigned char *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
+prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
 
 # At the very least, MSVC 2008 has compiler bug exhibited by this code; code
 # compiles warning free but a dissassembly of generated code show bugs. To be
@@ -45,70 +47,76 @@
 prototype void vp9_dequantize_b_2x2 "struct blockd *x"
 specialize vp9_dequantize_b_2x2
 
-prototype void vp9_dequant_dc_idct_add_y_block_8x8 "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs, const short *dc, struct macroblockd *xd"
+prototype void vp9_dequant_dc_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dc, struct macroblockd *xd"
 specialize vp9_dequant_dc_idct_add_y_block_8x8
 
-prototype void vp9_dequant_idct_add_y_block_8x8 "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs, struct macroblockd *xd"
+prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, struct macroblockd *xd"
 specialize vp9_dequant_idct_add_y_block_8x8
 
-prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, const short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, unsigned short *eobs, struct macroblockd *xd"
+prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs, struct macroblockd *xd"
 specialize vp9_dequant_idct_add_uv_block_8x8
 
-prototype void vp9_dequant_idct_add_16x16 "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"
+prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"
 specialize vp9_dequant_idct_add_16x16
 
-prototype void vp9_dequant_idct_add_8x8 "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int dc, unsigned short eobs"
+prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc, int eob"
 specialize vp9_dequant_idct_add_8x8
 
-prototype void vp9_dequant_idct_add "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"
+prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride"
 specialize vp9_dequant_idct_add
 
-prototype void vp9_dequant_dc_idct_add "short *input, const short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc"
+prototype void vp9_dequant_dc_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc"
 specialize vp9_dequant_dc_idct_add
 
-prototype void vp9_dequant_dc_idct_add_y_block "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs, const short *dc"
+prototype void vp9_dequant_dc_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dcs"
 specialize vp9_dequant_dc_idct_add_y_block
 
-prototype void vp9_dequant_idct_add_y_block "short *q, const short *dq, unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs"
+prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs"
 specialize vp9_dequant_idct_add_y_block
 
-prototype void vp9_dequant_idct_add_uv_block "short *q, const short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, unsigned short *eobs"
+prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
 specialize vp9_dequant_idct_add_uv_block
 
+prototype void vp9_dequant_idct_add_32x32 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int pitch, int stride, int eob"
+specialize vp9_dequant_idct_add_32x32
+
+prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"
+specialize vp9_dequant_idct_add_uv_block_16x16
+
 #
 # RECON
 #
-prototype void vp9_copy_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem16x16 mmx sse2 dspr2
 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
 
-prototype void vp9_copy_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x8 mmx dspr2
 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
 
-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx
 
-prototype void vp9_avg_mem16x16 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_avg_mem16x16
 
-prototype void vp9_avg_mem8x8 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_avg_mem8x8
 
-prototype void vp9_copy_mem8x4 "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"
+prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx dspr2
 vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
 
-prototype void vp9_recon_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_b
 
-prototype void vp9_recon_uv_b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+prototype void vp9_recon_uv_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon_uv_b
 
-prototype void vp9_recon2b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+prototype void vp9_recon2b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon2b sse2
 
-prototype void vp9_recon4b "unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride"
+prototype void vp9_recon4b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
 specialize vp9_recon4b sse2
 
 prototype void vp9_recon_mb "struct macroblockd *x"
@@ -117,12 +125,18 @@
 prototype void vp9_recon_mby "struct macroblockd *x"
 specialize vp9_recon_mby
 
-prototype void vp9_recon_mby_s "struct macroblockd *x, unsigned char *dst"
+prototype void vp9_recon_mby_s "struct macroblockd *x, uint8_t *dst"
 specialize vp9_recon_mby_s
 
-prototype void vp9_recon_mbuv_s "struct macroblockd *x, unsigned char *udst, unsigned char *vdst"
+prototype void vp9_recon_mbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
 specialize void vp9_recon_mbuv_s
 
+prototype void vp9_recon_sby_s "struct macroblockd *x, uint8_t *dst"
+specialize vp9_recon_sby_s
+
+prototype void vp9_recon_sbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"
+specialize void vp9_recon_sbuv_s
+
 prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
 specialize vp9_build_intra_predictors_mby_s
 
@@ -135,9 +149,6 @@
 prototype void vp9_build_intra_predictors_mby "struct macroblockd *x"
 specialize vp9_build_intra_predictors_mby;
 
-prototype void vp9_build_comp_intra_predictors_mby "struct macroblockd *x"
-specialize vp9_build_comp_intra_predictors_mby;
-
 prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"
 specialize vp9_build_intra_predictors_mby_s;
 
@@ -147,262 +158,283 @@
 prototype void vp9_build_intra_predictors_mbuv_s "struct macroblockd *x"
 specialize vp9_build_intra_predictors_mbuv_s;
 
-prototype void vp9_build_comp_intra_predictors_mbuv "struct macroblockd *x"
-specialize vp9_build_comp_intra_predictors_mbuv;
+prototype void vp9_build_intra_predictors_sb64y_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_sb64y_s;
 
-prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+prototype void vp9_build_intra_predictors_sb64uv_s "struct macroblockd *x"
+specialize vp9_build_intra_predictors_sb64uv_s;
+
+prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra4x4_predict;
 
-prototype void vp9_comp_intra4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra4x4_predict;
-
-prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra8x8_predict;
 
-prototype void vp9_comp_intra8x8_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra8x8_predict;
-
-prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, unsigned char *predictor"
+prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor"
 specialize vp9_intra_uv4x4_predict;
 
-prototype void vp9_comp_intra_uv4x4_predict "struct blockd *x, int b_mode, int second_mode, unsigned char *predictor"
-specialize vp9_comp_intra_uv4x4_predict;
-
 #
 # Loopfilter
 #
-prototype void vp9_loop_filter_mbv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_mbv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_mbv sse2
 
-prototype void vp9_loop_filter_bv "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_bv "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bv sse2
 
-prototype void vp9_loop_filter_bv8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_bv8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bv8x8 sse2
 
-prototype void vp9_loop_filter_mbh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_mbh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_mbh sse2
 
-prototype void vp9_loop_filter_bh "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_bh "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bh sse2
 
-prototype void vp9_loop_filter_bh8x8 "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
+prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
 specialize vp9_loop_filter_bh8x8 sse2
 
-prototype void vp9_loop_filter_simple_mbv "unsigned char *y, int ystride, const unsigned char *blimit"
+prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"
 specialize vp9_loop_filter_simple_mbv mmx sse2
 vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
 vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
 vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
 
-prototype void vp9_loop_filter_simple_mbh "unsigned char *y, int ystride, const unsigned char *blimit"
+prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"
 specialize vp9_loop_filter_simple_mbh mmx sse2
 vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
 vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
 vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
 
-prototype void vp9_loop_filter_simple_bv "unsigned char *y, int ystride, const unsigned char *blimit"
+prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"
 specialize vp9_loop_filter_simple_bv mmx sse2
 vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
 vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
 vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
 
-prototype void vp9_loop_filter_simple_bh "unsigned char *y, int ystride, const unsigned char *blimit"
+prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"
 specialize vp9_loop_filter_simple_bh mmx sse2
 vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
 vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
 vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
 
+prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_lpf_mbh_w sse2
+
+prototype void vp9_lpf_mbv_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
+specialize vp9_lpf_mbv_w sse2
+
 #
 # post proc
 #
 if [ "$CONFIG_POSTPROC" = "yes" ]; then
-prototype void vp9_mbpost_proc_down "unsigned char *dst, int pitch, int rows, int cols, int flimit"
+prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit"
 specialize vp9_mbpost_proc_down mmx sse2
 vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm
 
-prototype void vp9_mbpost_proc_across_ip "unsigned char *src, int pitch, int rows, int cols, int flimit"
+prototype void vp9_mbpost_proc_across_ip "uint8_t *src, int pitch, int rows, int cols, int flimit"
 specialize vp9_mbpost_proc_across_ip sse2
 vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm
 
-prototype void vp9_post_proc_down_and_across "unsigned char *src_ptr, unsigned char *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
+prototype void vp9_post_proc_down_and_across "uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"
 specialize vp9_post_proc_down_and_across mmx sse2
 vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm
 
-prototype void vp9_plane_add_noise "unsigned char *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
+prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"
 specialize vp9_plane_add_noise mmx sse2
 vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt
 fi
 
-prototype void vp9_blend_mb_inner "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
 specialize vp9_blend_mb_inner
 
-prototype void vp9_blend_mb_outer "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+prototype void vp9_blend_mb_outer "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
 specialize vp9_blend_mb_outer
 
-prototype void vp9_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
+prototype void vp9_blend_b "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride"
 specialize vp9_blend_b
 
 #
 # sad 16x3, 3x16
 #
-prototype unsigned int vp9_sad16x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride"
+prototype unsigned int vp9_sad16x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"
 specialize vp9_sad16x3 sse2
 
-prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride"
+prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"
 specialize vp9_sad3x16 sse2
 
+prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance16x2 sse2
+
 #
 # Sub Pixel Filters
 #
-prototype void vp9_eighttap_predict16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict16x16
 
-prototype void vp9_eighttap_predict8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict8x8
 
-prototype void vp9_eighttap_predict_avg16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg16x16
 
-prototype void vp9_eighttap_predict_avg8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg8x8
 
-prototype void vp9_eighttap_predict_avg4x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg4x4
 
-prototype void vp9_eighttap_predict8x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict8x4
 
-prototype void vp9_eighttap_predict "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict
+prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict4x4
 
-prototype void vp9_eighttap_predict16x16_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict16x16_sharp
 
-prototype void vp9_eighttap_predict8x8_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict8x8_sharp
 
-prototype void vp9_eighttap_predict_avg16x16_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg16x16_sharp
 
-prototype void vp9_eighttap_predict_avg8x8_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg8x8_sharp
 
-prototype void vp9_eighttap_predict_avg4x4_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict_avg4x4_sharp
 
-prototype void vp9_eighttap_predict8x4_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_eighttap_predict8x4_sharp
 
-prototype void vp9_eighttap_predict_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_eighttap_predict_sharp
+prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict4x4_sharp
 
-prototype void vp9_sixtap_predict16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict16x16_smooth
+
+prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict8x8_smooth
+
+prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg16x16_smooth
+
+prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg8x8_smooth
+
+prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg4x4_smooth
+
+prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict8x4_smooth
+
+prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict4x4_smooth
+
+prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict16x16
 
-prototype void vp9_sixtap_predict8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict8x8
 
-prototype void vp9_sixtap_predict_avg16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict_avg16x16
 
-prototype void vp9_sixtap_predict_avg8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict_avg8x8
 
-prototype void vp9_sixtap_predict8x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_sixtap_predict8x4
 
-prototype void vp9_sixtap_predict "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict
+prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict4x4
 
-prototype void vp9_sixtap_predict_avg "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_sixtap_predict_avg
+prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict_avg4x4
 
-prototype void vp9_bilinear_predict16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict16x16 mmx sse2
+prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict16x16 sse2
 
-prototype void vp9_bilinear_predict8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict8x8 mmx sse2
+prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict8x8 sse2
 
-prototype void vp9_bilinear_predict_avg16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_bilinear_predict_avg16x16
 
-prototype void vp9_bilinear_predict_avg8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_bilinear_predict_avg8x8
 
-prototype void vp9_bilinear_predict8x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict8x4 mmx
+prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict8x4
 
-prototype void vp9_bilinear_predict4x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
-specialize vp9_bilinear_predict4x4 mmx
+prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict4x4
 
-prototype void vp9_bilinear_predict_avg4x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"
 specialize vp9_bilinear_predict_avg4x4
 
 #
 # dct
 #
-prototype void vp9_short_idct4x4llm_1 "short *input, short *output, int pitch"
+prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct4x4llm_1
 
-prototype void vp9_short_idct4x4llm "short *input, short *output, int pitch"
+prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct4x4llm
 
-prototype void vp9_short_idct8x8 "short *input, short *output, int pitch"
+prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct8x8
 
-prototype void vp9_short_idct10_8x8 "short *input, short *output, int pitch"
+prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct10_8x8
 
-prototype void vp9_short_ihaar2x2 "short *input, short *output, int pitch"
+prototype void vp9_short_ihaar2x2 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_ihaar2x2
 
-prototype void vp9_short_idct16x16 "short *input, short *output, int pitch"
+prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct16x16
 
-prototype void vp9_short_idct10_16x16 "short *input, short *output, int pitch"
+prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_idct10_16x16
 
-prototype void vp9_ihtllm "const short *input, short *output, int pitch, int tx_type, int tx_dim, short eobs"
+prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
+specialize vp9_short_idct32x32
+
+prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"
 specialize vp9_ihtllm
 
 #
 # 2nd order
 #
-prototype void vp9_short_inv_walsh4x4_1 "short *in, short *out"
+prototype void vp9_short_inv_walsh4x4_1 "int16_t *in, int16_t *out"
 specialize vp9_short_inv_walsh4x4_1
 
-prototype void vp9_short_inv_walsh4x4 "short *in, short *out"
+prototype void vp9_short_inv_walsh4x4 "int16_t *in, int16_t *out"
 specialize vp9_short_inv_walsh4x4_
 
 
 # dct and add
-prototype void vp9_dc_only_idct_add_8x8 "short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride"
+prototype void vp9_dc_only_idct_add_8x8 "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add_8x8
 
-prototype void vp9_dc_only_idct_add "short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride"
+prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add
 
 if [ "$CONFIG_LOSSLESS" = "yes" ]; then
-prototype void vp9_short_inv_walsh4x4_1_x8 "short *input, short *output, int pitch"
-prototype void vp9_short_inv_walsh4x4_x8 "short *input, short *output, int pitch"
-prototype void vp9_dc_only_inv_walsh_add "short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride"
-prototype void vp9_short_inv_walsh4x4_1_lossless "short *in, short *out"
-prototype void vp9_short_inv_walsh4x4_lossless "short *in, short *out"
+prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
+prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch"
+prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
+prototype void vp9_short_inv_walsh4x4_1_lossless "int16_t *in, int16_t *out"
+prototype void vp9_short_inv_walsh4x4_lossless "int16_t *in, int16_t *out"
 fi
 
-
-
-if [ "$CONFIG_SUPERBLOCKS" = "yes" ]; then
-
-prototype unsigned int vp9_sad32x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad32x3
 
-prototype unsigned int vp9_sad3x32 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
+prototype unsigned int vp9_sad3x32 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad3x32
 
-fi
-
 #
 # Encoder functions below this point.
 #
@@ -412,154 +444,181 @@
 # variance
 [ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
 
-prototype unsigned int vp9_variance32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance32x32
 
-prototype unsigned int vp9_variance16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance64x64
+
+prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance16x16 mmx sse2
 vp9_variance16x16_sse2=vp9_variance16x16_wmt
 vp9_variance16x16_mmx=vp9_variance16x16_mmx
 
-prototype unsigned int vp9_variance16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance16x8 mmx sse2
 vp9_variance16x8_sse2=vp9_variance16x8_wmt
 vp9_variance16x8_mmx=vp9_variance16x8_mmx
 
-prototype unsigned int vp9_variance8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance8x16 mmx sse2
 vp9_variance8x16_sse2=vp9_variance8x16_wmt
 vp9_variance8x16_mmx=vp9_variance8x16_mmx
 
-prototype unsigned int vp9_variance8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance8x8 mmx sse2
 vp9_variance8x8_sse2=vp9_variance8x8_wmt
 vp9_variance8x8_mmx=vp9_variance8x8_mmx
 
-prototype unsigned int vp9_variance4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance4x4 mmx sse2
 vp9_variance4x4_sse2=vp9_variance4x4_wmt
 vp9_variance4x4_mmx=vp9_variance4x4_mmx
 
-prototype unsigned int vp9_sub_pixel_variance32x32 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_variance64x64
+
+prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance32x32
 
-prototype unsigned int vp9_sub_pixel_variance16x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
 vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt
 
-prototype unsigned int vp9_sub_pixel_variance8x16 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x16 sse2 mmx
 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
 
-prototype unsigned int vp9_sub_pixel_variance16x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
 
-prototype unsigned int vp9_sub_pixel_variance8x8 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance8x8 sse2 mmx
 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
 
-prototype unsigned int vp9_sub_pixel_variance4x4 "const unsigned char *src_ptr, int source_stride, int xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_variance4x4 sse2 mmx
 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
 
-prototype unsigned int vp9_sad32x32 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
+specialize vp9_sad64x64
+
+prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad32x32
 
-prototype unsigned int vp9_sad16x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad16x16 mmx sse2 sse3
 vp9_sad16x16_sse2=vp9_sad16x16_wmt
 
-prototype unsigned int vp9_sad16x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad16x8 mmx sse2
 vp9_sad16x8_sse2=vp9_sad16x8_wmt
 
-prototype unsigned int vp9_sad8x16 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x16 mmx sse2
 vp9_sad8x16_sse2=vp9_sad8x16_wmt
 
-prototype unsigned int vp9_sad8x8 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad8x8 mmx sse2
 vp9_sad8x8_sse2=vp9_sad8x8_wmt
 
-prototype unsigned int vp9_sad4x4 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int max_sad"
+prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"
 specialize vp9_sad4x4 mmx sse2
 vp9_sad4x4_sse2=vp9_sad4x4_wmt
 
-prototype unsigned int vp9_variance_halfpixvar16x16_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar16x16_h mmx sse2
 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
 
-prototype unsigned int vp9_variance_halfpixvar16x16_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar16x16_v mmx sse2
 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
 
-prototype unsigned int vp9_variance_halfpixvar16x16_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar16x16_hv mmx sse2
 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
 
-prototype unsigned int vp9_variance_halfpixvar32x32_h "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar64x64_h
+
+prototype unsigned int vp9_variance_halfpixvar64x64_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar64x64_v
+
+prototype unsigned int vp9_variance_halfpixvar64x64_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
+specialize vp9_variance_halfpixvar64x64_hv
+
+prototype unsigned int vp9_variance_halfpixvar32x32_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar32x32_h
 
-prototype unsigned int vp9_variance_halfpixvar32x32_v "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar32x32_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar32x32_v
 
-prototype unsigned int vp9_variance_halfpixvar32x32_hv "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"
+prototype unsigned int vp9_variance_halfpixvar32x32_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
 specialize vp9_variance_halfpixvar32x32_hv
 
-prototype void vp9_sad32x32x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad64x64x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad64x64x3
+
+prototype void vp9_sad32x32x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad32x32x3
 
-prototype void vp9_sad16x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x16x3 sse3 ssse3
 
-prototype void vp9_sad16x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x8x3 sse3 ssse3
 
-prototype void vp9_sad8x16x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x16x3 sse3
 
-prototype void vp9_sad8x8x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x8x3 sse3
 
-prototype void vp9_sad4x4x3 "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x3 sse3
 
-prototype void vp9_sad32x32x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
+specialize vp9_sad64x64x8
+
+prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad32x32x8
 
-prototype void vp9_sad16x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad16x16x8 sse4
 
-prototype void vp9_sad16x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad16x8x8 sse4
 
-prototype void vp9_sad8x16x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad8x16x8 sse4
 
-prototype void vp9_sad8x8x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad8x8x8 sse4
 
-prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
+prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"
 specialize vp9_sad4x4x8 sse4
 
-prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
+specialize vp9_sad64x64x4d
+
+prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad32x32x4d
 
-prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x16x4d sse3
 
-prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x8x4d sse3
 
-prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x16x4d sse3
 
-prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x8x4d sse3
 
-prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse3
 
 #
@@ -567,23 +626,26 @@
 #
 case $arch in
     x86*)
-    prototype void vp9_copy32xn "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, int n"
+    prototype void vp9_copy32xn "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, int n"
     specialize vp9_copy32xn sse2 sse3
     ;;
 esac
 
-prototype unsigned int vp9_sub_pixel_mse16x16 "const unsigned char  *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const unsigned char *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
 specialize vp9_sub_pixel_mse16x16 sse2 mmx
 vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt
 
-prototype unsigned int vp9_mse16x16 "const unsigned char *src_ptr, int  source_stride, const unsigned char *ref_ptr, int  recon_stride, unsigned int *sse"
+prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"
 specialize vp9_mse16x16 mmx sse2
 vp9_mse16x16_sse2=vp9_mse16x16_wmt
 
-prototype unsigned int vp9_sub_pixel_mse32x32 "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse"
+prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
+specialize vp9_sub_pixel_mse64x64
+
+prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"
 specialize vp9_sub_pixel_mse32x32
 
-prototype unsigned int vp9_get_mb_ss "const short *"
+prototype unsigned int vp9_get_mb_ss "const int16_t *"
 specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
 prototype int vp9_mbblock_error "struct macroblock *mb, int dc"
@@ -590,7 +652,7 @@
 specialize vp9_mbblock_error mmx sse2
 vp9_mbblock_error_sse2=vp9_mbblock_error_xmm
 
-prototype int vp9_block_error "short *coeff, short *dqcoeff, int block_size"
+prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
 specialize vp9_block_error mmx sse2
 vp9_block_error_sse2=vp9_block_error_xmm
 
@@ -604,10 +666,10 @@
 prototype void vp9_subtract_b "struct block *be, struct blockd *bd, int pitch"
 specialize vp9_subtract_b mmx sse2
 
-prototype void vp9_subtract_mby "short *diff, unsigned char *src, unsigned char *pred, int stride"
+prototype void vp9_subtract_mby "int16_t *diff, uint8_t *src, uint8_t *pred, int stride"
 specialize vp9_subtract_mby mmx sse2
 
-prototype void vp9_subtract_mbuv "short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride"
+prototype void vp9_subtract_mbuv "int16_t *diff, uint8_t *usrc, uint8_t *vsrc, uint8_t *pred, int stride"
 specialize vp9_subtract_mbuv mmx sse2
 
 #
@@ -616,42 +678,45 @@
 if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
     [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
 
-    prototype void vp9_ssim_parms_8x8 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
     specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
 
-    prototype void vp9_ssim_parms_16x16 "unsigned char *s, int sp, unsigned char *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
+    prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
     specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
 fi
 
 # fdct functions
-prototype void vp9_fht "const short *input, int pitch, short *output, int tx_type, int tx_dim"
+prototype void vp9_fht "const int16_t *input, int pitch, int16_t *output, int tx_type, int tx_dim"
 specialize vp9_fht
 
-prototype void vp9_short_fdct8x8 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct8x8
 
-prototype void vp9_short_fhaar2x2 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fhaar2x2 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fhaar2x2
 
-prototype void vp9_short_fdct4x4 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct4x4
 
-prototype void vp9_short_fdct8x4 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct8x4
 
-prototype void vp9_short_walsh4x4 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh4x4
 
-prototype void vp9_short_fdct16x16 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
+specialize vp9_short_fdct32x32
+
+prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_fdct16x16
 
-prototype void vp9_short_walsh4x4_lossless "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_walsh4x4_lossless "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh4x4_lossless
 
-prototype void vp9_short_walsh4x4_x8 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_walsh4x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh4x4_x8
 
-prototype void vp9_short_walsh8x4_x8 "short *InputData, short *OutputData, int pitch"
+prototype void vp9_short_walsh8x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"
 specialize vp9_short_walsh8x4_x8
 
 #
@@ -670,7 +735,7 @@
 specialize vp9_diamond_search_sad sse3
 vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4
 
-prototype void vp9_temporal_filter_apply "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"
+prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
 specialize vp9_temporal_filter_apply sse2
 
 prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
--- a/vp9/common/vp9_sadmxn.h
+++ b/vp9/common/vp9_sadmxn.h
@@ -11,14 +11,14 @@
 #ifndef VP9_COMMON_VP9_SADMXN_H_
 #define VP9_COMMON_VP9_SADMXN_H_
 
-static __inline
-unsigned int sad_mx_n_c(
-  const unsigned char *src_ptr,
-  int  src_stride,
-  const unsigned char *ref_ptr,
-  int  ref_stride,
-  int m,
-  int n) {
+#include "vpx/vpx_integer.h"
+
+static __inline unsigned int sad_mx_n_c(const uint8_t *src_ptr,
+                                        int src_stride,
+                                        const uint8_t *ref_ptr,
+                                        int ref_stride,
+                                        int m,
+                                        int n) {
   int r, c;
   unsigned int sad = 0;
 
@@ -34,4 +34,4 @@
   return sad;
 }
 
-#endif
+#endif  // VP9_COMMON_VP9_SADMXN_H_
--- a/vp9/common/vp9_seg_common.c
+++ b/vp9/common/vp9_seg_common.c
@@ -14,7 +14,7 @@
 
 static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };
 static const int seg_feature_data_max[SEG_LVL_MAX] =
-                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX - 1};
+                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX_SB - 1};
 
 // These functions provide access to new segment level features.
 // Eventually these function may be "optimized out" but for the moment,
--- a/vp9/common/vp9_seg_common.h
+++ b/vp9/common/vp9_seg_common.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_type_aliases.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"
 
@@ -60,5 +59,5 @@
 
 int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id);
 
-#endif /* __INC_SEG_COMMON_H__ */
+#endif  // VP9_COMMON_VP9_SEG_COMMON_H_
 
--- a/vp9/common/vp9_setupintrarecon.c
+++ b/vp9/common/vp9_setupintrarecon.c
@@ -18,14 +18,14 @@
   /* set up frame new frame for intra coded blocks */
   vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
   for (i = 0; i < ybf->y_height; i++)
-    ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129;
+    ybf->y_buffer[ybf->y_stride * i - 1] = (uint8_t) 129;
 
   vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
   for (i = 0; i < ybf->uv_height; i++)
-    ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
+    ybf->u_buffer[ybf->uv_stride * i - 1] = (uint8_t) 129;
 
   vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
   for (i = 0; i < ybf->uv_height; i++)
-    ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129;
+    ybf->v_buffer[ybf->uv_stride * i - 1] = (uint8_t) 129;
 
 }
--- a/vp9/common/vp9_setupintrarecon.h
+++ b/vp9/common/vp9_setupintrarecon.h
@@ -8,6 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef VP9_COMMON_VP9_SETUPINTRARECON_H_
+#define VP9_COMMON_VP9_SETUPINTRARECON_H_
 
 #include "vpx_scale/yv12config.h"
+
 extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
+
+#endif  // VP9_COMMON_VP9_SETUPINTRARECON_H_
--- a/vp9/common/vp9_subpelvar.h
+++ b/vp9/common/vp9_subpelvar.h
@@ -8,14 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef VP9_COMMON_VP9_SUBPELVAR_H_
+#define VP9_COMMON_VP9_SUBPELVAR_H_
 
 #include "vp9/common/vp9_filter.h"
 
-
-
-static void variance(const unsigned char *src_ptr,
+static void variance(const uint8_t *src_ptr,
                      int  source_stride,
-                     const unsigned char *ref_ptr,
+                     const uint8_t *ref_ptr,
                      int  recon_stride,
                      int  w,
                      int  h,
@@ -43,14 +43,14 @@
  *
  *  ROUTINE       : filter_block2d_bil_first_pass
  *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
  *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
@@ -59,7 +59,7 @@
  *                  filtered output block. Used to implement first-pass
  *                  of 2-D separable filter.
  *
- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
  *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
  *                  pixel_step defines whether the filter is applied
  *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
@@ -67,13 +67,13 @@
  *                  to the next.
  *
  ****************************************************************************/
-static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr,
-                                              unsigned short *output_ptr,
+static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
+                                              uint16_t *output_ptr,
                                               unsigned int src_pixels_per_line,
                                               int pixel_step,
                                               unsigned int output_height,
                                               unsigned int output_width,
-                                              const short *vp9_filter) {
+                                              const int16_t *vp9_filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; i++) {
@@ -95,14 +95,14 @@
  *
  *  ROUTINE       : filter_block2d_bil_second_pass
  *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp9_filter          : Array of 2 bi-linear filter taps.
+ *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
+ *                  uint32_t src_pixels_per_line : Stride of input block.
+ *                  uint32_t pixel_step        : Offset between filter input samples (see notes).
+ *                  uint32_t output_height     : Input block height.
+ *                  uint32_t output_width      : Input block width.
+ *                  int32_t  *vp9_filter          : Array of 2 bi-linear filter taps.
  *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
  *
  *  RETURNS       : void
  *
@@ -119,13 +119,13 @@
  *                  to the next.
  *
  ****************************************************************************/
-static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr,
-                                               unsigned char *output_ptr,
+static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
+                                               uint8_t *output_ptr,
                                                unsigned int src_pixels_per_line,
                                                unsigned int pixel_step,
                                                unsigned int output_height,
                                                unsigned int output_width,
-                                               const short *vp9_filter) {
+                                               const int16_t *vp9_filter) {
   unsigned int  i, j;
   int  Temp;
 
@@ -145,3 +145,4 @@
   }
 }
 
+#endif  // VP9_COMMON_VP9_SUBPELVAR_H_
--- a/vp9/common/vp9_subpixel.h
+++ b/vp9/common/vp9_subpixel.h
@@ -8,14 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_SUBPIXEL_H_
 #define VP9_COMMON_VP9_SUBPIXEL_H_
 
 #define prototype_subpixel_predict(sym) \
-  void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \
-           unsigned char *dst, int dst_pitch)
+  void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \
+           uint8_t *dst, int dst_pitch)
 
 typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
 
-#endif
+#endif  // VP9_COMMON_VP9_SUBPIXEL_H_
--- a/vp9/common/vp9_swapyv12buffer.c
+++ b/vp9/common/vp9_swapyv12buffer.c
@@ -12,7 +12,7 @@
 
 void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
                           YV12_BUFFER_CONFIG *last_frame) {
-  unsigned char *temp;
+  uint8_t *temp;
 
   temp = last_frame->buffer_alloc;
   last_frame->buffer_alloc = new_frame->buffer_alloc;
--- a/vp9/common/vp9_swapyv12buffer.h
+++ b/vp9/common/vp9_swapyv12buffer.h
@@ -16,4 +16,4 @@
 void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
                           YV12_BUFFER_CONFIG *last_frame);
 
-#endif  // __SWAPYV12_BUFFER_H
+#endif  // VP9_COMMON_VP9_SWAPYV12BUFFER_H_
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h
@@ -7,9 +7,14 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 
+#ifdef _MSC_VER
+#include <math.h>
+#endif
+
 #include "./vpx_config.h"
 #if ARCH_X86 || ARCH_X86_64
 void vpx_reset_mmx_state(void);
@@ -18,6 +23,17 @@
 #define vp9_clear_system_state()
 #endif
 
+#ifdef _MSC_VER
+// round is not defined in MSVC
+static int round(double x) {
+  if (x < 0)
+    return (int)ceil(x - 0.5);
+  else
+    return (int)floor(x + 0.5);
+}
+#endif
+
 struct VP9Common;
 void vp9_machine_specific_config(struct VP9Common *);
-#endif
+
+#endif  // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
--- a/vp9/common/vp9_textblit.h
+++ b/vp9/common/vp9_textblit.h
@@ -16,4 +16,4 @@
 extern void vp9_blit_line(int x0, int x1, int y0, int y1,
                           unsigned char *image, const int pitch);
 
-#endif  // __INC_TEXTBLIT_H
+#endif  // VP9_COMMON_VP9_TEXTBLIT_H_
--- a/vp9/common/vp9_treecoder.c
+++ b/vp9/common/vp9_treecoder.c
@@ -100,9 +100,7 @@
   vp9_tree tree,
   vp9_prob probs          [ /* n-1 */ ],
   unsigned int branch_ct       [ /* n-1 */ ] [2],
-  const unsigned int num_events[ /* n */ ],
-  unsigned int Pfac,
-  int rd
+  const unsigned int num_events[ /* n */ ]
 ) {
   const int tree_len = n - 1;
   int t = 0;
@@ -110,29 +108,6 @@
   branch_counts(n, tok, tree, branch_ct, num_events);
 
   do {
-    const unsigned int *const c = branch_ct[t];
-    const unsigned int tot = c[0] + c[1];
-
-#if CONFIG_DEBUG
-    assert(tot < (1 << 24));        /* no overflow below */
-#endif
-
-    if (tot) {
-      const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
-      probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
-    } else
-      probs[t] = vp9_prob_half;
+    probs[t] = get_binary_prob(branch_ct[t][0], branch_ct[t][1]);
   } while (++t < tree_len);
-}
-
-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) {
-  int tot_count = counts[0] + counts[1];
-  vp9_prob prob;
-  if (tot_count) {
-    prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count;
-    prob += !prob;
-  } else {
-    prob = 128;
-  }
-  return prob;
 }
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -8,30 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_COMMON_VP9_TREECODER_H_
 #define VP9_COMMON_VP9_TREECODER_H_
 
-typedef unsigned char vp9_prob;
+#include "vpx/vpx_integer.h"
 
-#define vp9_prob_half ( (vp9_prob) 128)
+typedef uint8_t vp9_prob;
 
-typedef signed char vp9_tree_index;
-struct bool_coder_spec;
+#define vp9_prob_half ((vp9_prob) 128)
 
-typedef struct bool_coder_spec bool_coder_spec;
-typedef struct bool_writer bool_writer;
-typedef struct bool_reader bool_reader;
+typedef int8_t vp9_tree_index;
 
-typedef const bool_coder_spec c_bool_coder_spec;
-typedef const bool_writer c_bool_writer;
-typedef const bool_reader c_bool_reader;
+#define vp9_complement(x) (255 - x)
 
-
-
-# define vp9_complement( x) (255 - x)
-
-
 /* We build coding trees compactly in arrays.
    Each node of the tree is a pair of vp9_tree_indices.
    Array index often references a corresponding probability table.
@@ -41,7 +30,6 @@
 
 typedef const vp9_tree_index vp9_tree[], *vp9_tree_p;
 
-
 typedef const struct vp9_token_struct {
   int value;
   int Len;
@@ -53,31 +41,33 @@
 void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree,
                                  int offset);
 
-
 /* Convert array of token occurrence counts into a table of probabilities
    for the associated binary encoding tree.  Also writes count of branches
    taken for each node on the tree; this facilitiates decisions as to
    probability updates. */
 
-void vp9_tree_probs_from_distribution(
-  int n,                      /* n = size of alphabet */
-  vp9_token tok               [ /* n */ ],
-  vp9_tree tree,
-  vp9_prob probs          [ /* n-1 */ ],
-  unsigned int branch_ct       [ /* n-1 */ ] [2],
-  const unsigned int num_events[ /* n */ ],
-  unsigned int Pfactor,
-  int Round
-);
+void vp9_tree_probs_from_distribution(int n,  /* n = size of alphabet */
+                                      vp9_token tok[ /* n */ ],
+                                      vp9_tree tree,
+                                      vp9_prob probs[ /* n - 1 */ ],
+                                      unsigned int branch_ct[ /* n - 1 */ ][2],
+                                      const unsigned int num_events[ /* n */ ]);
 
-static __inline int clip_prob(int p) {
-  if (p > 255)
-    return 255;
-  else if (p < 1)
-    return 1;
-  return p;
+static __inline vp9_prob clip_prob(int p) {
+  return (p > 255) ? 255u : (p < 1) ? 1u : p;
 }
 
-vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]);
+static __inline vp9_prob get_prob(int num, int den) {
+  return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den);
+}
 
-#endif
+static __inline vp9_prob get_binary_prob(int n0, int n1) {
+  return get_prob(n0, n0 + n1);
+}
+
+/* this function assumes prob1 and prob2 are already within [1,255] range */
+static __inline vp9_prob weighted_prob(int prob1, int prob2, int factor) {
+  return (prob1 * (256 - factor) + prob2 * factor + 128) >> 8;
+}
+
+#endif  // VP9_COMMON_VP9_TREECODER_H_
--- a/vp9/common/vp9_type_aliases.h
+++ /dev/null
@@ -1,122 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     vp9_type_aliases.h
-*
-*   Description  :     Standard type aliases
-*
-****************************************************************************/
-#ifndef VP9_COMMON_VP9_TYPE_ALIASES_H_
-#define VP9_COMMON_VP9_TYPE_ALIASES_H_
-
-/****************************************************************************
-* Macros
-****************************************************************************/
-#define EXPORT
-#define IMPORT          extern      /* Used to declare imported data & routines */
-#define PRIVATE         static      /* Used to declare & define module-local data */
-#define LOCAL           static      /* Used to define all persistent routine-local data */
-#define STD_IN_PATH     0           /* Standard input path */
-#define STD_OUT_PATH    1           /* Standard output path */
-#define STD_ERR_PATH    2           /* Standard error path */
-#define STD_IN_FILE     stdin       /* Standard input file pointer */
-#define STD_OUT_FILE    stdout      /* Standard output file pointer */
-#define STD_ERR_FILE    stderr      /* Standard error file pointer */
-#define max_int         0x7FFFFFFF
-
-#define __export
-#define _export
-
-#define CCONV
-
-#ifndef NULL
-#ifdef __cplusplus
-#define NULL    0
-#else
-#define NULL    ((void *)0)
-#endif
-#endif
-
-#ifndef FALSE
-#define FALSE   0
-#endif
-
-#ifndef TRUE
-#define TRUE    1
-#endif
-
-/****************************************************************************
-* Typedefs
-****************************************************************************/
-#ifndef TYPE_INT8
-#define TYPE_INT8
-typedef signed char     INT8;
-#endif
-
-#ifndef TYPE_INT16
-/*#define TYPE_INT16*/
-typedef signed short    INT16;
-#endif
-
-#ifndef TYPE_INT32
-/*#define TYPE_INT32*/
-typedef signed int      INT32;
-#endif
-
-#ifndef TYPE_UINT8
-/*#define TYPE_UINT8*/
-typedef unsigned char   UINT8;
-#endif
-
-#ifndef TYPE_UINT32
-/*#define TYPE_UINT32*/
-typedef unsigned int    UINT32;
-#endif
-
-#ifndef TYPE_UINT16
-/*#define TYPE_UINT16*/
-typedef unsigned short  UINT16;
-#endif
-
-#ifndef TYPE_BOOL
-/*#define TYPE_BOOL*/
-typedef int             BOOL;
-#endif
-
-typedef unsigned char   BOOLEAN;
-
-#ifdef _MSC_VER
-typedef __int64 INT64;
-#if _MSC_VER < 1600
-#ifndef INT64_MAX
-#define INT64_MAX LLONG_MAX
-#endif
-#endif
-#else
-
-#ifndef TYPE_INT64
-#ifdef _TMS320C6X
-/* for now we only have 40bits */
-typedef long INT64;
-#else
-typedef long long INT64;
-#endif
-#endif
-
-#endif
-
-/* Floating point */
-typedef  double         FLOAT64;
-typedef  float          FLOAT32;
-
-#endif
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -15,8 +15,6 @@
 
 extern const short vp9_six_tap_mmx[8][6 * 8];
 
-extern const short vp9_bilinear_filters_8x_mmx[8][2 * 8];
-
 extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,
                                       unsigned short  *output_ptr,
                                       unsigned int     src_pixels_per_line,
@@ -95,8 +93,6 @@
                                              unsigned int   output_height,
                                              const short   *vp9_filter);
 
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
-
 ///////////////////////////////////////////////////////////////////////////
 // the mmx function that does the bilinear filtering and var calculation //
 // int one pass                                                          //
@@ -231,26 +227,6 @@
                              16, 8, 4, 8, vfilter);
   vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
                              16, 8, 4, 8, vfilter);
-}
-
-void vp9_bilinear_predict16x16_mmx(unsigned char  *src_ptr,
-                                   int  src_pixels_per_line,
-                                   int  xoffset,
-                                   int  yoffset,
-                                   unsigned char *dst_ptr,
-                                   int  dst_pitch) {
-  vp9_bilinear_predict8x8_mmx(src_ptr,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr, dst_pitch);
-  vp9_bilinear_predict8x8_mmx(src_ptr + 8,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + 8, dst_pitch);
-  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + dst_pitch * 8, dst_pitch);
-  vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8,
-                              src_pixels_per_line, xoffset, yoffset,
-                              dst_ptr + dst_pitch * 8 + 8, dst_pitch);
 }
 #endif
 
--- a/vp9/common/x86/vp9_loopfilter_x86.c
+++ b/vp9/common/x86/vp9_loopfilter_x86.c
@@ -85,6 +85,480 @@
 #endif
 
 #if HAVE_SSE2
+
+void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
+                                       int p,
+                                       const unsigned char *_blimit,
+                                       const unsigned char *_limit,
+                                       const unsigned char *_thresh) {
+  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
+  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
+
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  __m128i mask, hev, flat, flat2;
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i p7, p6, p5;
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i q5, q6, q7;
+  int i = 0;
+  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
+  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
+  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
+  const __m128i thresh =
+      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
+  const __m128i limit =
+      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
+  const __m128i blimit =
+      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
+
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                     _mm_subs_epu8(p0, p2)),
+                         _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                      _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                     _mm_subs_epu8(p0, p3)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                      _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+                                     _mm_subs_epu8(p0, p4)),
+                         _mm_or_si128(_mm_subs_epu8(q4, q0),
+                                      _mm_subs_epu8(q0, q4)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // calculate flat2
+  p4 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+//  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+//  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i work;
+    flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                     _mm_subs_epu8(p0, p2)),
+                         _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                      _mm_subs_epu8(q0, q2)));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                     _mm_subs_epu8(p0, p3)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                      _mm_subs_epu8(q0, q3)));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+                                     _mm_subs_epu8(p0, p4)),
+                         _mm_or_si128(_mm_subs_epu8(q4, q0),
+                                      _mm_subs_epu8(q0, q4)));
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+  // calculate flat2
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    i = 0;
+    do {
+      __m128i workp_a, workp_b, workp_shft;
+      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      src += 8;
+    } while (++i < 2);
+  }
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // wide flat
+  // TODO(slavarnway): interleave with the flat pixel calculations (see above)
+  {
+    const __m128i eight = _mm_set1_epi16(8);
+    unsigned char *src = s;
+    int i = 0;
+    do {
+      __m128i workp_a, workp_b, workp_shft;
+      p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero);
+      p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero);
+      p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero);
+      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
+      q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero);
+      q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero);
+      q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero);
+
+
+      workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
+      workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a);
+      workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2));
+      workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
+      _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      src += 8;
+    } while (++i < 2);
+  }
+  // wide flat
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    /* Filter1 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    /* Filter2 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    /* filt >> 1 */
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+    // write out op6 - op3
+    {
+      unsigned char *dst = (s - 7 * p);
+      for (i = 6; i > 2; i--) {
+        __m128i flat2_output;
+        work_a = _mm_loadu_si128((__m128i *)dst);
+        flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
+        work_a = _mm_andnot_si128(flat2, work_a);
+        flat2_output = _mm_and_si128(flat2, flat2_output);
+        work_a = _mm_or_si128(work_a, flat2_output);
+        _mm_storeu_si128((__m128i *)dst, work_a);
+        dst += p;
+      }
+    }
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_load_si128((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    work_a = _mm_or_si128(work_a, p2);
+    p2 = _mm_load_si128((__m128i *)flat2_op[2]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    p2 = _mm_and_si128(flat2, p2);
+    p2 = _mm_or_si128(work_a, p2);
+    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+
+    p1 = _mm_load_si128((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, ps1);
+    p1 = _mm_and_si128(flat, p1);
+    work_a = _mm_or_si128(work_a, p1);
+    p1 = _mm_load_si128((__m128i *)flat2_op[1]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    p1 = _mm_and_si128(flat2, p1);
+    p1 = _mm_or_si128(work_a, p1);
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+
+    p0 = _mm_load_si128((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, ps0);
+    p0 = _mm_and_si128(flat, p0);
+    work_a = _mm_or_si128(work_a, p0);
+    p0 = _mm_load_si128((__m128i *)flat2_op[0]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    p0 = _mm_and_si128(flat2, p0);
+    p0 = _mm_or_si128(work_a, p0);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+
+    q0 = _mm_load_si128((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, qs0);
+    q0 = _mm_and_si128(flat, q0);
+    work_a = _mm_or_si128(work_a, q0);
+    q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    q0 = _mm_and_si128(flat2, q0);
+    q0 = _mm_or_si128(work_a, q0);
+    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+    q1 = _mm_load_si128((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, qs1);
+    q1 = _mm_and_si128(flat, q1);
+    work_a = _mm_or_si128(work_a, q1);
+    q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    q1 = _mm_and_si128(flat2, q1);
+    q1 = _mm_or_si128(work_a, q1);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_load_si128((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    work_a = _mm_or_si128(work_a, q2);
+    q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
+    work_a = _mm_andnot_si128(flat2, work_a);
+    q2 = _mm_and_si128(flat2, q2);
+    q2 = _mm_or_si128(work_a, q2);
+    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+    // write out oq3 - oq7
+    {
+      unsigned char *dst = (s + 3 * p);
+      for (i = 3; i < 7; i++) {
+        __m128i flat2_output;
+        work_a = _mm_loadu_si128((__m128i *)dst);
+        flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
+        work_a = _mm_andnot_si128(flat2, work_a);
+        flat2_output = _mm_and_si128(flat2, flat2_output);
+        work_a = _mm_or_si128(work_a, flat2_output);
+        _mm_storeu_si128((__m128i *)dst, work_a);
+        dst += p;
+      }
+    }
+  }
+}
+
 void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
                                             int p,
                                             const unsigned char *_blimit,
@@ -562,6 +1036,38 @@
   transpose(src, 16, dst, p, 2);
 }
 
+void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
+                                          int p,
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+  unsigned char *src[4];
+  unsigned char *dst[4];
+
+  /* Transpose 16x16 */
+  transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
+  transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
+
+  /* Loop filtering */
+  vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
+                                           thresh);
+
+  src[0] = t_dst;
+  src[1] = t_dst + 8 * 16;
+  src[2] = t_dst + 8;
+  src[3] = t_dst + 8 * 16 + 8;
+
+  dst[0] = s - 8;
+  dst[1] = s - 8 + 8;
+  dst[2] = s - 8 + p * 8;
+  dst[3] = s - 8 + p * 8 + 8;
+
+  /* Transpose 16x16 */
+  transpose(src, 16, dst, p, 4);
+}
+
+
 void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u,
                                              int p,
                                              const unsigned char *blimit,
@@ -604,11 +1110,30 @@
                                               lfi->lim, lfi->hev_thr, v_ptr);
 }
 
+
+void vp9_lpf_mbh_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                           unsigned char *v_ptr, int y_stride, int uv_stride,
+                           struct loop_filter_info *lfi) {
+  vp9_mb_lpf_horizontal_edge_w_sse2(y_ptr, y_stride,
+                                      lfi->mblim, lfi->lim, lfi->hev_thr);
+
+  /* u,v */
+  if (u_ptr)
+    vp9_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
+                                              lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
 void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
                              unsigned char *v_ptr, int y_stride, int uv_stride,
                              struct loop_filter_info *lfi) {
   vp9_mbloop_filter_horizontal_edge_sse2(
     y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+
+  if (u_ptr)
+    vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride,
+                                            lfi->blim, lfi->lim, lfi->hev_thr,
+                                            v_ptr + 4 * uv_stride);
 }
 
 /* Vertical MB Filtering */
@@ -624,11 +1149,30 @@
                                             lfi->lim, lfi->hev_thr, v_ptr);
 }
 
+
+void vp9_lpf_mbv_w_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
+                   unsigned char *v_ptr, int y_stride, int uv_stride,
+                   struct loop_filter_info *lfi) {
+  vp9_mb_lpf_vertical_edge_w_sse2(y_ptr, y_stride,
+                                    lfi->mblim, lfi->lim, lfi->hev_thr);
+
+  /* u,v */
+  if (u_ptr)
+    vp9_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim,
+                                            lfi->lim, lfi->hev_thr, v_ptr);
+}
+
+
 void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr,
                              unsigned char *v_ptr, int y_stride, int uv_stride,
                              struct loop_filter_info *lfi) {
   vp9_mbloop_filter_vertical_edge_sse2(
     y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+
+  if (u_ptr)
+    vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride,
+                                          lfi->blim, lfi->lim, lfi->hev_thr,
+                                          v_ptr + 4);
 }
 
 /* Horizontal B Filtering */
--- /dev/null
+++ b/vp9/common/x86/vp9_subpel_variance_impl_sse2.asm
@@ -1,0 +1,645 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+;void vp9_filter_block2d_bil_var_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int  xoffset,
+;    int  yoffset,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
+sym(vp9_filter_block2d_bil_var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+        pxor            xmm6,           xmm6                 ;
+        pxor            xmm7,           xmm7                 ;
+
+        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
+        movdqa          xmm4,           XMMWORD PTR [rsi]
+
+        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]
+        movsxd          rax,            dword ptr arg(5)     ; xoffset
+
+        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
+        je              filter_block2d_bil_var_sse2_sp_only
+
+        shl             rax,            5                    ; point to filter coeff with xoffset
+        lea             rax,            [rax + rcx]          ; HFilter
+
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
+        je              filter_block2d_bil_var_sse2_fp_only
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        movq            xmm3,           QWORD PTR [rsi+1]    ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]                ;
+        punpcklbw       xmm3,           xmm0
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift     ;
+        movdqa          xmm5,           xmm1
+
+        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
+        lea             rsi,            [rsi + rbx]
+%if ABI_IS_32BIT=0
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_sse2_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        movq            xmm3,           QWORD PTR [rsi+1]             ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4               ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movdqa          xmm3,           xmm5                 ;
+        movdqa          xmm5,           xmm1                 ;
+
+        pmullw          xmm3,           [rdx]               ;
+        pmullw          xmm1,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
+%if ABI_IS_32BIT
+        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
+%else
+        lea             rdi,            [rdi + r9]
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_var_sse2_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_sp_only:
+        movsxd          rdx,            dword ptr arg(6)     ; yoffset
+
+        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
+        je              filter_block2d_bil_var_sse2_full_pixel
+
+        shl             rdx,            5
+        lea             rdx,            [rdx + rcx]          ; VFilter
+
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]      ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        lea             rsi,            [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+        movq            xmm3,           QWORD PTR [rsi]             ;
+        punpcklbw       xmm3,           xmm0                 ;
+        movdqa          xmm5,           xmm3
+
+        pmullw          xmm1,           [rdx]               ;
+        pmullw          xmm3,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4                 ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        movdqa          xmm1,           xmm5                 ;
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_sp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_full_pixel:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+        pxor            xmm0,           xmm0                 ;
+
+filter_block2d_bil_full_pixel_loop:
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        movq            xmm2,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm2,           xmm0                 ;
+
+        psubw           xmm1,           xmm2                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_full_pixel_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_var_sse2_fp_only:
+        mov             rsi,            arg(0)               ;ref_ptr
+        mov             rdi,            arg(2)               ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)     ;Height
+        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                 ;
+        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+        movq            xmm1,           QWORD PTR [rsi]       ;
+        movq            xmm3,           QWORD PTR [rsi+1]     ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           xmm4  ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]     ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+        lea             rsi,            [rsi + rdx]
+        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_fp_only_loop       ;
+
+        jmp             filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(7) ; sum
+        mov             rdi,            arg(8) ; sumsquared
+
+        movd            [rsi],          mm2    ; xsum
+        movd            [rdi],          mm4    ; xxsum
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void vp9_half_horiz_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
+sym(vp9_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+        lea             rsi,            [rsi + rax]
+
+.half_horiz_vert_variance16x_h_1:
+        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
+        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+
+        movq            xmm3,           QWORD PTR [rdi+8]
+        punpcklbw       xmm3,           xmm0
+        psubw           xmm4,           xmm3
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             .half_horiz_vert_variance16x_h_1    ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_half_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
+sym(vp9_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0)              ;ref_ptr
+
+        mov             rdi,            arg(2)              ;src_ptr
+        movsxd          rcx,            dword ptr arg(4)    ;Height
+        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        movdqu          xmm5,           XMMWORD PTR [rsi]
+        lea             rsi,            [rsi + rax          ]
+        pxor            xmm0,           xmm0
+
+.half_vert_variance16x_h_1:
+        movdqu          xmm3,           XMMWORD PTR [rsi]
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm4,           xmm5
+        punpcklbw       xmm5,           xmm0
+        punpckhbw       xmm4,           xmm0
+
+        movq            xmm2,           QWORD PTR [rdi]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm5,           xmm2
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+        psubw           xmm4,           xmm2
+
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm4
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm4,           xmm4
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm4
+
+        movdqa          xmm5,           xmm3
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1
+        jnz             .half_vert_variance16x_h_1
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp9_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
+sym(vp9_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+.half_horiz_variance16x_h_1:
+        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
+        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        movdqa          xmm1,           xmm5
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+        punpckhbw       xmm1,           xmm0
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+        movq            xmm2,           QWORD PTR [rdi+8]
+        punpcklbw       xmm2,           xmm0
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        psubw           xmm1,           xmm2
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        paddw           xmm6,           xmm1
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        pmaddwd         xmm1,           xmm1
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+        paddd           xmm7,           xmm1
+
+        lea             rsi,            [rsi + rax]
+        lea             rdi,            [rdi + rdx]
+
+        sub             rcx,            1                   ;
+        jnz             .half_horiz_variance16x_h_1         ;
+
+        pxor        xmm1,           xmm1
+        pxor        xmm5,           xmm5
+
+        punpcklwd   xmm0,           xmm6
+        punpckhwd   xmm1,           xmm6
+        psrad       xmm0,           16
+        psrad       xmm1,           16
+        paddd       xmm0,           xmm1
+        movdqa      xmm1,           xmm0
+
+        movdqa      xmm6,           xmm7
+        punpckldq   xmm6,           xmm5
+        punpckhdq   xmm7,           xmm5
+        paddd       xmm6,           xmm7
+
+        punpckldq   xmm0,           xmm5
+        punpckhdq   xmm1,           xmm5
+        paddd       xmm0,           xmm1
+
+        movdqa      xmm7,           xmm6
+        movdqa      xmm1,           xmm0
+
+        psrldq      xmm7,           8
+        psrldq      xmm1,           8
+
+        paddd       xmm6,           xmm7
+        paddd       xmm0,           xmm1
+
+        mov         rsi,            arg(5) ;[Sum]
+        mov         rdi,            arg(6) ;[SSE]
+
+        movd        [rsi],       xmm0
+        movd        [rdi],       xmm6
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+    times 8 dw 64
+align 16
+bilinear_filters_sse2:
+    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
+    dw 120, 120, 120, 120, 120, 120, 120, 120,  8,  8,  8,  8,  8,  8,  8,  8
+    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
+    dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
+    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
+    dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
+    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
+    dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
+    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
+    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
+    dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
+    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
+    dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
+    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
+    dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
--- a/vp9/common/x86/vp9_subpixel_mmx.asm
+++ b/vp9/common/x86/vp9_subpixel_mmx.asm
@@ -202,438 +202,6 @@
     pop         rbp
     ret
 
-
-;void bilinear_predict8x8_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;   unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_mmx) PRIVATE
-sym(vp9_bilinear_predict8x8_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        shl         rax,        5 ; offset * 32
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-
-        shl         rax,        5 ; offset*32
-        add         rax,        rcx ; VFilter
-
-        lea         rcx,        [rdi+rdx*8]          ;
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x8:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8                  ;dst_pitch
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x8
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void bilinear_predict8x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x4_mmx) PRIVATE
-sym(vp9_bilinear_predict8x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-        shl         rax,        5
-
-        mov         rsi,        arg(0) ;src_ptr              ;
-        add         rax,        rcx
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x4:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP9_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void bilinear_predict4x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp9_bilinear_predict4x4_mmx) PRIVATE
-sym(vp9_bilinear_predict4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
-        shl         rax,        5
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm0                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_4x4:
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-
-        movq        mm5,        mm7                 ;
-        punpcklbw   mm5,        mm0                 ;
-
-        pmullw      mm5,        [rax]               ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-        movq        mm7,        mm3                 ;
-
-        packuswb    mm7,        mm0                 ;
-
-        pmullw      mm3,        [rax+16]            ;
-        paddw       mm3,        mm5                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP9_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    mm3,        mm0
-        movd        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_4x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
 SECTION_RODATA
 align 16
 rd:
@@ -698,30 +266,3 @@
     times 8 dw -6
     times 8 dw 0
 
-
-align 16
-global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx))
-sym(vp9_bilinear_filters_8x_mmx):
-    times 8 dw 128
-    times 8 dw 0
-
-    times 8 dw 112
-    times 8 dw 16
-
-    times 8 dw 96
-    times 8 dw 32
-
-    times 8 dw 80
-    times 8 dw 48
-
-    times 8 dw 64
-    times 8 dw 64
-
-    times 8 dw 48
-    times 8 dw 80
-
-    times 8 dw 32
-    times 8 dw 96
-
-    times 8 dw 16
-    times 8 dw 112
--- /dev/null
+++ b/vp9/common/x86/vp9_subpixel_variance_sse2.c
@@ -1,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define HALFNDX 8
+
+void vp9_half_horiz_variance16x_h_sse2(const unsigned char *ref_ptr,
+                                       int ref_pixels_per_line,
+                                       const unsigned char *src_ptr,
+                                       int src_pixels_per_line,
+                                       unsigned int Height,
+                                       int *sum,
+                                       unsigned int *sumsquared);
+
+void vp9_half_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
+                                      int ref_pixels_per_line,
+                                      const unsigned char *src_ptr,
+                                      int src_pixels_per_line,
+                                      unsigned int Height,
+                                      int *sum,
+                                      unsigned int *sumsquared);
+
+void vp9_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref_ptr,
+                                            int ref_pixels_per_line,
+                                            const unsigned char *src_ptr,
+                                            int src_pixels_per_line,
+                                            unsigned int Height,
+                                            int *sum,
+                                            unsigned int *sumsquared);
+
+void vp9_filter_block2d_bil_var_sse2(const unsigned char *ref_ptr,
+                                     int ref_pixels_per_line,
+                                     const unsigned char *src_ptr,
+                                     int src_pixels_per_line,
+                                     unsigned int Height,
+                                     int  xoffset,
+                                     int  yoffset,
+                                     int *sum,
+                                     unsigned int *sumsquared);
+
+unsigned int vp9_sub_pixel_variance16x2_sse2(const unsigned char  *src_ptr,
+                                             int  src_pixels_per_line,
+                                             int  xoffset,
+                                             int  yoffset,
+                                             const unsigned char *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse) {
+  int xsum0, xsum1;
+  unsigned int xxsum0, xxsum1;
+
+  if (xoffset == HALFNDX && yoffset == 0) {
+    vp9_half_horiz_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      &xsum0, &xxsum0);
+  } else if (xoffset == 0 && yoffset == HALFNDX) {
+    vp9_half_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      &xsum0, &xxsum0);
+  } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
+    vp9_half_horiz_vert_variance16x_h_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      &xsum0, &xxsum0);
+  } else {
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr, src_pixels_per_line,
+      dst_ptr, dst_pixels_per_line, 2,
+      xoffset, yoffset,
+      &xsum0, &xxsum0);
+
+    vp9_filter_block2d_bil_var_sse2(
+      src_ptr + 8, src_pixels_per_line,
+      dst_ptr + 8, dst_pixels_per_line, 2,
+      xoffset, yoffset,
+      &xsum1, &xxsum1);
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+  }
+
+  *sse = xxsum0;
+  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 5));
+}
--- a/vp9/common/x86/vp9_subpixel_x86.h
+++ b/vp9/common/x86/vp9_subpixel_x86.h
@@ -25,11 +25,7 @@
 extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
 extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
 extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx);
 
-
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp9_subpix_sixtap16x16
 #define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
@@ -45,15 +41,6 @@
 
 #undef  vp9_subpix_bilinear16x16
 #define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
-
-#undef  vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx
-
-#undef  vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx
-
-#undef  vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx
 
 #endif
 #endif
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_DECODER_VP9_DBOOLHUFF_H_
 #define VP9_DECODER_VP9_DBOOLHUFF_H_
+
 #include <stddef.h>
 #include <limits.h>
 #include "./vpx_config.h"
@@ -33,7 +33,7 @@
   unsigned int         range;
 } BOOL_DECODER;
 
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
 int vp9_start_decode(BOOL_DECODER *br,
                      const unsigned char *source,
@@ -152,4 +152,4 @@
 
 extern int vp9_decode_unsigned_max(BOOL_DECODER *br, int max);
 
-#endif
+#endif  // VP9_DECODER_VP9_DBOOLHUFF_H_
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -14,7 +14,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/common/vp9_findnearmv.h"
-
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_entropy.h"
@@ -51,7 +51,6 @@
   return treed_read(bc, vp9_ymode_tree, p);
 }
 
-#if CONFIG_SUPERBLOCKS
 static int read_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
   return treed_read(bc, vp9_sb_ymode_tree, p);
 }
@@ -59,7 +58,6 @@
 static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {
   return treed_read(bc, vp9_uv_mode_tree, p);
 }
-#endif
 
 static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {
   return treed_read(bc, vp9_kf_ymode_tree, p);
@@ -122,7 +120,21 @@
   m->mbmi.segment_id = 0;
   if (pbi->mb.update_mb_segmentation_map) {
     read_mb_segid(bc, &m->mbmi, &pbi->mb);
-    pbi->common.last_frame_seg_map[map_index] = m->mbmi.segment_id;
+    if (m->mbmi.sb_type) {
+      const int nmbs = 1 << m->mbmi.sb_type;
+      const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
+      const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
+      int x, y;
+
+      for (y = 0; y < ymbs; y++) {
+        for (x = 0; x < xmbs; x++) {
+          cm->last_frame_seg_map[map_index + x + y * cm->mb_cols] =
+              m->mbmi.segment_id;
+        }
+      }
+    } else {
+      cm->last_frame_seg_map[map_index] = m->mbmi.segment_id;
+    }
   }
 
   m->mbmi.mb_skip_coeff = 0;
@@ -144,25 +156,18 @@
       m->mbmi.mb_skip_coeff = 0;
   }
 
-#if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb) {
+  if (m->mbmi.sb_type) {
     y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,
       pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-  } else
-#endif
-  y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,
-    pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
-#if CONFIG_COMP_INTRA_PRED
-  m->mbmi.second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
+  } else {
+    y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,
+      pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);
+  }
 
   m->mbmi.ref_frame = INTRA_FRAME;
 
   if ((m->mbmi.mode = y_mode) == B_PRED) {
     int i = 0;
-#if CONFIG_COMP_INTRA_PRED
-    int use_comp_pred = vp9_read(bc, DEFAULT_COMP_INTRA_PROB);
-#endif
     do {
       const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
       const B_PREDICTION_MODE L = left_block_mode(m, i);
@@ -170,15 +175,6 @@
       m->bmi[i].as_mode.first =
         (B_PREDICTION_MODE) read_kf_bmode(
           bc, pbi->common.kf_bmode_prob [A] [L]);
-#if CONFIG_COMP_INTRA_PRED
-      if (use_comp_pred) {
-        m->bmi[i].as_mode.second =
-          (B_PREDICTION_MODE) read_kf_bmode(
-            bc, pbi->common.kf_bmode_prob [A] [L]);
-      } else {
-        m->bmi[i].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
-      }
-#endif
     } while (++i < 16);
   }
   if ((m->mbmi.mode = y_mode) == I8X8_PRED) {
@@ -191,26 +187,22 @@
       m->bmi[ib + 1].as_mode.first = mode8x8;
       m->bmi[ib + 4].as_mode.first = mode8x8;
       m->bmi[ib + 5].as_mode.first = mode8x8;
-#if CONFIG_COMP_INTRA_PRED
-      m->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-      m->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-      m->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-      m->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
     }
   } else
     m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,
                                                        pbi->common.kf_uv_mode_prob[m->mbmi.mode]);
-#if CONFIG_COMP_INTRA_PRED
-  m->mbmi.second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
 
   if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&
       m->mbmi.mode <= I8X8_PRED) {
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);
-    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED)
+    if (m->mbmi.txfm_size != TX_4X4 && m->mbmi.mode != I8X8_PRED) {
       m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[1]);
+      if (m->mbmi.txfm_size != TX_8X8 && m->mbmi.sb_type)
+        m->mbmi.txfm_size += vp9_read(bc, cm->prob_tx[2]);
+    }
+  } else if (cm->txfm_mode >= ALLOW_32X32 && m->mbmi.sb_type) {
+    m->mbmi.txfm_size = TX_32X32;
   } else if (cm->txfm_mode >= ALLOW_16X16 && m->mbmi.mode <= TM_PRED) {
     m->mbmi.txfm_size = TX_16X16;
   } else if (cm->txfm_mode >= ALLOW_8X8 && m->mbmi.mode != B_PRED) {
@@ -478,11 +470,9 @@
   return (MV_REFERENCE_FRAME)ref_frame;
 }
 
-#if CONFIG_SUPERBLOCKS
 static MB_PREDICTION_MODE read_sb_mv_ref(vp9_reader *bc, const vp9_prob *p) {
   return (MB_PREDICTION_MODE) treed_read(bc, vp9_sb_mv_ref_tree, p);
 }
-#endif
 
 static MB_PREDICTION_MODE read_mv_ref(vp9_reader *bc, const vp9_prob *p) {
   return (MB_PREDICTION_MODE) treed_read(bc, vp9_mv_ref_tree, p);
@@ -532,12 +522,6 @@
     if (!cm->kf_ymode_probs_update)
       cm->kf_ymode_probs_index = vp9_read_literal(bc, 3);
   } else {
-#if CONFIG_PRED_FILTER
-    cm->pred_filter_mode = (vp9_prob)vp9_read_literal(bc, 2);
-
-    if (cm->pred_filter_mode == 2)
-      cm->prob_pred_filter_off = (vp9_prob)vp9_read_literal(bc, 8);
-#endif
     if (cm->mcomp_filter_type == SWITCHABLE)
       read_switchable_interp_probs(pbi, bc);
 #if CONFIG_COMP_INTERINTRA_PRED
@@ -572,7 +556,6 @@
       } while (++i < VP9_YMODES - 1);
     }
 
-#if CONFIG_SUPERBLOCKS
     if (vp9_read_bit(bc)) {
       int i = 0;
 
@@ -580,13 +563,7 @@
         cm->fc.sb_ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);
       } while (++i < VP9_I32X32_MODES - 1);
     }
-#endif
 
-#if CONFIG_NEW_MVREF
-  // Temp defaults probabilities for ecnoding the MV ref id signal
-  vpx_memset(xd->mb_mv_ref_id_probs, 192, sizeof(xd->mb_mv_ref_id_probs));
-#endif
-
     read_nmvprobs(bc, nmvc, xd->allow_high_precision_mv);
   }
 }
@@ -633,38 +610,38 @@
       else {
         read_mb_segid(bc, mbmi, xd);
       }
-#if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        cm->last_frame_seg_map[index] = mbmi->segment_id;
-        if (mb_col + 1 < cm->mb_cols)
-          cm->last_frame_seg_map[index + 1] = mbmi->segment_id;
-        if (mb_row + 1 < cm->mb_rows) {
-          cm->last_frame_seg_map[index + cm->mb_cols] = mbmi->segment_id;
-          if (mb_col + 1 < cm->mb_cols)
-            cm->last_frame_seg_map[index + cm->mb_cols + 1] = mbmi->segment_id;
+      if (mbmi->sb_type) {
+        const int nmbs = 1 << mbmi->sb_type;
+        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
+        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
+        int x, y;
+
+        for (y = 0; y < ymbs; y++) {
+          for (x = 0; x < xmbs; x++) {
+            cm->last_frame_seg_map[index + x + y * cm->mb_cols] =
+                mbmi->segment_id;
+          }
         }
-      } else
-#endif
-      {
+      } else {
         cm->last_frame_seg_map[index] = mbmi->segment_id;
       }
     } else {
-#if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        mbmi->segment_id = cm->last_frame_seg_map[index];
-        if (mb_col < cm->mb_cols - 1)
-          mbmi->segment_id = mbmi->segment_id &&
-                             cm->last_frame_seg_map[index + 1];
-        if (mb_row < cm->mb_rows - 1) {
-          mbmi->segment_id = mbmi->segment_id &&
-                             cm->last_frame_seg_map[index + cm->mb_cols];
-          if (mb_col < cm->mb_cols - 1)
-            mbmi->segment_id = mbmi->segment_id &&
-                               cm->last_frame_seg_map[index + cm->mb_cols + 1];
+      if (mbmi->sb_type) {
+        const int nmbs = 1 << mbmi->sb_type;
+        const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);
+        const int xmbs = MIN(cm->mb_cols - mb_col, nmbs);
+        unsigned segment_id = -1;
+        int x, y;
+
+        for (y = 0; y < ymbs; y++) {
+          for (x = 0; x < xmbs; x++) {
+            segment_id = MIN(segment_id,
+                             cm->last_frame_seg_map[index + x +
+                                                    y * cm->mb_cols]);
+          }
         }
-      } else
-#endif
-      {
+        mbmi->segment_id = segment_id;
+      } else {
         mbmi->segment_id = cm->last_frame_seg_map[index];
       }
     }
@@ -689,6 +666,7 @@
   int mb_to_right_edge;
   int mb_to_top_edge;
   int mb_to_bottom_edge;
+  const int mb_size = 1 << mi->mbmi.sb_type;
 
   mb_to_top_edge = xd->mb_to_top_edge;
   mb_to_bottom_edge = xd->mb_to_bottom_edge;
@@ -703,18 +681,8 @@
   xd->mb_to_left_edge =
     mb_to_left_edge = -((mb_col * 16) << 3);
   mb_to_left_edge -= LEFT_TOP_MARGIN;
-
-#if CONFIG_SUPERBLOCKS
-  if (mi->mbmi.encoded_as_sb) {
-    xd->mb_to_right_edge =
-      mb_to_right_edge = ((pbi->common.mb_cols - 2 - mb_col) * 16) << 3;
-  } else {
-#endif
-    xd->mb_to_right_edge =
-      mb_to_right_edge = ((pbi->common.mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-  }
-#endif
+  xd->mb_to_right_edge =
+      mb_to_right_edge = ((pbi->common.mb_cols - mb_size - mb_col) * 16) << 3;
   mb_to_right_edge += RIGHT_BOTTOM_MARGIN;
 
   // Make sure the MACROBLOCKD mode info pointer is pointed at the
@@ -756,10 +724,10 @@
 
     int recon_y_stride, recon_yoffset;
     int recon_uv_stride, recon_uvoffset;
+    MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
 
     {
       int ref_fb_idx;
-      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
 
       /* Select the appropriate reference frame for this MB */
       if (ref_frame == LAST_FRAME)
@@ -788,14 +756,33 @@
                        ref_frame, mbmi->ref_mvs[ref_frame],
                        cm->ref_frame_sign_bias);
 
-      vp9_find_best_ref_mvs(xd,
-                            xd->pre.y_buffer,
-                            recon_y_stride,
-                            mbmi->ref_mvs[ref_frame],
-                            &best_mv, &nearest, &nearby);
-
       vp9_mv_ref_probs(&pbi->common, mv_ref_p,
                        mbmi->mb_mode_context[ref_frame]);
+
+      // Is the segment level mode feature enabled for this segment
+      if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
+        mbmi->mode =
+          vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
+      } else {
+        if (mbmi->sb_type)
+          mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
+        else
+          mbmi->mode = read_mv_ref(bc, mv_ref_p);
+
+        vp9_accum_mv_refs(&pbi->common, mbmi->mode,
+                          mbmi->mb_mode_context[ref_frame]);
+      }
+
+      if (mbmi->mode != ZEROMV) {
+        vp9_find_best_ref_mvs(xd,
+                              xd->pre.y_buffer,
+                              recon_y_stride,
+                              mbmi->ref_mvs[ref_frame],
+                              &nearest, &nearby);
+
+        best_mv.as_int = (mbmi->ref_mvs[ref_frame][0]).as_int;
+      }
+
 #ifdef DEC_DEBUG
       if (dec_debug)
         printf("[D %d %d] %d %d %d %d\n", ref_frame,
@@ -804,32 +791,6 @@
 #endif
     }
 
-    // Is the segment level mode feature enabled for this segment
-    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
-      mbmi->mode =
-        vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
-    } else {
-#if CONFIG_SUPERBLOCKS
-      if (mbmi->encoded_as_sb) {
-        mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);
-      } else
-#endif
-      mbmi->mode = read_mv_ref(bc, mv_ref_p);
-
-      vp9_accum_mv_refs(&pbi->common, mbmi->mode,
-                        mbmi->mb_mode_context[mbmi->ref_frame]);
-    }
-
-#if CONFIG_PRED_FILTER
-    if (mbmi->mode >= NEARESTMV && mbmi->mode < SPLITMV) {
-      // Is the prediction filter enabled
-      if (cm->pred_filter_mode == 2)
-        mbmi->pred_filter_enabled =
-          vp9_read(bc, cm->prob_pred_filter_off);
-      else
-        mbmi->pred_filter_enabled = cm->pred_filter_mode;
-    }
-#endif
     if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)
     {
       if (cm->mcomp_filter_type == SWITCHABLE) {
@@ -877,13 +838,15 @@
                          mbmi->ref_mvs[mbmi->second_ref_frame],
                          cm->ref_frame_sign_bias);
 
-        vp9_find_best_ref_mvs(xd,
-                              xd->second_pre.y_buffer,
-                              recon_y_stride,
-                              mbmi->ref_mvs[mbmi->second_ref_frame],
-                              &best_mv_second,
-                              &nearest_second,
-                              &nearby_second);
+        if (mbmi->mode != ZEROMV) {
+          vp9_find_best_ref_mvs(xd,
+                                xd->second_pre.y_buffer,
+                                recon_y_stride,
+                                mbmi->ref_mvs[mbmi->second_ref_frame],
+                                &nearest_second,
+                                &nearby_second);
+          best_mv_second = mbmi->ref_mvs[mbmi->second_ref_frame][0];
+        }
       }
 
     } else {
@@ -916,6 +879,29 @@
 #endif
     }
 
+#if CONFIG_NEW_MVREF
+    // if ((mbmi->mode == NEWMV) || (mbmi->mode == SPLITMV))
+    if (mbmi->mode == NEWMV) {
+      int best_index;
+      MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
+
+      // Encode the index of the choice.
+      best_index =
+        vp9_read_mv_ref_id(bc, xd->mb_mv_ref_probs[ref_frame]);
+
+      best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
+
+      if (mbmi->second_ref_frame > 0) {
+        ref_frame = mbmi->second_ref_frame;
+
+        // Encode the index of the choice.
+        best_index =
+          vp9_read_mv_ref_id(bc, xd->mb_mv_ref_probs[ref_frame]);
+        best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
+      }
+    }
+#endif
+
     mbmi->uv_mode = DC_PRED;
     switch (mbmi->mode) {
       case SPLITMV: {
@@ -1072,19 +1058,6 @@
 
       case NEWMV:
 
-#if CONFIG_NEW_MVREF
-        {
-          int best_index;
-          MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
-
-          // Encode the index of the choice.
-          best_index =
-            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
-
-          best_mv.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
-        }
-#endif
-
         read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);
         read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,
                     xd->allow_high_precision_mv);
@@ -1106,18 +1079,6 @@
                                                   mb_to_bottom_edge);
 
         if (mbmi->second_ref_frame > 0) {
-#if CONFIG_NEW_MVREF
-        {
-          int best_index;
-          MV_REFERENCE_FRAME ref_frame = mbmi->second_ref_frame;
-
-          // Encode the index of the choice.
-          best_index =
-            vp9_read_mv_ref_id(bc, xd->mb_mv_ref_id_probs[ref_frame]);
-          best_mv_second.as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
-        }
-#endif
-
           read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc);
           read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,
                       xd->allow_high_precision_mv);
@@ -1144,27 +1105,19 @@
     if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {
       mbmi->mode = (MB_PREDICTION_MODE)
                    vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);
-#if CONFIG_SUPERBLOCKS
-    } else if (mbmi->encoded_as_sb) {
+    } else if (mbmi->sb_type) {
       mbmi->mode = (MB_PREDICTION_MODE)
                    read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob);
       pbi->common.fc.sb_ymode_counts[mbmi->mode]++;
-#endif
     } else {
       mbmi->mode = (MB_PREDICTION_MODE)
                    read_ymode(bc, pbi->common.fc.ymode_prob);
       pbi->common.fc.ymode_counts[mbmi->mode]++;
     }
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
 
     // If MB mode is BPRED read the block modes
     if (mbmi->mode == B_PRED) {
       int j = 0;
-#if CONFIG_COMP_INTRA_PRED
-      int use_comp_pred = vp9_read(bc, DEFAULT_COMP_INTRA_PROB);
-#endif
       do {
         int m;
         m = mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)
@@ -1173,13 +1126,6 @@
         if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;
 #endif
         pbi->common.fc.bmode_counts[m]++;
-#if CONFIG_COMP_INTRA_PRED
-        if (use_comp_pred) {
-          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)read_bmode(bc, pbi->common.fc.bmode_prob);
-        } else {
-          mi->bmi[j].as_mode.second = (B_PREDICTION_MODE)(B_DC_PRED - 1);
-        }
-#endif
       } while (++j < 16);
     }
 
@@ -1194,12 +1140,6 @@
         mi->bmi[ib + 4].as_mode.first = mode8x8;
         mi->bmi[ib + 5].as_mode.first = mode8x8;
         pbi->common.fc.i8x8_mode_counts[mode8x8]++;
-#if CONFIG_COMP_INTRA_PRED
-        mi->bmi[ib + 0].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-        mi->bmi[ib + 1].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-        mi->bmi[ib + 4].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-        mi->bmi[ib + 5].as_mode.second = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
       }
     } else {
       mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(
@@ -1206,10 +1146,6 @@
         bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);
       pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
     }
-
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
   }
 
   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&
@@ -1219,8 +1155,13 @@
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     mbmi->txfm_size = vp9_read(bc, cm->prob_tx[0]);
     if (mbmi->txfm_size != TX_4X4 && mbmi->mode != I8X8_PRED &&
-        mbmi->mode != SPLITMV)
+        mbmi->mode != SPLITMV) {
       mbmi->txfm_size += vp9_read(bc, cm->prob_tx[1]);
+      if (mbmi->sb_type && mbmi->txfm_size != TX_8X8)
+        mbmi->txfm_size += vp9_read(bc, cm->prob_tx[2]);
+    }
+  } else if (mbmi->sb_type && cm->txfm_mode >= ALLOW_32X32) {
+    mbmi->txfm_size = TX_32X32;
   } else if (cm->txfm_mode >= ALLOW_16X16 &&
       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= TM_PRED) ||
        (mbmi->ref_frame != INTRA_FRAME && mbmi->mode != SPLITMV))) {
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef VP9_DECODER_VP9_DECODEMV_H_
+#define VP9_DECODER_VP9_DECODEMV_H_
 
 #include "vp9/decoder/vp9_onyxd_int.h"
 
@@ -17,3 +19,5 @@
                            int mb_col,
                            BOOL_DECODER* const bc);
 void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, BOOL_DECODER* const bc);
+
+#endif  // VP9_DECODER_VP9_DECODEMV_H_
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -10,10 +10,12 @@
 
 
 #include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_header.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_reconintra4x4.h"
 #include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/decoder/vp9_decodframe.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/common/vp9_invtrans.h"
@@ -20,7 +22,7 @@
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_setupintrarecon.h"
 
 #include "vp9/decoder/vp9_decodemv.h"
@@ -81,17 +83,17 @@
   VP9_COMMON *const pc = &pbi->common;
 
   for (Q = 0; Q < QINDEX_RANGE; Q++) {
-    pc->Y1dequant[Q][0] = (short)vp9_dc_quant(Q, pc->y1dc_delta_q);
-    pc->Y2dequant[Q][0] = (short)vp9_dc2quant(Q, pc->y2dc_delta_q);
-    pc->UVdequant[Q][0] = (short)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);
+    pc->Y1dequant[Q][0] = (int16_t)vp9_dc_quant(Q, pc->y1dc_delta_q);
+    pc->Y2dequant[Q][0] = (int16_t)vp9_dc2quant(Q, pc->y2dc_delta_q);
+    pc->UVdequant[Q][0] = (int16_t)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);
 
     /* all the ac values =; */
     for (i = 1; i < 16; i++) {
-      int rc = vp9_default_zig_zag1d[i];
+      int rc = vp9_default_zig_zag1d_4x4[i];
 
-      pc->Y1dequant[Q][rc] = (short)vp9_ac_yquant(Q);
-      pc->Y2dequant[Q][rc] = (short)vp9_ac2quant(Q, pc->y2ac_delta_q);
-      pc->UVdequant[Q][rc] = (short)vp9_ac_uv_quant(Q, pc->uvac_delta_q);
+      pc->Y1dequant[Q][rc] = (int16_t)vp9_ac_yquant(Q);
+      pc->Y2dequant[Q][rc] = (int16_t)vp9_ac2quant(Q, pc->y2ac_delta_q);
+      pc->UVdequant[Q][rc] = (int16_t)vp9_ac_uv_quant(Q, pc->uvac_delta_q);
     }
   }
 }
@@ -170,20 +172,25 @@
  */
 static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+      vp9_build_intra_predictors_sb64uv_s(xd);
+      vp9_build_intra_predictors_sb64y_s(xd);
+    } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
       vp9_build_intra_predictors_sbuv_s(xd);
       vp9_build_intra_predictors_sby_s(xd);
     } else {
-#endif
-    vp9_build_intra_predictors_mbuv_s(xd);
-    vp9_build_intra_predictors_mby_s(xd);
-#if CONFIG_SUPERBLOCKS
+      vp9_build_intra_predictors_mbuv_s(xd);
+      vp9_build_intra_predictors_mby_s(xd);
     }
-#endif
   } else {
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
+    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+      vp9_build_inter64x64_predictors_sb(xd,
+                                         xd->dst.y_buffer,
+                                         xd->dst.u_buffer,
+                                         xd->dst.v_buffer,
+                                         xd->dst.y_stride,
+                                         xd->dst.uv_stride);
+    } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {
       vp9_build_inter32x32_predictors_sb(xd,
                                          xd->dst.y_buffer,
                                          xd->dst.u_buffer,
@@ -191,35 +198,32 @@
                                          xd->dst.y_stride,
                                          xd->dst.uv_stride);
     } else {
-#endif
-    vp9_build_1st_inter16x16_predictors_mb(xd,
-                                           xd->dst.y_buffer,
-                                           xd->dst.u_buffer,
-                                           xd->dst.v_buffer,
-                                           xd->dst.y_stride,
-                                           xd->dst.uv_stride);
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      vp9_build_2nd_inter16x16_predictors_mb(xd,
+      vp9_build_1st_inter16x16_predictors_mb(xd,
                                              xd->dst.y_buffer,
                                              xd->dst.u_buffer,
                                              xd->dst.v_buffer,
                                              xd->dst.y_stride,
                                              xd->dst.uv_stride);
-    }
-#if CONFIG_COMP_INTERINTRA_PRED
-    else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
-      vp9_build_interintra_16x16_predictors_mb(xd,
+
+      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+        vp9_build_2nd_inter16x16_predictors_mb(xd,
                                                xd->dst.y_buffer,
                                                xd->dst.u_buffer,
                                                xd->dst.v_buffer,
                                                xd->dst.y_stride,
                                                xd->dst.uv_stride);
-    }
+      }
+#if CONFIG_COMP_INTERINTRA_PRED
+      else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {
+        vp9_build_interintra_16x16_predictors_mb(xd,
+                                                 xd->dst.y_buffer,
+                                                 xd->dst.u_buffer,
+                                                 xd->dst.v_buffer,
+                                                 xd->dst.y_stride,
+                                                 xd->dst.uv_stride);
+      }
 #endif
-#if CONFIG_SUPERBLOCKS
     }
-#endif
   }
 }
 
@@ -283,10 +287,10 @@
     for (i = 0; i < 4; i++) {
       int ib = vp9_i8x8_block[i];
       int idx = (ib & 0x02) ? (ib + 2) : ib;
-      short *q  = xd->block[idx].qcoeff;
-      short *dq = xd->block[0].dequant;
-      unsigned char *pre = xd->block[ib].predictor;
-      unsigned char *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;
+      int16_t *q  = xd->block[idx].qcoeff;
+      int16_t *dq = xd->block[0].dequant;
+      uint8_t *pre = xd->block[ib].predictor;
+      uint8_t *dst = *(xd->block[ib].base_dst) + xd->block[ib].dst;
       int stride = xd->dst.y_stride;
       BLOCKD *b = &xd->block[ib];
       if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {
@@ -414,9 +418,6 @@
     assert(get_2nd_order_usage(xd) == 0);
     for (i = 0; i < 16; i++) {
       int b_mode;
-#if CONFIG_COMP_INTRA_PRED
-      int b_mode2;
-#endif
       BLOCKD *b = &xd->block[i];
       b_mode = xd->mode_info_context->bmi[i].as_mode.first;
 #if CONFIG_NEWBINTRAMODES
@@ -425,17 +426,8 @@
 #endif
       if (!xd->mode_info_context->mbmi.mb_skip_coeff)
         eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);
-#if CONFIG_COMP_INTRA_PRED
-      b_mode2 = xd->mode_info_context->bmi[i].as_mode.second;
 
-      if (b_mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
-        vp9_intra4x4_predict(b, b_mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        vp9_comp_intra4x4_predict(b, b_mode, b_mode2, b->predictor);
-      }
-#endif
+      vp9_intra4x4_predict(b, b_mode, b->predictor);
       tx_type = get_tx_type_4x4(xd, b);
       if (tx_type != DCT_DCT) {
         vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,
@@ -446,12 +438,12 @@
         vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,
                              *(b->base_dst) + b->dst, 16, b->dst_stride);
       }
-      xd->above_context->y2 = 1;
-      xd->left_context->y2 = 1;
     }
     if (!xd->mode_info_context->mbmi.mb_skip_coeff) {
       vp9_decode_mb_tokens_4x4_uv(pbi, xd, bc);
     }
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
     vp9_build_intra_predictors_mbuv(xd);
     pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,
                            xd->block[16].dequant,
@@ -546,10 +538,10 @@
   }
 }
 
-#if CONFIG_SUPERBLOCKS
 static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                            BOOL_DECODER* const bc, int n) {
-  int x_idx = n & 1, y_idx = n >> 1;
+                            BOOL_DECODER* const bc, int n,
+                            int maska, int shiftb) {
+  int x_idx = n & maska, y_idx = n >> shiftb;
   TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);
   if (tx_type != DCT_DCT) {
     vp9_ht_dequant_idct_add_16x16_c(
@@ -573,9 +565,10 @@
 };
 
 static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          BOOL_DECODER* const bc, int n) {
+                          BOOL_DECODER* const bc, int n,
+                          int maska, int shiftb) {
+  int x_idx = n & maska, y_idx = n >> shiftb;
   BLOCKD *b = &xd->block[24];
-  int x_idx = n & 1, y_idx = n >> 1;
   TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]);
   if (tx_type != DCT_DCT) {
     int i;
@@ -582,8 +575,8 @@
     for (i = 0; i < 4; i++) {
       int ib = vp9_i8x8_block[i];
       int idx = (ib & 0x02) ? (ib + 2) : ib;
-      short *q  = xd->block[idx].qcoeff;
-      short *dq = xd->block[0].dequant;
+      int16_t *q  = xd->block[idx].qcoeff;
+      int16_t *dq = xd->block[0].dequant;
       int stride = xd->dst.y_stride;
       BLOCKD *b = &xd->block[ib];
       tx_type = get_tx_type_8x8(xd, &xd->block[ib]);
@@ -634,9 +627,10 @@
 };
 
 static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                          BOOL_DECODER* const bc, int n) {
+                          BOOL_DECODER* const bc, int n,
+                          int maska, int shiftb) {
+  int x_idx = n & maska, y_idx = n >> shiftb;
   BLOCKD *b = &xd->block[24];
-  int x_idx = n & 1, y_idx = n >> 1;
   TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[0]);
   if (tx_type != DCT_DCT) {
     int i;
@@ -689,15 +683,16 @@
       xd->dst.uv_stride, xd->eobs + 16, xd);
 };
 
-static void decode_superblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
-                              int mb_row, unsigned int mb_col,
-                              BOOL_DECODER* const bc) {
+static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                                int mb_row, unsigned int mb_col,
+                                BOOL_DECODER* const bc) {
   int i, n, eobtotal;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
   VP9_COMMON *const pc = &pbi->common;
   MODE_INFO *orig_mi = xd->mode_info_context;
+  const int mis = pc->mode_info_stride;
 
-  assert(xd->mode_info_context->mbmi.encoded_as_sb);
+  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64);
 
   if (pbi->common.frame_type != KEY_FRAME)
     vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
@@ -707,7 +702,133 @@
     mb_init_dequantizer(pbi, xd);
 
   if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+    int n;
+
     vp9_reset_mb_tokens_context(xd);
+    for (n = 1; n <= 3; n++) {
+      if (mb_col < pc->mb_cols - n)
+        xd->above_context += n;
+      if (mb_row < pc->mb_rows - n)
+        xd->left_context += n;
+      vp9_reset_mb_tokens_context(xd);
+      if (mb_col < pc->mb_cols - n)
+        xd->above_context -= n;
+      if (mb_row < pc->mb_rows - n)
+        xd->left_context -= n;
+    }
+
+    /* Special case:  Force the loopfilter to skip when eobtotal and
+     * mb_skip_coeff are zero.
+     */
+    skip_recon_mb(pbi, xd);
+    return;
+  }
+
+  /* do prediction */
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    vp9_build_intra_predictors_sb64y_s(xd);
+    vp9_build_intra_predictors_sb64uv_s(xd);
+  } else {
+    vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  /* dequantization and idct */
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    for (n = 0; n < 4; n++) {
+      const int x_idx = n & 1, y_idx = n >> 1;
+
+      if (mb_col + x_idx * 2 >= pc->mb_cols ||
+          mb_row + y_idx * 2 >= pc->mb_rows)
+        continue;
+
+      xd->left_context = pc->left_context + (y_idx << 1);
+      xd->above_context = pc->above_context + mb_col + (x_idx << 1);
+      xd->mode_info_context = orig_mi + x_idx * 2 + y_idx * 2 * mis;
+      eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
+      if (eobtotal == 0) {  // skip loopfilter
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        if (mb_col + 1 < pc->mb_cols)
+          xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
+        if (mb_row + 1 < pc->mb_rows) {
+          xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
+          if (mb_col + 1 < pc->mb_cols)
+            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
+        }
+      } else {
+        vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,
+                                   xd->dst.y_buffer + x_idx * 32 +
+                                       xd->dst.y_stride * y_idx * 32,
+                                   xd->dst.y_buffer + x_idx * 32 +
+                                       xd->dst.y_stride * y_idx * 32,
+                                   xd->dst.y_stride, xd->dst.y_stride,
+                                   xd->eobs[0]);
+        vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
+                                              xd->block[16].dequant,
+                                              xd->dst.u_buffer + x_idx * 16 +
+                                                xd->dst.uv_stride * y_idx * 16,
+                                              xd->dst.v_buffer + x_idx * 16 +
+                                                xd->dst.uv_stride * y_idx * 16,
+                                              xd->dst.uv_stride, xd->eobs + 16);
+      }
+    }
+  } else {
+    for (n = 0; n < 16; n++) {
+      int x_idx = n & 3, y_idx = n >> 2;
+
+      if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
+        continue;
+
+      xd->above_context = pc->above_context + mb_col + x_idx;
+      xd->left_context = pc->left_context + y_idx;
+      xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
+      for (i = 0; i < 25; i++) {
+        xd->block[i].eob = 0;
+        xd->eobs[i] = 0;
+      }
+
+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+      if (eobtotal == 0) {  // skip loopfilter
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        continue;
+      }
+
+      if (tx_size == TX_16X16) {
+        decode_16x16_sb(pbi, xd, bc, n, 3, 2);
+      } else if (tx_size == TX_8X8) {
+        decode_8x8_sb(pbi, xd, bc, n, 3, 2);
+      } else {
+        decode_4x4_sb(pbi, xd, bc, n, 3, 2);
+      }
+    }
+  }
+
+  xd->above_context = pc->above_context + mb_col;
+  xd->left_context = pc->left_context;
+  xd->mode_info_context = orig_mi;
+}
+
+static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,
+                                int mb_row, unsigned int mb_col,
+                                BOOL_DECODER* const bc) {
+  int i, n, eobtotal;
+  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  VP9_COMMON *const pc = &pbi->common;
+  MODE_INFO *orig_mi = xd->mode_info_context;
+  const int mis = pc->mode_info_stride;
+
+  assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32);
+
+  if (pbi->common.frame_type != KEY_FRAME)
+    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, pc);
+
+  // re-initialize macroblock dequantizer before detokenization
+  if (xd->segmentation_enabled)
+    mb_init_dequantizer(pbi, xd);
+
+  if (xd->mode_info_context->mbmi.mb_skip_coeff) {
+    vp9_reset_mb_tokens_context(xd);
     if (mb_col < pc->mb_cols - 1)
       xd->above_context++;
     if (mb_row < pc->mb_rows - 1)
@@ -736,41 +857,62 @@
   }
 
   /* dequantization and idct */
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);
+    if (eobtotal == 0) {  // skip loopfilter
+      xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+      if (mb_col + 1 < pc->mb_cols)
+        xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;
+      if (mb_row + 1 < pc->mb_rows) {
+        xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;
+        if (mb_col + 1 < pc->mb_cols)
+          xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;
+      }
+    } else {
+      vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,
+                                 xd->dst.y_buffer, xd->dst.y_buffer,
+                                 xd->dst.y_stride, xd->dst.y_stride,
+                                 xd->eobs[0]);
+      vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,
+                                            xd->block[16].dequant,
+                                            xd->dst.u_buffer, xd->dst.v_buffer,
+                                            xd->dst.uv_stride, xd->eobs + 16);
+    }
+  } else {
+    for (n = 0; n < 4; n++) {
+      int x_idx = n & 1, y_idx = n >> 1;
 
-    if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
-      continue;
+      if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)
+        continue;
 
+      xd->above_context = pc->above_context + mb_col + x_idx;
+      xd->left_context = pc->left_context + y_idx + (mb_row & 2);
+      xd->mode_info_context = orig_mi + x_idx + y_idx * mis;
+      for (i = 0; i < 25; i++) {
+        xd->block[i].eob = 0;
+        xd->eobs[i] = 0;
+      }
 
-    xd->above_context = pc->above_context + mb_col + x_idx;
-    xd->left_context = pc->left_context + y_idx;
-    xd->mode_info_context = orig_mi + x_idx + y_idx * pc->mode_info_stride;
-    for (i = 0; i < 25; i++) {
-      xd->block[i].eob = 0;
-      xd->eobs[i] = 0;
-    }
+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
+      if (eobtotal == 0) {  // skip loopfilter
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        continue;
+      }
 
-    eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
-    if (eobtotal == 0) {  // skip loopfilter
-      xd->mode_info_context->mbmi.mb_skip_coeff = 1;
-      continue;
+      if (tx_size == TX_16X16) {
+        decode_16x16_sb(pbi, xd, bc, n, 1, 1);
+      } else if (tx_size == TX_8X8) {
+        decode_8x8_sb(pbi, xd, bc, n, 1, 1);
+      } else {
+        decode_4x4_sb(pbi, xd, bc, n, 1, 1);
+      }
     }
 
-    if (tx_size == TX_16X16) {
-      decode_16x16_sb(pbi, xd, bc, n);
-    } else if (tx_size == TX_8X8) {
-      decode_8x8_sb(pbi, xd, bc, n);
-    } else {
-      decode_4x4_sb(pbi, xd, bc, n);
-    }
+    xd->above_context = pc->above_context + mb_col;
+    xd->left_context = pc->left_context + (mb_row & 2);
+    xd->mode_info_context = orig_mi;
   }
-
-  xd->above_context = pc->above_context + mb_col;
-  xd->left_context = pc->left_context;
-  xd->mode_info_context = orig_mi;
 }
-#endif
 
 static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd,
                               int mb_row, unsigned int mb_col,
@@ -780,9 +922,7 @@
   int i;
   int tx_size;
 
-#if CONFIG_SUPERBLOCKS
-  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
-#endif
+  assert(!xd->mode_info_context->mbmi.sb_type);
 
   // re-initialize macroblock dequantizer before detokenization
   if (xd->segmentation_enabled)
@@ -904,192 +1044,176 @@
 FILE *vpxlog = 0;
 #endif
 
-/* Decode a row of Superblocks (2x2 region of MBs) */
-static void
-decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, int mbrow, MACROBLOCKD *xd,
-              BOOL_DECODER* const bc) {
-  int i;
-  int sb_col;
-  int mb_row, mb_col;
-  int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = pc->lst_fb_idx;
-  int dst_fb_idx = pc->new_fb_idx;
-  int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-  int sb_cols = (pc->mb_cols + 1) >> 1;
+static void set_offsets(VP9D_COMP *pbi, int block_size,
+                        int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  const int mis = cm->mode_info_stride;
+  const int idx = mis * mb_row + mb_col;
+  const int dst_fb_idx = cm->new_fb_idx;
+  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;
+  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;
+  const int recon_yoffset = mb_row * 16 * recon_y_stride + 16 * mb_col;
+  const int recon_uvoffset = mb_row * 8 * recon_uv_stride + 8 * mb_col;
 
-  // For a SB there are 2 left contexts, each pertaining to a MB row within
-  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
+  xd->mode_info_context = cm->mi + idx;
+  xd->mode_info_context->mbmi.sb_type = block_size >> 5;
+  xd->prev_mode_info_context = cm->prev_mi + idx;
+  xd->above_context = cm->above_context + mb_col;
+  xd->left_context = cm->left_context + (mb_row & 3);
 
-  mb_row = mbrow;
-  mb_col = 0;
+  /* Distance of Mb to the various image edges.
+   * These are specified to 8th pel as they are always compared to
+   * values that are in 1/8th pel units
+   */
+  block_size >>= 4;  // in mb units
+  xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+  xd->mb_to_left_edge = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
+  xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
 
-  for (sb_col = 0; sb_col < sb_cols; sb_col++) {
-#if CONFIG_SUPERBLOCKS
-    MODE_INFO *mi = xd->mode_info_context;
+  xd->up_available = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
 
-    mi->mbmi.encoded_as_sb = vp9_read(bc, pc->sb_coded);
-#endif
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+}
 
-    // Process the 4 MBs within the SB in the order:
-    // top-left, top-right, bottom-left, bottom-right
-    for (i = 0; i < 4; i++) {
-      int dy = row_delta[i];
-      int dx = col_delta[i];
-      int offset_extended = dy * xd->mode_info_stride + dx;
+static void set_refs(VP9D_COMP *pbi, int block_size,
+                     int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &pbi->common;
+  MACROBLOCKD *const xd = &pbi->mb;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-      xd->mb_index = i;
+  if (mbmi->ref_frame > INTRA_FRAME) {
+    int ref_fb_idx, ref_yoffset, ref_uvoffset, ref_y_stride, ref_uv_stride;
 
-#if CONFIG_SUPERBLOCKS
-      mi = xd->mode_info_context;
-#endif
-      if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
-        // MB lies outside frame, skip on to next
-        mb_row += dy;
-        mb_col += dx;
-        xd->mode_info_context += offset_extended;
-        xd->prev_mode_info_context += offset_extended;
-        continue;
-      }
-#if CONFIG_SUPERBLOCKS
-      if (i)
-        mi->mbmi.encoded_as_sb = 0;
-#endif
+    /* Select the appropriate reference frame for this MB */
+    if (mbmi->ref_frame == LAST_FRAME)
+      ref_fb_idx = cm->lst_fb_idx;
+    else if (mbmi->ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cm->gld_fb_idx;
+    else
+      ref_fb_idx = cm->alt_fb_idx;
 
-      // Set above context pointer
-      xd->above_context = pc->above_context + mb_col;
-      xd->left_context = pc->left_context + (i >> 1);
+    ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+    ref_yoffset = mb_row * 16 * ref_y_stride + 16 * mb_col;
+    xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + ref_yoffset;
+    ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+    ref_uvoffset = mb_row * 8 * ref_uv_stride + 8 * mb_col;
+    xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + ref_uvoffset;
+    xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + ref_uvoffset;
 
-      /* Distance of Mb to the various image edges.
-       * These are specified to 8th pel as they are always compared to
-       * values that are in 1/8th pel units
-       */
-      xd->mb_to_top_edge = -((mb_row * 16)) << 3;
-      xd->mb_to_left_edge = -((mb_col * 16) << 3);
-#if CONFIG_SUPERBLOCKS
-      if (mi->mbmi.encoded_as_sb) {
-        xd->mb_to_bottom_edge = ((pc->mb_rows - 2 - mb_row) * 16) << 3;
-        xd->mb_to_right_edge = ((pc->mb_cols - 2 - mb_col) * 16) << 3;
-      } else {
-#endif
-        xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-        xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-      }
-#endif
-#ifdef DEC_DEBUG
-      dec_debug = (pbi->common.current_video_frame == 1 &&
-                   mb_row == 2 && mb_col == 8);
-      if (dec_debug)
-#if CONFIG_SUPERBLOCKS
-        printf("Enter Debug %d %d sb %d\n", mb_row, mb_col,
-               mi->mbmi.encoded_as_sb);
-#else
-        printf("Enter Debug %d %d\n", mb_row, mb_col);
-#endif
-#endif
-      xd->up_available = (mb_row != 0);
-      xd->left_available = (mb_col != 0);
+    /* propagate errors from reference frames */
+    xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;
 
+    if (mbmi->second_ref_frame > INTRA_FRAME) {
+      int second_ref_fb_idx;
 
-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
-
-      xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-      xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-      xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
-      vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
-
-      update_blockd_bmi(xd);
-#ifdef DEC_DEBUG
-      if (dec_debug)
-        printf("Hello\n");
-#endif
-
       /* Select the appropriate reference frame for this MB */
-      if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-        ref_fb_idx = pc->lst_fb_idx;
-      else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-        ref_fb_idx = pc->gld_fb_idx;
+      if (mbmi->second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cm->lst_fb_idx;
+      else if (mbmi->second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cm->gld_fb_idx;
       else
-        ref_fb_idx = pc->alt_fb_idx;
+        second_ref_fb_idx = cm->alt_fb_idx;
 
-      xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
-      xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
-      xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+      xd->second_pre.y_buffer =
+          cm->yv12_fb[second_ref_fb_idx].y_buffer + ref_yoffset;
+      xd->second_pre.u_buffer =
+          cm->yv12_fb[second_ref_fb_idx].u_buffer + ref_uvoffset;
+      xd->second_pre.v_buffer =
+          cm->yv12_fb[second_ref_fb_idx].v_buffer + ref_uvoffset;
 
-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-        int second_ref_fb_idx;
+      /* propagate errors from reference frames */
+      xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;
+    }
+  }
 
-        /* Select the appropriate reference frame for this MB */
-        if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-          second_ref_fb_idx = pc->lst_fb_idx;
-        else if (xd->mode_info_context->mbmi.second_ref_frame ==
-                 GOLDEN_FRAME)
-          second_ref_fb_idx = pc->gld_fb_idx;
-        else
-          second_ref_fb_idx = pc->alt_fb_idx;
+  if (mbmi->sb_type) {
+    const int n_mbs = 1 << mbmi->sb_type;
+    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);
+    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);
+    const int mis = cm->mode_info_stride;
+    int x, y;
 
-        xd->second_pre.y_buffer =
-          pc->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
-        xd->second_pre.u_buffer =
-          pc->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
-        xd->second_pre.v_buffer =
-          pc->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+    for (y = 0; y < y_mbs; y++) {
+      for (x = !y; x < x_mbs; x++) {
+        mi[y * mis + x] = *mi;
       }
+    }
+  }
+}
 
-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
-        /* propagate errors from reference frames */
-        xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
-      }
+/* Decode a row of Superblocks (2x2 region of MBs) */
+static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc,
+                          int mb_row, MACROBLOCKD *xd,
+                          BOOL_DECODER* const bc) {
+  int mb_col;
 
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        if (mb_col < pc->mb_cols - 1)
-          mi[1] = mi[0];
-        if (mb_row < pc->mb_rows - 1) {
-          mi[pc->mode_info_stride] = mi[0];
-          if (mb_col < pc->mb_cols - 1)
-            mi[pc->mode_info_stride + 1] = mi[0];
-        }
-      }
-      if (xd->mode_info_context->mbmi.encoded_as_sb) {
-        decode_superblock(pbi, xd, mb_row, mb_col, bc);
-      } else {
-#endif
-        vp9_intra_prediction_down_copy(xd);
-        decode_macroblock(pbi, xd, mb_row, mb_col, bc);
-#if CONFIG_SUPERBLOCKS
-      }
-#endif
+  // For a SB there are 2 left contexts, each pertaining to a MB row within
+  vpx_memset(pc->left_context, 0, sizeof(pc->left_context));
 
-      /* check if the boolean decoder has suffered an error */
+  for (mb_col = 0; mb_col < pc->mb_cols; mb_col += 4) {
+    if (vp9_read(bc, pc->sb64_coded)) {
+      set_offsets(pbi, 64, mb_row, mb_col);
+      vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);
+      set_refs(pbi, 64, mb_row, mb_col);
+      decode_superblock64(pbi, xd, mb_row, mb_col, bc);
       xd->corrupted |= bool_error(bc);
+    } else {
+      int j;
 
-#if CONFIG_SUPERBLOCKS
-      if (mi->mbmi.encoded_as_sb) {
-        assert(!i);
-        mb_col += 2;
-        xd->mode_info_context += 2;
-        xd->prev_mode_info_context += 2;
-        break;
-      }
-#endif
+      for (j = 0; j < 4; j++) {
+        const int x_idx_sb = (j & 1) << 1, y_idx_sb = j & 2;
 
-      // skip to next MB
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-      mb_row += dy;
-      mb_col += dx;
+        if (mb_row + y_idx_sb >= pc->mb_rows ||
+            mb_col + x_idx_sb >= pc->mb_cols) {
+          // MB lies outside frame, skip on to next
+          continue;
+        }
+
+        xd->sb_index = j;
+
+        if (vp9_read(bc, pc->sb32_coded)) {
+          set_offsets(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
+          vp9_decode_mb_mode_mv(pbi,
+                                xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
+          set_refs(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);
+          decode_superblock32(pbi,
+                              xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);
+          xd->corrupted |= bool_error(bc);
+        } else {
+          int i;
+
+          // Process the 4 MBs within the SB in the order:
+          // top-left, top-right, bottom-left, bottom-right
+          for (i = 0; i < 4; i++) {
+            const int x_idx = x_idx_sb + (i & 1), y_idx = y_idx_sb + (i >> 1);
+
+            if (mb_row + y_idx >= pc->mb_rows ||
+                mb_col + x_idx >= pc->mb_cols) {
+              // MB lies outside frame, skip on to next
+              continue;
+            }
+
+            set_offsets(pbi, 16, mb_row + y_idx, mb_col + x_idx);
+            xd->mb_index = i;
+            vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);
+            update_blockd_bmi(xd);
+            set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx);
+            vp9_intra_prediction_down_copy(xd);
+            decode_macroblock(pbi, xd, mb_row, mb_col, bc);
+
+            /* check if the boolean decoder has suffered an error */
+            xd->corrupted |= bool_error(bc);
+          }
+        }
+      }
     }
   }
-
-  /* skip prediction column */
-  xd->mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
-  xd->prev_mode_info_context += 1 - (pc->mb_cols & 0x1) + xd->mode_info_stride;
 }
 
 static unsigned int read_partition_size(const unsigned char *cx_size) {
@@ -1212,14 +1336,13 @@
 
 }
 
-static void read_coef_probs_common(
-    BOOL_DECODER* const bc,
-    vp9_prob coef_probs[BLOCK_TYPES][COEF_BANDS]
-                       [PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
+static void read_coef_probs_common(BOOL_DECODER* const bc,
+                                   vp9_coeff_probs *coef_probs,
+                                   int block_types) {
   int i, j, k, l;
 
   if (vp9_read_bit(bc)) {
-    for (i = 0; i < BLOCK_TYPES; i++) {
+    for (i = 0; i < block_types; i++) {
       for (j = !i; j < COEF_BANDS; j++) {
         /* NB: This j loop starts from 1 on block type i == 0 */
         for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
@@ -1242,17 +1365,21 @@
 static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {
   VP9_COMMON *const pc = &pbi->common;
 
-  read_coef_probs_common(bc, pc->fc.coef_probs);
-  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs);
+  read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES_4X4);
+  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4);
 
   if (pbi->common.txfm_mode != ONLY_4X4) {
-    read_coef_probs_common(bc, pc->fc.coef_probs_8x8);
-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8);
+    read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES_8X8);
+    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8);
   }
   if (pbi->common.txfm_mode > ALLOW_8X8) {
-    read_coef_probs_common(bc, pc->fc.coef_probs_16x16);
-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16);
+    read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES_16X16);
+    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16,
+                           BLOCK_TYPES_16X16);
   }
+  if (pbi->common.txfm_mode > ALLOW_16X16) {
+    read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES_32X32);
+  }
 }
 
 int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
@@ -1437,15 +1564,17 @@
     }
   }
 
-#if CONFIG_SUPERBLOCKS
-  pc->sb_coded = vp9_read_literal(&header_bc, 8);
-#endif
+  pc->sb64_coded = vp9_read_literal(&header_bc, 8);
+  pc->sb32_coded = vp9_read_literal(&header_bc, 8);
 
   /* Read the loop filter level and type */
   pc->txfm_mode = vp9_read_literal(&header_bc, 2);
+  if (pc->txfm_mode == 3)
+    pc->txfm_mode += vp9_read_bit(&header_bc);
   if (pc->txfm_mode == TX_MODE_SELECT) {
     pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);
     pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);
+    pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);
   }
 
   pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);
@@ -1577,6 +1706,33 @@
     }
   }
 
+#if CONFIG_NEW_MVREF
+  // If Key frame reset mv ref id probabilities to defaults
+  if (pc->frame_type == KEY_FRAME) {
+    // Defaults probabilities for encoding the MV ref id signal
+    vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,
+               sizeof(xd->mb_mv_ref_probs));
+  } else {
+    // Read any mv_ref index probability updates
+    int i, j;
+
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      // Skip the dummy entry for intra ref frame.
+      if (i == INTRA_FRAME) {
+        continue;
+      }
+
+      // Read any updates to probabilities
+      for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) {
+        if (vp9_read(&header_bc, VP9_MVREF_UPDATE_PROB)) {
+          xd->mb_mv_ref_probs[i][j] =
+            (vp9_prob)vp9_read_literal(&header_bc, 8);
+        }
+      }
+    }
+  }
+#endif
+
   if (0) {
     FILE *z = fopen("decodestats.stt", "a");
     fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
@@ -1589,10 +1745,10 @@
     fclose(z);
   }
 
-  vp9_copy(pbi->common.fc.pre_coef_probs,
-           pbi->common.fc.coef_probs);
-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs,
-           pbi->common.fc.hybrid_coef_probs);
+  vp9_copy(pbi->common.fc.pre_coef_probs_4x4,
+           pbi->common.fc.coef_probs_4x4);
+  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_4x4,
+           pbi->common.fc.hybrid_coef_probs_4x4);
   vp9_copy(pbi->common.fc.pre_coef_probs_8x8,
            pbi->common.fc.coef_probs_8x8);
   vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,
@@ -1601,10 +1757,10 @@
            pbi->common.fc.coef_probs_16x16);
   vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,
            pbi->common.fc.hybrid_coef_probs_16x16);
+  vp9_copy(pbi->common.fc.pre_coef_probs_32x32,
+           pbi->common.fc.coef_probs_32x32);
   vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);
-#if CONFIG_SUPERBLOCKS
   vp9_copy(pbi->common.fc.pre_sb_ymode_prob, pbi->common.fc.sb_ymode_prob);
-#endif
   vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);
   vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);
   vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);
@@ -1614,16 +1770,15 @@
   pbi->common.fc.pre_interintra_prob = pbi->common.fc.interintra_prob;
 #endif
   pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;
-  vp9_zero(pbi->common.fc.coef_counts);
-  vp9_zero(pbi->common.fc.hybrid_coef_counts);
+  vp9_zero(pbi->common.fc.coef_counts_4x4);
+  vp9_zero(pbi->common.fc.hybrid_coef_counts_4x4);
   vp9_zero(pbi->common.fc.coef_counts_8x8);
   vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);
   vp9_zero(pbi->common.fc.coef_counts_16x16);
   vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);
+  vp9_zero(pbi->common.fc.coef_counts_32x32);
   vp9_zero(pbi->common.fc.ymode_counts);
-#if CONFIG_SUPERBLOCKS
   vp9_zero(pbi->common.fc.sb_ymode_counts);
-#endif
   vp9_zero(pbi->common.fc.uv_mode_counts);
   vp9_zero(pbi->common.fc.bmode_counts);
   vp9_zero(pbi->common.fc.i8x8_mode_counts);
@@ -1662,12 +1817,8 @@
 
   vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
 
-  // Resset the macroblock mode info context to the start of the list
-  xd->mode_info_context = pc->mi;
-  xd->prev_mode_info_context = pc->prev_mi;
-
   /* Decode a row of superblocks */
-  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 2) {
+  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 4) {
     decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);
   }
   corrupt_tokens |= xd->corrupted;
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodframe.h
@@ -16,4 +16,4 @@
 
 extern void vp9_init_de_quantizer(struct VP9Decompressor *pbi);
 
-#endif  // __INC_DECODFRAME_H
+#endif  // VP9_DECODER_VP9_DECODFRAME_H_
--- a/vp9/decoder/vp9_dequantize.c
+++ b/vp9/decoder/vp9_dequantize.c
@@ -13,6 +13,7 @@
 #include "vp9/decoder/vp9_dequantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/common/vp9_common.h"
 static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,
                          uint8_t *dest, int stride, int width, int height) {
   int r, c;
@@ -19,14 +20,7 @@
 
   for (r = 0; r < height; r++) {
     for (c = 0; c < width; c++) {
-      int a = diff[c] + pred[c];
-
-      if (a < 0)
-        a = 0;
-      else if (a > 255)
-        a = 255;
-
-      dest[c] = (uint8_t) a;
+      dest[c] = clip_pixel(diff[c] + pred[c]);
     }
 
     dest += stride;
@@ -42,14 +36,7 @@
 
   for (r = 0; r < height; r++) {
     for (c = 0; c < width; c++) {
-      int a = diff + pred[c];
-
-      if (a < 0)
-        a = 0;
-      else if (a > 255)
-        a = 255;
-
-      dest[c] = (uint8_t) a;
+      dest[c] = clip_pixel(diff + pred[c]);
     }
 
     dest += stride;
@@ -204,7 +191,7 @@
 
 void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,
                                 uint8_t *pred, uint8_t *dest, int pitch,
-                                int stride, int dc, uint16_t eobs) {
+                                int stride, int dc, int eob) {
   int16_t output[64];
   int16_t *diff_ptr = output;
   int i;
@@ -220,10 +207,10 @@
    * TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
    * Combine that with code here.
    */
-  if (eobs == 0) {
+  if (eob == 0) {
     /* All 0 DCT coefficient */
     vp9_copy_mem8x8(pred, pitch, dest, stride);
-  } else if (eobs == 1) {
+  } else if (eob == 1) {
     /* DC only DCT coefficient. */
     int16_t out;
 
@@ -236,7 +223,7 @@
     input[0] = 0;
 
     add_constant_residual(out, pred, pitch, dest, stride, 8, 8);
-  } else if (eobs <= 10) {
+  } else if (eob <= 10) {
     input[1] = input[1] * dq[1];
     input[2] = input[2] * dq[1];
     input[3] = input[3] * dq[1];
@@ -301,7 +288,7 @@
 
 void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
                                   uint8_t *pred, uint8_t *dest, int pitch,
-                                  int stride, uint16_t eobs) {
+                                  int stride, int eob) {
   int16_t output[256];
   int16_t *diff_ptr = output;
   int i;
@@ -308,10 +295,10 @@
 
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
-  if (eobs == 0) {
+  if (eob == 0) {
     /* All 0 DCT coefficient */
     vp9_copy_mem16x16(pred, pitch, dest, stride);
-  } else if (eobs == 1) {
+  } else if (eob == 1) {
     /* DC only DCT coefficient. */
     int16_t out;
 
@@ -324,7 +311,7 @@
     input[0] = 0;
 
     add_constant_residual(out, pred, pitch, dest, stride, 16, 16);
-  } else if (eobs <= 10) {
+  } else if (eob <= 10) {
     input[0]= input[0] * dq[0];
     input[1] = input[1] * dq[1];
     input[2] = input[2] * dq[1];
@@ -359,4 +346,29 @@
 
     add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);
   }
+}
+
+void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,
+                                  uint8_t *pred, uint8_t *dest, int pitch,
+                                  int stride, int eob) {
+  int16_t output[1024];
+  int i;
+
+  input[0]= input[0] * dq[0] / 2;
+  for (i = 1; i < 1024; i++)
+    input[i] = input[i] * dq[1] / 2;
+  vp9_short_idct32x32_c(input, output, 64);
+  vpx_memset(input, 0, 2048);
+
+  add_residual(output, pred, pitch, dest, stride, 32, 32);
+}
+
+void vp9_dequant_idct_add_uv_block_16x16_c(int16_t *q, const int16_t *dq,
+                                           uint8_t *dstu,
+                                           uint8_t *dstv,
+                                           int stride,
+                                           uint16_t *eobs) {
+  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride, eobs[0]);
+  vp9_dequant_idct_add_16x16_c(q + 256, dq,
+                               dstv, dstv, stride, stride, eobs[4]);
 }
--- a/vp9/decoder/vp9_dequantize.h
+++ b/vp9/decoder/vp9_dequantize.h
@@ -14,90 +14,88 @@
 #include "vp9/common/vp9_blockd.h"
 
 #if CONFIG_LOSSLESS
-extern void vp9_dequant_idct_add_lossless_c(short *input, const short *dq,
+extern void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,
                                             unsigned char *pred,
                                             unsigned char *output,
                                             int pitch, int stride);
-extern void vp9_dequant_dc_idct_add_lossless_c(short *input, const short *dq,
+extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,
                                                unsigned char *pred,
                                                unsigned char *output,
                                                int pitch, int stride, int dc);
-extern void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q,
-                                                       const short *dq,
+extern void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,
+                                                       const int16_t *dq,
                                                        unsigned char *pre,
                                                        unsigned char *dst,
                                                        int stride,
-                                                       unsigned short *eobs,
-                                                       const short *dc);
-extern void vp9_dequant_idct_add_y_block_lossless_c(short *q, const short *dq,
+                                                       uint16_t *eobs,
+                                                       const int16_t *dc);
+extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
                                                     unsigned char *pre,
                                                     unsigned char *dst,
                                                     int stride,
-                                                    unsigned short *eobs);
-extern void vp9_dequant_idct_add_uv_block_lossless_c(short *q, const short *dq,
+                                                    uint16_t *eobs);
+extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
                                                      unsigned char *pre,
                                                      unsigned char *dst_u,
                                                      unsigned char *dst_v,
                                                      int stride,
-                                                     unsigned short *eobs);
+                                                     uint16_t *eobs);
 #endif
 
-typedef void (*vp9_dequant_idct_add_fn_t)(short *input, const short *dq,
+typedef void (*vp9_dequant_idct_add_fn_t)(int16_t *input, const int16_t *dq,
     unsigned char *pred, unsigned char *output, int pitch, int stride);
-typedef void(*vp9_dequant_dc_idct_add_fn_t)(short *input, const short *dq,
+typedef void(*vp9_dequant_dc_idct_add_fn_t)(int16_t *input, const int16_t *dq,
     unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);
 
-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(short *q, const short *dq,
-    unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs,
-    const short *dc);
-typedef void(*vp9_dequant_idct_add_y_block_fn_t)(short *q, const short *dq,
-    unsigned char *pre, unsigned char *dst, int stride, unsigned short *eobs);
-typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(short *q, const short *dq,
+typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
+    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs,
+    const int16_t *dc);
+typedef void(*vp9_dequant_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,
+    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs);
+typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(int16_t *q, const int16_t *dq,
     unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,
-    unsigned short *eobs);
+    uint16_t *eobs);
 
-void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, short *input, const short *dq,
+void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,
                                     unsigned char *pred, unsigned char *dest,
                                     int pitch, int stride, uint16_t eobs);
 
-void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input,
-                                   const short *dq, unsigned char *pred,
+void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,
+                                   const int16_t *dq, unsigned char *pred,
                                    unsigned char *dest, int pitch, int stride,
                                    uint16_t eobs);
 
-void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, short *input,
-                                     const short *dq, unsigned char *pred,
+void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
+                                     const int16_t *dq, unsigned char *pred,
                                      unsigned char *dest,
                                      int pitch, int stride, uint16_t eobs);
 
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, const short *dq,
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
                                                    unsigned char *dst,
                                                    int stride,
-                                                   unsigned short *eobs,
-                                                   const short *dc,
+                                                   uint16_t *eobs,
+                                                   const int16_t *dc,
                                                    MACROBLOCKD *xd);
 
-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, const short *dq,
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
                                                    unsigned char *dst,
                                                    int stride,
-                                                   unsigned short *eobs,
-                                                   const short *dc,
+                                                   uint16_t *eobs,
+                                                   const int16_t *dc,
                                                    MACROBLOCKD *xd);
 
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, const short *dq,
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
                                                  unsigned char *dstu,
                                                  unsigned char *dstv,
                                                  int stride,
-                                                 unsigned short *eobs,
+                                                 uint16_t *eobs,
                                                  MACROBLOCKD *xd);
 
-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, const short *dq,
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
                                                  unsigned char *dstu,
                                                  unsigned char *dstv,
                                                  int stride,
-                                                 unsigned short *eobs,
+                                                 uint16_t *eobs,
                                                  MACROBLOCKD *xd);
-#endif
 
 #endif
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -9,13 +9,11 @@
  */
 
 
-#include "vp9/common/vp9_type_aliases.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vp9/decoder/vp9_detokenize.h"
-
 #include "vp9/common/vp9_seg_common.h"
 
 #define EOB_CONTEXT_NODE            0
@@ -55,59 +53,38 @@
 #define CAT5_PROB3 157
 #define CAT5_PROB4 180
 
-static const unsigned char cat6_prob[14] =
-{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
+static const vp9_prob cat6_prob[15] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
+};
 
-void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {
-  /* Clear entropy contexts */
-  if ((xd->mode_info_context->mbmi.mode != B_PRED &&
-       xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-       xd->mode_info_context->mbmi.mode != SPLITMV)
-      || xd->mode_info_context->mbmi.txfm_size == TX_16X16) {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-  } else {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
-  }
-}
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
-DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
-
 static int get_signed(BOOL_DECODER *br, int value_to_sign) {
-  const int split = (br->range + 1) >> 1;
-  const VP9_BD_VALUE bigsplit = (VP9_BD_VALUE)split << (VP9_BD_VALUE_SIZE - 8);
-  int v;
-
-  if (br->count < 0)
-    vp9_bool_decoder_fill(br);
-
-  if (br->value < bigsplit) {
-    br->range = split;
-    v = value_to_sign;
-  } else {
-    br->range = br->range - split;
-    br->value = br->value - bigsplit;
-    v = -value_to_sign;
-  }
-  br->range += br->range;
-  br->value += br->value;
-  --br->count;
-
-  return v;
+  return decode_bool(br, 128) ? -value_to_sign : value_to_sign;
 }
 
+#if CONFIG_NEWCOEFCONTEXT
+#define PT pn
+#define INCREMENT_COUNT(token)                       \
+  do {                                               \
+    coef_counts[type][coef_bands[c]][pn][token]++;   \
+    pn = pt = vp9_prev_token_class[token];           \
+    if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(coef_bands[c + 1]))  \
+      pn = vp9_get_coef_neighbor_context(            \
+          qcoeff_ptr, nodc, neighbors, scan[c + 1]); \
+  } while (0)
+#else
+#define PT pt
 #define INCREMENT_COUNT(token)               \
   do {                                       \
-    coef_counts[coef_bands[c]][pt][token]++; \
-    pt = vp9_prev_token_class[token];        \
+    coef_counts[type][coef_bands[c]][pt][token]++; \
+    pt = vp9_prev_token_class[token];              \
   } while (0)
+#endif  /* CONFIG_NEWCOEFCONTEXT */
 
 #define WRITE_COEF_CONTINUE(val, token)                       \
   {                                                           \
-    qcoeff_ptr[scan[c]] = (INT16) get_signed(br, val);        \
+    qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val);        \
     INCREMENT_COUNT(token);                                   \
     c++;                                                      \
     continue;                                                 \
@@ -116,7 +93,7 @@
 #define ADJUST_COEF(prob, bits_count)  \
   do {                                 \
     if (vp9_read(br, prob))            \
-      val += (UINT16)(1 << bits_count);\
+      val += (uint16_t)(1 << bits_count);\
   } while (0);
 
 static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
@@ -124,51 +101,65 @@
                         ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                         PLANE_TYPE type,
                         TX_TYPE tx_type,
-                        int seg_eob, INT16 *qcoeff_ptr,
+                        int seg_eob, int16_t *qcoeff_ptr,
                         const int *const scan, TX_SIZE txfm_size,
                         const int *coef_bands) {
   FRAME_CONTEXT *const fc = &dx->common.fc;
-  int pt, c = (type == PLANE_TYPE_Y_NO_DC);
-  vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][ENTROPY_NODES], *prob;
-  unsigned int (*coef_counts)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+#if CONFIG_NEWCOEFCONTEXT
+  const int *neighbors;
+  int pn;
+#endif
+  int nodc = (type == PLANE_TYPE_Y_NO_DC);
+  int pt, c = nodc;
+  vp9_coeff_probs *coef_probs;
+  vp9_prob *prob;
+  vp9_coeff_count *coef_counts;
 
   switch (txfm_size) {
     default:
     case TX_4X4:
       if (tx_type == DCT_DCT) {
-        coef_probs  = fc->coef_probs[type];
-        coef_counts = fc->coef_counts[type];
+        coef_probs  = fc->coef_probs_4x4;
+        coef_counts = fc->coef_counts_4x4;
       } else {
-        coef_probs  = fc->hybrid_coef_probs[type];
-        coef_counts = fc->hybrid_coef_counts[type];
+        coef_probs  = fc->hybrid_coef_probs_4x4;
+        coef_counts = fc->hybrid_coef_counts_4x4;
       }
       break;
     case TX_8X8:
       if (tx_type == DCT_DCT) {
-        coef_probs  = fc->coef_probs_8x8[type];
-        coef_counts = fc->coef_counts_8x8[type];
+        coef_probs  = fc->coef_probs_8x8;
+        coef_counts = fc->coef_counts_8x8;
       } else {
-        coef_probs  = fc->hybrid_coef_probs_8x8[type];
-        coef_counts = fc->hybrid_coef_counts_8x8[type];
+        coef_probs  = fc->hybrid_coef_probs_8x8;
+        coef_counts = fc->hybrid_coef_counts_8x8;
       }
       break;
     case TX_16X16:
       if (tx_type == DCT_DCT) {
-        coef_probs  = fc->coef_probs_16x16[type];
-        coef_counts = fc->coef_counts_16x16[type];
+        coef_probs  = fc->coef_probs_16x16;
+        coef_counts = fc->coef_counts_16x16;
       } else {
-        coef_probs  = fc->hybrid_coef_probs_16x16[type];
-        coef_counts = fc->hybrid_coef_counts_16x16[type];
+        coef_probs  = fc->hybrid_coef_probs_16x16;
+        coef_counts = fc->hybrid_coef_counts_16x16;
       }
       break;
+    case TX_32X32:
+      coef_probs = fc->coef_probs_32x32;
+      coef_counts = fc->coef_counts_32x32;
+      break;
   }
 
   VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+#if CONFIG_NEWCOEFCONTEXT
+  pn = pt;
+  neighbors = vp9_get_coef_neighbors_handle(scan);
+#endif
   while (1) {
     int val;
     const uint8_t *cat6 = cat6_prob;
     if (c >= seg_eob) break;
-    prob = coef_probs[coef_bands[c]][pt];
+    prob = coef_probs[type][coef_bands[c]][PT];
     if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))
       break;
 SKIP_START:
@@ -176,7 +167,7 @@
     if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       ++c;
-      prob = coef_probs[coef_bands[c]][pt];
+      prob = coef_probs[type][coef_bands[c]][PT];
       goto SKIP_START;
     }
     // ONE_CONTEXT_NODE_0_
@@ -240,7 +231,7 @@
   }
 
   if (c < seg_eob)
-    coef_counts[coef_bands[c]][pt][DCT_EOB_TOKEN]++;
+    coef_counts[type][coef_bands[c]][PT][DCT_EOB_TOKEN]++;
 
   a[0] = l[0] = (c > !type);
 
@@ -256,38 +247,120 @@
   return eob;
 }
 
+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc) {
+  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
+  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
+  ENTROPY_CONTEXT* const A1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]);
+  ENTROPY_CONTEXT* const L1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]);
+  uint16_t *const eobs = xd->eobs;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  int c, i, eobtotal = 0, seg_eob;
 
+  // Luma block
+#if CONFIG_CNVCONTEXT
+  ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3] +
+                              A1[0] + A1[1] + A1[2] + A1[3]) != 0;
+  ENTROPY_CONTEXT left_ec =  (L[0] + L[1] + L[2] + L[3] +
+                              L1[0] + L1[1] + L1[2] + L1[3]) != 0;
+#else
+  ENTROPY_CONTEXT above_ec = A[0];
+  ENTROPY_CONTEXT left_ec =  L[0];
+#endif
+  eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec,
+                             PLANE_TYPE_Y_WITH_DC,
+                             DCT_DCT, get_eob(xd, segment_id, 1024),
+                             xd->sb_coeff_data.qcoeff,
+                             vp9_default_zig_zag1d_32x32,
+                             TX_32X32, vp9_coef_bands_32x32);
+  A[1] = A[2] = A[3] = A[0] = above_ec;
+  L[1] = L[2] = L[3] = L[0] = left_ec;
+  A1[1] = A1[2] = A1[3] = A1[0] = above_ec;
+  L1[1] = L1[2] = L1[3] = L1[0] = left_ec;
+
+  eobtotal += c;
+
+  // 16x16 chroma blocks
+  seg_eob = get_eob(xd, segment_id, 256);
+
+  for (i = 16; i < 24; i += 4) {
+    ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_16X16][i];
+    ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_16X16][i];
+    ENTROPY_CONTEXT* const a1 = A1 + vp9_block2above[TX_16X16][i];
+    ENTROPY_CONTEXT* const l1 = L1 + vp9_block2left[TX_16X16][i];
+#if CONFIG_CNVCONTEXT
+    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
+    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
+#else
+    above_ec = a[0];
+    left_ec = l[0];
+#endif
+
+    eobs[i] = c = decode_coefs(pbi, xd, bc,
+                               &above_ec, &left_ec,
+                               PLANE_TYPE_UV,
+                               DCT_DCT, seg_eob,
+                               xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
+                               vp9_default_zig_zag1d_16x16,
+                               TX_16X16, vp9_coef_bands_16x16);
+
+    a1[1] = a1[0] = a[1] = a[0] = above_ec;
+    l1[1] = l1[0] = l[1] = l[0] = left_ec;
+    eobtotal += c;
+  }
+  // no Y2 block
+  A[8] = L[8] = A1[8] = L1[8] = 0;
+  return eobtotal;
+}
+
 static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,
                                       MACROBLOCKD* const xd,
                                       BOOL_DECODER* const bc) {
   ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
-  unsigned short* const eobs = xd->eobs;
+  uint16_t *const eobs = xd->eobs;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
   int c, i, eobtotal = 0, seg_eob;
-
   // Luma block
-  eobs[0] = c = decode_coefs(pbi, xd, bc, A, L, PLANE_TYPE_Y_WITH_DC,
+
+#if CONFIG_CNVCONTEXT
+  ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
+  ENTROPY_CONTEXT left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
+#else
+  ENTROPY_CONTEXT above_ec = A[0];
+  ENTROPY_CONTEXT left_ec = L[0];
+#endif
+  eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec,
+                             PLANE_TYPE_Y_WITH_DC,
                              get_tx_type(xd, &xd->block[0]),
                              get_eob(xd, segment_id, 256),
                              xd->qcoeff, vp9_default_zig_zag1d_16x16,
                              TX_16X16, vp9_coef_bands_16x16);
-  A[1] = A[2] = A[3] = A[0];
-  L[1] = L[2] = L[3] = L[0];
+  A[1] = A[2] = A[3] = A[0] = above_ec;
+  L[1] = L[2] = L[3] = L[0] = left_ec;
   eobtotal += c;
 
   // 8x8 chroma blocks
   seg_eob = get_eob(xd, segment_id, 64);
   for (i = 16; i < 24; i += 4) {
-    ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];
-    ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];
-
-    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
+    ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_8X8][i];
+    ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_8X8][i];
+#if CONFIG_CNVCONTEXT
+    above_ec = (a[0] + a[1]) != 0;
+    left_ec = (l[0] + l[1]) != 0;
+#else
+    above_ec = a[0];
+    left_ec = l[0];
+#endif
+    eobs[i] = c = decode_coefs(pbi, xd, bc,
+                               &above_ec, &left_ec,
+                               PLANE_TYPE_UV,
                                DCT_DCT, seg_eob, xd->block[i].qcoeff,
                                vp9_default_zig_zag1d_8x8,
                                TX_8X8, vp9_coef_bands_8x8);
-    a[1] = a[0];
-    l[1] = l[0];
+    a[1] = a[0] = above_ec;
+    l[1] = l[0] = left_ec;
     eobtotal += c;
   }
   A[8] = 0;
@@ -300,7 +373,7 @@
                                     BOOL_DECODER* const bc) {
   ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-  unsigned short *const eobs = xd->eobs;
+  uint16_t *const eobs = xd->eobs;
   PLANE_TYPE type;
   int c, i, eobtotal = 0, seg_eob;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
@@ -308,18 +381,19 @@
   int has_2nd_order = get_2nd_order_usage(xd);
   // 2nd order DC block
   if (has_2nd_order) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[24];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[24];
+    ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][24];
+    ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][24];
 
     eobs[24] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_Y2,
                                 DCT_DCT, get_eob(xd, segment_id, 4),
                                 xd->block[24].qcoeff,
-                                vp9_default_zig_zag1d, TX_8X8, vp9_coef_bands);
+                                vp9_default_zig_zag1d_4x4, TX_8X8,
+                                vp9_coef_bands_4x4);
     eobtotal += c - 4;
     type = PLANE_TYPE_Y_NO_DC;
   } else {
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
     eobs[24] = 0;
     type = PLANE_TYPE_Y_WITH_DC;
   }
@@ -327,17 +401,23 @@
   // luma blocks
   seg_eob = get_eob(xd, segment_id, 64);
   for (i = 0; i < 16; i += 4) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
-
-    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, type,
+    ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i];
+    ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i];
+#if CONFIG_CNVCONTEXT
+    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
+    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
+#else
+    ENTROPY_CONTEXT above_ec = a[0];
+    ENTROPY_CONTEXT left_ec = l[0];
+#endif
+    eobs[i] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec, type,
                                type == PLANE_TYPE_Y_WITH_DC ?
                                get_tx_type(xd, xd->block + i) : DCT_DCT,
                                seg_eob, xd->block[i].qcoeff,
                                vp9_default_zig_zag1d_8x8,
                                TX_8X8, vp9_coef_bands_8x8);
-    a[1] = a[0];
-    l[1] = l[0];
+    a[1] = a[0] = above_ec;
+    l[1] = l[0] = left_ec;
     eobtotal += c;
   }
 
@@ -347,25 +427,34 @@
     // use 4x4 transform for U, V components in I8X8/splitmv prediction mode
     seg_eob = get_eob(xd, segment_id, 16);
     for (i = 16; i < 24; i++) {
-      ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
-      ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
+      ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i];
+      ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i];
 
       eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
                                  DCT_DCT, seg_eob, xd->block[i].qcoeff,
-                                 vp9_default_zig_zag1d, TX_4X4, vp9_coef_bands);
+                                 vp9_default_zig_zag1d_4x4, TX_4X4,
+                                 vp9_coef_bands_4x4);
       eobtotal += c;
     }
   } else {
     for (i = 16; i < 24; i += 4) {
-      ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
-      ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
-
-      eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
+      ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i];
+      ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i];
+#if CONFIG_CNVCONTEXT
+      ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
+      ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
+#else
+      ENTROPY_CONTEXT above_ec = a[0];
+      ENTROPY_CONTEXT left_ec = l[0];
+#endif
+      eobs[i] = c = decode_coefs(pbi, xd, bc,
+                                 &above_ec, &left_ec,
+                                 PLANE_TYPE_UV,
                                  DCT_DCT, seg_eob, xd->block[i].qcoeff,
                                  vp9_default_zig_zag1d_8x8,
                                  TX_8X8, vp9_coef_bands_8x8);
-      a[1] = a[0];
-      l[1] = l[0];
+      a[1] = a[0] = above_ec;
+      l[1] = l[0] = left_ec;
       eobtotal += c;
     }
   }
@@ -373,75 +462,109 @@
   return eobtotal;
 }
 
-int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
-                         BOOL_DECODER* const bc,
-                         PLANE_TYPE type, int i) {
+static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
+                            BOOL_DECODER* const bc,
+                            PLANE_TYPE type, int i, int seg_eob,
+                            TX_TYPE tx_type, const int *scan) {
   ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-  ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
-  ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
-  INT16 *qcoeff_ptr = &xd->qcoeff[0];
-  const int *scan = vp9_default_zig_zag1d;
-  unsigned short *const eobs = xd->eobs;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
-  int c, seg_eob = get_eob(xd, segment_id, 16);
-  TX_TYPE tx_type = DCT_DCT;
+  ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i];
+  ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i];
+  uint16_t *const eobs = xd->eobs;
+  int c;
 
-  if (type == PLANE_TYPE_Y_WITH_DC)
-    tx_type = get_tx_type_4x4(xd, &xd->block[i]);
+  c = decode_coefs(dx, xd, bc, a, l, type, tx_type, seg_eob,
+                   xd->block[i].qcoeff, scan, TX_4X4, vp9_coef_bands_4x4);
+  eobs[i] = c;
+
+  return c;
+}
+
+static int decode_coefs_4x4_y(VP9D_COMP *dx, MACROBLOCKD *xd,
+                              BOOL_DECODER* const bc,
+                              PLANE_TYPE type, int i, int seg_eob) {
+  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+                          get_tx_type(xd, &xd->block[i]) : DCT_DCT;
+  const int *scan;
+
   switch (tx_type) {
-    case ADST_DCT :
-      scan = vp9_row_scan;
+    case ADST_DCT:
+      scan = vp9_row_scan_4x4;
       break;
-
-    case DCT_ADST :
-      scan = vp9_col_scan;
+    case DCT_ADST:
+      scan = vp9_col_scan_4x4;
       break;
-
-    default :
-      scan = vp9_default_zig_zag1d;
+    default:
+      scan = vp9_default_zig_zag1d_4x4;
       break;
   }
-  eobs[i] = c = decode_coefs(dx, xd, bc, a, l, type,
-                             tx_type, seg_eob, qcoeff_ptr + i * 16,
-                             scan, TX_4X4, vp9_coef_bands);
-  return c;
+
+  return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob, tx_type, scan);
 }
 
-int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
-                                MACROBLOCKD* const xd,
-                                BOOL_DECODER* const bc) {
+int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
+                         BOOL_DECODER* const bc,
+                         PLANE_TYPE type, int i) {
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int seg_eob = get_eob(xd, segment_id, 16);
+
+  return decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob);
+}
+
+static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
+                                   MACROBLOCKD* const xd,
+                                   BOOL_DECODER* const bc,
+                                   int seg_eob) {
   int eobtotal = 0, i;
 
-  for (i = 16; i < 24; i++)
-    eobtotal += vp9_decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i);
+  // chroma blocks
+  for (i = 16; i < 24; i++) {
+    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob,
+                                 DCT_DCT, vp9_default_zig_zag1d_4x4);
+  }
 
   return eobtotal;
 }
 
+int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
+                                MACROBLOCKD* const xd,
+                                BOOL_DECODER* const bc) {
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int seg_eob = get_eob(xd, segment_id, 16);
+
+  return decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);
+}
+
 static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx,
                                     MACROBLOCKD* const xd,
                                     BOOL_DECODER* const bc) {
   int i, eobtotal = 0;
   PLANE_TYPE type;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int seg_eob = get_eob(xd, segment_id, 16);
+  const int has_2nd_order = get_2nd_order_usage(xd);
 
-  int has_2nd_order = get_2nd_order_usage(xd);
-
+  // 2nd order DC block
   if (has_2nd_order) {
-    eobtotal += vp9_decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y2, 24) - 16;
+    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y2, 24, seg_eob,
+                                 DCT_DCT, vp9_default_zig_zag1d_4x4) - 16;
     type = PLANE_TYPE_Y_NO_DC;
   } else {
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
     xd->eobs[24] = 0;
     type = PLANE_TYPE_Y_WITH_DC;
   }
 
+  // luma blocks
   for (i = 0; i < 16; ++i) {
-    eobtotal += vp9_decode_coefs_4x4(dx, xd, bc, type, i);
+    eobtotal += decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob);
   }
 
-  return eobtotal + vp9_decode_mb_tokens_4x4_uv(dx, xd, bc);
+  // chroma blocks
+  eobtotal += decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);
+
+  return eobtotal;
 }
 
 int vp9_decode_mb_tokens(VP9D_COMP* const dx,
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -23,7 +23,11 @@
 int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
                          BOOL_DECODER* const);
 
+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc);
+
 int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,
                                 BOOL_DECODER* const bc);
 
-#endif /* DETOKENIZE_H */
+#endif  // VP9_DECODER_VP9_DETOKENIZE_H_
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -14,11 +14,11 @@
 #include "vp9/decoder/vp9_dequantize.h"
 #endif
 
-void vp9_dequant_dc_idct_add_y_block_c(short *q, const short *dq,
-                                       unsigned char *pre,
-                                       unsigned char *dst,
-                                       int stride, unsigned short *eobs,
-                                       const short *dc) {
+void vp9_dequant_dc_idct_add_y_block_c(int16_t *q, const int16_t *dq,
+                                       uint8_t *pre,
+                                       uint8_t *dst,
+                                       int stride, uint16_t *eobs,
+                                       const int16_t *dc) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
@@ -39,12 +39,12 @@
   }
 }
 
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(short *q, const short *dq,
-                                                   unsigned char *dst,
+void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q,
+                                                   const int16_t *dq,
+                                                   uint8_t *dst,
                                                    int stride,
-                                                   unsigned short *eobs,
-                                                   const short *dc,
+                                                   uint16_t *eobs,
+                                                   const int16_t *dc,
                                                    MACROBLOCKD *xd) {
   int i, j;
 
@@ -63,12 +63,11 @@
     dst += 4 * stride - 16;
   }
 }
-#endif
 
-void vp9_dequant_idct_add_y_block_c(short *q, const short *dq,
-                                    unsigned char *pre,
-                                    unsigned char *dst,
-                                    int stride, unsigned short *eobs) {
+void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq,
+                                    uint8_t *pre,
+                                    uint8_t *dst,
+                                    int stride, uint16_t *eobs) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
@@ -90,10 +89,10 @@
   }
 }
 
-void vp9_dequant_idct_add_uv_block_c(short *q, const short *dq,
-                                     unsigned char *pre, unsigned char *dstu,
-                                     unsigned char *dstv, int stride,
-                                     unsigned short *eobs) {
+void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,
+                                     uint8_t *pre, uint8_t *dstu,
+                                     uint8_t *dstv, int stride,
+                                     uint16_t *eobs) {
   int i, j;
 
   for (i = 0; i < 2; i++) {
@@ -133,12 +132,11 @@
   }
 }
 
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_idct_add_uv_block_4x4_inplace_c(short *q, const short *dq,
-                                                 unsigned char *dstu,
-                                                 unsigned char *dstv,
+void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,
+                                                 uint8_t *dstu,
+                                                 uint8_t *dstv,
                                                  int stride,
-                                                 unsigned short *eobs,
+                                                 uint16_t *eobs,
                                                  MACROBLOCKD *xd) {
   int i, j;
 
@@ -174,13 +172,12 @@
     dstv += 4 * stride - 8;
   }
 }
-#endif
 
-void vp9_dequant_dc_idct_add_y_block_8x8_c(short *q, const short *dq,
-                                           unsigned char *pre,
-                                           unsigned char *dst,
-                                           int stride, unsigned short *eobs,
-                                           const short *dc,
+void vp9_dequant_dc_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,
+                                           uint8_t *pre,
+                                           uint8_t *dst,
+                                           int stride, uint16_t *eobs,
+                                           const int16_t *dc,
                                            MACROBLOCKD *xd) {
   q[0] = dc[0];
   vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 1, xd->eobs[0]);
@@ -199,12 +196,12 @@
                                 xd->eobs[12]);
 }
 
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, const short *dq,
-                                                   unsigned char *dst,
+void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q,
+                                                   const int16_t *dq,
+                                                   uint8_t *dst,
                                                    int stride,
-                                                   unsigned short *eobs,
-                                                   const short *dc,
+                                                   uint16_t *eobs,
+                                                   const int16_t *dc,
                                                    MACROBLOCKD *xd) {
   q[0] = dc[0];
   vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, 1, xd->eobs[0]);
@@ -223,15 +220,14 @@
                                 dst + 8 * stride + 8, stride, stride, 1,
                                 xd->eobs[12]);
 }
-#endif
 
-void vp9_dequant_idct_add_y_block_8x8_c(short *q, const short *dq,
-                                        unsigned char *pre,
-                                        unsigned char *dst,
-                                        int stride, unsigned short *eobs,
+void vp9_dequant_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,
+                                        uint8_t *pre,
+                                        uint8_t *dst,
+                                        int stride, uint16_t *eobs,
                                         MACROBLOCKD *xd) {
-  unsigned char *origdest = dst;
-  unsigned char *origpred = pre;
+  uint8_t *origdest = dst;
+  uint8_t *origpred = pre;
 
   vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[0]);
   vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,
@@ -243,11 +239,11 @@
                              xd->eobs[12]);
 }
 
-void vp9_dequant_idct_add_uv_block_8x8_c(short *q, const short *dq,
-                                         unsigned char *pre,
-                                         unsigned char *dstu,
-                                         unsigned char *dstv,
-                                         int stride, unsigned short *eobs,
+void vp9_dequant_idct_add_uv_block_8x8_c(int16_t *q, const int16_t *dq,
+                                         uint8_t *pre,
+                                         uint8_t *dstu,
+                                         uint8_t *dstv,
+                                         int stride, uint16_t *eobs,
                                          MACROBLOCKD *xd) {
   vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, 0, xd->eobs[16]);
 
@@ -257,12 +253,11 @@
   vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, 0, xd->eobs[20]);
 }
 
-#if CONFIG_SUPERBLOCKS
-void vp9_dequant_idct_add_uv_block_8x8_inplace_c(short *q, const short *dq,
-                                                 unsigned char *dstu,
-                                                 unsigned char *dstv,
+void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,
+                                                 uint8_t *dstu,
+                                                 uint8_t *dstv,
                                                  int stride,
-                                                 unsigned short *eobs,
+                                                 uint16_t *eobs,
                                                  MACROBLOCKD *xd) {
   vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, 0,
                              xd->eobs[16]);
@@ -271,15 +266,14 @@
   vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, 0,
                              xd->eobs[20]);
 }
-#endif
 
 #if CONFIG_LOSSLESS
-void vp9_dequant_dc_idct_add_y_block_lossless_c(short *q, const short *dq,
-                                                unsigned char *pre,
-                                                unsigned char *dst,
+void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
+                                                uint8_t *pre,
+                                                uint8_t *dst,
                                                 int stride,
-                                                unsigned short *eobs,
-                                                const short *dc) {
+                                                uint16_t *eobs,
+                                                const int16_t *dc) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
@@ -300,10 +294,10 @@
   }
 }
 
-void vp9_dequant_idct_add_y_block_lossless_c(short *q, const short *dq,
-                                             unsigned char *pre,
-                                             unsigned char *dst,
-                                             int stride, unsigned short *eobs) {
+void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,
+                                             uint8_t *pre,
+                                             uint8_t *dst,
+                                             int stride, uint16_t *eobs) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
@@ -325,12 +319,12 @@
   }
 }
 
-void vp9_dequant_idct_add_uv_block_lossless_c(short *q, const short *dq,
-                                              unsigned char *pre,
-                                              unsigned char *dstu,
-                                              unsigned char *dstv,
+void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,
+                                              uint8_t *pre,
+                                              uint8_t *dstu,
+                                              uint8_t *dstv,
                                               int stride,
-                                              unsigned short *eobs) {
+                                              uint16_t *eobs) {
   int i, j;
 
   for (i = 0; i < 2; i++) {
--- /dev/null
+++ b/vp9/decoder/vp9_onyxd.h
@@ -1,0 +1,64 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_ONYXD_H_
+#define VP9_COMMON_VP9_ONYXD_H_
+
+/* Create/destroy static data structures. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_ppflags.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_codec.h"
+
+  typedef void   *VP9D_PTR;
+  typedef struct {
+    int     Width;
+    int     Height;
+    int     Version;
+    int     postprocess;
+    int     max_threads;
+    int     input_partition;
+  } VP9D_CONFIG;
+  typedef enum {
+    VP9_LAST_FLAG = 1,
+    VP9_GOLD_FLAG = 2,
+    VP9_ALT_FLAG = 4
+  } VP9_REFFRAME;
+
+  void vp9_initialize_dec(void);
+
+  int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,
+                                  const unsigned char **dest,
+                                  int64_t time_stamp);
+
+  int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
+                        int64_t *time_stamp, int64_t *time_end_stamp,
+                        vp9_ppflags_t *flags);
+
+  vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,
+                                        VP9_REFFRAME ref_frame_flag,
+                                        YV12_BUFFER_CONFIG *sd);
+
+  vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,
+                                        VP9_REFFRAME ref_frame_flag,
+                                        YV12_BUFFER_CONFIG *sd);
+
+  VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);
+
+  void vp9_remove_decompressor(VP9D_PTR comp);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP9_COMMON_VP9_ONYXD_H_
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -13,7 +13,7 @@
 #if CONFIG_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
-#include "vp9/common/vp9_onyxd.h"
+#include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/common/vp9_alloccommon.h"
@@ -23,7 +23,7 @@
 #include <assert.h>
 
 #include "vp9/common/vp9_quant_common.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vp9/decoder/vp9_decodframe.h"
@@ -37,7 +37,7 @@
 #if WRITE_RECON_BUFFER == 1
 static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {
   FILE *yuv_file = fopen((char *)name, "ab");
-  unsigned char *src = s->y_buffer;
+  uint8_t *src = s->y_buffer;
   int h = s->y_height;
 
   do {
@@ -382,7 +382,7 @@
 
     if (cm->filter_level) {
       /* Apply the loop filter if appropriate. */
-      vp9_loop_filter_frame(cm, &pbi->mb);
+      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0);
     }
     vp8_yv12_extend_frame_borders(cm->frame_to_show);
   }
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -8,11 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_DECODER_VP9_ONYXD_INT_H_
 #define VP9_DECODER_VP9_ONYXD_INT_H_
 #include "./vpx_config.h"
-#include "vp9/common/vp9_onyxd.h"
+#include "vp9/decoder/vp9_onyxd.h"
 #include "vp9/decoder/vp9_treereader.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/decoder/vp9_dequantize.h"
@@ -35,22 +34,22 @@
 typedef struct {
   int const *scan;
   int const *scan_8x8;
-  UINT8 const *ptr_block2leftabove;
+  uint8_t const *ptr_block2leftabove;
   vp9_tree_index const *vp9_coef_tree_ptr;
   unsigned char *norm_ptr;
-  UINT8 *ptr_coef_bands_x;
-  UINT8 *ptr_coef_bands_x_8x8;
+  uint8_t *ptr_coef_bands_x;
+  uint8_t *ptr_coef_bands_x_8x8;
 
   ENTROPY_CONTEXT_PLANES *A;
   ENTROPY_CONTEXT_PLANES *L;
 
-  INT16 *qcoeff_start_ptr;
+  int16_t *qcoeff_start_ptr;
 
-  vp9_prob const *coef_probs[BLOCK_TYPES];
+  vp9_prob const *coef_probs_4x4[BLOCK_TYPES_4X4];
   vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];
   vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];
 
-  UINT8 eob[25];
+  uint8_t eob[25];
 
 } DETOK;
 
@@ -103,4 +102,4 @@
   } while(0)
 #endif
 
-#endif  // __INC_ONYXD_INT_H
+#endif  // VP9_DECODER_VP9_TREEREADER_H_
--- a/vp9/decoder/vp9_reconintra_mt.h
+++ /dev/null
@@ -1,15 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_DECODER_VP9_RECONINTRA_MT_H_
-#define VP9_DECODER_VP9_RECONINTRA_MT_H_
-
-#endif
--- a/vp9/decoder/vp9_treereader.h
+++ b/vp9/decoder/vp9_treereader.h
@@ -34,4 +34,4 @@
   return -i;
 }
 
-#endif /* tree_reader_h */
+#endif  // VP9_DECODER_VP9_TREEREADER_H_
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -12,6 +12,7 @@
 #include "vp9/common/vp9_header.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/common/vp9_systemdependent.h"
@@ -30,6 +31,7 @@
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_treecoder.h"
 
 #if defined(SECTIONBITS_OUTPUT)
 unsigned __int64 Sectionbits[500];
@@ -39,30 +41,13 @@
 int intra_mode_stats[VP9_KF_BINTRAMODES]
                     [VP9_KF_BINTRAMODES]
                     [VP9_KF_BINTRAMODES];
-unsigned int tree_update_hist [BLOCK_TYPES]
-                              [COEF_BANDS]
-                              [PREV_COEF_CONTEXTS]
-                              [ENTROPY_NODES][2];
-unsigned int hybrid_tree_update_hist [BLOCK_TYPES]
-                                     [COEF_BANDS]
-                                     [PREV_COEF_CONTEXTS]
-                                     [ENTROPY_NODES][2];
-unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]
-                                  [COEF_BANDS]
-                                  [PREV_COEF_CONTEXTS]
-                                  [ENTROPY_NODES] [2];
-unsigned int hybrid_tree_update_hist_8x8 [BLOCK_TYPES_8X8]
-                                         [COEF_BANDS]
-                                         [PREV_COEF_CONTEXTS]
-                                         [ENTROPY_NODES] [2];
-unsigned int tree_update_hist_16x16 [BLOCK_TYPES_16X16]
-                                    [COEF_BANDS]
-                                    [PREV_COEF_CONTEXTS]
-                                    [ENTROPY_NODES] [2];
-unsigned int hybrid_tree_update_hist_16x16 [BLOCK_TYPES_16X16]
-                                           [COEF_BANDS]
-                                           [PREV_COEF_CONTEXTS]
-                                           [ENTROPY_NODES] [2];
+vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4];
+vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4];
+vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8];
+vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8];
+vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16];
+vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16];
+vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32];
 
 extern unsigned int active_section;
 #endif
@@ -127,11 +112,8 @@
   unsigned int new_b = 0, old_b = 0;
   int i = 0;
 
-  vp9_tree_probs_from_distribution(
-    n--, tok, tree,
-    Pnew, bct, num_events,
-    256, 1
-  );
+  vp9_tree_probs_from_distribution(n--, tok, tree,
+                                   Pnew, bct, num_events);
 
   do {
     new_b += cost_branch(bct[i], Pnew[i]);
@@ -164,26 +146,12 @@
       bc, VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,
       Pnew, cm->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
     );
-#if CONFIG_SUPERBLOCKS
     update_mode(bc, VP9_I32X32_MODES, vp9_sb_ymode_encodings,
                 vp9_sb_ymode_tree, Pnew, cm->fc.sb_ymode_prob, bct,
                 (unsigned int *)cpi->sb_ymode_count);
-#endif
   }
 }
 
-static int get_prob(int num, int den) {
-  int p;
-  if (den <= 0)
-    return 128;
-  p = (num * 255 + (den >> 1)) / den;
-  return clip_prob(p);
-}
-
-static int get_binary_prob(int n0, int n1) {
-  return get_prob(n0, n0 + n1);
-}
-
 void vp9_update_skip_probs(VP9_COMP *cpi) {
   VP9_COMMON *const pc = &cpi->common;
   int k;
@@ -204,7 +172,7 @@
         VP9_SWITCHABLE_FILTERS,
         vp9_switchable_interp_encodings, vp9_switchable_interp_tree,
         pc->fc.switchable_interp_prob[j], branch_ct,
-        cpi->switchable_interp_count[j], 256, 1);
+        cpi->switchable_interp_count[j]);
     for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
       if (pc->fc.switchable_interp_prob[j][i] < 1)
         pc->fc.switchable_interp_prob[j][i] = 1;
@@ -274,13 +242,11 @@
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
     for (j = 0; j < 4; j++) {
-      int new_prob, count, old_cost, new_cost;
+      int new_prob, old_cost, new_cost;
 
       // Work out cost of coding branches with the old and optimal probability
       old_cost = cost_branch256(mv_ref_ct[i][j], mode_context[i][j]);
-      count = mv_ref_ct[i][j][0] + mv_ref_ct[i][j][1];
-      new_prob = count > 0 ? (255 * mv_ref_ct[i][j][0]) / count : 128;
-      new_prob = (new_prob > 0) ? new_prob : 1;
+      new_prob = get_binary_prob(mv_ref_ct[i][j][0], mv_ref_ct[i][j][1]);
       new_cost = cost_branch256(mv_ref_ct[i][j], new_prob);
 
       // If cost saving is >= 14 bits then update the mode probability.
@@ -292,6 +258,56 @@
     }
   }
 }
+
+#if CONFIG_NEW_MVREF
+static void update_mv_ref_probs(VP9_COMP *cpi,
+                                int mvref_probs[MAX_REF_FRAMES]
+                                               [MAX_MV_REF_CANDIDATES-1]) {
+  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  int rf;     // Reference frame
+  int ref_c;  // Motion reference candidate
+  int node;   // Probability node index
+
+  for (rf = 0; rf < MAX_REF_FRAMES; ++rf) {
+    int count = 0;
+
+    // Skip the dummy entry for intra ref frame.
+    if (rf == INTRA_FRAME) {
+      continue;
+    }
+
+    // Sum the counts for all candidates
+    for (ref_c = 0; ref_c < MAX_MV_REF_CANDIDATES; ++ref_c) {
+      count += cpi->mb_mv_ref_count[rf][ref_c];
+    }
+
+    // Calculate the tree node probabilities
+    for (node = 0; node < MAX_MV_REF_CANDIDATES-1; ++node) {
+      int new_prob, old_cost, new_cost;
+      unsigned int branch_cnts[2];
+
+      // How many hits on each branch at this node
+      branch_cnts[0] = cpi->mb_mv_ref_count[rf][node];
+      branch_cnts[1] = count - cpi->mb_mv_ref_count[rf][node];
+
+      // Work out cost of coding branches with the old and optimal probability
+      old_cost = cost_branch256(branch_cnts, xd->mb_mv_ref_probs[rf][node]);
+      new_prob = get_prob(branch_cnts[0], count);
+      new_cost = cost_branch256(branch_cnts, new_prob);
+
+      // Take current 0 branch cases out of residual count
+      count -= cpi->mb_mv_ref_count[rf][node];
+
+      if ((new_cost + VP9_MV_REF_UPDATE_COST) <= old_cost) {
+        mvref_probs[rf][node] = new_prob;
+      } else {
+        mvref_probs[rf][node] = xd->mb_mv_ref_probs[rf][node];
+      }
+    }
+  }
+}
+#endif
+
 static void write_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_ymode_tree, p, vp9_ymode_encodings + m);
 }
@@ -300,7 +316,6 @@
   write_token(bc, vp9_kf_ymode_tree, p, vp9_kf_ymode_encodings + m);
 }
 
-#if CONFIG_SUPERBLOCKS
 static void write_sb_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_sb_ymode_tree, p, vp9_sb_ymode_encodings + m);
 }
@@ -308,7 +323,6 @@
 static void sb_kfwrite_ymode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_uv_mode_tree, p, vp9_sb_kf_ymode_encodings + m);
 }
-#endif
 
 static void write_i8x8_mode(vp9_writer *bc, int m, const vp9_prob *p) {
   write_token(bc, vp9_i8x8_mode_tree, p, vp9_i8x8_mode_encodings + m);
@@ -397,11 +411,6 @@
 static void pack_mb_tokens(vp9_writer* const bc,
                            TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop) {
-  unsigned int split;
-  unsigned int shift;
-  int count = bc->count;
-  unsigned int range = bc->range;
-  unsigned int lowvalue = bc->lowvalue;
   TOKENEXTRA *p = *tp;
 
   while (p < stop) {
@@ -427,42 +436,8 @@
 
     do {
       const int bb = (v >> --n) & 1;
-      split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
+      encode_bool(bc, bb, pp[i >> 1]);
       i = vp9_coef_tree[i + bb];
-
-      if (bb) {
-        lowvalue += split;
-        range = range - split;
-      } else {
-        range = split;
-      }
-
-      shift = vp9_norm[range];
-      range <<= shift;
-      count += shift;
-
-      if (count >= 0) {
-        int offset = shift - count;
-
-        if ((lowvalue << (offset - 1)) & 0x80000000) {
-          int x = bc->pos - 1;
-
-          while (x >= 0 && bc->buffer[x] == 0xff) {
-            bc->buffer[x] = (unsigned char)0;
-            x--;
-          }
-
-          bc->buffer[x] += 1;
-        }
-
-        bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
-        lowvalue <<= offset;
-        shift = count;
-        lowvalue &= 0xffffff;
-        count -= 8;
-      }
-
-      lowvalue <<= shift;
     } while (n);
 
 
@@ -477,87 +452,16 @@
 
         do {
           const int bb = (v >> --n) & 1;
-          split = 1 + (((range - 1) * pp[i >> 1]) >> 8);
+          encode_bool(bc, bb, pp[i >> 1]);
           i = b->tree[i + bb];
-
-          if (bb) {
-            lowvalue += split;
-            range = range - split;
-          } else {
-            range = split;
-          }
-
-          shift = vp9_norm[range];
-          range <<= shift;
-          count += shift;
-
-          if (count >= 0) {
-            int offset = shift - count;
-
-            if ((lowvalue << (offset - 1)) & 0x80000000) {
-              int x = bc->pos - 1;
-
-              while (x >= 0 && bc->buffer[x] == 0xff) {
-                bc->buffer[x] = (unsigned char)0;
-                x--;
-              }
-
-              bc->buffer[x] += 1;
-            }
-
-            bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
-            lowvalue <<= offset;
-            shift = count;
-            lowvalue &= 0xffffff;
-            count -= 8;
-          }
-
-          lowvalue <<= shift;
         } while (n);
       }
 
-
-      {
-
-        split = (range + 1) >> 1;
-
-        if (e & 1) {
-          lowvalue += split;
-          range = range - split;
-        } else {
-          range = split;
-        }
-
-        range <<= 1;
-
-        if ((lowvalue & 0x80000000)) {
-          int x = bc->pos - 1;
-
-          while (x >= 0 && bc->buffer[x] == 0xff) {
-            bc->buffer[x] = (unsigned char)0;
-            x--;
-          }
-
-          bc->buffer[x] += 1;
-
-        }
-
-        lowvalue  <<= 1;
-
-        if (!++count) {
-          count = -8;
-          bc->buffer[bc->pos++] = (lowvalue >> 24);
-          lowvalue &= 0xffffff;
-        }
-      }
-
+      encode_bool(bc, e & 1, 128);
     }
     ++p;
   }
 
-  bc->count = count;
-  bc->lowvalue = lowvalue;
-  bc->range = range;
   *tp = p;
 }
 
@@ -584,7 +488,6 @@
               vp9_mv_ref_encoding_array - NEARESTMV + m);
 }
 
-#if CONFIG_SUPERBLOCKS
 static void write_sb_mv_ref(vp9_writer *bc, MB_PREDICTION_MODE m,
                             const vp9_prob *p) {
 #if CONFIG_DEBUG
@@ -593,7 +496,6 @@
   write_token(bc, vp9_sb_mv_ref_tree, p,
               vp9_sb_mv_ref_encoding_array - NEARESTMV + m);
 }
-#endif
 
 static void write_sub_mv_ref
 (
@@ -654,19 +556,7 @@
                            const MB_MODE_INFO *mi, const MACROBLOCKD *xd) {
   // Encode the MB segment id.
   int seg_id = mi->segment_id;
-#if CONFIG_SUPERBLOCKS
-  if (mi->encoded_as_sb) {
-    if (xd->mb_to_right_edge >= 0)
-      seg_id = seg_id && xd->mode_info_context[1].mbmi.segment_id;
-    if (xd->mb_to_bottom_edge >= 0) {
-      seg_id = seg_id &&
-               xd->mode_info_context[xd->mode_info_stride].mbmi.segment_id;
-      if (xd->mb_to_right_edge >= 0)
-        seg_id = seg_id &&
-                xd->mode_info_context[xd->mode_info_stride + 1].mbmi.segment_id;
-    }
-  }
-#endif
+
   if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {
     switch (seg_id) {
       case 0:
@@ -795,510 +685,336 @@
   vp9_compute_mod_refprobs(cm);
 }
 
-static void pack_inter_mode_mvs(VP9_COMP *const cpi, vp9_writer *const bc) {
+static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
+                                vp9_writer *bc,
+                                int mb_rows_left, int mb_cols_left) {
   VP9_COMMON *const pc = &cpi->common;
   const nmv_context *nmvc = &pc->fc.nmvc;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  MODE_INFO *m;
-  MODE_INFO *prev_m;
-  TOKENEXTRA *tok = cpi->tok;
-  TOKENEXTRA *tok_end = tok + cpi->tok_count;
-
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   const int mis = pc->mode_info_stride;
-  int mb_row, mb_col;
-  int row, col;
+  MB_MODE_INFO *const mi = &m->mbmi;
+  const MV_REFERENCE_FRAME rf = mi->ref_frame;
+  const MB_PREDICTION_MODE mode = mi->mode;
+  const int segment_id = mi->segment_id;
+  const int mb_size = 1 << mi->sb_type;
+  int skip_coeff;
 
-  // Values used in prediction model coding
-  vp9_prob pred_prob;
-  unsigned char prediction_flag;
+  int mb_row = pc->mb_rows - mb_rows_left;
+  int mb_col = pc->mb_cols - mb_cols_left;
+  xd->prev_mode_info_context = pc->prev_mi + (m - pc->mi);
+  x->partition_info = x->pi + (m - pc->mi);
 
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
+  // Distance of Mb to the various image edges.
+  // These specified to 8th pel as they are always compared to MV
+  // values that are in 1/8th pel units
+  xd->mb_to_left_edge = -((mb_col * 16) << 3);
+  xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+  xd->mb_to_right_edge = ((pc->mb_cols - mb_size - mb_col) * 16) << 3;
+  xd->mb_to_bottom_edge = ((pc->mb_rows - mb_size - mb_row) * 16) << 3;
 
-  cpi->mb.partition_info = cpi->mb.pi;
-
-  mb_row = 0;
-  for (row = 0; row < pc->mb_rows; row += 2) {
-    m = pc->mi + row * mis;
-    prev_m = pc->prev_mi + row * mis;
-
-    mb_col = 0;
-    for (col = 0; col < pc->mb_cols; col += 2) {
-      int i;
-
-      // Process the 4 MBs in the order:
-      // top-left, top-right, bottom-left, bottom-right
-#if CONFIG_SUPERBLOCKS
-      vp9_write(bc, m->mbmi.encoded_as_sb, pc->sb_coded);
+#ifdef ENTROPY_STATS
+  active_section = 9;
 #endif
-      for (i = 0; i < 4; i++) {
-        MB_MODE_INFO *mi;
-        MV_REFERENCE_FRAME rf;
-        MB_PREDICTION_MODE mode;
-        int segment_id, skip_coeff;
 
-        int dy = row_delta[i];
-        int dx = col_delta[i];
-        int offset_extended = dy * mis + dx;
+  if (cpi->mb.e_mbd.update_mb_segmentation_map) {
+    // Is temporal coding of the segment map enabled
+    if (pc->temporal_update) {
+      unsigned char prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
+      vp9_prob pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
 
-        if ((mb_row >= pc->mb_rows) || (mb_col >= pc->mb_cols)) {
-          // MB lies outside frame, move on
-          mb_row += dy;
-          mb_col += dx;
-          m += offset_extended;
-          prev_m += offset_extended;
-          cpi->mb.partition_info += offset_extended;
-          continue;
-        }
+      // Code the segment id prediction flag for this mb
+      vp9_write(bc, prediction_flag, pred_prob);
 
-        mi = &m->mbmi;
-        rf = mi->ref_frame;
-        mode = mi->mode;
-        segment_id = mi->segment_id;
+      // If the mb segment id wasn't predicted code explicitly
+      if (!prediction_flag)
+        write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+    } else {
+      // Normal unpredicted coding
+      write_mb_segid(bc, mi, &cpi->mb.e_mbd);
+    }
+  }
 
-        // Distance of Mb to the various image edges.
-        // These specified to 8th pel as they are always compared to MV
-        // values that are in 1/8th pel units
-        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-        xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+  if (!pc->mb_no_coeff_skip) {
+    skip_coeff = 0;
+  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+             vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) {
+    skip_coeff = 1;
+  } else {
+    const int nmbs = mb_size;
+    const int xmbs = MIN(nmbs, mb_cols_left);
+    const int ymbs = MIN(nmbs, mb_rows_left);
+    int x, y;
 
-#if CONFIG_SUPERBLOCKS
-        if (mi->encoded_as_sb) {
-          xd->mb_to_right_edge = ((pc->mb_cols - 2 - mb_col) * 16) << 3;
-          xd->mb_to_bottom_edge = ((pc->mb_rows - 2 - mb_row) * 16) << 3;
-        } else {
-#endif
-          xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
-          xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-        }
-#endif
+    skip_coeff = 1;
+    for (y = 0; y < ymbs; y++) {
+      for (x = 0; x < xmbs; x++) {
+        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;
+      }
+    }
 
-        // Make sure the MacroBlockD mode info pointer is set correctly
-        xd->mode_info_context = m;
-        xd->prev_mode_info_context = prev_m;
+    vp9_write(bc, skip_coeff,
+              vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
+  }
 
-#ifdef ENTROPY_STATS
-        active_section = 9;
-#endif
-        if (cpi->mb.e_mbd.update_mb_segmentation_map) {
-          // Is temporal coding of the segment map enabled
-          if (pc->temporal_update) {
-            prediction_flag = vp9_get_pred_flag(xd, PRED_SEG_ID);
-            pred_prob = vp9_get_pred_prob(pc, xd, PRED_SEG_ID);
+  // Encode the reference frame.
+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)
+      || vp9_get_segdata(xd, segment_id, SEG_LVL_MODE) >= NEARESTMV) {
+    encode_ref_frame(bc, pc, xd, segment_id, rf);
+  } else {
+    assert(rf == INTRA_FRAME);
+  }
 
-            // Code the segment id prediction flag for this mb
-            vp9_write(bc, prediction_flag, pred_prob);
-
-            // If the mb segment id wasn't predicted code explicitly
-            if (!prediction_flag)
-              write_mb_segid(bc, mi, &cpi->mb.e_mbd);
-          } else {
-            // Normal unpredicted coding
-            write_mb_segid(bc, mi, &cpi->mb.e_mbd);
-          }
-        }
-
-        skip_coeff = 1;
-        if (pc->mb_no_coeff_skip &&
-            (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-             (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-          skip_coeff = mi->mb_skip_coeff;
-#if CONFIG_SUPERBLOCKS
-          if (mi->encoded_as_sb) {
-            skip_coeff &= m[1].mbmi.mb_skip_coeff;
-            skip_coeff &= m[mis].mbmi.mb_skip_coeff;
-            skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
-          }
-#endif
-          vp9_write(bc, skip_coeff,
-                    vp9_get_pred_prob(pc, xd, PRED_MBSKIP));
-        }
-
-        // Encode the reference frame.
-        if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)
-            || vp9_get_segdata(xd, segment_id, SEG_LVL_MODE) >= NEARESTMV) {
-          encode_ref_frame(bc, pc, xd, segment_id, rf);
-        } else {
-          assert(rf == INTRA_FRAME);
-        }
-
-        if (rf == INTRA_FRAME) {
+  if (rf == INTRA_FRAME) {
 #ifdef ENTROPY_STATS
-          active_section = 6;
+    active_section = 6;
 #endif
 
-          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-#if CONFIG_SUPERBLOCKS
-            if (m->mbmi.encoded_as_sb)
-              write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
-            else
-#endif
-            write_ymode(bc, mode, pc->fc.ymode_prob);
-          }
-          if (mode == B_PRED) {
-            int j = 0;
-#if CONFIG_COMP_INTRA_PRED
-            int uses_second =
-              m->bmi[0].as_mode.second !=
-              (B_PREDICTION_MODE)(B_DC_PRED - 1);
-            vp9_write(bc, uses_second, DEFAULT_COMP_INTRA_PROB);
-#endif
-            do {
-#if CONFIG_COMP_INTRA_PRED
-              B_PREDICTION_MODE mode2 = m->bmi[j].as_mode.second;
-#endif
-              write_bmode(bc, m->bmi[j].as_mode.first,
-                          pc->fc.bmode_prob);
-#if CONFIG_COMP_INTRA_PRED
-              if (uses_second) {
-                write_bmode(bc, mode2, pc->fc.bmode_prob);
-              }
-#endif
-            } while (++j < 16);
-          }
-          if (mode == I8X8_PRED) {
-            write_i8x8_mode(bc, m->bmi[0].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(bc, m->bmi[2].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(bc, m->bmi[8].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-            write_i8x8_mode(bc, m->bmi[10].as_mode.first,
-                            pc->fc.i8x8_mode_prob);
-          } else {
-            write_uv_mode(bc, mi->uv_mode,
-                          pc->fc.uv_mode_prob[mode]);
-          }
-        } else {
-          int_mv best_mv, best_second_mv;
+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+      if (m->mbmi.sb_type)
+        write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);
+      else
+        write_ymode(bc, mode, pc->fc.ymode_prob);
+    }
+    if (mode == B_PRED) {
+      int j = 0;
+      do {
+        write_bmode(bc, m->bmi[j].as_mode.first,
+                    pc->fc.bmode_prob);
+      } while (++j < 16);
+    }
+    if (mode == I8X8_PRED) {
+      write_i8x8_mode(bc, m->bmi[0].as_mode.first,
+                      pc->fc.i8x8_mode_prob);
+      write_i8x8_mode(bc, m->bmi[2].as_mode.first,
+                      pc->fc.i8x8_mode_prob);
+      write_i8x8_mode(bc, m->bmi[8].as_mode.first,
+                      pc->fc.i8x8_mode_prob);
+      write_i8x8_mode(bc, m->bmi[10].as_mode.first,
+                      pc->fc.i8x8_mode_prob);
+    } else {
+      write_uv_mode(bc, mi->uv_mode,
+                    pc->fc.uv_mode_prob[mode]);
+    }
+  } else {
+    vp9_prob mv_ref_p[VP9_MVREFS - 1];
 
-          vp9_prob mv_ref_p [VP9_MVREFS - 1];
+    vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]);
 
-          {
-            best_mv.as_int = mi->ref_mvs[rf][0].as_int;
-
-            vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]);
-
+    // #ifdef ENTROPY_STATS
 #ifdef ENTROPY_STATS
-            accum_mv_refs(mode, ct);
+    accum_mv_refs(mode, ct);
+    active_section = 3;
 #endif
-          }
 
-#ifdef ENTROPY_STATS
-          active_section = 3;
-#endif
+    // Is the segment coding of mode enabled
+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
+      if (mi->sb_type) {
+        write_sb_mv_ref(bc, mode, mv_ref_p);
+      } else {
+        write_mv_ref(bc, mode, mv_ref_p);
+      }
+      vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
+    }
 
-          // Is the segment coding of mode enabled
-          if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {
-#if CONFIG_SUPERBLOCKS
-            if (mi->encoded_as_sb) {
-              write_sb_mv_ref(bc, mode, mv_ref_p);
-            } else
-#endif
-            {
-              write_mv_ref(bc, mode, mv_ref_p);
-            }
-            vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
-          }
+    if (mode >= NEARESTMV && mode <= SPLITMV) {
+      if (cpi->common.mcomp_filter_type == SWITCHABLE) {
+        write_token(bc, vp9_switchable_interp_tree,
+                    vp9_get_pred_probs(&cpi->common, xd,
+                                       PRED_SWITCHABLE_INTERP),
+                    vp9_switchable_interp_encodings +
+                    vp9_switchable_interp_map[mi->interp_filter]);
+      } else {
+        assert(mi->interp_filter == cpi->common.mcomp_filter_type);
+      }
+    }
 
-#if CONFIG_PRED_FILTER
-          // Is the prediction filter enabled
-          if (mode >= NEARESTMV && mode < SPLITMV) {
-            if (cpi->common.pred_filter_mode == 2)
-              vp9_write(bc, mi->pred_filter_enabled,
-                        pc->prob_pred_filter_off);
-            else
-              assert(mi->pred_filter_enabled ==
-                     cpi->common.pred_filter_mode);
-          }
-#endif
-          if (mode >= NEARESTMV && mode <= SPLITMV)
-          {
-            if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-              write_token(bc, vp9_switchable_interp_tree,
-                          vp9_get_pred_probs(&cpi->common, xd,
-                                             PRED_SWITCHABLE_INTERP),
-                          vp9_switchable_interp_encodings +
-                              vp9_switchable_interp_map[mi->interp_filter]);
-            } else {
-              assert (mi->interp_filter ==
-                      cpi->common.mcomp_filter_type);
-            }
-          }
-
-          if (mi->second_ref_frame > 0 &&
-              (mode == NEWMV || mode == SPLITMV)) {
-
-            best_second_mv.as_int =
-              mi->ref_mvs[mi->second_ref_frame][0].as_int;
-          }
-
-          // does the feature use compound prediction or not
-          // (if not specified at the frame/segment level)
-          if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-            vp9_write(bc, mi->second_ref_frame > INTRA_FRAME,
-                      vp9_get_pred_prob(pc, xd, PRED_COMP));
-          }
+    // does the feature use compound prediction or not
+    // (if not specified at the frame/segment level)
+    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      vp9_write(bc, mi->second_ref_frame > INTRA_FRAME,
+                vp9_get_pred_prob(pc, xd, PRED_COMP));
+    }
 #if CONFIG_COMP_INTERINTRA_PRED
-          if (cpi->common.use_interintra &&
-              mode >= NEARESTMV && mode < SPLITMV &&
-              mi->second_ref_frame <= INTRA_FRAME) {
-            vp9_write(bc, mi->second_ref_frame == INTRA_FRAME,
-                      pc->fc.interintra_prob);
-            // if (!cpi->dummy_packing)
-            //   printf("-- %d (%d)\n", mi->second_ref_frame == INTRA_FRAME,
-            //          pc->fc.interintra_prob);
-            if (mi->second_ref_frame == INTRA_FRAME) {
-              // if (!cpi->dummy_packing)
-              //   printf("** %d %d\n", mi->interintra_mode,
-                       // mi->interintra_uv_mode);
-              write_ymode(bc, mi->interintra_mode, pc->fc.ymode_prob);
+    if (cpi->common.use_interintra &&
+        mode >= NEARESTMV && mode < SPLITMV &&
+        mi->second_ref_frame <= INTRA_FRAME) {
+      vp9_write(bc, mi->second_ref_frame == INTRA_FRAME,
+                pc->fc.interintra_prob);
+      // if (!cpi->dummy_packing)
+      //   printf("-- %d (%d)\n", mi->second_ref_frame == INTRA_FRAME,
+      //          pc->fc.interintra_prob);
+      if (mi->second_ref_frame == INTRA_FRAME) {
+        // if (!cpi->dummy_packing)
+        //   printf("** %d %d\n", mi->interintra_mode,
+        // mi->interintra_uv_mode);
+        write_ymode(bc, mi->interintra_mode, pc->fc.ymode_prob);
 #if SEPARATE_INTERINTRA_UV
-              write_uv_mode(bc, mi->interintra_uv_mode,
-                            pc->fc.uv_mode_prob[mi->interintra_mode]);
+        write_uv_mode(bc, mi->interintra_uv_mode,
+                      pc->fc.uv_mode_prob[mi->interintra_mode]);
 #endif
-            }
-          }
+      }
+    }
 #endif
 
-          {
-            switch (mode) { /* new, split require MVs */
-              case NEWMV:
-#ifdef ENTROPY_STATS
-                active_section = 5;
-#endif
-
 #if CONFIG_NEW_MVREF
-                {
-                  unsigned int best_index;
+    // if ((mode == NEWMV) || (mode == SPLITMV)) {
+    if (mode == NEWMV) {
+      // Encode the index of the choice.
+      vp9_write_mv_ref_id(bc,
+                          xd->mb_mv_ref_probs[rf], mi->best_index);
 
-                  // Choose the best mv reference
-                  /*
-                  best_index = pick_best_mv_ref(x, rf, mi->mv[0],
-                                                mi->ref_mvs[rf], &best_mv);
-                  assert(best_index == mi->best_index);
-                  assert(best_mv.as_int == mi->best_mv.as_int);
-                  */
-                  best_index = mi->best_index;
-                  best_mv.as_int = mi->best_mv.as_int;
-
-                  // Encode the index of the choice.
-                  vp9_write_mv_ref_id(bc,
-                                      xd->mb_mv_ref_id_probs[rf], best_index);
-
-                  cpi->best_ref_index_counts[rf][best_index]++;
-
-                }
+      if (mi->second_ref_frame > 0) {
+        // Encode the index of the choice.
+        vp9_write_mv_ref_id(
+                            bc, xd->mb_mv_ref_probs[mi->second_ref_frame],
+                            mi->best_second_index);
+      }
+    }
 #endif
 
-                write_nmv(bc, &mi->mv[0].as_mv, &best_mv,
-                          (const nmv_context*) nmvc,
-                          xd->allow_high_precision_mv);
-
-                if (mi->second_ref_frame > 0) {
-#if CONFIG_NEW_MVREF
-                  unsigned int best_index;
-                  MV_REFERENCE_FRAME sec_ref_frame = mi->second_ref_frame;
-
-                  /*
-                  best_index =
-                    pick_best_mv_ref(x, sec_ref_frame, mi->mv[1],
-                                     mi->ref_mvs[sec_ref_frame],
-                                     &best_second_mv);
-                  assert(best_index == mi->best_second_index);
-                  assert(best_second_mv.as_int == mi->best_second_mv.as_int);
-                  */
-                  best_index = mi->best_second_index;
-                  best_second_mv.as_int = mi->best_second_mv.as_int;
-
-                  // Encode the index of the choice.
-                  vp9_write_mv_ref_id(bc,
-                                      xd->mb_mv_ref_id_probs[sec_ref_frame],
-                                      best_index);
-
-                  cpi->best_ref_index_counts[sec_ref_frame][best_index]++;
+    switch (mode) { /* new, split require MVs */
+      case NEWMV:
+#ifdef ENTROPY_STATS
+        active_section = 5;
 #endif
-                  write_nmv(bc, &mi->mv[1].as_mv, &best_second_mv,
-                            (const nmv_context*) nmvc,
-                            xd->allow_high_precision_mv);
-                }
-                break;
-              case SPLITMV: {
-                int j = 0;
+        write_nmv(bc, &mi->mv[0].as_mv, &mi->best_mv,
+                  (const nmv_context*) nmvc,
+                  xd->allow_high_precision_mv);
 
+        if (mi->second_ref_frame > 0) {
+          write_nmv(bc, &mi->mv[1].as_mv, &mi->best_second_mv,
+                    (const nmv_context*) nmvc,
+                    xd->allow_high_precision_mv);
+        }
+        break;
+      case SPLITMV: {
+        int j = 0;
+
 #ifdef MODE_STATS
-                ++count_mb_seg [mi->partitioning];
+        ++count_mb_seg[mi->partitioning];
 #endif
 
-                write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
-                cpi->mbsplit_count[mi->partitioning]++;
+        write_split(bc, mi->partitioning, cpi->common.fc.mbsplit_prob);
+        cpi->mbsplit_count[mi->partitioning]++;
 
-                do {
-                  B_PREDICTION_MODE blockmode;
-                  int_mv blockmv;
-                  const int *const  L =
-                    vp9_mbsplits [mi->partitioning];
-                  int k = -1;  /* first block in subset j */
-                  int mv_contz;
-                  int_mv leftmv, abovemv;
+        do {
+          B_PREDICTION_MODE blockmode;
+          int_mv blockmv;
+          const int *const  L = vp9_mbsplits[mi->partitioning];
+          int k = -1;  /* first block in subset j */
+          int mv_contz;
+          int_mv leftmv, abovemv;
 
-                  blockmode = cpi->mb.partition_info->bmi[j].mode;
-                  blockmv = cpi->mb.partition_info->bmi[j].mv;
+          blockmode = cpi->mb.partition_info->bmi[j].mode;
+          blockmv = cpi->mb.partition_info->bmi[j].mv;
 #if CONFIG_DEBUG
-                  while (j != L[++k])
-                    if (k >= 16)
-                      assert(0);
+          while (j != L[++k])
+            if (k >= 16)
+              assert(0);
 #else
-                  while (j != L[++k]);
+          while (j != L[++k]);
 #endif
-                  leftmv.as_int = left_block_mv(m, k);
-                  abovemv.as_int = above_block_mv(m, k, mis);
-                  mv_contz = vp9_mv_cont(&leftmv, &abovemv);
+          leftmv.as_int = left_block_mv(m, k);
+          abovemv.as_int = above_block_mv(m, k, mis);
+          mv_contz = vp9_mv_cont(&leftmv, &abovemv);
 
-                  write_sub_mv_ref(bc, blockmode,
-                                   cpi->common.fc.sub_mv_ref_prob [mv_contz]);
-                  cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
-                  if (blockmode == NEW4X4) {
+          write_sub_mv_ref(bc, blockmode,
+                           cpi->common.fc.sub_mv_ref_prob[mv_contz]);
+          cpi->sub_mv_ref_count[mv_contz][blockmode - LEFT4X4]++;
+          if (blockmode == NEW4X4) {
 #ifdef ENTROPY_STATS
-                    active_section = 11;
+            active_section = 11;
 #endif
-                    write_nmv(bc, &blockmv.as_mv, &best_mv,
-                              (const nmv_context*) nmvc,
-                              xd->allow_high_precision_mv);
+            write_nmv(bc, &blockmv.as_mv, &mi->best_mv,
+                      (const nmv_context*) nmvc,
+                      xd->allow_high_precision_mv);
 
-                    if (mi->second_ref_frame > 0) {
-                      write_nmv(bc,
-                                &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
-                                &best_second_mv,
-                                (const nmv_context*) nmvc,
-                                xd->allow_high_precision_mv);
-                    }
-                  }
-                } while (++j < cpi->mb.partition_info->count);
-              }
-              break;
-              default:
-                break;
+            if (mi->second_ref_frame > 0) {
+              write_nmv(bc,
+                        &cpi->mb.partition_info->bmi[j].second_mv.as_mv,
+                        &mi->best_second_mv,
+                        (const nmv_context*) nmvc,
+                        xd->allow_high_precision_mv);
             }
           }
-          /* This is not required if the counts in cpi are consistent with the
-           * final packing pass */
-          // if (!cpi->dummy_packing)
-          //   vp9_update_nmv_count(cpi, x, &best_mv, &best_second_mv);
-        }
-
-        if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
-             (rf != INTRA_FRAME && !(mode == SPLITMV &&
-                                     mi->partitioning == PARTITIONING_4X4))) &&
-            pc->txfm_mode == TX_MODE_SELECT &&
-            !((pc->mb_no_coeff_skip && skip_coeff) ||
-              (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-               vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
-          TX_SIZE sz = mi->txfm_size;
-          // FIXME(rbultje) code ternary symbol once all experiments are merged
-          vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
-          if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV)
-            vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
-        }
-
-#ifdef ENTROPY_STATS
-        active_section = 1;
-#endif
-        assert(tok < tok_end);
-        pack_mb_tokens(bc, &tok, tok_end);
-
-#if CONFIG_SUPERBLOCKS
-        if (m->mbmi.encoded_as_sb) {
-          assert(!i);
-          mb_col += 2;
-          m += 2;
-          cpi->mb.partition_info += 2;
-          prev_m += 2;
-          break;
-        }
-#endif
-
-        // Next MB
-        mb_row += dy;
-        mb_col += dx;
-        m += offset_extended;
-        prev_m += offset_extended;
-        cpi->mb.partition_info += offset_extended;
-#if CONFIG_DEBUG
-        assert((prev_m - cpi->common.prev_mip) == (m - cpi->common.mip));
-        assert((prev_m - cpi->common.prev_mi) == (m - cpi->common.mi));
-#endif
+        } while (++j < cpi->mb.partition_info->count);
+        break;
       }
+      default:
+        break;
     }
+  }
 
-    // Next SB
-    mb_row += 2;
-    m += mis + (1 - (pc->mb_cols & 0x1));
-    prev_m += mis + (1 - (pc->mb_cols & 0x1));
-    cpi->mb.partition_info += mis + (1 - (pc->mb_cols & 0x1));
+  if (((rf == INTRA_FRAME && mode <= I8X8_PRED) ||
+       (rf != INTRA_FRAME && !(mode == SPLITMV &&
+                               mi->partitioning == PARTITIONING_4X4))) &&
+      pc->txfm_mode == TX_MODE_SELECT &&
+      !((pc->mb_no_coeff_skip && skip_coeff) ||
+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+    TX_SIZE sz = mi->txfm_size;
+    // FIXME(rbultje) code ternary symbol once all experiments are merged
+    vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);
+    if (sz != TX_4X4 && mode != I8X8_PRED && mode != SPLITMV) {
+      vp9_write(bc, sz != TX_8X8, pc->prob_tx[1]);
+      if (mi->sb_type && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, pc->prob_tx[2]);
+    }
   }
 }
 
+static void write_mb_modes_kf(const VP9_COMP *cpi,
+                              const MODE_INFO *m,
+                              vp9_writer *bc,
+                              int mb_rows_left, int mb_cols_left) {
+  const VP9_COMMON *const c = &cpi->common;
+  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const int mis = c->mode_info_stride;
+  const int ym = m->mbmi.mode;
+  const int segment_id = m->mbmi.segment_id;
+  int skip_coeff;
 
-static void write_mb_modes_kf(const VP9_COMMON  *c,
-                              const MACROBLOCKD *xd,
-                              const MODE_INFO   *m,
-                              int                mode_info_stride,
-                              vp9_writer *const  bc) {
-  int ym;
-  int segment_id;
-
-  ym = m->mbmi.mode;
-  segment_id = m->mbmi.segment_id;
-
   if (xd->update_mb_segmentation_map) {
     write_mb_segid(bc, &m->mbmi, xd);
   }
 
-  if (c->mb_no_coeff_skip &&
-      (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
-       (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0))) {
-        int skip_coeff = m->mbmi.mb_skip_coeff;
-#if CONFIG_SUPERBLOCKS
-        const int mis = mode_info_stride;
-        if (m->mbmi.encoded_as_sb) {
-          skip_coeff &= m[1].mbmi.mb_skip_coeff;
-          skip_coeff &= m[mis].mbmi.mb_skip_coeff;
-          skip_coeff &= m[mis + 1].mbmi.mb_skip_coeff;
-        }
-#endif
-        vp9_write(bc, skip_coeff,
-                  vp9_get_pred_prob(c, xd, PRED_MBSKIP));
+  if (!c->mb_no_coeff_skip) {
+    skip_coeff = 0;
+  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+             vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) {
+    skip_coeff = 1;
+  } else {
+    const int nmbs = 1 << m->mbmi.sb_type;
+    const int xmbs = MIN(nmbs, mb_cols_left);
+    const int ymbs = MIN(nmbs, mb_rows_left);
+    int x, y;
+
+    skip_coeff = 1;
+    for (y = 0; y < ymbs; y++) {
+      for (x = 0; x < xmbs; x++) {
+        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;
+      }
+    }
+
+    vp9_write(bc, skip_coeff,
+              vp9_get_pred_prob(c, xd, PRED_MBSKIP));
   }
 
-#if CONFIG_SUPERBLOCKS
-  if (m->mbmi.encoded_as_sb) {
+  if (m->mbmi.sb_type) {
     sb_kfwrite_ymode(bc, ym,
                      c->sb_kf_ymode_prob[c->kf_ymode_probs_index]);
-  } else
-#endif
-  {
+  } else {
     kfwrite_ymode(bc, ym,
                   c->kf_ymode_prob[c->kf_ymode_probs_index]);
   }
 
   if (ym == B_PRED) {
-    const int mis = c->mode_info_stride;
     int i = 0;
-#if CONFIG_COMP_INTRA_PRED
-    int uses_second =
-      m->bmi[0].as_mode.second !=
-      (B_PREDICTION_MODE)(B_DC_PRED - 1);
-    vp9_write(bc, uses_second, DEFAULT_COMP_INTRA_PROB);
-#endif
     do {
       const B_PREDICTION_MODE A = above_block_mode(m, i, mis);
       const B_PREDICTION_MODE L = left_block_mode(m, i);
       const int bm = m->bmi[i].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-      const int bm2 = m->bmi[i].as_mode.second;
-#endif
 
 #ifdef ENTROPY_STATS
       ++intra_mode_stats [A] [L] [bm];
@@ -1305,11 +1021,6 @@
 #endif
 
       write_kf_bmode(bc, bm, c->kf_bmode_prob[A][L]);
-#if CONFIG_COMP_INTRA_PRED
-      if (uses_second) {
-        write_kf_bmode(bc, bm2, c->kf_bmode_prob[A][L]);
-      }
-#endif
     } while (++i < 16);
   }
   if (ym == I8X8_PRED) {
@@ -1329,91 +1040,107 @@
     write_uv_mode(bc, m->mbmi.uv_mode, c->kf_uv_mode_prob[ym]);
 
   if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&
-      !((c->mb_no_coeff_skip && m->mbmi.mb_skip_coeff) ||
+      !((c->mb_no_coeff_skip && skip_coeff) ||
         (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
     TX_SIZE sz = m->mbmi.txfm_size;
     // FIXME(rbultje) code ternary symbol once all experiments are merged
     vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);
-    if (sz != TX_4X4 && ym <= TM_PRED)
+    if (sz != TX_4X4 && ym <= TM_PRED) {
       vp9_write(bc, sz != TX_8X8, c->prob_tx[1]);
+      if (m->mbmi.sb_type && sz != TX_8X8)
+        vp9_write(bc, sz != TX_16X16, c->prob_tx[2]);
+    }
   }
 }
 
-static void write_kfmodes(VP9_COMP* const cpi, vp9_writer* const bc) {
+static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
+                          TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                          int mb_row, int mb_col) {
   VP9_COMMON *const c = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+
+  xd->mode_info_context = m;
+  if (c->frame_type == KEY_FRAME) {
+    write_mb_modes_kf(cpi, m, bc,
+                      c->mb_rows - mb_row, c->mb_cols - mb_col);
+#ifdef ENTROPY_STATS
+    active_section = 8;
+#endif
+  } else {
+    pack_inter_mode_mvs(cpi, m, bc,
+                        c->mb_rows - mb_row, c->mb_cols - mb_col);
+#ifdef ENTROPY_STATS
+    active_section = 1;
+#endif
+  }
+
+  assert(*tok < tok_end);
+  pack_mb_tokens(bc, tok, tok_end);
+}
+
+static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) {
+  VP9_COMMON *const c = &cpi->common;
   const int mis = c->mode_info_stride;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  MODE_INFO *m;
-  int i;
-  int row, col;
-  int mb_row, mb_col;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
+  MODE_INFO *m, *m_ptr = c->mi;
+  int i, mb_row, mb_col;
   TOKENEXTRA *tok = cpi->tok;
   TOKENEXTRA *tok_end = tok + cpi->tok_count;
 
-  mb_row = 0;
-  for (row = 0; row < c->mb_rows; row += 2) {
-    m = c->mi + row * mis;
+  for (mb_row = 0; mb_row < c->mb_rows; mb_row += 4, m_ptr += 4 * mis) {
+    m = m_ptr;
+    for (mb_col = 0; mb_col < c->mb_cols; mb_col += 4, m += 4) {
+      vp9_write(bc, m->mbmi.sb_type == BLOCK_SIZE_SB64X64, c->sb64_coded);
+      if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+        write_modes_b(cpi, m, bc, &tok, tok_end, mb_row, mb_col);
+      } else {
+        int j;
 
-    mb_col = 0;
-    for (col = 0; col < c->mb_cols; col += 2) {
-#if CONFIG_SUPERBLOCKS
-      vp9_write(bc, m->mbmi.encoded_as_sb, c->sb_coded);
-#endif
-      // Process the 4 MBs in the order:
-      // top-left, top-right, bottom-left, bottom-right
-      for (i = 0; i < 4; i++) {
-        int dy = row_delta[i];
-        int dx = col_delta[i];
-        int offset_extended = dy * mis + dx;
+        for (j = 0; j < 4; j++) {
+          const int x_idx_sb = (j & 1) << 1, y_idx_sb = j & 2;
+          MODE_INFO *sb_m = m + y_idx_sb * mis + x_idx_sb;
 
-        if ((mb_row >= c->mb_rows) || (mb_col >= c->mb_cols)) {
-          // MB lies outside frame, move on
-          mb_row += dy;
-          mb_col += dx;
-          m += offset_extended;
-          continue;
-        }
+          if (mb_col + x_idx_sb >= c->mb_cols ||
+              mb_row + y_idx_sb >= c->mb_rows)
+            continue;
 
-        // Make sure the MacroBlockD mode info pointer is set correctly
-        xd->mode_info_context = m;
+          vp9_write(bc, sb_m->mbmi.sb_type, c->sb32_coded);
+          if (sb_m->mbmi.sb_type) {
+            assert(sb_m->mbmi.sb_type == BLOCK_SIZE_SB32X32);
+            write_modes_b(cpi, sb_m, bc, &tok, tok_end,
+                          mb_row + y_idx_sb, mb_col + x_idx_sb);
+          } else {
+            // Process the 4 MBs in the order:
+            // top-left, top-right, bottom-left, bottom-right
+            for (i = 0; i < 4; i++) {
+              const int x_idx = x_idx_sb + (i & 1), y_idx = y_idx_sb + (i >> 1);
+              MODE_INFO *mb_m = m + x_idx + y_idx * mis;
 
-        write_mb_modes_kf(c, xd, m, mis, bc);
-#ifdef ENTROPY_STATS
-        active_section = 8;
-#endif
-        assert(tok < tok_end);
-        pack_mb_tokens(bc, &tok, tok_end);
+              if (mb_row + y_idx >= c->mb_rows ||
+                  mb_col + x_idx >= c->mb_cols) {
+                // MB lies outside frame, move on
+                continue;
+              }
 
-#if CONFIG_SUPERBLOCKS
-        if (m->mbmi.encoded_as_sb) {
-          assert(!i);
-          mb_col += 2;
-          m += 2;
-          break;
+              assert(mb_m->mbmi.sb_type == BLOCK_SIZE_MB16X16);
+              write_modes_b(cpi, mb_m, bc, &tok, tok_end,
+                            mb_row + y_idx, mb_col + x_idx);
+            }
+          }
         }
-#endif
-        // Next MB
-        mb_row += dy;
-        mb_col += dx;
-        m += offset_extended;
       }
     }
-    mb_row += 2;
   }
 }
 
 
 /* This function is used for debugging probability trees. */
-static void print_prob_tree(vp9_prob
-                            coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) {
+static void print_prob_tree(vp9_coeff_probs *coef_probs) {
   /* print coef probability tree */
   int i, j, k, l;
   FILE *f = fopen("enc_tree_probs.txt", "a");
   fprintf(f, "{\n");
-  for (i = 0; i < BLOCK_TYPES; i++) {
+  for (i = 0; i < BLOCK_TYPES_4X4; i++) {
     fprintf(f, "  {\n");
     for (j = 0; j < COEF_BANDS; j++) {
       fprintf(f, "    {\n");
@@ -1433,151 +1160,93 @@
   fclose(f);
 }
 
-static void build_coeff_contexts(VP9_COMP *cpi) {
+static void build_tree_distribution(vp9_coeff_probs *coef_probs,
+                                    vp9_coeff_count *coef_counts,
+#ifdef ENTROPY_STATS
+                                    VP9_COMP *cpi,
+                                    vp9_coeff_accum *context_counters,
+#endif
+                                    vp9_coeff_stats *coef_branch_ct,
+                                    int block_types) {
   int i = 0, j, k;
 #ifdef ENTROPY_STATS
   int t = 0;
 #endif
-  for (i = 0; i < BLOCK_TYPES; ++i) {
+
+  for (i = 0; i < block_types; ++i) {
     for (j = 0; j < COEF_BANDS; ++j) {
       for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
         if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
           continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          cpi->frame_coef_probs [i][j][k],
-          cpi->frame_branch_ct [i][j][k],
-          cpi->coef_counts [i][j][k],
-          256, 1
-        );
+        vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,
+                                         vp9_coef_encodings, vp9_coef_tree,
+                                         coef_probs[i][j][k],
+                                         coef_branch_ct[i][j][k],
+                                         coef_counts[i][j][k]);
 #ifdef ENTROPY_STATS
         if (!cpi->dummy_packing)
           for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];
+            context_counters[i][j][k][t] += coef_counts[i][j][k][t];
 #endif
       }
     }
   }
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    for (j = 0; j < COEF_BANDS; ++j) {
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          cpi->frame_hybrid_coef_probs [i][j][k],
-          cpi->frame_hybrid_branch_ct [i][j][k],
-          cpi->hybrid_coef_counts [i][j][k],
-          256, 1
-        );
+}
+
+static void build_coeff_contexts(VP9_COMP *cpi) {
+  build_tree_distribution(cpi->frame_coef_probs_4x4,
+                          cpi->coef_counts_4x4,
 #ifdef ENTROPY_STATS
-        if (!cpi->dummy_packing)
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            hybrid_context_counters[i][j][k][t] += cpi->hybrid_coef_counts[i][j][k][t];
+                          cpi, context_counters_4x4,
 #endif
-      }
-    }
-  }
-
-  if (cpi->common.txfm_mode != ONLY_4X4) {
-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
-      for (j = 0; j < COEF_BANDS; ++j) {
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          /* at every context */
-          /* calc probs and branch cts for this frame only */
-          // vp9_prob new_p           [ENTROPY_NODES];
-          // unsigned int branch_ct   [ENTROPY_NODES] [2];
-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-            continue;
-          vp9_tree_probs_from_distribution(
-            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-            cpi->frame_coef_probs_8x8 [i][j][k],
-            cpi->frame_branch_ct_8x8 [i][j][k],
-            cpi->coef_counts_8x8 [i][j][k],
-            256, 1
-          );
+                          cpi->frame_branch_ct_4x4, BLOCK_TYPES_4X4);
+  build_tree_distribution(cpi->frame_hybrid_coef_probs_4x4,
+                          cpi->hybrid_coef_counts_4x4,
 #ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing)
-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-              context_counters_8x8[i][j][k][t] += cpi->coef_counts_8x8[i][j][k][t];
+                          cpi, hybrid_context_counters_4x4,
 #endif
-        }
-      }
-    }
-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {
-      for (j = 0; j < COEF_BANDS; ++j) {
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          /* at every context */
-          /* calc probs and branch cts for this frame only */
-          // vp9_prob new_p           [ENTROPY_NODES];
-          // unsigned int branch_ct   [ENTROPY_NODES] [2];
-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-            continue;
-          vp9_tree_probs_from_distribution(
-            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-            cpi->frame_hybrid_coef_probs_8x8 [i][j][k],
-            cpi->frame_hybrid_branch_ct_8x8 [i][j][k],
-            cpi->hybrid_coef_counts_8x8 [i][j][k],
-            256, 1
-          );
+                          cpi->frame_hybrid_branch_ct_4x4, BLOCK_TYPES_4X4);
+  build_tree_distribution(cpi->frame_coef_probs_8x8,
+                          cpi->coef_counts_8x8,
 #ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing)
-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-              hybrid_context_counters_8x8[i][j][k][t] += cpi->hybrid_coef_counts_8x8[i][j][k][t];
+                          cpi, context_counters_8x8,
 #endif
-        }
-      }
-    }
-  }
-
-  if (cpi->common.txfm_mode > ALLOW_8X8) {
-    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
-      for (j = 0; j < COEF_BANDS; ++j) {
-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-            continue;
-          vp9_tree_probs_from_distribution(
-            MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-            cpi->frame_coef_probs_16x16[i][j][k],
-            cpi->frame_branch_ct_16x16[i][j][k],
-            cpi->coef_counts_16x16[i][j][k], 256, 1);
+                          cpi->frame_branch_ct_8x8, BLOCK_TYPES_8X8);
+  build_tree_distribution(cpi->frame_hybrid_coef_probs_8x8,
+                          cpi->hybrid_coef_counts_8x8,
 #ifdef ENTROPY_STATS
-          if (!cpi->dummy_packing)
-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-              context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];
+                          cpi, hybrid_context_counters_8x8,
 #endif
-        }
-      }
-    }
-  }
-  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {
-    for (j = 0; j < COEF_BANDS; ++j) {
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))
-          continue;
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          cpi->frame_hybrid_coef_probs_16x16[i][j][k],
-          cpi->frame_hybrid_branch_ct_16x16[i][j][k],
-          cpi->hybrid_coef_counts_16x16[i][j][k], 256, 1);
+                          cpi->frame_hybrid_branch_ct_8x8, BLOCK_TYPES_8X8);
+  build_tree_distribution(cpi->frame_coef_probs_16x16,
+                          cpi->coef_counts_16x16,
 #ifdef ENTROPY_STATS
-        if (!cpi->dummy_packing)
-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-            hybrid_context_counters_16x16[i][j][k][t] += cpi->hybrid_coef_counts_16x16[i][j][k][t];
+                          cpi, context_counters_16x16,
 #endif
-      }
-    }
-  }
+                          cpi->frame_branch_ct_16x16, BLOCK_TYPES_16X16);
+  build_tree_distribution(cpi->frame_hybrid_coef_probs_16x16,
+                          cpi->hybrid_coef_counts_16x16,
+#ifdef ENTROPY_STATS
+                          cpi, hybrid_context_counters_16x16,
+#endif
+                          cpi->frame_hybrid_branch_ct_16x16, BLOCK_TYPES_16X16);
+  build_tree_distribution(cpi->frame_coef_probs_32x32,
+                          cpi->coef_counts_32x32,
+#ifdef ENTROPY_STATS
+                          cpi, context_counters_32x32,
+#endif
+                          cpi->frame_branch_ct_32x32, BLOCK_TYPES_32X32);
 }
 
-static void update_coef_probs_common(
-    vp9_writer* const bc,
-    vp9_prob new_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
-                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],
-    vp9_prob old_frame_coef_probs[BLOCK_TYPES][COEF_BANDS]
-                                 [PREV_COEF_CONTEXTS][ENTROPY_NODES],
-    unsigned int frame_branch_ct[BLOCK_TYPES][COEF_BANDS]
-                                [PREV_COEF_CONTEXTS][ENTROPY_NODES][2]) {
+static void update_coef_probs_common(vp9_writer* const bc,
+#ifdef ENTROPY_STATS
+                                     VP9_COMP *cpi,
+                                     vp9_coeff_stats *tree_update_hist,
+#endif
+                                     vp9_coeff_probs *new_frame_coef_probs,
+                                     vp9_coeff_probs *old_frame_coef_probs,
+                                     vp9_coeff_stats *frame_branch_ct,
+                                     int block_types) {
   int i, j, k, t;
   int update[2] = {0, 0};
   int savings;
@@ -1585,7 +1254,7 @@
 
   /* dry run to see if there is any udpate at all needed */
   savings = 0;
-  for (i = 0; i < BLOCK_TYPES; ++i) {
+  for (i = 0; i < block_types; ++i) {
     for (j = !i; j < COEF_BANDS; ++j) {
       int prev_coef_savings[ENTROPY_NODES] = {0};
       for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -1629,7 +1298,7 @@
     vp9_write_bit(bc, 0);
   } else {
     vp9_write_bit(bc, 1);
-    for (i = 0; i < BLOCK_TYPES; ++i) {
+    for (i = 0; i < block_types; ++i) {
       for (j = !i; j < COEF_BANDS; ++j) {
         int prev_coef_savings[ENTROPY_NODES] = {0};
         for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
@@ -1659,7 +1328,7 @@
             vp9_write(bc, u, upd);
 #ifdef ENTROPY_STATS
             if (!cpi->dummy_packing)
-              ++ tree_update_hist [i][j][k][t] [u];
+              ++tree_update_hist[i][j][k][t][u];
 #endif
             if (u) {
               /* send/use new probability */
@@ -1680,38 +1349,80 @@
   build_coeff_contexts(cpi);
 
   update_coef_probs_common(bc,
-                           cpi->frame_coef_probs,
-                           cpi->common.fc.coef_probs,
-                           cpi->frame_branch_ct);
+#ifdef ENTROPY_STATS
+                           cpi,
+                           tree_update_hist_4x4,
+#endif
+                           cpi->frame_coef_probs_4x4,
+                           cpi->common.fc.coef_probs_4x4,
+                           cpi->frame_branch_ct_4x4,
+                           BLOCK_TYPES_4X4);
 
   update_coef_probs_common(bc,
-                           cpi->frame_hybrid_coef_probs,
-                           cpi->common.fc.hybrid_coef_probs,
-                           cpi->frame_hybrid_branch_ct);
+#ifdef ENTROPY_STATS
+                           cpi,
+                           hybrid_tree_update_hist_4x4,
+#endif
+                           cpi->frame_hybrid_coef_probs_4x4,
+                           cpi->common.fc.hybrid_coef_probs_4x4,
+                           cpi->frame_hybrid_branch_ct_4x4,
+                           BLOCK_TYPES_4X4);
 
   /* do not do this if not even allowed */
   if (cpi->common.txfm_mode != ONLY_4X4) {
     update_coef_probs_common(bc,
+#ifdef ENTROPY_STATS
+                             cpi,
+                             tree_update_hist_8x8,
+#endif
                              cpi->frame_coef_probs_8x8,
                              cpi->common.fc.coef_probs_8x8,
-                             cpi->frame_branch_ct_8x8);
+                             cpi->frame_branch_ct_8x8,
+                             BLOCK_TYPES_8X8);
 
     update_coef_probs_common(bc,
+#ifdef ENTROPY_STATS
+                             cpi,
+                             hybrid_tree_update_hist_8x8,
+#endif
                              cpi->frame_hybrid_coef_probs_8x8,
                              cpi->common.fc.hybrid_coef_probs_8x8,
-                             cpi->frame_hybrid_branch_ct_8x8);
+                             cpi->frame_hybrid_branch_ct_8x8,
+                             BLOCK_TYPES_8X8);
   }
 
   if (cpi->common.txfm_mode > ALLOW_8X8) {
     update_coef_probs_common(bc,
+#ifdef ENTROPY_STATS
+                             cpi,
+                             tree_update_hist_16x16,
+#endif
                              cpi->frame_coef_probs_16x16,
                              cpi->common.fc.coef_probs_16x16,
-                             cpi->frame_branch_ct_16x16);
+                             cpi->frame_branch_ct_16x16,
+                             BLOCK_TYPES_16X16);
     update_coef_probs_common(bc,
+#ifdef ENTROPY_STATS
+                             cpi,
+                             hybrid_tree_update_hist_16x16,
+#endif
                              cpi->frame_hybrid_coef_probs_16x16,
                              cpi->common.fc.hybrid_coef_probs_16x16,
-                             cpi->frame_hybrid_branch_ct_16x16);
+                             cpi->frame_hybrid_branch_ct_16x16,
+                             BLOCK_TYPES_16X16);
   }
+
+  if (cpi->common.txfm_mode > ALLOW_16X16) {
+    update_coef_probs_common(bc,
+#ifdef ENTROPY_STATS
+                             cpi,
+                             tree_update_hist_32x32,
+#endif
+                             cpi->frame_coef_probs_32x32,
+                             cpi->common.fc.coef_probs_32x32,
+                             cpi->frame_branch_ct_32x32,
+                             BLOCK_TYPES_32X32);
+  }
 }
 
 #ifdef PACKET_TESTING
@@ -1745,13 +1456,11 @@
     for (j = 0; j < VP9_YMODES; j++) {
       cost += mode_cost[j] * cpi->ymode_count[j];
     }
-#if CONFIG_SUPERBLOCKS
     vp9_cost_tokens(mode_cost, cpi->common.sb_kf_ymode_prob[i],
                     vp9_sb_ymode_tree);
     for (j = 0; j < VP9_I32X32_MODES; j++) {
       cost += mode_cost[j] * cpi->sb_ymode_count[j];
     }
-#endif
     if (cost < bestcost) {
       bestindex = i;
       bestcost = cost;
@@ -1941,30 +1650,48 @@
     }
   }
 
-#if CONFIG_SUPERBLOCKS
-  {
-    /* sb mode probability */
-    const int sb_max = (((pc->mb_rows + 1) >> 1) * ((pc->mb_cols + 1) >> 1));
+  pc->sb64_coded = get_binary_prob(cpi->sb64_count[0], cpi->sb64_count[1]);
+  vp9_write_literal(&header_bc, pc->sb64_coded, 8);
+  pc->sb32_coded = get_binary_prob(cpi->sb32_count[0], cpi->sb32_count[1]);
+  vp9_write_literal(&header_bc, pc->sb32_coded, 8);
 
-    pc->sb_coded = get_prob(sb_max - cpi->sb_count, sb_max);
-    vp9_write_literal(&header_bc, pc->sb_coded, 8);
-  }
-#endif
-
   {
     if (pc->txfm_mode == TX_MODE_SELECT) {
-      pc->prob_tx[0] = get_prob(cpi->txfm_count[0] + cpi->txfm_count_8x8p[0],
-                                cpi->txfm_count[0] + cpi->txfm_count[1] + cpi->txfm_count[2] +
-                                cpi->txfm_count_8x8p[0] + cpi->txfm_count_8x8p[1]);
-      pc->prob_tx[1] = get_prob(cpi->txfm_count[1], cpi->txfm_count[1] + cpi->txfm_count[2]);
+      pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] +
+                                cpi->txfm_count_16x16p[TX_4X4] +
+                                cpi->txfm_count_8x8p[TX_4X4],
+                                cpi->txfm_count_32x32p[TX_4X4] +
+                                cpi->txfm_count_32x32p[TX_8X8] +
+                                cpi->txfm_count_32x32p[TX_16X16] +
+                                cpi->txfm_count_32x32p[TX_32X32] +
+                                cpi->txfm_count_16x16p[TX_4X4] +
+                                cpi->txfm_count_16x16p[TX_8X8] +
+                                cpi->txfm_count_16x16p[TX_16X16] +
+                                cpi->txfm_count_8x8p[TX_4X4] +
+                                cpi->txfm_count_8x8p[TX_8X8]);
+      pc->prob_tx[1] = get_prob(cpi->txfm_count_32x32p[TX_8X8] +
+                                cpi->txfm_count_16x16p[TX_8X8],
+                                cpi->txfm_count_32x32p[TX_8X8] +
+                                cpi->txfm_count_32x32p[TX_16X16] +
+                                cpi->txfm_count_32x32p[TX_32X32] +
+                                cpi->txfm_count_16x16p[TX_8X8] +
+                                cpi->txfm_count_16x16p[TX_16X16]);
+      pc->prob_tx[2] = get_prob(cpi->txfm_count_32x32p[TX_16X16],
+                                cpi->txfm_count_32x32p[TX_16X16] +
+                                cpi->txfm_count_32x32p[TX_32X32]);
     } else {
       pc->prob_tx[0] = 128;
       pc->prob_tx[1] = 128;
+      pc->prob_tx[2] = 128;
     }
-    vp9_write_literal(&header_bc, pc->txfm_mode, 2);
+    vp9_write_literal(&header_bc, pc->txfm_mode <= 3 ? pc->txfm_mode : 3, 2);
+    if (pc->txfm_mode > ALLOW_16X16) {
+      vp9_write_bit(&header_bc, pc->txfm_mode == TX_MODE_SELECT);
+    }
     if (pc->txfm_mode == TX_MODE_SELECT) {
       vp9_write_literal(&header_bc, pc->prob_tx[0], 8);
       vp9_write_literal(&header_bc, pc->prob_tx[1], 8);
+      vp9_write_literal(&header_bc, pc->prob_tx[2], 8);
     }
   }
 
@@ -2118,7 +1845,7 @@
 
   // If appropriate update the inter mode probability context and code the
   // changes in the bitstream.
-  if ((pc->frame_type != KEY_FRAME)) {
+  if (pc->frame_type != KEY_FRAME) {
     int i, j;
     int new_context[INTER_MODE_CONTEXTS][4];
     update_mode_probs(pc, new_context);
@@ -2140,17 +1867,54 @@
     }
   }
 
+#if CONFIG_NEW_MVREF
+  if ((pc->frame_type != KEY_FRAME)) {
+    int new_mvref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1];
+    int i, j;
+
+    update_mv_ref_probs(cpi, new_mvref_probs);
+
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      // Skip the dummy entry for intra ref frame.
+      if (i == INTRA_FRAME) {
+        continue;
+      }
+
+      // Encode any mandated updates to probabilities
+      for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) {
+        if (new_mvref_probs[i][j] != xd->mb_mv_ref_probs[i][j]) {
+          vp9_write(&header_bc, 1, VP9_MVREF_UPDATE_PROB);
+          vp9_write_literal(&header_bc, new_mvref_probs[i][j], 8);
+
+          // Only update the persistent copy if this is the "real pack"
+          if (!cpi->dummy_packing) {
+            xd->mb_mv_ref_probs[i][j] = new_mvref_probs[i][j];
+          }
+        } else {
+          vp9_write(&header_bc, 0, VP9_MVREF_UPDATE_PROB);
+        }
+      }
+    }
+  }
+#endif
+
   vp9_clear_system_state();  // __asm emms;
 
-  vp9_copy(cpi->common.fc.pre_coef_probs, cpi->common.fc.coef_probs);
-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs, cpi->common.fc.hybrid_coef_probs);
-  vp9_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8);
-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8, cpi->common.fc.hybrid_coef_probs_8x8);
-  vp9_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);
-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16, cpi->common.fc.hybrid_coef_probs_16x16);
-#if CONFIG_SUPERBLOCKS
+  vp9_copy(cpi->common.fc.pre_coef_probs_4x4,
+           cpi->common.fc.coef_probs_4x4);
+  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_4x4,
+           cpi->common.fc.hybrid_coef_probs_4x4);
+  vp9_copy(cpi->common.fc.pre_coef_probs_8x8,
+           cpi->common.fc.coef_probs_8x8);
+  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8,
+           cpi->common.fc.hybrid_coef_probs_8x8);
+  vp9_copy(cpi->common.fc.pre_coef_probs_16x16,
+           cpi->common.fc.coef_probs_16x16);
+  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16,
+           cpi->common.fc.hybrid_coef_probs_16x16);
+  vp9_copy(cpi->common.fc.pre_coef_probs_32x32,
+           cpi->common.fc.coef_probs_32x32);
   vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);
-#endif
   vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);
   vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);
   vp9_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);
@@ -2193,15 +1957,6 @@
     active_section = 1;
 #endif
 
-#if CONFIG_PRED_FILTER
-    // Write the prediction filter mode used for this frame
-    vp9_write_literal(&header_bc, pc->pred_filter_mode, 2);
-
-    // Write prediction filter on/off probability if signaling at MB level
-    if (pc->pred_filter_mode == 2)
-      vp9_write_literal(&header_bc, pc->prob_pred_filter_off, 8);
-
-#endif
     if (pc->mcomp_filter_type == SWITCHABLE)
       update_switchable_interp_probs(cpi, &header_bc);
 
@@ -2261,12 +2016,12 @@
 
   if (pc->frame_type == KEY_FRAME) {
     decide_kf_ymode_entropy(cpi);
-    write_kfmodes(cpi, &residual_bc);
+    write_modes(cpi, &residual_bc);
   } else {
     /* This is not required if the counts in cpi are consistent with the
      * final packing pass */
     // if (!cpi->dummy_packing) vp9_zero(cpi->NMVcount);
-    pack_inter_mode_mvs(cpi, &residual_bc);
+    write_modes(cpi, &residual_bc);
 
     vp9_update_mode_context(&cpi->common);
   }
@@ -2277,18 +2032,13 @@
 }
 
 #ifdef ENTROPY_STATS
-void print_tree_update_probs() {
+static void print_tree_update_for_type(FILE *f,
+                                       vp9_coeff_stats *tree_update_hist,
+                                       int block_types, const char *header) {
   int i, j, k, l;
-  FILE *f = fopen("coefupdprob.h", "w");
-  int Sum;
-  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
 
-  fprintf(f, "const vp9_prob\n"
-          "vp9_coef_update_probs[BLOCK_TYPES]\n"
-          "                     [COEF_BANDS]\n"
-          "                     [PREV_COEF_CONTEXTS]\n"
-          "                     [ENTROPY_NODES] = {\n");
-  for (i = 0; i < BLOCK_TYPES; i++) {
+  fprintf(f, "const vp9_coeff_prob %s = {\n", header);
+  for (i = 0; i < block_types; i++) {
     fprintf(f, "  { \n");
     for (j = 0; j < COEF_BANDS; j++) {
       fprintf(f, "    {\n");
@@ -2295,9 +2045,9 @@
       for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
         fprintf(f, "      {");
         for (l = 0; l < ENTROPY_NODES; l++) {
-          fprintf(f, "%3ld, ",
-              get_binary_prob(tree_update_hist[i][j][k][l][0],
-                              tree_update_hist[i][j][k][l][1]));
+          fprintf(f, "%3d, ",
+                  get_binary_prob(tree_update_hist[i][j][k][l][0],
+                                  tree_update_hist[i][j][k][l][1]));
         }
         fprintf(f, "},\n");
       }
@@ -2306,56 +2056,31 @@
     fprintf(f, "  },\n");
   }
   fprintf(f, "};\n");
+}
 
-  fprintf(f, "const vp9_prob\n"
-          "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]\n"
-          "                         [COEF_BANDS]\n"
-          "                         [PREV_COEF_CONTEXTS]\n"
-          "                         [ENTROPY_NODES] = {\n");
-  for (i = 0; i < BLOCK_TYPES_8X8; i++) {
-    fprintf(f, "  { \n");
-    for (j = 0; j < COEF_BANDS; j++) {
-      fprintf(f, "    {\n");
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        fprintf(f, "      {");
-        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
-          fprintf(f, "%3ld, ",
-              get_binary_prob(tree_update_hist_8x8[i][j][k][l][0],
-                              tree_update_hist_8x8[i][j][k][l][1]));
-        }
-        fprintf(f, "},\n");
-      }
-      fprintf(f, "    },\n");
-    }
-    fprintf(f, "  },\n");
-  }
+void print_tree_update_probs() {
+  FILE *f = fopen("coefupdprob.h", "w");
+  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
 
-  fprintf(f, "const vp9_prob\n"
-          "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]\n"
-          "                           [COEF_BANDS]\n"
-          "                           [PREV_COEF_CONTEXTS]\n"
-          "                           [ENTROPY_NODES] = {\n");
-  for (i = 0; i < BLOCK_TYPES_16X16; i++) {
-    fprintf(f, "  { \n");
-    for (j = 0; j < COEF_BANDS; j++) {
-      fprintf(f, "    {\n");
-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {
-        fprintf(f, "      {");
-        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {
-          fprintf(f, "%3ld, ",
-              get_binary_prob(tree_update_hist_16x16[i][j][k][l][0],
-                              tree_update_hist_16x16[i][j][k][l][1]));
-        }
-        fprintf(f, "},\n");
-      }
-      fprintf(f, "    },\n");
-    }
-    fprintf(f, "  },\n");
-  }
+  print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES_4X4,
+                             "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]");
+  print_tree_update_for_type(f, hybrid_tree_update_hist_4x4, BLOCK_TYPES_4X4,
+                             "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]");
+  print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES_8X8,
+                             "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]");
+  print_tree_update_for_type(f, hybrid_tree_update_hist_8x8, BLOCK_TYPES_8X8,
+                             "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]");
+  print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES_16X16,
+                             "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]");
+  print_tree_update_for_type(f, hybrid_tree_update_hist_16x16,
+                             BLOCK_TYPES_16X16,
+                             "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]");
+  print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES_32X32,
+                             "vp9_coef_update_probs_32x32[BLOCK_TYPES_32X32]");
 
   fclose(f);
   f = fopen("treeupdate.bin", "wb");
-  fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);
+  fwrite(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);
   fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
   fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
   fclose(f);
--- a/vp9/encoder/vp9_bitstream.h
+++ b/vp9/encoder/vp9_bitstream.h
@@ -14,4 +14,4 @@
 
 void vp9_update_skip_probs(VP9_COMP *cpi);
 
-#endif
+#endif  // VP9_ENCODER_VP9_BITSTREAM_H_
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_BLOCK_H_
 #define VP9_ENCODER_VP9_BLOCK_H_
 
@@ -26,26 +25,28 @@
 
 typedef struct block {
   // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
-  short *src_diff;
-  short *coeff;
+  int16_t *src_diff;
+  int16_t *coeff;
 
   // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
-  short *quant;
-  short *quant_fast;      // fast quant deprecated for now
-  unsigned char *quant_shift;
-  short *zbin;
-  short *zbin_8x8;
-  short *zbin_16x16;
-  short *zrun_zbin_boost;
-  short *zrun_zbin_boost_8x8;
-  short *zrun_zbin_boost_16x16;
-  short *round;
+  int16_t *quant;
+  int16_t *quant_fast;      // fast quant deprecated for now
+  uint8_t *quant_shift;
+  int16_t *zbin;
+  int16_t *zbin_8x8;
+  int16_t *zbin_16x16;
+  int16_t *zbin_32x32;
+  int16_t *zrun_zbin_boost;
+  int16_t *zrun_zbin_boost_8x8;
+  int16_t *zrun_zbin_boost_16x16;
+  int16_t *zrun_zbin_boost_32x32;
+  int16_t *round;
 
   // Zbin Over Quant value
   short zbin_extra;
 
-  unsigned char **base_src;
-  unsigned char **base_second_src;
+  uint8_t **base_src;
+  uint8_t **base_second_src;
   int src;
   int src_stride;
 
@@ -52,6 +53,7 @@
   int eob_max_offset;
   int eob_max_offset_8x8;
   int eob_max_offset_16x16;
+  int eob_max_offset_32x32;
 } BLOCK;
 
 typedef struct {
@@ -68,9 +70,10 @@
 typedef struct {
   MODE_INFO mic;
   PARTITION_INFO partition_info;
+  int skip;
   int_mv best_ref_mv;
   int_mv second_best_ref_mv;
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
+  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   int rate;
   int distortion;
   int64_t intra_error;
@@ -83,18 +86,20 @@
   int64_t txfm_rd_diff[NB_TXFM_MODES];
 } PICK_MODE_CONTEXT;
 
-typedef struct macroblock {
-  DECLARE_ALIGNED(16, short, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
-  DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
-#if !CONFIG_SUPERBLOCKS
-  DECLARE_ALIGNED(16, unsigned char, thismb[256]);    // 16x16 Y
+typedef struct superblock {
+  DECLARE_ALIGNED(16, int16_t, src_diff[32*32+16*16*2]);
+  DECLARE_ALIGNED(16, int16_t, coeff[32*32+16*16*2]);
+} SUPERBLOCK;
 
-  unsigned char *thismb_ptr;
-#endif
+typedef struct macroblock {
+  DECLARE_ALIGNED(16, int16_t, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+  DECLARE_ALIGNED(16, int16_t, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
   // 16 Y blocks, 4 U blocks, 4 V blocks,
   // 1 DC 2nd order block each with 16 entries
   BLOCK block[25];
 
+  SUPERBLOCK sb_coeff_data;
+
   YV12_BUFFER_CONFIG src;
 
   MACROBLOCKD e_mbd;
@@ -115,6 +120,8 @@
   int *mb_norm_activity_ptr;
   signed int act_zbin_adj;
 
+  int mv_best_ref_index[MAX_REF_FRAMES];
+
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
   int *nmvcost[2];
@@ -153,34 +160,29 @@
 
   unsigned char *active_ptr;
 
-  unsigned int token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
-    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-  unsigned int hybrid_token_costs[TX_SIZE_MAX][BLOCK_TYPES][COEF_BANDS]
-    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+  vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4];
+  vp9_coeff_count hybrid_token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4];
 
   int optimize;
 
   // Structure to hold context for each of the 4 MBs within a SB:
   // when encoded as 4 independent MBs:
-  PICK_MODE_CONTEXT mb_context[4];
-#if CONFIG_SUPERBLOCKS
+  PICK_MODE_CONTEXT mb_context[4][4];
   // when 4 MBs share coding parameters:
-  PICK_MODE_CONTEXT sb_context[4];
-#endif
+  PICK_MODE_CONTEXT sb32_context[4];
+  PICK_MODE_CONTEXT sb64_context;
 
-  void (*vp9_short_fdct4x4)(short *input, short *output, int pitch);
-  void (*vp9_short_fdct8x4)(short *input, short *output, int pitch);
-  void (*short_walsh4x4)(short *input, short *output, int pitch);
+  void (*vp9_short_fdct4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*vp9_short_fdct8x4)(int16_t *input, int16_t *output, int pitch);
+  void (*short_walsh4x4)(int16_t *input, int16_t *output, int pitch);
   void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);
   void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
-  void (*vp9_short_fdct8x8)(short *input, short *output, int pitch);
-  void (*vp9_short_fdct16x16)(short *input, short *output, int pitch);
-  void (*short_fhaar2x2)(short *input, short *output, int pitch);
+  void (*vp9_short_fdct8x8)(int16_t *input, int16_t *output, int pitch);
+  void (*vp9_short_fdct16x16)(int16_t *input, int16_t *output, int pitch);
+  void (*short_fhaar2x2)(int16_t *input, int16_t *output, int pitch);
   void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);
   void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);
   void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);
-
 } MACROBLOCK;
 
-
-#endif
+#endif  // VP9_ENCODER_VP9_BLOCK_H_
--- a/vp9/encoder/vp9_boolhuff.h
+++ b/vp9/encoder/vp9_boolhuff.h
@@ -109,4 +109,4 @@
   br->range = range;
 }
 
-#endif
+#endif  // VP9_ENCODER_VP9_BOOLHUFF_H_
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -902,23 +902,24 @@
 
 #define TEST_INT_16x16_DCT 1
 #if !TEST_INT_16x16_DCT
-static const double C1 = 0.995184726672197;
-static const double C2 = 0.98078528040323;
-static const double C3 = 0.956940335732209;
-static const double C4 = 0.923879532511287;
-static const double C5 = 0.881921264348355;
-static const double C6 = 0.831469612302545;
-static const double C7 = 0.773010453362737;
-static const double C8 = 0.707106781186548;
-static const double C9 = 0.634393284163646;
-static const double C10 = 0.555570233019602;
-static const double C11 = 0.471396736825998;
-static const double C12 = 0.38268343236509;
-static const double C13 = 0.290284677254462;
-static const double C14 = 0.195090322016128;
-static const double C15 = 0.098017140329561;
 
 static void dct16x16_1d(double input[16], double output[16]) {
+  static const double C1 = 0.995184726672197;
+  static const double C2 = 0.98078528040323;
+  static const double C3 = 0.956940335732209;
+  static const double C4 = 0.923879532511287;
+  static const double C5 = 0.881921264348355;
+  static const double C6 = 0.831469612302545;
+  static const double C7 = 0.773010453362737;
+  static const double C8 = 0.707106781186548;
+  static const double C9 = 0.634393284163646;
+  static const double C10 = 0.555570233019602;
+  static const double C11 = 0.471396736825998;
+  static const double C12 = 0.38268343236509;
+  static const double C13 = 0.290284677254462;
+  static const double C14 = 0.195090322016128;
+  static const double C15 = 0.098017140329561;
+
   vp9_clear_system_state(); // Make it simd safe : __asm emms;
   {
     double step[16];
@@ -1330,3 +1331,1058 @@
 #undef RIGHT_SHIFT
 #undef ROUNDING
 #endif
+
+#if !CONFIG_DWTDCTHYBRID
+static void dct32_1d(double *input, double *output, int stride) {
+  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)
+  static const double C2 = 0.995184726672;  // cos(pi * 2 / 64)
+  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)
+  static const double C4 = 0.980785280403;  // cos(pi * 4 / 64)
+  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)
+  static const double C6 = 0.956940335732;  // cos(pi * 6 / 64)
+  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)
+  static const double C8 = 0.923879532511;  // cos(pi * 8 / 64)
+  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)
+  static const double C10 = 0.881921264348;  // cos(pi * 10 / 64)
+  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)
+  static const double C12 = 0.831469612303;  // cos(pi * 12 / 64)
+  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)
+  static const double C14 = 0.773010453363;  // cos(pi * 14 / 64)
+  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)
+  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)
+  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)
+  static const double C18 = 0.634393284164;  // cos(pi * 18 / 64)
+  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)
+  static const double C20 = 0.555570233020;  // cos(pi * 20 / 64)
+  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)
+  static const double C22 = 0.471396736826;  // cos(pi * 22 / 64)
+  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)
+  static const double C24 = 0.382683432365;  // cos(pi * 24 / 64)
+  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)
+  static const double C26 = 0.290284677254;  // cos(pi * 26 / 64)
+  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)
+  static const double C28 = 0.195090322016;  // cos(pi * 28 / 64)
+  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)
+  static const double C30 = 0.098017140330;  // cos(pi * 30 / 64)
+  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)
+
+  double step[32];
+
+  // Stage 1
+  step[0] = input[stride*0] + input[stride*(32 - 1)];
+  step[1] = input[stride*1] + input[stride*(32 - 2)];
+  step[2] = input[stride*2] + input[stride*(32 - 3)];
+  step[3] = input[stride*3] + input[stride*(32 - 4)];
+  step[4] = input[stride*4] + input[stride*(32 - 5)];
+  step[5] = input[stride*5] + input[stride*(32 - 6)];
+  step[6] = input[stride*6] + input[stride*(32 - 7)];
+  step[7] = input[stride*7] + input[stride*(32 - 8)];
+  step[8] = input[stride*8] + input[stride*(32 - 9)];
+  step[9] = input[stride*9] + input[stride*(32 - 10)];
+  step[10] = input[stride*10] + input[stride*(32 - 11)];
+  step[11] = input[stride*11] + input[stride*(32 - 12)];
+  step[12] = input[stride*12] + input[stride*(32 - 13)];
+  step[13] = input[stride*13] + input[stride*(32 - 14)];
+  step[14] = input[stride*14] + input[stride*(32 - 15)];
+  step[15] = input[stride*15] + input[stride*(32 - 16)];
+  step[16] = -input[stride*16] + input[stride*(32 - 17)];
+  step[17] = -input[stride*17] + input[stride*(32 - 18)];
+  step[18] = -input[stride*18] + input[stride*(32 - 19)];
+  step[19] = -input[stride*19] + input[stride*(32 - 20)];
+  step[20] = -input[stride*20] + input[stride*(32 - 21)];
+  step[21] = -input[stride*21] + input[stride*(32 - 22)];
+  step[22] = -input[stride*22] + input[stride*(32 - 23)];
+  step[23] = -input[stride*23] + input[stride*(32 - 24)];
+  step[24] = -input[stride*24] + input[stride*(32 - 25)];
+  step[25] = -input[stride*25] + input[stride*(32 - 26)];
+  step[26] = -input[stride*26] + input[stride*(32 - 27)];
+  step[27] = -input[stride*27] + input[stride*(32 - 28)];
+  step[28] = -input[stride*28] + input[stride*(32 - 29)];
+  step[29] = -input[stride*29] + input[stride*(32 - 30)];
+  step[30] = -input[stride*30] + input[stride*(32 - 31)];
+  step[31] = -input[stride*31] + input[stride*(32 - 32)];
+
+  // Stage 2
+  output[stride*0] = step[0] + step[16 - 1];
+  output[stride*1] = step[1] + step[16 - 2];
+  output[stride*2] = step[2] + step[16 - 3];
+  output[stride*3] = step[3] + step[16 - 4];
+  output[stride*4] = step[4] + step[16 - 5];
+  output[stride*5] = step[5] + step[16 - 6];
+  output[stride*6] = step[6] + step[16 - 7];
+  output[stride*7] = step[7] + step[16 - 8];
+  output[stride*8] = -step[8] + step[16 - 9];
+  output[stride*9] = -step[9] + step[16 - 10];
+  output[stride*10] = -step[10] + step[16 - 11];
+  output[stride*11] = -step[11] + step[16 - 12];
+  output[stride*12] = -step[12] + step[16 - 13];
+  output[stride*13] = -step[13] + step[16 - 14];
+  output[stride*14] = -step[14] + step[16 - 15];
+  output[stride*15] = -step[15] + step[16 - 16];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18];
+  output[stride*19] = step[19];
+
+  output[stride*20] = (-step[20] + step[27])*C16;
+  output[stride*21] = (-step[21] + step[26])*C16;
+  output[stride*22] = (-step[22] + step[25])*C16;
+  output[stride*23] = (-step[23] + step[24])*C16;
+
+  output[stride*24] = (step[24] + step[23])*C16;
+  output[stride*25] = (step[25] + step[22])*C16;
+  output[stride*26] = (step[26] + step[21])*C16;
+  output[stride*27] = (step[27] + step[20])*C16;
+
+  output[stride*28] = step[28];
+  output[stride*29] = step[29];
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 3
+  step[0] = output[stride*0] + output[stride*(8 - 1)];
+  step[1] = output[stride*1] + output[stride*(8 - 2)];
+  step[2] = output[stride*2] + output[stride*(8 - 3)];
+  step[3] = output[stride*3] + output[stride*(8 - 4)];
+  step[4] = -output[stride*4] + output[stride*(8 - 5)];
+  step[5] = -output[stride*5] + output[stride*(8 - 6)];
+  step[6] = -output[stride*6] + output[stride*(8 - 7)];
+  step[7] = -output[stride*7] + output[stride*(8 - 8)];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9];
+  step[10] = (-output[stride*10] + output[stride*13])*C16;
+  step[11] = (-output[stride*11] + output[stride*12])*C16;
+  step[12] = (output[stride*12] + output[stride*11])*C16;
+  step[13] = (output[stride*13] + output[stride*10])*C16;
+  step[14] = output[stride*14];
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*23];
+  step[17] = output[stride*17] + output[stride*22];
+  step[18] = output[stride*18] + output[stride*21];
+  step[19] = output[stride*19] + output[stride*20];
+  step[20] = -output[stride*20] + output[stride*19];
+  step[21] = -output[stride*21] + output[stride*18];
+  step[22] = -output[stride*22] + output[stride*17];
+  step[23] = -output[stride*23] + output[stride*16];
+  step[24] = -output[stride*24] + output[stride*31];
+  step[25] = -output[stride*25] + output[stride*30];
+  step[26] = -output[stride*26] + output[stride*29];
+  step[27] = -output[stride*27] + output[stride*28];
+  step[28] = output[stride*28] + output[stride*27];
+  step[29] = output[stride*29] + output[stride*26];
+  step[30] = output[stride*30] + output[stride*25];
+  step[31] = output[stride*31] + output[stride*24];
+
+  // Stage 4
+  output[stride*0] = step[0] + step[3];
+  output[stride*1] = step[1] + step[2];
+  output[stride*2] = -step[2] + step[1];
+  output[stride*3] = -step[3] + step[0];
+  output[stride*4] = step[4];
+  output[stride*5] = (-step[5] + step[6])*C16;
+  output[stride*6] = (step[6] + step[5])*C16;
+  output[stride*7] = step[7];
+  output[stride*8] = step[8] + step[11];
+  output[stride*9] = step[9] + step[10];
+  output[stride*10] = -step[10] + step[9];
+  output[stride*11] = -step[11] + step[8];
+  output[stride*12] = -step[12] + step[15];
+  output[stride*13] = -step[13] + step[14];
+  output[stride*14] = step[14] + step[13];
+  output[stride*15] = step[15] + step[12];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17];
+  output[stride*18] = step[18]*-C8 + step[29]*C24;
+  output[stride*19] = step[19]*-C8 + step[28]*C24;
+  output[stride*20] = step[20]*-C24 + step[27]*-C8;
+  output[stride*21] = step[21]*-C24 + step[26]*-C8;
+  output[stride*22] = step[22];
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25];
+  output[stride*26] = step[26]*C24 + step[21]*-C8;
+  output[stride*27] = step[27]*C24 + step[20]*-C8;
+  output[stride*28] = step[28]*C8 + step[19]*C24;
+  output[stride*29] = step[29]*C8 + step[18]*C24;
+  output[stride*30] = step[30];
+  output[stride*31] = step[31];
+
+  // Stage 5
+  step[0] = (output[stride*0] + output[stride*1]) * C16;
+  step[1] = (-output[stride*1] + output[stride*0]) * C16;
+  step[2] = output[stride*2]*C24 + output[stride*3] * C8;
+  step[3] = output[stride*3]*C24 - output[stride*2] * C8;
+  step[4] = output[stride*4] + output[stride*5];
+  step[5] = -output[stride*5] + output[stride*4];
+  step[6] = -output[stride*6] + output[stride*7];
+  step[7] = output[stride*7] + output[stride*6];
+  step[8] = output[stride*8];
+  step[9] = output[stride*9]*-C8 + output[stride*14]*C24;
+  step[10] = output[stride*10]*-C24 + output[stride*13]*-C8;
+  step[11] = output[stride*11];
+  step[12] = output[stride*12];
+  step[13] = output[stride*13]*C24 + output[stride*10]*-C8;
+  step[14] = output[stride*14]*C8 + output[stride*9]*C24;
+  step[15] = output[stride*15];
+
+  step[16] = output[stride*16] + output[stride*19];
+  step[17] = output[stride*17] + output[stride*18];
+  step[18] = -output[stride*18] + output[stride*17];
+  step[19] = -output[stride*19] + output[stride*16];
+  step[20] = -output[stride*20] + output[stride*23];
+  step[21] = -output[stride*21] + output[stride*22];
+  step[22] = output[stride*22] + output[stride*21];
+  step[23] = output[stride*23] + output[stride*20];
+  step[24] = output[stride*24] + output[stride*27];
+  step[25] = output[stride*25] + output[stride*26];
+  step[26] = -output[stride*26] + output[stride*25];
+  step[27] = -output[stride*27] + output[stride*24];
+  step[28] = -output[stride*28] + output[stride*31];
+  step[29] = -output[stride*29] + output[stride*30];
+  step[30] = output[stride*30] + output[stride*29];
+  step[31] = output[stride*31] + output[stride*28];
+
+  // Stage 6
+  output[stride*0] = step[0];
+  output[stride*1] = step[1];
+  output[stride*2] = step[2];
+  output[stride*3] = step[3];
+  output[stride*4] = step[4]*C28 + step[7]*C4;
+  output[stride*5] = step[5]*C12 + step[6]*C20;
+  output[stride*6] = step[6]*C12 + step[5]*-C20;
+  output[stride*7] = step[7]*C28 + step[4]*-C4;
+  output[stride*8] = step[8] + step[9];
+  output[stride*9] = -step[9] + step[8];
+  output[stride*10] = -step[10] + step[11];
+  output[stride*11] = step[11] + step[10];
+  output[stride*12] = step[12] + step[13];
+  output[stride*13] = -step[13] + step[12];
+  output[stride*14] = -step[14] + step[15];
+  output[stride*15] = step[15] + step[14];
+
+  output[stride*16] = step[16];
+  output[stride*17] = step[17]*-C4 + step[30]*C28;
+  output[stride*18] = step[18]*-C28 + step[29]*-C4;
+  output[stride*19] = step[19];
+  output[stride*20] = step[20];
+  output[stride*21] = step[21]*-C20 + step[26]*C12;
+  output[stride*22] = step[22]*-C12 + step[25]*-C20;
+  output[stride*23] = step[23];
+  output[stride*24] = step[24];
+  output[stride*25] = step[25]*C12 + step[22]*-C20;
+  output[stride*26] = step[26]*C20 + step[21]*C12;
+  output[stride*27] = step[27];
+  output[stride*28] = step[28];
+  output[stride*29] = step[29]*C28 + step[18]*-C4;
+  output[stride*30] = step[30]*C4 + step[17]*C28;
+  output[stride*31] = step[31];
+
+  // Stage 7
+  step[0] = output[stride*0];
+  step[1] = output[stride*1];
+  step[2] = output[stride*2];
+  step[3] = output[stride*3];
+  step[4] = output[stride*4];
+  step[5] = output[stride*5];
+  step[6] = output[stride*6];
+  step[7] = output[stride*7];
+  step[8] = output[stride*8]*C30 + output[stride*15]*C2;
+  step[9] = output[stride*9]*C14 + output[stride*14]*C18;
+  step[10] = output[stride*10]*C22 + output[stride*13]*C10;
+  step[11] = output[stride*11]*C6 + output[stride*12]*C26;
+  step[12] = output[stride*12]*C6 + output[stride*11]*-C26;
+  step[13] = output[stride*13]*C22 + output[stride*10]*-C10;
+  step[14] = output[stride*14]*C14 + output[stride*9]*-C18;
+  step[15] = output[stride*15]*C30 + output[stride*8]*-C2;
+
+  step[16] = output[stride*16] + output[stride*17];
+  step[17] = -output[stride*17] + output[stride*16];
+  step[18] = -output[stride*18] + output[stride*19];
+  step[19] = output[stride*19] + output[stride*18];
+  step[20] = output[stride*20] + output[stride*21];
+  step[21] = -output[stride*21] + output[stride*20];
+  step[22] = -output[stride*22] + output[stride*23];
+  step[23] = output[stride*23] + output[stride*22];
+  step[24] = output[stride*24] + output[stride*25];
+  step[25] = -output[stride*25] + output[stride*24];
+  step[26] = -output[stride*26] + output[stride*27];
+  step[27] = output[stride*27] + output[stride*26];
+  step[28] = output[stride*28] + output[stride*29];
+  step[29] = -output[stride*29] + output[stride*28];
+  step[30] = -output[stride*30] + output[stride*31];
+  step[31] = output[stride*31] + output[stride*30];
+
+  // Final stage --- outputs indices are bit-reversed.
+  output[stride*0] = step[0];
+  output[stride*16] = step[1];
+  output[stride*8] = step[2];
+  output[stride*24] = step[3];
+  output[stride*4] = step[4];
+  output[stride*20] = step[5];
+  output[stride*12] = step[6];
+  output[stride*28] = step[7];
+  output[stride*2] = step[8];
+  output[stride*18] = step[9];
+  output[stride*10] = step[10];
+  output[stride*26] = step[11];
+  output[stride*6] = step[12];
+  output[stride*22] = step[13];
+  output[stride*14] = step[14];
+  output[stride*30] = step[15];
+
+  output[stride*1] = step[16]*C31 + step[31]*C1;
+  output[stride*17] = step[17]*C15 + step[30]*C17;
+  output[stride*9] = step[18]*C23 + step[29]*C9;
+  output[stride*25] = step[19]*C7 + step[28]*C25;
+  output[stride*5] = step[20]*C27 + step[27]*C5;
+  output[stride*21] = step[21]*C11 + step[26]*C21;
+  output[stride*13] = step[22]*C19 + step[25]*C13;
+  output[stride*29] = step[23]*C3 + step[24]*C29;
+  output[stride*3] = step[24]*C3 + step[23]*-C29;
+  output[stride*19] = step[25]*C19 + step[22]*-C13;
+  output[stride*11] = step[26]*C11 + step[21]*-C21;
+  output[stride*27] = step[27]*C27 + step[20]*-C5;
+  output[stride*7] = step[28]*C7 + step[19]*-C25;
+  output[stride*23] = step[29]*C23 + step[18]*-C9;
+  output[stride*15] = step[30]*C15 + step[17]*-C17;
+  output[stride*31] = step[31]*C31 + step[16]*-C1;
+}
+
+void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    int shortpitch = pitch >> 1;
+    int i, j;
+    double output[1024];
+    // First transform columns
+    for (i = 0; i < 32; i++) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; j++)
+        temp_in[j] = input[j*shortpitch + i];
+      dct32_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; j++)
+        output[j*32 + i] = temp_out[j];
+    }
+    // Then transform rows
+    for (i = 0; i < 32; ++i) {
+      double temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = output[j + i*32];
+      dct32_1d(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        output[j + i*32] = temp_out[j];
+    }
+    // Scale by some magic number
+    for (i = 0; i < 1024; i++) {
+      out[i] = (short)round(output[i]/4);
+    }
+  }
+
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+#else  // CONFIG_DWTDCTHYBRID
+
+#if DWT_TYPE == 53
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, short *x,
+                            short *lowpass, short *highpass) {
+  int n;
+  short r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++) << 1;
+    *b++ = *x - ((r + x[1] + 1) >> 1);
+    x++;
+  }
+  *a = (r = *x++) << 1;
+  *b = *x - r;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void analysis_53_col(int length, short *x,
+                            short *lowpass, short *highpass) {
+  int n;
+  short r, *a, *b;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  while (--n) {
+    *a++ = (r = *x++);
+    *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2;
+    x++;
+  }
+  *a = (r = *x++);
+  *b = (*x - r + 1) >> 1;
+
+  n = length >> 1;
+  b = highpass;
+  a = lowpass;
+  r = *highpass;
+  while (n--) {
+    *a++ += (r + (*b) + 1) >> 1;
+    r = *b++;
+  }
+}
+
+static void dyadic_analyze_53(int levels, int width, int height,
+                              short *x, int pitch_x, short *c, int pitch_c) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  short buffer[2 * DWT_MAX_LENGTH];
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
+    }
+  }
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
+      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i + nh] = c[i * pitch_c + j];
+      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+#elif DWT_TYPE == 26
+
+static void analysis_26_row(int length, short *x,
+                            short *lowpass, short *highpass) {
+  int i, n;
+  short r, s, *a, *b;
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    r = *x++;
+    s = *x++;
+    *a++ = r + s;
+    *b++ = r - s;
+  }
+  n = length >> 1;
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ -= (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b -= (r - *a + 4) >> 3;
+  }
+}
+
+static void analysis_26_col(int length, short *x,
+                            short *lowpass, short *highpass) {
+  int i, n;
+  short r, s, *a, *b;
+  a = lowpass;
+  b = highpass;
+  for (i = length >> 1; i; i--) {
+    r = *x++;
+    s = *x++;
+    *a++ = (r + s + 1) >> 1;
+    *b++ = (r - s + 1) >> 1;
+  }
+  n = length >> 1;
+  if (n >= 4) {
+    a = lowpass;
+    b = highpass;
+    r = *lowpass;
+    while (--n) {
+      *b++ -= (r - a[1] + 4) >> 3;
+      r = *a++;
+    }
+    *b -= (r - *a + 4) >> 3;
+  }
+}
+
+static void dyadic_analyze_26(int levels, int width, int height,
+                              short *x, int pitch_x, short *c, int pitch_c) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  short buffer[2 * DWT_MAX_LENGTH];
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
+    }
+  }
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));
+      analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i + nh] = c[i * pitch_c + j];
+      analysis_26_col(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = buffer[i];
+    }
+  }
+}
+
+#elif DWT_TYPE == 97
+
+static void analysis_97(int length, double *x,
+                        double *lowpass, double *highpass) {
+  static const double a_predict1 = -1.586134342;
+  static const double a_update1 = -0.05298011854;
+  static const double a_predict2 = 0.8829110762;
+  static const double a_update2 = 0.4435068522;
+  static const double s_low = 1.149604398;
+  static const double s_high = 1/1.149604398;
+  int i;
+  double y[DWT_MAX_LENGTH];
+  // Predict 1
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] += a_predict1 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] += 2 * a_predict1 * x[length - 2];
+  // Update 1
+  for (i = 2; i < length; i += 2) {
+    x[i] += a_update1 * (x[i - 1] + x[i + 1]);
+  }
+  x[0] += 2 * a_update1 * x[1];
+  // Predict 2
+  for (i = 1; i < length - 2; i += 2) {
+    x[i] += a_predict2 * (x[i - 1] + x[i + 1]);
+  }
+  x[length - 1] += 2 * a_predict2 * x[length - 2];
+  // Update 2
+  for (i = 2; i < length; i += 2) {
+    x[i] += a_update2 * (x[i - 1] + x[i + 1]);
+  }
+  x[0] += 2 * a_update2 * x[1];
+  memcpy(y, x, sizeof(*y) * length);
+  // Scale and pack
+  for (i = 0; i < length / 2; i++) {
+    lowpass[i] = y[2 * i] * s_low;
+    highpass[i] = y[2 * i + 1] * s_high;
+  }
+}
+
+static void dyadic_analyze_97(int levels, int width, int height,
+                             short *x, int pitch_x, short *c, int pitch_c) {
+  int lv, i, j, nh, nw, hh = height, hw = width;
+  double buffer[2 * DWT_MAX_LENGTH];
+  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;
+    }
+  }
+  for (lv = 0; lv < levels; lv++) {
+    nh = hh;
+    hh = (hh + 1) >> 1;
+    nw = hw;
+    hw = (hw + 1) >> 1;
+    if ((nh < 2) || (nw < 2)) return;
+    for (i = 0; i < nh; i++) {
+      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));
+      analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH],
+                  &y[i * DWT_MAX_LENGTH] + hw);
+    }
+    for (j = 0; j < nw; j++) {
+      for (i = 0; i < nh; i++)
+        buffer[i + nh] = y[i * DWT_MAX_LENGTH + j];
+      analysis_97(nh, buffer + nh, buffer, buffer + hh);
+      for (i = 0; i < nh; i++)
+        c[i * pitch_c + j] = round(buffer[i]);
+    }
+  }
+}
+
+#endif  // DWT_TYPE
+
+// TODO(debargha): Implement the scaling differently so as not to have to
+// use the floating point dct
+static void dct16x16_1d_f(double input[16], double output[16]) {
+  static const double C1 = 0.995184726672197;
+  static const double C2 = 0.98078528040323;
+  static const double C3 = 0.956940335732209;
+  static const double C4 = 0.923879532511287;
+  static const double C5 = 0.881921264348355;
+  static const double C6 = 0.831469612302545;
+  static const double C7 = 0.773010453362737;
+  static const double C8 = 0.707106781186548;
+  static const double C9 = 0.634393284163646;
+  static const double C10 = 0.555570233019602;
+  static const double C11 = 0.471396736825998;
+  static const double C12 = 0.38268343236509;
+  static const double C13 = 0.290284677254462;
+  static const double C14 = 0.195090322016128;
+  static const double C15 = 0.098017140329561;
+
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    double step[16];
+    double intermediate[16];
+    double temp1, temp2;
+
+    // step 1
+    step[ 0] = input[0] + input[15];
+    step[ 1] = input[1] + input[14];
+    step[ 2] = input[2] + input[13];
+    step[ 3] = input[3] + input[12];
+    step[ 4] = input[4] + input[11];
+    step[ 5] = input[5] + input[10];
+    step[ 6] = input[6] + input[ 9];
+    step[ 7] = input[7] + input[ 8];
+    step[ 8] = input[7] - input[ 8];
+    step[ 9] = input[6] - input[ 9];
+    step[10] = input[5] - input[10];
+    step[11] = input[4] - input[11];
+    step[12] = input[3] - input[12];
+    step[13] = input[2] - input[13];
+    step[14] = input[1] - input[14];
+    step[15] = input[0] - input[15];
+
+    // step 2
+    output[0] = step[0] + step[7];
+    output[1] = step[1] + step[6];
+    output[2] = step[2] + step[5];
+    output[3] = step[3] + step[4];
+    output[4] = step[3] - step[4];
+    output[5] = step[2] - step[5];
+    output[6] = step[1] - step[6];
+    output[7] = step[0] - step[7];
+
+    temp1 = step[ 8]*C7;
+    temp2 = step[15]*C9;
+    output[ 8] = temp1 + temp2;
+
+    temp1 = step[ 9]*C11;
+    temp2 = step[14]*C5;
+    output[ 9] = temp1 - temp2;
+
+    temp1 = step[10]*C3;
+    temp2 = step[13]*C13;
+    output[10] = temp1 + temp2;
+
+    temp1 = step[11]*C15;
+    temp2 = step[12]*C1;
+    output[11] = temp1 - temp2;
+
+    temp1 = step[11]*C1;
+    temp2 = step[12]*C15;
+    output[12] = temp2 + temp1;
+
+    temp1 = step[10]*C13;
+    temp2 = step[13]*C3;
+    output[13] = temp2 - temp1;
+
+    temp1 = step[ 9]*C5;
+    temp2 = step[14]*C11;
+    output[14] = temp2 + temp1;
+
+    temp1 = step[ 8]*C9;
+    temp2 = step[15]*C7;
+    output[15] = temp2 - temp1;
+
+    // step 3
+    step[ 0] = output[0] + output[3];
+    step[ 1] = output[1] + output[2];
+    step[ 2] = output[1] - output[2];
+    step[ 3] = output[0] - output[3];
+
+    temp1 = output[4]*C14;
+    temp2 = output[7]*C2;
+    step[ 4] = temp1 + temp2;
+
+    temp1 = output[5]*C10;
+    temp2 = output[6]*C6;
+    step[ 5] = temp1 + temp2;
+
+    temp1 = output[5]*C6;
+    temp2 = output[6]*C10;
+    step[ 6] = temp2 - temp1;
+
+    temp1 = output[4]*C2;
+    temp2 = output[7]*C14;
+    step[ 7] = temp2 - temp1;
+
+    step[ 8] = output[ 8] + output[11];
+    step[ 9] = output[ 9] + output[10];
+    step[10] = output[ 9] - output[10];
+    step[11] = output[ 8] - output[11];
+
+    step[12] = output[12] + output[15];
+    step[13] = output[13] + output[14];
+    step[14] = output[13] - output[14];
+    step[15] = output[12] - output[15];
+
+    // step 4
+    output[ 0] = (step[ 0] + step[ 1]);
+    output[ 8] = (step[ 0] - step[ 1]);
+
+    temp1 = step[2]*C12;
+    temp2 = step[3]*C4;
+    temp1 = temp1 + temp2;
+    output[ 4] = 2*(temp1*C8);
+
+    temp1 = step[2]*C4;
+    temp2 = step[3]*C12;
+    temp1 = temp2 - temp1;
+    output[12] = 2*(temp1*C8);
+
+    output[ 2] = 2*((step[4] + step[ 5])*C8);
+    output[14] = 2*((step[7] - step[ 6])*C8);
+
+    temp1 = step[4] - step[5];
+    temp2 = step[6] + step[7];
+    output[ 6] = (temp1 + temp2);
+    output[10] = (temp1 - temp2);
+
+    intermediate[8] = step[8] + step[14];
+    intermediate[9] = step[9] + step[15];
+
+    temp1 = intermediate[8]*C12;
+    temp2 = intermediate[9]*C4;
+    temp1 = temp1 - temp2;
+    output[3] = 2*(temp1*C8);
+
+    temp1 = intermediate[8]*C4;
+    temp2 = intermediate[9]*C12;
+    temp1 = temp2 + temp1;
+    output[13] = 2*(temp1*C8);
+
+    output[ 9] = 2*((step[10] + step[11])*C8);
+
+    intermediate[11] = step[10] - step[11];
+    intermediate[12] = step[12] + step[13];
+    intermediate[13] = step[12] - step[13];
+    intermediate[14] = step[ 8] - step[14];
+    intermediate[15] = step[ 9] - step[15];
+
+    output[15] = (intermediate[11] + intermediate[12]);
+    output[ 1] = -(intermediate[11] - intermediate[12]);
+
+    output[ 7] = 2*(intermediate[13]*C8);
+
+    temp1 = intermediate[14]*C12;
+    temp2 = intermediate[15]*C4;
+    temp1 = temp1 - temp2;
+    output[11] = -2*(temp1*C8);
+
+    temp1 = intermediate[14]*C4;
+    temp2 = intermediate[15]*C12;
+    temp1 = temp2 + temp1;
+    output[ 5] = 2*(temp1*C8);
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch,
+                                    int scale) {
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+  {
+    int shortpitch = pitch >> 1;
+    int i, j;
+    double output[256];
+    // First transform columns
+    for (i = 0; i < 16; i++) {
+        double temp_in[16], temp_out[16];
+        for (j = 0; j < 16; j++)
+            temp_in[j] = input[j*shortpitch + i];
+        dct16x16_1d_f(temp_in, temp_out);
+        for (j = 0; j < 16; j++)
+            output[j*16 + i] = temp_out[j];
+    }
+    // Then transform rows
+    for (i = 0; i < 16; ++i) {
+        double temp_in[16], temp_out[16];
+        for (j = 0; j < 16; ++j)
+            temp_in[j] = output[j + i*16];
+        dct16x16_1d_f(temp_in, temp_out);
+        for (j = 0; j < 16; ++j)
+            output[j + i*16] = temp_out[j];
+    }
+    // Scale by some magic number
+    for (i = 0; i < 256; i++)
+        out[i] = (short)round(output[i] / (2 << scale));
+  }
+  vp9_clear_system_state();  // Make it simd safe : __asm emms;
+}
+
+void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) {
+  int j1, i, j, k;
+  float b[8];
+  float b1[8];
+  float d[8][8];
+  float f0 = (float) .7071068;
+  float f1 = (float) .4903926;
+  float f2 = (float) .4619398;
+  float f3 = (float) .4157348;
+  float f4 = (float) .3535534;
+  float f5 = (float) .2777851;
+  float f6 = (float) .1913417;
+  float f7 = (float) .0975452;
+  pitch = pitch / 2;
+  for (i = 0, k = 0; i < 8; i++, k += pitch) {
+    for (j = 0; j < 8; j++) {
+      b[j] = (float)(block[k + j] << (3 - scale));
+    }
+    /* Horizontal transform */
+    for (j = 0; j < 4; j++) {
+      j1 = 7 - j;
+      b1[j] = b[j] + b[j1];
+      b1[j1] = b[j] - b[j1];
+    }
+    b[0] = b1[0] + b1[3];
+    b[1] = b1[1] + b1[2];
+    b[2] = b1[1] - b1[2];
+    b[3] = b1[0] - b1[3];
+    b[4] = b1[4];
+    b[5] = (b1[6] - b1[5]) * f0;
+    b[6] = (b1[6] + b1[5]) * f0;
+    b[7] = b1[7];
+    d[i][0] = (b[0] + b[1]) * f4;
+    d[i][4] = (b[0] - b[1]) * f4;
+    d[i][2] = b[2] * f6 + b[3] * f2;
+    d[i][6] = b[3] * f6 - b[2] * f2;
+    b1[4] = b[4] + b[5];
+    b1[7] = b[7] + b[6];
+    b1[5] = b[4] - b[5];
+    b1[6] = b[7] - b[6];
+    d[i][1] = b1[4] * f7 + b1[7] * f1;
+    d[i][5] = b1[5] * f3 + b1[6] * f5;
+    d[i][7] = b1[7] * f7 - b1[4] * f1;
+    d[i][3] = b1[6] * f3 - b1[5] * f5;
+  }
+  /* Vertical transform */
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 4; j++) {
+      j1 = 7 - j;
+      b1[j] = d[j][i] + d[j1][i];
+      b1[j1] = d[j][i] - d[j1][i];
+    }
+    b[0] = b1[0] + b1[3];
+    b[1] = b1[1] + b1[2];
+    b[2] = b1[1] - b1[2];
+    b[3] = b1[0] - b1[3];
+    b[4] = b1[4];
+    b[5] = (b1[6] - b1[5]) * f0;
+    b[6] = (b1[6] + b1[5]) * f0;
+    b[7] = b1[7];
+    d[0][i] = (b[0] + b[1]) * f4;
+    d[4][i] = (b[0] - b[1]) * f4;
+    d[2][i] = b[2] * f6 + b[3] * f2;
+    d[6][i] = b[3] * f6 - b[2] * f2;
+    b1[4] = b[4] + b[5];
+    b1[7] = b[7] + b[6];
+    b1[5] = b[4] - b[5];
+    b1[6] = b[7] - b[6];
+    d[1][i] = b1[4] * f7 + b1[7] * f1;
+    d[5][i] = b1[5] * f3 + b1[6] * f5;
+    d[7][i] = b1[7] * f7 - b1[4] * f1;
+    d[3][i] = b1[6] * f3 - b1[5] * f5;
+  }
+  for (i = 0; i < 8; i++) {
+    for (j = 0; j < 8; j++) {
+      *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5);
+    }
+  }
+  return;
+}
+
+#define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n))
+
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+
+void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
+  // assume out is a 32x32 buffer
+  short buffer[16 * 16];
+  int i, j;
+  const int short_pitch = pitch >> 1;
+#if DWT_TYPE == 26
+  dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);
+#endif
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the dct16x16 function
+  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 32; ++j) {
+      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+  for (i = 16; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+}
+
+#elif DWTDCT_TYPE == DWTDCT16X16
+
+void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
+  // assume out is a 32x32 buffer
+  short buffer[16 * 16];
+  int i, j;
+  const int short_pitch = pitch >> 1;
+#if DWT_TYPE == 26
+  dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);
+#endif
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the dct16x16 function
+  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);
+  vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16);
+
+  vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16);
+
+  vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16);
+}
+
+#elif DWTDCT_TYPE == DWTDCT8X8
+
+void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {
+  // assume out is a 32x32 buffer
+  short buffer[8 * 8];
+  int i, j;
+  const int short_pitch = pitch >> 1;
+#if DWT_TYPE == 26
+  dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32);
+#endif
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the dct16x16 function
+  vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i)
+    vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8);
+
+  vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i)
+    vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8);
+
+  vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i)
+    vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8);
+
+  vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);
+  for (i = 0; i < 8; ++i)
+    vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8);
+
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 32; ++j) {
+      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+  for (i = 16; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);
+    }
+  }
+}
+
+#endif
+
+#if CONFIG_TX64X64
+void vp9_short_fdct64x64_c(short *input, short *out, int pitch) {
+  // assume out is a 64x64 buffer
+  short buffer[16 * 16];
+  int i, j;
+  const int short_pitch = pitch >> 1;
+#if DWT_TYPE == 26
+  dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64);
+#elif DWT_TYPE == 97
+  dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64);
+#elif DWT_TYPE == 53
+  dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64);
+#endif
+  // TODO(debargha): Implement more efficiently by adding output pitch
+  // argument to the dct16x16 function
+  vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16);
+
+#if DWTDCT_TYPE == DWTDCT16X16_LEAN
+  for (i = 0; i < 16; ++i) {
+    for (j = 16; j < 48; ++j) {
+      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+  for (i = 16; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+#elif DWTDCT_TYPE == DWTDCT16X16
+  vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16);
+
+  vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16);
+
+  vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);
+  for (i = 0; i < 16; ++i)
+    vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16);
+
+  // There is no dct used on the highest bands for now.
+  // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS
+  // TODO(debargha): experiment with turning these coeffs to 0
+  for (i = 0; i < 32; ++i) {
+    for (j = 32; j < 64; ++j) {
+      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+  for (i = 32; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);
+    }
+  }
+#endif  // DWTDCT_TYPE
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_DWTDCTHYBRID
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -16,6 +16,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_extend.h"
+#include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
@@ -44,15 +45,20 @@
 int enc_debug = 0;
 #endif
 
-static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                              TOKENEXTRA **t, int recon_yoffset,
-                              int recon_uvoffset, int output_enabled,
-                              int mb_col, int mb_row);
+extern void select_interp_filter_type(VP9_COMP *cpi);
 
-static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
-                              TOKENEXTRA **t, int recon_yoffset,
-                              int recon_uvoffset, int mb_col, int mb_row);
+static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
+                              int recon_yoffset, int recon_uvoffset,
+                              int output_enabled, int mb_row, int mb_col);
 
+static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
+                                int recon_yoffset, int recon_uvoffset,
+                                int output_enabled, int mb_row, int mb_col);
+
+static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
+                                int recon_yoffset, int recon_uvoffset,
+                                int output_enabled, int mb_row, int mb_col);
+
 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
 
 #ifdef MODE_STATS
@@ -79,7 +85,7 @@
  * Eventually this should be replaced by custom no-reference routines,
  *  which will be faster.
  */
-static const unsigned char VP9_VAR_OFFS[16] = {
+static const uint8_t VP9_VAR_OFFS[16] = {
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
 };
 
@@ -279,10 +285,6 @@
       xd->left_available = (mb_col != 0);
       recon_yoffset += 16;
 #endif
-#if !CONFIG_SUPERBLOCKS
-      // Copy current mb to a buffer
-      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-#endif
 
       // measure activity
       mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
@@ -391,12 +393,11 @@
   MACROBLOCKD *xd = &x->e_mbd;
   int max_mv = MV_MAX;
 
-  cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], 0) +
+  cost = vp9_cost_mv_ref_id(xd->mb_mv_ref_probs[ref_frame], 0) +
          vp9_mv_bit_cost(&target_mv, &mv_ref_list[0], x->nmvjointcost,
                          x->mvcost, 96, xd->allow_high_precision_mv);
 
-  // Use 4 for now : for (i = 1; i < MAX_MV_REFS; ++i ) {
-  for (i = 1; i < 4; ++i) {
+  for (i = 1; i < MAX_MV_REF_CANDIDATES; ++i) {
     // If we see a 0,0 reference vector for a second time we have reached
     // the end of the list of valid candidate vectors.
     if (!mv_ref_list[i].as_int) {
@@ -413,7 +414,7 @@
       continue;
     }
 
-    cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_id_probs[ref_frame], i) +
+    cost2 = vp9_cost_mv_ref_id(xd->mb_mv_ref_probs[ref_frame], i) +
             vp9_mv_bit_cost(&target_mv, &mv_ref_list[i], x->nmvjointcost,
                             x->mvcost, 96, xd->allow_high_precision_mv);
 
@@ -422,7 +423,6 @@
       best_index = i;
     }
   }
-
   best_ref->as_int = mv_ref_list[best_index].as_int;
 
   return best_index;
@@ -429,14 +429,18 @@
 }
 #endif
 
-static void update_state(VP9_COMP *cpi, MACROBLOCK *x,
-                         PICK_MODE_CONTEXT *ctx) {
-  int i;
-  MACROBLOCKD *xd = &x->e_mbd;
+static void update_state(VP9_COMP *cpi,
+                         PICK_MODE_CONTEXT *ctx, int block_size,
+                         int output_enabled) {
+  int i, x_idx, y;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   int mb_mode = mi->mbmi.mode;
   int mb_mode_index = ctx->best_mode_index;
+  const int mis = cpi->common.mode_info_stride;
+  int mb_block_size = 1 << mi->mbmi.sb_type;
 
 #if CONFIG_DEBUG
   assert(mb_mode < MB_MODE_COUNT);
@@ -443,22 +447,23 @@
   assert(mb_mode_index < MAX_MODES);
   assert(mi->mbmi.ref_frame < MAX_REF_FRAMES);
 #endif
+  assert(mi->mbmi.sb_type == (block_size >> 5));
 
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
-  vpx_memcpy(xd->mode_info_context, mi, sizeof(MODE_INFO));
-#if CONFIG_SUPERBLOCKS
-  if (mi->mbmi.encoded_as_sb) {
-    const int mis = cpi->common.mode_info_stride;
-    if (xd->mb_to_right_edge >= 0)
-      vpx_memcpy(xd->mode_info_context + 1, mi, sizeof(MODE_INFO));
-    if (xd->mb_to_bottom_edge >= 0) {
-      vpx_memcpy(xd->mode_info_context + mis, mi, sizeof(MODE_INFO));
-      if (xd->mb_to_right_edge >= 0)
-        vpx_memcpy(xd->mode_info_context + mis + 1, mi, sizeof(MODE_INFO));
+  for (y = 0; y < mb_block_size; y++) {
+    for (x_idx = 0; x_idx < mb_block_size; x_idx++) {
+      if ((xd->mb_to_right_edge >> 7) + mb_block_size > x_idx &&
+          (xd->mb_to_bottom_edge >> 7) + mb_block_size > y) {
+        MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
+
+        vpx_memcpy(mi_addr, mi, sizeof(MODE_INFO));
+      }
     }
   }
-#endif
+  if (block_size == 16) {
+    ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];
+  }
 
   if (mb_mode == B_PRED) {
     for (i = 0; i < 16; i++) {
@@ -477,6 +482,10 @@
     mbmi->mv[1].as_int = x->partition_info->bmi[15].second_mv.as_int;
   }
 
+  x->skip = ctx->skip;
+  if (!output_enabled)
+    return;
+
   {
     int segment_id = mbmi->segment_id;
     if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
@@ -550,6 +559,7 @@
         best_index = pick_best_mv_ref(x, rf, mbmi->mv[0],
                                       mbmi->ref_mvs[rf], &best_mv);
         mbmi->best_index = best_index;
+        ++cpi->mb_mv_ref_count[rf][best_index];
 
         if (mbmi->second_ref_frame > 0) {
           unsigned int best_index;
@@ -558,6 +568,7 @@
                                mbmi->ref_mvs[sec_ref_frame],
                                &best_second_mv);
           mbmi->best_second_index = best_index;
+          ++cpi->mb_mv_ref_count[sec_ref_frame][best_index];
         }
 #endif
       }
@@ -578,6 +589,7 @@
         ++cpi->interintra_count[0];
       }
     }
+#endif
     if (cpi->common.mcomp_filter_type == SWITCHABLE &&
         mbmi->mode >= NEARESTMV &&
         mbmi->mode <= SPLITMV) {
@@ -585,7 +597,6 @@
           [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
           [vp9_switchable_interp_map[mbmi->interp_filter]];
     }
-#endif
 
     cpi->prediction_error += ctx->distortion;
     cpi->intra_error += ctx->intra_error;
@@ -596,34 +607,154 @@
   }
 }
 
+static unsigned find_seg_id(uint8_t *buf, int block_size,
+                            int start_y, int height, int start_x, int width) {
+  const int end_x = MIN(start_x + block_size, width);
+  const int end_y = MIN(start_y + block_size, height);
+  int x, y;
+  unsigned seg_id = -1;
+
+  buf += width * start_y;
+  for (y = start_y; y < end_y; y++, buf += width) {
+    for (x = start_x; x < end_x; x++) {
+      seg_id = MIN(seg_id, buf[x]);
+    }
+  }
+
+  return seg_id;
+}
+
+static void set_offsets(VP9_COMP *cpi,
+                        int mb_row, int mb_col, int block_size,
+                        int *ref_yoffset, int *ref_uvoffset) {
+  MACROBLOCK *const x = &cpi->mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  const int dst_fb_idx = cm->new_fb_idx;
+  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;
+  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;
+  const int recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;
+  const int recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;
+  const int src_y_stride = x->src.y_stride;
+  const int src_uv_stride = x->src.uv_stride;
+  const int src_yoffset = 16 * mb_row * src_y_stride + 16 * mb_col;
+  const int src_uvoffset = 8 * mb_row * src_uv_stride + 8 * mb_col;
+  const int ref_fb_idx = cm->lst_fb_idx;
+  const int ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
+  const int ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+  const int idx_map = mb_row * cm->mb_cols + mb_col;
+  const int idx_str = xd->mode_info_stride * mb_row + mb_col;
+
+  // entropy context structures
+  xd->above_context = cm->above_context + mb_col;
+  xd->left_context  = cm->left_context + (mb_row & 3);
+
+  // GF active flags data structure
+  x->gf_active_ptr = (signed char *)&cpi->gf_active_flags[idx_map];
+
+  // Activity map pointer
+  x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
+  x->active_ptr = cpi->active_map + idx_map;
+
+  /* pointers to mode info contexts */
+  x->partition_info          = x->pi + idx_str;
+  xd->mode_info_context      = cm->mi + idx_str;
+  mbmi = &xd->mode_info_context->mbmi;
+  xd->prev_mode_info_context = cm->prev_mi + idx_str;
+
+  // Set up destination pointers
+  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
+  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
+  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+
+  /* Set up limit values for MV components to prevent them from
+   * extending beyond the UMV borders assuming 16x16 block size */
+  x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
+                   (VP9BORDERINPIXELS - block_size - VP9_INTERP_EXTEND));
+  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
+                   (VP9BORDERINPIXELS - block_size - VP9_INTERP_EXTEND));
+
+  // Set up distance of MB to edge of frame in 1/8th pel units
+  block_size >>= 4;  // in macroblock units
+  assert(!(mb_col & (block_size - 1)) && !(mb_row & (block_size - 1)));
+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;
+  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;
+
+  // Are edges available for intra prediction?
+  xd->up_available   = (mb_row != 0);
+  xd->left_available = (mb_col != 0);
+
+  /* Reference buffer offsets */
+  *ref_yoffset  = (mb_row * ref_y_stride * 16) + (mb_col * 16);
+  *ref_uvoffset = (mb_row * ref_uv_stride * 8) + (mb_col *  8);
+
+  /* set up source buffers */
+  x->src.y_buffer = cpi->Source->y_buffer + src_yoffset;
+  x->src.u_buffer = cpi->Source->u_buffer + src_uvoffset;
+  x->src.v_buffer = cpi->Source->v_buffer + src_uvoffset;
+
+  /* R/D setup */
+  x->rddiv = cpi->RDDIV;
+  x->rdmult = cpi->RDMULT;
+
+  /* segment ID */
+  if (xd->segmentation_enabled) {
+    if (xd->update_mb_segmentation_map) {
+      mbmi->segment_id = find_seg_id(cpi->segmentation_map, block_size,
+                                     mb_row, cm->mb_rows, mb_col, cm->mb_cols);
+    } else {
+      mbmi->segment_id = find_seg_id(cm->last_frame_seg_map, block_size,
+                                     mb_row, cm->mb_rows, mb_col, cm->mb_cols);
+    }
+    assert(mbmi->segment_id <= 3);
+    vp9_mb_init_quantizer(cpi, x);
+
+    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
+        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
+        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
+        vp9_check_segref(xd, 1, INTRA_FRAME)  +
+        vp9_check_segref(xd, 1, LAST_FRAME)   +
+        vp9_check_segref(xd, 1, GOLDEN_FRAME) +
+        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
+      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
+    } else {
+      const int y = mb_row & ~3;
+      const int x = mb_col & ~3;
+      const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);
+      const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
+
+      cpi->seg0_progress =
+          ((y * cm->mb_cols + x * 4 + p32 + p16) << 16) / cm->MBs;
+    }
+  } else {
+    mbmi->segment_id = 0;
+  }
+}
+
 static void pick_mb_modes(VP9_COMP *cpi,
-                          VP9_COMMON *cm,
                           int mb_row,
                           int mb_col,
-                          MACROBLOCK  *x,
-                          MACROBLOCKD *xd,
                           TOKENEXTRA **tp,
                           int *totalrate,
                           int *totaldist) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int i;
-  int map_index;
   int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = cm->lst_fb_idx;
-  int dst_fb_idx = cm->new_fb_idx;
-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
   ENTROPY_CONTEXT_PLANES left_context[2];
   ENTROPY_CONTEXT_PLANES above_context[2];
   ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
                                                       + mb_col;
 
-  // Offsets to move pointers from MB to MB within a SB in raster order
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
-
   /* Function should not modify L & A contexts; save and restore on exit */
   vpx_memcpy(left_context,
-             cm->left_context,
+             cm->left_context + (mb_row & 2),
              sizeof(left_context));
   vpx_memcpy(above_context,
              initial_above_context_ptr,
@@ -631,113 +762,29 @@
 
   /* Encode MBs in raster order within the SB */
   for (i = 0; i < 4; i++) {
-    int dy = row_delta[i];
-    int dx = col_delta[i];
-    int offset_unextended = dy * cm->mb_cols + dx;
-    int offset_extended   = dy * xd->mode_info_stride + dx;
-    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+    const int x_idx = i & 1, y_idx = i >> 1;
+    MB_MODE_INFO *mbmi;
 
-    // TODO Many of the index items here can be computed more efficiently!
-
-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
+    if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) {
       // MB lies outside frame, move on
-      mb_row += dy;
-      mb_col += dx;
-
-      // Update pointers
-      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-      x->gf_active_ptr += offset_unextended;
-      x->partition_info += offset_extended;
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-#if CONFIG_DEBUG
-      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-             (xd->mode_info_context - cpi->common.mip));
-#endif
       continue;
     }
 
     // Index of the MB in the SB 0..3
     xd->mb_index = i;
+    set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16,
+                &recon_yoffset, &recon_uvoffset);
 
-    map_index = (mb_row * cpi->common.mb_cols) + mb_col;
-    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
-    // set above context pointer
-    xd->above_context = cm->above_context + mb_col;
-
-    // Restore the appropriate left context depending on which
-    // row in the SB the MB is situated
-    xd->left_context = cm->left_context + (i >> 1);
-
-    // Set up distance of MB to edge of frame in 1/8th pel units
-    xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-    xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-    xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-    xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 16x16 block size
-    x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-    x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-    x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                     (VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND));
-    x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                     (VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND));
-
-    xd->up_available   = (mb_row != 0);
-    xd->left_available = (mb_col != 0);
-
-    recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
-
-    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-
-#if !CONFIG_SUPERBLOCKS
-    // Copy current MB to a work buffer
-    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-#endif
-
-    x->rddiv = cpi->RDDIV;
-    x->rdmult = cpi->RDMULT;
-
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
       vp9_activity_masking(cpi, x);
 
-    // Is segmentation enabled
-    if (xd->segmentation_enabled) {
-      // Code to set segment id in xd->mbmi.segment_id
-      if (xd->update_mb_segmentation_map)
-        mbmi->segment_id = cpi->segmentation_map[map_index];
-      else
-        mbmi->segment_id = cm->last_frame_seg_map[map_index];
-      if (mbmi->segment_id > 3)
-        mbmi->segment_id = 0;
+    mbmi = &xd->mode_info_context->mbmi;
+    mbmi->sb_type = BLOCK_SIZE_MB16X16;
 
-      vp9_mb_init_quantizer(cpi, x);
-    } else
-      // Set to Segment 0 by default
-      mbmi->segment_id = 0;
-
-    x->active_ptr = cpi->active_map + map_index;
-
-#if CONFIG_SUPERBLOCKS
-    xd->mode_info_context->mbmi.encoded_as_sb = 0;
-#endif
-
     cpi->update_context = 0;    // TODO Do we need this now??
 
     vp9_intra_prediction_down_copy(xd);
 
-#ifdef ENC_DEBUG
-      enc_debug = (cpi->common.current_video_frame == 46 &&
-                   mb_row == 5 && mb_col == 2);
-#endif
     // Find best coding mode & reconstruct the MB so it is available
     // as a predictor for MBs that follow in the SB
     if (cm->frame_type == KEY_FRAME) {
@@ -751,28 +798,16 @@
       *totaldist += d;
 
       // Dummy encode, do not do the tokenization
-      encode_macroblock(cpi, x, tp,
-                        recon_yoffset, recon_uvoffset, 0, mb_col, mb_row);
+      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0,
+                        mb_row + y_idx, mb_col + x_idx);
       // Note the encoder may have changed the segment_id
 
       // Save the coding context
-      vpx_memcpy(&x->mb_context[i].mic, xd->mode_info_context,
+      vpx_memcpy(&x->mb_context[xd->sb_index][i].mic, xd->mode_info_context,
                  sizeof(MODE_INFO));
     } else {
       int seg_id, r, d;
 
-      if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
-          !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
-          vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
-          vp9_check_segref(xd, 1, INTRA_FRAME)  +
-          vp9_check_segref(xd, 1, LAST_FRAME)   +
-          vp9_check_segref(xd, 1, GOLDEN_FRAME) +
-          vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
-        cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
-      } else {
-        cpi->seg0_progress = (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols + i) << 16) / cm->MBs;
-      }
-
 #ifdef ENC_DEBUG
       if (enc_debug)
         printf("inter pick_mb_modes %d %d\n", mb_row, mb_col);
@@ -783,8 +818,8 @@
       *totaldist += d;
 
       // Dummy encode, do not do the tokenization
-      encode_macroblock(cpi, x, tp,
-                        recon_yoffset, recon_uvoffset, 0, mb_col, mb_row);
+      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0,
+                        mb_row + y_idx, mb_col + x_idx);
 
       seg_id = mbmi->segment_id;
       if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {
@@ -804,28 +839,10 @@
         cpi->ref_pred_count[pred_context][pred_flag]++;
       }
     }
-
-    // Next MB
-    mb_row += dy;
-    mb_col += dx;
-
-    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-    x->gf_active_ptr += offset_unextended;
-    x->partition_info += offset_extended;
-    xd->mode_info_context += offset_extended;
-    xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
-    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-           (xd->mode_info_context - cpi->common.mip));
-#endif
   }
 
   /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy(cm->left_context,
+  vpx_memcpy(cm->left_context + (mb_row & 2),
              left_context,
              sizeof(left_context));
   vpx_memcpy(initial_above_context_ptr,
@@ -833,393 +850,193 @@
              sizeof(above_context));
 }
 
-#if CONFIG_SUPERBLOCKS
-static void pick_sb_modes (VP9_COMP *cpi,
-                           VP9_COMMON *cm,
-                           int mb_row,
-                           int mb_col,
-                           MACROBLOCK  *x,
-                           MACROBLOCKD *xd,
-                           TOKENEXTRA **tp,
-                           int *totalrate,
-                           int *totaldist)
-{
-  int map_index;
+static void pick_sb_modes(VP9_COMP *cpi,
+                          int mb_row,
+                          int mb_col,
+                          TOKENEXTRA **tp,
+                          int *totalrate,
+                          int *totaldist) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = cm->lst_fb_idx;
-  int dst_fb_idx = cm->new_fb_idx;
-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-  ENTROPY_CONTEXT_PLANES left_context[2];
-  ENTROPY_CONTEXT_PLANES above_context[2];
-  ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context
-    + mb_col;
 
-  /* Function should not modify L & A contexts; save and restore on exit */
-  vpx_memcpy (left_context,
-              cm->left_context,
-              sizeof(left_context));
-  vpx_memcpy (above_context,
-              initial_above_context_ptr,
-              sizeof(above_context));
-
-  map_index = (mb_row * cpi->common.mb_cols) + mb_col;
-  x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-
-  /* set above context pointer */
-  xd->above_context = cm->above_context + mb_col;
-
-  /* Restore the appropriate left context depending on which
-   * row in the SB the MB is situated */
-  xd->left_context = cm->left_context;
-
-  // Set up distance of MB to edge of frame in 1/8th pel units
-  xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-  xd->mb_to_bottom_edge = ((cm->mb_rows - 2 - mb_row) * 16) << 3;
-  xd->mb_to_right_edge  = ((cm->mb_cols - 2 - mb_col) * 16) << 3;
-
-  /* Set up limit values for MV components to prevent them from
-   * extending beyond the UMV borders assuming 16x16 block size */
-  x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-  x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                   (VP9BORDERINPIXELS - 32 - VP9_INTERP_EXTEND));
-  x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                   (VP9BORDERINPIXELS - 32 - VP9_INTERP_EXTEND));
-
-  xd->up_available   = (mb_row != 0);
-  xd->left_available = (mb_col != 0);
-
-  recon_yoffset  = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-  recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col *  8);
-
-  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
-#if 0 // FIXME
-  /* Copy current MB to a work buffer */
-  vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-#endif
-  x->rddiv = cpi->RDDIV;
-  x->rdmult = cpi->RDMULT;
-  if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
+  set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset);
+  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB32X32;
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
-  /* Is segmentation enabled */
-  if (xd->segmentation_enabled)
-  {
-    /* Code to set segment id in xd->mbmi.segment_id */
-    if (xd->update_mb_segmentation_map)
-      xd->mode_info_context->mbmi.segment_id =
-            cpi->segmentation_map[map_index] &&
-            cpi->segmentation_map[map_index + 1] &&
-            cpi->segmentation_map[map_index + cm->mb_cols] &&
-            cpi->segmentation_map[map_index + cm->mb_cols + 1];
-    else
-      xd->mode_info_context->mbmi.segment_id =
-            cm->last_frame_seg_map[map_index] &&
-            cm->last_frame_seg_map[map_index + 1] &&
-            cm->last_frame_seg_map[map_index + cm->mb_cols] &&
-            cm->last_frame_seg_map[map_index + cm->mb_cols + 1];
-    if (xd->mode_info_context->mbmi.segment_id > 3)
-      xd->mode_info_context->mbmi.segment_id = 0;
-
-    vp9_mb_init_quantizer(cpi, x);
-  }
-  else
-    /* Set to Segment 0 by default */
-    xd->mode_info_context->mbmi.segment_id = 0;
-
-  x->active_ptr = cpi->active_map + map_index;
-
   cpi->update_context = 0;    // TODO Do we need this now??
 
   /* Find best coding mode & reconstruct the MB so it is available
    * as a predictor for MBs that follow in the SB */
-  if (cm->frame_type == KEY_FRAME)
-  {
-    vp9_rd_pick_intra_mode_sb(cpi, x,
-                              totalrate,
-                              totaldist);
+  if (cm->frame_type == KEY_FRAME) {
+    vp9_rd_pick_intra_mode_sb32(cpi, x,
+                                totalrate,
+                                totaldist);
 
     /* Save the coding context */
-    vpx_memcpy(&x->sb_context[0].mic, xd->mode_info_context,
+    vpx_memcpy(&x->sb32_context[xd->sb_index].mic, xd->mode_info_context,
                sizeof(MODE_INFO));
   } else {
-    if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
-        !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
-        vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME) &&
-        vp9_check_segref(xd, 1, INTRA_FRAME)  +
-        vp9_check_segref(xd, 1, LAST_FRAME)   +
-        vp9_check_segref(xd, 1, GOLDEN_FRAME) +
-        vp9_check_segref(xd, 1, ALTREF_FRAME) == 1) {
-      cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
-    } else {
-      cpi->seg0_progress =
-        (((mb_col & ~1) * 2 + (mb_row & ~1) * cm->mb_cols) << 16) / cm->MBs;
-    }
-
-    vp9_rd_pick_inter_mode_sb(cpi, x,
-                              recon_yoffset,
-                              recon_uvoffset,
-                              totalrate,
-                              totaldist);
+    vp9_rd_pick_inter_mode_sb32(cpi, x,
+                                recon_yoffset,
+                                recon_uvoffset,
+                                totalrate,
+                                totaldist);
   }
-
-  /* Restore L & A coding context to those in place on entry */
-  vpx_memcpy (cm->left_context,
-              left_context,
-              sizeof(left_context));
-  vpx_memcpy (initial_above_context_ptr,
-              above_context,
-              sizeof(above_context));
 }
-#endif
 
-static void encode_sb(VP9_COMP *cpi,
-                      VP9_COMMON *cm,
-                      int mbrow,
-                      int mbcol,
-                      MACROBLOCK  *x,
-                      MACROBLOCKD *xd,
-                      TOKENEXTRA **tp) {
-  int i;
-  int map_index;
-  int mb_row, mb_col;
+static void pick_sb64_modes(VP9_COMP *cpi,
+                            int mb_row,
+                            int mb_col,
+                            TOKENEXTRA **tp,
+                            int *totalrate,
+                            int *totaldist) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int recon_yoffset, recon_uvoffset;
-  int ref_fb_idx = cm->lst_fb_idx;
-  int dst_fb_idx = cm->new_fb_idx;
-  int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
-  int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
-  int row_delta[4] = { 0, +1,  0, -1};
-  int col_delta[4] = { +1, -1, +1, +1};
 
-  mb_row = mbrow;
-  mb_col = mbcol;
+  set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset);
+  xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
+  if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+    vp9_activity_masking(cpi, x);
+  cpi->update_context = 0;    // TODO(rbultje) Do we need this now??
 
-  /* Encode MBs in raster order within the SB */
-  for (i = 0; i < 4; i++) {
-    int dy = row_delta[i];
-    int dx = col_delta[i];
-    int offset_extended   = dy * xd->mode_info_stride + dx;
-    int offset_unextended = dy * cm->mb_cols + dx;
-    MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  /* Find best coding mode & reconstruct the MB so it is available
+   * as a predictor for MBs that follow in the SB */
+  if (cm->frame_type == KEY_FRAME) {
+    vp9_rd_pick_intra_mode_sb64(cpi, x,
+                                totalrate,
+                                totaldist);
 
-    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {
-      // MB lies outside frame, move on
-      mb_row += dy;
-      mb_col += dx;
+    /* Save the coding context */
+    vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context,
+               sizeof(MODE_INFO));
+  } else {
+    vp9_rd_pick_inter_mode_sb64(cpi, x,
+                                recon_yoffset,
+                                recon_uvoffset,
+                                totalrate,
+                                totaldist);
+  }
+}
 
-      x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-      x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-      x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
+static void update_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-      x->gf_active_ptr      += offset_unextended;
-      x->partition_info     += offset_extended;
-      xd->mode_info_context += offset_extended;
-      xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
-      assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-             (xd->mode_info_context - cpi->common.mip));
+  if (cm->frame_type == KEY_FRAME) {
+#ifdef MODE_STATS
+    y_modes[mbmi->mode]++;
 #endif
-      continue;
-    }
+  } else {
+    int segment_id, seg_ref_active;
 
-    xd->mb_index = i;
+    if (mbmi->ref_frame) {
+      int pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);
 
-    // Restore MB state to that when it was picked
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      update_state(cpi, x, &x->sb_context[i]);
-      cpi->sb_count++;
-    } else
-#endif
-      update_state(cpi, x, &x->mb_context[i]);
+      if (mbmi->second_ref_frame <= INTRA_FRAME)
+        cpi->single_pred_count[pred_context]++;
+      else
+        cpi->comp_pred_count[pred_context]++;
+    }
 
-    map_index = (mb_row * cpi->common.mb_cols) + mb_col;
-    x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+#ifdef MODE_STATS
+    inter_y_modes[mbmi->mode]++;
 
-    // reset above block coeffs
-    xd->above_context = cm->above_context + mb_col;
-    xd->left_context  = cm->left_context + (i >> 1);
+    if (mbmi->mode == SPLITMV) {
+      int b;
 
-    // Set up distance of MB to edge of the frame in 1/8th pel units
-    // Set up limit values for MV components to prevent them from
-    // extending beyond the UMV borders assuming 32x32 block size
-    x->mv_row_min = -((mb_row * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-    x->mv_col_min = -((mb_col * 16) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
-
-    xd->mb_to_top_edge    = -((mb_row * 16) << 3);
-    xd->mb_to_left_edge   = -((mb_col * 16) << 3);
-
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                       (VP9BORDERINPIXELS - 32 - VP9_INTERP_EXTEND));
-      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                       (VP9BORDERINPIXELS - 32 - VP9_INTERP_EXTEND));
-
-      xd->mb_to_bottom_edge = ((cm->mb_rows - 2 - mb_row) * 16) << 3;
-      xd->mb_to_right_edge  = ((cm->mb_cols - 2 - mb_col) * 16) << 3;
-    } else {
+      for (b = 0; b < x->partition_info->count; b++) {
+        inter_b_modes[x->partition_info->bmi[b].mode]++;
+      }
+    }
 #endif
-      x->mv_row_max = ((cm->mb_rows - mb_row) * 16 +
-                       (VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND));
-      x->mv_col_max = ((cm->mb_cols - mb_col) * 16 +
-                       (VP9BORDERINPIXELS - 16 - VP9_INTERP_EXTEND));
 
-      xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-      xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
+    // If we have just a single reference frame coded for a segment then
+    // exclude from the reference frame counts used to work out
+    // probabilities. NOTE: At the moment we dont support custom trees
+    // for the reference frame coding for each segment but this is a
+    // possible future action.
+    segment_id = mbmi->segment_id;
+    seg_ref_active = vp9_segfeature_active(xd, segment_id,
+                                           SEG_LVL_REF_FRAME);
+    if (!seg_ref_active ||
+        ((vp9_check_segref(xd, segment_id, INTRA_FRAME) +
+          vp9_check_segref(xd, segment_id, LAST_FRAME) +
+          vp9_check_segref(xd, segment_id, GOLDEN_FRAME) +
+          vp9_check_segref(xd, segment_id, ALTREF_FRAME)) > 1)) {
+      cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;
     }
-#endif
+    // Count of last ref frame 0,0 usage
+    if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
+      cpi->inter_zz_count++;
+  }
+}
 
-    xd->up_available = (mb_row != 0);
-    xd->left_available = (mb_col != 0);
+static void encode_sb(VP9_COMP *cpi,
+                      int mb_row,
+                      int mb_col,
+                      int output_enabled,
+                      TOKENEXTRA **tp, int is_sb) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int recon_yoffset, recon_uvoffset;
 
-    recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);
+  cpi->sb32_count[is_sb]++;
+  if (is_sb) {
+    set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset);
+    update_state(cpi, &x->sb32_context[xd->sb_index], 32, output_enabled);
 
-    xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
-    xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
-    xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
+    encode_superblock32(cpi, tp, recon_yoffset, recon_uvoffset,
+                        output_enabled, mb_row, mb_col);
+    if (output_enabled)
+      update_stats(cpi);
 
-#if !CONFIG_SUPERBLOCKS
-    // Copy current MB to a work buffer
-    vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-#endif
-
-    if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-      vp9_activity_masking(cpi, x);
-
-    // Is segmentation enabled
-    if (xd->segmentation_enabled) {
-      vp9_mb_init_quantizer(cpi, x);
+    if (output_enabled) {
+      (*tp)->Token = EOSB_TOKEN;
+      (*tp)++;
+      if (mb_row < cm->mb_rows)
+        cpi->tplist[mb_row].stop = *tp;
     }
+  } else {
+    int i;
 
-    x->active_ptr = cpi->active_map + map_index;
+    for (i = 0; i < 4; i++) {
+      const int x_idx = i & 1, y_idx = i >> 1;
 
-    cpi->update_context = 0;
-
-#if CONFIG_SUPERBLOCKS
-    if (!xd->mode_info_context->mbmi.encoded_as_sb)
-#endif
-      vp9_intra_prediction_down_copy(xd);
-
-    if (cm->frame_type == KEY_FRAME) {
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb)
-        encode_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,
-                          mb_col, mb_row);
-      else
-#endif
-        encode_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, 1,
-                          mb_col, mb_row);
-        // Note the encoder may have changed the segment_id
-
-#ifdef MODE_STATS
-      y_modes[mbmi->mode]++;
-#endif
-    } else {
-      unsigned char *segment_id;
-      int seg_ref_active;
-
-      if (xd->mode_info_context->mbmi.ref_frame) {
-        unsigned char pred_context;
-
-        pred_context = vp9_get_pred_context(cm, xd, PRED_COMP);
-
-        if (xd->mode_info_context->mbmi.second_ref_frame <= INTRA_FRAME)
-          cpi->single_pred_count[pred_context]++;
-        else
-          cpi->comp_pred_count[pred_context]++;
+      if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) {
+        // MB lies outside frame, move on
+        continue;
       }
 
-#if CONFIG_SUPERBLOCKS
-      if (xd->mode_info_context->mbmi.encoded_as_sb)
-        encode_superblock(cpi, x, tp, recon_yoffset, recon_uvoffset,
-                          mb_col, mb_row);
-      else
-#endif
-        encode_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset, 1,
-                          mb_col, mb_row);
-        // Note the encoder may have changed the segment_id
+      set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16,
+                  &recon_yoffset, &recon_uvoffset);
+      xd->mb_index = i;
+      update_state(cpi, &x->mb_context[xd->sb_index][i], 16, output_enabled);
 
-#ifdef MODE_STATS
-      inter_y_modes[mbmi->mode]++;
+      if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
+        vp9_activity_masking(cpi, x);
 
-      if (mbmi->mode == SPLITMV) {
-        int b;
+      vp9_intra_prediction_down_copy(xd);
 
-        for (b = 0; b < x->partition_info->count; b++) {
-          inter_b_modes[x->partition_info->bmi[b].mode]++;
-        }
-      }
+      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset,
+                        output_enabled, mb_row + y_idx, mb_col + x_idx);
+      if (output_enabled)
+        update_stats(cpi);
 
-#endif
-
-      // If we have just a single reference frame coded for a segment then
-      // exclude from the reference frame counts used to work out
-      // probabilities. NOTE: At the moment we dont support custom trees
-      // for the reference frame coding for each segment but this is a
-      // possible future action.
-      segment_id = &mbmi->segment_id;
-      seg_ref_active = vp9_segfeature_active(xd, *segment_id,
-                                             SEG_LVL_REF_FRAME);
-      if (!seg_ref_active ||
-          ((vp9_check_segref(xd, *segment_id, INTRA_FRAME) +
-            vp9_check_segref(xd, *segment_id, LAST_FRAME) +
-            vp9_check_segref(xd, *segment_id, GOLDEN_FRAME) +
-            vp9_check_segref(xd, *segment_id, ALTREF_FRAME)) > 1)) {
-        {
-          cpi->count_mb_ref_frame_usage[mbmi->ref_frame]++;
-        }
+      if (output_enabled) {
+        (*tp)->Token = EOSB_TOKEN;
+        (*tp)++;
+        if (mb_row + y_idx < cm->mb_rows)
+          cpi->tplist[mb_row + y_idx].stop = *tp;
       }
-
-      // Count of last ref frame 0,0 usage
-      if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
-        cpi->inter_zz_count++;
     }
-
-#if CONFIG_SUPERBLOCKS
-    if (xd->mode_info_context->mbmi.encoded_as_sb) {
-      x->src.y_buffer += 32;
-      x->src.u_buffer += 16;
-      x->src.v_buffer += 16;
-
-      x->gf_active_ptr      += 2;
-      x->partition_info     += 2;
-      xd->mode_info_context += 2;
-      xd->prev_mode_info_context += 2;
-
-      (*tp)->Token = EOSB_TOKEN;
-      (*tp)++;
-      if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
-      break;
-    }
-#endif
-
-    // Next MB
-    mb_row += dy;
-    mb_col += dx;
-
-    x->src.y_buffer += 16 * (dx + dy * x->src.y_stride);
-    x->src.u_buffer += 8  * (dx + dy * x->src.uv_stride);
-    x->src.v_buffer += 8  * (dx + dy * x->src.uv_stride);
-
-    x->gf_active_ptr      += offset_unextended;
-    x->partition_info     += offset_extended;
-    xd->mode_info_context += offset_extended;
-    xd->prev_mode_info_context += offset_extended;
-
-#if CONFIG_DEBUG
-    assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-           (xd->mode_info_context - cpi->common.mip));
-#endif
-    (*tp)->Token = EOSB_TOKEN;
-    (*tp)++;
-    if (mb_row < cm->mb_rows) cpi->tplist[mb_row].stop = *tp;
   }
 
   // debug output
@@ -1233,14 +1050,53 @@
 #endif
 }
 
-static
-void encode_sb_row(VP9_COMP *cpi,
-                   VP9_COMMON *cm,
-                   int mb_row,
-                   MACROBLOCK  *x,
-                   MACROBLOCKD *xd,
-                   TOKENEXTRA **tp,
-                   int *totalrate) {
+static void encode_sb64(VP9_COMP *cpi,
+                        int mb_row,
+                        int mb_col,
+                        TOKENEXTRA **tp, int is_sb[4]) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  cpi->sb64_count[is_sb[0] == 2]++;
+  if (is_sb[0] == 2) {
+    int recon_yoffset, recon_uvoffset;
+
+    set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset);
+    update_state(cpi, &x->sb64_context, 64, 1);
+    encode_superblock64(cpi, tp, recon_yoffset, recon_uvoffset,
+                        1, mb_row, mb_col);
+    update_stats(cpi);
+
+    (*tp)->Token = EOSB_TOKEN;
+    (*tp)++;
+    if (mb_row < cm->mb_rows)
+      cpi->tplist[mb_row].stop = *tp;
+  } else {
+    int i;
+
+    for (i = 0; i < 4; i++) {
+      const int x_idx = i & 1, y_idx = i >> 1;
+
+      if (mb_row + y_idx * 2 >= cm->mb_rows ||
+          mb_col + x_idx * 2 >= cm->mb_cols) {
+        // MB lies outside frame, move on
+        continue;
+      }
+      xd->sb_index = i;
+      encode_sb(cpi, mb_row + 2 * y_idx, mb_col + 2 * x_idx, 1, tp,
+                is_sb[i]);
+    }
+  }
+}
+
+static void encode_sb_row(VP9_COMP *cpi,
+                          int mb_row,
+                          TOKENEXTRA **tp,
+                          int *totalrate) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int mb_col;
   int mb_cols = cm->mb_cols;
 
@@ -1248,105 +1104,84 @@
   vpx_memset(cm->left_context, 0, sizeof(cm->left_context));
 
   // Code each SB in the row
-  for (mb_col = 0; mb_col < mb_cols; mb_col += 2) {
-    int mb_rate = 0, mb_dist = 0;
-#if CONFIG_SUPERBLOCKS
-    int sb_rate = INT_MAX, sb_dist;
-#endif
+  for (mb_col = 0; mb_col < mb_cols; mb_col += 4) {
+    int i;
+    int sb32_rate = 0, sb32_dist = 0;
+    int is_sb[4];
+    int sb64_rate = INT_MAX, sb64_dist;
+    ENTROPY_CONTEXT_PLANES l[4], a[4];
+    TOKENEXTRA *tp_orig = *tp;
 
-#if CONFIG_DEBUG
-    MODE_INFO *mic = xd->mode_info_context;
-    PARTITION_INFO *pi = x->partition_info;
-    signed char  *gfa = x->gf_active_ptr;
-    unsigned char *yb = x->src.y_buffer;
-    unsigned char *ub = x->src.u_buffer;
-    unsigned char *vb = x->src.v_buffer;
-#endif
+    memcpy(&a, cm->above_context + mb_col, sizeof(a));
+    memcpy(&l, cm->left_context, sizeof(l));
+    for (i = 0; i < 4; i++) {
+      const int x_idx = (i & 1) << 1, y_idx = i & 2;
+      int mb_rate = 0, mb_dist = 0;
+      int sb_rate = INT_MAX, sb_dist;
 
-#if CONFIG_SUPERBLOCKS
-    // Pick modes assuming the SB is coded as 4 independent MBs
-    xd->mode_info_context->mbmi.encoded_as_sb = 0;
-#endif
-    pick_mb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &mb_rate, &mb_dist);
-#if CONFIG_SUPERBLOCKS
-    mb_rate += vp9_cost_bit(cm->sb_coded, 0);
-#endif
+      if (mb_row + y_idx >= cm->mb_rows || mb_col + x_idx >= cm->mb_cols)
+        continue;
 
-    x->src.y_buffer -= 32;
-    x->src.u_buffer -= 16;
-    x->src.v_buffer -= 16;
+      xd->sb_index = i;
 
-    x->gf_active_ptr -= 2;
-    x->partition_info -= 2;
-    xd->mode_info_context -= 2;
-    xd->prev_mode_info_context -= 2;
+      pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
+                    tp, &mb_rate, &mb_dist);
+      mb_rate += vp9_cost_bit(cm->sb32_coded, 0);
 
-#if CONFIG_DEBUG
-    assert(x->gf_active_ptr == gfa);
-    assert(x->partition_info == pi);
-    assert(xd->mode_info_context == mic);
-    assert(x->src.y_buffer == yb);
-    assert(x->src.u_buffer == ub);
-    assert(x->src.v_buffer == vb);
-#endif
+      if (!(((    mb_cols & 1) && mb_col + x_idx ==     mb_cols - 1) ||
+            ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) {
+        /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
+        pick_sb_modes(cpi, mb_row + y_idx, mb_col + x_idx,
+                      tp, &sb_rate, &sb_dist);
+        sb_rate += vp9_cost_bit(cm->sb32_coded, 1);
+      }
 
-#if CONFIG_SUPERBLOCKS
-    if (!(((    mb_cols & 1) && mb_col ==     mb_cols - 1) ||
-          ((cm->mb_rows & 1) && mb_row == cm->mb_rows - 1))) {
-      /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
-      xd->mode_info_context->mbmi.encoded_as_sb = 1;
-      pick_sb_modes(cpi, cm, mb_row, mb_col, x, xd, tp, &sb_rate, &sb_dist);
-      sb_rate += vp9_cost_bit(cm->sb_coded, 1);
-    }
-
-    /* Decide whether to encode as a SB or 4xMBs */
-    if (sb_rate < INT_MAX &&
-        RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
-          RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
-      xd->mode_info_context->mbmi.encoded_as_sb = 1;
-      xd->mode_info_context[1].mbmi.encoded_as_sb = 1;
-      xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 1;
-      xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 1;
-      *totalrate += sb_rate;
-    } else
-#endif
-    {
-#if CONFIG_SUPERBLOCKS
-      xd->mode_info_context->mbmi.encoded_as_sb = 0;
-      if (cm->mb_cols - 1 > mb_col)
-        xd->mode_info_context[1].mbmi.encoded_as_sb = 0;
-      if (cm->mb_rows - 1 > mb_row) {
-        xd->mode_info_context[cm->mode_info_stride].mbmi.encoded_as_sb = 0;
-        if (cm->mb_cols - 1 > mb_col)
-          xd->mode_info_context[1 + cm->mode_info_stride].mbmi.encoded_as_sb = 0;
+      /* Decide whether to encode as a SB or 4xMBs */
+      if (sb_rate < INT_MAX &&
+          RDCOST(x->rdmult, x->rddiv, sb_rate, sb_dist) <
+              RDCOST(x->rdmult, x->rddiv, mb_rate, mb_dist)) {
+        is_sb[i] = 1;
+        sb32_rate += sb_rate;
+        sb32_dist += sb_dist;
+      } else {
+        is_sb[i] = 0;
+        sb32_rate += mb_rate;
+        sb32_dist += mb_dist;
       }
-#endif
-      *totalrate += mb_rate;
+
+      /* Encode SB using best computed mode(s) */
+      // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
+      // for each level that we go up, we can just keep tokens and recon
+      // pixels of the lower level; also, inverting SB/MB order (big->small
+      // instead of small->big) means we can use as threshold for small, which
+      // may enable breakouts if RD is not good enough (i.e. faster)
+      encode_sb(cpi, mb_row + y_idx, mb_col + x_idx, 0, tp, is_sb[i]);
     }
 
-    /* Encode SB using best computed mode(s) */
-    encode_sb(cpi, cm, mb_row, mb_col, x, xd, tp);
+    memcpy(cm->above_context + mb_col, &a, sizeof(a));
+    memcpy(cm->left_context, &l, sizeof(l));
+    sb32_rate += vp9_cost_bit(cm->sb64_coded, 0);
 
-#if CONFIG_DEBUG
-    assert(x->gf_active_ptr == gfa + 2);
-    assert(x->partition_info == pi + 2);
-    assert(xd->mode_info_context == mic + 2);
-    assert(x->src.y_buffer == yb + 32);
-    assert(x->src.u_buffer == ub + 16);
-    assert(x->src.v_buffer == vb + 16);
-#endif
-  }
+    if (!(((    mb_cols & 3) && mb_col + 3 >=     mb_cols) ||
+          ((cm->mb_rows & 3) && mb_row + 3 >= cm->mb_rows))) {
+      pick_sb64_modes(cpi, mb_row, mb_col, tp, &sb64_rate, &sb64_dist);
+      sb64_rate += vp9_cost_bit(cm->sb64_coded, 1);
+    }
 
-  // this is to account for the border
-  x->gf_active_ptr += mb_cols - (mb_cols & 0x1);
-  x->partition_info += xd->mode_info_stride + 1 - (mb_cols & 0x1);
-  xd->mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
-  xd->prev_mode_info_context += xd->mode_info_stride + 1 - (mb_cols & 0x1);
+    /* Decide whether to encode as a SB or 4xMBs */
+    if (sb64_rate < INT_MAX &&
+        RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist) <
+            RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
+      is_sb[0] = 2;
+      *totalrate += sb64_rate;
+    } else {
+      *totalrate += sb32_rate;
+    }
 
-#if CONFIG_DEBUG
-  assert((xd->prev_mode_info_context - cpi->common.prev_mip) ==
-         (xd->mode_info_context - cpi->common.mip));
-#endif
+    assert(tp_orig == *tp);
+    encode_sb64(cpi, mb_row, mb_col, tp, is_sb);
+    assert(tp_orig < *tp);
+  }
 }
 
 static void init_encode_frame_mb_context(VP9_COMP *cpi) {
@@ -1354,22 +1189,11 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  // GF active flags data structure
-  x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
-
-  // Activity map pointer
-  x->mb_activity_ptr = cpi->mb_activity_map;
-
   x->act_zbin_adj = 0;
   cpi->seg0_idx = 0;
   vpx_memset(cpi->ref_pred_count, 0, sizeof(cpi->ref_pred_count));
 
-  x->partition_info = x->pi;
-
-  xd->mode_info_context = cm->mi;
   xd->mode_info_stride = cm->mode_info_stride;
-  xd->prev_mode_info_context = cm->prev_mi;
-
   xd->frame_type = cm->frame_type;
 
   xd->frames_since_golden = cm->frames_since_golden;
@@ -1380,7 +1204,7 @@
     vp9_init_mbmode_probs(cm);
 
   // Copy data over into macro block data structures.
-  x->src = * cpi->Source;
+  x->src = *cpi->Source;
   xd->pre = cm->yv12_fb[cm->lst_fb_idx];
   xd->dst = cm->yv12_fb[cm->new_fb_idx];
 
@@ -1404,10 +1228,9 @@
   vp9_zero(cpi->sub_mv_ref_count)
   vp9_zero(cpi->mbsplit_count)
   vp9_zero(cpi->common.fc.mv_ref_ct)
-#if CONFIG_SUPERBLOCKS
   vp9_zero(cpi->sb_ymode_count)
-  cpi->sb_count = 0;
-#endif
+  vp9_zero(cpi->sb32_count);
+  vp9_zero(cpi->sb64_count);
 #if CONFIG_COMP_INTERINTRA_PRED
   vp9_zero(cpi->interintra_count);
   vp9_zero(cpi->interintra_select_count);
@@ -1438,11 +1261,6 @@
   // this frame which may be updated with each iteration of the recode loop.
   vp9_compute_mod_refprobs(cm);
 
-#if CONFIG_NEW_MVREF
-  // temp stats reset
-  vp9_zero( cpi->best_ref_index_counts );
-#endif
-
 // debug output
 #if DBG_PRNT_SEGMAP
   {
@@ -1466,28 +1284,23 @@
   cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;
   cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;
 
-#if CONFIG_PRED_FILTER
-  if (cm->current_video_frame == 0) {
-    // Initially assume that we'll signal the prediction filter
-    // state at the frame level and that it is off.
-    cpi->common.pred_filter_mode = 0;
-    cpi->common.prob_pred_filter_off = 128;
-  }
-  cpi->pred_filter_on_count = 0;
-  cpi->pred_filter_off_count = 0;
-#endif
   vp9_zero(cpi->switchable_interp_count);
+  vp9_zero(cpi->best_switchable_interp_count);
 
   xd->mode_info_context = cm->mi;
   xd->prev_mode_info_context = cm->prev_mi;
 
   vp9_zero(cpi->NMVcount);
-  vp9_zero(cpi->coef_counts);
-  vp9_zero(cpi->hybrid_coef_counts);
+  vp9_zero(cpi->coef_counts_4x4);
+  vp9_zero(cpi->hybrid_coef_counts_4x4);
   vp9_zero(cpi->coef_counts_8x8);
   vp9_zero(cpi->hybrid_coef_counts_8x8);
   vp9_zero(cpi->coef_counts_16x16);
   vp9_zero(cpi->hybrid_coef_counts_16x16);
+  vp9_zero(cpi->coef_counts_32x32);
+#if CONFIG_NEW_MVREF
+  vp9_zero(cpi->mb_mv_ref_count);
+#endif
 
   vp9_frame_init_quantizer(cpi);
 
@@ -1508,7 +1321,8 @@
   vpx_memset(cpi->rd_comp_pred_diff, 0, sizeof(cpi->rd_comp_pred_diff));
   vpx_memset(cpi->single_pred_count, 0, sizeof(cpi->single_pred_count));
   vpx_memset(cpi->comp_pred_count, 0, sizeof(cpi->comp_pred_count));
-  vpx_memset(cpi->txfm_count, 0, sizeof(cpi->txfm_count));
+  vpx_memset(cpi->txfm_count_32x32p, 0, sizeof(cpi->txfm_count_32x32p));
+  vpx_memset(cpi->txfm_count_16x16p, 0, sizeof(cpi->txfm_count_16x16p));
   vpx_memset(cpi->txfm_count_8x8p, 0, sizeof(cpi->txfm_count_8x8p));
   vpx_memset(cpi->rd_tx_select_diff, 0, sizeof(cpi->rd_tx_select_diff));
   {
@@ -1517,15 +1331,8 @@
 
     {
       // For each row of SBs in the frame
-      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
-        int offset = (cm->mb_cols + 1) & ~0x1;
-
-        encode_sb_row(cpi, cm, mb_row, x, xd, &tp, &totalrate);
-
-        // adjust to the next row of SBs
-        x->src.y_buffer += 32 * x->src.y_stride - 16 * offset;
-        x->src.u_buffer += 16 * x->src.uv_stride - 8 * offset;
-        x->src.v_buffer += 16 * x->src.uv_stride - 8 * offset;
+      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) {
+        encode_sb_row(cpi, mb_row, &tp, &totalrate);
       }
 
       cpi->tok_count = (unsigned int)(tp - cpi->tok);
@@ -1570,78 +1377,136 @@
   }
 }
 
+static void reset_skip_txfm_size_mb(VP9_COMP *cpi,
+                                    MODE_INFO *mi, TX_SIZE txfm_max) {
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  if (mbmi->txfm_size > txfm_max) {
+    VP9_COMMON *const cm = &cpi->common;
+    MACROBLOCK *const x = &cpi->mb;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    const int segment_id = mbmi->segment_id;
+
+    xd->mode_info_context = mi;
+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+           (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
+    mbmi->txfm_size = txfm_max;
+  }
+}
+
+static int get_skip_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs) {
+  int x, y;
+
+  for (y = 0; y < ymbs; y++) {
+    for (x = 0; x < xmbs; x++) {
+      if (!mi[y * mis + x].mbmi.mb_skip_coeff)
+        return 0;
+    }
+  }
+
+  return 1;
+}
+
+static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
+                          TX_SIZE txfm_size) {
+  int x, y;
+
+  for (y = 0; y < ymbs; y++) {
+    for (x = 0; x < xmbs; x++) {
+      mi[y * mis + x].mbmi.txfm_size = txfm_size;
+    }
+  }
+}
+
+static void reset_skip_txfm_size_sb32(VP9_COMP *cpi, MODE_INFO *mi,
+                                      int mis, TX_SIZE txfm_max,
+                                      int mb_rows_left, int mb_cols_left) {
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  if (mbmi->txfm_size > txfm_max) {
+    VP9_COMMON *const cm = &cpi->common;
+    MACROBLOCK *const x = &cpi->mb;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    const int segment_id = mbmi->segment_id;
+    const int ymbs = MIN(2, mb_rows_left);
+    const int xmbs = MIN(2, mb_cols_left);
+
+    xd->mode_info_context = mi;
+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
+    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
+  }
+}
+
+static void reset_skip_txfm_size_sb64(VP9_COMP *cpi, MODE_INFO *mi,
+                                      int mis, TX_SIZE txfm_max,
+                                      int mb_rows_left, int mb_cols_left) {
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  if (mbmi->txfm_size > txfm_max) {
+    VP9_COMMON *const cm = &cpi->common;
+    MACROBLOCK *const x = &cpi->mb;
+    MACROBLOCKD *const xd = &x->e_mbd;
+    const int segment_id = mbmi->segment_id;
+    const int ymbs = MIN(4, mb_rows_left);
+    const int xmbs = MIN(4, mb_cols_left);
+
+    xd->mode_info_context = mi;
+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
+           (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));
+    set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
+  }
+}
+
 static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
-  VP9_COMMON *cm = &cpi->common;
-  int mb_row, mb_col, mis = cm->mode_info_stride, segment_id;
+  VP9_COMMON *const cm = &cpi->common;
+  int mb_row, mb_col;
+  const int mis = cm->mode_info_stride;
   MODE_INFO *mi, *mi_ptr = cm->mi;
-#if CONFIG_SUPERBLOCKS
-  int skip;
-  MODE_INFO *sb_mi_ptr = cm->mi, *sb_mi;
-  MB_MODE_INFO *sb_mbmi;
-#endif
-  MB_MODE_INFO *mbmi;
-  MACROBLOCK *x = &cpi->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++, mi_ptr += mis) {
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
     mi = mi_ptr;
-#if CONFIG_SUPERBLOCKS
-    sb_mi = sb_mi_ptr;
-#endif
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++, mi++) {
-      mbmi = &mi->mbmi;
-#if CONFIG_SUPERBLOCKS
-      sb_mbmi = &sb_mi->mbmi;
-#endif
-      if (mbmi->txfm_size > txfm_max) {
-#if CONFIG_SUPERBLOCKS
-        if (sb_mbmi->encoded_as_sb) {
-          if (!((mb_col & 1) || (mb_row & 1))) {
-            segment_id = mbmi->segment_id;
-            skip = mbmi->mb_skip_coeff;
-            if (mb_col < cm->mb_cols - 1) {
-              segment_id = segment_id && mi[1].mbmi.segment_id;
-              skip = skip && mi[1].mbmi.mb_skip_coeff;
-            }
-            if (mb_row < cm->mb_rows - 1) {
-              segment_id = segment_id &&
-                           mi[cm->mode_info_stride].mbmi.segment_id;
-              skip = skip && mi[cm->mode_info_stride].mbmi.mb_skip_coeff;
-              if (mb_col < cm->mb_cols - 1) {
-                segment_id = segment_id &&
-                             mi[cm->mode_info_stride + 1].mbmi.segment_id;
-                skip = skip && mi[cm->mode_info_stride + 1].mbmi.mb_skip_coeff;
-              }
-            }
-            xd->mode_info_context = mi;
-            assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-                    vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
-                   (cm->mb_no_coeff_skip && skip));
-            mbmi->txfm_size = txfm_max;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) {
+      if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+        reset_skip_txfm_size_sb64(cpi, mi, mis, txfm_max,
+                                  cm->mb_rows - mb_row, cm->mb_cols - mb_col);
+      } else {
+        int i;
+
+        for (i = 0; i < 4; i++) {
+          const int x_idx_sb = (i & 1) << 1, y_idx_sb = i & 2;
+          MODE_INFO *sb_mi = mi + y_idx_sb * mis + x_idx_sb;
+
+          if (mb_row + y_idx_sb >= cm->mb_rows ||
+              mb_col + x_idx_sb >= cm->mb_cols)
+            continue;
+
+          if (sb_mi->mbmi.sb_type) {
+            reset_skip_txfm_size_sb32(cpi, sb_mi, mis, txfm_max,
+                                      cm->mb_rows - mb_row - y_idx_sb,
+                                      cm->mb_cols - mb_col - x_idx_sb);
           } else {
-            mbmi->txfm_size = sb_mbmi->txfm_size;
+            int m;
+
+            for (m = 0; m < 4; m++) {
+              const int x_idx = x_idx_sb + (m & 1), y_idx = y_idx_sb + (m >> 1);
+              MODE_INFO *mb_mi;
+
+              if (mb_col + x_idx >= cm->mb_cols ||
+                  mb_row + y_idx >= cm->mb_rows)
+                continue;
+
+              mb_mi = mi + y_idx * mis + x_idx;
+              assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);
+              reset_skip_txfm_size_mb(cpi, mb_mi, txfm_max);
+            }
           }
-        } else {
-#endif
-          segment_id = mbmi->segment_id;
-          xd->mode_info_context = mi;
-          assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-                  vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||
-                 (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));
-          mbmi->txfm_size = txfm_max;
-#if CONFIG_SUPERBLOCKS
         }
-#endif
       }
-#if CONFIG_SUPERBLOCKS
-      if (mb_col & 1)
-        sb_mi += 2;
-#endif
     }
-#if CONFIG_SUPERBLOCKS
-    if (mb_row & 1)
-      sb_mi_ptr += 2 * mis;
-#endif
   }
 }
 
@@ -1701,7 +1566,7 @@
      * keyframe's probabilities as an estimate of what the current keyframe's
      * coefficient cost distributions may look like. */
     if (frame_type == 0) {
-      txfm_type = ALLOW_16X16;
+      txfm_type = ALLOW_32X32;
     } else
 #if 0
     /* FIXME (rbultje)
@@ -1732,9 +1597,9 @@
     } else
       txfm_type = ALLOW_8X8;
 #else
-    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
-                 cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
-    ALLOW_16X16 : TX_MODE_SELECT;
+    txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >=
+                  cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+                    ALLOW_32X32 : TX_MODE_SELECT;
 #endif
     cpi->common.txfm_mode = txfm_type;
     if (txfm_type != TX_MODE_SELECT) {
@@ -1754,7 +1619,8 @@
       int64_t pd = cpi->rd_tx_select_diff[i];
       int diff;
       if (i == TX_MODE_SELECT)
-        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZE_MAX - 1), 0);
+        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
+                     2048 * (TX_SIZE_MAX_SB - 1), 0);
       diff = (int)(pd / cpi->common.MBs);
       cpi->rd_tx_select_threshes[frame_type][i] += diff;
       cpi->rd_tx_select_threshes[frame_type][i] /= 2;
@@ -1777,21 +1643,35 @@
     }
 
     if (cpi->common.txfm_mode == TX_MODE_SELECT) {
-      const int count4x4 = cpi->txfm_count[TX_4X4] + cpi->txfm_count_8x8p[TX_4X4];
-      const int count8x8 = cpi->txfm_count[TX_8X8];
+      const int count4x4 = cpi->txfm_count_16x16p[TX_4X4] +
+                           cpi->txfm_count_32x32p[TX_4X4] +
+                           cpi->txfm_count_8x8p[TX_4X4];
+      const int count8x8_lp = cpi->txfm_count_32x32p[TX_8X8] +
+                              cpi->txfm_count_16x16p[TX_8X8];
       const int count8x8_8x8p = cpi->txfm_count_8x8p[TX_8X8];
-      const int count16x16 = cpi->txfm_count[TX_16X16];
+      const int count16x16_16x16p = cpi->txfm_count_16x16p[TX_16X16];
+      const int count16x16_lp = cpi->txfm_count_32x32p[TX_16X16];
+      const int count32x32 = cpi->txfm_count_32x32p[TX_32X32];
 
-      if (count4x4 == 0 && count16x16 == 0) {
+      if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+          count32x32 == 0) {
         cpi->common.txfm_mode = ALLOW_8X8;
         reset_skip_txfm_size(cpi, TX_8X8);
-      } else if (count8x8 == 0 && count16x16 == 0 && count8x8_8x8p == 0) {
+      } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
+                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
         cpi->common.txfm_mode = ONLY_4X4;
         reset_skip_txfm_size(cpi, TX_4X4);
-      } else if (count8x8 == 0 && count4x4 == 0) {
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+        cpi->common.txfm_mode = ALLOW_32X32;
+      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
         cpi->common.txfm_mode = ALLOW_16X16;
+        reset_skip_txfm_size(cpi, TX_16X16);
       }
     }
+
+    // Update interpolation filter strategy for next frame.
+    if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter))
+      select_interp_filter_type(cpi);
   } else {
     encode_frame_internal(cpi);
   }
@@ -1835,9 +1715,6 @@
 
   vp9_build_block_doffsets(&x->e_mbd);
 
-#if !CONFIG_SUPERBLOCKS
-  // y blocks
-  x->thismb_ptr = &x->thismb[0];
   for (br = 0; br < 4; br++) {
     for (bc = 0; bc < 4; bc++) {
       BLOCK *this_block = &x->block[block];
@@ -1844,19 +1721,6 @@
       // this_block->base_src = &x->src.y_buffer;
       // this_block->src_stride = x->src.y_stride;
       // this_block->src = 4 * br * this_block->src_stride + 4 * bc;
-      this_block->base_src = &x->thismb_ptr;
-      this_block->src_stride = 16;
-      this_block->src = 4 * br * 16 + 4 * bc;
-      ++block;
-    }
-  }
-#else
-  for (br = 0; br < 4; br++) {
-    for (bc = 0; bc < 4; bc++) {
-      BLOCK *this_block = &x->block[block];
-      // this_block->base_src = &x->src.y_buffer;
-      // this_block->src_stride = x->src.y_stride;
-      // this_block->src = 4 * br * this_block->src_stride + 4 * bc;
       this_block->base_src = &x->src.y_buffer;
       this_block->src_stride = x->src.y_stride;
       this_block->src = 4 * br * this_block->src_stride + 4 * bc;
@@ -1863,7 +1727,6 @@
       ++block;
     }
   }
-#endif
 
   // u blocks
   for (br = 0; br < 2; br++) {
@@ -1917,12 +1780,11 @@
   }
 #endif
 
-#if CONFIG_SUPERBLOCKS
-  if (xd->mode_info_context->mbmi.encoded_as_sb) {
+  if (xd->mode_info_context->mbmi.sb_type) {
     ++cpi->sb_ymode_count[m];
-  } else
-#endif
+  } else {
     ++cpi->ymode_count[m];
+  }
   if (m != I8X8_PRED)
     ++cpi->y_uv_mode_count[m][uvm];
   else {
@@ -1964,16 +1826,14 @@
 #endif
 }
 
-#if CONFIG_SUPERBLOCKS
 static void update_sb_skip_coeff_state(VP9_COMP *cpi,
-                                       MACROBLOCK *x,
                                        ENTROPY_CONTEXT_PLANES ta[4],
                                        ENTROPY_CONTEXT_PLANES tl[4],
                                        TOKENEXTRA *t[4],
                                        TOKENEXTRA **tp,
-                                       int skip[4])
-{
-  TOKENEXTRA tokens[4][16 * 24];
+                                       int skip[4], int output_enabled) {
+  MACROBLOCK *const x = &cpi->mb;
+  TOKENEXTRA tokens[4][16 * 25];
   int n_tokens[4], n;
 
   // if there were no skips, we don't need to do anything
@@ -2013,7 +1873,7 @@
     if (skip[n]) {
       x->e_mbd.above_context = &ta[n];
       x->e_mbd.left_context  = &tl[n];
-      vp9_stuff_mb(cpi, &x->e_mbd, tp, 0);
+      vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);
     } else {
       if (n_tokens[n]) {
         memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
@@ -2022,21 +1882,129 @@
     }
   }
 }
-#endif /* CONFIG_SUPERBLOCKS */
 
-static void encode_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                              TOKENEXTRA **t, int recon_yoffset,
-                              int recon_uvoffset, int output_enabled,
-                              int mb_col, int mb_row) {
-  VP9_COMMON *cm = &cpi->common;
+static void update_sb64_skip_coeff_state(VP9_COMP *cpi,
+                                         ENTROPY_CONTEXT_PLANES ta[16],
+                                         ENTROPY_CONTEXT_PLANES tl[16],
+                                         TOKENEXTRA *t[16],
+                                         TOKENEXTRA **tp,
+                                         int skip[16], int output_enabled) {
+  MACROBLOCK *const x = &cpi->mb;
+
+  if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_32X32) {
+    TOKENEXTRA tokens[4][1024+512];
+    int n_tokens[4], n;
+
+    // if there were no skips, we don't need to do anything
+    if (!skip[0] && !skip[1] && !skip[2] && !skip[3])
+      return;
+
+    // if we don't do coeff skipping for this frame, we don't
+    // need to do anything here
+    if (!cpi->common.mb_no_coeff_skip)
+      return;
+
+    // if all 4 MBs skipped coeff coding, nothing to be done
+    if (skip[0] && skip[1] && skip[2] && skip[3])
+      return;
+
+    // so the situation now is that we want to skip coeffs
+    // for some MBs, but not all, and we didn't code EOB
+    // coefficients for them. However, the skip flag for this
+    // SB will be 0 overall, so we need to insert EOBs in the
+    // middle of the token tree. Do so here.
+    for (n = 0; n < 4; n++) {
+      if (n < 3) {
+        n_tokens[n] = t[n + 1] - t[n];
+      } else {
+        n_tokens[n] = *tp - t[3];
+      }
+      if (n_tokens[n]) {
+        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
+      }
+    }
+
+    // reset pointer, stuff EOBs where necessary
+    *tp = t[0];
+    for (n = 0; n < 4; n++) {
+      if (skip[n]) {
+        x->e_mbd.above_context = &ta[n * 2];
+        x->e_mbd.left_context  = &tl[n * 2];
+        vp9_stuff_sb(cpi, &x->e_mbd, tp, !output_enabled);
+      } else {
+        if (n_tokens[n]) {
+          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+        }
+        (*tp) += n_tokens[n];
+      }
+    }
+  } else {
+    TOKENEXTRA tokens[16][16 * 25];
+    int n_tokens[16], n;
+
+    // if there were no skips, we don't need to do anything
+    if (!skip[ 0] && !skip[ 1] && !skip[ 2] && !skip[ 3] &&
+        !skip[ 4] && !skip[ 5] && !skip[ 6] && !skip[ 7] &&
+        !skip[ 8] && !skip[ 9] && !skip[10] && !skip[11] &&
+        !skip[12] && !skip[13] && !skip[14] && !skip[15])
+      return;
+
+    // if we don't do coeff skipping for this frame, we don't
+    // need to do anything here
+    if (!cpi->common.mb_no_coeff_skip)
+      return;
+
+    // if all 4 MBs skipped coeff coding, nothing to be done
+    if (skip[ 0] && skip[ 1] && skip[ 2] && skip[ 3] &&
+        skip[ 4] && skip[ 5] && skip[ 6] && skip[ 7] &&
+        skip[ 8] && skip[ 9] && skip[10] && skip[11] &&
+        skip[12] && skip[13] && skip[14] && skip[15])
+      return;
+
+    // so the situation now is that we want to skip coeffs
+    // for some MBs, but not all, and we didn't code EOB
+    // coefficients for them. However, the skip flag for this
+    // SB will be 0 overall, so we need to insert EOBs in the
+    // middle of the token tree. Do so here.
+    for (n = 0; n < 16; n++) {
+      if (n < 15) {
+        n_tokens[n] = t[n + 1] - t[n];
+      } else {
+        n_tokens[n] = *tp - t[15];
+      }
+      if (n_tokens[n]) {
+        memcpy(tokens[n], t[n], n_tokens[n] * sizeof(*t[0]));
+      }
+    }
+
+    // reset pointer, stuff EOBs where necessary
+    *tp = t[0];
+    for (n = 0; n < 16; n++) {
+      if (skip[n]) {
+        x->e_mbd.above_context = &ta[n];
+        x->e_mbd.left_context  = &tl[n];
+        vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);
+      } else {
+        if (n_tokens[n]) {
+          memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);
+        }
+        (*tp) += n_tokens[n];
+      }
+    }
+  }
+}
+
+static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,
+                              int recon_yoffset, int recon_uvoffset,
+                              int output_enabled,
+                              int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   unsigned char ref_pred_flag;
 
-  x->skip = 0;
-#if CONFIG_SUPERBLOCKS
-  assert(!xd->mode_info_context->mbmi.encoded_as_sb);
-#endif
+  assert(!xd->mode_info_context->mbmi.sb_type);
 
 #ifdef ENC_DEBUG
   enc_debug = (cpi->common.current_video_frame == 46 &&
@@ -2246,7 +2214,7 @@
       mbmi->mb_skip_coeff = 1;
       if (output_enabled)
         cpi->skip_true_count[mb_skip_context]++;
-      vp9_fix_contexts(xd);
+      vp9_reset_mb_tokens_context(xd);
     } else {
       vp9_stuff_mb(cpi, xd, t, !output_enabled);
       mbmi->mb_skip_coeff = 0;
@@ -2261,9 +2229,10 @@
         !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||
           (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&
            vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {
+      assert(mbmi->txfm_size <= TX_16X16);
       if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&
           mbmi->mode != SPLITMV) {
-        cpi->txfm_count[mbmi->txfm_size]++;
+        cpi->txfm_count_16x16p[mbmi->txfm_size]++;
       } else if (mbmi->mode == I8X8_PRED ||
                  (mbmi->mode == SPLITMV &&
                   mbmi->partitioning != PARTITIONING_4X4)) {
@@ -2283,11 +2252,11 @@
   }
 }
 
-#if CONFIG_SUPERBLOCKS
-static void encode_superblock(VP9_COMP *cpi, MACROBLOCK *x,
-                              TOKENEXTRA **t, int recon_yoffset,
-                              int recon_uvoffset, int mb_col, int mb_row) {
+static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,
+                                int recon_yoffset, int recon_uvoffset,
+                                int output_enabled, int mb_row, int mb_col) {
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const uint8_t *src = x->src.y_buffer;
   uint8_t *dst = xd->dst.y_buffer;
@@ -2297,7 +2266,6 @@
   uint8_t *vdst = xd->dst.v_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  int seg_ref_active;
   unsigned char ref_pred_flag;
   int n;
   TOKENEXTRA *tp[4];
@@ -2305,9 +2273,8 @@
   MODE_INFO *mi = x->e_mbd.mode_info_context;
   unsigned int segment_id = mi->mbmi.segment_id;
   ENTROPY_CONTEXT_PLANES ta[4], tl[4];
+  const int mis = cm->mode_info_stride;
 
-  x->skip = 0;
-
   if (cm->frame_type == KEY_FRAME) {
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       adjust_act_zbin(cpi, x);
@@ -2340,10 +2307,7 @@
 
     vp9_update_zbin_extra(cpi, x);
 
-    seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
-
     // SET VARIOUS PREDICTION FLAGS
-
     // Did the chosen reference frame match its predicted value.
     ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
                       vp9_get_pred_ref(cm, xd)));
@@ -2354,7 +2318,8 @@
   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
     vp9_build_intra_predictors_sby_s(&x->e_mbd);
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
-    sum_intra_stats(cpi, x);
+    if (output_enabled)
+      sum_intra_stats(cpi, x);
   } else {
     int ref_fb_idx;
 
@@ -2394,74 +2359,391 @@
                                        xd->dst.y_stride, xd->dst.uv_stride);
   }
 
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-
-    xd->left_context = cm->left_context + y_idx;
-    xd->above_context = cm->above_context + mb_col + x_idx;
-    memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
-    memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
-    tp[n] = *t;
-    xd->mode_info_context = mi + x_idx + y_idx * cm->mode_info_stride;
-
-    vp9_subtract_mby_s_c(x->src_diff,
-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
-                         src_y_stride,
-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
-                         dst_y_stride);
-    vp9_subtract_mbuv_s_c(x->src_diff,
-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          src_uv_stride,
-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          dst_uv_stride);
-    vp9_fidct_mb(x);
-    vp9_recon_mby_s_c(&x->e_mbd,
-                      dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
-    vp9_recon_mbuv_s_c(&x->e_mbd,
-                       udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                       vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
-
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
     if (!x->skip) {
-      vp9_tokenize_mb(cpi, &x->e_mbd, t, 0);
-      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+      vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,
+                           dst, dst_y_stride);
+      vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                            usrc, vsrc, src_uv_stride,
+                            udst, vdst, dst_uv_stride);
+      vp9_transform_sby_32x32(x);
+      vp9_transform_sbuv_16x16(x);
+      vp9_quantize_sby_32x32(x);
+      vp9_quantize_sbuv_16x16(x);
+      // TODO(rbultje): trellis optimize
+      vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data);
+      vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data);
+      vp9_recon_sby_s_c(&x->e_mbd, dst);
+      vp9_recon_sbuv_s_c(&x->e_mbd, udst, vdst);
+
+      vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled);
     } else {
       int mb_skip_context =
-        cpi->common.mb_no_coeff_skip ?
-          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
-            (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :
+          cpi->common.mb_no_coeff_skip ?
+          (mi - 1)->mbmi.mb_skip_coeff +
+          (mi - mis)->mbmi.mb_skip_coeff :
           0;
-      xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;
-      if (cpi->common.mb_no_coeff_skip) {
-        // TODO(rbultje) this should be done per-sb instead of per-mb?
-        cpi->skip_true_count[mb_skip_context]++;
-        vp9_fix_contexts(xd);
+      mi->mbmi.mb_skip_coeff = 1;
+      if (cm->mb_no_coeff_skip) {
+        if (output_enabled)
+          cpi->skip_true_count[mb_skip_context]++;
+        vp9_fix_contexts_sb(xd);
       } else {
-        vp9_stuff_mb(cpi, xd, t, 0);
-        // TODO(rbultje) this should be done per-sb instead of per-mb?
-        cpi->skip_false_count[mb_skip_context]++;
+        vp9_stuff_sb(cpi, xd, t, !output_enabled);
+        if (output_enabled)
+          cpi->skip_false_count[mb_skip_context]++;
       }
     }
-  }
 
-  xd->mode_info_context = mi;
-  update_sb_skip_coeff_state(cpi, x, ta, tl, tp, t, skip);
-  if (cm->txfm_mode == TX_MODE_SELECT &&
-      !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
-         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
-    cpi->txfm_count[mi->mbmi.txfm_size]++;
-  } else {
-    TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_16X16 : cm->txfm_mode;
-    mi->mbmi.txfm_size = sz;
+    // copy skip flag on all mb_mode_info contexts in this SB
+    // if this was a skip at this txfm size
     if (mb_col < cm->mb_cols - 1)
-      mi[1].mbmi.txfm_size = sz;
+      mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
     if (mb_row < cm->mb_rows - 1) {
-      mi[cm->mode_info_stride].mbmi.txfm_size = sz;
+      mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
       if (mb_col < cm->mb_cols - 1)
-        mi[cm->mode_info_stride + 1].mbmi.txfm_size = sz;
+        mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;
     }
+    skip[0] = skip[2] = skip[1] = skip[3] = mi->mbmi.mb_skip_coeff;
+  } else {
+    for (n = 0; n < 4; n++) {
+      int x_idx = n & 1, y_idx = n >> 1;
+
+      xd->left_context = cm->left_context + y_idx + (mb_row & 2);
+      xd->above_context = cm->above_context + mb_col + x_idx;
+      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx + y_idx * mis;
+
+      if (!x->skip) {
+        vp9_subtract_mby_s_c(x->src_diff,
+                             src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                             src_y_stride,
+                             dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                             dst_y_stride);
+        vp9_subtract_mbuv_s_c(x->src_diff,
+                              usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                              vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                              src_uv_stride,
+                              udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                              vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                              dst_uv_stride);
+        vp9_fidct_mb(x);
+        vp9_recon_mby_s_c(&x->e_mbd,
+                          dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+        vp9_recon_mbuv_s_c(&x->e_mbd,
+                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+        vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled);
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+      } else {
+        int mb_skip_context = cpi->common.mb_no_coeff_skip ?
+            (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+            (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff :
+            0;
+        xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;
+        if (cpi->common.mb_no_coeff_skip) {
+          // TODO(rbultje) this should be done per-sb instead of per-mb?
+          if (output_enabled)
+            cpi->skip_true_count[mb_skip_context]++;
+          vp9_reset_mb_tokens_context(xd);
+        } else {
+          vp9_stuff_mb(cpi, xd, t, !output_enabled);
+          // TODO(rbultje) this should be done per-sb instead of per-mb?
+          if (output_enabled)
+            cpi->skip_false_count[mb_skip_context]++;
+        }
+      }
+    }
+
+    xd->mode_info_context = mi;
+    update_sb_skip_coeff_state(cpi, ta, tl, tp, t, skip, output_enabled);
   }
+
+  if (output_enabled) {
+    if (cm->txfm_mode == TX_MODE_SELECT &&
+        !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||
+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+      cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
+    } else {
+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
+                      TX_32X32 :
+                      cm->txfm_mode;
+      mi->mbmi.txfm_size = sz;
+      if (mb_col < cm->mb_cols - 1)
+        mi[1].mbmi.txfm_size = sz;
+      if (mb_row < cm->mb_rows - 1) {
+        mi[mis].mbmi.txfm_size = sz;
+        if (mb_col < cm->mb_cols - 1)
+          mi[mis + 1].mbmi.txfm_size = sz;
+      }
+    }
+  }
 }
-#endif
+
+static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,
+                                int recon_yoffset, int recon_uvoffset,
+                                int output_enabled, int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const uint8_t *src = x->src.y_buffer;
+  uint8_t *dst = xd->dst.y_buffer;
+  const uint8_t *usrc = x->src.u_buffer;
+  uint8_t *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer;
+  uint8_t *vdst = xd->dst.v_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  unsigned char ref_pred_flag;
+  int n;
+  TOKENEXTRA *tp[16];
+  int skip[16];
+  MODE_INFO *mi = x->e_mbd.mode_info_context;
+  unsigned int segment_id = mi->mbmi.segment_id;
+  ENTROPY_CONTEXT_PLANES ta[16], tl[16];
+  const int mis = cm->mode_info_stride;
+
+  if (cm->frame_type == KEY_FRAME) {
+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+      adjust_act_zbin(cpi, x);
+      vp9_update_zbin_extra(cpi, x);
+    }
+  } else {
+    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, cm);
+
+    if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
+      // Adjust the zbin based on this MB rate.
+      adjust_act_zbin(cpi, x);
+    }
+
+    // Experimental code. Special case for gf and arf zeromv modes.
+    // Increase zbin size to suppress noise
+    cpi->zbin_mode_boost = 0;
+    if (cpi->zbin_mode_boost_enabled) {
+      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
+        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
+          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+            cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+          else
+            cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
+        } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {
+          cpi->zbin_mode_boost = 0;
+        } else {
+          cpi->zbin_mode_boost = MV_ZBIN_BOOST;
+        }
+      }
+    }
+
+    vp9_update_zbin_extra(cpi, x);
+
+    // Did the chosen reference frame match its predicted value.
+    ref_pred_flag = ((xd->mode_info_context->mbmi.ref_frame ==
+                      vp9_get_pred_ref(cm, xd)));
+    vp9_set_pred_flag(xd, PRED_REF, ref_pred_flag);
+  }
+
+  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
+    vp9_build_intra_predictors_sb64y_s(&x->e_mbd);
+    vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
+    if (output_enabled)
+      sum_intra_stats(cpi, x);
+  } else {
+    int ref_fb_idx;
+
+    assert(cm->frame_type != KEY_FRAME);
+
+    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
+      ref_fb_idx = cpi->common.lst_fb_idx;
+    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
+      ref_fb_idx = cpi->common.gld_fb_idx;
+    else
+      ref_fb_idx = cpi->common.alt_fb_idx;
+
+    xd->pre.y_buffer =
+        cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
+    xd->pre.u_buffer =
+        cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
+    xd->pre.v_buffer =
+        cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+
+    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
+      int second_ref_fb_idx;
+
+      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
+        second_ref_fb_idx = cpi->common.lst_fb_idx;
+      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
+        second_ref_fb_idx = cpi->common.gld_fb_idx;
+      else
+        second_ref_fb_idx = cpi->common.alt_fb_idx;
+
+      xd->second_pre.y_buffer =
+          cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;
+      xd->second_pre.u_buffer =
+          cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;
+      xd->second_pre.v_buffer =
+          cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;
+    }
+
+    vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,
+                                       xd->dst.u_buffer, xd->dst.v_buffer,
+                                       xd->dst.y_stride, xd->dst.uv_stride);
+  }
+
+  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {
+    int n;
+
+    for (n = 0; n < 4; n++) {
+      int x_idx = n & 1, y_idx = n >> 1;
+
+      xd->mode_info_context = mi + x_idx * 2 + mis * y_idx * 2;
+      xd->left_context = cm->left_context + (y_idx << 1);
+      xd->above_context = cm->above_context + mb_col + (x_idx << 1);
+      memcpy(&ta[n * 2], xd->above_context, sizeof(*ta) * 2);
+      memcpy(&tl[n * 2], xd->left_context, sizeof(*tl) * 2);
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx * 2 + y_idx * mis * 2;
+      if (!x->skip) {
+        vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff,
+                             src + x_idx * 32 + y_idx * 32 * src_y_stride,
+                             src_y_stride,
+                             dst + x_idx * 32 + y_idx * 32 * dst_y_stride,
+                             dst_y_stride);
+        vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                              usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
+                              vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
+                              src_uv_stride,
+                              udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                              vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                              dst_uv_stride);
+        vp9_transform_sby_32x32(x);
+        vp9_transform_sbuv_16x16(x);
+        vp9_quantize_sby_32x32(x);
+        vp9_quantize_sbuv_16x16(x);
+        // TODO(rbultje): trellis optimize
+        vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data);
+        vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data);
+        vp9_recon_sby_s_c(&x->e_mbd,
+                          dst + 32 * x_idx + 32 * y_idx * dst_y_stride);
+        vp9_recon_sbuv_s_c(&x->e_mbd,
+                           udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                           vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride);
+
+        vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled);
+      } else {
+        int mb_skip_context = cpi->common.mb_no_coeff_skip ?
+                              (mi - 1)->mbmi.mb_skip_coeff +
+                                  (mi - mis)->mbmi.mb_skip_coeff : 0;
+        xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+        if (cm->mb_no_coeff_skip) {
+          if (output_enabled)
+            cpi->skip_true_count[mb_skip_context]++;
+          vp9_fix_contexts_sb(xd);
+        } else {
+          vp9_stuff_sb(cpi, xd, t, !output_enabled);
+          if (output_enabled)
+            cpi->skip_false_count[mb_skip_context]++;
+        }
+      }
+
+      // copy skip flag on all mb_mode_info contexts in this SB
+      // if this was a skip at this txfm size
+      if (mb_col + x_idx * 2 < cm->mb_cols - 1)
+        mi[mis * y_idx * 2 + x_idx * 2 + 1].mbmi.mb_skip_coeff =
+            mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;
+      if (mb_row + y_idx * 2 < cm->mb_rows - 1) {
+        mi[mis * y_idx * 2 + x_idx * 2 + mis].mbmi.mb_skip_coeff =
+            mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;
+        if (mb_col + x_idx * 2 < cm->mb_cols - 1)
+          mi[mis * y_idx * 2 + x_idx * 2 + mis + 1].mbmi.mb_skip_coeff =
+              mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;
+      }
+      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+    }
+  } else {
+    for (n = 0; n < 16; n++) {
+      const int x_idx = n & 3, y_idx = n >> 2;
+
+      xd->left_context = cm->left_context + y_idx;
+      xd->above_context = cm->above_context + mb_col + x_idx;
+      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));
+      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));
+      tp[n] = *t;
+      xd->mode_info_context = mi + x_idx + y_idx * mis;
+
+      if (!x->skip) {
+        vp9_subtract_mby_s_c(x->src_diff,
+                             src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                             src_y_stride,
+                             dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                             dst_y_stride);
+        vp9_subtract_mbuv_s_c(x->src_diff,
+                              usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                              vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                              src_uv_stride,
+                              udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                              vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                              dst_uv_stride);
+        vp9_fidct_mb(x);
+        vp9_recon_mby_s_c(&x->e_mbd,
+                          dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
+        vp9_recon_mbuv_s_c(&x->e_mbd,
+                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);
+
+        vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled);
+        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;
+      } else {
+        int mb_skip_context = cpi->common.mb_no_coeff_skip ?
+          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +
+          (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff : 0;
+        xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;
+        if (cpi->common.mb_no_coeff_skip) {
+          // TODO(rbultje) this should be done per-sb instead of per-mb?
+          if (output_enabled)
+            cpi->skip_true_count[mb_skip_context]++;
+          vp9_reset_mb_tokens_context(xd);
+        } else {
+          vp9_stuff_mb(cpi, xd, t, !output_enabled);
+          // TODO(rbultje) this should be done per-sb instead of per-mb?
+          if (output_enabled)
+            cpi->skip_false_count[mb_skip_context]++;
+        }
+      }
+    }
+  }
+
+  xd->mode_info_context = mi;
+  update_sb64_skip_coeff_state(cpi, ta, tl, tp, t, skip, output_enabled);
+
+  if (output_enabled) {
+    if (cm->txfm_mode == TX_MODE_SELECT &&
+        !((cm->mb_no_coeff_skip &&
+           ((mi->mbmi.txfm_size == TX_32X32 &&
+             skip[0] && skip[1] && skip[2] && skip[3]) ||
+            (mi->mbmi.txfm_size != TX_32X32 &&
+             skip[0] && skip[1] && skip[2] && skip[3] &&
+             skip[4] && skip[5] && skip[6] && skip[7] &&
+             skip[8] && skip[9] && skip[10] && skip[11] &&
+             skip[12] && skip[13] && skip[14] && skip[15]))) ||
+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&
+           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {
+      cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
+    } else {
+      int x, y;
+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?
+                    TX_32X32 :
+                    cm->txfm_mode;
+      for (y = 0; y < 4; y++) {
+        for (x = 0; x < 4; x++) {
+          if (mb_col + x < cm->mb_cols && mb_row + y < cm->mb_rows) {
+            mi[mis * y + x].mbmi.txfm_size = sz;
+          }
+        }
+      }
+    }
+  }
+}
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h
@@ -18,4 +18,4 @@
 
 extern void vp9_setup_block_ptrs(struct macroblock *x);
 
-#endif  // __INC_ENCODEFRAME_H
+#endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -25,9 +25,6 @@
 
   if (use_16x16_pred) {
     mbmi->mode = DC_PRED;
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame = INTRA_FRAME;
 
@@ -53,17 +50,7 @@
   b->bmi.as_mode.context = vp9_find_bpred_context(b);
 #endif
 
-#if CONFIG_COMP_INTRA_PRED
-  if (b->bmi.as_mode.second == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
-    vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-  } else {
-    vp9_comp_intra4x4_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
-                              b->predictor);
-  }
-#endif
-
+  vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);
   vp9_subtract_b(be, b, 16);
 
   tx_type = get_tx_type_4x4(&x->e_mbd, b);
@@ -93,14 +80,7 @@
   BLOCK *b = &x->block[0];
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
-#if CONFIG_COMP_INTRA_PRED
-  if (xd->mode_info_context->mbmi.second_mode == (MB_PREDICTION_MODE)(DC_PRED - 1))
-#endif
-    vp9_build_intra_predictors_mby(xd);
-#if CONFIG_COMP_INTRA_PRED
-  else
-    vp9_build_comp_intra_predictors_mby(xd);
-#endif
+  vp9_build_intra_predictors_mby(xd);
 
   vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);
 
@@ -131,15 +111,7 @@
   MACROBLOCKD *xd = &x->e_mbd;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
-#if CONFIG_COMP_INTRA_PRED
-  if (xd->mode_info_context->mbmi.second_uv_mode == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-    vp9_build_intra_predictors_mbuv(xd);
-#if CONFIG_COMP_INTRA_PRED
-  } else {
-    vp9_build_comp_intra_predictors_mbuv(xd);
-  }
-#endif
+  vp9_build_intra_predictors_mbuv(xd);
 
   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
                     xd->predictor, x->src.uv_stride);
@@ -169,16 +141,7 @@
   int i;
   TX_TYPE tx_type;
 
-#if CONFIG_COMP_INTRA_PRED
-  if (b->bmi.as_mode.second == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-    vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-  } else {
-    vp9_comp_intra8x8_predict(b, b->bmi.as_mode.first, b->bmi.as_mode.second,
-                              b->predictor);
-  }
-#endif
+  vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);
   // generate residual blocks
   vp9_subtract_4b_c(be, b, 16);
 
@@ -231,20 +194,12 @@
   }
 }
 
-void vp9_encode_intra_uv4x4(MACROBLOCK *x, int ib,
-                            int mode, int second) {
+static void encode_intra_uv4x4(MACROBLOCK *x, int ib,
+                               int mode) {
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
 
-#if CONFIG_COMP_INTRA_PRED
-  if (second == -1) {
-#endif
-    vp9_intra_uv4x4_predict(b, mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-  } else {
-    vp9_comp_intra_uv4x4_predict(b, mode, second, b->predictor);
-  }
-#endif
+  vp9_intra_uv4x4_predict(b, mode, b->predictor);
 
   vp9_subtract_b(be, b, 8);
 
@@ -257,7 +212,7 @@
 }
 
 void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {
-  int i, ib, mode, second;
+  int i, ib, mode;
   BLOCKD *b;
 
   for (i = 0; i < 4; i++) {
@@ -264,14 +219,10 @@
     ib = vp9_i8x8_block[i];
     b = &x->e_mbd.block[ib];
     mode = b->bmi.as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-    second = b->bmi.as_mode.second;
-#else
-    second = -1;
-#endif
+
     /*u */
-    vp9_encode_intra_uv4x4(x, i + 16, mode, second);
+    encode_intra_uv4x4(x, i + 16, mode);
     /*v */
-    vp9_encode_intra_uv4x4(x, i + 20, mode, second);
+    encode_intra_uv4x4(x, i + 20, mode);
   }
 }
--- a/vp9/encoder/vp9_encodeintra.h
+++ b/vp9/encoder/vp9_encodeintra.h
@@ -22,4 +22,4 @@
 void vp9_encode_intra8x8mbuv(MACROBLOCK *x);
 void vp9_encode_intra8x8(MACROBLOCK *x, int ib);
 
-#endif  // __ENCODEINTRA_H_
+#endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -21,9 +21,9 @@
 #include "vp9_rtcd.h"
 
 void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *src_ptr = (*(be->base_src) + be->src);
-  short *diff_ptr = be->src_diff;
-  unsigned char *pred_ptr = bd->predictor;
+  uint8_t *src_ptr = (*(be->base_src) + be->src);
+  int16_t *diff_ptr = be->src_diff;
+  uint8_t *pred_ptr = bd->predictor;
   int src_stride = be->src_stride;
 
   int r, c;
@@ -40,9 +40,9 @@
 }
 
 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
-  unsigned char *src_ptr = (*(be->base_src) + be->src);
-  short *diff_ptr = be->src_diff;
-  unsigned char *pred_ptr = bd->predictor;
+  uint8_t *src_ptr = (*(be->base_src) + be->src);
+  int16_t *diff_ptr = be->src_diff;
+  uint8_t *pred_ptr = bd->predictor;
   int src_stride = be->src_stride;
   int r, c;
 
@@ -56,12 +56,12 @@
   }
 }
 
-void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
-                           const unsigned char *vsrc, int src_stride,
-                           const unsigned char *upred,
-                           const unsigned char *vpred, int dst_stride) {
-  short *udiff = diff + 256;
-  short *vdiff = diff + 320;
+void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc,
+                           const uint8_t *vsrc, int src_stride,
+                           const uint8_t *upred,
+                           const uint8_t *vpred, int dst_stride) {
+  int16_t *udiff = diff + 256;
+  int16_t *vdiff = diff + 320;
   int r, c;
 
   for (r = 0; r < 8; r++) {
@@ -85,16 +85,16 @@
   }
 }
 
-void vp9_subtract_mbuv_c(short *diff, unsigned char *usrc,
-                         unsigned char *vsrc, unsigned char *pred, int stride) {
-  unsigned char *upred = pred + 256;
-  unsigned char *vpred = pred + 320;
+void vp9_subtract_mbuv_c(int16_t *diff, uint8_t *usrc,
+                         uint8_t *vsrc, uint8_t *pred, int stride) {
+  uint8_t *upred = pred + 256;
+  uint8_t *vpred = pred + 320;
 
   vp9_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
 }
 
-void vp9_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,
-                          const unsigned char *pred, int dst_stride) {
+void vp9_subtract_mby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
+                          const uint8_t *pred, int dst_stride) {
   int r, c;
 
   for (r = 0; r < 16; r++) {
@@ -108,8 +108,52 @@
   }
 }
 
-void vp9_subtract_mby_c(short *diff, unsigned char *src,
-                        unsigned char *pred, int stride) {
+void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
+                          const uint8_t *pred, int dst_stride) {
+  int r, c;
+
+  for (r = 0; r < 32; r++) {
+    for (c = 0; c < 32; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += 32;
+    pred += dst_stride;
+    src  += src_stride;
+  }
+}
+
+void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,
+                           const uint8_t *vsrc, int src_stride,
+                           const uint8_t *upred,
+                           const uint8_t *vpred, int dst_stride) {
+  int16_t *udiff = diff + 1024;
+  int16_t *vdiff = diff + 1024 + 256;
+  int r, c;
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      udiff[c] = usrc[c] - upred[c];
+    }
+
+    udiff += 16;
+    upred += dst_stride;
+    usrc  += src_stride;
+  }
+
+  for (r = 0; r < 16; r++) {
+    for (c = 0; c < 16; c++) {
+      vdiff[c] = vsrc[c] - vpred[c];
+    }
+
+    vdiff += 16;
+    vpred += dst_stride;
+    vsrc  += src_stride;
+  }
+}
+
+void vp9_subtract_mby_c(int16_t *diff, uint8_t *src,
+                        uint8_t *pred, int stride) {
   vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
 }
 
@@ -123,7 +167,7 @@
 }
 
 static void build_dcblock_4x4(MACROBLOCK *x) {
-  short *src_diff_ptr = &x->src_diff[384];
+  int16_t *src_diff_ptr = &x->src_diff[384];
   int i;
 
   for (i = 0; i < 16; i++) {
@@ -265,6 +309,20 @@
   vp9_transform_mbuv_8x8(x);
 }
 
+void vp9_transform_sby_32x32(MACROBLOCK *x) {
+  SUPERBLOCK * const x_sb = &x->sb_coeff_data;
+  vp9_short_fdct32x32(x_sb->src_diff, x_sb->coeff, 64);
+}
+
+void vp9_transform_sbuv_16x16(MACROBLOCK *x) {
+  SUPERBLOCK * const x_sb = &x->sb_coeff_data;
+  vp9_clear_system_state();
+  x->vp9_short_fdct16x16(x_sb->src_diff + 1024,
+                         x_sb->coeff + 1024, 32);
+  x->vp9_short_fdct16x16(x_sb->src_diff + 1280,
+                         x_sb->coeff + 1280, 32);
+}
+
 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )
 typedef struct vp9_token_state vp9_token_state;
@@ -302,41 +360,31 @@
 static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        int tx_size) {
-  BLOCK *b;
-  BLOCKD *d;
-  vp9_token_state tokens[65][2];
-  uint64_t best_mask[2];
-  const short *dequant_ptr;
-  const short *coeff_ptr;
-  short *qcoeff_ptr;
-  short *dqcoeff_ptr;
-  int eob;
-  int i0;
-  int rc;
-  int x;
-  int sz = 0;
-  int next;
-  int rdmult;
-  int rddiv;
-  int final_eob;
-  int64_t rd_cost0, rd_cost1;
-  int rate0, rate1;
-  int error0, error1;
-  int t0, t1;
-  int best;
-  int band;
-  int pt;
+  BLOCK *b = &mb->block[i];
+  BLOCKD *d = &mb->e_mbd.block[i];
+  vp9_token_state tokens[257][2];
+  unsigned best_index[257][2];
+  const int16_t *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  int eob = d->eob, final_eob, sz = 0;
+  int i0 = (type == PLANE_TYPE_Y_NO_DC);
+  int rc, x, next;
+  int64_t rdmult, rddiv, rd_cost0, rd_cost1;
+  int rate0, rate1, error0, error1, t0, t1;
+  int best, band, pt;
   int err_mult = plane_rd_mult[type];
   int default_eob;
   int const *scan, *bands;
+#if CONFIG_NEWCOEFCONTEXT
+  const int *neighbors;
+#endif
 
-  b = &mb->block[i];
-  d = &mb->e_mbd.block[i];
   switch (tx_size) {
     default:
     case TX_4X4:
-      scan = vp9_default_zig_zag1d;
-      bands = vp9_coef_bands;
+      scan = vp9_default_zig_zag1d_4x4;
+      bands = vp9_coef_bands_4x4;
       default_eob = 16;
       // TODO: this isn't called (for intra4x4 modes), but will be left in
       // since it could be used later
@@ -345,19 +393,19 @@
         if (tx_type != DCT_DCT) {
           switch (tx_type) {
             case ADST_DCT:
-              scan = vp9_row_scan;
+              scan = vp9_row_scan_4x4;
               break;
 
             case DCT_ADST:
-              scan = vp9_col_scan;
+              scan = vp9_col_scan_4x4;
               break;
 
             default:
-              scan = vp9_default_zig_zag1d;
+              scan = vp9_default_zig_zag1d_4x4;
               break;
           }
         } else {
-          scan = vp9_default_zig_zag1d;
+          scan = vp9_default_zig_zag1d_4x4;
         }
       }
       break;
@@ -366,21 +414,22 @@
       bands = vp9_coef_bands_8x8;
       default_eob = 64;
       break;
+    case TX_16X16:
+      scan = vp9_default_zig_zag1d_16x16;
+      bands = vp9_coef_bands_16x16;
+      default_eob = 256;
+      break;
   }
+#if CONFIG_NEWCOEFCONTEXT
+  neighbors = vp9_get_coef_neighbors_handle(scan);
+#endif
 
-  dequant_ptr = d->dequant;
-  coeff_ptr = b->coeff;
-  qcoeff_ptr = d->qcoeff;
-  dqcoeff_ptr = d->dqcoeff;
-  i0 = (type == PLANE_TYPE_Y_NO_DC);
-  eob = d->eob;
-
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   rdmult = mb->rdmult * err_mult;
   if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
     rdmult = (rdmult * 9) >> 4;
   rddiv = mb->rddiv;
-  best_mask[0] = best_mask[1] = 0;
+  memset(best_index, 0, sizeof(best_index));
   /* Initialize the sentinel node of the trellis. */
   tokens[eob][0].rate = 0;
   tokens[eob][0].error = 0;
@@ -390,9 +439,7 @@
   *(tokens[eob] + 1) = *(tokens[eob] + 0);
   next = eob;
   for (i = eob; i-- > i0;) {
-    int base_bits;
-    int d2;
-    int dx;
+    int base_bits, d2, dx;
 
     rc = scan[i];
     x = qcoeff_ptr[rc];
@@ -409,6 +456,11 @@
       if (next < default_eob) {
         band = bands[i + 1];
         pt = vp9_prev_token_class[t0];
+#if CONFIG_NEWCOEFCONTEXT
+        if (NEWCOEFCONTEXT_BAND_COND(band))
+          pt = vp9_get_coef_neighbor_context(
+              qcoeff_ptr, i0, neighbors, scan[i + 1]);
+#endif
         rate0 +=
           mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];
         rate1 +=
@@ -425,7 +477,7 @@
       tokens[i][0].next = next;
       tokens[i][0].token = t0;
       tokens[i][0].qc = x;
-      best_mask[0] |= best << i;
+      best_index[i][0] = best;
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
@@ -456,12 +508,34 @@
       if (next < default_eob) {
         band = bands[i + 1];
         if (t0 != DCT_EOB_TOKEN) {
+#if CONFIG_NEWCOEFCONTEXT
+          int tmp = qcoeff_ptr[scan[i]];
+          qcoeff_ptr[scan[i]] = x;
+          if (NEWCOEFCONTEXT_BAND_COND(band))
+            pt = vp9_get_coef_neighbor_context(
+                qcoeff_ptr, i0, neighbors, scan[i + 1]);
+          else
+            pt = vp9_prev_token_class[t0];
+          qcoeff_ptr[scan[i]] = tmp;
+#else
           pt = vp9_prev_token_class[t0];
+#endif
           rate0 += mb->token_costs[tx_size][type][band][pt][
               tokens[next][0].token];
         }
         if (t1 != DCT_EOB_TOKEN) {
+#if CONFIG_NEWCOEFCONTEXT
+          int tmp = qcoeff_ptr[scan[i]];
+          qcoeff_ptr[scan[i]] = x;
+          if (NEWCOEFCONTEXT_BAND_COND(band))
+            pt = vp9_get_coef_neighbor_context(
+                qcoeff_ptr, i0, neighbors, scan[i + 1]);
+          else
+            pt = vp9_prev_token_class[t1];
+          qcoeff_ptr[scan[i]] = tmp;
+#else
           pt = vp9_prev_token_class[t1];
+#endif
           rate1 += mb->token_costs[tx_size][type][band][pt][
               tokens[next][1].token];
         }
@@ -481,7 +555,7 @@
       tokens[i][1].next = next;
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
-      best_mask[1] |= best << i;
+      best_index[i][1] = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
     }
@@ -528,7 +602,7 @@
     dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]);
 
     next = tokens[i][best].next;
-    best = (best_mask[best] >> i) & 1;
+    best = best_index[i][best];
   }
   final_eob++;
 
@@ -556,7 +630,7 @@
     return;
 
   for (i = 0; i < bd->eob; i++) {
-    int coef = bd->dqcoeff[vp9_default_zig_zag1d[i]];
+    int coef = bd->dqcoeff[vp9_default_zig_zag1d_4x4[i]];
     sum += (coef >= 0) ? coef : -coef;
     if (sum >= SUM_2ND_COEFF_THRESH)
       return;
@@ -564,7 +638,7 @@
 
   if (sum < SUM_2ND_COEFF_THRESH) {
     for (i = 0; i < bd->eob; i++) {
-      int rc = vp9_default_zig_zag1d[i];
+      int rc = vp9_default_zig_zag1d_4x4[i];
       bd->qcoeff[rc] = 0;
       bd->dqcoeff[rc] = 0;
     }
@@ -626,15 +700,18 @@
 
   for (b = 0; b < 16; b++) {
     optimize_b(x, b, type,
-               ta + vp9_block2above[b], tl + vp9_block2left[b], TX_4X4);
+               ta + vp9_block2above[TX_4X4][b],
+               tl + vp9_block2left[TX_4X4][b], TX_4X4);
   }
 
   if (has_2nd_order) {
     b = 24;
     optimize_b(x, b, PLANE_TYPE_Y2,
-               ta + vp9_block2above[b], tl + vp9_block2left[b], TX_4X4);
+               ta + vp9_block2above[TX_4X4][b],
+               tl + vp9_block2left[TX_4X4][b], TX_4X4);
     check_reset_2nd_coeffs(&x->e_mbd,
-                           ta + vp9_block2above[b], tl + vp9_block2left[b]);
+                           ta + vp9_block2above[TX_4X4][b],
+                           tl + vp9_block2left[TX_4X4][b]);
   }
 }
 
@@ -655,7 +732,8 @@
 
   for (b = 16; b < 24; b++) {
     optimize_b(x, b, PLANE_TYPE_UV,
-               ta + vp9_block2above[b], tl + vp9_block2left[b], TX_4X4);
+               ta + vp9_block2above[TX_4X4][b],
+               tl + vp9_block2left[TX_4X4][b], TX_4X4);
   }
 }
 
@@ -682,42 +760,47 @@
   tl = (ENTROPY_CONTEXT *)&t_left;
   type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
   for (b = 0; b < 16; b += 4) {
-    optimize_b(x, b, type,
-               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
-               TX_8X8);
-    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
-    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];
+    ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];
+    ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
+#if CONFIG_CNVCONTEXT
+    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
+    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
+#else
+    ENTROPY_CONTEXT above_ec = a[0];
+    ENTROPY_CONTEXT left_ec = l[0];
+#endif
+    optimize_b(x, b, type, &above_ec, &left_ec, TX_8X8);
+    a[1] = a[0] = above_ec;
+    l[1] = l[0] = left_ec;
   }
 
-  // 8x8 always have 2nd roder haar block
+  // 8x8 always have 2nd order block
   if (has_2nd_order) {
     check_reset_8x8_2nd_coeffs(&x->e_mbd,
-                               ta + vp9_block2above_8x8[24],
-                               tl + vp9_block2left_8x8[24]);
+                               ta + vp9_block2above[TX_8X8][24],
+                               tl + vp9_block2left[TX_8X8][24]);
   }
 }
 
 void vp9_optimize_mbuv_8x8(MACROBLOCK *x) {
   int b;
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
+  ENTROPY_CONTEXT *const ta = (ENTROPY_CONTEXT *)x->e_mbd.above_context;
+  ENTROPY_CONTEXT *const tl = (ENTROPY_CONTEXT *)x->e_mbd.left_context;
 
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+  if (!ta || !tl)
     return;
 
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-
   for (b = 16; b < 24; b += 4) {
-    optimize_b(x, b, PLANE_TYPE_UV,
-               ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
-               TX_8X8);
-    ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
-    tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];
+    ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];
+    ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];
+#if CONFIG_CNVCONTEXT
+    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;
+    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;
+#else
+    ENTROPY_CONTEXT above_ec = a[0];
+    ENTROPY_CONTEXT left_ec = l[0];
+#endif
+    optimize_b(x, b, PLANE_TYPE_UV, &above_ec, &left_ec, TX_8X8);
   }
 }
 
@@ -726,192 +809,22 @@
   vp9_optimize_mbuv_8x8(x);
 }
 
-static void optimize_b_16x16(MACROBLOCK *mb, int i, PLANE_TYPE type,
-                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
-  BLOCK *b = &mb->block[i];
-  BLOCKD *d = &mb->e_mbd.block[i];
-  vp9_token_state tokens[257][2];
-  unsigned best_index[257][2];
-  const short *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;
-  short *qcoeff_ptr = qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = dqcoeff_ptr = d->dqcoeff;
-  int eob = d->eob, final_eob, sz = 0;
-  int rc, x, next;
-  int64_t rdmult, rddiv, rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1, t0, t1;
-  int best, band, pt;
-  int err_mult = plane_rd_mult[type];
-
-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  rdmult = mb->rdmult * err_mult;
-  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)
-      rdmult = (rdmult * 9)>>4;
-  rddiv = mb->rddiv;
-  memset(best_index, 0, sizeof(best_index));
-  /* Initialize the sentinel node of the trellis. */
-  tokens[eob][0].rate = 0;
-  tokens[eob][0].error = 0;
-  tokens[eob][0].next = 256;
-  tokens[eob][0].token = DCT_EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  *(tokens[eob] + 1) = *(tokens[eob] + 0);
-  next = eob;
-  for (i = eob; i-- > 0;) {
-    int base_bits, d2, dx;
-
-    rc = vp9_default_zig_zag1d_16x16[i];
-    x = qcoeff_ptr[rc];
-    /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
-      int shortcut = 0;
-      error0 = tokens[next][0].error;
-      error1 = tokens[next][1].error;
-      /* Evaluate the first possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-      t0 = (vp9_dct_value_tokens_ptr + x)->Token;
-      /* Consider both possible successor states. */
-      if (next < 256) {
-        band = vp9_coef_bands_16x16[i + 1];
-        pt = vp9_prev_token_class[t0];
-        rate0 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][0].token];
-        rate1 += mb->token_costs[TX_16X16][type][band][pt][tokens[next][1].token];
-      }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = *(vp9_dct_value_cost_ptr + x);
-      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
-      d2 = dx*dx;
-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][0].error = d2 + (best ? error1 : error0);
-      tokens[i][0].next = next;
-      tokens[i][0].token = t0;
-      tokens[i][0].qc = x;
-      best_index[i][0] = best;
-      /* Evaluate the second possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-
-      if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&
-         (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))
-        shortcut = 1;
-      else
-        shortcut = 0;
-
-      if (shortcut) {
-        sz = -(x < 0);
-        x -= 2*sz + 1;
-      }
-
-      /* Consider both possible successor states. */
-      if (!x) {
-        /* If we reduced this coefficient to zero, check to see if
-         *  we need to move the EOB back here.
-         */
-        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
-      }
-      else
-        t0=t1 = (vp9_dct_value_tokens_ptr + x)->Token;
-      if (next < 256) {
-        band = vp9_coef_bands_16x16[i + 1];
-        if (t0 != DCT_EOB_TOKEN) {
-            pt = vp9_prev_token_class[t0];
-            rate0 += mb->token_costs[TX_16X16][type][band][pt]
-                [tokens[next][0].token];
-        }
-        if (t1!=DCT_EOB_TOKEN) {
-            pt = vp9_prev_token_class[t1];
-            rate1 += mb->token_costs[TX_16X16][type][band][pt]
-                [tokens[next][1].token];
-        }
-      }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = *(vp9_dct_value_cost_ptr + x);
-
-      if(shortcut) {
-        dx -= (dequant_ptr[rc!=0] + sz) ^ sz;
-        d2 = dx*dx;
-      }
-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][1].error = d2 + (best ? error1 : error0);
-      tokens[i][1].next = next;
-      tokens[i][1].token = best ? t1 : t0;
-      tokens[i][1].qc = x;
-      best_index[i][1] = best;
-      /* Finally, make this the new head of the trellis. */
-      next = i;
-    }
-    /* There's no choice to make for a zero coefficient, so we don't
-     *  add a new trellis node, but we do need to update the costs.
-     */
-    else {
-      band = vp9_coef_bands_16x16[i + 1];
-      t0 = tokens[next][0].token;
-      t1 = tokens[next][1].token;
-      /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != DCT_EOB_TOKEN) {
-        tokens[next][0].rate += mb->token_costs[TX_16X16][type][band][0][t0];
-        tokens[next][0].token = ZERO_TOKEN;
-      }
-      if (t1 != DCT_EOB_TOKEN) {
-        tokens[next][1].rate += mb->token_costs[TX_16X16][type][band][0][t1];
-        tokens[next][1].token = ZERO_TOKEN;
-      }
-      /* Don't update next, because we didn't add a new node. */
-    }
-  }
-
-  /* Now pick the best path through the whole trellis. */
-  band = vp9_coef_bands_16x16[i + 1];
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  rate0 = tokens[next][0].rate;
-  rate1 = tokens[next][1].rate;
-  error0 = tokens[next][0].error;
-  error1 = tokens[next][1].error;
-  t0 = tokens[next][0].token;
-  t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[TX_16X16][type][band][pt][t0];
-  rate1 += mb->token_costs[TX_16X16][type][band][pt][t1];
-  UPDATE_RD_COST();
-  best = rd_cost1 < rd_cost0;
-  final_eob = -1;
-
-  for (i = next; i < eob; i = next) {
-    x = tokens[i][best].qc;
-    if (x)
-      final_eob = i;
-    rc = vp9_default_zig_zag1d_16x16[i];
-    qcoeff_ptr[rc] = x;
-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc!=0]);
-
-    next = tokens[i][best].next;
-    best = best_index[i][best];
-  }
-  final_eob++;
-
-  d->eob = final_eob;
-  *a = *l = (d->eob > !type);
-}
-
 void vp9_optimize_mby_16x16(MACROBLOCK *x) {
-  ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta, *tl;
+  ENTROPY_CONTEXT_PLANES *const t_above = x->e_mbd.above_context;
+  ENTROPY_CONTEXT_PLANES *const t_left = x->e_mbd.left_context;
+  ENTROPY_CONTEXT ta, tl;
 
-  if (!x->e_mbd.above_context || !x->e_mbd.left_context)
+  if (!t_above || !t_left)
     return;
 
-  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
-
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl);
+#if CONFIG_CNVCONTEXT
+  ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0;
+  tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0;
+#else
+  ta = t_above->y1[0];
+  tl = t_left->y1[0];
+#endif
+  optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, &ta, &tl, TX_16X16);
 }
 
 static void optimize_mb_16x16(MACROBLOCK *x) {
@@ -971,11 +884,6 @@
 void vp9_encode_inter16x16y(MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
-
-#if CONFIG_PRED_FILTER
-  // Disable the prediction filter for firstpass
-  xd->mode_info_context->mbmi.pred_filter_enabled = 0;
-#endif
 
   vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
 
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_ENCODEMB_H_
 #define VP9_ENCODER_VP9_ENCODEMB_H_
 
@@ -19,9 +18,6 @@
   MB_PREDICTION_MODE mode;
   MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME second_ref_frame;
-#if CONFIG_PRED_FILTER
-  int pred_filter_flag;
-#endif
 } MODE_DEFINITION;
 
 
@@ -47,18 +43,25 @@
 void vp9_transform_mby_16x16(MACROBLOCK *x);
 void vp9_optimize_mby_16x16(MACROBLOCK *x);
 
+void vp9_transform_sby_32x32(MACROBLOCK *x);
+void vp9_transform_sbuv_16x16(MACROBLOCK *x);
+
 void vp9_fidct_mb(MACROBLOCK *x);
 
 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
 
-#if CONFIG_SUPERBLOCKS
-void vp9_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
-                           const unsigned char *vsrc, int src_stride,
-                           const unsigned char *upred,
-                           const unsigned char *vpred, int dst_stride);
-void vp9_subtract_mby_s_c(short *diff, const unsigned char *src,
-                          int src_stride, const unsigned char *pred,
+void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc,
+                           const uint8_t *vsrc, int src_stride,
+                           const uint8_t *upred,
+                           const uint8_t *vpred, int dst_stride);
+void vp9_subtract_mby_s_c(int16_t *diff, const uint8_t *src,
+                          int src_stride, const uint8_t *pred,
                           int dst_stride);
-#endif
+void vp9_subtract_sby_s_c(int16_t *diff, const uint8_t *src, int src_stride,
+                          const uint8_t *pred, int dst_stride);
+void vp9_subtract_sbuv_s_c(int16_t *diff, const uint8_t *usrc,
+                           const uint8_t *vsrc, int src_stride,
+                           const uint8_t *upred,
+                           const uint8_t *vpred, int dst_stride);
 
-#endif
+#endif  // VP9_ENCODER_VP9_ENCODEMB_H_
--- a/vp9/encoder/vp9_encodemv.h
+++ b/vp9/encoder/vp9_encodemv.h
@@ -30,4 +30,5 @@
                           int_mv *best_ref_mv, int_mv *second_best_ref_mv);
 
 void print_nmvcounts(nmv_context_counts tnmvcounts);
-#endif
+
+#endif  // VP9_ENCODER_VP9_ENCODEMV_H_
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -17,7 +17,7 @@
 #include "vp9/common/vp9_setupintrarecon.h"
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_firstpass.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/common/vp9_extend.h"
@@ -296,7 +296,7 @@
 static double simple_weight(YV12_BUFFER_CONFIG *source) {
   int i, j;
 
-  unsigned char *src = source->y_buffer;
+  uint8_t *src = source->y_buffer;
   double sum_weights = 0.0;
 
   // Loop throught the Y plane raw examining levels and creating a weight for the image
@@ -345,15 +345,15 @@
   BLOCK *b = &x->block[0];
   BLOCKD *d = &x->e_mbd.block[0];
 
-  unsigned char *src_ptr = (*(b->base_src) + b->src);
+  uint8_t *src_ptr = (*(b->base_src) + b->src);
   int src_stride = b->src_stride;
-  unsigned char *ref_ptr;
+  uint8_t *ref_ptr;
   int ref_stride = d->pre_stride;
 
   // Set up pointers for this macro block recon buffer
   xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
 
-  ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre);
+  ref_ptr = (uint8_t *)(*(d->base_pre) + d->pre);
 
   vp9_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
                (unsigned int *)(best_motion_err));
@@ -516,11 +516,6 @@
       xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
 
-#if !CONFIG_SUPERBLOCKS
-      // Copy current mb to a buffer
-      vp9_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
-#endif
-
       // do intra 16x16 prediction
       this_error = vp9_encode_intra(cpi, x, use_dc_pred);
 
@@ -799,7 +794,7 @@
   return -(log(prob) / log(2.0));
 }
 
-static long long estimate_modemvcost(VP9_COMP *cpi,
+static int64_t estimate_modemvcost(VP9_COMP *cpi,
                                      FIRSTPASS_STATS *fpstats) {
 #if 0
   int mv_cost;
@@ -1235,7 +1230,7 @@
   int still_interval,
   double loop_decay_rate,
   double last_decay_rate) {
-  BOOL trans_to_still = FALSE;
+  int trans_to_still = FALSE;
 
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
@@ -1273,10 +1268,10 @@
 // This function detects a flash through the high relative pcnt_second_ref
 // score in the frame following a flash frame. The offset passed in should
 // reflect this
-static BOOL detect_flash(VP9_COMP *cpi, int offset) {
+static int detect_flash(VP9_COMP *cpi, int offset) {
   FIRSTPASS_STATS next_frame;
 
-  BOOL flash_detected = FALSE;
+  int flash_detected = FALSE;
 
   // Read the frame data.
   // The return is FALSE (no flash detected) if not a valid frame
@@ -1388,7 +1383,7 @@
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   int arf_boost;
-  BOOL flash_detected = FALSE;
+  int flash_detected = FALSE;
 
   // Search forward from the proposed arf/next gf position
   for (i = 0; i < f_frames; i++) {
@@ -1543,7 +1538,7 @@
 
   int f_boost = 0;
   int b_boost = 0;
-  BOOL flash_detected;
+  int flash_detected;
 
   cpi->twopass.gf_group_bits = 0;
 
@@ -2096,8 +2091,11 @@
 }
 
 
-static BOOL test_candidate_kf(VP9_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame) {
-  BOOL is_viable_kf = FALSE;
+static int test_candidate_kf(VP9_COMP *cpi,
+                             FIRSTPASS_STATS *last_frame,
+                             FIRSTPASS_STATS *this_frame,
+                             FIRSTPASS_STATS *next_frame) {
+  int is_viable_kf = FALSE;
 
   // Does the frame satisfy the primary criteria of a key frame
   //      If so, then examine how well it predicts subsequent frames
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -8,8 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#if !defined __INC_FIRSTPASS_H
+#ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
 
 extern void vp9_init_first_pass(VP9_COMP *cpi);
@@ -20,4 +19,4 @@
 extern void vp9_second_pass(VP9_COMP *cpi);
 extern void vp9_end_second_pass(VP9_COMP *cpi);
 
-#endif
+#endif  // VP9_ENCODER_VP9_FIRSTPASS_H_
--- a/vp9/encoder/vp9_lookahead.h
+++ b/vp9/encoder/vp9_lookahead.h
@@ -7,8 +7,10 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #ifndef VP9_ENCODER_VP9_LOOKAHEAD_H_
 #define VP9_ENCODER_VP9_LOOKAHEAD_H_
+
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 
@@ -101,5 +103,4 @@
 unsigned int
 vp9_lookahead_depth(struct lookahead_ctx *ctx);
 
-
-#endif
+#endif  // VP9_ENCODER_VP9_LOOKAHEAD_H_
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -71,11 +71,6 @@
         & distortion, &sse);
   }
 
-#if CONFIG_PRED_FILTER
-  // Disable the prediction filter
-  xd->mode_info_context->mbmi.pred_filter_enabled = 0;
-#endif
-
   vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
   vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
   best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,
--- a/vp9/encoder/vp9_mbgraph.h
+++ b/vp9/encoder/vp9_mbgraph.h
@@ -13,4 +13,4 @@
 
 extern void vp9_update_mbgraph_stats(VP9_COMP *cpi);
 
-#endif /* __INC_MBGRAPH_H__ */
+#endif  // VP9_ENCODER_VP9_MBGRAPH_H_
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -17,6 +17,7 @@
 #include <limits.h>
 #include <math.h>
 #include "vp9/common/vp9_findnearmv.h"
+#include "vp9/common/vp9_common.h"
 
 #ifdef ENTROPY_STATS
 static int mv_ref_ct [31] [4] [2];
@@ -241,9 +242,6 @@
     },                                                                   \
     v = INT_MAX;)
 
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
-
 int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int_mv *bestmv, int_mv *ref_mv,
                                              int error_per_bit,
@@ -251,7 +249,7 @@
                                              int *mvjcost, int *mvcost[2],
                                              int *distortion,
                                              unsigned int *sse1) {
-  unsigned char *z = (*(b->base_src) + b->src);
+  uint8_t *z = (*(b->base_src) + b->src);
   MACROBLOCKD *xd = &x->e_mbd;
 
   int rr, rc, br, bc, hstep;
@@ -269,29 +267,9 @@
   int offset;
   int usehp = xd->allow_high_precision_mv;
 
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
-  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  unsigned char *y;
-  int buf_r1, buf_r2, buf_c1, buf_c2;
-
-  // Clamping to avoid out-of-range data access
-  buf_r1 = ((bestmv->as_mv.row - VP9_INTERP_EXTEND) < x->mv_row_min) ?
-      (bestmv->as_mv.row - x->mv_row_min) : VP9_INTERP_EXTEND - 1;
-  buf_r2 = ((bestmv->as_mv.row + VP9_INTERP_EXTEND) > x->mv_row_max) ?
-      (x->mv_row_max - bestmv->as_mv.row) : VP9_INTERP_EXTEND - 1;
-  buf_c1 = ((bestmv->as_mv.col - VP9_INTERP_EXTEND) < x->mv_col_min) ?
-      (bestmv->as_mv.col - x->mv_col_min) : VP9_INTERP_EXTEND - 1;
-  buf_c2 = ((bestmv->as_mv.col + VP9_INTERP_EXTEND) > x->mv_col_max) ?
-      (x->mv_col_max - bestmv->as_mv.col) : VP9_INTERP_EXTEND - 1;
-  y_stride = 32;
-
-  /* Copy to intermediate buffer before searching. */
-  vfp->copymem(y0 - buf_c1 - d->pre_stride * buf_r1, d->pre_stride, xd->y_buf, y_stride, 16 + buf_r1 + buf_r2);
-  y = xd->y_buf + y_stride * buf_r1 + buf_c1;
-#else
-  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+  uint8_t *y = *(d->base_pre) + d->pre +
+               (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
   y_stride = d->pre_stride;
-#endif
 
   rr = ref_mv->as_mv.row;
   rc = ref_mv->as_mv.col;
@@ -454,7 +432,7 @@
   int_mv this_mv;
   int_mv orig_mv;
   int yrow_movedback = 0, ycol_movedback = 0;
-  unsigned char *z = (*(b->base_src) + b->src);
+  uint8_t *z = (*(b->base_src) + b->src);
   int left, right, up, down, diag;
   unsigned int sse;
   int whichdir;
@@ -463,18 +441,9 @@
   MACROBLOCKD *xd = &x->e_mbd;
   int usehp = xd->allow_high_precision_mv;
 
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
-  unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  unsigned char *y;
-
-  y_stride = 32;
-  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
-  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
-  y = xd->y_buf + y_stride + 1;
-#else
-  unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
+  uint8_t *y = *(d->base_pre) + d->pre +
+               (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
   y_stride = d->pre_stride;
-#endif
 
   // central mv
   bestmv->as_mv.row <<= 3;
@@ -933,7 +902,7 @@
   int bestmse = INT_MAX;
   int_mv startmv;
   int_mv this_mv;
-  unsigned char *z = (*(b->base_src) + b->src);
+  uint8_t *z = (*(b->base_src) + b->src);
   int left, right, up, down, diag;
   unsigned int sse;
   int whichdir;
@@ -941,20 +910,9 @@
   int y_stride;
   MACROBLOCKD *xd = &x->e_mbd;
 
-#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64)
-  unsigned char *y0 = *(d->base_pre) + d->pre +
+  uint8_t *y = *(d->base_pre) + d->pre +
       (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
-  unsigned char *y;
-
-  y_stride = 32;
-  /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */
-  vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18);
-  y = xd->y_buf + y_stride + 1;
-#else
-  unsigned char *y = *(d->base_pre) + d->pre +
-      (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col;
   y_stride = d->pre_stride;
-#endif
 
   // central mv
   bestmv->as_mv.row <<= 3;
@@ -1118,7 +1076,7 @@
   MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}};
   int i, j;
 
-  unsigned char *what = (*(b->base_src) + b->src);
+  uint8_t *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
   int in_what_stride = d->pre_stride;
   int br, bc;
@@ -1125,8 +1083,8 @@
   int_mv this_mv;
   unsigned int bestsad = 0x7fffffff;
   unsigned int thissad;
-  unsigned char *base_offset;
-  unsigned char *this_offset;
+  uint8_t *base_offset;
+  uint8_t *this_offset;
   int k = -1;
   int all_in;
   int best_site = -1;
@@ -1141,7 +1099,7 @@
   bc = ref_mv->as_mv.col;
 
   // Work out the start point for the search
-  base_offset = (unsigned char *)(*(d->base_pre) + d->pre);
+  base_offset = (uint8_t *)(*(d->base_pre) + d->pre);
   this_offset = base_offset + (br * (d->pre_stride)) + bc;
   this_mv.as_mv.row = br;
   this_mv.as_mv.col = bc;
@@ -1264,11 +1222,11 @@
                              int *mvcost[2], int_mv *center_mv) {
   int i, j, step;
 
-  unsigned char *what = (*(b->base_src) + b->src);
+  uint8_t *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
-  unsigned char *in_what;
+  uint8_t *in_what;
   int in_what_stride = d->pre_stride;
-  unsigned char *best_address;
+  uint8_t *best_address;
 
   int tot_steps;
   int_mv this_mv;
@@ -1281,7 +1239,7 @@
   int this_row_offset, this_col_offset;
   search_site *ss;
 
-  unsigned char *check_here;
+  uint8_t *check_here;
   int thissad;
   MACROBLOCKD *xd = &x->e_mbd;
   int_mv fcenter_mv;
@@ -1300,7 +1258,8 @@
   best_mv->as_mv.col = ref_col;
 
   // Work out the start point for the search
-  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+  in_what = (uint8_t *)(*(d->base_pre) + d->pre +
+                        (ref_row * (d->pre_stride)) + ref_col);
   best_address = in_what;
 
   // Check the starting position
@@ -1374,11 +1333,11 @@
                              int *mvjcost, int *mvcost[2], int_mv *center_mv) {
   int i, j, step;
 
-  unsigned char *what = (*(b->base_src) + b->src);
+  uint8_t *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
-  unsigned char *in_what;
+  uint8_t *in_what;
   int in_what_stride = d->pre_stride;
-  unsigned char *best_address;
+  uint8_t *best_address;
 
   int tot_steps;
   int_mv this_mv;
@@ -1393,7 +1352,7 @@
   int this_col_offset;
   search_site *ss;
 
-  unsigned char *check_here;
+  uint8_t *check_here;
   unsigned int thissad;
   MACROBLOCKD *xd = &x->e_mbd;
   int_mv fcenter_mv;
@@ -1412,7 +1371,8 @@
   best_mv->as_mv.col = ref_col;
 
   // Work out the start point for the search
-  in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+  in_what = (uint8_t *)(*(d->base_pre) + d->pre +
+                        (ref_row * (d->pre_stride)) + ref_col);
   best_address = in_what;
 
   // Check the starting position
@@ -1580,18 +1540,18 @@
                           vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
                           int *mvcost[2],
                           int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
+  uint8_t *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
-  unsigned char *in_what;
+  uint8_t *in_what;
   int in_what_stride = d->pre_stride;
   int mv_stride = d->pre_stride;
-  unsigned char *bestaddress;
+  uint8_t *bestaddress;
   int_mv *best_mv = &d->bmi.as_mv.first;
   int_mv this_mv;
   int bestsad = INT_MAX;
   int r, c;
 
-  unsigned char *check_here;
+  uint8_t *check_here;
   int thissad;
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -1675,18 +1635,18 @@
                           int sad_per_bit, int distance,
                           vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
                           int *mvcost[2], int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
+  uint8_t *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
-  unsigned char *in_what;
+  uint8_t *in_what;
   int in_what_stride = d->pre_stride;
   int mv_stride = d->pre_stride;
-  unsigned char *bestaddress;
+  uint8_t *bestaddress;
   int_mv *best_mv = &d->bmi.as_mv.first;
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
 
-  unsigned char *check_here;
+  uint8_t *check_here;
   unsigned int thissad;
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -1804,18 +1764,18 @@
                           vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
                           int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
+  uint8_t *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
-  unsigned char *in_what;
+  uint8_t *in_what;
   int in_what_stride = d->pre_stride;
   int mv_stride = d->pre_stride;
-  unsigned char *bestaddress;
+  uint8_t *bestaddress;
   int_mv *best_mv = &d->bmi.as_mv.first;
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
 
-  unsigned char *check_here;
+  uint8_t *check_here;
   unsigned int thissad;
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -1827,7 +1787,7 @@
   int col_min = ref_col - distance;
   int col_max = ref_col + distance;
 
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8);
   unsigned int sad_array[3];
   int_mv fcenter_mv;
 
@@ -1959,14 +1919,15 @@
                               int *mvjcost, int *mvcost[2], int_mv *center_mv) {
   MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   int i, j;
-  short this_row_offset, this_col_offset;
+  int this_row_offset, this_col_offset;
 
   int what_stride = b->src_stride;
   int in_what_stride = d->pre_stride;
-  unsigned char *what = (*(b->base_src) + b->src);
-  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
-                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
-  unsigned char *check_here;
+  uint8_t *what = (*(b->base_src) + b->src);
+  uint8_t *best_address = (uint8_t *)(*(d->base_pre) + d->pre +
+                                      (ref_mv->as_mv.row * (d->pre_stride)) +
+                                      ref_mv->as_mv.col);
+  uint8_t *check_here;
   unsigned int thissad;
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
@@ -2036,14 +1997,15 @@
                               int *mvjcost, int *mvcost[2], int_mv *center_mv) {
   MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   int i, j;
-  short this_row_offset, this_col_offset;
+  int this_row_offset, this_col_offset;
 
   int what_stride = b->src_stride;
   int in_what_stride = d->pre_stride;
-  unsigned char *what = (*(b->base_src) + b->src);
-  unsigned char *best_address = (unsigned char *)(*(d->base_pre) + d->pre +
-                                                  (ref_mv->as_mv.row * (d->pre_stride)) + ref_mv->as_mv.col);
-  unsigned char *check_here;
+  uint8_t *what = (*(b->base_src) + b->src);
+  uint8_t *best_address = (uint8_t *)(*(d->base_pre) + d->pre +
+                                      (ref_mv->as_mv.row * (d->pre_stride)) +
+                                      ref_mv->as_mv.col);
+  uint8_t *check_here;
   unsigned int thissad;
   int_mv this_mv;
   unsigned int bestsad = INT_MAX;
@@ -2153,17 +2115,10 @@
     fprintf(f, "    ");
     for (i = 0; i < 4; i++) {
       int this_prob;
-      int count;
 
       // context probs
-      count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
-      if (count)
-        this_prob = 256 * mv_ref_ct[j][i][0] / count;
-      else
-        this_prob = 128;
+      this_prob = get_binary_prob(mv_ref_ct[j][i][0], mv_ref_ct[j][i][1]);
 
-      if (this_prob == 0)
-        this_prob = 1;
       fprintf(f, "%5d, ", this_prob);
     }
     fprintf(f, "  },\n");
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -18,6 +18,7 @@
 #ifdef ENTROPY_STATS
 extern void init_mv_ref_counts();
 extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
+void print_mode_context(void);
 #endif
 
 
@@ -82,4 +83,4 @@
                                        int_mv *center_mv);
 
 
-#endif
+#endif  // VP9_ENCODER_VP9_MCOMP_H_
--- a/vp9/encoder/vp9_modecosts.h
+++ b/vp9/encoder/vp9_modecosts.h
@@ -14,4 +14,4 @@
 
 void vp9_init_mode_costs(VP9_COMP *x);
 
-#endif
+#endif  // VP9_ENCODER_VP9_MODECOSTS_H_
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -18,7 +18,7 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_psnr.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_extend.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_quant_common.h"
@@ -49,7 +49,8 @@
 
 static void set_default_lf_deltas(VP9_COMP *cpi);
 
-#define DEFAULT_INTERP_FILTER EIGHTTAP  /* SWITCHABLE for better performance */
+#define DEFAULT_INTERP_FILTER SWITCHABLE
+
 #define SEARCH_BEST_FILTER 0            /* to search exhaustively for
                                            best filter */
 #define RESET_FOREACH_FILTER 0          /* whether to reset the encoder state
@@ -118,7 +119,7 @@
 extern unsigned __int64 Sectionbits[500];
 #endif
 #ifdef MODE_STATS
-extern INT64 Sectionbits[500];
+extern int64_t Sectionbits[500];
 extern unsigned int y_modes[VP9_YMODES];
 extern unsigned int i8x8_modes[VP9_I8X8_MODES];
 extern unsigned int uv_modes[VP9_UV_MODES];
@@ -214,7 +215,7 @@
 static void init_base_skip_probs(void) {
   int i;
   double q;
-  int skip_prob, t;
+  int t;
 
   for (i = 0; i < QINDEX_RANGE; i++) {
     q = vp9_convert_qindex_to_q(i);
@@ -223,26 +224,9 @@
     // Based on crude best fit of old table.
     t = (int)(564.25 * pow(2.71828, (-0.012 * q)));
 
-    skip_prob = t;
-    if (skip_prob < 1)
-      skip_prob = 1;
-    else if (skip_prob > 255)
-      skip_prob = 255;
-    base_skip_false_prob[i][1] = skip_prob;
-
-    skip_prob = t * 3 / 4;
-    if (skip_prob < 1)
-      skip_prob = 1;
-    else if (skip_prob > 255)
-      skip_prob = 255;
-    base_skip_false_prob[i][2] = skip_prob;
-
-    skip_prob = t * 5 / 4;
-    if (skip_prob < 1)
-      skip_prob = 1;
-    else if (skip_prob > 255)
-      skip_prob = 255;
-    base_skip_false_prob[i][0] = skip_prob;
+    base_skip_false_prob[i][1] = clip_prob(t);
+    base_skip_false_prob[i][2] = clip_prob(t * 3 / 4);
+    base_skip_false_prob[i][0] = clip_prob(t * 5 / 4);
   }
 }
 
@@ -571,43 +555,19 @@
 }
 
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int row, col, sb_rows = (cm->mb_rows + 1) >> 1, sb_cols = (cm->mb_cols + 1) >> 1;
-  MODE_INFO *mi = cm->mi;
-  uint8_t *segmap = cpi->segmentation_map;
-  uint8_t *segcache = cm->last_frame_seg_map;
+  VP9_COMMON *const cm = &cpi->common;
+  int row, col;
+  MODE_INFO *mi, *mi_ptr = cm->mi;
+  uint8_t *cache_ptr = cm->last_frame_seg_map, *cache;
 
-  for (row = 0; row < sb_rows; row++) {
-    for (col = 0; col < sb_cols; col++) {
-      MODE_INFO *miptr = mi + col * 2;
-      uint8_t *cache = segcache + col * 2;
-#if CONFIG_SUPERBLOCKS
-      if (miptr->mbmi.encoded_as_sb) {
-        cache[0] = miptr->mbmi.segment_id;
-        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-          cache[1] = miptr->mbmi.segment_id;
-        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
-          cache[cm->mb_cols] = miptr->mbmi.segment_id;
-          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-            cache[cm->mb_cols + 1] = miptr->mbmi.segment_id;
-        }
-      } else
-#endif
-      {
-        cache[0] = miptr[0].mbmi.segment_id;
-        if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-          cache[1] = miptr[1].mbmi.segment_id;
-        if (!(cm->mb_rows & 1) || row < sb_rows - 1) {
-          cache[cm->mb_cols] = miptr[cm->mode_info_stride].mbmi.segment_id;
-          if (!(cm->mb_cols & 1) || col < sb_cols - 1)
-            cache[1] = miptr[1].mbmi.segment_id;
-          cache[cm->mb_cols + 1] = miptr[cm->mode_info_stride + 1].mbmi.segment_id;
-        }
-      }
+  for (row = 0; row < cm->mb_rows; row++) {
+    mi = mi_ptr;
+    cache = cache_ptr;
+    for (col = 0; col < cm->mb_cols; col++, mi++, cache++) {
+      cache[0] = mi->mbmi.segment_id;
     }
-    segmap += 2 * cm->mb_cols;
-    segcache += 2 * cm->mb_cols;
-    mi += 2 * cm->mode_info_stride;
+    mi_ptr += cm->mode_info_stride;
+    cache_ptr += cm->mb_cols;
   }
 }
 
@@ -666,7 +626,6 @@
 
   sf->first_step = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-  sf->improved_mv_pred = 1;
 
   // default thresholds to 0
   for (i = 0; i < MAX_MODES; i++)
@@ -674,47 +633,6 @@
 
   switch (Mode) {
     case 0: // best quality mode
-#if CONFIG_PRED_FILTER
-      sf->thresh_mult[THR_ZEROMV        ] = 0;
-      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;
-      sf->thresh_mult[THR_ZEROG         ] = 0;
-      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;
-      sf->thresh_mult[THR_ZEROA         ] = 0;
-      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV     ] = 0;
-      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
-      sf->thresh_mult[THR_NEARESTG      ] = 0;
-      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
-      sf->thresh_mult[THR_NEARESTA      ] = 0;
-      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
-      sf->thresh_mult[THR_NEARMV        ] = 0;
-      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;
-      sf->thresh_mult[THR_NEARG         ] = 0;
-      sf->thresh_mult[THR_NEARG_FILT    ] = 0;
-      sf->thresh_mult[THR_NEARA         ] = 0;
-      sf->thresh_mult[THR_NEARA_FILT    ] = 0;
-
-      sf->thresh_mult[THR_DC       ] = 0;
-
-      sf->thresh_mult[THR_V_PRED   ] = 1000;
-      sf->thresh_mult[THR_H_PRED   ] = 1000;
-      sf->thresh_mult[THR_D45_PRED ] = 1000;
-      sf->thresh_mult[THR_D135_PRED] = 1000;
-      sf->thresh_mult[THR_D117_PRED] = 1000;
-      sf->thresh_mult[THR_D153_PRED] = 1000;
-      sf->thresh_mult[THR_D27_PRED ] = 1000;
-      sf->thresh_mult[THR_D63_PRED ] = 1000;
-      sf->thresh_mult[THR_B_PRED   ] = 2000;
-      sf->thresh_mult[THR_I8X8_PRED] = 2000;
-      sf->thresh_mult[THR_TM       ] = 1000;
-
-      sf->thresh_mult[THR_NEWMV    ] = 1000;
-      sf->thresh_mult[THR_NEWG     ] = 1000;
-      sf->thresh_mult[THR_NEWA     ] = 1000;
-      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;
-      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;
-      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;
-#else
       sf->thresh_mult[THR_ZEROMV   ] = 0;
       sf->thresh_mult[THR_ZEROG    ] = 0;
       sf->thresh_mult[THR_ZEROA    ] = 0;
@@ -742,7 +660,7 @@
       sf->thresh_mult[THR_NEWMV    ] = 1000;
       sf->thresh_mult[THR_NEWG     ] = 1000;
       sf->thresh_mult[THR_NEWA     ] = 1000;
-#endif
+
       sf->thresh_mult[THR_SPLITMV  ] = 2500;
       sf->thresh_mult[THR_SPLITG   ] = 5000;
       sf->thresh_mult[THR_SPLITA   ] = 5000;
@@ -785,14 +703,10 @@
       sf->search_best_filter = SEARCH_BEST_FILTER;
       break;
     case 1:
-#if CONFIG_PRED_FILTER
       sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
       sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;
       sf->thresh_mult[THR_DC       ] = 0;
       sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;
       sf->thresh_mult[THR_V_PRED   ] = 1000;
       sf->thresh_mult[THR_H_PRED   ] = 1000;
       sf->thresh_mult[THR_D45_PRED ] = 1000;
@@ -806,18 +720,12 @@
       sf->thresh_mult[THR_TM       ] = 1000;
 
       sf->thresh_mult[THR_NEARESTG ] = 1000;
-      sf->thresh_mult[THR_NEARESTG_FILT ] = 1000;
       sf->thresh_mult[THR_NEARESTA ] = 1000;
-      sf->thresh_mult[THR_NEARESTA_FILT ] = 1000;
 
       sf->thresh_mult[THR_ZEROG    ] = 1000;
       sf->thresh_mult[THR_ZEROA    ] = 1000;
       sf->thresh_mult[THR_NEARG    ] = 1000;
       sf->thresh_mult[THR_NEARA    ] = 1000;
-      sf->thresh_mult[THR_ZEROG_FILT    ] = 1000;
-      sf->thresh_mult[THR_ZEROA_FILT    ] = 1000;
-      sf->thresh_mult[THR_NEARG_FILT    ] = 1000;
-      sf->thresh_mult[THR_NEARA_FILT    ] = 1000;
 
       sf->thresh_mult[THR_ZEROMV   ] = 0;
       sf->thresh_mult[THR_ZEROG    ] = 0;
@@ -828,61 +736,11 @@
       sf->thresh_mult[THR_NEARMV   ] = 0;
       sf->thresh_mult[THR_NEARG    ] = 0;
       sf->thresh_mult[THR_NEARA    ] = 0;
-      sf->thresh_mult[THR_ZEROMV_FILT   ] = 0;
-      sf->thresh_mult[THR_ZEROG_FILT    ] = 0;
-      sf->thresh_mult[THR_ZEROA_FILT    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV_FILT] = 0;
-      sf->thresh_mult[THR_NEARESTG_FILT ] = 0;
-      sf->thresh_mult[THR_NEARESTA_FILT ] = 0;
-      sf->thresh_mult[THR_NEARMV_FILT   ] = 0;
-      sf->thresh_mult[THR_NEARG_FILT    ] = 0;
-      sf->thresh_mult[THR_NEARA_FILT    ] = 0;
 
       sf->thresh_mult[THR_NEWMV    ] = 1000;
       sf->thresh_mult[THR_NEWG     ] = 1000;
       sf->thresh_mult[THR_NEWA     ] = 1000;
-      sf->thresh_mult[THR_NEWMV_FILT    ] = 1000;
-      sf->thresh_mult[THR_NEWG_FILT     ] = 1000;
-      sf->thresh_mult[THR_NEWA_FILT     ] = 1000;
-#else
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_DC       ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_V_PRED   ] = 1000;
-      sf->thresh_mult[THR_H_PRED   ] = 1000;
-      sf->thresh_mult[THR_D45_PRED ] = 1000;
-      sf->thresh_mult[THR_D135_PRED] = 1000;
-      sf->thresh_mult[THR_D117_PRED] = 1000;
-      sf->thresh_mult[THR_D153_PRED] = 1000;
-      sf->thresh_mult[THR_D27_PRED ] = 1000;
-      sf->thresh_mult[THR_D63_PRED ] = 1000;
-      sf->thresh_mult[THR_B_PRED   ] = 2500;
-      sf->thresh_mult[THR_I8X8_PRED] = 2500;
-      sf->thresh_mult[THR_TM       ] = 1000;
 
-      sf->thresh_mult[THR_NEARESTG ] = 1000;
-      sf->thresh_mult[THR_NEARESTA ] = 1000;
-
-      sf->thresh_mult[THR_ZEROG    ] = 1000;
-      sf->thresh_mult[THR_ZEROA    ] = 1000;
-      sf->thresh_mult[THR_NEARG    ] = 1000;
-      sf->thresh_mult[THR_NEARA    ] = 1000;
-
-      sf->thresh_mult[THR_ZEROMV   ] = 0;
-      sf->thresh_mult[THR_ZEROG    ] = 0;
-      sf->thresh_mult[THR_ZEROA    ] = 0;
-      sf->thresh_mult[THR_NEARESTMV] = 0;
-      sf->thresh_mult[THR_NEARESTG ] = 0;
-      sf->thresh_mult[THR_NEARESTA ] = 0;
-      sf->thresh_mult[THR_NEARMV   ] = 0;
-      sf->thresh_mult[THR_NEARG    ] = 0;
-      sf->thresh_mult[THR_NEARA    ] = 0;
-
-      sf->thresh_mult[THR_NEWMV    ] = 1000;
-      sf->thresh_mult[THR_NEWG     ] = 1000;
-      sf->thresh_mult[THR_NEWA     ] = 1000;
-#endif
       sf->thresh_mult[THR_SPLITMV  ] = 1700;
       sf->thresh_mult[THR_SPLITG   ] = 4500;
       sf->thresh_mult[THR_SPLITA   ] = 4500;
@@ -958,9 +816,6 @@
 
         if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
           sf->thresh_mult[THR_NEWMV    ] = 2000;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;
-#endif
           sf->thresh_mult[THR_SPLITMV  ] = 10000;
           sf->thresh_mult[THR_COMP_SPLITLG  ] = 20000;
         }
@@ -970,12 +825,6 @@
           sf->thresh_mult[THR_ZEROG    ] = 1500;
           sf->thresh_mult[THR_NEARG    ] = 1500;
           sf->thresh_mult[THR_NEWG     ] = 2000;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEARESTG_FILT ] = 1500;
-          sf->thresh_mult[THR_ZEROG_FILT    ] = 1500;
-          sf->thresh_mult[THR_NEARG_FILT    ] = 1500;
-          sf->thresh_mult[THR_NEWG_FILT     ] = 2000;
-#endif
           sf->thresh_mult[THR_SPLITG   ] = 20000;
           sf->thresh_mult[THR_COMP_SPLITGA  ] = 20000;
         }
@@ -985,12 +834,6 @@
           sf->thresh_mult[THR_ZEROA    ] = 1500;
           sf->thresh_mult[THR_NEARA    ] = 1500;
           sf->thresh_mult[THR_NEWA     ] = 2000;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEARESTA_FILT ] = 1500;
-          sf->thresh_mult[THR_ZEROA_FILT    ] = 1500;
-          sf->thresh_mult[THR_NEARA_FILT    ] = 1500;
-          sf->thresh_mult[THR_NEWA_FILT     ] = 2000;
-#endif
           sf->thresh_mult[THR_SPLITA   ] = 20000;
           sf->thresh_mult[THR_COMP_SPLITLA  ] = 10000;
         }
@@ -1047,9 +890,6 @@
 
         if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
           sf->thresh_mult[THR_NEWMV    ] = 2000;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEWMV_FILT    ] = 2000;
-#endif
           sf->thresh_mult[THR_SPLITMV  ] = 25000;
           sf->thresh_mult[THR_COMP_SPLITLG  ] = 50000;
         }
@@ -1059,12 +899,6 @@
           sf->thresh_mult[THR_ZEROG    ] = 2000;
           sf->thresh_mult[THR_NEARG    ] = 2000;
           sf->thresh_mult[THR_NEWG     ] = 2500;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEARESTG_FILT ] = 2000;
-          sf->thresh_mult[THR_ZEROG_FILT    ] = 2000;
-          sf->thresh_mult[THR_NEARG_FILT    ] = 2000;
-          sf->thresh_mult[THR_NEWG_FILT     ] = 2500;
-#endif
           sf->thresh_mult[THR_SPLITG   ] = 50000;
           sf->thresh_mult[THR_COMP_SPLITGA  ] = 50000;
         }
@@ -1074,12 +908,6 @@
           sf->thresh_mult[THR_ZEROA    ] = 2000;
           sf->thresh_mult[THR_NEARA    ] = 2000;
           sf->thresh_mult[THR_NEWA     ] = 2500;
-#if CONFIG_PRED_FILTER
-          sf->thresh_mult[THR_NEARESTA_FILT ] = 2000;
-          sf->thresh_mult[THR_ZEROA_FILT    ] = 2000;
-          sf->thresh_mult[THR_NEARA_FILT    ] = 2000;
-          sf->thresh_mult[THR_NEWA_FILT     ] = 2500;
-#endif
           sf->thresh_mult[THR_SPLITA   ] = 50000;
           sf->thresh_mult[THR_COMP_SPLITLA  ] = 25000;
         }
@@ -1130,12 +958,6 @@
     sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
     sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
     sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
-#if CONFIG_PRED_FILTER
-    sf->thresh_mult[THR_NEWMV_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARESTMV_FILT] = INT_MAX;
-    sf->thresh_mult[THR_ZEROMV_FILT   ] = INT_MAX;
-    sf->thresh_mult[THR_NEARMV_FILT   ] = INT_MAX;
-#endif
     sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
   }
 
@@ -1144,12 +966,6 @@
     sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
     sf->thresh_mult[THR_NEARG    ] = INT_MAX;
     sf->thresh_mult[THR_NEWG     ] = INT_MAX;
-#if CONFIG_PRED_FILTER
-    sf->thresh_mult[THR_NEARESTG_FILT ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROG_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARG_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWG_FILT     ] = INT_MAX;
-#endif
 #if CONFIG_COMP_INTERINTRA_PRED
     sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = INT_MAX;
     sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX;
@@ -1164,12 +980,6 @@
     sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
     sf->thresh_mult[THR_NEARA    ] = INT_MAX;
     sf->thresh_mult[THR_NEWA     ] = INT_MAX;
-#if CONFIG_PRED_FILTER
-    sf->thresh_mult[THR_NEARESTA_FILT ] = INT_MAX;
-    sf->thresh_mult[THR_ZEROA_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEARA_FILT    ] = INT_MAX;
-    sf->thresh_mult[THR_NEWA_FILT     ] = INT_MAX;
-#endif
 #if CONFIG_COMP_INTERINTRA_PRED
     sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = INT_MAX;
     sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX;
@@ -1803,12 +1613,11 @@
   cm->prob_last_coded               = 128;
   cm->prob_gf_coded                 = 128;
   cm->prob_intra_coded              = 63;
-#if CONFIG_SUPERBLOCKS
-  cm->sb_coded                      = 200;
-#endif
+  cm->sb32_coded                    = 200;
+  cm->sb64_coded                    = 200;
   for (i = 0; i < COMP_PRED_CONTEXTS; i++)
     cm->prob_comppred[i]         = 128;
-  for (i = 0; i < TX_SIZE_MAX - 1; i++)
+  for (i = 0; i < TX_SIZE_MAX_SB - 1; i++)
     cm->prob_tx[i]               = 128;
 
   // Prime the recent reference frame useage counters.
@@ -1918,10 +1727,7 @@
 
 #endif
 
-#ifndef LLONG_MAX
-#define LLONG_MAX  9223372036854775807LL
-#endif
-  cpi->first_time_stamp_ever = LLONG_MAX;
+  cpi->first_time_stamp_ever = INT64_MAX;
 
   cpi->frames_till_gf_update_due      = 0;
   cpi->key_frame_count              = 1;
@@ -2005,13 +1811,16 @@
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
 
-#if CONFIG_SUPERBLOCKS
   BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
       vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
       vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
-#endif
 
+  BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
+      vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+      vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
+      vp9_sad64x64x4d)
+
   BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
        vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
        vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
@@ -2221,11 +2030,11 @@
       fprintf(fmode, "[VP9_KF_BINTRAMODES][VP9_KF_BINTRAMODES]"
                      "[VP9_KF_BINTRAMODES] =\n{\n");
 
-      for (i = 0; i < VP8_KF_BINTRAMODES; i++) {
+      for (i = 0; i < VP9_KF_BINTRAMODES; i++) {
 
         fprintf(fmode, "    { // Above Mode :  %d\n", i);
 
-        for (j = 0; j < VP8_KF_BINTRAMODES; j++) {
+        for (j = 0; j < VP9_KF_BINTRAMODES; j++) {
 
           fprintf(fmode, "        {");
 
@@ -2310,8 +2119,8 @@
 }
 
 
-static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
-                                 unsigned char *recon, int recon_stride,
+static uint64_t calc_plane_error(uint8_t *orig, int orig_stride,
+                                 uint8_t *recon, int recon_stride,
                                  unsigned int cols, unsigned int rows) {
   unsigned int row, col;
   uint64_t total_sse = 0;
@@ -2327,9 +2136,9 @@
 
     /* Handle odd-sized width */
     if (col < cols) {
-      unsigned int   border_row, border_col;
-      unsigned char *border_orig = orig;
-      unsigned char *border_recon = recon;
+      unsigned int border_row, border_col;
+      uint8_t *border_orig = orig;
+      uint8_t *border_recon = recon;
 
       for (border_row = 0; border_row < 16; border_row++) {
         for (border_col = col; border_col < cols; border_col++) {
@@ -2488,7 +2297,7 @@
 
 #ifdef OUTPUT_YUV_SRC
 void vp9_write_yuv_frame(YV12_BUFFER_CONFIG *s) {
-  unsigned char *src = s->y_buffer;
+  uint8_t *src = s->y_buffer;
   int h = s->y_height;
 
   do {
@@ -2517,7 +2326,7 @@
 #ifdef OUTPUT_YUV_REC
 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
   YV12_BUFFER_CONFIG *s = cm->frame_to_show;
-  unsigned char *src = s->y_buffer;
+  uint8_t *src = s->y_buffer;
   int h = cm->Height;
 
   do {
@@ -2690,9 +2499,9 @@
   int i, j;
   int num_edge_pels = 0;
   int num_pels = (frame->y_height - 2) * (frame->y_width - 2);
-  unsigned char *prev = frame->y_buffer + 1;
-  unsigned char *curr = frame->y_buffer + 1 + frame->y_stride;
-  unsigned char *next = frame->y_buffer + 1 + 2 * frame->y_stride;
+  uint8_t *prev = frame->y_buffer + 1;
+  uint8_t *curr = frame->y_buffer + 1 + frame->y_stride;
+  uint8_t *next = frame->y_buffer + 1 + 2 * frame->y_stride;
   for (i = 1; i < frame->y_height - 1; i++) {
     for (j = 1; j < frame->y_width - 1; j++) {
       /* Sobel hor and ver gradients */
@@ -2714,10 +2523,10 @@
 
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
-static BOOL recode_loop_test(VP9_COMP *cpi,
-                             int high_limit, int low_limit,
-                             int q, int maxq, int minq) {
-  BOOL    force_recode = FALSE;
+static int recode_loop_test(VP9_COMP *cpi,
+                            int high_limit, int low_limit,
+                            int q, int maxq, int minq) {
+  int force_recode = FALSE;
   VP9_COMMON *cm = &cpi->common;
 
   // Is frame recode allowed at all
@@ -2850,7 +2659,7 @@
 
   if (cm->filter_level > 0) {
     vp9_set_alt_lf_level(cpi, cm->filter_level);
-    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd);
+    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0);
   }
 
   vp8_yv12_extend_frame_borders(cm->frame_to_show);
@@ -2857,50 +2666,45 @@
 
 }
 
-#if CONFIG_PRED_FILTER
-void select_pred_filter_mode(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
+void select_interp_filter_type(VP9_COMP *cpi) {
+  int i;
+  int high_filter_index = 0;
+  unsigned int thresh;
+  unsigned int high_count = 0;
+  unsigned int count_sum = 0;
+  unsigned int *hist = cpi->best_switchable_interp_count;
 
-  int prob_pred_filter_off = cm->prob_pred_filter_off;
+  if (DEFAULT_INTERP_FILTER != SWITCHABLE) {
+    cpi->common.mcomp_filter_type = DEFAULT_INTERP_FILTER;
+    return;
+  }
 
-  // Force filter on/off if probability is extreme
-  if (prob_pred_filter_off >= 255 * 0.95)
-    cm->pred_filter_mode = 0;   // Off at the frame level
-  else if (prob_pred_filter_off <= 255 * 0.05)
-    cm->pred_filter_mode = 1;   // On at the frame level
-  else
-    cm->pred_filter_mode = 2;   // Selectable at the MB level
-}
+  // TODO(agrange): Look at using RD criteria to select the interpolation
+  // filter to use for the next frame rather than this simpler counting scheme.
 
-void update_pred_filt_prob(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int prob_pred_filter_off;
+  // Select the interpolation filter mode for the next frame
+  // based on the selection frequency seen in the current frame.
+  for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
+    unsigned int count = hist[i];
+    count_sum += count;
+    if (count > high_count) {
+      high_count = count;
+      high_filter_index = i;
+    }
+  }
 
-  // Based on the selection in the previous frame determine what mode
-  // to use for the current frame and work out the signaling probability
-  if (cpi->pred_filter_on_count + cpi->pred_filter_off_count) {
-    prob_pred_filter_off = cpi->pred_filter_off_count * 256 /
-                           (cpi->pred_filter_on_count + cpi->pred_filter_off_count);
+  thresh = (unsigned int)(0.80 * count_sum);
 
-    if (prob_pred_filter_off < 1)
-      prob_pred_filter_off = 1;
-
-    if (prob_pred_filter_off > 255)
-      prob_pred_filter_off = 255;
-
-    cm->prob_pred_filter_off = prob_pred_filter_off;
-  } else
-    cm->prob_pred_filter_off = 128;
-  /*
-      {
-        FILE *fp = fopen("filt_use.txt", "a");
-        fprintf (fp, "%d %d prob=%d\n", cpi->pred_filter_off_count,
-                 cpi->pred_filter_on_count, cm->prob_pred_filter_off);
-        fclose(fp);
-      }
-  */
+  if (high_count > thresh) {
+    // One filter accounts for 80+% of cases so force the next
+    // frame to use this filter exclusively using frame-level flag.
+    cpi->common.mcomp_filter_type = vp9_switchable_interp[high_filter_index];
+  } else {
+    // Use a MB-level switchable filter selection strategy.
+    cpi->common.mcomp_filter_type = SWITCHABLE;
+  }
 }
-#endif
+
 #if CONFIG_COMP_INTERINTRA_PRED
 static void select_interintra_mode(VP9_COMP *cpi) {
   static const double threshold = 0.01;
@@ -2915,13 +2719,10 @@
 }
 #endif
 
-static void encode_frame_to_data_rate
-(
-  VP9_COMP *cpi,
-  unsigned long *size,
-  unsigned char *dest,
-  unsigned int *frame_flags
-) {
+static void encode_frame_to_data_rate(VP9_COMP *cpi,
+                                      unsigned long *size,
+                                      unsigned char *dest,
+                                      unsigned int *frame_flags) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
 
@@ -2961,12 +2762,16 @@
 
   /* list of filters to search over */
   int mcomp_filters_to_search[] = {
-    EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE
+#if CONFIG_ENABLE_6TAP
+      EIGHTTAP, EIGHTTAP_SHARP, SIXTAP, SWITCHABLE
+#else
+      EIGHTTAP, EIGHTTAP_SHARP, EIGHTTAP_SMOOTH, SWITCHABLE
+#endif
   };
   int mcomp_filters = sizeof(mcomp_filters_to_search) /
       sizeof(*mcomp_filters_to_search);
   int mcomp_filter_index = 0;
-  INT64 mcomp_filter_cost[4];
+  int64_t mcomp_filter_cost[4];
 
   // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();
@@ -3020,11 +2825,6 @@
   // Set default state for segment based loop filter update flags
   xd->mode_ref_lf_delta_update = 0;
 
-#if CONFIG_NEW_MVREF
-  // Temp defaults probabilities for ecnoding the MV ref id signal
-  vpx_memset(xd->mb_mv_ref_id_probs, 192,
-             sizeof(xd->mb_mv_ref_id_probs));
-#endif
 
   // Set various flags etc to special state if it is a key frame
   if (cm->frame_type == KEY_FRAME) {
@@ -3164,7 +2964,7 @@
   if (cpi->active_worst_quality < cpi->active_best_quality)
     cpi->active_worst_quality = cpi->active_best_quality;
 
-  // Specuial case code to try and match quality with forced key frames
+  // Special case code to try and match quality with forced key frames
   if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
     Q = cpi->last_boosted_qindex;
   } else {
@@ -3216,7 +3016,7 @@
 #if CONFIG_POSTPROC
 
   if (cpi->oxcf.noise_sensitivity > 0) {
-    unsigned char *src;
+    uint8_t *src;
     int l = 0;
 
     switch (cpi->oxcf.noise_sensitivity) {
@@ -3340,13 +3140,6 @@
 
     vp9_clear_system_state();  // __asm emms;
 
-#if CONFIG_PRED_FILTER
-    // Update prediction filter on/off probability based on
-    // selection made for the current frame
-    if (cm->frame_type != KEY_FRAME)
-      update_pred_filt_prob(cpi);
-#endif
-
     // Dummy pack of the bitstream using up to date stats to get an
     // accurate estimate of output frame size to determine if we need
     // to recode.
@@ -3521,42 +3314,11 @@
     if (cpi->is_src_frame_alt_ref)
       Loop = FALSE;
 
-    if (cm->frame_type != KEY_FRAME &&
-        !sf->search_best_filter &&
-        cm->mcomp_filter_type == SWITCHABLE) {
-      int interp_factor = Q / 3;  /* denominator is 256 */
-      int count[VP9_SWITCHABLE_FILTERS];
-      int tot_count = 0, c = 0, thr;
-      int i, j;
-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-        count[i] = 0;
-        for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {
-          count[i] += cpi->switchable_interp_count[j][i];
-        }
-        tot_count += count[i];
-      }
-
-      thr = ((tot_count * interp_factor + 128) >> 8);
-      for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-        c += (count[i] >= thr);
-      }
-      if (c == 1) {
-        /* Mostly one filter is used. So set the filter at frame level */
-        for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
-          if (count[i]) {
-            cm->mcomp_filter_type = vp9_switchable_interp[i];
-            Loop = TRUE;  /* Make sure to loop since the filter changed */
-            break;
-          }
-        }
-      }
-    }
-
     if (Loop == FALSE && cm->frame_type != KEY_FRAME && sf->search_best_filter) {
       if (mcomp_filter_index < mcomp_filters) {
-        INT64 err = vp9_calc_ss_err(cpi->Source,
+        int64_t err = vp9_calc_ss_err(cpi->Source,
                                     &cm->yv12_fb[cm->new_fb_idx]);
-        INT64 rate = cpi->projected_frame_size << 8;
+        int64_t rate = cpi->projected_frame_size << 8;
         mcomp_filter_cost[mcomp_filter_index] =
           (RDCOST(cpi->RDMULT, cpi->RDDIV, rate, err));
         mcomp_filter_index++;
@@ -3566,7 +3328,7 @@
           Loop = TRUE;
         } else {
           int f;
-          INT64 best_cost = mcomp_filter_cost[0];
+          int64_t best_cost = mcomp_filter_cost[0];
           int mcomp_best_filter = mcomp_filters_to_search[0];
           for (f = 1; f < mcomp_filters; f++) {
             if (mcomp_filter_cost[f] < best_cost) {
@@ -3606,6 +3368,7 @@
 
     if (Loop == TRUE) {
       loop_count++;
+
 #if CONFIG_INTERNAL_STATS
       cpi->tot_recode_hits++;
 #endif
@@ -3681,26 +3444,20 @@
     update_reference_segmentation_map(cpi);
   }
 
-#if CONFIG_PRED_FILTER
-  // Select the prediction filtering mode to use for the
-  // next frame based on the current frame selections
-  if (cm->frame_type != KEY_FRAME)
-    select_pred_filter_mode(cpi);
-#endif
-
   update_reference_frames(cm);
-  vp9_copy(cpi->common.fc.coef_counts, cpi->coef_counts);
-  vp9_copy(cpi->common.fc.hybrid_coef_counts, cpi->hybrid_coef_counts);
+  vp9_copy(cpi->common.fc.coef_counts_4x4, cpi->coef_counts_4x4);
+  vp9_copy(cpi->common.fc.hybrid_coef_counts_4x4,
+           cpi->hybrid_coef_counts_4x4);
   vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);
-  vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8, cpi->hybrid_coef_counts_8x8);
+  vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8,
+           cpi->hybrid_coef_counts_8x8);
   vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);
   vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,
            cpi->hybrid_coef_counts_16x16);
+  vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);
   vp9_adapt_coef_probs(&cpi->common);
   if (cpi->common.frame_type != KEY_FRAME) {
-#if CONFIG_SUPERBLOCKS
     vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count);
-#endif
     vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);
     vp9_copy(cpi->common.fc.uv_mode_counts, cpi->y_uv_mode_count);
     vp9_copy(cpi->common.fc.bmode_counts, cpi->bmode_count);
@@ -3812,19 +3569,6 @@
   // in this frame.
   update_base_skip_probs(cpi);
 
-#if 0 //CONFIG_NEW_MVREF && CONFIG_INTERNAL_STATS
-  {
-    FILE *f = fopen("mv_ref_dist.stt", "a");
-    unsigned int i;
-    for (i = 0; i < MAX_MV_REFS; ++i) {
-      fprintf(f, "%10d", cpi->best_ref_index_counts[0][i]);
-    }
-    fprintf(f, "\n" );
-
-    fclose(f);
-  }
-#endif
-
 #if 0// 1 && CONFIG_INTERNAL_STATS
   {
     FILE *f = fopen("tmp.stt", "a");
@@ -4493,8 +4237,8 @@
   int i, j;
   int Total = 0;
 
-  unsigned char *src = source->y_buffer;
-  unsigned char *dst = dest->y_buffer;
+  uint8_t *src = source->y_buffer;
+  uint8_t *dst = dest->y_buffer;
 
   // Loop through the Y plane raw and reconstruction data summing (square differences)
   for (i = 0; i < source->y_height; i += 16) {
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -41,19 +41,11 @@
 #define AF_THRESH2  100
 #define ARF_DECAY_THRESH 12
 
-#if CONFIG_PRED_FILTER
 #if CONFIG_COMP_INTERINTRA_PRED
-#define MAX_MODES 66
-#else
 #define MAX_MODES 54
-#endif
-#else  // CONFIG_PRED_FILTER
-#if CONFIG_COMP_INTERINTRA_PRED
-#define MAX_MODES 54
 #else
 #define MAX_MODES 42
 #endif
-#endif  // CONFIG_PRED_FILTER
 
 #define MIN_THRESHMULT  32
 #define MAX_THRESHMULT  512
@@ -94,24 +86,15 @@
   // 0 = BPRED, ZERO_MV, MV, SPLIT
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
-  vp9_prob coef_probs[BLOCK_TYPES]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs[BLOCK_TYPES]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32];
 
-  vp9_prob coef_probs_8x8[BLOCK_TYPES_8X8]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-
-  vp9_prob coef_probs_16x16[BLOCK_TYPES_16X16]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  vp9_prob hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]
-      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-
-#if CONFIG_SUPERBLOCKS
   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];
-#endif
   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */
   vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1];
   vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1];
@@ -180,31 +163,21 @@
   MBGRAPH_MB_STATS *mb_stats;
 } MBGRAPH_FRAME_STATS;
 
-#if CONFIG_PRED_FILTER
 typedef enum {
   THR_ZEROMV,
-  THR_ZEROMV_FILT,
   THR_DC,
 
   THR_NEARESTMV,
-  THR_NEARESTMV_FILT,
   THR_NEARMV,
-  THR_NEARMV_FILT,
 
   THR_ZEROG,
-  THR_ZEROG_FILT,
   THR_NEARESTG,
-  THR_NEARESTG_FILT,
 
   THR_ZEROA,
-  THR_ZEROA_FILT,
   THR_NEARESTA,
-  THR_NEARESTA_FILT,
 
   THR_NEARG,
-  THR_NEARG_FILT,
   THR_NEARA,
-  THR_NEARA_FILT,
 
   THR_V_PRED,
   THR_H_PRED,
@@ -217,11 +190,8 @@
   THR_TM,
 
   THR_NEWMV,
-  THR_NEWMV_FILT,
   THR_NEWG,
-  THR_NEWG_FILT,
   THR_NEWA,
-  THR_NEWA_FILT,
 
   THR_SPLITMV,
   THR_SPLITG,
@@ -267,83 +237,7 @@
 #endif
 }
 THR_MODES;
-#else
-typedef enum {
-  THR_ZEROMV,
-  THR_DC,
 
-  THR_NEARESTMV,
-  THR_NEARMV,
-
-  THR_ZEROG,
-  THR_NEARESTG,
-
-  THR_ZEROA,
-  THR_NEARESTA,
-
-  THR_NEARG,
-  THR_NEARA,
-
-  THR_V_PRED,
-  THR_H_PRED,
-  THR_D45_PRED,
-  THR_D135_PRED,
-  THR_D117_PRED,
-  THR_D153_PRED,
-  THR_D27_PRED,
-  THR_D63_PRED,
-  THR_TM,
-
-  THR_NEWMV,
-  THR_NEWG,
-  THR_NEWA,
-
-  THR_SPLITMV,
-  THR_SPLITG,
-  THR_SPLITA,
-
-  THR_B_PRED,
-  THR_I8X8_PRED,
-
-  THR_COMP_ZEROLG,
-  THR_COMP_NEARESTLG,
-  THR_COMP_NEARLG,
-
-  THR_COMP_ZEROLA,
-  THR_COMP_NEARESTLA,
-  THR_COMP_NEARLA,
-
-  THR_COMP_ZEROGA,
-  THR_COMP_NEARESTGA,
-  THR_COMP_NEARGA,
-
-  THR_COMP_NEWLG,
-  THR_COMP_NEWLA,
-  THR_COMP_NEWGA,
-
-  THR_COMP_SPLITLG,
-  THR_COMP_SPLITLA,
-  THR_COMP_SPLITGA,
-#if CONFIG_COMP_INTERINTRA_PRED
-  THR_COMP_INTERINTRA_ZEROL,
-  THR_COMP_INTERINTRA_NEARESTL,
-  THR_COMP_INTERINTRA_NEARL,
-  THR_COMP_INTERINTRA_NEWL,
-
-  THR_COMP_INTERINTRA_ZEROG,
-  THR_COMP_INTERINTRA_NEARESTG,
-  THR_COMP_INTERINTRA_NEARG,
-  THR_COMP_INTERINTRA_NEWG,
-
-  THR_COMP_INTERINTRA_ZEROA,
-  THR_COMP_INTERINTRA_NEARESTA,
-  THR_COMP_INTERINTRA_NEARA,
-  THR_COMP_INTERINTRA_NEWA,
-#endif
-}
-THR_MODES;
-#endif
-
 typedef enum {
   DIAMOND = 0,
   NSTEP = 1,
@@ -364,7 +258,6 @@
   int first_step;
   int optimize_coefficients;
   int no_skip_block4x4_search;
-  int improved_mv_pred;
   int search_best_filter;
 
 } SPEED_FEATURES;
@@ -397,6 +290,7 @@
   BLOCK_16X16,
   BLOCK_MAX_SEGMENTS,
   BLOCK_32X32 = BLOCK_MAX_SEGMENTS,
+  BLOCK_64X64,
   BLOCK_MAX_SB_SEGMENTS,
 };
 
@@ -435,6 +329,13 @@
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);
   DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);
 
+  DECLARE_ALIGNED(16, short, Y1zbin_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, Y2zbin_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, UVzbin_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_32x32[QINDEX_RANGE][1024]);
+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_32x32[QINDEX_RANGE][1024]);
+
   MACROBLOCK mb;
   VP9_COMMON common;
   VP9_CONFIG oxcf;
@@ -483,8 +384,9 @@
   int comp_pred_count[COMP_PRED_CONTEXTS];
   int single_pred_count[COMP_PRED_CONTEXTS];
   // FIXME contextualize
-  int txfm_count[TX_SIZE_MAX];
-  int txfm_count_8x8p[TX_SIZE_MAX - 1];
+  int txfm_count_32x32p[TX_SIZE_MAX_SB];
+  int txfm_count_16x16p[TX_SIZE_MAX_MB];
+  int txfm_count_8x8p[TX_SIZE_MAX_MB - 1];
   int64_t rd_tx_select_diff[NB_TXFM_MODES];
   int rd_tx_select_threshes[4][NB_TXFM_MODES];
 
@@ -566,10 +468,9 @@
 
   int cq_target_quality;
 
-#if CONFIG_SUPERBLOCKS
-  int sb_count;
+  int sb32_count[2];
+  int sb64_count[2];
   int sb_ymode_count [VP9_I32X32_MODES];
-#endif
   int ymode_count[VP9_YMODES];        /* intra MB type cts this frame */
   int bmode_count[VP9_NKF_BINTRAMODES];
   int i8x8_mode_count[VP9_I8X8_MODES];
@@ -583,27 +484,31 @@
 
   nmv_context_counts NMVcount;
 
-  unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-  unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_hybrid_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_probs frame_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];
+  vp9_coeff_stats frame_hybrid_branch_ct_4x4[BLOCK_TYPES_4X4];
 
-  unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-  unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_hybrid_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_probs frame_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];
+  vp9_coeff_stats frame_hybrid_branch_ct_8x8[BLOCK_TYPES_8X8];
 
-  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
-  unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */
-  vp9_prob frame_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
-  unsigned int frame_hybrid_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];
+  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_probs frame_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];
+  vp9_coeff_stats frame_hybrid_branch_ct_16x16[BLOCK_TYPES_16X16];
 
+  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32];
+  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES_32X32];
+  vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES_32X32];
+
   int gfu_boost;
   int last_boost;
   int kf_boost;
@@ -783,14 +688,12 @@
 
   int dummy_packing;    /* flag to indicate if packing is dummy */
 
-#if CONFIG_PRED_FILTER
-  int pred_filter_on_count;
-  int pred_filter_off_count;
-#endif
   unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
                                       [VP9_SWITCHABLE_FILTERS];
+  unsigned int best_switchable_interp_count[VP9_SWITCHABLE_FILTERS];
+
 #if CONFIG_NEW_MVREF
-  unsigned int best_ref_index_counts[MAX_REF_FRAMES][MAX_MV_REFS];
+  unsigned int mb_mv_ref_count[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
 #endif
 
 } VP9_COMP;
@@ -825,4 +728,5 @@
                          "Failed to allocate "#lval);\
   } while(0)
 #endif
-#endif  // __INC_ONYX_INT_H
+
+#endif  // VP9_ENCODER_VP9_ONYX_INT_H_
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -14,7 +14,7 @@
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "./vpx_scale_rtcd.h"
@@ -21,7 +21,7 @@
 
 void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
                                    YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
-  unsigned char *src_y, *dst_y;
+  uint8_t *src_y, *dst_y;
   int yheight;
   int ystride;
   int yoffset;
@@ -49,8 +49,8 @@
   int i, j;
   int Total = 0;
   int srcoffset, dstoffset;
-  unsigned char *src = source->y_buffer;
-  unsigned char *dst = dest->y_buffer;
+  uint8_t *src = source->y_buffer;
+  uint8_t *dst = dest->y_buffer;
 
   int linestocopy = (source->y_height >> (Fraction + 4));
 
@@ -266,7 +266,7 @@
 
   // Get baseline error score
   vp9_set_alt_lf_level(cpi, filt_mid);
-  vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid);
+  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1);
 
   best_err = vp9_calc_ss_err(sd, cm->frame_to_show);
   filt_best = filt_mid;
@@ -291,7 +291,7 @@
     if ((filt_direction <= 0) && (filt_low != filt_mid)) {
       // Get Low filter error score
       vp9_set_alt_lf_level(cpi, filt_low);
-      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low);
+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
@@ -311,7 +311,7 @@
     // Now look at filt_high
     if ((filt_direction >= 0) && (filt_high != filt_mid)) {
       vp9_set_alt_lf_level(cpi, filt_high);
-      vp9_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high);
+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1);
 
       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);
 
--- a/vp9/encoder/vp9_picklpf.h
+++ b/vp9/encoder/vp9_picklpf.h
@@ -23,4 +23,4 @@
 extern void vp9_pick_filter_level(struct yv12_buffer_config *sd,
                                   struct VP9_COMP *cpi);
 
-#endif  // __INC_PICKLPF_H
+#endif  // VP9_ENCODER_VP9_PICKLPF_H_
--- a/vp9/encoder/vp9_psnr.h
+++ b/vp9/encoder/vp9_psnr.h
@@ -14,4 +14,4 @@
 
 extern double vp9_mse2psnr(double Samples, double Peak, double Mse);
 
-#endif
+#endif  // VP9_ENCODER_VP9_PSNR_H_
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -25,30 +25,30 @@
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
-  short *zbin_boost_ptr  = b->zrun_zbin_boost;
-  short *coeff_ptr       = b->coeff;
-  short *zbin_ptr        = b->zbin;
-  short *round_ptr       = b->round;
-  short *quant_ptr       = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr      = d->qcoeff;
-  short *dqcoeff_ptr     = d->dqcoeff;
-  short *dequant_ptr     = d->dequant;
-  short zbin_oq_value    = b->zbin_extra;
+  int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
+  int16_t *coeff_ptr       = b->coeff;
+  int16_t *zbin_ptr        = b->zbin;
+  int16_t *round_ptr       = b->round;
+  int16_t *quant_ptr       = b->quant;
+  uint8_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr      = d->qcoeff;
+  int16_t *dqcoeff_ptr     = d->dqcoeff;
+  int16_t *dequant_ptr     = d->dequant;
+  int zbin_oq_value        = b->zbin_extra;
 
   int const *pt_scan ;
 
   switch (tx_type) {
-    case ADST_DCT :
-      pt_scan = vp9_row_scan;
+    case ADST_DCT:
+      pt_scan = vp9_row_scan_4x4;
       break;
 
-    case DCT_ADST :
-      pt_scan = vp9_col_scan;
+    case DCT_ADST:
+      pt_scan = vp9_col_scan_4x4;
       break;
 
-    default :
-      pt_scan = vp9_default_zig_zag1d;
+    default:
+      pt_scan = vp9_default_zig_zag1d_4x4;
       break;
   }
 
@@ -89,16 +89,16 @@
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
-  short *zbin_boost_ptr  = b->zrun_zbin_boost;
-  short *coeff_ptr       = b->coeff;
-  short *zbin_ptr        = b->zbin;
-  short *round_ptr       = b->round;
-  short *quant_ptr       = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr      = d->qcoeff;
-  short *dqcoeff_ptr     = d->dqcoeff;
-  short *dequant_ptr     = d->dequant;
-  short zbin_oq_value    = b->zbin_extra;
+  int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;
+  int16_t *coeff_ptr       = b->coeff;
+  int16_t *zbin_ptr        = b->zbin;
+  int16_t *round_ptr       = b->round;
+  int16_t *quant_ptr       = b->quant;
+  uint8_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr      = d->qcoeff;
+  int16_t *dqcoeff_ptr     = d->dqcoeff;
+  int16_t *dequant_ptr     = d->dequant;
+  int zbin_oq_value        = b->zbin_extra;
 
   vpx_memset(qcoeff_ptr, 0, 32);
   vpx_memset(dqcoeff_ptr, 0, 32);
@@ -106,7 +106,7 @@
   eob = -1;
 
   for (i = 0; i < b->eob_max_offset; i++) {
-    rc   = vp9_default_zig_zag1d[i];
+    rc   = vp9_default_zig_zag1d_4x4[i];
     z    = coeff_ptr[rc];
 
     zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
@@ -174,17 +174,17 @@
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
-  short *zbin_boost_ptr = b->zrun_zbin_boost;
+  int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
   int zbin_zrun_index = 0;
-  short *coeff_ptr  = b->coeff;
-  short *zbin_ptr   = b->zbin;
-  short *round_ptr  = b->round;
-  short *quant_ptr  = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-  short zbin_oq_value = b->zbin_extra;
+  int16_t *coeff_ptr  = b->coeff;
+  int16_t *zbin_ptr   = b->zbin;
+  int16_t *round_ptr  = b->round;
+  int16_t *quant_ptr  = b->quant;
+  uint8_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  int16_t *dequant_ptr = d->dequant;
+  int zbin_oq_value    = b->zbin_extra;
   // double q2nd = 4;
   vpx_memset(qcoeff_ptr, 0, 32);
   vpx_memset(dqcoeff_ptr, 0, 32);
@@ -192,7 +192,7 @@
   eob = -1;
 
   for (i = 0; i < b->eob_max_offset_8x8; i++) {
-    rc   = vp9_default_zig_zag1d[i];
+    rc   = vp9_default_zig_zag1d_4x4[i];
     z    = coeff_ptr[rc];
 
     zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index];
@@ -224,19 +224,19 @@
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
-  short *zbin_boost_ptr = b->zrun_zbin_boost_8x8;
-  short *coeff_ptr  = b->coeff;
-  short *zbin_ptr   = b->zbin_8x8;
-  short *round_ptr  = b->round;
-  short *quant_ptr  = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-  short zbin_oq_value = b->zbin_extra;
+  int16_t *zbin_boost_ptr = b->zrun_zbin_boost_8x8;
+  int16_t *coeff_ptr  = b->coeff;
+  int16_t *zbin_ptr   = b->zbin_8x8;
+  int16_t *round_ptr  = b->round;
+  int16_t *quant_ptr  = b->quant;
+  uint8_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  int16_t *dequant_ptr = d->dequant;
+  int zbin_oq_value = b->zbin_extra;
 
-  vpx_memset(qcoeff_ptr, 0, 64 * sizeof(short));
-  vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(short));
+  vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t));
 
   eob = -1;
 
@@ -323,28 +323,25 @@
   vp9_quantize_mbuv_8x8(x);
 }
 
-void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
+static void quantize(int16_t *zbin_boost_orig_ptr,
+                     int16_t *coeff_ptr, int n_coeffs, int max_coeffs,
+                     int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
+                     uint8_t *quant_shift_ptr,
+                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                     int16_t *dequant_ptr, int zbin_oq_value,
+                     int *eob_ptr, const int *scan, int mul) {
   int i, rc, eob;
   int zbin;
   int x, y, z, sz;
-  short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;
-  short *coeff_ptr  = b->coeff;
-  short *zbin_ptr   = b->zbin_16x16;
-  short *round_ptr  = b->round;
-  short *quant_ptr  = b->quant;
-  unsigned char *quant_shift_ptr = b->quant_shift;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-  short zbin_oq_value = b->zbin_extra;
+  int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
 
-  vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));
-  vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
 
   eob = -1;
-  for (i = 0; i < b->eob_max_offset_16x16; i++) {
-    rc   = vp9_default_zig_zag1d_16x16[i];
-    z    = coeff_ptr[rc];
+  for (i = 0; i < max_coeffs; i++) {
+    rc   = scan[i];
+    z    = coeff_ptr[rc] * mul;
 
     zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);
     zbin_boost_ptr ++;
@@ -354,22 +351,68 @@
 
     if (x >= zbin) {
       x += (round_ptr[rc!=0]);
-      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))
+      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
           >> quant_shift_ptr[rc!=0];              // quantize (x)
       x  = (y ^ sz) - sz;                         // get the sign back
       qcoeff_ptr[rc]  = x;                        // write to destination
-      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];   // dequantized value
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value
 
       if (y) {
         eob = i;                                  // last nonzero coeffs
-        zbin_boost_ptr = b->zrun_zbin_boost_16x16;
+        zbin_boost_ptr = zbin_boost_orig_ptr;
       }
     }
   }
 
-  d->eob = eob + 1;
+  *eob_ptr = eob + 1;
 }
 
+void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {
+  quantize(b->zrun_zbin_boost_16x16,
+           b->coeff,
+           256, b->eob_max_offset_16x16,
+           b->zbin_16x16, b->round, b->quant, b->quant_shift,
+           d->qcoeff,
+           d->dqcoeff,
+           d->dequant,
+           b->zbin_extra,
+           &d->eob, vp9_default_zig_zag1d_16x16, 1);
+}
+
+void vp9_quantize_sby_32x32(MACROBLOCK *x) {
+  x->e_mbd.block[0].eob = 0;
+  quantize(x->block[0].zrun_zbin_boost_32x32,
+           x->sb_coeff_data.coeff,
+           1024, x->block[0].eob_max_offset_32x32,
+           x->block[0].zbin_32x32,
+           x->block[0].round, x->block[0].quant, x->block[0].quant_shift,
+           x->e_mbd.sb_coeff_data.qcoeff,
+           x->e_mbd.sb_coeff_data.dqcoeff,
+           x->e_mbd.block[0].dequant,
+           x->block[0].zbin_extra,
+           &x->e_mbd.block[0].eob,
+           vp9_default_zig_zag1d_32x32, 2);
+}
+
+void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {
+  int i;
+
+  x->e_mbd.block[16].eob = 0;
+  x->e_mbd.block[20].eob = 0;
+  for (i = 16; i < 24; i += 4)
+    quantize(x->block[i].zrun_zbin_boost_16x16,
+             x->sb_coeff_data.coeff + 1024 + (i - 16) * 64,
+             256, x->block[i].eob_max_offset_16x16,
+             x->block[i].zbin_16x16,
+             x->block[i].round, x->block[0].quant, x->block[i].quant_shift,
+             x->e_mbd.sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,
+             x->e_mbd.sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64,
+             x->e_mbd.block[i].dequant,
+             x->block[i].zbin_extra,
+             &x->e_mbd.block[i].eob,
+             vp9_default_zig_zag1d_16x16, 1);
+}
+
 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
  * these two C functions if corresponding optimized routine is not available.
  * NEON optimized version implements currently the fast quantization for pair
@@ -380,8 +423,8 @@
   vp9_regular_quantize_b_4x4(b2, d2);
 }
 
-static void invert_quant(short *quant,
-                         unsigned char *shift, short d) {
+static void invert_quant(int16_t *quant,
+                         uint8_t *shift, int d) {
   unsigned t;
   int l;
   t = d;
@@ -388,7 +431,7 @@
   for (l = 0; t > 1; l++)
     t >>= 1;
   t = 1 + (1 << (16 + l)) / d;
-  *quant = (short)(t - (1 << 16));
+  *quant = (int16_t)(t - (1 << 16));
   *shift = l;
 }
 
@@ -427,6 +470,72 @@
     48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
     48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
   };
+  static const int zbin_boost_32x32[1024] = {
+    0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,
+    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+  };
   int qrounding_factor = 48;
 
 
@@ -454,7 +563,11 @@
     cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
     cpi->zrun_zbin_boost_y1_8x8[Q][0] =
       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_y1_16x16[Q][0] =
+      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->Y1zbin_32x32[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;
+    cpi->zrun_zbin_boost_y1_32x32[Q][0] =
+     ((quant_val * zbin_boost_32x32[0]) + 64) >> 7;
 
 
     quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);
@@ -468,7 +581,8 @@
     cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
     cpi->zrun_zbin_boost_y2_8x8[Q][0] =
       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_y2_16x16[Q][0] =
+      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
 
     quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
     invert_quant(cpi->UVquant[Q] + 0,
@@ -481,11 +595,12 @@
     cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
     cpi->zrun_zbin_boost_uv_8x8[Q][0] =
       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;
-    cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
+    cpi->zrun_zbin_boost_uv_16x16[Q][0] =
+      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;
 
     // all the 4x4 ac values =;
     for (i = 1; i < 16; i++) {
-      int rc = vp9_default_zig_zag1d[i];
+      int rc = vp9_default_zig_zag1d_4x4[i];
 
       quant_val = vp9_ac_yquant(Q);
       invert_quant(cpi->Y1quant[Q] + rc,
@@ -543,16 +658,28 @@
 
       quant_val = vp9_ac_yquant(Q);
       cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+      cpi->zrun_zbin_boost_y1_16x16[Q][i] =
+        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
 
       quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);
       cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+      cpi->zrun_zbin_boost_y2_16x16[Q][i] =
+        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
 
       quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);
       cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
-      cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
+      cpi->zrun_zbin_boost_uv_16x16[Q][i] =
+        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;
     }
+    // 32x32 structures. Same comment above applies.
+    for (i = 1; i < 1024; i++) {
+      int rc = vp9_default_zig_zag1d_32x32[i];
+
+      quant_val = vp9_ac_yquant(Q);
+      cpi->Y1zbin_32x32[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;
+      cpi->zrun_zbin_boost_y1_32x32[Q][i] =
+        ((quant_val * zbin_boost_32x32[i]) + 64) >> 7;
+    }
   }
 }
 
@@ -592,12 +719,14 @@
     x->block[i].zbin = cpi->Y1zbin[QIndex];
     x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];
     x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];
+    x->block[i].zbin_32x32 = cpi->Y1zbin_32x32[QIndex];
     x->block[i].round = cpi->Y1round[QIndex];
     x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
     x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];
     x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];
-    x->block[i].zbin_extra = (short)zbin_extra;
+    x->block[i].zrun_zbin_boost_32x32 = cpi->zrun_zbin_boost_y1_32x32[QIndex];
+    x->block[i].zbin_extra = (int16_t)zbin_extra;
 
     // Segment max eob offset feature.
     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
@@ -607,10 +736,13 @@
         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
       x->block[i].eob_max_offset_16x16 =
         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+      x->block[i].eob_max_offset_32x32 =
+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
     } else {
       x->block[i].eob_max_offset = 16;
       x->block[i].eob_max_offset_8x8 = 64;
       x->block[i].eob_max_offset_16x16 = 256;
+      x->block[i].eob_max_offset_32x32 = 1024;
     }
   }
 
@@ -632,7 +764,7 @@
     x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];
     x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];
 
-    x->block[i].zbin_extra = (short)zbin_extra;
+    x->block[i].zbin_extra = (int16_t)zbin_extra;
 
     // Segment max eob offset feature.
     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {
@@ -640,9 +772,12 @@
         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
       x->block[i].eob_max_offset_8x8 =
         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+      x->block[i].eob_max_offset_16x16 =
+      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
     } else {
       x->block[i].eob_max_offset = 16;
       x->block[i].eob_max_offset_8x8 = 64;
+      x->block[i].eob_max_offset_16x16 = 256;
     }
   }
 
@@ -662,7 +797,7 @@
   x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
   x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];
   x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];
-  x->block[24].zbin_extra = (short)zbin_extra;
+  x->block[24].zbin_extra = (int16_t)zbin_extra;
 
   // TBD perhaps not use for Y2
   // Segment max eob offset feature.
@@ -691,7 +826,7 @@
                  cpi->zbin_mode_boost +
                  x->act_zbin_adj)) >> 7;
   for (i = 0; i < 16; i++) {
-    x->block[i].zbin_extra = (short)zbin_extra;
+    x->block[i].zbin_extra = (int16_t)zbin_extra;
   }
 
   // UV
@@ -701,7 +836,7 @@
                  x->act_zbin_adj)) >> 7;
 
   for (i = 16; i < 24; i++) {
-    x->block[i].zbin_extra = (short)zbin_extra;
+    x->block[i].zbin_extra = (int16_t)zbin_extra;
   }
 
   // Y2
@@ -710,7 +845,7 @@
                  cpi->zbin_mode_boost +
                  x->act_zbin_adj)) >> 7;
 
-  x->block[24].zbin_extra = (short)zbin_extra;
+  x->block[24].zbin_extra = (int16_t)zbin_extra;
 }
 
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -78,6 +78,9 @@
 extern prototype_quantize_block(vp9_quantize_quantb_16x16);
 extern prototype_quantize_mb(vp9_quantize_mby_16x16);
 
+void vp9_quantize_sby_32x32(MACROBLOCK *x);
+void vp9_quantize_sbuv_16x16(MACROBLOCK *x);
+
 struct VP9_COMP;
 
 extern void vp9_set_quantizer(struct VP9_COMP *cpi, int Q);
@@ -90,4 +93,4 @@
 
 extern void vp9_init_quantizer(struct VP9_COMP *cpi);
 
-#endif
+#endif  // VP9_ENCODER_VP9_QUANTIZE_H_
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -139,9 +139,7 @@
   vp9_copy(cc->vp9_mode_contexts, cm->fc.vp9_mode_contexts);
 
   vp9_copy(cc->ymode_prob, cm->fc.ymode_prob);
-#if CONFIG_SUPERBLOCKS
   vp9_copy(cc->sb_ymode_prob, cm->fc.sb_ymode_prob);
-#endif
   vp9_copy(cc->bmode_prob, cm->fc.bmode_prob);
   vp9_copy(cc->uv_mode_prob, cm->fc.uv_mode_prob);
   vp9_copy(cc->i8x8_mode_prob, cm->fc.i8x8_mode_prob);
@@ -169,12 +167,13 @@
   vp9_copy(cc->last_ref_lf_deltas, xd->last_ref_lf_deltas);
   vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);
 
-  vp9_copy(cc->coef_probs, cm->fc.coef_probs);
-  vp9_copy(cc->hybrid_coef_probs, cm->fc.hybrid_coef_probs);
+  vp9_copy(cc->coef_probs_4x4, cm->fc.coef_probs_4x4);
+  vp9_copy(cc->hybrid_coef_probs_4x4, cm->fc.hybrid_coef_probs_4x4);
   vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);
   vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);
   vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);
   vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);
+  vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32);
   vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
   cc->interintra_prob = cm->fc.interintra_prob;
@@ -197,9 +196,7 @@
   vp9_copy(cm->fc.vp9_mode_contexts, cc->vp9_mode_contexts);
 
   vp9_copy(cm->fc.ymode_prob, cc->ymode_prob);
-#if CONFIG_SUPERBLOCKS
   vp9_copy(cm->fc.sb_ymode_prob, cc->sb_ymode_prob);
-#endif
   vp9_copy(cm->fc.bmode_prob, cc->bmode_prob);
   vp9_copy(cm->fc.i8x8_mode_prob, cc->i8x8_mode_prob);
   vp9_copy(cm->fc.uv_mode_prob, cc->uv_mode_prob);
@@ -228,12 +225,13 @@
   vp9_copy(xd->last_ref_lf_deltas, cc->last_ref_lf_deltas);
   vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);
 
-  vp9_copy(cm->fc.coef_probs, cc->coef_probs);
-  vp9_copy(cm->fc.hybrid_coef_probs, cc->hybrid_coef_probs);
+  vp9_copy(cm->fc.coef_probs_4x4, cc->coef_probs_4x4);
+  vp9_copy(cm->fc.hybrid_coef_probs_4x4, cc->hybrid_coef_probs_4x4);
   vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);
   vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);
   vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);
   vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);
+  vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32);
   vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);
 #if CONFIG_COMP_INTERINTRA_PRED
   cm->fc.interintra_prob = cc->interintra_prob;
@@ -274,6 +272,16 @@
 
   vp9_update_mode_info_border(cm, cm->mip);
   vp9_update_mode_info_in_image(cm, cm->mi);
+
+#if CONFIG_NEW_MVREF
+  if (1) {
+    MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+    // Defaults probabilities for encoding the MV ref id signal
+    vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,
+               sizeof(xd->mb_mv_ref_probs));
+  }
+#endif
 }
 
 void vp9_setup_inter_frame(VP9_COMP *cpi) {
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -35,4 +35,4 @@
 extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex);
 void vp9_setup_inter_frame(VP9_COMP *cpi);
 
-#endif
+#endif  // VP9_ENCODER_VP9_RATECTRL_H_
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -41,6 +41,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vp9_rtcd.h"
 #include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_common.h"
 
 #define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
 
@@ -69,98 +70,7 @@
   105
 };
 
-#if CONFIG_PRED_FILTER
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {ZEROMV,    LAST_FRAME,   NONE,  0},
-  {ZEROMV,    LAST_FRAME,   NONE,  1},
-  {DC_PRED,   INTRA_FRAME,  NONE,  0},
-
-  {NEARESTMV, LAST_FRAME,   NONE,  0},
-  {NEARESTMV, LAST_FRAME,   NONE,  1},
-  {NEARMV,    LAST_FRAME,   NONE,  0},
-  {NEARMV,    LAST_FRAME,   NONE,  1},
-
-  {ZEROMV,    GOLDEN_FRAME, NONE,  0},
-  {ZEROMV,    GOLDEN_FRAME, NONE,  1},
-  {NEARESTMV, GOLDEN_FRAME, NONE,  0},
-  {NEARESTMV, GOLDEN_FRAME, NONE,  1},
-
-  {ZEROMV,    ALTREF_FRAME, NONE,  0},
-  {ZEROMV,    ALTREF_FRAME, NONE,  1},
-  {NEARESTMV, ALTREF_FRAME, NONE,  0},
-  {NEARESTMV, ALTREF_FRAME, NONE,  1},
-
-  {NEARMV,    GOLDEN_FRAME, NONE,  0},
-  {NEARMV,    GOLDEN_FRAME, NONE,  1},
-  {NEARMV,    ALTREF_FRAME, NONE,  0},
-  {NEARMV,    ALTREF_FRAME, NONE,  1},
-
-  {V_PRED,    INTRA_FRAME,  NONE,  0},
-  {H_PRED,    INTRA_FRAME,  NONE,  0},
-  {D45_PRED,  INTRA_FRAME,  NONE,  0},
-  {D135_PRED, INTRA_FRAME,  NONE,  0},
-  {D117_PRED, INTRA_FRAME,  NONE,  0},
-  {D153_PRED, INTRA_FRAME,  NONE,  0},
-  {D27_PRED,  INTRA_FRAME,  NONE,  0},
-  {D63_PRED,  INTRA_FRAME,  NONE,  0},
-
-  {TM_PRED,   INTRA_FRAME,  NONE,  0},
-
-  {NEWMV,     LAST_FRAME,   NONE,  0},
-  {NEWMV,     LAST_FRAME,   NONE,  1},
-  {NEWMV,     GOLDEN_FRAME, NONE,  0},
-  {NEWMV,     GOLDEN_FRAME, NONE,  1},
-  {NEWMV,     ALTREF_FRAME, NONE,  0},
-  {NEWMV,     ALTREF_FRAME, NONE,  1},
-
-  {SPLITMV,   LAST_FRAME,   NONE,  0},
-  {SPLITMV,   GOLDEN_FRAME, NONE,  0},
-  {SPLITMV,   ALTREF_FRAME, NONE,  0},
-
-  {B_PRED,    INTRA_FRAME,  NONE,  0},
-  {I8X8_PRED, INTRA_FRAME,  NONE,  0},
-
-  /* compound prediction modes */
-  {ZEROMV,    LAST_FRAME,   GOLDEN_FRAME, 0},
-  {NEARESTMV, LAST_FRAME,   GOLDEN_FRAME, 0},
-  {NEARMV,    LAST_FRAME,   GOLDEN_FRAME, 0},
-
-  {ZEROMV,    ALTREF_FRAME, LAST_FRAME,   0},
-  {NEARESTMV, ALTREF_FRAME, LAST_FRAME,   0},
-  {NEARMV,    ALTREF_FRAME, LAST_FRAME,   0},
-
-  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},
-  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME, 0},
-  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME, 0},
-
-  {NEWMV,     LAST_FRAME,   GOLDEN_FRAME, 0},
-  {NEWMV,     ALTREF_FRAME, LAST_FRAME,   0},
-  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME, 0},
-
-  {SPLITMV,   LAST_FRAME,   GOLDEN_FRAME, 0},
-  {SPLITMV,   ALTREF_FRAME, LAST_FRAME,   0},
-  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME, 0},
-
-#if CONFIG_COMP_INTERINTRA_PRED
-  /* compound inter-intra prediction */
-  {ZEROMV,    LAST_FRAME,   INTRA_FRAME, 0},
-  {NEARESTMV, LAST_FRAME,   INTRA_FRAME, 0},
-  {NEARMV,    LAST_FRAME,   INTRA_FRAME, 0},
-  {NEWMV,     LAST_FRAME,   INTRA_FRAME, 0},
-
-  {ZEROMV,    GOLDEN_FRAME,   INTRA_FRAME, 0},
-  {NEARESTMV, GOLDEN_FRAME,   INTRA_FRAME, 0},
-  {NEARMV,    GOLDEN_FRAME,   INTRA_FRAME, 0},
-  {NEWMV,     GOLDEN_FRAME,   INTRA_FRAME, 0},
-
-  {ZEROMV,    ALTREF_FRAME,   INTRA_FRAME, 0},
-  {NEARESTMV, ALTREF_FRAME,   INTRA_FRAME, 0},
-  {NEARMV,    ALTREF_FRAME,   INTRA_FRAME, 0},
-  {NEWMV,     ALTREF_FRAME,   INTRA_FRAME, 0},
-#endif
-};
-#else
-const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {ZEROMV,    LAST_FRAME,   NONE},
   {DC_PRED,   INTRA_FRAME,  NONE},
 
@@ -237,12 +147,10 @@
   {NEWMV,     ALTREF_FRAME,   INTRA_FRAME},
 #endif
 };
-#endif
 
-static void fill_token_costs(
-  unsigned int (*c)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS],
-  const vp9_prob(*p)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES],
-  int block_type_counts) {
+static void fill_token_costs(vp9_coeff_count *c,
+                             vp9_coeff_probs *p,
+                             int block_type_counts) {
   int i, j, k;
 
   for (i = 0; i < block_type_counts; i++)
@@ -370,42 +278,29 @@
     }
   }
 
-  fill_token_costs(
-    cpi->mb.token_costs[TX_4X4],
-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs,
-    BLOCK_TYPES);
-  fill_token_costs(
-    cpi->mb.hybrid_token_costs[TX_4X4],
-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
-    cpi->common.fc.hybrid_coef_probs,
-    BLOCK_TYPES);
+  fill_token_costs(cpi->mb.token_costs[TX_4X4],
+                   cpi->common.fc.coef_probs_4x4, BLOCK_TYPES_4X4);
+  fill_token_costs(cpi->mb.hybrid_token_costs[TX_4X4],
+                   cpi->common.fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4);
 
-  fill_token_costs(
-    cpi->mb.token_costs[TX_8X8],
-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,
-    BLOCK_TYPES_8X8);
-  fill_token_costs(
-    cpi->mb.hybrid_token_costs[TX_8X8],
-    (const vp9_prob( *)[8][PREV_COEF_CONTEXTS][11])
-    cpi->common.fc.hybrid_coef_probs_8x8,
-    BLOCK_TYPES_8X8);
+  fill_token_costs(cpi->mb.token_costs[TX_8X8],
+                   cpi->common.fc.coef_probs_8x8, BLOCK_TYPES_8X8);
+  fill_token_costs(cpi->mb.hybrid_token_costs[TX_8X8],
+                   cpi->common.fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8);
 
-  fill_token_costs(
-    cpi->mb.token_costs[TX_16X16],
-    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,
-    BLOCK_TYPES_16X16);
-  fill_token_costs(
-    cpi->mb.hybrid_token_costs[TX_16X16],
-    (const vp9_prob(*)[8][PREV_COEF_CONTEXTS][11])
-    cpi->common.fc.hybrid_coef_probs_16x16,
-    BLOCK_TYPES_16X16);
+  fill_token_costs(cpi->mb.token_costs[TX_16X16],
+                   cpi->common.fc.coef_probs_16x16, BLOCK_TYPES_16X16);
+  fill_token_costs(cpi->mb.hybrid_token_costs[TX_16X16],
+                   cpi->common.fc.hybrid_coef_probs_16x16, BLOCK_TYPES_16X16);
 
+  fill_token_costs(cpi->mb.token_costs[TX_32X32],
+                   cpi->common.fc.coef_probs_32x32, BLOCK_TYPES_32X32);
+
   /*rough estimate for costing*/
   cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;
   vp9_init_mode_costs(cpi);
 
-  if (cpi->common.frame_type != KEY_FRAME)
-  {
+  if (cpi->common.frame_type != KEY_FRAME) {
     vp9_build_nmv_cost_table(
         cpi->mb.nmvjointcost,
         cpi->mb.e_mbd.allow_high_precision_mv ?
@@ -415,7 +310,7 @@
   }
 }
 
-int vp9_block_error_c(short *coeff, short *dqcoeff, int block_size) {
+int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
   int i, error = 0;
 
   for (i = 0; i < block_size; i++) {
@@ -481,9 +376,9 @@
 }
 
 int vp9_uvsse(MACROBLOCK *x) {
-  unsigned char *uptr, *vptr;
-  unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
-  unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
+  uint8_t *uptr, *vptr;
+  uint8_t *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
+  uint8_t *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
   int uv_stride = x->block[16].src_stride;
 
   unsigned int sse1 = 0;
@@ -525,117 +420,105 @@
 
 }
 
-static int cost_coeffs_2x2(MACROBLOCK *mb,
-                           BLOCKD *b, PLANE_TYPE type,
-                           ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
-  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
-  int eob = b->eob;
-  int pt;    /* surrounding block/prev coef predictor */
-  int cost = 0;
-  short *qcoeff_ptr = b->qcoeff;
-
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
-  assert(eob <= 4);
-
-  for (; c < eob; c++) {
-    int v = qcoeff_ptr[vp9_default_zig_zag1d[c]];
-    int t = vp9_dct_value_tokens_ptr[v].Token;
-    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]][pt][t];
-    cost += vp9_dct_value_cost_ptr[v];
-    pt = vp9_prev_token_class[t];
-  }
-
-  if (c < 4)
-    cost += mb->token_costs[TX_8X8][type][vp9_coef_bands[c]]
-            [pt] [DCT_EOB_TOKEN];
-  // is eob first coefficient;
-  pt = (c > !type);
-  *a = *l = pt;
-  return cost;
-}
-
-static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, PLANE_TYPE type,
-                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       int tx_size) {
+#if CONFIG_NEWCOEFCONTEXT
+#define PT pn
+#else
+#define PT pt
+#endif
+static int cost_coeffs(MACROBLOCK *mb,
+                       BLOCKD *b, PLANE_TYPE type,
+                       ENTROPY_CONTEXT *a,
+                       ENTROPY_CONTEXT *l,
+                       TX_SIZE tx_size) {
+  int pt;
   const int eob = b->eob;
-  int c = (type == PLANE_TYPE_Y_NO_DC); /* start at coef 0, unless Y with Y2 */
-  int cost = 0, default_eob, seg_eob;
-  int pt;                     /* surrounding block/prev coef predictor */
-  int const *scan, *band;
-  short *qcoeff_ptr = b->qcoeff;
   MACROBLOCKD *xd = &mb->e_mbd;
-  MB_MODE_INFO *mbmi = &mb->e_mbd.mode_info_context->mbmi;
-  TX_TYPE tx_type = DCT_DCT;
-  int segment_id = mbmi->segment_id;
-  scan = vp9_default_zig_zag1d;
-  band = vp9_coef_bands;
-  default_eob = 16;
+  const int ib = (int)(b - xd->block);
+  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
+  int cost = 0, seg_eob;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int *scan, *band;
+  int16_t *qcoeff_ptr = b->qcoeff;
+  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
+                          get_tx_type(xd, b) : DCT_DCT;
+#if CONFIG_NEWCOEFCONTEXT
+  const int *neighbors;
+  int pn;
+#endif
 
+  ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
+
   switch (tx_size) {
     case TX_4X4:
+      scan = vp9_default_zig_zag1d_4x4;
+      band = vp9_coef_bands_4x4;
+      seg_eob = 16;
       if (type == PLANE_TYPE_Y_WITH_DC) {
-        tx_type = get_tx_type_4x4(xd, b);
-        if (tx_type != DCT_DCT) {
-          switch (tx_type) {
-            case ADST_DCT:
-              scan = vp9_row_scan;
-              break;
-
-            case DCT_ADST:
-              scan = vp9_col_scan;
-              break;
-
-            default:
-              scan = vp9_default_zig_zag1d;
-              break;
-          }
+        if (tx_type == ADST_DCT) {
+          scan = vp9_row_scan_4x4;
+        } else if (tx_type == DCT_ADST) {
+          scan = vp9_col_scan_4x4;
         }
       }
-
       break;
     case TX_8X8:
-      scan = vp9_default_zig_zag1d_8x8;
-      band = vp9_coef_bands_8x8;
-      default_eob = 64;
-      if (type == PLANE_TYPE_Y_WITH_DC) {
-        BLOCKD *bb;
-        int ib = (int)(b - xd->block);
-        if (ib < 16) {
-          ib = (ib & 8) + ((ib & 4) >> 1);
-          bb = xd->block + ib;
-          tx_type = get_tx_type_8x8(xd, bb);
-        }
+      if (type == PLANE_TYPE_Y2) {
+        scan = vp9_default_zig_zag1d_4x4;
+        band = vp9_coef_bands_4x4;
+        seg_eob = 4;
+      } else {
+        scan = vp9_default_zig_zag1d_8x8;
+        band = vp9_coef_bands_8x8;
+        seg_eob = 64;
       }
       break;
     case TX_16X16:
       scan = vp9_default_zig_zag1d_16x16;
       band = vp9_coef_bands_16x16;
-      default_eob = 256;
-      if (type == PLANE_TYPE_Y_WITH_DC) {
-        tx_type = get_tx_type_16x16(xd, b);
+      seg_eob = 256;
+      if (type == PLANE_TYPE_UV) {
+        const int uv_idx = ib - 16;
+        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * uv_idx;
       }
       break;
+    case TX_32X32:
+      scan = vp9_default_zig_zag1d_32x32;
+      band = vp9_coef_bands_32x32;
+      seg_eob = 1024;
+      qcoeff_ptr = xd->sb_coeff_data.qcoeff;
+      break;
     default:
+      abort();
       break;
   }
-  if (vp9_segfeature_active(&mb->e_mbd, segment_id, SEG_LVL_EOB))
-    seg_eob = vp9_get_segdata(&mb->e_mbd, segment_id, SEG_LVL_EOB);
-  else
-    seg_eob = default_eob;
 
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
+#if CONFIG_NEWCOEFCONTEXT
+  neighbors = vp9_get_coef_neighbors_handle(scan);
+  pn = pt;
+#endif
 
+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
+    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
+
   if (tx_type != DCT_DCT) {
     for (; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
       int t = vp9_dct_value_tokens_ptr[v].Token;
-      cost += mb->hybrid_token_costs[tx_size][type][band[c]][pt][t];
+      cost += mb->hybrid_token_costs[tx_size][type][band[c]][PT][t];
       cost += vp9_dct_value_cost_ptr[v];
       pt = vp9_prev_token_class[t];
+#if CONFIG_NEWCOEFCONTEXT
+      if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1]))
+        pn = vp9_get_coef_neighbor_context(
+            qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);
+      else
+        pn = pt;
+#endif
     }
     if (c < seg_eob)
       cost += mb->hybrid_token_costs[tx_size][type][band[c]]
-          [pt][DCT_EOB_TOKEN];
+          [PT][DCT_EOB_TOKEN];
   } else {
     for (; c < eob; c++) {
       int v = qcoeff_ptr[scan[c]];
@@ -643,10 +526,17 @@
       cost += mb->token_costs[tx_size][type][band[c]][pt][t];
       cost += vp9_dct_value_cost_ptr[v];
       pt = vp9_prev_token_class[t];
+#if CONFIG_NEWCOEFCONTEXT
+      if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1]))
+        pn = vp9_get_coef_neighbor_context(
+            qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);
+      else
+        pn = pt;
+#endif
     }
     if (c < seg_eob)
       cost += mb->token_costs[tx_size][type][band[c]]
-          [pt][DCT_EOB_TOKEN];
+          [PT][DCT_EOB_TOKEN];
   }
 
   // is eob first coefficient;
@@ -678,12 +568,14 @@
     cost += cost_coeffs(mb, xd->block + b,
                         (has_2nd_order ?
                          PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC),
-                        ta + vp9_block2above[b], tl + vp9_block2left[b],
+                        ta + vp9_block2above[TX_4X4][b],
+                        tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
 
   if (has_2nd_order)
     cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,
-                        ta + vp9_block2above[24], tl + vp9_block2left[24],
+                        ta + vp9_block2above[TX_4X4][24],
+                        tl + vp9_block2left[TX_4X4][24],
                         TX_4X4);
 
   return cost;
@@ -736,12 +628,15 @@
     cost += cost_coeffs(mb, xd->block + b,
                         (has_2nd_order ?
                          PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC),
-                        ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
+                        ta + vp9_block2above[TX_8X8][b],
+                        tl + vp9_block2left[TX_8X8][b],
                         TX_8X8);
 
   if (has_2nd_order)
-    cost += cost_coeffs_2x2(mb, xd->block + 24, PLANE_TYPE_Y2,
-                            ta + vp9_block2above[24], tl + vp9_block2left[24]);
+    cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,
+                            ta + vp9_block2above[TX_8X8][24],
+                            tl + vp9_block2left[TX_8X8][24],
+                            TX_8X8);
   return cost;
 }
 
@@ -813,23 +708,28 @@
 }
 
 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
-                                     int r[2][TX_SIZE_MAX], int *rate,
-                                     int d[TX_SIZE_MAX], int *distortion,
-                                     int s[TX_SIZE_MAX], int *skip,
-                                     int64_t txfm_cache[NB_TXFM_MODES]) {
+                                     int (*r)[2], int *rate,
+                                     int *d, int *distortion,
+                                     int *s, int *skip,
+                                     int64_t txfm_cache[NB_TXFM_MODES],
+                                     TX_SIZE max_txfm_size) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   vp9_prob skip_prob = cm->mb_no_coeff_skip ?
                        vp9_get_pred_prob(cm, xd, PRED_MBSKIP) : 128;
-  int64_t rd[2][TX_SIZE_MAX];
-  int n;
+  int64_t rd[TX_SIZE_MAX_SB][2];
+  int n, m;
 
-  r[1][TX_16X16] = r[0][TX_16X16] + vp9_cost_one(cm->prob_tx[0]) +
-                   vp9_cost_one(cm->prob_tx[1]);
-  r[1][TX_8X8]   = r[0][TX_8X8] + vp9_cost_one(cm->prob_tx[0]) +
-                   vp9_cost_zero(cm->prob_tx[1]);
-  r[1][TX_4X4]   = r[0][TX_4X4] + vp9_cost_zero(cm->prob_tx[0]);
+  for (n = TX_4X4; n <= max_txfm_size; n++) {
+    r[n][1] = r[n][0];
+    for (m = 0; m <= n - (n == max_txfm_size); m++) {
+      if (m == n)
+        r[n][1] += vp9_cost_zero(cm->prob_tx[m]);
+      else
+        r[n][1] += vp9_cost_one(cm->prob_tx[m]);
+    }
+  }
 
   if (cm->mb_no_coeff_skip) {
     int s0, s1;
@@ -838,46 +738,58 @@
     s0 = vp9_cost_bit(skip_prob, 0);
     s1 = vp9_cost_bit(skip_prob, 1);
 
-    for (n = TX_4X4; n <= TX_16X16; n++) {
+    for (n = TX_4X4; n <= max_txfm_size; n++) {
       if (s[n]) {
-        rd[0][n] = rd[1][n] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+        rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
       } else {
-        rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n] + s0, d[n]);
-        rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n] + s0, d[n]);
+        rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
+        rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
       }
     }
   } else {
-    for (n = TX_4X4; n <= TX_16X16; n++) {
-      rd[0][n] = RDCOST(x->rdmult, x->rddiv, r[0][n], d[n]);
-      rd[1][n] = RDCOST(x->rdmult, x->rddiv, r[1][n], d[n]);
+    for (n = TX_4X4; n <= max_txfm_size; n++) {
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0], d[n]);
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1], d[n]);
     }
   }
 
-  if ( cm->txfm_mode == ALLOW_16X16 ||
-      (cm->txfm_mode == TX_MODE_SELECT &&
-       rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])) {
+  if (max_txfm_size == TX_32X32 &&
+      (cm->txfm_mode == ALLOW_32X32 ||
+       (cm->txfm_mode == TX_MODE_SELECT &&
+        rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
+        rd[TX_32X32][1] < rd[TX_4X4][1]))) {
+    mbmi->txfm_size = TX_32X32;
+  } else if ( cm->txfm_mode == ALLOW_16X16 ||
+             (max_txfm_size == TX_16X16 && cm->txfm_mode == ALLOW_32X32) ||
+             (cm->txfm_mode == TX_MODE_SELECT &&
+              rd[TX_16X16][1] < rd[TX_8X8][1] &&
+              rd[TX_16X16][1] < rd[TX_4X4][1])) {
     mbmi->txfm_size = TX_16X16;
   } else if (cm->txfm_mode == ALLOW_8X8 ||
-           (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_8X8] < rd[1][TX_4X4])) {
+           (cm->txfm_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
     mbmi->txfm_size = TX_8X8;
   } else {
-    assert(cm->txfm_mode == ONLY_4X4 ||
-          (cm->txfm_mode == TX_MODE_SELECT && rd[1][TX_4X4] <= rd[1][TX_8X8]));
+    assert(cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT);
     mbmi->txfm_size = TX_4X4;
   }
 
   *distortion = d[mbmi->txfm_size];
-  *rate       = r[cm->txfm_mode == TX_MODE_SELECT][mbmi->txfm_size];
+  *rate       = r[mbmi->txfm_size][cm->txfm_mode == TX_MODE_SELECT];
   *skip       = s[mbmi->txfm_size];
 
-  txfm_cache[ONLY_4X4] = rd[0][TX_4X4];
-  txfm_cache[ALLOW_8X8] = rd[0][TX_8X8];
-  txfm_cache[ALLOW_16X16] = rd[0][TX_16X16];
-  if (rd[1][TX_16X16] < rd[1][TX_8X8] && rd[1][TX_16X16] < rd[1][TX_4X4])
-    txfm_cache[TX_MODE_SELECT] = rd[1][TX_16X16];
+  txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
+  txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];
+  txfm_cache[ALLOW_16X16] = rd[TX_16X16][0];
+  txfm_cache[ALLOW_32X32] = rd[max_txfm_size][0];
+  if (max_txfm_size == TX_32X32 &&
+      rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
+      rd[TX_32X32][1] < rd[TX_4X4][1])
+    txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
+  else if (rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
+    txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
   else
-    txfm_cache[TX_MODE_SELECT] = rd[1][TX_4X4] < rd[1][TX_8X8] ?
-                                 rd[1][TX_4X4] : rd[1][TX_8X8];
+    txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
+                                 rd[TX_4X4][1] : rd[TX_8X8][1];
 }
 
 static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
@@ -884,21 +796,20 @@
                             int *distortion, int *skippable,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX];
+  int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB];
 
   vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,
                    x->block[0].src_stride);
 
-  macro_block_yrd_16x16(x, &r[0][TX_16X16], &d[TX_16X16],
-                        &s[TX_16X16], 1);
-  macro_block_yrd_8x8(x, &r[0][TX_8X8], &d[TX_8X8], &s[TX_8X8], 1);
-  macro_block_yrd_4x4(x, &r[0][TX_4X4], &d[TX_4X4], &s[TX_4X4], 1);
+  macro_block_yrd_16x16(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1);
+  macro_block_yrd_8x8(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1);
+  macro_block_yrd_4x4(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
-                           txfm_cache);
+                           txfm_cache, TX_16X16);
 }
 
-static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {
+static void copy_predictor(uint8_t *dst, const uint8_t *predictor) {
   const unsigned int *p = (const unsigned int *)predictor;
   unsigned int *d = (unsigned int *)dst;
   d[0] = p[0];
@@ -907,26 +818,96 @@
   d[12] = p[12];
 }
 
-#if CONFIG_SUPERBLOCKS
+static int rdcost_sby_32x32(MACROBLOCK *x, int backup) {
+  MACROBLOCKD * const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  if (backup) {
+    ta = (ENTROPY_CONTEXT *) &t_above,
+    tl = (ENTROPY_CONTEXT *) &t_left;
+
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES));
+  } else {
+    ta = (ENTROPY_CONTEXT *) xd->above_context;
+    tl = (ENTROPY_CONTEXT *) xd->left_context;
+  }
+
+  return cost_coeffs(x, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);
+}
+
+static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,
+                                int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    unsigned int this_diff = coeff[i] - dqcoeff[i];
+    error += this_diff * this_diff;
+  }
+
+  return error > INT_MAX ? INT_MAX : error;
+}
+
+#define DEBUG_ERROR 0
+static void super_block_yrd_32x32(MACROBLOCK *x,
+                                  int *rate, int *distortion, int *skippable,
+                                  int backup) {
+  SUPERBLOCK  * const x_sb = &x->sb_coeff_data;
+  MACROBLOCKD * const xd = &x->e_mbd;
+  SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data;
+#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID
+  int16_t out[1024];
+#endif
+
+  vp9_transform_sby_32x32(x);
+  vp9_quantize_sby_32x32(x);
+#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID
+  vp9_short_idct32x32(xd_sb->dqcoeff, out, 64);
+#endif
+
+#if !CONFIG_DWTDCTHYBRID
+  *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024);
+#else
+  *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4;
+#endif
+#if DEBUG_ERROR
+  printf("IDCT/FDCT error 32x32: %d (d: %d)\n",
+         vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion);
+#endif
+  *rate       = rdcost_sby_32x32(x, backup);
+  *skippable  = vp9_sby_is_skippable_32x32(&x->e_mbd);
+}
+
 static void super_block_yrd(VP9_COMP *cpi,
                             MACROBLOCK *x, int *rate, int *distortion,
                             int *skip,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX], n;
+  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n;
   const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
-  ENTROPY_CONTEXT_PLANES t_above[3][2], *orig_above = xd->above_context;
-  ENTROPY_CONTEXT_PLANES t_left[3][2], *orig_left = xd->left_context;
+  ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_MB][2],
+                        *orig_above = xd->above_context;
+  ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_MB][2],
+                        *orig_left = xd->left_context;
 
-  for (n = TX_4X4; n <= TX_16X16; n++) {
+  for (n = TX_4X4; n < TX_SIZE_MAX_MB; n++) {
     vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));
     vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));
-    r[0][n] = 0;
+    r[n][0] = 0;
     d[n] = 0;
     s[n] = 1;
   }
 
+  vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,
+                       dst, dst_y_stride);
+  super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], 1);
+
+#if DEBUG_ERROR
+  int err[3] = { 0, 0, 0 };
+#endif
   for (n = 0; n < 4; n++) {
     int x_idx = n & 1, y_idx = n >> 1;
     int r_tmp, d_tmp, s_tmp;
@@ -941,32 +922,144 @@
     xd->left_context = &t_left[TX_16X16][y_idx];
     macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_16X16] += d_tmp;
-    r[0][TX_16X16] += r_tmp;
+    r[TX_16X16][0] += r_tmp;
     s[TX_16X16] = s[TX_16X16] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_16x16(xd);
+    err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
 
     xd->above_context = &t_above[TX_4X4][x_idx];
     xd->left_context = &t_left[TX_4X4][y_idx];
     macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_4X4] += d_tmp;
-    r[0][TX_4X4] += r_tmp;
+    r[TX_4X4][0] += r_tmp;
     s[TX_4X4] = s[TX_4X4] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_4x4(xd);
+    err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
 
     xd->above_context = &t_above[TX_8X8][x_idx];
     xd->left_context = &t_left[TX_8X8][y_idx];
     macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_8X8] += d_tmp;
-    r[0][TX_8X8] += r_tmp;
+    r[TX_8X8][0] += r_tmp;
     s[TX_8X8] = s[TX_8X8] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_8x8(xd);
+    err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
   }
+#if DEBUG_ERROR
+  printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]);
+  printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]);
+  printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]);
+#endif
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
+                           TX_SIZE_MAX_SB - 1);
 
-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache);
-
   xd->above_context = orig_above;
   xd->left_context = orig_left;
 }
+
+static void super_block_64_yrd(VP9_COMP *cpi,
+                               MACROBLOCK *x, int *rate, int *distortion,
+                               int *skip,
+                               int64_t txfm_cache[NB_TXFM_MODES]) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n;
+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
+  ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_SB][4],
+                        *orig_above = xd->above_context;
+  ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_SB][4],
+                        *orig_left = xd->left_context;
+
+  for (n = TX_4X4; n < TX_SIZE_MAX_SB; n++) {
+    vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));
+    vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));
+    r[n][0] = 0;
+    d[n] = 0;
+    s[n] = 1;
+  }
+
+  for (n = 0; n < 4; n++) {
+    int x_idx = n & 1, y_idx = n >> 1;
+    int r_tmp, d_tmp, s_tmp;
+
+    xd->above_context = &t_above[TX_32X32][x_idx << 1];
+    xd->left_context = &t_left[TX_32X32][y_idx << 1];
+    vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff,
+                         src + 32 * x_idx + 32 * y_idx * src_y_stride,
+                         src_y_stride,
+                         dst + 32 * x_idx + 32 * y_idx * dst_y_stride,
+                         dst_y_stride);
+    super_block_yrd_32x32(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    r[TX_32X32][0] += r_tmp;
+    d[TX_32X32] += d_tmp;
+    s[TX_32X32] = s[TX_32X32] && s_tmp;
+  }
+
+#if DEBUG_ERROR
+  int err[3] = { 0, 0, 0 };
 #endif
+  for (n = 0; n < 16; n++) {
+    int x_idx = n & 3, y_idx = n >> 2;
+    int r_tmp, d_tmp, s_tmp;
 
-static void copy_predictor_8x8(unsigned char *dst, const unsigned char *predictor) {
+    vp9_subtract_mby_s_c(x->src_diff,
+                         src + x_idx * 16 + y_idx * 16 * src_y_stride,
+                         src_y_stride,
+                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,
+                         dst_y_stride);
+
+    xd->above_context = &t_above[TX_16X16][x_idx];
+    xd->left_context = &t_left[TX_16X16][y_idx];
+    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    d[TX_16X16] += d_tmp;
+    r[TX_16X16][0] += r_tmp;
+    s[TX_16X16] = s[TX_16X16] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_16x16(xd);
+    err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
+
+    xd->above_context = &t_above[TX_4X4][x_idx];
+    xd->left_context = &t_left[TX_4X4][y_idx];
+    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    d[TX_4X4] += d_tmp;
+    r[TX_4X4][0] += r_tmp;
+    s[TX_4X4] = s[TX_4X4] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_4x4(xd);
+    err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
+
+    xd->above_context = &t_above[TX_8X8][x_idx];
+    xd->left_context = &t_left[TX_8X8][y_idx];
+    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
+    d[TX_8X8] += d_tmp;
+    r[TX_8X8][0] += r_tmp;
+    s[TX_8X8] = s[TX_8X8] && s_tmp;
+#if DEBUG_ERROR
+    vp9_inverse_transform_mby_8x8(xd);
+    err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256);
+#endif
+  }
+#if DEBUG_ERROR
+  printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]);
+  printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]);
+  printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]);
+#endif
+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,
+                           TX_SIZE_MAX_SB - 1);
+
+  xd->above_context = orig_above;
+  xd->left_context = orig_left;
+}
+
+static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) {
   const unsigned int *p = (const unsigned int *)predictor;
   unsigned int *d = (unsigned int *)dst;
   d[0] = p[0];
@@ -989,10 +1082,6 @@
 
 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be,
                                      BLOCKD *b, B_PREDICTION_MODE *best_mode,
-#if CONFIG_COMP_INTRA_PRED
-                                     B_PREDICTION_MODE *best_second_mode,
-                                     int allow_comp,
-#endif
                                      int *bmode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
@@ -999,10 +1088,6 @@
                                      int *bestdistortion) {
   B_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
-
-#if CONFIG_COMP_INTRA_PRED
-  B_PREDICTION_MODE mode2;
-#endif
   int64_t best_rd = INT64_MAX;
   int rate = 0;
   int distortion;
@@ -1016,107 +1101,70 @@
    * a temp buffer that meets the stride requirements, but we are only
    * interested in the left 4x4 block
    * */
-  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 4);
-  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, best_predictor, 16 * 4);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);
 
 #if CONFIG_NEWBINTRAMODES
   b->bmi.as_mode.context = vp9_find_bpred_context(b);
 #endif
   for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {
-#if CONFIG_COMP_INTRA_PRED
-    for (mode2 = (allow_comp ? 0 : (B_DC_PRED - 1));
-                   mode2 != (allow_comp ? (mode + 1) : 0); mode2++) {
-#endif
-      int64_t this_rd;
-      int ratey;
+    int64_t this_rd;
+    int ratey;
 
 #if CONFIG_NEWBINTRAMODES
-      if (xd->frame_type == KEY_FRAME) {
-        if (mode == B_CONTEXT_PRED) continue;
-#if CONFIG_COMP_INTRA_PRED
-        if (mode2 == B_CONTEXT_PRED) continue;
+    if (xd->frame_type == KEY_FRAME) {
+      if (mode == B_CONTEXT_PRED) continue;
+    } else {
+      if (mode >= B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS &&
+          mode < B_CONTEXT_PRED)
+        continue;
+    }
 #endif
-      } else {
-        if (mode >= B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS &&
-            mode < B_CONTEXT_PRED)
-          continue;
-#if CONFIG_COMP_INTRA_PRED
-        if (mode2 >= B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS &&
-            mode2 < B_CONTEXT_PRED)
-          continue;
-#endif
-      }
-#endif
 
-      b->bmi.as_mode.first = mode;
+    b->bmi.as_mode.first = mode;
 #if CONFIG_NEWBINTRAMODES
-      rate = bmode_costs[
-          mode == B_CONTEXT_PRED ? mode - CONTEXT_PRED_REPLACEMENTS : mode];
+    rate = bmode_costs[
+        mode == B_CONTEXT_PRED ? mode - CONTEXT_PRED_REPLACEMENTS : mode];
 #else
-      rate = bmode_costs[mode];
+    rate = bmode_costs[mode];
 #endif
 
-#if CONFIG_COMP_INTRA_PRED
-      if (mode2 == (B_PREDICTION_MODE)(B_DC_PRED - 1)) {
-#endif
-        vp9_intra4x4_predict(b, mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        vp9_comp_intra4x4_predict(b, mode, mode2, b->predictor);
-#if CONFIG_NEWBINTRAMODES
-        rate += bmode_costs[
-            mode2 == B_CONTEXT_PRED ?
-            mode2 - CONTEXT_PRED_REPLACEMENTS : mode2];
-#else
-        rate += bmode_costs[mode2];
-#endif
-      }
-#endif
-      vp9_subtract_b(be, b, 16);
+    vp9_intra4x4_predict(b, mode, b->predictor);
+    vp9_subtract_b(be, b, 16);
 
-      b->bmi.as_mode.first = mode;
-      tx_type = get_tx_type_4x4(xd, b);
-      if (tx_type != DCT_DCT) {
-        vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
-        vp9_ht_quantize_b_4x4(be, b, tx_type);
-      } else {
-        x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-        x->quantize_b_4x4(be, b);
-      }
+    b->bmi.as_mode.first = mode;
+    tx_type = get_tx_type_4x4(xd, b);
+    if (tx_type != DCT_DCT) {
+      vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);
+      vp9_ht_quantize_b_4x4(be, b, tx_type);
+    } else {
+      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+      x->quantize_b_4x4(be, b);
+    }
 
-      tempa = ta;
-      templ = tl;
+    tempa = ta;
+    templ = tl;
 
-      ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
-      rate += ratey;
-      distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
+    ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);
+    rate += ratey;
+    distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;
 
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
-      if (this_rd < best_rd) {
-        *bestrate = rate;
-        *bestratey = ratey;
-        *bestdistortion = distortion;
-        best_rd = this_rd;
-        *best_mode = mode;
-        best_tx_type = tx_type;
-
-#if CONFIG_COMP_INTRA_PRED
-        *best_second_mode = mode2;
-#endif
-        *a = tempa;
-        *l = templ;
-        copy_predictor(best_predictor, b->predictor);
-        vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
-      }
-#if CONFIG_COMP_INTRA_PRED
+    if (this_rd < best_rd) {
+      *bestrate = rate;
+      *bestratey = ratey;
+      *bestdistortion = distortion;
+      best_rd = this_rd;
+      *best_mode = mode;
+      best_tx_type = tx_type;
+      *a = tempa;
+      *l = templ;
+      copy_predictor(best_predictor, b->predictor);
+      vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
     }
-#endif
   }
   b->bmi.as_mode.first = (B_PREDICTION_MODE)(*best_mode);
-#if CONFIG_COMP_INTRA_PRED
-  b->bmi.as_mode.second = (B_PREDICTION_MODE)(*best_second_mode);
-#endif
 
   // inverse transform
   if (best_tx_type != DCT_DCT)
@@ -1129,12 +1177,10 @@
   return best_rd;
 }
 
-static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate,
-                                     int *rate_y, int *Distortion, int64_t best_rd,
-#if CONFIG_COMP_INTRA_PRED
-                                     int allow_comp,
-#endif
-                                     int update_contexts) {
+static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
+                                         int *Rate, int *rate_y,
+                                         int *Distortion, int64_t best_rd,
+                                         int update_contexts) {
   int i;
   MACROBLOCKD *const xd = &mb->e_mbd;
   int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
@@ -1165,9 +1211,6 @@
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
     B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-#if CONFIG_COMP_INTRA_PRED
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
-#endif
     int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
 
     if (xd->frame_type == KEY_FRAME) {
@@ -1182,11 +1225,8 @@
 
     total_rd += rd_pick_intra4x4block(
                   cpi, mb, mb->block + i, xd->block + i, &best_mode,
-#if CONFIG_COMP_INTRA_PRED
-                  & best_second_mode, allow_comp,
-#endif
-                  bmode_costs, ta + vp9_block2above[i],
-                  tl + vp9_block2left[i], &r, &ry, &d);
+                  bmode_costs, ta + vp9_block2above[TX_4X4][i],
+                  tl + vp9_block2left[TX_4X4][i], &r, &ry, &d);
 
     cost += r;
     distortion += d;
@@ -1193,9 +1233,6 @@
     tot_rate_y += ry;
 
     mic->bmi[i].as_mode.first = best_mode;
-#if CONFIG_COMP_INTRA_PRED
-    mic->bmi[i].as_mode.second = best_second_mode;
-#endif
 
 #if 0  // CONFIG_NEWBINTRAMODES
     printf("%d %d\n", mic->bmi[i].as_mode.first, mic->bmi[i].as_mode.context);
@@ -1208,9 +1245,6 @@
   if (total_rd >= best_rd)
     return INT64_MAX;
 
-#if CONFIG_COMP_INTRA_PRED
-  cost += vp9_cost_bit(128, allow_comp);
-#endif
   *Rate = cost;
   *rate_y = tot_rate_y;
   *Distortion = distortion;
@@ -1218,7 +1252,6 @@
   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
 }
 
-#if CONFIG_SUPERBLOCKS
 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi,
                                       MACROBLOCK *x,
                                       int *rate,
@@ -1258,8 +1291,47 @@
 
   return best_rd;
 }
-#endif
 
+static int64_t rd_pick_intra_sb64y_mode(VP9_COMP *cpi,
+                                        MACROBLOCK *x,
+                                        int *rate,
+                                        int *rate_tokenonly,
+                                        int *distortion,
+                                        int *skippable,
+                                        int64_t txfm_cache[NB_TXFM_MODES]) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int this_rate, this_rate_tokenonly;
+  int this_distortion, s;
+  int64_t best_rd = INT64_MAX, this_rd;
+
+  /* Y Search for 32x32 intra prediction mode */
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.mode = mode;
+    vp9_build_intra_predictors_sb64y_s(&x->e_mbd);
+
+    super_block_64_yrd(cpi, x, &this_rate_tokenonly,
+                       &this_distortion, &s, txfm_cache);
+    this_rate = this_rate_tokenonly +
+                x->mbmode_cost[x->e_mbd.frame_type]
+                              [x->e_mbd.mode_info_context->mbmi.mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.mode = mode_selected;
+
+  return best_rd;
+}
+
 static int64_t rd_pick_intra16x16mby_mode(VP9_COMP *cpi,
                                           MACROBLOCK *x,
                                           int *Rate,
@@ -1270,10 +1342,6 @@
   MB_PREDICTION_MODE mode;
   TX_SIZE txfm_size = 0;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-#if CONFIG_COMP_INTRA_PRED
-  MB_PREDICTION_MODE mode2;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
-#endif
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   int rate, ratey;
@@ -1291,59 +1359,38 @@
 
     mbmi->mode = mode;
 
-#if CONFIG_COMP_INTRA_PRED
-    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
-      mbmi->second_mode = mode2;
-      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-        vp9_build_intra_predictors_mby(xd);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        continue; // i.e. disable for now
-        vp9_build_comp_intra_predictors_mby(xd);
-      }
-#endif
+    vp9_build_intra_predictors_mby(xd);
 
-      macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);
+    macro_block_yrd(cpi, x, &ratey, &distortion, &skip, local_txfm_cache);
 
-      // FIXME add compoundmode cost
-      // FIXME add rate for mode2
-      rate = ratey + x->mbmode_cost[xd->frame_type][mbmi->mode];
+    // FIXME add compoundmode cost
+    // FIXME add rate for mode2
+    rate = ratey + x->mbmode_cost[xd->frame_type][mbmi->mode];
 
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
-      if (this_rd < best_rd) {
-        mode_selected = mode;
-        txfm_size = mbmi->txfm_size;
-#if CONFIG_COMP_INTRA_PRED
-        mode2_selected = mode2;
-#endif
-        best_rd = this_rd;
-        *Rate = rate;
-        *rate_y = ratey;
-        *Distortion = distortion;
-        *skippable = skip;
-      }
+    if (this_rd < best_rd) {
+      mode_selected = mode;
+      txfm_size = mbmi->txfm_size;
+      best_rd = this_rd;
+      *Rate = rate;
+      *rate_y = ratey;
+      *Distortion = distortion;
+      *skippable = skip;
+    }
 
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        int64_t adj_rd = this_rd + local_txfm_cache[i] -
-                          local_txfm_cache[cpi->common.txfm_mode];
-        if (adj_rd < txfm_cache[i]) {
-          txfm_cache[i] = adj_rd;
-        }
+    for (i = 0; i < NB_TXFM_MODES; i++) {
+      int64_t adj_rd = this_rd + local_txfm_cache[i] -
+                        local_txfm_cache[cpi->common.txfm_mode];
+      if (adj_rd < txfm_cache[i]) {
+        txfm_cache[i] = adj_rd;
       }
-
-#if CONFIG_COMP_INTRA_PRED
     }
-#endif
   }
 
   mbmi->txfm_size = txfm_size;
   mbmi->mode = mode_selected;
 
-#if CONFIG_COMP_INTRA_PRED
-  mbmi->second_mode = mode2_selected;
-#endif
   return best_rd;
 }
 
@@ -1350,17 +1397,11 @@
 
 static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      B_PREDICTION_MODE *best_mode,
-#if CONFIG_COMP_INTRA_PRED
-                                     B_PREDICTION_MODE *best_second_mode,
-#endif
                                      int *mode_costs,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
                                      int *bestdistortion) {
   MB_PREDICTION_MODE mode;
-#if CONFIG_COMP_INTRA_PRED
-  MB_PREDICTION_MODE mode2;
-#endif
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t best_rd = INT64_MAX;
   int distortion = 0, rate = 0;
@@ -1374,8 +1415,8 @@
    * a temp buffer that meets the stride requirements, but we are only
    * interested in the left 8x8 block
    * */
-  DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16 * 8);
-  DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16 * 4);
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, best_predictor, 16 * 8);
+  DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16 * 4);
 
   // perform transformation of dimension 8x8
   // note the input and output index mapping
@@ -1382,119 +1423,98 @@
   int idx = (ib & 0x02) ? (ib + 2) : ib;
 
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-#if CONFIG_COMP_INTRA_PRED
-    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
-#endif
-      int64_t this_rd;
-      int rate_t = 0;
+    int64_t this_rd;
+    int rate_t = 0;
 
-      // FIXME rate for compound mode and second intrapred mode
-      rate = mode_costs[mode];
-      b->bmi.as_mode.first = mode;
+    // FIXME rate for compound mode and second intrapred mode
+    rate = mode_costs[mode];
+    b->bmi.as_mode.first = mode;
 
-#if CONFIG_COMP_INTRA_PRED
-      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-        vp9_intra8x8_predict(b, mode, b->predictor);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        continue; // i.e. disable for now
-        vp9_comp_intra8x8_predict(b, mode, mode2, b->predictor);
-      }
-#endif
+    vp9_intra8x8_predict(b, mode, b->predictor);
 
-      vp9_subtract_4b_c(be, b, 16);
+    vp9_subtract_4b_c(be, b, 16);
 
-      assert(get_2nd_order_usage(xd) == 0);
-      if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-        TX_TYPE tx_type = get_tx_type_8x8(xd, b);
-        if (tx_type != DCT_DCT)
-          vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);
-        else
-          x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
-        x->quantize_b_8x8(x->block + idx, xd->block + idx);
+    assert(get_2nd_order_usage(xd) == 0);
+    if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
+      TX_TYPE tx_type = get_tx_type_8x8(xd, b);
+      if (tx_type != DCT_DCT)
+        vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);
+      else
+        x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
+      x->quantize_b_8x8(x->block + idx, xd->block + idx);
 
-        // compute quantization mse of 8x8 block
-        distortion = vp9_block_error_c((x->block + idx)->coeff,
-                                       (xd->block + idx)->dqcoeff, 64);
-        ta0 = a[vp9_block2above_8x8[idx]];
-        tl0 = l[vp9_block2left_8x8[idx]];
+      // compute quantization mse of 8x8 block
+      distortion = vp9_block_error_c((x->block + idx)->coeff,
+                                     (xd->block + idx)->dqcoeff, 64);
+      ta0 = a[vp9_block2above[TX_8X8][idx]];
+      tl0 = l[vp9_block2left[TX_8X8][idx]];
 
-        rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
-                             &ta0, &tl0, TX_8X8);
+      rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,
+                           &ta0, &tl0, TX_8X8);
 
-        rate += rate_t;
-        ta1 = ta0;
-        tl1 = tl0;
-      } else {
-        static const int iblock[4] = {0, 1, 4, 5};
-        TX_TYPE tx_type;
-        int i;
-        ta0 = a[vp9_block2above[ib]];
-        ta1 = a[vp9_block2above[ib + 1]];
-        tl0 = l[vp9_block2left[ib]];
-        tl1 = l[vp9_block2left[ib + 4]];
-        distortion = 0;
-        rate_t = 0;
-        for (i = 0; i < 4; ++i) {
-          b = &xd->block[ib + iblock[i]];
-          be = &x->block[ib + iblock[i]];
-          tx_type = get_tx_type_4x4(xd, b);
-          if (tx_type != DCT_DCT) {
-            vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
-            vp9_ht_quantize_b_4x4(be, b, tx_type);
-          } else {
-            x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
-            x->quantize_b_4x4(be, b);
-          }
-          distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16);
-          rate_t += cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC,
-                                // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0,
-                                &ta0, &tl0,
-                                TX_4X4);
+      rate += rate_t;
+      ta1 = ta0;
+      tl1 = tl0;
+    } else {
+      static const int iblock[4] = {0, 1, 4, 5};
+      TX_TYPE tx_type;
+      int i;
+      ta0 = a[vp9_block2above[TX_4X4][ib]];
+      ta1 = a[vp9_block2above[TX_4X4][ib + 1]];
+      tl0 = l[vp9_block2left[TX_4X4][ib]];
+      tl1 = l[vp9_block2left[TX_4X4][ib + 4]];
+      distortion = 0;
+      rate_t = 0;
+      for (i = 0; i < 4; ++i) {
+        b = &xd->block[ib + iblock[i]];
+        be = &x->block[ib + iblock[i]];
+        tx_type = get_tx_type_4x4(xd, b);
+        if (tx_type != DCT_DCT) {
+          vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);
+          vp9_ht_quantize_b_4x4(be, b, tx_type);
+        } else {
+          x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
+          x->quantize_b_4x4(be, b);
         }
-        rate += rate_t;
+        distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16);
+        rate_t += cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC,
+                              // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0,
+                              &ta0, &tl0,
+                              TX_4X4);
       }
+      rate += rate_t;
+    }
 
-      distortion >>= 2;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
-      if (this_rd < best_rd) {
-        *bestrate = rate;
-        *bestratey = rate_t;
-        *bestdistortion = distortion;
-        besta0 = ta0;
-        besta1 = ta1;
-        bestl0 = tl0;
-        bestl1 = tl1;
-        best_rd = this_rd;
-        *best_mode = mode;
-#if CONFIG_COMP_INTRA_PRED
-        *best_second_mode = mode2;
-#endif
-        copy_predictor_8x8(best_predictor, b->predictor);
-        vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);
-        vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);
-#if CONFIG_COMP_INTRA_PRED
-      }
-#endif
+    distortion >>= 2;
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+    if (this_rd < best_rd) {
+      *bestrate = rate;
+      *bestratey = rate_t;
+      *bestdistortion = distortion;
+      besta0 = ta0;
+      besta1 = ta1;
+      bestl0 = tl0;
+      bestl1 = tl1;
+      best_rd = this_rd;
+      *best_mode = mode;
+      copy_predictor_8x8(best_predictor, b->predictor);
+      vpx_memcpy(best_dqcoeff, b->dqcoeff, 64);
+      vpx_memcpy(best_dqcoeff + 32, b->dqcoeff + 64, 64);
     }
   }
   b->bmi.as_mode.first = (*best_mode);
-#if CONFIG_COMP_INTRA_PRED
-  b->bmi.as_mode.second = (*best_second_mode);
-#endif
   vp9_encode_intra8x8(x, ib);
 
   if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
-    a[vp9_block2above_8x8[idx]]     = besta0;
-    a[vp9_block2above_8x8[idx] + 1] = besta1;
-    l[vp9_block2left_8x8[idx]]      = bestl0;
-    l[vp9_block2left_8x8[idx] + 1]  = bestl1;
+    a[vp9_block2above[TX_8X8][idx]]     = besta0;
+    a[vp9_block2above[TX_8X8][idx] + 1] = besta1;
+    l[vp9_block2left[TX_8X8][idx]]      = bestl0;
+    l[vp9_block2left[TX_8X8][idx] + 1]  = bestl1;
   } else {
-    a[vp9_block2above[ib]]     = besta0;
-    a[vp9_block2above[ib + 1]] = besta1;
-    l[vp9_block2left[ib]]      = bestl0;
-    l[vp9_block2left[ib + 4]]  = bestl1;
+    a[vp9_block2above[TX_4X4][ib]]     = besta0;
+    a[vp9_block2above[TX_4X4][ib + 1]] = besta1;
+    l[vp9_block2left[TX_4X4][ib]]      = bestl0;
+    l[vp9_block2left[TX_4X4][ib + 4]]  = bestl1;
   }
 
   return best_rd;
@@ -1508,7 +1528,7 @@
   int cost = mb->mbmode_cost [xd->frame_type] [I8X8_PRED];
   int distortion = 0;
   int tot_rate_y = 0;
-  long long total_rd = 0;
+  int64_t total_rd = 0;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
   int *i8x8mode_costs;
@@ -1525,25 +1545,16 @@
   for (i = 0; i < 4; i++) {
     MODE_INFO *const mic = xd->mode_info_context;
     B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
-#if CONFIG_COMP_INTRA_PRED
-    B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_second_mode);
-#endif
     int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
 
     ib = vp9_i8x8_block[i];
     total_rd += rd_pick_intra8x8block(
                   cpi, mb, ib, &best_mode,
-#if CONFIG_COMP_INTRA_PRED
-                  & best_second_mode,
-#endif
                   i8x8mode_costs, ta, tl, &r, &ry, &d);
     cost += r;
     distortion += d;
     tot_rate_y += ry;
     mic->bmi[ib].as_mode.first = best_mode;
-#if CONFIG_COMP_INTRA_PRED
-    mic->bmi[ib].as_mode.second = best_second_mode;
-#endif
   }
 
   *Rate = cost;
@@ -1572,7 +1583,8 @@
 
   for (b = 16; b < 24; b++)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
-                        ta + vp9_block2above[b], tl + vp9_block2left[b],
+                        ta + vp9_block2above[TX_4X4][b],
+                        tl + vp9_block2left[TX_4X4][b],
                         TX_4X4);
 
   return cost;
@@ -1612,8 +1624,8 @@
 
   for (b = 16; b < 24; b += 4)
     cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,
-                        ta + vp9_block2above_8x8[b],
-                        tl + vp9_block2left_8x8[b], TX_8X8);
+                        ta + vp9_block2above[TX_8X8][b],
+                        tl + vp9_block2left[TX_8X8][b], TX_8X8);
 
   return cost;
 }
@@ -1631,59 +1643,113 @@
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-#if CONFIG_SUPERBLOCKS
+static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) {
+  int b;
+  int cost = 0;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  ENTROPY_CONTEXT_PLANES t_above, t_left;
+  ENTROPY_CONTEXT *ta, *tl;
+
+  if (backup) {
+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+
+    ta = (ENTROPY_CONTEXT *) &t_above;
+    tl = (ENTROPY_CONTEXT *) &t_left;
+  } else {
+    ta = (ENTROPY_CONTEXT *)xd->above_context;
+    tl = (ENTROPY_CONTEXT *)xd->left_context;
+  }
+
+  for (b = 16; b < 24; b += 4)
+    cost += cost_coeffs(x, xd->block + b, PLANE_TYPE_UV,
+                        ta + vp9_block2above[TX_8X8][b],
+                        tl + vp9_block2left[TX_8X8][b], TX_16X16);
+
+  return cost;
+}
+
+static void rd_inter32x32_uv_16x16(MACROBLOCK *x, int *rate,
+                                   int *distortion, int *skip,
+                                   int backup) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  vp9_transform_sbuv_16x16(x);
+  vp9_quantize_sbuv_16x16(x);
+
+  *rate       = rd_cost_sbuv_16x16(x, backup);
+  *distortion = vp9_block_error_c(x->sb_coeff_data.coeff + 1024,
+                                   xd->sb_coeff_data.dqcoeff + 1024, 512) >> 2;
+  *skip       = vp9_sbuv_is_skippable_16x16(xd);
+}
+
 static int64_t rd_inter32x32_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                 int *distortion, int fullpixel, int *skip) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
-  int n, r = 0, d = 0;
   const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  int skippable = 1;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
 
-  memcpy(t_above, xd->above_context, sizeof(t_above));
-  memcpy(t_left, xd->left_context, sizeof(t_left));
+  if (mbmi->txfm_size == TX_32X32) {
+    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                          usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride);
+    rd_inter32x32_uv_16x16(x, rate, distortion, skip, 1);
+  } else {
+    int n, r = 0, d = 0;
+    int skippable = 1;
+    ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+    ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
+    ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
 
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
-    int d_tmp, s_tmp, r_tmp;
+    memcpy(t_above, xd->above_context, sizeof(t_above));
+    memcpy(t_left, xd->left_context, sizeof(t_left));
 
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    vp9_subtract_mbuv_s_c(x->src_diff,
-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          src_uv_stride,
-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          dst_uv_stride);
+    for (n = 0; n < 4; n++) {
+      int x_idx = n & 1, y_idx = n >> 1;
+      int d_tmp, s_tmp, r_tmp;
 
-    if (mbmi->txfm_size == TX_4X4) {
-      rd_inter16x16_uv_4x4(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
-    } else {
-      rd_inter16x16_uv_8x8(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
+      xd->above_context = ta + x_idx;
+      xd->left_context = tl + y_idx;
+      vp9_subtract_mbuv_s_c(x->src_diff,
+                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            src_uv_stride,
+                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            dst_uv_stride);
+
+      if (mbmi->txfm_size == TX_4X4) {
+        rd_inter16x16_uv_4x4(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
+      } else {
+        rd_inter16x16_uv_8x8(cpi, x, &r_tmp, &d_tmp, fullpixel, &s_tmp, 0);
+      }
+
+      r += r_tmp;
+      d += d_tmp;
+      skippable = skippable && s_tmp;
     }
 
-    r += r_tmp;
-    d += d_tmp;
-    skippable = skippable && s_tmp;
+    *rate = r;
+    *distortion = d;
+    *skip = skippable;
+    xd->left_context = tl;
+    xd->above_context = ta;
+    memcpy(xd->above_context, t_above, sizeof(t_above));
+    memcpy(xd->left_context, t_left, sizeof(t_left));
   }
 
-  *rate = r;
-  *distortion = d;
-  *skip = skippable;
-  xd->left_context = tl;
-  xd->above_context = ta;
-  memcpy(xd->above_context, t_above, sizeof(t_above));
-  memcpy(xd->left_context, t_left, sizeof(t_left));
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
 
-  return RDCOST(x->rdmult, x->rddiv, r, d);
+static void super_block_64_uvrd(MACROBLOCK *x, int *rate,
+                                int *distortion, int *skip);
+static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
+                                int *distortion, int fullpixel, int *skip) {
+  super_block_64_uvrd(x, rate, distortion, skip);
+  return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
-#endif
 
 static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                               int *distortion, int *skip, int fullpixel) {
@@ -1701,10 +1767,6 @@
                                     int *skippable) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
-#if CONFIG_COMP_INTRA_PRED
-  MB_PREDICTION_MODE mode2;
-  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode2_selected);
-#endif
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int64_t best_rd = INT64_MAX;
@@ -1712,50 +1774,33 @@
   int rate_to, UNINITIALIZED_IS_SAFE(skip);
 
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-#if CONFIG_COMP_INTRA_PRED
-    for (mode2 = DC_PRED - 1; mode2 != TM_PRED + 1; mode2++) {
-#endif
-      int rate;
-      int distortion;
-      int64_t this_rd;
+    int rate;
+    int distortion;
+    int64_t this_rd;
 
-      mbmi->uv_mode = mode;
-#if CONFIG_COMP_INTRA_PRED
-      mbmi->second_uv_mode = mode2;
-      if (mode2 == (MB_PREDICTION_MODE)(DC_PRED - 1)) {
-#endif
-        vp9_build_intra_predictors_mbuv(&x->e_mbd);
-#if CONFIG_COMP_INTRA_PRED
-      } else {
-        continue;
-        vp9_build_comp_intra_predictors_mbuv(&x->e_mbd);
-      }
-#endif
+    mbmi->uv_mode = mode;
+    vp9_build_intra_predictors_mbuv(&x->e_mbd);
 
-      vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
-                        x->e_mbd.predictor, x->src.uv_stride);
-      vp9_transform_mbuv_4x4(x);
-      vp9_quantize_mbuv_4x4(x);
+    vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
+                      x->e_mbd.predictor, x->src.uv_stride);
+    vp9_transform_mbuv_4x4(x);
+    vp9_quantize_mbuv_4x4(x);
 
-      rate_to = rd_cost_mbuv_4x4(x, 1);
-      rate = rate_to
-             + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
+    rate_to = rd_cost_mbuv_4x4(x, 1);
+    rate = rate_to
+           + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];
 
-      distortion = vp9_mbuverror(x) / 4;
+    distortion = vp9_mbuverror(x) / 4;
 
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
-      if (this_rd < best_rd) {
-        skip = vp9_mbuv_is_skippable_4x4(xd);
-        best_rd = this_rd;
-        d = distortion;
-        r = rate;
-        *rate_tokenonly = rate_to;
-        mode_selected = mode;
-#if CONFIG_COMP_INTRA_PRED
-        mode2_selected = mode2;
-      }
-#endif
+    if (this_rd < best_rd) {
+      skip = vp9_mbuv_is_skippable_4x4(xd);
+      best_rd = this_rd;
+      d = distortion;
+      r = rate;
+      *rate_tokenonly = rate_to;
+      mode_selected = mode;
     }
   }
 
@@ -1764,9 +1809,6 @@
   *skippable = skip;
 
   mbmi->uv_mode = mode_selected;
-#if CONFIG_COMP_INTRA_PRED
-  mbmi->second_uv_mode = mode2_selected;
-#endif
 }
 
 static void rd_pick_intra_mbuv_mode_8x8(VP9_COMP *cpi,
@@ -1817,53 +1859,147 @@
   mbmi->uv_mode = mode_selected;
 }
 
-#if CONFIG_SUPERBLOCKS
-static void super_block_uvrd_8x8(MACROBLOCK *x,
-                                 int *rate,
-                                 int *distortion,
-                                 int *skippable) {
+// TODO(rbultje) very similar to rd_inter32x32_uv(), merge?
+static void super_block_uvrd(MACROBLOCK *x,
+                             int *rate,
+                             int *distortion,
+                             int *skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  int d = 0, r = 0, n, s = 1;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
-  ENTROPY_CONTEXT_PLANES *ta = xd->above_context;
-  ENTROPY_CONTEXT_PLANES *tl = xd->left_context;
 
+  if (mbmi->txfm_size == TX_32X32) {
+    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                          usrc, vsrc, src_uv_stride,
+                          udst, vdst, dst_uv_stride);
+    rd_inter32x32_uv_16x16(x, rate, distortion, skippable, 1);
+  } else {
+    int d = 0, r = 0, n, s = 1;
+    ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];
+    ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;
+    ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;
+
+    memcpy(t_above, xd->above_context, sizeof(t_above));
+    memcpy(t_left,  xd->left_context,  sizeof(t_left));
+
+    for (n = 0; n < 4; n++) {
+      int x_idx = n & 1, y_idx = n >> 1;
+
+      vp9_subtract_mbuv_s_c(x->src_diff,
+                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            src_uv_stride,
+                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            dst_uv_stride);
+      if (mbmi->txfm_size == TX_4X4) {
+        vp9_transform_mbuv_4x4(x);
+        vp9_quantize_mbuv_4x4(x);
+        s &= vp9_mbuv_is_skippable_4x4(xd);
+      } else {
+        vp9_transform_mbuv_8x8(x);
+        vp9_quantize_mbuv_8x8(x);
+        s &= vp9_mbuv_is_skippable_8x8(xd);
+      }
+
+      d += vp9_mbuverror(x) >> 2;
+      xd->above_context = t_above + x_idx;
+      xd->left_context = t_left + y_idx;
+      if (mbmi->txfm_size == TX_4X4) {
+        r += rd_cost_mbuv_4x4(x, 0);
+      } else {
+        r += rd_cost_mbuv_8x8(x, 0);
+      }
+    }
+
+    xd->above_context = ta_orig;
+    xd->left_context = tl_orig;
+
+    *distortion = d;
+    *rate       = r;
+    *skippable  = s;
+  }
+}
+
+static void super_block_64_uvrd(MACROBLOCK *x,
+                                int *rate,
+                                int *distortion,
+                                int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+  const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
+  const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+  int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];
+  ENTROPY_CONTEXT_PLANES *ta_orig = xd->above_context;
+  ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;
+  int d = 0, r = 0, n, s = 1;
+
   memcpy(t_above, xd->above_context, sizeof(t_above));
   memcpy(t_left,  xd->left_context,  sizeof(t_left));
 
-  for (n = 0; n < 4; n++) {
-    int x_idx = n & 1, y_idx = n >> 1;
+  if (mbmi->txfm_size == TX_32X32) {
+    int n;
 
-    vp9_subtract_mbuv_s_c(x->src_diff,
-                          usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
-                          src_uv_stride,
-                          udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
-                          dst_uv_stride);
-    vp9_transform_mbuv_8x8(x);
-    vp9_quantize_mbuv_8x8(x);
-    s &= vp9_mbuv_is_skippable_8x8(xd);
+    *rate = 0;
+    for (n = 0; n < 4; n++) {
+      int x_idx = n & 1, y_idx = n >> 1;
+      int r_tmp, d_tmp, s_tmp;
 
-    d += vp9_mbuverror(x) >> 2;
-    xd->above_context = ta + x_idx;
-    xd->left_context = tl + y_idx;
-    r += rd_cost_mbuv_8x8(x, 0);
+      vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,
+                            usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
+                            vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,
+                            src_uv_stride,
+                            udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                            vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride,
+                            dst_uv_stride);
+      xd->above_context = t_above + x_idx * 2;
+      xd->left_context = t_left + y_idx * 2;
+      rd_inter32x32_uv_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
+      r += r_tmp;
+      d += d_tmp;
+      s = s && s_tmp;
+    }
+  } else {
+    for (n = 0; n < 16; n++) {
+      int x_idx = n & 3, y_idx = n >> 2;
+
+      vp9_subtract_mbuv_s_c(x->src_diff,
+                            usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,
+                            src_uv_stride,
+                            udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
+                            dst_uv_stride);
+      if (mbmi->txfm_size == TX_4X4) {
+        vp9_transform_mbuv_4x4(x);
+        vp9_quantize_mbuv_4x4(x);
+        s &= vp9_mbuv_is_skippable_4x4(xd);
+      } else {
+        vp9_transform_mbuv_8x8(x);
+        vp9_quantize_mbuv_8x8(x);
+        s &= vp9_mbuv_is_skippable_8x8(xd);
+      }
+
+      xd->above_context = t_above + x_idx;
+      xd->left_context = t_left + y_idx;
+      d += vp9_mbuverror(x) >> 2;
+      if (mbmi->txfm_size == TX_4X4) {
+        r += rd_cost_mbuv_4x4(x, 0);
+      } else {
+        r += rd_cost_mbuv_8x8(x, 0);
+      }
+    }
   }
 
-  xd->above_context = ta;
-  xd->left_context = tl;
   *distortion = d;
   *rate       = r;
   *skippable  = s;
 
-  xd->left_context = tl;
-  xd->above_context = ta;
-  memcpy(xd->above_context, t_above, sizeof(t_above));
-  memcpy(xd->left_context,  t_left,  sizeof(t_left));
+  xd->left_context = tl_orig;
+  xd->above_context = ta_orig;
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi,
@@ -1882,8 +2018,8 @@
     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
 
-    super_block_uvrd_8x8(x, &this_rate_tokenonly,
-                         &this_distortion, &s);
+    super_block_uvrd(x, &this_rate_tokenonly,
+                     &this_distortion, &s);
     this_rate = this_rate_tokenonly +
                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -1902,8 +2038,44 @@
 
   return best_rd;
 }
-#endif
 
+static int64_t rd_pick_intra_sb64uv_mode(VP9_COMP *cpi,
+                                         MACROBLOCK *x,
+                                         int *rate,
+                                         int *rate_tokenonly,
+                                         int *distortion,
+                                         int *skippable) {
+  MB_PREDICTION_MODE mode;
+  MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+  int64_t best_rd = INT64_MAX, this_rd;
+  int this_rate_tokenonly, this_rate;
+  int this_distortion, s;
+
+  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
+    x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
+    vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);
+
+    super_block_64_uvrd(x, &this_rate_tokenonly,
+                        &this_distortion, &s);
+    this_rate = this_rate_tokenonly +
+    x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < best_rd) {
+      mode_selected   = mode;
+      best_rd         = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+    }
+  }
+
+  x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected;
+
+  return best_rd;
+}
+
 int vp9_cost_mv_ref(VP9_COMP *cpi,
                     MB_PREDICTION_MODE m,
                     const int mode_context) {
@@ -2058,9 +2230,9 @@
       BLOCK *be = &x->block[i];
       int thisdistortion;
 
-      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict);
+      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4);
       if (xd->mode_info_context->mbmi.second_ref_frame > 0)
-        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg);
+        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4);
       vp9_subtract_b(be, bd, 16);
       x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
       x->quantize_b_4x4(be, bd);
@@ -2067,8 +2239,8 @@
       thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);
       *distortion += thisdistortion;
       *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
-                                 ta + vp9_block2above[i],
-                                 tl + vp9_block2left[i], TX_4X4);
+                                 ta + vp9_block2above[TX_4X4][i],
+                                 tl + vp9_block2left[TX_4X4][i], TX_4X4);
     }
   }
   *distortion >>= 2;
@@ -2119,8 +2291,9 @@
           thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
           otherdist += thisdistortion;
           othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
-                                     tacp + vp9_block2above_8x8[idx],
-                                     tlcp + vp9_block2left_8x8[idx], TX_8X8);
+                                     tacp + vp9_block2above[TX_8X8][idx],
+                                     tlcp + vp9_block2left[TX_8X8][idx],
+                                     TX_8X8);
         }
         for (j = 0; j < 4; j += 2) {
           bd = &xd->block[ib + iblock[j]];
@@ -2130,13 +2303,13 @@
           thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
           *distortion += thisdistortion;
           *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
-                                     ta + vp9_block2above[ib + iblock[j]],
-                                     tl + vp9_block2left[ib + iblock[j]],
-                                     TX_4X4);
+                           ta + vp9_block2above[TX_4X4][ib + iblock[j]],
+                           tl + vp9_block2left[TX_4X4][ib + iblock[j]],
+                           TX_4X4);
           *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
-                                     ta + vp9_block2above[ib + iblock[j] + 1],
-                                     tl + vp9_block2left[ib + iblock[j]],
-                                     TX_4X4);
+                           ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
+                           tl + vp9_block2left[TX_4X4][ib + iblock[j]],
+                           TX_4X4);
         }
       } else /* 8x8 */ {
         if (otherrd) {
@@ -2148,13 +2321,13 @@
             thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);
             otherdist += thisdistortion;
             othercost += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,
-                                     tacp + vp9_block2above[ib + iblock[j]],
-                                     tlcp + vp9_block2left[ib + iblock[j]],
-                                     TX_4X4);
+                           tacp + vp9_block2above[TX_4X4][ib + iblock[j]],
+                           tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
+                           TX_4X4);
             othercost += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,
-                                     tacp + vp9_block2above[ib + iblock[j] + 1],
-                                     tlcp + vp9_block2left[ib + iblock[j]],
-                                     TX_4X4);
+                           tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],
+                           tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],
+                           TX_4X4);
           }
         }
         x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);
@@ -2162,8 +2335,8 @@
         thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);
         *distortion += thisdistortion;
         *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,
-                                   ta + vp9_block2above_8x8[idx],
-                                   tl + vp9_block2left_8x8[idx], TX_8X8);
+                                   ta + vp9_block2above[TX_8X8][idx],
+                                   tl + vp9_block2left[TX_8X8][idx], TX_8X8);
       }
     }
   }
@@ -2738,8 +2911,8 @@
 }
 
 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
-                    unsigned char *ref_y_buffer, int ref_y_stride,
-                    int_mv *mvp, int ref_frame, enum BlockSize block_size ) {
+                    uint8_t *ref_y_buffer, int ref_y_stride,
+                    int ref_frame, enum BlockSize block_size ) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int_mv this_mv;
@@ -2750,8 +2923,8 @@
   int this_sad = INT_MAX;
 
   BLOCK *b = &x->block[0];
-  unsigned char *src_y_ptr = *(b->base_src);
-  unsigned char *ref_y_ptr;
+  uint8_t *src_y_ptr = *(b->base_src);
+  uint8_t *ref_y_ptr;
   int row_offset, col_offset;
 
   // Get the sad for each candidate reference mv
@@ -2779,29 +2952,21 @@
     }
   }
 
-  // Return the mv that had the best sad for use in the motion search.
-  mvp->as_int = mbmi->ref_mvs[ref_frame][best_index].as_int;
-  clamp_mv2(mvp, xd);
+  // Note the index of the mv that worked best in the reference list.
+  x->mv_best_ref_index[ref_frame] = best_index;
 }
 
-static void set_i8x8_block_modes(MACROBLOCK *x, int modes[2][4]) {
+static void set_i8x8_block_modes(MACROBLOCK *x, int modes[4]) {
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
   for (i = 0; i < 4; i++) {
     int ib = vp9_i8x8_block[i];
-    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[0][i];
-    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[0][i];
-    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[0][i];
-    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[0][i];
-#if CONFIG_COMP_INTRA_PRED
-    xd->mode_info_context->bmi[ib + 0].as_mode.second = modes[1][i];
-    xd->mode_info_context->bmi[ib + 1].as_mode.second = modes[1][i];
-    xd->mode_info_context->bmi[ib + 4].as_mode.second = modes[1][i];
-    xd->mode_info_context->bmi[ib + 5].as_mode.second = modes[1][i];
-#endif
-    // printf("%d,%d,%d,%d %d,%d,%d,%d\n",
-    //       modes[0][0], modes[0][1], modes[0][2], modes[0][3],
-    //       modes[1][0], modes[1][1], modes[1][2], modes[1][3]);
+    xd->mode_info_context->bmi[ib + 0].as_mode.first = modes[i];
+    xd->mode_info_context->bmi[ib + 1].as_mode.first = modes[i];
+    xd->mode_info_context->bmi[ib + 4].as_mode.first = modes[i];
+    xd->mode_info_context->bmi[ib + 5].as_mode.first = modes[i];
+    // printf("%d,%d,%d,%d\n",
+    //       modes[0], modes[1], modes[2], modes[3]);
   }
 
   for (i = 0; i < 16; i++) {
@@ -2869,7 +3034,6 @@
   int pred_flag;
   int pred_ctx;
   int i;
-  int tot_count;
 
   vp9_prob pred_prob, new_pred_prob;
   int seg_ref_active;
@@ -2893,13 +3057,8 @@
 
   // Predict probability for current frame based on stats so far
   pred_ctx = vp9_get_pred_context(cm, xd, PRED_REF);
-  tot_count = cpi->ref_pred_count[pred_ctx][0] + cpi->ref_pred_count[pred_ctx][1];
-  if (tot_count) {
-    new_pred_prob =
-      (cpi->ref_pred_count[pred_ctx][0] * 255 + (tot_count >> 1)) / tot_count;
-    new_pred_prob += !new_pred_prob;
-  } else
-    new_pred_prob = 128;
+  new_pred_prob = get_binary_prob(cpi->ref_pred_count[pred_ctx][0],
+                                  cpi->ref_pred_count[pred_ctx][1]);
 
   // Get the set of probabilities to use if prediction fails
   mod_refprobs = cm->mod_refprobs[pred_ref];
@@ -2954,6 +3113,7 @@
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
+  ctx->skip = x->skip;
   ctx->best_mode_index = mode_index;
   vpx_memcpy(&ctx->mic, xd->mode_info_context,
              sizeof(MODE_INFO));
@@ -2963,9 +3123,6 @@
   ctx->best_ref_mv.as_int = ref_mv->as_int;
   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
 
-  // ctx[mb_index].rddiv = x->rddiv;
-  // ctx[mb_index].rdmult = x->rdmult;
-
   ctx->single_pred_diff = comp_pred_diff[SINGLE_PREDICTION_ONLY];
   ctx->comp_pred_diff   = comp_pred_diff[COMP_PREDICTION_ONLY];
   ctx->hybrid_pred_diff = comp_pred_diff[HYBRID_PREDICTION];
@@ -3000,8 +3157,6 @@
   *skippable = y_skippable && uv_skippable;
 }
 
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                                int idx, MV_REFERENCE_FRAME frame_type,
                                int block_size,
@@ -3008,12 +3163,10 @@
                                int recon_yoffset, int recon_uvoffset,
                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
                                int_mv frame_near_mv[MAX_REF_FRAMES],
-                               int_mv frame_best_ref_mv[MAX_REF_FRAMES],
-                               int_mv mv_search_ref[MAX_REF_FRAMES],
                                int frame_mdcounts[4][4],
-                               unsigned char *y_buffer[4],
-                               unsigned char *u_buffer[4],
-                               unsigned char *v_buffer[4]) {
+                               uint8_t *y_buffer[4],
+                               uint8_t *u_buffer[4],
+                               uint8_t *v_buffer[4]) {
   YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -3033,7 +3186,6 @@
   vp9_find_best_ref_mvs(xd, y_buffer[frame_type],
                         yv12->y_stride,
                         mbmi->ref_mvs[frame_type],
-                        &frame_best_ref_mv[frame_type],
                         &frame_nearest_mv[frame_type],
                         &frame_near_mv[frame_type]);
 
@@ -3041,13 +3193,8 @@
   // Further refinement that is encode side only to test the top few candidates
   // in full and choose the best as the centre point for subsequent searches.
   mv_pred(cpi, x, y_buffer[frame_type], yv12->y_stride,
-          &mv_search_ref[frame_type], frame_type, block_size);
+          frame_type, block_size);
 
-#if CONFIG_NEW_MVREF
-  // TODO(paulwilkins): Final choice of which of the best 4 candidates from
-  // above gives lowest error score when used in isolation. This stage encoder
-  // and sets the reference MV
-#endif
 }
 
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -3063,9 +3210,8 @@
                                  int *rate_uv, int *distortion_uv,
                                  int *mode_excluded, int *disable_skip,
                                  int recon_yoffset, int mode_index,
-                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-                                 int_mv frame_best_ref_mv[MAX_REF_FRAMES],
-                                 int_mv mv_search_ref[MAX_REF_FRAMES]) {
+                                 int_mv frame_mv[MB_MODE_COUNT]
+                                                [MAX_REF_FRAMES]) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
@@ -3081,20 +3227,24 @@
   int refs[2] = { mbmi->ref_frame,
                   (mbmi->second_ref_frame < 0 ? 0 : mbmi->second_ref_frame) };
   int_mv cur_mv[2];
+  int_mv ref_mv[2];
   int64_t this_rd = 0;
 
   switch (this_mode) {
     case NEWMV:
+      ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
+      ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
+
       if (is_comp_pred) {
         if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
             frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
           return INT64_MAX;
         *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[0]],
-                                  &frame_best_ref_mv[refs[0]],
+                                  &ref_mv[0],
                                   x->nmvjointcost, x->mvcost, 96,
                                   x->e_mbd.allow_high_precision_mv);
         *rate2 += vp9_mv_bit_cost(&frame_mv[NEWMV][refs[1]],
-                                  &frame_best_ref_mv[refs[1]],
+                                  &ref_mv[1],
                                   x->nmvjointcost, x->mvcost, 96,
                                   x->e_mbd.allow_high_precision_mv);
       } else {
@@ -3109,11 +3259,18 @@
         int tmp_row_min = x->mv_row_min;
         int tmp_row_max = x->mv_row_max;
 
-        vp9_clamp_mv_min_max(x, &frame_best_ref_mv[refs[0]]);
+        vp9_clamp_mv_min_max(x, &ref_mv[0]);
 
-        mvp_full.as_mv.col = mv_search_ref[mbmi->ref_frame].as_mv.col >> 3;
-        mvp_full.as_mv.row = mv_search_ref[mbmi->ref_frame].as_mv.row >> 3;
+        // mvp_full.as_int = ref_mv[0].as_int;
+        mvp_full.as_int =
+         mbmi->ref_mvs[refs[0]][x->mv_best_ref_index[refs[0]]].as_int;
 
+        mvp_full.as_mv.col >>= 3;
+        mvp_full.as_mv.row >>= 3;
+        if (mvp_full.as_int != mvp_full.as_int) {
+          mvp_full.as_int = mvp_full.as_int;
+        }
+
         // adjust search range according to sr from mv prediction
         step_param = MAX(step_param, sr);
 
@@ -3123,7 +3280,7 @@
         bestsme = vp9_full_pixel_diamond(cpi, x, b, d, &mvp_full, step_param,
                                          sadpb, further_steps, 1,
                                          &cpi->fn_ptr[block_size],
-                                         &frame_best_ref_mv[refs[0]], &tmp_mv);
+                                         &ref_mv[0], &tmp_mv);
 
         x->mv_col_min = tmp_col_min;
         x->mv_col_max = tmp_col_max;
@@ -3134,7 +3291,7 @@
           int dis; /* TODO: use dis in distortion calculation later. */
           unsigned int sse;
           cpi->find_fractional_mv_step(x, b, d, &tmp_mv,
-                                       &frame_best_ref_mv[refs[0]],
+                                       &ref_mv[0],
                                        x->errorperbit,
                                        &cpi->fn_ptr[block_size],
                                        x->nmvjointcost, x->mvcost,
@@ -3144,7 +3301,7 @@
         frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;
 
         // Add the new motion vector cost to our rolling cost variable
-        *rate2 += vp9_mv_bit_cost(&tmp_mv, &frame_best_ref_mv[refs[0]],
+        *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
                                   x->nmvjointcost, x->mvcost,
                                   96, xd->allow_high_precision_mv);
       }
@@ -3169,12 +3326,6 @@
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
-#if CONFIG_PRED_FILTER
-  // Filtered prediction:
-  mbmi->pred_filter_enabled = vp9_mode_order[mode_index].pred_filter_flag;
-  *rate2 += vp9_cost_bit(cpi->common.prob_pred_filter_off,
-                         mbmi->pred_filter_enabled);
-#endif
   if (cpi->common.mcomp_filter_type == SWITCHABLE) {
     const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
     const int m = vp9_switchable_interp_map[mbmi->interp_filter];
@@ -3204,7 +3355,22 @@
   }
 #endif
 
-  if (block_size == BLOCK_16X16) {
+  if (block_size == BLOCK_64X64) {
+    vp9_build_inter64x64_predictors_sb(xd,
+                                       xd->dst.y_buffer,
+                                       xd->dst.u_buffer,
+                                       xd->dst.v_buffer,
+                                       xd->dst.y_stride,
+                                       xd->dst.uv_stride);
+  } else if (block_size == BLOCK_32X32) {
+    vp9_build_inter32x32_predictors_sb(xd,
+                                       xd->dst.y_buffer,
+                                       xd->dst.u_buffer,
+                                       xd->dst.v_buffer,
+                                       xd->dst.y_stride,
+                                       xd->dst.uv_stride);
+  } else {
+    assert(block_size == BLOCK_16X16);
     vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);
     if (is_comp_pred)
       vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);
@@ -3213,15 +3379,6 @@
       vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16);
     }
 #endif
-  } else {
-#if CONFIG_SUPERBLOCKS
-    vp9_build_inter32x32_predictors_sb(xd,
-                                       xd->dst.y_buffer,
-                                       xd->dst.u_buffer,
-                                       xd->dst.v_buffer,
-                                       xd->dst.y_stride,
-                                       xd->dst.uv_stride);
-#endif
   }
 
   if (cpi->active_map_enabled && x->active_ptr[0] == 0)
@@ -3234,14 +3391,16 @@
     if (threshold < x->encode_breakout)
       threshold = x->encode_breakout;
 
-    if (block_size == BLOCK_16X16) {
-      var = vp9_variance16x16(*(b->base_src), b->src_stride,
-                              xd->predictor, 16, &sse);
-    } else {
-#if CONFIG_SUPERBLOCKS
+    if (block_size == BLOCK_64X64) {
+      var = vp9_variance64x64(*(b->base_src), b->src_stride,
+                              xd->dst.y_buffer, xd->dst.y_stride, &sse);
+    } else if (block_size == BLOCK_32X32) {
       var = vp9_variance32x32(*(b->base_src), b->src_stride,
                               xd->dst.y_buffer, xd->dst.y_stride, &sse);
-#endif
+    } else {
+      assert(block_size == BLOCK_16X16);
+      var = vp9_variance16x16(*(b->base_src), b->src_stride,
+                              xd->predictor, 16, &sse);
     }
 
     if ((int)sse < threshold) {
@@ -3253,15 +3412,23 @@
         // Check u and v to make sure skip is ok
         int sse2;
 
-        if (block_size == BLOCK_16X16) {
-          sse2 = vp9_uvsse(x);
-        } else {
+        if (block_size == BLOCK_64X64) {
           unsigned int sse2u, sse2v;
+          var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride,
+                                  xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
+          var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride,
+                                  xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
+          sse2 = sse2u + sse2v;
+        } else if (block_size == BLOCK_32X32) {
+          unsigned int sse2u, sse2v;
           var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,
                                   xd->dst.u_buffer, xd->dst.uv_stride, &sse2u);
           var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,
                                   xd->dst.v_buffer, xd->dst.uv_stride, &sse2v);
           sse2 = sse2u + sse2v;
+        } else {
+          assert(block_size == BLOCK_16X16);
+          sse2 = vp9_uvsse(x);
         }
 
         if (sse2 * 2 < threshold) {
@@ -3280,36 +3447,37 @@
     }
   }
 
-  if (is_comp_pred) {
-    *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
-  } else {
-    *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
-  }
+  if (!(*mode_excluded)) {
+    if (is_comp_pred) {
+      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
+    } else {
+      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
+    }
 #if CONFIG_COMP_INTERINTRA_PRED
-  if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1;
+    if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1;
 #endif
+  }
 
   if (!x->skip) {
-    if (block_size == BLOCK_16X16) {
-      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                               &xd->predictor[320], 8);
-      if (is_comp_pred)
-        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                                 &xd->predictor[320], 8);
-#if CONFIG_COMP_INTERINTRA_PRED
-      if (is_comp_interintra_pred) {
-        vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256],
-                                                   &xd->predictor[320], 8);
-      }
-#endif
-      inter_mode_cost(cpi, x, rate2, distortion,
-                      rate_y, distortion_y, rate_uv, distortion_uv,
-                      skippable, txfm_cache);
-    } else {
-#if CONFIG_SUPERBLOCKS
+    if (block_size == BLOCK_64X64) {
       int skippable_y, skippable_uv;
 
       // Y cost and distortion
+      super_block_64_yrd(cpi, x, rate_y, distortion_y,
+                         &skippable_y, txfm_cache);
+      *rate2 += *rate_y;
+      *distortion += *distortion_y;
+
+      rd_inter64x64_uv(cpi, x, rate_uv, distortion_uv,
+                       cm->full_pixel, &skippable_uv);
+
+      *rate2 += *rate_uv;
+      *distortion += *distortion_uv;
+      *skippable = skippable_y && skippable_uv;
+    } else if (block_size == BLOCK_32X32) {
+      int skippable_y, skippable_uv;
+
+      // Y cost and distortion
       super_block_yrd(cpi, x, rate_y, distortion_y,
                       &skippable_y, txfm_cache);
       *rate2 += *rate_y;
@@ -3321,7 +3489,23 @@
       *rate2 += *rate_uv;
       *distortion += *distortion_uv;
       *skippable = skippable_y && skippable_uv;
+    } else {
+      assert(block_size == BLOCK_16X16);
+
+      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                               &xd->predictor[320], 8);
+      if (is_comp_pred)
+        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                                 &xd->predictor[320], 8);
+#if CONFIG_COMP_INTERINTRA_PRED
+      if (is_comp_interintra_pred) {
+        vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256],
+                                                   &xd->predictor[320], 8);
+      }
 #endif
+      inter_mode_cost(cpi, x, rate2, distortion,
+                      rate_y, distortion_y, rate_uv, distortion_uv,
+                      skippable, txfm_cache);
     }
   }
   return this_rd;  // if 0, this will be re-calculated by caller
@@ -3338,9 +3522,10 @@
   PARTITION_INFO best_partition;
   int_mv best_ref_mv, second_best_ref_mv;
   MB_PREDICTION_MODE this_mode;
+  MB_PREDICTION_MODE best_mode = DC_PRED;
   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
   int i, best_mode_index = 0;
-  int mode8x8[2][4];
+  int mode8x8[4];
   unsigned char segment_id = mbmi->segment_id;
 
   int mode_index;
@@ -3358,6 +3543,7 @@
   int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED;
 #endif
   int64_t best_overall_rd = INT64_MAX;
+  INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
   int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
   int uv_intra_skippable = 0;
   int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
@@ -3365,9 +3551,6 @@
   int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
   int distortion_uv = INT_MAX;
   int64_t best_yrd = INT64_MAX;
-#if CONFIG_PRED_FILTER
-  int best_filter_state = 0;
-#endif
   int switchable_filter_index = 0;
 
   MB_PREDICTION_MODE uv_intra_mode;
@@ -3377,10 +3560,8 @@
   int saddone = 0;
 
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int_mv frame_best_ref_mv[MAX_REF_FRAMES];
-  int_mv mv_search_ref[MAX_REF_FRAMES];
   int frame_mdcounts[4][4];
-  unsigned char *y_buffer[4], *u_buffer[4], *v_buffer[4];
+  uint8_t *y_buffer[4], *u_buffer[4], *v_buffer[4];
 
   unsigned int ref_costs[MAX_REF_FRAMES];
   int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];
@@ -3392,7 +3573,8 @@
   vpx_memset(&frame_mv, 0, sizeof(frame_mv));
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
   vpx_memset(&best_bmodes, 0, sizeof(best_bmodes));
-  vpx_memset(&x->mb_context[xd->mb_index], 0, sizeof(PICK_MODE_CONTEXT));
+  vpx_memset(&x->mb_context[xd->sb_index][xd->mb_index], 0,
+             sizeof(PICK_MODE_CONTEXT));
 
   for (i = 0; i < MAX_REF_FRAMES; i++)
     frame_mv[NEWMV][i].as_int = INVALID_MV;
@@ -3412,31 +3594,26 @@
   if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
     setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME,
                        BLOCK_16X16, recon_yoffset, recon_uvoffset,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV], frame_best_ref_mv,
-                       mv_search_ref, frame_mdcounts,
-                       y_buffer, u_buffer, v_buffer);
+                       frame_mv[NEARESTMV], frame_mv[NEARMV],
+                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
   }
 
   if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
     setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME,
                        BLOCK_16X16, recon_yoffset, recon_uvoffset,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV], frame_best_ref_mv,
-                       mv_search_ref, frame_mdcounts,
-                       y_buffer, u_buffer, v_buffer);
+                       frame_mv[NEARESTMV], frame_mv[NEARMV],
+                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
   }
 
   if (cpi->ref_frame_flags & VP9_ALT_FLAG) {
     setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME,
                        BLOCK_16X16, recon_yoffset, recon_uvoffset,
-                       frame_mv[NEARESTMV], frame_mv[NEARMV], frame_best_ref_mv,
-                       mv_search_ref, frame_mdcounts,
-                       y_buffer, u_buffer, v_buffer);
+                       frame_mv[NEARESTMV], frame_mv[NEARMV],
+                       frame_mdcounts, y_buffer, u_buffer, v_buffer);
   }
 
   *returnintra = INT64_MAX;
 
-  x->skip = 0;
-
   mbmi->ref_frame = INTRA_FRAME;
 
   /* Initialize zbin mode boost for uv costing */
@@ -3479,24 +3656,27 @@
     rate_y = 0;
     rate_uv = 0;
 
+    x->skip = 0;
+
     this_mode = vp9_mode_order[mode_index].mode;
     mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;
     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
-#if CONFIG_PRED_FILTER
-    mbmi->pred_filter_enabled = 0;
-#endif
-    if (cpi->common.mcomp_filter_type == SWITCHABLE &&
-        this_mode >= NEARESTMV && this_mode <= SPLITMV) {
+
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    if (this_mode >= NEARESTMV && this_mode <= SPLITMV) {
       mbmi->interp_filter =
           vp9_switchable_interp[switchable_filter_index++];
       if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)
         switchable_filter_index = 0;
-    } else {
-      mbmi->interp_filter = cpi->common.mcomp_filter_type;
+      if ((cm->mcomp_filter_type != SWITCHABLE) &&
+          (cm->mcomp_filter_type != mbmi->interp_filter)) {
+        mode_excluded = 1;
+      }
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
     }
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
     // Test best rd so far against threshold for trying this mode.
     if (best_rd <= cpi->rd_threshes[mode_index])
@@ -3503,10 +3683,6 @@
       continue;
 
     // current coding mode under rate-distortion optimization test loop
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
 #if CONFIG_COMP_INTERINTRA_PRED
     mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
     mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
@@ -3546,7 +3722,7 @@
       xd->pre.y_buffer = y_buffer[ref];
       xd->pre.u_buffer = u_buffer[ref];
       xd->pre.v_buffer = v_buffer[ref];
-      best_ref_mv = frame_best_ref_mv[ref];
+      best_ref_mv = mbmi->ref_mvs[ref][0];
       vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));
     }
 
@@ -3556,7 +3732,7 @@
       xd->second_pre.y_buffer = y_buffer[ref];
       xd->second_pre.u_buffer = u_buffer[ref];
       xd->second_pre.v_buffer = v_buffer[ref];
-      second_best_ref_mv  = frame_best_ref_mv[ref];
+      second_best_ref_mv = mbmi->ref_mvs[ref][0];
     }
 
     // Experimental code. Special case for gf and arf zeromv modes.
@@ -3621,11 +3797,9 @@
           // Note the rate value returned here includes the cost of coding
           // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];
           mbmi->txfm_size = TX_4X4;
-          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion, best_yrd,
-#if CONFIG_COMP_INTRA_PRED
-                                             0,
-#endif
-                                             0);
+          tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
+                                             &distortion, best_yrd,
+                                             cpi->update_context);
           rate2 += rate;
           rate2 += intra_cost_penalty;
           distortion2 += distortion;
@@ -3650,16 +3824,10 @@
           mbmi->txfm_size = TX_4X4;
           tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,
                                                  &d4x4, best_yrd);
-          mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
-          mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
-          mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
-          mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-          mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
-          mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
-          mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
-          mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
+          mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
+          mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
+          mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
+          mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
           mbmi->txfm_size = TX_8X8;
           tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,
                                                  &d8x8, best_yrd);
@@ -3683,16 +3851,10 @@
               mbmi->txfm_size = TX_8X8;
               tmp_rd = tmp_rd_8x8s;
 
-              mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
-              mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
-              mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
-              mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-              mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
-              mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
-              mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
-              mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
+              mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
+              mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
+              mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
+              mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
             }
           } else if (cm->txfm_mode == ONLY_4X4) {
             rate = r4x4;
@@ -3707,16 +3869,10 @@
             mbmi->txfm_size = TX_8X8;
             tmp_rd = tmp_rd_8x8;
 
-            mode8x8[0][0] = xd->mode_info_context->bmi[0].as_mode.first;
-            mode8x8[0][1] = xd->mode_info_context->bmi[2].as_mode.first;
-            mode8x8[0][2] = xd->mode_info_context->bmi[8].as_mode.first;
-            mode8x8[0][3] = xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-            mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
-            mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
-            mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
-            mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
+            mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;
+            mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;
+            mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;
+            mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;
           }
 
           rate2 += rate;
@@ -3765,6 +3921,7 @@
         rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs
             [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
                 [vp9_switchable_interp_map[mbmi->interp_filter]];
+
       // If even the 'Y' rd value of split is higher than best so far
       // then dont bother looking at UV
       if (tmp_rd < best_yrd) {
@@ -3780,10 +3937,12 @@
         disable_skip = 1;
       }
 
-      if (is_comp_pred)
-        mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
-      else
-        mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+      if (!mode_excluded) {
+        if (is_comp_pred)
+          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
+        else
+          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
+      }
 
       compmode_cost =
         vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
@@ -3811,8 +3970,7 @@
                                   &rate_y, &distortion,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip, recon_yoffset,
-                                  mode_index, frame_mv, frame_best_ref_mv,
-                                  mv_search_ref);
+                                  mode_index, frame_mv);
       if (this_rd == INT64_MAX)
         continue;
     }
@@ -3897,7 +4055,6 @@
     }
 #endif
 
-
     if (!disable_skip && mbmi->ref_frame == INTRA_FRAME)
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
@@ -3904,144 +4061,135 @@
 
     if (this_rd < best_overall_rd) {
       best_overall_rd = this_rd;
-#if CONFIG_PRED_FILTER
-      best_filter_state = mbmi->pred_filter_enabled;
-#endif
+      best_filter = mbmi->interp_filter;
+      best_mode = this_mode;
 #if CONFIG_COMP_INTERINTRA_PRED
       is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
 #endif
     }
 
-#if CONFIG_PRED_FILTER
-    // Ignore modes where the prediction filter state doesn't
-    // match the state signaled at the frame level
-    if ((cm->pred_filter_mode == 2) ||
-        (cm->pred_filter_mode ==
-         mbmi->pred_filter_enabled)) {
-#endif
-      // Did this mode help.. i.e. is it the new best mode
-      if (this_rd < best_rd || x->skip) {
-        if (!mode_excluded) {
-          /*
-          if (mbmi->second_ref_frame == INTRA_FRAME) {
-            printf("rd %d best %d bestintra16 %d\n", this_rd, best_rd, best_intra16_rd);
-          }
-          */
-          // Note index of best mode so far
-          best_mode_index = mode_index;
+    // Did this mode help.. i.e. is it the new best mode
+    if (this_rd < best_rd || x->skip) {
+      if (!mode_excluded) {
+        /*
+        if (mbmi->second_ref_frame == INTRA_FRAME) {
+          printf("rd %d best %d bestintra16 %d\n", this_rd, best_rd, best_intra16_rd);
+        }
+        */
+        // Note index of best mode so far
+        best_mode_index = mode_index;
 
-          if (this_mode <= B_PRED) {
-            if (mbmi->txfm_size != TX_4X4
-                && this_mode != B_PRED
-                && this_mode != I8X8_PRED)
-              mbmi->uv_mode = uv_intra_mode_8x8;
-            else
-              mbmi->uv_mode = uv_intra_mode;
-            /* required for left and above block mv */
-            mbmi->mv[0].as_int = 0;
-          }
+        if (this_mode <= B_PRED) {
+          if (mbmi->txfm_size != TX_4X4
+              && this_mode != B_PRED
+              && this_mode != I8X8_PRED)
+            mbmi->uv_mode = uv_intra_mode_8x8;
+          else
+            mbmi->uv_mode = uv_intra_mode;
+          /* required for left and above block mv */
+          mbmi->mv[0].as_int = 0;
+        }
 
-          other_cost += ref_costs[mbmi->ref_frame];
+        other_cost += ref_costs[mbmi->ref_frame];
 
-          /* Calculate the final y RD estimate for this mode */
-          best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
-                            (distortion2 - distortion_uv));
+        /* Calculate the final y RD estimate for this mode */
+        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
+                          (distortion2 - distortion_uv));
 
-          *returnrate = rate2;
-          *returndistortion = distortion2;
-          best_rd = this_rd;
-          vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
-          vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
+        *returnrate = rate2;
+        *returndistortion = distortion2;
+        best_rd = this_rd;
+        vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
+        vpx_memcpy(&best_partition, x->partition_info, sizeof(PARTITION_INFO));
 
-          if ((this_mode == B_PRED)
-              || (this_mode == I8X8_PRED)
-              || (this_mode == SPLITMV))
-            for (i = 0; i < 16; i++) {
-              best_bmodes[i] = xd->block[i].bmi;
-            }
-        }
-
-        // Testing this mode gave rise to an improvement in best error score.
-        // Lower threshold a bit for next time
-        cpi->rd_thresh_mult[mode_index] =
-            (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
-            cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-        cpi->rd_threshes[mode_index] =
-            (cpi->rd_baseline_thresh[mode_index] >> 7) *
-            cpi->rd_thresh_mult[mode_index];
+        if ((this_mode == B_PRED)
+            || (this_mode == I8X8_PRED)
+            || (this_mode == SPLITMV))
+          for (i = 0; i < 16; i++) {
+            best_bmodes[i] = xd->block[i].bmi;
+          }
       }
+
+      // Testing this mode gave rise to an improvement in best error score.
+      // Lower threshold a bit for next time
+      cpi->rd_thresh_mult[mode_index] =
+          (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+          cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+      cpi->rd_threshes[mode_index] =
+          (cpi->rd_baseline_thresh[mode_index] >> 7) *
+          cpi->rd_thresh_mult[mode_index];
+    } else {
       // If the mode did not help improve the best error case then raise the
       // threshold for testing that mode next time around.
-      else {
-        cpi->rd_thresh_mult[mode_index] += 4;
+      cpi->rd_thresh_mult[mode_index] += 4;
 
-        if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
-          cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+      if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+        cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
 
-        cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
-      }
+      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7)
+          * cpi->rd_thresh_mult[mode_index];
+    }
 
-      /* keep record of best compound/single-only prediction */
-      if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
-        int64_t single_rd, hybrid_rd;
-        int single_rate, hybrid_rate;
+    /* keep record of best compound/single-only prediction */
+    if (!disable_skip && mbmi->ref_frame != INTRA_FRAME) {
+      int64_t single_rd, hybrid_rd;
+      int single_rate, hybrid_rate;
 
-        if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
-          single_rate = rate2 - compmode_cost;
-          hybrid_rate = rate2;
-        } else {
-          single_rate = rate2;
-          hybrid_rate = rate2 + compmode_cost;
-        }
+      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+        single_rate = rate2 - compmode_cost;
+        hybrid_rate = rate2;
+      } else {
+        single_rate = rate2;
+        hybrid_rate = rate2 + compmode_cost;
+      }
 
-        single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
-        hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
+      single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
+      hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
-        if (mbmi->second_ref_frame <= INTRA_FRAME &&
-            single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-          best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-        } else if (mbmi->second_ref_frame > INTRA_FRAME &&
-                   single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-          best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
-        }
-        if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-          best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+      if (mbmi->second_ref_frame <= INTRA_FRAME &&
+          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
+        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+      } else if (mbmi->second_ref_frame > INTRA_FRAME &&
+                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
+        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
       }
+      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
+        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+    }
 
-      /* keep record of best txfm size */
-      if (!mode_excluded && this_rd != INT64_MAX) {
-        for (i = 0; i < NB_TXFM_MODES; i++) {
-          int64_t adj_rd;
-          if (this_mode != B_PRED) {
-            const int64_t txfm_mode_diff =
-                txfm_cache[i] - txfm_cache[cm->txfm_mode];
-            adj_rd = this_rd + txfm_mode_diff;
-          } else {
-            adj_rd = this_rd;
-          }
-          if (adj_rd < best_txfm_rd[i])
-            best_txfm_rd[i] = adj_rd;
+    /* keep record of best txfm size */
+    if (!mode_excluded && this_rd != INT64_MAX) {
+      for (i = 0; i < NB_TXFM_MODES; i++) {
+        int64_t adj_rd;
+        if (this_mode != B_PRED) {
+          const int64_t txfm_mode_diff =
+              txfm_cache[i] - txfm_cache[cm->txfm_mode];
+          adj_rd = this_rd + txfm_mode_diff;
+        } else {
+          adj_rd = this_rd;
         }
+        if (adj_rd < best_txfm_rd[i])
+          best_txfm_rd[i] = adj_rd;
       }
-#if CONFIG_PRED_FILTER
     }
-#endif
 
     if (x->skip && !mode_excluded)
       break;
-  }
+    }
 
-#if CONFIG_PRED_FILTER
-  // Update counts for prediction filter usage
-  if (best_filter_state != 0)
-    ++cpi->pred_filter_on_count;
-  else
-    ++cpi->pred_filter_off_count;
-#endif
+  assert((cm->mcomp_filter_type == SWITCHABLE) ||
+         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
+         (best_mbmode.mode <= B_PRED));
+
 #if CONFIG_COMP_INTERINTRA_PRED
   ++cpi->interintra_select_count[is_best_interintra];
 #endif
 
+  // Accumulate filter usage stats
+  // TODO(agrange): Use RD criteria to select interpolation filter mode.
+  if ((best_mode >= NEARESTMV) && (best_mode <= SPLITMV))
+    ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
+
   // Reduce the activation RD thresholds for the best choice mode
   if ((cpi->rd_baseline_thresh[best_mode_index] > 0) &&
       (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2))) {
@@ -4066,7 +4214,7 @@
       (cpi->oxcf.arnr_max_frames == 0) &&
       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {
     mbmi->mode = ZEROMV;
-    if (cm->txfm_mode != TX_MODE_SELECT)
+    if (cm->txfm_mode <= ALLOW_8X8)
       mbmi->txfm_size = cm->txfm_mode;
     else
       mbmi->txfm_size = TX_16X16;
@@ -4126,32 +4274,28 @@
   }
 
 end:
-  store_coding_context(
-      x, &x->mb_context[xd->mb_index], best_mode_index, &best_partition,
-      &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame],
-      &frame_best_ref_mv[xd->mode_info_context->mbmi.second_ref_frame < 0 ?
-                         0 : xd->mode_info_context->mbmi.second_ref_frame],
-      best_pred_diff, best_txfm_diff);
+  store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],
+                       best_mode_index, &best_partition,
+                       &mbmi->ref_mvs[mbmi->ref_frame][0],
+                       &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
+                                      mbmi->second_ref_frame][0],
+                       best_pred_diff, best_txfm_diff);
 }
 
-#if CONFIG_SUPERBLOCKS
-void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               int *returnrate,
-                               int *returndist) {
+void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                 int *returnrate,
+                                 int *returndist) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  int rate_y, rate_uv;
-  int rate_y_tokenonly, rate_uv_tokenonly;
-  int error_y, error_uv;
-  int dist_y, dist_uv;
-  int y_skip, uv_skip;
+  int rate_y = 0, rate_uv;
+  int rate_y_tokenonly = 0, rate_uv_tokenonly;
+  int dist_y = 0, dist_uv;
+  int y_skip = 0, uv_skip;
   int64_t txfm_cache[NB_TXFM_MODES];
 
-  xd->mode_info_context->mbmi.txfm_size = TX_8X8;
-
-  error_y = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+  rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                    &dist_y, &y_skip, txfm_cache);
-  error_uv = rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+  rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
                                      &dist_uv, &uv_skip);
 
   if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
@@ -4165,8 +4309,35 @@
     *returndist = dist_y + (dist_uv >> 2);
   }
 }
-#endif
 
+void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                 int *returnrate,
+                                 int *returndist) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int rate_y = 0, rate_uv;
+  int rate_y_tokenonly = 0, rate_uv_tokenonly;
+  int dist_y = 0, dist_uv;
+  int y_skip = 0, uv_skip;
+  int64_t txfm_cache[NB_TXFM_MODES];
+
+  rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                     &dist_y, &y_skip, txfm_cache);
+  rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+                                       &dist_uv, &uv_skip);
+
+  if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {
+    *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+    vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
+    *returndist = dist_y + (dist_uv >> 2);
+  } else {
+    *returnrate = rate_y + rate_uv;
+    if (cm->mb_no_coeff_skip)
+      *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
+    *returndist = dist_y + (dist_uv >> 2);
+  }
+}
+
 void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
                             int *returnrate, int *returndist) {
   VP9_COMMON *cm = &cpi->common;
@@ -4173,10 +4344,6 @@
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
   int64_t error4x4, error16x16;
-#if CONFIG_COMP_INTRA_PRED
-  int64_t error4x4d;
-  int rate4x4d, dist4x4d;
-#endif
   int rate4x4, rate16x16 = 0, rateuv, rateuv8x8;
   int dist4x4 = 0, dist16x16 = 0, distuv = 0, distuv8x8 = 0;
   int rate;
@@ -4187,7 +4354,7 @@
   int rate8x8_tokenonly=0;
   int rate8x8, dist8x8;
   int mode16x16;
-  int mode8x8[2][4];
+  int mode8x8[4];
   int dist;
   int modeuv, uv_intra_skippable, uv_intra_skippable_8x8;
   int y_intra16x16_skippable = 0;
@@ -4220,29 +4387,15 @@
   mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;
   error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly,
                                        &dist8x8, error16x16);
-  mode8x8[0][0]= xd->mode_info_context->bmi[0].as_mode.first;
-  mode8x8[0][1]= xd->mode_info_context->bmi[2].as_mode.first;
-  mode8x8[0][2]= xd->mode_info_context->bmi[8].as_mode.first;
-  mode8x8[0][3]= xd->mode_info_context->bmi[10].as_mode.first;
-#if CONFIG_COMP_INTRA_PRED
-  mode8x8[1][0] = xd->mode_info_context->bmi[0].as_mode.second;
-  mode8x8[1][1] = xd->mode_info_context->bmi[2].as_mode.second;
-  mode8x8[1][2] = xd->mode_info_context->bmi[8].as_mode.second;
-  mode8x8[1][3] = xd->mode_info_context->bmi[10].as_mode.second;
-#endif
+  mode8x8[0]= xd->mode_info_context->bmi[0].as_mode.first;
+  mode8x8[1]= xd->mode_info_context->bmi[2].as_mode.first;
+  mode8x8[2]= xd->mode_info_context->bmi[8].as_mode.first;
+  mode8x8[3]= xd->mode_info_context->bmi[10].as_mode.first;
 
   error4x4 = rd_pick_intra4x4mby_modes(cpi, x,
                                        &rate4x4, &rate4x4_tokenonly,
                                        &dist4x4, error16x16,
-#if CONFIG_COMP_INTRA_PRED
-                                       0,
-#endif
-                                       0);
-#if CONFIG_COMP_INTRA_PRED
-  error4x4d = rd_pick_intra4x4mby_modes(cpi, x,
-                                        &rate4x4d, &rate4x4_tokenonly,
-                                        &dist4x4d, error16x16, 1, 0);
-#endif
+                                       cpi->update_context);
 
   mbmi->mb_skip_coeff = 0;
   if (cpi->common.mb_no_coeff_skip &&
@@ -4253,27 +4406,18 @@
     rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +
            vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);
     dist = dist16x16 + (distuv8x8 >> 2);
+
     mbmi->txfm_size = txfm_size_16x16;
-    memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-           sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+    memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
+           sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
   } else if (error8x8 > error16x16) {
     if (error4x4 < error16x16) {
-      rate = rateuv;
-#if CONFIG_COMP_INTRA_PRED
-      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
-      if (error4x4d >= error4x4) // FIXME save original modes etc.
-        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
-                                             &rate4x4_tokenonly,
-                                             &dist4x4, error16x16, 0,
-                                             cpi->update_context);
-#else
-      rate += rate4x4;
-#endif
+      rate = rateuv + rate4x4;
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
     } else {
       mbmi->txfm_size = txfm_size_16x16;
       mbmi->mode = mode16x16;
@@ -4280,7 +4424,8 @@
       rate = rate16x16 + rateuv8x8;
       dist = dist16x16 + (distuv8x8 >> 2);
       for (i = 0; i < NB_TXFM_MODES; i++) {
-        x->mb_context[xd->mb_index].txfm_rd_diff[i] = error16x16 - txfm_cache[i];
+        x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =
+            error16x16 - txfm_cache[i];
       }
     }
     if (cpi->common.mb_no_coeff_skip)
@@ -4287,22 +4432,12 @@
       rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
   } else {
     if (error4x4 < error8x8) {
-      rate = rateuv;
-#if CONFIG_COMP_INTRA_PRED
-      rate += (error4x4d < error4x4) ? rate4x4d : rate4x4;
-      if (error4x4d >= error4x4) // FIXME save original modes etc.
-        error4x4 = rd_pick_intra4x4mby_modes(cpi, x, &rate4x4,
-                                             &rate4x4_tokenonly,
-                                             &dist4x4, error16x16, 0,
-                                             cpi->update_context);
-#else
-      rate += rate4x4;
-#endif
+      rate = rateuv + rate4x4;
       mbmi->mode = B_PRED;
       mbmi->txfm_size = TX_4X4;
       dist = dist4x4 + (distuv >> 2);
-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
     } else {
       // FIXME(rbultje) support transform-size selection
       mbmi->mode = I8X8_PRED;
@@ -4310,8 +4445,8 @@
       set_i8x8_block_modes(x, mode8x8);
       rate = rate8x8 + rateuv;
       dist = dist8x8 + (distuv >> 2);
-      memset(x->mb_context[xd->mb_index].txfm_rd_diff, 0,
-             sizeof(x->mb_context[xd->mb_index].txfm_rd_diff));
+      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,
+             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));
     }
     if (cpi->common.mb_no_coeff_skip)
       rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);
@@ -4321,24 +4456,24 @@
   *returndist = dist;
 }
 
-#if CONFIG_SUPERBLOCKS
-int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                  int recon_yoffset, int recon_uvoffset,
-                                  int *returnrate, int *returndistortion) {
+static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+                                         int recon_yoffset, int recon_uvoffset,
+                                         int *returnrate,
+                                         int *returndistortion,
+                                         int block_size) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   MB_PREDICTION_MODE this_mode;
+  MB_PREDICTION_MODE best_mode = DC_PRED;
   MV_REFERENCE_FRAME ref_frame;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int_mv frame_best_ref_mv[MAX_REF_FRAMES];
-  int_mv mv_search_ref[MAX_REF_FRAMES];
   int frame_mdcounts[4][4];
-  unsigned char *y_buffer[4];
-  unsigned char *u_buffer[4];
-  unsigned char *v_buffer[4];
+  uint8_t *y_buffer[4];
+  uint8_t *u_buffer[4];
+  uint8_t *v_buffer[4];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx,
@@ -4347,7 +4482,6 @@
   int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
   int saddone = 0;
   int64_t best_rd = INT64_MAX;
-  int64_t best_yrd = INT64_MAX;
   int64_t best_txfm_rd[NB_TXFM_MODES];
   int64_t best_txfm_diff[NB_TXFM_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
@@ -4361,11 +4495,15 @@
   int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED;
 #endif
   int64_t best_overall_rd = INT64_MAX;
+  INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
   int rate_uv_4x4 = 0, rate_uv_8x8 = 0, rate_uv_tokenonly_4x4 = 0,
       rate_uv_tokenonly_8x8 = 0;
   int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0;
   MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV;
   int switchable_filter_index = 0;
+  int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0;
+  int dist_uv_16x16 = 0, uv_skip_16x16 = 0;
+  MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV;
 
   x->skip = 0;
   xd->mode_info_context->mbmi.segment_id = segment_id;
@@ -4379,28 +4517,58 @@
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, BLOCK_32X32,
+      setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
                          recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],
-                         frame_mv[NEARMV], frame_best_ref_mv, mv_search_ref,
-                         frame_mdcounts, y_buffer, u_buffer, v_buffer);
+                         frame_mv[NEARMV], frame_mdcounts,
+                         y_buffer, u_buffer, v_buffer);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
-  mbmi->mode = DC_PRED;
-  if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
-    mbmi->txfm_size = TX_4X4;
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
-                            &dist_uv_4x4, &uv_skip_4x4);
-    mode_uv_4x4 = mbmi->uv_mode;
+  if (block_size == BLOCK_64X64) {
+    mbmi->mode = DC_PRED;
+    if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
+      mbmi->txfm_size = TX_4X4;
+      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
+                                &dist_uv_4x4, &uv_skip_4x4);
+      mode_uv_4x4 = mbmi->uv_mode;
+    }
+    if (cm->txfm_mode != ONLY_4X4) {
+      mbmi->txfm_size = TX_8X8;
+      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
+                                &dist_uv_8x8, &uv_skip_8x8);
+      mode_uv_8x8 = mbmi->uv_mode;
+    }
+    if (cm->txfm_mode >= ALLOW_32X32) {
+      mbmi->txfm_size = TX_32X32;
+      rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv_16x16,
+                                &rate_uv_tokenonly_16x16,
+                                &dist_uv_16x16, &uv_skip_16x16);
+      mode_uv_16x16 = mbmi->uv_mode;
+    }
+  } else {
+    assert(block_size == BLOCK_32X32);
+    mbmi->mode = DC_PRED;
+    if (cm->txfm_mode == ONLY_4X4 || cm->txfm_mode == TX_MODE_SELECT) {
+      mbmi->txfm_size = TX_4X4;
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_4x4, &rate_uv_tokenonly_4x4,
+                              &dist_uv_4x4, &uv_skip_4x4);
+      mode_uv_4x4 = mbmi->uv_mode;
+    }
+    if (cm->txfm_mode != ONLY_4X4) {
+      mbmi->txfm_size = TX_8X8;
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
+                              &dist_uv_8x8, &uv_skip_8x8);
+      mode_uv_8x8 = mbmi->uv_mode;
+    }
+    if (cm->txfm_mode >= ALLOW_32X32) {
+      mbmi->txfm_size = TX_32X32;
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_16x16, &rate_uv_tokenonly_16x16,
+                              &dist_uv_16x16, &uv_skip_16x16);
+      mode_uv_16x16 = mbmi->uv_mode;
+    }
   }
-  if (cm->txfm_mode != ONLY_4X4) {
-    mbmi->txfm_size = TX_8X8;
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_8x8, &rate_uv_tokenonly_8x8,
-                            &dist_uv_8x8, &uv_skip_8x8);
-    mode_uv_8x8 = mbmi->uv_mode;
-  }
 
   for (mode_index = 0; mode_index < MAX_MODES;
        mode_index += (!switchable_filter_index)) {
@@ -4420,6 +4588,7 @@
     // Test best rd so far against threshold for trying this mode.
     if (best_rd <= cpi->rd_threshes[mode_index] ||
         cpi->rd_threshes[mode_index] == INT_MAX) {
+      switchable_filter_index = 0;
       continue;
     }
 
@@ -4434,24 +4603,23 @@
     comp_pred = mbmi->second_ref_frame > INTRA_FRAME;
     mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
-#if CONFIG_COMP_INTRA_PRED
-    mbmi->second_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-    mbmi->second_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
-#endif
 #if CONFIG_COMP_INTERINTRA_PRED
     mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
     mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);
 #endif
-    if (cpi->common.mcomp_filter_type == SWITCHABLE &&
-        this_mode >= NEARESTMV && this_mode <= SPLITMV) {
+    // Evaluate all sub-pel filters irrespective of whether we can use
+    // them for this frame.
+    if (this_mode >= NEARESTMV && this_mode <= SPLITMV) {
       mbmi->interp_filter =
           vp9_switchable_interp[switchable_filter_index++];
       if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)
         switchable_filter_index = 0;
-    } else {
-      mbmi->interp_filter = cpi->common.mcomp_filter_type;
+      if ((cm->mcomp_filter_type != SWITCHABLE) &&
+          (cm->mcomp_filter_type != mbmi->interp_filter)) {
+        mode_excluded = 1;
+      }
+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
     }
-    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
     // if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
     //  continue;
@@ -4476,15 +4644,19 @@
       xd->second_pre.y_buffer = y_buffer[second_ref];
       xd->second_pre.u_buffer = u_buffer[second_ref];
       xd->second_pre.v_buffer = v_buffer[second_ref];
-      mode_excluded = cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+      mode_excluded =
+          mode_excluded ?
+              mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
     } else {
       // mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
       if (ref_frame != INTRA_FRAME) {
         if (mbmi->second_ref_frame != INTRA_FRAME)
-          mode_excluded = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+          mode_excluded =
+              mode_excluded ?
+                  mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
 #if CONFIG_COMP_INTERINTRA_PRED
         else
-          mode_excluded = !cm->use_interintra;
+          mode_excluded = mode_excluded ? mode_excluded : !cm->use_interintra;
 #endif
       }
     }
@@ -4520,14 +4692,26 @@
     }
 
     if (ref_frame == INTRA_FRAME) {
-      vp9_build_intra_predictors_sby_s(xd);
-      super_block_yrd(cpi, x, &rate_y, &distortion_y,
-                      &skippable, txfm_cache);
+      if (block_size == BLOCK_64X64) {
+        vp9_build_intra_predictors_sb64y_s(xd);
+        super_block_64_yrd(cpi, x, &rate_y, &distortion_y,
+                           &skippable, txfm_cache);
+      } else {
+        assert(block_size == BLOCK_32X32);
+        vp9_build_intra_predictors_sby_s(xd);
+        super_block_yrd(cpi, x, &rate_y, &distortion_y,
+                        &skippable, txfm_cache);
+      }
       if (mbmi->txfm_size == TX_4X4) {
         rate_uv = rate_uv_4x4;
         distortion_uv = dist_uv_4x4;
         skippable = skippable && uv_skip_4x4;
         mbmi->uv_mode = mode_uv_4x4;
+      } else if (mbmi->txfm_size == TX_32X32) {
+        rate_uv = rate_uv_16x16;
+        distortion_uv = dist_uv_16x16;
+        skippable = skippable && uv_skip_16x16;
+        mbmi->uv_mode = mode_uv_16x16;
       } else {
         rate_uv = rate_uv_8x8;
         distortion_uv = dist_uv_8x8;
@@ -4549,7 +4733,7 @@
 #endif
       }
 #endif
-      this_rd = handle_inter_mode(cpi, x, BLOCK_32X32,
+      this_rd = handle_inter_mode(cpi, x, block_size,
                                   &saddone, near_sadidx, mdcounts, txfm_cache,
                                   &rate2, &distortion2, &skippable,
                                   &compmode_cost,
@@ -4559,8 +4743,7 @@
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
                                   &mode_excluded, &disable_skip, recon_yoffset,
-                                  mode_index, frame_mv, frame_best_ref_mv,
-                                  mv_search_ref);
+                                  mode_index, frame_mv);
       if (this_rd == INT64_MAX)
         continue;
     }
@@ -4649,6 +4832,8 @@
 
     if (this_rd < best_overall_rd) {
       best_overall_rd = this_rd;
+      best_filter = mbmi->interp_filter;
+      best_mode = this_mode;
 #if CONFIG_COMP_INTERINTRA_PRED
       is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);
 #endif
@@ -4666,11 +4851,6 @@
         }
 
         other_cost += ref_costs[xd->mode_info_context->mbmi.ref_frame];
-
-        /* Calculate the final y RD estimate for this mode */
-        best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2 - rate_uv - other_cost),
-                          (distortion2 - distortion_uv));
-
         *returnrate = rate2;
         *returndistortion = distortion2;
         best_rd = this_rd;
@@ -4677,13 +4857,18 @@
         vpx_memcpy(&best_mbmode, mbmi, sizeof(MB_MODE_INFO));
       }
 #if 0
-      // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
-      cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+      // Testing this mode gave rise to an improvement in best error score.
+      // Lower threshold a bit for next time
+      cpi->rd_thresh_mult[mode_index] =
+          (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ?
+              cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+      cpi->rd_threshes[mode_index] =
+          (cpi->rd_baseline_thresh[mode_index] >> 7)
+              * cpi->rd_thresh_mult[mode_index];
 #endif
-    }
-    // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
-    else {
+    } else {
+      // If the mode did not help improve the best error case then
+      // raise the threshold for testing that mode next time around.
 #if 0
       cpi->rd_thresh_mult[mode_index] += 4;
 
@@ -4690,7 +4875,9 @@
       if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
         cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
 
-      cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+      cpi->rd_threshes[mode_index] =
+          (cpi->rd_baseline_thresh[mode_index] >> 7)
+              * cpi->rd_thresh_mult[mode_index];
 #endif
     }
 
@@ -4738,11 +4925,20 @@
       break;
   }
 
+  assert((cm->mcomp_filter_type == SWITCHABLE) ||
+         (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
+         (best_mbmode.mode <= B_PRED));
+
 #if CONFIG_COMP_INTERINTRA_PRED
   ++cpi->interintra_select_count[is_best_interintra];
   // if (is_best_interintra)  printf("best_interintra\n");
 #endif
 
+  // Accumulate filter usage stats
+  // TODO(agrange): Use RD criteria to select interpolation filter mode.
+  if ((best_mode >= NEARESTMV) && (best_mode <= SPLITMV))
+    ++cpi->best_switchable_interp_count[vp9_switchable_interp_map[best_filter]];
+
   // TODO(rbultje) integrate with RD thresholding
 #if 0
   // Reduce the activation RD thresholds for the best choice mode
@@ -4804,16 +5000,36 @@
   }
 
  end:
-  store_coding_context(x, &x->sb_context[0], best_mode_index, NULL,
-                       &frame_best_ref_mv[mbmi->ref_frame],
-                       &frame_best_ref_mv[mbmi->second_ref_frame < 0 ?
-                                          0 : mbmi->second_ref_frame],
-                       best_pred_diff, best_txfm_diff);
+  {
+    PICK_MODE_CONTEXT *p = (block_size == BLOCK_32X32) ?
+                            &x->sb32_context[xd->sb_index] :
+                            &x->sb64_context;
+    store_coding_context(x, p, best_mode_index, NULL,
+                         &mbmi->ref_mvs[mbmi->ref_frame][0],
+                         &mbmi->ref_mvs[mbmi->second_ref_frame < 0 ? 0 :
+                             mbmi->second_ref_frame][0],
+                         best_pred_diff, best_txfm_diff);
+  }
 
   return best_rd;
 }
-#endif
 
+int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                    int recon_yoffset, int recon_uvoffset,
+                                    int *returnrate,
+                                    int *returndistortion) {
+  return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset,
+                                   returnrate, returndistortion, BLOCK_32X32);
+}
+
+int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                    int recon_yoffset, int recon_uvoffset,
+                                    int *returnrate,
+                                    int *returndistortion) {
+  return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset,
+                                   returnrate, returndistortion, BLOCK_64X64);
+}
+
 void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
                                     int recon_yoffset,
                                     int recon_uvoffset,
@@ -4846,8 +5062,8 @@
   //    vp9_pick_inter_mode
 
   // Store metrics so they can be added in to totals if this mode is picked
-  x->mb_context[xd->mb_index].distortion  = distortion;
-  x->mb_context[xd->mb_index].intra_error = intra_error;
+  x->mb_context[xd->sb_index][xd->mb_index].distortion  = distortion;
+  x->mb_context[xd->sb_index][xd->mb_index].intra_error = intra_error;
 
   *totalrate = rate;
   *totaldist = distortion;
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -22,20 +22,27 @@
 extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                    int *r, int *d);
 
-extern void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                      int *r, int *d);
+extern void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                        int *r, int *d);
 
+extern void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                        int *r, int *d);
+
 extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,
-                                           int recon_yoffset,
-                                           int recon_uvoffset, int *r, int *d);
+                                           int ref_yoffset, int ref_uvoffset,
+                                           int *r, int *d);
 
-extern int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                                         int recon_yoffset, int recon_uvoffset,
-                                         int *returnrate, int *returndist);
+extern int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,
+                                           int ref_yoffset, int ref_uvoffset,
+                                           int *r, int *d);
 
+extern int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,
+                                           int ref_yoffset, int ref_uvoffset,
+                                           int *r, int *d);
+
 extern void vp9_init_me_luts();
 
 extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,
                                    MB_PREDICTION_MODE mb, int_mv *mv);
 
-#endif
+#endif  // VP9_ENCODER_VP9_RDOPT_H_
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -14,25 +14,33 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-unsigned int vp9_sad32x32_c(const unsigned char *src_ptr,
+unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,
                             int  src_stride,
-                            const unsigned char *ref_ptr,
+                            const uint8_t *ref_ptr,
                             int  ref_stride,
                             int max_sad) {
+  return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);
+}
+
+unsigned int vp9_sad32x32_c(const uint8_t *src_ptr,
+                            int  src_stride,
+                            const uint8_t *ref_ptr,
+                            int  ref_stride,
+                            int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);
 }
 
-unsigned int vp9_sad16x16_c(const unsigned char *src_ptr,
+unsigned int vp9_sad16x16_c(const uint8_t *src_ptr,
                             int  src_stride,
-                            const unsigned char *ref_ptr,
+                            const uint8_t *ref_ptr,
                             int  ref_stride,
                             int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);
 }
 
-unsigned int vp9_sad8x8_c(const unsigned char *src_ptr,
+unsigned int vp9_sad8x8_c(const uint8_t *src_ptr,
                           int  src_stride,
-                          const unsigned char *ref_ptr,
+                          const uint8_t *ref_ptr,
                           int  ref_stride,
                           int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
@@ -39,17 +47,17 @@
 }
 
 
-unsigned int vp9_sad16x8_c(const unsigned char *src_ptr,
+unsigned int vp9_sad16x8_c(const uint8_t *src_ptr,
                            int  src_stride,
-                           const unsigned char *ref_ptr,
+                           const uint8_t *ref_ptr,
                            int  ref_stride,
                            int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
 }
 
-unsigned int vp9_sad8x16_c(const unsigned char *src_ptr,
+unsigned int vp9_sad8x16_c(const uint8_t *src_ptr,
                            int  src_stride,
-                           const unsigned char *ref_ptr,
+                           const uint8_t *ref_ptr,
                            int  ref_stride,
                            int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
@@ -56,20 +64,32 @@
 }
 
 
-unsigned int vp9_sad4x4_c(const unsigned char *src_ptr,
+unsigned int vp9_sad4x4_c(const uint8_t *src_ptr,
                           int  src_stride,
-                          const unsigned char *ref_ptr,
+                          const uint8_t *ref_ptr,
                           int  ref_stride,
                           int max_sad) {
   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
 }
 
-void vp9_sad32x32x3_c(const unsigned char *src_ptr,
+void vp9_sad64x64x3_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      const unsigned char *ref_ptr,
+                      const uint8_t *ref_ptr,
                       int  ref_stride,
-                      unsigned int *sad_array
-                      ) {
+                      unsigned int *sad_array) {
+  sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr, ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr + 1, ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp9_sad32x32x3_c(const uint8_t *src_ptr,
+                      int  src_stride,
+                      const uint8_t *ref_ptr,
+                      int  ref_stride,
+                      unsigned int *sad_array) {
   sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
                                 ref_ptr, ref_stride, 0x7fffffff);
   sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
@@ -78,42 +98,72 @@
                                 ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
-void vp9_sad32x32x8_c(const unsigned char *src_ptr,
+void vp9_sad64x64x8_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      const unsigned char *ref_ptr,
+                      const uint8_t *ref_ptr,
                       int  ref_stride,
-                      unsigned short *sad_array
-                      ) {
-  sad_array[0] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr, ref_stride,
-                                                0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 1, ref_stride,
-                                                0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 2, ref_stride,
-                                                0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 3, ref_stride,
-                                                0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 4, ref_stride,
-                                                0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 5, ref_stride,
-                                                0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 6, ref_stride,
-                                                0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad32x32_c(src_ptr, src_stride,
-                                                ref_ptr + 7, ref_stride,
-                                                0x7fffffff);
+                      uint16_t *sad_array) {
+  sad_array[0] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr, ref_stride,
+                                          0x7fffffff);
+  sad_array[1] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 1, ref_stride,
+                                          0x7fffffff);
+  sad_array[2] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 2, ref_stride,
+                                          0x7fffffff);
+  sad_array[3] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 3, ref_stride,
+                                          0x7fffffff);
+  sad_array[4] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 4, ref_stride,
+                                          0x7fffffff);
+  sad_array[5] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 5, ref_stride,
+                                          0x7fffffff);
+  sad_array[6] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 6, ref_stride,
+                                          0x7fffffff);
+  sad_array[7] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,
+                                          ref_ptr + 7, ref_stride,
+                                          0x7fffffff);
 }
 
-void vp9_sad16x16x3_c(const unsigned char *src_ptr,
+void vp9_sad32x32x8_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      const unsigned char *ref_ptr,
+                      const uint8_t *ref_ptr,
                       int  ref_stride,
+                      uint16_t *sad_array) {
+  sad_array[0] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
+                                          ref_ptr, ref_stride,
+                                          0x7fffffff);
+  sad_array[1] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
+                                          ref_ptr + 1, ref_stride,
+                                          0x7fffffff);
+  sad_array[2] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
+                                          ref_ptr + 2, ref_stride,
+                                          0x7fffffff);
+  sad_array[3] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
+                                          ref_ptr + 3, ref_stride,
+                                          0x7fffffff);
+  sad_array[4] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
+                                          ref_ptr + 4, ref_stride,
+                                          0x7fffffff);
+  sad_array[5] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
+                                          ref_ptr + 5, ref_stride,
+                                          0x7fffffff);
+  sad_array[6] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
+                                          ref_ptr + 6, ref_stride,
+                                          0x7fffffff);
+  sad_array[7] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,
+                                          ref_ptr + 7, ref_stride,
+                                          0x7fffffff);
+}
+
+void vp9_sad16x16x3_c(const uint8_t *src_ptr,
+                      int  src_stride,
+                      const uint8_t *ref_ptr,
+                      int  ref_stride,
                       unsigned int *sad_array) {
   sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
                                 ref_ptr, ref_stride, 0x7fffffff);
@@ -123,40 +173,40 @@
                                 ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
-void vp9_sad16x16x8_c(const unsigned char *src_ptr,
+void vp9_sad16x16x8_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      const unsigned char *ref_ptr,
+                      const uint8_t *ref_ptr,
                       int  ref_stride,
-                      unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr, ref_stride,
-                                                0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 1, ref_stride,
-                                                0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 2, ref_stride,
-                                                0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 3, ref_stride,
-                                                0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 4, ref_stride,
-                                                0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 5, ref_stride,
-                                                0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 6, ref_stride,
-                                                0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad16x16_c(src_ptr, src_stride,
-                                                ref_ptr + 7, ref_stride,
-                                                0x7fffffff);
+                      uint16_t *sad_array) {
+  sad_array[0] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
+                                          ref_ptr, ref_stride,
+                                          0x7fffffff);
+  sad_array[1] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
+                                          ref_ptr + 1, ref_stride,
+                                          0x7fffffff);
+  sad_array[2] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
+                                          ref_ptr + 2, ref_stride,
+                                          0x7fffffff);
+  sad_array[3] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
+                                          ref_ptr + 3, ref_stride,
+                                          0x7fffffff);
+  sad_array[4] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
+                                          ref_ptr + 4, ref_stride,
+                                          0x7fffffff);
+  sad_array[5] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
+                                          ref_ptr + 5, ref_stride,
+                                          0x7fffffff);
+  sad_array[6] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
+                                          ref_ptr + 6, ref_stride,
+                                          0x7fffffff);
+  sad_array[7] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,
+                                          ref_ptr + 7, ref_stride,
+                                          0x7fffffff);
 }
 
-void vp9_sad16x8x3_c(const unsigned char *src_ptr,
+void vp9_sad16x8x3_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     const unsigned char *ref_ptr,
+                     const uint8_t *ref_ptr,
                      int  ref_stride,
                      unsigned int *sad_array) {
   sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
@@ -167,40 +217,40 @@
                                ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
-void vp9_sad16x8x8_c(const unsigned char *src_ptr,
+void vp9_sad16x8x8_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     const unsigned char *ref_ptr,
+                     const uint8_t *ref_ptr,
                      int  ref_stride,
-                     unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr, ref_stride,
-                                               0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 1, ref_stride,
-                                               0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 2, ref_stride,
-                                               0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 3, ref_stride,
-                                               0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 4, ref_stride,
-                                               0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 5, ref_stride,
-                                               0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 6, ref_stride,
-                                               0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad16x8_c(src_ptr, src_stride,
-                                               ref_ptr + 7, ref_stride,
-                                               0x7fffffff);
+                     uint16_t *sad_array) {
+  sad_array[0] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
+                                         ref_ptr, ref_stride,
+                                         0x7fffffff);
+  sad_array[1] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
+                                         ref_ptr + 1, ref_stride,
+                                         0x7fffffff);
+  sad_array[2] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
+                                         ref_ptr + 2, ref_stride,
+                                         0x7fffffff);
+  sad_array[3] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
+                                         ref_ptr + 3, ref_stride,
+                                         0x7fffffff);
+  sad_array[4] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
+                                         ref_ptr + 4, ref_stride,
+                                         0x7fffffff);
+  sad_array[5] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
+                                         ref_ptr + 5, ref_stride,
+                                         0x7fffffff);
+  sad_array[6] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
+                                         ref_ptr + 6, ref_stride,
+                                         0x7fffffff);
+  sad_array[7] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,
+                                         ref_ptr + 7, ref_stride,
+                                         0x7fffffff);
 }
 
-void vp9_sad8x8x3_c(const unsigned char *src_ptr,
+void vp9_sad8x8x3_c(const uint8_t *src_ptr,
                     int  src_stride,
-                    const unsigned char *ref_ptr,
+                    const uint8_t *ref_ptr,
                     int  ref_stride,
                     unsigned int *sad_array) {
   sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
@@ -211,40 +261,40 @@
                               ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
-void vp9_sad8x8x8_c(const unsigned char *src_ptr,
+void vp9_sad8x8x8_c(const uint8_t *src_ptr,
                     int  src_stride,
-                    const unsigned char *ref_ptr,
+                    const uint8_t *ref_ptr,
                     int  ref_stride,
-                    unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr, ref_stride,
-                                              0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 1, ref_stride,
-                                              0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 2, ref_stride,
-                                              0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 3, ref_stride,
-                                              0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 4, ref_stride,
-                                              0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 5, ref_stride,
-                                              0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 6, ref_stride,
-                                              0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad8x8_c(src_ptr, src_stride,
-                                              ref_ptr + 7, ref_stride,
-                                              0x7fffffff);
+                    uint16_t *sad_array) {
+  sad_array[0] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
+                                        ref_ptr, ref_stride,
+                                        0x7fffffff);
+  sad_array[1] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
+                                        ref_ptr + 1, ref_stride,
+                                        0x7fffffff);
+  sad_array[2] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
+                                        ref_ptr + 2, ref_stride,
+                                        0x7fffffff);
+  sad_array[3] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
+                                        ref_ptr + 3, ref_stride,
+                                        0x7fffffff);
+  sad_array[4] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
+                                        ref_ptr + 4, ref_stride,
+                                        0x7fffffff);
+  sad_array[5] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
+                                        ref_ptr + 5, ref_stride,
+                                        0x7fffffff);
+  sad_array[6] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
+                                        ref_ptr + 6, ref_stride,
+                                        0x7fffffff);
+  sad_array[7] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,
+                                        ref_ptr + 7, ref_stride,
+                                        0x7fffffff);
 }
 
-void vp9_sad8x16x3_c(const unsigned char *src_ptr,
+void vp9_sad8x16x3_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     const unsigned char *ref_ptr,
+                     const uint8_t *ref_ptr,
                      int  ref_stride,
                      unsigned int *sad_array) {
   sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
@@ -255,40 +305,40 @@
                                ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
-void vp9_sad8x16x8_c(const unsigned char *src_ptr,
+void vp9_sad8x16x8_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     const unsigned char *ref_ptr,
+                     const uint8_t *ref_ptr,
                      int  ref_stride,
-                     unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr, ref_stride,
-                                               0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 1, ref_stride,
-                                               0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 2, ref_stride,
-                                               0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 3, ref_stride,
-                                               0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 4, ref_stride,
-                                               0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 5, ref_stride,
-                                               0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 6, ref_stride,
-                                               0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad8x16_c(src_ptr, src_stride,
-                                               ref_ptr + 7, ref_stride,
-                                               0x7fffffff);
+                     uint16_t *sad_array) {
+  sad_array[0] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
+                                         ref_ptr, ref_stride,
+                                         0x7fffffff);
+  sad_array[1] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
+                                         ref_ptr + 1, ref_stride,
+                                         0x7fffffff);
+  sad_array[2] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
+                                         ref_ptr + 2, ref_stride,
+                                         0x7fffffff);
+  sad_array[3] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
+                                         ref_ptr + 3, ref_stride,
+                                         0x7fffffff);
+  sad_array[4] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
+                                         ref_ptr + 4, ref_stride,
+                                         0x7fffffff);
+  sad_array[5] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
+                                         ref_ptr + 5, ref_stride,
+                                         0x7fffffff);
+  sad_array[6] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
+                                         ref_ptr + 6, ref_stride,
+                                         0x7fffffff);
+  sad_array[7] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,
+                                         ref_ptr + 7, ref_stride,
+                                         0x7fffffff);
 }
 
-void vp9_sad4x4x3_c(const unsigned char *src_ptr,
+void vp9_sad4x4x3_c(const uint8_t *src_ptr,
                     int  src_stride,
-                    const unsigned char *ref_ptr,
+                    const uint8_t *ref_ptr,
                     int  ref_stride,
                     unsigned int *sad_array) {
   sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
@@ -299,43 +349,57 @@
                               ref_ptr + 2, ref_stride, 0x7fffffff);
 }
 
-void vp9_sad4x4x8_c(const unsigned char *src_ptr,
+void vp9_sad4x4x8_c(const uint8_t *src_ptr,
                     int  src_stride,
-                    const unsigned char *ref_ptr,
+                    const uint8_t *ref_ptr,
                     int  ref_stride,
-                    unsigned short *sad_array) {
-  sad_array[0] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr, ref_stride,
-                                              0x7fffffff);
-  sad_array[1] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 1, ref_stride,
-                                              0x7fffffff);
-  sad_array[2] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 2, ref_stride,
-                                              0x7fffffff);
-  sad_array[3] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 3, ref_stride,
-                                              0x7fffffff);
-  sad_array[4] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 4, ref_stride,
-                                              0x7fffffff);
-  sad_array[5] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 5, ref_stride,
-                                              0x7fffffff);
-  sad_array[6] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 6, ref_stride,
-                                              0x7fffffff);
-  sad_array[7] = (unsigned short)vp9_sad4x4_c(src_ptr, src_stride,
-                                              ref_ptr + 7, ref_stride,
-                                              0x7fffffff);
+                    uint16_t *sad_array) {
+  sad_array[0] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
+                                        ref_ptr, ref_stride,
+                                        0x7fffffff);
+  sad_array[1] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
+                                        ref_ptr + 1, ref_stride,
+                                        0x7fffffff);
+  sad_array[2] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
+                                        ref_ptr + 2, ref_stride,
+                                        0x7fffffff);
+  sad_array[3] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
+                                        ref_ptr + 3, ref_stride,
+                                        0x7fffffff);
+  sad_array[4] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
+                                        ref_ptr + 4, ref_stride,
+                                        0x7fffffff);
+  sad_array[5] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
+                                        ref_ptr + 5, ref_stride,
+                                        0x7fffffff);
+  sad_array[6] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
+                                        ref_ptr + 6, ref_stride,
+                                        0x7fffffff);
+  sad_array[7] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,
+                                        ref_ptr + 7, ref_stride,
+                                        0x7fffffff);
 }
 
-void vp9_sad32x32x4d_c(const unsigned char *src_ptr,
+void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
-                       unsigned char *ref_ptr[],
+                       uint8_t *ref_ptr[],
                        int  ref_stride,
-                       unsigned int *sad_array
-                       ) {
+                       unsigned int *sad_array) {
+  sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr[0], ref_stride, 0x7fffffff);
+  sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr[1], ref_stride, 0x7fffffff);
+  sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr[2], ref_stride, 0x7fffffff);
+  sad_array[3] = vp9_sad64x64_c(src_ptr, src_stride,
+                                ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp9_sad32x32x4d_c(const uint8_t *src_ptr,
+                       int  src_stride,
+                       uint8_t *ref_ptr[],
+                       int  ref_stride,
+                       unsigned int *sad_array) {
   sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,
                                 ref_ptr[0], ref_stride, 0x7fffffff);
   sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,
@@ -346,9 +410,9 @@
                                 ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-void vp9_sad16x16x4d_c(const unsigned char *src_ptr,
+void vp9_sad16x16x4d_c(const uint8_t *src_ptr,
                        int  src_stride,
-                       unsigned char *ref_ptr[],
+                       uint8_t *ref_ptr[],
                        int  ref_stride,
                        unsigned int *sad_array) {
   sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,
@@ -361,9 +425,9 @@
                                 ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-void vp9_sad16x8x4d_c(const unsigned char *src_ptr,
+void vp9_sad16x8x4d_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      unsigned char *ref_ptr[],
+                      uint8_t *ref_ptr[],
                       int  ref_stride,
                       unsigned int *sad_array) {
   sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,
@@ -376,9 +440,9 @@
                                ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-void vp9_sad8x8x4d_c(const unsigned char *src_ptr,
+void vp9_sad8x8x4d_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     unsigned char *ref_ptr[],
+                     uint8_t *ref_ptr[],
                      int  ref_stride,
                      unsigned int *sad_array) {
   sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,
@@ -391,9 +455,9 @@
                               ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-void vp9_sad8x16x4d_c(const unsigned char *src_ptr,
+void vp9_sad8x16x4d_c(const uint8_t *src_ptr,
                       int  src_stride,
-                      unsigned char *ref_ptr[],
+                      uint8_t *ref_ptr[],
                       int  ref_stride,
                       unsigned int *sad_array) {
   sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,
@@ -406,9 +470,9 @@
                                ref_ptr[3], ref_stride, 0x7fffffff);
 }
 
-void vp9_sad4x4x4d_c(const unsigned char *src_ptr,
+void vp9_sad4x4x4d_c(const uint8_t *src_ptr,
                      int  src_stride,
-                     unsigned char *ref_ptr[],
+                     uint8_t *ref_ptr[],
                      int  ref_stride,
                      unsigned int *sad_array) {
   sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,
@@ -422,9 +486,9 @@
 }
 
 /* Copy 2 macroblocks to a buffer */
-void vp9_copy32xn_c(unsigned char *src_ptr,
+void vp9_copy32xn_c(uint8_t *src_ptr,
                     int  src_stride,
-                    unsigned char *dst_ptr,
+                    uint8_t *dst_ptr,
                     int  dst_stride,
                     int height) {
   int r;
--- a/vp9/encoder/vp9_satd_c.c
+++ b/vp9/encoder/vp9_satd_c.c
@@ -11,16 +11,17 @@
 #include <stdlib.h>
 #include "vpx_ports/mem.h"
 #include "./vp9_rtcd.h"
-unsigned int vp9_satd16x16_c(const unsigned char *src_ptr,
+
+unsigned int vp9_satd16x16_c(const uint8_t *src_ptr,
                              int  src_stride,
-                             const unsigned char *ref_ptr,
+                             const uint8_t *ref_ptr,
                              int  ref_stride,
                              unsigned int *psatd) {
   int r, c, i;
   unsigned int satd = 0;
-  DECLARE_ALIGNED(16, short, diff_in[256]);
-  DECLARE_ALIGNED(16, short, diff_out[16]);
-  short *in;
+  DECLARE_ALIGNED(16, int16_t, diff_in[256]);
+  DECLARE_ALIGNED(16, int16_t, diff_out[16]);
+  int16_t *in;
 
   for (r = 0; r < 16; r++) {
     for (c = 0; c < 16; c++) {
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -107,31 +107,15 @@
                                int *segcounts,
                                vp9_prob *segment_tree_probs) {
   int count1, count2;
-  int tot_count;
-  int i;
 
-  // Blank the strtucture to start with
-  vpx_memset(segment_tree_probs, 0,
-             MB_FEATURE_TREE_PROBS * sizeof(*segment_tree_probs));
-
   // Total count for all segments
   count1 = segcounts[0] + segcounts[1];
   count2 = segcounts[2] + segcounts[3];
-  tot_count = count1 + count2;
 
   // Work out probabilities of each segment
-  if (tot_count)
-    segment_tree_probs[0] = (count1 * 255) / tot_count;
-  if (count1 > 0)
-    segment_tree_probs[1] = (segcounts[0] * 255) / count1;
-  if (count2 > 0)
-    segment_tree_probs[2] = (segcounts[2] * 255) / count2;
-
-  // Clamp probabilities to minimum allowed value
-  for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {
-    if (segment_tree_probs[i] == 0)
-      segment_tree_probs[i] = 1;
-  }
+  segment_tree_probs[0] = get_binary_prob(count1, count2);
+  segment_tree_probs[1] = get_prob(segcounts[0], count1);
+  segment_tree_probs[2] = get_prob(segcounts[2], count2);
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
@@ -157,7 +141,46 @@
             segcounts[3] * vp9_cost_one(probs[2]);
 
   return cost;
+}
 
+static void count_segs(VP9_COMP *cpi,
+                       MODE_INFO *mi,
+                       int *no_pred_segcounts,
+                       int (*temporal_predictor_count)[2],
+                       int *t_unpred_seg_counts,
+                       int mb_size, int mb_row, int mb_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const int segmap_index = mb_row * cm->mb_cols + mb_col;
+  const int segment_id = mi->mbmi.segment_id;
+
+  xd->mode_info_context = mi;
+  xd->mb_to_top_edge = -((mb_row * 16) << 3);
+  xd->mb_to_left_edge = -((mb_col * 16) << 3);
+  xd->mb_to_bottom_edge = ((cm->mb_rows - mb_size - mb_row) * 16) << 3;
+  xd->mb_to_right_edge  = ((cm->mb_cols - mb_size - mb_col) * 16) << 3;
+
+  // Count the number of hits on each segment with no prediction
+  no_pred_segcounts[segment_id]++;
+
+  // Temporal prediction not allowed on key frames
+  if (cm->frame_type != KEY_FRAME) {
+    // Test to see if the segment id matches the predicted value.
+    const int seg_predicted =
+        (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));
+
+    // Get the segment id prediction context
+    const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID);
+
+    // Store the prediction status for this mb and update counts
+    // as appropriate
+    vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
+    temporal_predictor_count[pred_context][seg_predicted]++;
+
+    if (!seg_predicted)
+      // Update the "unpredicted" segment count
+      t_unpred_seg_counts[segment_id]++;
+  }
 }
 
 void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
@@ -164,15 +187,11 @@
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
 
-  int i;
-  int tot_count;
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
-  int pred_context;
 
+  int i;
   int mb_row, mb_col;
-  int segmap_index = 0;
-  unsigned char segment_id;
 
   int temporal_predictor_count[PREDICTION_PROBS][2];
   int no_pred_segcounts[MAX_MB_SEGMENTS];
@@ -182,9 +201,8 @@
   vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];
   vp9_prob t_nopred_prob[PREDICTION_PROBS];
 
-#if CONFIG_SUPERBLOCKS
   const int mis = cm->mode_info_stride;
-#endif
+  MODE_INFO *mi_ptr = cm->mi, *mi;
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
@@ -200,87 +218,47 @@
   // First of all generate stats regarding how well the last segment map
   // predicts this one
 
-  // Initialize macroblock decoder mode info context for the first mb
-  // in the frame
-  xd->mode_info_context = cm->mi;
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {
+    mi = mi_ptr;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) {
+      if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {
+        count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,
+                   t_unpred_seg_counts, 4, mb_row, mb_col);
+      } else {
+        for (i = 0; i < 4; i++) {
+          int x_idx = (i & 1) << 1, y_idx = i & 2;
+          MODE_INFO *sb_mi = mi + y_idx * mis + x_idx;
 
-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 2) {
-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 2) {
-      for (i = 0; i < 4; i++) {
-        static const int dx[4] = { +1, -1, +1, +1 };
-        static const int dy[4] = {  0, +1,  0, -1 };
-        int x_idx = i & 1, y_idx = i >> 1;
-
-        if (mb_col + x_idx >= cm->mb_cols ||
-            mb_row + y_idx >= cm->mb_rows) {
-          goto end;
-        }
-
-        xd->mb_to_top_edge = -((mb_row * 16) << 3);
-        xd->mb_to_left_edge = -((mb_col * 16) << 3);
-
-        segmap_index = (mb_row + y_idx) * cm->mb_cols + mb_col + x_idx;
-        segment_id = xd->mode_info_context->mbmi.segment_id;
-#if CONFIG_SUPERBLOCKS
-        if (xd->mode_info_context->mbmi.encoded_as_sb) {
-          if (mb_col + 1 < cm->mb_cols)
-            segment_id = segment_id &&
-                         xd->mode_info_context[1].mbmi.segment_id;
-          if (mb_row + 1 < cm->mb_rows) {
-            segment_id = segment_id &&
-                         xd->mode_info_context[mis].mbmi.segment_id;
-            if (mb_col + 1 < cm->mb_cols)
-              segment_id = segment_id &&
-                           xd->mode_info_context[mis + 1].mbmi.segment_id;
+          if (mb_col + x_idx >= cm->mb_cols ||
+              mb_row + y_idx >= cm->mb_rows) {
+            continue;
           }
-          xd->mb_to_bottom_edge = ((cm->mb_rows - 2 - mb_row) * 16) << 3;
-          xd->mb_to_right_edge  = ((cm->mb_cols - 2 - mb_col) * 16) << 3;
-        } else {
-#endif
-          xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
-          xd->mb_to_right_edge  = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
-#if CONFIG_SUPERBLOCKS
-        }
-#endif
 
-        // Count the number of hits on each segment with no prediction
-        no_pred_segcounts[segment_id]++;
+          if (sb_mi->mbmi.sb_type) {
+            assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32);
+            count_segs(cpi, sb_mi, no_pred_segcounts, temporal_predictor_count,
+                       t_unpred_seg_counts, 2, mb_row + y_idx, mb_col + x_idx);
+          } else {
+            int j;
 
-        // Temporal prediction not allowed on key frames
-        if (cm->frame_type != KEY_FRAME) {
-          // Test to see if the segment id matches the predicted value.
-          int seg_predicted =
-            (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));
+            for (j = 0; j < 4; j++) {
+              const int x_idx_mb = x_idx + (j & 1), y_idx_mb = y_idx + (j >> 1);
+              MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis;
 
-          // Get the segment id prediction context
-          pred_context =
-            vp9_get_pred_context(cm, xd, PRED_SEG_ID);
+              if (mb_col + x_idx_mb >= cm->mb_cols ||
+                  mb_row + y_idx_mb >= cm->mb_rows) {
+                continue;
+              }
 
-          // Store the prediction status for this mb and update counts
-          // as appropriate
-          vp9_set_pred_flag(xd, PRED_SEG_ID, seg_predicted);
-          temporal_predictor_count[pred_context][seg_predicted]++;
-
-          if (!seg_predicted)
-            // Update the "unpredicted" segment count
-            t_unpred_seg_counts[segment_id]++;
+              assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);
+              count_segs(cpi, mb_mi, no_pred_segcounts,
+                         temporal_predictor_count, t_unpred_seg_counts,
+                         1, mb_row + y_idx_mb, mb_col + x_idx_mb);
+            }
+          }
         }
-
-#if CONFIG_SUPERBLOCKS
-        if (xd->mode_info_context->mbmi.encoded_as_sb) {
-          assert(!i);
-          xd->mode_info_context += 2;
-          break;
-        }
-#endif
-      end:
-        xd->mode_info_context += dx[i] + dy[i] * cm->mode_info_stride;
       }
     }
-
-    // this is to account for the border in mode_info_context
-    xd->mode_info_context -= mb_col;
-    xd->mode_info_context += cm->mode_info_stride * 2;
   }
 
   // Work out probability tree for coding segments without prediction
@@ -297,20 +275,8 @@
 
     // Add in the cost of the signalling for each prediction context
     for (i = 0; i < PREDICTION_PROBS; i++) {
-      tot_count = temporal_predictor_count[i][0] +
-                  temporal_predictor_count[i][1];
-
-      // Work out the context probabilities for the segment
-      // prediction flag
-      if (tot_count) {
-        t_nopred_prob[i] = (temporal_predictor_count[i][0] * 255) /
-                           tot_count;
-
-        // Clamp to minimum allowed value
-        if (t_nopred_prob[i] < 1)
-          t_nopred_prob[i] = 1;
-      } else
-        t_nopred_prob[i] = 1;
+      t_nopred_prob[i] = get_binary_prob(temporal_predictor_count[i][0],
+                                         temporal_predictor_count[i][1]);
 
       // Add in the predictor signaling cost
       t_pred_cost += (temporal_predictor_count[i][0] *
--- a/vp9/encoder/vp9_segmentation.h
+++ b/vp9/encoder/vp9_segmentation.h
@@ -43,4 +43,4 @@
 
 extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi);
 
-#endif /* __INC_SEGMENTATION_H__ */
+#endif  // VP9_ENCODER_VP9_SEGMENTATION_H_
--- a/vp9/encoder/vp9_ssim.c
+++ b/vp9/encoder/vp9_ssim.c
@@ -11,7 +11,7 @@
 
 #include "vp9/encoder/vp9_onyx_int.h"
 
-void vp9_ssim_parms_16x16_c(unsigned char *s, int sp, unsigned char *r,
+void vp9_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r,
                             int rp, unsigned long *sum_s, unsigned long *sum_r,
                             unsigned long *sum_sq_s, unsigned long *sum_sq_r,
                             unsigned long *sum_sxr) {
@@ -26,7 +26,7 @@
     }
   }
 }
-void vp9_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
+void vp9_ssim_parms_8x8_c(uint8_t *s, int sp, uint8_t *r, int rp,
                           unsigned long *sum_s, unsigned long *sum_r,
                           unsigned long *sum_sq_s, unsigned long *sum_sq_r,
                           unsigned long *sum_sxr) {
@@ -65,13 +65,13 @@
   return ssim_n * 1.0 / ssim_d;
 }
 
-static double ssim_16x16(unsigned char *s, int sp, unsigned char *r, int rp) {
+static double ssim_16x16(uint8_t *s, int sp, uint8_t *r, int rp) {
   unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
   vp9_ssim_parms_16x16(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
                        &sum_sxr);
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
 }
-static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
+static double ssim_8x8(uint8_t *s, int sp, uint8_t *r, int rp) {
   unsigned long sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
   vp9_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
                      &sum_sxr);
@@ -81,7 +81,7 @@
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
-double vp9_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
+double vp9_ssim2(uint8_t *img1, uint8_t *img2, int stride_img1,
                  int stride_img2, int width, int height) {
   int i, j;
   int samples = 0;
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -17,7 +17,7 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_psnr.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_extend.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/common/vp9_quant_common.h"
@@ -35,19 +35,16 @@
 #if VP9_TEMPORAL_ALT_REF
 
 
-static void temporal_filter_predictors_mb_c
-(
-  MACROBLOCKD *xd,
-  unsigned char *y_mb_ptr,
-  unsigned char *u_mb_ptr,
-  unsigned char *v_mb_ptr,
-  int stride,
-  int mv_row,
-  int mv_col,
-  unsigned char *pred
-) {
+static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
+                                            uint8_t *y_mb_ptr,
+                                            uint8_t *u_mb_ptr,
+                                            uint8_t *v_mb_ptr,
+                                            int stride,
+                                            int mv_row,
+                                            int mv_col,
+                                            uint8_t *pred) {
   int offset;
-  unsigned char *yptr, *uptr, *vptr;
+  uint8_t *yptr, *uptr, *vptr;
   int omv_row, omv_col;
 
   // Y
@@ -75,23 +72,20 @@
                            (omv_col & 15), (omv_row & 15), &pred[256], 8);
     xd->subpixel_predict8x8(vptr, stride,
                            (omv_col & 15), (omv_row & 15), &pred[320], 8);
-  }
-  else {
+  } else {
     vp9_copy_mem8x8(uptr, stride, &pred[256], 8);
     vp9_copy_mem8x8(vptr, stride, &pred[320], 8);
   }
 }
-void vp9_temporal_filter_apply_c
-(
-  unsigned char *frame1,
-  unsigned int stride,
-  unsigned char *frame2,
-  unsigned int block_size,
-  int strength,
-  int filter_weight,
-  unsigned int *accumulator,
-  unsigned short *count
-) {
+
+void vp9_temporal_filter_apply_c(uint8_t *frame1,
+                                 unsigned int stride,
+                                 uint8_t *frame2,
+                                 unsigned int block_size,
+                                 int strength,
+                                 int filter_weight,
+                                 unsigned int *accumulator,
+                                 uint16_t *count) {
   unsigned int i, j, k;
   int modifier;
   int byte = 0;
@@ -129,14 +123,11 @@
 
 #if ALT_REF_MC_ENABLED
 
-static int temporal_filter_find_matching_mb_c
-(
-  VP9_COMP *cpi,
-  YV12_BUFFER_CONFIG *arf_frame,
-  YV12_BUFFER_CONFIG *frame_ptr,
-  int mb_offset,
-  int error_thresh
-) {
+static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
+                                              YV12_BUFFER_CONFIG *arf_frame,
+                                              YV12_BUFFER_CONFIG *frame_ptr,
+                                              int mb_offset,
+                                              int error_thresh) {
   MACROBLOCK *x = &cpi->mb;
   int step_param;
   int sadpb = x->sadperbit16;
@@ -148,10 +139,10 @@
   int_mv best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
 
   // Save input state
-  unsigned char **base_src = b->base_src;
+  uint8_t **base_src = b->base_src;
   int src = b->src;
   int src_stride = b->src_stride;
-  unsigned char **base_pre = d->base_pre;
+  uint8_t **base_pre = d->base_pre;
   int pre = d->pre;
   int pre_stride = d->pre_stride;
 
@@ -212,13 +203,10 @@
 }
 #endif
 
-static void temporal_filter_iterate_c
-(
-  VP9_COMP *cpi,
-  int frame_count,
-  int alt_ref_index,
-  int strength
-) {
+static void temporal_filter_iterate_c(VP9_COMP *cpi,
+                                      int frame_count,
+                                      int alt_ref_index,
+                                      int strength) {
   int byte;
   int frame;
   int mb_col, mb_row;
@@ -228,16 +216,16 @@
   int mb_y_offset = 0;
   int mb_uv_offset = 0;
   DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 + 8 * 8 + 8 * 8);
-  DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16 * 16 + 8 * 8 + 8 * 8);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 + 8 * 8 + 8 * 8);
   MACROBLOCKD *mbd = &cpi->mb.e_mbd;
   YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
-  unsigned char *dst1, *dst2;
-  DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16 * 16 + 8 * 8 + 8 * 8);
+  uint8_t *dst1, *dst2;
+  DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 + 8 * 8 + 8 * 8);
 
   // Save input state
-  unsigned char *y_buffer = mbd->pre.y_buffer;
-  unsigned char *u_buffer = mbd->pre.u_buffer;
-  unsigned char *v_buffer = mbd->pre.v_buffer;
+  uint8_t *y_buffer = mbd->pre.y_buffer;
+  uint8_t *u_buffer = mbd->pre.u_buffer;
+  uint8_t *v_buffer = mbd->pre.v_buffer;
 
   for (mb_row = 0; mb_row < mb_rows; mb_row++) {
 #if ALT_REF_MC_ENABLED
@@ -262,7 +250,7 @@
       int stride;
 
       vpx_memset(accumulator, 0, 384 * sizeof(unsigned int));
-      vpx_memset(count, 0, 384 * sizeof(unsigned short));
+      vpx_memset(count, 0, 384 * sizeof(uint16_t));
 
 #if ALT_REF_MC_ENABLED
       cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
@@ -337,7 +325,7 @@
           pval *= cpi->fixed_divide[count[k]];
           pval >>= 19;
 
-          dst1[byte] = (unsigned char)pval;
+          dst1[byte] = (uint8_t)pval;
 
           // move to next pixel
           byte++;
@@ -358,13 +346,13 @@
           unsigned int pval = accumulator[k] + (count[k] >> 1);
           pval *= cpi->fixed_divide[count[k]];
           pval >>= 19;
-          dst1[byte] = (unsigned char)pval;
+          dst1[byte] = (uint8_t)pval;
 
           // V
           pval = accumulator[m] + (count[m] >> 1);
           pval *= cpi->fixed_divide[count[m]];
           pval >>= 19;
-          dst2[byte] = (unsigned char)pval;
+          dst2[byte] = (uint8_t)pval;
 
           // move to next pixel
           byte++;
--- a/vp9/encoder/vp9_temporal_filter.h
+++ b/vp9/encoder/vp9_temporal_filter.h
@@ -13,4 +13,4 @@
 
 extern void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);
 
-#endif
+#endif  // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -25,27 +25,21 @@
    compressions, then generating vp9_context.c = initial stats. */
 
 #ifdef ENTROPY_STATS
-INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-INT64 hybrid_context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
+vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4];
+vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4];
+vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8];
+vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8];
+vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16];
+vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16];
+vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32];
 
-INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-INT64 hybrid_context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-INT64 hybrid_context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];
-
-extern unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
-extern unsigned int hybrid_tree_update_hist[BLOCK_TYPES][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
-extern unsigned int tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-extern unsigned int hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
-extern unsigned int hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
-                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];
+extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4];
+extern vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4];
+extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8];
+extern vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8];
+extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16];
+extern vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16];
+extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32];
 #endif  /* ENTROPY_STATS */
 
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
@@ -106,54 +100,78 @@
   vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
 }
 
+#if CONFIG_NEWCOEFCONTEXT
+#define PT pn
+#else
+#define PT pt
+#endif
+
 static void tokenize_b(VP9_COMP *cpi,
                        MACROBLOCKD *xd,
-                       const BLOCKD * const b,
+                       const int ib,
                        TOKENEXTRA **tp,
                        PLANE_TYPE type,
-                       ENTROPY_CONTEXT *a,
-                       ENTROPY_CONTEXT *l,
                        TX_SIZE tx_size,
                        int dry_run) {
   int pt; /* near block/prev token context index */
   int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;
+  const BLOCKD * const b = xd->block + ib;
   const int eob = b->eob;     /* one beyond last nonzero coeff */
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  const short *qcoeff_ptr = b->qcoeff;
+  int16_t *qcoeff_ptr = b->qcoeff;
   int seg_eob;
-  int segment_id = xd->mode_info_context->mbmi.segment_id;
+  const int segment_id = xd->mode_info_context->mbmi.segment_id;
   const int *bands, *scan;
-  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+  vp9_coeff_count *counts;
+  vp9_coeff_probs *probs;
   const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
                           get_tx_type(xd, b) : DCT_DCT;
+#if CONFIG_NEWCOEFCONTEXT
+  const int *neighbors;
+  int pn;
+#endif
 
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context +
+      vp9_block2above[tx_size][ib];
+  ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context +
+      vp9_block2left[tx_size][ib];
+  ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
+
+  ENTROPY_CONTEXT *const a1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]) +
+      vp9_block2above[tx_size][ib];
+  ENTROPY_CONTEXT *const l1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]) +
+      vp9_block2left[tx_size][ib];
+
+
   switch (tx_size) {
     default:
     case TX_4X4:
       seg_eob = 16;
-      bands = vp9_coef_bands;
-      scan = vp9_default_zig_zag1d;
+      bands = vp9_coef_bands_4x4;
+      scan = vp9_default_zig_zag1d_4x4;
       if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts;
-        probs = cpi->common.fc.hybrid_coef_probs;
+        counts = cpi->hybrid_coef_counts_4x4;
+        probs = cpi->common.fc.hybrid_coef_probs_4x4;
         if (tx_type == ADST_DCT) {
-          scan = vp9_row_scan;
+          scan = vp9_row_scan_4x4;
         } else if (tx_type == DCT_ADST) {
-          scan = vp9_col_scan;
+          scan = vp9_col_scan_4x4;
         }
       } else {
-        counts = cpi->coef_counts;
-        probs = cpi->common.fc.coef_probs;
+        counts = cpi->coef_counts_4x4;
+        probs = cpi->common.fc.coef_probs_4x4;
       }
       break;
     case TX_8X8:
       if (type == PLANE_TYPE_Y2) {
         seg_eob = 4;
-        bands = vp9_coef_bands;
-        scan = vp9_default_zig_zag1d;
+        bands = vp9_coef_bands_4x4;
+        scan = vp9_default_zig_zag1d_4x4;
       } else {
+#if CONFIG_CNVCONTEXT
+        a_ec = (a[0] + a[1]) != 0;
+        l_ec = (l[0] + l[1]) != 0;
+#endif
         seg_eob = 64;
         bands = vp9_coef_bands_8x8;
         scan = vp9_default_zig_zag1d_8x8;
@@ -167,6 +185,15 @@
       }
       break;
     case TX_16X16:
+#if CONFIG_CNVCONTEXT
+      if (type != PLANE_TYPE_UV) {
+        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
+        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
+      } else {
+        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
+        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
+      }
+#endif
       seg_eob = 256;
       bands = vp9_coef_bands_16x16;
       scan = vp9_default_zig_zag1d_16x16;
@@ -177,9 +204,35 @@
         counts = cpi->coef_counts_16x16;
         probs = cpi->common.fc.coef_probs_16x16;
       }
+      if (type == PLANE_TYPE_UV) {
+        int uv_idx = (ib - 16) >> 2;
+        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 256 * uv_idx;
+      }
       break;
+    case TX_32X32:
+#if CONFIG_CNVCONTEXT
+      a_ec = a[0] + a[1] + a[2] + a[3] +
+             a1[0] + a1[1] + a1[2] + a1[3];
+      l_ec = l[0] + l[1] + l[2] + l[3] +
+             l1[0] + l1[1] + l1[2] + l1[3];
+      a_ec = a_ec != 0;
+      l_ec = l_ec != 0;
+#endif
+      seg_eob = 1024;
+      bands = vp9_coef_bands_32x32;
+      scan = vp9_default_zig_zag1d_32x32;
+      counts = cpi->coef_counts_32x32;
+      probs = cpi->common.fc.coef_probs_32x32;
+      qcoeff_ptr = xd->sb_coeff_data.qcoeff;
+      break;
   }
 
+  VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
+#if CONFIG_NEWCOEFCONTEXT
+  neighbors = vp9_get_coef_neighbors_handle(scan);
+  pn = pt;
+#endif
+
   if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))
     seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
 
@@ -190,7 +243,6 @@
     if (c < eob) {
       const int rc = scan[c];
       const int v = qcoeff_ptr[rc];
-
       assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);
 
       t->Extra = vp9_dct_value_tokens_ptr[v].Extra;
@@ -200,19 +252,46 @@
     }
 
     t->Token = token;
-    t->context_tree = probs[type][band][pt];
+    t->context_tree = probs[type][band][PT];
     t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||
                                      (band > 1 && type == PLANE_TYPE_Y_NO_DC));
     assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);
     if (!dry_run) {
-      ++counts[type][band][pt][token];
+      ++counts[type][band][PT][token];
     }
     pt = vp9_prev_token_class[token];
+#if CONFIG_NEWCOEFCONTEXT
+    if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(bands[c + 1]))
+      pn = vp9_get_coef_neighbor_context(
+          qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);
+    else
+      pn = pt;
+#endif
     ++t;
   } while (c < eob && ++c < seg_eob);
 
   *tp = t;
-  *a = *l = (c > !type); /* 0 <-> all coeff data is zero */
+  a_ec = l_ec = (c > !type); /* 0 <-> all coeff data is zero */
+  a[0] = a_ec;
+  l[0] = l_ec;
+
+  if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) {
+    a[1] = a_ec;
+    l[1] = l_ec;
+  } else if (tx_size == TX_16X16) {
+    if (type != PLANE_TYPE_UV) {
+      a[1] = a[2] = a[3] = a_ec;
+      l[1] = l[2] = l[3] = l_ec;
+    } else {
+      a1[0] = a1[1] = a[1] = a_ec;
+      l1[0] = l1[1] = l[1] = l_ec;
+    }
+  } else if (tx_size == TX_32X32) {
+    a[1] = a[2] = a[3] = a_ec;
+    l[1] = l[2] = l[3] = l_ec;
+    a1[0] = a1[1] = a1[2] = a1[3] = a_ec;
+    l1[0] = l1[1] = l1[2] = l1[3] = l_ec;
+  }
 }
 
 int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_2nd_order) {
@@ -283,6 +362,68 @@
   return (vp9_mby_is_skippable_16x16(xd) & vp9_mbuv_is_skippable_8x8(xd));
 }
 
+int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) {
+  int skip = 1;
+  skip &= !xd->block[0].eob;
+  return skip;
+}
+
+int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) {
+  return (!xd->block[16].eob) & (!xd->block[20].eob);
+}
+
+static int sb_is_skippable_32x32(MACROBLOCKD *xd) {
+  return vp9_sby_is_skippable_32x32(xd) &&
+         vp9_sbuv_is_skippable_16x16(xd);
+}
+
+void vp9_tokenize_sb(VP9_COMP *cpi,
+                     MACROBLOCKD *xd,
+                     TOKENEXTRA **t,
+                     int dry_run) {
+  VP9_COMMON * const cm = &cpi->common;
+  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->above_context + 1), };
+  ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0),
+                            (ENTROPY_CONTEXT *) (xd->left_context + 1), };
+  const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);
+  const int segment_id = mbmi->segment_id;
+  const int skip_inc =  !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||
+                        (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0);
+  int b;
+
+  mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);
+
+  if (mbmi->mb_skip_coeff) {
+    if (!dry_run)
+      cpi->skip_true_count[mb_skip_context] += skip_inc;
+    if (!cm->mb_no_coeff_skip) {
+      vp9_stuff_sb(cpi, xd, t, dry_run);
+    } else {
+      vp9_fix_contexts_sb(xd);
+    }
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    cpi->skip_false_count[mb_skip_context] += skip_inc;
+
+  tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC,
+             TX_32X32, dry_run);
+
+  for (b = 16; b < 24; b += 4) {
+    tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,
+               TX_16X16, dry_run);
+  }
+  A[0][8] = L[0][8] = A[1][8] = L[1][8] = 0;
+  if (dry_run)
+    *t = t_backup;
+}
+
 void vp9_tokenize_mb(VP9_COMP *cpi,
                      MACROBLOCKD *xd,
                      TOKENEXTRA **t,
@@ -293,8 +434,6 @@
   int tx_size = xd->mode_info_context->mbmi.txfm_size;
   int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);
   TOKENEXTRA *t_backup = *t;
-  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *) xd->above_context;
-  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *) xd->left_context;
 
   // If the MB is going to be skipped because of a segment level flag
   // exclude this from the skip count stats used to calculate the
@@ -312,6 +451,7 @@
 
   switch (tx_size) {
     case TX_16X16:
+
       xd->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(xd);
       break;
     case TX_8X8:
@@ -336,8 +476,9 @@
     if (!cpi->common.mb_no_coeff_skip) {
       vp9_stuff_mb(cpi, xd, t, dry_run);
     } else {
-      vp9_fix_contexts(xd);
+      vp9_reset_mb_tokens_context(xd);
     }
+
     if (dry_run)
       *t = t_backup;
     return;
@@ -347,120 +488,106 @@
     cpi->skip_false_count[mb_skip_context] += skip_inc;
 
   if (has_2nd_order) {
-    if (tx_size == TX_8X8) {
-      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
-                 A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
-                 TX_8X8, dry_run);
-    } else {
-      tokenize_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
-                 A + vp9_block2above[24], L + vp9_block2left[24],
-                 TX_4X4, dry_run);
-    }
-
+    tokenize_b(cpi, xd, 24, t, PLANE_TYPE_Y2, tx_size, dry_run);
     plane_type = PLANE_TYPE_Y_NO_DC;
   } else {
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
     plane_type = PLANE_TYPE_Y_WITH_DC;
   }
 
   if (tx_size == TX_16X16) {
-    tokenize_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC,
-               A, L, TX_16X16, dry_run);
-    A[1] = A[2] = A[3] = A[0];
-    L[1] = L[2] = L[3] = L[0];
-
+    tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
     for (b = 16; b < 24; b += 4) {
-      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-                 TX_8X8, dry_run);
-      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+      tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
     }
-    A[8] = 0;
-    L[8] = 0;
   } else if (tx_size == TX_8X8) {
     for (b = 0; b < 16; b += 4) {
-      tokenize_b(cpi, xd, xd->block + b, t, plane_type,
-                 A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-                 TX_8X8, dry_run);
-      A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-      L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+      tokenize_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run);
     }
     if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
         xd->mode_info_context->mbmi.mode == SPLITMV) {
       for (b = 16; b < 24; b++) {
-        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-                   A + vp9_block2above[b], L + vp9_block2left[b],
-                   TX_4X4, dry_run);
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
       }
     } else {
       for (b = 16; b < 24; b += 4) {
-        tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-                   A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-                   TX_8X8, dry_run);
-        A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-        L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
       }
     }
   } else {
-    for (b = 0; b < 16; b++) {
-      tokenize_b(cpi, xd, xd->block + b, t, plane_type,
-                 A + vp9_block2above[b], L + vp9_block2left[b],
-                 TX_4X4, dry_run);
+    for (b = 0; b < 24; b++) {
+      if (b >= 16)
+        plane_type = PLANE_TYPE_UV;
+      tokenize_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run);
     }
-
-    for (b = 16; b < 24; b++) {
-      tokenize_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-                 A + vp9_block2above[b], L + vp9_block2left[b],
-                 TX_4X4, dry_run);
-    }
   }
   if (dry_run)
     *t = t_backup;
 }
 
-
 #ifdef ENTROPY_STATS
 void init_context_counters(void) {
   FILE *f = fopen("context.bin", "rb");
   if (!f) {
-    vpx_memset(context_counters, 0, sizeof(context_counters));
+    vpx_memset(context_counters_4x4, 0, sizeof(context_counters_4x4));
+    vpx_memset(hybrid_context_counters_4x4, 0,
+               sizeof(hybrid_context_counters_4x4));
     vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));
+    vpx_memset(hybrid_context_counters_8x8, 0,
+               sizeof(hybrid_context_counters_8x8));
     vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));
+    vpx_memset(hybrid_context_counters_16x16, 0,
+               sizeof(hybrid_context_counters_16x16));
+    vpx_memset(context_counters_32x32, 0, sizeof(context_counters_32x32));
   } else {
-    fread(context_counters, sizeof(context_counters), 1, f);
+    fread(context_counters_4x4, sizeof(context_counters_4x4), 1, f);
+    fread(hybrid_context_counters_4x4,
+          sizeof(hybrid_context_counters_4x4), 1, f);
     fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
+    fread(hybrid_context_counters_8x8,
+          sizeof(hybrid_context_counters_8x8), 1, f);
     fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
+    fread(hybrid_context_counters_16x16,
+          sizeof(hybrid_context_counters_16x16), 1, f);
+    fread(context_counters_32x32, sizeof(context_counters_32x32), 1, f);
     fclose(f);
   }
 
   f = fopen("treeupdate.bin", "rb");
   if (!f) {
-    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
+    vpx_memset(tree_update_hist_4x4, 0, sizeof(tree_update_hist_4x4));
+    vpx_memset(hybrid_tree_update_hist_4x4, 0,
+               sizeof(hybrid_tree_update_hist_4x4));
     vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));
+    vpx_memset(hybrid_tree_update_hist_8x8, 0,
+               sizeof(hybrid_tree_update_hist_8x8));
     vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));
+    vpx_memset(hybrid_tree_update_hist_16x16, 0,
+               sizeof(hybrid_tree_update_hist_16x16));
+    vpx_memset(tree_update_hist_32x32, 0, sizeof(tree_update_hist_32x32));
   } else {
-    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
+    fread(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);
+    fread(hybrid_tree_update_hist_4x4,
+          sizeof(hybrid_tree_update_hist_4x4), 1, f);
     fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);
+    fread(hybrid_tree_update_hist_8x8,
+          sizeof(hybrid_tree_update_hist_8x8), 1, f);
     fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);
+    fread(hybrid_tree_update_hist_16x16,
+          sizeof(hybrid_tree_update_hist_16x16), 1, f);
+    fread(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f);
     fclose(f);
   }
 }
 
-void print_context_counters() {
+static void print_counter(FILE *f, vp9_coeff_accum *context_counters,
+                          int block_types, const char *header) {
   int type, band, pt, t;
-  FILE *f = fopen("vp9_context.c", "w");
 
-  fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-  fprintf(f, "static const unsigned int\n"
-          "vp9_default_coef_counts[BLOCK_TYPES]\n"
-          "                      [COEF_BANDS]\n"
-          "                      [PREV_COEF_CONTEXTS]\n"
-          "                      [MAX_ENTROPY_TOKENS]={\n");
+  fprintf(f, "static const vp9_coeff_count %s = {\n", header);
 
-# define Comma( X) (X? ",":"")
+#define Comma(X) (X ? "," : "")
   type = 0;
   do {
     fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
@@ -473,192 +600,114 @@
 
         t = 0;
         do {
-          const INT64 x = context_counters [type] [band] [pt] [t];
+          const int64_t x = context_counters[type][band][pt][t];
           const int y = (int) x;
-          assert(x == (INT64) y);  /* no overflow handling yet */
-          fprintf(f, "%s %d", Comma(t), y);
-        } while (++t < MAX_ENTROPY_TOKENS);
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-      fprintf(f, "\n    }");
-    } while (++band < COEF_BANDS);
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES);
-  fprintf(f, "\n};\n");
 
-  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_8x8"
-          "[BLOCK_TYPES_8X8] [COEF_BANDS]"
-          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        fprintf(f, "%s\n      {", Comma(pt));
-        t = 0;
-        do {
-          const INT64 x = context_counters_8x8 [type] [band] [pt] [t];
-          const int y = (int) x;
-
-          assert(x == (INT64) y);  /* no overflow handling yet */
+          assert(x == (int64_t) y);  /* no overflow handling yet */
           fprintf(f, "%s %d", Comma(t), y);
-
         } while (++t < MAX_ENTROPY_TOKENS);
-
         fprintf(f, "}");
       } while (++pt < PREV_COEF_CONTEXTS);
-
       fprintf(f, "\n    }");
-
     } while (++band < COEF_BANDS);
-
     fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES_8X8);
+  } while (++type < block_types);
   fprintf(f, "\n};\n");
+}
 
-  fprintf(f, "static const unsigned int\nvp9_default_coef_counts_16x16"
-          "[BLOCK_TYPES_16X16] [COEF_BANDS]"
-          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        fprintf(f, "%s\n      {", Comma(pt));
-        t = 0;
-        do {
-          const INT64 x = context_counters_16x16 [type] [band] [pt] [t];
-          const int y = (int) x;
+static void print_probs(FILE *f, vp9_coeff_accum *context_counters,
+                        int block_types, const char *header) {
+  int type, band, pt, t;
 
-          assert(x == (INT64) y);  /* no overflow handling yet */
-          fprintf(f, "%s %d", Comma(t), y);
+  fprintf(f, "static const vp9_coeff_probs %s = {", header);
 
-        } while (++t < MAX_ENTROPY_TOKENS);
-
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-
-      fprintf(f, "\n    }");
-
-    } while (++band < COEF_BANDS);
-
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES_16X16);
-  fprintf(f, "\n};\n");
-
-  fprintf(f, "static const vp9_prob\n"
-          "vp9_default_coef_probs[BLOCK_TYPES] [COEF_BANDS] \n"
-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
   type = 0;
+#define Newline(x, spaces) (x ? " " : "\n" spaces)
   do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+    fprintf(f, "%s%s{ /* block Type %d */",
+            Comma(type), Newline(type, "  "), type);
     band = 0;
     do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+      fprintf(f, "%s%s{ /* Coeff Band %d */",
+              Comma(band), Newline(band, "    "), band);
       pt = 0;
       do {
-        unsigned int branch_ct [ENTROPY_NODES] [2];
+        unsigned int branch_ct[ENTROPY_NODES][2];
         unsigned int coef_counts[MAX_ENTROPY_TOKENS];
         vp9_prob coef_probs[ENTROPY_NODES];
+
         for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-          coef_counts[t] = context_counters [type] [band] [pt] [t];
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, coef_counts, 256, 1);
+          coef_counts[t] = context_counters[type][band][pt][t];
+        vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,
+                                         vp9_coef_encodings, vp9_coef_tree,
+                                         coef_probs, branch_ct, coef_counts);
         fprintf(f, "%s\n      {", Comma(pt));
 
         t = 0;
         do {
-          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
-
+          fprintf(f, "%s %3d", Comma(t), coef_probs[t]);
         } while (++t < ENTROPY_NODES);
 
-        fprintf(f, "}");
+        fprintf(f, " }");
       } while (++pt < PREV_COEF_CONTEXTS);
       fprintf(f, "\n    }");
     } while (++band < COEF_BANDS);
     fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES);
+  } while (++type < block_types);
   fprintf(f, "\n};\n");
+}
 
-  fprintf(f, "static const vp9_prob\n"
-          "vp9_default_coef_probs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]\n"
-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        unsigned int branch_ct [ENTROPY_NODES] [2];
-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
-        vp9_prob coef_probs[ENTROPY_NODES];
-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-          coef_counts[t] = context_counters_8x8[type] [band] [pt] [t];
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, coef_counts, 256, 1);
-        fprintf(f, "%s\n      {", Comma(pt));
+void print_context_counters() {
+  FILE *f = fopen("vp9_context.c", "w");
 
-        t = 0;
-        do {
-          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
-        } while (++t < ENTROPY_NODES);
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-      fprintf(f, "\n    }");
-    } while (++band < COEF_BANDS);
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES_8X8);
-  fprintf(f, "\n};\n");
+  fprintf(f, "#include \"vp9_entropy.h\"\n");
+  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
 
-  fprintf(f, "static const vp9_prob\n"
-          "vp9_default_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]\n"
-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");
-  type = 0;
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    band = 0;
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-      pt = 0;
-      do {
-        unsigned int branch_ct [ENTROPY_NODES] [2];
-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];
-        vp9_prob coef_probs[ENTROPY_NODES];
-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-          coef_counts[t] = context_counters_16x16[type] [band] [pt] [t];
-        vp9_tree_probs_from_distribution(
-          MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree,
-          coef_probs, branch_ct, coef_counts, 256, 1);
-        fprintf(f, "%s\n      {", Comma(pt));
+  /* print counts */
+  print_counter(f, context_counters_4x4, BLOCK_TYPES_4X4,
+                "vp9_default_coef_counts_4x4[BLOCK_TYPES_4X4]");
+  print_counter(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4,
+                "vp9_default_hybrid_coef_counts_4x4[BLOCK_TYPES_4X4]");
+  print_counter(f, context_counters_8x8, BLOCK_TYPES_8X8,
+                "vp9_default_coef_counts_8x8[BLOCK_TYPES_8X8]");
+  print_counter(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8,
+                "vp9_default_hybrid_coef_counts_8x8[BLOCK_TYPES_8X8]");
+  print_counter(f, context_counters_16x16, BLOCK_TYPES_16X16,
+                "vp9_default_coef_counts_16x16[BLOCK_TYPES_16X16]");
+  print_counter(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16,
+                "vp9_default_hybrid_coef_counts_16x16[BLOCK_TYPES_16X16]");
+  print_counter(f, context_counters_32x32, BLOCK_TYPES_32X32,
+                "vp9_default_coef_counts_32x32[BLOCK_TYPES_32X32]");
 
-        t = 0;
-        do {
-          fprintf(f, "%s %d", Comma(t), coef_probs[t]);
-        } while (++t < ENTROPY_NODES);
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-      fprintf(f, "\n    }");
-    } while (++band < COEF_BANDS);
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES_16X16);
-  fprintf(f, "\n};\n");
+  /* print coefficient probabilities */
+  print_probs(f, context_counters_4x4, BLOCK_TYPES_4X4,
+              "default_coef_probs_4x4[BLOCK_TYPES_4X4]");
+  print_probs(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4,
+              "default_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]");
+  print_probs(f, context_counters_8x8, BLOCK_TYPES_8X8,
+              "default_coef_probs_8x8[BLOCK_TYPES_8X8]");
+  print_probs(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8,
+              "default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]");
+  print_probs(f, context_counters_16x16, BLOCK_TYPES_16X16,
+              "default_coef_probs_16x16[BLOCK_TYPES_16X16]");
+  print_probs(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16,
+              "default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]");
+  print_probs(f, context_counters_32x32, BLOCK_TYPES_32X32,
+              "default_coef_probs_32x32[BLOCK_TYPES_32X32]");
 
   fclose(f);
 
   f = fopen("context.bin", "wb");
-  fwrite(context_counters, sizeof(context_counters), 1, f);
+  fwrite(context_counters_4x4, sizeof(context_counters_4x4), 1, f);
+  fwrite(hybrid_context_counters_4x4,
+         sizeof(hybrid_context_counters_4x4), 1, f);
   fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);
+  fwrite(hybrid_context_counters_8x8,
+         sizeof(hybrid_context_counters_8x8), 1, f);
   fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);
+  fwrite(hybrid_context_counters_16x16,
+         sizeof(hybrid_context_counters_16x16), 1, f);
+  fwrite(context_counters_32x32, sizeof(context_counters_32x32), 1, f);
   fclose(f);
 }
 #endif
@@ -669,35 +718,48 @@
 
 static __inline void stuff_b(VP9_COMP *cpi,
                              MACROBLOCKD *xd,
-                             const BLOCKD * const b,
+                             const int ib,
                              TOKENEXTRA **tp,
                              PLANE_TYPE type,
-                             ENTROPY_CONTEXT *a,
-                             ENTROPY_CONTEXT *l,
                              TX_SIZE tx_size,
                              int dry_run) {
+  const BLOCKD * const b = xd->block + ib;
   const int *bands;
-  unsigned int (*counts)[COEF_BANDS][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-  vp9_prob (*probs)[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
+  vp9_coeff_count *counts;
+  vp9_coeff_probs *probs;
   int pt, band;
   TOKENEXTRA *t = *tp;
   const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?
                           get_tx_type(xd, b) : DCT_DCT;
-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+  ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context +
+      vp9_block2above[tx_size][ib];
+  ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context +
+      vp9_block2left[tx_size][ib];
+  ENTROPY_CONTEXT a_ec = *a, l_ec = *l;
+  ENTROPY_CONTEXT *const a1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]) +
+      vp9_block2above[tx_size][ib];
+  ENTROPY_CONTEXT *const l1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]) +
+      vp9_block2left[tx_size][ib];
 
   switch (tx_size) {
     default:
     case TX_4X4:
-      bands = vp9_coef_bands;
+      bands = vp9_coef_bands_4x4;
       if (tx_type != DCT_DCT) {
-        counts = cpi->hybrid_coef_counts;
-        probs = cpi->common.fc.hybrid_coef_probs;
+        counts = cpi->hybrid_coef_counts_4x4;
+        probs = cpi->common.fc.hybrid_coef_probs_4x4;
       } else {
-        counts = cpi->coef_counts;
-        probs = cpi->common.fc.coef_probs;
+        counts = cpi->coef_counts_4x4;
+        probs = cpi->common.fc.coef_probs_4x4;
       }
       break;
     case TX_8X8:
+#if CONFIG_CNVCONTEXT
+      if (type != PLANE_TYPE_Y2) {
+        a_ec = (a[0] + a[1]) != 0;
+        l_ec = (l[0] + l[1]) != 0;
+      }
+#endif
       bands = vp9_coef_bands_8x8;
       if (tx_type != DCT_DCT) {
         counts = cpi->hybrid_coef_counts_8x8;
@@ -708,6 +770,15 @@
       }
       break;
     case TX_16X16:
+#if CONFIG_CNVCONTEXT
+      if (type != PLANE_TYPE_UV) {
+        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;
+        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;
+      } else {
+        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;
+        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;
+      }
+#endif
       bands = vp9_coef_bands_16x16;
       if (tx_type != DCT_DCT) {
         counts = cpi->hybrid_coef_counts_16x16;
@@ -717,7 +788,23 @@
         probs = cpi->common.fc.coef_probs_16x16;
       }
       break;
+    case TX_32X32:
+#if CONFIG_CNVCONTEXT
+      a_ec = a[0] + a[1] + a[2] + a[3] +
+             a1[0] + a1[1] + a1[2] + a1[3];
+      l_ec = l[0] + l[1] + l[2] + l[3] +
+             l1[0] + l1[1] + l1[2] + l1[3];
+      a_ec = a_ec != 0;
+      l_ec = l_ec != 0;
+#endif
+      bands = vp9_coef_bands_32x32;
+      counts = cpi->coef_counts_32x32;
+      probs = cpi->common.fc.coef_probs_32x32;
+      break;
   }
+
+  VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);
+
   band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];
   t->Token = DCT_EOB_TOKEN;
   t->context_tree = probs[type][band][pt];
@@ -725,6 +812,24 @@
   ++t;
   *tp = t;
   *a = *l = 0;
+  if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) {
+    a[1] = 0;
+    l[1] = 0;
+  } else if (tx_size == TX_16X16) {
+    if (type != PLANE_TYPE_UV) {
+      a[1] = a[2] = a[3] = 0;
+      l[1] = l[2] = l[3] = 0;
+    } else {
+      a1[0] = a1[1] = a[1] = a_ec;
+      l1[0] = l1[1] = l[1] = l_ec;
+    }
+  } else if (tx_size == TX_32X32) {
+    a[1] = a[2] = a[3] = a_ec;
+    l[1] = l[2] = l[3] = l_ec;
+    a1[0] = a1[1] = a1[2] = a1[3] = a_ec;
+    l1[0] = l1[1] = l1[2] = l1[3] = l_ec;
+  }
+
   if (!dry_run) {
     ++counts[type][band][pt][DCT_EOB_TOKEN];
   }
@@ -732,119 +837,86 @@
 
 static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,
                          TOKENEXTRA **t, int dry_run) {
-  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
   PLANE_TYPE plane_type;
   int b;
   int has_2nd_order = get_2nd_order_usage(xd);
 
   if (has_2nd_order) {
-    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
-            A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
-            TX_8X8, dry_run);
+    stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run);
     plane_type = PLANE_TYPE_Y_NO_DC;
   } else {
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
+#if CONFIG_CNVCONTEXT
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
+#endif
     plane_type = PLANE_TYPE_Y_WITH_DC;
   }
 
-  for (b = 0; b < 16; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above_8x8[b],
-            L + vp9_block2left_8x8[b], TX_8X8, dry_run);
-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+  for (b = 0; b < 24; b += 4) {
+    if (b >= 16)
+      plane_type = PLANE_TYPE_UV;
+    stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run);
   }
-
-  for (b = 16; b < 24; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV,
-            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-            TX_8X8, dry_run);
-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
-  }
 }
 
 static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,
                            TOKENEXTRA **t, int dry_run) {
-  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)xd->left_context;
   int b;
+  stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);
 
-  stuff_b(cpi, xd, xd->block, t, PLANE_TYPE_Y_WITH_DC, A, L, TX_16X16, dry_run);
-  A[1] = A[2] = A[3] = A[0];
-  L[1] = L[2] = L[3] = L[0];
   for (b = 16; b < 24; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
-            L + vp9_block2above_8x8[b], TX_8X8, dry_run);
-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);
   }
-  vpx_memset(&A[8], 0, sizeof(A[8]));
-  vpx_memset(&L[8], 0, sizeof(L[8]));
+#if CONFIG_CNVCONTEXT
+  xd->above_context->y2 = 0;
+  xd->left_context->y2 = 0;
+#endif
 }
 
 static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,
                          TOKENEXTRA **t, int dry_run) {
-  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
   int b;
   PLANE_TYPE plane_type;
-  int has_2nd_order = (xd->mode_info_context->mbmi.mode != B_PRED &&
-                      xd->mode_info_context->mbmi.mode != I8X8_PRED &&
-                      xd->mode_info_context->mbmi.mode != SPLITMV);
-  if (has_2nd_order && get_tx_type(xd, &xd->block[0]) != DCT_DCT)
-    has_2nd_order = 0;
+  int has_2nd_order = get_2nd_order_usage(xd);
 
   if (has_2nd_order) {
-    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2, A + vp9_block2above[24],
-            L + vp9_block2left[24], TX_4X4, dry_run);
+    stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_4X4, dry_run);
     plane_type = PLANE_TYPE_Y_NO_DC;
   } else {
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
     plane_type = PLANE_TYPE_Y_WITH_DC;
   }
 
-  for (b = 0; b < 16; b++)
-    stuff_b(cpi, xd, xd->block + b, t, plane_type, A + vp9_block2above[b],
-            L + vp9_block2left[b], TX_4X4, dry_run);
-
-  for (b = 16; b < 24; b++)
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
-            L + vp9_block2left[b], TX_4X4, dry_run);
+  for (b = 0; b < 24; b++) {
+    if (b >= 16)
+      plane_type = PLANE_TYPE_UV;
+    stuff_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run);
+  }
 }
 
 static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,
                                TOKENEXTRA **t, int dry_run) {
-  ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)xd->above_context;
-  ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)xd->left_context;
   PLANE_TYPE plane_type;
   int b;
 
   int has_2nd_order = get_2nd_order_usage(xd);
   if (has_2nd_order) {
-    stuff_b(cpi, xd, xd->block + 24, t, PLANE_TYPE_Y2,
-            A + vp9_block2above_8x8[24], L + vp9_block2left_8x8[24],
-            TX_8X8, dry_run);
+    stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run);
     plane_type = PLANE_TYPE_Y_NO_DC;
   } else {
+    xd->above_context->y2 = 0;
+    xd->left_context->y2 = 0;
     plane_type = PLANE_TYPE_Y_WITH_DC;
   }
 
   for (b = 0; b < 16; b += 4) {
-    stuff_b(cpi, xd, xd->block + b, t, plane_type,
-            A + vp9_block2above_8x8[b], L + vp9_block2left_8x8[b],
-            TX_8X8, dry_run);
-    A[vp9_block2above_8x8[b] + 1] = A[vp9_block2above_8x8[b]];
-    L[vp9_block2left_8x8[b] + 1]  = L[vp9_block2left_8x8[b]];
+    stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run);
   }
 
-  for (b = 16; b < 24; b++)
-    stuff_b(cpi, xd, xd->block + b, t, PLANE_TYPE_UV, A + vp9_block2above[b],
-            L + vp9_block2left[b], TX_4X4, dry_run);
-  xd->above_context->y2 = 1;
-  xd->left_context->y2 = 1;
+  for (b = 16; b < 24; b++) {
+    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);
+  }
 }
 
 void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
@@ -869,19 +941,27 @@
   }
 }
 
-void vp9_fix_contexts(MACROBLOCKD *xd) {
-  /* Clear entropy contexts for blocks */
-  if ((xd->mode_info_context->mbmi.mode != B_PRED
-       && xd->mode_info_context->mbmi.mode != I8X8_PRED
-       && xd->mode_info_context->mbmi.mode != SPLITMV)
-      || xd->mode_info_context->mbmi.txfm_size == TX_16X16
-      ) {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-  } else {
-    vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-    vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) - 1);
-    xd->above_context->y2 = 1;
-    xd->left_context->y2 = 1;
+static void stuff_sb_32x32(VP9_COMP *cpi, MACROBLOCKD *xd,
+                               TOKENEXTRA **t, int dry_run) {
+  int b;
+
+  stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);
+  for (b = 16; b < 24; b += 4) {
+    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);
   }
+}
+
+void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {
+  TOKENEXTRA * const t_backup = *t;
+
+  stuff_sb_32x32(cpi, xd, t, dry_run);
+
+  if (dry_run) {
+    *t = t_backup;
+  }
+}
+
+void vp9_fix_contexts_sb(MACROBLOCKD *xd) {
+  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
+  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);
 }
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_TOKENIZE_H_
 #define VP9_ENCODER_VP9_TOKENIZE_H_
 
@@ -18,43 +17,53 @@
 void vp9_tokenize_initialize();
 
 typedef struct {
-  short Token;
-  short Extra;
+  int16_t Token;
+  int16_t Extra;
 } TOKENVALUE;
 
 typedef struct {
   const vp9_prob *context_tree;
-  short           Extra;
-  unsigned char   Token;
-  unsigned char   skip_eob_node;
+  int16_t         Extra;
+  uint8_t         Token;
+  uint8_t         skip_eob_node;
 } TOKENEXTRA;
 
+typedef int64_t vp9_coeff_accum[COEF_BANDS][PREV_COEF_CONTEXTS]
+                               [MAX_ENTROPY_TOKENS];
+
 extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block);
 extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);
 extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);
 extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);
 extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);
+extern int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);
+extern int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);
 
 struct VP9_COMP;
 
 extern void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                             TOKENEXTRA **t, int dry_run);
+extern void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                            TOKENEXTRA **t, int dry_run);
 
 extern void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
                          TOKENEXTRA **t, int dry_run);
+extern void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,
+                         TOKENEXTRA **t, int dry_run);
 
-extern void vp9_fix_contexts(MACROBLOCKD *xd);
-
+extern void vp9_fix_contexts_sb(MACROBLOCKD *xd);
 #ifdef ENTROPY_STATS
 void init_context_counters();
 void print_context_counters();
 
-extern INT64 context_counters[BLOCK_TYPES][COEF_BANDS]
-                             [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-extern INT64 context_counters_8x8[BLOCK_TYPES_8X8][COEF_BANDS]
-                                 [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
-extern INT64 context_counters_16x16[BLOCK_TYPES_16X16][COEF_BANDS]
-                                   [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4];
+extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8];
+extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16];
+extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32];
+
+extern vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4];
+extern vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8];
+extern vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16];
 #endif
 
 extern const int *vp9_dct_value_cost_ptr;
@@ -64,4 +73,4 @@
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
 
-#endif  /* tokenize_h */
+#endif  // VP9_ENCODER_VP9_TOKENIZE_H_
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h
@@ -105,4 +105,4 @@
 
 void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t);
 
-#endif
+#endif  // VP9_ENCODER_VP9_TREEWRITER_H_
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -8,54 +8,55 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_VARIANCE_H_
 #define VP9_ENCODER_VP9_VARIANCE_H_
 
-typedef unsigned int(*vp9_sad_fn_t)(const unsigned char *src_ptr,
+#include "vpx/vpx_integer.h"
+
+typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
-                                    const unsigned char *ref_ptr,
+                                    const uint8_t *ref_ptr,
                                     int ref_stride,
                                     unsigned int max_sad);
 
-typedef void (*vp9_copy32xn_fn_t)(const unsigned char *src_ptr,
+typedef void (*vp9_copy32xn_fn_t)(const uint8_t *src_ptr,
                                   int source_stride,
-                                  const unsigned char *ref_ptr,
+                                  const uint8_t *ref_ptr,
                                   int ref_stride,
                                   int n);
 
-typedef void (*vp9_sad_multi_fn_t)(const unsigned char *src_ptr,
+typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
                                    int source_stride,
-                                   const unsigned char *ref_ptr,
+                                   const uint8_t *ref_ptr,
                                    int  ref_stride,
                                    unsigned int *sad_array);
 
-typedef void (*vp9_sad_multi1_fn_t)(const unsigned char *src_ptr,
+typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr,
                                     int source_stride,
-                                    const unsigned char *ref_ptr,
+                                    const uint8_t *ref_ptr,
                                     int  ref_stride,
                                     unsigned short *sad_array);
 
-typedef void (*vp9_sad_multi_d_fn_t)(const unsigned char *src_ptr,
+typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
                                      int source_stride,
-                                     const unsigned char ** ref_ptr,
+                                     const uint8_t ** ref_ptr,
                                      int  ref_stride, unsigned int *sad_array);
 
-typedef unsigned int (*vp9_variance_fn_t)(const unsigned char *src_ptr,
+typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,
                                           int source_stride,
-                                          const unsigned char *ref_ptr,
+                                          const uint8_t *ref_ptr,
                                           int ref_stride,
                                           unsigned int *sse);
 
-typedef unsigned int (*vp9_subpixvariance_fn_t)(const unsigned char  *src_ptr,
+typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
                                                 int source_stride,
                                                 int xoffset,
                                                 int yoffset,
-                                                const unsigned char *ref_ptr,
+                                                const uint8_t *ref_ptr,
                                                 int Refstride,
                                                 unsigned int *sse);
 
-typedef void (*vp9_ssimpf_fn_t)(unsigned char *s, int sp, unsigned char *r,
+typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
                                 int rp, unsigned long *sum_s,
                                 unsigned long *sum_r, unsigned long *sum_sq_s,
                                 unsigned long *sum_sq_r,
@@ -63,9 +64,9 @@
 
 typedef unsigned int (*vp9_getmbss_fn_t)(const short *);
 
-typedef unsigned int (*vp9_get16x16prederror_fn_t)(const unsigned char *src_ptr,
+typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
                                                    int source_stride,
-                                                   const unsigned char *ref_ptr,
+                                                   const uint8_t *ref_ptr,
                                                    int  ref_stride);
 
 typedef struct variance_vtable {
@@ -81,4 +82,4 @@
     vp9_copy32xn_fn_t       copymem;
 } vp9_variance_fn_ptr_t;
 
-#endif
+#endif  // VP9_ENCODER_VP9_VARIANCE_H_
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -14,7 +14,7 @@
 #include "vp9/common/vp9_subpelvar.h"
 #include "vpx/vpx_integer.h"
 
-unsigned int vp9_get_mb_ss_c(const short *src_ptr) {
+unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
 
   for (i = 0; i < 256; i++) {
@@ -24,12 +24,22 @@
   return sum;
 }
 
+unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
+                                 int  source_stride,
+                                 const uint8_t *ref_ptr,
+                                 int  recon_stride,
+                                 unsigned int *sse) {
+  unsigned int var;
+  int avg;
 
+  variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, &var, &avg);
+  *sse = var;
+  return (var - (((int64_t)avg * avg) >> 12));
+}
 
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance32x32_c(const unsigned char *src_ptr,
+unsigned int vp9_variance32x32_c(const uint8_t *src_ptr,
                                  int  source_stride,
-                                 const unsigned char *ref_ptr,
+                                 const uint8_t *ref_ptr,
                                  int  recon_stride,
                                  unsigned int *sse) {
   unsigned int var;
@@ -39,11 +49,10 @@
   *sse = var;
   return (var - (((int64_t)avg * avg) >> 10));
 }
-#endif
 
-unsigned int vp9_variance16x16_c(const unsigned char *src_ptr,
+unsigned int vp9_variance16x16_c(const uint8_t *src_ptr,
                                  int  source_stride,
-                                 const unsigned char *ref_ptr,
+                                 const uint8_t *ref_ptr,
                                  int  recon_stride,
                                  unsigned int *sse) {
   unsigned int var;
@@ -54,9 +63,9 @@
   return (var - (((unsigned int)avg * avg) >> 8));
 }
 
-unsigned int vp9_variance8x16_c(const unsigned char *src_ptr,
+unsigned int vp9_variance8x16_c(const uint8_t *src_ptr,
                                 int  source_stride,
-                                const unsigned char *ref_ptr,
+                                const uint8_t *ref_ptr,
                                 int  recon_stride,
                                 unsigned int *sse) {
   unsigned int var;
@@ -67,9 +76,9 @@
   return (var - (((unsigned int)avg * avg) >> 7));
 }
 
-unsigned int vp9_variance16x8_c(const unsigned char *src_ptr,
+unsigned int vp9_variance16x8_c(const uint8_t *src_ptr,
                                 int  source_stride,
-                                const unsigned char *ref_ptr,
+                                const uint8_t *ref_ptr,
                                 int  recon_stride,
                                 unsigned int *sse) {
   unsigned int var;
@@ -81,9 +90,9 @@
 }
 
 
-unsigned int vp9_variance8x8_c(const unsigned char *src_ptr,
+unsigned int vp9_variance8x8_c(const uint8_t *src_ptr,
                                int  source_stride,
-                               const unsigned char *ref_ptr,
+                               const uint8_t *ref_ptr,
                                int  recon_stride,
                                unsigned int *sse) {
   unsigned int var;
@@ -94,9 +103,9 @@
   return (var - (((unsigned int)avg * avg) >> 6));
 }
 
-unsigned int vp9_variance4x4_c(const unsigned char *src_ptr,
+unsigned int vp9_variance4x4_c(const uint8_t *src_ptr,
                                int  source_stride,
-                               const unsigned char *ref_ptr,
+                               const uint8_t *ref_ptr,
                                int  recon_stride,
                                unsigned int *sse) {
   unsigned int var;
@@ -108,9 +117,9 @@
 }
 
 
-unsigned int vp9_mse16x16_c(const unsigned char *src_ptr,
+unsigned int vp9_mse16x16_c(const uint8_t *src_ptr,
                             int  source_stride,
-                            const unsigned char *ref_ptr,
+                            const uint8_t *ref_ptr,
                             int  recon_stride,
                             unsigned int *sse) {
   unsigned int var;
@@ -122,16 +131,16 @@
 }
 
 
-unsigned int vp9_sub_pixel_variance4x4_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
                                          int  src_pixels_per_line,
                                          int  xoffset,
                                          int  yoffset,
-                                         const unsigned char *dst_ptr,
+                                         const uint8_t *dst_ptr,
                                          int dst_pixels_per_line,
                                          unsigned int *sse) {
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
-  unsigned short FData3[5 * 4]; // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  const int16_t *HFilter, *VFilter;
+  uint16_t FData3[5 * 4];  // Temp data bufffer used in filtering
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -146,16 +155,16 @@
 }
 
 
-unsigned int vp9_sub_pixel_variance8x8_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
                                          int  src_pixels_per_line,
                                          int  xoffset,
                                          int  yoffset,
-                                         const unsigned char *dst_ptr,
+                                         const uint8_t *dst_ptr,
                                          int dst_pixels_per_line,
                                          unsigned int *sse) {
-  unsigned short FData3[9 * 8]; // Temp data bufffer used in filtering
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
+  uint16_t FData3[9 * 8];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  const int16_t *HFilter, *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -166,16 +175,16 @@
   return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
 
-unsigned int vp9_sub_pixel_variance16x16_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
                                            int  yoffset,
-                                           const unsigned char *dst_ptr,
+                                           const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  unsigned short FData3[17 * 16]; // Temp data bufffer used in filtering
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
+  uint16_t FData3[17 * 16];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  const int16_t *HFilter, *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -186,31 +195,50 @@
   return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_sub_pixel_variance32x32_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
                                            int  src_pixels_per_line,
                                            int  xoffset,
                                            int  yoffset,
-                                           const unsigned char *dst_ptr,
+                                           const uint8_t *dst_ptr,
                                            int dst_pixels_per_line,
                                            unsigned int *sse) {
-  unsigned short FData3[33 * 32]; // Temp data bufffer used in filtering
-  unsigned char  temp2[36 * 32];
-  const short *HFilter, *VFilter;
+  uint16_t FData3[65 * 64];  // Temp data bufffer used in filtering
+  uint8_t temp2[68 * 64];
+  const int16_t *HFilter, *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
 
+  var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,
+                                    1, 65, 64, HFilter);
+  var_filter_block2d_bil_second_pass(FData3, temp2, 64, 64, 64, 64, VFilter);
+
+  return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
+                                           int  src_pixels_per_line,
+                                           int  xoffset,
+                                           int  yoffset,
+                                           const uint8_t *dst_ptr,
+                                           int dst_pixels_per_line,
+                                           unsigned int *sse) {
+  uint16_t FData3[33 * 32];  // Temp data bufffer used in filtering
+  uint8_t temp2[36 * 32];
+  const int16_t *HFilter, *VFilter;
+
+  HFilter = vp9_bilinear_filters[xoffset];
+  VFilter = vp9_bilinear_filters[yoffset];
+
   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);
   var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);
 
   return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
 }
-#endif
 
-unsigned int vp9_variance_halfpixvar16x16_h_c(const unsigned char *src_ptr,
+unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
                                               int  source_stride,
-                                              const unsigned char *ref_ptr,
+                                              const uint8_t *ref_ptr,
                                               int  recon_stride,
                                               unsigned int *sse) {
   return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
@@ -217,21 +245,27 @@
                                        ref_ptr, recon_stride, sse);
 }
 
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance_halfpixvar32x32_h_c(const unsigned char *src_ptr,
+unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr,
                                               int  source_stride,
-                                              const unsigned char *ref_ptr,
+                                              const uint8_t *ref_ptr,
                                               int  recon_stride,
                                               unsigned int *sse) {
   return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
                                        ref_ptr, recon_stride, sse);
 }
-#endif
 
+unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr,
+                                              int  source_stride,
+                                              const uint8_t *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 0,
+                                       ref_ptr, recon_stride, sse);
+}
 
-unsigned int vp9_variance_halfpixvar16x16_v_c(const unsigned char *src_ptr,
+unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr,
                                               int  source_stride,
-                                              const unsigned char *ref_ptr,
+                                              const uint8_t *ref_ptr,
                                               int  recon_stride,
                                               unsigned int *sse) {
   return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
@@ -238,20 +272,27 @@
                                        ref_ptr, recon_stride, sse);
 }
 
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance_halfpixvar32x32_v_c(const unsigned char *src_ptr,
+unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr,
                                               int  source_stride,
-                                              const unsigned char *ref_ptr,
+                                              const uint8_t *ref_ptr,
                                               int  recon_stride,
                                               unsigned int *sse) {
   return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
                                        ref_ptr, recon_stride, sse);
 }
-#endif
 
-unsigned int vp9_variance_halfpixvar16x16_hv_c(const unsigned char *src_ptr,
+unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr,
+                                              int  source_stride,
+                                              const uint8_t *ref_ptr,
+                                              int  recon_stride,
+                                              unsigned int *sse) {
+  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 0, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+
+unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr,
                                                int  source_stride,
-                                               const unsigned char *ref_ptr,
+                                               const uint8_t *ref_ptr,
                                                int  recon_stride,
                                                unsigned int *sse) {
   return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
@@ -258,22 +299,29 @@
                                        ref_ptr, recon_stride, sse);
 }
 
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_variance_halfpixvar32x32_hv_c(const unsigned char *src_ptr,
+unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr,
                                                int  source_stride,
-                                               const unsigned char *ref_ptr,
+                                               const uint8_t *ref_ptr,
                                                int  recon_stride,
                                                unsigned int *sse) {
   return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
                                        ref_ptr, recon_stride, sse);
 }
-#endif
 
-unsigned int vp9_sub_pixel_mse16x16_c(const unsigned char  *src_ptr,
+unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr,
+                                               int  source_stride,
+                                               const uint8_t *ref_ptr,
+                                               int  recon_stride,
+                                               unsigned int *sse) {
+  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 8,
+                                       ref_ptr, recon_stride, sse);
+}
+
+unsigned int vp9_sub_pixel_mse16x16_c(const uint8_t *src_ptr,
                                       int  src_pixels_per_line,
                                       int  xoffset,
                                       int  yoffset,
-                                      const unsigned char *dst_ptr,
+                                      const uint8_t *dst_ptr,
                                       int dst_pixels_per_line,
                                       unsigned int *sse) {
   vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,
@@ -282,12 +330,11 @@
   return *sse;
 }
 
-#if CONFIG_SUPERBLOCKS
-unsigned int vp9_sub_pixel_mse32x32_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr,
                                       int  src_pixels_per_line,
                                       int  xoffset,
                                       int  yoffset,
-                                      const unsigned char *dst_ptr,
+                                      const uint8_t *dst_ptr,
                                       int dst_pixels_per_line,
                                       unsigned int *sse) {
   vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,
@@ -295,18 +342,30 @@
                                 dst_pixels_per_line, sse);
   return *sse;
 }
-#endif
 
-unsigned int vp9_sub_pixel_variance16x8_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr,
+                                      int  src_pixels_per_line,
+                                      int  xoffset,
+                                      int  yoffset,
+                                      const uint8_t *dst_ptr,
+                                      int dst_pixels_per_line,
+                                      unsigned int *sse) {
+  vp9_sub_pixel_variance64x64_c(src_ptr, src_pixels_per_line,
+                                xoffset, yoffset, dst_ptr,
+                                dst_pixels_per_line, sse);
+  return *sse;
+}
+
+unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
                                           int  yoffset,
-                                          const unsigned char *dst_ptr,
+                                          const uint8_t *dst_ptr,
                                           int dst_pixels_per_line,
                                           unsigned int *sse) {
-  unsigned short FData3[16 * 9];  // Temp data bufffer used in filtering
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
+  uint16_t FData3[16 * 9];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  const int16_t *HFilter, *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
@@ -317,16 +376,16 @@
   return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
-unsigned int vp9_sub_pixel_variance8x16_c(const unsigned char  *src_ptr,
+unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
                                           int  src_pixels_per_line,
                                           int  xoffset,
                                           int  yoffset,
-                                          const unsigned char *dst_ptr,
+                                          const uint8_t *dst_ptr,
                                           int dst_pixels_per_line,
                                           unsigned int *sse) {
-  unsigned short FData3[9 * 16];  // Temp data bufffer used in filtering
-  unsigned char  temp2[20 * 16];
-  const short *HFilter, *VFilter;
+  uint16_t FData3[9 * 16];  // Temp data bufffer used in filtering
+  uint8_t temp2[20 * 16];
+  const int16_t *HFilter, *VFilter;
 
   HFilter = vp9_bilinear_filters[xoffset];
   VFilter = vp9_bilinear_filters[yoffset];
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -400,286 +400,6 @@
     pop         rbp
     ret
 
-;void vp9_filter_block2d_bil_var_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp9_filter_block2d_bil_var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-        pxor            xmm6,           xmm6                 ;
-        pxor            xmm7,           xmm7                 ;
-
-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
-        movdqa          xmm4,           XMMWORD PTR [rsi]
-
-        lea             rcx,            [GLOBAL(bilinear_filters_sse2)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_sse2_sp_only
-
-        shl             rax,            5                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_sse2_fp_only
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        movq            xmm3,           QWORD PTR [rsi+1]    ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]                ;
-        punpcklbw       xmm3,           xmm0
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift     ;
-        movdqa          xmm5,           xmm1
-
-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
-        lea             rsi,            [rsi + rbx]
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        movq            xmm3,           QWORD PTR [rsi+1]             ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4               ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movdqa          xmm3,           xmm5                 ;
-        movdqa          xmm5,           xmm1                 ;
-
-        pmullw          xmm3,           [rdx]               ;
-        pmullw          xmm1,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_sse2_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
-        je              filter_block2d_bil_var_sse2_full_pixel
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        lea             rsi,            [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
-        movq            xmm3,           QWORD PTR [rsi]             ;
-        punpcklbw       xmm3,           xmm0                 ;
-        movdqa          xmm5,           xmm3
-
-        pmullw          xmm1,           [rdx]               ;
-        pmullw          xmm3,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        movdqa          xmm1,           xmm5                 ;
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_sp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0                 ;
-
-filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movq            xmm2,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm2,           xmm0                 ;
-
-        psubw           xmm1,           xmm2                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_full_pixel_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
-        movq            xmm1,           QWORD PTR [rsi]       ;
-        movq            xmm3,           QWORD PTR [rsi+1]     ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]     ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-        lea             rsi,            [rsi + rdx]
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_fp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(7) ; sum
-        mov             rdi,            arg(8) ; sumsquared
-
-        movd            [rsi],          mm2    ; xsum
-        movd            [rdi],          mm4    ; xxsum
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void vp9_half_horiz_vert_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
@@ -802,122 +522,6 @@
     pop         rbp
     ret
 
-;void vp9_half_horiz_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(vp9_half_horiz_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-        lea             rsi,            [rsi + rax]
-
-.half_horiz_vert_variance16x_h_1:
-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-
-        movq            xmm3,           QWORD PTR [rdi+8]
-        punpcklbw       xmm3,           xmm0
-        psubw           xmm4,           xmm3
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_vert_variance16x_h_1    ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void vp9_half_vert_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
@@ -1025,114 +629,7 @@
     pop         rbp
     ret
 
-;void vp9_half_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
-sym(vp9_half_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
 
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0)              ;ref_ptr
-
-        mov             rdi,            arg(2)              ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)    ;Height
-        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        lea             rsi,            [rsi + rax          ]
-        pxor            xmm0,           xmm0
-
-.half_vert_variance16x_h_1:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm2,           QWORD PTR [rdi]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm5,           xmm2
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm4,           xmm2
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm3
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1
-        jnz             .half_vert_variance16x_h_1
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void vp9_half_horiz_variance8x_h_sse2
 ;(
 ;    unsigned char *ref_ptr,
@@ -1238,109 +735,6 @@
     pop         rbp
     ret
 
-;void vp9_half_horiz_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
-sym(vp9_half_horiz_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-.half_horiz_variance16x_h_1:
-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm1,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm1,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        psubw           xmm1,           xmm2
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm1
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm1,           xmm1
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm1
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             .half_horiz_variance16x_h_1         ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
 
 SECTION_RODATA
 ;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -9,11 +9,9 @@
 ##
 
 VP9_COMMON_SRCS-yes += vp9_common.mk
-VP9_COMMON_SRCS-yes += common/vp9_type_aliases.h
 VP9_COMMON_SRCS-yes += common/vp9_pragmas.h
 VP9_COMMON_SRCS-yes += common/vp9_ppflags.h
 VP9_COMMON_SRCS-yes += common/vp9_onyx.h
-VP9_COMMON_SRCS-yes += common/vp9_onyxd.h
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
 VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c
 VP9_COMMON_SRCS-yes += common/vp9_blockd.c
@@ -32,7 +30,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h
 VP9_COMMON_SRCS-yes += common/vp9_blockd.h
 VP9_COMMON_SRCS-yes += common/vp9_common.h
-VP9_COMMON_SRCS-yes += common/vp9_common_types.h
 VP9_COMMON_SRCS-yes += common/vp9_entropy.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
@@ -94,11 +91,13 @@
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm
 ifeq ($(CONFIG_POSTPROC),yes)
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -15,7 +15,7 @@
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
-#include "common/vp9_onyxd.h"
+#include "decoder/vp9_onyxd.h"
 #include "decoder/vp9_onyxd_int.h"
 
 #define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
@@ -573,7 +573,8 @@
 
     image2yuvconfig(&frame->img, &sd);
 
-    return vp9_set_reference_dec(ctx->pbi, frame->frame_type, &sd);
+    return vp9_set_reference_dec(ctx->pbi,
+                                 (VP9_REFFRAME)frame->frame_type, &sd);
   } else
     return VPX_CODEC_INVALID_PARAM;
 
@@ -591,7 +592,8 @@
 
     image2yuvconfig(&frame->img, &sd);
 
-    return vp9_get_reference_dec(ctx->pbi, frame->frame_type, &sd);
+    return vp9_get_reference_dec(ctx->pbi,
+                                 (VP9_REFFRAME)frame->frame_type, &sd);
   } else
     return VPX_CODEC_INVALID_PARAM;
 
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk
@@ -28,6 +28,7 @@
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_dequantize.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
+VP9_DX_SRCS-yes += decoder/vp9_onyxd.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_int.h
 VP9_DX_SRCS-yes += decoder/vp9_treereader.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
--- a/vpx/vpx_integer.h
+++ b/vpx/vpx_integer.h
@@ -27,6 +27,7 @@
 #if (defined(_MSC_VER) && (_MSC_VER < 1600))
 typedef signed __int64   int64_t;
 typedef unsigned __int64 uint64_t;
+#define INT64_MAX _I64_MAX
 #endif
 
 #ifndef _UINTPTR_T_DEFINED
--- a/vpx_mem/vpx_mem.h
+++ b/vpx_mem/vpx_mem.h
@@ -101,14 +101,7 @@
   /* some defines for backward compatibility */
 #define DMEM_GENERAL 0
 
-#define duck_memalign(X,Y,Z) vpx_memalign(X,Y)
-#define duck_malloc(X,Y) vpx_malloc(X)
-#define duck_calloc(X,Y,Z) vpx_calloc(X,Y)
-#define duck_realloc  vpx_realloc
-#define duck_free     vpx_free
-#define duck_memcpy   vpx_memcpy
-#define duck_memmove  vpx_memmove
-#define duck_memset   vpx_memset
+// (*)<
 
 #if REPLACE_BUILTIN_FUNCTIONS
 # ifndef __VPX_MEM_C__
--- a/vpx_ports/vpxtypes.h
+++ /dev/null
@@ -1,166 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __VPXTYPES_H__
-#define __VPXTYPES_H__
-
-#include "vpx_config.h"
-
-// #include <sys/types.h>
-#ifdef _MSC_VER
-# include <basetsd.h>
-typedef SSIZE_T ssize_t;
-#endif
-
-#if defined(HAVE_STDINT_H) && HAVE_STDINT_H
-/* C99 types are preferred to vpx integer types */
-# include <stdint.h>
-#endif
-
-/*!\defgroup basetypes Base Types
-  @{*/
-#if !defined(HAVE_STDINT_H) && !defined(INT_T_DEFINED)
-# ifdef STRICTTYPES
-typedef signed char  int8_t;
-typedef signed short int16_t;
-typedef signed int   int32_t;
-# else
-typedef char         int8_t;
-typedef short        int16_t;
-typedef int          int32_t;
-# endif
-typedef unsigned char  uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int   uint32_t;
-#endif
-
-typedef int8_t     vpxs8;
-typedef uint8_t    vpxu8;
-typedef int16_t    vpxs16;
-typedef uint16_t   vpxu16;
-typedef int32_t    vpxs32;
-typedef uint32_t   vpxu32;
-typedef int32_t    vpxbool;
-
-enum {vpxfalse, vpxtrue};
-
-/*!\def OTC
-   \brief a macro suitable for declaring a constant #vpxtc*/
-/*!\def VPXTC
-   \brief printf format string suitable for printing an #vpxtc*/
-#ifdef UNICODE
-# ifdef NO_WCHAR
-#  error "no non-wchar support added yet"
-# else
-#  include <wchar.h>
-typedef wchar_t vpxtc;
-#  define OTC(str) L ## str
-#  define VPXTC "ls"
-# endif /*NO_WCHAR*/
-#else
-typedef char vpxtc;
-# define OTC(str) (vpxtc*)str
-# define VPXTC "s"
-#endif /*UNICODE*/
-/*@} end - base types*/
-
-/*!\addtogroup basetypes
-  @{*/
-/*!\def VPX64
-   \brief printf format string suitable for printing an #vpxs64*/
-#if defined(HAVE_STDINT_H)
-# define VPX64 PRId64
-typedef int64_t vpxs64;
-#elif defined(HASLONGLONG)
-# undef  PRId64
-# define PRId64 "lld"
-# define VPX64 PRId64
-typedef long long vpxs64;
-#elif defined(WIN32) || defined(_WIN32_WCE)
-# undef  PRId64
-# define PRId64 "I64d"
-# define VPX64 PRId64
-typedef __int64 vpxs64;
-typedef unsigned __int64 vpxu64;
-#elif defined(__uClinux__) && defined(CHIP_DM642)
-# include <lddk.h>
-# undef  PRId64
-# define PRId64 "lld"
-# define VPX64 PRId64
-typedef long vpxs64;
-#else
-# error "64 bit integer type undefined for this platform!"
-#endif
-#if !defined(HAVE_STDINT_H) && !defined(INT_T_DEFINED)
-typedef vpxs64 int64_t;
-typedef vpxu64 uint64_t;
-#endif
-/*!@} end - base types*/
-
-/*!\ingroup basetypes
-   \brief Common return type*/
-typedef enum {
-  VPX_NOT_FOUND        = -404,
-  VPX_BUFFER_EMPTY     = -202,
-  VPX_BUFFER_FULL      = -201,
-
-  VPX_CONNREFUSED      = -102,
-  VPX_TIMEDOUT         = -101,
-  VPX_WOULDBLOCK       = -100,
-
-  VPX_NET_ERROR        = -9,
-  VPX_INVALID_VERSION  = -8,
-  VPX_INPROGRESS       = -7,
-  VPX_NOT_SUPP         = -6,
-  VPX_NO_MEM           = -3,
-  VPX_INVALID_PARAMS   = -2,
-  VPX_ERROR            = -1,
-  VPX_OK               = 0,
-  VPX_DONE             = 1
-} vpxsc;
-
-#if defined(WIN32) || defined(_WIN32_WCE)
-# define DLLIMPORT __declspec(dllimport)
-# define DLLEXPORT __declspec(dllexport)
-# define DLLLOCAL
-#elif defined(LINUX)
-# define DLLIMPORT
-/*visibility attribute support is available in 3.4 and later.
-  see: http:// gcc.gnu.org/wiki/Visibility for more info*/
-# if defined(__GNUC__) && ((__GNUC__<<16|(__GNUC_MINOR__&0xff)) >= (3<<16|4))
-#  define GCC_HASCLASSVISIBILITY
-# endif /*defined(__GNUC__) && __GNUC_PREREQ(3,4)*/
-# ifdef GCC_HASCLASSVISIBILITY
-#  define DLLEXPORT   __attribute__ ((visibility("default")))
-#  define DLLLOCAL __attribute__ ((visibility("hidden")))
-# else
-#  define DLLEXPORT
-#  define DLLLOCAL
-# endif /*GCC_HASCLASSVISIBILITY*/
-#endif /*platform ifdefs*/
-
-#endif /*__VPXTYPES_H__*/
-
-#undef VPXAPI
-/*!\def VPXAPI
-   \brief library calling convention/storage class attributes.
-
-   Specifies whether the function is imported through a dll
-   or is from a static library.*/
-#ifdef VPXDLL
-# ifdef VPXDLLEXPORT
-#  define VPXAPI DLLEXPORT
-# else
-#  define VPXAPI DLLIMPORT
-# endif /*VPXDLLEXPORT*/
-#else
-# define VPXAPI
-#endif /*VPXDLL*/
--- a/vpx_scale/generic/gen_scalers.c
+++ b/vpx_scale/generic/gen_scalers.c
@@ -9,7 +9,7 @@
  */
 
 
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vpx_mem/vpx_mem.h"
 /****************************************************************************
 *  Imports
--- /dev/null
+++ b/vpx_scale/generic/vpx_scale.c
@@ -1,0 +1,530 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+ *
+ *   Module Title :     scale.c
+ *
+ *   Description  :     Image scaling functions.
+ *
+ ***************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+#include "./vpx_scale_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+typedef struct {
+  int     expanded_frame_width;
+  int     expanded_frame_height;
+
+  int HScale;
+  int HRatio;
+  int VScale;
+  int VRatio;
+
+  YV12_BUFFER_CONFIG *src_yuv_config;
+  YV12_BUFFER_CONFIG *dst_yuv_config;
+
+} SCALE_VARS;
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_i
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source (UNUSED).
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination (UNUSED).
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 interpolated scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_i
+(
+  const unsigned char *source,
+  int source_step,
+  unsigned int source_scale,
+  unsigned int source_length,
+  unsigned char *dest,
+  int dest_step,
+  unsigned int dest_scale,
+  unsigned int dest_length
+) {
+  unsigned int i, j;
+  unsigned int temp;
+  int source_pitch = source_step;
+  (void) source_length;
+  (void) source_scale;
+  (void) dest_scale;
+
+  source_step *= 2;
+  dest[0] = source[0];
+
+  for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step) {
+    temp = 8;
+    temp += 3 * source[j - source_pitch];
+    temp += 10 * source[j];
+    temp += 3 * source[j + source_pitch];
+    temp >>= 4;
+    dest[i] = (char)(temp);
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_2t1_ps
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source (UNUSED).
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination (UNUSED).
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-to-1 point subsampled scaling.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_2t1_ps
+(
+  const unsigned char *source,
+  int source_step,
+  unsigned int source_scale,
+  unsigned int source_length,
+  unsigned char *dest,
+  int dest_step,
+  unsigned int dest_scale,
+  unsigned int dest_length
+) {
+  unsigned int i, j;
+
+  (void) source_length;
+  (void) source_scale;
+  (void) dest_scale;
+
+  source_step *= 2;
+  j = 0;
+
+  for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
+    dest[i] = source[j];
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : scale1d_c
+ *
+ *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
+ *                  int source_step              : Number of pixels to step on in source.
+ *                  unsigned int source_scale    : Scale for source.
+ *                  unsigned int source_length   : Length of source (UNUSED).
+ *                  unsigned char *dest         : Pointer to output data array.
+ *                  int dest_step                : Number of pixels to step on in destination.
+ *                  unsigned int dest_scale      : Scale for destination.
+ *                  unsigned int dest_length     : Length of destination.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs linear interpolation in one dimension.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+static
+void scale1d_c
+(
+  const unsigned char *source,
+  int source_step,
+  unsigned int source_scale,
+  unsigned int source_length,
+  unsigned char *dest,
+  int dest_step,
+  unsigned int dest_scale,
+  unsigned int dest_length
+) {
+  unsigned int i;
+  unsigned int round_value = dest_scale / 2;
+  unsigned int left_modifier = dest_scale;
+  unsigned int right_modifier = 0;
+  unsigned char left_pixel = *source;
+  unsigned char right_pixel = *(source + source_step);
+
+  (void) source_length;
+
+  /* These asserts are needed if there are boundary issues... */
+  /*assert ( dest_scale > source_scale );*/
+  /*assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );*/
+
+  for (i = 0; i < dest_length * dest_step; i += dest_step) {
+    dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale);
+
+    right_modifier += source_scale;
+
+    while (right_modifier > dest_scale) {
+      right_modifier -= dest_scale;
+      source += source_step;
+      left_pixel = *source;
+      right_pixel = *(source + source_step);
+    }
+
+    left_modifier = dest_scale - right_modifier;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : Scale2D
+ *
+ *  INPUTS        : const unsigned char *source  : Pointer to data to be scaled.
+ *                  int source_pitch              : Stride of source image.
+ *                  unsigned int source_width     : Width of input image.
+ *                  unsigned int source_height    : Height of input image.
+ *                  unsigned char *dest          : Pointer to output data array.
+ *                  int dest_pitch                : Stride of destination image.
+ *                  unsigned int dest_width       : Width of destination image.
+ *                  unsigned int dest_height      : Height of destination image.
+ *                  unsigned char *temp_area      : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale          : Horizontal scale factor numerator.
+ *                  unsigned int hratio          : Horizontal scale factor denominator.
+ *                  unsigned int vscale          : Vertical scale factor numerator.
+ *                  unsigned int vratio          : Vertical scale factor denominator.
+ *                  unsigned int interlaced      : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+static
+void Scale2D
+(
+  /*const*/
+  unsigned char *source,
+  int source_pitch,
+  unsigned int source_width,
+  unsigned int source_height,
+  unsigned char *dest,
+  int dest_pitch,
+  unsigned int dest_width,
+  unsigned int dest_height,
+  unsigned char *temp_area,
+  unsigned char temp_area_height,
+  unsigned int hscale,
+  unsigned int hratio,
+  unsigned int vscale,
+  unsigned int vratio,
+  unsigned int interlaced
+) {
+  /*unsigned*/
+  int i, j, k;
+  int bands;
+  int dest_band_height;
+  int source_band_height;
+
+  typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length,
+                          unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length);
+
+  Scale1D Scale1Dv = scale1d_c;
+  Scale1D Scale1Dh = scale1d_c;
+
+  void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
+  void (*vert_band_scale)(unsigned char *, unsigned int, unsigned char *, unsigned int, unsigned int) = NULL;
+
+  int ratio_scalable = 1;
+  int interpolation = 0;
+
+  unsigned char *source_base; /* = (unsigned char *) ((source_pitch >= 0) ? source : (source + ((source_height-1) * source_pitch))); */
+  unsigned char *line_src;
+
+
+  source_base = (unsigned char *)source;
+
+  if (source_pitch < 0) {
+    int offset;
+
+    offset = (source_height - 1);
+    offset *= source_pitch;
+
+    source_base += offset;
+  }
+
+  /* find out the ratio for each direction */
+  switch (hratio * 10 / hscale) {
+    case 8:
+      /* 4-5 Scale in Width direction */
+      horiz_line_scale = vp8_horizontal_line_5_4_scale;
+      break;
+    case 6:
+      /* 3-5 Scale in Width direction */
+      horiz_line_scale = vp8_horizontal_line_5_3_scale;
+      break;
+    case 5:
+      /* 1-2 Scale in Width direction */
+      horiz_line_scale = vp8_horizontal_line_2_1_scale;
+      break;
+    default:
+      /* The ratio is not acceptable now */
+      /* throw("The ratio is not acceptable for now!"); */
+      ratio_scalable = 0;
+      break;
+  }
+
+  switch (vratio * 10 / vscale) {
+    case 8:
+      /* 4-5 Scale in vertical direction */
+      vert_band_scale     = vp8_vertical_band_5_4_scale;
+      source_band_height  = 5;
+      dest_band_height    = 4;
+      break;
+    case 6:
+      /* 3-5 Scale in vertical direction */
+      vert_band_scale     = vp8_vertical_band_5_3_scale;
+      source_band_height  = 5;
+      dest_band_height    = 3;
+      break;
+    case 5:
+      /* 1-2 Scale in vertical direction */
+
+      if (interlaced) {
+        /* if the content is interlaced, point sampling is used */
+        vert_band_scale     = vp8_vertical_band_2_1_scale;
+      } else {
+
+        interpolation = 1;
+        /* if the content is progressive, interplo */
+        vert_band_scale     = vp8_vertical_band_2_1_scale_i;
+
+      }
+
+      source_band_height  = 2;
+      dest_band_height    = 1;
+      break;
+    default:
+      /* The ratio is not acceptable now */
+      /* throw("The ratio is not acceptable for now!"); */
+      ratio_scalable = 0;
+      break;
+  }
+
+  if (ratio_scalable) {
+    if (source_height == dest_height) {
+      /* for each band of the image */
+      for (k = 0; k < (int)dest_height; k++) {
+        horiz_line_scale(source, source_width, dest, dest_width);
+        source += source_pitch;
+        dest   += dest_pitch;
+      }
+
+      return;
+    }
+
+    if (interpolation) {
+      if (source < source_base)
+        source = source_base;
+
+      horiz_line_scale(source, source_width, temp_area, dest_width);
+    }
+
+    for (k = 0; k < (int)(dest_height + dest_band_height - 1) / dest_band_height; k++) {
+      /* scale one band horizontally */
+      for (i = 0; i < source_band_height; i++) {
+        /* Trap case where we could read off the base of the source buffer */
+
+        line_src = (unsigned char *)source + i * source_pitch;
+
+        if (line_src < source_base)
+          line_src = source_base;
+
+        horiz_line_scale(line_src, source_width,
+                         temp_area + (i + 1)*dest_pitch, dest_width);
+      }
+
+      /* Vertical scaling is in place */
+      vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width);
+
+      if (interpolation)
+        vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
+
+      /* Next band... */
+      source += (unsigned long) source_band_height  * source_pitch;
+      dest   += (unsigned long) dest_band_height * dest_pitch;
+    }
+
+    return;
+  }
+
+  if (hscale == 2 && hratio == 1)
+    Scale1Dh = scale1d_2t1_ps;
+
+  if (vscale == 2 && vratio == 1) {
+    if (interlaced)
+      Scale1Dv = scale1d_2t1_ps;
+    else
+      Scale1Dv = scale1d_2t1_i;
+  }
+
+  if (source_height == dest_height) {
+    /* for each band of the image */
+    for (k = 0; k < (int)dest_height; k++) {
+      Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width);
+      source += source_pitch;
+      dest   += dest_pitch;
+    }
+
+    return;
+  }
+
+  if (dest_height > source_height) {
+    dest_band_height   = temp_area_height - 1;
+    source_band_height = dest_band_height * source_height / dest_height;
+  } else {
+    source_band_height = temp_area_height - 1;
+    dest_band_height   = source_band_height * vratio / vscale;
+  }
+
+  /* first row needs to be done so that we can stay one row ahead for vertical zoom */
+  Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width);
+
+  /* for each band of the image */
+  bands = (dest_height + dest_band_height - 1) / dest_band_height;
+
+  for (k = 0; k < bands; k++) {
+    /* scale one band horizontally */
+    for (i = 1; i < source_band_height + 1; i++) {
+      if (k * source_band_height + i < (int) source_height) {
+        Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
+                 temp_area + i * dest_pitch, 1, hratio, dest_width);
+      } else { /*  Duplicate the last row */
+        /* copy temp_area row 0 over from last row in the past */
+        vpx_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
+      }
+    }
+
+    /* scale one band vertically */
+    for (j = 0; j < (int)dest_width; j++) {
+      Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
+               &dest[j], dest_pitch, vratio, dest_band_height);
+    }
+
+    /* copy temp_area row 0 over from last row in the past */
+    vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+
+    /* move to the next band */
+    source += source_band_height * source_pitch;
+    dest   += dest_band_height * dest_pitch;
+  }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : vpx_scale_frame
+ *
+ *  INPUTS        : YV12_BUFFER_CONFIG *src       : Pointer to frame to be scaled.
+ *                  YV12_BUFFER_CONFIG *dst       : Pointer to buffer to hold scaled frame.
+ *                  unsigned char *temp_area      : Pointer to temp work area.
+ *                  unsigned char temp_area_height : Height of temp work area.
+ *                  unsigned int hscale          : Horizontal scale factor numerator.
+ *                  unsigned int hratio          : Horizontal scale factor denominator.
+ *                  unsigned int vscale          : Vertical scale factor numerator.
+ *                  unsigned int vratio          : Vertical scale factor denominator.
+ *                  unsigned int interlaced      : Interlace flag.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
+ *
+ *  SPECIAL NOTES : Expansion is performed one band at a time to help with
+ *                  caching.
+ *
+ ****************************************************************************/
+void vpx_scale_frame
+(
+  YV12_BUFFER_CONFIG *src,
+  YV12_BUFFER_CONFIG *dst,
+  unsigned char *temp_area,
+  unsigned char temp_height,
+  unsigned int hscale,
+  unsigned int hratio,
+  unsigned int vscale,
+  unsigned int vratio,
+  unsigned int interlaced
+) {
+  int i;
+  int dw = (hscale - 1 + src->y_width * hratio) / hscale;
+  int dh = (vscale - 1 + src->y_height * vratio) / vscale;
+
+  /* call our internal scaling routines!! */
+  Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height,
+          (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh,
+          temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+  if (dw < (int)dst->y_width)
+    for (i = 0; i < dh; i++)
+      vpx_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
+
+  if (dh < (int)dst->y_height)
+    for (i = dh - 1; i < (int)dst->y_height; i++)
+      vpx_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
+
+  Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
+          (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
+          temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+  if (dw / 2 < (int)dst->uv_width)
+    for (i = 0; i < dst->uv_height; i++)
+      vpx_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+
+  if (dh / 2 < (int)dst->uv_height)
+    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+      vpx_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+
+  Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
+          (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
+          temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
+
+  if (dw / 2 < (int)dst->uv_width)
+    for (i = 0; i < dst->uv_height; i++)
+      vpx_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+
+  if (dh / 2 < (int) dst->uv_height)
+    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
+      vpx_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+}
--- a/vpx_scale/generic/vpxscale.c
+++ /dev/null
@@ -1,530 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
- *
- *   Module Title :     scale.c
- *
- *   Description  :     Image scaling functions.
- *
- ***************************************************************************/
-
-/****************************************************************************
-*  Header Files
-****************************************************************************/
-#include "./vpx_scale_rtcd.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/yv12config.h"
-
-typedef struct {
-  int     expanded_frame_width;
-  int     expanded_frame_height;
-
-  int HScale;
-  int HRatio;
-  int VScale;
-  int VRatio;
-
-  YV12_BUFFER_CONFIG *src_yuv_config;
-  YV12_BUFFER_CONFIG *dst_yuv_config;
-
-} SCALE_VARS;
-
-/****************************************************************************
- *
- *  ROUTINE       : scale1d_2t1_i
- *
- *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
- *                  int source_step              : Number of pixels to step on in source.
- *                  unsigned int source_scale    : Scale for source (UNUSED).
- *                  unsigned int source_length   : Length of source (UNUSED).
- *                  unsigned char *dest         : Pointer to output data array.
- *                  int dest_step                : Number of pixels to step on in destination.
- *                  unsigned int dest_scale      : Scale for destination (UNUSED).
- *                  unsigned int dest_length     : Length of destination.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-to-1 interpolated scaling.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void scale1d_2t1_i
-(
-  const unsigned char *source,
-  int source_step,
-  unsigned int source_scale,
-  unsigned int source_length,
-  unsigned char *dest,
-  int dest_step,
-  unsigned int dest_scale,
-  unsigned int dest_length
-) {
-  unsigned int i, j;
-  unsigned int temp;
-  int source_pitch = source_step;
-  (void) source_length;
-  (void) source_scale;
-  (void) dest_scale;
-
-  source_step *= 2;
-  dest[0] = source[0];
-
-  for (i = dest_step, j = source_step; i < dest_length * dest_step; i += dest_step, j += source_step) {
-    temp = 8;
-    temp += 3 * source[j - source_pitch];
-    temp += 10 * source[j];
-    temp += 3 * source[j + source_pitch];
-    temp >>= 4;
-    dest[i] = (char)(temp);
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : scale1d_2t1_ps
- *
- *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
- *                  int source_step              : Number of pixels to step on in source.
- *                  unsigned int source_scale    : Scale for source (UNUSED).
- *                  unsigned int source_length   : Length of source (UNUSED).
- *                  unsigned char *dest         : Pointer to output data array.
- *                  int dest_step                : Number of pixels to step on in destination.
- *                  unsigned int dest_scale      : Scale for destination (UNUSED).
- *                  unsigned int dest_length     : Length of destination.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-to-1 point subsampled scaling.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void scale1d_2t1_ps
-(
-  const unsigned char *source,
-  int source_step,
-  unsigned int source_scale,
-  unsigned int source_length,
-  unsigned char *dest,
-  int dest_step,
-  unsigned int dest_scale,
-  unsigned int dest_length
-) {
-  unsigned int i, j;
-
-  (void) source_length;
-  (void) source_scale;
-  (void) dest_scale;
-
-  source_step *= 2;
-  j = 0;
-
-  for (i = 0; i < dest_length * dest_step; i += dest_step, j += source_step)
-    dest[i] = source[j];
-}
-/****************************************************************************
- *
- *  ROUTINE       : scale1d_c
- *
- *  INPUTS        : const unsigned char *source : Pointer to data to be scaled.
- *                  int source_step              : Number of pixels to step on in source.
- *                  unsigned int source_scale    : Scale for source.
- *                  unsigned int source_length   : Length of source (UNUSED).
- *                  unsigned char *dest         : Pointer to output data array.
- *                  int dest_step                : Number of pixels to step on in destination.
- *                  unsigned int dest_scale      : Scale for destination.
- *                  unsigned int dest_length     : Length of destination.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs linear interpolation in one dimension.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void scale1d_c
-(
-  const unsigned char *source,
-  int source_step,
-  unsigned int source_scale,
-  unsigned int source_length,
-  unsigned char *dest,
-  int dest_step,
-  unsigned int dest_scale,
-  unsigned int dest_length
-) {
-  unsigned int i;
-  unsigned int round_value = dest_scale / 2;
-  unsigned int left_modifier = dest_scale;
-  unsigned int right_modifier = 0;
-  unsigned char left_pixel = *source;
-  unsigned char right_pixel = *(source + source_step);
-
-  (void) source_length;
-
-  /* These asserts are needed if there are boundary issues... */
-  /*assert ( dest_scale > source_scale );*/
-  /*assert ( (source_length-1) * dest_scale >= (dest_length-1) * source_scale );*/
-
-  for (i = 0; i < dest_length * dest_step; i += dest_step) {
-    dest[i] = (char)((left_modifier * left_pixel + right_modifier * right_pixel + round_value) / dest_scale);
-
-    right_modifier += source_scale;
-
-    while (right_modifier > dest_scale) {
-      right_modifier -= dest_scale;
-      source += source_step;
-      left_pixel = *source;
-      right_pixel = *(source + source_step);
-    }
-
-    left_modifier = dest_scale - right_modifier;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : Scale2D
- *
- *  INPUTS        : const unsigned char *source  : Pointer to data to be scaled.
- *                  int source_pitch              : Stride of source image.
- *                  unsigned int source_width     : Width of input image.
- *                  unsigned int source_height    : Height of input image.
- *                  unsigned char *dest          : Pointer to output data array.
- *                  int dest_pitch                : Stride of destination image.
- *                  unsigned int dest_width       : Width of destination image.
- *                  unsigned int dest_height      : Height of destination image.
- *                  unsigned char *temp_area      : Pointer to temp work area.
- *                  unsigned char temp_area_height : Height of temp work area.
- *                  unsigned int hscale          : Horizontal scale factor numerator.
- *                  unsigned int hratio          : Horizontal scale factor denominator.
- *                  unsigned int vscale          : Vertical scale factor numerator.
- *                  unsigned int vratio          : Vertical scale factor denominator.
- *                  unsigned int interlaced      : Interlace flag.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
- *
- *  SPECIAL NOTES : Expansion is performed one band at a time to help with
- *                  caching.
- *
- ****************************************************************************/
-static
-void Scale2D
-(
-  /*const*/
-  unsigned char *source,
-  int source_pitch,
-  unsigned int source_width,
-  unsigned int source_height,
-  unsigned char *dest,
-  int dest_pitch,
-  unsigned int dest_width,
-  unsigned int dest_height,
-  unsigned char *temp_area,
-  unsigned char temp_area_height,
-  unsigned int hscale,
-  unsigned int hratio,
-  unsigned int vscale,
-  unsigned int vratio,
-  unsigned int interlaced
-) {
-  /*unsigned*/
-  int i, j, k;
-  int bands;
-  int dest_band_height;
-  int source_band_height;
-
-  typedef void (*Scale1D)(const unsigned char * source, int source_step, unsigned int source_scale, unsigned int source_length,
-                          unsigned char * dest, int dest_step, unsigned int dest_scale, unsigned int dest_length);
-
-  Scale1D Scale1Dv = scale1d_c;
-  Scale1D Scale1Dh = scale1d_c;
-
-  void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *, unsigned int) = NULL;
-  void (*vert_band_scale)(unsigned char *, unsigned int, unsigned char *, unsigned int, unsigned int) = NULL;
-
-  int ratio_scalable = 1;
-  int interpolation = 0;
-
-  unsigned char *source_base; /* = (unsigned char *) ((source_pitch >= 0) ? source : (source + ((source_height-1) * source_pitch))); */
-  unsigned char *line_src;
-
-
-  source_base = (unsigned char *)source;
-
-  if (source_pitch < 0) {
-    int offset;
-
-    offset = (source_height - 1);
-    offset *= source_pitch;
-
-    source_base += offset;
-  }
-
-  /* find out the ratio for each direction */
-  switch (hratio * 10 / hscale) {
-    case 8:
-      /* 4-5 Scale in Width direction */
-      horiz_line_scale = vp8_horizontal_line_5_4_scale;
-      break;
-    case 6:
-      /* 3-5 Scale in Width direction */
-      horiz_line_scale = vp8_horizontal_line_5_3_scale;
-      break;
-    case 5:
-      /* 1-2 Scale in Width direction */
-      horiz_line_scale = vp8_horizontal_line_2_1_scale;
-      break;
-    default:
-      /* The ratio is not acceptable now */
-      /* throw("The ratio is not acceptable for now!"); */
-      ratio_scalable = 0;
-      break;
-  }
-
-  switch (vratio * 10 / vscale) {
-    case 8:
-      /* 4-5 Scale in vertical direction */
-      vert_band_scale     = vp8_vertical_band_5_4_scale;
-      source_band_height  = 5;
-      dest_band_height    = 4;
-      break;
-    case 6:
-      /* 3-5 Scale in vertical direction */
-      vert_band_scale     = vp8_vertical_band_5_3_scale;
-      source_band_height  = 5;
-      dest_band_height    = 3;
-      break;
-    case 5:
-      /* 1-2 Scale in vertical direction */
-
-      if (interlaced) {
-        /* if the content is interlaced, point sampling is used */
-        vert_band_scale     = vp8_vertical_band_2_1_scale;
-      } else {
-
-        interpolation = 1;
-        /* if the content is progressive, interplo */
-        vert_band_scale     = vp8_vertical_band_2_1_scale_i;
-
-      }
-
-      source_band_height  = 2;
-      dest_band_height    = 1;
-      break;
-    default:
-      /* The ratio is not acceptable now */
-      /* throw("The ratio is not acceptable for now!"); */
-      ratio_scalable = 0;
-      break;
-  }
-
-  if (ratio_scalable) {
-    if (source_height == dest_height) {
-      /* for each band of the image */
-      for (k = 0; k < (int)dest_height; k++) {
-        horiz_line_scale(source, source_width, dest, dest_width);
-        source += source_pitch;
-        dest   += dest_pitch;
-      }
-
-      return;
-    }
-
-    if (interpolation) {
-      if (source < source_base)
-        source = source_base;
-
-      horiz_line_scale(source, source_width, temp_area, dest_width);
-    }
-
-    for (k = 0; k < (int)(dest_height + dest_band_height - 1) / dest_band_height; k++) {
-      /* scale one band horizontally */
-      for (i = 0; i < source_band_height; i++) {
-        /* Trap case where we could read off the base of the source buffer */
-
-        line_src = (unsigned char *)source + i * source_pitch;
-
-        if (line_src < source_base)
-          line_src = source_base;
-
-        horiz_line_scale(line_src, source_width,
-                         temp_area + (i + 1)*dest_pitch, dest_width);
-      }
-
-      /* Vertical scaling is in place */
-      vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width);
-
-      if (interpolation)
-        vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
-
-      /* Next band... */
-      source += (unsigned long) source_band_height  * source_pitch;
-      dest   += (unsigned long) dest_band_height * dest_pitch;
-    }
-
-    return;
-  }
-
-  if (hscale == 2 && hratio == 1)
-    Scale1Dh = scale1d_2t1_ps;
-
-  if (vscale == 2 && vratio == 1) {
-    if (interlaced)
-      Scale1Dv = scale1d_2t1_ps;
-    else
-      Scale1Dv = scale1d_2t1_i;
-  }
-
-  if (source_height == dest_height) {
-    /* for each band of the image */
-    for (k = 0; k < (int)dest_height; k++) {
-      Scale1Dh(source, 1, hscale, source_width + 1, dest, 1, hratio, dest_width);
-      source += source_pitch;
-      dest   += dest_pitch;
-    }
-
-    return;
-  }
-
-  if (dest_height > source_height) {
-    dest_band_height   = temp_area_height - 1;
-    source_band_height = dest_band_height * source_height / dest_height;
-  } else {
-    source_band_height = temp_area_height - 1;
-    dest_band_height   = source_band_height * vratio / vscale;
-  }
-
-  /* first row needs to be done so that we can stay one row ahead for vertical zoom */
-  Scale1Dh(source, 1, hscale, source_width + 1, temp_area, 1, hratio, dest_width);
-
-  /* for each band of the image */
-  bands = (dest_height + dest_band_height - 1) / dest_band_height;
-
-  for (k = 0; k < bands; k++) {
-    /* scale one band horizontally */
-    for (i = 1; i < source_band_height + 1; i++) {
-      if (k * source_band_height + i < (int) source_height) {
-        Scale1Dh(source + i * source_pitch, 1, hscale, source_width + 1,
-                 temp_area + i * dest_pitch, 1, hratio, dest_width);
-      } else { /*  Duplicate the last row */
-        /* copy temp_area row 0 over from last row in the past */
-        duck_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
-      }
-    }
-
-    /* scale one band vertically */
-    for (j = 0; j < (int)dest_width; j++) {
-      Scale1Dv(&temp_area[j], dest_pitch, vscale, source_band_height + 1,
-               &dest[j], dest_pitch, vratio, dest_band_height);
-    }
-
-    /* copy temp_area row 0 over from last row in the past */
-    duck_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
-
-    /* move to the next band */
-    source += source_band_height * source_pitch;
-    dest   += dest_band_height * dest_pitch;
-  }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       :
- *
- *  INPUTS        : YV12_BUFFER_CONFIG *src       : Pointer to frame to be scaled.
- *                  YV12_BUFFER_CONFIG *dst       : Pointer to buffer to hold scaled frame.
- *                  unsigned char *temp_area      : Pointer to temp work area.
- *                  unsigned char temp_area_height : Height of temp work area.
- *                  unsigned int hscale          : Horizontal scale factor numerator.
- *                  unsigned int hratio          : Horizontal scale factor denominator.
- *                  unsigned int vscale          : Vertical scale factor numerator.
- *                  unsigned int vratio          : Vertical scale factor denominator.
- *                  unsigned int interlaced      : Interlace flag.
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Performs 2-tap linear interpolation in two dimensions.
- *
- *  SPECIAL NOTES : Expansion is performed one band at a time to help with
- *                  caching.
- *
- ****************************************************************************/
-void vp8_scale_frame
-(
-  YV12_BUFFER_CONFIG *src,
-  YV12_BUFFER_CONFIG *dst,
-  unsigned char *temp_area,
-  unsigned char temp_height,
-  unsigned int hscale,
-  unsigned int hratio,
-  unsigned int vscale,
-  unsigned int vratio,
-  unsigned int interlaced
-) {
-  int i;
-  int dw = (hscale - 1 + src->y_width * hratio) / hscale;
-  int dh = (vscale - 1 + src->y_height * vratio) / vscale;
-
-  /* call our internal scaling routines!! */
-  Scale2D((unsigned char *) src->y_buffer, src->y_stride, src->y_width, src->y_height,
-          (unsigned char *) dst->y_buffer, dst->y_stride, dw, dh,
-          temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
-
-  if (dw < (int)dst->y_width)
-    for (i = 0; i < dh; i++)
-      duck_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
-
-  if (dh < (int)dst->y_height)
-    for (i = dh - 1; i < (int)dst->y_height; i++)
-      duck_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
-
-  Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
-          (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
-          temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
-
-  if (dw / 2 < (int)dst->uv_width)
-    for (i = 0; i < dst->uv_height; i++)
-      duck_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
-
-  if (dh / 2 < (int)dst->uv_height)
-    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
-      duck_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
-
-  Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
-          (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
-          temp_area, temp_height, hscale, hratio, vscale, vratio, interlaced);
-
-  if (dw / 2 < (int)dst->uv_width)
-    for (i = 0; i < dst->uv_height; i++)
-      duck_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
-
-  if (dh / 2 < (int) dst->uv_height)
-    for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
-      duck_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
-}
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -11,7 +11,7 @@
 
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 
 /****************************************************************************
 *  Exports
--- /dev/null
+++ b/vpx_scale/vpx_scale.h
@@ -1,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VPXSCALE_H
+#define VPXSCALE_H
+
+#include "vpx_scale/yv12config.h"
+
+extern void vpx_scale_frame(YV12_BUFFER_CONFIG *src,
+                            YV12_BUFFER_CONFIG *dst,
+                            unsigned char *temp_area,
+                            unsigned char temp_height,
+                            unsigned int hscale,
+                            unsigned int hratio,
+                            unsigned int vscale,
+                            unsigned int vratio,
+                            unsigned int interlaced);
+
+#endif
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -1,7 +1,7 @@
 SCALE_SRCS-yes += vpx_scale.mk
 SCALE_SRCS-yes += yv12config.h
-SCALE_SRCS-yes += vpxscale.h
-SCALE_SRCS-yes += generic/vpxscale.c
+SCALE_SRCS-yes += vpx_scale.h
+SCALE_SRCS-yes += generic/vpx_scale.c
 SCALE_SRCS-yes += generic/yv12config.c
 SCALE_SRCS-yes += generic/yv12extend.c
 SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
--- a/vpx_scale/vpxscale.h
+++ /dev/null
@@ -1,27 +1,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VPXSCALE_H
-#define VPXSCALE_H
-
-#include "vpx_scale/yv12config.h"
-
-extern void vp8_scale_frame(YV12_BUFFER_CONFIG *src,
-                            YV12_BUFFER_CONFIG *dst,
-                            unsigned char *temp_area,
-                            unsigned char temp_height,
-                            unsigned int hscale,
-                            unsigned int hratio,
-                            unsigned int vscale,
-                            unsigned int vratio,
-                            unsigned int interlaced);
-
-#endif
--- a/vpx_scale/win32/scaleopt.c
+++ b/vpx_scale/win32/scaleopt.c
@@ -23,7 +23,7 @@
 ****************************************************************************/
 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
 
-#include "vpx_scale/vpxscale.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vpx_mem/vpx_mem.h"
 
 __declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -8,14 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef YV12_CONFIG_H
 #define YV12_CONFIG_H
+
 #ifdef __cplusplus
-extern "C"
-{
+extern "C" {
 #endif
 
+#include "vpx/vpx_integer.h"
+
 #define VP8BORDERINPIXELS       32
 #define VP9BORDERINPIXELS       64
 #define VP9_INTERP_EXTEND        4
@@ -49,11 +50,11 @@
     int   uv_stride;
     /*    int   uvinternal_width; */
 
-    unsigned char *y_buffer;
-    unsigned char *u_buffer;
-    unsigned char *v_buffer;
+    uint8_t *y_buffer;
+    uint8_t *u_buffer;
+    uint8_t *v_buffer;
 
-    unsigned char *buffer_alloc;
+    uint8_t *buffer_alloc;
     int border;
     int frame_size;
     YUV_TYPE clrtype;
@@ -62,7 +63,8 @@
     int flags;
   } YV12_BUFFER_CONFIG;
 
-  int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border);
+  int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
+                                  int width, int height, int border);
   int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
 #ifdef __cplusplus
@@ -69,5 +71,4 @@
 }
 #endif
 
-
-#endif /*YV12_CONFIG_H*/
+#endif  // YV12_CONFIG_H