shithub: libvpx

--- a/.gitignore

+++ b/.gitignore

@@ -3,6 +3,8 @@

*.d

*.o

*~

+/*.ivf

+/*.ivf.md5

 /*-*.mk

 /*.asm

 /*.doxy

--- a/build/make/configure.sh

+++ b/build/make/configure.sh

@@ -460,6 +460,7 @@

 #ifndef VPX_CONFIG_H

 #define VPX_CONFIG_H

 #define RESTRICT    ${RESTRICT}

+#define INLINE      ${INLINE}

EOF

     print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}

     print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}

@@ -1005,12 +1006,6 @@

 #error "not x32"

 #endif

EOF

-        soft_enable runtime_cpu_detect

-        soft_enable mmx

-        soft_enable sse

-        soft_enable sse2

-        soft_enable sse3

-        soft_enable ssse3

         case  ${tgt_os} in

             win*)

@@ -1064,9 +1059,15 @@

;;

         esac

+        soft_enable runtime_cpu_detect

+        soft_enable mmx

+        soft_enable sse

+        soft_enable sse2

+        soft_enable sse3

+        soft_enable ssse3

         # We can't use 'check_cflags' until the compiler is configured and CC is

         # populated.

-        if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4.1; then

+        if enabled gcc && ! disabled sse4_1 && ! check_cflags -msse4; then

             RTCD_OPTIONS="${RTCD_OPTIONS}--disable-sse4_1 "

         else

             soft_enable sse4_1

@@ -1173,6 +1174,14 @@

EOF

     [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |

         grep '4f *32 *42 *45' >/dev/null 2>&1 && enable big_endian

+    # Try to find which inline keywords are supported

+    check_cc <<EOF && INLINE="inline"

+    static inline function() {}

+EOF

+    check_cc <<EOF && INLINE="__attribute__((always_inline))"

+    static __attribute__((always_inline)) function() {}

+EOF

     # Almost every platform uses pthreads.

     if enabled multithread; then

--- a/configure

+++ b/configure

@@ -239,17 +239,18 @@

 EXPERIMENT_LIST="

csm

-    lossless

     new_mvref

     implicit_segmentation

     newbintramodes

     comp_interintra_pred

-    tx64x64

-    dwtdcthybrid

-    cnvcontext

-    newcoefcontext

     enable_6tap

     abovesprefmv

+    code_nonzerocount

+    useselectrefmv

+    modelcoefprob

+    loop_dering

+    implicit_compoundinter_weight

+    scatterscan

 CONFIG_LIST="

     external_build

@@ -647,6 +648,7 @@

              enable solution

              vs_version=${tgt_cc##vs}

              all_targets="${all_targets} solution"

+             INLINE="__forceinline"

;;

     esac

--- a/test/altref_test.cc

+++ b/test/altref_test.cc

@@ -8,9 +8,10 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

 #include "test/encode_test_driver.h"

 #include "test/i420_video_source.h"

+#include "test/util.h"

 namespace {

 // lookahead range: [kLookAheadMin, kLookAheadMax).

@@ -17,10 +18,10 @@

 const int kLookAheadMin = 5;

 const int kLookAheadMax = 26;

-class AltRefTest : public libvpx_test::EncoderTest,

-    public ::testing::TestWithParam<int> {

+class AltRefTest : public ::libvpx_test::EncoderTest,

+    public ::libvpx_test::CodecTestWithParam<int> {

  protected:

-  AltRefTest() : altref_count_(0) {}

+  AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {}

   virtual ~AltRefTest() {}

   virtual void SetUp() {

@@ -58,7 +59,7 @@

   const vpx_rational timebase = { 33333333, 1000000000 };

   cfg_.g_timebase = timebase;

   cfg_.rc_target_bitrate = 1000;

-  cfg_.g_lag_in_frames = GetParam();

+  cfg_.g_lag_in_frames = GET_PARAM(1);

   libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,

                                      timebase.den, timebase.num, 0, 30);

@@ -66,6 +67,7 @@

   EXPECT_GE(altref_count(), 1);

-INSTANTIATE_TEST_CASE_P(NonZeroLag, AltRefTest,

-                        ::testing::Range(kLookAheadMin, kLookAheadMax));

+VP8_INSTANTIATE_TEST_CASE(AltRefTest,

+                          ::testing::Range(kLookAheadMin, kLookAheadMax));

 }  // namespace

--- /dev/null

+++ b/test/codec_factory.h

@@ -1,0 +1,232 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef TEST_CODEC_FACTORY_H_

+#define TEST_CODEC_FACTORY_H_

+extern "C" {

+#include "./vpx_config.h"

+#include "vpx/vpx_decoder.h"

+#include "vpx/vpx_encoder.h"

+#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER

+#include "vpx/vp8cx.h"

+#endif

+#if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER

+#include "vpx/vp8dx.h"

+#endif

+}

+#include "test/decode_test_driver.h"

+#include "test/encode_test_driver.h"

+namespace libvpx_test {

+class CodecFactory {

+ public:

+  CodecFactory() {}

+  virtual ~CodecFactory() {}

+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,

+                                 unsigned long deadline) const = 0;

+  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,

+                                 unsigned long deadline,

+                                 const unsigned long init_flags,

+                                 TwopassStatsStore *stats) const = 0;

+  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,

+                                               int usage) const = 0;

+};

+/* Provide CodecTestWith<n>Params classes for a variable number of parameters

+ * to avoid having to include a pointer to the CodecFactory in every test

+ * definition.

+ */

+template<class T1>

+class CodecTestWithParam : public ::testing::TestWithParam<

+    std::tr1::tuple< const libvpx_test::CodecFactory*, T1 > > {

+};

+template<class T1, class T2>

+class CodecTestWith2Params : public ::testing::TestWithParam<

+    std::tr1::tuple< const libvpx_test::CodecFactory*, T1, T2 > > {

+};

+template<class T1, class T2, class T3>

+class CodecTestWith3Params : public ::testing::TestWithParam<

+    std::tr1::tuple< const libvpx_test::CodecFactory*, T1, T2, T3 > > {

+};

+/*

+ * VP8 Codec Definitions

+ */

+#if CONFIG_VP8

+class VP8Decoder : public Decoder {

+ public:

+  VP8Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)

+      : Decoder(cfg, deadline) {}

+ protected:

+  virtual const vpx_codec_iface_t* CodecInterface() const {

+#if CONFIG_VP8_DECODER

+    return &vpx_codec_vp8_dx_algo;

+#else

+    return NULL;

+#endif

+  }

+};

+class VP8Encoder : public Encoder {

+ public:

+  VP8Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,

+             const unsigned long init_flags, TwopassStatsStore *stats)

+      : Encoder(cfg, deadline, init_flags, stats) {}

+ protected:

+  virtual const vpx_codec_iface_t* CodecInterface() const {

+#if CONFIG_VP8_ENCODER

+    return &vpx_codec_vp8_cx_algo;

+#else

+    return NULL;

+#endif

+  }

+};

+class VP8CodecFactory : public CodecFactory {

+ public:

+  VP8CodecFactory() : CodecFactory() {}

+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,

+                                 unsigned long deadline) const {

+#if CONFIG_VP8_DECODER

+    return new VP8Decoder(cfg, deadline);

+#else

+    return NULL;

+#endif

+  }

+  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,

+                                 unsigned long deadline,

+                                 const unsigned long init_flags,

+                                 TwopassStatsStore *stats) const {

+#if CONFIG_VP8_ENCODER

+    return new VP8Encoder(cfg, deadline, init_flags, stats);

+#else

+    return NULL;

+#endif

+  }

+  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,

+                                               int usage) const {

+#if CONFIG_VP8_ENCODER

+    return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage);

+#else

+    return VPX_CODEC_INCAPABLE;

+#endif

+  }

+};

+const libvpx_test::VP8CodecFactory kVP8;

+#define VP8_INSTANTIATE_TEST_CASE(test, params)\

+  INSTANTIATE_TEST_CASE_P(VP8, test, \

+      ::testing::Combine( \

+          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \

+              &libvpx_test::kVP8)), \

+          params))

+#else

+#define VP8_INSTANTIATE_TEST_CASE(test, params)

+#endif  // CONFIG_VP8

+/*

+ * VP9 Codec Definitions

+ */

+#if CONFIG_VP9

+class VP9Decoder : public Decoder {

+ public:

+  VP9Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)

+      : Decoder(cfg, deadline) {}

+ protected:

+  virtual const vpx_codec_iface_t* CodecInterface() const {

+#if CONFIG_VP9_DECODER

+    return &vpx_codec_vp9_dx_algo;

+#else

+    return NULL;

+#endif

+  }

+};

+class VP9Encoder : public Encoder {

+ public:

+  VP9Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,

+             const unsigned long init_flags, TwopassStatsStore *stats)

+      : Encoder(cfg, deadline, init_flags, stats) {}

+ protected:

+  virtual const vpx_codec_iface_t* CodecInterface() const {

+#if CONFIG_VP9_ENCODER

+    return &vpx_codec_vp9_cx_algo;

+#else

+    return NULL;

+#endif

+  }

+};

+class VP9CodecFactory : public CodecFactory {

+ public:

+  VP9CodecFactory() : CodecFactory() {}

+  virtual Decoder* CreateDecoder(vpx_codec_dec_cfg_t cfg,

+                                 unsigned long deadline) const {

+#if CONFIG_VP9_DECODER

+    return new VP9Decoder(cfg, deadline);

+#else

+    return NULL;

+#endif

+  }

+  virtual Encoder* CreateEncoder(vpx_codec_enc_cfg_t cfg,

+                                 unsigned long deadline,

+                                 const unsigned long init_flags,

+                                 TwopassStatsStore *stats) const {

+#if CONFIG_VP9_ENCODER

+    return new VP9Encoder(cfg, deadline, init_flags, stats);

+#else

+    return NULL;

+#endif

+  }

+  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,

+                                               int usage) const {

+#if CONFIG_VP9_ENCODER

+    return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);

+#else

+    return VPX_CODEC_INCAPABLE;

+#endif

+  }

+};

+const libvpx_test::VP9CodecFactory kVP9;

+#define VP9_INSTANTIATE_TEST_CASE(test, params)\

+  INSTANTIATE_TEST_CASE_P(VP9, test, \

+      ::testing::Combine( \

+          ::testing::Values(static_cast<const libvpx_test::CodecFactory*>( \

+               &libvpx_test::kVP9)), \

+          params))

+#else

+#define VP9_INSTANTIATE_TEST_CASE(test, params)

+#endif  // CONFIG_VP9

+}  // namespace libvpx_test

+#endif  // TEST_CODEC_FACTORY_H_

--- a/test/config_test.cc

+++ b/test/config_test.cc

@@ -8,20 +8,22 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

 #include "test/encode_test_driver.h"

+#include "test/util.h"

 #include "test/video_source.h"

 namespace {

 class ConfigTest : public ::libvpx_test::EncoderTest,

-    public ::testing::TestWithParam<enum libvpx_test::TestMode> {

- public:

-  ConfigTest() : frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {}

+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {

  protected:

+  ConfigTest() : EncoderTest(GET_PARAM(0)),

+                 frame_count_in_(0), frame_count_out_(0), frame_count_max_(0) {}

   virtual void SetUp() {

     InitializeConfig();

-    SetMode(GetParam());

+    SetMode(GET_PARAM(1));

   virtual void BeginPassHook(unsigned int /*pass*/) {

@@ -57,5 +59,5 @@

   EXPECT_EQ(frame_count_in_, frame_count_out_);

-INSTANTIATE_TEST_CASE_P(OnePassModes, ConfigTest, ONE_PASS_TEST_MODES);

+VP8_INSTANTIATE_TEST_CASE(ConfigTest, ONE_PASS_TEST_MODES);

 }  // namespace

--- /dev/null

+++ b/test/convolve_test.cc

@@ -1,0 +1,509 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+extern "C" {

+#include "./vpx_config.h"

+#include "./vp9_rtcd.h"

+#include "vp9/common/vp9_filter.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vpx_ports/mem.h"

+}

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/acm_random.h"

+#include "test/register_state_check.h"

+#include "test/util.h"

+namespace {

+typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int filter_x_stride,

+                              const int16_t *filter_y, int filter_y_stride,

+                              int w, int h);

+struct ConvolveFunctions {

+  ConvolveFunctions(convolve_fn_t h8, convolve_fn_t h8_avg,

+                    convolve_fn_t v8, convolve_fn_t v8_avg,

+                    convolve_fn_t hv8, convolve_fn_t hv8_avg)

+      : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg),

+        hv8_avg_(hv8_avg) {}

+  convolve_fn_t h8_;

+  convolve_fn_t v8_;

+  convolve_fn_t hv8_;

+  convolve_fn_t h8_avg_;

+  convolve_fn_t v8_avg_;

+  convolve_fn_t hv8_avg_;

+};

+// Reference 8-tap subpixel filter, slightly modified to fit into this test.

+#define VP9_FILTER_WEIGHT 128

+#define VP9_FILTER_SHIFT 7

+static uint8_t clip_pixel(int x) {

+  return x < 0 ? 0 :

+         x > 255 ? 255 :

+         x;

+}

+static void filter_block2d_8_c(const uint8_t *src_ptr,

+                               const unsigned int src_stride,

+                               const int16_t *HFilter,

+                               const int16_t *VFilter,

+                               uint8_t *dst_ptr,

+                               unsigned int dst_stride,

+                               unsigned int output_width,

+                               unsigned int output_height) {

+  // Between passes, we use an intermediate buffer whose height is extended to

+  // have enough horizontally filtered values as input for the vertical pass.

+  // This buffer is allocated to be big enough for the largest block type we

+  // support.

+  const int kInterp_Extend = 4;

+  const unsigned int intermediate_height =

+    (kInterp_Extend - 1) +     output_height + kInterp_Extend;

+  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,

+   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height

+   *                                 + kInterp_Extend

+   *                               = 3 + 16 + 4

+   *                               = 23

+   * and filter_max_width = 16

+   */

+  uint8_t intermediate_buffer[23 * 16];

+  const int intermediate_next_stride = 1 - intermediate_height * output_width;

+  // Horizontal pass (src -> transposed intermediate).

+  {

+    uint8_t *output_ptr = intermediate_buffer;

+    const int src_next_row_stride = src_stride - output_width;

+    unsigned int i, j;

+    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

+    for (i = 0; i < intermediate_height; ++i) {

+      for (j = 0; j < output_width; ++j) {

+        // Apply filter...

+        int temp = ((int)src_ptr[0] * HFilter[0]) +

+                   ((int)src_ptr[1] * HFilter[1]) +

+                   ((int)src_ptr[2] * HFilter[2]) +

+                   ((int)src_ptr[3] * HFilter[3]) +

+                   ((int)src_ptr[4] * HFilter[4]) +

+                   ((int)src_ptr[5] * HFilter[5]) +

+                   ((int)src_ptr[6] * HFilter[6]) +

+                   ((int)src_ptr[7] * HFilter[7]) +

+                   (VP9_FILTER_WEIGHT >> 1);  // Rounding

+        // Normalize back to 0-255...

+        *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);

+        ++src_ptr;

+        output_ptr += intermediate_height;

+      }

+      src_ptr += src_next_row_stride;

+      output_ptr += intermediate_next_stride;

+    }

+  }

+  // Vertical pass (transposed intermediate -> dst).

+  {

+    uint8_t *src_ptr = intermediate_buffer;

+    const int dst_next_row_stride = dst_stride - output_width;

+    unsigned int i, j;

+    for (i = 0; i < output_height; ++i) {

+      for (j = 0; j < output_width; ++j) {

+        // Apply filter...

+        int temp = ((int)src_ptr[0] * VFilter[0]) +

+                   ((int)src_ptr[1] * VFilter[1]) +

+                   ((int)src_ptr[2] * VFilter[2]) +

+                   ((int)src_ptr[3] * VFilter[3]) +

+                   ((int)src_ptr[4] * VFilter[4]) +

+                   ((int)src_ptr[5] * VFilter[5]) +

+                   ((int)src_ptr[6] * VFilter[6]) +

+                   ((int)src_ptr[7] * VFilter[7]) +

+                   (VP9_FILTER_WEIGHT >> 1);  // Rounding

+        // Normalize back to 0-255...

+        *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);

+        src_ptr += intermediate_height;

+      }

+      src_ptr += intermediate_next_stride;

+      dst_ptr += dst_next_row_stride;

+    }

+  }

+}

+static void block2d_average_c(uint8_t *src,

+                              unsigned int src_stride,

+                              uint8_t *output_ptr,

+                              unsigned int output_stride,

+                              unsigned int output_width,

+                              unsigned int output_height) {

+  unsigned int i, j;

+  for (i = 0; i < output_height; ++i) {

+    for (j = 0; j < output_width; ++j) {

+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;

+    }

+    output_ptr += output_stride;

+  }

+}

+static void filter_average_block2d_8_c(const uint8_t *src_ptr,

+                                       const unsigned int src_stride,

+                                       const int16_t *HFilter,

+                                       const int16_t *VFilter,

+                                       uint8_t *dst_ptr,

+                                       unsigned int dst_stride,

+                                       unsigned int output_width,

+                                       unsigned int output_height) {

+  uint8_t tmp[16*16];

+  assert(output_width <= 16);

+  assert(output_height <= 16);

+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 16,

+                     output_width, output_height);

+  block2d_average_c(tmp, 16, dst_ptr, dst_stride,

+                    output_width, output_height);

+}

+class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

+ public:

+  static void SetUpTestCase() {

+    // Force input_ to be unaligned, output to be 16 byte aligned.

+    input_ = reinterpret_cast<uint8_t*>(

+        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))

+        + 1;

+    output_ = reinterpret_cast<uint8_t*>(

+        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));

+  }

+  static void TearDownTestCase() {

+    vpx_free(input_ - 1);

+    input_ = NULL;

+    vpx_free(output_);

+    output_ = NULL;

+  }

+  protected:

+    static const int kDataAlignment = 16;

+    static const int kOuterBlockSize = 32;

+    static const int kInputStride = kOuterBlockSize;

+    static const int kOutputStride = kOuterBlockSize;

+    static const int kMaxDimension = 16;

+    int Width() const { return GET_PARAM(0); }

+    int Height() const { return GET_PARAM(1); }

+    int BorderLeft() const {

+      const int center = (kOuterBlockSize - Width()) / 2;

+      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);

+    }

+    int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }

+    bool IsIndexInBorder(int i) {

+      return (i < BorderTop() * kOuterBlockSize ||

+              i >= (BorderTop() + Height()) * kOuterBlockSize ||

+              i % kOuterBlockSize < BorderLeft() ||

+              i % kOuterBlockSize >= (BorderLeft() + Width()));

+    }

+    virtual void SetUp() {

+      UUT_ = GET_PARAM(2);

+      memset(input_, 0, sizeof(input_));

+      /* Set up guard blocks for an inner block cetered in the outer block */

+      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {

+        if (IsIndexInBorder(i))

+          output_[i] = 255;

+        else

+          output_[i] = 0;

+      }

+      ::libvpx_test::ACMRandom prng;

+      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i)

+        input_[i] = prng.Rand8();

+    }

+    void CheckGuardBlocks() {

+      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {

+        if (IsIndexInBorder(i))

+          EXPECT_EQ(255, output_[i]);

+      }

+    }

+    uint8_t* input() {

+      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();

+    }

+    uint8_t* output() {

+      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();

+    }

+    const ConvolveFunctions* UUT_;

+    static uint8_t* input_;

+    static uint8_t* output_;

+};

+uint8_t* ConvolveTest::input_ = NULL;

+uint8_t* ConvolveTest::output_ = NULL;

+TEST_P(ConvolveTest, GuardBlocks) {

+  CheckGuardBlocks();

+}

+TEST_P(ConvolveTest, CopyHoriz) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0};

+  REGISTER_STATE_CHECK(

+      UUT_->h8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,

+                Width(), Height()));

+  CheckGuardBlocks();

+  for (int y = 0; y < Height(); ++y)

+    for (int x = 0; x < Width(); ++x)

+      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])

+          << "(" << x << "," << y << ")";

+}

+TEST_P(ConvolveTest, CopyVert) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0};

+  REGISTER_STATE_CHECK(

+      UUT_->v8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,

+                Width(), Height()));

+  CheckGuardBlocks();

+  for (int y = 0; y < Height(); ++y)

+    for (int x = 0; x < Width(); ++x)

+      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])

+          << "(" << x << "," << y << ")";

+}

+TEST_P(ConvolveTest, Copy2D) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0};

+  REGISTER_STATE_CHECK(

+      UUT_->hv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,

+                 Width(), Height()));

+  CheckGuardBlocks();

+  for (int y = 0; y < Height(); ++y)

+    for (int x = 0; x < Width(); ++x)

+      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])

+          << "(" << x << "," << y << ")";

+}

+const int16_t (*kTestFilterList[])[8] = {

+  vp9_bilinear_filters,

+  vp9_sub_pel_filters_6,

+  vp9_sub_pel_filters_8,

+  vp9_sub_pel_filters_8s,

+  vp9_sub_pel_filters_8lp

+};

+const int16_t kInvalidFilter[8] = { 0 };

+TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  uint8_t ref[kOutputStride * kMaxDimension];

+  const int kNumFilterBanks = sizeof(kTestFilterList) /

+      sizeof(kTestFilterList[0]);

+  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {

+    const int16_t (*filters)[8] = kTestFilterList[filter_bank];

+    const int kNumFilters = 16;

+    for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {

+      for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {

+        filter_block2d_8_c(in, kInputStride,

+                           filters[filter_x], filters[filter_y],

+                           ref, kOutputStride,

+                           Width(), Height());

+        if (filters == vp9_sub_pel_filters_8lp || (filter_x && filter_y))

+          REGISTER_STATE_CHECK(

+              UUT_->hv8_(in, kInputStride, out, kOutputStride,

+                         filters[filter_x], 16, filters[filter_y], 16,

+                         Width(), Height()));

+        else if (filter_y)

+          REGISTER_STATE_CHECK(

+              UUT_->v8_(in, kInputStride, out, kOutputStride,

+                        kInvalidFilter, 16, filters[filter_y], 16,

+                        Width(), Height()));

+        else

+          REGISTER_STATE_CHECK(

+              UUT_->h8_(in, kInputStride, out, kOutputStride,

+                        filters[filter_x], 16, kInvalidFilter, 16,

+                        Width(), Height()));

+        CheckGuardBlocks();

+        for (int y = 0; y < Height(); ++y)

+          for (int x = 0; x < Width(); ++x)

+            ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x])

+                << "mismatch at (" << x << "," << y << "), "

+                << "filters (" << filter_bank << ","

+                << filter_x << "," << filter_y << ")";

+      }

+    }

+  }

+}

+TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  uint8_t ref[kOutputStride * kMaxDimension];

+  // Populate ref and out with some random data

+  ::libvpx_test::ACMRandom prng;

+  for (int y = 0; y < Height(); ++y) {

+    for (int x = 0; x < Width(); ++x) {

+      const uint8_t r = prng.Rand8();

+      out[y * kOutputStride + x] = r;

+      ref[y * kOutputStride + x] = r;

+    }

+  }

+  const int kNumFilterBanks = sizeof(kTestFilterList) /

+      sizeof(kTestFilterList[0]);

+  for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {

+    const int16_t (*filters)[8] = kTestFilterList[filter_bank];

+    const int kNumFilters = 16;

+    for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {

+      for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {

+        filter_average_block2d_8_c(in, kInputStride,

+                                   filters[filter_x], filters[filter_y],

+                                   ref, kOutputStride,

+                                   Width(), Height());

+        if (filters == vp9_sub_pel_filters_8lp || (filter_x && filter_y))

+          REGISTER_STATE_CHECK(

+              UUT_->hv8_avg_(in, kInputStride, out, kOutputStride,

+                             filters[filter_x], 16, filters[filter_y], 16,

+                             Width(), Height()));

+        else if (filter_y)

+          REGISTER_STATE_CHECK(

+              UUT_->v8_avg_(in, kInputStride, out, kOutputStride,

+                            filters[filter_x], 16, filters[filter_y], 16,

+                            Width(), Height()));

+        else

+          REGISTER_STATE_CHECK(

+              UUT_->h8_avg_(in, kInputStride, out, kOutputStride,

+                            filters[filter_x], 16, filters[filter_y], 16,

+                            Width(), Height()));

+        CheckGuardBlocks();

+        for (int y = 0; y < Height(); ++y)

+          for (int x = 0; x < Width(); ++x)

+            ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x])

+                << "mismatch at (" << x << "," << y << "), "

+                << "filters (" << filter_bank << ","

+                << filter_x << "," << filter_y << ")";

+      }

+    }

+  }

+}

+DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {

+    { 0,   0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0, 128},

+    { 0,   0,   0, 128},

+    { 0,   0, 128},

+    { 0, 128},

+    { 128},

+    { 0,   0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0, 128},

+    { 0,   0,   0, 128},

+    { 0,   0, 128},

+    { 0, 128},

+    { 128}

+};

+TEST_P(ConvolveTest, ChangeFilterWorks) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,

+                                 kChangeFilters[8], 17, kChangeFilters[4], 16,

+                                 Width(), Height()));

+  for (int x = 0; x < Width(); ++x) {

+    if (x < 8)

+      ASSERT_EQ(in[4], out[x]) << "x == " << x;

+    else

+      ASSERT_EQ(in[12], out[x]) << "x == " << x;

+  }

+  REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,

+                                 kChangeFilters[4], 16, kChangeFilters[8], 17,

+                                 Width(), Height()));

+  for (int y = 0; y < Height(); ++y) {

+    if (y < 8)

+      ASSERT_EQ(in[4 * kInputStride], out[y * kOutputStride]) << "y == " << y;

+    else

+      ASSERT_EQ(in[12 * kInputStride], out[y * kOutputStride]) << "y == " << y;

+  }

+  REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,

+                                  kChangeFilters[8], 17, kChangeFilters[8], 17,

+                                  Width(), Height()));

+  for (int y = 0; y < Height(); ++y) {

+    for (int x = 0; x < Width(); ++x) {

+      const int ref_x = x < 8 ? 4 : 12;

+      const int ref_y = y < 8 ? 4 : 12;

+      ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])

+          << "x == " << x << ", y == " << y;

+    }

+  }

+}

+using std::tr1::make_tuple;

+const ConvolveFunctions convolve8_c(

+    vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,

+    vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,

+    vp9_convolve8_c, vp9_convolve8_avg_c);

+INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(

+    make_tuple(4, 4, &convolve8_c),

+    make_tuple(8, 4, &convolve8_c),

+    make_tuple(8, 8, &convolve8_c),

+    make_tuple(16, 8, &convolve8_c),

+    make_tuple(16, 16, &convolve8_c)));

+}

+#if HAVE_SSSE3

+const ConvolveFunctions convolve8_ssse3(

+    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c,

+    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c,

+    vp9_convolve8_ssse3, vp9_convolve8_avg_c);

+INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(

+    make_tuple(4, 4, &convolve8_ssse3),

+    make_tuple(8, 4, &convolve8_ssse3),

+    make_tuple(8, 8, &convolve8_ssse3),

+    make_tuple(16, 8, &convolve8_ssse3),

+    make_tuple(16, 16, &convolve8_ssse3)));

+#endif

--- a/test/cq_test.cc

+++ b/test/cq_test.cc

@@ -9,9 +9,13 @@

*/

 #include <cmath>

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

 #include "test/encode_test_driver.h"

 #include "test/i420_video_source.h"

+#include "test/util.h"

+namespace {

 // CQ level range: [kCQLevelMin, kCQLevelMax).

 const int kCQLevelMin = 4;

 const int kCQLevelMax = 63;

@@ -18,12 +22,13 @@

 const int kCQLevelStep = 8;

 const int kCQTargetBitrate = 2000;

-namespace {

-class CQTest : public libvpx_test::EncoderTest,

-    public ::testing::TestWithParam<int> {

+class CQTest : public ::libvpx_test::EncoderTest,

+    public ::libvpx_test::CodecTestWithParam<int> {

  protected:

-  CQTest() : cq_level_(GetParam()) { init_flags_ = VPX_CODEC_USE_PSNR; }

+  CQTest() : EncoderTest(GET_PARAM(0)), cq_level_(GET_PARAM(1)) {

+    init_flags_ = VPX_CODEC_USE_PSNR;

+  }

   virtual ~CQTest() {}

   virtual void SetUp() {

@@ -100,7 +105,7 @@

   EXPECT_GE(cq_psnr_lin, vbr_psnr_lin);

-INSTANTIATE_TEST_CASE_P(CQLevelRange, CQTest,

-                        ::testing::Range(kCQLevelMin, kCQLevelMax,

-                                         kCQLevelStep));

+VP8_INSTANTIATE_TEST_CASE(CQTest,

+                          ::testing::Range(kCQLevelMin, kCQLevelMax,

+                                           kCQLevelStep));

 }  // namespace

--- a/test/datarate_test.cc

+++ b/test/datarate_test.cc

@@ -7,17 +7,23 @@

  *  in the file PATENTS.  All contributing project authors may

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

 #include "test/encode_test_driver.h"

 #include "test/i420_video_source.h"

-#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/util.h"

 namespace {

 class DatarateTest : public ::libvpx_test::EncoderTest,

-    public ::testing::TestWithParam<enum libvpx_test::TestMode> {

+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {

+ public:

+  DatarateTest() : EncoderTest(GET_PARAM(0)) {}

  protected:

   virtual void SetUp() {

     InitializeConfig();

-    SetMode(GetParam());

+    SetMode(GET_PARAM(1));

     ResetModel();

@@ -174,5 +180,6 @@

-INSTANTIATE_TEST_CASE_P(AllModes, DatarateTest, ALL_TEST_MODES);

+VP8_INSTANTIATE_TEST_CASE(DatarateTest, ALL_TEST_MODES);

 }  // namespace

--- a/test/dct16x16_test.cc

+++ b/test/dct16x16_test.cc

@@ -15,7 +15,7 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

 extern "C" {

-#include "vp9/common/entropy.h"

+#include "vp9/common/vp9_entropy.h"

 #include "vp9_rtcd.h"

@@ -26,6 +26,15 @@

 namespace {

+#ifdef _MSC_VER

+static int round(double x) {

+  if (x < 0)

+    return (int)ceil(x - 0.5);

+  else

+    return (int)floor(x + 0.5);

+}

+#endif

 const double PI = 3.1415926535898;

 void reference2_16x16_idct_2d(double *input, double *output) {

   double x;

@@ -278,18 +287,10 @@

           << "Error: 16x16 IDCT has error " << error

           << " at index " << j;

-    vp9_short_fdct16x16_c(in, out_c, 32);

-    for (int j = 0; j < 256; ++j) {

-      const double diff = coeff[j] - out_c[j];

-      const double error = diff * diff;

-      EXPECT_GE(1.0, error)

-          << "Error: 16x16 FDCT has error " << error

-          << " at index " << j;

-    }

+#if 1

+// we need enable fdct test once we re-do the 16 point fdct.

 TEST(VP9Fdct16x16Test, AccuracyCheck) {

   ACMRandom rnd(ACMRandom::DeterministicSeed());

   int max_error = 0;

@@ -318,10 +319,10 @@

   EXPECT_GE(1, max_error)

-      << "Error: 16x16 FDCT/IDCT has an individual roundtrip error > 1";

+      << "Error: 16x16 FDCT/IDCT has an individual round trip error > 1";

-  EXPECT_GE(count_test_block/10, total_error)

-      << "Error: 16x16 FDCT/IDCT has average roundtrip error > 1/10 per block";

+  EXPECT_GE(count_test_block , total_error)

+      << "Error: 16x16 FDCT/IDCT has average round trip error > 1 per block";

 TEST(VP9Fdct16x16Test, CoeffSizeCheck) {

@@ -353,4 +354,6 @@

+#endif

 }  // namespace

--- a/test/dct32x32_test.cc

+++ b/test/dct32x32_test.cc

@@ -36,7 +36,6 @@

 #endif

-#if !CONFIG_DWTDCTHYBRID

 static const double kPi = 3.141592653589793238462643383279502884;

 static void reference2_32x32_idct_2d(double *input, double *output) {

   double x;

@@ -116,20 +115,9 @@

           << "Error: 3x32 IDCT has error " << error

           << " at index " << j;

-    vp9_short_fdct32x32_c(in, out_c, 64);

-    for (int j = 0; j < 1024; ++j) {

-      const double diff = coeff[j] - out_c[j];

-      const double error = diff * diff;

-      EXPECT_GE(1.0, error)

-          << "Error: 32x32 FDCT has error " << error

-          << " at index " << j;

-    }

-#else  // CONFIG_DWTDCTHYBRID

-  // TODO(rbultje/debargha): add DWT-specific tests

-#endif  // CONFIG_DWTDCTHYBRID

 TEST(VP9Fdct32x32Test, AccuracyCheck) {

   ACMRandom rnd(ACMRandom::DeterministicSeed());

   unsigned int max_error = 0;

@@ -160,8 +148,8 @@

   EXPECT_GE(1u, max_error)

       << "Error: 32x32 FDCT/IDCT has an individual roundtrip error > 1";

-  EXPECT_GE(count_test_block/10, total_error)

-      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1/10 per block";

+  EXPECT_GE(count_test_block, total_error)

+      << "Error: 32x32 FDCT/IDCT has average roundtrip error > 1 per block";

 TEST(VP9Fdct32x32Test, CoeffSizeCheck) {

--- a/test/decode_test_driver.cc

+++ b/test/decode_test_driver.cc

@@ -7,6 +7,7 @@

  *  in the file PATENTS.  All contributing project authors may

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include "test/codec_factory.h"

 #include "test/decode_test_driver.h"

 #include "third_party/googletest/src/include/gtest/gtest.h"

 #include "test/register_state_check.h"

@@ -13,10 +14,10 @@

 #include "test/video_source.h"

 namespace libvpx_test {

-#if CONFIG_VP8_DECODER

 vpx_codec_err_t Decoder::DecodeFrame(const uint8_t *cxdata, int size) {

   vpx_codec_err_t res_dec;

+  InitOnce();

   REGISTER_STATE_CHECK(res_dec = vpx_codec_decode(&decoder_,

                                                   cxdata, size, NULL, 0));

   return res_dec;

@@ -24,15 +25,16 @@

 void DecoderTest::RunLoop(CompressedVideoSource *video) {

   vpx_codec_dec_cfg_t dec_cfg = {0};

-  Decoder decoder(dec_cfg, 0);

+  Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);

+  ASSERT_TRUE(decoder != NULL);

   // Decode frames.

   for (video->Begin(); video->cxdata(); video->Next()) {

-    vpx_codec_err_t res_dec = decoder.DecodeFrame(video->cxdata(),

-                                                  video->frame_size());

-    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder.DecodeError();

+    vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),

+                                                   video->frame_size());

+    ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();

-    DxDataIterator dec_iter = decoder.GetDxData();

+    DxDataIterator dec_iter = decoder->GetDxData();

     const vpx_image_t *img = NULL;

     // Get decompressed data

@@ -39,6 +41,7 @@

     while ((img = dec_iter.Next()))

       DecompressedFrameHook(*img, video->frame_number());

+  delete decoder;

-#endif

 }  // namespace libvpx_test

--- a/test/decode_test_driver.h

+++ b/test/decode_test_driver.h

@@ -14,10 +14,10 @@

 #include "third_party/googletest/src/include/gtest/gtest.h"

 #include "vpx_config.h"

 #include "vpx/vpx_decoder.h"

-#include "vpx/vp8dx.h"

 namespace libvpx_test {

+class CodecFactory;

 class CompressedVideoSource;

 // Provides an object to handle decoding output

@@ -42,12 +42,11 @@

 class Decoder {

  public:

   Decoder(vpx_codec_dec_cfg_t cfg, unsigned long deadline)

-      : cfg_(cfg), deadline_(deadline) {

+      : cfg_(cfg), deadline_(deadline), init_done_(false) {

     memset(&decoder_, 0, sizeof(decoder_));

-    Init();

-  ~Decoder() {

+  virtual ~Decoder() {

     vpx_codec_destroy(&decoder_);

@@ -62,37 +61,45 @@

   void Control(int ctrl_id, int arg) {

+    InitOnce();

     const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);

     ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();

   void Control(int ctrl_id, const void *arg) {

+    InitOnce();

     const vpx_codec_err_t res = vpx_codec_control_(&decoder_, ctrl_id, arg);

     ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();

-  const char *DecodeError() {

+  const char* DecodeError() {

     const char *detail = vpx_codec_error_detail(&decoder_);

     return detail ? detail : vpx_codec_error(&decoder_);

  protected:

-  void Init() {

-    const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,

-                                                   &vpx_codec_vp8_dx_algo,

-                                                   &cfg_, 0);

-    ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();

+  virtual const vpx_codec_iface_t* CodecInterface() const = 0;

+  void InitOnce() {

+    if (!init_done_) {

+      const vpx_codec_err_t res = vpx_codec_dec_init(&decoder_,

+                                                     CodecInterface(),

+                                                     &cfg_, 0);

+      ASSERT_EQ(VPX_CODEC_OK, res) << DecodeError();

+      init_done_ = true;

+    }

   vpx_codec_ctx_t     decoder_;

   vpx_codec_dec_cfg_t cfg_;

   unsigned int        deadline_;

+  bool                init_done_;

};

 // Common test functionality for all Decoder tests.

 class DecoderTest {

  public:

-  // Main loop.

+  // Main decoding loop

   virtual void RunLoop(CompressedVideoSource *video);

   // Hook to be called on every decompressed frame.

@@ -100,9 +107,11 @@

                                      const unsigned int frame_number) {}

  protected:

-  DecoderTest() {}

+  explicit DecoderTest(const CodecFactory *codec) : codec_(codec) {}

   virtual ~DecoderTest() {}

+  const CodecFactory *codec_;

};

 }  // namespace libvpx_test

--- a/test/encode_test_driver.cc

+++ b/test/encode_test_driver.cc

@@ -7,11 +7,11 @@

  *  in the file PATENTS.  All contributing project authors may

  *  be found in the AUTHORS file in the root of the source tree.

*/

 #include "vpx_config.h"

+#include "test/codec_factory.h"

 #include "test/encode_test_driver.h"

-#if CONFIG_VP8_DECODER

 #include "test/decode_test_driver.h"

-#endif

 #include "test/register_state_check.h"

 #include "test/video_source.h"

 #include "third_party/googletest/src/include/gtest/gtest.h"

@@ -45,7 +45,7 @@

     cfg_.g_h = img->d_h;

     cfg_.g_timebase = video.timebase();

     cfg_.rc_twopass_stats_in = stats_->buf();

-    res = vpx_codec_enc_init(&encoder_, &vpx_codec_vp8_cx_algo, &cfg_,

+    res = vpx_codec_enc_init(&encoder_, CodecInterface(), &cfg_,

                              init_flags_);

     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();

@@ -72,6 +72,11 @@

   ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();

+void EncoderTest::InitializeConfig() {

+  const vpx_codec_err_t res = codec_->DefaultEncoderConfig(&cfg_, 0);

+  ASSERT_EQ(VPX_CODEC_OK, res);

+}

 void EncoderTest::SetMode(TestMode mode) {

   switch (mode) {

     case kRealTime:

@@ -125,13 +130,17 @@

   return match;

+void EncoderTest::MismatchHook(const vpx_image_t *img1,

+                               const vpx_image_t *img2) {

+  ASSERT_TRUE(0) << "Encode/Decode mismatch found";

+}

 void EncoderTest::RunLoop(VideoSource *video) {

-#if CONFIG_VP8_DECODER

   vpx_codec_dec_cfg_t dec_cfg = {0};

-#endif

   stats_.Reset();

+  ASSERT_TRUE(passes_ == 1 || passes_ == 2);

   for (unsigned int pass = 0; pass < passes_; pass++) {

     last_pts_ = 0;

@@ -143,34 +152,34 @@

       cfg_.g_pass = VPX_RC_LAST_PASS;

     BeginPassHook(pass);

-    Encoder encoder(cfg_, deadline_, init_flags_, &stats_);

-#if CONFIG_VP8_DECODER

-    Decoder decoder(dec_cfg, 0);

-    bool has_cxdata = false;

-#endif

+    Encoder* const encoder = codec_->CreateEncoder(cfg_, deadline_, init_flags_,

+                                                   &stats_);

+    ASSERT_TRUE(encoder != NULL);

+    Decoder* const decoder = codec_->CreateDecoder(dec_cfg, 0);

     bool again;

     for (again = true, video->Begin(); again; video->Next()) {

       again = video->img() != NULL;

       PreEncodeFrameHook(video);

-      PreEncodeFrameHook(video, &encoder);

-      encoder.EncodeFrame(video, frame_flags_);

+      PreEncodeFrameHook(video, encoder);

+      encoder->EncodeFrame(video, frame_flags_);

-      CxDataIterator iter = encoder.GetCxData();

+      CxDataIterator iter = encoder->GetCxData();

+      bool has_cxdata = false;

+      bool has_dxdata = false;

       while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {

+        pkt = MutateEncoderOutputHook(pkt);

         again = true;

-#if CONFIG_VP8_DECODER

-        vpx_codec_err_t res_dec;

-#endif

         switch (pkt->kind) {

           case VPX_CODEC_CX_FRAME_PKT:

-#if CONFIG_VP8_DECODER

             has_cxdata = true;

-            res_dec = decoder.DecodeFrame((const uint8_t*)pkt->data.frame.buf,

-                                          pkt->data.frame.sz);

-            ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder.DecodeError();

-#endif

+            if (decoder && DoDecode()) {

+              vpx_codec_err_t res_dec = decoder->DecodeFrame(

+                  (const uint8_t*)pkt->data.frame.buf, pkt->data.frame.sz);

+              ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();

+              has_dxdata = true;

+            }

             ASSERT_GE(pkt->data.frame.pts, last_pts_);

             last_pts_ = pkt->data.frame.pts;

             FramePktHook(pkt);

@@ -185,17 +194,19 @@

-#if CONFIG_VP8_DECODER

-      if (has_cxdata) {

-        const vpx_image_t *img_enc = encoder.GetPreviewFrame();

-        DxDataIterator dec_iter = decoder.GetDxData();

+      if (has_dxdata && has_cxdata) {

+        const vpx_image_t *img_enc = encoder->GetPreviewFrame();

+        DxDataIterator dec_iter = decoder->GetDxData();

         const vpx_image_t *img_dec = dec_iter.Next();

-        if(img_enc && img_dec) {

+        if (img_enc && img_dec) {

           const bool res = compare_img(img_enc, img_dec);

-          ASSERT_TRUE(res)<< "Encoder/Decoder mismatch found.";

+          if (!res) {  // Mismatch

+            MismatchHook(img_enc, img_dec);

+          }

+        if (img_dec)

+          DecompressedFrameHook(*img_dec, video->pts());

-#endif

       if (!Continue())

         break;

@@ -202,8 +213,13 @@

     EndPassHook();

+    if (decoder)

+      delete decoder;

+    delete encoder;

     if (!Continue())

       break;

 }  // namespace libvpx_test

--- a/test/encode_test_driver.h

+++ b/test/encode_test_driver.h

@@ -9,14 +9,16 @@

*/

 #ifndef TEST_ENCODE_TEST_DRIVER_H_

 #define TEST_ENCODE_TEST_DRIVER_H_

+#include "./vpx_config.h"

 #include <string>

 #include <vector>

 #include "third_party/googletest/src/include/gtest/gtest.h"

 #include "vpx/vpx_encoder.h"

-#include "vpx/vp8cx.h"

 namespace libvpx_test {

+class CodecFactory;

 class VideoSource;

 enum TestMode {

@@ -36,7 +38,10 @@

                                               ::libvpx_test::kOnePassGood, \

                                               ::libvpx_test::kOnePassBest)

+#define TWO_PASS_TEST_MODES ::testing::Values(::libvpx_test::kTwoPassGood, \

+                                              ::libvpx_test::kTwoPassBest)

 // Provides an object to handle the libvpx get_cx_data() iteration pattern

 class CxDataIterator {

  public:

@@ -83,7 +88,7 @@

  public:

   Encoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,

           const unsigned long init_flags, TwopassStatsStore *stats)

-    : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) {

+      : cfg_(cfg), deadline_(deadline), init_flags_(init_flags), stats_(stats) {

     memset(&encoder_, 0, sizeof(encoder_));

@@ -112,11 +117,18 @@

     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();

+  void Control(int ctrl_id, struct vpx_scaling_mode *arg) {

+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);

+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();

+  }

   void set_deadline(unsigned long deadline) {

     deadline_ = deadline;

  protected:

+  virtual const vpx_codec_iface_t* CodecInterface() const = 0;

   const char *EncoderError() {

     const char *detail = vpx_codec_error_detail(&encoder_);

     return detail ? detail : vpx_codec_error(&encoder_);

@@ -145,22 +157,19 @@

 // classes directly, so that tests can be parameterized differently.

 class EncoderTest {

  protected:

-  EncoderTest() : abort_(false), init_flags_(0), frame_flags_(0),

-                  last_pts_(0) {}

+  explicit EncoderTest(const CodecFactory *codec)

+      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),

+        last_pts_(0) {}

   virtual ~EncoderTest() {}

   // Initialize the cfg_ member with the default configuration.

-  void InitializeConfig() {

-    const vpx_codec_err_t res = vpx_codec_enc_config_default(

-                                    &vpx_codec_vp8_cx_algo, &cfg_, 0);

-    ASSERT_EQ(VPX_CODEC_OK, res);

-  }

+  void InitializeConfig();

   // Map the TestMode enum to the deadline_ and passes_ variables.

   void SetMode(TestMode mode);

-  // Main loop.

+  // Main loop

   virtual void RunLoop(VideoSource *video);

   // Hook to be called at the beginning of a pass.

@@ -181,6 +190,24 @@

   // Hook to determine whether the encode loop should continue.

   virtual bool Continue() const { return !abort_; }

+  const CodecFactory   *codec_;

+  // Hook to determine whether to decode frame after encoding

+  virtual bool DoDecode() const { return 1; }

+  // Hook to handle encode/decode mismatch

+  virtual void MismatchHook(const vpx_image_t *img1,

+                            const vpx_image_t *img2);

+  // Hook to be called on every decompressed frame.

+  virtual void DecompressedFrameHook(const vpx_image_t& img,

+                                     vpx_codec_pts_t pts) {}

+  // Hook that can modify the encoder's output data

+  virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook(

+      const vpx_codec_cx_pkt_t *pkt) {

+    return pkt;

+  }

   bool                 abort_;

   vpx_codec_enc_cfg_t  cfg_;

--- a/test/error_resilience_test.cc

+++ b/test/error_resilience_test.cc

@@ -7,22 +7,37 @@

   in the file PATENTS.  All contributing project authors may

   be found in the AUTHORS file in the root of the source tree.

*/

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

 #include "test/encode_test_driver.h"

 #include "test/i420_video_source.h"

+#include "test/util.h"

 namespace {

-class ErrorResilienceTest : public libvpx_test::EncoderTest,

-    public ::testing::TestWithParam<int> {

+const int kMaxErrorFrames = 8;

+const int kMaxDroppableFrames = 8;

+class ErrorResilienceTest : public ::libvpx_test::EncoderTest,

+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {

  protected:

-  ErrorResilienceTest() {

-    psnr_ = 0.0;

-    nframes_ = 0;

-    encoding_mode_ = static_cast<libvpx_test::TestMode>(GetParam());

+  ErrorResilienceTest() : EncoderTest(GET_PARAM(0)),

+                          psnr_(0.0),

+                          nframes_(0),

+                          mismatch_psnr_(0.0),

+                          mismatch_nframes_(0),

+                          encoding_mode_(GET_PARAM(1)) {

+    Reset();

   virtual ~ErrorResilienceTest() {}

+  void Reset() {

+    error_nframes_ = 0;

+    droppable_nframes_ = 0;

+  }

   virtual void SetUp() {

     InitializeConfig();

     SetMode(encoding_mode_);

@@ -31,6 +46,8 @@

   virtual void BeginPassHook(unsigned int /*pass*/) {

     psnr_ = 0.0;

     nframes_ = 0;

+    mismatch_psnr_ = 0.0;

+    mismatch_nframes_ = 0;

   virtual bool Continue() const {

@@ -42,6 +59,25 @@

     nframes_++;

+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) {

+    frame_flags_ &= ~(VP8_EFLAG_NO_UPD_LAST |

+                      VP8_EFLAG_NO_UPD_GF |

+                      VP8_EFLAG_NO_UPD_ARF);

+    if (droppable_nframes_ > 0 &&

+        (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {

+      for (unsigned int i = 0; i < droppable_nframes_; ++i) {

+        if (droppable_frames_[i] == nframes_) {

+          std::cout << "             Encoding droppable frame: "

+                    << droppable_frames_[i] << "\n";

+          frame_flags_ |= (VP8_EFLAG_NO_UPD_LAST |

+                           VP8_EFLAG_NO_UPD_GF |

+                           VP8_EFLAG_NO_UPD_ARF);

+          return;

+        }

+      }

+    }

+  }

   double GetAveragePsnr() const {

     if (nframes_)

       return psnr_ / nframes_;

@@ -48,9 +84,67 @@

     return 0.0;

+  double GetAverageMismatchPsnr() const {

+    if (mismatch_nframes_)

+      return mismatch_psnr_ / mismatch_nframes_;

+    return 0.0;

+  }

+  virtual bool DoDecode() const {

+    if (error_nframes_ > 0 &&

+        (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {

+      for (unsigned int i = 0; i < error_nframes_; ++i) {

+        if (error_frames_[i] == nframes_ - 1) {

+          std::cout << "             Skipping decoding frame: "

+                    << error_frames_[i] << "\n";

+          return 0;

+        }

+      }

+    }

+    return 1;

+  }

+  virtual void MismatchHook(const vpx_image_t *img1,

+                            const vpx_image_t *img2) {

+    double mismatch_psnr = compute_psnr(img1, img2);

+    mismatch_psnr_ += mismatch_psnr;

+    ++mismatch_nframes_;

+    // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";

+  }

+  void SetErrorFrames(int num, unsigned int *list) {

+    if (num > kMaxErrorFrames)

+      num = kMaxErrorFrames;

+    else if (num < 0)

+      num = 0;

+    error_nframes_ = num;

+    for (unsigned int i = 0; i < error_nframes_; ++i)

+      error_frames_[i] = list[i];

+  }

+  void SetDroppableFrames(int num, unsigned int *list) {

+    if (num > kMaxDroppableFrames)

+      num = kMaxDroppableFrames;

+    else if (num < 0)

+      num = 0;

+    droppable_nframes_ = num;

+    for (unsigned int i = 0; i < droppable_nframes_; ++i)

+      droppable_frames_[i] = list[i];

+  }

+  unsigned int GetMismatchFrames() {

+    return mismatch_nframes_;

+  }

  private:

   double psnr_;

   unsigned int nframes_;

+  unsigned int error_nframes_;

+  unsigned int droppable_nframes_;

+  double mismatch_psnr_;

+  unsigned int mismatch_nframes_;

+  unsigned int error_frames_[kMaxErrorFrames];

+  unsigned int droppable_frames_[kMaxDroppableFrames];

   libvpx_test::TestMode encoding_mode_;

};

@@ -85,6 +179,49 @@

-INSTANTIATE_TEST_CASE_P(OnOffTest, ErrorResilienceTest,

-                        ONE_PASS_TEST_MODES);

+TEST_P(ErrorResilienceTest, DropFramesWithoutRecovery) {

+  const vpx_rational timebase = { 33333333, 1000000000 };

+  cfg_.g_timebase = timebase;

+  cfg_.rc_target_bitrate = 500;

+  init_flags_ = VPX_CODEC_USE_PSNR;

+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,

+                                     timebase.den, timebase.num, 0, 30);

+  // Error resilient mode ON.

+  cfg_.g_error_resilient = 1;

+  // Set an arbitrary set of error frames same as droppable frames

+  unsigned int num_droppable_frames = 2;

+  unsigned int droppable_frame_list[] = {5, 16};

+  SetDroppableFrames(num_droppable_frames, droppable_frame_list);

+  SetErrorFrames(num_droppable_frames, droppable_frame_list);

+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

+  // Test that no mismatches have been found

+  std::cout << "             Mismatch frames: "

+            << GetMismatchFrames() << "\n";

+  EXPECT_EQ(GetMismatchFrames(), (unsigned int) 0);

+  // reset previously set error/droppable frames

+  Reset();

+  // Now set an arbitrary set of error frames that are non-droppable

+  unsigned int num_error_frames = 3;

+  unsigned int error_frame_list[] = {3, 10, 20};

+  SetErrorFrames(num_error_frames, error_frame_list);

+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

+  // Test that dropping an arbitrary set of inter frames does not hurt too much

+  // Note the Average Mismatch PSNR is the average of the PSNR between

+  // decoded frame and encoder's version of the same frame for all frames

+  // with mismatch.

+  const double psnr_resilience_mismatch = GetAverageMismatchPsnr();

+  std::cout << "             Mismatch PSNR: "

+            << psnr_resilience_mismatch << "\n";

+  EXPECT_GT(psnr_resilience_mismatch, 20.0);

+}

+VP8_INSTANTIATE_TEST_CASE(ErrorResilienceTest, ONE_PASS_TEST_MODES);

+VP9_INSTANTIATE_TEST_CASE(ErrorResilienceTest, ONE_PASS_TEST_MODES);

 }  // namespace

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -25,7 +25,7 @@

 namespace {

-TEST(Vp9FdctTest, SignBiasCheck) {

+TEST(Vp9Fdct4x4Test, SignBiasCheck) {

   ACMRandom rnd(ACMRandom::DeterministicSeed());

   int16_t test_input_block[16];

   int16_t test_output_block[16];

@@ -88,7 +88,7 @@

};

-TEST(Vp9FdctTest, RoundTripErrorCheck) {

+TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {

   ACMRandom rnd(ACMRandom::DeterministicSeed());

   int max_error = 0;

   double total_error = 0;

@@ -120,7 +120,7 @@

     // Because the bitstream is not frozen yet, use the idct in the codebase.

-    vp9_short_idct4x4llm_c(test_temp_block, test_output_block, pitch);

+    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);

     for (int j = 0; j < 16; ++j) {

       const int diff = test_input_block[j] - test_output_block[j];

--- a/test/fdct8x8_test.cc

+++ b/test/fdct8x8_test.cc

@@ -149,7 +149,7 @@

     // Initialize a test block with input range {-255, 255}.

     for (int j = 0; j < 64; ++j)

-      test_input_block[j] = rnd.Rand8() % 2 ? 255 : -255;

+      test_input_block[j] = rnd.Rand8() % 2 ? 255 : -256;

     const int pitch = 16;

     vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);

--- a/test/idct8x8_test.cc

+++ b/test/idct8x8_test.cc

@@ -120,31 +120,6 @@

       input[j] = rnd.Rand8() - rnd.Rand8();

     const int pitch = 16;

-    vp9_short_fdct8x8_c(input, output_c, pitch);

-    reference_dct_2d(input, output_r);

-    for (int j = 0; j < 64; ++j) {

-      const double diff = output_c[j] - output_r[j];

-      const double error = diff * diff;

-      // An error in a DCT coefficient isn't that bad.

-      // We care more about the reconstructed pixels.

-      EXPECT_GE(2.0, error)

-          << "Error: 8x8 FDCT/IDCT has error " << error

-          << " at index " << j;

-    }

-#if 0

-    // Tests that the reference iDCT and fDCT match.

-    reference_dct_2d(input, output_r);

-    reference_idct_2d(output_r, output_c);

-    for (int j = 0; j < 64; ++j) {

-      const int diff = output_c[j] -input[j];

-      const int error = diff * diff;

-      EXPECT_EQ(0, error)

-          << "Error: 8x8 FDCT/IDCT has error " << error

-          << " at index " << j;

-    }

-#endif

     reference_dct_2d(input, output_r);

     for (int j = 0; j < 64; ++j)

       coeff[j] = round(output_r[j]);

--- /dev/null

+++ b/test/idct_test.cc

@@ -1,0 +1,118 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+extern "C" {

+#include "./vpx_config.h"

+#include "./vp8_rtcd.h"

+}

+#include "test/register_state_check.h"

+#include "third_party/googletest/src/include/gtest/gtest.h"

+typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,

+                          int pred_stride, unsigned char *dst_ptr,

+                          int dst_stride);

+namespace {

+class IDCTTest : public ::testing::TestWithParam<idct_fn_t> {

+  protected:

+    virtual void SetUp() {

+        int i;

+        UUT = GetParam();

+        memset(input, 0, sizeof(input));

+        /* Set up guard blocks */

+        for (i = 0; i < 256; i++)

+            output[i] = ((i & 0xF) < 4 && (i < 64)) ? 0 : -1;

+    }

+    idct_fn_t UUT;

+    short input[16];

+    unsigned char output[256];

+    unsigned char predict[256];

+};

+TEST_P(IDCTTest, TestGuardBlocks) {

+    int i;

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) < 4 && i < 64)

+            EXPECT_EQ(0, output[i]) << i;

+        else

+            EXPECT_EQ(255, output[i]);

+}

+TEST_P(IDCTTest, TestAllZeros) {

+    int i;

+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) < 4 && i < 64)

+            EXPECT_EQ(0, output[i]) << "i==" << i;

+        else

+            EXPECT_EQ(255, output[i]) << "i==" << i;

+}

+TEST_P(IDCTTest, TestAllOnes) {

+    int i;

+    input[0] = 4;

+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) < 4 && i < 64)

+            EXPECT_EQ(1, output[i]) << "i==" << i;

+        else

+            EXPECT_EQ(255, output[i]) << "i==" << i;

+}

+TEST_P(IDCTTest, TestAddOne) {

+    int i;

+    for (i = 0; i < 256; i++)

+        predict[i] = i;

+    input[0] = 4;

+    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) < 4 && i < 64)

+            EXPECT_EQ(i+1, output[i]) << "i==" << i;

+        else

+            EXPECT_EQ(255, output[i]) << "i==" << i;

+}

+TEST_P(IDCTTest, TestWithData) {

+    int i;

+    for (i = 0; i < 16; i++)

+        input[i] = i;

+    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

+    for (i = 0; i < 256; i++)

+        if ((i & 0xF) > 3 || i > 63)

+            EXPECT_EQ(255, output[i]) << "i==" << i;

+        else if (i == 0)

+            EXPECT_EQ(11, output[i]) << "i==" << i;

+        else if (i == 34)

+            EXPECT_EQ(1, output[i]) << "i==" << i;

+        else if (i == 2 || i == 17 || i == 32)

+            EXPECT_EQ(3, output[i]) << "i==" << i;

+        else

+            EXPECT_EQ(0, output[i]) << "i==" << i;

+}

+INSTANTIATE_TEST_CASE_P(C, IDCTTest,

+                        ::testing::Values(vp8_short_idct4x4llm_c));

+#if HAVE_MMX

+INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,

+                        ::testing::Values(vp8_short_idct4x4llm_mmx));

+#endif

+}

--- a/test/idctllm_test.cc

+++ /dev/null

@@ -1,126 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-extern "C" {

-#include "vpx_config.h"

-#include "vp8_rtcd.h"

-}

-#include "test/register_state_check.h"

-#include "third_party/googletest/src/include/gtest/gtest.h"

-typedef void (*idct_fn_t)(short *input, unsigned char *pred_ptr,

-                          int pred_stride, unsigned char *dst_ptr,

-                          int dst_stride);

-namespace {

-class IDCTTest : public ::testing::TestWithParam<idct_fn_t>

-{

-  protected:

-    virtual void SetUp()

-    {

-        int i;

-        UUT = GetParam();

-        memset(input, 0, sizeof(input));

-        /* Set up guard blocks */

-        for(i=0; i<256; i++)

-            output[i] = ((i&0xF)<4&&(i<64))?0:-1;

-    }

-    idct_fn_t UUT;

-    short input[16];

-    unsigned char output[256];

-    unsigned char predict[256];

-};

-TEST_P(IDCTTest, TestGuardBlocks)

-{

-    int i;

-    for(i=0; i<256; i++)

-        if((i&0xF) < 4 && i<64)

-            EXPECT_EQ(0, output[i]) << i;

-        else

-            EXPECT_EQ(255, output[i]);

-}

-TEST_P(IDCTTest, TestAllZeros)

-{

-    int i;

-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-    for(i=0; i<256; i++)

-        if((i&0xF) < 4 && i<64)

-            EXPECT_EQ(0, output[i]) << "i==" << i;

-        else

-            EXPECT_EQ(255, output[i]) << "i==" << i;

-}

-TEST_P(IDCTTest, TestAllOnes)

-{

-    int i;

-    input[0] = 4;

-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-    for(i=0; i<256; i++)

-        if((i&0xF) < 4 && i<64)

-            EXPECT_EQ(1, output[i]) << "i==" << i;

-        else

-            EXPECT_EQ(255, output[i]) << "i==" << i;

-}

-TEST_P(IDCTTest, TestAddOne)

-{

-    int i;

-    for(i=0; i<256; i++)

-        predict[i] = i;

-    input[0] = 4;

-    REGISTER_STATE_CHECK(UUT(input, predict, 16, output, 16));

-    for(i=0; i<256; i++)

-        if((i&0xF) < 4 && i<64)

-            EXPECT_EQ(i+1, output[i]) << "i==" << i;

-        else

-            EXPECT_EQ(255, output[i]) << "i==" << i;

-}

-TEST_P(IDCTTest, TestWithData)

-{

-    int i;

-    for(i=0; i<16; i++)

-        input[i] = i;

-    REGISTER_STATE_CHECK(UUT(input, output, 16, output, 16));

-    for(i=0; i<256; i++)

-        if((i&0xF) > 3 || i>63)

-            EXPECT_EQ(255, output[i]) << "i==" << i;

-        else if(i == 0)

-            EXPECT_EQ(11, output[i]) << "i==" << i;

-        else if(i == 34)

-            EXPECT_EQ(1, output[i]) << "i==" << i;

-        else if(i == 2 || i == 17 || i == 32)

-            EXPECT_EQ(3, output[i]) << "i==" << i;

-        else

-            EXPECT_EQ(0, output[i]) << "i==" << i;

-}

-INSTANTIATE_TEST_CASE_P(C, IDCTTest,

-                        ::testing::Values(vp8_short_idct4x4llm_c));

-#if HAVE_MMX

-INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,

-                        ::testing::Values(vp8_short_idct4x4llm_mmx));

-#endif

-}

--- a/test/keyframe_test.cc

+++ b/test/keyframe_test.cc

@@ -9,18 +9,22 @@

*/

 #include <climits>

 #include <vector>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

 #include "test/encode_test_driver.h"

 #include "test/i420_video_source.h"

-#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/util.h"

 namespace {

 class KeyframeTest : public ::libvpx_test::EncoderTest,

-    public ::testing::TestWithParam<enum libvpx_test::TestMode> {

+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {

  protected:

+  KeyframeTest() : EncoderTest(GET_PARAM(0)) {}

   virtual void SetUp() {

     InitializeConfig();

-    SetMode(GetParam());

+    SetMode(GET_PARAM(1));

     kf_count_ = 0;

     kf_count_max_ = INT_MAX;

     kf_do_force_kf_ = false;

@@ -64,7 +68,7 @@

   // In realtime mode - auto placed keyframes are exceedingly rare,  don't

   // bother with this check   if(GetParam() > 0)

-  if(GetParam() > 0)

+  if (GET_PARAM(1) > 0)

     EXPECT_GT(kf_count_, 1);

@@ -126,7 +130,7 @@

   // In realtime mode - auto placed keyframes are exceedingly rare,  don't

   // bother with this check

-  if(GetParam() > 0)

+  if (GET_PARAM(1) > 0)

     EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes ";

   // Verify that keyframes match the file keyframes in the file.

@@ -141,5 +145,5 @@

-INSTANTIATE_TEST_CASE_P(AllModes, KeyframeTest, ALL_TEST_MODES);

+VP8_INSTANTIATE_TEST_CASE(KeyframeTest, ALL_TEST_MODES);

 }  // namespace

--- /dev/null

+++ b/test/md5_helper.h

@@ -1,0 +1,64 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef LIBVPX_TEST_MD5_HELPER_H_

+#define LIBVPX_TEST_MD5_HELPER_H_

+extern "C" {

+#include "./md5_utils.h"

+#include "vpx/vpx_decoder.h"

+}

+namespace libvpx_test {

+class MD5 {

+ public:

+  MD5() {

+    MD5Init(&md5_);

+  }

+  void Add(const vpx_image_t *img) {

+    for (int plane = 0; plane < 3; ++plane) {

+      uint8_t *buf = img->planes[plane];

+      const int h = plane ? (img->d_h + 1) >> 1 : img->d_h;

+      const int w = plane ? (img->d_w + 1) >> 1 : img->d_w;

+      for (int y = 0; y < h; ++y) {

+        MD5Update(&md5_, buf, w);

+        buf += img->stride[plane];

+      }

+    }

+  }

+  const char *Get(void) {

+    static const char hex[16] = {

+      '0', '1', '2', '3', '4', '5', '6', '7',

+      '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',

+    };

+    uint8_t tmp[16];

+    MD5Context ctx_tmp = md5_;

+    MD5Final(tmp, &ctx_tmp);

+    for (int i = 0; i < 16; i++) {

+      res_[i * 2 + 0]  = hex[tmp[i] >> 4];

+      res_[i * 2 + 1]  = hex[tmp[i] & 0xf];

+    }

+    res_[32] = 0;

+    return res_;

+  }

+ protected:

+  char res_[33];

+  MD5Context md5_;

+};

+}  // namespace libvpx_test

+#endif  // LIBVPX_TEST_MD5_HELPER_H_

--- a/test/resize_test.cc

+++ b/test/resize_test.cc

@@ -9,9 +9,12 @@

*/

 #include <climits>

 #include <vector>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

 #include "test/encode_test_driver.h"

+#include "test/i420_video_source.h"

 #include "test/video_source.h"

-#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/util.h"

 namespace {

@@ -49,8 +52,10 @@

};

 class ResizeTest : public ::libvpx_test::EncoderTest,

-  public ::testing::TestWithParam<enum libvpx_test::TestMode> {

+  public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {

  protected:

+  ResizeTest() : EncoderTest(GET_PARAM(0)) {}

   struct FrameInfo {

     FrameInfo(vpx_codec_pts_t _pts, unsigned int _w, unsigned int _h)

         : pts(_pts), w(_w), h(_h) {}

@@ -62,7 +67,7 @@

   virtual void SetUp() {

     InitializeConfig();

-    SetMode(GetParam());

+    SetMode(GET_PARAM(1));

   virtual bool Continue() const {

@@ -69,15 +74,9 @@

     return !HasFatalFailure() && !abort_;

-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {

-    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {

-      const unsigned char *buf =

-          reinterpret_cast<const unsigned char *>(pkt->data.frame.buf);

-      const unsigned int w = (buf[6] | (buf[7] << 8)) & 0x3fff;

-      const unsigned int h = (buf[8] | (buf[9] << 8)) & 0x3fff;

-      frame_info_list_.push_back(FrameInfo(pkt->data.frame.pts, w, h));

-    }

+  virtual void DecompressedFrameHook(const vpx_image_t &img,

+                                     vpx_codec_pts_t pts) {

+    frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));

   std::vector< FrameInfo > frame_info_list_;

@@ -100,5 +99,53 @@

-INSTANTIATE_TEST_CASE_P(OnePass, ResizeTest, ONE_PASS_TEST_MODES);

+class ResizeInternalTest : public ResizeTest {

+ protected:

+  ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}

+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,

+                                  libvpx_test::Encoder *encoder) {

+    if (video->frame() == 3) {

+      struct vpx_scaling_mode mode = {VP8E_FOURFIVE, VP8E_THREEFIVE};

+      encoder->Control(VP8E_SET_SCALEMODE, &mode);

+    }

+    if (video->frame() == 6) {

+      struct vpx_scaling_mode mode = {VP8E_NORMAL, VP8E_NORMAL};

+      encoder->Control(VP8E_SET_SCALEMODE, &mode);

+    }

+  }

+  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {

+    if (!frame0_psnr_)

+      frame0_psnr_ = pkt->data.psnr.psnr[0];

+    EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.0);

+  }

+  double frame0_psnr_;

+};

+TEST_P(ResizeInternalTest, TestInternalResizeWorks) {

+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,

+                                       30, 1, 0, 10);

+  init_flags_ = VPX_CODEC_USE_PSNR;

+  // q picked such that initial keyframe on this clip is ~30dB PSNR

+  cfg_.rc_min_quantizer = cfg_.rc_max_quantizer = 48;

+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

+  for (std::vector<FrameInfo>::iterator info = frame_info_list_.begin();

+       info != frame_info_list_.end(); ++info) {

+    const vpx_codec_pts_t pts = info->pts;

+    if (pts >= 3 && pts < 6) {

+      ASSERT_EQ(282U, info->w) << "Frame " << pts << " had unexpected width";

+      ASSERT_EQ(173U, info->h) << "Frame " << pts << " had unexpected height";

+    } else {

+      EXPECT_EQ(352U, info->w) << "Frame " << pts << " had unexpected width";

+      EXPECT_EQ(288U, info->h) << "Frame " << pts << " had unexpected height";

+    }

+  }

+}

+VP8_INSTANTIATE_TEST_CASE(ResizeTest, ONE_PASS_TEST_MODES);

+VP9_INSTANTIATE_TEST_CASE(ResizeInternalTest,

+                          ::testing::Values(::libvpx_test::kOnePassBest));

 }  // namespace

--- a/test/sad_test.cc

+++ b/test/sad_test.cc

@@ -15,8 +15,13 @@

 extern "C" {

 #include "./vpx_config.h"

+#if CONFIG_VP8_ENCODER

 #include "./vp8_rtcd.h"

-#include "vp8/common/blockd.h"

+//#include "vp8/common/blockd.h"

+#endif

+#if CONFIG_VP9_ENCODER

+#include "./vp9_rtcd.h"

+#endif

 #include "vpx_mem/vpx_mem.h"

@@ -32,14 +37,22 @@

                                         int reference_stride,

                                         unsigned int max_sad);

+typedef void (*sad_n_by_n_by_4_fn_t)(const uint8_t *src_ptr,

+                                     int src_stride,

+                                     const unsigned char * const ref_ptr[],

+                                     int ref_stride,

+                                     unsigned int *sad_array);

 using libvpx_test::ACMRandom;

 namespace {

-class SADTest : public PARAMS(int, int, sad_m_by_n_fn_t) {

+class SADTestBase : public ::testing::Test {

  public:

+  SADTestBase(int width, int height) : width_(width), height_(height) {}

   static void SetUpTestCase() {

     source_data_ = reinterpret_cast<uint8_t*>(

-        vpx_memalign(kDataAlignment, kDataBufferSize));

+        vpx_memalign(kDataAlignment, kDataBlockSize));

     reference_data_ = reinterpret_cast<uint8_t*>(

         vpx_memalign(kDataAlignment, kDataBufferSize));

@@ -52,36 +65,31 @@

  protected:

+  // Handle blocks up to 4 blocks 64x64 with stride up to 128

   static const int kDataAlignment = 16;

-  static const int kDataBufferSize = 16 * 32;

+  static const int kDataBlockSize = 64 * 128;

+  static const int kDataBufferSize = 4 * kDataBlockSize;

   virtual void SetUp() {

-    sad_fn_ = GET_PARAM(2);

-    height_ = GET_PARAM(1);

-    width_ = GET_PARAM(0);

-    source_stride_ = width_ * 2;

+    source_stride_ = (width_ + 31) & ~31;

     reference_stride_ = width_ * 2;

     rnd_.Reset(ACMRandom::DeterministicSeed());

-  sad_m_by_n_fn_t sad_fn_;

-  virtual unsigned int SAD(unsigned int max_sad) {

-    unsigned int ret;

-    REGISTER_STATE_CHECK(ret = sad_fn_(source_data_, source_stride_,

-                                       reference_data_, reference_stride_,

-                                       max_sad));

-    return ret;

+  virtual uint8_t* GetReference(int block_idx) {

+    return reference_data_ + block_idx * kDataBlockSize;

   // Sum of Absolute Differences. Given two blocks, calculate the absolute

   // difference between two pixels in the same relative location; accumulate.

-  unsigned int ReferenceSAD(unsigned int max_sad) {

+  unsigned int ReferenceSAD(unsigned int max_sad, int block_idx = 0) {

     unsigned int sad = 0;

+    const uint8_t* const reference = GetReference(block_idx);

     for (int h = 0; h < height_; ++h) {

       for (int w = 0; w < width_; ++w) {

         sad += abs(source_data_[h * source_stride_ + w]

-               - reference_data_[h * reference_stride_ + w]);

+               - reference[h * reference_stride_ + w]);

       if (sad > max_sad) {

         break;

@@ -106,6 +114,32 @@

+  int width_, height_;

+  static uint8_t* source_data_;

+  int source_stride_;

+  static uint8_t* reference_data_;

+  int reference_stride_;

+  ACMRandom rnd_;

+};

+class SADTest : public SADTestBase,

+    public ::testing::WithParamInterface<

+        std::tr1::tuple<int, int, sad_m_by_n_fn_t> > {

+ public:

+  SADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}

+ protected:

+  unsigned int SAD(unsigned int max_sad, int block_idx = 0) {

+    unsigned int ret;

+    const uint8_t* const reference = GetReference(block_idx);

+    REGISTER_STATE_CHECK(ret = GET_PARAM(2)(source_data_, source_stride_,

+                                            reference, reference_stride_,

+                                            max_sad));

+    return ret;

+  }

   void CheckSad(unsigned int max_sad) {

     unsigned int reference_sad, exp_sad;

@@ -119,19 +153,38 @@

       ASSERT_GE(exp_sad, reference_sad);

+};

-  // Handle blocks up to 16x16 with stride up to 32

-  int height_, width_;

-  static uint8_t* source_data_;

-  int source_stride_;

-  static uint8_t* reference_data_;

-  int reference_stride_;

+class SADx4Test : public SADTestBase,

+    public ::testing::WithParamInterface<

+        std::tr1::tuple<int, int, sad_n_by_n_by_4_fn_t> > {

+ public:

+  SADx4Test() : SADTestBase(GET_PARAM(0), GET_PARAM(1)) {}

-  ACMRandom rnd_;

+ protected:

+  void SADs(unsigned int *results) {

+    const uint8_t* refs[] = {GetReference(0), GetReference(1),

+                             GetReference(2), GetReference(3)};

+    REGISTER_STATE_CHECK(GET_PARAM(2)(source_data_, source_stride_,

+                                      refs, reference_stride_,

+                                      results));

+  }

+  void CheckSADs() {

+    unsigned int reference_sad, exp_sad[4];

+    SADs(exp_sad);

+    for (int block = 0; block < 4; block++) {

+      reference_sad = ReferenceSAD(UINT_MAX, block);

+      EXPECT_EQ(exp_sad[block], reference_sad) << "block " << block;

+    }

+  }

};

-uint8_t* SADTest::source_data_ = NULL;

-uint8_t* SADTest::reference_data_ = NULL;

+uint8_t* SADTestBase::source_data_ = NULL;

+uint8_t* SADTestBase::reference_data_ = NULL;

 TEST_P(SADTest, MaxRef) {

   FillConstant(source_data_, source_stride_, 0);

@@ -139,6 +192,15 @@

   CheckSad(UINT_MAX);

+TEST_P(SADx4Test, MaxRef) {

+  FillConstant(source_data_, source_stride_, 0);

+  FillConstant(GetReference(0), reference_stride_, 255);

+  FillConstant(GetReference(1), reference_stride_, 255);

+  FillConstant(GetReference(2), reference_stride_, 255);

+  FillConstant(GetReference(3), reference_stride_, 255);

+  CheckSADs();

+}

 TEST_P(SADTest, MaxSrc) {

   FillConstant(source_data_, source_stride_, 255);

   FillConstant(reference_data_, reference_stride_, 0);

@@ -145,6 +207,15 @@

   CheckSad(UINT_MAX);

+TEST_P(SADx4Test, MaxSrc) {

+  FillConstant(source_data_, source_stride_, 255);

+  FillConstant(GetReference(0), reference_stride_, 0);

+  FillConstant(GetReference(1), reference_stride_, 0);

+  FillConstant(GetReference(2), reference_stride_, 0);

+  FillConstant(GetReference(3), reference_stride_, 0);

+  CheckSADs();

+}

 TEST_P(SADTest, ShortRef) {

   int tmp_stride = reference_stride_;

   reference_stride_ >>= 1;

@@ -154,6 +225,18 @@

   reference_stride_ = tmp_stride;

+TEST_P(SADx4Test, ShortRef) {

+  int tmp_stride = reference_stride_;

+  reference_stride_ >>= 1;

+  FillRandom(source_data_, source_stride_);

+  FillRandom(GetReference(0), reference_stride_);

+  FillRandom(GetReference(1), reference_stride_);

+  FillRandom(GetReference(2), reference_stride_);

+  FillRandom(GetReference(3), reference_stride_);

+  CheckSADs();

+  reference_stride_ = tmp_stride;

+}

 TEST_P(SADTest, UnalignedRef) {

   // The reference frame, but not the source frame, may be unaligned for

   // certain types of searches.

@@ -165,6 +248,20 @@

   reference_stride_ = tmp_stride;

+TEST_P(SADx4Test, UnalignedRef) {

+  // The reference frame, but not the source frame, may be unaligned for

+  // certain types of searches.

+  int tmp_stride = reference_stride_;

+  reference_stride_ -= 1;

+  FillRandom(source_data_, source_stride_);

+  FillRandom(GetReference(0), reference_stride_);

+  FillRandom(GetReference(1), reference_stride_);

+  FillRandom(GetReference(2), reference_stride_);

+  FillRandom(GetReference(3), reference_stride_);

+  CheckSADs();

+  reference_stride_ = tmp_stride;

+}

 TEST_P(SADTest, ShortSrc) {

   int tmp_stride = source_stride_;

   source_stride_ >>= 1;

@@ -174,6 +271,18 @@

   source_stride_ = tmp_stride;

+TEST_P(SADx4Test, ShortSrc) {

+  int tmp_stride = source_stride_;

+  source_stride_ >>= 1;

+  FillRandom(source_data_, source_stride_);

+  FillRandom(GetReference(0), reference_stride_);

+  FillRandom(GetReference(1), reference_stride_);

+  FillRandom(GetReference(2), reference_stride_);

+  FillRandom(GetReference(3), reference_stride_);

+  CheckSADs();

+  source_stride_ = tmp_stride;

+}

 TEST_P(SADTest, MaxSAD) {

   // Verify that, when max_sad is set, the implementation does not return a

   // value lower than the reference.

@@ -184,18 +293,62 @@

 using std::tr1::make_tuple;

+#if CONFIG_VP8_ENCODER && CONFIG_VP9_ENCODER

+#define VP8_VP9_SEPARATOR ,

+#else

+#define VP8_VP9_SEPARATOR

+#endif

+#if CONFIG_VP8_ENCODER

 const sad_m_by_n_fn_t sad_16x16_c = vp8_sad16x16_c;

 const sad_m_by_n_fn_t sad_8x16_c = vp8_sad8x16_c;

 const sad_m_by_n_fn_t sad_16x8_c = vp8_sad16x8_c;

 const sad_m_by_n_fn_t sad_8x8_c = vp8_sad8x8_c;

 const sad_m_by_n_fn_t sad_4x4_c = vp8_sad4x4_c;

+#endif

+#if CONFIG_VP9_ENCODER

+const sad_m_by_n_fn_t sad_64x64_c_vp9 = vp9_sad64x64_c;

+const sad_m_by_n_fn_t sad_32x32_c_vp9 = vp9_sad32x32_c;

+const sad_m_by_n_fn_t sad_16x16_c_vp9 = vp9_sad16x16_c;

+const sad_m_by_n_fn_t sad_8x16_c_vp9 = vp9_sad8x16_c;

+const sad_m_by_n_fn_t sad_16x8_c_vp9 = vp9_sad16x8_c;

+const sad_m_by_n_fn_t sad_8x8_c_vp9 = vp9_sad8x8_c;

+const sad_m_by_n_fn_t sad_4x4_c_vp9 = vp9_sad4x4_c;

+#endif

 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::Values(

+#if CONFIG_VP8_ENCODER

                         make_tuple(16, 16, sad_16x16_c),

                         make_tuple(8, 16, sad_8x16_c),

                         make_tuple(16, 8, sad_16x8_c),

                         make_tuple(8, 8, sad_8x8_c),

-                        make_tuple(4, 4, sad_4x4_c)));

+                        make_tuple(4, 4, sad_4x4_c)

+#endif

+                        VP8_VP9_SEPARATOR

+#if CONFIG_VP9_ENCODER

+                        make_tuple(64, 64, sad_64x64_c_vp9),

+                        make_tuple(32, 32, sad_32x32_c_vp9),

+                        make_tuple(16, 16, sad_16x16_c_vp9),

+                        make_tuple(8, 16, sad_8x16_c_vp9),

+                        make_tuple(16, 8, sad_16x8_c_vp9),

+                        make_tuple(8, 8, sad_8x8_c_vp9),

+                        make_tuple(4, 4, sad_4x4_c_vp9)

+#endif

+                        ));

+#if CONFIG_VP9_ENCODER

+const sad_n_by_n_by_4_fn_t sad_64x64x4d_c = vp9_sad64x64x4d_c;

+const sad_n_by_n_by_4_fn_t sad_32x32x4d_c = vp9_sad32x32x4d_c;

+const sad_n_by_n_by_4_fn_t sad_16x16x4d_c = vp9_sad16x16x4d_c;

+const sad_n_by_n_by_4_fn_t sad_8x8x4d_c = vp9_sad8x8x4d_c;

+const sad_n_by_n_by_4_fn_t sad_4x4x4d_c = vp9_sad4x4x4d_c;

+INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::Values(

+                        make_tuple(64, 64, sad_64x64x4d_c),

+                        make_tuple(32, 32, sad_32x32x4d_c),

+                        make_tuple(16, 16, sad_16x16x4d_c),

+                        make_tuple(8, 8, sad_8x8x4d_c),

+                        make_tuple(4, 4, sad_4x4x4d_c)));

+#endif

 // ARM tests

 #if HAVE_MEDIA

 const sad_m_by_n_fn_t sad_16x16_armv6 = vp8_sad16x16_armv6;

@@ -219,31 +372,120 @@

 // X86 tests

 #if HAVE_MMX

+#if CONFIG_VP8_ENCODER

 const sad_m_by_n_fn_t sad_16x16_mmx = vp8_sad16x16_mmx;

 const sad_m_by_n_fn_t sad_8x16_mmx = vp8_sad8x16_mmx;

 const sad_m_by_n_fn_t sad_16x8_mmx = vp8_sad16x8_mmx;

 const sad_m_by_n_fn_t sad_8x8_mmx = vp8_sad8x8_mmx;

 const sad_m_by_n_fn_t sad_4x4_mmx = vp8_sad4x4_mmx;

+#endif

+#if CONFIG_VP9_ENCODER

+const sad_m_by_n_fn_t sad_16x16_mmx_vp9 = vp9_sad16x16_mmx;

+const sad_m_by_n_fn_t sad_8x16_mmx_vp9 = vp9_sad8x16_mmx;

+const sad_m_by_n_fn_t sad_16x8_mmx_vp9 = vp9_sad16x8_mmx;

+const sad_m_by_n_fn_t sad_8x8_mmx_vp9 = vp9_sad8x8_mmx;

+const sad_m_by_n_fn_t sad_4x4_mmx_vp9 = vp9_sad4x4_mmx;

+#endif

 INSTANTIATE_TEST_CASE_P(MMX, SADTest, ::testing::Values(

+#if CONFIG_VP8_ENCODER

                         make_tuple(16, 16, sad_16x16_mmx),

                         make_tuple(8, 16, sad_8x16_mmx),

                         make_tuple(16, 8, sad_16x8_mmx),

                         make_tuple(8, 8, sad_8x8_mmx),

-                        make_tuple(4, 4, sad_4x4_mmx)));

+                        make_tuple(4, 4, sad_4x4_mmx)

 #endif

+                        VP8_VP9_SEPARATOR

+#if CONFIG_VP9_ENCODER

+                        make_tuple(16, 16, sad_16x16_mmx_vp9),

+                        make_tuple(8, 16, sad_8x16_mmx_vp9),

+                        make_tuple(16, 8, sad_16x8_mmx_vp9),

+                        make_tuple(8, 8, sad_8x8_mmx_vp9),

+                        make_tuple(4, 4, sad_4x4_mmx_vp9)

+#endif

+                        ));

+#endif

+#if HAVE_SSE

+#if CONFIG_VP9_ENCODER

+const sad_m_by_n_fn_t sad_4x4_sse_vp9 = vp9_sad4x4_sse;

+INSTANTIATE_TEST_CASE_P(SSE, SADTest, ::testing::Values(

+                        make_tuple(4, 4, sad_4x4_sse_vp9)));

+const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse = vp9_sad4x4x4d_sse;

+INSTANTIATE_TEST_CASE_P(SSE, SADx4Test, ::testing::Values(

+                        make_tuple(4, 4, sad_4x4x4d_sse)));

+#endif

+#endif

 #if HAVE_SSE2

+#if CONFIG_VP8_ENCODER

 const sad_m_by_n_fn_t sad_16x16_wmt = vp8_sad16x16_wmt;

 const sad_m_by_n_fn_t sad_8x16_wmt = vp8_sad8x16_wmt;

 const sad_m_by_n_fn_t sad_16x8_wmt = vp8_sad16x8_wmt;

 const sad_m_by_n_fn_t sad_8x8_wmt = vp8_sad8x8_wmt;

 const sad_m_by_n_fn_t sad_4x4_wmt = vp8_sad4x4_wmt;

+#endif

+#if CONFIG_VP9_ENCODER

+const sad_m_by_n_fn_t sad_64x64_sse2_vp9 = vp9_sad64x64_sse2;

+const sad_m_by_n_fn_t sad_32x32_sse2_vp9 = vp9_sad32x32_sse2;

+const sad_m_by_n_fn_t sad_16x16_sse2_vp9 = vp9_sad16x16_sse2;

+const sad_m_by_n_fn_t sad_8x16_sse2_vp9 = vp9_sad8x16_sse2;

+const sad_m_by_n_fn_t sad_16x8_sse2_vp9 = vp9_sad16x8_sse2;

+const sad_m_by_n_fn_t sad_8x8_sse2_vp9 = vp9_sad8x8_sse2;

+#endif

 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::Values(

+#if CONFIG_VP8_ENCODER

                         make_tuple(16, 16, sad_16x16_wmt),

                         make_tuple(8, 16, sad_8x16_wmt),

                         make_tuple(16, 8, sad_16x8_wmt),

                         make_tuple(8, 8, sad_8x8_wmt),

-                        make_tuple(4, 4, sad_4x4_wmt)));

+                        make_tuple(4, 4, sad_4x4_wmt)

 #endif

+                        VP8_VP9_SEPARATOR

+#if CONFIG_VP9_ENCODER

+                        make_tuple(64, 64, sad_64x64_sse2_vp9),

+                        make_tuple(32, 32, sad_32x32_sse2_vp9),

+                        make_tuple(16, 16, sad_16x16_sse2_vp9),

+                        make_tuple(8, 16, sad_8x16_sse2_vp9),

+                        make_tuple(16, 8, sad_16x8_sse2_vp9),

+                        make_tuple(8, 8, sad_8x8_sse2_vp9)

+#endif

+                        ));

+#if CONFIG_VP9_ENCODER

+const sad_n_by_n_by_4_fn_t sad_64x64x4d_sse2 = vp9_sad64x64x4d_sse2;

+const sad_n_by_n_by_4_fn_t sad_32x32x4d_sse2 = vp9_sad32x32x4d_sse2;

+const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse2 = vp9_sad16x16x4d_sse2;

+const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse2 = vp9_sad16x8x4d_sse2;

+const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse2 = vp9_sad8x16x4d_sse2;

+const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse2 = vp9_sad8x8x4d_sse2;

+INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::Values(

+                        make_tuple(64, 64, sad_64x64x4d_sse2),

+                        make_tuple(32, 32, sad_32x32x4d_sse2),

+                        make_tuple(16, 16, sad_16x16x4d_sse2),

+                        make_tuple(16, 8, sad_16x8x4d_sse2),

+                        make_tuple(8, 16, sad_8x16x4d_sse2),

+                        make_tuple(8, 8, sad_8x8x4d_sse2)));

+#endif

+#endif

+#if HAVE_SSE3

+#if CONFIG_VP8_ENCODER

+const sad_n_by_n_by_4_fn_t sad_16x16x4d_sse3 = vp8_sad16x16x4d_sse3;

+const sad_n_by_n_by_4_fn_t sad_16x8x4d_sse3 = vp8_sad16x8x4d_sse3;

+const sad_n_by_n_by_4_fn_t sad_8x16x4d_sse3 = vp8_sad8x16x4d_sse3;

+const sad_n_by_n_by_4_fn_t sad_8x8x4d_sse3 = vp8_sad8x8x4d_sse3;

+const sad_n_by_n_by_4_fn_t sad_4x4x4d_sse3 = vp8_sad4x4x4d_sse3;

+INSTANTIATE_TEST_CASE_P(SSE3, SADx4Test, ::testing::Values(

+                        make_tuple(16, 16, sad_16x16x4d_sse3),

+                        make_tuple(16, 8, sad_16x8x4d_sse3),

+                        make_tuple(8, 16, sad_8x16x4d_sse3),

+                        make_tuple(8, 8, sad_8x8x4d_sse3),

+                        make_tuple(4, 4, sad_4x4x4d_sse3)));

+#endif

+#endif

 #if HAVE_SSSE3

 const sad_m_by_n_fn_t sad_16x16_sse3 = vp8_sad16x16_sse3;

 INSTANTIATE_TEST_CASE_P(SSE3, SADTest, ::testing::Values(

--- /dev/null

+++ b/test/superframe_test.cc

@@ -1,0 +1,100 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <climits>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

+#include "test/encode_test_driver.h"

+#include "test/i420_video_source.h"

+#include "test/util.h"

+namespace {

+class SuperframeTest : public ::libvpx_test::EncoderTest,

+    public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {

+ protected:

+  SuperframeTest() : EncoderTest(GET_PARAM(0)), modified_buf_(NULL),

+      last_sf_pts_(0) {}

+  virtual void SetUp() {

+    InitializeConfig();

+    SetMode(GET_PARAM(1));

+    sf_count_ = 0;

+    sf_count_max_ = INT_MAX;

+  }

+  virtual void TearDown() {

+    delete modified_buf_;

+  }

+  virtual bool Continue() const {

+    return !HasFatalFailure() && !abort_;

+  }

+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,

+                                  libvpx_test::Encoder *encoder) {

+    if (video->frame() == 1) {

+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);

+    }

+  }

+  virtual const vpx_codec_cx_pkt_t * MutateEncoderOutputHook(

+      const vpx_codec_cx_pkt_t *pkt) {

+    if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)

+      return pkt;

+    const uint8_t *buffer = reinterpret_cast<uint8_t*>(pkt->data.frame.buf);

+    const uint8_t marker = buffer[pkt->data.frame.sz - 1];

+    const int frames = (marker & 0x7) + 1;

+    const int mag = ((marker >> 3) & 3) + 1;

+    const unsigned int index_sz = 2 + mag  * frames;

+    if ((marker & 0xe0) == 0xc0 &&

+        pkt->data.frame.sz >= index_sz &&

+        buffer[pkt->data.frame.sz - index_sz] == marker) {

+      // frame is a superframe. strip off the index.

+      if (modified_buf_)

+        delete modified_buf_;

+      modified_buf_ = new uint8_t[pkt->data.frame.sz - index_sz];

+      memcpy(modified_buf_, pkt->data.frame.buf,

+             pkt->data.frame.sz - index_sz);

+      modified_pkt_ = *pkt;

+      modified_pkt_.data.frame.buf = modified_buf_;

+      modified_pkt_.data.frame.sz -= index_sz;

+      sf_count_++;

+      last_sf_pts_ = pkt->data.frame.pts;

+      return &modified_pkt_;

+    }

+    // Make sure we do a few frames after the last SF

+    abort_ |= sf_count_ > sf_count_max_ &&

+              pkt->data.frame.pts - last_sf_pts_ >= 5;

+    return pkt;

+  }

+  int sf_count_;

+  int sf_count_max_;

+  vpx_codec_cx_pkt_t modified_pkt_;

+  uint8_t *modified_buf_;

+  vpx_codec_pts_t last_sf_pts_;

+};

+TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {

+  sf_count_max_ = 0;  // early exit on successful test.

+  cfg_.g_lag_in_frames = 25;

+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,

+                                       30, 1, 0, 40);

+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

+  EXPECT_EQ(sf_count_, 1);

+}

+VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Values(

+    ::libvpx_test::kTwoPassGood));

+}  // namespace

--- a/test/test.mk

+++ b/test/test.mk

@@ -1,7 +1,8 @@

 LIBVPX_TEST_SRCS-yes += register_state_check.h

 LIBVPX_TEST_SRCS-yes += test.mk

 LIBVPX_TEST_SRCS-yes += acm_random.h

+LIBVPX_TEST_SRCS-yes += md5_helper.h

+LIBVPX_TEST_SRCS-yes += codec_factory.h

 LIBVPX_TEST_SRCS-yes += test_libvpx.cc

 LIBVPX_TEST_SRCS-yes += util.h

 LIBVPX_TEST_SRCS-yes += video_source.h

@@ -15,17 +16,20 @@

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += datarate_test.cc

-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.cc

-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += encode_test_driver.h

-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += error_resilience_test.cc

-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += i420_video_source.h

+LIBVPX_TEST_SRCS-yes                   += encode_test_driver.cc

+LIBVPX_TEST_SRCS-yes                   += encode_test_driver.h

+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc

+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += resize_test.cc

-LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ../md5_utils.h ../md5_utils.c

-LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.cc

-LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += decode_test_driver.h

-LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += ivf_video_source.h

+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c

+LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc

+LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h

+LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += test_vector_test.cc

##

@@ -44,10 +48,10 @@

 LIBVPX_TEST_SRCS-yes                   += vp8_boolcoder_test.cc

 endif

-LIBVPX_TEST_SRCS-yes                   += idctllm_test.cc

+LIBVPX_TEST_SRCS-yes                   += idct_test.cc

 LIBVPX_TEST_SRCS-yes                   += intrapred_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc

-LIBVPX_TEST_SRCS-yes                   += sad_test.cc

+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += sad_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc

 LIBVPX_TEST_SRCS-yes                   += sixtap_predict_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc

@@ -66,13 +70,18 @@

 # IDCT test currently depends on FDCT function

 LIBVPX_TEST_SRCS-yes                   += idct8x8_test.cc

+LIBVPX_TEST_SRCS-yes                   += superframe_test.cc

+LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc

 endif

+LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc

-#LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc

+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc

 endif # VP9

@@ -82,7 +91,8 @@

##

 ## TEST DATA

##

-LIBVPX_TEST_DATA-$(CONFIG_VP8_ENCODER) += hantro_collage_w352h288.yuv

+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv

 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf

 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf

 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf

--- a/test/test_vector_test.cc

+++ b/test/test_vector_test.cc

@@ -12,17 +12,15 @@

 #include <cstdlib>

 #include <string>

 #include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

 #include "test/decode_test_driver.h"

 #include "test/ivf_video_source.h"

+#include "test/util.h"

+#include "test/md5_helper.h"

 extern "C" {

-#include "./md5_utils.h"

 #include "vpx_mem/vpx_mem.h"

-#if defined(_MSC_VER)

-#define snprintf sprintf_s

-#endif

 namespace {

 // There are 61 test vectors in total.

 const char *kTestVectors[] = {

@@ -59,10 +57,10 @@

   "vp80-05-sharpness-1440.ivf", "vp80-05-sharpness-1443.ivf"

};

-class TestVectorTest : public libvpx_test::DecoderTest,

-    public ::testing::TestWithParam<const char*> {

+class TestVectorTest : public ::libvpx_test::DecoderTest,

+    public ::libvpx_test::CodecTestWithParam<const char*> {

  protected:

-  TestVectorTest() : md5_file_(NULL) {}

+  TestVectorTest() : DecoderTest(GET_PARAM(0)), md5_file_(NULL) {}

   virtual ~TestVectorTest() {

     if (md5_file_)

@@ -85,31 +83,10 @@

     ASSERT_NE(res, EOF) << "Read md5 data failed";

     expected_md5[32] = '\0';

-    MD5Context md5;

-    MD5Init(&md5);

+    ::libvpx_test::MD5 md5_res;

+    md5_res.Add(&img);

+    const char *actual_md5 = md5_res.Get();

-    // Compute and update md5 for each raw in decompressed data.

-    for (int plane = 0; plane < 3; ++plane) {

-      uint8_t *buf = img.planes[plane];

-      for (unsigned int y = 0; y < (plane ? (img.d_h + 1) >> 1 : img.d_h);

-           ++y) {

-        MD5Update(&md5, buf, (plane ? (img.d_w + 1) >> 1 : img.d_w));

-        buf += img.stride[plane];

-      }

-    }

-    uint8_t md5_sum[16];

-    MD5Final(md5_sum, &md5);

-    char actual_md5[33];

-    // Convert to get the actual md5.

-    for (int i = 0; i < 16; i++) {

-      snprintf(&actual_md5[i * 2], sizeof(actual_md5) - i * 2, "%02x",

-               md5_sum[i]);

-    }

-    actual_md5[32] = '\0';

     // Check md5 match.

     ASSERT_STREQ(expected_md5, actual_md5)

         << "Md5 checksums don't match: frame number = " << frame_number;

@@ -124,7 +101,7 @@

 // checksums match the correct md5 data, then the test is passed. Otherwise,

 // the test failed.

 TEST_P(TestVectorTest, MD5Match) {

-  const std::string filename = GetParam();

+  const std::string filename = GET_PARAM(1);

   // Open compressed video file.

   libvpx_test::IVFVideoSource video(filename);

@@ -138,7 +115,7 @@

   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

-INSTANTIATE_TEST_CASE_P(TestVectorSequence, TestVectorTest,

-                        ::testing::ValuesIn(kTestVectors));

+VP8_INSTANTIATE_TEST_CASE(TestVectorTest,

+                          ::testing::ValuesIn(kTestVectors));

 }  // namespace

--- /dev/null

+++ b/test/tile_independence_test.cc

@@ -1,0 +1,102 @@

+/*

+ Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ Use of this source code is governed by a BSD-style license

+ that can be found in the LICENSE file in the root of the source

+ tree. An additional intellectual property rights grant can be found

+ in the file PATENTS.  All contributing project authors may

+ be found in the AUTHORS file in the root of the source tree.

+ */

+#include <cstdio>

+#include <cstdlib>

+#include <string>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/codec_factory.h"

+#include "test/encode_test_driver.h"

+#include "test/i420_video_source.h"

+#include "test/util.h"

+#include "test/md5_helper.h"

+extern "C" {

+#include "vpx_mem/vpx_mem.h"

+}

+namespace {

+class TileIndependenceTest : public ::libvpx_test::EncoderTest,

+    public ::libvpx_test::CodecTestWithParam<int> {

+ protected:

+  TileIndependenceTest() : EncoderTest(GET_PARAM(0)), n_tiles_(GET_PARAM(1)),

+      md5_fw_order_(), md5_inv_order_() {

+    init_flags_ = VPX_CODEC_USE_PSNR;

+    vpx_codec_dec_cfg_t cfg;

+    cfg.w = 704;

+    cfg.h = 144;

+    cfg.threads = 1;

+    fw_dec_ = codec_->CreateDecoder(cfg, 0);

+    inv_dec_ = codec_->CreateDecoder(cfg, 0);

+    inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1);

+  }

+  virtual ~TileIndependenceTest() {

+    delete fw_dec_;

+    delete inv_dec_;

+  }

+  virtual void SetUp() {

+    InitializeConfig();

+    SetMode(libvpx_test::kTwoPassGood);

+  }

+  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,

+                                  libvpx_test::Encoder *encoder) {

+    if (video->frame() == 1) {

+      encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_);

+    }

+  }

+  void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,

+                 ::libvpx_test::MD5 *md5) {

+    dec->DecodeFrame((uint8_t *) pkt->data.frame.buf, pkt->data.frame.sz);

+    const vpx_image_t *img = dec->GetDxData().Next();

+    md5->Add(img);

+  }

+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {

+    UpdateMD5(fw_dec_, pkt, &md5_fw_order_);

+    UpdateMD5(inv_dec_, pkt, &md5_inv_order_);

+  }

+ private:

+  int n_tiles_;

+ protected:

+  ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;

+  ::libvpx_test::Decoder *fw_dec_, *inv_dec_;

+};

+// run an encode with 2 or 4 tiles, and do the decode both in normal and

+// inverted tile ordering. Ensure that the MD5 of the output in both cases

+// is identical. If so, tiles are considered independent and the test passes.

+TEST_P(TileIndependenceTest, MD5Match) {

+  const vpx_rational timebase = { 33333333, 1000000000 };

+  cfg_.g_timebase = timebase;

+  cfg_.rc_target_bitrate = 500;

+  cfg_.g_lag_in_frames = 25;

+  cfg_.rc_end_usage = VPX_VBR;

+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 144,

+                                     timebase.den, timebase.num, 0, 30);

+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));

+  const char *md5_fw_str  = md5_fw_order_.Get();

+  const char *md5_inv_str = md5_inv_order_.Get();

+  // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer

+  // output if it fails. Not sure if it's helpful since it's really just

+  // a MD5...

+  ASSERT_STREQ(md5_fw_str, md5_inv_str);

+}

+VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest,

+                          ::testing::Range(0, 2, 1));

+}  // namespace

--- a/test/util.h

+++ b/test/util.h

@@ -11,8 +11,38 @@

 #ifndef TEST_UTIL_H_

 #define TEST_UTIL_H_

+#include <stdio.h>

+#include <math.h>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "vpx/vpx_image.h"

 // Macros

 #define PARAMS(...) ::testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >

 #define GET_PARAM(k) std::tr1::get< k >(GetParam())

+static double compute_psnr(const vpx_image_t *img1,

+                           const vpx_image_t *img2) {

+  assert((img1->fmt == img2->fmt) &&

+         (img1->d_w == img2->d_w) &&

+         (img1->d_h == img2->d_h));

+  const unsigned int width_y  = img1->d_w;

+  const unsigned int height_y = img1->d_h;

+  unsigned int i, j;

+  int64_t sqrerr = 0;

+  for (i = 0; i < height_y; ++i)

+    for (j = 0; j < width_y; ++j) {

+      int64_t d = img1->planes[VPX_PLANE_Y][i * img1->stride[VPX_PLANE_Y] + j] -

+                  img2->planes[VPX_PLANE_Y][i * img2->stride[VPX_PLANE_Y] + j];

+      sqrerr += d * d;

+    }

+  double mse = sqrerr / (width_y * height_y);

+  double psnr = 100.0;

+  if (mse > 0.0) {

+    psnr = 10 * log10(255.0 * 255.0 / mse);

+  }

+  return psnr;

+}

 #endif  // TEST_UTIL_H_

--- a/vp8/decoder/onyxd_if.c

+++ b/vp8/decoder/onyxd_if.c

@@ -302,7 +302,7 @@

     return 1;

 int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,

                                   const uint8_t *source,

                                   int64_t time_stamp)

--- a/vp8/encoder/bitstream.c

+++ b/vp8/encoder/bitstream.c

@@ -50,7 +50,7 @@

 unsigned __int64 Sectionbits[500];

 #endif

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 int intra_mode_stats[10][10][10];

 static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] [2];

 extern unsigned int active_section;

@@ -531,7 +531,7 @@

     vp8_convert_rfct_to_prob(cpi);

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

     active_section = 1;

 #endif

@@ -580,7 +580,7 @@

             xd->mb_to_top_edge = -((mb_row * 16)) << 3;

             xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

             active_section = 9;

 #endif

@@ -593,7 +593,7 @@

             if (rf == INTRA_FRAME)

                 vp8_write(w, 0, cpi->prob_intra_coded);

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

                 active_section = 6;

 #endif

                 write_ymode(w, mode, pc->fc.ymode_prob);

@@ -633,13 +633,13 @@

                     vp8_mv_ref_probs(mv_ref_p, ct);

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

                     accum_mv_refs(mode, ct);

 #endif

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

                 active_section = 3;

 #endif

@@ -649,7 +649,7 @@

                 case NEWMV:

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

                     active_section = 5;

 #endif

@@ -692,7 +692,7 @@

                         if (blockmode == NEW4X4)

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

                             active_section = 11;

 #endif

                             write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *) mvc);

@@ -769,7 +769,7 @@

                     const B_PREDICTION_MODE L = left_block_mode(m, i);

                     const int bm = m->bmi[i].as_mode;

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

                     ++intra_mode_stats [A] [L] [bm];

 #endif

@@ -1160,7 +1160,7 @@

 #endif

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

                     ++ tree_update_hist [i][j][k][t] [u];

 #endif

@@ -1181,7 +1181,7 @@

                 while (++t < ENTROPY_NODES);

                 /* Accum token counts for generation of default statistics */

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

                 t = 0;

do

@@ -1527,7 +1527,7 @@

     if (pc->frame_type != KEY_FRAME)

         vp8_write_bit(bc, pc->refresh_last_frame);

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

     if (pc->frame_type == INTER_FRAME)

         active_section = 0;

@@ -1550,7 +1550,7 @@

     vp8_update_coef_probs(cpi);

 #endif

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

     active_section = 2;

 #endif

@@ -1561,7 +1561,7 @@

         write_kfmodes(cpi);

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

         active_section = 8;

 #endif

@@ -1569,7 +1569,7 @@

         pack_inter_mode_mvs(cpi);

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

         active_section = 1;

 #endif

@@ -1687,7 +1687,7 @@

 #endif

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 void print_tree_update_probs()

     int i, j, k, l;

--- a/vp8/encoder/boolhuff.c

+++ b/vp8/encoder/boolhuff.c

@@ -16,7 +16,7 @@

 #endif

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 unsigned int active_section = 0;

 #endif

--- a/vp8/encoder/boolhuff.h

+++ b/vp8/encoder/boolhuff.h

@@ -67,7 +67,7 @@

     unsigned int lowvalue = br->lowvalue;

     register unsigned int shift;

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 #if defined(SECTIONBITS_OUTPUT)

     if (bit)

--- a/vp8/encoder/encodemv.c

+++ b/vp8/encoder/encodemv.c

@@ -16,7 +16,7 @@

 #include <math.h>

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 extern unsigned int active_section;

 #endif

@@ -359,7 +359,7 @@

     vp8_writer *const w  = cpi->bc;

     MV_CONTEXT *mvc = cpi->common.fc.mvc;

     int flags[2] = {0, 0};

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

     active_section = 4;

 #endif

     write_component_probs(

@@ -374,7 +374,7 @@

     if (flags[0] || flags[1])

         vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

     active_section = 5;

 #endif

--- a/vp8/encoder/mcomp.c

+++ b/vp8/encoder/mcomp.c

@@ -18,7 +18,7 @@

 #include <math.h>

 #include "vp8/common/findnearmv.h"

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 static int mv_ref_ct [31] [4] [2];

 static int mv_mode_cts [4] [2];

 #endif

@@ -1912,7 +1912,7 @@

            + mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 void print_mode_context(void)

     FILE *f = fopen("modecont.c", "w");

@@ -1965,8 +1965,8 @@

     fclose(f);

-/* MV ref count ENTROPY_STATS stats code */

-#ifdef ENTROPY_STATS

+/* MV ref count VP8_ENTROPY_STATS stats code */

+#ifdef VP8_ENTROPY_STATS

 void init_mv_ref_counts()

     vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));

@@ -2020,6 +2020,6 @@

-#endif/* END MV ref count ENTROPY_STATS stats code */

+#endif/* END MV ref count VP8_ENTROPY_STATS stats code */

 #endif

--- a/vp8/encoder/mcomp.h

+++ b/vp8/encoder/mcomp.h

@@ -15,7 +15,7 @@

 #include "block.h"

 #include "vp8/common/variance.h"

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 extern void init_mv_ref_counts();

 extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);

 #endif

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -111,7 +111,7 @@

 #endif

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 extern int intra_mode_stats[10][10][10];

 #endif

@@ -1805,7 +1805,7 @@

     else

         cpi->cyclic_refresh_map = (signed char *) NULL;

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

     init_context_counters();

 #endif

@@ -1923,7 +1923,7 @@

         cpi->mb.rd_thresh_mult[i] = 128;

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

     init_mv_ref_counts();

 #endif

@@ -2060,7 +2060,7 @@

 #endif

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

         print_context_counters();

         print_tree_update_probs();

         print_mode_context();

@@ -2242,7 +2242,7 @@

 #endif

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

             int i, j, k;

             FILE *fmode = fopen("modecontext.c", "w");

--- a/vp8/encoder/tokenize.c

+++ b/vp8/encoder/tokenize.c

@@ -20,7 +20,7 @@

 /* Global event counters used for accumulating statistics across several

    compressions, then generating context.c = initial stats. */

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

 #endif

 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) ;

@@ -413,7 +413,7 @@

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 void init_context_counters(void)

--- a/vp8/encoder/tokenize.h

+++ b/vp8/encoder/tokenize.h

@@ -33,7 +33,7 @@

 int rd_cost_mby(MACROBLOCKD *);

-#ifdef ENTROPY_STATS

+#ifdef VP8_ENTROPY_STATS

 void init_context_counters();

 void print_context_counters();

--- a/vp8/encoder/x86/quantize_sse2.c

+++ /dev/null

@@ -1,229 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "vpx_config.h"

-#include "vp8_rtcd.h"

-#include "vpx_ports/x86.h"

-#include "vpx_mem/vpx_mem.h"

-#include "vp8/encoder/block.h"

-#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */

-#include <mmintrin.h> /* MMX */

-#include <xmmintrin.h> /* SSE */

-#include <emmintrin.h> /* SSE2 */

-#define SELECT_EOB(i, z) \

-    do { \

-        short boost = *zbin_boost_ptr; \

-        int cmp = (x[z] < boost) | (y[z] == 0); \

-        zbin_boost_ptr++; \

-        if (cmp) \

-            goto select_eob_end_##i; \

-        qcoeff_ptr[z] = y[z]; \

-        eob = i; \

-        zbin_boost_ptr = b->zrun_zbin_boost; \

-        select_eob_end_##i:; \

-    } while (0)

-void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)

-{

-    char eob = 0;

-    short *zbin_boost_ptr  = b->zrun_zbin_boost;

-    short *qcoeff_ptr      = d->qcoeff;

-    DECLARE_ALIGNED_ARRAY(16, short, x, 16);

-    DECLARE_ALIGNED_ARRAY(16, short, y, 16);

-    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;

-    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));

-    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));

-    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));

-    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));

-    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);

-    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));

-    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));

-    __m128i round0 = _mm_load_si128((__m128i *)(b->round));

-    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));

-    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));

-    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));

-    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));

-    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));

-    vpx_memset(qcoeff_ptr, 0, 32);

-    /* Duplicate to all lanes. */

-    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);

-    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);

-    /* Sign of z: z >> 15 */

-    sz0 = _mm_srai_epi16(z0, 15);

-    sz1 = _mm_srai_epi16(z1, 15);

-    /* x = abs(z): (z ^ sz) - sz */

-    x0 = _mm_xor_si128(z0, sz0);

-    x1 = _mm_xor_si128(z1, sz1);

-    x0 = _mm_sub_epi16(x0, sz0);

-    x1 = _mm_sub_epi16(x1, sz1);

-    /* zbin[] + zbin_extra */

-    zbin0 = _mm_add_epi16(zbin0, zbin_extra);

-    zbin1 = _mm_add_epi16(zbin1, zbin_extra);

-    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance

-     * the equation because boost is the only value which can change:

-     * x - (zbin[] + extra) >= boost */

-    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);

-    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);

-    _mm_store_si128((__m128i *)(x), x_minus_zbin0);

-    _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);

-    /* All the remaining calculations are valid whether they are done now with

-     * simd or later inside the loop one at a time. */

-    x0 = _mm_add_epi16(x0, round0);

-    x1 = _mm_add_epi16(x1, round1);

-    y0 = _mm_mulhi_epi16(x0, quant0);

-    y1 = _mm_mulhi_epi16(x1, quant1);

-    y0 = _mm_add_epi16(y0, x0);

-    y1 = _mm_add_epi16(y1, x1);

-    /* Instead of shifting each value independently we convert the scaling

-     * factor with 1 << (16 - shift) so we can use multiply/return high half. */

-    y0 = _mm_mulhi_epi16(y0, quant_shift0);

-    y1 = _mm_mulhi_epi16(y1, quant_shift1);

-    /* Return the sign: (y ^ sz) - sz */

-    y0 = _mm_xor_si128(y0, sz0);

-    y1 = _mm_xor_si128(y1, sz1);

-    y0 = _mm_sub_epi16(y0, sz0);

-    y1 = _mm_sub_epi16(y1, sz1);

-    _mm_store_si128((__m128i *)(y), y0);

-    _mm_store_si128((__m128i *)(y + 8), y1);

-    zbin_boost_ptr = b->zrun_zbin_boost;

-    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */

-    SELECT_EOB(1, 0);

-    SELECT_EOB(2, 1);

-    SELECT_EOB(3, 4);

-    SELECT_EOB(4, 8);

-    SELECT_EOB(5, 5);

-    SELECT_EOB(6, 2);

-    SELECT_EOB(7, 3);

-    SELECT_EOB(8, 6);

-    SELECT_EOB(9, 9);

-    SELECT_EOB(10, 12);

-    SELECT_EOB(11, 13);

-    SELECT_EOB(12, 10);

-    SELECT_EOB(13, 7);

-    SELECT_EOB(14, 11);

-    SELECT_EOB(15, 14);

-    SELECT_EOB(16, 15);

-    y0 = _mm_load_si128((__m128i *)(d->qcoeff));

-    y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));

-    /* dqcoeff = qcoeff * dequant */

-    y0 = _mm_mullo_epi16(y0, dequant0);

-    y1 = _mm_mullo_epi16(y1, dequant1);

-    _mm_store_si128((__m128i *)(d->dqcoeff), y0);

-    _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);

-    *d->eob = eob;

-}

-void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)

-{

-  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));

-  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));

-  __m128i round0 = _mm_load_si128((__m128i *)(b->round));

-  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));

-  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));

-  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));

-  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));

-  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));

-  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));

-  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));

-  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;

-  /* sign of z: z >> 15 */

-  sz0 = _mm_srai_epi16(z0, 15);

-  sz1 = _mm_srai_epi16(z1, 15);

-  /* x = abs(z): (z ^ sz) - sz */

-  x0 = _mm_xor_si128(z0, sz0);

-  x1 = _mm_xor_si128(z1, sz1);

-  x0 = _mm_sub_epi16(x0, sz0);

-  x1 = _mm_sub_epi16(x1, sz1);

-  /* x += round */

-  x0 = _mm_add_epi16(x0, round0);

-  x1 = _mm_add_epi16(x1, round1);

-  /* y = (x * quant) >> 16 */

-  y0 = _mm_mulhi_epi16(x0, quant_fast0);

-  y1 = _mm_mulhi_epi16(x1, quant_fast1);

-  /* x = abs(y) = (y ^ sz) - sz */

-  y0 = _mm_xor_si128(y0, sz0);

-  y1 = _mm_xor_si128(y1, sz1);

-  x0 = _mm_sub_epi16(y0, sz0);

-  x1 = _mm_sub_epi16(y1, sz1);

-  /* qcoeff = x */

-  _mm_store_si128((__m128i *)(d->qcoeff), x0);

-  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);

-  /* x * dequant */

-  xdq0 = _mm_mullo_epi16(x0, dequant0);

-  xdq1 = _mm_mullo_epi16(x1, dequant1);

-  /* dqcoeff = x * dequant */

-  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);

-  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);

-  /* build a mask for the zig zag */

-  zeros = _mm_setzero_si128();

-  x0 = _mm_cmpeq_epi16(x0, zeros);

-  x1 = _mm_cmpeq_epi16(x1, zeros);

-  ones = _mm_cmpeq_epi16(zeros, zeros);

-  x0 = _mm_xor_si128(x0, ones);

-  x1 = _mm_xor_si128(x1, ones);

-  x0 = _mm_and_si128(x0, inv_zig_zag0);

-  x1 = _mm_and_si128(x1, inv_zig_zag1);

-  x0 = _mm_max_epi16(x0, x1);

-  /* now down to 8 */

-  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110

-  x0 = _mm_max_epi16(x0, x1);

-  /* only 4 left */

-  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110

-  x0 = _mm_max_epi16(x0, x1);

-  /* okay, just 2! */

-  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001

-  x0 = _mm_max_epi16(x0, x1);

-  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);

-}

--- /dev/null

+++ b/vp8/encoder/x86/quantize_sse2_intrinsics.c

@@ -1,0 +1,229 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vpx_config.h"

+#include "vp8_rtcd.h"

+#include "vpx_ports/x86.h"

+#include "vpx_mem/vpx_mem.h"

+#include "vp8/encoder/block.h"

+#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */

+#include <mmintrin.h> /* MMX */

+#include <xmmintrin.h> /* SSE */

+#include <emmintrin.h> /* SSE2 */

+#define SELECT_EOB(i, z) \

+    do { \

+        short boost = *zbin_boost_ptr; \

+        int cmp = (x[z] < boost) | (y[z] == 0); \

+        zbin_boost_ptr++; \

+        if (cmp) \

+            goto select_eob_end_##i; \

+        qcoeff_ptr[z] = y[z]; \

+        eob = i; \

+        zbin_boost_ptr = b->zrun_zbin_boost; \

+        select_eob_end_##i:; \

+    } while (0)

+void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)

+{

+    char eob = 0;

+    short *zbin_boost_ptr  = b->zrun_zbin_boost;

+    short *qcoeff_ptr      = d->qcoeff;

+    DECLARE_ALIGNED_ARRAY(16, short, x, 16);

+    DECLARE_ALIGNED_ARRAY(16, short, y, 16);

+    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;

+    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));

+    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));

+    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));

+    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));

+    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);

+    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));

+    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));

+    __m128i round0 = _mm_load_si128((__m128i *)(b->round));

+    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));

+    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));

+    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));

+    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));

+    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));

+    vpx_memset(qcoeff_ptr, 0, 32);

+    /* Duplicate to all lanes. */

+    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);

+    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);

+    /* Sign of z: z >> 15 */

+    sz0 = _mm_srai_epi16(z0, 15);

+    sz1 = _mm_srai_epi16(z1, 15);

+    /* x = abs(z): (z ^ sz) - sz */

+    x0 = _mm_xor_si128(z0, sz0);

+    x1 = _mm_xor_si128(z1, sz1);

+    x0 = _mm_sub_epi16(x0, sz0);

+    x1 = _mm_sub_epi16(x1, sz1);

+    /* zbin[] + zbin_extra */

+    zbin0 = _mm_add_epi16(zbin0, zbin_extra);

+    zbin1 = _mm_add_epi16(zbin1, zbin_extra);

+    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance

+     * the equation because boost is the only value which can change:

+     * x - (zbin[] + extra) >= boost */

+    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);

+    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);

+    _mm_store_si128((__m128i *)(x), x_minus_zbin0);

+    _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);

+    /* All the remaining calculations are valid whether they are done now with

+     * simd or later inside the loop one at a time. */

+    x0 = _mm_add_epi16(x0, round0);

+    x1 = _mm_add_epi16(x1, round1);

+    y0 = _mm_mulhi_epi16(x0, quant0);

+    y1 = _mm_mulhi_epi16(x1, quant1);

+    y0 = _mm_add_epi16(y0, x0);

+    y1 = _mm_add_epi16(y1, x1);

+    /* Instead of shifting each value independently we convert the scaling

+     * factor with 1 << (16 - shift) so we can use multiply/return high half. */

+    y0 = _mm_mulhi_epi16(y0, quant_shift0);

+    y1 = _mm_mulhi_epi16(y1, quant_shift1);

+    /* Return the sign: (y ^ sz) - sz */

+    y0 = _mm_xor_si128(y0, sz0);

+    y1 = _mm_xor_si128(y1, sz1);

+    y0 = _mm_sub_epi16(y0, sz0);

+    y1 = _mm_sub_epi16(y1, sz1);

+    _mm_store_si128((__m128i *)(y), y0);

+    _mm_store_si128((__m128i *)(y + 8), y1);

+    zbin_boost_ptr = b->zrun_zbin_boost;

+    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */

+    SELECT_EOB(1, 0);

+    SELECT_EOB(2, 1);

+    SELECT_EOB(3, 4);

+    SELECT_EOB(4, 8);

+    SELECT_EOB(5, 5);

+    SELECT_EOB(6, 2);

+    SELECT_EOB(7, 3);

+    SELECT_EOB(8, 6);

+    SELECT_EOB(9, 9);

+    SELECT_EOB(10, 12);

+    SELECT_EOB(11, 13);

+    SELECT_EOB(12, 10);

+    SELECT_EOB(13, 7);

+    SELECT_EOB(14, 11);

+    SELECT_EOB(15, 14);

+    SELECT_EOB(16, 15);

+    y0 = _mm_load_si128((__m128i *)(d->qcoeff));

+    y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));

+    /* dqcoeff = qcoeff * dequant */

+    y0 = _mm_mullo_epi16(y0, dequant0);

+    y1 = _mm_mullo_epi16(y1, dequant1);

+    _mm_store_si128((__m128i *)(d->dqcoeff), y0);

+    _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);

+    *d->eob = eob;

+}

+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)

+{

+  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));

+  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));

+  __m128i round0 = _mm_load_si128((__m128i *)(b->round));

+  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));

+  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));

+  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));

+  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));

+  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));

+  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));

+  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));

+  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;

+  /* sign of z: z >> 15 */

+  sz0 = _mm_srai_epi16(z0, 15);

+  sz1 = _mm_srai_epi16(z1, 15);

+  /* x = abs(z): (z ^ sz) - sz */

+  x0 = _mm_xor_si128(z0, sz0);

+  x1 = _mm_xor_si128(z1, sz1);

+  x0 = _mm_sub_epi16(x0, sz0);

+  x1 = _mm_sub_epi16(x1, sz1);

+  /* x += round */

+  x0 = _mm_add_epi16(x0, round0);

+  x1 = _mm_add_epi16(x1, round1);

+  /* y = (x * quant) >> 16 */

+  y0 = _mm_mulhi_epi16(x0, quant_fast0);

+  y1 = _mm_mulhi_epi16(x1, quant_fast1);

+  /* x = abs(y) = (y ^ sz) - sz */

+  y0 = _mm_xor_si128(y0, sz0);

+  y1 = _mm_xor_si128(y1, sz1);

+  x0 = _mm_sub_epi16(y0, sz0);

+  x1 = _mm_sub_epi16(y1, sz1);

+  /* qcoeff = x */

+  _mm_store_si128((__m128i *)(d->qcoeff), x0);

+  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);

+  /* x * dequant */

+  xdq0 = _mm_mullo_epi16(x0, dequant0);

+  xdq1 = _mm_mullo_epi16(x1, dequant1);

+  /* dqcoeff = x * dequant */

+  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);

+  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);

+  /* build a mask for the zig zag */

+  zeros = _mm_setzero_si128();

+  x0 = _mm_cmpeq_epi16(x0, zeros);

+  x1 = _mm_cmpeq_epi16(x1, zeros);

+  ones = _mm_cmpeq_epi16(zeros, zeros);

+  x0 = _mm_xor_si128(x0, ones);

+  x1 = _mm_xor_si128(x1, ones);

+  x0 = _mm_and_si128(x0, inv_zig_zag0);

+  x1 = _mm_and_si128(x1, inv_zig_zag1);

+  x0 = _mm_max_epi16(x0, x1);

+  /* now down to 8 */

+  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110

+  x0 = _mm_max_epi16(x0, x1);

+  /* only 4 left */

+  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110

+  x0 = _mm_max_epi16(x0, x1);

+  /* okay, just 2! */

+  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001

+  x0 = _mm_max_epi16(x0, x1);

+  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);

+}

--- a/vp8/vp8_cx_iface.c

+++ b/vp8/vp8_cx_iface.c

@@ -684,6 +684,8 @@

     yv12->u_buffer = img->planes[VPX_PLANE_U];

     yv12->v_buffer = img->planes[VPX_PLANE_V];

+    yv12->y_crop_width  = img->d_w;

+    yv12->y_crop_height = img->d_h;

     yv12->y_width  = img->d_w;

     yv12->y_height = img->d_h;

     yv12->uv_width = (1 + yv12->y_width) / 2;

--- a/vp8/vp8_dx_iface.c

+++ b/vp8/vp8_dx_iface.c

@@ -790,6 +790,8 @@

     yv12->u_buffer = img->planes[VPX_PLANE_U];

     yv12->v_buffer = img->planes[VPX_PLANE_V];

+    yv12->y_crop_width  = img->d_w;

+    yv12->y_crop_height = img->d_h;

     yv12->y_width  = img->d_w;

     yv12->y_height = img->d_h;

     yv12->uv_width = yv12->y_width / 2;

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -89,12 +89,12 @@

 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm

-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c

+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2_intrinsics.c

 # TODO(johann) make this generic

 ifeq ($(HAVE_SSE2),yes)

-vp8/encoder/x86/quantize_sse2.c.o: CFLAGS += -msse2

-vp8/encoder/x86/quantize_sse2.c.d: CFLAGS += -msse2

+vp8/encoder/x86/quantize_sse2_intrinsics.c.o: CFLAGS += -msse2

+vp8/encoder/x86/quantize_sse2_intrinsics.c.d: CFLAGS += -msse2

 endif

 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)

--- a/vp9/common/generic/vp9_systemdependent.c

+++ b/vp9/common/generic/vp9_systemdependent.c

@@ -11,8 +11,6 @@

 #include "./vpx_config.h"

 #include "vp9_rtcd.h"

-#include "vp9/common/vp9_subpixel.h"

-#include "vp9/common/vp9_loopfilter.h"

 #include "vp9/common/vp9_onyxc_int.h"

 void vp9_machine_specific_config(VP9_COMMON *ctx) {

--- /dev/null

+++ b/vp9/common/ppc/vp9_idct_altivec.asm

@@ -1,0 +1,189 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    .globl short_idct4x4_ppc

+.macro load_c V, LABEL, OFF, R0, R1

+    lis     \R0, \LABEL@ha

+    la      \R1, \LABEL@l(\R0)

+    lvx     \V, \OFF, \R1

+.endm

+;# r3 short *input

+;# r4 short *output

+;# r5 int pitch

+    .align 2

+short_idct4x4_ppc:

+    mfspr   r11, 256            ;# get old VRSAVE

+    oris    r12, r11, 0xfff8

+    mtspr   256, r12            ;# set VRSAVE

+    load_c v8, sinpi8sqrt2, 0, r9, r10

+    load_c v9, cospi8sqrt2minus1, 0, r9, r10

+    load_c v10, hi_hi, 0, r9, r10

+    load_c v11, lo_lo, 0, r9, r10

+    load_c v12, shift_16, 0, r9, r10

+    li      r10,  16

+    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]

+    lvx     v1, r10, r3         ;# input ip[8], ip[12]

+    ;# first pass

+    vupkhsh v2, v0

+    vupkhsh v3, v1

+    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]

+    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]

+    vupklsh v0, v0

+    vmulosh v4, v0, v8

+    vsraw   v4, v4, v12

+    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)

+    vupklsh v1, v1

+    vmulosh v5, v1, v9

+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v1

+    vsubsws v4, v4, v5          ;# c1

+    vmulosh v3, v1, v8

+    vsraw   v3, v3, v12

+    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)

+    vmulosh v5, v0, v9

+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v0

+    vaddsws v3, v3, v5          ;# d1

+    vaddsws v0, v6, v3          ;# a1 + d1

+    vsubsws v3, v6, v3          ;# a1 - d1

+    vaddsws v1, v7, v4          ;# b1 + c1

+    vsubsws v2, v7, v4          ;# b1 - c1

+    ;# transpose input

+    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1

+    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1

+    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3

+    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3

+    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0

+    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1

+    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2

+    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3

+    ;# second pass

+    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]

+    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]

+    vmulosh v4, v1, v8

+    vsraw   v4, v4, v12

+    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)

+    vmulosh v5, v3, v9

+    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v3

+    vsubsws v4, v4, v5          ;# c1

+    vmulosh v2, v3, v8

+    vsraw   v2, v2, v12

+    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)

+    vmulosh v5, v1, v9

+    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

+    vaddsws v5, v5, v1

+    vaddsws v3, v2, v5          ;# d1

+    vaddsws v0, v6, v3          ;# a1 + d1

+    vsubsws v3, v6, v3          ;# a1 - d1

+    vaddsws v1, v7, v4          ;# b1 + c1

+    vsubsws v2, v7, v4          ;# b1 - c1

+    vspltish v6, 4

+    vspltish v7, 3

+    vpkswss v0, v0, v1

+    vpkswss v1, v2, v3

+    vaddshs v0, v0, v6

+    vaddshs v1, v1, v6

+    vsrah   v0, v0, v7

+    vsrah   v1, v1, v7

+    ;# transpose output

+    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3

+    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3

+    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1

+    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3

+    stwu    r1,-416(r1)         ;# create space on the stack

+    stvx    v0,  0, r1

+    lwz     r6, 0(r1)

+    stw     r6, 0(r4)

+    lwz     r6, 4(r1)

+    stw     r6, 4(r4)

+    add     r4, r4, r5

+    lwz     r6,  8(r1)

+    stw     r6,  0(r4)

+    lwz     r6, 12(r1)

+    stw     r6,  4(r4)

+    add     r4, r4, r5

+    stvx    v1,  0, r1

+    lwz     r6, 0(r1)

+    stw     r6, 0(r4)

+    lwz     r6, 4(r1)

+    stw     r6, 4(r4)

+    add     r4, r4, r5

+    lwz     r6,  8(r1)

+    stw     r6,  0(r4)

+    lwz     r6, 12(r1)

+    stw     r6,  4(r4)

+    addi    r1, r1, 416         ;# recover stack

+    mtspr   256, r11            ;# reset old VRSAVE

+    blr

+    .align 4

+sinpi8sqrt2:

+    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468

+    .align 4

+cospi8sqrt2minus1:

+    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091

+    .align 4

+shift_16:

+    .long      16,    16,    16,    16

+    .align 4

+hi_hi:

+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

+    .align 4

+lo_lo:

+    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

--- a/vp9/common/ppc/vp9_idctllm_altivec.asm

+++ /dev/null

@@ -1,189 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    .globl short_idct4x4llm_ppc

-.macro load_c V, LABEL, OFF, R0, R1

-    lis     \R0, \LABEL@ha

-    la      \R1, \LABEL@l(\R0)

-    lvx     \V, \OFF, \R1

-.endm

-;# r3 short *input

-;# r4 short *output

-;# r5 int pitch

-    .align 2

-short_idct4x4llm_ppc:

-    mfspr   r11, 256            ;# get old VRSAVE

-    oris    r12, r11, 0xfff8

-    mtspr   256, r12            ;# set VRSAVE

-    load_c v8, sinpi8sqrt2, 0, r9, r10

-    load_c v9, cospi8sqrt2minus1, 0, r9, r10

-    load_c v10, hi_hi, 0, r9, r10

-    load_c v11, lo_lo, 0, r9, r10

-    load_c v12, shift_16, 0, r9, r10

-    li      r10,  16

-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]

-    lvx     v1, r10, r3         ;# input ip[8], ip[12]

-    ;# first pass

-    vupkhsh v2, v0

-    vupkhsh v3, v1

-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]

-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]

-    vupklsh v0, v0

-    vmulosh v4, v0, v8

-    vsraw   v4, v4, v12

-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)

-    vupklsh v1, v1

-    vmulosh v5, v1, v9

-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v1

-    vsubsws v4, v4, v5          ;# c1

-    vmulosh v3, v1, v8

-    vsraw   v3, v3, v12

-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v0, v9

-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v0

-    vaddsws v3, v3, v5          ;# d1

-    vaddsws v0, v6, v3          ;# a1 + d1

-    vsubsws v3, v6, v3          ;# a1 - d1

-    vaddsws v1, v7, v4          ;# b1 + c1

-    vsubsws v2, v7, v4          ;# b1 - c1

-    ;# transpose input

-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1

-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1

-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3

-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3

-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0

-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1

-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2

-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3

-    ;# second pass

-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]

-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]

-    vmulosh v4, v1, v8

-    vsraw   v4, v4, v12

-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v3, v9

-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v3

-    vsubsws v4, v4, v5          ;# c1

-    vmulosh v2, v3, v8

-    vsraw   v2, v2, v12

-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)

-    vmulosh v5, v1, v9

-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)

-    vaddsws v5, v5, v1

-    vaddsws v3, v2, v5          ;# d1

-    vaddsws v0, v6, v3          ;# a1 + d1

-    vsubsws v3, v6, v3          ;# a1 - d1

-    vaddsws v1, v7, v4          ;# b1 + c1

-    vsubsws v2, v7, v4          ;# b1 - c1

-    vspltish v6, 4

-    vspltish v7, 3

-    vpkswss v0, v0, v1

-    vpkswss v1, v2, v3

-    vaddshs v0, v0, v6

-    vaddshs v1, v1, v6

-    vsrah   v0, v0, v7

-    vsrah   v1, v1, v7

-    ;# transpose output

-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3

-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3

-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1

-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3

-    stwu    r1,-416(r1)         ;# create space on the stack

-    stvx    v0,  0, r1

-    lwz     r6, 0(r1)

-    stw     r6, 0(r4)

-    lwz     r6, 4(r1)

-    stw     r6, 4(r4)

-    add     r4, r4, r5

-    lwz     r6,  8(r1)

-    stw     r6,  0(r4)

-    lwz     r6, 12(r1)

-    stw     r6,  4(r4)

-    add     r4, r4, r5

-    stvx    v1,  0, r1

-    lwz     r6, 0(r1)

-    stw     r6, 0(r4)

-    lwz     r6, 4(r1)

-    stw     r6, 4(r4)

-    add     r4, r4, r5

-    lwz     r6,  8(r1)

-    stw     r6,  0(r4)

-    lwz     r6, 12(r1)

-    stw     r6,  4(r4)

-    addi    r1, r1, 416         ;# recover stack

-    mtspr   256, r11            ;# reset old VRSAVE

-    blr

-    .align 4

-sinpi8sqrt2:

-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468

-    .align 4

-cospi8sqrt2minus1:

-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091

-    .align 4

-shift_16:

-    .long      16,    16,    16,    16

-    .align 4

-hi_hi:

-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

-    .align 4

-lo_lo:

-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

--- a/vp9/common/ppc/vp9_systemdependent.c

+++ b/vp9/common/ppc/vp9_systemdependent.c

@@ -8,7 +8,6 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "vp9/common/vp9_subpixel.h"

 #include "vp9/common/vp9_loopfilter.h"

 #include "recon.h"

 #include "vp9/common/vp9_onyxc_int.h"

@@ -17,33 +16,29 @@

 void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);

 void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);

-extern void (*vp9_post_proc_down_and_across)(

-  unsigned char *src_ptr,

-  unsigned char *dst_ptr,

-  int src_pixels_per_line,

-  int dst_pixels_per_line,

-  int rows,

-  int cols,

-  int flimit

-);

+extern void (*vp9_post_proc_down_and_across)(unsigned char *src_ptr,

+                                             unsigned char *dst_ptr,

+                                             int src_pixels_per_line,

+                                             int dst_pixels_per_line,

+                                             int rows, int cols, int flimit);

-extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);

-extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);

-extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);

-extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);

+extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch,

+                                    int rows, int cols, int flimit);

+extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch,

+                                   int rows, int cols, int flimit);

+extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch,

+                                         int rows, int cols, int flimit);

+extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch,

+                                        int rows, int cols, int flimit);

+extern void vp9_post_proc_down_and_across_c(unsigned char *src_ptr,

+                                            unsigned char *dst_ptr,

+                                            int src_pixels_per_line,

+                                            int dst_pixels_per_line,

+                                            int rows, int cols, int flimit);

+void vp9_plane_add_noise_c(unsigned char *start,

+                           unsigned int width, unsigned int height,

+                           int pitch, int q, int a);

-extern void vp9_post_proc_down_and_across_c

-(

-  unsigned char *src_ptr,

-  unsigned char *dst_ptr,

-  int src_pixels_per_line,

-  int dst_pixels_per_line,

-  int rows,

-  int cols,

-  int flimit

-);

-void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);

 extern copy_mem_block_function *vp9_copy_mem16x16;

 extern copy_mem_block_function *vp9_copy_mem8x8;

 extern copy_mem_block_function *vp9_copy_mem8x4;

@@ -60,11 +55,14 @@

 extern copy_mem_block_function copy_mem16x16_ppc;

-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

+void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr,

+                 unsigned char *dst_ptr, int stride);

+void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr,

+                 unsigned char *dst_ptr, int stride);

+void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr,

+                 unsigned char *dst_ptr, int stride);

-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);

+extern void short_idct4x4_ppc(short *input, short *output, int pitch);

 // Generic C

 extern subpixel_predict_function vp9_sixtap_predict_c;

@@ -80,12 +78,15 @@

 extern copy_mem_block_function vp9_copy_mem8x8_c;

 extern copy_mem_block_function vp9_copy_mem8x4_c;

-void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

-void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);

+void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr,

+                   unsigned char *dst_ptr, int stride);

+void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr,

+                   unsigned char *dst_ptr, int stride);

+void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr,

+                   unsigned char *dst_ptr, int stride);

-extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch);

-extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch);

+extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch);

+extern void vp9_short_idct4x4_c(short *input, short *output, int pitch);

 extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);

 // PPC

@@ -140,8 +141,8 @@

   vp9_sixtap_predict8x4                = sixtap_predict8x4_ppc;

   vp9_sixtap_predict                   = sixtap_predict_ppc;

-  vp8_short_idct4x4_1                  = vp9_short_idct4x4llm_1_c;

-  vp8_short_idct4x4                    = short_idct4x4llm_ppc;

+  vp8_short_idct4x4_1                  = vp9_short_idct4x4_1_c;

+  vp8_short_idct4x4                    = short_idct4x4_ppc;

   vp8_dc_only_idct                      = vp8_dc_only_idct_c;

   vp8_lf_mbvfull                       = loop_filter_mbv_ppc;

--- a/vp9/common/vp9_alloccommon.c

+++ b/vp9/common/vp9_alloccommon.c

@@ -67,20 +67,16 @@

 int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) {

   int i;

+  int aligned_width, aligned_height;

   vp9_de_alloc_frame_buffers(oci);

   /* our internal buffers are always multiples of 16 */

-  if ((width & 0xf) != 0)

-    width += 16 - (width & 0xf);

+  aligned_width = (width + 15) & ~15;

+  aligned_height = (height + 15) & ~15;

-  if ((height & 0xf) != 0)

-    height += 16 - (height & 0xf);

   for (i = 0; i < NUM_YV12_BUFFERS; i++) {

     oci->fb_idx_ref_cnt[i] = 0;

-    oci->yv12_fb[i].flags = 0;

     if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height,

                                     VP9BORDERINPIXELS) < 0) {

       vp9_de_alloc_frame_buffers(oci);

@@ -88,16 +84,17 @@

-  oci->new_fb_idx = 0;

-  oci->lst_fb_idx = 1;

-  oci->gld_fb_idx = 2;

-  oci->alt_fb_idx = 3;

+  oci->new_fb_idx = NUM_YV12_BUFFERS - 1;

+  oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1;

-  oci->fb_idx_ref_cnt[0] = 1;

-  oci->fb_idx_ref_cnt[1] = 1;

-  oci->fb_idx_ref_cnt[2] = 1;

-  oci->fb_idx_ref_cnt[3] = 1;

+  for (i = 0; i < 3; i++)

+    oci->active_ref_idx[i] = i;

+  for (i = 0; i < NUM_REF_FRAMES; i++) {

+    oci->ref_frame_map[i] = i;

+    oci->fb_idx_ref_cnt[i] = 1;

+  }

   if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16,

                                   VP9BORDERINPIXELS) < 0) {

     vp9_de_alloc_frame_buffers(oci);

@@ -110,8 +107,8 @@

     return 1;

-  oci->mb_rows = height >> 4;

-  oci->mb_cols = width >> 4;

+  oci->mb_rows = aligned_height >> 4;

+  oci->mb_cols = aligned_width >> 4;

   oci->MBs = oci->mb_rows * oci->mb_cols;

   oci->mode_info_stride = oci->mb_cols + 1;

   oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));

@@ -134,7 +131,8 @@

   oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;

-  oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);

+  oci->above_context =

+    vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * (3 + oci->mb_cols), 1);

   if (!oci->above_context) {

     vp9_de_alloc_frame_buffers(oci);

@@ -146,6 +144,7 @@

   return 0;

 void vp9_setup_version(VP9_COMMON *cm) {

   if (cm->version & 0x4) {

     if (!CONFIG_EXPERIMENTAL)

@@ -204,9 +203,6 @@

   /* Initialise reference frame sign bias structure to defaults */

   vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));

-  /* Default disable buffer to buffer copying */

-  oci->copy_buffer_to_gf = 0;

-  oci->copy_buffer_to_arf = 0;

   oci->kf_ymode_probs_update = 0;

@@ -220,8 +216,4 @@

   vp9_entropy_mode_init();

   vp9_entropy_mv_init();

-#if CONFIG_NEWCOEFCONTEXT

-  vp9_init_neighbors();

-#endif

--- a/vp9/common/vp9_blockd.c

+++ b/vp9/common/vp9_blockd.c

@@ -12,15 +12,431 @@

 #include "vp9/common/vp9_blockd.h"

 #include "vpx_mem/vpx_mem.h"

-const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25] = {

-  {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8},

-  {0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8},

-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8},

-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}

+const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24] = {

+  { 0, 0, 0, 0,

+    1, 1, 1, 1,

+    2, 2, 2, 2,

+    3, 3, 3, 3,

+    4, 4,

+    5, 5,

+    6, 6,

+    7, 7 },

+  { 0, 0, 0, 0,

+    0, 0, 0, 0,

+    2, 2, 2, 2,

+    2, 2, 2, 2,

+    4, 4,

+    4, 4,

+    6, 6,

+    6, 6 },

+  { 0, 0, 0, 0,

+    0, 0, 0, 0,

+    0, 0, 0, 0,

+    0, 0, 0, 0 },

};

-const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25] = {

-  {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8},

-  {0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8},

-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8},

-  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 6, 6, 6, 6, 8}

+const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24] = {

+  { 0, 1, 2, 3,

+    0, 1, 2, 3,

+    0, 1, 2, 3,

+    0, 1, 2, 3,

+    4, 5,

+    4, 5,

+    6, 7,

+    6, 7 },

+  { 0, 0, 0, 0,

+    2, 2, 2, 2,

+    0, 0, 0, 0,

+    2, 2, 2, 2,

+    4, 4,

+    4, 4,

+    6, 6,

+    6, 6 },

+  { 0, 0, 0, 0,

+    0, 0, 0, 0,

+    0, 0, 0, 0,

+    0, 0, 0, 0 },

};

+#define S(x) x + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT)

+const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96] = {

+  { 0, 0, 0, 0, 0, 0, 0, 0,

+    1, 1, 1, 1, 1, 1, 1, 1,

+    2, 2, 2, 2, 2, 2, 2, 2,

+    3, 3, 3, 3, 3, 3, 3, 3,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1),

+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

+    S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3),

+    4, 4, 4, 4,

+    5, 5, 5, 5,

+    S(4), S(4), S(4), S(4),

+    S(5), S(5), S(5), S(5),

+    6, 6, 6, 6,

+    7, 7, 7, 7,

+    S(6), S(6), S(6), S(6),

+    S(7), S(7), S(7), S(7) },

+  { 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    2, 2, 2, 2, 2, 2, 2, 2,

+    2, 2, 2, 2, 2, 2, 2, 2,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

+    4, 4, 4, 4,

+    4, 4, 4, 4,

+    S(4), S(4), S(4), S(4),

+    S(4), S(4), S(4), S(4),

+    6, 6, 6, 6,

+    6, 6, 6, 6,

+    S(6), S(6), S(6), S(6),

+    S(6), S(6), S(6), S(6) },

+  { 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    4, 4, 4, 4,

+    4, 4, 4, 4,

+    4, 4, 4, 4,

+    4, 4, 4, 4,

+    6, 6, 6, 6,

+    6, 6, 6, 6,

+    6, 6, 6, 6,

+    6, 6, 6, 6 },

+  { 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0 },

+};

+const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96] = {

+  { 0, 1, 2, 3, S(0), S(1), S(2), S(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3),

+    4, 5, S(4), S(5),

+    4, 5, S(4), S(5),

+    4, 5, S(4), S(5),

+    4, 5, S(4), S(5),

+    6, 7, S(6), S(7),

+    6, 7, S(6), S(7),

+    6, 7, S(6), S(7),

+    6, 7, S(6), S(7) },

+  { 0, 0, 0, 0, 2, 2, 2, 2,

+    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    0, 0, 0, 0, 2, 2, 2, 2,

+    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    0, 0, 0, 0, 2, 2, 2, 2,

+    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    0, 0, 0, 0, 2, 2, 2, 2,

+    S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    4, 4, 4, 4,

+    S(4), S(4), S(4), S(4),

+    4, 4, 4, 4,

+    S(4), S(4), S(4), S(4),

+    6, 6, 6, 6,

+    S(6), S(6), S(6), S(6),

+    6, 6, 6, 6,

+    S(6), S(6), S(6), S(6) },

+  { 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    4, 4, 4, 4,

+    4, 4, 4, 4,

+    4, 4, 4, 4,

+    4, 4, 4, 4,

+    6, 6, 6, 6,

+    6, 6, 6, 6,

+    6, 6, 6, 6,

+    6, 6, 6, 6 },

+  { 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0 },

+};

+#define T(x) x + 2 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT))

+#define U(x) x + 3 * (sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT))

+const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384] = {

+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1), S(1),

+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

+    S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3), S(3),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1),

+    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),

+    T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3), T(3),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1), U(1),

+    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),

+    U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3), U(3),

+    4, 4, 4, 4, 4, 4, 4, 4,

+    5, 5, 5, 5, 5, 5, 5, 5,

+    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),

+    S(5), S(5), S(5), S(5), S(5), S(5), S(5), S(5),

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    T(5), T(5), T(5), T(5), T(5), T(5), T(5), T(5),

+    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),

+    U(5), U(5), U(5), U(5), U(5), U(5), U(5), U(5),

+    6, 6, 6, 6, 6, 6, 6, 6,

+    7, 7, 7, 7, 7, 7, 7, 7,

+    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),

+    S(7), S(7), S(7), S(7), S(7), S(7), S(7), S(7),

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

+    T(7), T(7), T(7), T(7), T(7), T(7), T(7), T(7),

+    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6),

+    U(7), U(7), U(7), U(7), U(7), U(7), U(7), U(7) },

+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

+    S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2), S(2),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),

+    T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2), T(2),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),

+    U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2), U(2),

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),

+    S(4), S(4), S(4), S(4), S(4), S(4), S(4), S(4),

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),

+    U(4), U(4), U(4), U(4), U(4), U(4), U(4), U(4),

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),

+    S(6), S(6), S(6), S(6), S(6), S(6), S(6), S(6),

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

+    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6),

+    U(6), U(6), U(6), U(6), U(6), U(6), U(6), U(6) },

+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) },

+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6 },

+};

+const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384] = {

+  { 0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    0, 1, 2, 3, S(0), S(1), S(2), S(3), T(0), T(1), T(2), T(3), U(0), U(1), U(2), U(3),

+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

+    4, 5, S(4), S(5), T(4), T(5), U(4), U(5),

+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7),

+    6, 7, S(6), S(7), T(6), T(7), U(6), U(7) },

+  { 0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

+    0, 0, 0, 0, 2, 2, 2, 2, S(0), S(0), S(0), S(0), S(2), S(2), S(2), S(2),

+    T(0), T(0), T(0), T(0), T(2), T(2), T(2), T(2), U(0), U(0), U(0), U(0), U(2), U(2), U(2), U(2),

+    4, 4, 4, 4, S(4), S(4), S(4), S(4),

+    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),

+    4, 4, 4, 4, S(4), S(4), S(4), S(4),

+    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),

+    4, 4, 4, 4, S(4), S(4), S(4), S(4),

+    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),

+    4, 4, 4, 4, S(4), S(4), S(4), S(4),

+    T(4), T(4), T(4), T(4), U(4), U(4), U(4), U(4),

+    6, 6, 6, 6, S(6), S(6), S(6), S(6),

+    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),

+    6, 6, 6, 6, S(6), S(6), S(6), S(6),

+    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),

+    6, 6, 6, 6, S(6), S(6), S(6), S(6),

+    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6),

+    6, 6, 6, 6, S(6), S(6), S(6), S(6),

+    T(6), T(6), T(6), T(6), U(6), U(6), U(6), U(6) },

+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0), S(0),

+    T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0), T(0),

+    U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0), U(0),

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    T(4), T(4), T(4), T(4), T(4), T(4), T(4), T(4),

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6),

+    T(6), T(6), T(6), T(6), T(6), T(6), T(6), T(6) },

+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    4, 4, 4, 4, 4, 4, 4, 4,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 6, 6, 6, 6 },

+};

+#undef U

+#undef T

+#undef S

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -16,9 +16,9 @@

 #include "./vpx_config.h"

 #include "vpx_scale/yv12config.h"

+#include "vp9/common/vp9_convolve.h"

 #include "vp9/common/vp9_mv.h"

 #include "vp9/common/vp9_treecoder.h"

-#include "vp9/common/vp9_subpixel.h"

 #include "vpx_ports/mem.h"

 #include "vp9/common/vp9_common.h"

@@ -47,27 +47,13 @@

 #define MAX_MV_REFS 9

 #define MAX_MV_REF_CANDIDATES 4

-#if CONFIG_DWTDCTHYBRID

-#define DWT_MAX_LENGTH     64

-#define DWT_TYPE           26    // 26/53/97

-#define DWT_PRECISION_BITS 2

-#define DWT_PRECISION_RND  ((1 << DWT_PRECISION_BITS) / 2)

-#define DWTDCT16X16        0

-#define DWTDCT16X16_LEAN   1

-#define DWTDCT8X8          2

-#define DWTDCT_TYPE        DWTDCT16X16_LEAN

-#endif

 typedef struct {

   int r, c;

 } POS;

-typedef enum PlaneType {

-  PLANE_TYPE_Y_NO_DC = 0,

-  PLANE_TYPE_Y2,

-  PLANE_TYPE_UV,

+typedef enum {

   PLANE_TYPE_Y_WITH_DC,

+  PLANE_TYPE_UV,

 } PLANE_TYPE;

 typedef char ENTROPY_CONTEXT;

@@ -75,10 +61,9 @@

   ENTROPY_CONTEXT y1[4];

   ENTROPY_CONTEXT u[2];

   ENTROPY_CONTEXT v[2];

-  ENTROPY_CONTEXT y2;

 } ENTROPY_CONTEXT_PLANES;

-#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \

+#define VP9_COMBINEENTROPYCONTEXTS(Dest, A, B) \

   Dest = ((A)!=0) + ((B)!=0);

 typedef enum {

@@ -86,8 +71,7 @@

   INTER_FRAME = 1

 } FRAME_TYPE;

-typedef enum

-{

+typedef enum {

 #if CONFIG_ENABLE_6TAP

   SIXTAP,

 #endif

@@ -98,8 +82,7 @@

   SWITCHABLE  /* should be the last one */

 } INTERPOLATIONFILTERTYPE;

-typedef enum

-{

+typedef enum {

   DC_PRED,            /* average of above and left pixels */

   V_PRED,             /* vertical prediction */

   H_PRED,             /* horizontal prediction */

@@ -125,10 +108,8 @@

   SEG_LVL_ALT_Q = 0,               // Use alternate Quantizer ....

   SEG_LVL_ALT_LF = 1,              // Use alternate loop filter value...

   SEG_LVL_REF_FRAME = 2,           // Optional Segment reference frame

-  SEG_LVL_MODE = 3,                // Optional Segment mode

-  SEG_LVL_EOB = 4,                 // EOB end stop marker.

-  SEG_LVL_TRANSFORM = 5,           // Block transform size.

-  SEG_LVL_MAX = 6                  // Number of MB level features supported

+  SEG_LVL_SKIP = 3,                // Optional Segment (0,0) + skip mode

+  SEG_LVL_MAX = 4                  // Number of MB level features supported

 } SEG_LVL_FEATURES;

 // Segment level features.

@@ -155,10 +136,7 @@

 #define VP9_MVREFS (1 + SPLITMV - NEARESTMV)

-#if CONFIG_LOSSLESS

-#define WHT_UPSCALE_FACTOR 3

-#define Y2_WHT_UPSCALE_FACTOR 2

-#endif

+#define WHT_UPSCALE_FACTOR 2

 typedef enum {

   B_DC_PRED,          /* average of above and left pixels */

@@ -219,10 +197,7 @@

     B_PREDICTION_MODE context;

 #endif

   } as_mode;

-  struct {

-    int_mv first;

-    int_mv second;

-  } as_mv;

+  int_mv as_mv[2];  // first, second inter predictor motion vectors

};

 typedef enum {

@@ -274,6 +249,9 @@

   INTERPOLATIONFILTERTYPE interp_filter;

   BLOCK_SIZE_TYPE sb_type;

+#if CONFIG_CODE_NONZEROCOUNT

+  uint16_t nzcs[256+64*2];

+#endif

 } MB_MODE_INFO;

 typedef struct {

@@ -298,36 +276,44 @@

   int dst;

   int dst_stride;

-  int eob;

   union b_mode_info bmi;

 } BLOCKD;

-typedef struct superblockd {

-  /* 32x32 Y and 16x16 U/V. No 2nd order transform yet. */

-  DECLARE_ALIGNED(16, int16_t, diff[32*32+16*16*2]);

-  DECLARE_ALIGNED(16, int16_t, qcoeff[32*32+16*16*2]);

-  DECLARE_ALIGNED(16, int16_t, dqcoeff[32*32+16*16*2]);

-} SUPERBLOCKD;

+struct scale_factors {

+  int x_num;

+  int x_den;

+  int x_offset_q4;

+  int x_step_q4;

+  int y_num;

+  int y_den;

+  int y_offset_q4;

+  int y_step_q4;

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+  convolve_fn_t predict[2][2][8];  // horiz, vert, weight (0 - 7)

+#else

+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg

+#endif

+};

 typedef struct macroblockd {

-  DECLARE_ALIGNED(16, int16_t,  diff[400]);      /* from idct diff */

-  DECLARE_ALIGNED(16, uint8_t,  predictor[384]);

-  DECLARE_ALIGNED(16, int16_t,  qcoeff[400]);

-  DECLARE_ALIGNED(16, int16_t,  dqcoeff[400]);

-  DECLARE_ALIGNED(16, uint16_t, eobs[25]);

+  DECLARE_ALIGNED(16, int16_t,  diff[64*64+32*32*2]);      /* from idct diff */

+  DECLARE_ALIGNED(16, uint8_t,  predictor[384]);  // unused for superblocks

+  DECLARE_ALIGNED(16, int16_t,  qcoeff[64*64+32*32*2]);

+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[64*64+32*32*2]);

+  DECLARE_ALIGNED(16, uint16_t, eobs[256+64*2]);

+#if CONFIG_CODE_NONZEROCOUNT

+  DECLARE_ALIGNED(16, uint16_t, nzcs[256+64*2]);

+#endif

-  SUPERBLOCKD sb_coeff_data;

-  /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */

-  BLOCKD block[25];

+  /* 16 Y blocks, 4 U, 4 V, each with 16 entries. */

+  BLOCKD block[24];

   int fullpixel_mask;

   YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */

-  struct {

-    uint8_t *y_buffer, *u_buffer, *v_buffer;

-  } second_pre;

+  YV12_BUFFER_CONFIG second_pre;

   YV12_BUFFER_CONFIG dst;

+  struct scale_factors scale_factor[2];

+  struct scale_factors scale_factor_uv[2];

   MODE_INFO *prev_mode_info_context;

   MODE_INFO *mode_info_context;

@@ -337,8 +323,9 @@

   int up_available;

   int left_available;

+  int right_available;

-  /* Y,U,V,Y2 */

+  /* Y,U,V */

   ENTROPY_CONTEXT_PLANES *above_context;

   ENTROPY_CONTEXT_PLANES *left_context;

@@ -359,6 +346,7 @@

   // Probability Tree used to code Segment number

   vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS];

+  vp9_prob mb_segment_mispred_tree_probs[MAX_MB_SEGMENTS];

 #if CONFIG_NEW_MVREF

   vp9_prob mb_mv_ref_probs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES-1];

@@ -387,21 +375,20 @@

   unsigned int frames_since_golden;

   unsigned int frames_till_alt_ref_frame;

+  int lossless;

   /* Inverse transform function pointers. */

-  void (*inv_xform4x4_1_x8)(int16_t *input, int16_t *output, int pitch);

-  void (*inv_xform4x4_x8)(int16_t *input, int16_t *output, int pitch);

-  void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);

-  void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);

+  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);

+  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);

+  void (*itxm_add)(int16_t *input, const int16_t *dq,

+    uint8_t *pred, uint8_t *output, int pitch, int stride, int eob);

+  void (*itxm_add_y_block)(int16_t *q, const int16_t *dq,

+    uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd);

+  void (*itxm_add_uv_block)(int16_t *q, const int16_t *dq,

+    uint8_t *pre, uint8_t *dst_u, uint8_t *dst_v, int stride,

+    struct macroblockd *xd);

+  struct subpix_fn_table  subpix;

-  vp9_subpix_fn_t  subpixel_predict4x4;

-  vp9_subpix_fn_t  subpixel_predict8x4;

-  vp9_subpix_fn_t  subpixel_predict8x8;

-  vp9_subpix_fn_t  subpixel_predict16x16;

-  vp9_subpix_fn_t  subpixel_predict_avg4x4;

-  vp9_subpix_fn_t  subpixel_predict_avg8x4;

-  vp9_subpix_fn_t  subpixel_predict_avg8x8;

-  vp9_subpix_fn_t  subpixel_predict_avg16x16;

   int allow_high_precision_mv;

   int corrupted;

@@ -412,74 +399,46 @@

 } MACROBLOCKD;

-#define ACTIVE_HT 110                // quantization stepsize threshold

+#define ACTIVE_HT   110                // quantization stepsize threshold

-#define ACTIVE_HT8 300

+#define ACTIVE_HT8  300

 #define ACTIVE_HT16 300

 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE

 static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {

-  B_PREDICTION_MODE b_mode;

   switch (mode) {

-    case DC_PRED:

-      b_mode = B_DC_PRED;

-      break;

-    case V_PRED:

-      b_mode = B_VE_PRED;

-      break;

-    case H_PRED:

-      b_mode = B_HE_PRED;

-      break;

-    case TM_PRED:

-      b_mode = B_TM_PRED;

-      break;

-    case D45_PRED:

-      b_mode = B_LD_PRED;

-      break;

-    case D135_PRED:

-      b_mode = B_RD_PRED;

-      break;

-    case D117_PRED:

-      b_mode = B_VR_PRED;

-      break;

-    case D153_PRED:

-      b_mode = B_HD_PRED;

-      break;

-    case D27_PRED:

-      b_mode = B_HU_PRED;

-      break;

-    case D63_PRED:

-      b_mode = B_VL_PRED;

-      break;

-    default :

-      // for debug purpose, to be removed after full testing

-      assert(0);

-      break;

+    case DC_PRED: return B_DC_PRED;

+    case V_PRED: return B_VE_PRED;

+    case H_PRED: return B_HE_PRED;

+    case TM_PRED: return B_TM_PRED;

+    case D45_PRED: return B_LD_PRED;

+    case D135_PRED: return B_RD_PRED;

+    case D117_PRED: return B_VR_PRED;

+    case D153_PRED: return B_HD_PRED;

+    case D27_PRED: return B_HU_PRED;

+    case D63_PRED: return B_VL_PRED;

+    default:

+       assert(0);

+       return B_MODE_COUNT;  // Dummy value

-  return b_mode;

 // transform mapping

 static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) {

-  // map transform type

-  TX_TYPE tx_type;

   switch (bmode) {

     case B_TM_PRED :

     case B_RD_PRED :

-      tx_type = ADST_ADST;

-      break;

+      return ADST_ADST;

     case B_VE_PRED :

     case B_VR_PRED :

-      tx_type = ADST_DCT;

-      break;

+      return ADST_DCT;

     case B_HE_PRED :

     case B_HD_PRED :

     case B_HU_PRED :

-      tx_type = DCT_ADST;

-      break;

+      return DCT_ADST;

 #if CONFIG_NEWBINTRAMODES

     case B_CONTEXT_PRED:

@@ -487,33 +446,41 @@

       break;

 #endif

-    default :

-      tx_type = DCT_DCT;

-      break;

+    default:

+      return DCT_DCT;

-  return tx_type;

-extern const uint8_t vp9_block2left[TX_SIZE_MAX_SB][25];

-extern const uint8_t vp9_block2above[TX_SIZE_MAX_SB][25];

+extern const uint8_t vp9_block2left[TX_SIZE_MAX_MB][24];

+extern const uint8_t vp9_block2above[TX_SIZE_MAX_MB][24];

+extern const uint8_t vp9_block2left_sb[TX_SIZE_MAX_SB][96];

+extern const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96];

+extern const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384];

+extern const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384];

-#define USE_ADST_FOR_I16X16_8X8   0

-#define USE_ADST_FOR_I16X16_4X4   0

+#define USE_ADST_FOR_I16X16_8X8   1

+#define USE_ADST_FOR_I16X16_4X4   1

 #define USE_ADST_FOR_I8X8_4X4     1

 #define USE_ADST_PERIPHERY_ONLY   1

+#define USE_ADST_FOR_SB           1

+#define USE_ADST_FOR_REMOTE_EDGE  0

-static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) {

+static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) {

   // TODO(debargha): explore different patterns for ADST usage when blocksize

   // is smaller than the prediction size

   TX_TYPE tx_type = DCT_DCT;

-  int ib = (int)(b - xd->block);

-  if (ib >= 16)

+  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+#if !USE_ADST_FOR_SB

+  if (sb_type)

     return tx_type;

-  // TODO(rbultje, debargha): Explore ADST usage for superblocks

-  if (xd->mode_info_context->mbmi.sb_type)

+#endif

+  if (ib >= (16 << (2 * sb_type)))  // no chroma adst

     return tx_type;

+  if (xd->lossless)

+    return DCT_DCT;

   if (xd->mode_info_context->mbmi.mode == B_PRED &&

       xd->q_index < ACTIVE_HT) {

+    const BLOCKD *b = &xd->block[ib];

     tx_type = txfm_map(

 #if CONFIG_NEWBINTRAMODES

         b->bmi.as_mode.first == B_CONTEXT_PRED ? b->bmi.as_mode.context :

@@ -521,16 +488,32 @@

         b->bmi.as_mode.first);

   } else if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&

              xd->q_index < ACTIVE_HT) {

+    const BLOCKD *b = &xd->block[ib];

+    const int ic = (ib & 10);

 #if USE_ADST_FOR_I8X8_4X4

 #if USE_ADST_PERIPHERY_ONLY

     // Use ADST for periphery blocks only

-    int ic = (ib & 10);

+    const int inner = ib & 5;

     b += ic - ib;

-    tx_type = (ic != 10) ?

-         txfm_map(pred_mode_conv((MB_PREDICTION_MODE)b->bmi.as_mode.first)) :

-         DCT_DCT;

+    tx_type = txfm_map(pred_mode_conv(

+        (MB_PREDICTION_MODE)b->bmi.as_mode.first));

+#if USE_ADST_FOR_REMOTE_EDGE

+    if (inner == 5)

+      tx_type = DCT_DCT;

 #else

+    if (inner == 1) {

+      if (tx_type == ADST_ADST) tx_type = ADST_DCT;

+      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;

+    } else if (inner == 4) {

+      if (tx_type == ADST_ADST) tx_type = DCT_ADST;

+      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;

+    } else if (inner == 5) {

+      tx_type = DCT_DCT;

+    }

+#endif

+#else

     // Use ADST

+    b += ic - ib;

     tx_type = txfm_map(pred_mode_conv(

         (MB_PREDICTION_MODE)b->bmi.as_mode.first));

 #endif

@@ -542,10 +525,23 @@

              xd->q_index < ACTIVE_HT) {

 #if USE_ADST_FOR_I16X16_4X4

 #if USE_ADST_PERIPHERY_ONLY

-    // Use ADST for periphery blocks only

-    tx_type = (ib < 4 || ((ib & 3) == 0)) ?

-        txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)) : DCT_DCT;

+    const int hmax = 4 << sb_type;

+    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

+#if USE_ADST_FOR_REMOTE_EDGE

+    if ((ib & (hmax - 1)) != 0 && ib >= hmax)

+      tx_type = DCT_DCT;

 #else

+    if (ib >= 1 && ib < hmax) {

+      if (tx_type == ADST_ADST) tx_type = ADST_DCT;

+      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;

+    } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {

+      if (tx_type == ADST_ADST) tx_type = DCT_ADST;

+      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;

+    } else if (ib != 0) {

+      tx_type = DCT_DCT;

+    }

+#endif

+#else

     // Use ADST

     tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

 #endif

@@ -557,18 +553,20 @@

   return tx_type;

-static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) {

+static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) {

   // TODO(debargha): explore different patterns for ADST usage when blocksize

   // is smaller than the prediction size

   TX_TYPE tx_type = DCT_DCT;

-  int ib = (int)(b - xd->block);

-  if (ib >= 16)

+  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+#if !USE_ADST_FOR_SB

+  if (sb_type)

     return tx_type;

-  // TODO(rbultje, debargha): Explore ADST usage for superblocks

-  if (xd->mode_info_context->mbmi.sb_type)

+#endif

+  if (ib >= (16 << (2 * sb_type)))  // no chroma adst

     return tx_type;

   if (xd->mode_info_context->mbmi.mode == I8X8_PRED &&

       xd->q_index < ACTIVE_HT8) {

+    const BLOCKD *b = &xd->block[ib];

     // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged

     // or the relationship otherwise modified to address this type conversion.

     tx_type = txfm_map(pred_mode_conv(

@@ -575,12 +573,25 @@

            (MB_PREDICTION_MODE)b->bmi.as_mode.first));

   } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&

              xd->q_index < ACTIVE_HT8) {

-#if USE_ADST_FOR_I8X8_4X4

+#if USE_ADST_FOR_I16X16_8X8

 #if USE_ADST_PERIPHERY_ONLY

-    // Use ADST for periphery blocks only

-    tx_type = (ib != 10) ?

-        txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)) : DCT_DCT;

+    const int hmax = 4 << sb_type;

+    tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

+#if USE_ADST_FOR_REMOTE_EDGE

+    if ((ib & (hmax - 1)) != 0 && ib >= hmax)

+      tx_type = DCT_DCT;

 #else

+    if (ib >= 1 && ib < hmax) {

+      if (tx_type == ADST_ADST) tx_type = ADST_DCT;

+      else if (tx_type == DCT_ADST) tx_type = DCT_DCT;

+    } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {

+      if (tx_type == ADST_ADST) tx_type = DCT_ADST;

+      else if (tx_type == ADST_DCT) tx_type = DCT_DCT;

+    } else if (ib != 0) {

+      tx_type = DCT_DCT;

+    }

+#endif

+#else

     // Use ADST

     tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

 #endif

@@ -592,63 +603,73 @@

   return tx_type;

-static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) {

+static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) {

   TX_TYPE tx_type = DCT_DCT;

-  int ib = (int)(b - xd->block);

-  if (ib >= 16)

+  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+#if !USE_ADST_FOR_SB

+  if (sb_type)

     return tx_type;

-  // TODO(rbultje, debargha): Explore ADST usage for superblocks

-  if (xd->mode_info_context->mbmi.sb_type)

+#endif

+  if (ib >= (16 << (2 * sb_type)))

     return tx_type;

   if (xd->mode_info_context->mbmi.mode < I8X8_PRED &&

       xd->q_index < ACTIVE_HT16) {

     tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode));

+#if USE_ADST_PERIPHERY_ONLY

+    if (sb_type) {

+      const int hmax = 4 << sb_type;

+#if USE_ADST_FOR_REMOTE_EDGE

+      if ((ib & (hmax - 1)) != 0 && ib >= hmax)

+        tx_type = DCT_DCT;

+#else

+      if (ib >= 1 && ib < hmax) {

+        if (tx_type == ADST_ADST) tx_type = ADST_DCT;

+        else if (tx_type == DCT_ADST) tx_type = DCT_DCT;

+      } else if (ib >= 1 && (ib & (hmax - 1)) == 0) {

+        if (tx_type == ADST_ADST) tx_type = DCT_ADST;

+        else if (tx_type == ADST_DCT) tx_type = DCT_DCT;

+      } else if (ib != 0) {

+        tx_type = DCT_DCT;

+      }

+#endif

+    }

+#endif

   return tx_type;

-static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) {

-  TX_TYPE tx_type = DCT_DCT;

-  int ib = (int)(b - xd->block);

-  if (ib >= 16)

-    return tx_type;

-  if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) {

-    tx_type = get_tx_type_16x16(xd, b);

-  }

-  if (xd->mode_info_context->mbmi.txfm_size  == TX_8X8) {

-    ib = (ib & 8) + ((ib & 4) >> 1);

-    tx_type = get_tx_type_8x8(xd, &xd->block[ib]);

-  }

-  if (xd->mode_info_context->mbmi.txfm_size  == TX_4X4) {

-    tx_type = get_tx_type_4x4(xd, b);

-  }

-  return tx_type;

-}

+void vp9_build_block_doffsets(MACROBLOCKD *xd);

+void vp9_setup_block_dptrs(MACROBLOCKD *xd);

-static int get_2nd_order_usage(const MACROBLOCKD *xd) {

-  int has_2nd_order = (xd->mode_info_context->mbmi.mode != SPLITMV &&

-                       xd->mode_info_context->mbmi.mode != I8X8_PRED &&

-                       xd->mode_info_context->mbmi.mode != B_PRED &&

-                       xd->mode_info_context->mbmi.txfm_size != TX_16X16);

-  if (has_2nd_order)

-    has_2nd_order = (get_tx_type(xd, xd->block) == DCT_DCT);

-  return has_2nd_order;

-}

-extern void vp9_build_block_doffsets(MACROBLOCKD *xd);

-extern void vp9_setup_block_dptrs(MACROBLOCKD *xd);

 static void update_blockd_bmi(MACROBLOCKD *xd) {

-  int i;

-  int is_4x4;

-  is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||

-           (xd->mode_info_context->mbmi.mode == I8X8_PRED) ||

-           (xd->mode_info_context->mbmi.mode == B_PRED);

+  const MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;

-  if (is_4x4) {

-    for (i = 0; i < 16; i++) {

+  if (mode == SPLITMV || mode == I8X8_PRED || mode == B_PRED) {

+    int i;

+    for (i = 0; i < 16; i++)

       xd->block[i].bmi = xd->mode_info_context->bmi[i];

-    }

+}

+static TX_SIZE get_uv_tx_size(const MACROBLOCKD *xd) {

+  TX_SIZE tx_size_uv;

+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

+    tx_size_uv = xd->mode_info_context->mbmi.txfm_size;

+  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {

+    if (xd->mode_info_context->mbmi.txfm_size == TX_32X32)

+      tx_size_uv = TX_16X16;

+    else

+      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;

+  } else {

+    if (xd->mode_info_context->mbmi.txfm_size == TX_16X16)

+      tx_size_uv = TX_8X8;

+    else if (xd->mode_info_context->mbmi.txfm_size == TX_8X8 &&

+             (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

+              xd->mode_info_context->mbmi.mode == SPLITMV))

+      tx_size_uv = TX_4X4;

+    else

+      tx_size_uv = xd->mode_info_context->mbmi.txfm_size;

+  }

+  return tx_size_uv;

 #endif  // VP9_COMMON_VP9_BLOCKD_H_

--- a/vp9/common/vp9_coefupdateprobs.h

+++ b/vp9/common/vp9_coefupdateprobs.h

@@ -9,12 +9,25 @@

*/

 #ifndef VP9_COMMON_VP9_COEFUPDATEPROBS_H_

-#define VP9_COMMON_VP9_COEFUPDATEPROBS_H__

+#define VP9_COMMON_VP9_COEFUPDATEPROBS_H_

 /* Update probabilities for the nodes in the token entropy tree.

    Generated file included by vp9_entropy.c */

-#define COEF_UPDATE_PROB 252

-#define COEF_UPDATE_PROB_8X8 252

-#define COEF_UPDATE_PROB_16X16 252

+static const vp9_prob vp9_coef_update_prob[ENTROPY_NODES] = {

+  252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252

+};

+#if CONFIG_CODE_NONZEROCOUNT

+#define NZC_UPDATE_PROB_4X4     252

+#define NZC_UPDATE_PROB_8X8     252

+#define NZC_UPDATE_PROB_16X16   252

+#define NZC_UPDATE_PROB_32X32   252

+#define NZC_UPDATE_PROB_PCAT    252

+#endif

+#if CONFIG_MODELCOEFPROB

+#define COEF_MODEL_UPDATE_PROB   16

+#endif

 #endif  // VP9_COMMON_VP9_COEFUPDATEPROBS_H__

--- a/vp9/common/vp9_common.h

+++ b/vp9/common/vp9_common.h

@@ -11,10 +11,11 @@

 #ifndef VP9_COMMON_VP9_COMMON_H_

 #define VP9_COMMON_VP9_COMMON_H_

-#include <assert.h>

-#include "vpx_config.h"

 /* Interface header for common constant data structures and lookup tables */

+#include <assert.h>

+#include "./vpx_config.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx/vpx_integer.h"

@@ -24,26 +25,34 @@

 #define MIN(x, y) (((x) < (y)) ? (x) : (y))

 #define MAX(x, y) (((x) > (y)) ? (x) : (y))

-/* Only need this for fixed-size arrays, for structs just assign. */

+#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))

-#define vp9_copy(Dest, Src) { \

-    assert(sizeof(Dest) == sizeof(Src)); \

-    vpx_memcpy(Dest, Src, sizeof(Src)); \

+/* If we don't want to use ROUND_POWER_OF_TWO macro

+static INLINE int16_t round_power_of_two(int16_t value, int n) {

+  return (value + (1 << (n - 1))) >> n;

+}*/

+// Only need this for fixed-size arrays, for structs just assign.

+#define vp9_copy(dest, src) {            \

+    assert(sizeof(dest) == sizeof(src)); \

+    vpx_memcpy(dest, src, sizeof(src));  \

-/* Use this for variably-sized arrays. */

-#define vp9_copy_array(Dest, Src, N) { \

-    assert(sizeof(*Dest) == sizeof(*Src)); \

-    vpx_memcpy(Dest, Src, N * sizeof(*Src)); \

+// Use this for variably-sized arrays.

+#define vp9_copy_array(dest, src, n) {       \

+    assert(sizeof(*dest) == sizeof(*src));   \

+    vpx_memcpy(dest, src, n * sizeof(*src)); \

-#define vp9_zero(Dest) vpx_memset(&Dest, 0, sizeof(Dest));

+#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest));

+#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest));

-#define vp9_zero_array(Dest, N) vpx_memset(Dest, 0, N * sizeof(*Dest));

-static __inline uint8_t clip_pixel(int val) {

+static INLINE uint8_t clip_pixel(int val) {

   return (val > 255) ? 255u : (val < 0) ? 0u : val;

+}

+static INLINE int clamp(int value, int low, int high) {

+  return value < low ? low : (value > high ? high : value);

 #endif  // VP9_COMMON_VP9_COMMON_H_

--- /dev/null

+++ b/vp9/common/vp9_convolve.c

@@ -1,0 +1,850 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/vp9_convolve.h"

+#include <assert.h>

+#include "./vpx_config.h"

+#include "./vp9_rtcd.h"

+#include "vp9/common/vp9_common.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_ports/mem.h"

+#define VP9_FILTER_WEIGHT 128

+#define VP9_FILTER_SHIFT  7

+/* Assume a bank of 16 filters to choose from. There are two implementations

+ * for filter wrapping behavior, since we want to be able to pick which filter

+ * to start with. We could either:

+ *

+ * 1) make filter_ a pointer to the base of the filter array, and then add an

+ *    additional offset parameter, to choose the starting filter.

+ * 2) use a pointer to 2 periods worth of filters, so that even if the original

+ *    phase offset is at 15/16, we'll have valid data to read. The filter

+ *    tables become [32][8], and the second half is duplicated.

+ * 3) fix the alignment of the filter tables, so that we know the 0/16 is

+ *    always 256 byte aligned.

+ *

+ * Implementations 2 and 3 are likely preferable, as they avoid an extra 2

+ * parameters, and switching between them is trivial, with the

+ * ALIGN_FILTERS_256 macro, below.

+ */

+ #define ALIGN_FILTERS_256 1

+static void convolve_horiz_c(const uint8_t *src, int src_stride,

+                             uint8_t *dst, int dst_stride,

+                             const int16_t *filter_x0, int x_step_q4,

+                             const int16_t *filter_y, int y_step_q4,

+                             int w, int h, int taps) {

+  int x, y, k, sum;

+  const int16_t *filter_x_base = filter_x0;

+#if ALIGN_FILTERS_256

+  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source line */

+  src -= taps / 2 - 1;

+  for (y = 0; y < h; ++y) {

+    /* Pointer to filter to use */

+    const int16_t *filter_x = filter_x0;

+    /* Initial phase offset */

+    int x0_q4 = (filter_x - filter_x_base) / taps;

+    int x_q4 = x0_q4;

+    for (x = 0; x < w; ++x) {

+      /* Per-pixel src offset */

+      int src_x = (x_q4 - x0_q4) >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[src_x + k] * filter_x[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT);

+      /* Adjust source and filter to use for the next pixel */

+      x_q4 += x_step_q4;

+      filter_x = filter_x_base + (x_q4 & 0xf) * taps;

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,

+                                 uint8_t *dst, int dst_stride,

+                                 const int16_t *filter_x0, int x_step_q4,

+                                 const int16_t *filter_y, int y_step_q4,

+                                 int w, int h, int taps) {

+  int x, y, k, sum;

+  const int16_t *filter_x_base = filter_x0;

+#if ALIGN_FILTERS_256

+  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source line */

+  src -= taps / 2 - 1;

+  for (y = 0; y < h; ++y) {

+    /* Pointer to filter to use */

+    const int16_t *filter_x = filter_x0;

+    /* Initial phase offset */

+    int x0_q4 = (filter_x - filter_x_base) / taps;

+    int x_q4 = x0_q4;

+    for (x = 0; x < w; ++x) {

+      /* Per-pixel src offset */

+      int src_x = (x_q4 - x0_q4) >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[src_x + k] * filter_x[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

+      /* Adjust source and filter to use for the next pixel */

+      x_q4 += x_step_q4;

+      filter_x = filter_x_base + (x_q4 & 0xf) * taps;

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+static inline uint8_t combine_qtr(uint8_t a, uint8_t b) {

+  return (((a) + (b) * 3 + 2) >> 2);

+}

+static inline uint8_t combine_3qtr(uint8_t a, uint8_t b) {

+  return (((a) * 3 + (b) + 2) >> 2);

+}

+static inline uint8_t combine_1by8(uint8_t a, uint8_t b) {

+  return (((a) * 1 + (b) * 7 + 4) >> 3);

+}

+static inline uint8_t combine_3by8(uint8_t a, uint8_t b) {

+  return (((a) * 3 + (b) * 5 + 4) >> 3);

+}

+static inline uint8_t combine_5by8(uint8_t a, uint8_t b) {

+  return (((a) * 5 + (b) * 3 + 4) >> 3);

+}

+static inline uint8_t combine_7by8(uint8_t a, uint8_t b) {

+  return (((a) * 7 + (b) * 1 + 4) >> 3);

+}

+// TODO(debargha): Implment with a separate weight parameter

+static void convolve_wtd_horiz_c(const uint8_t *src, int src_stride,

+                                 uint8_t *dst, int dst_stride,

+                                 const int16_t *filter_x0, int x_step_q4,

+                                 const int16_t *filter_y, int y_step_q4,

+                                 int w, int h, int taps,

+                                 uint8_t (*combine)(uint8_t a, uint8_t b)) {

+  int x, y, k, sum;

+  const int16_t *filter_x_base = filter_x0;

+#if ALIGN_FILTERS_256

+  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source line */

+  src -= taps / 2 - 1;

+  for (y = 0; y < h; ++y) {

+    /* Pointer to filter to use */

+    const int16_t *filter_x = filter_x0;

+    /* Initial phase offset */

+    int x0_q4 = (filter_x - filter_x_base) / taps;

+    int x_q4 = x0_q4;

+    for (x = 0; x < w; ++x) {

+      /* Per-pixel src offset */

+      int src_x = (x_q4 - x0_q4) >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[src_x + k] * filter_x[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[x] = combine(dst[x], clip_pixel(sum >> VP9_FILTER_SHIFT));

+      /* Adjust source and filter to use for the next pixel */

+      x_q4 += x_step_q4;

+      filter_x = filter_x_base + (x_q4 & 0xf) * taps;

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+#endif

+static void convolve_vert_c(const uint8_t *src, int src_stride,

+                            uint8_t *dst, int dst_stride,

+                            const int16_t *filter_x, int x_step_q4,

+                            const int16_t *filter_y0, int y_step_q4,

+                            int w, int h, int taps) {

+  int x, y, k, sum;

+  const int16_t *filter_y_base = filter_y0;

+#if ALIGN_FILTERS_256

+  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source column */

+  src -= src_stride * (taps / 2 - 1);

+  for (x = 0; x < w; ++x) {

+    /* Pointer to filter to use */

+    const int16_t *filter_y = filter_y0;

+    /* Initial phase offset */

+    int y0_q4 = (filter_y - filter_y_base) / taps;

+    int y_q4 = y0_q4;

+    for (y = 0; y < h; ++y) {

+      /* Per-pixel src offset */

+      int src_y = (y_q4 - y0_q4) >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[(src_y + k) * src_stride] * filter_y[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT);

+      /* Adjust source and filter to use for the next pixel */

+      y_q4 += y_step_q4;

+      filter_y = filter_y_base + (y_q4 & 0xf) * taps;

+    }

+    ++src;

+    ++dst;

+  }

+}

+static void convolve_avg_vert_c(const uint8_t *src, int src_stride,

+                                uint8_t *dst, int dst_stride,

+                                const int16_t *filter_x, int x_step_q4,

+                                const int16_t *filter_y0, int y_step_q4,

+                                int w, int h, int taps) {

+  int x, y, k, sum;

+  const int16_t *filter_y_base = filter_y0;

+#if ALIGN_FILTERS_256

+  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source column */

+  src -= src_stride * (taps / 2 - 1);

+  for (x = 0; x < w; ++x) {

+    /* Pointer to filter to use */

+    const int16_t *filter_y = filter_y0;

+    /* Initial phase offset */

+    int y0_q4 = (filter_y - filter_y_base) / taps;

+    int y_q4 = y0_q4;

+    for (y = 0; y < h; ++y) {

+      /* Per-pixel src offset */

+      int src_y = (y_q4 - y0_q4) >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[(src_y + k) * src_stride] * filter_y[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[y * dst_stride] =

+          (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

+      /* Adjust source and filter to use for the next pixel */

+      y_q4 += y_step_q4;

+      filter_y = filter_y_base + (y_q4 & 0xf) * taps;

+    }

+    ++src;

+    ++dst;

+  }

+}

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+static void convolve_wtd_vert_c(const uint8_t *src, int src_stride,

+                                uint8_t *dst, int dst_stride,

+                                const int16_t *filter_x, int x_step_q4,

+                                const int16_t *filter_y0, int y_step_q4,

+                                int w, int h, int taps,

+                                uint8_t (*combine)(uint8_t a, uint8_t b)) {

+  int x, y, k, sum;

+  const int16_t *filter_y_base = filter_y0;

+#if ALIGN_FILTERS_256

+  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source column */

+  src -= src_stride * (taps / 2 - 1);

+  for (x = 0; x < w; ++x) {

+    /* Pointer to filter to use */

+    const int16_t *filter_y = filter_y0;

+    /* Initial phase offset */

+    int y0_q4 = (filter_y - filter_y_base) / taps;

+    int y_q4 = y0_q4;

+    for (y = 0; y < h; ++y) {

+      /* Per-pixel src offset */

+      int src_y = (y_q4 - y0_q4) >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[(src_y + k) * src_stride] * filter_y[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[y * dst_stride] = combine(dst[y * dst_stride],

+                                    clip_pixel(sum >> VP9_FILTER_SHIFT));

+      /* Adjust source and filter to use for the next pixel */

+      y_q4 += y_step_q4;

+      filter_y = filter_y_base + (y_q4 & 0xf) * taps;

+    }

+    ++src;

+    ++dst;

+  }

+}

+#endif

+static void convolve_c(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int x_step_q4,

+                       const int16_t *filter_y, int y_step_q4,

+                       int w, int h, int taps) {

+  /* Fixed size intermediate buffer places limits on parameters.

+   * Maximum intermediate_height is 39, for y_step_q4 == 32,

+   * h == 16, taps == 8.

+   */

+  uint8_t temp[16 * 39];

+  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;

+  assert(w <= 16);

+  assert(h <= 16);

+  assert(taps <= 8);

+  assert(y_step_q4 <= 32);

+  if (intermediate_height < h)

+    intermediate_height = h;

+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,

+                   temp, 16,

+                   filter_x, x_step_q4, filter_y, y_step_q4,

+                   w, intermediate_height, taps);

+  convolve_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,

+                  filter_x, x_step_q4, filter_y, y_step_q4,

+                  w, h, taps);

+}

+static void convolve_avg_c(const uint8_t *src, int src_stride,

+                           uint8_t *dst, int dst_stride,

+                           const int16_t *filter_x, int x_step_q4,

+                           const int16_t *filter_y, int y_step_q4,

+                           int w, int h, int taps) {

+  /* Fixed size intermediate buffer places limits on parameters.

+   * Maximum intermediate_height is 39, for y_step_q4 == 32,

+   * h == 16, taps == 8.

+   */

+  uint8_t temp[16 * 39];

+  int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1;

+  assert(w <= 16);

+  assert(h <= 16);

+  assert(taps <= 8);

+  assert(y_step_q4 <= 32);

+  if (intermediate_height < h)

+    intermediate_height = h;

+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,

+                   temp, 16,

+                   filter_x, x_step_q4, filter_y, y_step_q4,

+                   w, intermediate_height, taps);

+  convolve_avg_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, taps);

+}

+void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,

+                           uint8_t *dst, int dst_stride,

+                           const int16_t *filter_x, int x_step_q4,

+                           const int16_t *filter_y, int y_step_q4,

+                           int w, int h) {

+  convolve_horiz_c(src, src_stride, dst, dst_stride,

+                   filter_x, x_step_q4, filter_y, y_step_q4,

+                   w, h, 8);

+}

+void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  convolve_avg_horiz_c(src, src_stride, dst, dst_stride,

+                       filter_x, x_step_q4, filter_y, y_step_q4,

+                       w, h, 8);

+}

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+void vp9_convolve8_1by8_horiz_c(const uint8_t *src, int src_stride,

+                                uint8_t *dst, int dst_stride,

+                                const int16_t *filter_x, int x_step_q4,

+                                const int16_t *filter_y, int y_step_q4,

+                                int w, int h) {

+  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

+                       filter_x, x_step_q4, filter_y, y_step_q4,

+                       w, h, 8, combine_1by8);

+}

+void vp9_convolve8_qtr_horiz_c(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

+                       filter_x, x_step_q4, filter_y, y_step_q4,

+                       w, h, 8, combine_qtr);

+}

+void vp9_convolve8_3by8_horiz_c(const uint8_t *src, int src_stride,

+                                uint8_t *dst, int dst_stride,

+                                const int16_t *filter_x, int x_step_q4,

+                                const int16_t *filter_y, int y_step_q4,

+                                int w, int h) {

+  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

+                       filter_x, x_step_q4, filter_y, y_step_q4,

+                       w, h, 8, combine_3by8);

+}

+void vp9_convolve8_5by8_horiz_c(const uint8_t *src, int src_stride,

+                                uint8_t *dst, int dst_stride,

+                                const int16_t *filter_x, int x_step_q4,

+                                const int16_t *filter_y, int y_step_q4,

+                                int w, int h) {

+  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

+                       filter_x, x_step_q4, filter_y, y_step_q4,

+                       w, h, 8, combine_5by8);

+}

+void vp9_convolve8_3qtr_horiz_c(const uint8_t *src, int src_stride,

+                                uint8_t *dst, int dst_stride,

+                                const int16_t *filter_x, int x_step_q4,

+                                const int16_t *filter_y, int y_step_q4,

+                                int w, int h) {

+  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

+                       filter_x, x_step_q4, filter_y, y_step_q4,

+                       w, h, 8, combine_3qtr);

+}

+void vp9_convolve8_7by8_horiz_c(const uint8_t *src, int src_stride,

+                                uint8_t *dst, int dst_stride,

+                                const int16_t *filter_x, int x_step_q4,

+                                const int16_t *filter_y, int y_step_q4,

+                                int w, int h) {

+  convolve_wtd_horiz_c(src, src_stride, dst, dst_stride,

+                       filter_x, x_step_q4, filter_y, y_step_q4,

+                       w, h, 8, combine_7by8);

+}

+#endif

+void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,

+                          uint8_t *dst, int dst_stride,

+                          const int16_t *filter_x, int x_step_q4,

+                          const int16_t *filter_y, int y_step_q4,

+                          int w, int h) {

+  convolve_vert_c(src, src_stride, dst, dst_stride,

+                  filter_x, x_step_q4, filter_y, y_step_q4,

+                  w, h, 8);

+}

+void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int x_step_q4,

+                              const int16_t *filter_y, int y_step_q4,

+                              int w, int h) {

+  convolve_avg_vert_c(src, src_stride, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, 8);

+}

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+void vp9_convolve8_1by8_vert_c(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, 8, combine_1by8);

+}

+void vp9_convolve8_qtr_vert_c(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int x_step_q4,

+                              const int16_t *filter_y, int y_step_q4,

+                              int w, int h) {

+  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, 8, combine_qtr);

+}

+void vp9_convolve8_3by8_vert_c(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, 8, combine_3by8);

+}

+void vp9_convolve8_5by8_vert_c(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, 8, combine_5by8);

+}

+void vp9_convolve8_3qtr_vert_c(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, 8, combine_3qtr);

+}

+void vp9_convolve8_7by8_vert_c(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  convolve_wtd_vert_c(src, src_stride, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, 8, combine_7by8);

+}

+#endif

+void vp9_convolve8_c(const uint8_t *src, int src_stride,

+                     uint8_t *dst, int dst_stride,

+                     const int16_t *filter_x, int x_step_q4,

+                     const int16_t *filter_y, int y_step_q4,

+                     int w, int h) {

+  convolve_c(src, src_stride, dst, dst_stride,

+             filter_x, x_step_q4, filter_y, y_step_q4,

+             w, h, 8);

+}

+void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  /* Fixed size intermediate buffer places limits on parameters. */

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

+  assert(w <= 16);

+  assert(h <= 16);

+  vp9_convolve8(src, src_stride,

+                temp, 16,

+                filter_x, x_step_q4,

+                filter_y, y_step_q4,

+                w, h);

+  vp9_convolve_avg(temp, 16,

+                   dst, dst_stride,

+                   NULL, 0, /* These unused parameter should be removed! */

+                   NULL, 0, /* These unused parameter should be removed! */

+                   w, h);

+}

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+void vp9_convolve8_1by8_c(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  /* Fixed size intermediate buffer places limits on parameters. */

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

+  assert(w <= 16);

+  assert(h <= 16);

+  vp9_convolve8(src, src_stride,

+                temp, 16,

+                filter_x, x_step_q4,

+                filter_y, y_step_q4,

+                w, h);

+  vp9_convolve_1by8(temp, 16,

+                    dst, dst_stride,

+                    NULL, 0, /* These unused parameter should be removed! */

+                    NULL, 0, /* These unused parameter should be removed! */

+                    w, h);

+}

+void vp9_convolve8_qtr_c(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  /* Fixed size intermediate buffer places limits on parameters. */

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

+  assert(w <= 16);

+  assert(h <= 16);

+  vp9_convolve8(src, src_stride,

+                temp, 16,

+                filter_x, x_step_q4,

+                filter_y, y_step_q4,

+                w, h);

+  vp9_convolve_qtr(temp, 16,

+                   dst, dst_stride,

+                   NULL, 0, /* These unused parameter should be removed! */

+                   NULL, 0, /* These unused parameter should be removed! */

+                   w, h);

+}

+void vp9_convolve8_3by8_c(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  /* Fixed size intermediate buffer places limits on parameters. */

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

+  assert(w <= 16);

+  assert(h <= 16);

+  vp9_convolve8(src, src_stride,

+                temp, 16,

+                filter_x, x_step_q4,

+                filter_y, y_step_q4,

+                w, h);

+  vp9_convolve_3by8(temp, 16,

+                    dst, dst_stride,

+                    NULL, 0, /* These unused parameter should be removed! */

+                    NULL, 0, /* These unused parameter should be removed! */

+                    w, h);

+}

+void vp9_convolve8_5by8_c(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  /* Fixed size intermediate buffer places limits on parameters. */

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

+  assert(w <= 16);

+  assert(h <= 16);

+  vp9_convolve8(src, src_stride,

+                temp, 16,

+                filter_x, x_step_q4,

+                filter_y, y_step_q4,

+                w, h);

+  vp9_convolve_5by8(temp, 16,

+                    dst, dst_stride,

+                    NULL, 0, /* These unused parameter should be removed! */

+                    NULL, 0, /* These unused parameter should be removed! */

+                    w, h);

+}

+void vp9_convolve8_3qtr_c(const uint8_t *src, int src_stride,

+                          uint8_t *dst, int dst_stride,

+                          const int16_t *filter_x, int x_step_q4,

+                          const int16_t *filter_y, int y_step_q4,

+                          int w, int h) {

+  /* Fixed size intermediate buffer places limits on parameters. */

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

+  assert(w <= 16);

+  assert(h <= 16);

+  vp9_convolve8(src, src_stride,

+                temp, 16,

+                filter_x, x_step_q4,

+                filter_y, y_step_q4,

+                w, h);

+  vp9_convolve_3qtr(temp, 16,

+                    dst, dst_stride,

+                    NULL, 0, /* These unused parameter should be removed! */

+                    NULL, 0, /* These unused parameter should be removed! */

+                    w, h);

+}

+void vp9_convolve8_7by8_c(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  /* Fixed size intermediate buffer places limits on parameters. */

+  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16);

+  assert(w <= 16);

+  assert(h <= 16);

+  vp9_convolve8(src, src_stride,

+                temp, 16,

+                filter_x, x_step_q4,

+                filter_y, y_step_q4,

+                w, h);

+  vp9_convolve_7by8(temp, 16,

+                    dst, dst_stride,

+                    NULL, 0, /* These unused parameter should be removed! */

+                    NULL, 0, /* These unused parameter should be removed! */

+                    w, h);

+}

+#endif

+void vp9_convolve_copy(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int filter_x_stride,

+                       const int16_t *filter_y, int filter_y_stride,

+                       int w, int h) {

+  if (w == 16 && h == 16) {

+    vp9_copy_mem16x16(src, src_stride, dst, dst_stride);

+  } else if (w == 8 && h == 8) {

+    vp9_copy_mem8x8(src, src_stride, dst, dst_stride);

+  } else if (w == 8 && h == 4) {

+    vp9_copy_mem8x4(src, src_stride, dst, dst_stride);

+  } else {

+    int r;

+    for (r = h; r > 0; --r) {

+      memcpy(dst, src, w);

+      src += src_stride;

+      dst += dst_stride;

+    }

+  }

+}

+void vp9_convolve_avg(const uint8_t *src, int src_stride,

+                      uint8_t *dst, int dst_stride,

+                      const int16_t *filter_x, int filter_x_stride,

+                      const int16_t *filter_y, int filter_y_stride,

+                      int w, int h) {

+  int x, y;

+  for (y = 0; y < h; ++y) {

+    for (x = 0; x < w; ++x) {

+      dst[x] = (dst[x] + src[x] + 1) >> 1;

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+void vp9_convolve_1by8(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int filter_x_stride,

+                       const int16_t *filter_y, int filter_y_stride,

+                       int w, int h) {

+  int x, y;

+  for (y = 0; y < h; ++y) {

+    for (x = 0; x < w; ++x) {

+      dst[x] = combine_1by8(dst[x], src[x]);

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_convolve_qtr(const uint8_t *src, int src_stride,

+                      uint8_t *dst, int dst_stride,

+                      const int16_t *filter_x, int filter_x_stride,

+                      const int16_t *filter_y, int filter_y_stride,

+                      int w, int h) {

+  int x, y;

+  for (y = 0; y < h; ++y) {

+    for (x = 0; x < w; ++x) {

+      dst[x] = combine_qtr(dst[x], src[x]);

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_convolve_3by8(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int filter_x_stride,

+                       const int16_t *filter_y, int filter_y_stride,

+                       int w, int h) {

+  int x, y;

+  for (y = 0; y < h; ++y) {

+    for (x = 0; x < w; ++x) {

+      dst[x] = combine_3by8(dst[x], src[x]);

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_convolve_5by8(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int filter_x_stride,

+                       const int16_t *filter_y, int filter_y_stride,

+                       int w, int h) {

+  int x, y;

+  for (y = 0; y < h; ++y) {

+    for (x = 0; x < w; ++x) {

+      dst[x] = combine_5by8(dst[x], src[x]);

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_convolve_3qtr(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int filter_x_stride,

+                       const int16_t *filter_y, int filter_y_stride,

+                       int w, int h) {

+  int x, y;

+  for (y = 0; y < h; ++y) {

+    for (x = 0; x < w; ++x) {

+      dst[x] = combine_3qtr(dst[x], src[x]);

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+void vp9_convolve_7by8(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int filter_x_stride,

+                       const int16_t *filter_y, int filter_y_stride,

+                       int w, int h) {

+  int x, y;

+  for (y = 0; y < h; ++y) {

+    for (x = 0; x < w; ++x) {

+      dst[x] = combine_7by8(dst[x], src[x]);

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+#endif

--- /dev/null

+++ b/vp9/common/vp9_convolve.h

@@ -1,0 +1,85 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_COMMON_CONVOLVE_H_

+#define VP9_COMMON_CONVOLVE_H_

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int x_step_q4,

+                              const int16_t *filter_y, int y_step_q4,

+                              int w, int h);

+// Not a convolution, a block copy conforming to the convolution prototype

+void vp9_convolve_copy(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int x_step_q4,

+                       const int16_t *filter_y, int y_step_q4,

+                       int w, int h);

+// Not a convolution, a block average conforming to the convolution prototype

+void vp9_convolve_avg(const uint8_t *src, int src_stride,

+                      uint8_t *dst, int dst_stride,

+                      const int16_t *filter_x, int x_step_q4,

+                      const int16_t *filter_y, int y_step_q4,

+                      int w, int h);

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+// Not a convolution, a block wtd (1/8, 7/8) average for (dst, src)

+void vp9_convolve_1by8(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int x_step_q4,

+                       const int16_t *filter_y, int y_step_q4,

+                       int w, int h);

+// Not a convolution, a block wtd (1/4, 3/4) average for (dst, src)

+void vp9_convolve_qtr(const uint8_t *src, int src_stride,

+                      uint8_t *dst, int dst_stride,

+                      const int16_t *filter_x, int x_step_q4,

+                      const int16_t *filter_y, int y_step_q4,

+                      int w, int h);

+// Not a convolution, a block wtd (3/8, 5/8) average for (dst, src)

+void vp9_convolve_3by8(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int x_step_q4,

+                       const int16_t *filter_y, int y_step_q4,

+                       int w, int h);

+// Not a convolution, a block wtd (5/8, 3/8) average for (dst, src)

+void vp9_convolve_5by8(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int x_step_q4,

+                       const int16_t *filter_y, int y_step_q4,

+                       int w, int h);

+// Not a convolution, a block wtd (3/4, 1/4) average for (dst, src)

+void vp9_convolve_3qtr(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int x_step_q4,

+                       const int16_t *filter_y, int y_step_q4,

+                       int w, int h);

+// Not a convolution, a block wtd (7/8, 1/8) average for (dst, src)

+void vp9_convolve_7by8(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int x_step_q4,

+                       const int16_t *filter_y, int y_step_q4,

+                       int w, int h);

+#endif

+struct subpix_fn_table {

+  const int16_t (*filter_x)[8];

+  const int16_t (*filter_y)[8];

+};

+#endif  // VP9_COMMON_CONVOLVE_H_

--- a/vp9/common/vp9_debugmodes.c

+++ b/vp9/common/vp9_debugmodes.c

@@ -9,6 +9,7 @@

*/

 #include <stdio.h>

 #include "vp9/common/vp9_blockd.h"

 void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,

@@ -18,8 +19,7 @@

   int mb_index = 0;

   FILE *mvs = fopen("mvs.stt", "a");

-  /* print out the macroblock Y modes */

-  mb_index = 0;

+  // Print out the macroblock Y modes

   fprintf(mvs, "Mb Modes for Frame %d\n", frame);

   for (mb_row = 0; mb_row < rows; mb_row++) {

@@ -129,8 +129,8 @@

         mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2);

         bindex = (b_row & 3) * 4 + (b_col & 3);

         fprintf(mvs, "%3d:%-3d ",

-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.row,

-                mi[mb_index].bmi[bindex].as_mv.first.as_mv.col);

+                mi[mb_index].bmi[bindex].as_mv[0].as_mv.row,

+                mi[mb_index].bmi[bindex].as_mv[0].as_mv.col);

--- a/vp9/common/vp9_default_coef_probs.h

+++ b/vp9/common/vp9_default_coef_probs.h

@@ -11,1201 +11,987 @@

 /*Generated file, included by vp9_entropy.c*/

-static const vp9_coeff_probs default_coef_probs_4x4[BLOCK_TYPES_4X4] = {

+// NOTE: When the CONFIG_MODELCOEFPROB experiment is on, only the first

+// 2 or 3 from each row is actually used depending on whether

+// UNCONSTRAINDED_NODES is 2 or 3. If this experiment is merged

+// the tables below should be shortened accordingly.

+static const vp9_coeff_probs default_coef_probs_4x4[BLOCK_TYPES] = {

   { /* block Type 0 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 224, 180, 254, 255, 234, 224, 255, 227, 128, 128, 128 },

-      { 187, 178, 250, 255, 226, 218, 255, 229, 255, 255, 128 },

-      { 145, 171, 243, 253, 219, 211, 254, 226, 255, 224, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      {   1, 187, 252, 255, 231, 220, 255, 229, 255, 255, 128 },

-      { 129, 174, 244, 254, 225, 216, 253, 219, 255, 255, 128 },

-      {  16, 131, 193, 251, 205, 205, 254, 222, 255, 255, 128 },

-      {   2,  93, 136, 236, 159, 179, 255, 197, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      {   1, 188, 254, 255, 241, 236, 254, 220, 255, 255, 128 },

-      { 133, 165, 249, 255, 236, 220, 252, 220, 255, 255, 128 },

-      {  20, 112, 203, 254, 217, 214, 255, 224, 255, 255, 128 },

-      {   4,  61, 106, 240, 155, 189, 252, 202, 255, 255, 128 }

-    }, { /* Coeff Band 4 */

-      {   1, 168, 252, 255, 239, 228, 253, 217, 255, 255, 128 },

-      { 158, 163, 247, 255, 231, 221, 255, 242, 128, 128, 128 },

-      {  23, 127, 205, 253, 212, 224, 255, 234, 255, 255, 128 },

-      {   2,  83, 141, 237, 176, 210, 245, 207, 255, 255, 128 }

-    }, { /* Coeff Band 5 */

-      {   1, 233, 254, 255, 243, 241, 255, 213, 128, 128, 128 },

-      { 155, 213, 253, 255, 240, 221, 216, 112, 255, 255, 128 },

-      {  41, 159, 237, 254, 229, 216, 255, 161, 128, 128, 128 },

-      {  11,  95, 176, 244, 194, 191, 255, 167, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      {   1, 160, 253, 255, 238, 231, 255, 230, 255, 255, 128 },

-      { 174, 152, 248, 255, 230, 223, 255, 223, 255, 255, 128 },

-      {  86, 125, 213, 253, 207, 207, 254, 224, 255, 171, 128 },

-      {  39,  89, 156, 240, 168, 190, 251, 181, 255, 255, 128 }

-    }, { /* Coeff Band 7 */

-      {   1, 101, 255, 255, 243, 244, 255, 255, 128, 128, 128 },

-      { 230,  66, 255, 255, 238, 238, 128, 128, 128, 128, 128 },

-      { 151,  92, 229, 255, 224, 197, 128, 128, 128, 128, 128 },

-      { 109,  57, 171, 255,  73, 255, 128, 128, 128, 128, 128 }

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 208,  32, 178, 198, 161, 167, 196, 147, 244, 194, 210 },

+        { 102,  43, 132, 185, 148, 162, 185, 141, 237, 181, 215 },

+        {  15,  36,  68, 143, 119, 151, 169, 133, 230, 173, 214 }

+      }, { /* Coeff Band 1 */

+        {  71,  91, 178, 226, 169, 176, 232, 170, 252, 219, 231 },

+        {  72,  88, 174, 226, 168, 176, 232, 170, 252, 219, 234 },

+        {  40,  79, 154, 222, 161, 174, 231, 169, 251, 219, 238 },

+        {  21,  68, 126, 211, 144, 167, 230, 167, 252, 219, 236 },

+        {   7,  49,  84, 175, 121, 152, 223, 151, 251, 218, 237 },

+        {   1,  20,  32, 100,  97, 140, 163, 116, 237, 186, 222 }

+      }, { /* Coeff Band 2 */

+        { 108, 110, 206, 237, 182, 183, 239, 181, 252, 221, 245 },

+        {  72,  98, 191, 236, 180, 182, 240, 183, 252, 223, 239 },

+        {  26,  77, 152, 230, 166, 179, 239, 181, 252, 222, 241 },

+        {   7,  57, 106, 212, 141, 167, 236, 173, 252, 223, 243 },

+        {   1,  35,  60, 171, 110, 149, 225, 155, 251, 218, 240 },

+        {   1,  14,  22,  90,  86, 134, 163, 116, 238, 181, 233 }

+      }, { /* Coeff Band 3 */

+        { 105, 139, 222, 245, 196, 192, 245, 195, 253, 229, 255 },

+        {  76, 118, 205, 245, 192, 192, 247, 198, 254, 230, 255 },

+        {  21,  88, 164, 240, 175, 186, 246, 197, 255, 232, 255 },

+        {   5,  63, 118, 222, 149, 172, 242, 185, 255, 230, 254 },

+        {   1,  42,  74, 186, 120, 157, 227, 161, 253, 220, 250 },

+        {   1,  18,  30,  97,  92, 136, 163, 118, 244, 184, 244 }

+      }, { /* Coeff Band 4 */

+        { 143, 117, 233, 251, 207, 201, 250, 210, 255, 239, 128 },

+        {  99, 104, 214, 249, 200, 199, 251, 211, 255, 238, 255 },

+        {  26,  81, 170, 245, 183, 192, 250, 206, 255, 242, 255 },

+        {   6,  60, 116, 226, 151, 176, 242, 187, 255, 235, 255 },

+        {   1,  38,  65, 178, 114, 153, 224, 157, 254, 224, 255 },

+        {   1,  15,  26,  86,  88, 133, 163, 110, 251, 197, 252 }

+      }, { /* Coeff Band 5 */

+        { 155,  74, 238, 252, 215, 206, 252, 223, 255, 255, 128 },

+        { 152,  64, 223, 250, 205, 201, 254, 219, 255, 255, 128 },

+        {  67,  55, 182, 246, 187, 192, 251, 210, 255, 240, 128 },

+        {  27,  44, 127, 227, 155, 176, 244, 186, 255, 240, 255 },

+        {   9,  27,  69, 176, 115, 152, 227, 154, 255, 229, 255 },

+        {   2,  11,  28,  91,  84, 133, 177, 115, 254, 210, 255 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 207, 112, 234, 244, 192, 193, 246, 194, 255, 237, 255 },

+        { 145, 120, 212, 233, 178, 183, 232, 177, 252, 216, 228 },

+        {  77, 114, 177, 214, 164, 174, 210, 159, 245, 199, 230 }

+      }, { /* Coeff Band 1 */

+        {  93, 174, 243, 248, 205, 200, 245, 195, 255, 232, 255 },

+        { 100, 144, 231, 248, 204, 200, 244, 193, 255, 232, 255 },

+        {  28, 101, 186, 247, 194, 199, 244, 194, 255, 232, 255 },

+        {   9,  73, 132, 238, 155, 186, 245, 197, 255, 232, 250 },

+        {   2,  44,  76, 187, 112, 151, 240, 172, 255, 235, 249 },

+        {   1,  19,  33,  98,  92, 138, 176, 113, 252, 208, 249 }

+      }, { /* Coeff Band 2 */

+        { 116, 175, 246, 250, 212, 202, 248, 198, 255, 238, 255 },

+        {  78, 142, 231, 250, 208, 203, 249, 200, 255, 241, 255 },

+        {  14,  93, 177, 245, 186, 196, 248, 198, 255, 241, 255 },

+        {   4,  65, 122, 227, 148, 177, 244, 186, 255, 241, 243 },

+        {   1,  38,  69, 180, 111, 152, 235, 162, 255, 237, 247 },

+        {   1,  18,  30, 101,  89, 133, 190, 116, 255, 219, 246 }

+      }, { /* Coeff Band 3 */

+        { 138, 183, 249, 253, 220, 209, 252, 210, 255, 251, 128 },

+        {  93, 147, 237, 252, 213, 209, 253, 213, 255, 251, 128 },

+        {  21, 104, 187, 247, 185, 196, 252, 210, 255, 249, 128 },

+        {   6,  73, 131, 225, 147, 174, 248, 190, 255, 248, 128 },

+        {   1,  47,  83, 189, 119, 155, 239, 167, 255, 246, 128 },

+        {   1,  26,  44, 130,  96, 139, 209, 129, 255, 235, 255 }

+      }, { /* Coeff Band 4 */

+        { 188, 143, 252, 255, 228, 218, 253, 218, 255, 209, 128 },

+        { 137, 124, 241, 253, 215, 211, 254, 221, 255, 255, 128 },

+        {  32,  89, 188, 248, 186, 198, 254, 216, 255, 253, 128 },

+        {   7,  61, 122, 231, 146, 176, 252, 201, 255, 250, 128 },

+        {   1,  34,  66, 186, 103, 149, 246, 176, 255, 249, 128 },

+        {   1,  18,  34, 115,  91, 134, 217, 124, 255, 233, 255 }

+      }, { /* Coeff Band 5 */

+        { 198,  92, 253, 255, 231, 222, 255, 230, 128, 128, 128 },

+        { 189,  79, 244, 254, 220, 217, 255, 237, 255, 255, 128 },

+        {  78,  61, 200, 252, 196, 207, 255, 231, 255, 255, 128 },

+        {  34,  50, 146, 242, 161, 187, 255, 222, 255, 255, 128 },

+        {  11,  38,  93, 215, 122, 159, 253, 202, 255, 255, 128 },

+        {   1,  31,  55, 143, 102, 143, 227, 148, 255, 238, 128 }

+      }

   }, { /* block Type 1 */

-    { /* Coeff Band 0 */

-      { 148, 109, 219, 239, 203, 184, 222, 172, 238, 203, 192 },

-      { 101, 110, 206, 229, 181, 178, 224, 171, 250, 206, 180 },

-      {  67, 108, 186, 222, 172, 174, 216, 167, 246, 195, 221 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 184, 249, 254, 226, 220, 253, 241, 255, 255, 128 },

-      {  84, 182, 244, 254, 222, 218, 254, 217, 255, 255, 128 },

-      {  56, 147, 210, 252, 208, 210, 253, 218, 255, 255, 128 },

-      {  32, 124, 170, 233, 165, 178, 249, 196, 255, 253, 128 }

-    }, { /* Coeff Band 2 */

-      {   1, 182, 242, 245, 208, 194, 239, 179, 255, 238, 128 },

-      {  28, 170, 230, 241, 202, 192, 243, 171, 255, 243, 128 },

-      {  16, 109, 165, 231, 182, 184, 237, 168, 255, 249, 255 },

-      {   2,  76, 113, 202, 141, 172, 221, 160, 252, 227, 255 }

-    }, { /* Coeff Band 3 */

-      {   1, 195, 249, 254, 230, 239, 251, 211, 255, 255, 128 },

-      {  39, 164, 242, 254, 224, 222, 255, 235, 255, 255, 128 },

-      {  16, 111, 179, 251, 204, 197, 251, 234, 255, 209, 128 },

-      {   3,  84, 130, 225, 155, 176, 226, 196, 255, 238, 128 }

-    }, { /* Coeff Band 4 */

-      {   1, 180, 248, 254, 227, 219, 254, 211, 255, 255, 128 },

-      {  38, 170, 242, 253, 222, 214, 254, 242, 255, 255, 128 },

-      {   5, 111, 176, 250, 204, 197, 255, 208, 128, 128, 128 },

-      {   1,  75, 120, 233, 146, 186, 250, 203, 255, 255, 128 }

-    }, { /* Coeff Band 5 */

-      {   1, 183, 251, 255, 232, 223, 252, 229, 255, 255, 128 },

-      {  51, 158, 245, 255, 230, 224, 255, 239, 128, 128, 128 },

-      {  13,  80, 158, 253, 206, 216, 255, 233, 128, 128, 128 },

-      {   4,  39,  76, 212, 107, 153, 252, 206, 255, 255, 128 }

-    }, { /* Coeff Band 6 */

-      {   1, 181, 252, 254, 231, 214, 242, 225, 255, 236, 128 },

-      {  81, 167, 247, 254, 229, 217, 252, 226, 255, 255, 128 },

-      {  20, 122, 195, 253, 213, 212, 249, 211, 255, 238, 128 },

-      {  18, 100, 153, 231, 158, 182, 244, 203, 255, 219, 128 }

-    }, { /* Coeff Band 7 */

-      {   1, 100, 254, 255, 242, 246, 255, 230, 128, 128, 128 },

-      { 177,  62, 250, 255, 246, 210, 255, 255, 128, 128, 128 },

-      {  65,  58, 186, 255, 227, 241, 255, 219, 128, 128, 128 },

-      {  45,  23, 118, 244, 162, 208, 255, 228, 128, 128, 128 }

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 207,  35, 219, 243, 195, 192, 243, 188, 251, 232, 238 },

+        { 126,  46, 182, 230, 177, 182, 228, 171, 248, 214, 232 },

+        {  51,  47, 125, 196, 147, 166, 206, 151, 245, 199, 229 }

+      }, { /* Coeff Band 1 */

+        { 114, 124, 220, 244, 197, 192, 242, 189, 253, 226, 255 },

+        { 142, 116, 213, 243, 194, 191, 241, 188, 252, 226, 255 },

+        {  81, 101, 190, 242, 188, 190, 242, 190, 253, 229, 255 },

+        {  42,  83, 155, 235, 166, 183, 241, 190, 253, 227, 246 },

+        {  16,  62, 104, 205, 133, 161, 238, 176, 254, 227, 250 },

+        {   6,  40,  60, 132, 109, 145, 190, 128, 248, 202, 239 }

+      }, { /* Coeff Band 2 */

+        { 139, 149, 228, 248, 205, 198, 244, 196, 255, 223, 255 },

+        { 115, 127, 221, 248, 202, 198, 245, 198, 255, 228, 255 },

+        {  43, 100, 189, 246, 195, 195, 244, 196, 254, 234, 228 },

+        {  13,  77, 141, 238, 168, 187, 243, 191, 255, 232, 255 },

+        {   3,  49,  88, 203, 125, 160, 237, 178, 253, 227, 251 },

+        {   1,  23,  41, 118,  97, 136, 191, 127, 250, 207, 247 }

+      }, { /* Coeff Band 3 */

+        { 119, 185, 236, 251, 216, 205, 249, 202, 253, 237, 255 },

+        {  89, 140, 224, 251, 211, 205, 250, 208, 255, 241, 255 },

+        {  34, 105, 189, 248, 195, 197, 250, 208, 255, 245, 255 },

+        {  14,  78, 142, 235, 166, 182, 246, 194, 255, 242, 255 },

+        {   5,  49,  90, 196, 128, 160, 235, 165, 255, 237, 255 },

+        {   1,  22,  41, 114,  97, 139, 180, 124, 252, 201, 249 }

+      }, { /* Coeff Band 4 */

+        { 162, 142, 244, 254, 228, 215, 255, 230, 128, 128, 128 },

+        { 129, 120, 231, 253, 216, 210, 255, 228, 255, 255, 128 },

+        {  44,  90, 189, 249, 195, 199, 253, 217, 255, 240, 128 },

+        {  14,  65, 132, 234, 158, 181, 249, 203, 255, 248, 128 },

+        {   3,  38,  72, 188, 112, 154, 239, 171, 255, 243, 128 },

+        {   1,  17,  39, 110,  86, 141, 201, 123, 255, 240, 128 }

+      }, { /* Coeff Band 5 */

+        { 167,  96, 247, 255, 230, 218, 249, 231, 255, 255, 128 },

+        { 163,  84, 234, 253, 214, 209, 255, 231, 255, 255, 128 },

+        {  70,  63, 185, 249, 189, 197, 255, 230, 255, 255, 128 },

+        {  30,  44, 132, 238, 157, 180, 251, 210, 255, 220, 128 },

+        {  13,  30,  80, 195, 121, 153, 243, 179, 255, 224, 128 },

+        {   5,  13,  38, 103, 109, 128, 196, 147, 255, 255, 128 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 242,  90, 246, 244, 200, 192, 242, 189, 255, 234, 255 },

+        { 186, 102, 228, 233, 187, 182, 231, 172, 254, 225, 252 },

+        { 102, 108, 203, 228, 181, 180, 218, 167, 243, 201, 223 }

+      }, { /* Coeff Band 1 */

+        { 152, 169, 250, 253, 223, 209, 251, 208, 255, 250, 128 },

+        { 164, 149, 242, 253, 222, 209, 249, 207, 253, 238, 255 },

+        {  63, 108, 204, 252, 215, 211, 251, 211, 255, 242, 128 },

+        {  39,  83, 153, 248, 175, 199, 250, 214, 255, 245, 128 },

+        {  31,  66, 108, 214, 130, 161, 251, 196, 255, 237, 128 },

+        {  27,  65,  71, 150, 112, 149, 213, 133, 255, 230, 255 }

+      }, { /* Coeff Band 2 */

+        { 161, 174, 250, 254, 226, 215, 254, 226, 255, 230, 128 },

+        { 133, 150, 239, 254, 222, 213, 254, 225, 255, 255, 128 },

+        {  32, 105, 197, 252, 206, 207, 253, 220, 255, 255, 128 },

+        {  10,  78, 147, 245, 173, 193, 253, 212, 255, 255, 128 },

+        {   2,  49,  99, 221, 133, 164, 250, 198, 255, 252, 128 },

+        {   1,  26,  53, 154,  96, 135, 234, 142, 255, 240, 128 }

+      }, { /* Coeff Band 3 */

+        { 160, 187, 251, 255, 234, 223, 255, 233, 128, 128, 128 },

+        { 131, 155, 241, 255, 228, 222, 255, 232, 255, 255, 128 },

+        {  42, 108, 198, 253, 207, 212, 255, 234, 255, 255, 128 },

+        {  18,  81, 151, 246, 176, 194, 254, 222, 255, 255, 128 },

+        {   9,  60, 112, 225, 144, 167, 252, 199, 255, 255, 128 },

+        {   5,  35,  49, 163, 113, 150, 237, 118, 255, 255, 128 }

+      }, { /* Coeff Band 4 */

+        { 195, 141, 253, 255, 242, 232, 255, 255, 128, 128, 128 },

+        { 169, 128, 245, 255, 235, 227, 255, 248, 128, 128, 128 },

+        {  62,  91, 204, 255, 216, 220, 255, 233, 128, 128, 128 },

+        {  23,  70, 150, 248, 178, 202, 255, 223, 128, 128, 128 },

+        {   2,  44,  78, 220, 110, 164, 255, 209, 128, 128, 128 },

+        {   1,   1, 128, 255, 255, 128, 128, 128, 128, 128, 128 }

+      }, { /* Coeff Band 5 */

+        { 195, 104, 253, 255, 246, 246, 255, 171, 128, 128, 128 },

+        { 197,  92, 248, 255, 239, 228, 255, 239, 128, 128, 128 },

+        {  88,  71, 214, 255, 219, 220, 255, 244, 128, 128, 128 },

+        {  39,  56, 160, 250, 187, 204, 255, 255, 128, 128, 128 },

+        {  18,  28,  90, 217,  81, 137, 255, 128, 128, 128, 128 },

+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+      }

-  }, { /* block Type 2 */

-    { /* Coeff Band 0 */

-      { 242,  73, 238, 244, 198, 192, 241, 189, 253, 226, 247 },

-      { 171,  70, 204, 231, 180, 183, 228, 172, 247, 215, 221 },

-      {  73,  62, 144, 202, 153, 169, 207, 153, 245, 199, 230 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 163, 241, 245, 201, 192, 243, 191, 255, 229, 255 },

-      { 165, 147, 230, 245, 201, 193, 244, 193, 255, 231, 255 },

-      {  76, 109, 191, 243, 190, 193, 243, 192, 255, 231, 255 },

-      {  22,  63, 111, 202, 138, 164, 225, 164, 252, 218, 248 }

-    }, { /* Coeff Band 2 */

-      {   1, 113, 225, 245, 201, 195, 238, 185, 254, 225, 255 },

-      { 122, 105, 195, 236, 183, 186, 235, 180, 254, 227, 252 },

-      {  38,  79, 135, 217, 154, 172, 229, 171, 253, 220, 250 },

-      {   9,  53,  78, 161, 121, 151, 202, 141, 251, 207, 244 }

-    }, { /* Coeff Band 3 */

-      {   1, 150, 238, 250, 213, 202, 244, 194, 255, 236, 255 },

-      { 140, 132, 223, 247, 204, 199, 243, 193, 255, 234, 255 },

-      {  51, 101, 182, 240, 188, 189, 240, 186, 255, 232, 255 },

-      {   6,  59, 100, 201, 137, 165, 225, 161, 252, 221, 249 }

-    }, { /* Coeff Band 4 */

-      {   1, 151, 233, 248, 205, 199, 248, 196, 255, 243, 255 },

-      { 133, 140, 214, 244, 193, 193, 245, 194, 255, 236, 255 },

-      {  27, 104, 168, 235, 172, 183, 243, 187, 254, 235, 255 },

-      {   2,  61, 101, 202, 135, 164, 229, 167, 254, 223, 255 }

-    }, { /* Coeff Band 5 */

-      {   1, 227, 246, 254, 225, 215, 254, 217, 255, 255, 128 },

-      { 132, 195, 239, 253, 219, 210, 252, 212, 255, 255, 128 },

-      {  49, 143, 214, 251, 207, 204, 253, 212, 255, 238, 128 },

-      {  11,  93, 151, 235, 169, 185, 247, 190, 255, 238, 128 }

-    }, { /* Coeff Band 6 */

-      {   1, 143, 237, 251, 213, 203, 249, 203, 255, 243, 128 },

-      { 137, 120, 216, 246, 198, 196, 248, 199, 255, 240, 255 },

-      {  50,  94, 166, 233, 169, 181, 245, 189, 255, 240, 255 },

-      {   9,  56,  97, 190, 129, 158, 228, 159, 255, 226, 255 }

-    }, { /* Coeff Band 7 */

-      {   1,  96, 245, 254, 229, 216, 255, 212, 255, 255, 128 },

-      { 179,  81, 234, 253, 217, 209, 255, 230, 255, 255, 128 },

-      { 105,  56, 192, 248, 192, 197, 252, 212, 255, 205, 128 },

-      {  53,  32, 133, 228, 151, 177, 250, 192, 255, 255, 128 }

-    }

-  }, { /* block Type 3 */

-    { /* Coeff Band 0 */

-      { 209,  89, 216, 242, 191, 190, 245, 191, 240, 235, 168 },

-      { 142,  96, 196, 229, 173, 180, 233, 175, 247, 220, 174 },

-      {  66,  89, 157, 205, 155, 171, 209, 156, 243, 200, 197 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 159, 235, 246, 202, 197, 237, 186, 248, 223, 223 },

-      {  96, 137, 223, 247, 203, 198, 242, 188, 241, 202, 209 },

-      {  22,  95, 167, 243, 184, 196, 237, 187, 247, 221, 221 },

-      {   3,  51,  81, 192, 125, 158, 220, 164, 242, 211, 197 }

-    }, { /* Coeff Band 2 */

-      {   1, 145, 226, 244, 196, 194, 240, 191, 247, 225, 233 },

-      {  66, 127, 203, 240, 188, 189, 239, 188, 248, 225, 220 },

-      {   9,  83, 136, 224, 159, 176, 235, 177, 247, 223, 207 },

-      {   2,  46,  71, 169, 121, 152, 210, 149, 241, 212, 199 }

-    }, { /* Coeff Band 3 */

-      {   1, 174, 238, 249, 209, 201, 245, 198, 241, 196, 241 },

-      {  76, 151, 223, 247, 203, 197, 245, 194, 243, 202, 198 },

-      {  12, 102, 170, 240, 183, 187, 242, 191, 247, 225, 209 },

-      {   1,  52,  85, 202, 135, 162, 225, 168, 240, 209, 221 }

-    }, { /* Coeff Band 4 */

-      {   1, 140, 230, 247, 204, 198, 242, 190, 249, 209, 248 },

-      {  94, 126, 213, 244, 195, 194, 240, 190, 247, 210, 237 },

-      {  13,  95, 159, 232, 171, 181, 237, 179, 245, 205, 237 },

-      {   1,  51,  83, 186, 128, 158, 216, 154, 240, 193, 229 }

-    }, { /* Coeff Band 5 */

-      {   1, 218, 244, 251, 214, 202, 243, 199, 253, 214, 255 },

-      {  91, 194, 238, 249, 210, 200, 247, 203, 251, 223, 255 },

-      {  18, 140, 207, 247, 198, 194, 246, 203, 252, 213, 255 },

-      {   3,  76, 126, 223, 156, 172, 233, 185, 251, 206, 255 }

-    }, { /* Coeff Band 6 */

-      {   1, 135, 235, 250, 210, 203, 246, 206, 251, 219, 241 },

-      { 105, 120, 214, 246, 196, 196, 245, 195, 250, 216, 243 },

-      {  24,  91, 154, 231, 166, 180, 241, 183, 250, 214, 242 },

-      {   3,  53,  84, 183, 127, 157, 218, 153, 244, 195, 237 }

-    }, { /* Coeff Band 7 */

-      {   1,  83, 246, 252, 215, 208, 246, 206, 255, 237, 128 },

-      { 184,  61, 233, 250, 208, 204, 245, 198, 254, 227, 255 },

-      {  83,  58, 190, 246, 189, 195, 244, 198, 255, 229, 128 },

-      {  41,  38, 125, 214, 144, 169, 229, 171, 251, 216, 255 }

-    }

};

-static const vp9_coeff_probs default_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4] = {

+static const vp9_coeff_probs default_coef_probs_8x8[BLOCK_TYPES] = {

   { /* block Type 0 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 196,  40, 199, 180, 158, 161, 172, 135, 226, 183, 140 },

+        {  83,  38, 128, 153, 142, 157, 155, 128, 222, 164, 202 },

+        {  10,  29,  55, 116, 113, 146, 150, 122, 223, 169, 200 }

+      }, { /* Coeff Band 1 */

+        {  33, 114, 160, 211, 155, 169, 223, 162, 248, 212, 215 },

+        {  69, 107, 155, 210, 154, 169, 224, 163, 248, 212, 216 },

+        {  30,  91, 138, 207, 150, 168, 223, 162, 248, 212, 216 },

+        {  12,  74, 115, 200, 140, 164, 222, 160, 249, 212, 219 },

+        {   4,  52,  80, 172, 121, 153, 216, 149, 249, 212, 226 },

+        {   1,  27,  40, 105, 101, 141, 157, 120, 231, 177, 210 }

+      }, { /* Coeff Band 2 */

+        {  38, 159, 190, 227, 171, 177, 229, 172, 250, 214, 237 },

+        {  34, 130, 182, 229, 173, 180, 231, 174, 249, 215, 234 },

+        {  10,  97, 153, 226, 164, 178, 232, 175, 250, 215, 241 },

+        {   3,  71, 115, 213, 145, 170, 230, 171, 251, 217, 235 },

+        {   1,  41,  68, 172, 114, 152, 219, 154, 250, 212, 235 },

+        {   1,  16,  27,  88,  90, 135, 155, 113, 235, 180, 216 }

+      }, { /* Coeff Band 3 */

+        {  41, 184, 214, 238, 187, 186, 235, 180, 252, 217, 236 },

+        {  24, 142, 199, 241, 188, 189, 237, 184, 252, 220, 235 },

+        {   6,  97, 159, 235, 172, 184, 239, 185, 252, 221, 243 },

+        {   1,  63, 110, 214, 144, 170, 234, 174, 253, 223, 243 },

+        {   1,  32,  58, 166, 109, 149, 218, 152, 251, 215, 238 },

+        {   1,  12,  21,  78,  85, 131, 152, 109, 236, 180, 224 }

+      }, { /* Coeff Band 4 */

+        {  54, 207, 231, 245, 201, 193, 238, 186, 252, 221, 220 },

+        {  32, 156, 213, 246, 198, 195, 242, 192, 252, 224, 245 },

+        {   7,  98, 164, 240, 177, 187, 243, 193, 252, 227, 244 },

+        {   2,  62, 108, 216, 143, 170, 237, 177, 254, 227, 248 },

+        {   1,  32,  57, 165, 108, 148, 219, 152, 252, 217, 243 },

+        {   1,  13,  22,  79,  87, 132, 153, 109, 240, 182, 232 }

+      }, { /* Coeff Band 5 */

+        {  89, 208, 239, 250, 216, 200, 240, 190, 255, 222, 219 },

+        {  53, 155, 223, 250, 209, 202, 245, 199, 253, 225, 246 },

+        {  12, 102, 170, 243, 183, 192, 246, 198, 254, 230, 255 },

+        {   3,  67, 111, 218, 144, 171, 239, 180, 254, 231, 248 },

+        {   1,  38,  60, 164, 108, 148, 221, 152, 253, 220, 246 },

+        {   1,  18,  26,  81,  88, 132, 157, 108, 245, 188, 241 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 205, 121, 244, 237, 187, 188, 229, 174, 248, 215, 228 },

+        { 140, 120, 211, 219, 174, 177, 207, 158, 241, 195, 214 },

+        {  51, 100, 152, 198, 155, 168, 199, 148, 240, 193, 207 }

+      }, { /* Coeff Band 1 */

+        {  66, 196, 236, 247, 202, 197, 243, 193, 254, 228, 246 },

+        {  99, 164, 223, 246, 199, 196, 243, 193, 254, 226, 255 },

+        {  29, 122, 187, 244, 187, 194, 244, 193, 255, 227, 239 },

+        {  14,  95, 145, 234, 156, 181, 244, 194, 254, 229, 246 },

+        {   6,  68,  97, 190, 123, 155, 240, 168, 254, 232, 245 },

+        {   3,  43,  50, 112, 105, 143, 170, 118, 245, 195, 230 }

+      }, { /* Coeff Band 2 */

+        {  66, 202, 238, 248, 206, 199, 245, 196, 254, 233, 244 },

+        {  45, 155, 218, 248, 200, 199, 245, 197, 254, 229, 208 },

+        {   6,  96, 163, 242, 178, 191, 245, 196, 254, 233, 228 },

+        {   2,  64, 110, 224, 142, 175, 242, 185, 254, 232, 247 },

+        {   1,  34,  61, 172, 103, 147, 232, 164, 254, 226, 244 },

+        {   1,  13,  24,  82,  85, 133, 165, 105, 248, 199, 242 }

+      }, { /* Coeff Band 3 */

+        {  66, 204, 242, 251, 213, 204, 248, 204, 255, 236, 255 },

+        {  38, 158, 222, 251, 206, 205, 249, 206, 255, 238, 255 },

+        {   6,  95, 166, 244, 178, 194, 249, 205, 255, 236, 255 },

+        {   2,  61, 111, 223, 141, 173, 244, 187, 255, 237, 255 },

+        {   1,  31,  59, 171, 104, 149, 230, 158, 255, 230, 252 },

+        {   1,  12,  22,  82,  79, 128, 171, 111, 251, 203, 249 }

+      }, { /* Coeff Band 4 */

+        {  63, 214, 245, 252, 219, 208, 249, 206, 255, 241, 128 },

+        {  38, 164, 228, 252, 210, 208, 251, 212, 255, 245, 255 },

+        {   5, 101, 174, 246, 182, 196, 251, 207, 255, 244, 255 },

+        {   1,  64, 116, 224, 142, 174, 246, 190, 255, 241, 228 },

+        {   1,  34,  63, 172, 105, 148, 233, 160, 255, 235, 237 },

+        {   1,  14,  26,  88,  85, 130, 177, 110, 252, 210, 250 }

+      }, { /* Coeff Band 5 */

+        {  91, 214, 246, 254, 226, 213, 251, 210, 255, 239, 255 },

+        {  55, 162, 233, 253, 215, 210, 253, 216, 255, 244, 128 },

+        {  10, 104, 179, 247, 184, 196, 252, 212, 255, 247, 255 },

+        {   2,  67, 119, 226, 143, 173, 249, 195, 255, 245, 255 },

+        {   1,  37,  66, 175, 106, 149, 237, 164, 255, 240, 255 },

+        {   1,  16,  30,  96,  87, 132, 188, 113, 255, 222, 255 }

+      }

   }, { /* block Type 1 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 211,  32, 212, 235, 185, 184, 223, 167, 239, 210, 182 },

+        { 121,  47, 171, 224, 171, 180, 211, 162, 238, 195, 221 },

+        {  40,  51, 118, 203, 145, 168, 211, 160, 246, 200, 236 }

+      }, { /* Coeff Band 1 */

+        {  71, 129, 209, 244, 192, 194, 242, 188, 255, 230, 255 },

+        { 118, 122, 206, 244, 192, 192, 241, 187, 254, 227, 255 },

+        {  53, 104, 184, 241, 186, 190, 241, 184, 254, 232, 255 },

+        {  20,  81, 148, 234, 168, 183, 240, 183, 254, 231, 240 },

+        {   3,  47,  82, 197, 127, 160, 234, 166, 254, 228, 251 },

+        {   1,  18,  28,  96,  88, 134, 174, 116, 247, 194, 247 }

+      }, { /* Coeff Band 2 */

+        {  86, 162, 220, 247, 203, 198, 245, 193, 255, 237, 255 },

+        {  84, 134, 216, 247, 201, 197, 244, 192, 255, 233, 255 },

+        {  26, 102, 186, 243, 190, 192, 244, 192, 255, 232, 255 },

+        {   7,  75, 135, 231, 163, 181, 240, 183, 255, 234, 255 },

+        {   1,  46,  79, 193, 121, 157, 233, 168, 255, 225, 242 },

+        {   1,  20,  35, 113,  94, 136, 191, 123, 252, 209, 250 }

+      }, { /* Coeff Band 3 */

+        {  89, 191, 232, 250, 211, 203, 248, 202, 255, 230, 128 },

+        {  67, 148, 223, 250, 207, 201, 250, 207, 255, 247, 255 },

+        {  19, 105, 183, 245, 189, 193, 249, 202, 255, 244, 255 },

+        {   5,  72, 127, 228, 156, 177, 245, 186, 255, 238, 255 },

+        {   1,  44,  76, 190, 119, 156, 234, 167, 255, 231, 255 },

+        {   1,  21,  36, 116,  92, 138, 195, 128, 250, 208, 241 }

+      }, { /* Coeff Band 4 */

+        {  94, 210, 236, 252, 215, 206, 253, 209, 255, 247, 128 },

+        {  68, 153, 224, 251, 209, 204, 251, 213, 255, 240, 128 },

+        {  14, 103, 178, 246, 188, 195, 251, 209, 255, 239, 128 },

+        {   2,  70, 122, 230, 154, 177, 247, 194, 255, 239, 128 },

+        {   1,  42,  72, 189, 115, 153, 234, 166, 255, 229, 255 },

+        {   1,  19,  34, 104,  98, 143, 180, 124, 252, 200, 255 }

+      }, { /* Coeff Band 5 */

+        {  87, 200, 238, 254, 226, 214, 250, 212, 255, 226, 128 },

+        {  55, 151, 225, 253, 217, 212, 253, 217, 255, 233, 128 },

+        {  11, 106, 179, 249, 193, 200, 252, 213, 255, 247, 128 },

+        {   2,  72, 124, 232, 155, 180, 246, 195, 255, 230, 128 },

+        {   1,  42,  70, 182, 114, 153, 232, 163, 255, 236, 255 },

+        {   1,  17,  28,  95,  92, 137, 170, 115, 252, 208, 228 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 238,  66, 250, 245, 205, 193, 232, 180, 254, 228, 255 },

+        { 178,  84, 226, 237, 192, 185, 230, 176, 253, 217, 251 },

+        {  76,  83, 168, 218, 166, 173, 225, 162, 252, 220, 243 }

+      }, { /* Coeff Band 1 */

+        { 137, 176, 246, 252, 218, 207, 251, 208, 255, 238, 128 },

+        { 176, 160, 237, 252, 217, 206, 249, 209, 255, 247, 128 },

+        {  68, 128, 205, 251, 209, 207, 251, 207, 255, 248, 128 },

+        {  40, 105, 167, 246, 172, 192, 252, 215, 255, 247, 128 },

+        {  22,  84, 131, 214, 144, 164, 249, 185, 255, 250, 255 },

+        {  11,  60,  91, 161, 130, 155, 194, 133, 253, 214, 255 }

+      }, { /* Coeff Band 2 */

+        { 124, 192, 247, 253, 223, 210, 254, 215, 255, 255, 128 },

+        { 103, 161, 234, 253, 218, 209, 253, 214, 255, 255, 128 },

+        {  19, 108, 190, 250, 202, 202, 251, 213, 255, 241, 128 },

+        {   6,  74, 131, 242, 165, 191, 251, 207, 255, 244, 128 },

+        {   1,  41,  72, 198, 111, 151, 249, 185, 255, 248, 128 },

+        {   1,  14,  24,  82,  90, 140, 185,  96, 254, 224, 255 }

+      }, { /* Coeff Band 3 */

+        { 118, 200, 248, 254, 228, 216, 254, 222, 255, 213, 128 },

+        {  91, 166, 235, 254, 220, 212, 254, 223, 255, 233, 128 },

+        {  16, 110, 186, 251, 197, 201, 255, 225, 255, 255, 128 },

+        {   3,  72, 124, 239, 160, 186, 253, 209, 255, 239, 128 },

+        {   1,  39,  66, 198, 106, 151, 248, 191, 255, 247, 128 },

+        {   1,  14,  19,  94,  74, 124, 209, 109, 255, 245, 128 }

+      }, { /* Coeff Band 4 */

+        { 112, 213, 248, 255, 231, 218, 255, 234, 255, 255, 128 },

+        {  80, 172, 234, 254, 220, 216, 255, 233, 255, 255, 128 },

+        {  11, 112, 182, 251, 195, 204, 255, 231, 255, 224, 128 },

+        {   2,  73, 126, 241, 159, 186, 254, 219, 255, 255, 128 },

+        {   1,  40,  69, 207, 111, 159, 249, 191, 255, 255, 128 },

+        {   1,  16,  24,  83,  78, 138, 230, 134, 255, 239, 128 }

+      }, { /* Coeff Band 5 */

+        { 100, 209, 245, 255, 236, 225, 248, 231, 255, 192, 128 },

+        {  65, 164, 232, 255, 226, 221, 255, 240, 255, 255, 128 },

+        {  11, 117, 186, 253, 203, 209, 255, 240, 255, 255, 128 },

+        {   2,  83, 136, 245, 167, 191, 253, 222, 255, 255, 128 },

+        {   1,  55,  88, 213, 122, 157, 248, 182, 255, 255, 128 },

+        {   1,  10,  38,  58,  85,  43, 198, 107, 255, 255, 128 }

+      }

-  }, { /* block Type 2 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }

-  }, { /* block Type 3 */

-    { /* Coeff Band 0 */

-      { 191,  34, 178, 193, 160, 173, 196, 142, 247, 191, 244 },

-      {  84,  45, 129, 187, 145, 170, 189, 145, 240, 186, 212 },

-      {  14,  36,  69, 149, 120, 154, 177, 136, 231, 177, 196 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1,  76, 169, 226, 167, 180, 227, 171, 247, 218, 226 },

-      {  72,  75, 162, 226, 166, 181, 231, 172, 242, 200, 219 },

-      {  30,  63, 130, 218, 153, 175, 226, 170, 247, 216, 219 },

-      {   5,  39,  67, 156, 119, 151, 194, 140, 239, 202, 216 }

-    }, { /* Coeff Band 2 */

-      {   1,  79, 182, 228, 175, 183, 224, 170, 247, 215, 220 },

-      {  69,  77, 168, 224, 170, 180, 223, 168, 246, 215, 223 },

-      {  24,  63, 126, 209, 153, 171, 219, 160, 247, 215, 225 },

-      {   3,  35,  58, 151, 115, 151, 191, 138, 240, 199, 220 }

-    }, { /* Coeff Band 3 */

-      {   1, 139, 213, 238, 194, 192, 234, 180, 244, 193, 236 },

-      {  82, 127, 204, 238, 190, 186, 234, 175, 244, 191, 235 },

-      {  26,  93, 161, 230, 173, 179, 233, 178, 249, 217, 241 },

-      {   3,  48,  78, 186, 132, 158, 212, 157, 244, 205, 233 }

-    }, { /* Coeff Band 4 */

-      {   1, 100, 208, 233, 180, 182, 238, 175, 250, 206, 225 },

-      {  84,  87, 184, 230, 175, 180, 236, 179, 250, 209, 243 },

-      {  14,  61, 111, 217, 146, 171, 236, 174, 249, 207, 245 },

-      {   1,  32,  49, 150, 106, 142, 212, 145, 242, 191, 237 }

-    }, { /* Coeff Band 5 */

-      {   1, 130, 223, 241, 192, 189, 231, 176, 250, 209, 246 },

-      { 101, 120, 207, 239, 188, 187, 240, 196, 250, 202, 255 },

-      {  19,  90, 155, 232, 169, 181, 238, 190, 250, 207, 249 },

-      {   1,  54,  86, 197, 130, 161, 220, 170, 248, 196, 248 }

-    }, { /* Coeff Band 6 */

-      {   1, 103, 208, 236, 183, 185, 235, 190, 243, 202, 219 },

-      {  95,  92, 185, 230, 175, 181, 233, 174, 242, 203, 225 },

-      {  24,  72, 131, 213, 152, 171, 226, 164, 241, 202, 220 },

-      {   3,  45,  74, 169, 123, 154, 204, 145, 238, 188, 222 }

-    }, { /* Coeff Band 7 */

-      {   1,  63, 236, 247, 205, 194, 241, 189, 252, 222, 255 },

-      { 151,  48, 224, 245, 200, 193, 240, 187, 255, 234, 255 },

-      {  76,  45, 178, 240, 180, 189, 239, 182, 253, 231, 255 },

-      {  38,  31, 111, 187, 125, 154, 217, 155, 253, 214, 255 }

-    }

};

-static const vp9_coeff_probs default_coef_probs_8x8[BLOCK_TYPES_8X8] = {

+static const vp9_coeff_probs default_coef_probs_16x16[BLOCK_TYPES] = {

   { /* block Type 0 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 179, 203, 246, 252, 217, 208, 249, 197, 238, 237, 255 },

-      { 136, 193, 232, 247, 202, 199, 245, 194, 255, 235, 255 },

-      {  66, 170, 209, 244, 190, 191, 250, 199, 255, 242, 192 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      {   1, 191, 232, 250, 204, 201, 248, 199, 254, 243, 213 },

-      {  50, 161, 209, 247, 196, 197, 250, 206, 253, 240, 213 },

-      {   6, 118, 160, 239, 173, 186, 249, 203, 254, 235, 255 },

-      {   2,  90, 110, 211, 141, 166, 242, 181, 254, 235, 255 }

-    }, { /* Coeff Band 3 */

-      {   1, 209, 242, 254, 223, 215, 253, 218, 255, 253, 128 },

-      {  58, 168, 227, 253, 216, 211, 254, 226, 255, 251, 128 },

-      {   7, 111, 178, 249, 195, 202, 253, 222, 254, 240, 255 },

-      {   2,  63, 103, 226, 142, 175, 250, 202, 255, 246, 128 }

-    }, { /* Coeff Band 4 */

-      {   1, 207, 241, 252, 213, 205, 252, 215, 255, 228, 255 },

-      {  55, 171, 225, 251, 209, 205, 251, 212, 254, 234, 255 },

-      {   5, 108, 173, 247, 187, 195, 251, 211, 255, 231, 128 },

-      {   2,  56,  97, 220, 138, 169, 248, 191, 253, 237, 255 }

-    }, { /* Coeff Band 5 */

-      {   1, 211, 245, 255, 227, 219, 255, 233, 255, 255, 128 },

-      {  58, 175, 228, 254, 217, 215, 255, 231, 255, 255, 128 },

-      {   6, 124, 181, 249, 191, 199, 255, 222, 255, 251, 128 },

-      {   2,  85, 122, 227, 149, 172, 250, 195, 255, 245, 128 }

-    }, { /* Coeff Band 6 */

-      {   1, 216, 246, 255, 231, 217, 254, 220, 255, 250, 128 },

-      {  74, 177, 236, 254, 222, 214, 254, 221, 255, 255, 128 },

-      {  13, 125, 192, 250, 200, 203, 254, 217, 255, 245, 128 },

-      {   2,  70, 114, 227, 147, 175, 251, 198, 255, 240, 128 }

-    }, { /* Coeff Band 7 */

-      {   1, 199, 246, 255, 238, 229, 255, 226, 255, 255, 128 },

-      { 132, 162, 240, 255, 229, 222, 255, 239, 255, 255, 128 },

-      {  79, 125, 207, 253, 213, 214, 255, 232, 255, 255, 128 },

-      {  41,  89, 149, 240, 161, 187, 250, 216, 255, 255, 128 }

+    { /* Intra */

+      { /* Coeff Band 0 */

+        {   8,  26, 101, 170, 141, 159, 166, 138, 205, 164, 158 },

+        {   2,  25,  67, 119, 124, 152, 121, 123, 189, 145, 175 },

+        {   1,  15,  28,  67, 102, 139,  95, 107, 191, 136, 187 }

+      }, { /* Coeff Band 1 */

+        {  22,  73, 118, 160, 137, 157, 175, 132, 242, 184, 229 },

+        {  43,  73, 116, 160, 137, 157, 177, 132, 242, 185, 231 },

+        {  24,  66, 105, 158, 134, 156, 175, 133, 242, 185, 232 },

+        {   9,  54,  85, 150, 126, 153, 175, 132, 242, 185, 231 },

+        {   2,  34,  54, 123, 109, 145, 168, 124, 242, 183, 231 },

+        {   1,  14,  22,  63,  93, 134, 108, 103, 214, 149, 206 }

+      }, { /* Coeff Band 2 */

+        {  34, 123, 149, 186, 148, 163, 195, 143, 245, 195, 233 },

+        {  34, 106, 147, 189, 149, 164, 198, 146, 246, 197, 234 },

+        {  10,  81, 123, 186, 143, 162, 200, 147, 246, 198, 235 },

+        {   2,  56,  87, 170, 127, 156, 201, 143, 248, 202, 234 },

+        {   1,  35,  56, 138, 109, 146, 187, 133, 246, 196, 233 },

+        {   1,  17,  27,  80,  93, 135, 136, 109, 229, 168, 215 }

+      }, { /* Coeff Band 3 */

+        {  27, 159, 171, 208, 161, 171, 211, 155, 249, 205, 239 },

+        {  17, 119, 162, 213, 160, 172, 218, 160, 250, 210, 238 },

+        {   3,  81, 128, 207, 149, 168, 220, 161, 250, 213, 238 },

+        {   1,  53,  87, 183, 128, 158, 217, 153, 251, 214, 239 },

+        {   1,  31,  52, 143, 106, 145, 199, 137, 249, 205, 235 },

+        {   1,  14,  24,  77,  89, 133, 142, 109, 234, 174, 215 }

+      }, { /* Coeff Band 4 */

+        {  24, 189, 200, 224, 177, 178, 221, 164, 250, 212, 234 },

+        {  14, 136, 184, 230, 176, 181, 228, 172, 252, 215, 231 },

+        {   2,  87, 140, 222, 159, 176, 230, 172, 252, 218, 238 },

+        {   1,  54,  90, 193, 130, 161, 223, 160, 252, 217, 241 },

+        {   1,  28,  49, 142, 103, 144, 202, 139, 250, 208, 233 },

+        {   1,  12,  21,  73,  87, 132, 141, 106, 234, 176, 209 }

+      }, { /* Coeff Band 5 */

+        {  32, 220, 227, 242, 199, 190, 234, 180, 251, 220, 232 },

+        {  12, 155, 200, 242, 190, 191, 240, 187, 252, 225, 230 },

+        {   1,  90, 144, 231, 164, 180, 240, 184, 253, 229, 239 },

+        {   1,  53,  90, 198, 130, 162, 230, 165, 253, 226, 238 },

+        {   1,  28,  50, 145, 103, 144, 207, 140, 251, 213, 236 },

+        {   1,  13,  22,  74,  88, 132, 142, 107, 233, 176, 216 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        {   5,  61, 234, 230, 183, 183, 212, 164, 241, 199, 205 },

+        {   3,  65, 184, 199, 164, 170, 182, 145, 232, 175, 223 },

+        {   1,  56, 104, 154, 137, 158, 156, 131, 221, 165, 210 }

+      }, { /* Coeff Band 1 */

+        {  46, 183, 210, 229, 181, 182, 222, 165, 252, 214, 251 },

+        { 122, 166, 202, 228, 179, 181, 223, 164, 252, 217, 250 },

+        {  49, 125, 177, 225, 172, 179, 223, 163, 252, 215, 253 },

+        {  22,  99, 142, 216, 155, 173, 222, 164, 252, 215, 250 },

+        {   8,  69,  95, 180, 127, 156, 220, 153, 252, 214, 250 },

+        {   2,  38,  51, 112, 109, 144, 159, 118, 243, 184, 232 }

+      }, { /* Coeff Band 2 */

+        {  56, 196, 218, 236, 187, 185, 231, 172, 254, 223, 239 },

+        {  38, 141, 195, 235, 182, 185, 233, 174, 254, 225, 232 },

+        {   7,  93, 147, 225, 164, 178, 233, 173, 255, 226, 248 },

+        {   2,  63, 101, 201, 137, 165, 227, 162, 254, 225, 248 },

+        {   1,  39,  61, 159, 110, 148, 213, 146, 254, 218, 247 },

+        {   1,  20,  33,  98,  95, 136, 166, 115, 247, 192, 231 }

+      }, { /* Coeff Band 3 */

+        {  44, 206, 223, 240, 193, 189, 235, 177, 255, 231, 224 },

+        {  27, 147, 200, 240, 188, 189, 238, 181, 255, 229, 239 },

+        {   4,  93, 147, 230, 165, 180, 238, 180, 255, 231, 237 },

+        {   1,  58,  95, 201, 134, 164, 229, 164, 255, 228, 254 },

+        {   1,  32,  52, 152, 105, 146, 212, 142, 254, 221, 255 },

+        {   1,  14,  23,  81,  87, 133, 156, 109, 248, 191, 236 }

+      }, { /* Coeff Band 4 */

+        {  39, 216, 227, 244, 200, 194, 237, 179, 255, 231, 255 },

+        {  22, 152, 204, 243, 192, 193, 240, 186, 255, 231, 240 },

+        {   2,  92, 148, 232, 167, 183, 239, 182, 255, 232, 255 },

+        {   1,  55,  91, 200, 132, 164, 229, 164, 255, 230, 255 },

+        {   1,  28,  47, 144,  99, 142, 211, 141, 255, 222, 251 },

+        {   1,  13,  21,  75,  86, 131, 152, 103, 249, 193, 242 }

+      }, { /* Coeff Band 5 */

+        {  34, 228, 234, 249, 213, 201, 246, 194, 255, 239, 255 },

+        {  13, 161, 208, 247, 198, 197, 248, 197, 255, 243, 255 },

+        {   1,  95, 148, 234, 166, 183, 246, 190, 255, 243, 236 },

+        {   1,  55,  90, 199, 128, 161, 237, 168, 255, 239, 255 },

+        {   1,  30,  51, 147, 102, 144, 218, 142, 255, 232, 254 },

+        {   1,  16,  25,  86,  88, 131, 168, 109, 252, 207, 245 }

+      }

   }, { /* block Type 1 */

-    { /* Coeff Band 0 */

-      { 138,  65, 189, 212, 172, 169, 200, 153, 233, 182, 214 },

-      {  93,  60, 162, 203, 160, 169, 200, 153, 239, 190, 213 },

-      {  66,  55, 141, 195, 152, 166, 199, 152, 238, 190, 212 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 102, 221, 247, 205, 198, 248, 201, 255, 235, 128 },

-      { 122,  95, 215, 247, 200, 197, 248, 200, 254, 227, 255 },

-      {  60,  81, 166, 241, 177, 190, 245, 193, 255, 246, 255 },

-      {  32,  61, 108, 195, 133, 159, 230, 163, 254, 230, 238 }

-    }, { /* Coeff Band 2 */

-      {   1,  58, 203, 242, 194, 193, 229, 177, 253, 225, 249 },

-      { 113,  62, 192, 237, 184, 187, 231, 181, 253, 220, 249 },

-      {  50,  50, 135, 225, 159, 177, 229, 172, 254, 222, 241 },

-      {  24,  34,  82, 185, 125, 152, 223, 158, 253, 212, 219 }

-    }, { /* Coeff Band 3 */

-      {   1,   1, 220, 253, 218, 209, 251, 213, 255, 255, 128 },

-      { 154,   1, 216, 252, 211, 206, 252, 212, 255, 252, 128 },

-      { 102,   1, 157, 249, 184, 200, 253, 214, 255, 247, 128 },

-      {  68,   1, 101, 213, 129, 161, 247, 186, 255, 237, 255 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 204,  33, 217, 233, 185, 184, 199, 165, 204, 163, 162 },

+        {  93,  48, 151, 209, 157, 171, 193, 161, 203, 167, 189 },

+        {  18,  43,  86, 173, 126, 156, 203, 149, 231, 193, 200 }

+      }, { /* Coeff Band 1 */

+        {  43, 121, 184, 233, 173, 182, 235, 187, 248, 211, 237 },

+        {  93, 117, 177, 232, 170, 180, 235, 182, 246, 204, 224 },

+        {  33, 101, 158, 229, 165, 179, 235, 182, 245, 207, 236 },

+        {  11,  81, 129, 221, 153, 173, 233, 179, 246, 203, 229 },

+        {   2,  51,  82, 188, 124, 158, 224, 162, 248, 206, 228 },

+        {   1,  18,  29,  88,  93, 137, 141, 116, 222, 161, 217 }

+      }, { /* Coeff Band 2 */

+        {  63, 154, 199, 239, 184, 187, 236, 187, 248, 209, 221 },

+        {  53, 128, 191, 239, 182, 188, 236, 188, 251, 209, 255 },

+        {  14,  99, 160, 235, 172, 184, 235, 187, 249, 207, 240 },

+        {   4,  75, 122, 219, 150, 173, 226, 177, 250, 204, 240 },

+        {   1,  47,  77, 176, 121, 154, 207, 153, 245, 197, 237 },

+        {   1,  18,  30,  84,  95, 136, 138, 112, 229, 167, 228 }

+      }, { /* Coeff Band 3 */

+        {  48, 193, 210, 245, 194, 194, 241, 196, 252, 213, 255 },

+        {  26, 145, 201, 245, 194, 196, 240, 195, 251, 215, 240 },

+        {   6, 104, 165, 241, 179, 190, 239, 191, 253, 222, 255 },

+        {   1,  73, 120, 218, 151, 174, 227, 172, 251, 219, 248 },

+        {   1,  42,  69, 167, 118, 153, 205, 146, 251, 206, 245 },

+        {   1,  16,  27,  84,  89, 133, 148, 112, 240, 179, 238 }

+      }, { /* Coeff Band 4 */

+        {  47, 213, 225, 248, 203, 199, 240, 194, 254, 211, 255 },

+        {  32, 153, 212, 248, 201, 199, 241, 196, 251, 226, 255 },

+        {   6, 102, 168, 240, 181, 190, 240, 187, 251, 225, 238 },

+        {   1,  66, 111, 211, 146, 169, 229, 167, 255, 224, 244 },

+        {   1,  36,  60, 157, 110, 148, 209, 143, 252, 215, 255 },

+        {   1,  16,  27,  83,  90, 133, 152, 111, 244, 184, 250 }

+      }, { /* Coeff Band 5 */

+        {  46, 225, 232, 252, 219, 208, 247, 204, 254, 233, 255 },

+        {  24, 162, 214, 250, 208, 204, 247, 201, 254, 236, 255 },

+        {   3, 106, 165, 242, 182, 191, 245, 196, 255, 231, 255 },

+        {   1,  66, 108, 213, 142, 169, 235, 175, 255, 226, 247 },

+        {   1,  35,  59, 158, 108, 147, 216, 146, 254, 220, 255 },

+        {   1,  16,  27,  85,  90, 131, 159, 110, 248, 191, 252 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 229,  28, 245, 227, 195, 182, 200, 145, 253, 186, 255 },

+        { 151,  44, 210, 214, 180, 175, 193, 146, 247, 185, 254 },

+        {  55,  48, 131, 183, 148, 163, 194, 138, 249, 201, 246 }

+      }, { /* Coeff Band 1 */

+        { 126, 165, 239, 250, 206, 204, 248, 193, 255, 255, 128 },

+        { 199, 158, 231, 248, 206, 198, 247, 200, 243, 255, 255 },

+        { 102, 136, 209, 248, 203, 197, 247, 201, 255, 244, 128 },

+        {  64, 116, 181, 245, 185, 196, 248, 201, 255, 233, 128 },

+        {  44,  98, 151, 233, 162, 179, 248, 195, 255, 242, 128 },

+        {  44,  81, 119, 204, 140, 165, 222, 163, 252, 217, 255 }

+      }, { /* Coeff Band 2 */

+        { 108, 185, 239, 252, 216, 209, 248, 205, 255, 230, 128 },

+        {  91, 155, 224, 252, 211, 205, 251, 211, 255, 230, 128 },

+        {  20, 116, 185, 248, 194, 196, 252, 206, 255, 255, 128 },

+        {   8,  86, 141, 239, 168, 185, 248, 196, 255, 247, 128 },

+        {   3,  50,  92, 206, 125, 164, 242, 176, 255, 246, 128 },

+        {   1,  21,  40, 131,  85, 141, 200, 131, 247, 236, 255 }

+      }, { /* Coeff Band 3 */

+        {  94, 198, 243, 254, 226, 215, 254, 220, 255, 255, 128 },

+        {  67, 164, 228, 253, 217, 208, 250, 216, 255, 213, 128 },

+        {  14, 120, 185, 250, 196, 205, 248, 205, 255, 255, 128 },

+        {   4,  83, 134, 238, 161, 181, 250, 202, 255, 233, 128 },

+        {   1,  48,  82, 196, 119, 157, 248, 178, 255, 255, 128 },

+        {   1,  26,  38,  96,  84, 132, 221, 110, 255, 209, 128 }

+      }, { /* Coeff Band 4 */

+        {  82, 210, 245, 255, 230, 215, 246, 221, 255, 255, 128 },

+        {  55, 170, 231, 254, 222, 213, 255, 220, 255, 255, 128 },

+        {   8, 118, 184, 251, 200, 207, 255, 219, 255, 255, 128 },

+        {   2,  78, 126, 239, 156, 185, 251, 216, 255, 255, 128 },

+        {   1,  43,  68, 189, 108, 151, 247, 187, 255, 228, 128 },

+        {   1,  34,  40, 121, 114, 102, 205,  96, 255, 255, 128 }

+      }, { /* Coeff Band 5 */

+        {  65, 228, 241, 255, 231, 214, 253, 222, 255, 255, 128 },

+        {  33, 173, 226, 254, 222, 216, 255, 231, 255, 255, 128 },

+        {   5, 120, 180, 251, 197, 205, 251, 226, 255, 233, 128 },

+        {   1,  81, 130, 240, 159, 187, 251, 206, 255, 205, 128 },

+        {   1,  51,  78, 198, 119, 168, 238, 181, 255, 171, 128 },

+        {   1,  18,  49, 183, 119, 160, 255, 171, 128, 128, 128 }

+      }

-  }, { /* block Type 2 */

-    { /* Coeff Band 0 */

-      { 229,  64, 235, 236, 189, 190, 227, 179, 247, 203, 226 },

-      { 148,  70, 194, 228, 175, 182, 216, 170, 238, 192, 224 },

-      {  53,  63, 134, 207, 150, 169, 213, 161, 247, 204, 232 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 173, 234, 244, 201, 193, 239, 180, 252, 214, 255 },

-      { 160, 156, 222, 243, 200, 193, 237, 179, 253, 216, 255 },

-      {  55, 119, 187, 240, 189, 192, 236, 180, 253, 226, 255 },

-      {  14,  65, 105, 193, 142, 165, 205, 151, 249, 200, 250 }

-    }, { /* Coeff Band 2 */

-      {   1, 124, 218, 246, 195, 196, 242, 198, 254, 229, 255 },

-      {  85, 114, 180, 240, 179, 187, 239, 191, 253, 223, 239 },

-      {  18,  81, 128, 220, 152, 173, 232, 176, 252, 221, 254 },

-      {   2,  42,  64, 150, 115, 149, 192, 137, 247, 197, 247 }

-    }, { /* Coeff Band 3 */

-      {   1, 164, 230, 251, 210, 204, 245, 201, 255, 238, 255 },

-      {  96, 137, 210, 248, 199, 199, 244, 198, 254, 218, 255 },

-      {  20,  97, 169, 240, 179, 188, 242, 190, 254, 228, 255 },

-      {   2,  58,  95, 197, 137, 164, 220, 158, 252, 217, 248 }

-    }, { /* Coeff Band 4 */

-      {   1, 193, 236, 245, 203, 194, 243, 191, 254, 223, 255 },

-      {  86, 163, 217, 241, 190, 188, 242, 189, 253, 220, 255 },

-      {  14, 108, 161, 228, 167, 178, 238, 180, 253, 224, 255 },

-      {   1,  51,  84, 186, 127, 159, 216, 155, 251, 208, 243 }

-    }, { /* Coeff Band 5 */

-      {   1, 183, 235, 248, 209, 197, 244, 195, 253, 236, 239 },

-      {  79, 144, 208, 243, 193, 190, 244, 191, 254, 231, 255 },

-      {  13, 100, 151, 227, 163, 176, 240, 180, 255, 233, 244 },

-      {   1,  48,  77, 171, 121, 153, 214, 150, 252, 214, 245 }

-    }, { /* Coeff Band 6 */

-      {   1, 202, 234, 252, 215, 207, 248, 207, 254, 242, 255 },

-      {  75, 153, 216, 249, 203, 201, 248, 203, 255, 239, 255 },

-      {  11, 104, 168, 241, 179, 189, 245, 194, 255, 237, 128 },

-      {   1,  57,  95, 201, 134, 163, 229, 165, 254, 223, 246 }

-    }, { /* Coeff Band 7 */

-      {   1, 184, 236, 254, 222, 212, 254, 225, 255, 255, 128 },

-      {  74, 149, 220, 252, 210, 208, 253, 223, 255, 249, 128 },

-      {  18, 109, 175, 247, 184, 195, 253, 211, 255, 250, 128 },

-      {   3,  64, 113, 219, 144, 171, 246, 187, 255, 250, 128 }

-    }

-  }, { /* block Type 3 */

-    { /* Coeff Band 0 */

-      { 140, 101, 214, 227, 176, 182, 218, 167, 233, 205, 164 },

-      {  96, 101, 176, 204, 161, 173, 193, 152, 223, 182, 182 },

-      {  27,  84, 123, 176, 140, 162, 190, 142, 238, 189, 210 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 178, 218, 240, 189, 189, 238, 184, 250, 232, 189 },

-      {  69, 146, 204, 239, 187, 189, 238, 183, 251, 226, 221 },

-      {  16,  98, 157, 234, 170, 185, 237, 183, 252, 220, 218 },

-      {   3,  49,  78, 172, 122, 154, 204, 150, 242, 198, 207 }

-    }, { /* Coeff Band 2 */

-      {   1, 165, 207, 230, 179, 181, 234, 172, 252, 228, 218 },

-      {  25, 130, 175, 224, 169, 177, 232, 169, 252, 230, 207 },

-      {   4,  81, 118, 205, 144, 167, 227, 162, 252, 225, 219 },

-      {   2,  51,  63, 150, 114, 148, 197, 138, 244, 202, 204 }

-    }, { /* Coeff Band 3 */

-      {   1, 181, 222, 247, 200, 197, 246, 199, 252, 232, 228 },

-      {  25, 142, 200, 244, 190, 193, 245, 195, 253, 233, 204 },

-      {   3,  90, 146, 233, 166, 181, 242, 188, 252, 229, 216 },

-      {   1,  47,  79, 188, 124, 157, 222, 162, 245, 213, 203 }

-    }, { /* Coeff Band 4 */

-      {   1, 179, 220, 242, 195, 191, 237, 182, 251, 217, 231 },

-      {  27, 144, 200, 241, 188, 190, 238, 185, 250, 224, 235 },

-      {   3,  93, 149, 230, 166, 180, 235, 180, 249, 222, 221 },

-      {   1,  47,  79, 181, 125, 157, 211, 154, 241, 205, 198 }

-    }, { /* Coeff Band 5 */

-      {   1, 176, 222, 247, 202, 198, 247, 199, 252, 234, 219 },

-      {  24, 139, 197, 244, 190, 192, 246, 196, 253, 232, 220 },

-      {   2,  89, 140, 229, 161, 178, 243, 185, 253, 233, 234 },

-      {   1,  49,  76, 176, 121, 154, 214, 153, 243, 209, 208 }

-    }, { /* Coeff Band 6 */

-      {   1, 197, 233, 251, 213, 205, 247, 206, 249, 222, 247 },

-      {  35, 159, 216, 249, 203, 201, 246, 203, 250, 222, 223 },

-      {   4, 108, 167, 240, 178, 188, 244, 195, 248, 220, 235 },

-      {   1,  58,  93, 198, 133, 161, 220, 167, 233, 195, 221 }

-    }, { /* Coeff Band 7 */

-      {   1, 188, 240, 253, 221, 209, 248, 207, 252, 223, 255 },

-      {  84, 153, 227, 251, 212, 205, 247, 205, 254, 215, 255 },

-      {  25, 117, 182, 244, 186, 192, 243, 198, 250, 209, 255 },

-      {   7,  72, 108, 197, 138, 162, 203, 161, 240, 178, 247 }

-    }

};

-static const vp9_coeff_probs default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8] = {

+static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES] = {

   { /* block Type 0 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+    { /* Intra */

+      { /* Coeff Band 0 */

+        {  37,  34, 137, 205, 154, 170, 151, 159, 109, 172,  44 },

+        {   3,  26,  60, 113, 123, 154, 100, 124, 152, 131, 144 },

+        {   1,  13,  23,  54, 102, 139,  71, 106, 146, 123, 148 }

+      }, { /* Coeff Band 1 */

+        {  26,  77, 122, 152, 144, 160, 143, 129, 216, 158, 201 },

+        {  43,  76, 123, 152, 142, 159, 145, 129, 218, 160, 204 },

+        {  25,  67, 112, 150, 141, 159, 144, 128, 218, 159, 204 },

+        {   9,  54,  90, 143, 134, 156, 144, 127, 218, 159, 204 },

+        {   2,  32,  52, 116, 114, 148, 138, 123, 217, 158, 207 },

+        {   1,  10,  15,  44,  91, 133,  75,  99, 172, 128, 169 }

+      }, { /* Coeff Band 2 */

+        {  32, 122, 143, 163, 145, 161, 162, 131, 226, 171, 206 },

+        {  46, 105, 143, 168, 148, 161, 165, 133, 228, 174, 204 },

+        {  17,  79, 116, 164, 142, 161, 166, 134, 229, 174, 206 },

+        {   4,  53,  78, 143, 125, 153, 163, 129, 232, 175, 213 },

+        {   1,  29,  44, 105, 105, 142, 147, 120, 228, 168, 211 },

+        {   1,  12,  18,  52,  91, 133,  92, 100, 193, 140, 183 }

+      }, { /* Coeff Band 3 */

+        {  33, 157, 160, 182, 149, 163, 185, 141, 236, 185, 218 },

+        {  20, 116, 152, 188, 152, 165, 191, 144, 238, 188, 217 },

+        {   4,  74, 114, 180, 141, 162, 192, 143, 240, 191, 219 },

+        {   1,  44,  69, 148, 119, 151, 183, 134, 243, 192, 227 },

+        {   1,  25,  40, 110, 101, 141, 162, 121, 238, 181, 223 },

+        {   1,  12,  18,  56,  89, 132, 103, 101, 206, 148, 196 }

+      }, { /* Coeff Band 4 */

+        {  25, 183, 174, 207, 159, 171, 205, 156, 243, 194, 228 },

+        {  13, 124, 159, 209, 157, 171, 213, 160, 243, 200, 228 },

+        {   2,  75, 117, 199, 143, 166, 215, 158, 246, 205, 230 },

+        {   1,  45,  73, 165, 119, 153, 204, 144, 248, 205, 231 },

+        {   1,  26,  43, 120, 101, 141, 178, 127, 242, 192, 226 },

+        {   1,  12,  19,  59,  89, 132, 112, 102, 215, 154, 201 }

+      }, { /* Coeff Band 5 */

+        {  13, 232, 223, 239, 196, 188, 225, 172, 248, 209, 226 },

+        {   4, 155, 187, 237, 184, 187, 233, 180, 250, 216, 232 },

+        {   1,  86, 131, 222, 156, 175, 233, 176, 251, 218, 237 },

+        {   1,  49,  79, 181, 123, 157, 218, 155, 251, 214, 237 },

+        {   1,  26,  43, 125, 100, 141, 188, 130, 246, 199, 231 },

+        {   1,  12,  20,  62,  88, 131, 119, 102, 222, 161, 209 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        {  51,  37, 227, 237, 205, 184, 200, 162, 231, 187, 207 },

+        {   9,  36, 172, 204, 176, 173, 171, 145, 217, 167, 197 },

+        {  21,  26, 112, 162, 145, 162, 155, 133, 215, 165, 191 }

+      }, { /* Coeff Band 1 */

+        {  79, 169, 219, 223, 176, 177, 222, 161, 248, 213, 244 },

+        { 177, 166, 216, 222, 175, 178, 222, 161, 246, 212, 226 },

+        { 119, 141, 196, 222, 174, 176, 220, 163, 250, 212, 236 },

+        {  63, 117, 165, 217, 163, 175, 218, 161, 248, 209, 231 },

+        {  30,  87, 117, 192, 138, 162, 216, 157, 247, 211, 224 },

+        {  14,  56,  60, 119, 111, 146, 156, 123, 227, 171, 220 }

+      }, { /* Coeff Band 2 */

+        {  88, 195, 225, 229, 181, 181, 229, 171, 252, 212, 221 },

+        {  66, 145, 202, 229, 177, 180, 230, 172, 253, 220, 255 },

+        {  12,  97, 152, 221, 162, 174, 230, 169, 253, 218, 249 },

+        {   3,  66, 103, 198, 138, 165, 223, 159, 253, 219, 251 },

+        {   1,  38,  61, 158, 110, 148, 209, 146, 252, 212, 238 },

+        {   1,  19,  30,  94,  94, 136, 160, 114, 244, 185, 236 }

+      }, { /* Coeff Band 3 */

+        {  79, 211, 228, 235, 186, 184, 233, 176, 255, 225, 255 },

+        {  50, 151, 205, 235, 182, 185, 237, 177, 254, 223, 255 },

+        {   7,  95, 149, 225, 162, 176, 236, 177, 254, 229, 219 },

+        {   1,  62,  98, 198, 134, 164, 228, 162, 254, 224, 238 },

+        {   1,  35,  57, 156, 108, 148, 211, 143, 253, 215, 238 },

+        {   1,  17,  26,  87,  89, 135, 161, 113, 246, 189, 237 }

+      }, { /* Coeff Band 4 */

+        {  68, 225, 230, 239, 190, 187, 238, 180, 252, 234, 255 },

+        {  39, 156, 206, 239, 185, 187, 241, 187, 254, 231, 255 },

+        {   4,  94, 147, 229, 163, 178, 242, 183, 255, 236, 224 },

+        {   1,  58,  94, 200, 132, 163, 232, 166, 254, 230, 255 },

+        {   1,  32,  52, 153, 104, 146, 214, 144, 253, 222, 236 },

+        {   1,  15,  24,  84,  89, 131, 159, 109, 247, 192, 240 }

+      }, { /* Coeff Band 5 */

+        {  45, 248, 234, 248, 208, 198, 244, 193, 255, 233, 255 },

+        {  19, 169, 204, 246, 195, 195, 246, 199, 255, 233, 255 },

+        {   2,  98, 145, 235, 166, 183, 245, 192, 255, 235, 255 },

+        {   1,  59,  92, 205, 131, 164, 236, 172, 254, 231, 250 },

+        {   1,  33,  52, 152, 103, 145, 216, 144, 253, 221, 240 },

+        {   1,  15,  24,  83,  87, 133, 156, 110, 246, 191, 242 }

+      }

   }, { /* block Type 1 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+    { /* Intra */

+      { /* Coeff Band 0 */

+        { 179,  23, 200, 222, 180, 182, 150, 152, 148, 135, 125 },

+        {  60,  33, 113, 185, 143, 166, 168, 144, 189, 168, 152 },

+        {   8,  31,  59, 137, 114, 150, 163, 132, 206, 171, 169 }

+      }, { /* Coeff Band 1 */

+        {  27, 103, 158, 215, 157, 174, 209, 165, 239, 191, 233 },

+        {  90, 101, 159, 213, 156, 173, 212, 164, 230, 185, 237 },

+        {  39,  91, 146, 212, 155, 169, 212, 165, 232, 186, 207 },

+        {  16,  75, 120, 203, 144, 169, 210, 161, 233, 189, 227 },

+        {   3,  48,  76, 167, 120, 154, 199, 146, 236, 190, 218 },

+        {   1,  18,  26,  72,  95, 137, 113, 109, 197, 146, 186 }

+      }, { /* Coeff Band 2 */

+        {  45, 137, 177, 218, 166, 174, 206, 163, 234, 184, 214 },

+        {  47, 117, 167, 218, 166, 176, 206, 164, 234, 182, 229 },

+        {  16,  90, 136, 211, 153, 172, 205, 162, 236, 192, 231 },

+        {   6,  65, 100, 188, 136, 162, 193, 155, 237, 177, 228 },

+        {   1,  37,  58, 137, 113, 150, 166, 134, 229, 167, 234 },

+        {   1,  13,  19,  55,  90, 132,  93, 103, 196, 137, 202 }

+      }, { /* Coeff Band 3 */

+        {  36, 171, 194, 227, 177, 179, 208, 165, 244, 196, 245 },

+        {  19, 129, 178, 227, 175, 184, 214, 165, 246, 188, 255 },

+        {   5,  90, 139, 217, 158, 174, 213, 166, 246, 198, 255 },

+        {   1,  59,  93, 182, 134, 162, 193, 150, 242, 188, 241 },

+        {   1,  31,  49, 122, 108, 145, 160, 127, 235, 172, 229 },

+        {   1,  10,  18,  54,  89, 132, 101,  99, 213, 144, 217 }

+      }, { /* Coeff Band 4 */

+        {  37, 197, 210, 233, 187, 186, 216, 172, 250, 202, 255 },

+        {  20, 142, 191, 234, 183, 186, 219, 170, 249, 207, 246 },

+        {   3,  93, 144, 222, 163, 176, 219, 170, 249, 204, 224 },

+        {   1,  56,  88, 179, 130, 159, 199, 148, 246, 197, 243 },

+        {   1,  29,  47, 123, 104, 144, 172, 127, 244, 185, 234 },

+        {   1,  14,  22,  66,  91, 130, 120, 103, 225, 158, 221 }

+      }, { /* Coeff Band 5 */

+        {  19, 227, 223, 245, 203, 194, 238, 187, 251, 225, 217 },

+        {   6, 152, 192, 242, 189, 190, 241, 190, 253, 225, 255 },

+        {   1,  89, 138, 228, 161, 177, 239, 181, 254, 224, 248 },

+        {   1,  52,  84, 188, 127, 157, 224, 159, 253, 222, 247 },

+        {   1,  29,  47, 132, 102, 140, 196, 132, 251, 208, 244 },

+        {   1,  14,  23,  71,  90, 133, 134, 103, 239, 174, 233 }

+      }

+    }, { /* Inter */

+      { /* Coeff Band 0 */

+        { 205,  14, 245, 235, 216, 189, 190, 146, 249, 201, 255 },

+        {  97,  19, 213, 210, 194, 174, 176, 139, 241, 183, 250 },

+        {  31,  20, 144, 183, 160, 167, 171, 132, 240, 184, 253 }

+      }, { /* Coeff Band 1 */

+        { 137, 182, 245, 254, 221, 216, 255, 160, 128, 128, 128 },

+        { 231, 185, 242, 251, 218, 205, 255, 233, 128, 128, 128 },

+        { 170, 175, 229, 252, 205, 209, 255, 211, 128, 128, 128 },

+        { 107, 157, 213, 250, 199, 205, 251, 207, 255, 255, 128 },

+        {  77, 126, 183, 243, 182, 183, 252, 206, 255, 255, 128 },

+        {  69,  96, 149, 229, 157, 170, 247, 169, 255, 255, 128 }

+      }, { /* Coeff Band 2 */

+        { 107, 196, 241, 252, 211, 208, 255, 210, 128, 128, 128 },

+        {  92, 162, 221, 249, 203, 195, 255, 199, 128, 128, 128 },

+        {  20, 108, 181, 244, 190, 191, 250, 200, 255, 255, 128 },

+        {   7,  80, 132, 241, 172, 197, 253, 191, 255, 255, 128 },

+        {   2,  43,  75, 219, 122, 150, 255, 203, 128, 128, 128 },

+        {   1,  15,  48,  98,  51, 192, 255, 160, 128, 128, 128 }

+      }, { /* Coeff Band 3 */

+        { 107, 202, 244, 254, 226, 215, 255, 192, 128, 128, 128 },

+        {  77, 167, 224, 252, 215, 212, 255, 235, 128, 128, 128 },

+        {  14, 117, 179, 249, 191, 196, 255, 212, 128, 128, 128 },

+        {   3,  84, 134, 237, 160, 194, 248, 216, 255, 255, 128 },

+        {   1,  57,  84, 216, 145, 136, 255, 161, 128, 128, 128 },

+        {   1,   1,   1, 255, 128, 255, 128, 128, 128, 128, 128 }

+      }, { /* Coeff Band 4 */

+        {  88, 219, 248, 255, 239, 225, 255, 255, 128, 128, 128 },

+        {  61, 178, 234, 255, 227, 227, 255, 217, 128, 128, 128 },

+        {   6, 127, 188, 252, 201, 211, 255, 244, 128, 128, 128 },

+        {   1,  83, 130, 248, 173, 197, 255, 175, 128, 128, 128 },

+        {   1,  61,  66, 211, 121, 188, 255, 213, 128, 128, 128 },

+        { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+      }, { /* Coeff Band 5 */

+        {  73, 243, 250, 255, 244, 220, 255, 205, 128, 128, 128 },

+        {  42, 197, 242, 255, 237, 227, 242, 166, 255, 255, 128 },

+        {  10, 137, 197, 252, 214, 199, 255, 238, 128, 128, 128 },

+        {   2,  85, 134, 242, 163, 185, 224, 238, 255, 255, 128 },

+        {   1,  70,  69, 199, 110,  64, 255, 213, 128, 128, 128 },

+        {   1,   1,   1,   1, 128, 128, 255,   1, 128, 128, 128 }

+      }

-  }, { /* block Type 2 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+  }

+};

+#if CONFIG_CODE_NONZEROCOUNT

+// TODO(debargha): Remove the macro and count tables after experimentation

+#define NZC_DEFAULT_COUNTS  /* Uncomment to use counts as defaults */

+#ifdef NZC_DEFAULT_COUNTS

+static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]

+                                                [REF_TYPES]

+                                                [BLOCK_TYPES]

+                                                [NZC4X4_TOKENS] = {

+  {

+    {

+      { 967652, 29023, 15039, 6952, 1568, 116 },

+      { 289116, 22938, 4522, 1935, 520, 47 }

+    }, {

+      { 967652, 29023, 15039, 6952, 1568, 116 },

+      { 689116, 22938, 4522, 1935, 520, 47 }

+    },

+  }, {

+    {

+      { 124684, 37167, 15270, 8483, 1777, 102 },

+      { 10405, 12395, 3401, 3574, 2461, 771 }

+    }, {

+      { 124684, 37167, 15270, 8483, 1777, 102 },

+      { 20405, 12395, 3401, 3574, 2461, 771 }

-  }, { /* block Type 3 */

-    { /* Coeff Band 0 */

-      { 118,  27, 105, 170, 137, 166, 183, 137, 243, 189, 241 },

-      {  44,  34,  85, 142, 127, 158, 161, 128, 232, 174, 213 },

-      {   8,  26,  47, 104, 108, 145, 143, 117, 226, 168, 207 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 134, 172, 217, 163, 175, 226, 167, 251, 220, 204 },

-      {  56, 129, 168, 217, 161, 174, 223, 164, 249, 218, 223 },

-      {  20, 110, 151, 215, 158, 174, 221, 165, 249, 209, 221 },

-      {   2,  59,  88, 169, 128, 157, 192, 143, 239, 189, 214 }

-    }, { /* Coeff Band 2 */

-      {   1,  65, 126, 191, 140, 163, 218, 153, 252, 218, 229 },

-      {  21,  57,  92, 175, 126, 156, 214, 148, 252, 218, 229 },

-      {   4,  44,  66, 148, 114, 148, 200, 136, 251, 211, 228 },

-      {   1,  28,  42, 108, 104, 141, 158, 119, 235, 180, 210 }

-    }, { /* Coeff Band 3 */

-      {   1, 114, 172, 227, 166, 177, 236, 178, 252, 226, 233 },

-      {  41,  94, 152, 218, 156, 172, 233, 172, 251, 223, 231 },

-      {   9,  69, 116, 202, 142, 165, 226, 162, 251, 221, 227 },

-      {   1,  36,  60, 151, 113, 148, 195, 140, 241, 198, 211 }

-    }, { /* Coeff Band 4 */

-      {   1, 186, 200, 227, 174, 178, 230, 169, 248, 210, 238 },

-      {  27, 148, 181, 221, 167, 176, 226, 166, 250, 218, 228 },

-      {   3,  96, 139, 208, 154, 170, 219, 161, 249, 214, 229 },

-      {   1,  44,  70, 156, 120, 152, 188, 139, 239, 193, 200 }

-    }, { /* Coeff Band 5 */

-      {   1, 169, 203, 238, 186, 186, 238, 184, 252, 224, 230 },

-      {  32, 119, 173, 232, 172, 181, 236, 182, 252, 222, 237 },

-      {   6,  84, 128, 215, 150, 170, 232, 172, 251, 221, 235 },

-      {   1,  49,  78, 167, 124, 154, 200, 145, 243, 198, 217 }

-    }, { /* Coeff Band 6 */

-      {   1, 193, 215, 244, 197, 195, 239, 192, 249, 213, 240 },

-      {  52, 136, 193, 239, 184, 189, 237, 189, 248, 211, 226 },

-      {  13,  90, 146, 227, 162, 178, 233, 182, 248, 211, 231 },

-      {   1,  49,  79, 177, 124, 156, 201, 154, 234, 188, 212 }

-    }, { /* Coeff Band 7 */

-      {   1, 189, 238, 248, 219, 196, 232, 180, 253, 211, 255 },

-      { 104, 148, 224, 245, 211, 194, 225, 171, 251, 206, 255 },

-      {  43, 116, 190, 231, 179, 183, 217, 168, 249, 199, 255 },

-      {  13,  65,  92, 154, 131, 152, 167, 132, 238, 174, 243 }

+  }, {

+    {

+      { 4100, 22976, 15627, 16137, 7982, 1793 },

+      { 4249, 3084, 2131, 4081, 6439, 1653 }

+    }, {

+      { 21100, 22976, 15627, 16137, 7982, 1793 },

+      { 4249, 3084, 2131, 4081, 2439, 1653 }

};

-static const vp9_coeff_probs default_coef_probs_16x16[BLOCK_TYPES_16X16] = {

-  { /* block Type 0 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]

+                                                [REF_TYPES]

+                                                [BLOCK_TYPES]

+                                                [NZC8X8_TOKENS] = {

+  {

+    {

+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },

+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 5 },

+    }, {

+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10 },

+      { 192052, 30468, 6973, 3250, 1500, 750, 375, 5 },

-  }, { /* block Type 1 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+  }, {

+    {

+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },

+      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },

+    }, {

+      { 121533, 33527, 15655, 11920, 5723, 2009, 315, 7 },

+      { 23772, 23120, 13127, 8115, 4000, 2000, 200, 6 },

-  }, { /* block Type 2 */

-    { /* Coeff Band 0 */

-      { 223,  34, 236, 234, 193, 185, 216, 169, 239, 189, 229 },

-      { 125,  40, 195, 221, 173, 175, 209, 165, 220, 181, 196 },

-      {  41,  37, 127, 185, 145, 162, 191, 150, 227, 180, 219 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 160, 224, 239, 193, 190, 213, 178, 244, 174, 255 },

-      { 199, 154, 212, 238, 190, 190, 210, 173, 246, 183, 249 },

-      {  88, 122, 178, 234, 180, 187, 213, 174, 244, 182, 247 },

-      {  27,  69, 100, 174, 139, 165, 159, 142, 225, 157, 240 }

-    }, { /* Coeff Band 2 */

-      {   1, 118, 207, 237, 179, 185, 234, 189, 241, 194, 237 },

-      {  86, 103, 161, 227, 163, 176, 231, 183, 241, 196, 234 },

-      {  19,  69, 113, 205, 140, 166, 220, 169, 240, 188, 242 },

-      {   3,  32,  49, 106, 111, 144, 132, 121, 225, 151, 237 }

-    }, { /* Coeff Band 3 */

-      {   1, 160, 218, 245, 197, 195, 235, 189, 254, 218, 255 },

-      {  90, 127, 193, 240, 186, 189, 235, 187, 251, 217, 230 },

-      {  18,  92, 148, 229, 164, 179, 228, 180, 254, 212, 229 },

-      {   2,  50,  79, 163, 126, 156, 186, 140, 247, 191, 236 }

-    }, { /* Coeff Band 4 */

-      {   1, 196, 231, 240, 203, 191, 225, 171, 253, 214, 255 },

-      {  71, 167, 210, 234, 194, 188, 218, 165, 253, 215, 236 },

-      {  11, 119, 165, 217, 171, 177, 213, 155, 252, 209, 255 },

-      {   1,  46,  70, 145, 121, 153, 180, 131, 249, 192, 246 }

-    }, { /* Coeff Band 5 */

-      {   1, 176, 223, 242, 202, 194, 222, 169, 253, 211, 244 },

-      {  62, 131, 191, 233, 185, 186, 219, 164, 251, 211, 252 },

-      {   7,  89, 133, 207, 156, 173, 211, 157, 251, 206, 247 },

-      {   1,  36,  56, 127, 113, 147, 166, 125, 243, 183, 242 }

-    }, { /* Coeff Band 6 */

-      {   1, 203, 232, 249, 213, 202, 245, 193, 254, 237, 255 },

-      {  51, 155, 212, 245, 199, 195, 244, 192, 254, 234, 255 },

-      {   7, 101, 158, 233, 170, 181, 244, 185, 253, 242, 255 },

-      {   1,  49,  82, 185, 123, 157, 226, 156, 252, 225, 240 }

-    }, { /* Coeff Band 7 */

-      {   1, 222, 233, 252, 220, 207, 247, 206, 255, 240, 128 },

-      {  40, 159, 216, 250, 205, 201, 248, 207, 249, 219, 255 },

-      {   6, 106, 163, 240, 176, 188, 247, 198, 251, 222, 255 },

-      {   1,  51,  88, 196, 127, 159, 232, 169, 252, 214, 255 }

+  }, {

+    {

+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },

+      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },

+    }, {

+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17 },

+      { 11612, 13874, 13329, 13022, 6500, 3250, 300, 12 },

-  }, { /* block Type 3 */

-    { /* Coeff Band 0 */

-      {  14,  78, 225, 217, 173, 181, 198, 153, 228, 185, 176 },

-      {   9,  74, 179, 191, 157, 171, 178, 143, 229, 175, 209 },

-      {   3,  48,  92, 128, 130, 155, 135, 123, 220, 155, 219 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 178, 209, 214, 173, 175, 208, 152, 252, 210, 237 },

-      { 142, 151, 193, 212, 170, 175, 209, 151, 251, 208, 237 },

-      {  38, 105, 150, 206, 159, 173, 208, 151, 250, 209, 238 },

-      {   5,  44,  61, 128, 114, 147, 167, 125, 239, 184, 217 }

-    }, { /* Coeff Band 2 */

-      {   1, 154, 195, 202, 166, 173, 184, 144, 245, 184, 236 },

-      {  49, 110, 150, 188, 155, 168, 180, 141, 244, 183, 239 },

-      {   4,  63,  90, 158, 132, 157, 171, 134, 243, 179, 239 },

-      {   1,  25,  37,  93, 104, 141, 133, 114, 231, 161, 226 }

-    }, { /* Coeff Band 3 */

-      {   1, 184, 201, 223, 173, 177, 224, 164, 253, 220, 238 },

-      {  42, 127, 170, 215, 164, 173, 223, 162, 253, 219, 233 },

-      {   4,  75, 114, 195, 142, 164, 218, 155, 253, 217, 235 },

-      {   1,  32,  50, 128, 108, 144, 180, 127, 247, 197, 219 }

-    }, { /* Coeff Band 4 */

-      {   1, 190, 207, 232, 181, 184, 228, 172, 251, 216, 212 },

-      {  35, 136, 180, 227, 173, 180, 227, 171, 251, 216, 218 },

-      {   2,  85, 131, 214, 154, 173, 224, 166, 250, 214, 225 },

-      {   1,  44,  71, 162, 120, 153, 195, 143, 240, 195, 197 }

-    }, { /* Coeff Band 5 */

-      {   1, 185, 201, 230, 177, 180, 232, 172, 253, 225, 235 },

-      {  27, 122, 165, 221, 164, 175, 230, 169, 253, 224, 220 },

-      {   1,  72, 108, 197, 139, 163, 224, 159, 253, 224, 226 },

-      {   1,  33,  51, 132, 107, 144, 186, 130, 245, 201, 206 }

-    }, { /* Coeff Band 6 */

-      {   1, 203, 214, 240, 193, 191, 235, 178, 252, 225, 224 },

-      {  20, 140, 188, 235, 182, 186, 234, 177, 252, 226, 226 },

-      {   1,  85, 132, 218, 155, 174, 230, 170, 251, 224, 227 },

-      {   1,  39,  62, 154, 114, 150, 199, 141, 241, 203, 214 }

-    }, { /* Coeff Band 7 */

-      {   1, 217, 224, 244, 202, 193, 241, 187, 252, 227, 239 },

-      {  22, 151, 200, 239, 187, 188, 240, 184, 252, 226, 237 },

-      {   2,  90, 138, 222, 158, 174, 237, 176, 252, 226, 239 },

-      {   1,  41,  66, 163, 116, 151, 206, 146, 243, 201, 230 }

-    }

};

-static const vp9_coeff_probs default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16] = {

-  { /* block Type 0 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]

+                                                  [REF_TYPES]

+                                                  [BLOCK_TYPES]

+                                                  [NZC16X16_TOKENS] = {

+  {

+    {

+      { 372988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },

+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },

+    }, {

+      { 32988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2 },

+      { 92052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1 },

-  }, { /* block Type 1 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+  }, {

+    {

+      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },

+      { 47772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },

+    }, {

+      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2 },

+      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2 },

-  }, { /* block Type 2 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+  }, {

+    {

+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },

+      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },

+    }, {

+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5 },

+      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3 },

-  }, { /* block Type 3 */

-    { /* Coeff Band 0 */

-      {   3,  29,  86, 140, 130, 163, 135, 131, 190, 148, 186 },

-      {   1,  26,  61, 105, 124, 156, 105, 119, 178, 138, 173 },

-      {   1,  15,  28,  60, 105, 142,  80, 105, 173, 128, 178 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 130, 142, 172, 141, 161, 191, 140, 244, 193, 216 },

-      {  61, 124, 141, 173, 141, 161, 190, 139, 244, 194, 215 },

-      {  28, 103, 124, 171, 138, 160, 190, 140, 243, 194, 225 },

-      {   1,  36,  51, 111, 109, 144, 152, 120, 227, 173, 205 }

-    }, { /* Coeff Band 2 */

-      {   1,  60, 125, 153, 143, 159, 156, 127, 234, 170, 233 },

-      {  22,  48,  78, 129, 124, 152, 151, 123, 234, 170, 233 },

-      {   3,  32,  46,  98, 107, 142, 138, 114, 232, 165, 232 },

-      {   1,  15,  23,  61,  96, 135, 101, 103, 210, 144, 213 }

-    }, { /* Coeff Band 3 */

-      {   1, 102, 144, 182, 146, 162, 194, 143, 246, 196, 239 },

-      {  34,  76, 116, 171, 136, 159, 192, 140, 246, 195, 239 },

-      {   4,  51,  81, 153, 124, 153, 184, 135, 246, 192, 239 },

-      {   1,  23,  37,  98, 102, 140, 142, 116, 230, 167, 227 }

-    }, { /* Coeff Band 4 */

-      {   1, 165, 171, 214, 163, 174, 214, 160, 245, 203, 219 },

-      {  16, 120, 154, 210, 158, 172, 212, 159, 245, 201, 219 },

-      {   1,  80, 122, 199, 147, 167, 208, 154, 244, 200, 223 },

-      {   1,  40,  65, 145, 118, 151, 171, 135, 226, 175, 202 }

-    }, { /* Coeff Band 5 */

-      {   1, 146, 162, 215, 159, 172, 226, 165, 251, 218, 231 },

-      {  16,  92, 131, 205, 147, 167, 224, 162, 252, 217, 228 },

-      {   2,  60,  92, 182, 129, 158, 216, 152, 251, 214, 234 },

-      {   1,  32,  50, 126, 107, 144, 176, 128, 240, 189, 216 }

-    }, { /* Coeff Band 6 */

-      {   1, 178, 186, 224, 172, 178, 224, 167, 251, 214, 232 },

-      {  14, 118, 158, 215, 160, 173, 223, 164, 250, 214, 228 },

-      {   2,  70, 109, 194, 139, 164, 217, 156, 250, 213, 227 },

-      {   1,  32,  51, 129, 108, 146, 175, 128, 240, 187, 218 }

-    }, { /* Coeff Band 7 */

-      {   1, 210, 214, 240, 192, 188, 235, 182, 251, 221, 228 },

-      {  22, 140, 187, 233, 177, 183, 234, 178, 251, 219, 233 },

-      {   3,  82, 130, 215, 152, 171, 229, 171, 250, 217, 232 },

-      {   1,  38,  63, 154, 115, 149, 195, 141, 240, 196, 219 }

-    }

};

-static const vp9_coeff_probs default_coef_probs_32x32[BLOCK_TYPES_32X32] = {

-  { /* block Type 0 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]

+                                                  [REF_TYPES]

+                                                  [BLOCK_TYPES]

+                                                  [NZC32X32_TOKENS] = {

+  {

+    {

+      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },

+      { 52052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },

+    }, {

+      { 72988, 62777, 19440, 11812, 5145, 1917, 439, 10, 5, 2, 1, 0 },

+      { 72052, 30468, 6973, 3250, 1500, 750, 375, 50, 8, 1, 0, 0 },

-  }, { /* block Type 1 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+  }, {

+    {

+      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },

+      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },

+    }, {

+      { 21533, 33527, 15655, 11920, 5723, 2009, 315, 7, 4, 2, 1, 0 },

+      { 27772, 23120, 13127, 8115, 4000, 2000, 200, 6, 4, 2, 1, 0 },

-  }, { /* block Type 2 */

-    { /* Coeff Band 0 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 2 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 3 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 4 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 5 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 6 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 7 */

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

+  }, {

+    {

+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },

+      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },

+    }, {

+      { 29408, 11758, 8023, 10123, 6705, 2468, 369, 17, 10, 5, 2, 1 },

+      { 9612, 13874, 13329, 13022, 6500, 3250, 300, 12, 6, 3, 2, 1 },

-  }, { /* block Type 3 */

-    { /* Coeff Band 0 */

-      {   8,  40, 224, 217, 183, 181, 180, 148, 200, 180, 123 },

-      {   6,  37, 178, 193, 173, 171, 160, 139, 205, 166, 173 },

-      {   3,  27,  93, 133, 143, 159, 115, 125, 183, 141, 178 },

-      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }

-    }, { /* Coeff Band 1 */

-      {   1, 170, 209, 202, 172, 175, 179, 143, 238, 181, 214 },

-      { 184, 164, 199, 199, 169, 173, 180, 143, 238, 184, 217 },

-      {  99, 128, 165, 194, 161, 171, 180, 142, 239, 182, 219 },

-      {  17,  49,  59, 102, 117, 148, 122, 116, 208, 152, 191 }

-    }, { /* Coeff Band 2 */

-      {   1, 136, 200, 197, 172, 172, 168, 142, 226, 170, 216 },

-      {  66, 104, 146, 175, 152, 165, 163, 139, 225, 170, 219 },

-      {  11,  52,  83, 144, 130, 156, 151, 130, 222, 165, 216 },

-      {   1,  16,  25,  65,  99, 137,  96, 106, 190, 138, 184 }

-    }, { /* Coeff Band 3 */

-      {   1, 180, 203, 198, 166, 170, 190, 143, 241, 190, 227 },

-      {  74, 125, 161, 187, 154, 165, 187, 142, 241, 189, 224 },

-      {  15,  70,  98, 163, 133, 157, 182, 137, 241, 187, 226 },

-      {   1,  25,  37,  89, 104, 140, 128, 113, 218, 158, 206 }

-    }, { /* Coeff Band 4 */

-      {   1, 191, 208, 213, 169, 173, 212, 156, 246, 206, 217 },

-      {  53, 136, 170, 205, 159, 170, 211, 156, 246, 205, 208 },

-      {   3,  75, 112, 189, 140, 163, 209, 151, 246, 205, 215 },

-      {   1,  32,  51, 127, 108, 145, 171, 128, 231, 183, 197 }

-    }, { /* Coeff Band 5 */

-      {   1, 183, 195, 202, 161, 168, 206, 150, 247, 202, 229 },

-      {  42, 113, 144, 190, 147, 163, 203, 148, 247, 202, 229 },

-      {   2,  56,  82, 160, 124, 153, 195, 140, 246, 200, 229 },

-      {   1,  22,  34,  93,  99, 138, 143, 115, 227, 170, 206 }

-    }, { /* Coeff Band 6 */

-      {   1, 202, 193, 221, 168, 175, 227, 167, 251, 217, 236 },

-      {  26, 122, 158, 213, 157, 171, 225, 165, 251, 216, 242 },

-      {   1,  68, 105, 194, 136, 162, 221, 158, 251, 215, 239 },

-      {   1,  32,  51, 131, 107, 145, 179, 130, 240, 188, 231 }

-    }, { /* Coeff Band 7 */

-      {   1, 234, 212, 243, 195, 192, 240, 187, 253, 226, 227 },

-      {  14, 141, 186, 237, 181, 186, 239, 184, 253, 226, 233 },

-      {   1,  85, 132, 221, 155, 174, 235, 176, 253, 224, 226 },

-      {   1,  39,  65, 159, 115, 150, 202, 144, 245, 202, 214 }

-    }

};

+#else

+static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]

+                                           [REF_TYPES]

+                                           [BLOCK_TYPES]

+                                           [NZC4X4_TOKENS] = {

+  {

+    {

+      { 219, 162, 179, 142, 242, },

+      { 214, 253, 228, 246, 255, },

+    }, {

+      { 225, 236, 190, 229, 253, },

+      { 251, 253, 240, 248, 255, },

+    },

+  }, {

+    {

+      { 106, 126, 158, 126, 244, },

+      { 118, 241, 201, 240, 255, },

+    }, {

+      { 165, 179, 143, 189, 242, },

+      { 173, 239, 192, 255, 128, },

+    },

+  }, {

+    {

+      { 42 , 78 , 153, 92 , 223, },

+      { 128, 128, 128, 128, 128, },

+    }, {

+      { 76 , 68 , 126, 110, 216, },

+      { 128, 128, 128, 128, 128, },

+    },

+  },

+};

+static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]

+                                           [REF_TYPES]

+                                           [BLOCK_TYPES]

+                                           [NZC8X8_TOKENS] = {

+  {

+    {

+      { 134, 139, 170, 178, 142, 197, 255, },

+      { 167, 224, 199, 252, 205, 255, 128, },

+    }, {

+      { 181, 210, 180, 241, 190, 235, 255, },

+      { 234, 251, 235, 252, 219, 255, 128, },

+    },

+  }, {

+    {

+      { 33 , 64 , 155, 143, 86 , 216, 255, },

+      { 73 , 160, 167, 251, 153, 255, 128, },

+    }, {

+      { 79 , 104, 153, 195, 119, 246, 255, },

+      { 149, 183, 186, 249, 203, 255, 128, },

+    },

+  }, {

+    {

+      { 10 , 25 , 156, 61 , 69 , 156, 254, },

+      { 32 , 1  , 128, 146, 64 , 255, 128, },

+    }, {

+      { 37 , 48 , 143, 113, 81 , 202, 255, },

+      { 1  , 255, 128, 128, 128, 128, 128, },

+    },

+  },

+};

+static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]

+                                             [REF_TYPES]

+                                             [BLOCK_TYPES]

+                                             [NZC16X16_TOKENS] = {

+  {

+    {

+      { 11 , 188, 210, 167, 141, 143, 152, 255, 128, },

+      { 171, 201, 203, 244, 207, 255, 255, 128, 128, },

+    }, {

+      { 23 , 217, 207, 251, 198, 255, 219, 128, 128, },

+      { 235, 249, 229, 255, 199, 128, 128, 128, 128, },

+    },

+  }, {

+    {

+      { 9  , 45 , 168, 85 , 66 , 221, 139, 246, 255, },

+      { 51 , 110, 163, 238, 94 , 255, 255, 128, 128, },

+    }, {

+      { 4  , 149, 175, 240, 149, 255, 205, 128, 128, },

+      { 141, 217, 186, 255, 128, 128, 128, 128, 128, },

+    },

+  }, {

+    {

+      { 1  , 12 , 173, 6  , 68 , 145, 41 , 204, 255, },

+      { 39 , 47 , 128, 199, 110, 255, 128, 128, 128, },

+    }, {

+      { 1  , 121, 171, 149, 115, 242, 159, 255, 128, },

+      { 1  , 255, 255, 128, 128, 128, 128, 128, 128, },

+    },

+  },

+};

+static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]

+                                             [REF_TYPES]

+                                             [BLOCK_TYPES]

+                                             [NZC32X32_TOKENS] = {

+  {

+    {

+      { 11 , 216, 195, 201, 160, 247, 217, 255, 255, 128, 128, },

+      { 177, 240, 239, 255, 192, 128, 128, 128, 128, 128, 128, },

+    }, {

+      { 48 , 235, 213, 235, 199, 255, 255, 128, 128, 128, 128, },

+      { 205, 255, 248, 128, 128, 128, 128, 128, 128, 128, 128, },

+    },

+  }, {

+    {

+      { 6  , 96 , 138, 99 , 125, 248, 188, 255, 128, 128, 128, },

+      { 17 , 53 , 43 , 189, 1  , 255, 171, 128, 128, 128, 128, },

+    }, {

+      { 5  , 187, 235, 232, 117, 255, 219, 128, 128, 128, 128, },

+      { 146, 255, 255, 128, 128, 128, 128, 128, 128, 128, 128, },

+    },

+  }, {

+    {

+      { 1  , 7  , 93 , 14 , 100, 30 , 85 , 65 , 81 , 210, 255, },

+      { 1  , 1  , 128, 26 , 1  , 218, 78 , 255, 255, 128, 128, },

+    }, {

+      { 4  , 148, 206, 137, 160, 255, 255, 128, 128, 128, 128, },

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, },

+    },

+  },

+};

+#endif

+static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]

+                                            [NZC_TOKENS_EXTRA]

+                                            [NZC_BITS_EXTRA] = {

+  // Bit probabilities are in least to most significance order

+  {

+    {176, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4

+    {164, 192, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8

+    {154, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16

+    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32

+    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64

+    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128

+    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256

+    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512

+    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024

+  }, {

+    {168, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4

+    {152, 184, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8

+    {152, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16

+    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32

+    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64

+    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128

+    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256

+    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512

+    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024

+  }, {

+    {160, 128, 128, 128, 128, 128, 128, 128, 128},   // 3 - 4

+    {152, 176, 128, 128, 128, 128, 128, 128, 128},   // 5 - 8

+    {150, 184, 208, 128, 128, 128, 128, 128, 128},   // 9 - 16

+    {144, 176, 200, 216, 128, 128, 128, 128, 128},   // 17 - 32

+    {140, 172, 192, 208, 224, 128, 128, 128, 128},   // 33 - 64

+    {136, 168, 188, 200, 220, 232, 128, 128, 128},   // 65 - 128

+    {132, 164, 184, 196, 216, 228, 240, 128, 128},   // 129 - 256

+    {130, 162, 178, 194, 212, 226, 240, 248, 128},   // 257 - 512

+    {128, 160, 176, 192, 208, 224, 240, 248, 254},   // 513 - 1024

+  },

+};

+#endif  // CONFIG_CODE_NONZEROCOUNT

--- a/vp9/common/vp9_entropy.c

+++ b/vp9/common/vp9_entropy.c

@@ -41,15 +41,176 @@

   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

};

-DECLARE_ALIGNED(16, const int, vp9_coef_bands_4x4[16]) = {

-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7

+// Unified coefficient band structure used by all block sizes

+DECLARE_ALIGNED(16, const int, vp9_coef_bands8x8[64]) = {

+  0, 1, 2, 3, 4, 4, 5, 5,

+  1, 2, 3, 4, 4, 5, 5, 5,

+  2, 3, 4, 4, 5, 5, 5, 5,

+  3, 4, 4, 5, 5, 5, 5, 5,

+  4, 4, 5, 5, 5, 5, 5, 5,

+  4, 5, 5, 5, 5, 5, 5, 5,

+  5, 5, 5, 5, 5, 5, 5, 5,

+  5, 5, 5, 5, 5, 5, 5, 5

};

+DECLARE_ALIGNED(16, const int, vp9_coef_bands4x4[16]) = {

+  0, 1, 2, 3,

+  1, 2, 3, 4,

+  2, 3, 4, 5,

+  3, 4, 5, 5

+};

-DECLARE_ALIGNED(16, const uint8_t, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = {

-  0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0

+DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {

+  0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5

};

+#if CONFIG_SCATTERSCAN

 DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = {

+  0,  4,  1,  5,

+  8,  2, 12,  9,

+  3,  6, 13, 10,

+  7, 14, 11, 15,

+};

+DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = {

+  0,  4,  8,  1,

+  12,  5,  9,  2,

+  13,  6, 10,  3,

+  7, 14, 11, 15,

+};

+DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = {

+  0,  1,  4,  2,

+  5,  3,  6,  8,

+  9,  7, 12, 10,

+  13, 11, 14, 15,

+};

+DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {

+  0,  8,  1, 16,  9,  2, 17, 24,

+  10,  3, 18, 25, 32, 11,  4, 26,

+  33, 19, 40, 12, 34, 27,  5, 41,

+  20, 48, 13, 35, 42, 28, 21,  6,

+  49, 56, 36, 43, 29,  7, 14, 50,

+  57, 44, 22, 37, 15, 51, 58, 30,

+  45, 23, 52, 59, 38, 31, 60, 53,

+  46, 39, 61, 54, 47, 62, 55, 63,

+};

+DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {

+  0,  8, 16,  1, 24,  9, 32, 17,

+  2, 40, 25, 10, 33, 18, 48,  3,

+  26, 41, 11, 56, 19, 34,  4, 49,

+  27, 42, 12, 35, 20, 57, 50, 28,

+  5, 43, 13, 36, 58, 51, 21, 44,

+  6, 29, 59, 37, 14, 52, 22,  7,

+  45, 60, 30, 15, 38, 53, 23, 46,

+  31, 61, 39, 54, 47, 62, 55, 63,

+};

+DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {

+  0,  1,  2,  8,  9,  3, 16, 10,

+  4, 17, 11, 24,  5, 18, 25, 12,

+  19, 26, 32,  6, 13, 20, 33, 27,

+  7, 34, 40, 21, 28, 41, 14, 35,

+  48, 42, 29, 36, 49, 22, 43, 15,

+  56, 37, 50, 44, 30, 57, 23, 51,

+  58, 45, 38, 52, 31, 59, 53, 46,

+  60, 39, 61, 47, 54, 55, 62, 63,

+};

+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {

+  0,  16,   1,  32,  17,   2,  48,  33,  18,   3,  64,  34,  49,  19,  65,  80,

+  50,   4,  35,  66,  20,  81,  96,  51,   5,  36,  82,  97,  67, 112,  21,  52,

+  98,  37,  83, 113,   6,  68, 128,  53,  22,  99, 114,  84,   7, 129,  38,  69,

+  100, 115, 144, 130,  85,  54,  23,   8, 145,  39,  70, 116, 101, 131, 160, 146,

+  55,  86,  24,  71, 132, 117, 161,  40,   9, 102, 147, 176, 162,  87,  56,  25,

+  133, 118, 177, 148,  72, 103,  41, 163,  10, 192, 178,  88,  57, 134, 149, 119,

+  26, 164,  73, 104, 193,  42, 179, 208,  11, 135,  89, 165, 120, 150,  58, 194,

+  180,  27,  74, 209, 105, 151, 136,  43,  90, 224, 166, 195, 181, 121, 210,  59,

+  12, 152, 106, 167, 196,  75, 137, 225, 211, 240, 182, 122,  91,  28, 197,  13,

+  226, 168, 183, 153,  44, 212, 138, 107, 241,  60,  29, 123, 198, 184, 227, 169,

+  242,  76, 213, 154,  45,  92,  14, 199, 139,  61, 228, 214, 170, 185, 243, 108,

+  77, 155,  30,  15, 200, 229, 124, 215, 244,  93,  46, 186, 171, 201, 109, 140,

+  230,  62, 216, 245,  31, 125,  78, 156, 231,  47, 187, 202, 217,  94, 246, 141,

+  63, 232, 172, 110, 247, 157,  79, 218, 203, 126, 233, 188, 248,  95, 173, 142,

+  219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251,

+  190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255,

+};

+DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {

+  0,  16,  32,  48,   1,  64,  17,  80,  33,  96,  49,   2,  65, 112,  18,  81,

+  34, 128,  50,  97,   3,  66, 144,  19, 113,  35,  82, 160,  98,  51, 129,   4,

+  67, 176,  20, 114, 145,  83,  36,  99, 130,  52, 192,   5, 161,  68, 115,  21,

+  146,  84, 208, 177,  37, 131, 100,  53, 162, 224,  69,   6, 116, 193, 147,  85,

+  22, 240, 132,  38, 178, 101, 163,  54, 209, 117,  70,   7, 148, 194,  86, 179,

+  225,  23, 133,  39, 164,   8, 102, 210, 241,  55, 195, 118, 149,  71, 180,  24,

+  87, 226, 134, 165, 211,  40, 103,  56,  72, 150, 196, 242, 119,   9, 181, 227,

+  88, 166,  25, 135,  41, 104, 212,  57, 151, 197, 120,  73, 243, 182, 136, 167,

+  213,  89,  10, 228, 105, 152, 198,  26,  42, 121, 183, 244, 168,  58, 137, 229,

+  74, 214,  90, 153, 199, 184,  11, 106, 245,  27, 122, 230, 169,  43, 215,  59,

+  200, 138, 185, 246,  75,  12,  91, 154, 216, 231, 107,  28,  44, 201, 123, 170,

+  60, 247, 232,  76, 139,  13,  92, 217, 186, 248, 155, 108,  29, 124,  45, 202,

+  233, 171,  61,  14,  77, 140,  15, 249,  93,  30, 187, 156, 218,  46, 109, 125,

+  62, 172,  78, 203,  31, 141, 234,  94,  47, 188,  63, 157, 110, 250, 219,  79,

+  126, 204, 173, 142,  95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236,

+  159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255,

+};

+DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {

+  0,   1,   2,  16,   3,  17,   4,  18,  32,   5,  33,  19,   6,  34,  48,  20,

+  49,   7,  35,  21,  50,  64,   8,  36,  65,  22,  51,  37,  80,   9,  66,  52,

+  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,  83,  97,  69,

+  25,  98,  84,  40, 112,  55,  12,  70,  99, 113,  85,  26,  41,  56, 114, 100,

+  13,  71, 128,  86,  27, 115, 101, 129,  42,  57,  72, 116,  14,  87, 130, 102,

+  144,  73, 131, 117,  28,  58,  15,  88,  43, 145, 103, 132, 146, 118,  74, 160,

+  89, 133, 104,  29,  59, 147, 119,  44, 161, 148,  90, 105, 134, 162, 120, 176,

+  75, 135, 149,  30,  60, 163, 177,  45, 121,  91, 106, 164, 178, 150, 192, 136,

+  165, 179,  31, 151, 193,  76, 122,  61, 137, 194, 107, 152, 180, 208,  46, 166,

+  167, 195,  92, 181, 138, 209, 123, 153, 224, 196,  77, 168, 210, 182, 240, 108,

+  197,  62, 154, 225, 183, 169, 211,  47, 139,  93, 184, 226, 212, 241, 198, 170,

+  124, 155, 199,  78, 213, 185, 109, 227, 200,  63, 228, 242, 140, 214, 171, 186,

+  156, 229, 243, 125,  94, 201, 244, 215, 216, 230, 141, 187, 202,  79, 172, 110,

+  157, 245, 217, 231,  95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158,

+  188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175,

+  190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255,

+};

+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {

+  0,   32,    1,   64,   33,    2,   96,   65,   34,  128,    3,   97,   66,  160,  129,   35,   98,    4,   67,  130,  161,  192,   36,   99,  224,    5,  162,  193,   68,  131,   37,  100,

+  225,  194,  256,  163,   69,  132,    6,  226,  257,  288,  195,  101,  164,   38,  258,    7,  227,  289,  133,  320,   70,  196,  165,  290,  259,  228,   39,  321,  102,  352,    8,  197,

+  71,  134,  322,  291,  260,  353,  384,  229,  166,  103,   40,  354,  323,  292,  135,  385,  198,  261,   72,    9,  416,  167,  386,  355,  230,  324,  104,  293,   41,  417,  199,  136,

+  262,  387,  448,  325,  356,   10,   73,  418,  231,  168,  449,  294,  388,  105,  419,  263,   42,  200,  357,  450,  137,  480,   74,  326,  232,   11,  389,  169,  295,  420,  106,  451,

+  481,  358,  264,  327,  201,   43,  138,  512,  482,  390,  296,  233,  170,  421,   75,  452,  359,   12,  513,  265,  483,  328,  107,  202,  514,  544,  422,  391,  453,  139,   44,  234,

+  484,  297,  360,  171,   76,  515,  545,  266,  329,  454,   13,  423,  392,  203,  108,  546,  485,  576,  298,  235,  140,  361,  516,  330,  172,  547,   45,  424,  455,  267,  393,  577,

+  486,   77,  204,  517,  362,  548,  608,   14,  456,  299,  578,  109,  236,  425,  394,  487,  609,  331,  141,  579,  518,   46,  268,   15,  173,  549,  610,  640,  363,   78,  519,  488,

+  300,  205,   16,  457,  580,  426,  550,  395,  110,  237,  611,  641,  332,  672,  142,  642,  269,  458,   47,  581,  427,  489,  174,  364,  520,  612,  551,  673,   79,  206,  301,  643,

+  704,   17,  111,  490,  674,  238,  582,   48,  521,  613,  333,  396,  459,  143,  270,  552,  644,  705,  736,  365,   80,  675,  583,  175,  428,  706,  112,  302,  207,  614,  553,   49,

+  645,  522,  737,  397,  768,  144,  334,   18,  676,  491,  239,  615,  707,  584,   81,  460,  176,  271,  738,  429,  113,  800,  366,  208,  523,  708,  646,  554,  677,  769,   19,  145,

+  585,  739,  240,  303,   50,  461,  616,  398,  647,  335,  492,  177,   82,  770,  832,  555,  272,  430,  678,  209,  709,  114,  740,  801,  617,   51,  304,  679,  524,  367,  586,  241,

+  20,  146,  771,  864,   83,  802,  648,  493,  399,  273,  336,  710,  178,  462,  833,  587,  741,  115,  305,  711,  368,  525,  618,  803,  210,  896,  680,  834,  772,   52,  649,  147,

+  431,  494,  556,  242,  400,  865,  337,   21,  928,  179,  742,   84,  463,  274,  369,  804,  650,  557,  743,  960,  835,  619,  773,  306,  211,  526,  432,  992,  588,  712,  116,  243,

+  866,  495,  681,  558,  805,  589,  401,  897,   53,  338,  148,  682,  867,  464,  275,   22,  370,  433,  307,  620,  527,  836,  774,  651,  713,  744,   85,  180,  621,  465,  929,  775,

+  496,  898,  212,  339,  244,  402,  590,  117,  559,  714,  434,   23,  868,  930,  806,  683,  528,  652,  371,  961,  149,  837,   54,  899,  745,  276,  993,  497,  403,  622,  181,  776,

+  746,  529,  560,  435,   86,  684,  466,  308,  591,  653,  715,  807,  340,  869,  213,  962,  245,  838,  561,  931,  808,  592,  118,  498,  372,  623,  685,  994,  467,  654,  747,  900,

+  716,  277,  150,   55,   24,  404,  530,  839,  777,  655,  182,  963,  840,  686,  778,  309,  870,  341,   87,  499,  809,  624,  593,  436,  717,  932,  214,  246,  995,  718,  625,  373,

+  562,   25,  119,  901,  531,  468,  964,  748,  810,  278,  779,  500,  563,  656,  405,  687,  871,  872,  594,  151,  933,  749,  841,  310,  657,  626,  595,  437,  688,  183,  996,  965,

+  902,  811,  342,  750,  689,  719,  532,   56,  215,  469,  934,  374,  247,  720,  780,  564,  781,  842,  406,   26,  751,  903,  873,   57,  279,  627,  501,  658,  843,  997,  812,  904,

+  88,  813,  438,  752,  935,  936,  311,  596,  533,  690,  343,  966,  874,   89,  120,  470,  721,  875,  659,  782,  565,  998,  375,  844,  845,   27,  628,  967,  121,  905,  968,  152,

+  937,  814,  753,  502,  691,  783,  184,  153,  722,  407,   58,  815,  999,  660,  597,  723,  534,  906,  216,  439,  907,  248,  185,  876,  846,  692,  784,  629,   90,  969,  280,  754,

+  938,  939,  217,  847,  566,  471,  785,  816,  877, 1000,  249,  878,  661,  503,  312,  970,  755,  122,  817,  281,  344,  786,  598,  724,   28,   59,   29,  154,  535,  630,  376, 1001,

+  313,  908,  186,   91,  848,  849,  345,  909,  940,  879,  408,  818,  693, 1002,  971,  941,  567,  377,  218,  756,  910,  787,  440,  123,  880,  725,  662,  250,  819, 1003,  282,  972,

+  850,  599,  472,  409,  155,  441,  942,  757,  788,  694,  911,  881,  314,  631,  973,  504,  187, 1004,  346,  473,  851,  943,  820,  726,   60,  505,  219,  378,  912,  974,   30,   31,

+  536,  882, 1005,   92,  251,  663,  944,  913,  283,  695,  883,  568, 1006,  975,  410,  442,  945,  789,  852,  537, 1007,  124,  315,   61,  758,  821,  600,  914,  976,  569,  474,  347,

+  156, 1008,  915,   93,  977,  506,  946,  727,  379,  884,  188,  632,  601, 1009,  790,  853,  978,  947,  220,  411,  125,  633,  664,  759,  252,  443,  916,  538,  157,  822,   62,  570,

+  979,  284, 1010,  885,  948,  189,  475,   94,  316,  665,  696, 1011,  854,  791,  980,  221,  348,   63,  917,  602,  380,  507,  253,  126,  697,  823,  634,  285,  728,  949,  886,   95,

+  158,  539, 1012,  317,  412,  444,  760,  571,  190,  981,  729,  918,  127,  666,  349,  381,  476,  855,  761, 1013,  603,  222,  159,  698,  950,  508,  254,  792,  286,  635,  887,  793,

+  413,  191,  982,  445,  540,  318,  730,  667,  223,  824,  919, 1014,  350,  477,  572,  255,  825,  951,  762,  509,  604,  856,  382,  699,  287,  319,  636,  983,  794,  414,  541,  731,

+  857,  888,  351,  446,  573, 1015,  668,  889,  478,  826,  383,  763,  605,  920,  510,  637,  415,  700,  921,  858,  447,  952,  542,  795,  479,  953,  732,  890,  669,  574,  511,  984,

+  827,  985,  922, 1016,  764,  606,  543,  701,  859,  638, 1017,  575,  796,  954,  733,  891,  670,  607,  828,  986,  765,  923,  639, 1018,  702,  860,  955,  671,  892,  734,  797,  703,

+  987,  829, 1019,  766,  924,  735,  861,  956,  988,  893,  767,  798,  830, 1020,  925,  957,  799,  862,  831,  989,  894, 1021,  863,  926,  895,  958,  990, 1022,  927,  959,  991, 1023,

+};

+#else  // CONFIG_SCATTERSCAN

+DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]) = {

   0,  1,  4,  8,

   5,  2,  3,  6,

   9, 12, 13, 10,

@@ -70,17 +231,6 @@

   12, 13, 14, 15

};

-DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = {

-  0, 1, 2, 3, 5, 4, 4, 5,

-  5, 3, 6, 3, 5, 4, 6, 6,

-  6, 5, 5, 6, 6, 6, 6, 6,

-  6, 6, 6, 6, 6, 6, 6, 6,

-  6, 6, 6, 6, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7

-};

 DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = {

   0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,

   12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,

@@ -88,26 +238,28 @@

   58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,

};

-// Table can be optimized.

-DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = {

-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,

-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,

-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = {

+   0,  8, 16, 24, 32, 40, 48, 56,

+   1,  9, 17, 25, 33, 41, 49, 57,

+   2, 10, 18, 26, 34, 42, 50, 58,

+   3, 11, 19, 27, 35, 43, 51, 59,

+   4, 12, 20, 28, 36, 44, 52, 60,

+   5, 13, 21, 29, 37, 45, 53, 61,

+   6, 14, 22, 30, 38, 46, 54, 62,

+   7, 15, 23, 31, 39, 47, 55, 63,

};

+DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = {

+   0,  1,  2,  3,  4,  5,  6,  7,

+   8,  9, 10, 11, 12, 13, 14, 15,

+  16, 17, 18, 19, 20, 21, 22, 23,

+  24, 25, 26, 27, 28, 29, 30, 31,

+  32, 33, 34, 35, 36, 37, 38, 39,

+  40, 41, 42, 43, 44, 45, 46, 47,

+  48, 49, 50, 51, 52, 53, 54, 55,

+  56, 57, 58, 59, 60, 61, 62, 63,

+};

 DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = {

   0,   1,  16,  32,  17,   2,   3,  18,

   33,  48,  64,  49,  34,  19,   4,   5,

@@ -143,692 +295,42 @@

   237, 252, 253, 238, 223, 239, 254, 255,

};

-#if CONFIG_DWTDCTHYBRID

-#if DWTDCT_TYPE == DWTDCT16X16_LEAN

-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {

-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,

-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,

-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-};

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {

-  0,    1,   32,   64,   33,    2,    3,   34,

-  65,   96, 128,   97,   66,   35,    4,  5,

-  36,   67,   98,  129,  160,  192,  161,  130,

-  99,   68,   37,    6,    7,   38,   69,  100,

-  131,  162,  193,  224, 256,  225,  194,  163,

-  132,  101,   70,   39,    8,    9,   40,   71,

-  102,  133,  164,  195,  226,  257,  288,  320,

-  289,  258,  227,  196,  165,  134,  103,   72,

-  41,   10,   11,   42,   73,  104,  135,  166,

-  197,  228,  259,  290,  321,  352,  384,  353,

-  322,  291,  260,  229,  198,  167,  136,  105,

-  74,   43,   12,   13,   44,   75,  106,  137,

-  168,  199,  230,  261,  292,  323,  354,  385,

-  416,  448,  417,  386,  355,  324,  293,  262,

-  231,  200,  169,  138,  107,   76,   45,   14,

-  15,   46,   77,  108,  139,  170,  201,  232,

-  263,  294,  325,  356,  387,  418,  449,  480,

-  481,  450,  419,  388,  357,  326,  295,  264,

-  233,  202,  171,  140,  109,   78,   47,   79,

-  110,  141,  172,  203,  234,  265,  296,  327,

-  358,  389,  420,  451,  482,  483,  452,  421,

-  390,  359,  328,  297,  266,  235,  204,  173,

-  142,  111,  143,  174,  205,  236,  267,  298,

-  329,  360,  391,  422,  453,  484,  485,  454,

-  423,  392,  361,  330,  299,  268,  237,  206,

-  175,  207,  238,  269,  300,  331,  362,  393,

-  424,  455,  486,  487,  456,  425,  394,  363,

-  332,  301,  270,  239,  271,  302,  333,  364,

-  395,  426,  457,  488,  489,  458,  427,  396,

-  365,  334,  303,  335,  366,  397,  428,  459,

-  490,  491,  460,  429,  398,  367,  399,  430,

-  461,  492,  493,  462,  431,  463,  494,  495,

-  16,   512,  528, 17,  513,  529,   48,  544,

-  560, 80,  576,  592,   49,  545,  561,   18,

-  514,  530,   19,  515,  531,   50,  546,  562,

-  81,  577,  593,  112,  608,  624,  144,  640,

-  656,  113,  609,  625,   82,  578,  594,   51,

-  547,  563,   20,  516,  532,   21,  517,  533,

-  52,  548,  564,   83,  579,  595,  114,  610,

-  626,  145,  641,  657,  176,  672,  688,  208,

-  704,  720,  177,  673,  689,  146,  642,  658,

-  115,  611,  627,   84,  580,  596,   53,  549,

-  565,   22,  518,  534,   23,  519,  535,   54,

-  550,  566,   85,  581,  597,  116,  612,  628,

-  147,  643,  659,  178,  674,  690,  209,  705,

-  721,  240,  736,  752,  272,  768,  784,  241,

-  737,  753,  210,  706,  722,  179,  675,  691,

-  148,  644,  660,  117,  613,  629,   86,  582,

-  598,   55,  551,  567,   24,  520,  536,   25,

-  521,  537,   56,  552,  568,   87,  583,  599,

-  118,  614,  630,  149,  645,  661,  180,  676,

-  692,  211,  707,  723,  242,  738,  754,  273,

-  769,  785,  304,  800,  816,  336,  832,  848,

-  305,  801,  817,  274,  770,  786,  243,  739,

-  755,  212,  708,  724,  181,  677,  693,  150,

-  646,  662,  119,  615,  631,   88,  584,  600,

-  57,  553,  569,   26,  522,  538,   27,  523,

-  539,   58,  554,  570,   89,  585,  601,  120,

-  616,  632,  151,  647,  663,  182,  678,  694,

-  213,  709,  725,  244,  740,  756,  275,  771,

-  787,  306,  802,  818,  337,  833,  849,  368,

-  864,  880,  400,  896,  912,  369,  865,  881,

-  338,  834,  850,  307,  803,  819,  276,  772,

-  788,  245,  741,  757,  214,  710,  726,  183,

-  679,  695,  152,  648,  664,  121,  617,  633,

-  90,  586,  602,   59,  555,  571,   28,  524,

-  540,   29,  525,  541,   60,  556,  572,   91,

-  587,  603,  122,  618,  634,  153,  649,  665,

-  184,  680,  696,  215,  711,  727,  246,  742,

-  758,  277,  773,  789,  308,  804,  820,  339,

-  835,  851,  370,  866,  882,  401,  897,  913,

-  432,  928,  944,  464,  960,  976,  433,  929,

-  945,  402,  898,  914,  371,  867,  883,  340,

-  836,  852,  309,  805,  821,  278,  774,  790,

-  247,  743,  759,  216,  712,  728,  185,  681,

-  697,  154,  650,  666,  123,  619,  635,   92,

-  588,  604,   61,  557,  573,   30,  526,  542,

-  31,  527,  543,   62,  558,  574,   93,  589,

-  605,  124,  620,  636,  155,  651,  667,  186,

-  682,  698,  217,  713,  729,  248,  744,  760,

-  279,  775,  791,  310,  806,  822,  341,  837,

-  853,  372,  868,  884,  403,  899,  915,  434,

-  930,  946,  465,  961,  977,  496,  992, 1008,

-  497,  993, 1009,  466,  962,  978,  435,  931,

-  947,  404,  900,  916,  373,  869,  885,  342,

-  838,  854,  311,  807,  823,  280,  776,  792,

-  249,  745,  761,  218,  714,  730,  187,  683,

-  699,  156,  652,  668,  125,  621,  637,   94,

-  590,  606,   63,  559,  575,   95,  591,  607,

-  126,  622,  638,  157,  653,  669,  188,  684,

-  700,  219,  715,  731,  250,  746,  762,  281,

-  777,  793,  312,  808,  824,  343,  839,  855,

-  374,  870,  886,  405,  901,  917,  436,  932,

-  948,  467,  963,  979,  498,  994, 1010,  499,

-  995, 1011,  468,  964,  980,  437,  933,  949,

-  406,  902,  918,  375,  871,  887,  344,  840,

-  856,  313,  809,  825,  282,  778,  794,  251,

-  747,  763,  220,  716,  732,  189,  685,  701,

-  158,  654,  670,  127,  623,  639,  159,  655,

-  671,  190,  686,  702,  221,  717,  733,  252,

-  748,  764,  283,  779,  795,  314,  810,  826,

-  345,  841,  857,  376,  872,  888,  407,  903,

-  919,  438,  934,  950,  469,  965,  981,  500,

-  996, 1012,  501,  997, 1013,  470,  966,  982,

-  439,  935,  951,  408,  904,  920,  377,  873,

-  889,  346,  842,  858,  315,  811,  827,  284,

-  780,  796,  253,  749,  765,  222,  718,  734,

-  191,  687,  703,  223,  719,  735,  254,  750,

-  766,  285,  781,  797,  316,  812,  828,  347,

-  843,  859,  378,  874,  890,  409,  905,  921,

-  440,  936,  952,  471,  967,  983,  502,  998,

-  1014,  503,  999, 1015,  472,  968,  984,  441,

-  937,  953,  410,  906,  922,  379,  875,  891,

-  348,  844,  860,  317,  813,  829,  286,  782,

-  798,  255,  751,  767,  287,  783,  799,  318,

-  814,  830,  349,  845,  861,  380,  876,  892,

-  411,  907,  923,  442,  938,  954,  473,  969,

-  985,  504, 1000, 1016,  505, 1001, 1017,  474,

-  970,  986,  443,  939,  955,  412,  908,  924,

-  381,  877,  893,  350,  846,  862,  319,  815,

-  831,  351,  847,  863,  382,  878,  894,  413,

-  909,  925,  444,  940,  956,  475,  971,  987,

-  506, 1002, 1018,  507, 1003, 1019,  476,  972,

-  988,  445,  941,  957,  414,  910,  926,  383,

-  879,  895,  415,  911,  927,  446,  942,  958,

-  477,  973,  989,  508, 1004, 1020,  509, 1005,

-  1021,  478,  974,  990,  447,  943,  959,  479,

-  975,  991,  510, 1006, 1022,  511, 1007, 1023,

-};

-#elif DWTDCT_TYPE == DWTDCT16X16

-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {

-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6,

-  6, 6, 6,

-  6,

-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,

-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-};

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {

-  0,    1,   32,   64,   33,    2,    3,   34,

-  65,   96, 128,   97,   66,   35,    4,

-  16,   512,  528,

-  5,

-  36,   67,   98,  129,  160,  192,  161,  130,

-  99,   68,   37,    6,    7,   38,   69,  100,

-  131,  162,  193,  224, 256,  225,  194,  163,

-  132,  101,   70,   39,    8,    9,   40,   71,

-  102,  133,  164,  195,  226,  257,  288,  320,

-  289,  258,  227,  196,  165,  134,  103,   72,

-  41,   10,   11,   42,   73,  104,  135,  166,

-  197,  228,  259,  290,  321,  352,  384,  353,

-  322,  291,  260,  229,  198,  167,  136,  105,

-  74,   43,   12,   13,   44,   75,  106,  137,

-  168,  199,  230,  261,  292,  323,  354,  385,

-  416,  448,  417,  386,  355,  324,  293,  262,

-  231,  200,  169,  138,  107,   76,   45,   14,

-  15,   46,   77,  108,  139,  170,  201,  232,

-  263,  294,  325,  356,  387,  418,  449,  480,

-  481,  450,  419,  388,  357,  326,  295,  264,

-  233,  202,  171,  140,  109,   78,   47,   79,

-  110,  141,  172,  203,  234,  265,  296,  327,

-  358,  389,  420,  451,  482,  483,  452,  421,

-  390,  359,  328,  297,  266,  235,  204,  173,

-  142,  111,  143,  174,  205,  236,  267,  298,

-  329,  360,  391,  422,  453,  484,  485,  454,

-  423,  392,  361,  330,  299,  268,  237,  206,

-  175,  207,  238,  269,  300,  331,  362,  393,

-  424,  455,  486,  487,  456,  425,  394,  363,

-  332,  301,  270,  239,  271,  302,  333,  364,

-  395,  426,  457,  488,  489,  458,  427,  396,

-  365,  334,  303,  335,  366,  397,  428,  459,

-  490,  491,  460,  429,  398,  367,  399,  430,

-  461,  492,  493,  462,  431,  463,  494,  495,

-  17,  513,  529,   48,  544,

-  560, 80,  576,  592,   49,  545,  561,   18,

-  514,  530,   19,  515,  531,   50,  546,  562,

-  81,  577,  593,  112,  608,  624,  144,  640,

-  656,  113,  609,  625,   82,  578,  594,   51,

-  547,  563,   20,  516,  532,   21,  517,  533,

-  52,  548,  564,   83,  579,  595,  114,  610,

-  626,  145,  641,  657,  176,  672,  688,  208,

-  704,  720,  177,  673,  689,  146,  642,  658,

-  115,  611,  627,   84,  580,  596,   53,  549,

-  565,   22,  518,  534,   23,  519,  535,   54,

-  550,  566,   85,  581,  597,  116,  612,  628,

-  147,  643,  659,  178,  674,  690,  209,  705,

-  721,  240,  736,  752,  272,  768,  784,  241,

-  737,  753,  210,  706,  722,  179,  675,  691,

-  148,  644,  660,  117,  613,  629,   86,  582,

-  598,   55,  551,  567,   24,  520,  536,   25,

-  521,  537,   56,  552,  568,   87,  583,  599,

-  118,  614,  630,  149,  645,  661,  180,  676,

-  692,  211,  707,  723,  242,  738,  754,  273,

-  769,  785,  304,  800,  816,  336,  832,  848,

-  305,  801,  817,  274,  770,  786,  243,  739,

-  755,  212,  708,  724,  181,  677,  693,  150,

-  646,  662,  119,  615,  631,   88,  584,  600,

-  57,  553,  569,   26,  522,  538,   27,  523,

-  539,   58,  554,  570,   89,  585,  601,  120,

-  616,  632,  151,  647,  663,  182,  678,  694,

-  213,  709,  725,  244,  740,  756,  275,  771,

-  787,  306,  802,  818,  337,  833,  849,  368,

-  864,  880,  400,  896,  912,  369,  865,  881,

-  338,  834,  850,  307,  803,  819,  276,  772,

-  788,  245,  741,  757,  214,  710,  726,  183,

-  679,  695,  152,  648,  664,  121,  617,  633,

-  90,  586,  602,   59,  555,  571,   28,  524,

-  540,   29,  525,  541,   60,  556,  572,   91,

-  587,  603,  122,  618,  634,  153,  649,  665,

-  184,  680,  696,  215,  711,  727,  246,  742,

-  758,  277,  773,  789,  308,  804,  820,  339,

-  835,  851,  370,  866,  882,  401,  897,  913,

-  432,  928,  944,  464,  960,  976,  433,  929,

-  945,  402,  898,  914,  371,  867,  883,  340,

-  836,  852,  309,  805,  821,  278,  774,  790,

-  247,  743,  759,  216,  712,  728,  185,  681,

-  697,  154,  650,  666,  123,  619,  635,   92,

-  588,  604,   61,  557,  573,   30,  526,  542,

-  31,  527,  543,   62,  558,  574,   93,  589,

-  605,  124,  620,  636,  155,  651,  667,  186,

-  682,  698,  217,  713,  729,  248,  744,  760,

-  279,  775,  791,  310,  806,  822,  341,  837,

-  853,  372,  868,  884,  403,  899,  915,  434,

-  930,  946,  465,  961,  977,  496,  992, 1008,

-  497,  993, 1009,  466,  962,  978,  435,  931,

-  947,  404,  900,  916,  373,  869,  885,  342,

-  838,  854,  311,  807,  823,  280,  776,  792,

-  249,  745,  761,  218,  714,  730,  187,  683,

-  699,  156,  652,  668,  125,  621,  637,   94,

-  590,  606,   63,  559,  575,   95,  591,  607,

-  126,  622,  638,  157,  653,  669,  188,  684,

-  700,  219,  715,  731,  250,  746,  762,  281,

-  777,  793,  312,  808,  824,  343,  839,  855,

-  374,  870,  886,  405,  901,  917,  436,  932,

-  948,  467,  963,  979,  498,  994, 1010,  499,

-  995, 1011,  468,  964,  980,  437,  933,  949,

-  406,  902,  918,  375,  871,  887,  344,  840,

-  856,  313,  809,  825,  282,  778,  794,  251,

-  747,  763,  220,  716,  732,  189,  685,  701,

-  158,  654,  670,  127,  623,  639,  159,  655,

-  671,  190,  686,  702,  221,  717,  733,  252,

-  748,  764,  283,  779,  795,  314,  810,  826,

-  345,  841,  857,  376,  872,  888,  407,  903,

-  919,  438,  934,  950,  469,  965,  981,  500,

-  996, 1012,  501,  997, 1013,  470,  966,  982,

-  439,  935,  951,  408,  904,  920,  377,  873,

-  889,  346,  842,  858,  315,  811,  827,  284,

-  780,  796,  253,  749,  765,  222,  718,  734,

-  191,  687,  703,  223,  719,  735,  254,  750,

-  766,  285,  781,  797,  316,  812,  828,  347,

-  843,  859,  378,  874,  890,  409,  905,  921,

-  440,  936,  952,  471,  967,  983,  502,  998,

-  1014,  503,  999, 1015,  472,  968,  984,  441,

-  937,  953,  410,  906,  922,  379,  875,  891,

-  348,  844,  860,  317,  813,  829,  286,  782,

-  798,  255,  751,  767,  287,  783,  799,  318,

-  814,  830,  349,  845,  861,  380,  876,  892,

-  411,  907,  923,  442,  938,  954,  473,  969,

-  985,  504, 1000, 1016,  505, 1001, 1017,  474,

-  970,  986,  443,  939,  955,  412,  908,  924,

-  381,  877,  893,  350,  846,  862,  319,  815,

-  831,  351,  847,  863,  382,  878,  894,  413,

-  909,  925,  444,  940,  956,  475,  971,  987,

-  506, 1002, 1018,  507, 1003, 1019,  476,  972,

-  988,  445,  941,  957,  414,  910,  926,  383,

-  879,  895,  415,  911,  927,  446,  942,  958,

-  477,  973,  989,  508, 1004, 1020,  509, 1005,

-  1021,  478,  974,  990,  447,  943,  959,  479,

-  975,  991,  510, 1006, 1022,  511, 1007, 1023,

-};

-#elif DWTDCT_TYPE == DWTDCT8X8

-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {

-  0, 1, 2, 3, 5, 4, 4, 5,

-  5, 3, 6, 3, 5, 4, 6, 6,

-  6, 5, 5, 6, 6, 6, 6, 6,

-  6, 6, 6, 6, 6, 6, 6, 6,

-  6, 6, 6, 6, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7,

-  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-};

-DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {

-  0,    1,   32,   64,   33,    2,    3,   34,

-  65,   96,  128,   97,   66,   35,    4,    5,

-  36,   67,   98,  129,  160,  192,  161,  130,

-  99,   68,   37,    6,    7,   38,   69,  100,

-  131,  162,  193,  224,  225,  194,  163,  132,

-  101,   70,   39,   71,  102,  133,  164,  195,

-  226,  227,  196,  165,  134,  103,  135,  166,

-  197,  228,  229,  198,  167,  199,  230,  231,

-  8,  256,  264,    9,  257,  265,   40,  288, 296, 72,  320,  328,

-  41,  289,  297,   10, 258,  266, 11,  259,  267,   42,  290,  298,

-  73,  321,  329,  104,  352,  360,  136,  384, 392,  105,  353,  361,

-  74,  322,  330,   43, 291,  299,   12,  260,  268,   13,  261,  269,

-  44,  292,  300,   75,  323,  331,  106,  354, 362,  137,  385,  393,

-  168,  416,  424,  200, 448,  456,  169,  417,  425,  138,  386,  394,

-  107,  355,  363,   76,  324,  332,   45,  293, 301,   14,  262,  270,

-  15,  263,  271,   46, 294,  302,   77,  325,  333,  108,  356,  364,

-  139,  387,  395,  170, 418,  426,  201,  449, 457,  232,  480,  488,

-  233,  481,  489,  202, 450,  458,  171,  419,  427,  140,  388,  396,

-  109,  357,  365,   78,  326,  334,   47,  295, 303,   79,  327,  335,

-  110,  358,  366,  141, 389,  397,  172,  420,  428,  203,  451,  459,

-  234,  482,  490,  235,  483,  491,  204,  452, 460,  173,  421,  429,

-  142,  390,  398,  111, 359,  367,  143,  391,  399,  174,  422,  430,

-  205,  453,  461,  236,  484,  492,  237,  485, 493,  206,  454,  462,

-  175,  423,  431,  207, 455,  463,  238,  486,  494,  239,  487,  495,

-  16,  512,  528,   17,  513,  529,   18,  514,

-  530,   19,  515,  531,   20,  516,  532,   21,

-  517,  533,   22,  518,  534,   23,  519,  535,

-  24,  520,  536,   25,  521,  537,   26,  522,

-  538,   27,  523,  539,   28,  524,  540,   29,

-  525,  541,   30,  526,  542,   31,  527,  543,

-  48,  544,  560,   49,  545,  561,   50,  546,

-  562,   51,  547,  563,   52,  548,  564,   53,

-  549,  565,   54,  550,  566,   55,  551,  567,

-  56,  552,  568,   57,  553,  569,   58,  554,

-  570,   59,  555,  571,   60,  556,  572,   61,

-  557,  573,   62,  558,  574,   63,  559,  575,

-  80,  576,  592,   81,  577,  593,   82,  578,

-  594,   83,  579,  595,   84,  580,  596,   85,

-  581,  597,   86,  582,  598,   87,  583,  599,

-  88,  584,  600,   89,  585,  601,   90,  586,

-  602,   91,  587,  603,   92,  588,  604,   93,

-  589,  605,   94,  590,  606,   95,  591,  607,

-  112,  608,  624,  113,  609,  625,  114,  610,

-  626,  115,  611,  627,  116,  612,  628,  117,

-  613,  629,  118,  614,  630,  119,  615,  631,

-  120,  616,  632,  121,  617,  633,  122,  618,

-  634,  123,  619,  635,  124,  620,  636,  125,

-  621,  637,  126,  622,  638,  127,  623,  639,

-  144,  640,  656,  145,  641,  657,  146,  642,

-  658,  147,  643,  659,  148,  644,  660,  149,

-  645,  661,  150,  646,  662,  151,  647,  663,

-  152,  648,  664,  153,  649,  665,  154,  650,

-  666,  155,  651,  667,  156,  652,  668,  157,

-  653,  669,  158,  654,  670,  159,  655,  671,

-  176,  672,  688,  177,  673,  689,  178,  674,

-  690,  179,  675,  691,  180,  676,  692,  181,

-  677,  693,  182,  678,  694,  183,  679,  695,

-  184,  680,  696,  185,  681,  697,  186,  682,

-  698,  187,  683,  699,  188,  684,  700,  189,

-  685,  701,  190,  686,  702,  191,  687,  703,

-  208,  704,  720,  209,  705,  721,  210,  706,

-  722,  211,  707,  723,  212,  708,  724,  213,

-  709,  725,  214,  710,  726,  215,  711,  727,

-  216,  712,  728,  217,  713,  729,  218,  714,

-  730,  219,  715,  731,  220,  716,  732,  221,

-  717,  733,  222,  718,  734,  223,  719,  735,

-  240,  736,  752,  241,  737,  753,  242,  738,

-  754,  243,  739,  755,  244,  740,  756,  245,

-  741,  757,  246,  742,  758,  247,  743,  759,

-  248,  744,  760,  249,  745,  761,  250,  746,

-  762,  251,  747,  763,  252,  748,  764,  253,

-  749,  765,  254,  750,  766,  255,  751,  767,

-  272,  768,  784,  273,  769,  785,  274,  770,

-  786,  275,  771,  787,  276,  772,  788,  277,

-  773,  789,  278,  774,  790,  279,  775,  791,

-  280,  776,  792,  281,  777,  793,  282,  778,

-  794,  283,  779,  795,  284,  780,  796,  285,

-  781,  797,  286,  782,  798,  287,  783,  799,

-  304,  800,  816,  305,  801,  817,  306,  802,

-  818,  307,  803,  819,  308,  804,  820,  309,

-  805,  821,  310,  806,  822,  311,  807,  823,

-  312,  808,  824,  313,  809,  825,  314,  810,

-  826,  315,  811,  827,  316,  812,  828,  317,

-  813,  829,  318,  814,  830,  319,  815,  831,

-  336,  832,  848,  337,  833,  849,  338,  834,

-  850,  339,  835,  851,  340,  836,  852,  341,

-  837,  853,  342,  838,  854,  343,  839,  855,

-  344,  840,  856,  345,  841,  857,  346,  842,

-  858,  347,  843,  859,  348,  844,  860,  349,

-  845,  861,  350,  846,  862,  351,  847,  863,

-  368,  864,  880,  369,  865,  881,  370,  866,

-  882,  371,  867,  883,  372,  868,  884,  373,

-  869,  885,  374,  870,  886,  375,  871,  887,

-  376,  872,  888,  377,  873,  889,  378,  874,

-  890,  379,  875,  891,  380,  876,  892,  381,

-  877,  893,  382,  878,  894,  383,  879,  895,

-  400,  896,  912,  401,  897,  913,  402,  898,

-  914,  403,  899,  915,  404,  900,  916,  405,

-  901,  917,  406,  902,  918,  407,  903,  919,

-  408,  904,  920,  409,  905,  921,  410,  906,

-  922,  411,  907,  923,  412,  908,  924,  413,

-  909,  925,  414,  910,  926,  415,  911,  927,

-  432,  928,  944,  433,  929,  945,  434,  930,

-  946,  435,  931,  947,  436,  932,  948,  437,

-  933,  949,  438,  934,  950,  439,  935,  951,

-  440,  936,  952,  441,  937,  953,  442,  938,

-  954,  443,  939,  955,  444,  940,  956,  445,

-  941,  957,  446,  942,  958,  447,  943,  959,

-  464,  960,  976,  465,  961,  977,  466,  962,

-  978,  467,  963,  979,  468,  964,  980,  469,

-  965,  981,  470,  966,  982,  471,  967,  983,

-  472,  968,  984,  473,  969,  985,  474,  970,

-  986,  475,  971,  987,  476,  972,  988,  477,

-  973,  989,  478,  974,  990,  479,  975,  991,

-  496,  992, 1008,  497,  993, 1009,  498,  994,

-  1010,  499,  995, 1011,  500,  996, 1012,  501,

-  997, 1013,  502,  998, 1014,  503,  999, 1015,

-  504, 1000, 1016,  505, 1001, 1017,  506, 1002,

-  1018,  507, 1003, 1019,  508, 1004, 1020,  509,

-  1005, 1021,  510, 1006, 1022,  511, 1007, 1023,

+DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = {

+    0,  16,  32,  48,  64,  80,  96, 112, 128, 144, 160, 176, 192, 208, 224, 240,

+    1,  17,  33,  49,  65,  81,  97, 113, 129, 145, 161, 177, 193, 209, 225, 241,

+    2,  18,  34,  50,  66,  82,  98, 114, 130, 146, 162, 178, 194, 210, 226, 242,

+    3,  19,  35,  51,  67,  83,  99, 115, 131, 147, 163, 179, 195, 211, 227, 243,

+    4,  20,  36,  52,  68,  84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,

+    5,  21,  37,  53,  69,  85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,

+    6,  22,  38,  54,  70,  86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,

+    7,  23,  39,  55,  71,  87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,

+    8,  24,  40,  56,  72,  88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,

+    9,  25,  41,  57,  73,  89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,

+   10,  26,  42,  58,  74,  90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,

+   11,  27,  43,  59,  75,  91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,

+   12,  28,  44,  60,  76,  92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,

+   13,  29,  45,  61,  77,  93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,

+   14,  30,  46,  62,  78,  94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,

+   15,  31,  47,  63,  79,  95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,

};

-#endif

-#else

-DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = {

-  0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,

-  6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,

-  6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

-  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = {

+    0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,

+   16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,

+   32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,

+   48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,

+   64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,

+   80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,

+   96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,

+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,

+  128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,

+  144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,

+  160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,

+  176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,

+  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,

+  208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,

+  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,

+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,

};

 DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = {

@@ -865,7 +367,7 @@

   951,  920,  889,  858,  827,  796,  765,  734,  703,  735,  766,  797,  828,  859,  890,  921,  952,  983, 1014, 1015,  984,  953,  922,  891,  860,  829,  798,  767,  799,  830,  861,  892,

   923,  954,  985, 1016, 1017,  986,  955,  924,  893,  862,  831,  863,  894,  925,  956,  987, 1018, 1019,  988,  957,  926,  895,  927,  958,  989, 1020, 1021,  990,  959,  991, 1022, 1023,

};

-#endif  // CONFIG_DWTDCTHYBRID

+#endif  // CONFIG_SCATTERSCAN

 /* Array indices are identical to previously-existing CONTEXT_NODE indices */

@@ -898,6 +400,1661 @@

   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129

};

+#if CONFIG_CODE_NONZEROCOUNT

+const vp9_tree_index vp9_nzc4x4_tree[2 * NZC4X4_NODES] = {

+  -NZC_0, 2,

+  4, 6,

+  -NZC_1, -NZC_2,

+  -NZC_3TO4, 8,

+  -NZC_5TO8, -NZC_9TO16,

+};

+struct vp9_token_struct vp9_nzc4x4_encodings[NZC4X4_TOKENS];

+const vp9_tree_index vp9_nzc8x8_tree[2 * NZC8X8_NODES] = {

+  -NZC_0, 2,

+  4, 6,

+  -NZC_1, -NZC_2,

+  8, 10,

+  -NZC_3TO4, -NZC_5TO8,

+  -NZC_9TO16, 12,

+  -NZC_17TO32, -NZC_33TO64,

+};

+struct vp9_token_struct vp9_nzc8x8_encodings[NZC8X8_TOKENS];

+const vp9_tree_index vp9_nzc16x16_tree[2 * NZC16X16_NODES] = {

+  -NZC_0, 2,

+  4, 6,

+  -NZC_1, -NZC_2,

+  8, 10,

+  -NZC_3TO4, -NZC_5TO8,

+  12, 14,

+  -NZC_9TO16, -NZC_17TO32,

+  -NZC_33TO64, 16,

+  -NZC_65TO128, -NZC_129TO256,

+};

+struct vp9_token_struct vp9_nzc16x16_encodings[NZC16X16_TOKENS];

+const vp9_tree_index vp9_nzc32x32_tree[2 * NZC32X32_NODES] = {

+  -NZC_0, 2,

+  4, 6,

+  -NZC_1, -NZC_2,

+  8, 10,

+  -NZC_3TO4, -NZC_5TO8,

+  12, 14,

+  -NZC_9TO16, -NZC_17TO32,

+  16, 18,

+  -NZC_33TO64, -NZC_65TO128,

+  -NZC_129TO256, 20,

+  -NZC_257TO512, -NZC_513TO1024,

+};

+struct vp9_token_struct vp9_nzc32x32_encodings[NZC32X32_TOKENS];

+const int vp9_extranzcbits[NZC32X32_TOKENS] = {

+  0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9

+};

+const int vp9_basenzcvalue[NZC32X32_TOKENS] = {

+  0, 1, 2, 3, 5, 9, 17, 33, 65, 129, 257, 513

+};

+#endif  // CONFIG_CODE_NONZEROCOUNT

+#if CONFIG_MODELCOEFPROB

+const vp9_prob vp9_modelcoefprobs_gg875[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

+  // Probs generated with a Generalized Gaussian (with shape parameter 0.875)

+  // source model with varying quantizer step size for a uniform quantizer

+  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

+  {1,   2,   6,  86, 129,  11,  87,  42,  92,  52,},

+  {2,   4,  12,  87, 129,  22,  89,  75,  97,  91,},

+  {3,   6,  17,  88, 130,  32,  90, 102, 102, 121,},

+  {4,   8,  22,  89, 131,  41,  91, 125, 107, 145,},

+  {5,  10,  28,  90, 131,  50,  93, 144, 112, 164,},

+  {6,  12,  33,  90, 132,  59,  94, 160, 117, 180,},

+  {7,  14,  38,  91, 132,  67,  95, 173, 122, 193,},

+  {8,  15,  42,  92, 133,  75,  97, 185, 126, 204,},

+  {9,  17,  47,  92, 133,  82,  98, 195, 131, 212,},

+  {10,  19,  52,  93, 134,  89,  99, 203, 135, 220,},

+  {11,  21,  56,  94, 134,  96, 101, 211, 140, 226,},

+  {12,  23,  60,  95, 135, 102, 102, 217, 144, 231,},

+  {13,  25,  65,  95, 135, 109, 103, 222, 148, 235,},

+  {14,  26,  69,  96, 136, 115, 105, 227, 153, 238,},

+  {15,  28,  73,  97, 136, 120, 106, 231, 157, 241,},

+  {16,  30,  77,  97, 137, 126, 107, 234, 161, 244,},

+  {17,  32,  81,  98, 138, 131, 108, 237, 164, 246,},

+  {18,  34,  85,  99, 138, 136, 110, 240, 168, 247,},

+  {19,  35,  89, 100, 139, 141, 111, 242, 172, 249,},

+  {20,  37,  92, 100, 139, 145, 112, 244, 175, 250,},

+  {21,  39,  96, 101, 140, 150, 113, 246, 179, 251,},

+  {22,  41,  99, 102, 140, 154, 115, 247, 182, 252,},

+  {23,  42, 103, 102, 141, 158, 116, 248, 185, 252,},

+  {24,  44, 106, 103, 141, 162, 117, 249, 188, 253,},

+  {25,  46, 110, 104, 142, 166, 118, 250, 191, 253,},

+  {26,  48, 113, 104, 142, 170, 120, 251, 194, 254,},

+  {27,  49, 116, 105, 143, 173, 121, 252, 197, 254,},

+  {28,  51, 119, 106, 143, 176, 122, 252, 200, 254,},

+  {29,  53, 122, 107, 144, 180, 123, 253, 202, 255,},

+  {30,  54, 125, 107, 144, 183, 125, 253, 205, 255,},

+  {31,  56, 128, 108, 145, 186, 126, 254, 207, 255,},

+  {32,  58, 131, 109, 145, 189, 127, 254, 209, 255,},

+  {33,  59, 134, 109, 146, 191, 128, 254, 212, 255,},

+  {34,  61, 137, 110, 146, 194, 130, 254, 214, 255,},

+  {35,  62, 139, 111, 147, 196, 131, 255, 216, 255,},

+  {36,  64, 142, 112, 147, 199, 132, 255, 218, 255,},

+  {37,  66, 145, 112, 148, 201, 134, 255, 220, 255,},

+  {38,  67, 147, 113, 148, 203, 135, 255, 221, 255,},

+  {39,  69, 150, 114, 149, 206, 136, 255, 223, 255,},

+  {40,  70, 152, 114, 149, 208, 137, 255, 225, 255,},

+  {41,  72, 155, 115, 150, 210, 138, 255, 226, 255,},

+  {42,  74, 157, 116, 150, 212, 140, 255, 228, 255,},

+  {43,  75, 159, 117, 151, 213, 141, 255, 229, 255,},

+  {44,  77, 161, 117, 151, 215, 142, 255, 230, 255,},

+  {45,  78, 164, 118, 152, 217, 143, 255, 232, 255,},

+  {46,  80, 166, 119, 152, 219, 145, 255, 233, 255,},

+  {47,  81, 168, 120, 153, 220, 146, 255, 234, 255,},

+  {48,  83, 170, 120, 153, 222, 147, 255, 235, 255,},

+  {49,  84, 172, 121, 154, 223, 148, 255, 236, 255,},

+  {50,  86, 174, 122, 154, 225, 150, 255, 237, 255,},

+  {51,  87, 176, 123, 155, 226, 151, 255, 238, 255,},

+  {52,  89, 178, 123, 155, 227, 152, 255, 239, 255,},

+  {53,  90, 180, 124, 156, 228, 153, 255, 240, 255,},

+  {54,  92, 182, 125, 156, 230, 154, 255, 241, 255,},

+  {55,  93, 183, 126, 157, 231, 156, 255, 242, 255,},

+  {56,  95, 185, 126, 157, 232, 157, 255, 242, 255,},

+  {57,  96, 187, 127, 158, 233, 158, 255, 243, 255,},

+  {58,  98, 189, 128, 158, 234, 159, 255, 244, 255,},

+  {59,  99, 190, 129, 159, 235, 160, 255, 244, 255,},

+  {60, 101, 192, 129, 159, 236, 162, 255, 245, 255,},

+  {61, 102, 193, 130, 160, 237, 163, 255, 246, 255,},

+  {62, 104, 195, 131, 160, 238, 164, 255, 246, 255,},

+  {63, 105, 197, 132, 161, 238, 165, 255, 247, 255,},

+  {64, 106, 198, 132, 162, 239, 166, 255, 247, 255,},

+  {65, 108, 199, 133, 162, 240, 167, 255, 248, 255,},

+  {66, 109, 201, 134, 163, 241, 169, 255, 248, 255,},

+  {67, 111, 202, 135, 163, 241, 170, 255, 249, 255,},

+  {68, 112, 204, 135, 164, 242, 171, 255, 249, 255,},

+  {69, 113, 205, 136, 164, 243, 172, 255, 249, 255,},

+  {70, 115, 206, 137, 165, 243, 173, 255, 250, 255,},

+  {71, 116, 208, 138, 165, 244, 174, 255, 250, 255,},

+  {72, 117, 209, 138, 166, 244, 175, 255, 250, 255,},

+  {73, 119, 210, 139, 166, 245, 177, 255, 251, 255,},

+  {74, 120, 211, 140, 167, 245, 178, 255, 251, 255,},

+  {75, 121, 212, 141, 167, 246, 179, 255, 251, 255,},

+  {76, 123, 214, 142, 168, 246, 180, 255, 252, 255,},

+  {77, 124, 215, 142, 168, 247, 181, 255, 252, 255,},

+  {78, 125, 216, 143, 169, 247, 182, 255, 252, 255,},

+  {79, 127, 217, 144, 170, 248, 183, 255, 252, 255,},

+  {80, 128, 218, 145, 170, 248, 184, 255, 253, 255,},

+  {81, 129, 219, 146, 171, 248, 185, 255, 253, 255,},

+  {82, 131, 220, 146, 171, 249, 186, 255, 253, 255,},

+  {83, 132, 221, 147, 172, 249, 187, 255, 253, 255,},

+  {84, 133, 222, 148, 172, 249, 188, 255, 253, 255,},

+  {85, 134, 223, 149, 173, 250, 189, 255, 253, 255,},

+  {86, 136, 224, 149, 173, 250, 190, 255, 254, 255,},

+  {87, 137, 225, 150, 174, 250, 191, 255, 254, 255,},

+  {88, 138, 226, 151, 174, 251, 192, 255, 254, 255,},

+  {89, 139, 226, 152, 175, 251, 193, 255, 254, 255,},

+  {90, 141, 227, 153, 175, 251, 194, 255, 254, 255,},

+  {91, 142, 228, 153, 176, 251, 195, 255, 254, 255,},

+  {92, 143, 229, 154, 177, 252, 196, 255, 254, 255,},

+  {93, 144, 230, 155, 177, 252, 197, 255, 254, 255,},

+  {94, 146, 230, 156, 178, 252, 198, 255, 255, 255,},

+  {95, 147, 231, 157, 178, 252, 199, 255, 255, 255,},

+  {96, 148, 232, 157, 179, 252, 200, 255, 255, 255,},

+  {97, 149, 233, 158, 179, 253, 201, 255, 255, 255,},

+  {98, 150, 233, 159, 180, 253, 202, 255, 255, 255,},

+  {99, 152, 234, 160, 180, 253, 203, 255, 255, 255,},

+  {100, 153, 235, 161, 181, 253, 204, 255, 255, 255,},

+  {101, 154, 235, 161, 182, 253, 205, 255, 255, 255,},

+  {102, 155, 236, 162, 182, 253, 206, 255, 255, 255,},

+  {103, 156, 236, 163, 183, 254, 207, 255, 255, 255,},

+  {104, 157, 237, 164, 183, 254, 207, 255, 255, 255,},

+  {105, 159, 238, 165, 184, 254, 208, 255, 255, 255,},

+  {106, 160, 238, 166, 184, 254, 209, 255, 255, 255,},

+  {107, 161, 239, 166, 185, 254, 210, 255, 255, 255,},

+  {108, 162, 239, 167, 185, 254, 211, 255, 255, 255,},

+  {109, 163, 240, 168, 186, 254, 212, 255, 255, 255,},

+  {110, 164, 240, 169, 187, 254, 212, 255, 255, 255,},

+  {111, 165, 241, 170, 187, 254, 213, 255, 255, 255,},

+  {112, 166, 241, 170, 188, 255, 214, 255, 255, 255,},

+  {113, 167, 242, 171, 188, 255, 215, 255, 255, 255,},

+  {114, 169, 242, 172, 189, 255, 216, 255, 255, 255,},

+  {115, 170, 243, 173, 189, 255, 216, 255, 255, 255,},

+  {116, 171, 243, 174, 190, 255, 217, 255, 255, 255,},

+  {117, 172, 244, 174, 190, 255, 218, 255, 255, 255,},

+  {118, 173, 244, 175, 191, 255, 219, 255, 255, 255,},

+  {119, 174, 244, 176, 192, 255, 219, 255, 255, 255,},

+  {120, 175, 245, 177, 192, 255, 220, 255, 255, 255,},

+  {121, 176, 245, 178, 193, 255, 221, 255, 255, 255,},

+  {122, 177, 245, 178, 193, 255, 222, 255, 255, 255,},

+  {123, 178, 246, 179, 194, 255, 222, 255, 255, 255,},

+  {124, 179, 246, 180, 194, 255, 223, 255, 255, 255,},

+  {125, 180, 247, 181, 195, 255, 224, 255, 255, 255,},

+  {126, 181, 247, 182, 196, 255, 224, 255, 255, 255,},

+  {127, 182, 247, 182, 196, 255, 225, 255, 255, 255,},

+  {128, 183, 247, 183, 197, 255, 226, 255, 255, 255,},

+  {129, 184, 248, 184, 197, 255, 226, 255, 255, 255,},

+  {130, 185, 248, 185, 198, 255, 227, 255, 255, 255,},

+  {131, 186, 248, 186, 198, 255, 228, 255, 255, 255,},

+  {132, 187, 249, 186, 199, 255, 228, 255, 255, 255,},

+  {133, 188, 249, 187, 200, 255, 229, 255, 255, 255,},

+  {134, 189, 249, 188, 200, 255, 230, 255, 255, 255,},

+  {135, 190, 249, 189, 201, 255, 230, 255, 255, 255,},

+  {136, 191, 250, 190, 201, 255, 231, 255, 255, 255,},

+  {137, 192, 250, 190, 202, 255, 231, 255, 255, 255,},

+  {138, 193, 250, 191, 202, 255, 232, 255, 255, 255,},

+  {139, 194, 250, 192, 203, 255, 232, 255, 255, 255,},

+  {140, 195, 251, 193, 204, 255, 233, 255, 255, 255,},

+  {141, 195, 251, 194, 204, 255, 234, 255, 255, 255,},

+  {142, 196, 251, 194, 205, 255, 234, 255, 255, 255,},

+  {143, 197, 251, 195, 205, 255, 235, 255, 255, 255,},

+  {144, 198, 251, 196, 206, 255, 235, 255, 255, 255,},

+  {145, 199, 252, 197, 206, 255, 236, 255, 255, 255,},

+  {146, 200, 252, 197, 207, 255, 236, 255, 255, 255,},

+  {147, 201, 252, 198, 208, 255, 237, 255, 255, 255,},

+  {148, 202, 252, 199, 208, 255, 237, 255, 255, 255,},

+  {149, 203, 252, 200, 209, 255, 238, 255, 255, 255,},

+  {150, 203, 252, 201, 209, 255, 238, 255, 255, 255,},

+  {151, 204, 253, 201, 210, 255, 239, 255, 255, 255,},

+  {152, 205, 253, 202, 210, 255, 239, 255, 255, 255,},

+  {153, 206, 253, 203, 211, 255, 239, 255, 255, 255,},

+  {154, 207, 253, 204, 212, 255, 240, 255, 255, 255,},

+  {155, 208, 253, 204, 212, 255, 240, 255, 255, 255,},

+  {156, 209, 253, 205, 213, 255, 241, 255, 255, 255,},

+  {157, 209, 253, 206, 213, 255, 241, 255, 255, 255,},

+  {158, 210, 254, 207, 214, 255, 242, 255, 255, 255,},

+  {159, 211, 254, 207, 214, 255, 242, 255, 255, 255,},

+  {160, 212, 254, 208, 215, 255, 242, 255, 255, 255,},

+  {161, 213, 254, 209, 215, 255, 243, 255, 255, 255,},

+  {162, 213, 254, 210, 216, 255, 243, 255, 255, 255,},

+  {163, 214, 254, 210, 217, 255, 244, 255, 255, 255,},

+  {164, 215, 254, 211, 217, 255, 244, 255, 255, 255,},

+  {165, 216, 254, 212, 218, 255, 244, 255, 255, 255,},

+  {166, 216, 254, 212, 218, 255, 245, 255, 255, 255,},

+  {167, 217, 254, 213, 219, 255, 245, 255, 255, 255,},

+  {168, 218, 254, 214, 219, 255, 245, 255, 255, 255,},

+  {169, 219, 255, 215, 220, 255, 246, 255, 255, 255,},

+  {170, 219, 255, 215, 221, 255, 246, 255, 255, 255,},

+  {171, 220, 255, 216, 221, 255, 246, 255, 255, 255,},

+  {172, 221, 255, 217, 222, 255, 247, 255, 255, 255,},

+  {173, 222, 255, 217, 222, 255, 247, 255, 255, 255,},

+  {174, 222, 255, 218, 223, 255, 247, 255, 255, 255,},

+  {175, 223, 255, 219, 223, 255, 248, 255, 255, 255,},

+  {176, 224, 255, 220, 224, 255, 248, 255, 255, 255,},

+  {177, 224, 255, 220, 224, 255, 248, 255, 255, 255,},

+  {178, 225, 255, 221, 225, 255, 248, 255, 255, 255,},

+  {179, 226, 255, 222, 225, 255, 249, 255, 255, 255,},

+  {180, 226, 255, 222, 226, 255, 249, 255, 255, 255,},

+  {181, 227, 255, 223, 227, 255, 249, 255, 255, 255,},

+  {182, 228, 255, 224, 227, 255, 249, 255, 255, 255,},

+  {183, 228, 255, 224, 228, 255, 250, 255, 255, 255,},

+  {184, 229, 255, 225, 228, 255, 250, 255, 255, 255,},

+  {185, 230, 255, 226, 229, 255, 250, 255, 255, 255,},

+  {186, 230, 255, 226, 229, 255, 250, 255, 255, 255,},

+  {187, 231, 255, 227, 230, 255, 251, 255, 255, 255,},

+  {188, 232, 255, 228, 230, 255, 251, 255, 255, 255,},

+  {189, 232, 255, 228, 231, 255, 251, 255, 255, 255,},

+  {190, 233, 255, 229, 231, 255, 251, 255, 255, 255,},

+  {191, 233, 255, 229, 232, 255, 251, 255, 255, 255,},

+  {192, 234, 255, 230, 232, 255, 252, 255, 255, 255,},

+  {193, 234, 255, 231, 233, 255, 252, 255, 255, 255,},

+  {194, 235, 255, 231, 233, 255, 252, 255, 255, 255,},

+  {195, 236, 255, 232, 234, 255, 252, 255, 255, 255,},

+  {196, 236, 255, 232, 234, 255, 252, 255, 255, 255,},

+  {197, 237, 255, 233, 235, 255, 252, 255, 255, 255,},

+  {198, 237, 255, 234, 235, 255, 253, 255, 255, 255,},

+  {199, 238, 255, 234, 236, 255, 253, 255, 255, 255,},

+  {200, 238, 255, 235, 236, 255, 253, 255, 255, 255,},

+  {201, 239, 255, 235, 237, 255, 253, 255, 255, 255,},

+  {202, 239, 255, 236, 237, 255, 253, 255, 255, 255,},

+  {203, 240, 255, 237, 238, 255, 253, 255, 255, 255,},

+  {204, 240, 255, 237, 238, 255, 254, 255, 255, 255,},

+  {205, 241, 255, 238, 239, 255, 254, 255, 255, 255,},

+  {206, 241, 255, 238, 239, 255, 254, 255, 255, 255,},

+  {207, 242, 255, 239, 240, 255, 254, 255, 255, 255,},

+  {208, 242, 255, 239, 240, 255, 254, 255, 255, 255,},

+  {209, 243, 255, 240, 241, 255, 254, 255, 255, 255,},

+  {210, 243, 255, 240, 241, 255, 254, 255, 255, 255,},

+  {211, 244, 255, 241, 242, 255, 254, 255, 255, 255,},

+  {212, 244, 255, 241, 242, 255, 254, 255, 255, 255,},

+  {213, 245, 255, 242, 243, 255, 255, 255, 255, 255,},

+  {214, 245, 255, 242, 243, 255, 255, 255, 255, 255,},

+  {215, 246, 255, 243, 244, 255, 255, 255, 255, 255,},

+  {216, 246, 255, 243, 244, 255, 255, 255, 255, 255,},

+  {217, 246, 255, 244, 244, 255, 255, 255, 255, 255,},

+  {218, 247, 255, 244, 245, 255, 255, 255, 255, 255,},

+  {219, 247, 255, 245, 245, 255, 255, 255, 255, 255,},

+  {220, 248, 255, 245, 246, 255, 255, 255, 255, 255,},

+  {221, 248, 255, 246, 246, 255, 255, 255, 255, 255,},

+  {222, 248, 255, 246, 247, 255, 255, 255, 255, 255,},

+  {223, 249, 255, 247, 247, 255, 255, 255, 255, 255,},

+  {224, 249, 255, 247, 247, 255, 255, 255, 255, 255,},

+  {225, 250, 255, 247, 248, 255, 255, 255, 255, 255,},

+  {226, 250, 255, 248, 248, 255, 255, 255, 255, 255,},

+  {227, 250, 255, 248, 249, 255, 255, 255, 255, 255,},

+  {228, 251, 255, 249, 249, 255, 255, 255, 255, 255,},

+  {229, 251, 255, 249, 249, 255, 255, 255, 255, 255,},

+  {230, 251, 255, 249, 250, 255, 255, 255, 255, 255,},

+  {231, 251, 255, 250, 250, 255, 255, 255, 255, 255,},

+  {232, 252, 255, 250, 250, 255, 255, 255, 255, 255,},

+  {233, 252, 255, 251, 251, 255, 255, 255, 255, 255,},

+  {234, 252, 255, 251, 251, 255, 255, 255, 255, 255,},

+  {235, 253, 255, 251, 251, 255, 255, 255, 255, 255,},

+  {236, 253, 255, 252, 252, 255, 255, 255, 255, 255,},

+  {237, 253, 255, 252, 252, 255, 255, 255, 255, 255,},

+  {238, 253, 255, 252, 252, 255, 255, 255, 255, 255,},

+  {239, 254, 255, 253, 253, 255, 255, 255, 255, 255,},

+  {240, 254, 255, 253, 253, 255, 255, 255, 255, 255,},

+  {241, 254, 255, 253, 253, 255, 255, 255, 255, 255,},

+  {242, 254, 255, 253, 254, 255, 255, 255, 255, 255,},

+  {243, 254, 255, 254, 254, 255, 255, 255, 255, 255,},

+  {244, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

+  {245, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

+  {246, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

+  {247, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {248, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {249, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {250, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {251, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {252, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {253, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {254, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {255, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+};

+const vp9_prob vp9_modelcoefprobs_gg75[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

+  // Probs generated with a Generalized Gaussian (with shape parameter 0.75)

+  // source model with varying quantizer step size for a uniform quantizer

+  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

+  {1,   2,   6,  87, 129,  11,  88,  39,  93,  47,},

+  {2,   4,  11,  88, 130,  21,  89,  68,  98,  79,},

+  {3,   6,  16,  89, 131,  30,  91,  92, 103, 105,},

+  {4,   8,  21,  90, 131,  38,  92, 112, 107, 126,},

+  {5,  10,  26,  90, 132,  46,  94, 129, 111, 143,},

+  {6,  11,  31,  91, 133,  54,  95, 143, 115, 157,},

+  {7,  13,  35,  92, 133,  61,  96, 156, 119, 170,},

+  {8,  15,  40,  93, 134,  68,  97, 167, 123, 180,},

+  {9,  17,  44,  94, 134,  74,  98, 177, 126, 189,},

+  {10,  19,  48,  94, 135,  80, 100, 185, 130, 197,},

+  {11,  20,  52,  95, 135,  86, 101, 192, 133, 204,},

+  {12,  22,  56,  96, 136,  92, 102, 199, 137, 210,},

+  {13,  24,  60,  96, 136,  97, 103, 205, 140, 215,},

+  {14,  26,  64,  97, 137, 103, 104, 210, 143, 219,},

+  {15,  27,  68,  98, 137, 108, 105, 215, 146, 223,},

+  {16,  29,  71,  98, 138, 112, 106, 219, 149, 227,},

+  {17,  31,  75,  99, 138, 117, 107, 223, 152, 230,},

+  {18,  32,  78, 100, 139, 121, 108, 226, 155, 233,},

+  {19,  34,  82, 100, 139, 126, 109, 229, 158, 235,},

+  {20,  36,  85, 101, 140, 130, 110, 231, 161, 238,},

+  {21,  37,  88, 102, 140, 134, 111, 234, 164, 239,},

+  {22,  39,  91, 102, 141, 138, 112, 236, 167, 241,},

+  {23,  40,  94, 103, 141, 141, 113, 238, 169, 243,},

+  {24,  42,  97, 104, 142, 145, 114, 240, 172, 244,},

+  {25,  44, 100, 104, 142, 149, 115, 241, 174, 245,},

+  {26,  45, 103, 105, 143, 152, 116, 243, 177, 246,},

+  {27,  47, 106, 105, 143, 155, 117, 244, 179, 247,},

+  {28,  48, 109, 106, 143, 158, 118, 245, 182, 248,},

+  {29,  50, 112, 107, 144, 161, 119, 246, 184, 249,},

+  {30,  52, 115, 107, 144, 164, 120, 247, 186, 250,},

+  {31,  53, 117, 108, 145, 167, 121, 248, 188, 250,},

+  {32,  55, 120, 109, 145, 170, 122, 249, 190, 251,},

+  {33,  56, 122, 109, 146, 173, 123, 249, 192, 252,},

+  {34,  58, 125, 110, 146, 175, 124, 250, 194, 252,},

+  {35,  59, 127, 110, 147, 178, 125, 251, 196, 252,},

+  {36,  61, 130, 111, 147, 180, 126, 251, 198, 253,},

+  {37,  62, 132, 112, 147, 183, 127, 251, 200, 253,},

+  {38,  64, 135, 112, 148, 185, 128, 252, 202, 253,},

+  {39,  65, 137, 113, 148, 187, 129, 252, 204, 254,},

+  {40,  67, 139, 114, 149, 189, 130, 253, 205, 254,},

+  {41,  68, 141, 114, 149, 191, 131, 253, 207, 254,},

+  {42,  70, 144, 115, 150, 193, 132, 253, 209, 254,},

+  {43,  71, 146, 115, 150, 195, 133, 254, 210, 254,},

+  {44,  72, 148, 116, 151, 197, 134, 254, 212, 255,},

+  {45,  74, 150, 117, 151, 199, 135, 254, 213, 255,},

+  {46,  75, 152, 117, 151, 201, 136, 254, 215, 255,},

+  {47,  77, 154, 118, 152, 202, 137, 254, 216, 255,},

+  {48,  78, 156, 119, 152, 204, 138, 254, 217, 255,},

+  {49,  80, 158, 119, 153, 206, 139, 255, 219, 255,},

+  {50,  81, 160, 120, 153, 207, 140, 255, 220, 255,},

+  {51,  82, 162, 120, 154, 209, 141, 255, 221, 255,},

+  {52,  84, 164, 121, 154, 210, 142, 255, 222, 255,},

+  {53,  85, 165, 122, 155, 212, 143, 255, 224, 255,},

+  {54,  87, 167, 122, 155, 213, 144, 255, 225, 255,},

+  {55,  88, 169, 123, 155, 215, 145, 255, 226, 255,},

+  {56,  89, 171, 124, 156, 216, 146, 255, 227, 255,},

+  {57,  91, 172, 124, 156, 217, 146, 255, 228, 255,},

+  {58,  92, 174, 125, 157, 218, 147, 255, 229, 255,},

+  {59,  93, 176, 126, 157, 220, 148, 255, 230, 255,},

+  {60,  95, 177, 126, 158, 221, 149, 255, 231, 255,},

+  {61,  96, 179, 127, 158, 222, 150, 255, 232, 255,},

+  {62,  97, 180, 127, 159, 223, 151, 255, 232, 255,},

+  {63,  99, 182, 128, 159, 224, 152, 255, 233, 255,},

+  {64, 100, 183, 129, 159, 225, 153, 255, 234, 255,},

+  {65, 101, 185, 129, 160, 226, 154, 255, 235, 255,},

+  {66, 103, 186, 130, 160, 227, 155, 255, 236, 255,},

+  {67, 104, 188, 131, 161, 228, 156, 255, 236, 255,},

+  {68, 105, 189, 131, 161, 229, 157, 255, 237, 255,},

+  {69, 106, 190, 132, 162, 230, 158, 255, 238, 255,},

+  {70, 108, 192, 133, 162, 231, 159, 255, 238, 255,},

+  {71, 109, 193, 133, 162, 231, 159, 255, 239, 255,},

+  {72, 110, 194, 134, 163, 232, 160, 255, 240, 255,},

+  {73, 111, 196, 134, 163, 233, 161, 255, 240, 255,},

+  {74, 113, 197, 135, 164, 234, 162, 255, 241, 255,},

+  {75, 114, 198, 136, 164, 235, 163, 255, 241, 255,},

+  {76, 115, 199, 136, 165, 235, 164, 255, 242, 255,},

+  {77, 116, 200, 137, 165, 236, 165, 255, 243, 255,},

+  {78, 118, 202, 138, 166, 237, 166, 255, 243, 255,},

+  {79, 119, 203, 138, 166, 237, 167, 255, 244, 255,},

+  {80, 120, 204, 139, 167, 238, 168, 255, 244, 255,},

+  {81, 121, 205, 140, 167, 239, 168, 255, 244, 255,},

+  {82, 123, 206, 140, 167, 239, 169, 255, 245, 255,},

+  {83, 124, 207, 141, 168, 240, 170, 255, 245, 255,},

+  {84, 125, 208, 142, 168, 240, 171, 255, 246, 255,},

+  {85, 126, 209, 142, 169, 241, 172, 255, 246, 255,},

+  {86, 127, 210, 143, 169, 241, 173, 255, 247, 255,},

+  {87, 129, 211, 144, 170, 242, 174, 255, 247, 255,},

+  {88, 130, 212, 144, 170, 242, 175, 255, 247, 255,},

+  {89, 131, 213, 145, 171, 243, 175, 255, 248, 255,},

+  {90, 132, 214, 146, 171, 243, 176, 255, 248, 255,},

+  {91, 133, 215, 146, 171, 244, 177, 255, 248, 255,},

+  {92, 134, 216, 147, 172, 244, 178, 255, 249, 255,},

+  {93, 136, 217, 148, 172, 245, 179, 255, 249, 255,},

+  {94, 137, 218, 148, 173, 245, 180, 255, 249, 255,},

+  {95, 138, 219, 149, 173, 245, 181, 255, 249, 255,},

+  {96, 139, 220, 150, 174, 246, 181, 255, 250, 255,},

+  {97, 140, 220, 150, 174, 246, 182, 255, 250, 255,},

+  {98, 141, 221, 151, 175, 247, 183, 255, 250, 255,},

+  {99, 142, 222, 152, 175, 247, 184, 255, 250, 255,},

+  {100, 144, 223, 152, 176, 247, 185, 255, 251, 255,},

+  {101, 145, 224, 153, 176, 248, 186, 255, 251, 255,},

+  {102, 146, 224, 154, 177, 248, 186, 255, 251, 255,},

+  {103, 147, 225, 154, 177, 248, 187, 255, 251, 255,},

+  {104, 148, 226, 155, 177, 248, 188, 255, 252, 255,},

+  {105, 149, 226, 156, 178, 249, 189, 255, 252, 255,},

+  {106, 150, 227, 156, 178, 249, 190, 255, 252, 255,},

+  {107, 151, 228, 157, 179, 249, 191, 255, 252, 255,},

+  {108, 152, 229, 158, 179, 250, 191, 255, 252, 255,},

+  {109, 153, 229, 158, 180, 250, 192, 255, 252, 255,},

+  {110, 154, 230, 159, 180, 250, 193, 255, 253, 255,},

+  {111, 155, 231, 160, 181, 250, 194, 255, 253, 255,},

+  {112, 157, 231, 160, 181, 251, 195, 255, 253, 255,},

+  {113, 158, 232, 161, 182, 251, 195, 255, 253, 255,},

+  {114, 159, 232, 162, 182, 251, 196, 255, 253, 255,},

+  {115, 160, 233, 162, 183, 251, 197, 255, 253, 255,},

+  {116, 161, 234, 163, 183, 251, 198, 255, 253, 255,},

+  {117, 162, 234, 164, 184, 252, 198, 255, 254, 255,},

+  {118, 163, 235, 165, 184, 252, 199, 255, 254, 255,},

+  {119, 164, 235, 165, 185, 252, 200, 255, 254, 255,},

+  {120, 165, 236, 166, 185, 252, 201, 255, 254, 255,},

+  {121, 166, 236, 167, 186, 252, 201, 255, 254, 255,},

+  {122, 167, 237, 167, 186, 252, 202, 255, 254, 255,},

+  {123, 168, 237, 168, 186, 253, 203, 255, 254, 255,},

+  {124, 169, 238, 169, 187, 253, 204, 255, 254, 255,},

+  {125, 170, 238, 169, 187, 253, 204, 255, 254, 255,},

+  {126, 171, 239, 170, 188, 253, 205, 255, 254, 255,},

+  {127, 172, 239, 171, 188, 253, 206, 255, 254, 255,},

+  {128, 173, 240, 171, 189, 253, 207, 255, 255, 255,},

+  {129, 174, 240, 172, 189, 253, 207, 255, 255, 255,},

+  {130, 175, 241, 173, 190, 253, 208, 255, 255, 255,},

+  {131, 176, 241, 174, 190, 254, 209, 255, 255, 255,},

+  {132, 177, 241, 174, 191, 254, 209, 255, 255, 255,},

+  {133, 178, 242, 175, 191, 254, 210, 255, 255, 255,},

+  {134, 179, 242, 176, 192, 254, 211, 255, 255, 255,},

+  {135, 180, 243, 176, 192, 254, 212, 255, 255, 255,},

+  {136, 180, 243, 177, 193, 254, 212, 255, 255, 255,},

+  {137, 181, 243, 178, 193, 254, 213, 255, 255, 255,},

+  {138, 182, 244, 179, 194, 254, 214, 255, 255, 255,},

+  {139, 183, 244, 179, 194, 254, 214, 255, 255, 255,},

+  {140, 184, 244, 180, 195, 254, 215, 255, 255, 255,},

+  {141, 185, 245, 181, 195, 254, 216, 255, 255, 255,},

+  {142, 186, 245, 181, 196, 255, 216, 255, 255, 255,},

+  {143, 187, 245, 182, 196, 255, 217, 255, 255, 255,},

+  {144, 188, 246, 183, 197, 255, 218, 255, 255, 255,},

+  {145, 189, 246, 183, 197, 255, 218, 255, 255, 255,},

+  {146, 190, 246, 184, 198, 255, 219, 255, 255, 255,},

+  {147, 191, 247, 185, 198, 255, 220, 255, 255, 255,},

+  {148, 191, 247, 186, 199, 255, 220, 255, 255, 255,},

+  {149, 192, 247, 186, 199, 255, 221, 255, 255, 255,},

+  {150, 193, 248, 187, 200, 255, 221, 255, 255, 255,},

+  {151, 194, 248, 188, 200, 255, 222, 255, 255, 255,},

+  {152, 195, 248, 188, 201, 255, 223, 255, 255, 255,},

+  {153, 196, 248, 189, 201, 255, 223, 255, 255, 255,},

+  {154, 197, 249, 190, 202, 255, 224, 255, 255, 255,},

+  {155, 198, 249, 191, 202, 255, 224, 255, 255, 255,},

+  {156, 198, 249, 191, 203, 255, 225, 255, 255, 255,},

+  {157, 199, 249, 192, 203, 255, 226, 255, 255, 255,},

+  {158, 200, 250, 193, 204, 255, 226, 255, 255, 255,},

+  {159, 201, 250, 193, 204, 255, 227, 255, 255, 255,},

+  {160, 202, 250, 194, 205, 255, 227, 255, 255, 255,},

+  {161, 203, 250, 195, 206, 255, 228, 255, 255, 255,},

+  {162, 203, 250, 196, 206, 255, 228, 255, 255, 255,},

+  {163, 204, 251, 196, 207, 255, 229, 255, 255, 255,},

+  {164, 205, 251, 197, 207, 255, 229, 255, 255, 255,},

+  {165, 206, 251, 198, 208, 255, 230, 255, 255, 255,},

+  {166, 207, 251, 198, 208, 255, 231, 255, 255, 255,},

+  {167, 207, 251, 199, 209, 255, 231, 255, 255, 255,},

+  {168, 208, 252, 200, 209, 255, 232, 255, 255, 255,},

+  {169, 209, 252, 201, 210, 255, 232, 255, 255, 255,},

+  {170, 210, 252, 201, 210, 255, 233, 255, 255, 255,},

+  {171, 211, 252, 202, 211, 255, 233, 255, 255, 255,},

+  {172, 211, 252, 203, 211, 255, 234, 255, 255, 255,},

+  {173, 212, 252, 203, 212, 255, 234, 255, 255, 255,},

+  {174, 213, 252, 204, 212, 255, 235, 255, 255, 255,},

+  {175, 214, 253, 205, 213, 255, 235, 255, 255, 255,},

+  {176, 214, 253, 206, 213, 255, 236, 255, 255, 255,},

+  {177, 215, 253, 206, 214, 255, 236, 255, 255, 255,},

+  {178, 216, 253, 207, 214, 255, 237, 255, 255, 255,},

+  {179, 217, 253, 208, 215, 255, 237, 255, 255, 255,},

+  {180, 217, 253, 208, 216, 255, 237, 255, 255, 255,},

+  {181, 218, 253, 209, 216, 255, 238, 255, 255, 255,},

+  {182, 219, 254, 210, 217, 255, 238, 255, 255, 255,},

+  {183, 220, 254, 211, 217, 255, 239, 255, 255, 255,},

+  {184, 220, 254, 211, 218, 255, 239, 255, 255, 255,},

+  {185, 221, 254, 212, 218, 255, 240, 255, 255, 255,},

+  {186, 222, 254, 213, 219, 255, 240, 255, 255, 255,},

+  {187, 222, 254, 213, 219, 255, 241, 255, 255, 255,},

+  {188, 223, 254, 214, 220, 255, 241, 255, 255, 255,},

+  {189, 224, 254, 215, 220, 255, 241, 255, 255, 255,},

+  {190, 225, 254, 215, 221, 255, 242, 255, 255, 255,},

+  {191, 225, 254, 216, 221, 255, 242, 255, 255, 255,},

+  {192, 226, 254, 217, 222, 255, 243, 255, 255, 255,},

+  {193, 227, 255, 218, 223, 255, 243, 255, 255, 255,},

+  {194, 227, 255, 218, 223, 255, 243, 255, 255, 255,},

+  {195, 228, 255, 219, 224, 255, 244, 255, 255, 255,},

+  {196, 229, 255, 220, 224, 255, 244, 255, 255, 255,},

+  {197, 229, 255, 220, 225, 255, 244, 255, 255, 255,},

+  {198, 230, 255, 221, 225, 255, 245, 255, 255, 255,},

+  {199, 230, 255, 222, 226, 255, 245, 255, 255, 255,},

+  {200, 231, 255, 222, 226, 255, 246, 255, 255, 255,},

+  {201, 232, 255, 223, 227, 255, 246, 255, 255, 255,},

+  {202, 232, 255, 224, 228, 255, 246, 255, 255, 255,},

+  {203, 233, 255, 224, 228, 255, 247, 255, 255, 255,},

+  {204, 234, 255, 225, 229, 255, 247, 255, 255, 255,},

+  {205, 234, 255, 226, 229, 255, 247, 255, 255, 255,},

+  {206, 235, 255, 227, 230, 255, 248, 255, 255, 255,},

+  {207, 235, 255, 227, 230, 255, 248, 255, 255, 255,},

+  {208, 236, 255, 228, 231, 255, 248, 255, 255, 255,},

+  {209, 237, 255, 229, 231, 255, 248, 255, 255, 255,},

+  {210, 237, 255, 229, 232, 255, 249, 255, 255, 255,},

+  {211, 238, 255, 230, 233, 255, 249, 255, 255, 255,},

+  {212, 238, 255, 231, 233, 255, 249, 255, 255, 255,},

+  {213, 239, 255, 231, 234, 255, 250, 255, 255, 255,},

+  {214, 239, 255, 232, 234, 255, 250, 255, 255, 255,},

+  {215, 240, 255, 233, 235, 255, 250, 255, 255, 255,},

+  {216, 241, 255, 233, 235, 255, 250, 255, 255, 255,},

+  {217, 241, 255, 234, 236, 255, 251, 255, 255, 255,},

+  {218, 242, 255, 235, 236, 255, 251, 255, 255, 255,},

+  {219, 242, 255, 235, 237, 255, 251, 255, 255, 255,},

+  {220, 243, 255, 236, 237, 255, 251, 255, 255, 255,},

+  {221, 243, 255, 236, 238, 255, 252, 255, 255, 255,},

+  {222, 244, 255, 237, 239, 255, 252, 255, 255, 255,},

+  {223, 244, 255, 238, 239, 255, 252, 255, 255, 255,},

+  {224, 245, 255, 238, 240, 255, 252, 255, 255, 255,},

+  {225, 245, 255, 239, 240, 255, 252, 255, 255, 255,},

+  {226, 246, 255, 240, 241, 255, 253, 255, 255, 255,},

+  {227, 246, 255, 240, 241, 255, 253, 255, 255, 255,},

+  {228, 247, 255, 241, 242, 255, 253, 255, 255, 255,},

+  {229, 247, 255, 242, 242, 255, 253, 255, 255, 255,},

+  {230, 248, 255, 242, 243, 255, 253, 255, 255, 255,},

+  {231, 248, 255, 243, 244, 255, 254, 255, 255, 255,},

+  {232, 248, 255, 243, 244, 255, 254, 255, 255, 255,},

+  {233, 249, 255, 244, 245, 255, 254, 255, 255, 255,},

+  {234, 249, 255, 245, 245, 255, 254, 255, 255, 255,},

+  {235, 250, 255, 245, 246, 255, 254, 255, 255, 255,},

+  {236, 250, 255, 246, 246, 255, 254, 255, 255, 255,},

+  {237, 251, 255, 246, 247, 255, 255, 255, 255, 255,},

+  {238, 251, 255, 247, 247, 255, 255, 255, 255, 255,},

+  {239, 251, 255, 248, 248, 255, 255, 255, 255, 255,},

+  {240, 252, 255, 248, 248, 255, 255, 255, 255, 255,},

+  {241, 252, 255, 249, 249, 255, 255, 255, 255, 255,},

+  {242, 252, 255, 249, 249, 255, 255, 255, 255, 255,},

+  {243, 253, 255, 250, 250, 255, 255, 255, 255, 255,},

+  {244, 253, 255, 250, 250, 255, 255, 255, 255, 255,},

+  {245, 253, 255, 251, 251, 255, 255, 255, 255, 255,},

+  {246, 254, 255, 251, 251, 255, 255, 255, 255, 255,},

+  {247, 254, 255, 252, 252, 255, 255, 255, 255, 255,},

+  {248, 254, 255, 252, 252, 255, 255, 255, 255, 255,},

+  {249, 255, 255, 253, 253, 255, 255, 255, 255, 255,},

+  {250, 255, 255, 253, 253, 255, 255, 255, 255, 255,},

+  {251, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

+  {252, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

+  {253, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {254, 255, 255, 255, 255, 255, 255, 255, 255, 255,},

+  {255, 255, 255, 255, 255, 255, 255, 255, 255, 255,}

+};

+const vp9_prob vp9_modelcoefprobs_gg625[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

+  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)

+  // source model with varying quantizer step size for a uniform quantizer

+  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

+  {1,   2,   6,  88, 130,  10,  88,  35,  94,  40,},

+  {2,   4,  11,  89, 131,  19,  90,  60,  99,  67,},

+  {3,   6,  15,  90, 132,  27,  92,  80, 103,  88,},

+  {4,   7,  20,  91, 132,  34,  93,  97, 107, 105,},

+  {5,   9,  24,  92, 133,  41,  94, 112, 110, 120,},

+  {6,  11,  28,  93, 134,  48,  95, 125, 113, 132,},

+  {7,  13,  33,  93, 134,  54,  97, 136, 116, 143,},

+  {8,  14,  36,  94, 135,  60,  98, 146, 119, 152,},

+  {9,  16,  40,  95, 135,  65,  99, 155, 122, 161,},

+  {10,  18,  44,  95, 136,  70, 100, 163, 125, 168,},

+  {11,  19,  48,  96, 136,  75, 101, 170, 127, 175,},

+  {12,  21,  51,  97, 137,  80, 102, 176, 130, 181,},

+  {13,  23,  55,  97, 137,  85, 102, 182, 132, 187,},

+  {14,  24,  58,  98, 138,  89, 103, 188, 135, 192,},

+  {15,  26,  61,  99, 138,  94, 104, 193, 137, 196,},

+  {16,  27,  64,  99, 139,  98, 105, 197, 140, 201,},

+  {17,  29,  67, 100, 139, 102, 106, 201, 142, 205,},

+  {18,  30,  70, 101, 140, 106, 107, 205, 144, 208,},

+  {19,  32,  73, 101, 140, 109, 108, 209, 146, 211,},

+  {20,  34,  76, 102, 140, 113, 109, 212, 148, 214,},

+  {21,  35,  79, 102, 141, 116, 109, 215, 151, 217,},

+  {22,  37,  82, 103, 141, 120, 110, 218, 153, 220,},

+  {23,  38,  85, 103, 142, 123, 111, 220, 155, 222,},

+  {24,  40,  87, 104, 142, 126, 112, 223, 157, 224,},

+  {25,  41,  90, 105, 143, 129, 113, 225, 159, 226,},

+  {26,  42,  93, 105, 143, 132, 113, 227, 161, 228,},

+  {27,  44,  95, 106, 143, 135, 114, 229, 162, 230,},

+  {28,  45,  98, 106, 144, 138, 115, 230, 164, 232,},

+  {29,  47, 100, 107, 144, 141, 116, 232, 166, 233,},

+  {30,  48, 103, 107, 145, 144, 117, 234, 168, 235,},

+  {31,  50, 105, 108, 145, 146, 117, 235, 170, 236,},

+  {32,  51, 107, 108, 145, 149, 118, 236, 171, 237,},

+  {33,  52, 110, 109, 146, 151, 119, 238, 173, 238,},

+  {34,  54, 112, 110, 146, 154, 120, 239, 175, 239,},

+  {35,  55, 114, 110, 147, 156, 120, 240, 176, 240,},

+  {36,  57, 116, 111, 147, 158, 121, 241, 178, 241,},

+  {37,  58, 119, 111, 147, 161, 122, 242, 180, 242,},

+  {38,  59, 121, 112, 148, 163, 123, 243, 181, 243,},

+  {39,  61, 123, 112, 148, 165, 123, 244, 183, 244,},

+  {40,  62, 125, 113, 148, 167, 124, 244, 184, 245,},

+  {41,  63, 127, 113, 149, 169, 125, 245, 186, 245,},

+  {42,  65, 129, 114, 149, 171, 126, 246, 187, 246,},

+  {43,  66, 131, 114, 150, 173, 126, 246, 188, 247,},

+  {44,  67, 133, 115, 150, 175, 127, 247, 190, 247,},

+  {45,  69, 135, 115, 150, 177, 128, 247, 191, 248,},

+  {46,  70, 136, 116, 151, 178, 129, 248, 193, 248,},

+  {47,  71, 138, 116, 151, 180, 129, 248, 194, 249,},

+  {48,  73, 140, 117, 151, 182, 130, 249, 195, 249,},

+  {49,  74, 142, 118, 152, 184, 131, 249, 197, 250,},

+  {50,  75, 144, 118, 152, 185, 131, 250, 198, 250,},

+  {51,  76, 145, 119, 153, 187, 132, 250, 199, 250,},

+  {52,  78, 147, 119, 153, 188, 133, 251, 200, 251,},

+  {53,  79, 149, 120, 153, 190, 134, 251, 201, 251,},

+  {54,  80, 151, 120, 154, 192, 134, 251, 203, 251,},

+  {55,  82, 152, 121, 154, 193, 135, 251, 204, 252,},

+  {56,  83, 154, 121, 154, 194, 136, 252, 205, 252,},

+  {57,  84, 155, 122, 155, 196, 136, 252, 206, 252,},

+  {58,  85, 157, 122, 155, 197, 137, 252, 207, 252,},

+  {59,  86, 158, 123, 156, 199, 138, 252, 208, 252,},

+  {60,  88, 160, 123, 156, 200, 139, 253, 209, 253,},

+  {61,  89, 162, 124, 156, 201, 139, 253, 210, 253,},

+  {62,  90, 163, 124, 157, 202, 140, 253, 211, 253,},

+  {63,  91, 164, 125, 157, 204, 141, 253, 212, 253,},

+  {64,  93, 166, 125, 157, 205, 141, 253, 213, 253,},

+  {65,  94, 167, 126, 158, 206, 142, 254, 214, 254,},

+  {66,  95, 169, 126, 158, 207, 143, 254, 215, 254,},

+  {67,  96, 170, 127, 158, 208, 143, 254, 216, 254,},

+  {68,  97, 172, 127, 159, 209, 144, 254, 217, 254,},

+  {69,  98, 173, 128, 159, 210, 145, 254, 218, 254,},

+  {70, 100, 174, 128, 160, 212, 146, 254, 219, 254,},

+  {71, 101, 176, 129, 160, 213, 146, 254, 220, 254,},

+  {72, 102, 177, 130, 160, 214, 147, 254, 220, 254,},

+  {73, 103, 178, 130, 161, 215, 148, 255, 221, 255,},

+  {74, 104, 179, 131, 161, 216, 148, 255, 222, 255,},

+  {75, 105, 181, 131, 161, 217, 149, 255, 223, 255,},

+  {76, 107, 182, 132, 162, 217, 150, 255, 224, 255,},

+  {77, 108, 183, 132, 162, 218, 150, 255, 224, 255,},

+  {78, 109, 184, 133, 163, 219, 151, 255, 225, 255,},

+  {79, 110, 185, 133, 163, 220, 152, 255, 226, 255,},

+  {80, 111, 187, 134, 163, 221, 153, 255, 227, 255,},

+  {81, 112, 188, 134, 164, 222, 153, 255, 227, 255,},

+  {82, 113, 189, 135, 164, 223, 154, 255, 228, 255,},

+  {83, 115, 190, 135, 164, 223, 155, 255, 229, 255,},

+  {84, 116, 191, 136, 165, 224, 155, 255, 229, 255,},

+  {85, 117, 192, 136, 165, 225, 156, 255, 230, 255,},

+  {86, 118, 193, 137, 165, 226, 157, 255, 231, 255,},

+  {87, 119, 194, 137, 166, 226, 157, 255, 231, 255,},

+  {88, 120, 195, 138, 166, 227, 158, 255, 232, 255,},

+  {89, 121, 196, 139, 167, 228, 159, 255, 232, 255,},

+  {90, 122, 197, 139, 167, 229, 159, 255, 233, 255,},

+  {91, 123, 198, 140, 167, 229, 160, 255, 234, 255,},

+  {92, 124, 199, 140, 168, 230, 161, 255, 234, 255,},

+  {93, 125, 200, 141, 168, 231, 162, 255, 235, 255,},

+  {94, 127, 201, 141, 168, 231, 162, 255, 235, 255,},

+  {95, 128, 202, 142, 169, 232, 163, 255, 236, 255,},

+  {96, 129, 203, 142, 169, 232, 164, 255, 236, 255,},

+  {97, 130, 204, 143, 170, 233, 164, 255, 237, 255,},

+  {98, 131, 205, 143, 170, 234, 165, 255, 237, 255,},

+  {99, 132, 206, 144, 170, 234, 166, 255, 238, 255,},

+  {100, 133, 207, 144, 171, 235, 166, 255, 238, 255,},

+  {101, 134, 208, 145, 171, 235, 167, 255, 239, 255,},

+  {102, 135, 209, 146, 171, 236, 168, 255, 239, 255,},

+  {103, 136, 209, 146, 172, 236, 168, 255, 240, 255,},

+  {104, 137, 210, 147, 172, 237, 169, 255, 240, 255,},

+  {105, 138, 211, 147, 173, 237, 170, 255, 240, 255,},

+  {106, 139, 212, 148, 173, 238, 170, 255, 241, 255,},

+  {107, 140, 213, 148, 173, 238, 171, 255, 241, 255,},

+  {108, 141, 213, 149, 174, 239, 172, 255, 242, 255,},

+  {109, 142, 214, 149, 174, 239, 172, 255, 242, 255,},

+  {110, 143, 215, 150, 175, 240, 173, 255, 242, 255,},

+  {111, 144, 216, 151, 175, 240, 174, 255, 243, 255,},

+  {112, 145, 217, 151, 175, 240, 174, 255, 243, 255,},

+  {113, 146, 217, 152, 176, 241, 175, 255, 244, 255,},

+  {114, 147, 218, 152, 176, 241, 176, 255, 244, 255,},

+  {115, 148, 219, 153, 176, 242, 177, 255, 244, 255,},

+  {116, 149, 219, 153, 177, 242, 177, 255, 245, 255,},

+  {117, 150, 220, 154, 177, 242, 178, 255, 245, 255,},

+  {118, 151, 221, 155, 178, 243, 179, 255, 245, 255,},

+  {119, 152, 222, 155, 178, 243, 179, 255, 245, 255,},

+  {120, 153, 222, 156, 178, 244, 180, 255, 246, 255,},

+  {121, 154, 223, 156, 179, 244, 181, 255, 246, 255,},

+  {122, 155, 224, 157, 179, 244, 181, 255, 246, 255,},

+  {123, 156, 224, 157, 180, 245, 182, 255, 247, 255,},

+  {124, 157, 225, 158, 180, 245, 183, 255, 247, 255,},

+  {125, 158, 225, 159, 180, 245, 183, 255, 247, 255,},

+  {126, 159, 226, 159, 181, 246, 184, 255, 247, 255,},

+  {127, 160, 227, 160, 181, 246, 185, 255, 248, 255,},

+  {128, 161, 227, 160, 182, 246, 185, 255, 248, 255,},

+  {129, 162, 228, 161, 182, 246, 186, 255, 248, 255,},

+  {130, 163, 228, 161, 182, 247, 187, 255, 248, 255,},

+  {131, 164, 229, 162, 183, 247, 187, 255, 249, 255,},

+  {132, 165, 230, 163, 183, 247, 188, 255, 249, 255,},

+  {133, 166, 230, 163, 184, 248, 189, 255, 249, 255,},

+  {134, 166, 231, 164, 184, 248, 189, 255, 249, 255,},

+  {135, 167, 231, 164, 184, 248, 190, 255, 250, 255,},

+  {136, 168, 232, 165, 185, 248, 191, 255, 250, 255,},

+  {137, 169, 232, 166, 185, 248, 191, 255, 250, 255,},

+  {138, 170, 233, 166, 186, 249, 192, 255, 250, 255,},

+  {139, 171, 233, 167, 186, 249, 192, 255, 250, 255,},

+  {140, 172, 234, 167, 187, 249, 193, 255, 251, 255,},

+  {141, 173, 234, 168, 187, 249, 194, 255, 251, 255,},

+  {142, 174, 235, 169, 187, 250, 194, 255, 251, 255,},

+  {143, 175, 235, 169, 188, 250, 195, 255, 251, 255,},

+  {144, 176, 236, 170, 188, 250, 196, 255, 251, 255,},

+  {145, 177, 236, 170, 189, 250, 196, 255, 251, 255,},

+  {146, 177, 237, 171, 189, 250, 197, 255, 252, 255,},

+  {147, 178, 237, 172, 189, 251, 198, 255, 252, 255,},

+  {148, 179, 238, 172, 190, 251, 198, 255, 252, 255,},

+  {149, 180, 238, 173, 190, 251, 199, 255, 252, 255,},

+  {150, 181, 238, 173, 191, 251, 200, 255, 252, 255,},

+  {151, 182, 239, 174, 191, 251, 200, 255, 252, 255,},

+  {152, 183, 239, 175, 192, 251, 201, 255, 252, 255,},

+  {153, 184, 240, 175, 192, 252, 202, 255, 252, 255,},

+  {154, 184, 240, 176, 193, 252, 202, 255, 253, 255,},

+  {155, 185, 240, 177, 193, 252, 203, 255, 253, 255,},

+  {156, 186, 241, 177, 193, 252, 203, 255, 253, 255,},

+  {157, 187, 241, 178, 194, 252, 204, 255, 253, 255,},

+  {158, 188, 242, 178, 194, 252, 205, 255, 253, 255,},

+  {159, 189, 242, 179, 195, 252, 205, 255, 253, 255,},

+  {160, 190, 242, 180, 195, 253, 206, 255, 253, 255,},

+  {161, 190, 243, 180, 196, 253, 207, 255, 253, 255,},

+  {162, 191, 243, 181, 196, 253, 207, 255, 254, 255,},

+  {163, 192, 243, 182, 197, 253, 208, 255, 254, 255,},

+  {164, 193, 244, 182, 197, 253, 209, 255, 254, 255,},

+  {165, 194, 244, 183, 197, 253, 209, 255, 254, 255,},

+  {166, 195, 244, 184, 198, 253, 210, 255, 254, 255,},

+  {167, 196, 245, 184, 198, 253, 210, 255, 254, 255,},

+  {168, 196, 245, 185, 199, 253, 211, 255, 254, 255,},

+  {169, 197, 245, 186, 199, 254, 212, 255, 254, 255,},

+  {170, 198, 246, 186, 200, 254, 212, 255, 254, 255,},

+  {171, 199, 246, 187, 200, 254, 213, 255, 254, 255,},

+  {172, 200, 246, 188, 201, 254, 214, 255, 254, 255,},

+  {173, 200, 246, 188, 201, 254, 214, 255, 254, 255,},

+  {174, 201, 247, 189, 202, 254, 215, 255, 254, 255,},

+  {175, 202, 247, 189, 202, 254, 215, 255, 255, 255,},

+  {176, 203, 247, 190, 203, 254, 216, 255, 255, 255,},

+  {177, 204, 248, 191, 203, 254, 217, 255, 255, 255,},

+  {178, 204, 248, 191, 204, 254, 217, 255, 255, 255,},

+  {179, 205, 248, 192, 204, 254, 218, 255, 255, 255,},

+  {180, 206, 248, 193, 204, 254, 218, 255, 255, 255,},

+  {181, 207, 249, 194, 205, 255, 219, 255, 255, 255,},

+  {182, 208, 249, 194, 205, 255, 220, 255, 255, 255,},

+  {183, 208, 249, 195, 206, 255, 220, 255, 255, 255,},

+  {184, 209, 249, 196, 206, 255, 221, 255, 255, 255,},

+  {185, 210, 250, 196, 207, 255, 221, 255, 255, 255,},

+  {186, 211, 250, 197, 207, 255, 222, 255, 255, 255,},

+  {187, 211, 250, 198, 208, 255, 223, 255, 255, 255,},

+  {188, 212, 250, 198, 208, 255, 223, 255, 255, 255,},

+  {189, 213, 250, 199, 209, 255, 224, 255, 255, 255,},

+  {190, 214, 251, 200, 209, 255, 224, 255, 255, 255,},

+  {191, 215, 251, 200, 210, 255, 225, 255, 255, 255,},

+  {192, 215, 251, 201, 211, 255, 225, 255, 255, 255,},

+  {193, 216, 251, 202, 211, 255, 226, 255, 255, 255,},

+  {194, 217, 251, 203, 212, 255, 227, 255, 255, 255,},

+  {195, 218, 252, 203, 212, 255, 227, 255, 255, 255,},

+  {196, 218, 252, 204, 213, 255, 228, 255, 255, 255,},

+  {197, 219, 252, 205, 213, 255, 228, 255, 255, 255,},

+  {198, 220, 252, 205, 214, 255, 229, 255, 255, 255,},

+  {199, 221, 252, 206, 214, 255, 229, 255, 255, 255,},

+  {200, 221, 252, 207, 215, 255, 230, 255, 255, 255,},

+  {201, 222, 252, 208, 215, 255, 231, 255, 255, 255,},

+  {202, 223, 253, 208, 216, 255, 231, 255, 255, 255,},

+  {203, 223, 253, 209, 216, 255, 232, 255, 255, 255,},

+  {204, 224, 253, 210, 217, 255, 232, 255, 255, 255,},

+  {205, 225, 253, 211, 218, 255, 233, 255, 255, 255,},

+  {206, 226, 253, 211, 218, 255, 233, 255, 255, 255,},

+  {207, 226, 253, 212, 219, 255, 234, 255, 255, 255,},

+  {208, 227, 253, 213, 219, 255, 234, 255, 255, 255,},

+  {209, 228, 254, 214, 220, 255, 235, 255, 255, 255,},

+  {210, 228, 254, 214, 220, 255, 236, 255, 255, 255,},

+  {211, 229, 254, 215, 221, 255, 236, 255, 255, 255,},

+  {212, 230, 254, 216, 222, 255, 237, 255, 255, 255,},

+  {213, 230, 254, 217, 222, 255, 237, 255, 255, 255,},

+  {214, 231, 254, 217, 223, 255, 238, 255, 255, 255,},

+  {215, 232, 254, 218, 223, 255, 238, 255, 255, 255,},

+  {216, 233, 254, 219, 224, 255, 239, 255, 255, 255,},

+  {217, 233, 254, 220, 225, 255, 239, 255, 255, 255,},

+  {218, 234, 255, 220, 225, 255, 240, 255, 255, 255,},

+  {219, 235, 255, 221, 226, 255, 240, 255, 255, 255,},

+  {220, 235, 255, 222, 226, 255, 241, 255, 255, 255,},

+  {221, 236, 255, 223, 227, 255, 241, 255, 255, 255,},

+  {222, 237, 255, 224, 228, 255, 242, 255, 255, 255,},

+  {223, 237, 255, 224, 228, 255, 242, 255, 255, 255,},

+  {224, 238, 255, 225, 229, 255, 243, 255, 255, 255,},

+  {225, 238, 255, 226, 230, 255, 243, 255, 255, 255,},

+  {226, 239, 255, 227, 230, 255, 244, 255, 255, 255,},

+  {227, 240, 255, 228, 231, 255, 244, 255, 255, 255,},

+  {228, 240, 255, 228, 232, 255, 245, 255, 255, 255,},

+  {229, 241, 255, 229, 232, 255, 245, 255, 255, 255,},

+  {230, 242, 255, 230, 233, 255, 246, 255, 255, 255,},

+  {231, 242, 255, 231, 234, 255, 246, 255, 255, 255,},

+  {232, 243, 255, 232, 234, 255, 247, 255, 255, 255,},

+  {233, 243, 255, 233, 235, 255, 247, 255, 255, 255,},

+  {234, 244, 255, 233, 236, 255, 247, 255, 255, 255,},

+  {235, 245, 255, 234, 236, 255, 248, 255, 255, 255,},

+  {236, 245, 255, 235, 237, 255, 248, 255, 255, 255,},

+  {237, 246, 255, 236, 238, 255, 249, 255, 255, 255,},

+  {238, 247, 255, 237, 239, 255, 249, 255, 255, 255,},

+  {239, 247, 255, 238, 239, 255, 250, 255, 255, 255,},

+  {240, 248, 255, 239, 240, 255, 250, 255, 255, 255,},

+  {241, 248, 255, 240, 241, 255, 251, 255, 255, 255,},

+  {242, 249, 255, 241, 242, 255, 251, 255, 255, 255,},

+  {243, 249, 255, 241, 243, 255, 251, 255, 255, 255,},

+  {244, 250, 255, 242, 243, 255, 252, 255, 255, 255,},

+  {245, 251, 255, 243, 244, 255, 252, 255, 255, 255,},

+  {246, 251, 255, 244, 245, 255, 253, 255, 255, 255,},

+  {247, 252, 255, 245, 246, 255, 253, 255, 255, 255,},

+  {248, 252, 255, 246, 247, 255, 253, 255, 255, 255,},

+  {249, 253, 255, 247, 248, 255, 254, 255, 255, 255,},

+  {250, 253, 255, 248, 249, 255, 254, 255, 255, 255,},

+  {251, 254, 255, 249, 250, 255, 254, 255, 255, 255,},

+  {252, 254, 255, 251, 251, 255, 255, 255, 255, 255,},

+  {253, 255, 255, 252, 252, 255, 255, 255, 255, 255,},

+  {254, 255, 255, 253, 253, 255, 255, 255, 255, 255,},

+  {255, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

+};

+const vp9_prob vp9_modelcoefprobs_gg875p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

+  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)

+  // source model with varying quantizer step size for a uniform quantizer

+  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

+  {1,   1,   3,  86, 128,   6,  86,  22,  89,  28,},

+  {1,   2,   6,  86, 129,  11,  87,  42,  92,  52,},

+  {2,   3,   9,  87, 129,  17,  88,  59,  94,  73,},

+  {2,   4,  12,  87, 129,  22,  89,  75,  97,  92,},

+  {3,   5,  14,  88, 130,  27,  89,  90, 100, 108,},

+  {3,   6,  17,  88, 130,  33,  90, 103, 102, 122,},

+  {4,   7,  20,  88, 130,  37,  91, 115, 105, 135,},

+  {4,   8,  23,  89, 131,  42,  92, 126, 108, 147,},

+  {5,   9,  25,  89, 131,  47,  92, 137, 110, 157,},

+  {5,  10,  28,  90, 131,  52,  93, 146, 113, 167,},

+  {6,  11,  31,  90, 132,  56,  94, 154, 115, 175,},

+  {6,  12,  33,  90, 132,  60,  94, 162, 118, 183,},

+  {7,  13,  36,  91, 132,  65,  95, 170, 120, 190,},

+  {7,  14,  39,  91, 132,  69,  96, 176, 123, 196,},

+  {8,  15,  41,  92, 133,  73,  96, 182, 125, 201,},

+  {8,  16,  44,  92, 133,  77,  97, 188, 128, 206,},

+  {9,  17,  46,  92, 133,  81,  98, 193, 130, 211,},

+  {9,  18,  49,  93, 134,  85,  99, 198, 133, 215,},

+  {10,  19,  51,  93, 134,  89,  99, 203, 135, 219,},

+  {10,  20,  54,  93, 134,  92, 100, 207, 137, 222,},

+  {11,  21,  56,  94, 134,  96, 101, 211, 140, 226,},

+  {12,  22,  58,  94, 135, 100, 101, 214, 142, 228,},

+  {12,  23,  61,  95, 135, 103, 102, 217, 145, 231,},

+  {13,  24,  63,  95, 135, 106, 103, 220, 147, 233,},

+  {13,  25,  66,  95, 136, 110, 103, 223, 149, 235,},

+  {14,  26,  68,  96, 136, 113, 104, 226, 151, 237,},

+  {14,  27,  70,  96, 136, 116, 105, 228, 154, 239,},

+  {15,  28,  72,  97, 136, 119, 106, 230, 156, 241,},

+  {15,  29,  75,  97, 137, 122, 106, 232, 158, 242,},

+  {16,  30,  77,  97, 137, 125, 107, 234, 160, 243,},

+  {17,  31,  79,  98, 137, 128, 108, 236, 163, 245,},

+  {17,  32,  81,  98, 138, 131, 108, 237, 165, 246,},

+  {18,  33,  83,  99, 138, 134, 109, 239, 167, 247,},

+  {18,  34,  86,  99, 138, 137, 110, 240, 169, 248,},

+  {19,  35,  88,  99, 138, 140, 111, 242, 171, 248,},

+  {19,  36,  90, 100, 139, 142, 111, 243, 173, 249,},

+  {20,  37,  92, 100, 139, 145, 112, 244, 175, 250,},

+  {20,  38,  94, 101, 139, 148, 113, 245, 177, 250,},

+  {21,  39,  96, 101, 140, 150, 113, 246, 179, 251,},

+  {22,  40,  98, 101, 140, 153, 114, 246, 181, 251,},

+  {22,  41, 100, 102, 140, 155, 115, 247, 183, 252,},

+  {23,  42, 102, 102, 140, 157, 116, 248, 185, 252,},

+  {23,  43, 104, 103, 141, 160, 116, 249, 186, 253,},

+  {24,  44, 106, 103, 141, 162, 117, 249, 188, 253,},

+  {25,  45, 108, 103, 141, 164, 118, 250, 190, 253,},

+  {25,  46, 110, 104, 142, 166, 119, 250, 192, 253,},

+  {26,  47, 112, 104, 142, 168, 119, 251, 193, 254,},

+  {26,  48, 114, 105, 142, 171, 120, 251, 195, 254,},

+  {27,  49, 116, 105, 143, 173, 121, 252, 197, 254,},

+  {27,  50, 118, 105, 143, 175, 122, 252, 198, 254,},

+  {28,  51, 119, 106, 143, 177, 122, 252, 200, 254,},

+  {29,  52, 121, 106, 143, 179, 123, 253, 201, 255,},

+  {29,  53, 123, 107, 144, 180, 124, 253, 203, 255,},

+  {30,  54, 125, 107, 144, 182, 125, 253, 204, 255,},

+  {30,  55, 127, 108, 144, 184, 125, 253, 206, 255,},

+  {31,  56, 128, 108, 145, 186, 126, 254, 207, 255,},

+  {32,  57, 130, 108, 145, 188, 127, 254, 209, 255,},

+  {32,  58, 132, 109, 145, 189, 128, 254, 210, 255,},

+  {33,  59, 134, 109, 146, 191, 128, 254, 211, 255,},

+  {33,  60, 135, 110, 146, 193, 129, 254, 213, 255,},

+  {34,  61, 137, 110, 146, 194, 130, 254, 214, 255,},

+  {35,  62, 139, 111, 146, 196, 131, 255, 215, 255,},

+  {35,  63, 140, 111, 147, 197, 131, 255, 216, 255,},

+  {36,  64, 142, 112, 147, 199, 132, 255, 218, 255,},

+  {37,  65, 144, 112, 147, 200, 133, 255, 219, 255,},

+  {37,  66, 145, 112, 148, 202, 134, 255, 220, 255,},

+  {38,  67, 147, 113, 148, 203, 135, 255, 221, 255,},

+  {38,  68, 148, 113, 148, 204, 135, 255, 222, 255,},

+  {39,  69, 150, 114, 149, 206, 136, 255, 223, 255,},

+  {40,  70, 151, 114, 149, 207, 137, 255, 224, 255,},

+  {40,  71, 153, 115, 149, 208, 138, 255, 225, 255,},

+  {41,  72, 154, 115, 150, 210, 138, 255, 226, 255,},

+  {42,  73, 156, 116, 150, 211, 139, 255, 227, 255,},

+  {42,  74, 157, 116, 150, 212, 140, 255, 228, 255,},

+  {43,  75, 159, 117, 151, 213, 141, 255, 229, 255,},

+  {44,  76, 160, 117, 151, 214, 142, 255, 230, 255,},

+  {44,  77, 162, 117, 151, 216, 142, 255, 231, 255,},

+  {45,  78, 163, 118, 152, 217, 143, 255, 231, 255,},

+  {45,  79, 165, 118, 152, 218, 144, 255, 232, 255,},

+  {46,  80, 166, 119, 152, 219, 145, 255, 233, 255,},

+  {47,  81, 167, 119, 153, 220, 146, 255, 234, 255,},

+  {47,  82, 169, 120, 153, 221, 146, 255, 235, 255,},

+  {48,  83, 170, 120, 153, 222, 147, 255, 235, 255,},

+  {49,  84, 171, 121, 154, 223, 148, 255, 236, 255,},

+  {49,  85, 173, 121, 154, 224, 149, 255, 237, 255,},

+  {50,  86, 174, 122, 154, 225, 150, 255, 237, 255,},

+  {51,  87, 175, 122, 155, 225, 150, 255, 238, 255,},

+  {51,  88, 177, 123, 155, 226, 151, 255, 239, 255,},

+  {52,  89, 178, 123, 155, 227, 152, 255, 239, 255,},

+  {53,  90, 179, 124, 156, 228, 153, 255, 240, 255,},

+  {53,  91, 180, 124, 156, 229, 154, 255, 240, 255,},

+  {54,  92, 182, 125, 156, 230, 154, 255, 241, 255,},

+  {55,  93, 183, 125, 157, 230, 155, 255, 241, 255,},

+  {55,  94, 184, 126, 157, 231, 156, 255, 242, 255,},

+  {56,  95, 185, 126, 157, 232, 157, 255, 242, 255,},

+  {57,  96, 187, 127, 158, 233, 158, 255, 243, 255,},

+  {57,  97, 188, 127, 158, 233, 159, 255, 243, 255,},

+  {58,  98, 189, 128, 158, 234, 159, 255, 244, 255,},

+  {59,  99, 190, 128, 159, 235, 160, 255, 244, 255,},

+  {60, 100, 191, 129, 159, 235, 161, 255, 245, 255,},

+  {60, 101, 192, 129, 160, 236, 162, 255, 245, 255,},

+  {61, 102, 193, 130, 160, 237, 163, 255, 246, 255,},

+  {62, 103, 194, 131, 160, 237, 164, 255, 246, 255,},

+  {62, 104, 196, 131, 161, 238, 164, 255, 246, 255,},

+  {63, 105, 197, 132, 161, 238, 165, 255, 247, 255,},

+  {64, 106, 198, 132, 161, 239, 166, 255, 247, 255,},

+  {64, 107, 199, 133, 162, 239, 167, 255, 247, 255,},

+  {65, 108, 200, 133, 162, 240, 168, 255, 248, 255,},

+  {66, 109, 201, 134, 163, 241, 168, 255, 248, 255,},

+  {67, 110, 202, 134, 163, 241, 169, 255, 248, 255,},

+  {67, 111, 203, 135, 163, 242, 170, 255, 249, 255,},

+  {68, 112, 204, 135, 164, 242, 171, 255, 249, 255,},

+  {69, 113, 205, 136, 164, 242, 172, 255, 249, 255,},

+  {69, 114, 206, 137, 164, 243, 173, 255, 250, 255,},

+  {70, 115, 207, 137, 165, 243, 173, 255, 250, 255,},

+  {71, 116, 208, 138, 165, 244, 174, 255, 250, 255,},

+  {72, 117, 208, 138, 166, 244, 175, 255, 250, 255,},

+  {72, 118, 209, 139, 166, 245, 176, 255, 251, 255,},

+  {73, 119, 210, 139, 166, 245, 177, 255, 251, 255,},

+  {74, 120, 211, 140, 167, 245, 178, 255, 251, 255,},

+  {75, 121, 212, 141, 167, 246, 178, 255, 251, 255,},

+  {75, 122, 213, 141, 168, 246, 179, 255, 251, 255,},

+  {76, 123, 214, 142, 168, 246, 180, 255, 252, 255,},

+  {77, 124, 215, 142, 168, 247, 181, 255, 252, 255,},

+  {78, 125, 215, 143, 169, 247, 182, 255, 252, 255,},

+  {78, 126, 216, 144, 169, 247, 182, 255, 252, 255,},

+  {79, 127, 217, 144, 170, 248, 183, 255, 252, 255,},

+  {80, 128, 218, 145, 170, 248, 184, 255, 253, 255,},

+  {81, 129, 219, 145, 170, 248, 185, 255, 253, 255,},

+  {82, 130, 219, 146, 171, 249, 186, 255, 253, 255,},

+  {82, 131, 220, 147, 171, 249, 187, 255, 253, 255,},

+  {83, 132, 221, 147, 172, 249, 187, 255, 253, 255,},

+  {84, 133, 222, 148, 172, 249, 188, 255, 253, 255,},

+  {85, 134, 222, 148, 173, 250, 189, 255, 253, 255,},

+  {85, 135, 223, 149, 173, 250, 190, 255, 254, 255,},

+  {86, 136, 224, 150, 173, 250, 191, 255, 254, 255,},

+  {87, 137, 225, 150, 174, 250, 191, 255, 254, 255,},

+  {88, 138, 225, 151, 174, 251, 192, 255, 254, 255,},

+  {89, 139, 226, 152, 175, 251, 193, 255, 254, 255,},

+  {89, 140, 227, 152, 175, 251, 194, 255, 254, 255,},

+  {90, 141, 227, 153, 176, 251, 195, 255, 254, 255,},

+  {91, 142, 228, 153, 176, 251, 195, 255, 254, 255,},

+  {92, 143, 229, 154, 176, 252, 196, 255, 254, 255,},

+  {93, 144, 229, 155, 177, 252, 197, 255, 254, 255,},

+  {93, 145, 230, 155, 177, 252, 198, 255, 255, 255,},

+  {94, 146, 231, 156, 178, 252, 199, 255, 255, 255,},

+  {95, 147, 231, 157, 178, 252, 199, 255, 255, 255,},

+  {96, 148, 232, 157, 179, 252, 200, 255, 255, 255,},

+  {97, 149, 232, 158, 179, 253, 201, 255, 255, 255,},

+  {98, 150, 233, 159, 180, 253, 202, 255, 255, 255,},

+  {99, 151, 234, 159, 180, 253, 202, 255, 255, 255,},

+  {99, 152, 234, 160, 181, 253, 203, 255, 255, 255,},

+  {100, 153, 235, 161, 181, 253, 204, 255, 255, 255,},

+  {101, 154, 235, 162, 182, 253, 205, 255, 255, 255,},

+  {102, 155, 236, 162, 182, 253, 206, 255, 255, 255,},

+  {103, 156, 236, 163, 183, 254, 206, 255, 255, 255,},

+  {104, 157, 237, 164, 183, 254, 207, 255, 255, 255,},

+  {105, 158, 237, 164, 183, 254, 208, 255, 255, 255,},

+  {105, 159, 238, 165, 184, 254, 209, 255, 255, 255,},

+  {106, 160, 238, 166, 184, 254, 209, 255, 255, 255,},

+  {107, 161, 239, 166, 185, 254, 210, 255, 255, 255,},

+  {108, 162, 239, 167, 185, 254, 211, 255, 255, 255,},

+  {109, 163, 240, 168, 186, 254, 212, 255, 255, 255,},

+  {110, 164, 240, 169, 186, 254, 212, 255, 255, 255,},

+  {111, 165, 241, 169, 187, 254, 213, 255, 255, 255,},

+  {112, 166, 241, 170, 187, 255, 214, 255, 255, 255,},

+  {113, 167, 242, 171, 188, 255, 215, 255, 255, 255,},

+  {114, 168, 242, 172, 189, 255, 215, 255, 255, 255,},

+  {114, 169, 242, 172, 189, 255, 216, 255, 255, 255,},

+  {115, 170, 243, 173, 190, 255, 217, 255, 255, 255,},

+  {116, 171, 243, 174, 190, 255, 217, 255, 255, 255,},

+  {117, 172, 244, 175, 191, 255, 218, 255, 255, 255,},

+  {118, 173, 244, 175, 191, 255, 219, 255, 255, 255,},

+  {119, 174, 244, 176, 192, 255, 220, 255, 255, 255,},

+  {120, 175, 245, 177, 192, 255, 220, 255, 255, 255,},

+  {121, 176, 245, 178, 193, 255, 221, 255, 255, 255,},

+  {122, 177, 245, 178, 193, 255, 222, 255, 255, 255,},

+  {123, 178, 246, 179, 194, 255, 222, 255, 255, 255,},

+  {124, 179, 246, 180, 194, 255, 223, 255, 255, 255,},

+  {125, 180, 247, 181, 195, 255, 224, 255, 255, 255,},

+  {126, 181, 247, 182, 196, 255, 224, 255, 255, 255,},

+  {127, 182, 247, 182, 196, 255, 225, 255, 255, 255,},

+  {128, 183, 247, 183, 197, 255, 226, 255, 255, 255,},

+  {129, 184, 248, 184, 197, 255, 226, 255, 255, 255,},

+  {130, 185, 248, 185, 198, 255, 227, 255, 255, 255,},

+  {131, 186, 248, 186, 198, 255, 228, 255, 255, 255,},

+  {132, 187, 249, 186, 199, 255, 228, 255, 255, 255,},

+  {133, 188, 249, 187, 200, 255, 229, 255, 255, 255,},

+  {134, 189, 249, 188, 200, 255, 230, 255, 255, 255,},

+  {135, 190, 249, 189, 201, 255, 230, 255, 255, 255,},

+  {136, 191, 250, 190, 201, 255, 231, 255, 255, 255,},

+  {137, 192, 250, 191, 202, 255, 231, 255, 255, 255,},

+  {138, 193, 250, 191, 203, 255, 232, 255, 255, 255,},

+  {139, 194, 250, 192, 203, 255, 233, 255, 255, 255,},

+  {140, 195, 251, 193, 204, 255, 233, 255, 255, 255,},

+  {142, 196, 251, 194, 204, 255, 234, 255, 255, 255,},

+  {143, 197, 251, 195, 205, 255, 234, 255, 255, 255,},

+  {144, 198, 251, 196, 206, 255, 235, 255, 255, 255,},

+  {145, 199, 252, 197, 206, 255, 236, 255, 255, 255,},

+  {146, 200, 252, 197, 207, 255, 236, 255, 255, 255,},

+  {147, 201, 252, 198, 208, 255, 237, 255, 255, 255,},

+  {148, 202, 252, 199, 208, 255, 237, 255, 255, 255,},

+  {149, 203, 252, 200, 209, 255, 238, 255, 255, 255,},

+  {151, 204, 253, 201, 210, 255, 238, 255, 255, 255,},

+  {152, 205, 253, 202, 210, 255, 239, 255, 255, 255,},

+  {153, 206, 253, 203, 211, 255, 239, 255, 255, 255,},

+  {154, 207, 253, 204, 212, 255, 240, 255, 255, 255,},

+  {155, 208, 253, 205, 212, 255, 241, 255, 255, 255,},

+  {157, 209, 253, 206, 213, 255, 241, 255, 255, 255,},

+  {158, 210, 253, 206, 214, 255, 242, 255, 255, 255,},

+  {159, 211, 254, 207, 214, 255, 242, 255, 255, 255,},

+  {160, 212, 254, 208, 215, 255, 243, 255, 255, 255,},

+  {162, 213, 254, 209, 216, 255, 243, 255, 255, 255,},

+  {163, 214, 254, 210, 217, 255, 244, 255, 255, 255,},

+  {164, 215, 254, 211, 217, 255, 244, 255, 255, 255,},

+  {165, 216, 254, 212, 218, 255, 244, 255, 255, 255,},

+  {167, 217, 254, 213, 219, 255, 245, 255, 255, 255,},

+  {168, 218, 254, 214, 219, 255, 245, 255, 255, 255,},

+  {169, 219, 255, 215, 220, 255, 246, 255, 255, 255,},

+  {171, 220, 255, 216, 221, 255, 246, 255, 255, 255,},

+  {172, 221, 255, 217, 222, 255, 247, 255, 255, 255,},

+  {174, 222, 255, 218, 223, 255, 247, 255, 255, 255,},

+  {175, 223, 255, 219, 223, 255, 248, 255, 255, 255,},

+  {177, 224, 255, 220, 224, 255, 248, 255, 255, 255,},

+  {178, 225, 255, 221, 225, 255, 248, 255, 255, 255,},

+  {179, 226, 255, 222, 226, 255, 249, 255, 255, 255,},

+  {181, 227, 255, 223, 227, 255, 249, 255, 255, 255,},

+  {182, 228, 255, 224, 227, 255, 250, 255, 255, 255,},

+  {184, 229, 255, 225, 228, 255, 250, 255, 255, 255,},

+  {186, 230, 255, 226, 229, 255, 250, 255, 255, 255,},

+  {187, 231, 255, 227, 230, 255, 251, 255, 255, 255,},

+  {189, 232, 255, 228, 231, 255, 251, 255, 255, 255,},

+  {190, 233, 255, 229, 232, 255, 251, 255, 255, 255,},

+  {192, 234, 255, 230, 232, 255, 252, 255, 255, 255,},

+  {194, 235, 255, 231, 233, 255, 252, 255, 255, 255,},

+  {196, 236, 255, 232, 234, 255, 252, 255, 255, 255,},

+  {197, 237, 255, 233, 235, 255, 253, 255, 255, 255,},

+  {199, 238, 255, 234, 236, 255, 253, 255, 255, 255,},

+  {201, 239, 255, 235, 237, 255, 253, 255, 255, 255,},

+  {203, 240, 255, 237, 238, 255, 253, 255, 255, 255,},

+  {205, 241, 255, 238, 239, 255, 254, 255, 255, 255,},

+  {207, 242, 255, 239, 240, 255, 254, 255, 255, 255,},

+  {209, 243, 255, 240, 241, 255, 254, 255, 255, 255,},

+  {211, 244, 255, 241, 242, 255, 254, 255, 255, 255,},

+  {214, 245, 255, 242, 243, 255, 255, 255, 255, 255,},

+  {216, 246, 255, 243, 244, 255, 255, 255, 255, 255,},

+  {218, 247, 255, 244, 245, 255, 255, 255, 255, 255,},

+  {221, 248, 255, 246, 246, 255, 255, 255, 255, 255,},

+  {224, 249, 255, 247, 247, 255, 255, 255, 255, 255,},

+  {226, 250, 255, 248, 248, 255, 255, 255, 255, 255,},

+  {229, 251, 255, 249, 249, 255, 255, 255, 255, 255,},

+  {233, 252, 255, 251, 251, 255, 255, 255, 255, 255,},

+  {236, 253, 255, 252, 252, 255, 255, 255, 255, 255,},

+  {241, 254, 255, 253, 253, 255, 255, 255, 255, 255,},

+  {246, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

+};

+const vp9_prob vp9_modelcoefprobs_gg75p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

+  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)

+  // source model with varying quantizer step size for a uniform quantizer

+  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

+  {1,   1,   3,  86, 129,   6,  87,  21,  90,  26,},

+  {1,   2,   6,  87, 129,  11,  88,  39,  93,  47,},

+  {2,   3,   9,  87, 130,  16,  89,  55,  96,  65,},

+  {2,   4,  11,  88, 130,  21,  89,  69,  98,  81,},

+  {3,   5,  14,  88, 130,  26,  90,  82, 101,  95,},

+  {3,   6,  17,  89, 131,  31,  91,  94, 103, 107,},

+  {4,   7,  20,  89, 131,  35,  92, 105, 105, 119,},

+  {4,   8,  22,  90, 131,  40,  92, 115, 108, 129,},

+  {5,   9,  25,  90, 132,  44,  93, 124, 110, 138,},

+  {5,  10,  27,  91, 132,  48,  94, 133, 112, 147,},

+  {6,  11,  30,  91, 132,  52,  95, 141, 114, 155,},

+  {6,  12,  32,  92, 133,  56,  95, 148, 116, 162,},

+  {7,  13,  35,  92, 133,  60,  96, 155, 118, 168,},

+  {7,  14,  37,  92, 133,  64,  97, 161, 121, 174,},

+  {8,  15,  40,  93, 134,  68,  97, 167, 123, 180,},

+  {9,  16,  42,  93, 134,  71,  98, 173, 125, 185,},

+  {9,  17,  44,  94, 134,  75,  99, 178, 127, 190,},

+  {10,  18,  47,  94, 135,  78,  99, 182, 129, 195,},

+  {10,  19,  49,  94, 135,  82, 100, 187, 131, 199,},

+  {11,  20,  51,  95, 135,  85, 100, 191, 133, 202,},

+  {11,  21,  54,  95, 135,  88, 101, 195, 135, 206,},

+  {12,  22,  56,  96, 136,  92, 102, 199, 137, 209,},

+  {13,  23,  58,  96, 136,  95, 102, 202, 138, 213,},

+  {13,  24,  61,  96, 136,  98, 103, 206, 140, 215,},

+  {14,  25,  63,  97, 137, 101, 104, 209, 142, 218,},

+  {14,  26,  65,  97, 137, 104, 104, 211, 144, 221,},

+  {15,  27,  67,  98, 137, 107, 105, 214, 146, 223,},

+  {15,  28,  69,  98, 138, 110, 106, 217, 148, 225,},

+  {16,  29,  71,  98, 138, 113, 106, 219, 150, 227,},

+  {17,  30,  73,  99, 138, 115, 107, 221, 151, 229,},

+  {17,  31,  76,  99, 138, 118, 107, 223, 153, 231,},

+  {18,  32,  78, 100, 139, 121, 108, 225, 155, 232,},

+  {18,  33,  80, 100, 139, 123, 109, 227, 157, 234,},

+  {19,  34,  82, 100, 139, 126, 109, 229, 158, 235,},

+  {20,  35,  84, 101, 140, 128, 110, 231, 160, 237,},

+  {20,  36,  86, 101, 140, 131, 111, 232, 162, 238,},

+  {21,  37,  88, 102, 140, 133, 111, 234, 164, 239,},

+  {21,  38,  90, 102, 140, 136, 112, 235, 165, 240,},

+  {22,  39,  92, 102, 141, 138, 112, 236, 167, 241,},

+  {23,  40,  94, 103, 141, 140, 113, 237, 169, 242,},

+  {23,  41,  95, 103, 141, 143, 114, 238, 170, 243,},

+  {24,  42,  97, 103, 142, 145, 114, 240, 172, 244,},

+  {25,  43,  99, 104, 142, 147, 115, 241, 173, 245,},

+  {25,  44, 101, 104, 142, 149, 116, 242, 175, 246,},

+  {26,  45, 103, 105, 142, 151, 116, 242, 176, 246,},

+  {26,  46, 105, 105, 143, 153, 117, 243, 178, 247,},

+  {27,  47, 107, 105, 143, 156, 117, 244, 180, 248,},

+  {28,  48, 108, 106, 143, 158, 118, 245, 181, 248,},

+  {28,  49, 110, 106, 144, 159, 119, 245, 182, 249,},

+  {29,  50, 112, 107, 144, 161, 119, 246, 184, 249,},

+  {30,  51, 114, 107, 144, 163, 120, 247, 185, 250,},

+  {30,  52, 115, 108, 144, 165, 121, 247, 187, 250,},

+  {31,  53, 117, 108, 145, 167, 121, 248, 188, 250,},

+  {32,  54, 119, 108, 145, 169, 122, 248, 190, 251,},

+  {32,  55, 121, 109, 145, 171, 123, 249, 191, 251,},

+  {33,  56, 122, 109, 146, 172, 123, 249, 192, 251,},

+  {34,  57, 124, 110, 146, 174, 124, 250, 194, 252,},

+  {34,  58, 126, 110, 146, 176, 125, 250, 195, 252,},

+  {35,  59, 127, 110, 147, 177, 125, 250, 196, 252,},

+  {36,  60, 129, 111, 147, 179, 126, 251, 197, 253,},

+  {36,  61, 130, 111, 147, 181, 127, 251, 199, 253,},

+  {37,  62, 132, 112, 147, 182, 127, 251, 200, 253,},

+  {38,  63, 134, 112, 148, 184, 128, 252, 201, 253,},

+  {38,  64, 135, 112, 148, 185, 128, 252, 202, 253,},

+  {39,  65, 137, 113, 148, 187, 129, 252, 204, 254,},

+  {40,  66, 138, 113, 149, 188, 130, 253, 205, 254,},

+  {40,  67, 140, 114, 149, 190, 130, 253, 206, 254,},

+  {41,  68, 141, 114, 149, 191, 131, 253, 207, 254,},

+  {42,  69, 143, 115, 150, 192, 132, 253, 208, 254,},

+  {42,  70, 144, 115, 150, 194, 132, 253, 209, 254,},

+  {43,  71, 146, 115, 150, 195, 133, 254, 210, 254,},

+  {44,  72, 147, 116, 150, 197, 134, 254, 211, 255,},

+  {44,  73, 149, 116, 151, 198, 134, 254, 212, 255,},

+  {45,  74, 150, 117, 151, 199, 135, 254, 213, 255,},

+  {46,  75, 152, 117, 151, 200, 136, 254, 214, 255,},

+  {46,  76, 153, 118, 152, 202, 136, 254, 215, 255,},

+  {47,  77, 154, 118, 152, 203, 137, 254, 216, 255,},

+  {48,  78, 156, 119, 152, 204, 138, 254, 217, 255,},

+  {49,  79, 157, 119, 153, 205, 139, 255, 218, 255,},

+  {49,  80, 159, 119, 153, 206, 139, 255, 219, 255,},

+  {50,  81, 160, 120, 153, 207, 140, 255, 220, 255,},

+  {51,  82, 161, 120, 154, 208, 141, 255, 221, 255,},

+  {51,  83, 163, 121, 154, 210, 141, 255, 222, 255,},

+  {52,  84, 164, 121, 154, 211, 142, 255, 223, 255,},

+  {53,  85, 165, 122, 154, 212, 143, 255, 223, 255,},

+  {54,  86, 166, 122, 155, 213, 143, 255, 224, 255,},

+  {54,  87, 168, 123, 155, 214, 144, 255, 225, 255,},

+  {55,  88, 169, 123, 155, 215, 145, 255, 226, 255,},

+  {56,  89, 170, 123, 156, 216, 145, 255, 227, 255,},

+  {57,  90, 172, 124, 156, 217, 146, 255, 227, 255,},

+  {57,  91, 173, 124, 156, 218, 147, 255, 228, 255,},

+  {58,  92, 174, 125, 157, 218, 147, 255, 229, 255,},

+  {59,  93, 175, 125, 157, 219, 148, 255, 230, 255,},

+  {60,  94, 176, 126, 157, 220, 149, 255, 230, 255,},

+  {60,  95, 178, 126, 158, 221, 150, 255, 231, 255,},

+  {61,  96, 179, 127, 158, 222, 150, 255, 232, 255,},

+  {62,  97, 180, 127, 158, 223, 151, 255, 232, 255,},

+  {63,  98, 181, 128, 159, 224, 152, 255, 233, 255,},

+  {63,  99, 182, 128, 159, 224, 152, 255, 234, 255,},

+  {64, 100, 183, 129, 159, 225, 153, 255, 234, 255,},

+  {65, 101, 184, 129, 160, 226, 154, 255, 235, 255,},

+  {66, 102, 186, 130, 160, 227, 154, 255, 235, 255,},

+  {66, 103, 187, 130, 160, 227, 155, 255, 236, 255,},

+  {67, 104, 188, 131, 161, 228, 156, 255, 236, 255,},

+  {68, 105, 189, 131, 161, 229, 157, 255, 237, 255,},

+  {69, 106, 190, 132, 161, 230, 157, 255, 238, 255,},

+  {69, 107, 191, 132, 162, 230, 158, 255, 238, 255,},

+  {70, 108, 192, 133, 162, 231, 159, 255, 239, 255,},

+  {71, 109, 193, 133, 163, 232, 159, 255, 239, 255,},

+  {72, 110, 194, 134, 163, 232, 160, 255, 240, 255,},

+  {73, 111, 195, 134, 163, 233, 161, 255, 240, 255,},

+  {73, 112, 196, 135, 164, 233, 162, 255, 241, 255,},

+  {74, 113, 197, 135, 164, 234, 162, 255, 241, 255,},

+  {75, 114, 198, 136, 164, 235, 163, 255, 241, 255,},

+  {76, 115, 199, 136, 165, 235, 164, 255, 242, 255,},

+  {77, 116, 200, 137, 165, 236, 165, 255, 242, 255,},

+  {77, 117, 201, 137, 165, 236, 165, 255, 243, 255,},

+  {78, 118, 202, 138, 166, 237, 166, 255, 243, 255,},

+  {79, 119, 203, 138, 166, 237, 167, 255, 244, 255,},

+  {80, 120, 204, 139, 166, 238, 167, 255, 244, 255,},

+  {81, 121, 205, 139, 167, 238, 168, 255, 244, 255,},

+  {82, 122, 206, 140, 167, 239, 169, 255, 245, 255,},

+  {82, 123, 206, 141, 168, 239, 170, 255, 245, 255,},

+  {83, 124, 207, 141, 168, 240, 170, 255, 245, 255,},

+  {84, 125, 208, 142, 168, 240, 171, 255, 246, 255,},

+  {85, 126, 209, 142, 169, 241, 172, 255, 246, 255,},

+  {86, 127, 210, 143, 169, 241, 173, 255, 246, 255,},

+  {87, 128, 211, 143, 169, 242, 173, 255, 247, 255,},

+  {87, 129, 212, 144, 170, 242, 174, 255, 247, 255,},

+  {88, 130, 212, 144, 170, 242, 175, 255, 247, 255,},

+  {89, 131, 213, 145, 171, 243, 176, 255, 248, 255,},

+  {90, 132, 214, 146, 171, 243, 176, 255, 248, 255,},

+  {91, 133, 215, 146, 171, 244, 177, 255, 248, 255,},

+  {92, 134, 216, 147, 172, 244, 178, 255, 248, 255,},

+  {93, 135, 216, 147, 172, 244, 179, 255, 249, 255,},

+  {93, 136, 217, 148, 173, 245, 179, 255, 249, 255,},

+  {94, 137, 218, 148, 173, 245, 180, 255, 249, 255,},

+  {95, 138, 219, 149, 173, 245, 181, 255, 249, 255,},

+  {96, 139, 220, 150, 174, 246, 181, 255, 250, 255,},

+  {97, 140, 220, 150, 174, 246, 182, 255, 250, 255,},

+  {98, 141, 221, 151, 175, 246, 183, 255, 250, 255,},

+  {99, 142, 222, 151, 175, 247, 184, 255, 250, 255,},

+  {100, 143, 222, 152, 175, 247, 184, 255, 251, 255,},

+  {100, 144, 223, 153, 176, 247, 185, 255, 251, 255,},

+  {101, 145, 224, 153, 176, 248, 186, 255, 251, 255,},

+  {102, 146, 224, 154, 177, 248, 187, 255, 251, 255,},

+  {103, 147, 225, 154, 177, 248, 187, 255, 251, 255,},

+  {104, 148, 226, 155, 178, 248, 188, 255, 252, 255,},

+  {105, 149, 226, 156, 178, 249, 189, 255, 252, 255,},

+  {106, 150, 227, 156, 178, 249, 190, 255, 252, 255,},

+  {107, 151, 228, 157, 179, 249, 190, 255, 252, 255,},

+  {108, 152, 228, 158, 179, 249, 191, 255, 252, 255,},

+  {109, 153, 229, 158, 180, 250, 192, 255, 252, 255,},

+  {110, 154, 230, 159, 180, 250, 193, 255, 253, 255,},

+  {111, 155, 230, 159, 181, 250, 193, 255, 253, 255,},

+  {111, 156, 231, 160, 181, 250, 194, 255, 253, 255,},

+  {112, 157, 231, 161, 181, 251, 195, 255, 253, 255,},

+  {113, 158, 232, 161, 182, 251, 196, 255, 253, 255,},

+  {114, 159, 233, 162, 182, 251, 196, 255, 253, 255,},

+  {115, 160, 233, 163, 183, 251, 197, 255, 253, 255,},

+  {116, 161, 234, 163, 183, 251, 198, 255, 253, 255,},

+  {117, 162, 234, 164, 184, 252, 199, 255, 254, 255,},

+  {118, 163, 235, 165, 184, 252, 199, 255, 254, 255,},

+  {119, 164, 235, 165, 185, 252, 200, 255, 254, 255,},

+  {120, 165, 236, 166, 185, 252, 201, 255, 254, 255,},

+  {121, 166, 236, 167, 186, 252, 202, 255, 254, 255,},

+  {122, 167, 237, 167, 186, 252, 202, 255, 254, 255,},

+  {123, 168, 237, 168, 187, 253, 203, 255, 254, 255,},

+  {124, 169, 238, 169, 187, 253, 204, 255, 254, 255,},

+  {125, 170, 238, 169, 188, 253, 205, 255, 254, 255,},

+  {126, 171, 239, 170, 188, 253, 205, 255, 254, 255,},

+  {127, 172, 239, 171, 189, 253, 206, 255, 254, 255,},

+  {128, 173, 240, 172, 189, 253, 207, 255, 255, 255,},

+  {129, 174, 240, 172, 190, 253, 208, 255, 255, 255,},

+  {130, 175, 241, 173, 190, 253, 208, 255, 255, 255,},

+  {131, 176, 241, 174, 191, 254, 209, 255, 255, 255,},

+  {132, 177, 242, 175, 191, 254, 210, 255, 255, 255,},

+  {133, 178, 242, 175, 192, 254, 210, 255, 255, 255,},

+  {134, 179, 242, 176, 192, 254, 211, 255, 255, 255,},

+  {135, 180, 243, 177, 193, 254, 212, 255, 255, 255,},

+  {137, 181, 243, 177, 193, 254, 213, 255, 255, 255,},

+  {138, 182, 244, 178, 194, 254, 213, 255, 255, 255,},

+  {139, 183, 244, 179, 194, 254, 214, 255, 255, 255,},

+  {140, 184, 244, 180, 195, 254, 215, 255, 255, 255,},

+  {141, 185, 245, 181, 195, 254, 216, 255, 255, 255,},

+  {142, 186, 245, 181, 196, 255, 216, 255, 255, 255,},

+  {143, 187, 245, 182, 196, 255, 217, 255, 255, 255,},

+  {144, 188, 246, 183, 197, 255, 218, 255, 255, 255,},

+  {145, 189, 246, 184, 197, 255, 218, 255, 255, 255,},

+  {146, 190, 247, 184, 198, 255, 219, 255, 255, 255,},

+  {147, 191, 247, 185, 199, 255, 220, 255, 255, 255,},

+  {149, 192, 247, 186, 199, 255, 221, 255, 255, 255,},

+  {150, 193, 247, 187, 200, 255, 221, 255, 255, 255,},

+  {151, 194, 248, 188, 200, 255, 222, 255, 255, 255,},

+  {152, 195, 248, 188, 201, 255, 223, 255, 255, 255,},

+  {153, 196, 248, 189, 201, 255, 223, 255, 255, 255,},

+  {154, 197, 249, 190, 202, 255, 224, 255, 255, 255,},

+  {156, 198, 249, 191, 203, 255, 225, 255, 255, 255,},

+  {157, 199, 249, 192, 203, 255, 225, 255, 255, 255,},

+  {158, 200, 250, 193, 204, 255, 226, 255, 255, 255,},

+  {159, 201, 250, 193, 205, 255, 227, 255, 255, 255,},

+  {160, 202, 250, 194, 205, 255, 227, 255, 255, 255,},

+  {162, 203, 250, 195, 206, 255, 228, 255, 255, 255,},

+  {163, 204, 251, 196, 206, 255, 229, 255, 255, 255,},

+  {164, 205, 251, 197, 207, 255, 229, 255, 255, 255,},

+  {165, 206, 251, 198, 208, 255, 230, 255, 255, 255,},

+  {166, 207, 251, 199, 208, 255, 231, 255, 255, 255,},

+  {168, 208, 251, 200, 209, 255, 231, 255, 255, 255,},

+  {169, 209, 252, 201, 210, 255, 232, 255, 255, 255,},

+  {170, 210, 252, 201, 210, 255, 233, 255, 255, 255,},

+  {172, 211, 252, 202, 211, 255, 233, 255, 255, 255,},

+  {173, 212, 252, 203, 212, 255, 234, 255, 255, 255,},

+  {174, 213, 252, 204, 212, 255, 235, 255, 255, 255,},

+  {175, 214, 253, 205, 213, 255, 235, 255, 255, 255,},

+  {177, 215, 253, 206, 214, 255, 236, 255, 255, 255,},

+  {178, 216, 253, 207, 215, 255, 237, 255, 255, 255,},

+  {179, 217, 253, 208, 215, 255, 237, 255, 255, 255,},

+  {181, 218, 253, 209, 216, 255, 238, 255, 255, 255,},

+  {182, 219, 254, 210, 217, 255, 238, 255, 255, 255,},

+  {184, 220, 254, 211, 217, 255, 239, 255, 255, 255,},

+  {185, 221, 254, 212, 218, 255, 240, 255, 255, 255,},

+  {186, 222, 254, 213, 219, 255, 240, 255, 255, 255,},

+  {188, 223, 254, 214, 220, 255, 241, 255, 255, 255,},

+  {189, 224, 254, 215, 221, 255, 241, 255, 255, 255,},

+  {191, 225, 254, 216, 221, 255, 242, 255, 255, 255,},

+  {192, 226, 254, 217, 222, 255, 243, 255, 255, 255,},

+  {194, 227, 255, 218, 223, 255, 243, 255, 255, 255,},

+  {195, 228, 255, 219, 224, 255, 244, 255, 255, 255,},

+  {197, 229, 255, 220, 225, 255, 244, 255, 255, 255,},

+  {198, 230, 255, 221, 225, 255, 245, 255, 255, 255,},

+  {200, 231, 255, 222, 226, 255, 245, 255, 255, 255,},

+  {201, 232, 255, 223, 227, 255, 246, 255, 255, 255,},

+  {203, 233, 255, 224, 228, 255, 247, 255, 255, 255,},

+  {205, 234, 255, 226, 229, 255, 247, 255, 255, 255,},

+  {206, 235, 255, 227, 230, 255, 248, 255, 255, 255,},

+  {208, 236, 255, 228, 231, 255, 248, 255, 255, 255,},

+  {210, 237, 255, 229, 232, 255, 249, 255, 255, 255,},

+  {211, 238, 255, 230, 233, 255, 249, 255, 255, 255,},

+  {213, 239, 255, 231, 234, 255, 250, 255, 255, 255,},

+  {215, 240, 255, 233, 235, 255, 250, 255, 255, 255,},

+  {217, 241, 255, 234, 236, 255, 251, 255, 255, 255,},

+  {219, 242, 255, 235, 237, 255, 251, 255, 255, 255,},

+  {221, 243, 255, 236, 238, 255, 252, 255, 255, 255,},

+  {223, 244, 255, 237, 239, 255, 252, 255, 255, 255,},

+  {225, 245, 255, 239, 240, 255, 252, 255, 255, 255,},

+  {227, 246, 255, 240, 241, 255, 253, 255, 255, 255,},

+  {229, 247, 255, 241, 242, 255, 253, 255, 255, 255,},

+  {231, 248, 255, 243, 244, 255, 254, 255, 255, 255,},

+  {233, 249, 255, 244, 245, 255, 254, 255, 255, 255,},

+  {236, 250, 255, 246, 246, 255, 254, 255, 255, 255,},

+  {238, 251, 255, 247, 247, 255, 255, 255, 255, 255,},

+  {241, 252, 255, 249, 249, 255, 255, 255, 255, 255,},

+  {244, 253, 255, 250, 250, 255, 255, 255, 255, 255,},

+  {247, 254, 255, 252, 252, 255, 255, 255, 255, 255,},

+  {251, 255, 255, 254, 254, 255, 255, 255, 255, 255,},

+};

+const vp9_prob vp9_modelcoefprobs_gg625p1[COEFPROB_MODELS][ENTROPY_NODES - 1] = {

+  // Probs generated with a Generalized Gaussian (with shape parameter 0.625)

+  // source model with varying quantizer step size for a uniform quantizer

+  {0,   0,   0,   0,   0,   0,   0,   0,   0,   0,},  // do not use

+  {1,   1,   3,  87, 129,   6,  87,  20,  91,  24,},

+  {1,   2,   6,  88, 130,  11,  89,  36,  94,  41,},

+  {2,   3,   8,  88, 130,  15,  90,  50,  97,  56,},

+  {2,   4,  11,  89, 131,  20,  90,  62,  99,  70,},

+  {3,   5,  14,  90, 131,  24,  91,  74, 102,  81,},

+  {3,   6,  16,  90, 132,  29,  92,  84, 104,  92,},

+  {4,   7,  19,  91, 132,  33,  93,  93, 106, 101,},

+  {4,   8,  21,  91, 132,  37,  93, 102, 108, 110,},

+  {5,   9,  24,  92, 133,  40,  94, 110, 110, 118,},

+  {5,  10,  26,  92, 133,  44,  95, 118, 111, 125,},

+  {6,  11,  29,  93, 134,  48,  96, 125, 113, 132,},

+  {7,  12,  31,  93, 134,  51,  96, 132, 115, 139,},

+  {7,  13,  33,  93, 134,  55,  97, 138, 117, 145,},

+  {8,  14,  36,  94, 135,  58,  97, 144, 119, 150,},

+  {8,  15,  38,  94, 135,  62,  98, 149, 120, 155,},

+  {9,  16,  40,  95, 135,  65,  99, 154, 122, 160,},

+  {10,  17,  42,  95, 136,  68,  99, 159, 124, 165,},

+  {10,  18,  45,  96, 136,  71, 100, 164, 125, 169,},

+  {11,  19,  47,  96, 136,  74, 100, 168, 127, 174,},

+  {11,  20,  49,  96, 136,  77, 101, 173, 128, 177,},

+  {12,  21,  51,  97, 137,  80, 102, 176, 130, 181,},

+  {13,  22,  53,  97, 137,  83, 102, 180, 131, 185,},

+  {13,  23,  55,  98, 137,  86, 103, 184, 133, 188,},

+  {14,  24,  57,  98, 138,  89, 103, 187, 135, 191,},

+  {14,  25,  59,  98, 138,  91, 104, 190, 136, 194,},

+  {15,  26,  61,  99, 138,  94, 104, 193, 138, 197,},

+  {16,  27,  64,  99, 139,  97, 105, 196, 139, 200,},

+  {16,  28,  66, 100, 139,  99, 106, 199, 141, 202,},

+  {17,  29,  68, 100, 139, 102, 106, 201, 142, 205,},

+  {18,  30,  69, 100, 139, 104, 107, 204, 143, 207,},

+  {18,  31,  71, 101, 140, 107, 107, 206, 145, 209,},

+  {19,  32,  73, 101, 140, 109, 108, 209, 146, 211,},

+  {20,  33,  75, 102, 140, 112, 108, 211, 148, 213,},

+  {20,  34,  77, 102, 141, 114, 109, 213, 149, 215,},

+  {21,  35,  79, 102, 141, 116, 109, 215, 150, 217,},

+  {22,  36,  81, 103, 141, 119, 110, 217, 152, 219,},

+  {22,  37,  83, 103, 141, 121, 110, 218, 153, 220,},

+  {23,  38,  85, 103, 142, 123, 111, 220, 155, 222,},

+  {24,  39,  87, 104, 142, 125, 112, 222, 156, 224,},

+  {24,  40,  88, 104, 142, 127, 112, 223, 157, 225,},

+  {25,  41,  90, 105, 143, 129, 113, 225, 159, 226,},

+  {26,  42,  92, 105, 143, 131, 113, 226, 160, 228,},

+  {26,  43,  94, 105, 143, 133, 114, 227, 161, 229,},

+  {27,  44,  95, 106, 143, 135, 114, 229, 162, 230,},

+  {28,  45,  97, 106, 144, 137, 115, 230, 164, 231,},

+  {28,  46,  99, 107, 144, 139, 115, 231, 165, 232,},

+  {29,  47, 101, 107, 144, 141, 116, 232, 166, 233,},

+  {30,  48, 102, 107, 145, 143, 116, 233, 168, 234,},

+  {31,  49, 104, 108, 145, 145, 117, 234, 169, 235,},

+  {31,  50, 106, 108, 145, 147, 118, 235, 170, 236,},

+  {32,  51, 107, 108, 145, 149, 118, 236, 171, 237,},

+  {33,  52, 109, 109, 146, 150, 119, 237, 172, 238,},

+  {33,  53, 111, 109, 146, 152, 119, 238, 174, 239,},

+  {34,  54, 112, 110, 146, 154, 120, 239, 175, 240,},

+  {35,  55, 114, 110, 146, 156, 120, 240, 176, 240,},

+  {36,  56, 115, 110, 147, 157, 121, 240, 177, 241,},

+  {36,  57, 117, 111, 147, 159, 121, 241, 178, 242,},

+  {37,  58, 119, 111, 147, 161, 122, 242, 180, 242,},

+  {38,  59, 120, 112, 148, 162, 122, 242, 181, 243,},

+  {38,  60, 122, 112, 148, 164, 123, 243, 182, 244,},

+  {39,  61, 123, 112, 148, 165, 124, 244, 183, 244,},

+  {40,  62, 125, 113, 148, 167, 124, 244, 184, 245,},

+  {41,  63, 126, 113, 149, 168, 125, 245, 185, 245,},

+  {41,  64, 128, 114, 149, 170, 125, 245, 186, 246,},

+  {42,  65, 129, 114, 149, 171, 126, 246, 187, 246,},

+  {43,  66, 131, 114, 150, 173, 126, 246, 188, 247,},

+  {44,  67, 132, 115, 150, 174, 127, 247, 189, 247,},

+  {44,  68, 134, 115, 150, 176, 127, 247, 191, 247,},

+  {45,  69, 135, 116, 150, 177, 128, 248, 192, 248,},

+  {46,  70, 136, 116, 151, 178, 129, 248, 193, 248,},

+  {47,  71, 138, 116, 151, 180, 129, 248, 194, 249,},

+  {48,  72, 139, 117, 151, 181, 130, 249, 195, 249,},

+  {48,  73, 141, 117, 152, 183, 130, 249, 196, 249,},

+  {49,  74, 142, 118, 152, 184, 131, 249, 197, 250,},

+  {50,  75, 143, 118, 152, 185, 131, 250, 198, 250,},

+  {51,  76, 145, 118, 152, 186, 132, 250, 199, 250,},

+  {51,  77, 146, 119, 153, 188, 132, 250, 200, 250,},

+  {52,  78, 148, 119, 153, 189, 133, 251, 201, 251,},

+  {53,  79, 149, 120, 153, 190, 134, 251, 201, 251,},

+  {54,  80, 150, 120, 154, 191, 134, 251, 202, 251,},

+  {55,  81, 151, 120, 154, 192, 135, 251, 203, 251,},

+  {55,  82, 153, 121, 154, 194, 135, 252, 204, 252,},

+  {56,  83, 154, 121, 155, 195, 136, 252, 205, 252,},

+  {57,  84, 155, 122, 155, 196, 136, 252, 206, 252,},

+  {58,  85, 157, 122, 155, 197, 137, 252, 207, 252,},

+  {59,  86, 158, 123, 155, 198, 138, 252, 208, 252,},

+  {59,  87, 159, 123, 156, 199, 138, 253, 209, 253,},

+  {60,  88, 160, 123, 156, 200, 139, 253, 210, 253,},

+  {61,  89, 162, 124, 156, 201, 139, 253, 210, 253,},

+  {62,  90, 163, 124, 157, 202, 140, 253, 211, 253,},

+  {63,  91, 164, 125, 157, 203, 140, 253, 212, 253,},

+  {64,  92, 165, 125, 157, 204, 141, 253, 213, 253,},

+  {64,  93, 166, 126, 158, 205, 142, 254, 214, 253,},

+  {65,  94, 168, 126, 158, 206, 142, 254, 214, 254,},

+  {66,  95, 169, 126, 158, 207, 143, 254, 215, 254,},

+  {67,  96, 170, 127, 158, 208, 143, 254, 216, 254,},

+  {68,  97, 171, 127, 159, 209, 144, 254, 217, 254,},

+  {69,  98, 172, 128, 159, 210, 145, 254, 218, 254,},

+  {69,  99, 173, 128, 159, 211, 145, 254, 218, 254,},

+  {70, 100, 175, 129, 160, 212, 146, 254, 219, 254,},

+  {71, 101, 176, 129, 160, 213, 146, 254, 220, 254,},

+  {72, 102, 177, 130, 160, 214, 147, 254, 220, 254,},

+  {73, 103, 178, 130, 161, 214, 148, 255, 221, 255,},

+  {74, 104, 179, 130, 161, 215, 148, 255, 222, 255,},

+  {75, 105, 180, 131, 161, 216, 149, 255, 223, 255,},

+  {75, 106, 181, 131, 162, 217, 149, 255, 223, 255,},

+  {76, 107, 182, 132, 162, 218, 150, 255, 224, 255,},

+  {77, 108, 183, 132, 162, 219, 151, 255, 225, 255,},

+  {78, 109, 184, 133, 163, 219, 151, 255, 225, 255,},

+  {79, 110, 185, 133, 163, 220, 152, 255, 226, 255,},

+  {80, 111, 186, 134, 163, 221, 152, 255, 226, 255,},

+  {81, 112, 187, 134, 164, 222, 153, 255, 227, 255,},

+  {82, 113, 188, 135, 164, 222, 154, 255, 228, 255,},

+  {83, 114, 189, 135, 164, 223, 154, 255, 228, 255,},

+  {83, 115, 190, 136, 165, 224, 155, 255, 229, 255,},

+  {84, 116, 191, 136, 165, 224, 156, 255, 230, 255,},

+  {85, 117, 192, 137, 165, 225, 156, 255, 230, 255,},

+  {86, 118, 193, 137, 166, 226, 157, 255, 231, 255,},

+  {87, 119, 194, 137, 166, 226, 157, 255, 231, 255,},

+  {88, 120, 195, 138, 166, 227, 158, 255, 232, 255,},

+  {89, 121, 196, 138, 167, 228, 159, 255, 232, 255,},

+  {90, 122, 197, 139, 167, 228, 159, 255, 233, 255,},

+  {91, 123, 198, 139, 167, 229, 160, 255, 233, 255,},

+  {92, 124, 199, 140, 168, 230, 161, 255, 234, 255,},

+  {93, 125, 200, 140, 168, 230, 161, 255, 234, 255,},

+  {93, 126, 201, 141, 168, 231, 162, 255, 235, 255,},

+  {94, 127, 202, 141, 169, 231, 163, 255, 235, 255,},

+  {95, 128, 203, 142, 169, 232, 163, 255, 236, 255,},

+  {96, 129, 203, 142, 169, 233, 164, 255, 236, 255,},

+  {97, 130, 204, 143, 170, 233, 164, 255, 237, 255,},

+  {98, 131, 205, 143, 170, 234, 165, 255, 237, 255,},

+  {99, 132, 206, 144, 170, 234, 166, 255, 238, 255,},

+  {100, 133, 207, 145, 171, 235, 166, 255, 238, 255,},

+  {101, 134, 208, 145, 171, 235, 167, 255, 239, 255,},

+  {102, 135, 209, 146, 171, 236, 168, 255, 239, 255,},

+  {103, 136, 209, 146, 172, 236, 168, 255, 240, 255,},

+  {104, 137, 210, 147, 172, 237, 169, 255, 240, 255,},

+  {105, 138, 211, 147, 173, 237, 170, 255, 240, 255,},

+  {106, 139, 212, 148, 173, 238, 170, 255, 241, 255,},

+  {107, 140, 213, 148, 173, 238, 171, 255, 241, 255,},

+  {108, 141, 213, 149, 174, 239, 172, 255, 242, 255,},

+  {109, 142, 214, 149, 174, 239, 172, 255, 242, 255,},

+  {110, 143, 215, 150, 174, 240, 173, 255, 242, 255,},

+  {111, 144, 216, 150, 175, 240, 174, 255, 243, 255,},

+  {112, 145, 216, 151, 175, 240, 174, 255, 243, 255,},

+  {113, 146, 217, 152, 176, 241, 175, 255, 243, 255,},

+  {114, 147, 218, 152, 176, 241, 176, 255, 244, 255,},

+  {115, 148, 219, 153, 176, 242, 176, 255, 244, 255,},

+  {116, 149, 219, 153, 177, 242, 177, 255, 244, 255,},

+  {117, 150, 220, 154, 177, 242, 178, 255, 245, 255,},

+  {118, 151, 221, 154, 178, 243, 178, 255, 245, 255,},

+  {119, 152, 221, 155, 178, 243, 179, 255, 245, 255,},

+  {120, 153, 222, 156, 178, 244, 180, 255, 246, 255,},

+  {121, 154, 223, 156, 179, 244, 180, 255, 246, 255,},

+  {122, 155, 223, 157, 179, 244, 181, 255, 246, 255,},

+  {123, 156, 224, 157, 180, 245, 182, 255, 247, 255,},

+  {124, 157, 225, 158, 180, 245, 183, 255, 247, 255,},

+  {125, 158, 225, 159, 180, 245, 183, 255, 247, 255,},

+  {126, 159, 226, 159, 181, 246, 184, 255, 247, 255,},

+  {127, 160, 227, 160, 181, 246, 185, 255, 248, 255,},

+  {128, 161, 227, 160, 182, 246, 185, 255, 248, 255,},

+  {129, 162, 228, 161, 182, 246, 186, 255, 248, 255,},

+  {130, 163, 229, 162, 183, 247, 187, 255, 248, 255,},

+  {131, 164, 229, 162, 183, 247, 187, 255, 249, 255,},

+  {132, 165, 230, 163, 183, 247, 188, 255, 249, 255,},

+  {133, 166, 230, 163, 184, 248, 189, 255, 249, 255,},

+  {135, 167, 231, 164, 184, 248, 190, 255, 249, 255,},

+  {136, 168, 232, 165, 185, 248, 190, 255, 250, 255,},

+  {137, 169, 232, 165, 185, 248, 191, 255, 250, 255,},

+  {138, 170, 233, 166, 186, 249, 192, 255, 250, 255,},

+  {139, 171, 233, 167, 186, 249, 192, 255, 250, 255,},

+  {140, 172, 234, 167, 187, 249, 193, 255, 251, 255,},

+  {141, 173, 234, 168, 187, 249, 194, 255, 251, 255,},

+  {142, 174, 235, 169, 187, 250, 195, 255, 251, 255,},

+  {143, 175, 235, 169, 188, 250, 195, 255, 251, 255,},

+  {144, 176, 236, 170, 188, 250, 196, 255, 251, 255,},

+  {146, 177, 236, 171, 189, 250, 197, 255, 251, 255,},

+  {147, 178, 237, 171, 189, 251, 197, 255, 252, 255,},

+  {148, 179, 237, 172, 190, 251, 198, 255, 252, 255,},

+  {149, 180, 238, 173, 190, 251, 199, 255, 252, 255,},

+  {150, 181, 238, 173, 191, 251, 200, 255, 252, 255,},

+  {151, 182, 239, 174, 191, 251, 200, 255, 252, 255,},

+  {152, 183, 239, 175, 192, 251, 201, 255, 252, 255,},

+  {153, 184, 240, 176, 192, 252, 202, 255, 253, 255,},

+  {155, 185, 240, 176, 193, 252, 203, 255, 253, 255,},

+  {156, 186, 241, 177, 193, 252, 203, 255, 253, 255,},

+  {157, 187, 241, 178, 194, 252, 204, 255, 253, 255,},

+  {158, 188, 242, 179, 194, 252, 205, 255, 253, 255,},

+  {159, 189, 242, 179, 195, 252, 206, 255, 253, 255,},

+  {160, 190, 242, 180, 195, 253, 206, 255, 253, 255,},

+  {162, 191, 243, 181, 196, 253, 207, 255, 253, 255,},

+  {163, 192, 243, 182, 196, 253, 208, 255, 254, 255,},

+  {164, 193, 244, 182, 197, 253, 209, 255, 254, 255,},

+  {165, 194, 244, 183, 198, 253, 209, 255, 254, 255,},

+  {166, 195, 244, 184, 198, 253, 210, 255, 254, 255,},

+  {168, 196, 245, 185, 199, 253, 211, 255, 254, 255,},

+  {169, 197, 245, 185, 199, 254, 212, 255, 254, 255,},

+  {170, 198, 246, 186, 200, 254, 212, 255, 254, 255,},

+  {171, 199, 246, 187, 200, 254, 213, 255, 254, 255,},

+  {172, 200, 246, 188, 201, 254, 214, 255, 254, 255,},

+  {174, 201, 247, 189, 201, 254, 215, 255, 254, 255,},

+  {175, 202, 247, 189, 202, 254, 215, 255, 255, 255,},

+  {176, 203, 247, 190, 203, 254, 216, 255, 255, 255,},

+  {177, 204, 248, 191, 203, 254, 217, 255, 255, 255,},

+  {179, 205, 248, 192, 204, 254, 218, 255, 255, 255,},

+  {180, 206, 248, 193, 204, 254, 218, 255, 255, 255,},

+  {181, 207, 249, 194, 205, 255, 219, 255, 255, 255,},

+  {183, 208, 249, 195, 206, 255, 220, 255, 255, 255,},

+  {184, 209, 249, 195, 206, 255, 221, 255, 255, 255,},

+  {185, 210, 250, 196, 207, 255, 221, 255, 255, 255,},

+  {186, 211, 250, 197, 208, 255, 222, 255, 255, 255,},

+  {188, 212, 250, 198, 208, 255, 223, 255, 255, 255,},

+  {189, 213, 250, 199, 209, 255, 224, 255, 255, 255,},

+  {190, 214, 251, 200, 210, 255, 224, 255, 255, 255,},

+  {192, 215, 251, 201, 210, 255, 225, 255, 255, 255,},

+  {193, 216, 251, 202, 211, 255, 226, 255, 255, 255,},

+  {194, 217, 251, 203, 212, 255, 227, 255, 255, 255,},

+  {196, 218, 252, 204, 212, 255, 228, 255, 255, 255,},

+  {197, 219, 252, 205, 213, 255, 228, 255, 255, 255,},

+  {198, 220, 252, 206, 214, 255, 229, 255, 255, 255,},

+  {200, 221, 252, 207, 215, 255, 230, 255, 255, 255,},

+  {201, 222, 252, 208, 215, 255, 231, 255, 255, 255,},

+  {202, 223, 253, 209, 216, 255, 231, 255, 255, 255,},

+  {204, 224, 253, 210, 217, 255, 232, 255, 255, 255,},

+  {205, 225, 253, 211, 218, 255, 233, 255, 255, 255,},

+  {207, 226, 253, 212, 218, 255, 234, 255, 255, 255,},

+  {208, 227, 253, 213, 219, 255, 234, 255, 255, 255,},

+  {209, 228, 254, 214, 220, 255, 235, 255, 255, 255,},

+  {211, 229, 254, 215, 221, 255, 236, 255, 255, 255,},

+  {212, 230, 254, 216, 222, 255, 237, 255, 255, 255,},

+  {214, 231, 254, 217, 223, 255, 238, 255, 255, 255,},

+  {215, 232, 254, 218, 223, 255, 238, 255, 255, 255,},

+  {217, 233, 254, 219, 224, 255, 239, 255, 255, 255,},

+  {218, 234, 255, 221, 225, 255, 240, 255, 255, 255,},

+  {220, 235, 255, 222, 226, 255, 241, 255, 255, 255,},

+  {221, 236, 255, 223, 227, 255, 241, 255, 255, 255,},

+  {223, 237, 255, 224, 228, 255, 242, 255, 255, 255,},

+  {224, 238, 255, 225, 229, 255, 243, 255, 255, 255,},

+  {226, 239, 255, 227, 230, 255, 244, 255, 255, 255,},

+  {227, 240, 255, 228, 231, 255, 244, 255, 255, 255,},

+  {229, 241, 255, 229, 232, 255, 245, 255, 255, 255,},

+  {231, 242, 255, 231, 233, 255, 246, 255, 255, 255,},

+  {232, 243, 255, 232, 234, 255, 247, 255, 255, 255,},

+  {234, 244, 255, 233, 236, 255, 247, 255, 255, 255,},

+  {235, 245, 255, 235, 237, 255, 248, 255, 255, 255,},

+  {237, 246, 255, 236, 238, 255, 249, 255, 255, 255,},

+  {239, 247, 255, 238, 239, 255, 250, 255, 255, 255,},

+  {241, 248, 255, 239, 241, 255, 250, 255, 255, 255,},

+  {242, 249, 255, 241, 242, 255, 251, 255, 255, 255,},

+  {244, 250, 255, 243, 243, 255, 252, 255, 255, 255,},

+  {246, 251, 255, 244, 245, 255, 253, 255, 255, 255,},

+  {248, 252, 255, 246, 247, 255, 253, 255, 255, 255,},

+  {250, 253, 255, 248, 248, 255, 254, 255, 255, 255,},

+  {252, 254, 255, 250, 250, 255, 255, 255, 255, 255,},

+  {254, 255, 255, 253, 253, 255, 255, 255, 255, 255,},

+};

+void vp9_get_model_distribution(vp9_prob p, vp9_prob *tree_probs,

+                                int b, int r) {

+  const vp9_prob (*model)[ENTROPY_NODES - 1];

+#if UNCONSTRAINED_NODES == 2

+  if (r != INTRA_FRAME && b == PLANE_TYPE_UV)

+    model = vp9_modelcoefprobs_gg75;

+  else if (r == INTRA_FRAME && b == PLANE_TYPE_UV)

+    model = vp9_modelcoefprobs_gg75;

+  else if (r != INTRA_FRAME && b == PLANE_TYPE_Y_WITH_DC)

+    model = vp9_modelcoefprobs_gg75;

+  else

+    model = vp9_modelcoefprobs_gg75;

+#else

+  if (r != INTRA_FRAME && b == PLANE_TYPE_UV)

+    model = vp9_modelcoefprobs_gg75p1;

+  else if (r == INTRA_FRAME && b == PLANE_TYPE_UV)

+    model = vp9_modelcoefprobs_gg75p1;

+  else if (r != INTRA_FRAME && b == PLANE_TYPE_Y_WITH_DC)

+    model = vp9_modelcoefprobs_gg75p1;

+  else

+    model = vp9_modelcoefprobs_gg75p1;

+#endif

+  vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,

+             model[p] + UNCONSTRAINED_NODES - 1,

+             (ENTROPY_NODES - UNCONSTRAINED_NODES) * sizeof(vp9_prob));

+}

+#endif

 static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];

 static void init_bit_tree(vp9_tree_index *p, int n) {

@@ -937,8 +2094,188 @@

 #include "vp9/common/vp9_default_coef_probs.h"

-#if CONFIG_NEWCOEFCONTEXT

+// This function updates and then returns n AC coefficient context

+// This is currently a placeholder function to allow experimentation

+// using various context models based on the energy earlier tokens

+// within the current block.

+//

+// For now it just returns the previously used context.

+#define MAX_NEIGHBORS 2

+int vp9_get_coef_context(const int *scan, const int *neighbors,

+                         int nb_pad, uint8_t *token_cache, int c, int l) {

+  int eob = l;

+  assert(nb_pad == MAX_NEIGHBORS);

+  if (c == eob) {

+    return 0;

+  } else {

+    int ctx;

+    assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0);

+    if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) {

+      ctx = (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +

+             token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;

+    } else {

+      ctx = token_cache[neighbors[MAX_NEIGHBORS * c + 0]];

+    }

+    return vp9_pt_energy_class[ctx];

+  }

+};

+void vp9_default_coef_probs(VP9_COMMON *pc) {

+#if CONFIG_MODELCOEFPROB

+  int b, r, c, p;

+#endif

+#if CONFIG_CODE_NONZEROCOUNT

+#ifdef NZC_DEFAULT_COUNTS

+  int h, g;

+  for (h = 0; h < MAX_NZC_CONTEXTS; ++h) {

+    for (g = 0; g < REF_TYPES; ++g) {

+      int i;

+      unsigned int branch_ct4x4[NZC4X4_NODES][2];

+      unsigned int branch_ct8x8[NZC8X8_NODES][2];

+      unsigned int branch_ct16x16[NZC16X16_NODES][2];

+      unsigned int branch_ct32x32[NZC32X32_NODES][2];

+      for (i = 0; i < BLOCK_TYPES; ++i) {

+        vp9_tree_probs_from_distribution(

+          vp9_nzc4x4_tree,

+          pc->fc.nzc_probs_4x4[h][g][i], branch_ct4x4,

+          default_nzc_counts_4x4[h][g][i], 0);

+      }

+      for (i = 0; i < BLOCK_TYPES; ++i) {

+        vp9_tree_probs_from_distribution(

+          vp9_nzc8x8_tree,

+          pc->fc.nzc_probs_8x8[h][g][i], branch_ct8x8,

+          default_nzc_counts_8x8[h][g][i], 0);

+      }

+      for (i = 0; i < BLOCK_TYPES; ++i) {

+        vp9_tree_probs_from_distribution(

+          vp9_nzc16x16_tree,

+          pc->fc.nzc_probs_16x16[h][g][i], branch_ct16x16,

+          default_nzc_counts_16x16[h][g][i], 0);

+      }

+      for (i = 0; i < BLOCK_TYPES; ++i) {

+        vp9_tree_probs_from_distribution(

+          vp9_nzc32x32_tree,

+          pc->fc.nzc_probs_32x32[h][g][i], branch_ct32x32,

+          default_nzc_counts_32x32[h][g][i], 0);

+      }

+    }

+  }

+#else

+  vpx_memcpy(pc->fc.nzc_probs_4x4, default_nzc_probs_4x4,

+             sizeof(pc->fc.nzc_probs_4x4));

+  vpx_memcpy(pc->fc.nzc_probs_8x8, default_nzc_probs_8x8,

+             sizeof(pc->fc.nzc_probs_8x8));

+  vpx_memcpy(pc->fc.nzc_probs_16x16, default_nzc_probs_16x16,

+             sizeof(pc->fc.nzc_probs_16x16));

+  vpx_memcpy(pc->fc.nzc_probs_32x32, default_nzc_probs_32x32,

+             sizeof(pc->fc.nzc_probs_32x32));

+#endif

+  vpx_memcpy(pc->fc.nzc_pcat_probs, default_nzc_pcat_probs,

+             sizeof(pc->fc.nzc_pcat_probs));

+#endif  // CONFIG_CODE_NONZEROCOUNT

+#if CONFIG_MODELCOEFPROB

+  for (b = 0; b < BLOCK_TYPES; ++b)

+    for (r = 0; r < REF_TYPES; ++r)

+      for (c = 0; c < COEF_BANDS; ++c)

+        for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {

+          int t;

+          for (t = 0; t < UNCONSTRAINED_NODES; t++)

+            pc->fc.coef_probs_4x4[b][r][c][p][t] =

+                default_coef_probs_4x4[b][r][c][p][t];

+          vp9_get_model_distribution(

+              default_coef_probs_4x4[b][r][c][p][UNCONSTRAINED_NODES - 1],

+              pc->fc.coef_probs_4x4[b][r][c][p], b, r);

+          for (t = 0; t < UNCONSTRAINED_NODES; t++)

+            pc->fc.coef_probs_8x8[b][r][c][p][t] =

+                default_coef_probs_8x8[b][r][c][p][t];

+          vp9_get_model_distribution(

+              default_coef_probs_8x8[b][r][c][p][UNCONSTRAINED_NODES - 1],

+              pc->fc.coef_probs_8x8[b][r][c][p], b, r);

+          for (t = 0; t < UNCONSTRAINED_NODES; t++)

+            pc->fc.coef_probs_16x16[b][r][c][p][t] =

+                default_coef_probs_16x16[b][r][c][p][t];

+          vp9_get_model_distribution(

+              default_coef_probs_16x16[b][r][c][p][UNCONSTRAINED_NODES - 1],

+              pc->fc.coef_probs_16x16[b][r][c][p], b, r);

+          for (t = 0; t < UNCONSTRAINED_NODES; t++)

+            pc->fc.coef_probs_32x32[b][r][c][p][t] =

+                default_coef_probs_32x32[b][r][c][p][t];

+          vp9_get_model_distribution(

+              default_coef_probs_32x32[b][r][c][p][UNCONSTRAINED_NODES - 1],

+              pc->fc.coef_probs_32x32[b][r][c][p], b, r);

+        }

+#else

+  vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4,

+             sizeof(pc->fc.coef_probs_4x4));

+  vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,

+             sizeof(pc->fc.coef_probs_8x8));

+  vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,

+             sizeof(pc->fc.coef_probs_16x16));

+  vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,

+             sizeof(pc->fc.coef_probs_32x32));

+#endif

+}

+#if CONFIG_MODELCOEFPROB

+// This is a placeholder function that will enable the default coef probs to

+// change for key frames based on the base_qindex. If base_qindex is large,

+// we can expect probabilities of zeros to be bigger, and vice versa. The rest

+// of the probabilities are derived from the nodel.

+void vp9_adjust_default_coef_probs(VP9_COMMON *cm) {

+  static const int factor_bits = 4;

+  static const int factor_rnd = 8;   // (1 << (factor_bits - 1))

+  int b, r, c, p;

+  int factor = (1 << factor_bits);

+  /*

+  if (cm->base_qindex < 32)

+    factor -= ((32 - cm->base_qindex) >> 4);

+    */

+  if (cm->base_qindex > 128)

+    factor += ((cm->base_qindex - 128) >> 4);

+  // printf(" Q %d factor %d\n", cm->base_qindex, factor);

+  for (b = 0; b < BLOCK_TYPES; ++b)

+    for (r = 0; r < REF_TYPES; ++r)

+      for (c = 0; c < COEF_BANDS; ++c)

+        for (p = 0; p < PREV_COEF_CONTEXTS; ++p) {

+          int t, x;

+          vp9_prob prob;

+          for (t = 0; t < UNCONSTRAINED_NODES; t++) {

+            x = (default_coef_probs_4x4[b][r][c][p][t] * factor + factor_rnd)

+                >> factor_bits;

+            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));

+            cm->fc.coef_probs_4x4[b][r][c][p][t] = prob;

+          }

+          vp9_get_model_distribution(

+              prob, cm->fc.coef_probs_4x4[b][r][c][p], b, r);

+          for (t = 0; t < UNCONSTRAINED_NODES; t++) {

+            x = (default_coef_probs_8x8[b][r][c][p][t] * factor + factor_rnd)

+                >> factor_bits;

+            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));

+            cm->fc.coef_probs_8x8[b][r][c][p][t] = prob;

+          }

+          vp9_get_model_distribution(

+              prob, cm->fc.coef_probs_8x8[b][r][c][p], b, r);

+          for (t = 0; t < UNCONSTRAINED_NODES; t++) {

+            x = (default_coef_probs_16x16[b][r][c][p][t] * factor + factor_rnd)

+                >> factor_bits;

+            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));

+            cm->fc.coef_probs_16x16[b][r][c][p][t] = prob;

+          }

+          vp9_get_model_distribution(

+              prob, cm->fc.coef_probs_16x16[b][r][c][p], b, r);

+          for (t = 0; t < UNCONSTRAINED_NODES; t++) {

+            x = (default_coef_probs_32x32[b][r][c][p][t] * factor + factor_rnd)

+                >> factor_bits;

+            prob = (x > 255 ? 255 : (x < 1 ? 1 : x));

+            cm->fc.coef_probs_32x32[b][r][c][p][t] = prob;

+          }

+          vp9_get_model_distribution(

+              prob, cm->fc.coef_probs_32x32[b][r][c][p], b, r);

+        }

+}

+#endif

 // Neighborhood 5-tuples for various scans and blocksizes,

 // in {top, left, topleft, topright, bottomleft} order

 // for each position in raster scan order.

@@ -950,158 +2287,1235 @@

 DECLARE_ALIGNED(16, int,

                 vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

+                vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);

+DECLARE_ALIGNED(16, int,

+                vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]);

+DECLARE_ALIGNED(16, int,

                 vp9_default_zig_zag1d_8x8_neighbors[64 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

+                vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);

+DECLARE_ALIGNED(16, int,

+                vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]);

+DECLARE_ALIGNED(16, int,

                 vp9_default_zig_zag1d_16x16_neighbors[256 * MAX_NEIGHBORS]);

 DECLARE_ALIGNED(16, int,

                 vp9_default_zig_zag1d_32x32_neighbors[1024 * MAX_NEIGHBORS]);

-static int find_in_scan(const int *scan, int l, int m) {

-  int i, l2 = l * l;

-  for (i = 0; i < l2; ++i) {

-    if (scan[i] == m)

-      return i;

+static int find_in_scan(const int *scan, int l, int idx) {

+  int n, l2 = l * l;

+  for (n = 0; n < l2; n++) {

+    int rc = scan[n];

+    if (rc == idx)

+      return  n;

+  assert(0);

   return -1;

-static void init_scan_neighbors(const int *scan, int l, int *neighbors) {

+static void init_scan_neighbors(const int *scan, int l, int *neighbors,

+                                int max_neighbors) {

   int l2 = l * l;

-  int m, n, i, j, k;

-  for (n = 0; n < l2; ++n) {

-    int locn = find_in_scan(scan, l, n);

-    int z = -1;

-    i = n / l;

-    j = n % l;

-    for (k = 0; k < MAX_NEIGHBORS; ++k)

-      neighbors[MAX_NEIGHBORS * n + k] = -1;

-    if (i - 1 >= 0) {

-      m = (i - 1) * l + j;

-      if (find_in_scan(scan, l, m) < locn) {

-        neighbors[MAX_NEIGHBORS * n] = m;

-        if (m == 0) z = 0;

+  int n, i, j;

+  for (n = 0; n < l2; n++) {

+    int rc = scan[n];

+    assert(max_neighbors == MAX_NEIGHBORS);

+    i = rc / l;

+    j = rc % l;

+    if (i > 0 && j > 0) {

+      // col/row scan is used for adst/dct, and generally means that

+      // energy decreases to zero much faster in the dimension in

+      // which ADST is used compared to the direction in which DCT

+      // is used. Likewise, we find much higher correlation between

+      // coefficients within the direction in which DCT is used.

+      // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff

+      // as a context. If ADST or DCT is used in both directions, we

+      // use the combination of the two as a context.

+      int a = find_in_scan(scan, l, (i - 1) * l + j);

+      int b = find_in_scan(scan, l,  i      * l + j - 1);

+      if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||

+          scan == vp9_col_scan_16x16) {

+        neighbors[max_neighbors * n + 0] = a;

+        neighbors[max_neighbors * n + 1] = -1;

+      } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||

+                 scan == vp9_row_scan_16x16) {

+        neighbors[max_neighbors * n + 0] = b;

+        neighbors[max_neighbors * n + 1] = -1;

+      } else {

+        neighbors[max_neighbors * n + 0] = a;

+        neighbors[max_neighbors * n + 1] = b;

+    } else if (i > 0) {

+      neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j);

+      neighbors[max_neighbors * n + 1] = -1;

+    } else if (j > 0) {

+      neighbors[max_neighbors * n + 0] =

+          find_in_scan(scan, l,  i      * l + j - 1);

+      neighbors[max_neighbors * n + 1] = -1;

+    } else {

+      assert(n == 0);

+      // dc predictor doesn't use previous tokens

+      neighbors[max_neighbors * n + 0] = -1;

-    if (j - 1 >= 0) {

-      m = i * l + j - 1;

-      if (find_in_scan(scan, l, m) < locn) {

-        neighbors[MAX_NEIGHBORS * n + 1] = m;

-        if (m == 0) z = 1;

-      }

-    }

-    if (i - 1 >= 0 && j - 1 >= 0) {

-      m = (i - 1) * l + j - 1;

-      if (find_in_scan(scan, l, m) < locn) {

-        neighbors[MAX_NEIGHBORS * n + 2] = m;

-        if (m == 0) z = 2;

-      }

-    }

-    if (i - 1 >= 0 && j + 1 < l) {

-      m = (i - 1) * l + j + 1;

-      if (find_in_scan(scan, l, m) < locn) {

-        neighbors[MAX_NEIGHBORS * n + 3] = m;

-        if (m == 0) z = 3;

-      }

-    }

-    if (i + 1 < l && j - 1 >= 0) {

-       m = (i + 1) * l + j - 1;

-      if (find_in_scan(scan, l, m) < locn) {

-        neighbors[MAX_NEIGHBORS * n + 4] = m;

-        if (m == 0) z = 4;

-      }

-    }

-    if (z != -1) {  // zero exists

-      int v = 0;

-      for (k = 0; k < MAX_NEIGHBORS; ++k)

-        v += (neighbors[MAX_NEIGHBORS * n + k] > 0);

-      if (v) {

-        neighbors[MAX_NEIGHBORS * n + z] = -1;

-      }

-    }

+    assert(neighbors[max_neighbors * n + 0] < n);

 void vp9_init_neighbors() {

   init_scan_neighbors(vp9_default_zig_zag1d_4x4, 4,

-                      vp9_default_zig_zag1d_4x4_neighbors);

+                      vp9_default_zig_zag1d_4x4_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_row_scan_4x4, 4,

-                      vp9_row_scan_4x4_neighbors);

+                      vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_col_scan_4x4, 4,

-                      vp9_col_scan_4x4_neighbors);

+                      vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_default_zig_zag1d_8x8, 8,

-                      vp9_default_zig_zag1d_8x8_neighbors);

+                      vp9_default_zig_zag1d_8x8_neighbors, MAX_NEIGHBORS);

+  init_scan_neighbors(vp9_row_scan_8x8, 8,

+                      vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS);

+  init_scan_neighbors(vp9_col_scan_8x8, 8,

+                      vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_default_zig_zag1d_16x16, 16,

-                      vp9_default_zig_zag1d_16x16_neighbors);

+                      vp9_default_zig_zag1d_16x16_neighbors, MAX_NEIGHBORS);

+  init_scan_neighbors(vp9_row_scan_16x16, 16,

+                      vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS);

+  init_scan_neighbors(vp9_col_scan_16x16, 16,

+                      vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS);

   init_scan_neighbors(vp9_default_zig_zag1d_32x32, 32,

-                      vp9_default_zig_zag1d_32x32_neighbors);

+                      vp9_default_zig_zag1d_32x32_neighbors, MAX_NEIGHBORS);

-const int *vp9_get_coef_neighbors_handle(const int *scan) {

+const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) {

   if (scan == vp9_default_zig_zag1d_4x4) {

+    *pad = MAX_NEIGHBORS;

     return vp9_default_zig_zag1d_4x4_neighbors;

   } else if (scan == vp9_row_scan_4x4) {

+    *pad = MAX_NEIGHBORS;

     return vp9_row_scan_4x4_neighbors;

   } else if (scan == vp9_col_scan_4x4) {

+    *pad = MAX_NEIGHBORS;

     return vp9_col_scan_4x4_neighbors;

   } else if (scan == vp9_default_zig_zag1d_8x8) {

+    *pad = MAX_NEIGHBORS;

     return vp9_default_zig_zag1d_8x8_neighbors;

+  } else if (scan == vp9_row_scan_8x8) {

+    *pad = 2;

+    return vp9_row_scan_8x8_neighbors;

+  } else if (scan == vp9_col_scan_8x8) {

+    *pad = 2;

+    return vp9_col_scan_8x8_neighbors;

   } else if (scan == vp9_default_zig_zag1d_16x16) {

+    *pad = MAX_NEIGHBORS;

     return vp9_default_zig_zag1d_16x16_neighbors;

+  } else if (scan == vp9_row_scan_16x16) {

+    *pad = 2;

+    return vp9_row_scan_16x16_neighbors;

+  } else if (scan == vp9_col_scan_16x16) {

+    *pad = 2;

+    return vp9_col_scan_16x16_neighbors;

   } else if (scan == vp9_default_zig_zag1d_32x32) {

+    *pad = MAX_NEIGHBORS;

     return vp9_default_zig_zag1d_32x32_neighbors;

+  } else {

+    assert(0);

+    return NULL;

-  return vp9_default_zig_zag1d_4x4_neighbors;

-int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc,

-                                  const int *neigbor_handle, int rc) {

-  static int neighbors_used = MAX_NEIGHBORS;   // maximum is MAX_NEIGHBORS

-  const int *nb = neigbor_handle + rc * MAX_NEIGHBORS;

-  int i, v, val = 0, n = 0;

-  for (i = 0; i < neighbors_used; ++i) {

-    if (nb[i] == -1 || (nb[i] == 0 && nodc)) {

-      continue;

-    }

-    v = abs(qcoeff_ptr[nb[i]]);

-    val = (v > val ? v : val);

-    n++;

+void vp9_coef_tree_initialize() {

+  vp9_init_neighbors();

+  init_bit_trees();

+  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_tokens_from_tree(vp9_nzc4x4_encodings, vp9_nzc4x4_tree);

+  vp9_tokens_from_tree(vp9_nzc8x8_encodings, vp9_nzc8x8_tree);

+  vp9_tokens_from_tree(vp9_nzc16x16_encodings, vp9_nzc16x16_tree);

+  vp9_tokens_from_tree(vp9_nzc32x32_encodings, vp9_nzc32x32_tree);

+#endif

+}

+#if CONFIG_CODE_NONZEROCOUNT

+#define mb_in_cur_tile(cm, mb_row, mb_col)      \

+    ((mb_col) >= (cm)->cur_tile_mb_col_start && \

+     (mb_col) <= (cm)->cur_tile_mb_col_end   && \

+     (mb_row) >= 0)

+#define choose_nzc_context(nzc_exp, t2, t1)     \

+    ((nzc_exp) >= (t2) ? 2 : (nzc_exp) >= (t1) ? 1 : 0)

+#define NZC_T2_32X32    (16 << 6)

+#define NZC_T1_32X32     (4 << 6)

+#define NZC_T2_16X16    (12 << 6)

+#define NZC_T1_16X16     (3 << 6)

+#define NZC_T2_8X8       (8 << 6)

+#define NZC_T1_8X8       (2 << 6)

+#define NZC_T2_4X4       (4 << 6)

+#define NZC_T1_4X4       (1 << 6)

+// Transforms a mb16 block index to a sb64 block index

+static inline int mb16_to_sb64_index(int mb_row, int mb_col, int block) {

+  int r = (mb_row & 3);

+  int c = (mb_col & 3);

+  int b;

+  if (block < 16) {  // Y

+    int ib = block >> 2;

+    int jb = block & 3;

+    ib += r * 4;

+    jb += c * 4;

+    b = ib * 16 + jb;

+    assert(b < 256);

+    return b;

+  } else {  // UV

+    int base = block - (block & 3);

+    int ib = (block - base) >> 1;

+    int jb = (block - base) & 1;

+    ib += r * 2;

+    jb += c * 2;

+    b = base * 16 + ib * 8 + jb;

+    assert(b >= 256 && b < 384);

+    return b;

-  if (n == 0)

+}

+// Transforms a mb16 block index to a sb32 block index

+static inline int mb16_to_sb32_index(int mb_row, int mb_col, int block) {

+  int r = (mb_row & 1);

+  int c = (mb_col & 1);

+  int b;

+  if (block < 16) {  // Y

+    int ib = block >> 2;

+    int jb = block & 3;

+    ib += r * 4;

+    jb += c * 4;

+    b = ib * 8 + jb;

+    assert(b < 64);

+    return b;

+  } else {  // UV

+    int base = block - (block & 3);

+    int ib = (block - base) >> 1;

+    int jb = (block - base) & 1;

+    ib += r * 2;

+    jb += c * 2;

+    b = base * 4 + ib * 4 + jb;

+    assert(b >= 64 && b < 96);

+    return b;

+  }

+}

+static inline int block_to_txfm_index(int block, TX_SIZE tx_size, int s) {

+  // s is the log of the number of 4x4 blocks in each row/col of larger block

+  int b, ib, jb, nb;

+  ib = block >> s;

+  jb = block - (ib << s);

+  ib >>= tx_size;

+  jb >>= tx_size;

+  nb = 1 << (s - tx_size);

+  b = (ib * nb + jb) << (2 * tx_size);

+  return b;

+}

+/* BEGIN - Helper functions to get the y nzcs */

+static unsigned int get_nzc_4x4_y_sb64(MB_MODE_INFO *mi, int block) {

+  int b;

+  assert(block < 256);

+  b = block_to_txfm_index(block, mi->txfm_size, 4);

+  assert(b < 256);

+  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);

+}

+static unsigned int get_nzc_4x4_y_sb32(MB_MODE_INFO *mi, int block) {

+  int b;

+  assert(block < 64);

+  b = block_to_txfm_index(block, mi->txfm_size, 3);

+  assert(b < 64);

+  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);

+}

+static unsigned int get_nzc_4x4_y_mb16(MB_MODE_INFO *mi, int block) {

+  int b;

+  assert(block < 16);

+  b = block_to_txfm_index(block, mi->txfm_size, 2);

+  assert(b < 16);

+  return mi->nzcs[b] << (6 - 2 * mi->txfm_size);

+}

+/* END - Helper functions to get the y nzcs */

+/* Function to get y nzc where block index is in mb16 terms */

+static unsigned int get_nzc_4x4_y(VP9_COMMON *cm, MODE_INFO *m,

+                                  int mb_row, int mb_col, int block) {

+  // NOTE: All values returned are at 64 times the true value at 4x4 scale

+  MB_MODE_INFO *const mi = &m->mbmi;

+  const int mis = cm->mode_info_stride;

+  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))

     return 0;

-  else if (val <= 1)

-    return val;

-  else if (val < 4)

-    return 2;

+  if (mi->sb_type == BLOCK_SIZE_SB64X64) {

+    int r = mb_row & 3;

+    int c = mb_col & 3;

+    m -= c + r * mis;

+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))

+      return 0;

+    else

+      return get_nzc_4x4_y_sb64(

+          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));

+  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {

+    int r = mb_row & 1;

+    int c = mb_col & 1;

+    m -= c + r * mis;

+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))

+      return 0;

+    else

+      return get_nzc_4x4_y_sb32(

+          &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));

+  } else {

+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))

+      return 0;

+    return get_nzc_4x4_y_mb16(mi, block);

+  }

+}

+/* BEGIN - Helper functions to get the uv nzcs */

+static unsigned int get_nzc_4x4_uv_sb64(MB_MODE_INFO *mi, int block) {

+  int b;

+  int base, uvtxfm_size;

+  assert(block >= 256 && block < 384);

+  uvtxfm_size = mi->txfm_size;

+  base = 256 + (block & 64);

+  block -= base;

+  b = base + block_to_txfm_index(block, uvtxfm_size, 3);

+  assert(b >= 256 && b < 384);

+  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);

+}

+static unsigned int get_nzc_4x4_uv_sb32(MB_MODE_INFO *mi, int block) {

+  int b;

+  int base, uvtxfm_size;

+  assert(block >= 64 && block < 96);

+  if (mi->txfm_size == TX_32X32)

+    uvtxfm_size = TX_16X16;

   else

-    return 3;

+    uvtxfm_size = mi->txfm_size;

+  base = 64 + (block & 16);

+  block -= base;

+  b = base + block_to_txfm_index(block, uvtxfm_size, 2);

+  assert(b >= 64 && b < 96);

+  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);

-#endif  /* CONFIG_NEWCOEFCONTEXT */

-void vp9_default_coef_probs(VP9_COMMON *pc) {

-  vpx_memcpy(pc->fc.coef_probs_4x4, default_coef_probs_4x4,

-             sizeof(pc->fc.coef_probs_4x4));

-  vpx_memcpy(pc->fc.hybrid_coef_probs_4x4, default_hybrid_coef_probs_4x4,

-             sizeof(pc->fc.hybrid_coef_probs_4x4));

+static unsigned int get_nzc_4x4_uv_mb16(MB_MODE_INFO *mi, int block) {

+  int b;

+  int base, uvtxfm_size;

+  assert(block >= 16 && block < 24);

+  if (mi->txfm_size == TX_8X8 &&

+      (mi->mode == SPLITMV || mi->mode == I8X8_PRED))

+    uvtxfm_size = TX_4X4;

+  else if (mi->txfm_size == TX_16X16)

+    uvtxfm_size = TX_8X8;

+  else

+    uvtxfm_size = mi->txfm_size;

+  base = 16 + (block & 4);

+  block -= base;

+  b = base + block_to_txfm_index(block, uvtxfm_size, 1);

+  assert(b >= 16 && b < 24);

+  return mi->nzcs[b] << (6 - 2 * uvtxfm_size);

+}

+/* END - Helper functions to get the uv nzcs */

-  vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8,

-             sizeof(pc->fc.coef_probs_8x8));

-  vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8,

-             sizeof(pc->fc.hybrid_coef_probs_8x8));

+/* Function to get uv nzc where block index is in mb16 terms */

+static unsigned int get_nzc_4x4_uv(VP9_COMMON *cm, MODE_INFO *m,

+                                   int mb_row, int mb_col, int block) {

+  // NOTE: All values returned are at 64 times the true value at 4x4 scale

+  MB_MODE_INFO *const mi = &m->mbmi;

+  const int mis = cm->mode_info_stride;

+  if (mi->mb_skip_coeff || !mb_in_cur_tile(cm, mb_row, mb_col))

+    return 0;

+  if (mi->sb_type == BLOCK_SIZE_SB64X64) {

+    int r = mb_row & 3;

+    int c = mb_col & 3;

+    m -= c + r * mis;

+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))

+      return 0;

+    else

+      return get_nzc_4x4_uv_sb64(

+          &m->mbmi, mb16_to_sb64_index(mb_row, mb_col, block));

+  } else if (mi->sb_type == BLOCK_SIZE_SB32X32) {

+    int r = mb_row & 1;

+    int c = mb_col & 1;

+    m -= c + r * mis;

+    if (m->mbmi.mb_skip_coeff || !mb_in_cur_tile(cm, mb_row - r, mb_col - c))

+      return 0;

+    else

+    return get_nzc_4x4_uv_sb32(

+        &m->mbmi, mb16_to_sb32_index(mb_row, mb_col, block));

+  } else {

+    return get_nzc_4x4_uv_mb16(mi, block);

+  }

+}

-  vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16,

-             sizeof(pc->fc.coef_probs_16x16));

-  vpx_memcpy(pc->fc.hybrid_coef_probs_16x16,

-             default_hybrid_coef_probs_16x16,

-             sizeof(pc->fc.hybrid_coef_probs_16x16));

-  vpx_memcpy(pc->fc.coef_probs_32x32, default_coef_probs_32x32,

-             sizeof(pc->fc.coef_probs_32x32));

+int vp9_get_nzc_context_y_sb64(VP9_COMMON *cm, MODE_INFO *cur,

+                               int mb_row, int mb_col, int block) {

+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

+  // neighboring blocks are

+  int mis = cm->mode_info_stride;

+  int nzc_exp = 0;

+  TX_SIZE txfm_size = cur->mbmi.txfm_size;

+  assert(block < 256);

+  switch (txfm_size) {

+    case TX_32X32:

+      assert((block & 63) == 0);

+      if (block < 128) {

+        int o = (block >> 6) * 2;

+        nzc_exp =

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15) +

+            get_nzc_4x4_y(cm, cur - mis + o + 1,

+                          mb_row - 1, mb_col + o + 1, 12) +

+            get_nzc_4x4_y(cm, cur - mis + o + 1,

+                          mb_row - 1, mb_col + o + 1, 13) +

+            get_nzc_4x4_y(cm, cur - mis + o + 1,

+                          mb_row - 1, mb_col + o + 1, 14) +

+            get_nzc_4x4_y(cm, cur - mis + o + 1,

+                          mb_row - 1, mb_col + o + 1, 15);

+      } else {

+        nzc_exp = cur->mbmi.nzcs[block - 128] << 3;

+      }

+      if ((block & 127) == 0) {

+        int o = (block >> 7) * 2;

+        nzc_exp +=

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,

+                          mb_row + o + 1, mb_col - 1, 3) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,

+                          mb_row + o + 1, mb_col - 1, 7) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,

+                          mb_row + o + 1, mb_col - 1, 11) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis + mis,

+                          mb_row + o + 1, mb_col - 1, 15);

+      } else {

+        nzc_exp += cur->mbmi.nzcs[block - 64] << 3;

+      }

+      nzc_exp <<= 2;

+      // Note nzc_exp is 64 times the average value expected at 32x32 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);

+      break;

+    case TX_16X16:

+      assert((block & 15) == 0);

+      if (block < 64) {

+        int o = block >> 4;

+        nzc_exp =

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);

+      } else {

+        nzc_exp = cur->mbmi.nzcs[block - 64] << 4;

+      }

+      if ((block & 63) == 0) {

+        int o = block >> 6;

+        nzc_exp +=

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);

+      } else {

+        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;

+      }

+      nzc_exp <<= 1;

+      // Note nzc_exp is 64 times the average value expected at 16x16 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

+      break;

+    case TX_8X8:

+      assert((block & 3) == 0);

+      if (block < 32) {

+        int o = block >> 3;

+        int p = ((block >> 2) & 1) ? 14 : 12;

+        nzc_exp =

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);

+      } else {

+        nzc_exp = cur->mbmi.nzcs[block - 32] << 5;

+      }

+      if ((block & 31) == 0) {

+        int o = block >> 6;

+        int p = ((block >> 5) & 1) ? 11 : 3;

+        nzc_exp +=

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);

+      } else {

+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

+      }

+      // Note nzc_exp is 64 times the average value expected at 8x8 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

+      break;

+    case TX_4X4:

+      if (block < 16) {

+        int o = block >> 2;

+        int p = block & 3;

+        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                                12 + p);

+      } else {

+        nzc_exp = (cur->mbmi.nzcs[block - 16] << 6);

+      }

+      if ((block & 15) == 0) {

+        int o = block >> 6;

+        int p = (block >> 4) & 3;

+        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

+                                 3 + 4 * p);

+      } else {

+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

+      }

+      nzc_exp >>= 1;

+      // Note nzc_exp is 64 times the average value expected at 4x4 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

+      break;

+    default:

+      return 0;

+  }

-void vp9_coef_tree_initialize() {

-  init_bit_trees();

-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);

+int vp9_get_nzc_context_y_sb32(VP9_COMMON *cm, MODE_INFO *cur,

+                               int mb_row, int mb_col, int block) {

+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

+  // neighboring blocks are

+  int mis = cm->mode_info_stride;

+  int nzc_exp = 0;

+  TX_SIZE txfm_size = cur->mbmi.txfm_size;

+  assert(block < 64);

+  switch (txfm_size) {

+    case TX_32X32:

+      assert(block == 0);

+      nzc_exp =

+          (get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +

+           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +

+           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +

+           get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +

+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 12) +

+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 13) +

+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 14) +

+           get_nzc_4x4_y(cm, cur - mis + 1, mb_row - 1, mb_col + 1, 15) +

+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +

+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +

+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +

+           get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15) +

+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 3) +

+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 7) +

+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 11) +

+           get_nzc_4x4_y(cm, cur - 1 + mis, mb_row + 1, mb_col - 1, 15)) << 2;

+      // Note nzc_exp is 64 times the average value expected at 32x32 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);

+      break;

+    case TX_16X16:

+      assert((block & 15) == 0);

+      if (block < 32) {

+        int o = (block >> 4) & 1;

+        nzc_exp =

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 12) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 13) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 14) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, 15);

+      } else {

+        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;

+      }

+      if ((block & 31) == 0) {

+        int o = block >> 5;

+        nzc_exp +=

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 3) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 7) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 11) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, 15);

+      } else {

+        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;

+      }

+      nzc_exp <<= 1;

+      // Note nzc_exp is 64 times the average value expected at 16x16 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

+      break;

+    case TX_8X8:

+      assert((block & 3) == 0);

+      if (block < 16) {

+        int o = block >> 3;

+        int p = ((block >> 2) & 1) ? 14 : 12;

+        nzc_exp =

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p) +

+            get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o, p + 1);

+      } else {

+        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;

+      }

+      if ((block & 15) == 0) {

+        int o = block >> 5;

+        int p = ((block >> 4) & 1) ? 11 : 3;

+        nzc_exp +=

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p) +

+            get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1, p + 4);

+      } else {

+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

+      }

+      // Note nzc_exp is 64 times the average value expected at 8x8 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

+      break;

+    case TX_4X4:

+      if (block < 8) {

+        int o = block >> 2;

+        int p = block & 3;

+        nzc_exp = get_nzc_4x4_y(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                                12 + p);

+      } else {

+        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);

+      }

+      if ((block & 7) == 0) {

+        int o = block >> 5;

+        int p = (block >> 3) & 3;

+        nzc_exp += get_nzc_4x4_y(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

+                                 3 + 4 * p);

+      } else {

+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

+      }

+      nzc_exp >>= 1;

+      // Note nzc_exp is 64 times the average value expected at 4x4 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

+      break;

+    default:

+      return 0;

+      break;

+  }

+int vp9_get_nzc_context_y_mb16(VP9_COMMON *cm, MODE_INFO *cur,

+                               int mb_row, int mb_col, int block) {

+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

+  // neighboring blocks are

+  int mis = cm->mode_info_stride;

+  int nzc_exp = 0;

+  TX_SIZE txfm_size = cur->mbmi.txfm_size;

+  assert(block < 16);

+  switch (txfm_size) {

+    case TX_16X16:

+      assert(block == 0);

+      nzc_exp =

+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 12) +

+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 13) +

+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 14) +

+          get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, 15) +

+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 3) +

+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 7) +

+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 11) +

+          get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, 15);

+      nzc_exp <<= 1;

+      // Note nzc_exp is 64 times the average value expected at 16x16 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

+    case TX_8X8:

+      assert((block & 3) == 0);

+      if (block < 8) {

+        int p = ((block >> 2) & 1) ? 14 : 12;

+        nzc_exp =

+            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p) +

+            get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col, p + 1);

+      } else {

+        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;

+      }

+      if ((block & 7) == 0) {

+        int p = ((block >> 3) & 1) ? 11 : 3;

+        nzc_exp +=

+            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p) +

+            get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1, p + 4);

+      } else {

+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

+      }

+      // Note nzc_exp is 64 times the average value expected at 8x8 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

+    case TX_4X4:

+      if (block < 4) {

+        int p = block & 3;

+        nzc_exp = get_nzc_4x4_y(cm, cur - mis, mb_row - 1, mb_col,

+                                12 + p);

+      } else {

+        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);

+      }

+      if ((block & 3) == 0) {

+        int p = (block >> 2) & 3;

+        nzc_exp += get_nzc_4x4_y(cm, cur - 1, mb_row, mb_col - 1,

+                                 3 + 4 * p);

+      } else {

+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

+      }

+      nzc_exp >>= 1;

+      // Note nzc_exp is 64 times the average value expected at 4x4 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

+    default:

+      return 0;

+      break;

+  }

+}

+int vp9_get_nzc_context_uv_sb64(VP9_COMMON *cm, MODE_INFO *cur,

+                                int mb_row, int mb_col, int block) {

+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

+  // neighboring blocks are

+  int mis = cm->mode_info_stride;

+  int nzc_exp = 0;

+  const int base = block - (block & 63);

+  const int boff = (block & 63);

+  const int base_mb16 = base >> 4;

+  TX_SIZE txfm_size = cur->mbmi.txfm_size;

+  TX_SIZE txfm_size_uv;

+  assert(block >= 256 && block < 384);

+  txfm_size_uv = txfm_size;

+  switch (txfm_size_uv) {

+    case TX_32X32:

+      assert(block == 256 || block == 320);

+      nzc_exp =

+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

+                         base_mb16 + 2) +

+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,

+                         base_mb16 + 2) +

+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,

+                         base_mb16 + 2) +

+          get_nzc_4x4_uv(cm, cur - mis + 2, mb_row - 1, mb_col + 2,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,

+                         base_mb16 + 2) +

+          get_nzc_4x4_uv(cm, cur - mis + 3, mb_row - 1, mb_col + 3,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,

+                         base_mb16 + 1) +

+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,

+                         base_mb16 + 1) +

+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,

+                         base_mb16 + 1) +

+          get_nzc_4x4_uv(cm, cur - 1 + 2 * mis, mb_row + 2, mb_col - 1,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,

+                         base_mb16 + 1) +

+          get_nzc_4x4_uv(cm, cur - 1 + 3 * mis, mb_row + 3, mb_col - 1,

+                         base_mb16 + 3);

+      nzc_exp <<= 2;

+      // Note nzc_exp is 64 times the average value expected at 32x32 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_32X32, NZC_T1_32X32);

+    case TX_16X16:

+      // uv txfm_size 16x16

+      assert((block & 15) == 0);

+      if (boff < 32) {

+        int o = (boff >> 4) & 1;

+        nzc_exp =

+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                           base_mb16 + 2) +

+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                           base_mb16 + 3) +

+            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,

+                           base_mb16 + 2) +

+            get_nzc_4x4_uv(cm, cur - mis + o + 1, mb_row - 1, mb_col + o + 1,

+                           base_mb16 + 3);

+      } else {

+        nzc_exp = cur->mbmi.nzcs[block - 32] << 4;

+      }

+      if ((boff & 31) == 0) {

+        int o = boff >> 5;

+        nzc_exp +=

+            get_nzc_4x4_uv(cm, cur - 1 + o * mis,

+                           mb_row + o, mb_col - 1, base_mb16 + 1) +

+            get_nzc_4x4_uv(cm, cur - 1 + o * mis,

+                           mb_row + o, mb_col - 1, base_mb16 + 3) +

+            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,

+                           mb_row + o + 1, mb_col - 1, base_mb16 + 1) +

+            get_nzc_4x4_uv(cm, cur - 1 + o * mis + mis,

+                           mb_row + o + 1, mb_col - 1, base_mb16 + 3);

+      } else {

+        nzc_exp += cur->mbmi.nzcs[block - 16] << 4;

+      }

+      nzc_exp <<= 1;

+      // Note nzc_exp is 64 times the average value expected at 16x16 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

+    case TX_8X8:

+      assert((block & 3) == 0);

+      if (boff < 16) {

+        int o = boff >> 2;

+        nzc_exp =

+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                           base_mb16 + 2) +

+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                           base_mb16 + 3);

+      } else {

+        nzc_exp = cur->mbmi.nzcs[block - 16] << 5;

+      }

+      if ((boff & 15) == 0) {

+        int o = boff >> 4;

+        nzc_exp +=

+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

+                           base_mb16 + 1) +

+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

+                           base_mb16 + 3);

+      } else {

+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

+      }

+      // Note nzc_exp is 64 times the average value expected at 8x8 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

+    case TX_4X4:

+      if (boff < 8) {

+        int o = boff >> 1;

+        int p = boff & 1;

+        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                                 base_mb16 + 2 + p);

+      } else {

+        nzc_exp = (cur->mbmi.nzcs[block - 8] << 6);

+      }

+      if ((boff & 7) == 0) {

+        int o = boff >> 4;

+        int p = (boff >> 3) & 1;

+        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

+                                  base_mb16 + 1 + 2 * p);

+      } else {

+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

+      }

+      nzc_exp >>= 1;

+      // Note nzc_exp is 64 times the average value expected at 4x4 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

+    default:

+      return 0;

+  }

+}

+int vp9_get_nzc_context_uv_sb32(VP9_COMMON *cm, MODE_INFO *cur,

+                                int mb_row, int mb_col, int block) {

+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

+  // neighboring blocks are

+  int mis = cm->mode_info_stride;

+  int nzc_exp = 0;

+  const int base = block - (block & 15);

+  const int boff = (block & 15);

+  const int base_mb16 = base >> 2;

+  TX_SIZE txfm_size = cur->mbmi.txfm_size;

+  TX_SIZE txfm_size_uv;

+  assert(block >= 64 && block < 96);

+  if (txfm_size == TX_32X32)

+    txfm_size_uv = TX_16X16;

+  else

+    txfm_size_uv = txfm_size;

+  switch (txfm_size_uv) {

+    case TX_16X16:

+      // uv txfm_size 16x16

+      assert(block == 64 || block == 80);

+      nzc_exp =

+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

+                         base_mb16 + 2) +

+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,

+                         base_mb16 + 2) +

+          get_nzc_4x4_uv(cm, cur - mis + 1, mb_row - 1, mb_col + 1,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,

+                         base_mb16 + 1) +

+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row, mb_col - 1,

+                         base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,

+                         base_mb16 + 1) +

+          get_nzc_4x4_uv(cm, cur - 1 + mis, mb_row + 1, mb_col - 1,

+                         base_mb16 + 3);

+      nzc_exp <<= 1;

+      // Note nzc_exp is 64 times the average value expected at 16x16 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_16X16, NZC_T1_16X16);

+      break;

+    case TX_8X8:

+      assert((block & 3) == 0);

+      if (boff < 8) {

+        int o = boff >> 2;

+        nzc_exp =

+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                           base_mb16 + 2) +

+            get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                           base_mb16 + 3);

+      } else {

+        nzc_exp = cur->mbmi.nzcs[block - 8] << 5;

+      }

+      if ((boff & 7) == 0) {

+        int o = boff >> 3;

+        nzc_exp +=

+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

+                           base_mb16 + 1) +

+            get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

+                           base_mb16 + 3);

+      } else {

+        nzc_exp += cur->mbmi.nzcs[block - 4] << 5;

+      }

+      // Note nzc_exp is 64 times the average value expected at 8x8 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

+    case TX_4X4:

+      if (boff < 4) {

+        int o = boff >> 1;

+        int p = boff & 1;

+        nzc_exp = get_nzc_4x4_uv(cm, cur - mis + o, mb_row - 1, mb_col + o,

+                                 base_mb16 + 2 + p);

+      } else {

+        nzc_exp = (cur->mbmi.nzcs[block - 4] << 6);

+      }

+      if ((boff & 3) == 0) {

+        int o = boff >> 3;

+        int p = (boff >> 2) & 1;

+        nzc_exp += get_nzc_4x4_uv(cm, cur - 1 + o * mis, mb_row + o, mb_col - 1,

+                                  base_mb16 + 1 + 2 * p);

+      } else {

+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

+      }

+      nzc_exp >>= 1;

+      // Note nzc_exp is 64 times the average value expected at 4x4 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

+    default:

+      return 0;

+  }

+}

+int vp9_get_nzc_context_uv_mb16(VP9_COMMON *cm, MODE_INFO *cur,

+                                int mb_row, int mb_col, int block) {

+  // returns an index in [0, MAX_NZC_CONTEXTS - 1] to reflect how busy

+  // neighboring blocks are

+  int mis = cm->mode_info_stride;

+  int nzc_exp = 0;

+  const int base = block - (block & 3);

+  const int boff = (block & 3);

+  const int base_mb16 = base;

+  TX_SIZE txfm_size = cur->mbmi.txfm_size;

+  TX_SIZE txfm_size_uv;

+  assert(block >= 16 && block < 24);

+  if (txfm_size == TX_16X16)

+    txfm_size_uv = TX_8X8;

+  else if (txfm_size == TX_8X8 &&

+           (cur->mbmi.mode == I8X8_PRED || cur->mbmi.mode == SPLITMV))

+    txfm_size_uv = TX_4X4;

+  else

+    txfm_size_uv = txfm_size;

+  switch (txfm_size_uv) {

+    case TX_8X8:

+      assert((block & 3) == 0);

+      nzc_exp =

+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 2) +

+          get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col, base_mb16 + 3) +

+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 1) +

+          get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1, base_mb16 + 3);

+      // Note nzc_exp is 64 times the average value expected at 8x8 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_8X8, NZC_T1_8X8);

+    case TX_4X4:

+      if (boff < 2) {

+        int p = boff & 1;

+        nzc_exp = get_nzc_4x4_uv(cm, cur - mis, mb_row - 1, mb_col,

+                                 base_mb16 + 2 + p);

+      } else {

+        nzc_exp = (cur->mbmi.nzcs[block - 2] << 6);

+      }

+      if ((boff & 1) == 0) {

+        int p = (boff >> 1) & 1;

+        nzc_exp += get_nzc_4x4_uv(cm, cur - 1, mb_row, mb_col - 1,

+                                  base_mb16 + 1 + 2 * p);

+      } else {

+        nzc_exp += (cur->mbmi.nzcs[block - 1] << 6);

+      }

+      nzc_exp >>= 1;

+      // Note nzc_exp is 64 times the average value expected at 4x4 scale

+      return choose_nzc_context(nzc_exp, NZC_T2_4X4, NZC_T1_4X4);

+    default:

+      return 0;

+  }

+}

+int vp9_get_nzc_context(VP9_COMMON *cm, MACROBLOCKD *xd, int block) {

+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

+    assert(block < 384);

+    if (block < 256)

+      return vp9_get_nzc_context_y_sb64(cm, xd->mode_info_context,

+                                        get_mb_row(xd), get_mb_col(xd), block);

+    else

+      return vp9_get_nzc_context_uv_sb64(cm, xd->mode_info_context,

+                                         get_mb_row(xd), get_mb_col(xd), block);

+  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {

+    assert(block < 96);

+    if (block < 64)

+      return vp9_get_nzc_context_y_sb32(cm, xd->mode_info_context,

+                                        get_mb_row(xd), get_mb_col(xd), block);

+    else

+      return vp9_get_nzc_context_uv_sb32(cm, xd->mode_info_context,

+                                         get_mb_row(xd), get_mb_col(xd), block);

+  } else {

+    assert(block < 64);

+    if (block < 16)

+      return vp9_get_nzc_context_y_mb16(cm, xd->mode_info_context,

+                                        get_mb_row(xd), get_mb_col(xd), block);

+    else

+      return vp9_get_nzc_context_uv_mb16(cm, xd->mode_info_context,

+                                         get_mb_row(xd), get_mb_col(xd), block);

+  }

+}

+static void update_nzc(VP9_COMMON *cm,

+                       uint16_t nzc,

+                       int nzc_context,

+                       TX_SIZE tx_size,

+                       int ref,

+                       int type) {

+  int e, c;

+  c = codenzc(nzc);

+  if (tx_size == TX_32X32)

+    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;

+  else if (tx_size == TX_16X16)

+    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;

+  else if (tx_size == TX_8X8)

+    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;

+  else if (tx_size == TX_4X4)

+    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;

+  else

+    assert(0);

+  if ((e = vp9_extranzcbits[c])) {

+    int x = nzc - vp9_basenzcvalue[c];

+    while (e--) {

+      int b = (x >> e) & 1;

+      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;

+    }

+  }

+}

+static void update_nzcs_sb64(VP9_COMMON *cm,

+                             MACROBLOCKD *xd,

+                             int mb_row,

+                             int mb_col) {

+  MODE_INFO *m = xd->mode_info_context;

+  MB_MODE_INFO *const mi = &m->mbmi;

+  int j, nzc_context;

+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  assert(mb_col == get_mb_col(xd));

+  assert(mb_row == get_mb_row(xd));

+  if (mi->mb_skip_coeff)

+    return;

+  switch (mi->txfm_size) {

+    case TX_32X32:

+      for (j = 0; j < 256; j += 64) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);

+      }

+      for (j = 256; j < 384; j += 64) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1);

+      }

+      break;

+    case TX_16X16:

+      for (j = 0; j < 256; j += 16) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);

+      }

+      for (j = 256; j < 384; j += 16) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);

+      }

+      break;

+    case TX_8X8:

+      for (j = 0; j < 256; j += 4) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);

+      }

+      for (j = 256; j < 384; j += 4) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);

+      }

+      break;

+    case TX_4X4:

+      for (j = 0; j < 256; ++j) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);

+      }

+      for (j = 256; j < 384; ++j) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);

+      }

+      break;

+    default:

+      break;

+  }

+}

+static void update_nzcs_sb32(VP9_COMMON *cm,

+                            MACROBLOCKD *xd,

+                            int mb_row,

+                            int mb_col) {

+  MODE_INFO *m = xd->mode_info_context;

+  MB_MODE_INFO *const mi = &m->mbmi;

+  int j, nzc_context;

+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  assert(mb_col == get_mb_col(xd));

+  assert(mb_row == get_mb_row(xd));

+  if (mi->mb_skip_coeff)

+    return;

+  switch (mi->txfm_size) {

+    case TX_32X32:

+      for (j = 0; j < 64; j += 64) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0);

+      }

+      for (j = 64; j < 96; j += 16) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);

+      }

+      break;

+    case TX_16X16:

+      for (j = 0; j < 64; j += 16) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);

+      }

+      for (j = 64; j < 96; j += 16) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1);

+      }

+      break;

+    case TX_8X8:

+      for (j = 0; j < 64; j += 4) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);

+      }

+      for (j = 64; j < 96; j += 4) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);

+      }

+      break;

+    case TX_4X4:

+      for (j = 0; j < 64; ++j) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);

+      }

+      for (j = 64; j < 96; ++j) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);

+      }

+      break;

+    default:

+      break;

+  }

+}

+static void update_nzcs_mb16(VP9_COMMON *cm,

+                             MACROBLOCKD *xd,

+                             int mb_row,

+                             int mb_col) {

+  MODE_INFO *m = xd->mode_info_context;

+  MB_MODE_INFO *const mi = &m->mbmi;

+  int j, nzc_context;

+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  assert(mb_col == get_mb_col(xd));

+  assert(mb_row == get_mb_row(xd));

+  if (mi->mb_skip_coeff)

+    return;

+  switch (mi->txfm_size) {

+    case TX_16X16:

+      for (j = 0; j < 16; j += 16) {

+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0);

+      }

+      for (j = 16; j < 24; j += 4) {

+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);

+      }

+      break;

+    case TX_8X8:

+      for (j = 0; j < 16; j += 4) {

+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0);

+      }

+      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {

+        for (j = 16; j < 24; ++j) {

+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);

+        }

+      } else {

+        for (j = 16; j < 24; j += 4) {

+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+          update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1);

+        }

+      }

+      break;

+    case TX_4X4:

+      for (j = 0; j < 16; ++j) {

+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0);

+      }

+      for (j = 16; j < 24; ++j) {

+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+        update_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1);

+      }

+      break;

+    default:

+      break;

+  }

+}

+void vp9_update_nzc_counts(VP9_COMMON *cm,

+                           MACROBLOCKD *xd,

+                           int mb_row,

+                           int mb_col) {

+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64)

+    update_nzcs_sb64(cm, xd, mb_row, mb_col);

+  else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32)

+    update_nzcs_sb32(cm, xd, mb_row, mb_col);

+  else

+    update_nzcs_mb16(cm, xd, mb_row, mb_col);

+}

+#endif  // CONFIG_CODE_NONZEROCOUNT

 // #define COEF_COUNT_TESTING

 #define COEF_COUNT_SAT 24

@@ -1111,42 +3525,55 @@

 #define COEF_COUNT_SAT_AFTER_KEY 24

 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128

-static void update_coef_probs(vp9_coeff_probs *dst_coef_probs,

-                              vp9_coeff_probs *pre_coef_probs,

-                              int block_types, vp9_coeff_count *coef_counts,

-                              int count_sat, int update_factor) {

-  int t, i, j, k, count;

+static void adapt_coef_probs(vp9_coeff_probs *dst_coef_probs,

+                             vp9_coeff_probs *pre_coef_probs,

+                             int block_types, vp9_coeff_count *coef_counts,

+                             unsigned int (*eob_branch_count)[REF_TYPES]

+                                                             [COEF_BANDS]

+                                                      [PREV_COEF_CONTEXTS],

+                             int count_sat, int update_factor) {

+  int t, i, j, k, l, count;

   unsigned int branch_ct[ENTROPY_NODES][2];

   vp9_prob coef_probs[ENTROPY_NODES];

   int factor;

+#if CONFIG_MODELCOEFPROB && MODEL_BASED_ADAPT

+  int entropy_nodes_adapt = UNCONSTRAINED_ADAPT_NODES;

+#else

+  int entropy_nodes_adapt = ENTROPY_NODES;

+#endif

   for (i = 0; i < block_types; ++i)

-    for (j = 0; j < COEF_BANDS; ++j)

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,

-                                         vp9_coef_encodings, vp9_coef_tree,

-                                         coef_probs, branch_ct,

-                                         coef_counts[i][j][k]);

-        for (t = 0; t < ENTROPY_NODES; ++t) {

-          count = branch_ct[t][0] + branch_ct[t][1];

-          count = count > count_sat ? count_sat : count;

-          factor = (update_factor * count / count_sat);

-          dst_coef_probs[i][j][k][t] = weighted_prob(pre_coef_probs[i][j][k][t],

-                                                     coef_probs[t], factor);

+    for (j = 0; j < REF_TYPES; ++j)

+      for (k = 0; k < COEF_BANDS; ++k)

+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {

+          if (l >= 3 && k == 0)

+            continue;

+          vp9_tree_probs_from_distribution(vp9_coef_tree,

+                                           coef_probs, branch_ct,

+                                           coef_counts[i][j][k][l], 0);

+          branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];

+          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);

+          for (t = 0; t < entropy_nodes_adapt; ++t) {

+            count = branch_ct[t][0] + branch_ct[t][1];

+            count = count > count_sat ? count_sat : count;

+            factor = (update_factor * count / count_sat);

+            dst_coef_probs[i][j][k][l][t] =

+                weighted_prob(pre_coef_probs[i][j][k][l][t],

+                              coef_probs[t], factor);

+#if CONFIG_MODELCOEFPROB && MODEL_BASED_ADAPT

+            if (t == UNCONSTRAINED_NODES - 1)

+              vp9_get_model_distribution(

+                  dst_coef_probs[i][j][k][l][UNCONSTRAINED_NODES - 1],

+                  dst_coef_probs[i][j][k][l], i, j);

+#endif

+          }

-      }

 void vp9_adapt_coef_probs(VP9_COMMON *cm) {

-#ifdef COEF_COUNT_TESTING

-  int t, i, j, k;

-#endif

   int count_sat;

   int update_factor; /* denominator 256 */

-  // printf("Frame type: %d\n", cm->frame_type);

   if (cm->frame_type == KEY_FRAME) {

     update_factor = COEF_MAX_UPDATE_FACTOR_KEY;

     count_sat = COEF_COUNT_SAT_KEY;

@@ -1158,87 +3585,141 @@

     count_sat = COEF_COUNT_SAT;

-#ifdef COEF_COUNT_TESTING

-  {

-    printf("static const unsigned int\ncoef_counts"

-           "[BLOCK_TYPES] [COEF_BANDS]"

-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");

-    for (i = 0; i < BLOCK_TYPES; ++i) {

-      printf("  {\n");

-      for (j = 0; j < COEF_BANDS; ++j) {

-        printf("    {\n");

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          printf("      {");

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            printf("%d, ", cm->fc.coef_counts[i][j][k][t]);

-          printf("},\n");

+  adapt_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,

+                   BLOCK_TYPES, cm->fc.coef_counts_4x4,

+                   cm->fc.eob_branch_counts[TX_4X4],

+                   count_sat, update_factor);

+  adapt_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,

+                   BLOCK_TYPES, cm->fc.coef_counts_8x8,

+                   cm->fc.eob_branch_counts[TX_8X8],

+                   count_sat, update_factor);

+  adapt_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,

+                   BLOCK_TYPES, cm->fc.coef_counts_16x16,

+                   cm->fc.eob_branch_counts[TX_16X16],

+                   count_sat, update_factor);

+  adapt_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,

+                   BLOCK_TYPES, cm->fc.coef_counts_32x32,

+                   cm->fc.eob_branch_counts[TX_32X32],

+                   count_sat, update_factor);

+}

+#if CONFIG_CODE_NONZEROCOUNT

+static void adapt_nzc_probs(VP9_COMMON *cm,

+                            int block_size,

+                            int count_sat,

+                            int update_factor) {

+  int c, r, b, n;

+  int count, factor;

+  unsigned int nzc_branch_ct[NZC32X32_NODES][2];

+  vp9_prob nzc_probs[NZC32X32_NODES];

+  int tokens, nodes;

+  const vp9_tree_index *nzc_tree;

+  vp9_prob *dst_nzc_probs;

+  vp9_prob *pre_nzc_probs;

+  unsigned int *nzc_counts;

+  if (block_size == 32) {

+    tokens = NZC32X32_TOKENS;

+    nzc_tree = vp9_nzc32x32_tree;

+    dst_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];

+    pre_nzc_probs = cm->fc.pre_nzc_probs_32x32[0][0][0];

+    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];

+  } else if (block_size == 16) {

+    tokens = NZC16X16_TOKENS;

+    nzc_tree = vp9_nzc16x16_tree;

+    dst_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];

+    pre_nzc_probs = cm->fc.pre_nzc_probs_16x16[0][0][0];

+    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];

+  } else if (block_size == 8) {

+    tokens = NZC8X8_TOKENS;

+    nzc_tree = vp9_nzc8x8_tree;

+    dst_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];

+    pre_nzc_probs = cm->fc.pre_nzc_probs_8x8[0][0][0];

+    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];

+  } else {

+    nzc_tree = vp9_nzc4x4_tree;

+    tokens = NZC4X4_TOKENS;

+    dst_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];

+    pre_nzc_probs = cm->fc.pre_nzc_probs_4x4[0][0][0];

+    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];

+  }

+  nodes = tokens - 1;

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)

+    for (r = 0; r < REF_TYPES; ++r)

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

+        int offset_nodes = offset * nodes;

+        int offset_tokens = offset * tokens;

+        vp9_tree_probs_from_distribution(nzc_tree,

+                                         nzc_probs, nzc_branch_ct,

+                                         nzc_counts + offset_tokens, 0);

+        for (n = 0; n < nodes; ++n) {

+          count = nzc_branch_ct[n][0] + nzc_branch_ct[n][1];

+          count = count > count_sat ? count_sat : count;

+          factor = (update_factor * count / count_sat);

+          dst_nzc_probs[offset_nodes + n] =

+              weighted_prob(pre_nzc_probs[offset_nodes + n],

+                            nzc_probs[n], factor);

-        printf("    },\n");

-      printf("  },\n");

-    }

-    printf("};\n");

-    printf("static const unsigned int\ncoef_counts_8x8"

-           "[BLOCK_TYPES_8X8] [COEF_BANDS]"

-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");

-    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

-      printf("  {\n");

-      for (j = 0; j < COEF_BANDS; ++j) {

-        printf("    {\n");

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          printf("      {");

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]);

-          printf("},\n");

-        }

-        printf("    },\n");

+}

+static void adapt_nzc_pcat(VP9_COMMON *cm, int count_sat, int update_factor) {

+  int c, t;

+  int count, factor;

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

+      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

+      int b;

+      for (b = 0; b < bits; ++b) {

+        vp9_prob prob = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],

+                                        cm->fc.nzc_pcat_counts[c][t][b][1]);

+        count = cm->fc.nzc_pcat_counts[c][t][b][0] +

+                cm->fc.nzc_pcat_counts[c][t][b][1];

+        count = count > count_sat ? count_sat : count;

+        factor = (update_factor * count / count_sat);

+        cm->fc.nzc_pcat_probs[c][t][b] = weighted_prob(

+            cm->fc.pre_nzc_pcat_probs[c][t][b], prob, factor);

-      printf("  },\n");

-    printf("};\n");

-    printf("static const unsigned int\nhybrid_coef_counts"

-           "[BLOCK_TYPES] [COEF_BANDS]"

-           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n");

-    for (i = 0; i < BLOCK_TYPES; ++i) {

-      printf("  {\n");

-      for (j = 0; j < COEF_BANDS; ++j) {

-        printf("    {\n");

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-          printf("      {");

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]);

-          printf("},\n");

+  }

+}

+// #define NZC_COUNT_TESTING

+void vp9_adapt_nzc_probs(VP9_COMMON *cm) {

+  int count_sat;

+  int update_factor; /* denominator 256 */

+#ifdef NZC_COUNT_TESTING

+  int c, r, b, t;

+  printf("\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c)

+    for (r = 0; r < REF_TYPES; ++r) {

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        printf("    {");

+        for (t = 0; t < NZC4X4_TOKENS; ++t) {

+          printf(" %d,", cm->fc.nzc_counts_4x4[c][r][b][t]);

-        printf("    },\n");

+        printf("}\n");

-      printf("  },\n");

+      printf("\n");

-    printf("};\n");

-  }

 #endif

-  update_coef_probs(cm->fc.coef_probs_4x4, cm->fc.pre_coef_probs_4x4,

-                    BLOCK_TYPES_4X4, cm->fc.coef_counts_4x4,

-                    count_sat, update_factor);

-  update_coef_probs(cm->fc.hybrid_coef_probs_4x4,

-                    cm->fc.pre_hybrid_coef_probs_4x4,

-                    BLOCK_TYPES_4X4, cm->fc.hybrid_coef_counts_4x4,

-                    count_sat, update_factor);

-  update_coef_probs(cm->fc.coef_probs_8x8, cm->fc.pre_coef_probs_8x8,

-                    BLOCK_TYPES_8X8, cm->fc.coef_counts_8x8,

-                    count_sat, update_factor);

-  update_coef_probs(cm->fc.hybrid_coef_probs_8x8,

-                    cm->fc.pre_hybrid_coef_probs_8x8,

-                    BLOCK_TYPES_8X8, cm->fc.hybrid_coef_counts_8x8,

-                    count_sat, update_factor);

-  update_coef_probs(cm->fc.coef_probs_16x16, cm->fc.pre_coef_probs_16x16,

-                    BLOCK_TYPES_16X16, cm->fc.coef_counts_16x16,

-                    count_sat, update_factor);

-  update_coef_probs(cm->fc.hybrid_coef_probs_16x16,

-                    cm->fc.pre_hybrid_coef_probs_16x16,

-                    BLOCK_TYPES_16X16, cm->fc.hybrid_coef_counts_16x16,

-                    count_sat, update_factor);

-  update_coef_probs(cm->fc.coef_probs_32x32, cm->fc.pre_coef_probs_32x32,

-                    BLOCK_TYPES_32X32, cm->fc.coef_counts_32x32,

-                    count_sat, update_factor);

+  if (cm->frame_type == KEY_FRAME) {

+    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;

+    count_sat = COEF_COUNT_SAT_KEY;

+  } else if (cm->last_frame_type == KEY_FRAME) {

+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */

+    count_sat = COEF_COUNT_SAT_AFTER_KEY;

+  } else {

+    update_factor = COEF_MAX_UPDATE_FACTOR;

+    count_sat = COEF_COUNT_SAT;

+  }

+  adapt_nzc_probs(cm, 4, count_sat, update_factor);

+  adapt_nzc_probs(cm, 8, count_sat, update_factor);

+  adapt_nzc_probs(cm, 16, count_sat, update_factor);

+  adapt_nzc_probs(cm, 32, count_sat, update_factor);

+  adapt_nzc_pcat(cm, count_sat, update_factor);

+#endif  // CONFIG_CODE_NONZEROCOUNT

--- a/vp9/common/vp9_entropy.h

+++ b/vp9/common/vp9_entropy.h

@@ -15,7 +15,6 @@

 #include "vp9/common/vp9_treecoder.h"

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_common.h"

-#include "vp9/common/vp9_coefupdateprobs.h"

 extern const int vp9_i8x8_block[4];

@@ -31,10 +30,10 @@

 #define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */

 #define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */

 #define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */

-#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 13+1 */

+#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 14+1 */

 #define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */

-#define MAX_ENTROPY_TOKENS 12

-#define ENTROPY_NODES 11

+#define MAX_ENTROPY_TOKENS      12

+#define ENTROPY_NODES           11

 #define EOSB_TOKEN              127     /* Not signalled, encoder only */

 #define INTER_MODE_CONTEXTS     7

@@ -59,31 +58,20 @@

 /* Coefficients are predicted via a 3-dimensional probability table. */

-/* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */

-#define BLOCK_TYPES_4X4 4

+/* Outside dimension.  0 = Y with DC, 1 = UV */

+#define BLOCK_TYPES 2

+#define REF_TYPES 2  // intra=0, inter=1

-#define BLOCK_TYPES_8X8 4

+/* Middle dimension reflects the coefficient position within the transform. */

+#define COEF_BANDS 6

-#define BLOCK_TYPES_16X16 4

-#define BLOCK_TYPES_32X32 4

-/* Middle dimension is a coarsening of the coefficient's

-   position within the 4x4 DCT. */

-#define COEF_BANDS 8

-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_4x4[16]);

-extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]);

-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]);

-extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]);

-/* Inside dimension is 3-valued measure of nearby complexity, that is,

-   the extent to which nearby coefficients are nonzero.  For the first

-   coefficient (DC, unless block type is 0), we look at the (already encoded)

-   blocks above and to the left of the current block.  The context index is

-   then the number (0,1,or 2) of these blocks having nonzero coefficients.

-   After decoding a coefficient, the measure is roughly the size of the

-   most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1).

+/* Inside dimension is measure of nearby complexity, that reflects the energy

+   of nearby coefficients are nonzero.  For the first coefficient (DC, unless

+   block type is 0), we look at the (already encoded) blocks above and to the

+   left of the current block.  The context index is then the number (0,1,or 2)

+   of these blocks having nonzero coefficients.

+   After decoding a coefficient, the measure is determined by the size of the

+   most recently decoded coefficient.

    Note that the intuitive meaning of this measure changes as coefficients

    are decoded, e.g., prior to the first token, a zero means that my neighbors

    are empty while, after the first token, because of the use of end-of-block,

@@ -94,21 +82,18 @@

    distinct bands). */

 /*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */

-#define PREV_COEF_CONTEXTS          4

+#define PREV_COEF_CONTEXTS          6

-typedef unsigned int vp9_coeff_count[COEF_BANDS][PREV_COEF_CONTEXTS]

+typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

                                     [MAX_ENTROPY_TOKENS];

-typedef unsigned int vp9_coeff_stats[COEF_BANDS][PREV_COEF_CONTEXTS]

+typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

                                     [ENTROPY_NODES][2];

-typedef vp9_prob vp9_coeff_probs[COEF_BANDS][PREV_COEF_CONTEXTS]

+typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

                                 [ENTROPY_NODES];

 #define SUBEXP_PARAM                4   /* Subexponential code parameter */

 #define MODULUS_PARAM               13  /* Modulus parameter */

-extern DECLARE_ALIGNED(16, const uint8_t,

-                       vp9_prev_token_class[MAX_ENTROPY_TOKENS]);

 struct VP9Common;

 void vp9_default_coef_probs(struct VP9Common *);

 extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_4x4[16]);

@@ -117,38 +102,168 @@

 extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]);

 extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]);

+extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]);

+extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]);

 extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]);

+extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]);

+extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]);

 extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]);

 void vp9_coef_tree_initialize(void);

 void vp9_adapt_coef_probs(struct VP9Common *);

-static void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {

+static INLINE void vp9_reset_mb_tokens_context(MACROBLOCKD* const xd) {

   /* Clear entropy contexts */

   vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

   vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

-#if CONFIG_NEWCOEFCONTEXT

+static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd) {

+  /* Clear entropy contexts */

+  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

+  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

+}

-#define MAX_NEIGHBORS 5

-#define NEWCOEFCONTEXT_BAND_COND(b)   ((b) >= 1)

-void vp9_init_neighbors(void);

+static INLINE void vp9_reset_sb64_tokens_context(MACROBLOCKD* const xd) {

+  /* Clear entropy contexts */

+  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4);

+  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 4);

+}

-const int *vp9_get_coef_neighbors_handle(const int *scan);

-int vp9_get_coef_neighbor_context(const short int *qcoeff_ptr, int nodc,

-                                  const int *neigbor_handle, int rc);

-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_4x4_neighbors[

-                       16 * MAX_NEIGHBORS]);

-extern DECLARE_ALIGNED(16, int, vp9_row_scan_4x4_neighbors[

-                       16 * MAX_NEIGHBORS]);

-extern DECLARE_ALIGNED(16, int, vp9_col_scan_4x4_neighbors[

-                       16 * MAX_NEIGHBORS]);

-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_8x8_neighbors[

-                       64 * MAX_NEIGHBORS]);

-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_16x16_neighbors[

-                       256 * MAX_NEIGHBORS]);

-extern DECLARE_ALIGNED(16, int, vp9_default_zig_zag1d_32x32_neighbors[

-                       1024 * MAX_NEIGHBORS]);

-#endif  // CONFIG_NEWCOEFCONTEXT

+extern const int vp9_coef_bands8x8[64];

+extern const int vp9_coef_bands4x4[16];

+static int get_coef_band(const int *scan, TX_SIZE tx_size, int coef_index) {

+  if (tx_size == TX_4X4) {

+    return vp9_coef_bands4x4[scan[coef_index]];

+  } else {

+    const int pos = scan[coef_index];

+    const int sz = 1 << (2 + tx_size);

+    const int x = pos & (sz - 1), y = pos >> (2 + tx_size);

+    if (x >= 8 || y >= 8)

+      return 5;

+    else

+      return vp9_coef_bands8x8[y * 8 + x];

+  }

+}

+extern int vp9_get_coef_context(const int *scan, const int *neighbors,

+                                int nb_pad, uint8_t *token_cache, int c, int l);

+const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);

+#if CONFIG_MODELCOEFPROB

+#define COEFPROB_BITS               8

+#define COEFPROB_MODELS             (1 << COEFPROB_BITS)

+// 2 => EOB and Zero nodes are unconstrained, rest are modeled

+// 3 => EOB, Zero and One nodes are unconstrained, rest are modeled

+#define UNCONSTRAINED_NODES         3   // Choose one of 2 or 3

+// whether forward updates are model-based

+#define MODEL_BASED_UPDATE          0

+// if model-based how many nodes are unconstrained

+#define UNCONSTRAINED_UPDATE_NODES  3

+// whether backward updates are model-based

+#define MODEL_BASED_ADAPT           0

+#define UNCONSTRAINED_ADAPT_NODES   3

+// whether to adjust the coef probs for key frames based on qindex

+#define ADJUST_KF_COEF_PROBS        0

+typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]

+                                      [PREV_COEF_CONTEXTS][2];

+extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1];

+void vp9_get_model_distribution(vp9_prob model, vp9_prob *tree_probs,

+                                int b, int r);

+void vp9_adjust_default_coef_probs(struct VP9Common *cm);

+#endif  // CONFIG_MODELCOEFPROB

+#if CONFIG_CODE_NONZEROCOUNT

+/* Alphabet for number of non-zero symbols in block */

+#define NZC_0                   0       /* Used for all blocks */

+#define NZC_1                   1       /* Used for all blocks */

+#define NZC_2                   2       /* Used for all blocks */

+#define NZC_3TO4                3       /* Used for all blocks */

+#define NZC_5TO8                4       /* Used for all blocks */

+#define NZC_9TO16               5       /* Used for all blocks */

+#define NZC_17TO32              6       /* Used for 8x8 and larger blocks */

+#define NZC_33TO64              7       /* Used for 8x8 and larger blocks */

+#define NZC_65TO128             8       /* Used for 16x16 and larger blocks */

+#define NZC_129TO256            9       /* Used for 16x16 and larger blocks */

+#define NZC_257TO512           10       /* Used for 32x32 and larger blocks */

+#define NZC_513TO1024          11       /* Used for 32x32 and larger blocks */

+/* Number of tokens for each block size */

+#define NZC4X4_TOKENS           6

+#define NZC8X8_TOKENS           8

+#define NZC16X16_TOKENS        10

+#define NZC32X32_TOKENS        12

+/* Number of nodes for each block size */

+#define NZC4X4_NODES            5

+#define NZC8X8_NODES            7

+#define NZC16X16_NODES          9

+#define NZC32X32_NODES         11

+/* Max number of tokens with extra bits */

+#define NZC_TOKENS_EXTRA        9

+/* Max number of extra bits */

+#define NZC_BITS_EXTRA          9

+/* Tokens without extra bits */

+#define NZC_TOKENS_NOEXTRA      (NZC32X32_TOKENS - NZC_TOKENS_EXTRA)

+#define MAX_NZC_CONTEXTS        3

+/* whether to update extra bit probabilities */

+#define NZC_PCAT_UPDATE

+/* nzc trees */

+extern const vp9_tree_index    vp9_nzc4x4_tree[];

+extern const vp9_tree_index    vp9_nzc8x8_tree[];

+extern const vp9_tree_index    vp9_nzc16x16_tree[];

+extern const vp9_tree_index    vp9_nzc32x32_tree[];

+/* nzc encodings */

+extern struct vp9_token_struct  vp9_nzc4x4_encodings[NZC4X4_TOKENS];

+extern struct vp9_token_struct  vp9_nzc8x8_encodings[NZC8X8_TOKENS];

+extern struct vp9_token_struct  vp9_nzc16x16_encodings[NZC16X16_TOKENS];

+extern struct vp9_token_struct  vp9_nzc32x32_encodings[NZC32X32_TOKENS];

+#define codenzc(x) (\

+  (x) <= 3 ? (x) : (x) <= 4 ? 3 : (x) <= 8 ? 4 : \

+  (x) <= 16 ? 5 : (x) <= 32 ? 6 : (x) <= 64 ? 7 :\

+  (x) <= 128 ? 8 : (x) <= 256 ? 9 : (x) <= 512 ? 10 : 11)

+int vp9_get_nzc_context_y_sb64(struct VP9Common *cm, MODE_INFO *cur,

+                               int mb_row, int mb_col, int block);

+int vp9_get_nzc_context_y_sb32(struct VP9Common *cm, MODE_INFO *cur,

+                               int mb_row, int mb_col, int block);

+int vp9_get_nzc_context_y_mb16(struct VP9Common *cm, MODE_INFO *cur,

+                               int mb_row, int mb_col, int block);

+int vp9_get_nzc_context_uv_sb64(struct VP9Common *cm, MODE_INFO *cur,

+                                int mb_row, int mb_col, int block);

+int vp9_get_nzc_context_uv_sb32(struct VP9Common *cm, MODE_INFO *cur,

+                                int mb_row, int mb_col, int block);

+int vp9_get_nzc_context_uv_mb16(struct VP9Common *cm, MODE_INFO *cur,

+                                int mb_row, int mb_col, int block);

+int vp9_get_nzc_context(struct VP9Common *cm, MACROBLOCKD *xd, int block);

+void vp9_update_nzc_counts(struct VP9Common *cm, MACROBLOCKD *xd,

+                           int mb_row, int mb_col);

+void vp9_adapt_nzc_probs(struct VP9Common *cm);

+/* Extra bits array */

+extern const int vp9_extranzcbits[NZC32X32_TOKENS];

+/* Base nzc values */

+extern const int vp9_basenzcvalue[NZC32X32_TOKENS];

+#endif  // CONFIG_CODE_NONZEROCOUNT

+#include "vp9/common/vp9_coefupdateprobs.h"

 #endif  // VP9_COMMON_VP9_ENTROPY_H_

--- a/vp9/common/vp9_entropymode.c

+++ b/vp9/common/vp9_entropymode.c

@@ -11,9 +11,10 @@

 #include "vp9/common/vp9_onyxc_int.h"

 #include "vp9/common/vp9_modecont.h"

+#include "vp9/common/vp9_seg_common.h"

+#include "vp9/common/vp9_alloccommon.h"

 #include "vpx_mem/vpx_mem.h"

 static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = {

   /* DC V   H  D45 135 117 153 D27 D63 TM i8x8 BPRED */

   {12,  6,  5,  5,  5,  5,  5,  5,  5,  2, 22, 200},

@@ -114,8 +115,6 @@

   return SUBMVREF_NORMAL;

-const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25};

 const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = {

   { 147, 136, 18 },

   { 106, 145, 1  },

@@ -301,40 +300,32 @@

 void vp9_init_mbmode_probs(VP9_COMMON *x) {

   unsigned int bct [VP9_YMODES] [2];      /* num Ymodes > num UV modes */

-  vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings,

-                                   vp9_ymode_tree, x->fc.ymode_prob,

-                                   bct, y_mode_cts);

-  vp9_tree_probs_from_distribution(VP9_I32X32_MODES, vp9_sb_ymode_encodings,

-                                   vp9_sb_ymode_tree, x->fc.sb_ymode_prob,

-                                   bct, y_mode_cts);

+  vp9_tree_probs_from_distribution(vp9_ymode_tree, x->fc.ymode_prob,

+                                   bct, y_mode_cts, 0);

+  vp9_tree_probs_from_distribution(vp9_sb_ymode_tree, x->fc.sb_ymode_prob,

+                                   bct, y_mode_cts, 0);

     int i;

     for (i = 0; i < 8; i++) {

-      vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings,

-                                       vp9_kf_ymode_tree, x->kf_ymode_prob[i],

-                                       bct, kf_y_mode_cts[i]);

-      vp9_tree_probs_from_distribution(VP9_I32X32_MODES,

-                                       vp9_sb_kf_ymode_encodings,

-                                       vp9_sb_kf_ymode_tree,

+      vp9_tree_probs_from_distribution(vp9_kf_ymode_tree, x->kf_ymode_prob[i],

+                                       bct, kf_y_mode_cts[i], 0);

+      vp9_tree_probs_from_distribution(vp9_sb_kf_ymode_tree,

                                        x->sb_kf_ymode_prob[i], bct,

-                                       kf_y_mode_cts[i]);

+                                       kf_y_mode_cts[i], 0);

     int i;

     for (i = 0; i < VP9_YMODES; i++) {

-      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,

-                                       vp9_uv_mode_tree, x->kf_uv_mode_prob[i],

-                                       bct, kf_uv_mode_cts[i]);

-      vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings,

-                                       vp9_uv_mode_tree, x->fc.uv_mode_prob[i],

-                                       bct, uv_mode_cts[i]);

+      vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->kf_uv_mode_prob[i],

+                                       bct, kf_uv_mode_cts[i], 0);

+      vp9_tree_probs_from_distribution(vp9_uv_mode_tree, x->fc.uv_mode_prob[i],

+                                       bct, uv_mode_cts[i], 0);

-  vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,

-                                   vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,

-                                   bct, i8x8_mode_cts);

+  vp9_tree_probs_from_distribution(vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob,

+                                   bct, i8x8_mode_cts, 0);

   vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2,

              sizeof(vp9_sub_mv_ref_prob2));

@@ -344,6 +335,9 @@

 #if CONFIG_COMP_INTERINTRA_PRED

   x->fc.interintra_prob = VP9_DEF_INTERINTRA_PROB;

 #endif

+  x->ref_pred_probs[0] = 120;

+  x->ref_pred_probs[1] = 80;

+  x->ref_pred_probs[2] = 40;

@@ -351,8 +345,7 @@

   vp9_prob p[VP9_NKF_BINTRAMODES - 1],

   unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2],

   const unsigned int events[VP9_NKF_BINTRAMODES]) {

-  vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings,

-                                   vp9_bmode_tree, p, branch_ct, events);

+  vp9_tree_probs_from_distribution(vp9_bmode_tree, p, branch_ct, events, 0);

 void vp9_default_bmode_probs(vp9_prob p[VP9_NKF_BINTRAMODES - 1]) {

@@ -364,8 +357,7 @@

   vp9_prob p[VP9_KF_BINTRAMODES - 1],

   unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2],

   const unsigned int events[VP9_KF_BINTRAMODES]) {

-  vp9_tree_probs_from_distribution(VP9_KF_BINTRAMODES, vp9_kf_bmode_encodings,

-                                   vp9_kf_bmode_tree, p, branch_ct, events);

+  vp9_tree_probs_from_distribution(vp9_kf_bmode_tree, p, branch_ct, events, 0);

 void vp9_kf_default_bmode_probs(vp9_prob p[VP9_KF_BINTRAMODES]

@@ -419,6 +411,14 @@

 #else

 const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, 0, 1, -1, -1};

 #endif

+#endif  // VP9_SWITCHABLE_FILTERS

+// Indicates if the filter is interpolating or non-interpolating

+// Note currently only the EIGHTTAP_SMOOTH is non-interpolating

+#if CONFIG_ENABLE_6TAP

+const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 0, 1, 1, 1, -1};

+#else

+const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {0, 1, 1, 1, -1};

 #endif

 void vp9_entropy_mode_init() {

@@ -480,7 +480,7 @@

 #define MVREF_COUNT_SAT 20

 #define MVREF_MAX_UPDATE_FACTOR 128

-void vp9_update_mode_context(VP9_COMMON *pc) {

+void vp9_adapt_mode_context(VP9_COMMON *pc) {

   int i, j;

   unsigned int (*mv_ref_ct)[4][2];

   int (*mode_context)[4];

@@ -526,9 +526,10 @@

 #define MODE_COUNT_SAT 20

 #define MODE_MAX_UPDATE_FACTOR 144

-static void update_mode_probs(int n_modes, struct vp9_token_struct *encoding,

+static void update_mode_probs(int n_modes,

                               const vp9_tree_index *tree, unsigned int *cnt,

-                              vp9_prob *pre_probs, vp9_prob *dst_probs) {

+                              vp9_prob *pre_probs, vp9_prob *dst_probs,

+                              unsigned int tok0_offset) {

 #define MAX_PROBS 32

   vp9_prob probs[MAX_PROBS];

   unsigned int branch_ct[MAX_PROBS][2];

@@ -535,8 +536,7 @@

   int t, count, factor;

   assert(n_modes - 1 < MAX_PROBS);

-  vp9_tree_probs_from_distribution(n_modes, encoding, tree, probs,

-                                   branch_ct, cnt);

+  vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);

   for (t = 0; t < n_modes - 1; ++t) {

     count = branch_ct[t][0] + branch_ct[t][1];

     count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count;

@@ -592,31 +592,32 @@

 #endif

 #endif

-  update_mode_probs(VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree,

+  update_mode_probs(VP9_YMODES, vp9_ymode_tree,

                     cm->fc.ymode_counts, cm->fc.pre_ymode_prob,

-                    cm->fc.ymode_prob);

-  update_mode_probs(VP9_I32X32_MODES, vp9_sb_ymode_encodings, vp9_sb_ymode_tree,

+                    cm->fc.ymode_prob, 0);

+  update_mode_probs(VP9_I32X32_MODES, vp9_sb_ymode_tree,

                     cm->fc.sb_ymode_counts, cm->fc.pre_sb_ymode_prob,

-                    cm->fc.sb_ymode_prob);

+                    cm->fc.sb_ymode_prob, 0);

   for (i = 0; i < VP9_YMODES; ++i) {

-    update_mode_probs(VP9_UV_MODES, vp9_uv_mode_encodings, vp9_uv_mode_tree,

+    update_mode_probs(VP9_UV_MODES, vp9_uv_mode_tree,

                       cm->fc.uv_mode_counts[i], cm->fc.pre_uv_mode_prob[i],

-                      cm->fc.uv_mode_prob[i]);

+                      cm->fc.uv_mode_prob[i], 0);

-  update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_encodings, vp9_bmode_tree,

+  update_mode_probs(VP9_NKF_BINTRAMODES, vp9_bmode_tree,

                     cm->fc.bmode_counts, cm->fc.pre_bmode_prob,

-                    cm->fc.bmode_prob);

-  update_mode_probs(VP9_I8X8_MODES, vp9_i8x8_mode_encodings,

+                    cm->fc.bmode_prob, 0);

+  update_mode_probs(VP9_I8X8_MODES,

                     vp9_i8x8_mode_tree, cm->fc.i8x8_mode_counts,

-                    cm->fc.pre_i8x8_mode_prob, cm->fc.i8x8_mode_prob);

+                    cm->fc.pre_i8x8_mode_prob, cm->fc.i8x8_mode_prob, 0);

   for (i = 0; i < SUBMVREF_COUNT; ++i) {

-    update_mode_probs(VP9_SUBMVREFS, vp9_sub_mv_ref_encoding_array,

+    update_mode_probs(VP9_SUBMVREFS,

                       vp9_sub_mv_ref_tree, cm->fc.sub_mv_ref_counts[i],

-                      cm->fc.pre_sub_mv_ref_prob[i], cm->fc.sub_mv_ref_prob[i]);

+                      cm->fc.pre_sub_mv_ref_prob[i], cm->fc.sub_mv_ref_prob[i],

+                      LEFT4X4);

-  update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_encodings, vp9_mbsplit_tree,

+  update_mode_probs(VP9_NUMMBSPLITS, vp9_mbsplit_tree,

                     cm->fc.mbsplit_counts, cm->fc.pre_mbsplit_prob,

-                    cm->fc.mbsplit_prob);

+                    cm->fc.mbsplit_prob, 0);

 #if CONFIG_COMP_INTERINTRA_PRED

   if (cm->use_interintra) {

     int factor, interintra_prob, count;

@@ -630,4 +631,66 @@

                                            interintra_prob, factor);

 #endif

+}

+static void set_default_lf_deltas(MACROBLOCKD *xd) {

+  xd->mode_ref_lf_delta_enabled = 1;

+  xd->mode_ref_lf_delta_update = 1;

+  xd->ref_lf_deltas[INTRA_FRAME] = 2;

+  xd->ref_lf_deltas[LAST_FRAME] = 0;

+  xd->ref_lf_deltas[GOLDEN_FRAME] = -2;

+  xd->ref_lf_deltas[ALTREF_FRAME] = -2;

+  xd->mode_lf_deltas[0] = 4;               // BPRED

+  xd->mode_lf_deltas[1] = -2;              // Zero

+  xd->mode_lf_deltas[2] = 2;               // New mv

+  xd->mode_lf_deltas[3] = 4;               // Split mv

+}

+void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {

+  // Reset the segment feature data to the default stats:

+  // Features disabled, 0, with delta coding (Default state).

+  int i;

+  vp9_clearall_segfeatures(xd);

+  xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

+  if (cm->last_frame_seg_map)

+    vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols));

+  /* reset the mode ref deltas for loop filter */

+  vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas));

+  vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas));

+  set_default_lf_deltas(xd);

+  vp9_default_coef_probs(cm);

+  vp9_init_mbmode_probs(cm);

+  vp9_default_bmode_probs(cm->fc.bmode_prob);

+  vp9_kf_default_bmode_probs(cm->kf_bmode_prob);

+  vp9_init_mv_probs(cm);

+  // To force update of the sharpness

+  cm->last_sharpness_level = -1;

+  vp9_init_mode_contexts(cm);

+  for (i = 0; i < NUM_FRAME_CONTEXTS; i++) {

+    vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc));

+  }

+  vpx_memset(cm->prev_mip, 0,

+             (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+  vpx_memset(cm->mip, 0,

+             (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

+  vp9_update_mode_info_border(cm, cm->mip);

+  vp9_update_mode_info_in_image(cm, cm->mi);

+#if CONFIG_NEW_MVREF

+  // Defaults probabilities for encoding the MV ref id signal

+  vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,

+             sizeof(xd->mb_mv_ref_probs));

+#endif

+  cm->ref_frame_sign_bias[GOLDEN_FRAME] = 0;

+  cm->ref_frame_sign_bias[ALTREF_FRAME] = 0;

+  cm->frame_context_idx = 0;

--- a/vp9/common/vp9_entropymode.h

+++ b/vp9/common/vp9_entropymode.h

@@ -34,8 +34,6 @@

 extern int vp9_mv_cont(const int_mv *l, const int_mv *a);

-extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1];

 extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

 extern const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES]

@@ -76,11 +74,14 @@

 struct VP9Common;

+/* sets up common features to forget past dependence */

+void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);

 void vp9_init_mbmode_probs(struct VP9Common *x);

 extern void vp9_init_mode_contexts(struct VP9Common *pc);

-extern void vp9_update_mode_context(struct VP9Common *pc);

+extern void vp9_adapt_mode_context(struct VP9Common *pc);

 extern void vp9_accum_mv_refs(struct VP9Common *pc,

                               MB_PREDICTION_MODE m,

@@ -100,6 +101,8 @@

                   [VP9_SWITCHABLE_FILTERS];

 extern const  int vp9_switchable_interp_map[SWITCHABLE + 1];

+extern const  int vp9_is_interpolating_filter[SWITCHABLE + 1];

 extern const  vp9_tree_index vp9_switchable_interp_tree

                   [2 * (VP9_SWITCHABLE_FILTERS - 1)];

--- a/vp9/common/vp9_entropymv.c

+++ b/vp9/common/vp9_entropymv.c

@@ -42,7 +42,10 @@

   -MV_CLASS_2, -MV_CLASS_3,

   10, 12,

   -MV_CLASS_4, -MV_CLASS_5,

-  -MV_CLASS_6, -MV_CLASS_7,

+  -MV_CLASS_6, 14,

+  16, 18,

+  -MV_CLASS_7, -MV_CLASS_8,

+  -MV_CLASS_9, -MV_CLASS_10,

};

 struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES];

@@ -62,24 +65,24 @@

   {32, 64, 96},

     { /* vert component */

-      128,                                             /* sign */

-      {224, 144, 192, 168, 192, 176, 192},             /* class */

-      {216},                                           /* class0 */

-      {136, 140, 148, 160, 176, 192, 224},             /* bits */

-      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */

-      {64, 96, 64},                                    /* fp */

-      160,                                             /* class0_hp bit */

-      128,                                             /* hp */

+      128,                                                  /* sign */

+      {224, 144, 192, 168, 192, 176, 192, 198, 198, 245},   /* class */

+      {216},                                                /* class0 */

+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   /* bits */

+      {{128, 128, 64}, {96, 112, 64}},                      /* class0_fp */

+      {64, 96, 64},                                         /* fp */

+      160,                                                  /* class0_hp bit */

+      128,                                                  /* hp */

},

     { /* hor component */

-      128,                                             /* sign */

-      {216, 128, 176, 160, 176, 176, 192},             /* class */

-      {208},                                           /* class0 */

-      {136, 140, 148, 160, 176, 192, 224},             /* bits */

-      {{128, 128, 64}, {96, 112, 64}},                 /* class0_fp */

-      {64, 96, 64},                                    /* fp */

-      160,                                             /* class0_hp bit */

-      128,                                             /* hp */

+      128,                                                  /* sign */

+      {216, 128, 176, 160, 176, 176, 192, 198, 198, 208},   /* class */

+      {208},                                                /* class0 */

+      {136, 140, 148, 160, 176, 192, 224, 234, 234, 240},   /* bits */

+      {{128, 128, 64}, {96, 112, 64}},                      /* class0_fp */

+      {64, 96, 64},                                         /* fp */

+      160,                                                  /* class0_hp bit */

+      128,                                                  /* hp */

},

};

@@ -103,6 +106,9 @@

   else if (z < CLASS0_SIZE * 256)  c = MV_CLASS_5;

   else if (z < CLASS0_SIZE * 512)  c = MV_CLASS_6;

   else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7;

+  else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8;

+  else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9;

+  else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10;

   else assert(0);

   if (offset)

     *offset = z - mv_class_base(c);

@@ -110,11 +116,8 @@

 int vp9_use_nmv_hp(const MV *ref) {

-  if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&

-      (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH)

-    return 1;

-  else

-    return 0;

+  return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&

+         (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;

 int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {

@@ -134,6 +137,7 @@

                                     int incr,

                                     int usehp) {

   int s, z, c, o, d, e, f;

+  if (!incr) return;

   assert (v != 0);            /* should not be zero */

   s = v < 0;

   mvcomp->sign[s] += incr;

@@ -211,24 +215,26 @@

-static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp,

+static void adapt_prob(vp9_prob *dest, vp9_prob prep,

                        unsigned int ct[2]) {

   int count = ct[0] + ct[1];

   if (count) {

+    vp9_prob newp = get_binary_prob(ct[0], ct[1]);

     count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count;

     *dest = weighted_prob(prep, newp,

                           MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT);

+  } else {

+    *dest = prep;

-void vp9_counts_process(nmv_context_counts *NMVcount, int usehp) {

-  counts_to_context(&NMVcount->comps[0], usehp);

-  counts_to_context(&NMVcount->comps[1], usehp);

+void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {

+  counts_to_context(&nmv_count->comps[0], usehp);

+  counts_to_context(&nmv_count->comps[1], usehp);

 void vp9_counts_to_nmv_context(

-    nmv_context_counts *NMVcount,

+    nmv_context_counts *nmv_count,

     nmv_context *prob,

     int usehp,

     unsigned int (*branch_ct_joint)[2],

@@ -241,81 +247,90 @@

     unsigned int (*branch_ct_class0_hp)[2],

     unsigned int (*branch_ct_hp)[2]) {

   int i, j, k;

-  vp9_counts_process(NMVcount, usehp);

-  vp9_tree_probs_from_distribution(MV_JOINTS,

-                                   vp9_mv_joint_encodings,

-                                   vp9_mv_joint_tree,

+  vp9_counts_process(nmv_count, usehp);

+  vp9_tree_probs_from_distribution(vp9_mv_joint_tree,

                                    prob->joints,

                                    branch_ct_joint,

-                                   NMVcount->joints);

+                                   nmv_count->joints, 0);

   for (i = 0; i < 2; ++i) {

-    prob->comps[i].sign = get_binary_prob(NMVcount->comps[i].sign[0],

-                                          NMVcount->comps[i].sign[1]);

-    branch_ct_sign[i][0] = NMVcount->comps[i].sign[0];

-    branch_ct_sign[i][1] = NMVcount->comps[i].sign[1];

-    vp9_tree_probs_from_distribution(MV_CLASSES,

-                                     vp9_mv_class_encodings,

-                                     vp9_mv_class_tree,

+    prob->comps[i].sign = get_binary_prob(nmv_count->comps[i].sign[0],

+                                          nmv_count->comps[i].sign[1]);

+    branch_ct_sign[i][0] = nmv_count->comps[i].sign[0];

+    branch_ct_sign[i][1] = nmv_count->comps[i].sign[1];

+    vp9_tree_probs_from_distribution(vp9_mv_class_tree,

                                      prob->comps[i].classes,

                                      branch_ct_classes[i],

-                                     NMVcount->comps[i].classes);

-    vp9_tree_probs_from_distribution(CLASS0_SIZE,

-                                     vp9_mv_class0_encodings,

-                                     vp9_mv_class0_tree,

+                                     nmv_count->comps[i].classes, 0);

+    vp9_tree_probs_from_distribution(vp9_mv_class0_tree,

                                      prob->comps[i].class0,

                                      branch_ct_class0[i],

-                                     NMVcount->comps[i].class0);

+                                     nmv_count->comps[i].class0, 0);

     for (j = 0; j < MV_OFFSET_BITS; ++j) {

-      prob->comps[i].bits[j] = get_binary_prob(NMVcount->comps[i].bits[j][0],

-                                               NMVcount->comps[i].bits[j][1]);

-      branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0];

-      branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1];

+      prob->comps[i].bits[j] = get_binary_prob(nmv_count->comps[i].bits[j][0],

+                                               nmv_count->comps[i].bits[j][1]);

+      branch_ct_bits[i][j][0] = nmv_count->comps[i].bits[j][0];

+      branch_ct_bits[i][j][1] = nmv_count->comps[i].bits[j][1];

   for (i = 0; i < 2; ++i) {

     for (k = 0; k < CLASS0_SIZE; ++k) {

-      vp9_tree_probs_from_distribution(4,

-                                       vp9_mv_fp_encodings,

-                                       vp9_mv_fp_tree,

+      vp9_tree_probs_from_distribution(vp9_mv_fp_tree,

                                        prob->comps[i].class0_fp[k],

                                        branch_ct_class0_fp[i][k],

-                                       NMVcount->comps[i].class0_fp[k]);

+                                       nmv_count->comps[i].class0_fp[k], 0);

-    vp9_tree_probs_from_distribution(4,

-                                     vp9_mv_fp_encodings,

-                                     vp9_mv_fp_tree,

+    vp9_tree_probs_from_distribution(vp9_mv_fp_tree,

                                      prob->comps[i].fp,

                                      branch_ct_fp[i],

-                                     NMVcount->comps[i].fp);

+                                     nmv_count->comps[i].fp, 0);

   if (usehp) {

     for (i = 0; i < 2; ++i) {

       prob->comps[i].class0_hp =

-          get_binary_prob(NMVcount->comps[i].class0_hp[0],

-                          NMVcount->comps[i].class0_hp[1]);

-      branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0];

-      branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1];

+          get_binary_prob(nmv_count->comps[i].class0_hp[0],

+                          nmv_count->comps[i].class0_hp[1]);

+      branch_ct_class0_hp[i][0] = nmv_count->comps[i].class0_hp[0];

+      branch_ct_class0_hp[i][1] = nmv_count->comps[i].class0_hp[1];

-      prob->comps[i].hp = get_binary_prob(NMVcount->comps[i].hp[0],

-                                          NMVcount->comps[i].hp[1]);

-      branch_ct_hp[i][0] = NMVcount->comps[i].hp[0];

-      branch_ct_hp[i][1] = NMVcount->comps[i].hp[1];

+      prob->comps[i].hp = get_binary_prob(nmv_count->comps[i].hp[0],

+                                          nmv_count->comps[i].hp[1]);

+      branch_ct_hp[i][0] = nmv_count->comps[i].hp[0];

+      branch_ct_hp[i][1] = nmv_count->comps[i].hp[1];

+static unsigned int adapt_probs(unsigned int i,

+                                vp9_tree tree,

+                                vp9_prob this_probs[],

+                                const vp9_prob last_probs[],

+                                const unsigned int num_events[]) {

+  vp9_prob this_prob;

+  const uint32_t left = tree[i] <= 0

+          ? num_events[-tree[i]]

+          : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);

+  const uint32_t right = tree[i + 1] <= 0

+          ? num_events[-tree[i + 1]]

+          : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);

+  uint32_t weight = left + right;

+  if (weight) {

+    this_prob = get_binary_prob(left, right);

+    weight = weight > MV_COUNT_SAT ? MV_COUNT_SAT : weight;

+    this_prob = weighted_prob(last_probs[i >> 1], this_prob,

+                              MV_MAX_UPDATE_FACTOR * weight / MV_COUNT_SAT);

+  } else {

+    this_prob = last_probs[i >> 1];

+  }

+  this_probs[i >> 1] = this_prob;

+  return left + right;

+}

 void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {

-  int i, j, k;

-  nmv_context prob;

-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];

-  unsigned int branch_ct_sign[2][2];

-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];

-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];

-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];

-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];

-  unsigned int branch_ct_fp[2][4 - 1][2];

-  unsigned int branch_ct_class0_hp[2][2];

-  unsigned int branch_ct_hp[2][2];

+  int i, j;

 #ifdef MV_COUNT_TESTING

   printf("joints count: ");

   for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]);

@@ -376,75 +391,48 @@

   smooth_counts(&cm->fc.NMVcount.comps[0]);

   smooth_counts(&cm->fc.NMVcount.comps[1]);

 #endif

-  vp9_counts_to_nmv_context(&cm->fc.NMVcount,

-                            &prob,

-                            usehp,

-                            branch_ct_joint,

-                            branch_ct_sign,

-                            branch_ct_classes,

-                            branch_ct_class0,

-                            branch_ct_bits,

-                            branch_ct_class0_fp,

-                            branch_ct_fp,

-                            branch_ct_class0_hp,

-                            branch_ct_hp);

+  vp9_counts_process(&cm->fc.NMVcount, usehp);

-  for (j = 0; j < MV_JOINTS - 1; ++j) {

-    adapt_prob(&cm->fc.nmvc.joints[j],

-               cm->fc.pre_nmvc.joints[j],

-               prob.joints[j],

-               branch_ct_joint[j]);

-  }

+  adapt_probs(0, vp9_mv_joint_tree,

+              cm->fc.nmvc.joints, cm->fc.pre_nmvc.joints,

+              cm->fc.NMVcount.joints);

   for (i = 0; i < 2; ++i) {

     adapt_prob(&cm->fc.nmvc.comps[i].sign,

                cm->fc.pre_nmvc.comps[i].sign,

-               prob.comps[i].sign,

-               branch_ct_sign[i]);

-    for (j = 0; j < MV_CLASSES - 1; ++j) {

-      adapt_prob(&cm->fc.nmvc.comps[i].classes[j],

-                 cm->fc.pre_nmvc.comps[i].classes[j],

-                 prob.comps[i].classes[j],

-                 branch_ct_classes[i][j]);

-    }

-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

-      adapt_prob(&cm->fc.nmvc.comps[i].class0[j],

-                 cm->fc.pre_nmvc.comps[i].class0[j],

-                 prob.comps[i].class0[j],

-                 branch_ct_class0[i][j]);

-    }

+               cm->fc.NMVcount.comps[i].sign);

+    adapt_probs(0, vp9_mv_class_tree,

+                cm->fc.nmvc.comps[i].classes, cm->fc.pre_nmvc.comps[i].classes,

+                cm->fc.NMVcount.comps[i].classes);

+    adapt_probs(0, vp9_mv_class0_tree,

+                cm->fc.nmvc.comps[i].class0, cm->fc.pre_nmvc.comps[i].class0,

+                cm->fc.NMVcount.comps[i].class0);

     for (j = 0; j < MV_OFFSET_BITS; ++j) {

       adapt_prob(&cm->fc.nmvc.comps[i].bits[j],

                  cm->fc.pre_nmvc.comps[i].bits[j],

-                 prob.comps[i].bits[j],

-                 branch_ct_bits[i][j]);

+                 cm->fc.NMVcount.comps[i].bits[j]);

   for (i = 0; i < 2; ++i) {

     for (j = 0; j < CLASS0_SIZE; ++j) {

-      for (k = 0; k < 3; ++k) {

-        adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k],

-                   cm->fc.pre_nmvc.comps[i].class0_fp[j][k],

-                   prob.comps[i].class0_fp[j][k],

-                   branch_ct_class0_fp[i][j][k]);

-      }

+      adapt_probs(0, vp9_mv_fp_tree,

+                  cm->fc.nmvc.comps[i].class0_fp[j],

+                  cm->fc.pre_nmvc.comps[i].class0_fp[j],

+                  cm->fc.NMVcount.comps[i].class0_fp[j]);

-    for (j = 0; j < 3; ++j) {

-      adapt_prob(&cm->fc.nmvc.comps[i].fp[j],

-                 cm->fc.pre_nmvc.comps[i].fp[j],

-                 prob.comps[i].fp[j],

-                 branch_ct_fp[i][j]);

-    }

+    adapt_probs(0, vp9_mv_fp_tree,

+                cm->fc.nmvc.comps[i].fp,

+                cm->fc.pre_nmvc.comps[i].fp,

+                cm->fc.NMVcount.comps[i].fp);

   if (usehp) {

     for (i = 0; i < 2; ++i) {

       adapt_prob(&cm->fc.nmvc.comps[i].class0_hp,

                  cm->fc.pre_nmvc.comps[i].class0_hp,

-                 prob.comps[i].class0_hp,

-                 branch_ct_class0_hp[i]);

+                 cm->fc.NMVcount.comps[i].class0_hp);

       adapt_prob(&cm->fc.nmvc.comps[i].hp,

                  cm->fc.pre_nmvc.comps[i].hp,

-                 prob.comps[i].hp,

-                 branch_ct_hp[i]);

+                 cm->fc.NMVcount.comps[i].hp);

--- a/vp9/common/vp9_entropymv.h

+++ b/vp9/common/vp9_entropymv.h

@@ -49,7 +49,7 @@

 extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS];

 /* Symbols for coding magnitude class of nonzero components */

-#define MV_CLASSES     8

+#define MV_CLASSES     11

 typedef enum {

   MV_CLASS_0 = 0,      /* (0, 2]     integer pel */

   MV_CLASS_1 = 1,      /* (2, 4]     integer pel */

@@ -59,6 +59,9 @@

   MV_CLASS_5 = 5,      /* (32, 64]   integer pel */

   MV_CLASS_6 = 6,      /* (64, 128]  integer pel */

   MV_CLASS_7 = 7,      /* (128, 256] integer pel */

+  MV_CLASS_8 = 8,      /* (256, 512] integer pel */

+  MV_CLASS_9 = 9,      /* (512, 1024] integer pel */

+  MV_CLASS_10 = 10,    /* (1024,2048] integer pel */

 } MV_CLASS_TYPE;

 extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2];

--- a/vp9/common/vp9_extend.c

+++ b/vp9/common/vp9_extend.c

@@ -11,159 +11,137 @@

 #include "vp9/common/vp9_extend.h"

 #include "vpx_mem/vpx_mem.h"

-static void copy_and_extend_plane(uint8_t *s,       /* source */

-                                  int sp,           /* source pitch */

-                                  uint8_t *d,       /* destination */

-                                  int dp,           /* destination pitch */

-                                  int h,            /* height */

-                                  int w,            /* width */

-                                  int et,           /* extend top border */

-                                  int el,           /* extend left border */

-                                  int eb,           /* extend bottom border */

-                                  int er) {         /* extend right border */

-  int i;

-  uint8_t *src_ptr1, *src_ptr2;

-  uint8_t *dest_ptr1, *dest_ptr2;

-  int linesize;

+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,

+                                  uint8_t *dst, int dst_pitch,

+                                  int w, int h,

+                                  int extend_top, int extend_left,

+                                  int extend_bottom, int extend_right) {

+  int i, linesize;

-  /* copy the left and right most columns out */

-  src_ptr1 = s;

-  src_ptr2 = s + w - 1;

-  dest_ptr1 = d - el;

-  dest_ptr2 = d + w;

+  // copy the left and right most columns out

+  const uint8_t *src_ptr1 = src;

+  const uint8_t *src_ptr2 = src + w - 1;

+  uint8_t *dst_ptr1 = dst - extend_left;

+  uint8_t *dst_ptr2 = dst + w;

   for (i = 0; i < h; i++) {

-    vpx_memset(dest_ptr1, src_ptr1[0], el);

-    vpx_memcpy(dest_ptr1 + el, src_ptr1, w);

-    vpx_memset(dest_ptr2, src_ptr2[0], er);

-    src_ptr1  += sp;

-    src_ptr2  += sp;

-    dest_ptr1 += dp;

-    dest_ptr2 += dp;

+    vpx_memset(dst_ptr1, src_ptr1[0], extend_left);

+    vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w);

+    vpx_memset(dst_ptr2, src_ptr2[0], extend_right);

+    src_ptr1 += src_pitch;

+    src_ptr2 += src_pitch;

+    dst_ptr1 += dst_pitch;

+    dst_ptr2 += dst_pitch;

-  /* Now copy the top and bottom lines into each line of the respective

-   * borders

-   */

-  src_ptr1 = d - el;

-  src_ptr2 = d + dp * (h - 1) - el;

-  dest_ptr1 = d + dp * (-et) - el;

-  dest_ptr2 = d + dp * (h) - el;

-  linesize = el + er + w;

+  // Now copy the top and bottom lines into each line of the respective

+  // borders

+  src_ptr1 = dst - extend_left;

+  src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;

+  dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;

+  dst_ptr2 = dst + dst_pitch * (h) - extend_left;

+  linesize = extend_left + extend_right + w;

-  for (i = 0; i < et; i++) {

-    vpx_memcpy(dest_ptr1, src_ptr1, linesize);

-    dest_ptr1 += dp;

+  for (i = 0; i < extend_top; i++) {

+    vpx_memcpy(dst_ptr1, src_ptr1, linesize);

+    dst_ptr1 += dst_pitch;

-  for (i = 0; i < eb; i++) {

-    vpx_memcpy(dest_ptr2, src_ptr2, linesize);

-    dest_ptr2 += dp;

+  for (i = 0; i < extend_bottom; i++) {

+    vpx_memcpy(dst_ptr2, src_ptr2, linesize);

+    dst_ptr2 += dst_pitch;

-void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,

+void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,

                                YV12_BUFFER_CONFIG *dst) {

-  int et = dst->border;

-  int el = dst->border;

-  int eb = dst->border + dst->y_height - src->y_height;

-  int er = dst->border + dst->y_width - src->y_width;

+  const int et_y = dst->border;

+  const int el_y = dst->border;

+  const int eb_y = dst->border + dst->y_height - src->y_height;

+  const int er_y = dst->border + dst->y_width - src->y_width;

+  const int et_uv = dst->border >> 1;

+  const int el_uv = dst->border >> 1;

+  const int eb_uv = (dst->border >> 1) + dst->uv_height - src->uv_height;

+  const int er_uv = (dst->border >> 1) + dst->uv_width - src->uv_width;

   copy_and_extend_plane(src->y_buffer, src->y_stride,

                         dst->y_buffer, dst->y_stride,

-                        src->y_height, src->y_width,

-                        et, el, eb, er);

+                        src->y_width, src->y_height,

+                        et_y, el_y, eb_y, er_y);

-  et = dst->border >> 1;

-  el = dst->border >> 1;

-  eb = (dst->border >> 1) + dst->uv_height - src->uv_height;

-  er = (dst->border >> 1) + dst->uv_width - src->uv_width;

   copy_and_extend_plane(src->u_buffer, src->uv_stride,

                         dst->u_buffer, dst->uv_stride,

-                        src->uv_height, src->uv_width,

-                        et, el, eb, er);

+                        src->uv_width, src->uv_height,

+                        et_uv, el_uv, eb_uv, er_uv);

   copy_and_extend_plane(src->v_buffer, src->uv_stride,

                         dst->v_buffer, dst->uv_stride,

-                        src->uv_height, src->uv_width,

-                        et, el, eb, er);

+                        src->uv_width, src->uv_height,

+                        et_y, el_y, eb_uv, er_uv);

-void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,

+void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,

                                          YV12_BUFFER_CONFIG *dst,

                                          int srcy, int srcx,

                                          int srch, int srcw) {

-  int et = dst->border;

-  int el = dst->border;

-  int eb = dst->border + dst->y_height - src->y_height;

-  int er = dst->border + dst->y_width - src->y_width;

-  int src_y_offset = srcy * src->y_stride + srcx;

-  int dst_y_offset = srcy * dst->y_stride + srcx;

-  int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);

-  int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);

   // If the side is not touching the bounder then don't extend.

-  if (srcy)

-    et = 0;

-  if (srcx)

-    el = 0;

-  if (srcy + srch != src->y_height)

-    eb = 0;

-  if (srcx + srcw != src->y_width)

-    er = 0;

+  const int et_y = srcy ? 0 : dst->border;

+  const int el_y = srcx ? 0 : dst->border;

+  const int eb_y = srcy + srch != src->y_height ? 0 :

+                      dst->border + dst->y_height - src->y_height;

+  const int er_y = srcx + srcw != src->y_width ? 0 :

+                      dst->border + dst->y_width - src->y_width;

+  const int src_y_offset = srcy * src->y_stride + srcx;

+  const int dst_y_offset = srcy * dst->y_stride + srcx;

-  copy_and_extend_plane(src->y_buffer + src_y_offset,

-                        src->y_stride,

-                        dst->y_buffer + dst_y_offset,

-                        dst->y_stride,

-                        srch, srcw,

-                        et, el, eb, er);

+  const int et_uv = (et_y + 1) >> 1;

+  const int el_uv = (el_y + 1) >> 1;

+  const int eb_uv = (eb_y + 1) >> 1;

+  const int er_uv = (er_y + 1) >> 1;

+  const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);

+  const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);

+  const int srch_uv = (srch + 1) >> 1;

+  const int srcw_uv = (srcw + 1) >> 1;

-  et = (et + 1) >> 1;

-  el = (el + 1) >> 1;

-  eb = (eb + 1) >> 1;

-  er = (er + 1) >> 1;

-  srch = (srch + 1) >> 1;

-  srcw = (srcw + 1) >> 1;

+  copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,

+                        dst->y_buffer + dst_y_offset, dst->y_stride,

+                        srcw, srch,

+                        et_y, el_y, eb_y, er_y);

-  copy_and_extend_plane(src->u_buffer + src_uv_offset,

-                        src->uv_stride,

-                        dst->u_buffer + dst_uv_offset,

-                        dst->uv_stride,

-                        srch, srcw,

-                        et, el, eb, er);

+  copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,

+                        dst->u_buffer + dst_uv_offset, dst->uv_stride,

+                        srcw_uv, srch_uv,

+                        et_uv, el_uv, eb_uv, er_uv);

-  copy_and_extend_plane(src->v_buffer + src_uv_offset,

-                        src->uv_stride,

-                        dst->v_buffer + dst_uv_offset,

-                        dst->uv_stride,

-                        srch, srcw,

-                        et, el, eb, er);

+  copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,

+                        dst->v_buffer + dst_uv_offset, dst->uv_stride,

+                        srcw_uv, srch_uv,

+                        et_uv, el_uv, eb_uv, er_uv);

-/* note the extension is only for the last row, for intra prediction purpose */

-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, uint8_t *YPtr,

-                       uint8_t *UPtr, uint8_t *VPtr) {

+// note the extension is only for the last row, for intra prediction purpose

+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *buf,

+                       uint8_t *y, uint8_t *u, uint8_t *v) {

   int i;

-  YPtr += ybf->y_stride * 14;

-  UPtr += ybf->uv_stride * 6;

-  VPtr += ybf->uv_stride * 6;

+  y += buf->y_stride * 14;

+  u += buf->uv_stride * 6;

+  v += buf->uv_stride * 6;

   for (i = 0; i < 4; i++) {

-    YPtr[i] = YPtr[-1];

-    UPtr[i] = UPtr[-1];

-    VPtr[i] = VPtr[-1];

+    y[i] = y[-1];

+    u[i] = u[-1];

+    v[i] = v[-1];

-  YPtr += ybf->y_stride;

-  UPtr += ybf->uv_stride;

-  VPtr += ybf->uv_stride;

+  y += buf->y_stride;

+  u += buf->uv_stride;

+  v += buf->uv_stride;

   for (i = 0; i < 4; i++) {

-    YPtr[i] = YPtr[-1];

-    UPtr[i] = UPtr[-1];

-    VPtr[i] = VPtr[-1];

+    y[i] = y[-1];

+    u[i] = u[-1];

+    v[i] = v[-1];

--- a/vp9/common/vp9_extend.h

+++ b/vp9/common/vp9_extend.h

@@ -14,15 +14,17 @@

 #include "vpx_scale/yv12config.h"

 #include "vpx/vpx_integer.h"

-void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, uint8_t *YPtr,

-                       uint8_t *UPtr, uint8_t *VPtr);

-void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,

+void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,

                                YV12_BUFFER_CONFIG *dst);

-void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,

+void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,

                                          YV12_BUFFER_CONFIG *dst,

                                          int srcy, int srcx,

                                          int srch, int srcw);

+void vp9_extend_mb_row(YV12_BUFFER_CONFIG *buf,

+                       uint8_t *y, uint8_t *u, uint8_t *v);

 #endif  // VP9_COMMON_VP9_EXTEND_H_

--- a/vp9/common/vp9_filter.c

+++ b/vp9/common/vp9_filter.c

@@ -15,28 +15,30 @@

 #include "vp9_rtcd.h"

 #include "vp9/common/vp9_common.h"

-DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {

-  { 128,   0 },

-  { 120,   8 },

-  { 112,  16 },

-  { 104,  24 },

-  {  96,  32 },

-  {  88,  40 },

-  {  80,  48 },

-  {  72,  56 },

-  {  64,  64 },

-  {  56,  72 },

-  {  48,  80 },

-  {  40,  88 },

-  {  32,  96 },

-  {  24, 104 },

-  {  16, 112 },

-  {   8, 120 }

+DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {

+  { 0, 0, 0, 128,   0, 0, 0, 0 },

+  { 0, 0, 0, 120,   8, 0, 0, 0 },

+  { 0, 0, 0, 112,  16, 0, 0, 0 },

+  { 0, 0, 0, 104,  24, 0, 0, 0 },

+  { 0, 0, 0,  96,  32, 0, 0, 0 },

+  { 0, 0, 0,  88,  40, 0, 0, 0 },

+  { 0, 0, 0,  80,  48, 0, 0, 0 },

+  { 0, 0, 0,  72,  56, 0, 0, 0 },

+  { 0, 0, 0,  64,  64, 0, 0, 0 },

+  { 0, 0, 0,  56,  72, 0, 0, 0 },

+  { 0, 0, 0,  48,  80, 0, 0, 0 },

+  { 0, 0, 0,  40,  88, 0, 0, 0 },

+  { 0, 0, 0,  32,  96, 0, 0, 0 },

+  { 0, 0, 0,  24, 104, 0, 0, 0 },

+  { 0, 0, 0,  16, 112, 0, 0, 0 },

+  { 0, 0, 0,   8, 120, 0, 0, 0 }

};

-#define FILTER_ALPHA       0

-#define FILTER_ALPHA_SHARP 1

-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {

+#define FILTER_ALPHA        0

+#define FILTER_ALPHA_SHARP  0

+#define FILTER_ALPHA_SMOOTH 50

+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8])

+    = {

 #if FILTER_ALPHA == 0

   /* Lagrangian interpolation filter */

   { 0,   0,   0, 128,   0,   0,   0,  0},

@@ -55,6 +57,7 @@

   { -1,   3,  -9,  27, 118, -13,   4, -1},

   { 0,   2,  -6,  18, 122, -10,   3, -1},

   { 0,   1,  -3,   8, 126,  -5,   1,  0}

 #elif FILTER_ALPHA == 50

   /* Generated using MATLAB:

    * alpha = 0.5;

@@ -79,11 +82,13 @@

   { 0,   3,  -9,  27, 118, -13,   3, -1},

   { 0,   2,  -6,  18, 122, -10,   2,  0},

   { 0,   1,  -3,   8, 126,  -5,   1,  0}

 #endif  /* FILTER_ALPHA */

};

-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = {

-#if FILTER_ALPHA_SHARP == 1

+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8])

+    = {

+#if FILTER_ALPHA_SHARP == 0

   /* dct based filter */

   {0,   0,   0, 128,   0,   0,   0, 0},

   {-1,   3,  -7, 127,   8,  -3,   1, 0},

@@ -101,31 +106,34 @@

   {-2,   5, -10,  27, 121, -17,   7, -3},

   {-1,   3,  -6,  17, 125, -13,   5, -2},

   {0,   1,  -3,   8, 127,  -7,   3, -1}

-#elif FILTER_ALPHA_SHARP == 75

-  /* alpha = 0.75 */

-  {0,   0,   0, 128,   0,   0,   0, 0},

-  {-1,   2,  -6, 126,   9,  -3,   2, -1},

-  {-1,   4, -11, 123,  18,  -7,   3, -1},

-  {-2,   6, -16, 119,  28, -10,   5, -2},

-  {-2,   7, -19, 113,  38, -13,   6, -2},

-  {-3,   8, -21, 106,  49, -16,   7, -2},

-  {-3,   9, -22,  99,  59, -19,   8, -3},

-  {-3,   9, -23,  90,  70, -21,   9, -3},

-  {-3,   9, -22,  80,  80, -22,   9, -3},

-  {-3,   9, -21,  70,  90, -23,   9, -3},

-  {-3,   8, -19,  59,  99, -22,   9, -3},

-  {-2,   7, -16,  49, 106, -21,   8, -3},

-  {-2,   6, -13,  38, 113, -19,   7, -2},

-  {-2,   5, -10,  28, 119, -16,   6, -2},

-  {-1,   3,  -7,  18, 123, -11,   4, -1},

-  {-1,   2,  -3,   9, 126,  -6,   2, -1}

+#elif FILTER_ALPHA_SHARP == 80

+  /* alpha = 0.80 */

+  { 0,   0,   0, 128,   0,   0,   0,  0},

+  {-1,   2,  -6, 127,   9,  -4,   2, -1},

+  {-2,   5, -12, 124,  18,  -7,   4, -2},

+  {-2,   7, -16, 119,  28, -11,   5, -2},

+  {-3,   8, -19, 114,  38, -14,   7, -3},

+  {-3,   9, -22, 107,  49, -17,   8, -3},

+  {-4,  10, -23,  99,  60, -20,  10, -4},

+  {-4,  11, -23,  90,  70, -22,  10, -4},

+  {-4,  11, -23,  80,  80, -23,  11, -4},

+  {-4,  10, -22,  70,  90, -23,  11, -4},

+  {-4,  10, -20,  60,  99, -23,  10, -4},

+  {-3,   8, -17,  49, 107, -22,   9, -3},

+  {-3,   7, -14,  38, 114, -19,   8, -3},

+  {-2,   5, -11,  28, 119, -16,   7, -2},

+  {-2,   4,  -7,  18, 124, -12,   5, -2},

+  {-1,   2,  -4,   9, 127,  -6,   2, -1}

 #endif  /* FILTER_ALPHA_SHARP */

};

-DECLARE_ALIGNED(16, const int16_t,

+DECLARE_ALIGNED(256, const int16_t,

                 vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = {

   /* 8-tap lowpass filter */

   /* Hamming window */

+  /* freqmultiplier = 0.625 */

+#if FILTER_ALPHA_SMOOTH == 625

   {-1, -7, 32, 80, 32, -7, -1,  0},

   {-1, -8, 28, 80, 37, -7, -2,  1},

   { 0, -8, 24, 79, 41, -7, -2,  1},

@@ -142,1074 +150,44 @@

   { 1, -3, -5, 45, 78, 20, -8,  0},

   { 1, -2, -7, 41, 79, 24, -8,  0},

   { 1, -2, -7, 37, 80, 28, -8, -1}

-};

-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {

-  {0,   0, 128,   0,   0, 0},

-  {1,  -5, 125,   8,  -2, 1},

-  {1,  -8, 122,  17,  -5, 1},

-  {2, -11, 116,  27,  -8, 2},

-  {3, -14, 110,  37, -10, 2},

-  {3, -15, 103,  47, -12, 2},

-  {3, -16,  95,  57, -14, 3},

-  {3, -16,  86,  67, -15, 3},

-  {3, -16,  77,  77, -16, 3},

-  {3, -15,  67,  86, -16, 3},

-  {3, -14,  57,  95, -16, 3},

-  {2, -12,  47, 103, -15, 3},

-  {2, -10,  37, 110, -14, 3},

-  {2,  -8,  27, 116, -11, 2},

-  {1,  -5,  17, 122,  -8, 1},

-  {1,  -2,   8, 125,  -5, 1}

+#elif FILTER_ALPHA_SMOOTH == 50

+  /* freqmultiplier = 0.5 */

+  {-3,  0, 35, 64, 35,  0, -3, 0},

+  {-3, -1, 32, 64, 38,  1, -3, 0},

+  {-2, -2, 29, 63, 41,  2, -3, 0},

+  {-2, -2, 26, 63, 43,  4, -4, 0},

+  {-2, -3, 24, 62, 46,  5, -4, 0},

+  {-2, -3, 21, 60, 49,  7, -4, 0},

+  {-1, -4, 18, 59, 51,  9, -4, 0},

+  {-1, -4, 16, 57, 53, 12, -4, -1},

+  {-1, -4, 14, 55, 55, 14, -4, -1},

+  {-1, -4, 12, 53, 57, 16, -4, -1},

+  {0, -4,  9, 51, 59, 18, -4, -1},

+  {0, -4,  7, 49, 60, 21, -3, -2},

+  {0, -4,  5, 46, 62, 24, -3, -2},

+  {0, -4,  4, 43, 63, 26, -2, -2},

+  {0, -3,  2, 41, 63, 29, -2, -2},

+  {0, -3,  1, 38, 64, 32, -1, -3}

+#endif

};

-static void filter_block2d_first_pass_6(uint8_t *src_ptr,

-                                        int *output_ptr,

-                                        unsigned int src_pixels_per_line,

-                                        unsigned int pixel_step,

-                                        unsigned int output_height,

-                                        unsigned int output_width,

-                                        const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

-             ((int)src_ptr[0]                    * vp9_filter[2]) +

-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

-             ((int)src_ptr[2 * pixel_step]       * vp9_filter[4]) +

-             ((int)src_ptr[3 * pixel_step]       * vp9_filter[5]) +

-             (VP9_FILTER_WEIGHT >> 1);      /* Rounding */

-      /* Normalize back to 0-255 */

-      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);

-      src_ptr++;

-    }

-    /* Next row... */

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_width;

-  }

-}

-static void filter_block2d_second_pass_6(int *src_ptr,

-                                         uint8_t *output_ptr,

-                                         int output_pitch,

-                                         unsigned int src_pixels_per_line,

-                                         unsigned int pixel_step,

-                                         unsigned int output_height,

-                                         unsigned int output_width,

-                                         const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      /* Apply filter */

-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

-             ((int)src_ptr[0]                    * vp9_filter[2]) +

-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +

-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +

-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */

-      /* Normalize back to 0-255 */

-      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);

-      src_ptr++;

-    }

-    /* Start next row */

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_pitch;

-  }

-}

-/*

- * The only functional difference between filter_block2d_second_pass()

- * and this function is that filter_block2d_second_pass() does a sixtap

- * filter on the input and stores it in the output. This function

- * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,

- * and then averages that with the content already present in the output

- * ((filter_result + dest + 1) >> 1) and stores that in the output.

- */

-static void filter_block2d_second_pass_avg_6(int *src_ptr,

-                                             uint8_t *output_ptr,

-                                             int output_pitch,

-                                             unsigned int src_pixels_per_line,

-                                             unsigned int pixel_step,

-                                             unsigned int output_height,

-                                             unsigned int output_width,

-                                             const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      /* Apply filter */

-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

-             ((int)src_ptr[0]                    * vp9_filter[2]) +

-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +

-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +

-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */

-      /* Normalize back to 0-255 */

-      output_ptr[j] = (clip_pixel(temp >> VP9_FILTER_SHIFT) +

-                       output_ptr[j] + 1) >> 1;

-      src_ptr++;

-    }

-    /* Start next row */

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_pitch;

-  }

-}

-#define Interp_Extend 3

-static void filter_block2d_6(uint8_t *src_ptr,

-                             uint8_t *output_ptr,

-                             unsigned int src_pixels_per_line,

-                             int output_pitch,

-                             const int16_t *HFilter,

-                             const int16_t *VFilter) {

-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr,

-                               output_pitch, 4, 4, 4, 4, VFilter);

-}

-void vp9_sixtap_predict4x4_c(uint8_t *src_ptr,

-                             int src_pixels_per_line,

-                             int xoffset,

-                             int yoffset,

-                             uint8_t *dst_ptr,

-                             int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,

-                   VFilter);

-}

-/*

- * The difference between filter_block2d_6() and filter_block2d_avg_6 is

- * that filter_block2d_6() does a 6-tap filter and stores it in the output

- * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and

- * then averages that with the content already present in the output

- * ((filter_result + dest + 1) >> 1) and stores that in the output.

- */

-static void filter_block2d_avg_6(uint8_t *src_ptr,

-                                 uint8_t *output_ptr,

-                                 unsigned int src_pixels_per_line,

-                                 int output_pitch,

-                                 const int16_t *HFilter,

-                                 const int16_t *VFilter) {

-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,

-                                   output_pitch, 4, 4, 4, 4, VFilter);

-}

-void vp9_sixtap_predict_avg4x4_c(uint8_t *src_ptr,

-                                 int src_pixels_per_line,

-                                 int xoffset,

-                                 int yoffset,

-                                 uint8_t *dst_ptr,

-                                 int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,

-                       HFilter, VFilter);

-}

-void vp9_sixtap_predict8x8_c(uint8_t *src_ptr,

-                             int src_pixels_per_line,

-                             int xoffset,

-                             int yoffset,

-                             uint8_t *dst_ptr,

-                             int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,

-                               dst_pitch, 8, 8, 8, 8, VFilter);

-}

-void vp9_sixtap_predict_avg8x8_c(uint8_t *src_ptr,

-                                 int src_pixels_per_line,

-                                 int xoffset,

-                                 int yoffset,

-                                 uint8_t *dst_ptr,

-                                 int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr,

-                                   dst_pitch, 8, 8, 8, 8, VFilter);

-}

-void vp9_sixtap_predict8x4_c(uint8_t *src_ptr,

-                             int src_pixels_per_line,

-                             int xoffset,

-                             int yoffset,

-                             uint8_t *dst_ptr,

-                             int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 8, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,

-                               dst_pitch, 8, 8, 4, 8, VFilter);

-}

-void vp9_sixtap_predict16x16_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr,

-                               dst_pitch, 16, 16, 16, 16, VFilter);

-}

-void vp9_sixtap_predict_avg16x16_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr,

-                                   dst_pitch, 16, 16, 16, 16, VFilter);

-}

-typedef enum {

-  VPX_FILTER_4x4 = 0,

-  VPX_FILTER_8x8 = 1,

-  VPX_FILTER_8x4 = 2,

-  VPX_FILTER_16x16 = 3,

-} filter_size_t;

-static const unsigned int filter_size_to_wh[][2] = {

-  {4, 4},

-  {8, 8},

-  {8, 4},

-  {16,16},

+DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8])

+    = {

+  {0, 0,   0, 128,   0,   0, 0,  0},

+  {0, 1,  -5, 125,   8,  -2, 1,  0},

+  {0, 1,  -8, 122,  17,  -5, 1,  0},

+  {0, 2, -11, 116,  27,  -8, 2,  0},

+  {0, 3, -14, 110,  37, -10, 2,  0},

+  {0, 3, -15, 103,  47, -12, 2,  0},

+  {0, 3, -16,  95,  57, -14, 3,  0},

+  {0, 3, -16,  86,  67, -15, 3,  0},

+  {0, 3, -16,  77,  77, -16, 3,  0},

+  {0, 3, -15,  67,  86, -16, 3,  0},

+  {0, 3, -14,  57,  95, -16, 3,  0},

+  {0, 2, -12,  47, 103, -15, 3,  0},

+  {0, 2, -10,  37, 110, -14, 3,  0},

+  {0, 2,  -8,  27, 116, -11, 2,  0},

+  {0, 1,  -5,  17, 122,  -8, 1,  0},

+  {0, 1,  -2,   8, 125,  -5, 1,  0}

};

-static void filter_block2d_8_c(const uint8_t *src_ptr,

-                               const unsigned int src_stride,

-                               const int16_t *HFilter,

-                               const int16_t *VFilter,

-                               const filter_size_t filter_size,

-                               uint8_t *dst_ptr,

-                               unsigned int dst_stride) {

-  const unsigned int output_width = filter_size_to_wh[filter_size][0];

-  const unsigned int output_height = filter_size_to_wh[filter_size][1];

-  // Between passes, we use an intermediate buffer whose height is extended to

-  // have enough horizontally filtered values as input for the vertical pass.

-  // This buffer is allocated to be big enough for the largest block type we

-  // support.

-  const int kInterp_Extend = 4;

-  const unsigned int intermediate_height =

-    (kInterp_Extend - 1) +     output_height + kInterp_Extend;

-  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,

-   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height

-   *                                 + kInterp_Extend

-   *                               = 3 + 16 + 4

-   *                               = 23

-   * and filter_max_width = 16

-   */

-  uint8_t intermediate_buffer[23 * 16];

-  const int intermediate_next_stride = 1 - intermediate_height * output_width;

-  // Horizontal pass (src -> transposed intermediate).

-  {

-    uint8_t *output_ptr = intermediate_buffer;

-    const int src_next_row_stride = src_stride - output_width;

-    unsigned int i, j;

-    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

-    for (i = 0; i < intermediate_height; i++) {

-      for (j = 0; j < output_width; j++) {

-        // Apply filter...

-        int temp = ((int)src_ptr[0] * HFilter[0]) +

-                   ((int)src_ptr[1] * HFilter[1]) +

-                   ((int)src_ptr[2] * HFilter[2]) +

-                   ((int)src_ptr[3] * HFilter[3]) +

-                   ((int)src_ptr[4] * HFilter[4]) +

-                   ((int)src_ptr[5] * HFilter[5]) +

-                   ((int)src_ptr[6] * HFilter[6]) +

-                   ((int)src_ptr[7] * HFilter[7]) +

-                   (VP9_FILTER_WEIGHT >> 1); // Rounding

-        // Normalize back to 0-255...

-        *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);

-        src_ptr++;

-        output_ptr += intermediate_height;

-      }

-      src_ptr += src_next_row_stride;

-      output_ptr += intermediate_next_stride;

-    }

-  }

-  // Vertical pass (transposed intermediate -> dst).

-  {

-    uint8_t *src_ptr = intermediate_buffer;

-    const int dst_next_row_stride = dst_stride - output_width;

-    unsigned int i, j;

-    for (i = 0; i < output_height; i++) {

-      for (j = 0; j < output_width; j++) {

-        // Apply filter...

-        int temp = ((int)src_ptr[0] * VFilter[0]) +

-                   ((int)src_ptr[1] * VFilter[1]) +

-                   ((int)src_ptr[2] * VFilter[2]) +

-                   ((int)src_ptr[3] * VFilter[3]) +

-                   ((int)src_ptr[4] * VFilter[4]) +

-                   ((int)src_ptr[5] * VFilter[5]) +

-                   ((int)src_ptr[6] * VFilter[6]) +

-                   ((int)src_ptr[7] * VFilter[7]) +

-                   (VP9_FILTER_WEIGHT >> 1); // Rounding

-        // Normalize back to 0-255...

-        *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);

-        src_ptr += intermediate_height;

-      }

-      src_ptr += intermediate_next_stride;

-      dst_ptr += dst_next_row_stride;

-    }

-  }

-}

-void vp9_filter_block2d_4x4_8_c(const uint8_t *src_ptr,

-                                const unsigned int src_stride,

-                                const int16_t *HFilter_aligned16,

-                                const int16_t *VFilter_aligned16,

-                                uint8_t *dst_ptr,

-                                unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_4x4, dst_ptr, dst_stride);

-}

-void vp9_filter_block2d_8x4_8_c(const uint8_t *src_ptr,

-                                const unsigned int src_stride,

-                                const int16_t *HFilter_aligned16,

-                                const int16_t *VFilter_aligned16,

-                                uint8_t *dst_ptr,

-                                unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_8x4, dst_ptr, dst_stride);

-}

-void vp9_filter_block2d_8x8_8_c(const uint8_t *src_ptr,

-                                const unsigned int src_stride,

-                                const int16_t *HFilter_aligned16,

-                                const int16_t *VFilter_aligned16,

-                                uint8_t *dst_ptr,

-                                unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_8x8, dst_ptr, dst_stride);

-}

-void vp9_filter_block2d_16x16_8_c(const uint8_t *src_ptr,

-                                  const unsigned int src_stride,

-                                  const int16_t *HFilter_aligned16,

-                                  const int16_t *VFilter_aligned16,

-                                  uint8_t *dst_ptr,

-                                  unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_16x16, dst_ptr, dst_stride);

-}

-static void block2d_average_c(uint8_t *src,

-                              unsigned int src_stride,

-                              uint8_t *output_ptr,

-                              unsigned int output_stride,

-                              const filter_size_t filter_size) {

-  const unsigned int output_width = filter_size_to_wh[filter_size][0];

-  const unsigned int output_height = filter_size_to_wh[filter_size][1];

-  unsigned int i, j;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;

-    }

-    output_ptr += output_stride;

-  }

-}

-#define block2d_average block2d_average_c

-void vp9_eighttap_predict4x4_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_8[xoffset];

-  VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg4x4_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  uint8_t tmp[4 * 4];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           4);

-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

-}

-void vp9_eighttap_predict4x4_sharp_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_8s[xoffset];

-  VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict4x4_smooth_c(uint8_t *src_ptr,

-                                      int src_pixels_per_line,

-                                      int xoffset,

-                                      int yoffset,

-                                      uint8_t *dst_ptr,

-                                      int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg4x4_sharp_c(uint8_t *src_ptr,

-                                         int src_pixels_per_line,

-                                         int xoffset,

-                                         int yoffset,

-                                         uint8_t *dst_ptr,

-                                         int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  uint8_t tmp[4 * 4];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           4);

-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

-}

-void vp9_eighttap_predict_avg4x4_smooth_c(uint8_t *src_ptr,

-                                          int src_pixels_per_line,

-                                          int xoffset,

-                                          int yoffset,

-                                          uint8_t *dst_ptr,

-                                          int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  uint8_t tmp[4 * 4];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           4);

-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

-}

-void vp9_eighttap_predict8x8_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x8_sharp_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x8_smooth_c(uint8_t *src_ptr,

-                                      int src_pixels_per_line,

-                                      int xoffset,

-                                      int yoffset,

-                                      uint8_t *dst_ptr,

-                                      int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg8x8_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  uint8_t tmp[8 * 8];

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           8);

-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

-}

-void vp9_eighttap_predict_avg8x8_sharp_c(uint8_t *src_ptr,

-                                         int src_pixels_per_line,

-                                         int xoffset,

-                                         int yoffset,

-                                         uint8_t *dst_ptr,

-                                         int dst_pitch) {

-  uint8_t tmp[8 * 8];

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           8);

-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

-}

-void vp9_eighttap_predict_avg8x8_smooth_c(uint8_t *src_ptr,

-                                          int src_pixels_per_line,

-                                          int xoffset,

-                                          int yoffset,

-                                          uint8_t *dst_ptr,

-                                          int dst_pitch) {

-  uint8_t tmp[8 * 8];

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           8);

-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

-}

-void vp9_eighttap_predict8x4_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x4_sharp_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x4_smooth_c(uint8_t *src_ptr,

-                                      int src_pixels_per_line,

-                                      int xoffset,

-                                      int yoffset,

-                                      uint8_t *dst_ptr,

-                                      int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict16x16_c(uint8_t *src_ptr,

-                                 int src_pixels_per_line,

-                                 int xoffset,

-                                 int yoffset,

-                                 uint8_t *dst_ptr,

-                                 int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict16x16_sharp_c(uint8_t *src_ptr,

-                                       int src_pixels_per_line,

-                                       int xoffset,

-                                       int yoffset,

-                                       uint8_t *dst_ptr,

-                                       int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict16x16_smooth_c(uint8_t *src_ptr,

-                                        int src_pixels_per_line,

-                                        int xoffset,

-                                        int yoffset,

-                                        uint8_t *dst_ptr,

-                                        int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg16x16_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             tmp, 16);

-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

-}

-void vp9_eighttap_predict_avg16x16_sharp_c(uint8_t *src_ptr,

-                                           int src_pixels_per_line,

-                                           int xoffset,

-                                           int yoffset,

-                                           uint8_t *dst_ptr,

-                                           int dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             tmp, 16);

-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

-}

-void vp9_eighttap_predict_avg16x16_smooth_c(uint8_t *src_ptr,

-                                            int src_pixels_per_line,

-                                            int xoffset,

-                                            int yoffset,

-                                            uint8_t *dst_ptr,

-                                            int dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             tmp, 16);

-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_first_pass

- *

- *  INPUTS        : uint8_t  *src_ptr    : Pointer to source block.

- *                  uint32_t  src_stride : Stride of source block.

- *                  uint32_t  height     : Block height.

- *                  uint32_t  width      : Block width.

- *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : int32_t  *dst_ptr    : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

- *                  in the horizontal direction to produce the filtered output

- *                  block. Used to implement first-pass of 2-D separable filter.

- *

- *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.

- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

- *

- ****************************************************************************/

-static void filter_block2d_bil_first_pass(uint8_t *src_ptr,

-                                          uint16_t *dst_ptr,

-                                          unsigned int src_stride,

-                                          unsigned int height,

-                                          unsigned int width,

-                                          const int16_t *vp9_filter) {

-  unsigned int i, j;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      /* Apply bilinear filter */

-      dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +

-                    ((int)src_ptr[1] * vp9_filter[1]) +

-                    (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;

-      src_ptr++;

-    }

-    /* Next row... */

-    src_ptr += src_stride - width;

-    dst_ptr += width;

-  }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_second_pass

- *

- *  INPUTS        : int32_t  *src_ptr    : Pointer to source block.

- *                  uint32_t  dst_pitch  : Destination block pitch.

- *                  uint32_t  height     : Block height.

- *                  uint32_t  width      : Block width.

- *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : uint16_t *dst_ptr    : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

- *                  in the vertical direction to produce the filtered output

- *                  block. Used to implement second-pass of 2-D separable filter.

- *

- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.

- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

- *

- ****************************************************************************/

-static void filter_block2d_bil_second_pass(uint16_t *src_ptr,

-                                           uint8_t *dst_ptr,

-                                           int dst_pitch,

-                                           unsigned int height,

-                                           unsigned int width,

-                                           const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      /* Apply filter */

-      temp = ((int)src_ptr[0]     * vp9_filter[0]) +

-             ((int)src_ptr[width] * vp9_filter[1]) +

-             (VP9_FILTER_WEIGHT / 2);

-      dst_ptr[j] = (unsigned int)(temp >> VP9_FILTER_SHIFT);

-      src_ptr++;

-    }

-    /* Next row... */

-    dst_ptr += dst_pitch;

-  }

-}

-/*

- * As before for filter_block2d_second_pass_avg(), the functional difference

- * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()

- * is that filter_block2d_bil_second_pass() does a bilinear filter on input

- * and stores the result in output; filter_block2d_bil_second_pass_avg(),

- * instead, does a bilinear filter on input, averages the resulting value

- * with the values already present in the output and stores the result of

- * that back into the output ((filter_result + dest + 1) >> 1).

- */

-static void filter_block2d_bil_second_pass_avg(uint16_t *src_ptr,

-                                               uint8_t *dst_ptr,

-                                               int dst_pitch,

-                                               unsigned int height,

-                                               unsigned int width,

-                                               const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      /* Apply filter */

-      temp = (((int)src_ptr[0]     * vp9_filter[0]) +

-              ((int)src_ptr[width] * vp9_filter[1]) +

-              (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;

-      dst_ptr[j] = (unsigned int)((temp + dst_ptr[j] + 1) >> 1);

-      src_ptr++;

-    }

-    /* Next row... */

-    dst_ptr += dst_pitch;

-  }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil

- *

- *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.

- *                  uint32_t  src_pitch        : Stride of source block.

- *                  uint32_t  dst_pitch        : Stride of destination block.

- *                  int32_t  *HFilter          : Array of 2 horizontal filter taps.

- *                  int32_t  *VFilter          : Array of 2 vertical filter taps.

- *                  int32_t  Width             : Block width

- *                  int32_t  Height            : Block height

- *

- *  OUTPUTS       : uint16_t *dst_ptr       : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : 2-D filters an input block by applying a 2-tap

- *                  bi-linear filter horizontally followed by a 2-tap

- *                  bi-linear filter vertically on the result.

- *

- *  SPECIAL NOTES : The largest block size can be handled here is 16x16

- *

- ****************************************************************************/

-static void filter_block2d_bil(uint8_t *src_ptr,

-                               uint8_t *dst_ptr,

-                               unsigned int src_pitch,

-                               unsigned int dst_pitch,

-                               const int16_t *HFilter,

-                               const int16_t *VFilter,

-                               int Width,

-                               int Height) {

-  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */

-  /* First filter 1-D horizontally... */

-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

-  /* then 1-D vertically... */

-  filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

-}

-static void filter_block2d_bil_avg(uint8_t *src_ptr,

-                                   uint8_t *dst_ptr,

-                                   unsigned int src_pitch,

-                                   unsigned int dst_pitch,

-                                   const int16_t *HFilter,

-                                   const int16_t *VFilter,

-                                   int Width,

-                                   int Height) {

-  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */

-  /* First filter 1-D horizontally... */

-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

-  /* then 1-D vertically... */

-  filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

-}

-void vp9_bilinear_predict4x4_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

-}

-void vp9_bilinear_predict_avg4x4_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

-                         dst_pitch, HFilter, VFilter, 4, 4);

-}

-void vp9_bilinear_predict8x8_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

-}

-void vp9_bilinear_predict_avg8x8_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

-                         dst_pitch, HFilter, VFilter, 8, 8);

-}

-void vp9_bilinear_predict8x4_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

-}

-void vp9_bilinear_predict16x16_c(uint8_t *src_ptr,

-                                 int src_pixels_per_line,

-                                 int xoffset,

-                                 int yoffset,

-                                 uint8_t *dst_ptr,

-                                 int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);

-}

-void vp9_bilinear_predict_avg16x16_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

-                         dst_pitch, HFilter, VFilter, 16, 16);

-}

--- a/vp9/common/vp9_filter.h

+++ b/vp9/common/vp9_filter.h

@@ -21,10 +21,17 @@

 #define SUBPEL_SHIFTS 16

-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2];

-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];

+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];

+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];

 extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];

 extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];

 extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];

+// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear

+// filter kernel as a 2 tap filter.

+#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \

+                   sizeof(vp9_bilinear_filters[0][0]))

+#define BF_OFFSET (BF_LENGTH / 2 - 1)

+#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)

 #endif  // VP9_COMMON_VP9_FILTER_H_

--- a/vp9/common/vp9_findnearmv.c

+++ b/vp9/common/vp9_findnearmv.c

@@ -9,10 +9,11 @@

*/

+#include <limits.h>

 #include "vp9/common/vp9_findnearmv.h"

 #include "vp9/common/vp9_sadmxn.h"

 #include "vp9/common/vp9_subpelvar.h"

-#include <limits.h>

 const uint8_t vp9_mbsplit_offset[4][16] = {

   { 0,  8,  0,  0,  0,  0,  0,  0,  0,  0,   0,  0,  0,  0,  0,  0},

@@ -32,8 +33,7 @@

 vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,

-                           vp9_prob p[4], const int context

-                          ) {

+                           vp9_prob p[4], const int context) {

   p[0] = pc->fc.vp9_mode_contexts[context][0];

   p[1] = pc->fc.vp9_mode_contexts[context][1];

   p[2] = pc->fc.vp9_mode_contexts[context][2];

@@ -87,8 +87,8 @@

   uint8_t temp2[2 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3,

                                     src_pixels_per_line, 1, 3, 16, HFilter);

@@ -108,8 +108,8 @@

   uint8_t temp2[2 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3,

                                     src_pixels_per_line, 1, 17, 2, HFilter);

@@ -118,10 +118,12 @@

   return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse);

+#if CONFIG_USESELECTREFMV

 /* check a list of motion vectors by sad score using a number rows of pixels

  * above and a number cols of pixels in the left to select the one with best

  * score to use as ref motion vector

*/

 void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

                            uint8_t *ref_y_buffer,

                            int ref_y_stride,

@@ -141,130 +143,140 @@

   int_mv sorted_mvs[MAX_MV_REF_CANDIDATES];

   int zero_seen = FALSE;

-  // Default all to 0,0 if nothing else available

-  nearest->as_int = near->as_int = 0;

-  vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));

+  if (ref_y_buffer) {

-  above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;

-  above_ref = ref_y_buffer - ref_y_stride * 2;

+    // Default all to 0,0 if nothing else available

+    nearest->as_int = near->as_int = 0;

+    vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));

+    above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;

+    above_ref = ref_y_buffer - ref_y_stride * 2;

 #if CONFIG_ABOVESPREFMV

-  above_src -= 4;

-  above_ref -= 4;

+    above_src -= 4;

+    above_ref -= 4;

 #else

-  left_src  = xd->dst.y_buffer - 2;

-  left_ref  = ref_y_buffer - 2;

+    left_src  = xd->dst.y_buffer - 2;

+    left_ref  = ref_y_buffer - 2;

 #endif

-  // Limit search to the predicted best few candidates

-  for(i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {

-    int_mv this_mv;

-    int offset = 0;

-    int row_offset, col_offset;

+    // Limit search to the predicted best few candidates

+    for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {

+      int_mv this_mv;

+      int offset = 0;

+      int row_offset, col_offset;

-    this_mv.as_int = mvlist[i].as_int;

+      this_mv.as_int = mvlist[i].as_int;

-    // If we see a 0,0 vector for a second time we have reached the end of

-    // the list of valid candidate vectors.

-    if (!this_mv.as_int && zero_seen)

-      break;

+      // If we see a 0,0 vector for a second time we have reached the end of

+      // the list of valid candidate vectors.

+      if (!this_mv.as_int && zero_seen)

+        break;

-    zero_seen = zero_seen || !this_mv.as_int;

+      zero_seen = zero_seen || !this_mv.as_int;

 #if !CONFIG_ABOVESPREFMV

-    clamp_mv(&this_mv,

-             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,

-             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

-             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,

-             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

+      clamp_mv(&this_mv,

+               xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24,

+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

+               xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,

+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

 #else

-    clamp_mv(&this_mv,

-             xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32,

-             xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

-             xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,

-             xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

+      clamp_mv(&this_mv,

+               xd->mb_to_left_edge - LEFT_TOP_MARGIN + 32,

+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,

+               xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24,

+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);

 #endif

-    row_offset = this_mv.as_mv.row >> 3;

-    col_offset = this_mv.as_mv.col >> 3;

-    offset = ref_y_stride * row_offset + col_offset;

-    score = 0;

-    if (xd->up_available) {

-      vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,

-                                 SP(this_mv.as_mv.col),

-                                 SP(this_mv.as_mv.row),

-                                 above_src, xd->dst.y_stride, &sse);

-      score += sse;

-      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {

-        vp9_sub_pixel_variance16x2(above_ref + offset + 16,

-                                   ref_y_stride,

-                                   SP(this_mv.as_mv.col),

-                                   SP(this_mv.as_mv.row),

-                                   above_src + 16, xd->dst.y_stride, &sse);

-        score += sse;

-      }

-      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {

-        vp9_sub_pixel_variance16x2(above_ref + offset + 32,

-                                   ref_y_stride,

-                                   SP(this_mv.as_mv.col),

-                                   SP(this_mv.as_mv.row),

-                                   above_src + 32, xd->dst.y_stride, &sse);

-        score += sse;

-        vp9_sub_pixel_variance16x2(above_ref + offset + 48,

-                                   ref_y_stride,

-                                   SP(this_mv.as_mv.col),

-                                   SP(this_mv.as_mv.row),

-                                   above_src + 48, xd->dst.y_stride, &sse);

-        score += sse;

-      }

-    }

+      row_offset = this_mv.as_mv.row >> 3;

+      col_offset = this_mv.as_mv.col >> 3;

+      offset = ref_y_stride * row_offset + col_offset;

+      score = 0;

 #if !CONFIG_ABOVESPREFMV

-    if (xd->left_available) {

-      vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,

+      if (xd->up_available) {

+#else

+      if (xd->up_available && xd->left_available) {

+#endif

+        vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride,

                                    SP(this_mv.as_mv.col),

                                    SP(this_mv.as_mv.row),

-                                   left_src, xd->dst.y_stride, &sse);

-      score += sse;

-      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {

-        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16,

+                                   above_src, xd->dst.y_stride, &sse);

+        score += sse;

+        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {

+          vp9_sub_pixel_variance16x2(above_ref + offset + 16,

                                      ref_y_stride,

                                      SP(this_mv.as_mv.col),

                                      SP(this_mv.as_mv.row),

-                                     left_src + xd->dst.y_stride * 16,

-                                     xd->dst.y_stride, &sse);

-        score += sse;

-      }

-      if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {

-        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32,

+                                     above_src + 16, xd->dst.y_stride, &sse);

+          score += sse;

+        }

+        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {

+          vp9_sub_pixel_variance16x2(above_ref + offset + 32,

                                      ref_y_stride,

                                      SP(this_mv.as_mv.col),

                                      SP(this_mv.as_mv.row),

-                                     left_src + xd->dst.y_stride * 32,

-                                     xd->dst.y_stride, &sse);

-        score += sse;

-        vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48,

+                                     above_src + 32, xd->dst.y_stride, &sse);

+          score += sse;

+          vp9_sub_pixel_variance16x2(above_ref + offset + 48,

                                      ref_y_stride,

                                      SP(this_mv.as_mv.col),

                                      SP(this_mv.as_mv.row),

-                                     left_src + xd->dst.y_stride * 48,

-                                     xd->dst.y_stride, &sse);

+                                     above_src + 48, xd->dst.y_stride, &sse);

+          score += sse;

+        }

+      }

+#if !CONFIG_ABOVESPREFMV

+      if (xd->left_available) {

+        vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride,

+                                     SP(this_mv.as_mv.col),

+                                     SP(this_mv.as_mv.row),

+                                     left_src, xd->dst.y_stride, &sse);

         score += sse;

+        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {

+          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16,

+                                       ref_y_stride,

+                                       SP(this_mv.as_mv.col),

+                                       SP(this_mv.as_mv.row),

+                                       left_src + xd->dst.y_stride * 16,

+                                       xd->dst.y_stride, &sse);

+          score += sse;

+        }

+        if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) {

+          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 32,

+                                     ref_y_stride,

+                                       SP(this_mv.as_mv.col),

+                                       SP(this_mv.as_mv.row),

+                                       left_src + xd->dst.y_stride * 32,

+                                       xd->dst.y_stride, &sse);

+          score += sse;

+          vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 48,

+                                       ref_y_stride,

+                                       SP(this_mv.as_mv.col),

+                                       SP(this_mv.as_mv.row),

+                                       left_src + xd->dst.y_stride * 48,

+                                       xd->dst.y_stride, &sse);

+          score += sse;

+        }

-    }

 #endif

-    // Add the entry to our list and then resort the list on score.

-    ref_scores[i] = score;

-    sorted_mvs[i].as_int = this_mv.as_int;

-    j = i;

-    while (j > 0) {

-      if (ref_scores[j] < ref_scores[j-1]) {

-        ref_scores[j] = ref_scores[j-1];

-        sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;

-        ref_scores[j-1] = score;

-        sorted_mvs[j-1].as_int = this_mv.as_int;

-        j--;

-      } else

-        break;

+      // Add the entry to our list and then resort the list on score.

+      ref_scores[i] = score;

+      sorted_mvs[i].as_int = this_mv.as_int;

+      j = i;

+      while (j > 0) {

+        if (ref_scores[j] < ref_scores[j-1]) {

+          ref_scores[j] = ref_scores[j-1];

+          sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;

+          ref_scores[j-1] = score;

+          sorted_mvs[j-1].as_int = this_mv.as_int;

+          j--;

+        } else {

+          break;

+        }

+      }

+  } else {

+    vpx_memcpy(sorted_mvs, mvlist, sizeof(sorted_mvs));

   // Make sure all the candidates are properly clamped etc

@@ -273,23 +285,35 @@

     clamp_mv2(&sorted_mvs[i], xd);

-  // Provided that there are non zero vectors available there will not

-  // be more than one 0,0 entry in the sorted list.

-  // The best ref mv is always set to the first entry (which gave the best

-  // results. The nearest is set to the first non zero vector if available and

-  // near to the second non zero vector if available.

-  // We do not use 0,0 as a nearest or near as 0,0 has its own mode.

-  if ( sorted_mvs[0].as_int ) {

-    nearest->as_int = sorted_mvs[0].as_int;

-    if ( sorted_mvs[1].as_int )

-      near->as_int = sorted_mvs[1].as_int;

-    else

-      near->as_int = sorted_mvs[2].as_int;

+  // Nearest may be a 0,0 or non zero vector and now matches the chosen

+  // "best reference". This has advantages when it is used as part of a

+  // compound predictor as it means a non zero vector can be paired using

+  // this mode with a 0 vector. The Near vector is still forced to be a

+  // non zero candidate if one is avaialble.

+  nearest->as_int = sorted_mvs[0].as_int;

+  if ( sorted_mvs[1].as_int ) {

+    near->as_int = sorted_mvs[1].as_int;

   } else {

-      nearest->as_int = sorted_mvs[1].as_int;

-      near->as_int = sorted_mvs[2].as_int;

+    near->as_int = sorted_mvs[2].as_int;

   // Copy back the re-ordered mv list

   vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs));

+#else

+void vp9_find_best_ref_mvs(MACROBLOCKD *xd,

+                           uint8_t *ref_y_buffer,

+                           int ref_y_stride,

+                           int_mv *mvlist,

+                           int_mv *nearest,

+                           int_mv *near) {

+  int i;

+  // Make sure all the candidates are properly clamped etc

+  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {

+    lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);

+    clamp_mv2(&mvlist[i], xd);

+  }

+  *nearest = mvlist[0];

+  *near = mvlist[1];

+}

+#endif

--- a/vp9/common/vp9_findnearmv.h

+++ b/vp9/common/vp9_findnearmv.h

@@ -17,6 +17,9 @@

 #include "vp9/common/vp9_treecoder.h"

 #include "vp9/common/vp9_onyxc_int.h"

+#define LEFT_TOP_MARGIN (16 << 3)

+#define RIGHT_BOTTOM_MARGIN (16 << 3)

 /* check a list of motion vectors by sad score using a number rows of pixels

  * above and a number cols of pixels in the left to select the one with best

  * score to use as ref motion vector

@@ -28,9 +31,9 @@

                            int_mv *nearest,

                            int_mv *near);

-static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) {

-  MV xmv;

-  xmv = mvp->as_mv;

+static void mv_bias(int refmb_ref_frame_sign_bias, int refframe,

+                    int_mv *mvp, const int *ref_frame_sign_bias) {

+  MV xmv = mvp->as_mv;

   if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) {

     xmv.row *= -1;

@@ -40,8 +43,6 @@

   mvp->as_mv = xmv;

-#define LEFT_TOP_MARGIN (16 << 3)

-#define RIGHT_BOTTOM_MARGIN (16 << 3)

 static void clamp_mv(int_mv *mv,

                      int mb_to_left_edge,

@@ -71,10 +72,10 @@

                                     int mb_to_right_edge,

                                     int mb_to_top_edge,

                                     int mb_to_bottom_edge) {

-  return (mv->as_mv.col < mb_to_left_edge) ||

-         (mv->as_mv.col > mb_to_right_edge) ||

-         (mv->as_mv.row < mb_to_top_edge) ||

-         (mv->as_mv.row > mb_to_bottom_edge);

+  return mv->as_mv.col < mb_to_left_edge ||

+         mv->as_mv.col > mb_to_right_edge ||

+         mv->as_mv.row < mb_to_top_edge ||

+         mv->as_mv.row > mb_to_bottom_edge;

 vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc,

@@ -83,21 +84,30 @@

 extern const uint8_t vp9_mbsplit_offset[4][16];

-static int left_block_mv(const MODE_INFO *cur_mb, int b) {

+static int left_block_mv(const MACROBLOCKD *xd,

+                         const MODE_INFO *cur_mb, int b) {

   if (!(b & 3)) {

-    /* On L edge, get from MB to left of us */

+    if (!xd->left_available)

+      return 0;

+    // On L edge, get from MB to left of us

     --cur_mb;

     if (cur_mb->mbmi.mode != SPLITMV)

       return cur_mb->mbmi.mv[0].as_int;

     b += 4;

-  return (cur_mb->bmi + b - 1)->as_mv.first.as_int;

+  return (cur_mb->bmi + b - 1)->as_mv[0].as_int;

-static int left_block_second_mv(const MODE_INFO *cur_mb, int b) {

+static int left_block_second_mv(const MACROBLOCKD *xd,

+                                const MODE_INFO *cur_mb, int b) {

   if (!(b & 3)) {

+    if (!xd->left_available)

+      return 0;

     /* On L edge, get from MB to left of us */

     --cur_mb;

@@ -108,8 +118,8 @@

   return cur_mb->mbmi.second_ref_frame > 0 ?

-      (cur_mb->bmi + b - 1)->as_mv.second.as_int :

-      (cur_mb->bmi + b - 1)->as_mv.first.as_int;

+      (cur_mb->bmi + b - 1)->as_mv[1].as_int :

+      (cur_mb->bmi + b - 1)->as_mv[0].as_int;

 static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {

@@ -122,7 +132,7 @@

     b += 16;

-  return (cur_mb->bmi + b - 4)->as_mv.first.as_int;

+  return (cur_mb->bmi + b - 4)->as_mv[0].as_int;

 static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) {

@@ -137,8 +147,8 @@

   return cur_mb->mbmi.second_ref_frame > 0 ?

-      (cur_mb->bmi + b - 4)->as_mv.second.as_int :

-      (cur_mb->bmi + b - 4)->as_mv.first.as_int;

+      (cur_mb->bmi + b - 4)->as_mv[1].as_int :

+      (cur_mb->bmi + b - 4)->as_mv[0].as_int;

 static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {

--- /dev/null

+++ b/vp9/common/vp9_idct.c

@@ -1,0 +1,1307 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include <math.h>

+#include "./vpx_config.h"

+#include "./vp9_rtcd.h"

+#include "vp9/common/vp9_systemdependent.h"

+#include "vp9/common/vp9_blockd.h"

+#include "vp9/common/vp9_common.h"

+#include "vp9/common/vp9_idct.h"

+void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {

+  int i;

+  int a1, b1, c1, d1;

+  int16_t *ip = input;

+  int16_t *op = output;

+  const int half_pitch = pitch >> 1;

+  for (i = 0; i < 4; i++) {

+    a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;

+    b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR;

+    c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR;

+    d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR;

+    op[0] = (a1 + b1 + 1) >> 1;

+    op[1] = (c1 + d1) >> 1;

+    op[2] = (a1 - b1) >> 1;

+    op[3] = (d1 - c1) >> 1;

+    ip += 4;

+    op += half_pitch;

+  }

+  ip = output;

+  op = output;

+  for (i = 0; i < 4; i++) {

+    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];

+    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];

+    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];

+    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];

+    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;

+    op[half_pitch * 1] = (c1 + d1) >> 1;

+    op[half_pitch * 2] = (a1 - b1) >> 1;

+    op[half_pitch * 3] = (d1 - c1) >> 1;

+    ip++;

+    op++;

+  }

+}

+void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {

+  int i;

+  int16_t tmp[4];

+  int16_t *ip = in;

+  int16_t *op = tmp;

+  const int half_pitch = pitch >> 1;

+  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

+  op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;

+  ip = tmp;

+  op = out;

+  for (i = 0; i < 4; i++) {

+    op[half_pitch * 0] = (ip[0] + 1) >> 1;

+    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;

+    ip++;

+    op++;

+  }

+}

+void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,

+                                 uint8_t *dst_ptr,

+                                 int pitch, int stride) {

+  int r, c;

+  int16_t dc = input_dc;

+  int16_t tmp[4 * 4];

+  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++)

+      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);

+    dst_ptr += stride;

+    pred_ptr += pitch;

+  }

+}

+void vp9_idct4_1d_c(int16_t *input, int16_t *output) {

+  int16_t step[4];

+  int temp1, temp2;

+  // stage 1

+  temp1 = (input[0] + input[2]) * cospi_16_64;

+  temp2 = (input[0] - input[2]) * cospi_16_64;

+  step[0] = dct_const_round_shift(temp1);

+  step[1] = dct_const_round_shift(temp2);

+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;

+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;

+  step[2] = dct_const_round_shift(temp1);

+  step[3] = dct_const_round_shift(temp2);

+  // stage 2

+  output[0] = step[0] + step[3];

+  output[1] = step[1] + step[2];

+  output[2] = step[1] - step[2];

+  output[3] = step[0] - step[3];

+}

+void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[4 * 4];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[4], temp_out[4];

+  // Rows

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = input[j];

+    vp9_idct4_1d(temp_in, outptr);

+    input += 4;

+    outptr += 4;

+  }

+  // Columns

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j * 4 + i];

+    vp9_idct4_1d(temp_in, temp_out);

+    for (j = 0; j < 4; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

+  }

+}

+void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {

+  int i;

+  int a1;

+  int16_t *op = output;

+  const int half_pitch = pitch >> 1;

+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  for (i = 0; i < 4; i++) {

+    op[0] = op[1] = op[2] = op[3] = a1;

+    op += half_pitch;

+  }

+}

+void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,

+                            uint8_t *dst_ptr, int pitch, int stride) {

+  int a1;

+  int r, c;

+  int16_t out = dct_const_round_shift(input_dc * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  for (r = 0; r < 4; r++) {

+    for (c = 0; c < 4; c++)

+      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);

+    dst_ptr += stride;

+    pred_ptr += pitch;

+  }

+}

+static void idct8_1d(int16_t *input, int16_t *output) {

+  int16_t step1[8], step2[8];

+  int temp1, temp2;

+  // stage 1

+  step1[0] = input[0];

+  step1[2] = input[4];

+  step1[1] = input[2];

+  step1[3] = input[6];

+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;

+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;

+  step1[4] = dct_const_round_shift(temp1);

+  step1[7] = dct_const_round_shift(temp2);

+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;

+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  // stage 2 & stage 3 - even half

+  vp9_idct4_1d(step1, step1);

+  // stage 2 - odd half

+  step2[4] = step1[4] + step1[5];

+  step2[5] = step1[4] - step1[5];

+  step2[6] = -step1[6] + step1[7];

+  step2[7] = step1[6] + step1[7];

+  // stage 3 -odd half

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[7] = step2[7];

+  // stage 4

+  output[0] = step1[0] + step1[7];

+  output[1] = step1[1] + step1[6];

+  output[2] = step1[2] + step1[5];

+  output[3] = step1[3] + step1[4];

+  output[4] = step1[3] - step1[4];

+  output[5] = step1[2] - step1[5];

+  output[6] = step1[1] - step1[6];

+  output[7] = step1[0] - step1[7];

+}

+void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[8 * 8];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[8], temp_out[8];

+  // Rows

+  for (i = 0; i < 8; ++i) {

+    idct8_1d(input, outptr);

+    input += 8;

+    outptr += 8;

+  }

+  // Columns

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    idct8_1d(temp_in, temp_out);

+    for (j = 0; j < 8; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

+  }

+}

+static void iadst4_1d(int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7;

+  int x0 = input[0];

+  int x1 = input[1];

+  int x2 = input[2];

+  int x3 = input[3];

+  if (!(x0 | x1 | x2 | x3)) {

+    output[0] = output[1] = output[2] = output[3] = 0;

+    return;

+  }

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_2_9 * x0;

+  s2 = sinpi_3_9 * x1;

+  s3 = sinpi_4_9 * x2;

+  s4 = sinpi_1_9 * x2;

+  s5 = sinpi_2_9 * x3;

+  s6 = sinpi_4_9 * x3;

+  s7 = x0 - x2 + x3;

+  x0 = s0 + s3 + s5;

+  x1 = s1 - s4 - s6;

+  x2 = sinpi_3_9 * s7;

+  x3 = s2;

+  s0 = x0 + x3;

+  s1 = x1 + x3;

+  s2 = x2;

+  s3 = x0 + x1 - x3;

+  // 1-D transform scaling factor is sqrt(2).

+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

+  // + 1b (addition) = 29b.

+  // Hence the output bit depth is 15b.

+  output[0] = dct_const_round_shift(s0);

+  output[1] = dct_const_round_shift(s1);

+  output[2] = dct_const_round_shift(s2);

+  output[3] = dct_const_round_shift(s3);

+}

+void vp9_short_iht4x4_c(int16_t *input, int16_t *output,

+                        int pitch, int tx_type) {

+  const transform_2d IHT_4[] = {

+    { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0

+    { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1

+    { vp9_idct4_1d,  iadst4_1d },      // DCT_ADST = 2

+    { iadst4_1d, iadst4_1d }           // ADST_ADST = 3

+  };

+  int i, j;

+  int16_t out[4 * 4];

+  int16_t *outptr = out;

+  int16_t temp_in[4], temp_out[4];

+  // inverse transform row vectors

+  for (i = 0; i < 4; ++i) {

+    IHT_4[tx_type].rows(input, outptr);

+    input  += 4;

+    outptr += 4;

+  }

+  // inverse transform column vectors

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j * 4 + i];

+    IHT_4[tx_type].cols(temp_in, temp_out);

+    for (j = 0; j < 4; ++j)

+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);

+  }

+}

+static void iadst8_1d(int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7;

+  int x0 = input[7];

+  int x1 = input[0];

+  int x2 = input[5];

+  int x3 = input[2];

+  int x4 = input[3];

+  int x5 = input[4];

+  int x6 = input[1];

+  int x7 = input[6];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {

+    output[0] = output[1] = output[2] = output[3] = output[4]

+              = output[5] = output[6] = output[7] = 0;

+    return;

+  }

+  // stage 1

+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

+  x0 = dct_const_round_shift(s0 + s4);

+  x1 = dct_const_round_shift(s1 + s5);

+  x2 = dct_const_round_shift(s2 + s6);

+  x3 = dct_const_round_shift(s3 + s7);

+  x4 = dct_const_round_shift(s0 - s4);

+  x5 = dct_const_round_shift(s1 - s5);

+  x6 = dct_const_round_shift(s2 - s6);

+  x7 = dct_const_round_shift(s3 - s7);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;

+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;

+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;

+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = dct_const_round_shift(s4 + s6);

+  x5 = dct_const_round_shift(s5 + s7);

+  x6 = dct_const_round_shift(s4 - s6);

+  x7 = dct_const_round_shift(s5 - s7);

+  // stage 3

+  s2 = cospi_16_64 * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (x6 - x7);

+  x2 = dct_const_round_shift(s2);

+  x3 = dct_const_round_shift(s3);

+  x6 = dct_const_round_shift(s6);

+  x7 = dct_const_round_shift(s7);

+  output[0] =  x0;

+  output[1] = -x4;

+  output[2] =  x6;

+  output[3] = -x2;

+  output[4] =  x3;

+  output[5] = -x7;

+  output[6] =  x5;

+  output[7] = -x1;

+}

+static const transform_2d IHT_8[] = {

+  { idct8_1d,  idct8_1d  },  // DCT_DCT  = 0

+  { iadst8_1d, idct8_1d  },  // ADST_DCT = 1

+  { idct8_1d,  iadst8_1d },  // DCT_ADST = 2

+  { iadst8_1d, iadst8_1d }   // ADST_ADST = 3

+};

+void vp9_short_iht8x8_c(int16_t *input, int16_t *output,

+                        int pitch, int tx_type) {

+  int i, j;

+  int16_t out[8 * 8];

+  int16_t *outptr = out;

+  int16_t temp_in[8], temp_out[8];

+  const transform_2d ht = IHT_8[tx_type];

+  // inverse transform row vectors

+  for (i = 0; i < 8; ++i) {

+    ht.rows(input, outptr);

+    input += 8;

+    outptr += 8;

+  }

+  // inverse transform column vectors

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    ht.cols(temp_in, temp_out);

+    for (j = 0; j < 8; ++j)

+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

+  }

+}

+void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[8 * 8];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[8], temp_out[8];

+  vpx_memset(out, 0, sizeof(out));

+  // First transform rows

+  // only first 4 row has non-zero coefs

+  for (i = 0; i < 4; ++i) {

+    idct8_1d(input, outptr);

+    input += 8;

+    outptr += 8;

+  }

+  // Then transform columns

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j * 8 + i];

+    idct8_1d(temp_in, temp_out);

+    for (j = 0; j < 8; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);

+  }

+}

+void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {

+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  output[0] = ROUND_POWER_OF_TWO(out, 5);

+}

+static void idct16_1d(int16_t *input, int16_t *output) {

+  int16_t step1[16], step2[16];

+  int temp1, temp2;

+  // stage 1

+  step1[0] = input[0/2];

+  step1[1] = input[16/2];

+  step1[2] = input[8/2];

+  step1[3] = input[24/2];

+  step1[4] = input[4/2];

+  step1[5] = input[20/2];

+  step1[6] = input[12/2];

+  step1[7] = input[28/2];

+  step1[8] = input[2/2];

+  step1[9] = input[18/2];

+  step1[10] = input[10/2];

+  step1[11] = input[26/2];

+  step1[12] = input[6/2];

+  step1[13] = input[22/2];

+  step1[14] = input[14/2];

+  step1[15] = input[30/2];

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = dct_const_round_shift(temp1);

+  step2[15] = dct_const_round_shift(temp2);

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = dct_const_round_shift(temp1);

+  step2[14] = dct_const_round_shift(temp2);

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = dct_const_round_shift(temp1);

+  step2[12] = dct_const_round_shift(temp2);

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = dct_const_round_shift(temp1);

+  step1[7] = dct_const_round_shift(temp2);

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[8] = step2[8] + step2[9];

+  step1[9] = step2[8] - step2[9];

+  step1[10] = -step2[10] + step2[11];

+  step1[11] = step2[10] + step2[11];

+  step1[12] = step2[12] + step2[13];

+  step1[13] = step2[12] - step2[13];

+  step1[14] = -step2[14] + step2[15];

+  step1[15] = step2[14] + step2[15];

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = dct_const_round_shift(temp1);

+  step2[1] = dct_const_round_shift(temp2);

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = dct_const_round_shift(temp1);

+  step2[3] = dct_const_round_shift(temp2);

+  step2[4] = step1[4] + step1[5];

+  step2[5] = step1[4] - step1[5];

+  step2[6] = -step1[6] + step1[7];

+  step2[7] = step1[6] + step1[7];

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = dct_const_round_shift(temp1);

+  step2[14] = dct_const_round_shift(temp2);

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  // stage 5

+  step1[0] = step2[0] + step2[3];

+  step1[1] = step2[1] + step2[2];

+  step1[2] = step2[1] - step2[2];

+  step1[3] = step2[0] - step2[3];

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[7] = step2[7];

+  step1[8] = step2[8] + step2[11];

+  step1[9] = step2[9] + step2[10];

+  step1[10] = step2[9] - step2[10];

+  step1[11] = step2[8] - step2[11];

+  step1[12] = -step2[12] + step2[15];

+  step1[13] = -step2[13] + step2[14];

+  step1[14] = step2[13] + step2[14];

+  step1[15] = step2[12] + step2[15];

+  // stage 6

+  step2[0] = step1[0] + step1[7];

+  step2[1] = step1[1] + step1[6];

+  step2[2] = step1[2] + step1[5];

+  step2[3] = step1[3] + step1[4];

+  step2[4] = step1[3] - step1[4];

+  step2[5] = step1[2] - step1[5];

+  step2[6] = step1[1] - step1[6];

+  step2[7] = step1[0] - step1[7];

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = dct_const_round_shift(temp1);

+  step2[12] = dct_const_round_shift(temp2);

+  step2[14] = step1[14];

+  step2[15] = step1[15];

+  // stage 7

+  output[0] = step2[0] + step2[15];

+  output[1] = step2[1] + step2[14];

+  output[2] = step2[2] + step2[13];

+  output[3] = step2[3] + step2[12];

+  output[4] = step2[4] + step2[11];

+  output[5] = step2[5] + step2[10];

+  output[6] = step2[6] + step2[9];

+  output[7] = step2[7] + step2[8];

+  output[8] = step2[7] - step2[8];

+  output[9] = step2[6] - step2[9];

+  output[10] = step2[5] - step2[10];

+  output[11] = step2[4] - step2[11];

+  output[12] = step2[3] - step2[12];

+  output[13] = step2[2] - step2[13];

+  output[14] = step2[1] - step2[14];

+  output[15] = step2[0] - step2[15];

+}

+void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[16 * 16];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[16], temp_out[16];

+  // First transform rows

+  for (i = 0; i < 16; ++i) {

+    idct16_1d(input, outptr);

+    input += 16;

+    outptr += 16;

+  }

+  // Then transform columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j * 16 + i];

+    idct16_1d(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+  }

+}

+void iadst16_1d(int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

+  int x0 = input[15];

+  int x1 = input[0];

+  int x2 = input[13];

+  int x3 = input[2];

+  int x4 = input[11];

+  int x5 = input[4];

+  int x6 = input[9];

+  int x7 = input[6];

+  int x8 = input[7];

+  int x9 = input[8];

+  int x10 = input[5];

+  int x11 = input[10];

+  int x12 = input[3];

+  int x13 = input[12];

+  int x14 = input[1];

+  int x15 = input[14];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

+    output[0] = output[1] = output[2] = output[3] = output[4]

+              = output[5] = output[6] = output[7] = output[8]

+              = output[9] = output[10] = output[11] = output[12]

+              = output[13] = output[14] = output[15] = 0;

+    return;

+  }

+  // stage 1

+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

+  x0 = dct_const_round_shift(s0 + s8);

+  x1 = dct_const_round_shift(s1 + s9);

+  x2 = dct_const_round_shift(s2 + s10);

+  x3 = dct_const_round_shift(s3 + s11);

+  x4 = dct_const_round_shift(s4 + s12);

+  x5 = dct_const_round_shift(s5 + s13);

+  x6 = dct_const_round_shift(s6 + s14);

+  x7 = dct_const_round_shift(s7 + s15);

+  x8  = dct_const_round_shift(s0 - s8);

+  x9  = dct_const_round_shift(s1 - s9);

+  x10 = dct_const_round_shift(s2 - s10);

+  x11 = dct_const_round_shift(s3 - s11);

+  x12 = dct_const_round_shift(s4 - s12);

+  x13 = dct_const_round_shift(s5 - s13);

+  x14 = dct_const_round_shift(s6 - s14);

+  x15 = dct_const_round_shift(s7 - s15);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

+  x0 = s0 + s4;

+  x1 = s1 + s5;

+  x2 = s2 + s6;

+  x3 = s3 + s7;

+  x4 = s0 - s4;

+  x5 = s1 - s5;

+  x6 = s2 - s6;

+  x7 = s3 - s7;

+  x8 = dct_const_round_shift(s8 + s12);

+  x9 = dct_const_round_shift(s9 + s13);

+  x10 = dct_const_round_shift(s10 + s14);

+  x11 = dct_const_round_shift(s11 + s15);

+  x12 = dct_const_round_shift(s8 - s12);

+  x13 = dct_const_round_shift(s9 - s13);

+  x14 = dct_const_round_shift(s10 - s14);

+  x15 = dct_const_round_shift(s11 - s15);

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = dct_const_round_shift(s4 + s6);

+  x5 = dct_const_round_shift(s5 + s7);

+  x6 = dct_const_round_shift(s4 - s6);

+  x7 = dct_const_round_shift(s5 - s7);

+  x8 = s8 + s10;

+  x9 = s9 + s11;

+  x10 = s8 - s10;

+  x11 = s9 - s11;

+  x12 = dct_const_round_shift(s12 + s14);

+  x13 = dct_const_round_shift(s13 + s15);

+  x14 = dct_const_round_shift(s12 - s14);

+  x15 = dct_const_round_shift(s13 - s15);

+  // stage 4

+  s2 = (- cospi_16_64) * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (- x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (- x10 + x11);

+  s14 = (- cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

+  x2 = dct_const_round_shift(s2);

+  x3 = dct_const_round_shift(s3);

+  x6 = dct_const_round_shift(s6);

+  x7 = dct_const_round_shift(s7);

+  x10 = dct_const_round_shift(s10);

+  x11 = dct_const_round_shift(s11);

+  x14 = dct_const_round_shift(s14);

+  x15 = dct_const_round_shift(s15);

+  output[0] =  x0;

+  output[1] = -x8;

+  output[2] =  x12;

+  output[3] = -x4;

+  output[4] =  x6;

+  output[5] =  x14;

+  output[6] =  x10;

+  output[7] =  x2;

+  output[8] =  x3;

+  output[9] =  x11;

+  output[10] =  x15;

+  output[11] =  x7;

+  output[12] =  x5;

+  output[13] = -x13;

+  output[14] =  x9;

+  output[15] = -x1;

+}

+static const transform_2d IHT_16[] = {

+  { idct16_1d,  idct16_1d  },  // DCT_DCT  = 0

+  { iadst16_1d, idct16_1d  },  // ADST_DCT = 1

+  { idct16_1d,  iadst16_1d },  // DCT_ADST = 2

+  { iadst16_1d, iadst16_1d }   // ADST_ADST = 3

+};

+void vp9_short_iht16x16_c(int16_t *input, int16_t *output,

+                          int pitch, int tx_type) {

+  int i, j;

+  int16_t out[16 * 16];

+  int16_t *outptr = out;

+  int16_t temp_in[16], temp_out[16];

+  const transform_2d ht = IHT_16[tx_type];

+  // Rows

+  for (i = 0; i < 16; ++i) {

+    ht.rows(input, outptr);

+    input += 16;

+    outptr += 16;

+  }

+  // Columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j * 16 + i];

+    ht.cols(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+  }

+}

+void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {

+    int16_t out[16 * 16];

+    int16_t *outptr = out;

+    const int half_pitch = pitch >> 1;

+    int i, j;

+    int16_t temp_in[16], temp_out[16];

+    /* First transform rows. Since all non-zero dct coefficients are in

+     * upper-left 4x4 area, we only need to calculate first 4 rows here.

+     */

+    vpx_memset(out, 0, sizeof(out));

+    for (i = 0; i < 4; ++i) {

+      idct16_1d(input, outptr);

+      input += 16;

+      outptr += 16;

+    }

+    // Then transform columns

+    for (i = 0; i < 16; ++i) {

+      for (j = 0; j < 16; ++j)

+        temp_in[j] = out[j*16 + i];

+      idct16_1d(temp_in, temp_out);

+      for (j = 0; j < 16; ++j)

+        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+    }

+}

+void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {

+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  output[0] = ROUND_POWER_OF_TWO(out, 6);

+}

+static void idct32_1d(int16_t *input, int16_t *output) {

+  int16_t step1[32], step2[32];

+  int temp1, temp2;

+  // stage 1

+  step1[0] = input[0];

+  step1[1] = input[16];

+  step1[2] = input[8];

+  step1[3] = input[24];

+  step1[4] = input[4];

+  step1[5] = input[20];

+  step1[6] = input[12];

+  step1[7] = input[28];

+  step1[8] = input[2];

+  step1[9] = input[18];

+  step1[10] = input[10];

+  step1[11] = input[26];

+  step1[12] = input[6];

+  step1[13] = input[22];

+  step1[14] = input[14];

+  step1[15] = input[30];

+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;

+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;

+  step1[16] = dct_const_round_shift(temp1);

+  step1[31] = dct_const_round_shift(temp2);

+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;

+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;

+  step1[17] = dct_const_round_shift(temp1);

+  step1[30] = dct_const_round_shift(temp2);

+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;

+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;

+  step1[18] = dct_const_round_shift(temp1);

+  step1[29] = dct_const_round_shift(temp2);

+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;

+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;

+  step1[19] = dct_const_round_shift(temp1);

+  step1[28] = dct_const_round_shift(temp2);

+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;

+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;

+  step1[20] = dct_const_round_shift(temp1);

+  step1[27] = dct_const_round_shift(temp2);

+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;

+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;

+  step1[21] = dct_const_round_shift(temp1);

+  step1[26] = dct_const_round_shift(temp2);

+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;

+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;

+  step1[22] = dct_const_round_shift(temp1);

+  step1[25] = dct_const_round_shift(temp2);

+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;

+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;

+  step1[23] = dct_const_round_shift(temp1);

+  step1[24] = dct_const_round_shift(temp2);

+  // stage 2

+  step2[0] = step1[0];

+  step2[1] = step1[1];

+  step2[2] = step1[2];

+  step2[3] = step1[3];

+  step2[4] = step1[4];

+  step2[5] = step1[5];

+  step2[6] = step1[6];

+  step2[7] = step1[7];

+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;

+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;

+  step2[8] = dct_const_round_shift(temp1);

+  step2[15] = dct_const_round_shift(temp2);

+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;

+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;

+  step2[9] = dct_const_round_shift(temp1);

+  step2[14] = dct_const_round_shift(temp2);

+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;

+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;

+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;

+  step2[11] = dct_const_round_shift(temp1);

+  step2[12] = dct_const_round_shift(temp2);

+  step2[16] = step1[16] + step1[17];

+  step2[17] = step1[16] - step1[17];

+  step2[18] = -step1[18] + step1[19];

+  step2[19] = step1[18] + step1[19];

+  step2[20] = step1[20] + step1[21];

+  step2[21] = step1[20] - step1[21];

+  step2[22] = -step1[22] + step1[23];

+  step2[23] = step1[22] + step1[23];

+  step2[24] = step1[24] + step1[25];

+  step2[25] = step1[24] - step1[25];

+  step2[26] = -step1[26] + step1[27];

+  step2[27] = step1[26] + step1[27];

+  step2[28] = step1[28] + step1[29];

+  step2[29] = step1[28] - step1[29];

+  step2[30] = -step1[30] + step1[31];

+  step2[31] = step1[30] + step1[31];

+  // stage 3

+  step1[0] = step2[0];

+  step1[1] = step2[1];

+  step1[2] = step2[2];

+  step1[3] = step2[3];

+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;

+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;

+  step1[4] = dct_const_round_shift(temp1);

+  step1[7] = dct_const_round_shift(temp2);

+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;

+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[8] = step2[8] + step2[9];

+  step1[9] = step2[8] - step2[9];

+  step1[10] = -step2[10] + step2[11];

+  step1[11] = step2[10] + step2[11];

+  step1[12] = step2[12] + step2[13];

+  step1[13] = step2[12] - step2[13];

+  step1[14] = -step2[14] + step2[15];

+  step1[15] = step2[14] + step2[15];

+  step1[16] = step2[16];

+  step1[31] = step2[31];

+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;

+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;

+  step1[17] = dct_const_round_shift(temp1);

+  step1[30] = dct_const_round_shift(temp2);

+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;

+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;

+  step1[18] = dct_const_round_shift(temp1);

+  step1[29] = dct_const_round_shift(temp2);

+  step1[19] = step2[19];

+  step1[20] = step2[20];

+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;

+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;

+  step1[21] = dct_const_round_shift(temp1);

+  step1[26] = dct_const_round_shift(temp2);

+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;

+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;

+  step1[22] = dct_const_round_shift(temp1);

+  step1[25] = dct_const_round_shift(temp2);

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[27] = step2[27];

+  step1[28] = step2[28];

+  // stage 4

+  temp1 = (step1[0] + step1[1]) * cospi_16_64;

+  temp2 = (step1[0] - step1[1]) * cospi_16_64;

+  step2[0] = dct_const_round_shift(temp1);

+  step2[1] = dct_const_round_shift(temp2);

+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;

+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;

+  step2[2] = dct_const_round_shift(temp1);

+  step2[3] = dct_const_round_shift(temp2);

+  step2[4] = step1[4] + step1[5];

+  step2[5] = step1[4] - step1[5];

+  step2[6] = -step1[6] + step1[7];

+  step2[7] = step1[6] + step1[7];

+  step2[8] = step1[8];

+  step2[15] = step1[15];

+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;

+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;

+  step2[9] = dct_const_round_shift(temp1);

+  step2[14] = dct_const_round_shift(temp2);

+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;

+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  step2[11] = step1[11];

+  step2[12] = step1[12];

+  step2[16] = step1[16] + step1[19];

+  step2[17] = step1[17] + step1[18];

+  step2[18] = step1[17] - step1[18];

+  step2[19] = step1[16] - step1[19];

+  step2[20] = -step1[20] + step1[23];

+  step2[21] = -step1[21] + step1[22];

+  step2[22] = step1[21] + step1[22];

+  step2[23] = step1[20] + step1[23];

+  step2[24] = step1[24] + step1[27];

+  step2[25] = step1[25] + step1[26];

+  step2[26] = step1[25] - step1[26];

+  step2[27] = step1[24] - step1[27];

+  step2[28] = -step1[28] + step1[31];

+  step2[29] = -step1[29] + step1[30];

+  step2[30] = step1[29] + step1[30];

+  step2[31] = step1[28] + step1[31];

+  // stage 5

+  step1[0] = step2[0] + step2[3];

+  step1[1] = step2[1] + step2[2];

+  step1[2] = step2[1] - step2[2];

+  step1[3] = step2[0] - step2[3];

+  step1[4] = step2[4];

+  temp1 = (step2[6] - step2[5]) * cospi_16_64;

+  temp2 = (step2[5] + step2[6]) * cospi_16_64;

+  step1[5] = dct_const_round_shift(temp1);

+  step1[6] = dct_const_round_shift(temp2);

+  step1[7] = step2[7];

+  step1[8] = step2[8] + step2[11];

+  step1[9] = step2[9] + step2[10];

+  step1[10] = step2[9] - step2[10];

+  step1[11] = step2[8] - step2[11];

+  step1[12] = -step2[12] + step2[15];

+  step1[13] = -step2[13] + step2[14];

+  step1[14] = step2[13] + step2[14];

+  step1[15] = step2[12] + step2[15];

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;

+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;

+  step1[18] = dct_const_round_shift(temp1);

+  step1[29] = dct_const_round_shift(temp2);

+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;

+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;

+  step1[19] = dct_const_round_shift(temp1);

+  step1[28] = dct_const_round_shift(temp2);

+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;

+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;

+  step1[20] = dct_const_round_shift(temp1);

+  step1[27] = dct_const_round_shift(temp2);

+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;

+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;

+  step1[21] = dct_const_round_shift(temp1);

+  step1[26] = dct_const_round_shift(temp2);

+  step1[22] = step2[22];

+  step1[23] = step2[23];

+  step1[24] = step2[24];

+  step1[25] = step2[25];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // stage 6

+  step2[0] = step1[0] + step1[7];

+  step2[1] = step1[1] + step1[6];

+  step2[2] = step1[2] + step1[5];

+  step2[3] = step1[3] + step1[4];

+  step2[4] = step1[3] - step1[4];

+  step2[5] = step1[2] - step1[5];

+  step2[6] = step1[1] - step1[6];

+  step2[7] = step1[0] - step1[7];

+  step2[8] = step1[8];

+  step2[9] = step1[9];

+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;

+  temp2 = (step1[10] + step1[13]) * cospi_16_64;

+  step2[10] = dct_const_round_shift(temp1);

+  step2[13] = dct_const_round_shift(temp2);

+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;

+  temp2 = (step1[11] + step1[12]) * cospi_16_64;

+  step2[11] = dct_const_round_shift(temp1);

+  step2[12] = dct_const_round_shift(temp2);

+  step2[14] = step1[14];

+  step2[15] = step1[15];

+  step2[16] = step1[16] + step1[23];

+  step2[17] = step1[17] + step1[22];

+  step2[18] = step1[18] + step1[21];

+  step2[19] = step1[19] + step1[20];

+  step2[20] = step1[19] - step1[20];

+  step2[21] = step1[18] - step1[21];

+  step2[22] = step1[17] - step1[22];

+  step2[23] = step1[16] - step1[23];

+  step2[24] = -step1[24] + step1[31];

+  step2[25] = -step1[25] + step1[30];

+  step2[26] = -step1[26] + step1[29];

+  step2[27] = -step1[27] + step1[28];

+  step2[28] = step1[27] + step1[28];

+  step2[29] = step1[26] + step1[29];

+  step2[30] = step1[25] + step1[30];

+  step2[31] = step1[24] + step1[31];

+  // stage 7

+  step1[0] = step2[0] + step2[15];

+  step1[1] = step2[1] + step2[14];

+  step1[2] = step2[2] + step2[13];

+  step1[3] = step2[3] + step2[12];

+  step1[4] = step2[4] + step2[11];

+  step1[5] = step2[5] + step2[10];

+  step1[6] = step2[6] + step2[9];

+  step1[7] = step2[7] + step2[8];

+  step1[8] = step2[7] - step2[8];

+  step1[9] = step2[6] - step2[9];

+  step1[10] = step2[5] - step2[10];

+  step1[11] = step2[4] - step2[11];

+  step1[12] = step2[3] - step2[12];

+  step1[13] = step2[2] - step2[13];

+  step1[14] = step2[1] - step2[14];

+  step1[15] = step2[0] - step2[15];

+  step1[16] = step2[16];

+  step1[17] = step2[17];

+  step1[18] = step2[18];

+  step1[19] = step2[19];

+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;

+  temp2 = (step2[20] + step2[27]) * cospi_16_64;

+  step1[20] = dct_const_round_shift(temp1);

+  step1[27] = dct_const_round_shift(temp2);

+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;

+  temp2 = (step2[21] + step2[26]) * cospi_16_64;

+  step1[21] = dct_const_round_shift(temp1);

+  step1[26] = dct_const_round_shift(temp2);

+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;

+  temp2 = (step2[22] + step2[25]) * cospi_16_64;

+  step1[22] = dct_const_round_shift(temp1);

+  step1[25] = dct_const_round_shift(temp2);

+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;

+  temp2 = (step2[23] + step2[24]) * cospi_16_64;

+  step1[23] = dct_const_round_shift(temp1);

+  step1[24] = dct_const_round_shift(temp2);

+  step1[28] = step2[28];

+  step1[29] = step2[29];

+  step1[30] = step2[30];

+  step1[31] = step2[31];

+  // final stage

+  output[0] = step1[0] + step1[31];

+  output[1] = step1[1] + step1[30];

+  output[2] = step1[2] + step1[29];

+  output[3] = step1[3] + step1[28];

+  output[4] = step1[4] + step1[27];

+  output[5] = step1[5] + step1[26];

+  output[6] = step1[6] + step1[25];

+  output[7] = step1[7] + step1[24];

+  output[8] = step1[8] + step1[23];

+  output[9] = step1[9] + step1[22];

+  output[10] = step1[10] + step1[21];

+  output[11] = step1[11] + step1[20];

+  output[12] = step1[12] + step1[19];

+  output[13] = step1[13] + step1[18];

+  output[14] = step1[14] + step1[17];

+  output[15] = step1[15] + step1[16];

+  output[16] = step1[15] - step1[16];

+  output[17] = step1[14] - step1[17];

+  output[18] = step1[13] - step1[18];

+  output[19] = step1[12] - step1[19];

+  output[20] = step1[11] - step1[20];

+  output[21] = step1[10] - step1[21];

+  output[22] = step1[9] - step1[22];

+  output[23] = step1[8] - step1[23];

+  output[24] = step1[7] - step1[24];

+  output[25] = step1[6] - step1[25];

+  output[26] = step1[5] - step1[26];

+  output[27] = step1[4] - step1[27];

+  output[28] = step1[3] - step1[28];

+  output[29] = step1[2] - step1[29];

+  output[30] = step1[1] - step1[30];

+  output[31] = step1[0] - step1[31];

+}

+void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[32 * 32];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[32], temp_out[32];

+  // Rows

+  for (i = 0; i < 32; ++i) {

+    idct32_1d(input, outptr);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    idct32_1d(temp_in, temp_out);

+    for (j = 0; j < 32; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+  }

+}

+void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {

+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  output[0] = ROUND_POWER_OF_TWO(out, 6);

+}

+void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[32 * 32];

+  int16_t *outptr = out;

+  const int half_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[32], temp_out[32];

+  /* First transform rows. Since all non-zero dct coefficients are in

+   * upper-left 4x4 area, we only need to calculate first 4 rows here.

+   */

+  vpx_memset(out, 0, sizeof(out));

+  for (i = 0; i < 4; ++i) {

+    idct32_1d(input, outptr);

+    input += 32;

+    outptr += 32;

+  }

+  // Columns

+  for (i = 0; i < 32; ++i) {

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = out[j * 32 + i];

+    idct32_1d(temp_in, temp_out);

+    for (j = 0; j < 32; ++j)

+      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);

+  }

+}

--- /dev/null

+++ b/vp9/common/vp9_idct.h

@@ -1,0 +1,85 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_COMMON_VP9_IDCT_H_

+#define VP9_COMMON_VP9_IDCT_H_

+#include <assert.h>

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vp9/common/vp9_common.h"

+// Constants and Macros used by all idct/dct functions

+#define DCT_CONST_BITS 14

+#define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))

+#define pair_set_epi16(a, b) \

+  _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16))

+// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31.

+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)

+static const int cospi_1_64  = 16364;

+static const int cospi_2_64  = 16305;

+static const int cospi_3_64  = 16207;

+static const int cospi_4_64  = 16069;

+static const int cospi_5_64  = 15893;

+static const int cospi_6_64  = 15679;

+static const int cospi_7_64  = 15426;

+static const int cospi_8_64  = 15137;

+static const int cospi_9_64  = 14811;

+static const int cospi_10_64 = 14449;

+static const int cospi_11_64 = 14053;

+static const int cospi_12_64 = 13623;

+static const int cospi_13_64 = 13160;

+static const int cospi_14_64 = 12665;

+static const int cospi_15_64 = 12140;

+static const int cospi_16_64 = 11585;

+static const int cospi_17_64 = 11003;

+static const int cospi_18_64 = 10394;

+static const int cospi_19_64 = 9760;

+static const int cospi_20_64 = 9102;

+static const int cospi_21_64 = 8423;

+static const int cospi_22_64 = 7723;

+static const int cospi_23_64 = 7005;

+static const int cospi_24_64 = 6270;

+static const int cospi_25_64 = 5520;

+static const int cospi_26_64 = 4756;

+static const int cospi_27_64 = 3981;

+static const int cospi_28_64 = 3196;

+static const int cospi_29_64 = 2404;

+static const int cospi_30_64 = 1606;

+static const int cospi_31_64 = 804;

+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3

+static const int sinpi_1_9 = 5283;

+static const int sinpi_2_9 = 9929;

+static const int sinpi_3_9 = 13377;

+static const int sinpi_4_9 = 15212;

+static INLINE int dct_const_round_shift(int input) {

+  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

+  assert(INT16_MIN <= rv && rv <= INT16_MAX);

+  return rv;

+}

+static INLINE int dct_32_round(int input) {

+  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

+  assert(-131072 <= rv && rv <= 131071);

+  return rv;

+}

+typedef void (*transform_1d)(int16_t*, int16_t*);

+typedef struct {

+  transform_1d cols, rows;  // vertical and horizontal

+} transform_2d;

+#endif  // VP9_COMMON_VP9_IDCT_H_

--- a/vp9/common/vp9_idctllm.c

+++ /dev/null

@@ -1,2670 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-/****************************************************************************

- * Notes:

- *

- * This implementation makes use of 16 bit fixed point verio of two multiply

- * constants:

- *         1.   sqrt(2) * cos (pi/8)

- *         2.   sqrt(2) * sin (pi/8)

- * Becuase the first constant is bigger than 1, to maintain the same 16 bit

- * fixed point precision as the second one, we use a trick of

- *         x * a = x + x*(a-1)

- * so

- *         x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).

- **************************************************************************/

-#include <assert.h>

-#include <math.h>

-#include "./vpx_config.h"

-#include "vp9/common/vp9_systemdependent.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/common/vp9_common.h"

-static const int cospi8sqrt2minus1 = 20091;

-static const int sinpi8sqrt2      = 35468;

-static const int rounding = 0;

-static const int16_t idct_i4[16] = {

-  8192,  10703,  8192,   4433,

-  8192,   4433, -8192, -10703,

-  8192,  -4433, -8192,  10703,

-  8192, -10703,  8192,  -4433

-};

-static const int16_t iadst_i4[16] = {

-   3736,  9459, 10757,   7021,

-   7021,  9459, -3736, -10757,

-   9459,     0, -9459,   9459,

-  10757, -9459,  7021,  -3736

-};

-static const int16_t idct_i8[64] = {

-   5793,  8035,  7568,  6811,

-   5793,  4551,  3135,  1598,

-   5793,  6811,  3135, -1598,

-  -5793, -8035, -7568, -4551,

-   5793,  4551, -3135, -8035,

-  -5793,  1598,  7568,  6811,

-   5793,  1598, -7568, -4551,

-   5793,  6811, -3135, -8035,

-   5793, -1598, -7568,  4551,

-   5793, -6811, -3135,  8035,

-   5793, -4551, -3135,  8035,

-  -5793, -1598,  7568, -6811,

-   5793, -6811,  3135,  1598,

-  -5793,  8035, -7568,  4551,

-   5793, -8035,  7568, -6811,

-   5793, -4551,  3135, -1598

-};

-static const int16_t iadst_i8[64] = {

-   1460,  4184,  6342,  7644,

-   7914,  7114,  5354,  2871,

-   2871,  7114,  7644,  4184,

-  -1460, -6342, -7914, -5354,

-   4184,  7914,  2871, -5354,

-  -7644, -1460,  6342,  7114,

-   5354,  6342, -4184, -7114,

-   2871,  7644, -1460, -7914,

-   6342,  2871, -7914,  1460,

-   7114, -5354, -4184,  7644,

-   7114, -1460, -5354,  7914,

-  -4184, -2871,  7644, -6342,

-   7644, -5354,  1460,  2871,

-  -6342,  7914, -7114,  4184,

-   7914, -7644,  7114, -6342,

-   5354, -4184,  2871, -1460

-};

-static const int16_t idct_i16[256] = {

-   4096,  5765,  5681,  5543,  5352,  5109,  4816,  4478,

-   4096,  3675,  3218,  2731,  2217,  1682,  1130,   568,

-   4096,  5543,  4816,  3675,  2217,   568, -1130, -2731,

-  -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682,

-   4096,  5109,  3218,   568, -2217, -4478, -5681, -5543,

-  -4096, -1682,  1130,  3675,  5352,  5765,  4816,  2731,

-   4096,  4478,  1130, -2731, -5352, -5543, -3218,   568,

-   4096,  5765,  4816,  1682, -2217, -5109, -5681, -3675,

-   4096,  3675, -1130, -5109, -5352, -1682,  3218,  5765,

-   4096,  -568, -4816, -5543, -2217,  2731,  5681,  4478,

-   4096,  2731, -3218, -5765, -2217,  3675,  5681,  1682,

-  -4096, -5543, -1130,  4478,  5352,   568, -4816, -5109,

-   4096,  1682, -4816, -4478,  2217,  5765,  1130, -5109,

-  -4096,  2731,  5681,   568, -5352, -3675,  3218,  5543,

-   4096,   568, -5681, -1682,  5352,  2731, -4816, -3675,

-   4096,  4478, -3218, -5109,  2217,  5543, -1130, -5765,

-   4096,  -568, -5681,  1682,  5352, -2731, -4816,  3675,

-   4096, -4478, -3218,  5109,  2217, -5543, -1130,  5765,

-   4096, -1682, -4816,  4478,  2217, -5765,  1130,  5109,

-  -4096, -2731,  5681,  -568, -5352,  3675,  3218, -5543,

-   4096, -2731, -3218,  5765, -2217, -3675,  5681, -1682,

-  -4096,  5543, -1130, -4478,  5352,  -568, -4816,  5109,

-   4096, -3675, -1130,  5109, -5352,  1682,  3218, -5765,

-   4096,   568, -4816,  5543, -2217, -2731,  5681, -4478,

-   4096, -4478,  1130,  2731, -5352,  5543, -3218,  -568,

-   4096, -5765,  4816, -1682, -2217,  5109, -5681,  3675,

-   4096, -5109,  3218,  -568, -2217,  4478, -5681,  5543,

-  -4096,  1682,  1130, -3675,  5352, -5765,  4816, -2731,

-   4096, -5543,  4816, -3675,  2217,  -568, -1130,  2731,

-  -4096,  5109, -5681,  5765, -5352,  4478, -3218,  1682,

-   4096, -5765,  5681, -5543,  5352, -5109,  4816, -4478,

-   4096, -3675,  3218, -2731,  2217, -1682,  1130,  -568

-};

-static const int16_t iadst_i16[256] = {

-    542,  1607,  2614,  3526,  4311,  4940,  5390,  5646,

-   5698,  5543,  5189,  4646,  3936,  3084,  2120,  1080,

-   1080,  3084,  4646,  5543,  5646,  4940,  3526,  1607,

-   -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120,

-   1607,  4311,  5646,  5189,  3084,     0, -3084, -5189,

-  -5646, -4311, -1607,  1607,  4311,  5646,  5189,  3084,

-   2120,  5189,  5390,  2614, -1607, -4940, -5543, -3084,

-   1080,  4646,  5646,  3526, -542,  -4311, -5698, -3936,

-   2614,  5646,  3936, -1080, -5189, -4940,  -542,  4311,

-   5543,  2120, -3084, -5698, -3526,  1607,  5390,  4646,

-   3084,  5646,  1607, -4311, -5189,     0,  5189,  4311,

-  -1607, -5646, -3084,  3084,  5646,  1607, -4311, -5189,

-   3526,  5189, -1080, -5698, -1607,  4940,  3936, -3084,

-  -5390,   542,  5646,  2120, -4646, -4311,  2614,  5543,

-   3936,  4311, -3526, -4646,  3084,  4940, -2614, -5189,

-   2120,  5390, -1607, -5543,  1080,  5646,  -542, -5698,

-   4311,  3084, -5189, -1607,  5646,     0, -5646,  1607,

-   5189, -3084, -4311,  4311,  3084, -5189, -1607,  5646,

-   4646,  1607, -5698,  2120,  4311, -4940, -1080,  5646,

-  -2614, -3936,  5189,   542, -5543,  3084,  3526, -5390,

-   4940,     0, -4940,  4940,     0, -4940,  4940,     0,

-  -4940,  4940,     0, -4940,  4940,     0, -4940,  4940,

-   5189, -1607, -3084,  5646, -4311,     0,  4311, -5646,

-   3084,  1607, -5189,  5189, -1607, -3084,  5646, -4311,

-   5390, -3084,  -542,  3936, -5646,  4940, -2120, -1607,

-   4646, -5698,  4311, -1080, -2614,  5189, -5543,  3526,

-   5543, -4311,  2120,   542, -3084,  4940, -5698,  5189,

-  -3526,  1080,  1607, -3936,  5390, -5646,  4646, -2614,

-   5646, -5189,  4311, -3084,  1607,     0, -1607,  3084,

-  -4311,  5189, -5646,  5646, -5189,  4311, -3084,  1607,

-   5698, -5646,  5543, -5390,  5189, -4940,  4646, -4311,

-   3936, -3526,  3084, -2614,  2120, -1607,  1080,  -542

-};

-/* Converted the transforms to integer form. */

-#define HORIZONTAL_SHIFT 14  // 16

-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)

-#define VERTICAL_SHIFT 17  // 15

-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)

-void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,

-                      TX_TYPE tx_type, int tx_dim, uint16_t eobs) {

-  int i, j, k;

-  int nz_dim;

-  int16_t imbuf[256];

-  const int16_t *ip = input;

-  int16_t *op = output;

-  int16_t *im = &imbuf[0];

-  /* pointers to vertical and horizontal transforms. */

-  const int16_t *ptv = NULL, *pth = NULL;

-  int shortpitch = pitch >> 1;

-  switch (tx_type) {

-    case ADST_ADST :

-      ptv = pth = (tx_dim == 4) ? &iadst_i4[0]

-                                  : ((tx_dim == 8) ? &iadst_i8[0]

-                                                     : &iadst_i16[0]);

-      break;

-    case ADST_DCT  :

-      ptv = (tx_dim == 4) ? &iadst_i4[0]

-                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);

-      pth = (tx_dim == 4) ? &idct_i4[0]

-                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);

-      break;

-    case  DCT_ADST :

-      ptv = (tx_dim == 4) ? &idct_i4[0]

-                            : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]);

-      pth = (tx_dim == 4) ? &iadst_i4[0]

-                            : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]);

-      break;

-    case  DCT_DCT :

-      ptv = pth = (tx_dim == 4) ? &idct_i4[0]

-                                  : ((tx_dim == 8) ? &idct_i8[0]

-                                                     : &idct_i16[0]);

-      break;

-    default:

-      assert(0);

-      break;

-  }

-  nz_dim = tx_dim;

-  if(tx_dim > 4) {

-    if(eobs < 36) {

-      vpx_memset(im, 0, 512);

-      nz_dim = 8;

-      if(eobs < 3) {

-        nz_dim = 2;

-      } else if(eobs < 10) {

-        nz_dim = 4;

-      }

-    }

-  }

-  /* 2-D inverse transform X = M1*Z*Transposed_M2 is calculated in 2 steps

-   * from right to left:

-   * 1. horizontal transform: Y= Z*Transposed_M2

-   * 2. vertical transform: X = M1*Y

-   * In SIMD, doing this way could eliminate the transpose needed if it is

-   * calculated from left to right.

-   */

-  /* Horizontal transformation */

-  for (j = 0; j < tx_dim; j++) {

-    for (i = 0; i < nz_dim; i++) {

-      int temp = 0;

-      for (k = 0; k < nz_dim; k++) {

-        temp += ip[k] * pth[k];

-      }

-      /* Calculate im and store it in its transposed position. */

-      im[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);

-      ip += tx_dim;

-    }

-    im += tx_dim;

-    pth += tx_dim;

-    ip = input;

-  }

-  /* Vertical transformation */

-  im = &imbuf[0];

-  for (i = 0; i < tx_dim; i++) {

-    for (j = 0; j < tx_dim; j++) {

-      int temp = 0;

-      for (k = 0; k < nz_dim; k++) {

-        temp += ptv[k] * im[k];

-      }

-      op[j] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);

-      im += tx_dim;

-    }

-    im = &imbuf[0];

-    ptv += tx_dim;

-    op += shortpitch;

-  }

-}

-void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  int16_t *ip = input;

-  int16_t *op = output;

-  int temp1, temp2;

-  int shortpitch = pitch >> 1;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[8];

-    b1 = ip[0] - ip[8];

-    temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;

-    temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);

-    c1 = temp1 - temp2;

-    temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);

-    temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;

-    d1 = temp1 + temp2;

-    op[shortpitch * 0] = a1 + d1;

-    op[shortpitch * 3] = a1 - d1;

-    op[shortpitch * 1] = b1 + c1;

-    op[shortpitch * 2] = b1 - c1;

-    ip++;

-    op++;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[2];

-    b1 = ip[0] - ip[2];

-    temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;

-    temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);

-    c1 = temp1 - temp2;

-    temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);

-    temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;

-    d1 = temp1 + temp2;

-    op[0] = (a1 + d1 + 16) >> 5;

-    op[3] = (a1 - d1 + 16) >> 5;

-    op[1] = (b1 + c1 + 16) >> 5;

-    op[2] = (b1 - c1 + 16) >> 5;

-    ip += shortpitch;

-    op += shortpitch;

-  }

-}

-void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) {

-  int i;

-  int a1;

-  int16_t *op = output;

-  int shortpitch = pitch >> 1;

-  a1 = ((input[0] + 16) >> 5);

-  for (i = 0; i < 4; i++) {

-    op[0] = a1;

-    op[1] = a1;

-    op[2] = a1;

-    op[3] = a1;

-    op += shortpitch;

-  }

-}

-void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr,

-                            uint8_t *dst_ptr, int pitch, int stride) {

-  int a1 = ((input_dc + 16) >> 5);

-  int r, c;

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);

-    }

-    dst_ptr += stride;

-    pred_ptr += pitch;

-  }

-}

-void vp9_short_inv_walsh4x4_c(int16_t *input, int16_t *output) {

-  int i;

-  int a1, b1, c1, d1;

-  int16_t *ip = input;

-  int16_t *op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ((ip[0] + ip[3]));

-    b1 = ((ip[1] + ip[2]));

-    c1 = ((ip[1] - ip[2]));

-    d1 = ((ip[0] - ip[3]));

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

-    ip += 4;

-    op += 4;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[12];

-    b1 = ip[4] + ip[8];

-    c1 = ip[4] - ip[8];

-    d1 = ip[0] - ip[12];

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[4] = (c1 + d1) >> 1;

-    op[8] = (a1 - b1) >> 1;

-    op[12] = (d1 - c1) >> 1;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_inv_walsh4x4_1_c(int16_t *in, int16_t *out) {

-  int i;

-  int16_t tmp[4];

-  int16_t *ip = in;

-  int16_t *op = tmp;

-  op[0] = (ip[0] + 1) >> 1;

-  op[1] = op[2] = op[3] = (ip[0] >> 1);

-  ip = tmp;

-  op = out;

-  for (i = 0; i < 4; i++) {

-    op[0] = (ip[0] + 1) >> 1;

-    op[4] = op[8] = op[12] = (ip[0] >> 1);

-    ip++;

-    op++;

-  }

-}

-#if CONFIG_LOSSLESS

-void vp9_short_inv_walsh4x4_lossless_c(int16_t *input, int16_t *output) {

-  int i;

-  int a1, b1, c1, d1;

-  int16_t *ip = input;

-  int16_t *op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR;

-    b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR;

-    c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR;

-    d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR;

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

-    ip += 4;

-    op += 4;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[12];

-    b1 = ip[4] + ip[8];

-    c1 = ip[4] - ip[8];

-    d1 = ip[0] - ip[12];

-    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_inv_walsh4x4_1_lossless_c(int16_t *in, int16_t *out) {

-  int i;

-  int16_t tmp[4];

-  int16_t *ip = in;

-  int16_t *op = tmp;

-  op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1;

-  op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1);

-  ip = tmp;

-  op = out;

-  for (i = 0; i < 4; i++) {

-    op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  int16_t *ip = input;

-  int16_t *op = output;

-  int shortpitch = pitch >> 1;

-  for (i = 0; i < 4; i++) {

-    a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR;

-    b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR;

-    c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR;

-    d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR;

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

-    ip += 4;

-    op += shortpitch;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[shortpitch * 0] + ip[shortpitch * 3];

-    b1 = ip[shortpitch * 1] + ip[shortpitch * 2];

-    c1 = ip[shortpitch * 1] - ip[shortpitch * 2];

-    d1 = ip[shortpitch * 0] - ip[shortpitch * 3];

-    op[shortpitch * 0] = (a1 + b1 + 1) >> 1;

-    op[shortpitch * 1] = (c1 + d1) >> 1;

-    op[shortpitch * 2] = (a1 - b1) >> 1;

-    op[shortpitch * 3] = (d1 - c1) >> 1;

-    ip++;

-    op++;

-  }

-}

-void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) {

-  int i;

-  int16_t tmp[4];

-  int16_t *ip = in;

-  int16_t *op = tmp;

-  int shortpitch = pitch >> 1;

-  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

-  op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1);

-  ip = tmp;

-  op = out;

-  for (i = 0; i < 4; i++) {

-    op[shortpitch * 0] = (ip[0] + 1) >> 1;

-    op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1;

-    ip++;

-    op++;

-  }

-}

-void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr,

-                                 uint8_t *dst_ptr,

-                                 int pitch, int stride) {

-  int r, c;

-  short tmp[16];

-  vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);

-    }

-    dst_ptr += stride;

-    pred_ptr += pitch;

-  }

-}

-#endif

-void vp9_dc_only_idct_add_8x8_c(short input_dc,

-                                uint8_t *pred_ptr,

-                                uint8_t *dst_ptr,

-                                int pitch, int stride) {

-  int a1 = ((input_dc + 16) >> 5);

-  int r, c, b;

-  uint8_t *orig_pred = pred_ptr;

-  uint8_t *orig_dst = dst_ptr;

-  for (b = 0; b < 4; b++) {

-    for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]);

-      }

-      dst_ptr += stride;

-      pred_ptr += pitch;

-    }

-    dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride;

-    pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch;

-  }

-}

-#define W1 2841                 /* 2048*sqrt(2)*cos(1*pi/16) */

-#define W2 2676                 /* 2048*sqrt(2)*cos(2*pi/16) */

-#define W3 2408                 /* 2048*sqrt(2)*cos(3*pi/16) */

-#define W5 1609                 /* 2048*sqrt(2)*cos(5*pi/16) */

-#define W6 1108                 /* 2048*sqrt(2)*cos(6*pi/16) */

-#define W7 565                  /* 2048*sqrt(2)*cos(7*pi/16) */

-/* row (horizontal) IDCT

- *

- * 7                       pi         1 dst[k] = sum c[l] * src[l] * cos( -- *

- * ( k + - ) * l ) l=0                      8          2

- *

- * where: c[0]    = 128 c[1..7] = 128*sqrt(2) */

-static void idctrow(int *blk) {

-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

-  /* shortcut */

-  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |

-        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {

-    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]

-                                        = blk[5] = blk[6] = blk[7] = blk[0] << 3;

-    return;

-  }

-  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */

-  /* first stage */

-  x8 = W7 * (x4 + x5);

-  x4 = x8 + (W1 - W7) * x4;

-  x5 = x8 - (W1 + W7) * x5;

-  x8 = W3 * (x6 + x7);

-  x6 = x8 - (W3 - W5) * x6;

-  x7 = x8 - (W3 + W5) * x7;

-  /* second stage */

-  x8 = x0 + x1;

-  x0 -= x1;

-  x1 = W6 * (x3 + x2);

-  x2 = x1 - (W2 + W6) * x2;

-  x3 = x1 + (W2 - W6) * x3;

-  x1 = x4 + x6;

-  x4 -= x6;

-  x6 = x5 + x7;

-  x5 -= x7;

-  /* third stage */

-  x7 = x8 + x3;

-  x8 -= x3;

-  x3 = x0 + x2;

-  x0 -= x2;

-  x2 = (181 * (x4 + x5) + 128) >> 8;

-  x4 = (181 * (x4 - x5) + 128) >> 8;

-  /* fourth stage */

-  blk[0] = (x7 + x1) >> 8;

-  blk[1] = (x3 + x2) >> 8;

-  blk[2] = (x0 + x4) >> 8;

-  blk[3] = (x8 + x6) >> 8;

-  blk[4] = (x8 - x6) >> 8;

-  blk[5] = (x0 - x4) >> 8;

-  blk[6] = (x3 - x2) >> 8;

-  blk[7] = (x7 - x1) >> 8;

-}

-/* column (vertical) IDCT

- *

- * 7                         pi         1 dst[8*k] = sum c[l] * src[8*l] *

- * cos( -- * ( k + - ) * l ) l=0                        8          2

- *

- * where: c[0]    = 1/1024 c[1..7] = (1/1024)*sqrt(2) */

-static void idctcol(int *blk) {

-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

-  /* shortcut */

-  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |

-        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |

-        (x7 = blk[8 * 3]))) {

-    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]

-        = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]

-        = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);

-    return;

-  }

-  x0 = (blk[8 * 0] << 8) + 16384;

-  /* first stage */

-  x8 = W7 * (x4 + x5) + 4;

-  x4 = (x8 + (W1 - W7) * x4) >> 3;

-  x5 = (x8 - (W1 + W7) * x5) >> 3;

-  x8 = W3 * (x6 + x7) + 4;

-  x6 = (x8 - (W3 - W5) * x6) >> 3;

-  x7 = (x8 - (W3 + W5) * x7) >> 3;

-  /* second stage */

-  x8 = x0 + x1;

-  x0 -= x1;

-  x1 = W6 * (x3 + x2) + 4;

-  x2 = (x1 - (W2 + W6) * x2) >> 3;

-  x3 = (x1 + (W2 - W6) * x3) >> 3;

-  x1 = x4 + x6;

-  x4 -= x6;

-  x6 = x5 + x7;

-  x5 -= x7;

-  /* third stage */

-  x7 = x8 + x3;

-  x8 -= x3;

-  x3 = x0 + x2;

-  x0 -= x2;

-  x2 = (181 * (x4 + x5) + 128) >> 8;

-  x4 = (181 * (x4 - x5) + 128) >> 8;

-  /* fourth stage */

-  blk[8 * 0] = (x7 + x1) >> 14;

-  blk[8 * 1] = (x3 + x2) >> 14;

-  blk[8 * 2] = (x0 + x4) >> 14;

-  blk[8 * 3] = (x8 + x6) >> 14;

-  blk[8 * 4] = (x8 - x6) >> 14;

-  blk[8 * 5] = (x0 - x4) >> 14;

-  blk[8 * 6] = (x3 - x2) >> 14;

-  blk[8 * 7] = (x7 - x1) >> 14;

-}

-#define TX_DIM 8

-void vp9_short_idct8x8_c(int16_t *coefs, int16_t *block, int pitch) {

-  int X[TX_DIM * TX_DIM];

-  int i, j;

-  int shortpitch = pitch >> 1;

-  for (i = 0; i < TX_DIM; i++) {

-    for (j = 0; j < TX_DIM; j++) {

-      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1

-                                + (coefs[i * TX_DIM + j] < 0)) >> 2;

-    }

-  }

-  for (i = 0; i < 8; i++)

-    idctrow(X + 8 * i);

-  for (i = 0; i < 8; i++)

-    idctcol(X + i);

-  for (i = 0; i < TX_DIM; i++) {

-    for (j = 0; j < TX_DIM; j++) {

-      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;

-    }

-  }

-}

-/* Row IDCT when only first 4 coefficients are non-zero. */

-static void idctrow10(int *blk) {

-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

-  /* shortcut */

-  if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) |

-        (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) {

-    blk[0] = blk[1] = blk[2] = blk[3] = blk[4]

-           = blk[5] = blk[6] = blk[7] = blk[0] << 3;

-    return;

-  }

-  x0 = (blk[0] << 11) + 128;    /* for proper rounding in the fourth stage */

-  /* first stage */

-  x5 = W7 * x4;

-  x4 = W1 * x4;

-  x6 = W3 * x7;

-  x7 = -W5 * x7;

-  /* second stage */

-  x2 = W6 * x3;

-  x3 = W2 * x3;

-  x1 = x4 + x6;

-  x4 -= x6;

-  x6 = x5 + x7;

-  x5 -= x7;

-  /* third stage */

-  x7 = x0 + x3;

-  x8 = x0 - x3;

-  x3 = x0 + x2;

-  x0 -= x2;

-  x2 = (181 * (x4 + x5) + 128) >> 8;

-  x4 = (181 * (x4 - x5) + 128) >> 8;

-  /* fourth stage */

-  blk[0] = (x7 + x1) >> 8;

-  blk[1] = (x3 + x2) >> 8;

-  blk[2] = (x0 + x4) >> 8;

-  blk[3] = (x8 + x6) >> 8;

-  blk[4] = (x8 - x6) >> 8;

-  blk[5] = (x0 - x4) >> 8;

-  blk[6] = (x3 - x2) >> 8;

-  blk[7] = (x7 - x1) >> 8;

-}

-/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */

-static void idctcol10(int *blk) {

-  int x0, x1, x2, x3, x4, x5, x6, x7, x8;

-  /* shortcut */

-  if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) |

-        (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) |

-        (x7 = blk[8 * 3]))) {

-    blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3]

-        = blk[8 * 4] = blk[8 * 5] = blk[8 * 6]

-        = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6);

-    return;

-  }

-  x0 = (blk[8 * 0] << 8) + 16384;

-  /* first stage */

-  x5 = (W7 * x4 + 4) >> 3;

-  x4 = (W1 * x4 + 4) >> 3;

-  x6 = (W3 * x7 + 4) >> 3;

-  x7 = (-W5 * x7 + 4) >> 3;

-  /* second stage */

-  x2 = (W6 * x3 + 4) >> 3;

-  x3 = (W2 * x3 + 4) >> 3;

-  x1 = x4 + x6;

-  x4 -= x6;

-  x6 = x5 + x7;

-  x5 -= x7;

-  /* third stage */

-  x7 = x0 + x3;

-  x8 = x0 - x3;

-  x3 = x0 + x2;

-  x0 -= x2;

-  x2 = (181 * (x4 + x5) + 128) >> 8;

-  x4 = (181 * (x4 - x5) + 128) >> 8;

-  /* fourth stage */

-  blk[8 * 0] = (x7 + x1) >> 14;

-  blk[8 * 1] = (x3 + x2) >> 14;

-  blk[8 * 2] = (x0 + x4) >> 14;

-  blk[8 * 3] = (x8 + x6) >> 14;

-  blk[8 * 4] = (x8 - x6) >> 14;

-  blk[8 * 5] = (x0 - x4) >> 14;

-  blk[8 * 6] = (x3 - x2) >> 14;

-  blk[8 * 7] = (x7 - x1) >> 14;

-}

-void vp9_short_idct10_8x8_c(int16_t *coefs, int16_t *block, int pitch) {

-  int X[TX_DIM * TX_DIM];

-  int i, j;

-  int shortpitch = pitch >> 1;

-  for (i = 0; i < TX_DIM; i++) {

-    for (j = 0; j < TX_DIM; j++) {

-      X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1

-                                + (coefs[i * TX_DIM + j] < 0)) >> 2;

-    }

-  }

-  /* Do first 4 row idct only since non-zero dct coefficients are all in

-   * upper-left 4x4 area. */

-  for (i = 0; i < 4; i++)

-    idctrow10(X + 8 * i);

-  for (i = 0; i < 8; i++)

-    idctcol10(X + i);

-  for (i = 0; i < TX_DIM; i++) {

-    for (j = 0; j < TX_DIM; j++) {

-      block[i * shortpitch + j]  = X[i * TX_DIM + j] >> 1;

-    }

-  }

-}

-void vp9_short_ihaar2x2_c(int16_t *input, int16_t *output, int pitch) {

-  int i;

-  int16_t *ip = input;  // 0, 1, 4, 8

-  int16_t *op = output;

-  for (i = 0; i < 16; i++) {

-    op[i] = 0;

-  }

-  op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1;

-  op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1;

-  op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1;

-  op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;

-}

-#if 0

-// Keep a really bad float version as reference for now.

-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    double x;

-    const int short_pitch = pitch >> 1;

-    int i, j, k, l;

-    for (l = 0; l < 16; ++l) {

-      for (k = 0; k < 16; ++k) {

-        double s = 0;

-        for (i = 0; i < 16; ++i) {

-          for (j = 0; j < 16; ++j) {

-            x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;

-            if (i != 0)

-              x *= sqrt(2.0);

-            if (j != 0)

-              x *= sqrt(2.0);

-            s += x;

-          }

-        }

-        output[k*short_pitch+l] = (short)round(s);

-      }

-    }

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

-#endif

-#define TEST_INT_16x16_IDCT 1

-#if !TEST_INT_16x16_IDCT

-static void butterfly_16x16_idct_1d(double input[16], double output[16]) {

-  static const double C1 = 0.995184726672197;

-  static const double C2 = 0.98078528040323;

-  static const double C3 = 0.956940335732209;

-  static const double C4 = 0.923879532511287;

-  static const double C5 = 0.881921264348355;

-  static const double C6 = 0.831469612302545;

-  static const double C7 = 0.773010453362737;

-  static const double C8 = 0.707106781186548;

-  static const double C9 = 0.634393284163646;

-  static const double C10 = 0.555570233019602;

-  static const double C11 = 0.471396736825998;

-  static const double C12 = 0.38268343236509;

-  static const double C13 = 0.290284677254462;

-  static const double C14 = 0.195090322016128;

-  static const double C15 = 0.098017140329561;

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    double step[16];

-    double intermediate[16];

-    double temp1, temp2;

-    // step 1 and 2

-    step[ 0] = input[0] + input[8];

-    step[ 1] = input[0] - input[8];

-    temp1 = input[4]*C12;

-    temp2 = input[12]*C4;

-    temp1 -= temp2;

-    temp1 *= C8;

-    step[ 2] = 2*(temp1);

-    temp1 = input[4]*C4;

-    temp2 = input[12]*C12;

-    temp1 += temp2;

-    temp1 = (temp1);

-    temp1 *= C8;

-    step[ 3] = 2*(temp1);

-    temp1 = input[2]*C8;

-    temp1 = 2*(temp1);

-    temp2 = input[6] + input[10];

-    step[ 4] = temp1 + temp2;

-    step[ 5] = temp1 - temp2;

-    temp1 = input[14]*C8;

-    temp1 = 2*(temp1);

-    temp2 = input[6] - input[10];

-    step[ 6] = temp2 - temp1;

-    step[ 7] = temp2 + temp1;

-    // for odd input

-    temp1 = input[3]*C12;

-    temp2 = input[13]*C4;

-    temp1 += temp2;

-    temp1 = (temp1);

-    temp1 *= C8;

-    intermediate[ 8] = 2*(temp1);

-    temp1 = input[3]*C4;

-    temp2 = input[13]*C12;

-    temp2 -= temp1;

-    temp2 = (temp2);

-    temp2 *= C8;

-    intermediate[ 9] = 2*(temp2);

-    intermediate[10] = 2*(input[9]*C8);

-    intermediate[11] = input[15] - input[1];

-    intermediate[12] = input[15] + input[1];

-    intermediate[13] = 2*((input[7]*C8));

-    temp1 = input[11]*C12;

-    temp2 = input[5]*C4;

-    temp2 -= temp1;

-    temp2 = (temp2);

-    temp2 *= C8;

-    intermediate[14] = 2*(temp2);

-    temp1 = input[11]*C4;

-    temp2 = input[5]*C12;

-    temp1 += temp2;

-    temp1 = (temp1);

-    temp1 *= C8;

-    intermediate[15] = 2*(temp1);

-    step[ 8] = intermediate[ 8] + intermediate[14];

-    step[ 9] = intermediate[ 9] + intermediate[15];

-    step[10] = intermediate[10] + intermediate[11];

-    step[11] = intermediate[10] - intermediate[11];

-    step[12] = intermediate[12] + intermediate[13];

-    step[13] = intermediate[12] - intermediate[13];

-    step[14] = intermediate[ 8] - intermediate[14];

-    step[15] = intermediate[ 9] - intermediate[15];

-    // step 3

-    output[0] = step[ 0] + step[ 3];

-    output[1] = step[ 1] + step[ 2];

-    output[2] = step[ 1] - step[ 2];

-    output[3] = step[ 0] - step[ 3];

-    temp1 = step[ 4]*C14;

-    temp2 = step[ 7]*C2;

-    temp1 -= temp2;

-    output[4] =  (temp1);

-    temp1 = step[ 4]*C2;

-    temp2 = step[ 7]*C14;

-    temp1 += temp2;

-    output[7] =  (temp1);

-    temp1 = step[ 5]*C10;

-    temp2 = step[ 6]*C6;

-    temp1 -= temp2;

-    output[5] =  (temp1);

-    temp1 = step[ 5]*C6;

-    temp2 = step[ 6]*C10;

-    temp1 += temp2;

-    output[6] =  (temp1);

-    output[8] = step[ 8] + step[11];

-    output[9] = step[ 9] + step[10];

-    output[10] = step[ 9] - step[10];

-    output[11] = step[ 8] - step[11];

-    output[12] = step[12] + step[15];

-    output[13] = step[13] + step[14];

-    output[14] = step[13] - step[14];

-    output[15] = step[12] - step[15];

-    // output 4

-    step[ 0] = output[0] + output[7];

-    step[ 1] = output[1] + output[6];

-    step[ 2] = output[2] + output[5];

-    step[ 3] = output[3] + output[4];

-    step[ 4] = output[3] - output[4];

-    step[ 5] = output[2] - output[5];

-    step[ 6] = output[1] - output[6];

-    step[ 7] = output[0] - output[7];

-    temp1 = output[8]*C7;

-    temp2 = output[15]*C9;

-    temp1 -= temp2;

-    step[ 8] = (temp1);

-    temp1 = output[9]*C11;

-    temp2 = output[14]*C5;

-    temp1 += temp2;

-    step[ 9] = (temp1);

-    temp1 = output[10]*C3;

-    temp2 = output[13]*C13;

-    temp1 -= temp2;

-    step[10] = (temp1);

-    temp1 = output[11]*C15;

-    temp2 = output[12]*C1;

-    temp1 += temp2;

-    step[11] = (temp1);

-    temp1 = output[11]*C1;

-    temp2 = output[12]*C15;

-    temp2 -= temp1;

-    step[12] = (temp2);

-    temp1 = output[10]*C13;

-    temp2 = output[13]*C3;

-    temp1 += temp2;

-    step[13] = (temp1);

-    temp1 = output[9]*C5;

-    temp2 = output[14]*C11;

-    temp2 -= temp1;

-    step[14] = (temp2);

-    temp1 = output[8]*C9;

-    temp2 = output[15]*C7;

-    temp1 += temp2;

-    step[15] = (temp1);

-    // step 5

-    output[0] = (step[0] + step[15]);

-    output[1] = (step[1] + step[14]);

-    output[2] = (step[2] + step[13]);

-    output[3] = (step[3] + step[12]);

-    output[4] = (step[4] + step[11]);

-    output[5] = (step[5] + step[10]);

-    output[6] = (step[6] + step[ 9]);

-    output[7] = (step[7] + step[ 8]);

-    output[15] = (step[0] - step[15]);

-    output[14] = (step[1] - step[14]);

-    output[13] = (step[2] - step[13]);

-    output[12] = (step[3] - step[12]);

-    output[11] = (step[4] - step[11]);

-    output[10] = (step[5] - step[10]);

-    output[9] = (step[6] - step[ 9]);

-    output[8] = (step[7] - step[ 8]);

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

-// Remove once an int version of iDCT is written

-#if 0

-void reference_16x16_idct_1d(double input[16], double output[16]) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    const double kPi = 3.141592653589793238462643383279502884;

-    const double kSqrt2 = 1.414213562373095048801688724209698;

-    for (int k = 0; k < 16; k++) {

-      output[k] = 0.0;

-      for (int n = 0; n < 16; n++) {

-        output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);

-        if (n == 0)

-          output[k] = output[k]/kSqrt2;

-      }

-    }

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

-#endif

-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    double out[16*16], out2[16*16];

-    const int short_pitch = pitch >> 1;

-    int i, j;

-      // First transform rows

-    for (i = 0; i < 16; ++i) {

-      double temp_in[16], temp_out[16];

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = input[j + i*short_pitch];

-      butterfly_16x16_idct_1d(temp_in, temp_out);

-      for (j = 0; j < 16; ++j)

-        out[j + i*16] = temp_out[j];

-    }

-    // Then transform columns

-    for (i = 0; i < 16; ++i) {

-      double temp_in[16], temp_out[16];

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = out[j*16 + i];

-      butterfly_16x16_idct_1d(temp_in, temp_out);

-      for (j = 0; j < 16; ++j)

-        out2[j*16 + i] = temp_out[j];

-    }

-    for (i = 0; i < 16*16; ++i)

-      output[i] = round(out2[i]/128);

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

-#else

-#define INITIAL_SHIFT 2

-#define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1))

-#define RIGHT_SHIFT 14

-#define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1))

-static const int16_t C1 = 16305;

-static const int16_t C2 = 16069;

-static const int16_t C3 = 15679;

-static const int16_t C4 = 15137;

-static const int16_t C5 = 14449;

-static const int16_t C6 = 13623;

-static const int16_t C7 = 12665;

-static const int16_t C8 = 11585;

-static const int16_t C9 = 10394;

-static const int16_t C10 = 9102;

-static const int16_t C11 = 7723;

-static const int16_t C12 = 6270;

-static const int16_t C13 = 4756;

-static const int16_t C14 = 3196;

-static const int16_t C15 = 1606;

-static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16],

-                                    int last_shift_bits) {

-  int16_t step[16];

-  int intermediate[16];

-  int temp1, temp2;

-  int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT;

-  int step1_rounding = 1 << (step1_shift - 1);

-  int last_rounding = 0;

-  if (last_shift_bits > 0)

-    last_rounding = 1 << (last_shift_bits - 1);

-  // step 1 and 2

-  step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-  step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-  temp1 = input[4] * C12;

-  temp2 = input[12] * C4;

-  temp1 = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1  *= C8;

-  step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift;

-  temp1 = input[4] * C4;

-  temp2 = input[12] * C12;

-  temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 *= C8;

-  step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift;

-  temp1 = input[2] * C8;

-  temp1 = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp2 = input[6] + input[10];

-  step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-  step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-  temp1 = input[14] * C8;

-  temp1 = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp2 = input[6] - input[10];

-  step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-  step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-  // for odd input

-  temp1 = input[3] * C12;

-  temp2 = input[13] * C4;

-  temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 *= C8;

-  intermediate[ 8] = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = input[3] * C4;

-  temp2 = input[13] * C12;

-  temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp2 *= C8;

-  intermediate[ 9] = (2 * (temp2) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  intermediate[11] = input[15] - input[1];

-  intermediate[12] = input[15] + input[1];

-  intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = input[11] * C12;

-  temp2 = input[5] * C4;

-  temp2 = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp2 *= C8;

-  intermediate[14] = (2 * (temp2) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = input[11] * C4;

-  temp2 = input[5] * C12;

-  temp1 = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 *= C8;

-  intermediate[15] = (2 * (temp1) +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING)

-      >> INITIAL_SHIFT;

-  step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING)

-      >> INITIAL_SHIFT;

-  step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING)

-      >> INITIAL_SHIFT;

-  step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING)

-      >> INITIAL_SHIFT;

-  step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING)

-      >> INITIAL_SHIFT;

-  step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING)

-      >> INITIAL_SHIFT;

-  step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING)

-      >> INITIAL_SHIFT;

-  step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING)

-      >> INITIAL_SHIFT;

-  // step 3

-  output[0] = step[ 0] + step[ 3];

-  output[1] = step[ 1] + step[ 2];

-  output[2] = step[ 1] - step[ 2];

-  output[3] = step[ 0] - step[ 3];

-  temp1 = step[ 4] * C14;

-  temp2 = step[ 7] * C2;

-  output[4] =  (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = step[ 4] * C2;

-  temp2 = step[ 7] * C14;

-  output[7] =  (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = step[ 5] * C10;

-  temp2 = step[ 6] * C6;

-  output[5] =  (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = step[ 5] * C6;

-  temp2 = step[ 6] * C10;

-  output[6] =  (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  output[8] = step[ 8] + step[11];

-  output[9] = step[ 9] + step[10];

-  output[10] = step[ 9] - step[10];

-  output[11] = step[ 8] - step[11];

-  output[12] = step[12] + step[15];

-  output[13] = step[13] + step[14];

-  output[14] = step[13] - step[14];

-  output[15] = step[12] - step[15];

-  // output 4

-  step[ 0] = output[0] + output[7];

-  step[ 1] = output[1] + output[6];

-  step[ 2] = output[2] + output[5];

-  step[ 3] = output[3] + output[4];

-  step[ 4] = output[3] - output[4];

-  step[ 5] = output[2] - output[5];

-  step[ 6] = output[1] - output[6];

-  step[ 7] = output[0] - output[7];

-  temp1 = output[8] * C7;

-  temp2 = output[15] * C9;

-  step[ 8] = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = output[9] * C11;

-  temp2 = output[14] * C5;

-  step[ 9] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = output[10] * C3;

-  temp2 = output[13] * C13;

-  step[10] = (temp1 - temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = output[11] * C15;

-  temp2 = output[12] * C1;

-  step[11] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = output[11] * C1;

-  temp2 = output[12] * C15;

-  step[12] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = output[10] * C13;

-  temp2 = output[13] * C3;

-  step[13] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = output[9] * C5;

-  temp2 = output[14] * C11;

-  step[14] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  temp1 = output[8] * C9;

-  temp2 = output[15] * C7;

-  step[15] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-  // step 5

-  output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;

-  output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;

-  output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;

-  output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;

-  output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;

-  output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;

-  output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;

-  output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;

-  output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;

-  output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;

-  output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;

-  output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;

-  output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;

-  output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;

-  output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;

-  output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;

-}

-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {

-  int16_t out[16 * 16];

-  int16_t *outptr = &out[0];

-  const int short_pitch = pitch >> 1;

-  int i, j;

-  int16_t temp_in[16], temp_out[16];

-  // First transform rows

-  for (i = 0; i < 16; ++i) {

-    butterfly_16x16_idct_1d(input, outptr, 0);

-    input += short_pitch;

-    outptr += 16;

-  }

-  // Then transform columns

-  for (i = 0; i < 16; ++i) {

-    for (j = 0; j < 16; ++j)

-      temp_in[j] = out[j * 16 + i];

-    butterfly_16x16_idct_1d(temp_in, temp_out, 3);

-    for (j = 0; j < 16; ++j)

-        output[j * 16 + i] = temp_out[j];

-    }

-}

-/* The following function is called when we know the maximum number of non-zero

- * dct coefficients is less or equal 10.

- */

-static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16],

-                                      int last_shift_bits) {

-    int16_t step[16] = {0};

-    int intermediate[16] = {0};

-    int temp1, temp2;

-    int last_rounding = 0;

-    if (last_shift_bits > 0)

-      last_rounding = 1 << (last_shift_bits - 1);

-    // step 1 and 2

-    step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    // for odd input

-    temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 *= C8;

-    intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 *= C8;

-    intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

-    // step 3

-    output[0] = step[ 0];

-    output[1] = step[ 1];

-    output[2] = step[ 1];

-    output[3] = step[ 0];

-    temp1 = step[ 4] * C14;

-    output[4] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[ 4] * C2;

-    output[7] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[ 5] * C10;

-    output[5] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[ 5] * C6;

-    output[6] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    output[8] = step[ 8] + step[11];

-    output[9] = step[ 9] + step[10];

-    output[10] = step[ 9] - step[10];

-    output[11] = step[ 8] - step[11];

-    output[12] = step[12] + step[15];

-    output[13] = step[13] + step[14];

-    output[14] = step[13] - step[14];

-    output[15] = step[12] - step[15];

-    // output 4

-    step[ 0] = output[0] + output[7];

-    step[ 1] = output[1] + output[6];

-    step[ 2] = output[2] + output[5];

-    step[ 3] = output[3] + output[4];

-    step[ 4] = output[3] - output[4];

-    step[ 5] = output[2] - output[5];

-    step[ 6] = output[1] - output[6];

-    step[ 7] = output[0] - output[7];

-    temp1 = output[8] * C7;

-    temp2 = output[15] * C9;

-    step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[9] * C11;

-    temp2 = output[14] * C5;

-    step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[10] * C3;

-    temp2 = output[13] * C13;

-    step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[11] * C15;

-    temp2 = output[12] * C1;

-    step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[11] * C1;

-    temp2 = output[12] * C15;

-    step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[10] * C13;

-    temp2 = output[13] * C3;

-    step[13] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[9] * C5;

-    temp2 = output[14] * C11;

-    step[14] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[8] * C9;

-    temp2 = output[15] * C7;

-    step[15] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

-    // step 5

-    output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;

-    output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;

-    output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;

-    output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;

-    output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;

-    output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;

-    output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;

-    output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;

-    output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;

-    output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;

-    output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;

-    output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;

-    output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;

-    output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;

-    output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;

-    output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;

-}

-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {

-    int16_t out[16 * 16];

-    int16_t *outptr = &out[0];

-    const int short_pitch = pitch >> 1;

-    int i, j;

-    int16_t temp_in[16], temp_out[16];

-    /* First transform rows. Since all non-zero dct coefficients are in

-     * upper-left 4x4 area, we only need to calculate first 4 rows here.

-     */

-    vpx_memset(out, 0, sizeof(out));

-    for (i = 0; i < 4; ++i) {

-      butterfly_16x16_idct10_1d(input, outptr, 0);

-      input += short_pitch;

-      outptr += 16;

-    }

-    // Then transform columns

-    for (i = 0; i < 16; ++i) {

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = out[j*16 + i];

-      butterfly_16x16_idct10_1d(temp_in, temp_out, 3);

-      for (j = 0; j < 16; ++j)

-        output[j*16 + i] = temp_out[j];

-    }

-}

-#undef INITIAL_SHIFT

-#undef INITIAL_ROUNDING

-#undef RIGHT_SHIFT

-#undef RIGHT_ROUNDING

-#endif

-#if !CONFIG_DWTDCTHYBRID

-#define DownshiftMultiplyBy2(x) x * 2

-#define DownshiftMultiply(x) x

-static void idct16(double *input, double *output, int stride) {

-  static const double C1 = 0.995184726672197;

-  static const double C2 = 0.98078528040323;

-  static const double C3 = 0.956940335732209;

-  static const double C4 = 0.923879532511287;

-  static const double C5 = 0.881921264348355;

-  static const double C6 = 0.831469612302545;

-  static const double C7 = 0.773010453362737;

-  static const double C8 = 0.707106781186548;

-  static const double C9 = 0.634393284163646;

-  static const double C10 = 0.555570233019602;

-  static const double C11 = 0.471396736825998;

-  static const double C12 = 0.38268343236509;

-  static const double C13 = 0.290284677254462;

-  static const double C14 = 0.195090322016128;

-  static const double C15 = 0.098017140329561;

-  double step[16];

-  double intermediate[16];

-  double temp1, temp2;

-  // step 1 and 2

-  step[ 0] = input[stride*0] + input[stride*8];

-  step[ 1] = input[stride*0] - input[stride*8];

-  temp1 = input[stride*4]*C12;

-  temp2 = input[stride*12]*C4;

-  temp1 -= temp2;

-  temp1 = DownshiftMultiply(temp1);

-  temp1 *= C8;

-  step[ 2] = DownshiftMultiplyBy2(temp1);

-  temp1 = input[stride*4]*C4;

-  temp2 = input[stride*12]*C12;

-  temp1 += temp2;

-  temp1 = DownshiftMultiply(temp1);

-  temp1 *= C8;

-  step[ 3] = DownshiftMultiplyBy2(temp1);

-  temp1 = input[stride*2]*C8;

-  temp1 = DownshiftMultiplyBy2(temp1);

-  temp2 = input[stride*6] + input[stride*10];

-  step[ 4] = temp1 + temp2;

-  step[ 5] = temp1 - temp2;

-  temp1 = input[stride*14]*C8;

-  temp1 = DownshiftMultiplyBy2(temp1);

-  temp2 = input[stride*6] - input[stride*10];

-  step[ 6] = temp2 - temp1;

-  step[ 7] = temp2 + temp1;

-  // for odd input

-  temp1 = input[stride*3]*C12;

-  temp2 = input[stride*13]*C4;

-  temp1 += temp2;

-  temp1 = DownshiftMultiply(temp1);

-  temp1 *= C8;

-  intermediate[ 8] = DownshiftMultiplyBy2(temp1);

-  temp1 = input[stride*3]*C4;

-  temp2 = input[stride*13]*C12;

-  temp2 -= temp1;

-  temp2 = DownshiftMultiply(temp2);

-  temp2 *= C8;

-  intermediate[ 9] = DownshiftMultiplyBy2(temp2);

-  intermediate[10] = DownshiftMultiplyBy2(input[stride*9]*C8);

-  intermediate[11] = input[stride*15] - input[stride*1];

-  intermediate[12] = input[stride*15] + input[stride*1];

-  intermediate[13] = DownshiftMultiplyBy2((input[stride*7]*C8));

-  temp1 = input[stride*11]*C12;

-  temp2 = input[stride*5]*C4;

-  temp2 -= temp1;

-  temp2 = DownshiftMultiply(temp2);

-  temp2 *= C8;

-  intermediate[14] = DownshiftMultiplyBy2(temp2);

-  temp1 = input[stride*11]*C4;

-  temp2 = input[stride*5]*C12;

-  temp1 += temp2;

-  temp1 = DownshiftMultiply(temp1);

-  temp1 *= C8;

-  intermediate[15] = DownshiftMultiplyBy2(temp1);

-  step[ 8] = intermediate[ 8] + intermediate[14];

-  step[ 9] = intermediate[ 9] + intermediate[15];

-  step[10] = intermediate[10] + intermediate[11];

-  step[11] = intermediate[10] - intermediate[11];

-  step[12] = intermediate[12] + intermediate[13];

-  step[13] = intermediate[12] - intermediate[13];

-  step[14] = intermediate[ 8] - intermediate[14];

-  step[15] = intermediate[ 9] - intermediate[15];

-  // step 3

-  output[stride*0] = step[ 0] + step[ 3];

-  output[stride*1] = step[ 1] + step[ 2];

-  output[stride*2] = step[ 1] - step[ 2];

-  output[stride*3] = step[ 0] - step[ 3];

-  temp1 = step[ 4]*C14;

-  temp2 = step[ 7]*C2;

-  temp1 -= temp2;

-  output[stride*4] =  DownshiftMultiply(temp1);

-  temp1 = step[ 4]*C2;

-  temp2 = step[ 7]*C14;

-  temp1 += temp2;

-  output[stride*7] =  DownshiftMultiply(temp1);

-  temp1 = step[ 5]*C10;

-  temp2 = step[ 6]*C6;

-  temp1 -= temp2;

-  output[stride*5] =  DownshiftMultiply(temp1);

-  temp1 = step[ 5]*C6;

-  temp2 = step[ 6]*C10;

-  temp1 += temp2;

-  output[stride*6] =  DownshiftMultiply(temp1);

-  output[stride*8] = step[ 8] + step[11];

-  output[stride*9] = step[ 9] + step[10];

-  output[stride*10] = step[ 9] - step[10];

-  output[stride*11] = step[ 8] - step[11];

-  output[stride*12] = step[12] + step[15];

-  output[stride*13] = step[13] + step[14];

-  output[stride*14] = step[13] - step[14];

-  output[stride*15] = step[12] - step[15];

-  // output 4

-  step[ 0] = output[stride*0] + output[stride*7];

-  step[ 1] = output[stride*1] + output[stride*6];

-  step[ 2] = output[stride*2] + output[stride*5];

-  step[ 3] = output[stride*3] + output[stride*4];

-  step[ 4] = output[stride*3] - output[stride*4];

-  step[ 5] = output[stride*2] - output[stride*5];

-  step[ 6] = output[stride*1] - output[stride*6];

-  step[ 7] = output[stride*0] - output[stride*7];

-  temp1 = output[stride*8]*C7;

-  temp2 = output[stride*15]*C9;

-  temp1 -= temp2;

-  step[ 8] = DownshiftMultiply(temp1);

-  temp1 = output[stride*9]*C11;

-  temp2 = output[stride*14]*C5;

-  temp1 += temp2;

-  step[ 9] = DownshiftMultiply(temp1);

-  temp1 = output[stride*10]*C3;

-  temp2 = output[stride*13]*C13;

-  temp1 -= temp2;

-  step[10] = DownshiftMultiply(temp1);

-  temp1 = output[stride*11]*C15;

-  temp2 = output[stride*12]*C1;

-  temp1 += temp2;

-  step[11] = DownshiftMultiply(temp1);

-  temp1 = output[stride*11]*C1;

-  temp2 = output[stride*12]*C15;

-  temp2 -= temp1;

-  step[12] = DownshiftMultiply(temp2);

-  temp1 = output[stride*10]*C13;

-  temp2 = output[stride*13]*C3;

-  temp1 += temp2;

-  step[13] = DownshiftMultiply(temp1);

-  temp1 = output[stride*9]*C5;

-  temp2 = output[stride*14]*C11;

-  temp2 -= temp1;

-  step[14] = DownshiftMultiply(temp2);

-  temp1 = output[stride*8]*C9;

-  temp2 = output[stride*15]*C7;

-  temp1 += temp2;

-  step[15] = DownshiftMultiply(temp1);

-  // step 5

-  output[stride*0] = step[0] + step[15];

-  output[stride*1] = step[1] + step[14];

-  output[stride*2] = step[2] + step[13];

-  output[stride*3] = step[3] + step[12];

-  output[stride*4] = step[4] + step[11];

-  output[stride*5] = step[5] + step[10];

-  output[stride*6] = step[6] + step[ 9];

-  output[stride*7] = step[7] + step[ 8];

-  output[stride*15] = step[0] - step[15];

-  output[stride*14] = step[1] - step[14];

-  output[stride*13] = step[2] - step[13];

-  output[stride*12] = step[3] - step[12];

-  output[stride*11] = step[4] - step[11];

-  output[stride*10] = step[5] - step[10];

-  output[stride*9] = step[6] - step[ 9];

-  output[stride*8] = step[7] - step[ 8];

-}

-static void butterfly_32_idct_1d(double *input, double *output, int stride) {

-  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)

-  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)

-  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)

-  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)

-  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)

-  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)

-  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)

-  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)

-  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)

-  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)

-  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)

-  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)

-  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)

-  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)

-  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)

-  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)

-  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)

-  double step1[32];

-  double step2[32];

-  step1[ 0] = input[stride*0];

-  step1[ 1] = input[stride*2];

-  step1[ 2] = input[stride*4];

-  step1[ 3] = input[stride*6];

-  step1[ 4] = input[stride*8];

-  step1[ 5] = input[stride*10];

-  step1[ 6] = input[stride*12];

-  step1[ 7] = input[stride*14];

-  step1[ 8] = input[stride*16];

-  step1[ 9] = input[stride*18];

-  step1[10] = input[stride*20];

-  step1[11] = input[stride*22];

-  step1[12] = input[stride*24];

-  step1[13] = input[stride*26];

-  step1[14] = input[stride*28];

-  step1[15] = input[stride*30];

-  step1[16] = DownshiftMultiplyBy2(input[stride*1]*C16);

-  step1[17] = (input[stride*3] + input[stride*1]);

-  step1[18] = (input[stride*5] + input[stride*3]);

-  step1[19] = (input[stride*7] + input[stride*5]);

-  step1[20] = (input[stride*9] + input[stride*7]);

-  step1[21] = (input[stride*11] + input[stride*9]);

-  step1[22] = (input[stride*13] + input[stride*11]);

-  step1[23] = (input[stride*15] + input[stride*13]);

-  step1[24] = (input[stride*17] + input[stride*15]);

-  step1[25] = (input[stride*19] + input[stride*17]);

-  step1[26] = (input[stride*21] + input[stride*19]);

-  step1[27] = (input[stride*23] + input[stride*21]);

-  step1[28] = (input[stride*25] + input[stride*23]);

-  step1[29] = (input[stride*27] + input[stride*25]);

-  step1[30] = (input[stride*29] + input[stride*27]);

-  step1[31] = (input[stride*31] + input[stride*29]);

-  idct16(step1, step2, 1);

-  idct16(step1 + 16, step2 + 16, 1);

-  step2[16] = DownshiftMultiply(step2[16] / (2*C1));

-  step2[17] = DownshiftMultiply(step2[17] / (2*C3));

-  step2[18] = DownshiftMultiply(step2[18] / (2*C5));

-  step2[19] = DownshiftMultiply(step2[19] / (2*C7));

-  step2[20] = DownshiftMultiply(step2[20] / (2*C9));

-  step2[21] = DownshiftMultiply(step2[21] / (2*C11));

-  step2[22] = DownshiftMultiply(step2[22] / (2*C13));

-  step2[23] = DownshiftMultiply(step2[23] / (2*C15));

-  step2[24] = DownshiftMultiply(step2[24] / (2*C17));

-  step2[25] = DownshiftMultiply(step2[25] / (2*C19));

-  step2[26] = DownshiftMultiply(step2[26] / (2*C21));

-  step2[27] = DownshiftMultiply(step2[27] / (2*C23));

-  step2[28] = DownshiftMultiply(step2[28] / (2*C25));

-  step2[29] = DownshiftMultiply(step2[29] / (2*C27));

-  step2[30] = DownshiftMultiply(step2[30] / (2*C29));

-  step2[31] = DownshiftMultiply(step2[31] / (2*C31));

-  output[stride* 0] = step2[ 0] + step2[16];

-  output[stride* 1] = step2[ 1] + step2[17];

-  output[stride* 2] = step2[ 2] + step2[18];

-  output[stride* 3] = step2[ 3] + step2[19];

-  output[stride* 4] = step2[ 4] + step2[20];

-  output[stride* 5] = step2[ 5] + step2[21];

-  output[stride* 6] = step2[ 6] + step2[22];

-  output[stride* 7] = step2[ 7] + step2[23];

-  output[stride* 8] = step2[ 8] + step2[24];

-  output[stride* 9] = step2[ 9] + step2[25];

-  output[stride*10] = step2[10] + step2[26];

-  output[stride*11] = step2[11] + step2[27];

-  output[stride*12] = step2[12] + step2[28];

-  output[stride*13] = step2[13] + step2[29];

-  output[stride*14] = step2[14] + step2[30];

-  output[stride*15] = step2[15] + step2[31];

-  output[stride*16] = step2[15] - step2[(31 - 0)];

-  output[stride*17] = step2[14] - step2[(31 - 1)];

-  output[stride*18] = step2[13] - step2[(31 - 2)];

-  output[stride*19] = step2[12] - step2[(31 - 3)];

-  output[stride*20] = step2[11] - step2[(31 - 4)];

-  output[stride*21] = step2[10] - step2[(31 - 5)];

-  output[stride*22] = step2[ 9] - step2[(31 - 6)];

-  output[stride*23] = step2[ 8] - step2[(31 - 7)];

-  output[stride*24] = step2[ 7] - step2[(31 - 8)];

-  output[stride*25] = step2[ 6] - step2[(31 - 9)];

-  output[stride*26] = step2[ 5] - step2[(31 - 10)];

-  output[stride*27] = step2[ 4] - step2[(31 - 11)];

-  output[stride*28] = step2[ 3] - step2[(31 - 12)];

-  output[stride*29] = step2[ 2] - step2[(31 - 13)];

-  output[stride*30] = step2[ 1] - step2[(31 - 14)];

-  output[stride*31] = step2[ 0] - step2[(31 - 15)];

-}

-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    double out[32*32], out2[32*32];

-    const int short_pitch = pitch >> 1;

-    int i, j;

-    // First transform rows

-    for (i = 0; i < 32; ++i) {

-      double temp_in[32], temp_out[32];

-      for (j = 0; j < 32; ++j)

-        temp_in[j] = input[j + i*short_pitch];

-      butterfly_32_idct_1d(temp_in, temp_out, 1);

-      for (j = 0; j < 32; ++j)

-        out[j + i*32] = temp_out[j];

-    }

-    // Then transform columns

-    for (i = 0; i < 32; ++i) {

-      double temp_in[32], temp_out[32];

-      for (j = 0; j < 32; ++j)

-        temp_in[j] = out[j*32 + i];

-      butterfly_32_idct_1d(temp_in, temp_out, 1);

-      for (j = 0; j < 32; ++j)

-        out2[j*32 + i] = temp_out[j];

-    }

-    for (i = 0; i < 32*32; ++i)

-      output[i] = round(out2[i]/128);

-  }

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-}

-#else  // !CONFIG_DWTDCTHYBRID

-#if DWT_TYPE == 53

-// Note: block length must be even for this implementation

-static void synthesis_53_row(int length, int16_t *lowpass, int16_t *highpass,

-                             int16_t *x) {

-  int16_t r, *a, *b;

-  int n;

-  n = length >> 1;

-  b = highpass;

-  a = lowpass;

-  r = *highpass;

-  while (n--) {

-    *a++ -= (r + (*b) + 1) >> 1;

-    r = *b++;

-  }

-  n = length >> 1;

-  b = highpass;

-  a = lowpass;

-  while (--n) {

-    *x++ = ((r = *a++) + 1) >> 1;

-    *x++ = *b++ + ((r + (*a) + 2) >> 2);

-  }

-  *x++ = ((r = *a) + 1) >> 1;

-  *x++ = *b + ((r + 1) >> 1);

-}

-static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass,

-                             int16_t *x) {

-  int16_t r, *a, *b;

-  int n;

-  n = length >> 1;

-  b = highpass;

-  a = lowpass;

-  r = *highpass;

-  while (n--) {

-    *a++ -= (r + (*b) + 1) >> 1;

-    r = *b++;

-  }

-  n = length >> 1;

-  b = highpass;

-  a = lowpass;

-  while (--n) {

-    r = *a++;

-    *x++ = r;

-    *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1);

-  }

-  *x++ = *a;

-  *x++ = ((*b) << 1) + *a;

-}

-static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c,

-                                 int pitch_c, int16_t *x, int pitch_x) {

-  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;

-  short buffer[2 * DWT_MAX_LENGTH];

-  th[0] = hh;

-  tw[0] = hw;

-  for (i = 1; i <= levels; i++) {

-    th[i] = (th[i - 1] + 1) >> 1;

-    tw[i] = (tw[i - 1] + 1) >> 1;

-  }

-  for (lv = levels - 1; lv >= 0; lv--) {

-    nh = th[lv];

-    nw = tw[lv];

-    hh = th[lv + 1];

-    hw = tw[lv + 1];

-    if ((nh < 2) || (nw < 2)) continue;

-    for (j = 0; j < nw; j++) {

-      for (i = 0; i < nh; i++)

-        buffer[i] = c[i * pitch_c + j];

-      synthesis_53_col(nh, buffer, buffer + hh, buffer + nh);

-      for (i = 0; i < nh; i++)

-        c[i * pitch_c + j] = buffer[i + nh];

-    }

-    for (i = 0; i < nh; i++) {

-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));

-      synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]);

-    }

-  }

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?

-          ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :

-          -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);

-    }

-  }

-}

-#elif DWT_TYPE == 26

-// Note: block length must be even for this implementation

-static void synthesis_26_row(int length, int16_t *lowpass, int16_t *highpass,

-                             int16_t *x) {

-  int16_t r, s, *a, *b;

-  int i, n = length >> 1;

-  if (n >= 4) {

-    a = lowpass;

-    b = highpass;

-    r = *lowpass;

-    while (--n) {

-      *b++ += (r - a[1] + 4) >> 3;

-      r = *a++;

-    }

-    *b += (r - *a + 4) >> 3;

-  }

-  a = lowpass;

-  b = highpass;

-  for (i = length >> 1; i; i--) {

-    s = *b++;

-    r = *a++;

-    *x++ = (r + s + 1) >> 1;

-    *x++ = (r - s + 1) >> 1;

-  }

-}

-static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass,

-                             int16_t *x) {

-  int16_t r, s, *a, *b;

-  int i, n = length >> 1;

-  if (n >= 4) {

-    a = lowpass;

-    b = highpass;

-    r = *lowpass;

-    while (--n) {

-      *b++ += (r - a[1] + 4) >> 3;

-      r = *a++;

-    }

-    *b += (r - *a + 4) >> 3;

-  }

-  a = lowpass;

-  b = highpass;

-  for (i = length >> 1; i; i--) {

-    s = *b++;

-    r = *a++;

-    *x++ = r + s;

-    *x++ = r - s;

-  }

-}

-static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c,

-                                 int pitch_c, int16_t *x, int pitch_x) {

-  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;

-  int16_t buffer[2 * DWT_MAX_LENGTH];

-  th[0] = hh;

-  tw[0] = hw;

-  for (i = 1; i <= levels; i++) {

-    th[i] = (th[i - 1] + 1) >> 1;

-    tw[i] = (tw[i - 1] + 1) >> 1;

-  }

-  for (lv = levels - 1; lv >= 0; lv--) {

-    nh = th[lv];

-    nw = tw[lv];

-    hh = th[lv + 1];

-    hw = tw[lv + 1];

-    if ((nh < 2) || (nw < 2)) continue;

-    for (j = 0; j < nw; j++) {

-      for (i = 0; i < nh; i++)

-        buffer[i] = c[i * pitch_c + j];

-      synthesis_26_col(nh, buffer, buffer + hh, buffer + nh);

-      for (i = 0; i < nh; i++)

-        c[i * pitch_c + j] = buffer[i + nh];

-    }

-    for (i = 0; i < nh; i++) {

-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer));

-      synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]);

-    }

-  }

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ?

-          ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) :

-          -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS);

-    }

-  }

-}

-#elif DWT_TYPE == 97

-static void synthesis_97(int length, double *lowpass, double *highpass,

-                         double *x) {

-  static const double a_predict1 = -1.586134342;

-  static const double a_update1 = -0.05298011854;

-  static const double a_predict2 = 0.8829110762;

-  static const double a_update2 = 0.4435068522;

-  static const double s_low = 1.149604398;

-  static const double s_high = 1/1.149604398;

-  static const double inv_s_low = 1 / s_low;

-  static const double inv_s_high = 1 / s_high;

-  int i;

-  double y[DWT_MAX_LENGTH];

-  // Undo pack and scale

-  for (i = 0; i < length / 2; i++) {

-    y[i * 2] = lowpass[i] * inv_s_low;

-    y[i * 2 + 1] = highpass[i] * inv_s_high;

-  }

-  memcpy(x, y, sizeof(*y) * length);

-  // Undo update 2

-  for (i = 2; i < length; i += 2) {

-    x[i] -= a_update2 * (x[i-1] + x[i+1]);

-  }

-  x[0] -= 2 * a_update2 * x[1];

-  // Undo predict 2

-  for (i = 1; i < length - 2; i += 2) {

-    x[i] -= a_predict2 * (x[i - 1] + x[i + 1]);

-  }

-  x[length - 1] -= 2 * a_predict2 * x[length - 2];

-  // Undo update 1

-  for (i = 2; i < length; i += 2) {

-    x[i] -= a_update1 * (x[i - 1] + x[i + 1]);

-  }

-  x[0] -= 2 * a_update1 * x[1];

-  // Undo predict 1

-  for (i = 1; i < length - 2; i += 2) {

-    x[i] -= a_predict1 * (x[i - 1] + x[i + 1]);

-  }

-  x[length - 1] -= 2 * a_predict1 * x[length - 2];

-}

-static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c,

-                                 int pitch_c, int16_t *x, int pitch_x) {

-  int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width;

-  double buffer[2 * DWT_MAX_LENGTH];

-  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];

-  th[0] = hh;

-  tw[0] = hw;

-  for (i = 1; i <= levels; i++) {

-    th[i] = (th[i - 1] + 1) >> 1;

-    tw[i] = (tw[i - 1] + 1) >> 1;

-  }

-  for (lv = levels - 1; lv >= 0; lv--) {

-    nh = th[lv];

-    nw = tw[lv];

-    hh = th[lv + 1];

-    hw = tw[lv + 1];

-    if ((nh < 2) || (nw < 2)) continue;

-    for (j = 0; j < nw; j++) {

-      for (i = 0; i < nh; i++)

-        buffer[i] = c[i * pitch_c + j];

-      synthesis_97(nh, buffer, buffer + hh, buffer + nh);

-      for (i = 0; i < nh; i++)

-        y[i * DWT_MAX_LENGTH + j] = buffer[i + nh];

-    }

-    for (i = 0; i < nh; i++) {

-      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));

-      synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]);

-    }

-  }

-  for (i = 0; i < height; i++)

-    for (j = 0; j < width; j++)

-      x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] /

-                                 (1 << DWT_PRECISION_BITS));

-}

-#endif  // DWT_TYPE

-// TODO(debargha): Implement scaling differently so as not to have to use the

-// floating point 16x16 dct

-static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) {

-  static const double C1 = 0.995184726672197;

-  static const double C2 = 0.98078528040323;

-  static const double C3 = 0.956940335732209;

-  static const double C4 = 0.923879532511287;

-  static const double C5 = 0.881921264348355;

-  static const double C6 = 0.831469612302545;

-  static const double C7 = 0.773010453362737;

-  static const double C8 = 0.707106781186548;

-  static const double C9 = 0.634393284163646;

-  static const double C10 = 0.555570233019602;

-  static const double C11 = 0.471396736825998;

-  static const double C12 = 0.38268343236509;

-  static const double C13 = 0.290284677254462;

-  static const double C14 = 0.195090322016128;

-  static const double C15 = 0.098017140329561;

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    double step[16];

-    double intermediate[16];

-    double temp1, temp2;

-    // step 1 and 2

-    step[ 0] = input[0] + input[8];

-    step[ 1] = input[0] - input[8];

-    temp1 = input[4]*C12;

-    temp2 = input[12]*C4;

-    temp1 -= temp2;

-    temp1 *= C8;

-    step[ 2] = 2*(temp1);

-    temp1 = input[4]*C4;

-    temp2 = input[12]*C12;

-    temp1 += temp2;

-    temp1 = (temp1);

-    temp1 *= C8;

-    step[ 3] = 2*(temp1);

-    temp1 = input[2]*C8;

-    temp1 = 2*(temp1);

-    temp2 = input[6] + input[10];

-    step[ 4] = temp1 + temp2;

-    step[ 5] = temp1 - temp2;

-    temp1 = input[14]*C8;

-    temp1 = 2*(temp1);

-    temp2 = input[6] - input[10];

-    step[ 6] = temp2 - temp1;

-    step[ 7] = temp2 + temp1;

-    // for odd input

-    temp1 = input[3]*C12;

-    temp2 = input[13]*C4;

-    temp1 += temp2;

-    temp1 = (temp1);

-    temp1 *= C8;

-    intermediate[ 8] = 2*(temp1);

-    temp1 = input[3]*C4;

-    temp2 = input[13]*C12;

-    temp2 -= temp1;

-    temp2 = (temp2);

-    temp2 *= C8;

-    intermediate[ 9] = 2*(temp2);

-    intermediate[10] = 2*(input[9]*C8);

-    intermediate[11] = input[15] - input[1];

-    intermediate[12] = input[15] + input[1];

-    intermediate[13] = 2*((input[7]*C8));

-    temp1 = input[11]*C12;

-    temp2 = input[5]*C4;

-    temp2 -= temp1;

-    temp2 = (temp2);

-    temp2 *= C8;

-    intermediate[14] = 2*(temp2);

-    temp1 = input[11]*C4;

-    temp2 = input[5]*C12;

-    temp1 += temp2;

-    temp1 = (temp1);

-    temp1 *= C8;

-    intermediate[15] = 2*(temp1);

-    step[ 8] = intermediate[ 8] + intermediate[14];

-    step[ 9] = intermediate[ 9] + intermediate[15];

-    step[10] = intermediate[10] + intermediate[11];

-    step[11] = intermediate[10] - intermediate[11];

-    step[12] = intermediate[12] + intermediate[13];

-    step[13] = intermediate[12] - intermediate[13];

-    step[14] = intermediate[ 8] - intermediate[14];

-    step[15] = intermediate[ 9] - intermediate[15];

-    // step 3

-    output[0] = step[ 0] + step[ 3];

-    output[1] = step[ 1] + step[ 2];

-    output[2] = step[ 1] - step[ 2];

-    output[3] = step[ 0] - step[ 3];

-    temp1 = step[ 4]*C14;

-    temp2 = step[ 7]*C2;

-    temp1 -= temp2;

-    output[4] =  (temp1);

-    temp1 = step[ 4]*C2;

-    temp2 = step[ 7]*C14;

-    temp1 += temp2;

-    output[7] =  (temp1);

-    temp1 = step[ 5]*C10;

-    temp2 = step[ 6]*C6;

-    temp1 -= temp2;

-    output[5] =  (temp1);

-    temp1 = step[ 5]*C6;

-    temp2 = step[ 6]*C10;

-    temp1 += temp2;

-    output[6] =  (temp1);

-    output[8] = step[ 8] + step[11];

-    output[9] = step[ 9] + step[10];

-    output[10] = step[ 9] - step[10];

-    output[11] = step[ 8] - step[11];

-    output[12] = step[12] + step[15];

-    output[13] = step[13] + step[14];

-    output[14] = step[13] - step[14];

-    output[15] = step[12] - step[15];

-    // output 4

-    step[ 0] = output[0] + output[7];

-    step[ 1] = output[1] + output[6];

-    step[ 2] = output[2] + output[5];

-    step[ 3] = output[3] + output[4];

-    step[ 4] = output[3] - output[4];

-    step[ 5] = output[2] - output[5];

-    step[ 6] = output[1] - output[6];

-    step[ 7] = output[0] - output[7];

-    temp1 = output[8]*C7;

-    temp2 = output[15]*C9;

-    temp1 -= temp2;

-    step[ 8] = (temp1);

-    temp1 = output[9]*C11;

-    temp2 = output[14]*C5;

-    temp1 += temp2;

-    step[ 9] = (temp1);

-    temp1 = output[10]*C3;

-    temp2 = output[13]*C13;

-    temp1 -= temp2;

-    step[10] = (temp1);

-    temp1 = output[11]*C15;

-    temp2 = output[12]*C1;

-    temp1 += temp2;

-    step[11] = (temp1);

-    temp1 = output[11]*C1;

-    temp2 = output[12]*C15;

-    temp2 -= temp1;

-    step[12] = (temp2);

-    temp1 = output[10]*C13;

-    temp2 = output[13]*C3;

-    temp1 += temp2;

-    step[13] = (temp1);

-    temp1 = output[9]*C5;

-    temp2 = output[14]*C11;

-    temp2 -= temp1;

-    step[14] = (temp2);

-    temp1 = output[8]*C9;

-    temp2 = output[15]*C7;

-    temp1 += temp2;

-    step[15] = (temp1);

-    // step 5

-    output[0] = (step[0] + step[15]);

-    output[1] = (step[1] + step[14]);

-    output[2] = (step[2] + step[13]);

-    output[3] = (step[3] + step[12]);

-    output[4] = (step[4] + step[11]);

-    output[5] = (step[5] + step[10]);

-    output[6] = (step[6] + step[ 9]);

-    output[7] = (step[7] + step[ 8]);

-    output[15] = (step[0] - step[15]);

-    output[14] = (step[1] - step[14]);

-    output[13] = (step[2] - step[13]);

-    output[12] = (step[3] - step[12]);

-    output[11] = (step[4] - step[11]);

-    output[10] = (step[5] - step[10]);

-    output[9] = (step[6] - step[ 9]);

-    output[8] = (step[7] - step[ 8]);

-  }

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-}

-static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch,

-                                    int scale) {

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    double out[16*16], out2[16*16];

-    const int short_pitch = pitch >> 1;

-    int i, j;

-      // First transform rows

-    for (i = 0; i < 16; ++i) {

-      double temp_in[16], temp_out[16];

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = input[j + i*short_pitch];

-      butterfly_16x16_idct_1d_f(temp_in, temp_out);

-      for (j = 0; j < 16; ++j)

-        out[j + i*16] = temp_out[j];

-    }

-    // Then transform columns

-    for (i = 0; i < 16; ++i) {

-      double temp_in[16], temp_out[16];

-      for (j = 0; j < 16; ++j)

-        temp_in[j] = out[j*16 + i];

-      butterfly_16x16_idct_1d_f(temp_in, temp_out);

-      for (j = 0; j < 16; ++j)

-        out2[j*16 + i] = temp_out[j];

-    }

-    for (i = 0; i < 16*16; ++i)

-      output[i] = round(out2[i] / (128 >> scale));

-  }

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-}

-static void idct8_1d(double *x) {

-  int i, j;

-  double t[8];

-  static const double idctmat[64] = {

-    0.35355339059327,  0.49039264020162,  0.46193976625564,  0.41573480615127,

-    0.35355339059327,   0.2777851165098,  0.19134171618254, 0.097545161008064,

-    0.35355339059327,  0.41573480615127,  0.19134171618254, -0.097545161008064,

-    -0.35355339059327, -0.49039264020161, -0.46193976625564,  -0.2777851165098,

-    0.35355339059327,   0.2777851165098, -0.19134171618254, -0.49039264020162,

-    -0.35355339059327, 0.097545161008064,  0.46193976625564,  0.41573480615127,

-    0.35355339059327, 0.097545161008063, -0.46193976625564,  -0.2777851165098,

-    0.35355339059327,  0.41573480615127, -0.19134171618254, -0.49039264020162,

-    0.35355339059327, -0.097545161008063, -0.46193976625564,   0.2777851165098,

-    0.35355339059327, -0.41573480615127, -0.19134171618255,  0.49039264020162,

-    0.35355339059327,  -0.2777851165098, -0.19134171618254,  0.49039264020161,

-    -0.35355339059327, -0.097545161008064,  0.46193976625564, -0.41573480615127,

-    0.35355339059327, -0.41573480615127,  0.19134171618254, 0.097545161008065,

-    -0.35355339059327,  0.49039264020162, -0.46193976625564,   0.2777851165098,

-    0.35355339059327, -0.49039264020162,  0.46193976625564, -0.41573480615127,

-    0.35355339059327,  -0.2777851165098,  0.19134171618255, -0.097545161008064

-  };

-  for (i = 0; i < 8; ++i) {

-    t[i] = 0;

-    for (j = 0; j < 8; ++j)

-      t[i] += idctmat[i * 8 + j] * x[j];

-  }

-  for (i = 0; i < 8; ++i) {

-    x[i] = t[i];

-  }

-}

-static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch,

-                                  int scale) {

-  double X[8 * 8], Y[8];

-  int i, j;

-  int shortpitch = pitch >> 1;

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    for (i = 0; i < 8; i++) {

-      for (j = 0; j < 8; j++) {

-        X[i * 8 + j] = (double)coefs[i * shortpitch + j];

-      }

-    }

-    for (i = 0; i < 8; i++)

-      idct8_1d(X + 8 * i);

-    for (i = 0; i < 8; i++) {

-      for (j = 0; j < 8; ++j)

-        Y[j] = X[i + 8 * j];

-      idct8_1d(Y);

-      for (j = 0; j < 8; ++j)

-        X[i + 8 * j] = Y[j];

-    }

-    for (i = 0; i < 8; i++) {

-      for (j = 0; j < 8; j++) {

-        block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale));

-      }

-    }

-  }

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-}

-#define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n))

-#if DWTDCT_TYPE == DWTDCT16X16_LEAN

-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {

-  // assume output is a 32x32 buffer

-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct

-  int16_t buffer[16 * 16];

-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt

-  int16_t buffer2[32 * 32];

-  // Note: pitch is in bytes, short_pitch is in short units

-  const int short_pitch = pitch >> 1;

-  int i, j;

-  // TODO(debargha): Implement more efficiently by adding output pitch

-  // argument to the idct16x16 function

-  vp9_short_idct16x16_c_f(input, buffer, pitch,

-                          1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i) {

-    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);

-  }

-  for (i = 0; i < 16; ++i) {

-    for (j = 16; j < 32; ++j) {

-      buffer2[i * 32 + j] =

-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);

-    }

-  }

-  for (i = 16; i < 32; ++i) {

-    for (j = 0; j < 32; ++j) {

-      buffer2[i * 32 + j] =

-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);

-    }

-  }

-#if DWT_TYPE == 26

-  dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);

-#elif DWT_TYPE == 97

-  dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);

-#elif DWT_TYPE == 53

-  dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);

-#endif

-}

-#elif DWTDCT_TYPE == DWTDCT16X16

-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {

-  // assume output is a 32x32 buffer

-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct

-  int16_t buffer[16 * 16];

-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt

-  int16_t buffer2[32 * 32];

-  // Note: pitch is in bytes, short_pitch is in short units

-  const int short_pitch = pitch >> 1;

-  int i, j;

-  // TODO(debargha): Implement more efficiently by adding output pitch

-  // argument to the idct16x16 function

-  vp9_short_idct16x16_c_f(input, buffer, pitch,

-                          1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i) {

-    vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16);

-  }

-  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,

-                          1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i) {

-    vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16);

-  }

-  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,

-                          1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i) {

-    vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16,

-               sizeof(*buffer2) * 16);

-  }

-  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,

-                          1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i) {

-    vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16,

-               sizeof(*buffer2) * 16);

-  }

-#if DWT_TYPE == 26

-  dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32);

-#elif DWT_TYPE == 97

-  dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32);

-#elif DWT_TYPE == 53

-  dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32);

-#endif

-}

-#elif DWTDCT_TYPE == DWTDCT8X8

-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {

-  // assume output is a 32x32 buffer

-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct

-  int16_t buffer[8 * 8];

-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt

-  int16_t buffer2[32 * 32];

-  // Note: pitch is in bytes, short_pitch is in short units

-  const int short_pitch = pitch >> 1;

-  int i, j;

-  // TODO(debargha): Implement more efficiently by adding output pitch

-  // argument to the idct16x16 function

-  vp9_short_idct8x8_c_f(input, buffer, pitch,

-                        1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 8; ++i) {

-    vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8);

-  }

-  vp9_short_idct8x8_c_f(input + 8, buffer, pitch,

-                        1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 8; ++i) {

-    vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8);

-  }

-  vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch,

-                        1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 8; ++i) {

-    vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8,

-               sizeof(*buffer2) * 8);

-  }

-  vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch,

-                        1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 8; ++i) {

-    vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8,

-               sizeof(*buffer2) * 8);

-  }

-  for (i = 0; i < 16; ++i) {

-    for (j = 16; j < 32; ++j) {

-      buffer2[i * 32 + j] =

-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);

-    }

-  }

-  for (i = 16; i < 32; ++i) {

-    for (j = 0; j < 32; ++j) {

-      buffer2[i * 32 + j] =

-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2);

-    }

-  }

-#if DWT_TYPE == 26

-  dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32);

-#elif DWT_TYPE == 97

-  dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32);

-#elif DWT_TYPE == 53

-  dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32);

-#endif

-}

-#endif

-#if CONFIG_TX64X64

-void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) {

-  // assume output is a 64x64 buffer

-  // Temporary buffer to hold a 16x16 block for 16x16 inverse dct

-  int16_t buffer[16 * 16];

-  // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt

-  int16_t buffer2[64 * 64];

-  // Note: pitch is in bytes, short_pitch is in short units

-  const int short_pitch = pitch >> 1;

-  int i, j;

-  // TODO(debargha): Implement more efficiently by adding output pitch

-  // argument to the idct16x16 function

-  vp9_short_idct16x16_c_f(input, buffer, pitch,

-                          2 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i) {

-    vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16);

-  }

-#if DWTDCT_TYPE == DWTDCT16X16_LEAN

-  for (i = 0; i < 16; ++i) {

-    for (j = 16; j < 64; ++j) {

-      buffer2[i * 64 + j] =

-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);

-    }

-  }

-  for (i = 16; i < 64; ++i) {

-    for (j = 0; j < 64; ++j) {

-      buffer2[i * 64 + j] =

-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);

-    }

-  }

-#elif DWTDCT_TYPE == DWTDCT16X16

-  vp9_short_idct16x16_c_f(input + 16, buffer, pitch,

-                          2 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i) {

-    vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16);

-  }

-  vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch,

-                          2 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i) {

-    vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16,

-               sizeof(*buffer2) * 16);

-  }

-  vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch,

-                          2 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i) {

-    vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16,

-               sizeof(*buffer2) * 16);

-  }

-  // Copying and scaling highest bands into buffer2

-  for (i = 0; i < 32; ++i) {

-    for (j = 32; j < 64; ++j) {

-      buffer2[i * 64 + j] =

-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);

-    }

-  }

-  for (i = 32; i < 64; ++i) {

-    for (j = 0; j < 64; ++j) {

-      buffer2[i * 64 + j] =

-          multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1);

-    }

-  }

-#endif  // DWTDCT_TYPE

-#if DWT_TYPE == 26

-  dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64);

-#elif DWT_TYPE == 97

-  dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64);

-#elif DWT_TYPE == 53

-  dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64);

-#endif

-}

-#endif  // CONFIG_TX64X64

-#endif  // !CONFIG_DWTDCTHYBRID

--- a/vp9/common/vp9_invtrans.c

+++ b/vp9/common/vp9_invtrans.c

@@ -11,50 +11,25 @@

 #include "vp9/common/vp9_invtrans.h"

 #include "./vp9_rtcd.h"

-static void recon_dcblock(MACROBLOCKD *xd) {

-  BLOCKD *b = &xd->block[24];

-  int i;

-  for (i = 0; i < 16; i++) {

-    xd->block[i].dqcoeff[0] = b->diff[i];

-  }

-}

-static void recon_dcblock_8x8(MACROBLOCKD *xd) {

-  BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10

-  xd->block[0].dqcoeff[0] = b->diff[0];

-  xd->block[4].dqcoeff[0] = b->diff[1];

-  xd->block[8].dqcoeff[0] = b->diff[4];

-  xd->block[12].dqcoeff[0] = b->diff[8];

-}

-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch) {

-  BLOCKD *b = &xd->block[block];

-  if (b->eob <= 1)

-    xd->inv_xform4x4_1_x8(b->dqcoeff, b->diff, pitch);

+void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,

+                                 int16_t *dqcoeff, int16_t *diff,

+                                 int pitch) {

+  if (eob <= 1)

+    xd->inv_txm4x4_1(dqcoeff, diff, pitch);

   else

-    xd->inv_xform4x4_x8(b->dqcoeff, b->diff, pitch);

+    xd->inv_txm4x4(dqcoeff, diff, pitch);

 void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {

   int i;

-  BLOCKD *blockd = xd->block;

-  int has_2nd_order = get_2nd_order_usage(xd);

-  if (has_2nd_order) {

-    /* do 2nd order transform on the dc block */

-    vp9_short_inv_walsh4x4(blockd[24].dqcoeff, blockd[24].diff);

-    recon_dcblock(xd);

-  }

   for (i = 0; i < 16; i++) {

-    TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);

+    TX_TYPE tx_type = get_tx_type_4x4(xd, i);

     if (tx_type != DCT_DCT) {

-      vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32,

-                   tx_type, 4, xd->block[i].eob);

+      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);

     } else {

-      vp9_inverse_transform_b_4x4(xd, i, 32);

+      vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff,

+                                  xd->block[i].diff, 32);

@@ -63,7 +38,8 @@

   int i;

   for (i = 16; i < 24; i++) {

-    vp9_inverse_transform_b_4x4(xd, i, 16);

+    vp9_inverse_transform_b_4x4(xd, xd->eobs[i], xd->block[i].dqcoeff,

+                                xd->block[i].diff, 16);

@@ -80,19 +56,11 @@

 void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {

   int i;

   BLOCKD *blockd = xd->block;

-  int has_2nd_order = get_2nd_order_usage(xd);

-  if (has_2nd_order) {

-    // do 2nd order transform on the dc block

-    vp9_short_ihaar2x2(blockd[24].dqcoeff, blockd[24].diff, 8);

-    recon_dcblock_8x8(xd); // need to change for 8x8

-  }

   for (i = 0; i < 9; i += 8) {

-    TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);

+    TX_TYPE tx_type = get_tx_type_8x8(xd, i);

     if (tx_type != DCT_DCT) {

-      vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8,

-                 xd->block[i].eob);

+      vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type);

     } else {

       vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],

                                   &blockd[i].diff[0], 32);

@@ -99,10 +67,10 @@

   for (i = 2; i < 11; i += 8) {

-    TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]);

+    TX_TYPE tx_type = get_tx_type_8x8(xd, i);

     if (tx_type != DCT_DCT) {

-      vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8,

-                 xd->block[i + 2].eob);

+      vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff,

+                           16, tx_type);

     } else {

       vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0],

                                   &blockd[i].diff[0], 32);

@@ -132,9 +100,9 @@

 void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {

   BLOCKD *bd = &xd->block[0];

-  TX_TYPE tx_type = get_tx_type_16x16(xd, bd);

+  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);

   if (tx_type != DCT_DCT) {

-    vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16, bd->eob);

+    vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type);

   } else {

     vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],

                                   &xd->block[0].diff[0], 32);

@@ -146,13 +114,208 @@

   vp9_inverse_transform_mbuv_8x8(xd);

-void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb) {

-  vp9_short_idct32x32(xd_sb->dqcoeff, xd_sb->diff, 64);

+void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd) {

+  vp9_short_idct32x32(xd->dqcoeff, xd->diff, 64);

-void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb) {

-  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1024,

-                                xd_sb->diff + 1024, 32);

-  vp9_inverse_transform_b_16x16(xd_sb->dqcoeff + 1280,

-                                xd_sb->diff + 1280, 32);

+void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4);

+    if (tx_type == DCT_DCT) {

+      vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,

+                                    xd->diff + x_idx * 16 + y_idx * 32 * 16,

+                                    64);

+    } else {

+      vp9_short_iht16x16(xd->dqcoeff + n * 256,

+                         xd->diff + x_idx * 16 + y_idx * 32 * 16, 32, tx_type);

+    }

+  }

+}

+void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);

+    if (tx_type == DCT_DCT) {

+      vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,

+                                  xd->diff + x_idx * 8 + y_idx * 32 * 8, 64);

+    } else {

+      vp9_short_iht8x8(xd->dqcoeff + n * 64,

+                       xd->diff + x_idx * 8 + y_idx * 32 * 8, 32, tx_type);

+    }

+  }

+}

+void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 64; n++) {

+    const int x_idx = n & 7, y_idx = n >> 3;

+    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);

+    if (tx_type == DCT_DCT) {

+      vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,

+                                  xd->diff + x_idx * 4 + y_idx * 4 * 32, 64);

+    } else {

+      vp9_short_iht4x4(xd->dqcoeff + n * 16,

+                       xd->diff + x_idx * 4 + y_idx * 4 * 32, 32, tx_type);

+    }

+  }

+}

+void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd) {

+  vp9_inverse_transform_b_16x16(xd->dqcoeff + 1024,

+                                xd->diff + 1024, 32);

+  vp9_inverse_transform_b_16x16(xd->dqcoeff + 1280,

+                                xd->diff + 1280, 32);

+}

+void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    vp9_inverse_transform_b_8x8(xd->dqcoeff + 1024 + n * 64,

+                                xd->diff + 1024 + x_idx * 8 + y_idx * 16 * 8,

+                                32);

+    vp9_inverse_transform_b_8x8(xd->dqcoeff + 1280 + n * 64,

+                                xd->diff + 1280 + x_idx * 8 + y_idx * 16 * 8,

+                                32);

+  }

+}

+void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + n],

+                                xd->dqcoeff + 1024 + n * 16,

+                                xd->diff + 1024 + x_idx * 4 + y_idx * 16 * 4,

+                                32);

+    vp9_inverse_transform_b_4x4(xd, xd->eobs[64 + 16 + n],

+                                xd->dqcoeff + 1280 + n * 16,

+                                xd->diff + 1280 + x_idx * 4 + y_idx * 16 * 4,

+                                32);

+  }

+}

+void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    vp9_short_idct32x32(xd->dqcoeff + n * 1024,

+                        xd->diff + x_idx * 32 + y_idx * 32 * 64, 128);

+  }

+}

+void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4);

+    if (tx_type == DCT_DCT) {

+      vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256,

+                                    xd->diff + x_idx * 16 + y_idx * 64 * 16,

+                                    128);

+    } else {

+      vp9_short_iht16x16(xd->dqcoeff + n * 256,

+                         xd->diff + x_idx * 16 + y_idx * 64 * 16, 64, tx_type);

+    }

+  }

+}

+void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 64; n++) {

+    const int x_idx = n & 7, y_idx = n >> 3;

+    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);

+    if (tx_type == DCT_DCT) {

+      vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64,

+                                  xd->diff + x_idx * 8 + y_idx * 64 * 8, 128);

+    } else {

+      vp9_short_iht8x8(xd->dqcoeff + n * 64,

+                       xd->diff + x_idx * 8 + y_idx * 64 * 8, 64, tx_type);

+    }

+  }

+}

+void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 256; n++) {

+    const int x_idx = n & 15, y_idx = n >> 4;

+    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);

+    if (tx_type == DCT_DCT) {

+      vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16,

+                                  xd->diff + x_idx * 4 + y_idx * 4 * 64, 128);

+    } else {

+      vp9_short_iht4x4(xd->dqcoeff + n * 16,

+                       xd->diff + x_idx * 4 + y_idx * 4 * 64, 64, tx_type);

+    }

+  }

+}

+void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd) {

+  vp9_short_idct32x32(xd->dqcoeff + 4096,

+                      xd->diff + 4096, 64);

+  vp9_short_idct32x32(xd->dqcoeff + 4096 + 1024,

+                      xd->diff + 4096 + 1024, 64);

+}

+void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1, off = x_idx * 16 + y_idx * 32 * 16;

+    vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + n * 256,

+                                  xd->diff + 4096 + off, 64);

+    vp9_inverse_transform_b_16x16(xd->dqcoeff + 4096 + 1024 + n * 256,

+                                  xd->diff + 4096 + 1024 + off, 64);

+  }

+}

+void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2, off = x_idx * 8 + y_idx * 32 * 8;

+    vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + n * 64,

+                                xd->diff + 4096 + off, 64);

+    vp9_inverse_transform_b_8x8(xd->dqcoeff + 4096 + 1024 + n * 64,

+                                xd->diff + 4096 + 1024 + off, 64);

+  }

+}

+void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd) {

+  int n;

+  for (n = 0; n < 64; n++) {

+    const int x_idx = n & 7, y_idx = n >> 3, off = x_idx * 4 + y_idx * 32 * 4;

+    vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + n],

+                                xd->dqcoeff + 4096 + n * 16,

+                                xd->diff + 4096 + off, 64);

+    vp9_inverse_transform_b_4x4(xd, xd->eobs[256 + 64 + n],

+                                xd->dqcoeff + 4096 + 1024 + n * 16,

+                                xd->diff + 4096 + 1024 + off, 64);

+  }

--- a/vp9/common/vp9_invtrans.h

+++ b/vp9/common/vp9_invtrans.h

@@ -15,31 +15,47 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_blockd.h"

-extern void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch);

+void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,

+                                 int16_t *dqcoeff, int16_t *diff,

+                                 int pitch);

-extern void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd);

+void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd);

+void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd);

+void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,

+void vp9_inverse_transform_b_8x8(int16_t *input_dqcoeff,

                                         int16_t *output_coeff, int pitch);

-extern void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd);

+void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd);

+void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd);

+void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,

+void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff,

                                           int16_t *output_coeff, int pitch);

-extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);

+void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);

+void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);

-extern void vp9_inverse_transform_sby_32x32(SUPERBLOCKD *xd_sb);

-extern void vp9_inverse_transform_sbuv_16x16(SUPERBLOCKD *xd_sb);

+void vp9_inverse_transform_sby_32x32(MACROBLOCKD *xd);

+void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd);

+void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd);

+void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd);

+void vp9_inverse_transform_sbuv_16x16(MACROBLOCKD *xd);

+void vp9_inverse_transform_sbuv_8x8(MACROBLOCKD *xd);

+void vp9_inverse_transform_sbuv_4x4(MACROBLOCKD *xd);

+void vp9_inverse_transform_sb64y_32x32(MACROBLOCKD *xd);

+void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd);

+void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd);

+void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd);

+void vp9_inverse_transform_sb64uv_32x32(MACROBLOCKD *xd);

+void vp9_inverse_transform_sb64uv_16x16(MACROBLOCKD *xd);

+void vp9_inverse_transform_sb64uv_8x8(MACROBLOCKD *xd);

+void vp9_inverse_transform_sb64uv_4x4(MACROBLOCKD *xd);

 #endif  // VP9_COMMON_VP9_INVTRANS_H_

--- a/vp9/common/vp9_loopfilter.c

+++ b/vp9/common/vp9_loopfilter.c

@@ -109,6 +109,9 @@

   loop_filter_info_n *lfi = &cm->lf_info;

   /* update limits if sharpness has changed */

+  // printf("vp9_loop_filter_frame_init %d\n", default_filt_lvl);

+  // printf("sharpness level: %d [%d]\n",

+  //        cm->sharpness_level, cm->last_sharpness_level);

   if (cm->last_sharpness_level != cm->sharpness_level) {

     vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level);

     cm->last_sharpness_level = cm->sharpness_level;

@@ -126,7 +129,7 @@

         lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);

       } else { /* Delta Value */

         lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF);

-        lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;

+        lvl_seg = clamp(lvl_seg, 0, 63);

@@ -149,13 +152,12 @@

     /* Apply delta for Intra modes */

     mode = 0; /* B_PRED */

     /* Only the split mode BPRED has a further special case */

-    lvl_mode = lvl_ref +  xd->mode_lf_deltas[mode];

-    lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */

+    lvl_mode = clamp(lvl_ref +  xd->mode_lf_deltas[mode], 0, 63);

     lfi->lvl[seg][ref][mode] = lvl_mode;

     mode = 1; /* all the rest of Intra modes */

-    lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref)  : 0; /* clamp */

+    lvl_mode = clamp(lvl_ref, 0, 63);

     lfi->lvl[seg][ref][mode] = lvl_mode;

     /* LAST, GOLDEN, ALT */

@@ -167,9 +169,7 @@

       /* Apply delta for Inter modes */

       for (mode = 1; mode < 4; mode++) {

-        lvl_mode = lvl_ref + xd->mode_lf_deltas[mode];

-        lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */

+        lvl_mode = clamp(lvl_ref + xd->mode_lf_deltas[mode], 0, 63);

         lfi->lvl[seg][ref][mode] = lvl_mode;

@@ -202,10 +202,12 @@

           mbmi1->mv[mbmi1->ref_frame].as_int) &&

          mbmi0->ref_frame != INTRA_FRAME;

 void vp9_loop_filter_frame(VP9_COMMON *cm,

                            MACROBLOCKD *xd,

                            int frame_filter_level,

-                           int y_only) {

+                           int y_only,

+                           int dering) {

   YV12_BUFFER_CONFIG *post = cm->frame_to_show;

   loop_filter_info_n *lfi_n = &cm->lf_info;

   struct loop_filter_info lfi;

@@ -271,7 +273,6 @@

               vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride,

                                  post->uv_stride, &lfi);

           /* don't apply across umv border */

           if (mb_row > 0 &&

@@ -299,6 +300,62 @@

                                  post->uv_stride, &lfi);

+#if CONFIG_LOOP_DERING

+          if (dering) {

+            if (mb_row && mb_row < cm->mb_rows - 1 &&

+                mb_col && mb_col < cm->mb_cols - 1) {

+              vp9_post_proc_down_and_across(y_ptr, y_ptr,

+                                            post->y_stride, post->y_stride,

+                                            16, 16, dering);

+              if (!y_only) {

+                vp9_post_proc_down_and_across(u_ptr, u_ptr,

+                                              post->uv_stride, post->uv_stride,

+                                              8, 8, dering);

+                vp9_post_proc_down_and_across(v_ptr, v_ptr,

+                                              post->uv_stride, post->uv_stride,

+                                              8, 8, dering);

+              }

+            } else {

+              // Adjust the filter so that no out-of-frame data is used.

+              uint8_t *dr_y = y_ptr, *dr_u = u_ptr, *dr_v = v_ptr;

+              int w_adjust = 0;

+              int h_adjust = 0;

+              if (mb_col == 0) {

+                dr_y += 2;

+                dr_u += 2;

+                dr_v += 2;

+                w_adjust += 2;

+              }

+              if (mb_col == cm->mb_cols - 1)

+                w_adjust += 2;

+              if (mb_row == 0) {

+                dr_y += 2 * post->y_stride;

+                dr_u += 2 * post->uv_stride;

+                dr_v += 2 * post->uv_stride;

+                h_adjust += 2;

+              }

+              if (mb_row == cm->mb_rows - 1)

+                h_adjust += 2;

+              vp9_post_proc_down_and_across_c(dr_y, dr_y,

+                                              post->y_stride, post->y_stride,

+                                              16 - w_adjust, 16 - h_adjust,

+                                              dering);

+              if (!y_only) {

+                vp9_post_proc_down_and_across_c(dr_u, dr_u,

+                                                post->uv_stride,

+                                                post->uv_stride,

+                                                8 - w_adjust, 8 - h_adjust,

+                                                dering);

+                vp9_post_proc_down_and_across_c(dr_v, dr_v,

+                                                post->uv_stride,

+                                                post->uv_stride,

+                                                8 - w_adjust, 8 - h_adjust,

+                                                dering);

+              }

+            }

+          }

+#endif

         } else {

           // FIXME: Not 8x8 aware

           if (mb_col > 0 &&

@@ -376,16 +433,13 @@

*/

   if (alt_flt_enabled) {

     for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-      /* Abs value */

       if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {

+        // Abs value

         lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);

-      }

-      /* Delta Value */

-      else {

-        lvl_seg[i] = default_filt_lvl +

-                     vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);

-        lvl_seg[i] = (lvl_seg[i] > 0) ?

-                     ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0;

+      } else {

+        // Delta Value

+        lvl_seg[i] = default_filt_lvl + vp9_get_segdata(xd, i, SEG_LVL_ALT_LF);

+        lvl_seg[i] = clamp(lvl_seg[i], 0, 63);

--- a/vp9/common/vp9_loopfilter.h

+++ b/vp9/common/vp9_loopfilter.h

@@ -83,7 +83,8 @@

 void vp9_loop_filter_frame(struct VP9Common *cm,

                            struct macroblockd *mbd,

                            int filter_level,

-                           int y_only);

+                           int y_only,

+                           int dering);

 void vp9_loop_filter_partial_frame(struct VP9Common *cm,

                                    struct macroblockd *mbd,

--- a/vp9/common/vp9_loopfilter_filters.c

+++ b/vp9/common/vp9_loopfilter_filters.c

@@ -13,7 +13,7 @@

 #include "vp9/common/vp9_loopfilter.h"

 #include "vp9/common/vp9_onyxc_int.h"

-static __inline int8_t signed_char_clamp(int t) {

+static INLINE int8_t signed_char_clamp(int t) {

   t = (t < -128 ? -128 : t);

   t = (t > 127 ? 127 : t);

   return (int8_t) t;

@@ -21,11 +21,11 @@

 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */

-static __inline int8_t filter_mask(uint8_t limit, uint8_t blimit,

-                                   uint8_t p3, uint8_t p2,

-                                   uint8_t p1, uint8_t p0,

-                                   uint8_t q0, uint8_t q1,

-                                   uint8_t q2, uint8_t q3) {

+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit,

+                                 uint8_t p3, uint8_t p2,

+                                 uint8_t p1, uint8_t p0,

+                                 uint8_t q0, uint8_t q1,

+                                 uint8_t q2, uint8_t q3) {

   int8_t mask = 0;

   mask |= (abs(p3 - p2) > limit) * -1;

   mask |= (abs(p2 - p1) > limit) * -1;

@@ -39,8 +39,8 @@

 /* is there high variance internal edge ( 11111111 yes, 00000000 no) */

-static __inline int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,

-                               uint8_t q0, uint8_t q1) {

+static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0,

+                             uint8_t q0, uint8_t q1) {

   int8_t hev = 0;

   hev  |= (abs(p1 - p0) > thresh) * -1;

   hev  |= (abs(q1 - q0) > thresh) * -1;

@@ -47,49 +47,38 @@

   return hev;

-static __inline void filter(int8_t mask, uint8_t hev, uint8_t *op1,

-                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {

-  int8_t ps0, qs0;

-  int8_t ps1, qs1;

-  int8_t filter, Filter1, Filter2;

-  int8_t u;

+static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1,

+                          uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {

+  int8_t filter1, filter2;

-  ps1 = (int8_t) *op1 ^ 0x80;

-  ps0 = (int8_t) *op0 ^ 0x80;

-  qs0 = (int8_t) *oq0 ^ 0x80;

-  qs1 = (int8_t) *oq1 ^ 0x80;

+  const int8_t ps1 = (int8_t) *op1 ^ 0x80;

+  const int8_t ps0 = (int8_t) *op0 ^ 0x80;

+  const int8_t qs0 = (int8_t) *oq0 ^ 0x80;

+  const int8_t qs1 = (int8_t) *oq1 ^ 0x80;

-  /* add outer taps if we have high edge variance */

-  filter = signed_char_clamp(ps1 - qs1);

-  filter &= hev;

+  // add outer taps if we have high edge variance

+  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;

-  /* inner taps */

-  filter = signed_char_clamp(filter + 3 * (qs0 - ps0));

-  filter &= mask;

+  // inner taps

+  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;

-  /* save bottom 3 bits so that we round one side +4 and the other +3

-   * if it equals 4 we'll set to adjust by -1 to account for the fact

-   * we'd round 3 the other way

-   */

-  Filter1 = signed_char_clamp(filter + 4);

-  Filter2 = signed_char_clamp(filter + 3);

-  Filter1 >>= 3;

-  Filter2 >>= 3;

-  u = signed_char_clamp(qs0 - Filter1);

-  *oq0 = u ^ 0x80;

-  u = signed_char_clamp(ps0 + Filter2);

-  *op0 = u ^ 0x80;

-  filter = Filter1;

+  // save bottom 3 bits so that we round one side +4 and the other +3

+  // if it equals 4 we'll set to adjust by -1 to account for the fact

+  // we'd round 3 the other way

+  filter1 = signed_char_clamp(filter + 4) >> 3;

+  filter2 = signed_char_clamp(filter + 3) >> 3;

-  /* outer tap adjustments */

+  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;

+  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;

+  filter = filter1;

+  // outer tap adjustments

   filter += 1;

   filter >>= 1;

   filter &= ~hev;

-  u = signed_char_clamp(qs1 - filter);

-  *oq1 = u ^ 0x80;

-  u = signed_char_clamp(ps1 + filter);

-  *op1 = u ^ 0x80;

+  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;

+  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;

 void vp9_loop_filter_horizontal_edge_c(uint8_t *s,

@@ -143,11 +132,11 @@

     s += p;

   } while (++i < count * 8);

-static __inline signed char flatmask(uint8_t thresh,

-                                     uint8_t p4, uint8_t p3, uint8_t p2,

-                                     uint8_t p1, uint8_t p0,

-                                     uint8_t q0, uint8_t q1, uint8_t q2,

-                                     uint8_t q3, uint8_t q4) {

+static INLINE signed char flatmask4(uint8_t thresh,

+                                    uint8_t p3, uint8_t p2,

+                                    uint8_t p1, uint8_t p0,

+                                    uint8_t q0, uint8_t q1,

+                                    uint8_t q2, uint8_t q3) {

   int8_t flat = 0;

   flat |= (abs(p1 - p0) > thresh) * -1;

   flat |= (abs(q1 - q0) > thresh) * -1;

@@ -155,81 +144,72 @@

   flat |= (abs(q0 - q2) > thresh) * -1;

   flat |= (abs(p3 - p0) > thresh) * -1;

   flat |= (abs(q3 - q0) > thresh) * -1;

+  flat = ~flat;

+  return flat;

+}

+static INLINE signed char flatmask5(uint8_t thresh,

+                                    uint8_t p4, uint8_t p3, uint8_t p2,

+                                    uint8_t p1, uint8_t p0,

+                                    uint8_t q0, uint8_t q1, uint8_t q2,

+                                    uint8_t q3, uint8_t q4) {

+  int8_t flat = 0;

   flat |= (abs(p4 - p0) > thresh) * -1;

   flat |= (abs(q4 - q0) > thresh) * -1;

   flat = ~flat;

-  return flat;

+  return flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);

-static __inline void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,

-                              uint8_t *op4, uint8_t *op3, uint8_t *op2,

-                              uint8_t *op1, uint8_t *op0,

-                              uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,

-                              uint8_t *oq3, uint8_t *oq4) {

-  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */

+static INLINE void mbfilter(int8_t mask, uint8_t hev, uint8_t flat,

+                            uint8_t *op3, uint8_t *op2,

+                            uint8_t *op1, uint8_t *op0,

+                            uint8_t *oq0, uint8_t *oq1,

+                            uint8_t *oq2, uint8_t *oq3) {

+  // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line

   if (flat && mask) {

-    uint8_t p0, q0;

-    uint8_t p1, q1;

-    uint8_t p2, q2;

-    uint8_t p3, q3;

-    uint8_t p4, q4;

+    const uint8_t p3 = *op3;

+    const uint8_t p2 = *op2;

+    const uint8_t p1 = *op1;

+    const uint8_t p0 = *op0;

+    const uint8_t q0 = *oq0;

+    const uint8_t q1 = *oq1;

+    const uint8_t q2 = *oq2;

+    const uint8_t q3 = *oq3;

-    p4 = *op4;

-    p3 = *op3;

-    p2 = *op2;

-    p1 = *op1;

-    p0 = *op0;

-    q0 = *oq0;

-    q1 = *oq1;

-    q2 = *oq2;

-    q3 = *oq3;

-    q4 = *oq4;

-    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;

-    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;

+    *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;

+    *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;

     *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;

     *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;

-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;

-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;

+    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3;

+    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3;

   } else {

-    int8_t ps0, qs0;

-    int8_t ps1, qs1;

-    int8_t filter, Filter1, Filter2;

-    int8_t u;

+    int8_t filter1, filter2;

-    ps1 = (int8_t) *op1 ^ 0x80;

-    ps0 = (int8_t) *op0 ^ 0x80;

-    qs0 = (int8_t) *oq0 ^ 0x80;

-    qs1 = (int8_t) *oq1 ^ 0x80;

+    const int8_t ps1 = (int8_t) *op1 ^ 0x80;

+    const int8_t ps0 = (int8_t) *op0 ^ 0x80;

+    const int8_t qs0 = (int8_t) *oq0 ^ 0x80;

+    const int8_t qs1 = (int8_t) *oq1 ^ 0x80;

-    /* add outer taps if we have high edge variance */

-    filter = signed_char_clamp(ps1 - qs1);

-    filter &= hev;

+    // add outer taps if we have high edge variance

+    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;

-    /* inner taps */

-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0));

-    filter &= mask;

+    // inner taps

+    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;

-    Filter1 = signed_char_clamp(filter + 4);

-    Filter2 = signed_char_clamp(filter + 3);

-    Filter1 >>= 3;

-    Filter2 >>= 3;

+    filter1 = signed_char_clamp(filter + 4) >> 3;

+    filter2 = signed_char_clamp(filter + 3) >> 3;

-    u = signed_char_clamp(qs0 - Filter1);

-    *oq0 = u ^ 0x80;

-    u = signed_char_clamp(ps0 + Filter2);

-    *op0 = u ^ 0x80;

-    filter = Filter1;

+    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;

+    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;

+    filter = filter1;

-    /* outer tap adjustments */

+    // outer tap adjustments

     filter += 1;

     filter >>= 1;

     filter &= ~hev;

-    u = signed_char_clamp(qs1 - filter);

-    *oq1 = u ^ 0x80;

-    u = signed_char_clamp(ps1 + filter);

-    *op1 = u ^ 0x80;

+    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;

+    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;

@@ -254,12 +234,11 @@

     hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

-    flat = flatmask(1,

-                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);

+    flat = flatmask4(1, s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

+                        s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);

     mbfilter(mask, hev, flat,

-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,

-             s,       s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p);

+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,

+             s,         s + 1 * p, s + 2 * p, s + 3 * p);

     ++s;

   } while (++i < count * 8);

@@ -283,12 +262,12 @@

                        s[0], s[1], s[2], s[3]);

     hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

-    flat = flatmask(1,

-                    s[-5], s[-4], s[-3], s[-2], s[-1],

-                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);

+    flat = flatmask4(1,

+                    s[-4], s[-3], s[-2], s[-1],

+                    s[ 0], s[ 1], s[ 2], s[ 3]);

     mbfilter(mask, hev, flat,

-             s - 5, s - 4, s - 3, s - 2, s - 1,

-             s,     s + 1, s + 2, s + 3, s + 4);

+             s - 4, s - 3, s - 2, s - 1,

+             s,     s + 1, s + 2, s + 3);

     s += p;

   } while (++i < count * 8);

@@ -295,41 +274,31 @@

 /* should we apply any filter at all ( 11111111 yes, 00000000 no) */

-static __inline int8_t simple_filter_mask(uint8_t blimit,

-                                          uint8_t p1, uint8_t p0,

-                                          uint8_t q0, uint8_t q1) {

-  /* Why does this cause problems for win32?

-   * error C2143: syntax error : missing ';' before 'type'

-   *  (void) limit;

-   */

-  int8_t mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;

-  return mask;

+static INLINE int8_t simple_filter_mask(uint8_t blimit,

+                                        uint8_t p1, uint8_t p0,

+                                        uint8_t q0, uint8_t q1) {

+  return (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  <= blimit) * -1;

-static __inline void simple_filter(int8_t mask,

-                                   uint8_t *op1, uint8_t *op0,

-                                   uint8_t *oq0, uint8_t *oq1) {

-  int8_t filter, Filter1, Filter2;

-  int8_t p1 = (int8_t) *op1 ^ 0x80;

-  int8_t p0 = (int8_t) *op0 ^ 0x80;

-  int8_t q0 = (int8_t) *oq0 ^ 0x80;

-  int8_t q1 = (int8_t) *oq1 ^ 0x80;

-  int8_t u;

+static INLINE void simple_filter(int8_t mask,

+                                 uint8_t *op1, uint8_t *op0,

+                                 uint8_t *oq0, uint8_t *oq1) {

+  int8_t filter1, filter2;

+  const int8_t p1 = (int8_t) *op1 ^ 0x80;

+  const int8_t p0 = (int8_t) *op0 ^ 0x80;

+  const int8_t q0 = (int8_t) *oq0 ^ 0x80;

+  const int8_t q1 = (int8_t) *oq1 ^ 0x80;

-  filter = signed_char_clamp(p1 - q1);

+  int8_t filter = signed_char_clamp(p1 - q1);

   filter = signed_char_clamp(filter + 3 * (q0 - p0));

   filter &= mask;

-  /* save bottom 3 bits so that we round one side +4 and the other +3 */

-  Filter1 = signed_char_clamp(filter + 4);

-  Filter1 >>= 3;

-  u = signed_char_clamp(q0 - Filter1);

-  *oq0  = u ^ 0x80;

+  // save bottom 3 bits so that we round one side +4 and the other +3

+  filter1 = signed_char_clamp(filter + 4) >> 3;

+  *oq0  = signed_char_clamp(q0 - filter1) ^ 0x80;

-  Filter2 = signed_char_clamp(filter + 3);

-  Filter2 >>= 3;

-  u = signed_char_clamp(p0 + Filter2);

-  *op0 = u ^ 0x80;

+  filter2 = signed_char_clamp(filter + 3) >> 3;

+  *op0 = signed_char_clamp(p0 + filter2) ^ 0x80;

 void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s,

@@ -481,42 +450,33 @@

   vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);

-static __inline void wide_mbfilter(int8_t mask, uint8_t hev,

-                                   uint8_t flat, uint8_t flat2,

-                                   uint8_t *op7, uint8_t *op6, uint8_t *op5,

-                                   uint8_t *op4, uint8_t *op3, uint8_t *op2,

-                                   uint8_t *op1, uint8_t *op0, uint8_t *oq0,

-                                   uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,

-                                   uint8_t *oq4, uint8_t *oq5, uint8_t *oq6,

-                                   uint8_t *oq7) {

-  /* use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line */

+static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,

+                                 uint8_t flat, uint8_t flat2,

+                                 uint8_t *op7, uint8_t *op6, uint8_t *op5,

+                                 uint8_t *op4, uint8_t *op3, uint8_t *op2,

+                                 uint8_t *op1, uint8_t *op0, uint8_t *oq0,

+                                 uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,

+                                 uint8_t *oq4, uint8_t *oq5, uint8_t *oq6,

+                                 uint8_t *oq7) {

+  // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line

   if (flat2 && flat && mask) {

-    uint8_t p0, q0;

-    uint8_t p1, q1;

-    uint8_t p2, q2;

-    uint8_t p3, q3;

-    uint8_t p4, q4;

-    uint8_t p5, q5;

-    uint8_t p6, q6;

-    uint8_t p7, q7;

+    const uint8_t p7 = *op7;

+    const uint8_t p6 = *op6;

+    const uint8_t p5 = *op5;

+    const uint8_t p4 = *op4;

+    const uint8_t p3 = *op3;

+    const uint8_t p2 = *op2;

+    const uint8_t p1 = *op1;

+    const uint8_t p0 = *op0;

+    const uint8_t q0 = *oq0;

+    const uint8_t q1 = *oq1;

+    const uint8_t q2 = *oq2;

+    const uint8_t q3 = *oq3;

+    const uint8_t q4 = *oq4;

+    const uint8_t q5 = *oq5;

+    const uint8_t q6 = *oq6;

+    const uint8_t q7 = *oq7;

-    p7 = *op7;

-    p6 = *op6;

-    p5 = *op5;

-    p4 = *op4;

-    p3 = *op3;

-    p2 = *op2;

-    p1 = *op1;

-    p0 = *op0;

-    q0 = *oq0;

-    q1 = *oq1;

-    q2 = *oq2;

-    q3 = *oq3;

-    q4 = *oq4;

-    q5 = *oq5;

-    q6 = *oq6;

-    q7 = *oq7;

     *op6 = (p7 * 7 + p6 * 2 +

             p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;

     *op5 = (p7 * 6 + p6 + p5 * 2 +

@@ -546,68 +506,48 @@

     *oq6 = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +

             q7 * 7 + 8) >> 4;

   } else if (flat && mask) {

-    unsigned char p0, q0;

-    unsigned char p1, q1;

-    unsigned char p2, q2;

-    unsigned char p3, q3;

-    unsigned char p4, q4;

+    const uint8_t p3 = *op3;

+    const uint8_t p2 = *op2;

+    const uint8_t p1 = *op1;

+    const uint8_t p0 = *op0;

+    const uint8_t q0 = *oq0;

+    const uint8_t q1 = *oq1;

+    const uint8_t q2 = *oq2;

+    const uint8_t q3 = *oq3;

-    p4 = *op4;

-    p3 = *op3;

-    p2 = *op2;

-    p1 = *op1;

-    p0 = *op0;

-    q0 = *oq0;

-    q1 = *oq1;

-    q2 = *oq2;

-    q3 = *oq3;

-    q4 = *oq4;

-    *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;

-    *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;

+    *op2 = (p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3;

+    *op1 = (p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3;

     *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3;

     *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3;

-    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3;

-    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3;

+    *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3 + 4) >> 3;

+    *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3 + 4) >> 3;

   } else {

-    signed char ps0, qs0;

-    signed char ps1, qs1;

-    signed char filter, Filter1, Filter2;

-    signed char u;

+    int8_t filter1, filter2;

-    ps1 = (signed char) * op1 ^ 0x80;

-    ps0 = (signed char) * op0 ^ 0x80;

-    qs0 = (signed char) * oq0 ^ 0x80;

-    qs1 = (signed char) * oq1 ^ 0x80;

+    const int8_t ps1 = (int8_t) * op1 ^ 0x80;

+    const int8_t ps0 = (int8_t) * op0 ^ 0x80;

+    const int8_t qs0 = (int8_t) * oq0 ^ 0x80;

+    const int8_t qs1 = (int8_t) * oq1 ^ 0x80;

-    /* add outer taps if we have high edge variance */

-    filter = signed_char_clamp(ps1 - qs1);

-    filter &= hev;

+    // add outer taps if we have high edge variance

+    int8_t filter = signed_char_clamp(ps1 - qs1) & hev;

-    /* inner taps */

-    filter = signed_char_clamp(filter + 3 * (qs0 - ps0));

-    filter &= mask;

+    // inner taps

+    filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;

+    filter1 = signed_char_clamp(filter + 4) >> 3;

+    filter2 = signed_char_clamp(filter + 3) >> 3;

-    Filter1 = signed_char_clamp(filter + 4);

-    Filter2 = signed_char_clamp(filter + 3);

-    Filter1 >>= 3;

-    Filter2 >>= 3;

+    *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;

+    *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;

+    filter = filter1;

-    u = signed_char_clamp(qs0 - Filter1);

-    *oq0 = u ^ 0x80;

-    u = signed_char_clamp(ps0 + Filter2);

-    *op0 = u ^ 0x80;

-    filter = Filter1;

-    /* outer tap adjustments */

+    // outer tap adjustments

     filter += 1;

     filter >>= 1;

     filter &= ~hev;

-    u = signed_char_clamp(qs1 - filter);

-    *oq1 = u ^ 0x80;

-    u = signed_char_clamp(ps1 + filter);

-    *op1 = u ^ 0x80;

+    *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;

+    *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;

@@ -636,19 +576,19 @@

     hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]);

-    flat = flatmask(1,

-                    s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

-                    s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]);

+    flat = flatmask4(1,

+                     s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p],

+                     s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]);

-    flat2 = flatmask(1,

-                    s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],

-                    s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);

+    flat2 = flatmask5(1,

+                      s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], s[-1 * p],

+                      s[ 0 * p], s[ 4 * p], s[ 5 * p], s[ 6 * p], s[ 7 * p]);

     wide_mbfilter(mask, hev, flat, flat2,

-             s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,

-             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,

-             s,         s + 1 * p, s + 2 * p, s + 3 * p,

-             s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);

+                  s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p,

+                  s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,

+                  s,         s + 1 * p, s + 2 * p, s + 3 * p,

+                  s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p);

     ++s;

   } while (++i < count * 8);

@@ -674,18 +614,18 @@

                        s[0], s[1], s[2], s[3]);

     hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);

-    flat = flatmask(1,

-                    s[-5], s[-4], s[-3], s[-2], s[-1],

-                    s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]);

-    flat2 = flatmask(1,

-                    s[-8], s[-7], s[-6], s[-5], s[-1],

-                    s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);

+    flat = flatmask4(1,

+                     s[-4], s[-3], s[-2], s[-1],

+                     s[ 0], s[ 1], s[ 2], s[ 3]);

+    flat2 = flatmask5(1,

+                     s[-8], s[-7], s[-6], s[-5], s[-1],

+                     s[ 0], s[ 4], s[ 5], s[ 6], s[ 7]);

     wide_mbfilter(mask, hev, flat, flat2,

-             s - 8, s - 7, s - 6, s - 5,

-             s - 4, s - 3, s - 2, s - 1,

-             s,     s + 1, s + 2, s + 3,

-             s + 4, s + 5, s + 6, s + 7);

+                  s - 8, s - 7, s - 6, s - 5,

+                  s - 4, s - 3, s - 2, s - 1,

+                  s,     s + 1, s + 2, s + 3,

+                  s + 4, s + 5, s + 6, s + 7);

     s += p;

   } while (++i < count * 8);

--- a/vp9/common/vp9_maskingmv.c

+++ b/vp9/common/vp9_maskingmv.c

@@ -11,7 +11,8 @@

 #include <stdio.h>

 #include <stdlib.h>

 #include <string.h>

-extern unsigned int vp9_sad16x16_sse3(

+unsigned int vp9_sad16x16_sse3(

   unsigned char *src_ptr,

   int  src_stride,

   unsigned char *ref_ptr,

@@ -18,18 +19,11 @@

   int  ref_stride,

   int  max_err);

-extern void vp9_sad16x16x3_sse3(

-  unsigned char *src_ptr,

-  int  src_stride,

-  unsigned char *ref_ptr,

-  int  ref_stride,

-  int  *results);

-extern int vp8_growmaskmb_sse3(

+int vp8_growmaskmb_sse3(

   unsigned char *om,

   unsigned char *nm);

-extern void vp8_makemask_sse3(

+void vp8_makemask_sse3(

   unsigned char *y,

   unsigned char *u,

   unsigned char *v,

@@ -238,6 +232,7 @@

   for (i = 0; i < 256; i++)

     ym[i] = nym[i];

 void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v,

                   unsigned char *ym, unsigned char *uvm,

                   int yp, int uvp,

@@ -283,6 +278,7 @@

   return sad;

 int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp,

                  unsigned char *ym) {

   int i, j;

@@ -294,6 +290,7 @@

   return sad;

 int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v,

                          int yp, int uvp,

                          unsigned char *dy, unsigned char *du, unsigned char *dv,

@@ -802,5 +799,5 @@

   fclose(f);

   fclose(g);

-  return;

+  return 0;

--- a/vp9/common/vp9_mbpitch.c

+++ b/vp9/common/vp9_mbpitch.c

@@ -20,15 +20,15 @@

                         int mv_stride,

                         uint8_t **base,

                         uint8_t **base2,

-                        int Stride,

+                        int stride,

                         int offset,

                         BLOCKSET bs) {

   if (bs == DEST) {

-    b->dst_stride = Stride;

+    b->dst_stride = stride;

     b->dst = offset;

     b->base_dst = base;

   } else {

-    b->pre_stride = Stride;

+    b->pre_stride = stride;

     b->pre = offset;

     b->base_pre = base;

     b->base_second_pre = base2;

@@ -102,9 +102,7 @@

-  blockd[24].diff = &xd->diff[384];

-  for (r = 0; r < 25; r++) {

+  for (r = 0; r < 24; r++) {

     blockd[r].qcoeff  = xd->qcoeff  + r * 16;

     blockd[r].dqcoeff = xd->dqcoeff + r * 16;

--- a/vp9/common/vp9_modecont.c

+++ b/vp9/common/vp9_modecont.c

@@ -12,7 +12,7 @@

 #include "vp9/common/vp9_entropy.h"

 const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4] = {

-  {223,     1,     1,    237},  // 0,0 best: Only candidate

+  {1,       223,   1,    237},  // 0,0 best: Only candidate

   {87,      166,   26,   219},  // 0,0 best: non zero candidates

   {89,      67,    18,   125},  // 0,0 best: non zero candidates, split

   {16,      141,   69,   226},  // strong nz candidate(s), no split

--- a/vp9/common/vp9_mv.h

+++ b/vp9/common/vp9_mv.h

@@ -23,4 +23,14 @@

   MV as_mv;

 } int_mv; /* facilitates faster equality tests and copies */

+struct mv32 {

+  int32_t row;

+  int32_t col;

+};

+typedef union int_mv32 {

+  uint64_t    as_int;

+  struct mv32 as_mv;

+} int_mv32; /* facilitates faster equality tests and copies */

 #endif  // VP9_COMMON_VP9_MV_H_

--- a/vp9/common/vp9_mvref_common.c

+++ b/vp9/common/vp9_mvref_common.c

@@ -11,64 +11,69 @@

 #include "vp9/common/vp9_mvref_common.h"

 #define MVREF_NEIGHBOURS 8

 static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = {

     {0, -1}, {-1, 0}, {-1, -1}, {0, -2},

     {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}

};

 static int mb_ref_distance_weight[MVREF_NEIGHBOURS] =

   { 3, 3, 2, 1, 1, 1, 1, 1 };

 static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = {

     {0, -1}, {-1, 0}, {1, -1}, {-1, 1},

     {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}

};

 static int sb_ref_distance_weight[MVREF_NEIGHBOURS] =

   { 3, 3, 2, 2, 2, 1, 1, 1 };

-// clamp_mv

-#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units

-static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {

-  if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))

-    mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;

-  else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)

-    mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER;

-  if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER))

-    mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER;

-  else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER)

-    mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;

+static int sb64_mv_ref_search[MVREF_NEIGHBOURS][2] = {

+    {0, -1}, {-1, 0}, {1, -1}, {-1, 1},

+    {2, -1}, {-1, 2}, {3, -1}, {-1,-1}

+};

+static int sb64_ref_distance_weight[MVREF_NEIGHBOURS] =

+  { 1, 1, 1, 1, 1, 1, 1, 1 };

+// clamp_mv_ref

+#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units

+static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {

+  mv->as_mv.col = clamp(mv->as_mv.col, xd->mb_to_left_edge - MV_BORDER,

+                                       xd->mb_to_right_edge + MV_BORDER);

+  mv->as_mv.row = clamp(mv->as_mv.row, xd->mb_to_top_edge - MV_BORDER,

+                                       xd->mb_to_bottom_edge + MV_BORDER);

 // Gets a candidate refenence motion vector from the given mode info

 // structure if one exists that matches the given reference frame.

-static int get_matching_candidate(

-  const MODE_INFO *candidate_mi,

-  MV_REFERENCE_FRAME ref_frame,

-  int_mv *c_mv

-) {

-  int ret_val = TRUE;

+static int get_matching_candidate(const MODE_INFO *candidate_mi,

+                                  MV_REFERENCE_FRAME ref_frame,

+                                  int_mv *c_mv) {

   if (ref_frame == candidate_mi->mbmi.ref_frame) {

     c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;

   } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {

     c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

   } else {

-    ret_val = FALSE;

+    return 0;

-  return ret_val;

+  return 1;

 // Gets candidate refenence motion vector(s) from the given mode info

 // structure if they exists and do NOT match the given reference frame.

-static void get_non_matching_candidates(

-  const MODE_INFO *candidate_mi,

-  MV_REFERENCE_FRAME ref_frame,

-  MV_REFERENCE_FRAME *c_ref_frame,

-  int_mv *c_mv,

-  MV_REFERENCE_FRAME *c2_ref_frame,

-  int_mv *c2_mv

-) {

+static void get_non_matching_candidates(const MODE_INFO *candidate_mi,

+                                        MV_REFERENCE_FRAME ref_frame,

+                                        MV_REFERENCE_FRAME *c_ref_frame,

+                                        int_mv *c_mv,

+                                        MV_REFERENCE_FRAME *c2_ref_frame,

+                                        int_mv *c2_mv) {

   c_mv->as_int = 0;

   c2_mv->as_int = 0;

@@ -85,10 +90,8 @@

     // Second candidate

     if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) &&

-        (candidate_mi->mbmi.second_ref_frame != ref_frame)) {  // &&

-        // (candidate_mi->mbmi.mv[1].as_int != 0) &&

-        // (candidate_mi->mbmi.mv[1].as_int !=

-        // candidate_mi->mbmi.mv[0].as_int)) {

+        (candidate_mi->mbmi.second_ref_frame != ref_frame) &&

+        (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) {

       *c2_ref_frame = candidate_mi->mbmi.second_ref_frame;

       c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;

@@ -95,63 +98,60 @@

-// Performs mv adjustment based on reference frame and clamps the MV

-// if it goes off the edge of the buffer.

-static void scale_mv(

-  MACROBLOCKD *xd,

-  MV_REFERENCE_FRAME this_ref_frame,

-  MV_REFERENCE_FRAME candidate_ref_frame,

-  int_mv *candidate_mv,

-  int *ref_sign_bias

-) {

-  if (candidate_ref_frame != this_ref_frame) {

+// Performs mv sign inversion if indicated by the reference frame combination.

+static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame,

+                     MV_REFERENCE_FRAME candidate_ref_frame,

+                     int_mv *candidate_mv, int *ref_sign_bias) {

+  // int frame_distances[MAX_REF_FRAMES];

+  // int last_distance = 1;

+  // int gf_distance = xd->frames_since_golden;

+  // int arf_distance = xd->frames_till_alt_ref_frame;

-    //int frame_distances[MAX_REF_FRAMES];

-    //int last_distance = 1;

-    //int gf_distance = xd->frames_since_golden;

-    //int arf_distance = xd->frames_till_alt_ref_frame;

+  // Sign inversion where appropriate.

+  if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {

+    candidate_mv->as_mv.row = -candidate_mv->as_mv.row;

+    candidate_mv->as_mv.col = -candidate_mv->as_mv.col;

+  }

-    // Sign inversion where appropriate.

-    if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {

-      candidate_mv->as_mv.row = -candidate_mv->as_mv.row;

-      candidate_mv->as_mv.col = -candidate_mv->as_mv.col;

-    }

+  /*

+  // Scale based on frame distance if the reference frames not the same.

+  frame_distances[INTRA_FRAME] = 1;   // should never be used

+  frame_distances[LAST_FRAME] = 1;

+  frame_distances[GOLDEN_FRAME] =

+    (xd->frames_since_golden) ? xd->frames_si nce_golden : 1;

+  frame_distances[ALTREF_FRAME] =

+    (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;

-    // Scale based on frame distance if the reference frames not the same.

-    /*frame_distances[INTRA_FRAME] = 1;   // should never be used

-    frame_distances[LAST_FRAME] = 1;

-    frame_distances[GOLDEN_FRAME] =

-      (xd->frames_since_golden) ? xd->frames_since_golden : 1;

-    frame_distances[ALTREF_FRAME] =

-      (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;

+  if (frame_distances[this_ref_frame] &&

+      frame_distances[candidate_ref_frame]) {

+    candidate_mv->as_mv.row =

+      (short)(((int)(candidate_mv->as_mv.row) *

+               frame_distances[this_ref_frame]) /

+              frame_distances[candidate_ref_frame]);

-    if (frame_distances[this_ref_frame] &&

-        frame_distances[candidate_ref_frame]) {

-      candidate_mv->as_mv.row =

-        (short)(((int)(candidate_mv->as_mv.row) *

-                 frame_distances[this_ref_frame]) /

-                frame_distances[candidate_ref_frame]);

-      candidate_mv->as_mv.col =

-        (short)(((int)(candidate_mv->as_mv.col) *

-                 frame_distances[this_ref_frame]) /

-                frame_distances[candidate_ref_frame]);

-    }

-    */

+    candidate_mv->as_mv.col =

+      (short)(((int)(candidate_mv->as_mv.col) *

+               frame_distances[this_ref_frame]) /

+              frame_distances[candidate_ref_frame]);

-  // Clamp the MV so it does not point out of the frame buffer

-  clamp_mv(xd, candidate_mv);

+  */

-// Adds a new candidate reference vector to the list if indeed it is new.

-// If it is not new then the score of the existing candidate that it matches

-// is increased and the list is resorted.

+/*

+// Adds a new candidate reference vector to the sorted list.

+// If it is a repeat the weight of the existing entry is increased

+// and the order of the list is resorted.

+// This method of add plus sort has been deprecated for now as there is a

+// further sort of the best candidates in vp9_find_best_ref_mvs() and the

+// incremental benefit of both is small. If the decision is made to remove

+// the sort in vp9_find_best_ref_mvs() for performance reasons then it may be

+// worth re-instating some sort of list reordering by weight here.

+//

 static void addmv_and_shuffle(

   int_mv *mv_list,

   int *mv_scores,

-  int *index,

+  int *refmv_count,

   int_mv candidate_mv,

   int weight

) {

@@ -162,11 +162,11 @@

   // Check for duplicates. If there is one increase its score.

   // We only compare vs the current top candidates.

-  insert_point = (*index < (MAX_MV_REF_CANDIDATES - 1))

-                 ? *index : (MAX_MV_REF_CANDIDATES - 1);

+  insert_point = (*refmv_count < (MAX_MV_REF_CANDIDATES - 1))

+                 ? *refmv_count : (MAX_MV_REF_CANDIDATES - 1);

   i = insert_point;

-  if (*index > i)

+  if (*refmv_count > i)

     i++;

   while (i > 0) {

     i--;

@@ -184,7 +184,7 @@

       mv_scores[insert_point] = weight;

       i = insert_point;

-    (*index)++;

+    (*refmv_count)++;

   // Reshuffle the list so that highest scoring mvs at the top.

@@ -202,19 +202,42 @@

       break;

+*/

+// Adds a new candidate reference vector to the list.

+// The mv is thrown out if it is already in the list.

+// Unlike the addmv_and_shuffle() this does not reorder the list

+// but assumes that candidates are added in the order most likely to

+// match distance and reference frame bias.

+static void add_candidate_mv(int_mv *mv_list,  int *mv_scores,

+                             int *candidate_count, int_mv candidate_mv,

+                             int weight) {

+  int i;

+  // Make sure we dont insert off the end of the list

+  const int insert_point = MIN(*candidate_count, MAX_MV_REF_CANDIDATES - 1);

+  // Look for duplicates

+  for (i = 0; i <= insert_point; ++i) {

+    if (candidate_mv.as_int == mv_list[i].as_int)

+      break;

+  }

+  // Add the candidate. If the list is already full it is only desirable that

+  // it should overwrite if it has a higher weight than the last entry.

+  if (i >= insert_point && weight > mv_scores[insert_point]) {

+    mv_list[insert_point].as_int = candidate_mv.as_int;

+    mv_scores[insert_point] = weight;

+    *candidate_count += (*candidate_count < MAX_MV_REF_CANDIDATES);

+  }

+}

 // This function searches the neighbourhood of a given MB/SB and populates a

 // list of candidate reference vectors.

//

-void vp9_find_mv_refs(

-  MACROBLOCKD *xd,

-  MODE_INFO *here,

-  MODE_INFO *lf_here,

-  MV_REFERENCE_FRAME ref_frame,

-  int_mv *mv_ref_list,

-  int *ref_sign_bias

-) {

+void vp9_find_mv_refs(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,

+                      MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame,

+                      int_mv *mv_ref_list, int *ref_sign_bias) {

   int i;

   MODE_INFO *candidate_mi;

   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

@@ -224,10 +247,12 @@

   MV_REFERENCE_FRAME c_ref_frame;

   MV_REFERENCE_FRAME c2_ref_frame;

   int candidate_scores[MAX_MV_REF_CANDIDATES];

-  int index = 0;

+  int refmv_count = 0;

   int split_count = 0;

   int (*mv_ref_search)[2];

   int *ref_distance_weight;

+  int zero_seen = FALSE;

+  const int mb_col = (-xd->mb_to_left_edge) >> 7;

   // Blank the reference vector lists and other local structures.

   vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);

@@ -234,7 +259,10 @@

   vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);

   vpx_memset(candidate_scores, 0, sizeof(candidate_scores));

-  if (mbmi->sb_type) {

+  if (mbmi->sb_type == BLOCK_SIZE_SB64X64) {

+    mv_ref_search = sb64_mv_ref_search;

+    ref_distance_weight = sb64_ref_distance_weight;

+  } else if (mbmi->sb_type == BLOCK_SIZE_SB32X32) {

     mv_ref_search = sb_mv_ref_search;

     ref_distance_weight = sb_ref_distance_weight;

   } else {

@@ -245,7 +273,10 @@

   // We first scan for candidate vectors that match the current reference frame

   // Look at nearest neigbours

   for (i = 0; i < 2; ++i) {

-    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&

+    const int mb_search_col = mb_col + mv_ref_search[i][0];

+    if ((mb_search_col >= cm->cur_tile_mb_col_start) &&

+        (mb_search_col < cm->cur_tile_mb_col_end) &&

         ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

       candidate_mi = here + mv_ref_search[i][0] +

@@ -252,32 +283,34 @@

                      (mv_ref_search[i][1] * xd->mode_info_stride);

       if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {

-        clamp_mv(xd, &c_refmv);

-        addmv_and_shuffle(candidate_mvs, candidate_scores,

-                          &index, c_refmv, ref_distance_weight[i] + 16);

+        add_candidate_mv(candidate_mvs, candidate_scores,

+                         &refmv_count, c_refmv, ref_distance_weight[i] + 16);

       split_count += (candidate_mi->mbmi.mode == SPLITMV);

-  // Look in the last frame

-  candidate_mi = lf_here;

-  if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {

-    clamp_mv(xd, &c_refmv);

-    addmv_and_shuffle(candidate_mvs, candidate_scores,

-                      &index, c_refmv, 18);

+  // Look in the last frame if it exists

+  if (lf_here) {

+    candidate_mi = lf_here;

+    if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {

+      add_candidate_mv(candidate_mvs, candidate_scores,

+                       &refmv_count, c_refmv, 18);

+    }

   // More distant neigbours

   for (i = 2; (i < MVREF_NEIGHBOURS) &&

-              (index < (MAX_MV_REF_CANDIDATES - 1)); ++i) {

-    if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&

+              (refmv_count < (MAX_MV_REF_CANDIDATES - 1)); ++i) {

+    const int mb_search_col = mb_col + mv_ref_search[i][0];

+    if ((mb_search_col >= cm->cur_tile_mb_col_start) &&

+        (mb_search_col < cm->cur_tile_mb_col_end) &&

         ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

       candidate_mi = here + mv_ref_search[i][0] +

                      (mv_ref_search[i][1] * xd->mode_info_stride);

       if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv)) {

-        clamp_mv(xd, &c_refmv);

-        addmv_and_shuffle(candidate_mvs, candidate_scores,

-                          &index, c_refmv, ref_distance_weight[i] + 16);

+        add_candidate_mv(candidate_mvs, candidate_scores,

+                         &refmv_count, c_refmv, ref_distance_weight[i] + 16);

@@ -286,9 +319,12 @@

   // reference frame does not match. Break out when we have

   // MAX_MV_REF_CANDIDATES candidates.

   // Look first at spatial neighbours

-  if (index < (MAX_MV_REF_CANDIDATES - 1)) {

+  if (refmv_count < (MAX_MV_REF_CANDIDATES - 1)) {

     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {

-      if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&

+      const int mb_search_col = mb_col + mv_ref_search[i][0];

+      if ((mb_search_col >= cm->cur_tile_mb_col_start) &&

+          (mb_search_col < cm->cur_tile_mb_col_end) &&

           ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {

         candidate_mi = here + mv_ref_search[i][0] +

@@ -300,24 +336,24 @@

         if (c_ref_frame != INTRA_FRAME) {

           scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);

-          addmv_and_shuffle(candidate_mvs, candidate_scores,

-                            &index, c_refmv, ref_distance_weight[i]);

+          add_candidate_mv(candidate_mvs, candidate_scores,

+                           &refmv_count, c_refmv, ref_distance_weight[i]);

         if (c2_ref_frame != INTRA_FRAME) {

           scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);

-          addmv_and_shuffle(candidate_mvs, candidate_scores,

-                            &index, c2_refmv, ref_distance_weight[i]);

+          add_candidate_mv(candidate_mvs, candidate_scores,

+                           &refmv_count, c2_refmv, ref_distance_weight[i]);

-      if (index >= (MAX_MV_REF_CANDIDATES - 1)) {

+      if (refmv_count >= (MAX_MV_REF_CANDIDATES - 1)) {

         break;

-  // Look at the last frame

-  if (index < (MAX_MV_REF_CANDIDATES - 1)) {

+  // Look at the last frame if it exists

+  if (refmv_count < (MAX_MV_REF_CANDIDATES - 1) && lf_here) {

     candidate_mi = lf_here;

     get_non_matching_candidates(candidate_mi, ref_frame,

                                 &c_ref_frame, &c_refmv,

@@ -325,14 +361,14 @@

     if (c_ref_frame != INTRA_FRAME) {

       scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);

-      addmv_and_shuffle(candidate_mvs, candidate_scores,

-                        &index, c_refmv, 2);

+      add_candidate_mv(candidate_mvs, candidate_scores,

+                       &refmv_count, c_refmv, 2);

     if (c2_ref_frame != INTRA_FRAME) {

       scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);

-      addmv_and_shuffle(candidate_mvs, candidate_scores,

-                        &index, c2_refmv, 2);

+      add_candidate_mv(candidate_mvs, candidate_scores,

+                       &refmv_count, c2_refmv, 2);

@@ -340,7 +376,7 @@

   // 0,0 was best

   if (candidate_mvs[0].as_int == 0) {

     // 0,0 is only candidate

-    if (index <= 1) {

+    if (refmv_count <= 1) {

       mbmi->mb_mode_context[ref_frame] = 0;

     // non zero candidates candidates available

     } else if (split_count == 0) {

@@ -348,30 +384,25 @@

     } else {

       mbmi->mb_mode_context[ref_frame] = 2;

-  // Non zero best, No Split MV cases

   } else if (split_count == 0) {

-    if (candidate_scores[0] >= 32) {

-      mbmi->mb_mode_context[ref_frame] = 3;

-    } else {

-      mbmi->mb_mode_context[ref_frame] = 4;

-    }

-  // Non zero best, some split mv

+    // Non zero best, No Split MV cases

+    mbmi->mb_mode_context[ref_frame] = candidate_scores[0] >= 16 ? 3 : 4;

   } else {

-    if (candidate_scores[0] >= 32) {

-      mbmi->mb_mode_context[ref_frame] = 5;

-    } else {

-      mbmi->mb_mode_context[ref_frame] = 6;

-    }

+    // Non zero best, some split mv

+    mbmi->mb_mode_context[ref_frame] = candidate_scores[0] >= 16 ? 5 : 6;

-  // 0,0 is always a valid reference.

+  // Scan for 0,0 case and clamp non zero choices

   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {

-    if (candidate_mvs[i].as_int == 0)

-      break;

+    if (candidate_mvs[i].as_int == 0) {

+      zero_seen = TRUE;

+    } else {

+      clamp_mv_ref(xd, &candidate_mvs[i]);

+    }

-  if (i == MAX_MV_REF_CANDIDATES) {

+  // 0,0 is always a valid reference. Add it if not already seen.

+  if (!zero_seen)

     candidate_mvs[MAX_MV_REF_CANDIDATES-1].as_int = 0;

-  }

   // Copy over the candidate list.

   vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));

--- a/vp9/common/vp9_mvref_common.h

+++ b/vp9/common/vp9_mvref_common.h

@@ -14,7 +14,8 @@

 #ifndef VP9_COMMON_VP9_MVREF_COMMON_H_

 #define VP9_COMMON_VP9_MVREF_COMMON_H_

-void vp9_find_mv_refs(MACROBLOCKD *xd,

+void vp9_find_mv_refs(VP9_COMMON *cm,

+                      MACROBLOCKD *xd,

                       MODE_INFO *here,

                       MODE_INFO *lf_here,

                       MV_REFERENCE_FRAME ref_frame,

--- a/vp9/common/vp9_onyx.h

+++ b/vp9/common/vp9_onyx.h

@@ -16,6 +16,7 @@

 #endif

+#include "./vpx_config.h"

 #include "vpx/internal/vpx_codec_internal.h"

 #include "vpx/vp8cx.h"

 #include "vpx_scale/yv12config.h"

@@ -62,7 +63,7 @@

 #include <assert.h>

-  static __inline void Scale2Ratio(int mode, int *hr, int *hs) {

+  static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {

     switch (mode) {

       case    NORMAL:

         *hr = 1;

@@ -89,11 +90,13 @@

   typedef struct {

-    int Version;            // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode

-    int Width;              // width of data passed to the compressor

-    int Height;             // height of data passed to the compressor

+    int version;  // 4 versions of bitstream defined:

+                  //   0 - best quality/slowest decode,

+                  //   3 - lowest quality/fastest decode

+    int width;  // width of data passed to the compressor

+    int height;  // height of data passed to the compressor

     double frame_rate;       // set to passed in framerate

-    int target_bandwidth;    // bandwidth to be used in kilobits per second

+    int64_t target_bandwidth;    // bandwidth to be used in kilobits per second

     int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0

     int Sharpness;          // parameter used for sharpening output: recommendation 0:

@@ -134,9 +137,9 @@

     int over_shoot_pct;

     // buffering parameters

-    int starting_buffer_level;  // in seconds

-    int optimal_buffer_level;

-    int maximum_buffer_size;

+    int64_t starting_buffer_level;  // in seconds

+    int64_t optimal_buffer_level;

+    int64_t maximum_buffer_size;

     // controlling quality

     int fixed_q;

@@ -159,10 +162,25 @@

     int encode_breakout;  // early breakout encode threshold : for video conf recommend 800

+    /* Bitfield defining the error resiliency features to enable.

+     * Can provide decodable frames after losses in previous

+     * frames and decodable partitions after losses in the same frame.

+     */

+    unsigned int error_resilient_mode;

+    /* Bitfield defining the parallel decoding mode where the

+     * decoding in successive frames may be conducted in parallel

+     * just by decoding the frame headers.

+     */

+    unsigned int frame_parallel_decoding_mode;

     int arnr_max_frames;

     int arnr_strength;

     int arnr_type;

+    int tile_columns;

+    int tile_rows;

     struct vpx_fixed_buf         two_pass_stats_in;

     struct vpx_codec_pkt_list  *output_pkt_list;

@@ -195,8 +213,10 @@

   int vp9_update_reference(VP9_PTR comp, int ref_frame_flags);

-  int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,

-                            YV12_BUFFER_CONFIG *sd);

+  int vp9_copy_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,

+                             YV12_BUFFER_CONFIG *sd);

+  int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);

   int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag,

                             YV12_BUFFER_CONFIG *sd);

--- a/vp9/common/vp9_onyxc_int.h

+++ b/vp9/common/vp9_onyxc_int.h

@@ -37,8 +37,17 @@

 #define QINDEX_RANGE (MAXQ + 1)

-#define NUM_YV12_BUFFERS 4

+#define NUM_REF_FRAMES 3

+#define NUM_REF_FRAMES_LG2 2

+// 1 scratch frame for the new frame, 3 for scaled references on the encoder

+// TODO(jkoleszar): These 3 extra references could probably come from the

+// normal reference pool.

+#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4)

+#define NUM_FRAME_CONTEXTS_LG2 2

+#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2)

 #define COMP_PRED_CONTEXTS   2

 typedef struct frame_contexts {

@@ -49,14 +58,24 @@

   vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1];

   vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1];

   vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1];

-  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32];

+  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];

+  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];

+  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];

+  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_prob nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                        [NZC4X4_NODES];

+  vp9_prob nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                        [NZC8X8_NODES];

+  vp9_prob nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                          [NZC16X16_NODES];

+  vp9_prob nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                          [NZC32X32_NODES];

+  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]

+                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];

+#endif

   nmv_context nmvc;

   nmv_context pre_nmvc;

   vp9_prob pre_bmode_prob[VP9_NKF_BINTRAMODES - 1];

@@ -74,22 +93,43 @@

   unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS];

   unsigned int mbsplit_counts[VP9_NUMMBSPLITS];

-  vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_probs pre_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_probs pre_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_probs pre_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES_32X32];

+  vp9_coeff_probs pre_coef_probs_4x4[BLOCK_TYPES];

+  vp9_coeff_probs pre_coef_probs_8x8[BLOCK_TYPES];

+  vp9_coeff_probs pre_coef_probs_16x16[BLOCK_TYPES];

+  vp9_coeff_probs pre_coef_probs_32x32[BLOCK_TYPES];

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_prob pre_nzc_probs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                            [NZC4X4_NODES];

+  vp9_prob pre_nzc_probs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                            [NZC8X8_NODES];

+  vp9_prob pre_nzc_probs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                              [NZC16X16_NODES];

+  vp9_prob pre_nzc_probs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                              [NZC32X32_NODES];

+  vp9_prob pre_nzc_pcat_probs[MAX_NZC_CONTEXTS]

+                             [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];

+#endif

-  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32];

+  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];

+  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];

+  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];

+  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];

+  unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]

+                                [COEF_BANDS][PREV_COEF_CONTEXTS];

+#if CONFIG_CODE_NONZEROCOUNT

+  unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                             [NZC4X4_TOKENS];

+  unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                             [NZC8X8_TOKENS];

+  unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                               [NZC16X16_TOKENS];

+  unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                               [NZC32X32_TOKENS];

+  unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS]

+                              [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA][2];

+#endif

   nmv_context_counts NMVcount;

   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]

                                  [VP9_SWITCHABLE_FILTERS - 1];

@@ -128,13 +168,14 @@

   struct vpx_internal_error_info  error;

   DECLARE_ALIGNED(16, int16_t, Y1dequant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, int16_t, Y2dequant[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, int16_t, UVdequant[QINDEX_RANGE][16]);

-  int Width;

-  int Height;

-  int horiz_scale;

-  int vert_scale;

+  int width;

+  int height;

+  int display_width;

+  int display_height;

+  int last_width;

+  int last_height;

   YUV_TYPE clr_type;

   CLAMP_TYPE  clamp_type;

@@ -142,9 +183,16 @@

   YV12_BUFFER_CONFIG *frame_to_show;

   YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];

-  int fb_idx_ref_cnt[NUM_YV12_BUFFERS];

-  int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx;

+  int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */

+  int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */

+  /* TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and

+   * roll new_fb_idx into it.

+   */

+  int active_ref_idx[3]; /* each frame can reference 3 buffers */

+  int new_fb_idx;

+  struct scale_factors active_ref_scale[3];

   YV12_BUFFER_CONFIG post_proc_buffer;

   YV12_BUFFER_CONFIG temp_scale_frame;

@@ -173,8 +221,6 @@

   int last_kf_gf_q;  /* Q used on the last GF or KF */

   int y1dc_delta_q;

-  int y2dc_delta_q;

-  int y2ac_delta_q;

   int uvdc_delta_q;

   int uvac_delta_q;

@@ -201,19 +247,13 @@

   int filter_level;

   int last_sharpness_level;

   int sharpness_level;

+  int dering_enabled;

-  int refresh_last_frame;       /* Two state 0 = NO, 1 = YES */

-  int refresh_golden_frame;     /* Two state 0 = NO, 1 = YES */

-  int refresh_alt_ref_frame;     /* Two state 0 = NO, 1 = YES */

-  int copy_buffer_to_gf;         /* 0 none, 1 Last to GF, 2 ARF to GF */

-  int copy_buffer_to_arf;        /* 0 none, 1 Last to ARF, 2 GF to ARF */

   int refresh_entropy_probs;    /* Two state 0 = NO, 1 = YES */

   int ref_frame_sign_bias[MAX_REF_FRAMES];    /* Two state 0, 1 */

-  /* Y,U,V,Y2 */

+  /* Y,U,V */

   ENTROPY_CONTEXT_PLANES *above_context;   /* row of context for each plane */

   ENTROPY_CONTEXT_PLANES left_context[4];  /* (up to) 4 contexts "" */

@@ -250,9 +290,9 @@

   vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS];

-  FRAME_CONTEXT lfc_a; /* last alt ref entropy */

-  FRAME_CONTEXT lfc; /* last frame entropy */

   FRAME_CONTEXT fc;  /* this frame entropy */

+  FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];

+  unsigned int  frame_context_idx; /* Context to use/update */

   unsigned int current_video_frame;

   int near_boffset[3];

@@ -272,6 +312,60 @@

   int use_interintra;

 #endif

+  int error_resilient_mode;

+  int frame_parallel_decoding_mode;

+  int tile_columns, log2_tile_columns;

+  int cur_tile_mb_col_start, cur_tile_mb_col_end, cur_tile_col_idx;

+  int tile_rows, log2_tile_rows;

+  int cur_tile_mb_row_start, cur_tile_mb_row_end, cur_tile_row_idx;

 } VP9_COMMON;

+static int get_free_fb(VP9_COMMON *cm) {

+  int i;

+  for (i = 0; i < NUM_YV12_BUFFERS; i++)

+    if (cm->fb_idx_ref_cnt[i] == 0)

+      break;

+  assert(i < NUM_YV12_BUFFERS);

+  cm->fb_idx_ref_cnt[i] = 1;

+  return i;

+}

+static void ref_cnt_fb(int *buf, int *idx, int new_idx) {

+  if (buf[*idx] > 0)

+    buf[*idx]--;

+  *idx = new_idx;

+  buf[new_idx]++;

+}

+// TODO(debargha): merge the two functions

+static void set_mb_row(VP9_COMMON *cm, MACROBLOCKD *xd,

+                       int mb_row, int block_size) {

+  xd->mb_to_top_edge    = -((mb_row * 16) << 3);

+  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;

+  // Are edges available for intra prediction?

+  xd->up_available    = (mb_row != 0);

+}

+static void set_mb_col(VP9_COMMON *cm, MACROBLOCKD *xd,

+                       int mb_col, int block_size) {

+  xd->mb_to_left_edge   = -((mb_col * 16) << 3);

+  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;

+  // Are edges available for intra prediction?

+  xd->left_available  = (mb_col > cm->cur_tile_mb_col_start);

+  xd->right_available = (mb_col + block_size < cm->cur_tile_mb_col_end);

+}

+static int get_mb_row(const MACROBLOCKD *xd) {

+  return ((-xd->mb_to_top_edge) >> 7);

+}

+static int get_mb_col(const MACROBLOCKD *xd) {

+  return ((-xd->mb_to_left_edge) >> 7);

+}

 #endif  // VP9_COMMON_VP9_ONYXC_INT_H_

--- a/vp9/common/vp9_postproc.c

+++ b/vp9/common/vp9_postproc.c

@@ -336,11 +336,8 @@

                                 source->uv_height, source->uv_width, ppl);

-void vp9_de_noise(YV12_BUFFER_CONFIG         *src,

-                  YV12_BUFFER_CONFIG         *post,

-                  int                         q,

-                  int                         low_var_thresh,

-                  int                         flag) {

+void vp9_denoise(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *post,

+                 int q, int low_var_thresh, int flag) {

   double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;

   int ppl = (int)(level + .5);

   (void) post;

@@ -424,9 +421,9 @@

  *  INPUTS        : unsigned char *Start  starting address of buffer to

  *                                        add gaussian noise to

- *                  unsigned int Width    width of plane

- *                  unsigned int Height   height of plane

- *                  int  Pitch    distance between subsequent lines of frame

+ *                  unsigned int width    width of plane

+ *                  unsigned int height   height of plane

+ *                  int  pitch    distance between subsequent lines of frame

  *                  int  q        quantizer used to determine amount of noise

  *                                  to add

@@ -439,25 +436,25 @@

  *  SPECIAL NOTES : None.

  ****************************************************************************/

-void vp9_plane_add_noise_c(uint8_t *Start, char *noise,

+void vp9_plane_add_noise_c(uint8_t *start, char *noise,

                            char blackclamp[16],

                            char whiteclamp[16],

                            char bothclamp[16],

-                           unsigned int Width, unsigned int Height, int Pitch) {

+                           unsigned int width, unsigned int height, int pitch) {

   unsigned int i, j;

-  for (i = 0; i < Height; i++) {

-    uint8_t *Pos = Start + i * Pitch;

-    char  *Ref = (char *)(noise + (rand() & 0xff));

+  for (i = 0; i < height; i++) {

+    uint8_t *pos = start + i * pitch;

+    char  *ref = (char *)(noise + (rand() & 0xff));  // NOLINT

-    for (j = 0; j < Width; j++) {

-      if (Pos[j] < blackclamp[0])

-        Pos[j] = blackclamp[0];

+    for (j = 0; j < width; j++) {

+      if (pos[j] < blackclamp[0])

+        pos[j] = blackclamp[0];

-      if (Pos[j] > 255 + whiteclamp[0])

-        Pos[j] = 255 + whiteclamp[0];

+      if (pos[j] > 255 + whiteclamp[0])

+        pos[j] = 255 + whiteclamp[0];

-      Pos[j] += Ref[j];

+      pos[j] += ref[j];

@@ -636,8 +633,8 @@

     *dest = *oci->frame_to_show;

     /* handle problem with extending borders */

-    dest->y_width = oci->Width;

-    dest->y_height = oci->Height;

+    dest->y_width = oci->width;

+    dest->y_height = oci->height;

     dest->uv_height = dest->y_height / 2;

     return 0;

@@ -1004,8 +1001,8 @@

   *dest = oci->post_proc_buffer;

   /* handle problem with extending borders */

-  dest->y_width = oci->Width;

-  dest->y_height = oci->Height;

+  dest->y_width = oci->width;

+  dest->y_height = oci->height;

   dest->uv_height = dest->y_height / 2;

   return 0;

--- a/vp9/common/vp9_postproc.h

+++ b/vp9/common/vp9_postproc.h

@@ -13,30 +13,26 @@

 #define VP9_COMMON_VP9_POSTPROC_H_

 #include "vpx_ports/mem.h"

 struct postproc_state {

-  int           last_q;

-  int           last_noise;

-  char          noise[3072];

+  int last_q;

+  int last_noise;

+  char noise[3072];

   DECLARE_ALIGNED(16, char, blackclamp[16]);

   DECLARE_ALIGNED(16, char, whiteclamp[16]);

   DECLARE_ALIGNED(16, char, bothclamp[16]);

};

 #include "vp9/common/vp9_onyxc_int.h"

 #include "vp9/common/vp9_ppflags.h"

 int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest,

                         vp9_ppflags_t *flags);

+void vp9_denoise(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,

+                 int q, int low_var_thresh, int flag);

-void vp9_de_noise(YV12_BUFFER_CONFIG         *source,

-                  YV12_BUFFER_CONFIG         *post,

-                  int                         q,

-                  int                         low_var_thresh,

-                  int                         flag);

-void vp9_deblock(YV12_BUFFER_CONFIG         *source,

-                 YV12_BUFFER_CONFIG         *post,

-                 int                         q,

-                 int                         low_var_thresh,

-                 int                         flag);

+void vp9_deblock(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post,

+                 int q, int low_var_thresh, int flag);

 #endif  // VP9_COMMON_VP9_POSTPROC_H_

--- a/vp9/common/vp9_pragmas.h

+++ b/vp9/common/vp9_pragmas.h

@@ -14,6 +14,7 @@

 #ifdef __INTEL_COMPILER

 #pragma warning(disable:997 1011 170)

 #endif

 #ifdef _MSC_VER

 #pragma warning(disable:4799)

 #endif

--- a/vp9/common/vp9_pred_common.c

+++ b/vp9/common/vp9_pred_common.c

@@ -29,14 +29,15 @@

   // The prediction flags in these dummy entries are initialised to 0.

   switch (pred_id) {

     case PRED_SEG_ID:

-      pred_context = (m - 1)->mbmi.seg_id_predicted +

-                     (m - cm->mode_info_stride)->mbmi.seg_id_predicted;

+      pred_context = (m - cm->mode_info_stride)->mbmi.seg_id_predicted;

+      if (xd->left_available)

+        pred_context += (m - 1)->mbmi.seg_id_predicted;

       break;

     case PRED_REF:

-      pred_context = (m - 1)->mbmi.ref_predicted +

-                     (m - cm->mode_info_stride)->mbmi.ref_predicted;

+      pred_context = (m - cm->mode_info_stride)->mbmi.ref_predicted;

+      if (xd->left_available)

+        pred_context += (m - 1)->mbmi.ref_predicted;

       break;

     case PRED_COMP:

@@ -61,13 +62,14 @@

       break;

     case PRED_MBSKIP:

-      pred_context = (m - 1)->mbmi.mb_skip_coeff +

-                     (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;

+      pred_context = (m - cm->mode_info_stride)->mbmi.mb_skip_coeff;

+      if (xd->left_available)

+        pred_context += (m - 1)->mbmi.mb_skip_coeff;

       break;

     case PRED_SWITCHABLE_INTERP:

-        int left_in_image = (m - 1)->mbmi.mb_in_image;

+        int left_in_image = xd->left_available && (m - 1)->mbmi.mb_in_image;

         int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;

         int left_mode = (m - 1)->mbmi.mode;

         int above_mode = (m - cm->mode_info_stride)->mbmi.mode;

@@ -98,8 +100,7 @@

       break;

     default:

-      // TODO *** add error trap code.

-      pred_context = 0;

+      pred_context = 0;  // *** add error trap code.

       break;

@@ -111,39 +112,23 @@

 vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,

                           const MACROBLOCKD *const xd,

                           PRED_ID pred_id) {

-  vp9_prob pred_probability;

-  int pred_context;

+  const int pred_context = vp9_get_pred_context(cm, xd, pred_id);

-  // Get the appropriate prediction context

-  pred_context = vp9_get_pred_context(cm, xd, pred_id);

   switch (pred_id) {

     case PRED_SEG_ID:

-      pred_probability = cm->segment_pred_probs[pred_context];

-      break;

+      return cm->segment_pred_probs[pred_context];

     case PRED_REF:

-      pred_probability = cm->ref_pred_probs[pred_context];

-      break;

+      return cm->ref_pred_probs[pred_context];

     case PRED_COMP:

       // In keeping with convention elsewhre the probability returned is

       // the probability of a "0" outcome which in this case means the

       // probability of comp pred off.

-      pred_probability = cm->prob_comppred[pred_context];

-      break;

+      return cm->prob_comppred[pred_context];

     case PRED_MBSKIP:

-      pred_probability = cm->mbskip_pred_probs[pred_context];

-      break;

+      return cm->mbskip_pred_probs[pred_context];

     default:

-      // TODO *** add error trap code.

-      pred_probability = 128;

-      break;

+      return 128;  // *** add error trap code.

-  return pred_probability;

 // This function returns a context probability ptr for coding a given

@@ -151,43 +136,25 @@

 const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,

                                    const MACROBLOCKD *const xd,

                                    PRED_ID pred_id) {

-  const vp9_prob *pred_probability;

-  int pred_context;

+  const int pred_context = vp9_get_pred_context(cm, xd, pred_id);

-  // Get the appropriate prediction context

-  pred_context = vp9_get_pred_context(cm, xd, pred_id);

   switch (pred_id) {

     case PRED_SEG_ID:

-      pred_probability = &cm->segment_pred_probs[pred_context];

-      break;

+      return &cm->segment_pred_probs[pred_context];

     case PRED_REF:

-      pred_probability = &cm->ref_pred_probs[pred_context];

-      break;

+      return &cm->ref_pred_probs[pred_context];

     case PRED_COMP:

       // In keeping with convention elsewhre the probability returned is

       // the probability of a "0" outcome which in this case means the

       // probability of comp pred off.

-      pred_probability = &cm->prob_comppred[pred_context];

-      break;

+      return &cm->prob_comppred[pred_context];

     case PRED_MBSKIP:

-      pred_probability = &cm->mbskip_pred_probs[pred_context];

-      break;

+      return &cm->mbskip_pred_probs[pred_context];

     case PRED_SWITCHABLE_INTERP:

-      pred_probability = &cm->fc.switchable_interp_prob[pred_context][0];

-      break;

+      return &cm->fc.switchable_interp_prob[pred_context][0];

     default:

-      // TODO *** add error trap code.

-      pred_probability = NULL;

-      break;

+      return NULL;  // *** add error trap code.

-  return pred_probability;

 // This function returns the status of the given prediction signal.

@@ -194,28 +161,16 @@

 // I.e. is the predicted value for the given signal correct.

 unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,

                                 PRED_ID pred_id) {

-  unsigned char pred_flag = 0;

   switch (pred_id) {

     case PRED_SEG_ID:

-      pred_flag = xd->mode_info_context->mbmi.seg_id_predicted;

-      break;

+      return xd->mode_info_context->mbmi.seg_id_predicted;

     case PRED_REF:

-      pred_flag = xd->mode_info_context->mbmi.ref_predicted;

-      break;

+      return  xd->mode_info_context->mbmi.ref_predicted;

     case PRED_MBSKIP:

-      pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff;

-      break;

+      return xd->mode_info_context->mbmi.mb_skip_coeff;

     default:

-      // TODO *** add error trap code.

-      pred_flag = 0;

-      break;

+      return 0;  // *** add error trap code.

-  return pred_flag;

 // This function sets the status of the given prediction signal.

@@ -277,7 +232,7 @@

       break;

     default:

-      // TODO *** add error trap code.

+      // *** add error trap code.

       break;

@@ -322,7 +277,6 @@

   MV_REFERENCE_FRAME pred_ref = LAST_FRAME;

   int segment_id = xd->mode_info_context->mbmi.segment_id;

-  int seg_ref_active;

   int i;

   unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1};

@@ -333,7 +287,7 @@

   unsigned char above_left_in_image;

   // Is segment coding ennabled

-  seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);

+  int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);

   // Special case treatment if segment coding is enabled.

   // Dont allow prediction of a reference frame that the segment

@@ -355,9 +309,10 @@

   above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame;

   // Are neighbours in image

-  left_in_image = (m - 1)->mbmi.mb_in_image;

+  left_in_image = (m - 1)->mbmi.mb_in_image && xd->left_available;

   above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image;

-  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image;

+  above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image &&

+                        xd->left_available;

   // Adjust scores for candidate reference frames based on neigbours

   if (frame_allowed[left] && left_in_image) {

@@ -385,9 +340,7 @@

 // Functions to computes a set of modified reference frame probabilities

 // to use when the prediction of the reference frame value fails

 void vp9_calc_ref_probs(int *count, vp9_prob *probs) {

-  int tot_count;

-  tot_count = count[0] + count[1] + count[2] + count[3];

+  int tot_count = count[0] + count[1] + count[2] + count[3];

   probs[0] = get_prob(count[0], tot_count);

   tot_count -= count[0];

@@ -403,19 +356,12 @@

 // they are not allowed for a given segment.

 void vp9_compute_mod_refprobs(VP9_COMMON *const cm) {

   int norm_cnt[MAX_REF_FRAMES];

-  int intra_count;

-  int inter_count;

-  int last_count;

-  int gfarf_count;

-  int gf_count;

-  int arf_count;

-  intra_count = cm->prob_intra_coded;

-  inter_count = (255 - intra_count);

-  last_count = (inter_count * cm->prob_last_coded) / 255;

-  gfarf_count = inter_count - last_count;

-  gf_count = (gfarf_count * cm->prob_gf_coded) / 255;

-  arf_count = gfarf_count - gf_count;

+  const int intra_count = cm->prob_intra_coded;

+  const int inter_count = (255 - intra_count);

+  const int last_count = (inter_count * cm->prob_last_coded) / 255;

+  const int gfarf_count = inter_count - last_count;

+  const int gf_count = (gfarf_count * cm->prob_gf_coded) / 255;

+  const int arf_count = gfarf_count - gf_count;

   // Work out modified reference frame probabilities to use where prediction

   // of the reference frame fails

--- a/vp9/common/vp9_pred_common.h

+++ b/vp9/common/vp9_pred_common.h

@@ -8,16 +8,15 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "vp9/common/vp9_onyxc_int.h"

-#include "vp9/common/vp9_blockd.h"

 #ifndef VP9_COMMON_VP9_PRED_COMMON_H_

 #define VP9_COMMON_VP9_PRED_COMMON_H_

+#include "vp9/common/vp9_blockd.h"

+#include "vp9/common/vp9_onyxc_int.h"

 // Predicted items

 typedef enum {

-  PRED_SEG_ID = 0,               // Segment identifier

+  PRED_SEG_ID = 0,  // Segment identifier

   PRED_REF = 1,

   PRED_COMP = 2,

   PRED_MBSKIP = 3,

@@ -24,32 +23,33 @@

   PRED_SWITCHABLE_INTERP = 4

 } PRED_ID;

-extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,

-                                          const MACROBLOCKD *const xd,

-                                          PRED_ID pred_id);

+unsigned char vp9_get_pred_context(const VP9_COMMON *const cm,

+                                   const MACROBLOCKD *const xd,

+                                   PRED_ID pred_id);

-extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,

-                                  const MACROBLOCKD *const xd,

-                                  PRED_ID pred_id);

+vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm,

+                           const MACROBLOCKD *const xd,

+                           PRED_ID pred_id);

-extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,

-                                          const MACROBLOCKD *const xd,

-                                          PRED_ID pred_id);

+const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm,

+                                   const MACROBLOCKD *const xd,

+                                   PRED_ID pred_id);

-extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,

-                                       PRED_ID pred_id);

+unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd,

+                                PRED_ID pred_id);

-extern void vp9_set_pred_flag(MACROBLOCKD *const xd,

-                              PRED_ID pred_id,

-                              unsigned char pred_flag);

+void vp9_set_pred_flag(MACROBLOCKD *const xd,

+                       PRED_ID pred_id,

+                       unsigned char pred_flag);

-extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,

-                                           const MACROBLOCKD *const xd,

-                                           int MbIndex);

+unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm,

+                                    const MACROBLOCKD *const xd,

+                                    int MbIndex);

-extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,

-                                       const MACROBLOCKD *const xd);

-extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm);

+MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm,

+                                    const MACROBLOCKD *const xd);

+void vp9_compute_mod_refprobs(VP9_COMMON *const cm);

 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_

--- a/vp9/common/vp9_quant_common.c

+++ b/vp9/common/vp9_quant_common.c

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_quant_common.h"

 static int dc_qlookup[QINDEX_RANGE];

@@ -24,7 +24,7 @@

   for (i = 0; i < QINDEX_RANGE; i++) {

     ac_qlookup[i] = current_val;

-    current_val = (int)((double)current_val * 1.02);

+    current_val = (int)(current_val * 1.02);

     if (current_val == last_val)

       current_val++;

     last_val = current_val;

@@ -38,88 +38,18 @@

-int vp9_dc_quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = dc_qlookup[ QIndex ];

-  return retval;

+int vp9_dc_quant(int qindex, int delta) {

+  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];

-int vp9_dc2quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = dc_qlookup[ QIndex ];

-  return retval;

+int vp9_dc_uv_quant(int qindex, int delta) {

+  return dc_qlookup[clamp(qindex + delta, 0, MAXQ)];

-int vp9_dc_uv_quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = dc_qlookup[ QIndex ];

-  return retval;

+int vp9_ac_yquant(int qindex) {

+  return ac_qlookup[clamp(qindex, 0, MAXQ)];

-int vp9_ac_yquant(int QIndex) {

-  int retval;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = ac_qlookup[ QIndex ];

-  return retval;

-}

-int vp9_ac2quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = (ac_qlookup[ QIndex ] * 775) / 1000;

-  if (retval < 4)

-    retval = 4;

-  return retval;

-}

-int vp9_ac_uv_quant(int QIndex, int Delta) {

-  int retval;

-  QIndex = QIndex + Delta;

-  if (QIndex > MAXQ)

-    QIndex = MAXQ;

-  else if (QIndex < 0)

-    QIndex = 0;

-  retval = ac_qlookup[ QIndex ];

-  return retval;

+int vp9_ac_uv_quant(int qindex, int delta) {

+  return ac_qlookup[clamp(qindex + delta, 0, MAXQ)];

--- a/vp9/common/vp9_quant_common.h

+++ b/vp9/common/vp9_quant_common.h

@@ -11,16 +11,15 @@

 #ifndef VP9_COMMON_VP9_QUANT_COMMON_H_

 #define VP9_COMMON_VP9_QUANT_COMMON_H_

-#include "string.h"

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_onyxc_int.h"

-extern void vp9_init_quant_tables(void);

-extern int vp9_ac_yquant(int QIndex);

-extern int vp9_dc_quant(int QIndex, int Delta);

-extern int vp9_dc2quant(int QIndex, int Delta);

-extern int vp9_ac2quant(int QIndex, int Delta);

-extern int vp9_dc_uv_quant(int QIndex, int Delta);

-extern int vp9_ac_uv_quant(int QIndex, int Delta);

+void vp9_init_quant_tables();

+int vp9_ac_yquant(int qindex);

+int vp9_dc_quant(int qindex, int delta);

+int vp9_dc2quant(int qindex, int delta);

+int vp9_ac2quant(int qindex, int delta);

+int vp9_dc_uv_quant(int qindex, int delta);

+int vp9_ac_uv_quant(int qindex, int delta);

 #endif  // VP9_COMMON_VP9_QUANT_COMMON_H_

--- a/vp9/common/vp9_recon.c

+++ b/vp9/common/vp9_recon.c

@@ -117,7 +117,7 @@

 void vp9_recon_sby_s_c(MACROBLOCKD *xd, uint8_t *dst) {

   int x, y, stride = xd->block[0].dst_stride;

-  int16_t *diff = xd->sb_coeff_data.diff;

+  int16_t *diff = xd->diff;

   for (y = 0; y < 32; y++) {

     for (x = 0; x < 32; x++) {

@@ -130,8 +130,8 @@

 void vp9_recon_sbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {

   int x, y, stride = xd->block[16].dst_stride;

-  int16_t *udiff = xd->sb_coeff_data.diff + 1024;

-  int16_t *vdiff = xd->sb_coeff_data.diff + 1280;

+  int16_t *udiff = xd->diff + 1024;

+  int16_t *vdiff = xd->diff + 1280;

   for (y = 0; y < 16; y++) {

     for (x = 0; x < 16; x++) {

@@ -142,6 +142,36 @@

     vdst += stride;

     udiff += 16;

     vdiff += 16;

+  }

+}

+void vp9_recon_sb64y_s_c(MACROBLOCKD *xd, uint8_t *dst) {

+  int x, y, stride = xd->block[0].dst_stride;

+  int16_t *diff = xd->diff;

+  for (y = 0; y < 64; y++) {

+    for (x = 0; x < 64; x++) {

+      dst[x] = clip_pixel(dst[x] + diff[x]);

+    }

+    dst += stride;

+    diff += 64;

+  }

+}

+void vp9_recon_sb64uv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) {

+  int x, y, stride = xd->block[16].dst_stride;

+  int16_t *udiff = xd->diff + 4096;

+  int16_t *vdiff = xd->diff + 4096 + 1024;

+  for (y = 0; y < 32; y++) {

+    for (x = 0; x < 32; x++) {

+      udst[x] = clip_pixel(udst[x] + udiff[x]);

+      vdst[x] = clip_pixel(vdst[x] + vdiff[x]);

+    }

+    udst += stride;

+    vdst += stride;

+    udiff += 32;

+    vdiff += 32;

--- a/vp9/common/vp9_reconinter.c

+++ b/vp9/common/vp9_reconinter.c

@@ -8,66 +8,252 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include "./vpx_config.h"

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_blockd.h"

+#include "vp9/common/vp9_filter.h"

 #include "vp9/common/vp9_reconinter.h"

 #include "vp9/common/vp9_reconintra.h"

+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,

+                                       YV12_BUFFER_CONFIG *other,

+                                       int this_w, int this_h) {

+  int other_h = other->y_crop_height;

+  int other_w = other->y_crop_width;

+  scale->x_num = other_w;

+  scale->x_den = this_w;

+  scale->x_offset_q4 = 0;  // calculated per-mb

+  scale->x_step_q4 = 16 * other_w / this_w;

+  scale->y_num = other_h;

+  scale->y_den = this_h;

+  scale->y_offset_q4 = 0;  // calculated per-mb

+  scale->y_step_q4 = 16 * other_h / this_h;

+  // TODO(agrange): Investigate the best choice of functions to use here

+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what

+  // to do at full-pel offsets. The current selection, where the filter is

+  // applied in one direction only, and not at all for 0,0, seems to give the

+  // best quality, but it may be worth trying an additional mode that does

+  // do the filtering on full-pel.

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+  if (scale->x_step_q4 == 16) {

+    if (scale->y_step_q4 == 16) {

+      // No scaling in either direction.

+      scale->predict[0][0][0] = vp9_convolve_copy;

+      scale->predict[0][0][1] = vp9_convolve_1by8;

+      scale->predict[0][0][2] = vp9_convolve_qtr;

+      scale->predict[0][0][3] = vp9_convolve_3by8;

+      scale->predict[0][0][4] = vp9_convolve_avg;

+      scale->predict[0][0][5] = vp9_convolve_5by8;

+      scale->predict[0][0][6] = vp9_convolve_3qtr;

+      scale->predict[0][0][7] = vp9_convolve_7by8;

+      scale->predict[0][1][0] = vp9_convolve8_vert;

+      scale->predict[0][1][1] = vp9_convolve8_1by8_vert;

+      scale->predict[0][1][2] = vp9_convolve8_qtr_vert;

+      scale->predict[0][1][3] = vp9_convolve8_3by8_vert;

+      scale->predict[0][1][4] = vp9_convolve8_avg_vert;

+      scale->predict[0][1][5] = vp9_convolve8_5by8_vert;

+      scale->predict[0][1][6] = vp9_convolve8_3qtr_vert;

+      scale->predict[0][1][7] = vp9_convolve8_7by8_vert;

+      scale->predict[1][0][0] = vp9_convolve8_horiz;

+      scale->predict[1][0][1] = vp9_convolve8_1by8_horiz;

+      scale->predict[1][0][2] = vp9_convolve8_qtr_horiz;

+      scale->predict[1][0][3] = vp9_convolve8_3by8_horiz;

+      scale->predict[1][0][4] = vp9_convolve8_avg_horiz;

+      scale->predict[1][0][5] = vp9_convolve8_5by8_horiz;

+      scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz;

+      scale->predict[1][0][7] = vp9_convolve8_7by8_horiz;

+    } else {

+      // No scaling in x direction. Must always scale in the y direction.

+      scale->predict[0][0][0] = vp9_convolve8_vert;

+      scale->predict[0][0][1] = vp9_convolve8_1by8_vert;

+      scale->predict[0][0][2] = vp9_convolve8_qtr_vert;

+      scale->predict[0][0][3] = vp9_convolve8_3by8_vert;

+      scale->predict[0][0][4] = vp9_convolve8_avg_vert;

+      scale->predict[0][0][5] = vp9_convolve8_5by8_vert;

+      scale->predict[0][0][6] = vp9_convolve8_3qtr_vert;

+      scale->predict[0][0][7] = vp9_convolve8_7by8_vert;

+      scale->predict[0][1][0] = vp9_convolve8_vert;

+      scale->predict[0][1][1] = vp9_convolve8_1by8_vert;

+      scale->predict[0][1][2] = vp9_convolve8_qtr_vert;

+      scale->predict[0][1][3] = vp9_convolve8_3by8_vert;

+      scale->predict[0][1][4] = vp9_convolve8_avg_vert;

+      scale->predict[0][1][5] = vp9_convolve8_5by8_vert;

+      scale->predict[0][1][6] = vp9_convolve8_3qtr_vert;

+      scale->predict[0][1][7] = vp9_convolve8_7by8_vert;

+      scale->predict[1][0][0] = vp9_convolve8;

+      scale->predict[1][0][1] = vp9_convolve8_1by8;

+      scale->predict[1][0][2] = vp9_convolve8_qtr;

+      scale->predict[1][0][3] = vp9_convolve8_3by8;

+      scale->predict[1][0][4] = vp9_convolve8_avg;

+      scale->predict[1][0][5] = vp9_convolve8_5by8;

+      scale->predict[1][0][6] = vp9_convolve8_3qtr;

+      scale->predict[1][0][7] = vp9_convolve8_7by8;

+    }

+  } else {

+    if (scale->y_step_q4 == 16) {

+      // No scaling in the y direction. Must always scale in the x direction.

+      scale->predict[0][0][0] = vp9_convolve8_horiz;

+      scale->predict[0][0][1] = vp9_convolve8_1by8_horiz;

+      scale->predict[0][0][2] = vp9_convolve8_qtr_horiz;

+      scale->predict[0][0][3] = vp9_convolve8_3by8_horiz;

+      scale->predict[0][0][4] = vp9_convolve8_avg_horiz;

+      scale->predict[0][0][5] = vp9_convolve8_5by8_horiz;

+      scale->predict[0][0][6] = vp9_convolve8_3qtr_horiz;

+      scale->predict[0][0][7] = vp9_convolve8_7by8_horiz;

+      scale->predict[0][1][0] = vp9_convolve8;

+      scale->predict[0][1][1] = vp9_convolve8_1by8;

+      scale->predict[0][1][2] = vp9_convolve8_qtr;

+      scale->predict[0][1][3] = vp9_convolve8_3by8;

+      scale->predict[0][1][4] = vp9_convolve8_avg;

+      scale->predict[0][1][5] = vp9_convolve8_5by8;

+      scale->predict[0][1][6] = vp9_convolve8_3qtr;

+      scale->predict[0][1][7] = vp9_convolve8_7by8;

+      scale->predict[1][0][0] = vp9_convolve8_horiz;

+      scale->predict[1][0][1] = vp9_convolve8_1by8_horiz;

+      scale->predict[1][0][2] = vp9_convolve8_qtr_horiz;

+      scale->predict[1][0][3] = vp9_convolve8_3by8_horiz;

+      scale->predict[1][0][4] = vp9_convolve8_avg_horiz;

+      scale->predict[1][0][5] = vp9_convolve8_5by8_horiz;

+      scale->predict[1][0][6] = vp9_convolve8_3qtr_horiz;

+      scale->predict[1][0][7] = vp9_convolve8_7by8_horiz;

+    } else {

+      // Must always scale in both directions.

+      scale->predict[0][0][0] = vp9_convolve8;

+      scale->predict[0][0][1] = vp9_convolve8_1by8;

+      scale->predict[0][0][2] = vp9_convolve8_qtr;

+      scale->predict[0][0][3] = vp9_convolve8_3by8;

+      scale->predict[0][0][4] = vp9_convolve8_avg;

+      scale->predict[0][0][5] = vp9_convolve8_5by8;

+      scale->predict[0][0][6] = vp9_convolve8_3qtr;

+      scale->predict[0][0][7] = vp9_convolve8_7by8;

+      scale->predict[0][1][0] = vp9_convolve8;

+      scale->predict[0][1][1] = vp9_convolve8_1by8;

+      scale->predict[0][1][2] = vp9_convolve8_qtr;

+      scale->predict[0][1][3] = vp9_convolve8_3by8;

+      scale->predict[0][1][4] = vp9_convolve8_avg;

+      scale->predict[0][1][5] = vp9_convolve8_5by8;

+      scale->predict[0][1][6] = vp9_convolve8_3qtr;

+      scale->predict[0][1][7] = vp9_convolve8_7by8;

+      scale->predict[1][0][0] = vp9_convolve8;

+      scale->predict[1][0][1] = vp9_convolve8_1by8;

+      scale->predict[1][0][2] = vp9_convolve8_qtr;

+      scale->predict[1][0][3] = vp9_convolve8_3by8;

+      scale->predict[1][0][4] = vp9_convolve8_avg;

+      scale->predict[1][0][5] = vp9_convolve8_5by8;

+      scale->predict[1][0][6] = vp9_convolve8_3qtr;

+      scale->predict[1][0][7] = vp9_convolve8_7by8;

+    }

+  }

+  // 2D subpel motion always gets filtered in both directions

+  scale->predict[1][1][0] = vp9_convolve8;

+  scale->predict[1][1][1] = vp9_convolve8_1by8;

+  scale->predict[1][1][2] = vp9_convolve8_qtr;

+  scale->predict[1][1][3] = vp9_convolve8_3by8;

+  scale->predict[1][1][4] = vp9_convolve8_avg;

+  scale->predict[1][1][5] = vp9_convolve8_5by8;

+  scale->predict[1][1][6] = vp9_convolve8_3qtr;

+  scale->predict[1][1][7] = vp9_convolve8_7by8;

+}

+#else

+  if (scale->x_step_q4 == 16) {

+    if (scale->y_step_q4 == 16) {

+      // No scaling in either direction.

+      scale->predict[0][0][0] = vp9_convolve_copy;

+      scale->predict[0][0][1] = vp9_convolve_avg;

+      scale->predict[0][1][0] = vp9_convolve8_vert;

+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;

+      scale->predict[1][0][0] = vp9_convolve8_horiz;

+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;

+    } else {

+      // No scaling in x direction. Must always scale in the y direction.

+      scale->predict[0][0][0] = vp9_convolve8_vert;

+      scale->predict[0][0][1] = vp9_convolve8_avg_vert;

+      scale->predict[0][1][0] = vp9_convolve8_vert;

+      scale->predict[0][1][1] = vp9_convolve8_avg_vert;

+      scale->predict[1][0][0] = vp9_convolve8;

+      scale->predict[1][0][1] = vp9_convolve8_avg;

+    }

+  } else {

+    if (scale->y_step_q4 == 16) {

+      // No scaling in the y direction. Must always scale in the x direction.

+      scale->predict[0][0][0] = vp9_convolve8_horiz;

+      scale->predict[0][0][1] = vp9_convolve8_avg_horiz;

+      scale->predict[0][1][0] = vp9_convolve8;

+      scale->predict[0][1][1] = vp9_convolve8_avg;

+      scale->predict[1][0][0] = vp9_convolve8_horiz;

+      scale->predict[1][0][1] = vp9_convolve8_avg_horiz;

+    } else {

+      // Must always scale in both directions.

+      scale->predict[0][0][0] = vp9_convolve8;

+      scale->predict[0][0][1] = vp9_convolve8_avg;

+      scale->predict[0][1][0] = vp9_convolve8;

+      scale->predict[0][1][1] = vp9_convolve8_avg;

+      scale->predict[1][0][0] = vp9_convolve8;

+      scale->predict[1][0][1] = vp9_convolve8_avg;

+    }

+  }

+  // 2D subpel motion always gets filtered in both directions

+  scale->predict[1][1][0] = vp9_convolve8;

+  scale->predict[1][1][1] = vp9_convolve8_avg;

+}

+#endif

 void vp9_setup_interp_filters(MACROBLOCKD *xd,

                               INTERPOLATIONFILTERTYPE mcomp_filter_type,

                               VP9_COMMON *cm) {

-#if CONFIG_ENABLE_6TAP

-  if (mcomp_filter_type == SIXTAP) {

-    xd->subpixel_predict4x4     = vp9_sixtap_predict4x4;

-    xd->subpixel_predict8x4     = vp9_sixtap_predict8x4;

-    xd->subpixel_predict8x8     = vp9_sixtap_predict8x8;

-    xd->subpixel_predict16x16   = vp9_sixtap_predict16x16;

-    xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4;

-    xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8;

-    xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16;

-  } else {

-#endif

-  if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {

-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4;

-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4;

-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8;

-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16;

-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4;

-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8;

-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16;

-  } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) {

-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_smooth;

-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_smooth;

-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_smooth;

-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_smooth;

-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth;

-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth;

-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth;

-  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {

-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_sharp;

-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_sharp;

-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_sharp;

-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_sharp;

-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp;

-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp;

-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c;

-  } else {

-    xd->subpixel_predict4x4     = vp9_bilinear_predict4x4;

-    xd->subpixel_predict8x4     = vp9_bilinear_predict8x4;

-    xd->subpixel_predict8x8     = vp9_bilinear_predict8x8;

-    xd->subpixel_predict16x16   = vp9_bilinear_predict16x16;

-    xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4;

-    xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8;

-    xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16;

+  int i;

+  /* Calculate scaling factors for each of the 3 available references */

+  for (i = 0; i < 3; ++i) {

+    if (cm->active_ref_idx[i] >= NUM_YV12_BUFFERS) {

+      memset(&cm->active_ref_scale[i], 0, sizeof(cm->active_ref_scale[i]));

+      continue;

+    }

+    vp9_setup_scale_factors_for_frame(&cm->active_ref_scale[i],

+                                      &cm->yv12_fb[cm->active_ref_idx[i]],

+                                      cm->width, cm->height);

-#if CONFIG_ENABLE_6TAP

+  if (xd->mode_info_context) {

+    MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+    set_scale_factors(xd,

+                      mbmi->ref_frame - 1,

+                      mbmi->second_ref_frame - 1,

+                      cm->active_ref_scale);

+  switch (mcomp_filter_type) {

+    case EIGHTTAP:

+    case SWITCHABLE:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8;

+      break;

+    case EIGHTTAP_SMOOTH:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp;

+      break;

+    case EIGHTTAP_SHARP:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s;

+      break;

+    case BILINEAR:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;

+      break;

+#if CONFIG_ENABLE_6TAP

+    case SIXTAP:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6;

+      break;

 #endif

+  }

+  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);

-void vp9_copy_mem16x16_c(uint8_t *src,

+void vp9_copy_mem16x16_c(const uint8_t *src,

                          int src_stride,

                          uint8_t *dst,

                          int dst_stride) {

@@ -93,10 +279,10 @@

     dst[15] = src[15];

 #else

-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

-    ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];

-    ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];

+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];

+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];

+    ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];

+    ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];

 #endif

     src += src_stride;

@@ -104,25 +290,7 @@

-void vp9_avg_mem16x16_c(uint8_t *src,

-                        int src_stride,

-                        uint8_t *dst,

-                        int dst_stride) {

-  int r;

-  for (r = 0; r < 16; r++) {

-    int n;

-    for (n = 0; n < 16; n++) {

-      dst[n] = (dst[n] + src[n] + 1) >> 1;

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_copy_mem8x8_c(uint8_t *src,

+void vp9_copy_mem8x8_c(const uint8_t *src,

                        int src_stride,

                        uint8_t *dst,

                        int dst_stride) {

@@ -139,8 +307,8 @@

     dst[6] = src[6];

     dst[7] = src[7];

 #else

-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];

+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];

 #endif

     src += src_stride;

     dst += dst_stride;

@@ -147,25 +315,7 @@

-void vp9_avg_mem8x8_c(uint8_t *src,

-                      int src_stride,

-                      uint8_t *dst,

-                      int dst_stride) {

-  int r;

-  for (r = 0; r < 8; r++) {

-    int n;

-    for (n = 0; n < 8; n++) {

-      dst[n] = (dst[n] + src[n] + 1) >> 1;

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_copy_mem8x4_c(uint8_t *src,

+void vp9_copy_mem8x4_c(const uint8_t *src,

                        int src_stride,

                        uint8_t *dst,

                        int dst_stride) {

@@ -182,8 +332,8 @@

     dst[6] = src[6];

     dst[7] = src[7];

 #else

-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];

+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];

 #endif

     src += src_stride;

     dst += dst_stride;

@@ -190,236 +340,193 @@

-void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {

-  int r;

-  uint8_t *ptr_base;

-  uint8_t *ptr;

-  uint8_t *pred_ptr = d->predictor;

-  int_mv mv;

+static void set_scaled_offsets(struct scale_factors *scale,

+                               int row, int col) {

+  const int x_q4 = 16 * col;

+  const int y_q4 = 16 * row;

-  ptr_base = *(d->base_pre);

-  mv.as_int = d->bmi.as_mv.first.as_int;

+  scale->x_offset_q4 = (x_q4 * scale->x_num / scale->x_den) & 0xf;

+  scale->y_offset_q4 = (y_q4 * scale->y_num / scale->y_den) & 0xf;

+}

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-          (mv.as_mv.col >> 3);

-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,

-         pred_ptr, pitch);

-  } else {

-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-                (mv.as_mv.col >> 3);

-    ptr = ptr_base;

+static int32_t scale_motion_vector_component_q3(int mv_q3,

+                                                int num,

+                                                int den,

+                                                int offset_q4) {

+  // returns the scaled and offset value of the mv component.

+  const int32_t mv_q4 = mv_q3 << 1;

-    for (r = 0; r < 4; r++) {

-#if !(CONFIG_FAST_UNALIGNED)

-      pred_ptr[0]  = ptr[0];

-      pred_ptr[1]  = ptr[1];

-      pred_ptr[2]  = ptr[2];

-      pred_ptr[3]  = ptr[3];

-#else

-      *(uint32_t *)pred_ptr = *(uint32_t *)ptr;

-#endif

-      pred_ptr     += pitch;

-      ptr         += d->pre_stride;

-    }

-  }

+  /* TODO(jkoleszar): make fixed point, or as a second multiply? */

+  return mv_q4 * num / den + offset_q4;

-/*

- * Similar to vp9_build_inter_predictors_b(), but instead of storing the

- * results in d->predictor, we average the contents of d->predictor (which

- * come from an earlier call to vp9_build_inter_predictors_b()) with the

- * predictor of the second reference frame / motion vector.

- */

-void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,

-                                      vp9_subpix_fn_t sppf) {

-  int r;

-  uint8_t *ptr_base;

-  uint8_t *ptr;

-  uint8_t *pred_ptr = d->predictor;

-  int_mv mv;

+static int32_t scale_motion_vector_component_q4(int mv_q4,

+                                                int num,

+                                                int den,

+                                                int offset_q4) {

+  // returns the scaled and offset value of the mv component.

-  ptr_base = *(d->base_second_pre);

-  mv.as_int = d->bmi.as_mv.second.as_int;

+  /* TODO(jkoleszar): make fixed point, or as a second multiply? */

+  return mv_q4 * num / den + offset_q4;

+}

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-          (mv.as_mv.col >> 3);

-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,

-         pred_ptr, pitch);

-  } else {

-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-                (mv.as_mv.col >> 3);

-    ptr = ptr_base;

+static int_mv32 scale_motion_vector_q3_to_q4(

+    const int_mv *src_mv,

+    const struct scale_factors *scale) {

+  // returns mv * scale + offset

+  int_mv32 result;

-    for (r = 0; r < 4; r++) {

-      pred_ptr[0]  = (pred_ptr[0] + ptr[0] + 1) >> 1;

-      pred_ptr[1]  = (pred_ptr[1] + ptr[1] + 1) >> 1;

-      pred_ptr[2]  = (pred_ptr[2] + ptr[2] + 1) >> 1;

-      pred_ptr[3]  = (pred_ptr[3] + ptr[3] + 1) >> 1;

-      pred_ptr    += pitch;

-      ptr         += d->pre_stride;

-    }

-  }

+  result.as_mv.row = scale_motion_vector_component_q3(src_mv->as_mv.row,

+                                                      scale->y_num,

+                                                      scale->y_den,

+                                                      scale->y_offset_q4);

+  result.as_mv.col = scale_motion_vector_component_q3(src_mv->as_mv.col,

+                                                      scale->x_num,

+                                                      scale->x_den,

+                                                      scale->x_offset_q4);

+  return result;

-void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {

-  uint8_t *ptr_base;

-  uint8_t *ptr;

-  uint8_t *pred_ptr = d->predictor;

-  int_mv mv;

-  ptr_base = *(d->base_pre);

-  mv.as_int = d->bmi.as_mv.first.as_int;

-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-        (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

-                            (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

-  } else {

-    vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);

-  }

+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int_mv *mv_q3,

+                               const struct scale_factors *scale,

+                               int w, int h, int weight,

+                               const struct subpix_fn_table *subpix) {

+  int_mv32 mv = scale_motion_vector_q3_to_q4(mv_q3, scale);

+  src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4);

+  scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight](

+      src, src_stride, dst, dst_stride,

+      subpix->filter_x[mv.as_mv.col & 15], scale->x_step_q4,

+      subpix->filter_y[mv.as_mv.row & 15], scale->y_step_q4,

+      w, h);

-/*

- * Similar to build_inter_predictors_4b(), but instead of storing the

- * results in d->predictor, we average the contents of d->predictor (which

- * come from an earlier call to build_inter_predictors_4b()) with the

- * predictor of the second reference frame / motion vector.

+/* Like vp9_build_inter_predictor, but takes the full-pel part of the

+ * mv separately, and the fractional part as a q4.

*/

-void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,

-                                      BLOCKD *d, int pitch) {

-  uint8_t *ptr_base;

-  uint8_t *ptr;

-  uint8_t *pred_ptr = d->predictor;

-  int_mv mv;

+void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,

+                                  uint8_t *dst, int dst_stride,

+                                  const int_mv *fullpel_mv_q3,

+                                  const int_mv *frac_mv_q4,

+                                  const struct scale_factors *scale,

+                                  int w, int h, int weight,

+                                  const struct subpix_fn_table *subpix) {

+  const int mv_row_q4 = ((fullpel_mv_q3->as_mv.row >> 3) << 4)

+                        + (frac_mv_q4->as_mv.row & 0xf);

+  const int mv_col_q4 = ((fullpel_mv_q3->as_mv.col >> 3) << 4)

+                        + (frac_mv_q4->as_mv.col & 0xf);

+  const int scaled_mv_row_q4 =

+      scale_motion_vector_component_q4(mv_row_q4, scale->y_num, scale->y_den,

+                                       scale->y_offset_q4);

+  const int scaled_mv_col_q4 =

+      scale_motion_vector_component_q4(mv_col_q4, scale->x_num, scale->x_den,

+                                       scale->x_offset_q4);

+  const int subpel_x = scaled_mv_col_q4 & 15;

+  const int subpel_y = scaled_mv_row_q4 & 15;

-  ptr_base = *(d->base_second_pre);

-  mv.as_int = d->bmi.as_mv.second.as_int;

-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-        (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

-                               (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

-  } else {

-    vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);

-  }

+  src += (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4);

+  scale->predict[!!subpel_x][!!subpel_y][weight](

+      src, src_stride, dst, dst_stride,

+      subpix->filter_x[subpel_x], scale->x_step_q4,

+      subpix->filter_y[subpel_y], scale->y_step_q4,

+      w, h);

-static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {

-  uint8_t *ptr_base;

-  uint8_t *ptr;

-  uint8_t *pred_ptr = d->predictor;

-  int_mv mv;

+static void build_2x1_inter_predictor_wh(const BLOCKD *d0, const BLOCKD *d1,

+                                         struct scale_factors *scale,

+                                         uint8_t *predictor,

+                                         int block_size, int stride,

+                                         int which_mv, int weight,

+                                         int width, int height,

+                                         const struct subpix_fn_table *subpix,

+                                         int row, int col) {

+  assert(d1->predictor - d0->predictor == block_size);

+  assert(d1->pre == d0->pre + block_size);

-  ptr_base = *(d->base_pre);

-  mv.as_int = d->bmi.as_mv.first.as_int;

-  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-        (mv.as_mv.col >> 3);

+  set_scaled_offsets(&scale[which_mv], row, col);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

-                           (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

-  } else {

-    vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);

-  }

-}

+  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {

+    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;

-/*encoder only*/

-void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {

-  int i, j;

-  BLOCKD *blockd = xd->block;

+    vp9_build_inter_predictor(*base_pre + d0->pre,

+                              d0->pre_stride,

+                              predictor, stride,

+                              &d0->bmi.as_mv[which_mv],

+                              &scale[which_mv],

+                              width, height,

+                              weight, subpix);

-  /* build uv mvs */

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      int yoffset = i * 8 + j * 2;

-      int uoffset = 16 + i * 2 + j;

-      int voffset = 20 + i * 2 + j;

-      int temp;

+  } else {

+    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;

+    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;

-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.row

-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row

-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row

-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row;

+    vp9_build_inter_predictor(*base_pre0 + d0->pre,

+                              d0->pre_stride,

+                              predictor, stride,

+                              &d0->bmi.as_mv[which_mv],

+                              &scale[which_mv],

+                              width > block_size ? block_size : width, height,

+                              weight, subpix);

-      if (temp < 0) temp -= 4;

-      else temp += 4;

+    if (width <= block_size) return;

-      xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &

-        xd->fullpixel_mask;

+    set_scaled_offsets(&scale[which_mv], row, col + block_size);

-      temp = blockd[yoffset  ].bmi.as_mv.first.as_mv.col

-             + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col

-             + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col

-             + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col;

+    vp9_build_inter_predictor(*base_pre1 + d1->pre,

+                              d1->pre_stride,

+                              predictor + block_size, stride,

+                              &d1->bmi.as_mv[which_mv],

+                              &scale[which_mv],

+                              width - block_size, height,

+                              weight, subpix);

+  }

+}

-      if (temp < 0) temp -= 4;

-      else temp += 4;

+static void build_2x1_inter_predictor(const BLOCKD *d0, const BLOCKD *d1,

+                                      struct scale_factors *scale,

+                                      int block_size, int stride,

+                                      int which_mv, int weight,

+                                      const struct subpix_fn_table *subpix,

+                                      int row, int col) {

+  assert(d1->predictor - d0->predictor == block_size);

+  assert(d1->pre == d0->pre + block_size);

-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &

-        xd->fullpixel_mask;

+  set_scaled_offsets(&scale[which_mv], row, col);

-      blockd[voffset].bmi.as_mv.first.as_mv.row =

-        blockd[uoffset].bmi.as_mv.first.as_mv.row;

-      blockd[voffset].bmi.as_mv.first.as_mv.col =

-        blockd[uoffset].bmi.as_mv.first.as_mv.col;

+  if (d0->bmi.as_mv[which_mv].as_int == d1->bmi.as_mv[which_mv].as_int) {

+    uint8_t **base_pre = which_mv ? d0->base_second_pre : d0->base_pre;

-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.row

-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row

-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row

-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row;

+    vp9_build_inter_predictor(*base_pre + d0->pre,

+                              d0->pre_stride,

+                              d0->predictor, stride,

+                              &d0->bmi.as_mv[which_mv],

+                              &scale[which_mv],

+                              2 * block_size, block_size,

+                              weight, subpix);

-        if (temp < 0) {

-          temp -= 4;

-        } else {

-          temp += 4;

-        }

+  } else {

+    uint8_t **base_pre0 = which_mv ? d0->base_second_pre : d0->base_pre;

+    uint8_t **base_pre1 = which_mv ? d1->base_second_pre : d1->base_pre;

-        blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &

-          xd->fullpixel_mask;

+    vp9_build_inter_predictor(*base_pre0 + d0->pre,

+                              d0->pre_stride,

+                              d0->predictor, stride,

+                              &d0->bmi.as_mv[which_mv],

+                              &scale[which_mv],

+                              block_size, block_size,

+                              weight, subpix);

-        temp = blockd[yoffset  ].bmi.as_mv.second.as_mv.col

-               + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col

-               + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col

-               + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col;

+    set_scaled_offsets(&scale[which_mv], row, col + block_size);

-        if (temp < 0) {

-          temp -= 4;

-        } else {

-          temp += 4;

-        }

-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &

-          xd->fullpixel_mask;

-        blockd[voffset].bmi.as_mv.second.as_mv.row =

-          blockd[uoffset].bmi.as_mv.second.as_mv.row;

-        blockd[voffset].bmi.as_mv.second.as_mv.col =

-          blockd[uoffset].bmi.as_mv.second.as_mv.col;

-      }

-    }

+    vp9_build_inter_predictor(*base_pre1 + d1->pre,

+                              d1->pre_stride,

+                              d1->predictor, stride,

+                              &d1->bmi.as_mv[which_mv],

+                              &scale[which_mv],

+                              block_size, block_size,

+                              weight, subpix);

-  for (i = 16; i < 24; i += 2) {

-    BLOCKD *d0 = &blockd[i];

-    BLOCKD *d1 = &blockd[i + 1];

-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

-      build_inter_predictors2b(xd, d0, 8);

-    else {

-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);

-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);

-    }

-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);

-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);

-    }

-  }

 static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) {

@@ -458,102 +565,653 @@

             (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row;

-/*encoder only*/

-void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                             uint8_t *dst_y,

-                                             int dst_ystride,

-                                             int clamp_mvs) {

-  uint8_t *ptr_base = xd->pre.y_buffer;

-  uint8_t *ptr;

-  int pre_stride = xd->block[0].pre_stride;

+#define AVERAGE_WEIGHT  (1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT))

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+// Whether to use implicit weighting for UV

+#define USE_IMPLICIT_WEIGHT_UV

+// Whether to use implicit weighting for SplitMV

+// #define USE_IMPLICIT_WEIGHT_SPLITMV

+// #define SEARCH_MIN3

+static int64_t get_consistency_metric(MACROBLOCKD *xd,

+                                      uint8_t *tmp_y, int tmp_ystride) {

+  int block_size = 16 <<  xd->mode_info_context->mbmi.sb_type;

+  uint8_t *rec_y = xd->dst.y_buffer;

+  int rec_ystride = xd->dst.y_stride;

+  int64_t metric = 0;

+  int i;

+  if (xd->up_available) {

+    for (i = 0; i < block_size; ++i) {

+      int diff = abs(*(rec_y - rec_ystride + i) -

+                     *(tmp_y + i));

+#ifdef SEARCH_MIN3

+      // Searches for the min abs diff among 3 pixel neighbors in the border

+      int diff1 = xd->left_available ?

+          abs(*(rec_y - rec_ystride + i - 1) - *(tmp_y + i)) : diff;

+      int diff2 = i < block_size - 1 ?

+          abs(*(rec_y - rec_ystride + i + 1) - *(tmp_y + i)) : diff;

+      diff = diff <= diff1 ? diff : diff1;

+      diff = diff <= diff2 ? diff : diff2;

+#endif

+      metric += diff;

+    }

+  }

+  if (xd->left_available) {

+    for (i = 0; i < block_size; ++i) {

+      int diff = abs(*(rec_y - 1 + i * rec_ystride) -

+                     *(tmp_y + i * tmp_ystride));

+#ifdef SEARCH_MIN3

+      // Searches for the min abs diff among 3 pixel neighbors in the border

+      int diff1 = xd->up_available ?

+          abs(*(rec_y - 1 + (i - 1) * rec_ystride) -

+                      *(tmp_y + i * tmp_ystride)) : diff;

+      int diff2 = i < block_size - 1 ?

+          abs(*(rec_y - 1 + (i + 1) * rec_ystride) -

+              *(tmp_y + i * tmp_ystride)) : diff;

+      diff = diff <= diff1 ? diff : diff1;

+      diff = diff <= diff2 ? diff : diff2;

+#endif

+      metric += diff;

+    }

+  }

+  return metric;

+}

+static int get_weight(MACROBLOCKD *xd, int64_t metric_1, int64_t metric_2) {

+  int weight = AVERAGE_WEIGHT;

+  if (2 * metric_1 < metric_2)

+    weight = 6;

+  else if (4 * metric_1 < 3 * metric_2)

+    weight = 5;

+  else if (2 * metric_2 < metric_1)

+    weight = 2;

+  else if (4 * metric_2 < 3 * metric_1)

+    weight = 3;

+  return weight;

+}

+#ifdef USE_IMPLICIT_WEIGHT_SPLITMV

+static int get_implicit_compoundinter_weight_splitmv(

+    MACROBLOCKD *xd, int mb_row, int mb_col) {

+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+  BLOCKD *blockd = xd->block;

+  const int use_second_ref = mbmi->second_ref_frame > 0;

+  int64_t metric_2 = 0, metric_1 = 0;

+  int i, which_mv, weight;

+  uint8_t tmp_y[256];

+  const int tmp_ystride = 16;

+  if (!use_second_ref) return 0;

+  if (!(xd->up_available || xd->left_available))

+    return AVERAGE_WEIGHT;

+  assert(xd->mode_info_context->mbmi.mode == SPLITMV);

+  which_mv = 1;  // second predictor

+  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {

+    for (i = 0; i < 16; i += 8) {

+      BLOCKD *d0 = &blockd[i];

+      BLOCKD *d1 = &blockd[i + 2];

+      const int y = i & 8;

+      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

+      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];

+      if (mbmi->need_to_clamp_mvs) {

+        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);

+        clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);

+      }

+      if (i == 0) {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,

+                                     which_mv, 0, 16, 1,

+                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,

+                                     which_mv, 0, 1, 8,

+                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

+      } else {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16,

+                                     8, 16, which_mv, 0, 1, 8,

+                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

+      }

+    }

+  } else {

+    for (i = 0; i < 16; i += 2) {

+      BLOCKD *d0 = &blockd[i];

+      BLOCKD *d1 = &blockd[i + 1];

+      const int x = (i & 3) * 4;

+      const int y = (i >> 2) * 4;

+      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

+      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];

+      if (i >= 4 && (i & 3) != 0) continue;

+      if (i == 0) {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,

+                                     which_mv, 0, 8, 1, &xd->subpix,

+                                     mb_row * 16 + y, mb_col * 16 + x);

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,

+                                     which_mv, 0, 1, 4, &xd->subpix,

+                                     mb_row * 16 + y, mb_col * 16 + x);

+      } else if (i < 4) {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16,

+                                     which_mv, 0, 8, 1, &xd->subpix,

+                                     mb_row * 16 + y, mb_col * 16 + x);

+      } else {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16,

+                                     4, 16, which_mv, 0, 1, 4, &xd->subpix,

+                                     mb_row * 16 + y, mb_col * 16 + x);

+      }

+    }

+  }

+  metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride);

+  which_mv = 0;  // first predictor

+  if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {

+    for (i = 0; i < 16; i += 8) {

+      BLOCKD *d0 = &blockd[i];

+      BLOCKD *d1 = &blockd[i + 2];

+      const int y = i & 8;

+      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

+      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];

+      if (mbmi->need_to_clamp_mvs) {

+        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);

+        clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);

+      }

+      if (i == 0) {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,

+                                     which_mv, 0, 16, 1,

+                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 8, 16,

+                                     which_mv, 0, 1, 8,

+                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

+      } else {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + 8 * 16,

+                                     8, 16, which_mv, 0, 1, 8,

+                                     &xd->subpix, mb_row * 16 + y, mb_col * 16);

+      }

+    }

+  } else {

+    for (i = 0; i < 16; i += 2) {

+      BLOCKD *d0 = &blockd[i];

+      BLOCKD *d1 = &blockd[i + 1];

+      const int x = (i & 3) * 4;

+      const int y = (i >> 2) * 4;

+      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

+      blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];

+      if (i >= 4 && (i & 3) != 0) continue;

+      if (i == 0) {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,

+                                     which_mv, 0, 8, 1, &xd->subpix,

+                                     mb_row * 16 + y, mb_col * 16 + x);

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y, 4, 16,

+                                     which_mv, 0, 1, 4, &xd->subpix,

+                                     mb_row * 16 + y, mb_col * 16 + x);

+      } else if (i < 4) {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + x, 4, 16,

+                                     which_mv, 0, 8, 1, &xd->subpix,

+                                     mb_row * 16 + y, mb_col * 16 + x);

+      } else {

+        build_2x1_inter_predictor_wh(d0, d1, xd->scale_factor, tmp_y + y * 16,

+                                     4, 16, which_mv, 0, 1, 4, &xd->subpix,

+                                     mb_row * 16 + y, mb_col * 16 + x);

+      }

+    }

+  }

+  metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride);

+  // Choose final weight for averaging

+  weight = get_weight(xd, metric_1, metric_2);

+  return weight;

+}

+#endif

+static int get_implicit_compoundinter_weight(MACROBLOCKD *xd,

+                                             int mb_row,

+                                             int mb_col) {

+  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

+  int64_t metric_2 = 0, metric_1 = 0;

+  int n, clamp_mvs, pre_stride;

+  uint8_t *base_pre;

   int_mv ymv;

+  uint8_t tmp_y[4096];

+  const int tmp_ystride = 64;

+  int weight;

+  int edge[4];

+  int block_size = 16 <<  xd->mode_info_context->mbmi.sb_type;

+  if (!use_second_ref) return 0;

+  if (!(xd->up_available || xd->left_available))

+    return AVERAGE_WEIGHT;

+  edge[0] = xd->mb_to_top_edge;

+  edge[1] = xd->mb_to_bottom_edge;

+  edge[2] = xd->mb_to_left_edge;

+  edge[3] = xd->mb_to_right_edge;

+  clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_secondmv;

+  base_pre = xd->second_pre.y_buffer;

+  pre_stride = xd->second_pre.y_stride;

+  ymv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;

+  // First generate the second predictor

+  for (n = 0; n < block_size; n += 16) {

+    xd->mb_to_left_edge   = edge[2] - (n << 3);

+    xd->mb_to_right_edge  = edge[3] + ((16 - n) << 3);

+    if (clamp_mvs)

+      clamp_mv_to_umv_border(&ymv.as_mv, xd);

+    set_scaled_offsets(&xd->scale_factor[1], mb_row * 16, mb_col * 16 + n);

+    // predict a single row of pixels

+    vp9_build_inter_predictor(

+        base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[1]),

+        pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[1],

+        16, 1, 0, &xd->subpix);

+  }

+  xd->mb_to_left_edge = edge[2];

+  xd->mb_to_right_edge = edge[3];

+  for (n = 0; n < block_size; n += 16) {

+    xd->mb_to_top_edge    = edge[0] - (n << 3);

+    xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3);

+    if (clamp_mvs)

+      clamp_mv_to_umv_border(&ymv.as_mv, xd);

+    set_scaled_offsets(&xd->scale_factor[1], mb_row * 16 + n, mb_col * 16);

+    // predict a single col of pixels

+    vp9_build_inter_predictor(

+        base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[1]),

+        pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv,

+        &xd->scale_factor[1], 1, 16, 0, &xd->subpix);

+  }

+  xd->mb_to_top_edge = edge[0];

+  xd->mb_to_bottom_edge = edge[1];

+  // Compute consistency metric

+  metric_2 = get_consistency_metric(xd, tmp_y, tmp_ystride);

+  clamp_mvs = xd->mode_info_context->mbmi.need_to_clamp_mvs;

+  base_pre = xd->pre.y_buffer;

+  pre_stride = xd->pre.y_stride;

   ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;

+  // Now generate the first predictor

+  for (n = 0; n < block_size; n += 16) {

+    xd->mb_to_left_edge   = edge[2] - (n << 3);

+    xd->mb_to_right_edge  = edge[3] + ((16 - n) << 3);

+    if (clamp_mvs)

+      clamp_mv_to_umv_border(&ymv.as_mv, xd);

+    set_scaled_offsets(&xd->scale_factor[0], mb_row * 16, mb_col * 16 + n);

+    // predict a single row of pixels

+    vp9_build_inter_predictor(

+        base_pre + scaled_buffer_offset(n, 0, pre_stride, &xd->scale_factor[0]),

+        pre_stride, tmp_y + n, tmp_ystride, &ymv, &xd->scale_factor[0],

+        16, 1, 0, &xd->subpix);

+  }

+  xd->mb_to_left_edge = edge[2];

+  xd->mb_to_right_edge = edge[3];

+  for (n = 0; n < block_size; n += 16) {

+    xd->mb_to_top_edge    = edge[0] - (n << 3);

+    xd->mb_to_bottom_edge = edge[1] + ((16 - n) << 3);

+    if (clamp_mvs)

+      clamp_mv_to_umv_border(&ymv.as_mv, xd);

+    set_scaled_offsets(&xd->scale_factor[0], mb_row * 16 + n, mb_col * 16);

+    // predict a single col of pixels

+    vp9_build_inter_predictor(

+        base_pre + scaled_buffer_offset(0, n, pre_stride, &xd->scale_factor[0]),

+        pre_stride, tmp_y + n * tmp_ystride, tmp_ystride, &ymv,

+        &xd->scale_factor[0], 1, 16, 0, &xd->subpix);

+  }

+  xd->mb_to_top_edge = edge[0];

+  xd->mb_to_bottom_edge = edge[1];

+  metric_1 = get_consistency_metric(xd, tmp_y, tmp_ystride);

-  if (clamp_mvs)

-    clamp_mv_to_umv_border(&ymv.as_mv, xd);

+  // Choose final weight for averaging

+  weight = get_weight(xd, metric_1, metric_2);

+  return weight;

+}

-  ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);

+static void build_inter16x16_predictors_mby_w(MACROBLOCKD *xd,

+                                              uint8_t *dst_y,

+                                              int dst_ystride,

+                                              int weight,

+                                              int mb_row,

+                                              int mb_col) {

+  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

+  int which_mv;

-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {

-      xd->subpixel_predict16x16(ptr, pre_stride,

-                                (ymv.as_mv.col & 7) << 1,

-                                (ymv.as_mv.row & 7) << 1,

-                                dst_y, dst_ystride);

-    } else {

-      vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

-    }

+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+    const int clamp_mvs = which_mv ?

+        xd->mode_info_context->mbmi.need_to_clamp_secondmv :

+         xd->mode_info_context->mbmi.need_to_clamp_mvs;

+    uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer;

+    int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride;

+    int_mv ymv;

+    ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;

+    if (clamp_mvs)

+      clamp_mv_to_umv_border(&ymv.as_mv, xd);

+    set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16);

+    vp9_build_inter_predictor(base_pre, pre_stride,

+                              dst_y, dst_ystride,

+                              &ymv, &xd->scale_factor[which_mv],

+                              16, 16, which_mv ? weight : 0, &xd->subpix);

+  }

-void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                              uint8_t *dst_u,

-                                              uint8_t *dst_v,

-                                              int dst_uvstride) {

-  int offset;

-  uint8_t *uptr, *vptr;

-  int pre_stride = xd->block[0].pre_stride;

-  int_mv _o16x16mv;

-  int_mv _16x16mv;

+void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,

+                                         uint8_t *dst_y,

+                                         int dst_ystride,

+                                         int mb_row,

+                                         int mb_col) {

+  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);

-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int;

+  build_inter16x16_predictors_mby_w(xd, dst_y, dst_ystride, weight,

+                                    mb_row, mb_col);

+}

-  if (xd->mode_info_context->mbmi.need_to_clamp_mvs)

-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

+#else

-  _o16x16mv = _16x16mv;

-  /* calc uv motion vectors */

-  if (_16x16mv.as_mv.row < 0)

-    _16x16mv.as_mv.row -= 1;

-  else

-    _16x16mv.as_mv.row += 1;

+void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,

+                                         uint8_t *dst_y,

+                                         int dst_ystride,

+                                         int mb_row,

+                                         int mb_col) {

+  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

+  int which_mv;

-  if (_16x16mv.as_mv.col < 0)

-    _16x16mv.as_mv.col -= 1;

-  else

-    _16x16mv.as_mv.col += 1;

+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+    const int clamp_mvs = which_mv ?

+         xd->mode_info_context->mbmi.need_to_clamp_secondmv :

+         xd->mode_info_context->mbmi.need_to_clamp_mvs;

-  _16x16mv.as_mv.row /= 2;

-  _16x16mv.as_mv.col /= 2;

+    uint8_t *base_pre = which_mv ? xd->second_pre.y_buffer : xd->pre.y_buffer;

+    int pre_stride = which_mv ? xd->second_pre.y_stride : xd->pre.y_stride;

+    int_mv ymv;

+    ymv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;

-  _16x16mv.as_mv.row &= xd->fullpixel_mask;

-  _16x16mv.as_mv.col &= xd->fullpixel_mask;

+    if (clamp_mvs)

+      clamp_mv_to_umv_border(&ymv.as_mv, xd);

-  pre_stride >>= 1;

-  offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);

-  uptr = xd->pre.u_buffer + offset;

-  vptr = xd->pre.v_buffer + offset;

+    set_scaled_offsets(&xd->scale_factor[which_mv], mb_row * 16, mb_col * 16);

-    if (_o16x16mv.as_int & 0x000f000f) {

-      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,

-                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);

-      xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,

-                              _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);

-    } else {

-      vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);

-      vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);

+    vp9_build_inter_predictor(base_pre, pre_stride,

+                              dst_y, dst_ystride,

+                              &ymv, &xd->scale_factor[which_mv],

+                              16, 16, which_mv, &xd->subpix);

+  }

+}

+#endif

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+static void build_inter16x16_predictors_mbuv_w(MACROBLOCKD *xd,

+                                               uint8_t *dst_u,

+                                               uint8_t *dst_v,

+                                               int dst_uvstride,

+                                               int weight,

+                                               int mb_row,

+                                               int mb_col) {

+  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

+  int which_mv;

+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+    const int clamp_mvs =

+        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv

+                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;

+    uint8_t *uptr, *vptr;

+    int pre_stride = which_mv ? xd->second_pre.uv_stride

+                              : xd->pre.uv_stride;

+    int_mv _o16x16mv;

+    int_mv _16x16mv;

+    _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;

+    if (clamp_mvs)

+      clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

+    _o16x16mv = _16x16mv;

+    /* calc uv motion vectors */

+    if (_16x16mv.as_mv.row < 0)

+      _16x16mv.as_mv.row -= 1;

+    else

+      _16x16mv.as_mv.row += 1;

+    if (_16x16mv.as_mv.col < 0)

+      _16x16mv.as_mv.col -= 1;

+    else

+      _16x16mv.as_mv.col += 1;

+    _16x16mv.as_mv.row /= 2;

+    _16x16mv.as_mv.col /= 2;

+    _16x16mv.as_mv.row &= xd->fullpixel_mask;

+    _16x16mv.as_mv.col &= xd->fullpixel_mask;

+    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);

+    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);

+    set_scaled_offsets(&xd->scale_factor_uv[which_mv],

+                       mb_row * 16, mb_col * 16);

+    vp9_build_inter_predictor_q4(

+        uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv,

+        &xd->scale_factor_uv[which_mv], 8, 8,

+        which_mv ? weight : 0, &xd->subpix);

+    vp9_build_inter_predictor_q4(

+        vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv,

+        &xd->scale_factor_uv[which_mv], 8, 8,

+        which_mv ? weight : 0, &xd->subpix);

+  }

+}

+void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

+                                          uint8_t *dst_u,

+                                          uint8_t *dst_v,

+                                          int dst_uvstride,

+                                          int mb_row,

+                                          int mb_col) {

+#ifdef USE_IMPLICIT_WEIGHT_UV

+  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);

+#else

+  int weight = AVERAGE_WEIGHT;

+#endif

+  build_inter16x16_predictors_mbuv_w(xd, dst_u, dst_v, dst_uvstride,

+                                     weight, mb_row, mb_col);

+}

+#else

+void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

+                                          uint8_t *dst_u,

+                                          uint8_t *dst_v,

+                                          int dst_uvstride,

+                                          int mb_row,

+                                          int mb_col) {

+  const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

+  int which_mv;

+  for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+    const int clamp_mvs =

+        which_mv ? xd->mode_info_context->mbmi.need_to_clamp_secondmv

+                 : xd->mode_info_context->mbmi.need_to_clamp_mvs;

+    uint8_t *uptr, *vptr;

+    int pre_stride = which_mv ? xd->second_pre.uv_stride

+                              : xd->pre.uv_stride;

+    int_mv _o16x16mv;

+    int_mv _16x16mv;

+    _16x16mv.as_int = xd->mode_info_context->mbmi.mv[which_mv].as_int;

+    if (clamp_mvs)

+      clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

+    _o16x16mv = _16x16mv;

+    /* calc uv motion vectors */

+    if (_16x16mv.as_mv.row < 0)

+      _16x16mv.as_mv.row -= 1;

+    else

+      _16x16mv.as_mv.row += 1;

+    if (_16x16mv.as_mv.col < 0)

+      _16x16mv.as_mv.col -= 1;

+    else

+      _16x16mv.as_mv.col += 1;

+    _16x16mv.as_mv.row /= 2;

+    _16x16mv.as_mv.col /= 2;

+    _16x16mv.as_mv.row &= xd->fullpixel_mask;

+    _16x16mv.as_mv.col &= xd->fullpixel_mask;

+    uptr = (which_mv ? xd->second_pre.u_buffer : xd->pre.u_buffer);

+    vptr = (which_mv ? xd->second_pre.v_buffer : xd->pre.v_buffer);

+    set_scaled_offsets(&xd->scale_factor_uv[which_mv],

+                       mb_row * 16, mb_col * 16);

+    vp9_build_inter_predictor_q4(

+        uptr, pre_stride, dst_u, dst_uvstride, &_16x16mv, &_o16x16mv,

+        &xd->scale_factor_uv[which_mv], 8, 8,

+        which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix);

+    vp9_build_inter_predictor_q4(

+        vptr, pre_stride, dst_v, dst_uvstride, &_16x16mv, &_o16x16mv,

+        &xd->scale_factor_uv[which_mv], 8, 8,

+        which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT), &xd->subpix);

+  }

+}

+#endif

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+static void build_inter32x32_predictors_sby_w(MACROBLOCKD *x,

+                                              uint8_t *dst_y,

+                                              int dst_ystride,

+                                              int weight,

+                                              int mb_row,

+                                              int mb_col) {

+  uint8_t *y1 = x->pre.y_buffer;

+  uint8_t *y2 = x->second_pre.y_buffer;

+  int edge[4], n;

+  edge[0] = x->mb_to_top_edge;

+  edge[1] = x->mb_to_bottom_edge;

+  edge[2] = x->mb_to_left_edge;

+  edge[3] = x->mb_to_right_edge;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);

+    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);

+    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);

+    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);

+    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16,

+                                                y_idx * 16,

+                                                x->pre.y_stride,

+                                                &x->scale_factor[0]);

+    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+      x->second_pre.y_buffer = y2 +

+          scaled_buffer_offset(x_idx * 16,

+                               y_idx * 16,

+                               x->second_pre.y_stride,

+                               &x->scale_factor[1]);

+    build_inter16x16_predictors_mby_w(x,

+        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

+        dst_ystride, weight, mb_row + y_idx, mb_col + x_idx);

+  }

+  x->mb_to_top_edge    = edge[0];

+  x->mb_to_bottom_edge = edge[1];

+  x->mb_to_left_edge   = edge[2];

+  x->mb_to_right_edge  = edge[3];

+  x->pre.y_buffer = y1;

+  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+    x->second_pre.y_buffer = y2;

+  }

+void vp9_build_inter32x32_predictors_sby(MACROBLOCKD *x,

+                                         uint8_t *dst_y,

+                                         int dst_ystride,

+                                         int mb_row,

+                                         int mb_col) {

+  int weight = get_implicit_compoundinter_weight(x, mb_row, mb_col);

+  build_inter32x32_predictors_sby_w(x, dst_y, dst_ystride, weight,

+                                    mb_row, mb_col);

+}

-void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                            uint8_t *dst_y,

-                                            uint8_t *dst_u,

-                                            uint8_t *dst_v,

-                                            int dst_ystride, int dst_uvstride) {

-  vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride,

-      xd->mode_info_context->mbmi.need_to_clamp_mvs);

-  vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);

+#else

+// TODO(all): Can we use 32x32 specific implementations of this rather than

+// using 16x16 implementations ?

+void vp9_build_inter32x32_predictors_sby(MACROBLOCKD *x,

+                                         uint8_t *dst_y,

+                                         int dst_ystride,

+                                         int mb_row,

+                                         int mb_col) {

+  uint8_t *y1 = x->pre.y_buffer;

+  uint8_t *y2 = x->second_pre.y_buffer;

+  int edge[4], n;

+  edge[0] = x->mb_to_top_edge;

+  edge[1] = x->mb_to_bottom_edge;

+  edge[2] = x->mb_to_left_edge;

+  edge[3] = x->mb_to_right_edge;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);

+    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);

+    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);

+    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);

+    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 16,

+                                                y_idx * 16,

+                                                x->pre.y_stride,

+                                                &x->scale_factor[0]);

+    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+      x->second_pre.y_buffer = y2 +

+          scaled_buffer_offset(x_idx * 16,

+                               y_idx * 16,

+                               x->second_pre.y_stride,

+                               &x->scale_factor[1]);

+    }

+    vp9_build_inter16x16_predictors_mby(x,

+        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

+        dst_ystride, mb_row + y_idx, mb_col + x_idx);

+  }

+  x->mb_to_top_edge    = edge[0];

+  x->mb_to_bottom_edge = edge[1];

+  x->mb_to_left_edge   = edge[2];

+  x->mb_to_right_edge  = edge[3];

+  x->pre.y_buffer = y1;

+  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+    x->second_pre.y_buffer = y2;

+  }

-void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

-                                        uint8_t *dst_y,

-                                        uint8_t *dst_u,

-                                        uint8_t *dst_v,

-                                        int dst_ystride,

-                                        int dst_uvstride) {

-  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

-  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,

-          *v2 = x->second_pre.v_buffer;

+#endif

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+static void build_inter32x32_predictors_sbuv_w(MACROBLOCKD *x,

+                                               uint8_t *dst_u,

+                                               uint8_t *dst_v,

+                                               int dst_uvstride,

+                                               int weight,

+                                               int mb_row,

+                                               int mb_col) {

+  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

+  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;

   int edge[4], n;

   edge[0] = x->mb_to_top_edge;

@@ -562,6 +1220,7 @@

   edge[3] = x->mb_to_right_edge;

   for (n = 0; n < 4; n++) {

+    int scaled_uv_offset;

     const int x_idx = n & 1, y_idx = n >> 1;

     x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);

@@ -569,43 +1228,130 @@

     x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);

     x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);

-    x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;

-    x->pre.u_buffer = u1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

-    x->pre.v_buffer = v1 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

+    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,

+                                            y_idx * 8,

+                                            x->pre.uv_stride,

+                                            &x->scale_factor_uv[0]);

+    x->pre.u_buffer = u1 + scaled_uv_offset;

+    x->pre.v_buffer = v1 + scaled_uv_offset;

-    vp9_build_1st_inter16x16_predictors_mb(x,

-      dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

-      dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,

-      dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,

-      dst_ystride, dst_uvstride);

     if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-      x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride  + x_idx * 16;

-      x->second_pre.u_buffer = u2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

-      x->second_pre.v_buffer = v2 + y_idx *  8 * x->pre.uv_stride + x_idx *  8;

+      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,

+                                              y_idx * 8,

+                                              x->second_pre.uv_stride,

+                                              &x->scale_factor_uv[1]);

+      x->second_pre.u_buffer = u2 + scaled_uv_offset;

+      x->second_pre.v_buffer = v2 + scaled_uv_offset;

+    }

-      vp9_build_2nd_inter16x16_predictors_mb(x,

-        dst_y + y_idx * 16 * dst_ystride  + x_idx * 16,

+    build_inter16x16_predictors_mbuv_w(x,

         dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,

         dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,

-        dst_ystride, dst_uvstride);

-    }

+        dst_uvstride, weight, mb_row + y_idx, mb_col + x_idx);

+  x->mb_to_top_edge    = edge[0];

+  x->mb_to_bottom_edge = edge[1];

+  x->mb_to_left_edge   = edge[2];

+  x->mb_to_right_edge  = edge[3];

+  x->pre.u_buffer = u1;

+  x->pre.v_buffer = v1;

+  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+    x->second_pre.u_buffer = u2;

+    x->second_pre.v_buffer = v2;

+  }

+}

+void vp9_build_inter32x32_predictors_sbuv(MACROBLOCKD *xd,

+                                          uint8_t *dst_u,

+                                          uint8_t *dst_v,

+                                          int dst_uvstride,

+                                          int mb_row,

+                                          int mb_col) {

+#ifdef USE_IMPLICIT_WEIGHT_UV

+  int weight = get_implicit_compoundinter_weight(xd, mb_row, mb_col);

+#else

+  int weight = AVERAGE_WEIGHT;

+#endif

+  build_inter32x32_predictors_sbuv_w(xd, dst_u, dst_v, dst_uvstride,

+                                     weight, mb_row, mb_col);

+}

+#else

+void vp9_build_inter32x32_predictors_sbuv(MACROBLOCKD *x,

+                                          uint8_t *dst_u,

+                                          uint8_t *dst_v,

+                                          int dst_uvstride,

+                                          int mb_row,

+                                          int mb_col) {

+  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

+  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;

+  int edge[4], n;

+  edge[0] = x->mb_to_top_edge;

+  edge[1] = x->mb_to_bottom_edge;

+  edge[2] = x->mb_to_left_edge;

+  edge[3] = x->mb_to_right_edge;

+  for (n = 0; n < 4; n++) {

+    int scaled_uv_offset;

+    const int x_idx = n & 1, y_idx = n >> 1;

+    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 16) << 3);

+    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3);

+    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 16) << 3);

+    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 16) << 3);

+    scaled_uv_offset = scaled_buffer_offset(x_idx * 8,

+                                            y_idx * 8,

+                                            x->pre.uv_stride,

+                                            &x->scale_factor_uv[0]);

+    x->pre.u_buffer = u1 + scaled_uv_offset;

+    x->pre.v_buffer = v1 + scaled_uv_offset;

+    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+      scaled_uv_offset = scaled_buffer_offset(x_idx * 8,

+                                              y_idx * 8,

+                                              x->second_pre.uv_stride,

+                                              &x->scale_factor_uv[1]);

+      x->second_pre.u_buffer = u2 + scaled_uv_offset;

+      x->second_pre.v_buffer = v2 + scaled_uv_offset;

+    }

+    vp9_build_inter16x16_predictors_mbuv(x,

+        dst_u + y_idx *  8 * dst_uvstride + x_idx *  8,

+        dst_v + y_idx *  8 * dst_uvstride + x_idx *  8,

+        dst_uvstride, mb_row + y_idx, mb_col + x_idx);

+  }

   x->mb_to_top_edge    = edge[0];

   x->mb_to_bottom_edge = edge[1];

   x->mb_to_left_edge   = edge[2];

   x->mb_to_right_edge  = edge[3];

-  x->pre.y_buffer = y1;

   x->pre.u_buffer = u1;

   x->pre.v_buffer = v1;

   if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-    x->second_pre.y_buffer = y2;

     x->second_pre.u_buffer = u2;

     x->second_pre.v_buffer = v2;

+}

+#endif

+void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

+                                        uint8_t *dst_y,

+                                        uint8_t *dst_u,

+                                        uint8_t *dst_v,

+                                        int dst_ystride,

+                                        int dst_uvstride,

+                                        int mb_row,

+                                        int mb_col) {

+  vp9_build_inter32x32_predictors_sby(x, dst_y, dst_ystride,

+                                      mb_row, mb_col);

+  vp9_build_inter32x32_predictors_sbuv(x, dst_u, dst_v, dst_uvstride,

+                                      mb_row, mb_col);

 #if CONFIG_COMP_INTERINTRA_PRED

   if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

     vp9_build_interintra_32x32_predictors_sb(

@@ -614,15 +1360,15 @@

 #endif

-void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,

-                                        uint8_t *dst_y,

-                                        uint8_t *dst_u,

-                                        uint8_t *dst_v,

-                                        int dst_ystride,

-                                        int dst_uvstride) {

-  uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

-  uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer,

-          *v2 = x->second_pre.v_buffer;

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+static void build_inter64x64_predictors_sby_w(MACROBLOCKD *x,

+                                              uint8_t *dst_y,

+                                              int dst_ystride,

+                                              int weight,

+                                              int mb_row,

+                                              int mb_col) {

+  uint8_t *y1 = x->pre.y_buffer;

+  uint8_t *y2 = x->second_pre.y_buffer;

   int edge[4], n;

   edge[0] = x->mb_to_top_edge;

@@ -638,21 +1384,22 @@

     x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);

     x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);

-    x->pre.y_buffer = y1 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;

-    x->pre.u_buffer = u1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;

-    x->pre.v_buffer = v1 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;

+    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32,

+                                                y_idx * 32,

+                                                x->pre.y_stride,

+                                                &x->scale_factor[0]);

     if (x->mode_info_context->mbmi.second_ref_frame > 0) {

-      x->second_pre.y_buffer = y2 + y_idx * 32 * x->pre.y_stride  + x_idx * 32;

-      x->second_pre.u_buffer = u2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;

-      x->second_pre.v_buffer = v2 + y_idx * 16 * x->pre.uv_stride + x_idx * 16;

+      x->second_pre.y_buffer = y2 +

+          scaled_buffer_offset(x_idx * 32,

+                               y_idx * 32,

+                               x->second_pre.y_stride,

+                               &x->scale_factor[1]);

-    vp9_build_inter32x32_predictors_sb(x,

+    build_inter32x32_predictors_sby_w(x,

         dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,

-        dst_u + y_idx * 16 * dst_uvstride + x_idx * 16,

-        dst_v + y_idx * 16 * dst_uvstride + x_idx * 16,

-        dst_ystride, dst_uvstride);

+        dst_ystride, weight, mb_row + y_idx * 2, mb_col + x_idx * 2);

   x->mb_to_top_edge    = edge[0];

@@ -661,324 +1408,392 @@

   x->mb_to_right_edge  = edge[3];

   x->pre.y_buffer = y1;

-  x->pre.u_buffer = u1;

-  x->pre.v_buffer = v1;

   if (x->mode_info_context->mbmi.second_ref_frame > 0) {

     x->second_pre.y_buffer = y2;

-    x->second_pre.u_buffer = u2;

-    x->second_pre.v_buffer = v2;

+}

-#if CONFIG_COMP_INTERINTRA_PRED

-  if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

-    vp9_build_interintra_64x64_predictors_sb(x, dst_y, dst_u, dst_v,

-                                             dst_ystride, dst_uvstride);

-  }

-#endif

+void vp9_build_inter64x64_predictors_sby(MACROBLOCKD *x,

+                                         uint8_t *dst_y,

+                                         int dst_ystride,

+                                         int mb_row,

+                                         int mb_col) {

+  int weight = get_implicit_compoundinter_weight(x, mb_row, mb_col);

+  build_inter64x64_predictors_sby_w(x, dst_y, dst_ystride, weight,

+                                    mb_row, mb_col);

-/*

- * The following functions should be called after an initial

- * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv().

- * It will run a second filter on a (different) ref

- * frame and average the result with the output of the

- * first filter. The second reference frame is stored

- * in x->second_pre (the reference frame index is in

- * x->mode_info_context->mbmi.second_ref_frame). The second

- * motion vector is x->mode_info_context->mbmi.second_mv.

- *

- * This allows blending prediction from two reference frames

- * which sometimes leads to better prediction than from a

- * single reference framer.

- */

-void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                             uint8_t *dst_y,

-                                             int dst_ystride) {

-  uint8_t *ptr;

+#else

-  int_mv _16x16mv;

-  int mv_row;

-  int mv_col;

+void vp9_build_inter64x64_predictors_sby(MACROBLOCKD *x,

+                                         uint8_t *dst_y,

+                                         int dst_ystride,

+                                         int mb_row,

+                                         int mb_col) {

+  uint8_t *y1 = x->pre.y_buffer;

+  uint8_t *y2 = x->second_pre.y_buffer;

+  int edge[4], n;

-  uint8_t *ptr_base = xd->second_pre.y_buffer;

-  int pre_stride = xd->block[0].pre_stride;

+  edge[0] = x->mb_to_top_edge;

+  edge[1] = x->mb_to_bottom_edge;

+  edge[2] = x->mb_to_left_edge;

+  edge[3] = x->mb_to_right_edge;

-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)

-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

+    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);

+    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);

+    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);

+    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);

-  mv_row = _16x16mv.as_mv.row;

-  mv_col = _16x16mv.as_mv.col;

+    x->pre.y_buffer = y1 + scaled_buffer_offset(x_idx * 32,

+                                                y_idx * 32,

+                                                x->pre.y_stride,

+                                                &x->scale_factor[0]);

-  ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);

+    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+      x->second_pre.y_buffer = y2 +

+          scaled_buffer_offset(x_idx * 32,

+                               y_idx * 32,

+                               x->second_pre.y_stride,

+                               &x->scale_factor[1]);

+    }

-  if ((mv_row | mv_col) & 7) {

-    xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,

-                                  (mv_row & 7) << 1, dst_y, dst_ystride);

-  } else {

-    vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

+    vp9_build_inter32x32_predictors_sby(x,

+        dst_y + y_idx * 32 * dst_ystride  + x_idx * 32,

+        dst_ystride, mb_row + y_idx * 2, mb_col + x_idx * 2);

+  x->mb_to_top_edge    = edge[0];

+  x->mb_to_bottom_edge = edge[1];

+  x->mb_to_left_edge   = edge[2];

+  x->mb_to_right_edge  = edge[3];

+  x->pre.y_buffer = y1;

+  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+    x->second_pre.y_buffer = y2;

+  }

+#endif

-void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                              uint8_t *dst_u,

-                                              uint8_t *dst_v,

-                                              int dst_uvstride) {

-  int offset;

-  uint8_t *uptr, *vptr;

+void vp9_build_inter64x64_predictors_sbuv(MACROBLOCKD *x,

+                                          uint8_t *dst_u,

+                                          uint8_t *dst_v,

+                                          int dst_uvstride,

+                                          int mb_row,

+                                          int mb_col) {

+  uint8_t *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer;

+  uint8_t *u2 = x->second_pre.u_buffer, *v2 = x->second_pre.v_buffer;

+  int edge[4], n;

-  int_mv _16x16mv;

-  int mv_row;

-  int mv_col;

-  int omv_row, omv_col;

+  edge[0] = x->mb_to_top_edge;

+  edge[1] = x->mb_to_bottom_edge;

+  edge[2] = x->mb_to_left_edge;

+  edge[3] = x->mb_to_right_edge;

-  int pre_stride = xd->block[0].pre_stride;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    int scaled_uv_offset;

-  _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int;

+    x->mb_to_top_edge    = edge[0] -      ((y_idx  * 32) << 3);

+    x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 32) << 3);

+    x->mb_to_left_edge   = edge[2] -      ((x_idx  * 32) << 3);

+    x->mb_to_right_edge  = edge[3] + (((1 - x_idx) * 32) << 3);

-  if (xd->mode_info_context->mbmi.need_to_clamp_secondmv)

-    clamp_mv_to_umv_border(&_16x16mv.as_mv, xd);

+    scaled_uv_offset = scaled_buffer_offset(x_idx * 16,

+                                            y_idx * 16,

+                                            x->pre.uv_stride,

+                                            &x->scale_factor_uv[0]);

+    x->pre.u_buffer = u1 + scaled_uv_offset;

+    x->pre.v_buffer = v1 + scaled_uv_offset;

-  mv_row = _16x16mv.as_mv.row;

-  mv_col = _16x16mv.as_mv.col;

+    if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+      scaled_uv_offset = scaled_buffer_offset(x_idx * 16,

+                                              y_idx * 16,

+                                              x->second_pre.uv_stride,

+                                              &x->scale_factor_uv[1]);

+      x->second_pre.u_buffer = u2 + scaled_uv_offset;

+      x->second_pre.v_buffer = v2 + scaled_uv_offset;

+    }

-  /* calc uv motion vectors */

-  omv_row = mv_row;

-  omv_col = mv_col;

-  mv_row = (mv_row + (mv_row > 0)) >> 1;

-  mv_col = (mv_col + (mv_col > 0)) >> 1;

+    vp9_build_inter32x32_predictors_sbuv(x,

+        dst_u + y_idx * 16 * dst_uvstride + x_idx * 16,

+        dst_v + y_idx * 16 * dst_uvstride + x_idx * 16,

+        dst_uvstride, mb_row + y_idx * 2, mb_col + x_idx * 2);

+  }

-  mv_row &= xd->fullpixel_mask;

-  mv_col &= xd->fullpixel_mask;

+  x->mb_to_top_edge    = edge[0];

+  x->mb_to_bottom_edge = edge[1];

+  x->mb_to_left_edge   = edge[2];

+  x->mb_to_right_edge  = edge[3];

-  pre_stride >>= 1;

-  offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);

-  uptr = xd->second_pre.u_buffer + offset;

-  vptr = xd->second_pre.v_buffer + offset;

+  x->pre.u_buffer = u1;

+  x->pre.v_buffer = v1;

-    if ((omv_row | omv_col) & 15) {

-      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,

-                                  omv_row & 15, dst_u, dst_uvstride);

-      xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,

-                                  omv_row & 15, dst_v, dst_uvstride);

-    } else {

-      vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);

-      vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);

-    }

+  if (x->mode_info_context->mbmi.second_ref_frame > 0) {

+    x->second_pre.u_buffer = u2;

+    x->second_pre.v_buffer = v2;

+  }

-void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                            uint8_t *dst_y,

-                                            uint8_t *dst_u,

-                                            uint8_t *dst_v,

-                                            int dst_ystride,

-                                            int dst_uvstride) {

-  vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride);

-  vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride);

+void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,

+                                        uint8_t *dst_y,

+                                        uint8_t *dst_u,

+                                        uint8_t *dst_v,

+                                        int dst_ystride,

+                                        int dst_uvstride,

+                                        int mb_row,

+                                        int mb_col) {

+  vp9_build_inter64x64_predictors_sby(x, dst_y, dst_ystride,

+                                      mb_row, mb_col);

+  vp9_build_inter64x64_predictors_sbuv(x, dst_u, dst_v, dst_uvstride,

+                                       mb_row, mb_col);

+#if CONFIG_COMP_INTERINTRA_PRED

+  if (x->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

+    vp9_build_interintra_64x64_predictors_sb(x, dst_y, dst_u, dst_v,

+                                             dst_ystride, dst_uvstride);

+  }

+#endif

-static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {

+static void build_inter4x4_predictors_mb(MACROBLOCKD *xd,

+                                         int mb_row, int mb_col) {

   int i;

   MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;

   BLOCKD *blockd = xd->block;

+  int which_mv = 0;

+  const int use_second_ref = mbmi->second_ref_frame > 0;

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && defined(USE_IMPLICIT_WEIGHT_SPLITMV)

+  int weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col);

+#else

+  int weight = AVERAGE_WEIGHT;

+#endif

   if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) {

-    blockd[ 0].bmi = xd->mode_info_context->bmi[ 0];

-    blockd[ 2].bmi = xd->mode_info_context->bmi[ 2];

-    blockd[ 8].bmi = xd->mode_info_context->bmi[ 8];

-    blockd[10].bmi = xd->mode_info_context->bmi[10];

+    for (i = 0; i < 16; i += 8) {

+      BLOCKD *d0 = &blockd[i];

+      BLOCKD *d1 = &blockd[i + 2];

+      const int y = i & 8;

-    if (mbmi->need_to_clamp_mvs) {

-      clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd);

-      clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd);

-      clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd);

-      clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd);

-      if (mbmi->second_ref_frame > 0) {

-        clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd);

-      }

-    }

+      blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

+      blockd[i + 2].bmi = xd->mode_info_context->bmi[i + 2];

+      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+        if (mbmi->need_to_clamp_mvs) {

+          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[which_mv].as_mv, xd);

+          clamp_mv_to_umv_border(&blockd[i + 2].bmi.as_mv[which_mv].as_mv, xd);

+        }

-    vp9_build_inter_predictors4b(xd, &blockd[ 0], 16);

-    vp9_build_inter_predictors4b(xd, &blockd[ 2], 16);

-    vp9_build_inter_predictors4b(xd, &blockd[ 8], 16);

-    vp9_build_inter_predictors4b(xd, &blockd[10], 16);

-    if (mbmi->second_ref_frame > 0) {

-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16);

-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16);

-      vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16);

-      vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16);

+        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 8, 16, which_mv,

+                                  which_mv ? weight : 0,

+                                  &xd->subpix, mb_row * 16 + y, mb_col * 16);

+      }

   } else {

     for (i = 0; i < 16; i += 2) {

       BLOCKD *d0 = &blockd[i];

       BLOCKD *d1 = &blockd[i + 1];

+      const int x = (i & 3) * 4;

+      const int y = (i >> 2) * 4;

       blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0];

       blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1];

-      if (mbmi->need_to_clamp_mvs) {

-        clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd);

-        clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd);

-        if (mbmi->second_ref_frame > 0) {

-          clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd);

-          clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd);

-        }

+      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+        build_2x1_inter_predictor(d0, d1, xd->scale_factor, 4, 16, which_mv,

+                                  which_mv ? weight : 0,

+                                  &xd->subpix,

+                                  mb_row * 16 + y, mb_col * 16 + x);

-      if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

-        build_inter_predictors2b(xd, d0, 16);

-      else {

-        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4);

-        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4);

-      }

-      if (mbmi->second_ref_frame > 0) {

-        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4);

-        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4);

-      }

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+#if !defined(USE_IMPLICIT_WEIGHT_UV)

+  weight = AVERAGE_WEIGHT;

+#endif

+#endif

   for (i = 16; i < 24; i += 2) {

     BLOCKD *d0 = &blockd[i];

     BLOCKD *d1 = &blockd[i + 1];

+    const int x = 4 * (i & 1);

+    const int y = ((i - 16) >> 1) * 4;

-    if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

-      build_inter_predictors2b(xd, d0, 8);

-    else {

-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);

-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);

+    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,

+                                which_mv ? weight : 0, &xd->subpix,

+                                mb_row * 8 + y, mb_col * 8 + x);

-    if (mbmi->second_ref_frame > 0) {

-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);

-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);

-    }

-static

-void build_4x4uvmvs(MACROBLOCKD *xd) {

-  int i, j;

-  BLOCKD *blockd = xd->block;

+static INLINE int round_mv_comp(int value) {

+  return (value < 0 ? value - 4 : value + 4) / 8;

+}

-  for (i = 0; i < 2; i++) {

-    for (j = 0; j < 2; j++) {

-      int yoffset = i * 8 + j * 2;

-      int uoffset = 16 + i * 2 + j;

-      int voffset = 20 + i * 2 + j;

+static int mi_mv_pred_row(MACROBLOCKD *mb, int off, int idx) {

+  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.row +

+                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.row +

+                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.row +

+                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.row;

+  return round_mv_comp(temp) & mb->fullpixel_mask;

+}

-      int temp;

+static int mi_mv_pred_col(MACROBLOCKD *mb, int off, int idx) {

+  const int temp = mb->mode_info_context->bmi[off + 0].as_mv[idx].as_mv.col +

+                   mb->mode_info_context->bmi[off + 1].as_mv[idx].as_mv.col +

+                   mb->mode_info_context->bmi[off + 4].as_mv[idx].as_mv.col +

+                   mb->mode_info_context->bmi[off + 5].as_mv[idx].as_mv.col;

+  return round_mv_comp(temp) & mb->fullpixel_mask;

+}

-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row

-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row

-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row

-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row;

+static int b_mv_pred_row(MACROBLOCKD *mb, int off, int idx) {

+  BLOCKD *const blockd = mb->block;

+  const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.row +

+                   blockd[off + 1].bmi.as_mv[idx].as_mv.row +

+                   blockd[off + 4].bmi.as_mv[idx].as_mv.row +

+                   blockd[off + 5].bmi.as_mv[idx].as_mv.row;

+  return round_mv_comp(temp) & mb->fullpixel_mask;

+}

-      if (temp < 0) temp -= 4;

-      else temp += 4;

+static int b_mv_pred_col(MACROBLOCKD *mb, int off, int idx) {

+  BLOCKD *const blockd = mb->block;

+  const int temp = blockd[off + 0].bmi.as_mv[idx].as_mv.col +

+                   blockd[off + 1].bmi.as_mv[idx].as_mv.col +

+                   blockd[off + 4].bmi.as_mv[idx].as_mv.col +

+                   blockd[off + 5].bmi.as_mv[idx].as_mv.col;

+  return round_mv_comp(temp) & mb->fullpixel_mask;

+}

-      blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) &

-                                                  xd->fullpixel_mask;

-      temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col

-             + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col

-             + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col

-             + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col;

+static void build_4x4uvmvs(MACROBLOCKD *xd) {

+  int i, j;

+  BLOCKD *blockd = xd->block;

-      if (temp < 0) temp -= 4;

-      else temp += 4;

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      const int yoffset = i * 8 + j * 2;

+      const int uoffset = 16 + i * 2 + j;

+      const int voffset = 20 + i * 2 + j;

-      blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) &

-        xd->fullpixel_mask;

+      MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv;

+      MV *v = &blockd[voffset].bmi.as_mv[0].as_mv;

+      u->row = mi_mv_pred_row(xd, yoffset, 0);

+      u->col = mi_mv_pred_col(xd, yoffset, 0);

       // if (x->mode_info_context->mbmi.need_to_clamp_mvs)

-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);

+      clamp_uvmv_to_umv_border(u, xd);

       // if (x->mode_info_context->mbmi.need_to_clamp_mvs)

-      clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd);

+      clamp_uvmv_to_umv_border(u, xd);

-      blockd[voffset].bmi.as_mv.first.as_mv.row =

-        blockd[uoffset].bmi.as_mv.first.as_mv.row;

-      blockd[voffset].bmi.as_mv.first.as_mv.col =

-        blockd[uoffset].bmi.as_mv.first.as_mv.col;

+      v->row = u->row;

+      v->col = u->col;

       if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row

-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row

-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row

-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row;

+        u = &blockd[uoffset].bmi.as_mv[1].as_mv;

+        v = &blockd[voffset].bmi.as_mv[1].as_mv;

+        u->row = mi_mv_pred_row(xd, yoffset, 1);

+        u->col = mi_mv_pred_col(xd, yoffset, 1);

-        if (temp < 0) {

-          temp -= 4;

-        } else {

-          temp += 4;

-        }

-       blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) &

-                                                    xd->fullpixel_mask;

-        temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col

-               + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col

-               + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col

-               + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col;

-        if (temp < 0) {

-          temp -= 4;

-        } else {

-          temp += 4;

-        }

-        blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) &

-                                                        xd->fullpixel_mask;

         // if (mbmi->need_to_clamp_mvs)

-        clamp_uvmv_to_umv_border(

-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);

+        clamp_uvmv_to_umv_border(u, xd);

         // if (mbmi->need_to_clamp_mvs)

-        clamp_uvmv_to_umv_border(

-          &blockd[uoffset].bmi.as_mv.second.as_mv, xd);

+        clamp_uvmv_to_umv_border(u, xd);

-        blockd[voffset].bmi.as_mv.second.as_mv.row =

-          blockd[uoffset].bmi.as_mv.second.as_mv.row;

-        blockd[voffset].bmi.as_mv.second.as_mv.col =

-          blockd[uoffset].bmi.as_mv.second.as_mv.col;

+        v->row = u->row;

+        v->col = u->col;

-void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) {

-  if (xd->mode_info_context->mbmi.mode != SPLITMV) {

-    vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor,

-                                           &xd->predictor[256],

-                                           &xd->predictor[320], 16, 8);

-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-      /* 256 = offset of U plane in Y+U+V buffer;

-       * 320 = offset of V plane in Y+U+V buffer.

-       * (256=16x16, 320=16x16+8x8). */

-      vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor,

-                                             &xd->predictor[256],

-                                             &xd->predictor[320], 16, 8);

-    }

+void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,

+                                        uint8_t *dst_y,

+                                        uint8_t *dst_u,

+                                        uint8_t *dst_v,

+                                        int dst_ystride,

+                                        int dst_uvstride,

+                                        int mb_row,

+                                        int mb_col) {

+  vp9_build_inter16x16_predictors_mby(xd, dst_y, dst_ystride, mb_row, mb_col);

+  vp9_build_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride,

+                                       mb_row, mb_col);

 #if CONFIG_COMP_INTERINTRA_PRED

-    else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

-      vp9_build_interintra_16x16_predictors_mb(xd, xd->predictor,

-                                               &xd->predictor[256],

-                                               &xd->predictor[320], 16, 8);

-    }

+  if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

+    vp9_build_interintra_16x16_predictors_mb(xd, dst_y, dst_u, dst_v,

+                                             dst_ystride, dst_uvstride);

+  }

 #endif

+}

+void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,

+                                   int mb_row,

+                                   int mb_col) {

+  if (xd->mode_info_context->mbmi.mode != SPLITMV) {

+    vp9_build_inter16x16_predictors_mb(xd, xd->predictor,

+                                       &xd->predictor[256],

+                                       &xd->predictor[320], 16, 8,

+                                       mb_row, mb_col);

   } else {

     build_4x4uvmvs(xd);

-    build_inter4x4_predictors_mb(xd);

+    build_inter4x4_predictors_mb(xd, mb_row, mb_col);

+  }

+}

+/*encoder only*/

+void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,

+                                        int mb_row, int mb_col) {

+  int i, j, weight;

+  BLOCKD *const blockd = xd->block;

+  /* build uv mvs */

+  for (i = 0; i < 2; i++) {

+    for (j = 0; j < 2; j++) {

+      const int yoffset = i * 8 + j * 2;

+      const int uoffset = 16 + i * 2 + j;

+      const int voffset = 20 + i * 2 + j;

+      MV *u = &blockd[uoffset].bmi.as_mv[0].as_mv;

+      MV *v = &blockd[voffset].bmi.as_mv[0].as_mv;

+      v->row = u->row = b_mv_pred_row(xd, yoffset, 0);

+      v->col = u->col = b_mv_pred_col(xd, yoffset, 0);

+      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

+        u = &blockd[uoffset].bmi.as_mv[1].as_mv;

+        v = &blockd[voffset].bmi.as_mv[1].as_mv;

+        v->row = u->row = b_mv_pred_row(xd, yoffset, 1);

+        v->row = u->col = b_mv_pred_row(xd, yoffset, 1);

+      }

+    }

+  }

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT && \

+  defined(USE_IMPLICIT_WEIGHT_SPLITMV) && \

+  defined(USE_IMPLICIT_WEIGHT_UV)

+  weight = get_implicit_compoundinter_weight_splitmv(xd, mb_row, mb_col);

+#else

+  weight = AVERAGE_WEIGHT;

+#endif

+  for (i = 16; i < 24; i += 2) {

+    const int use_second_ref = xd->mode_info_context->mbmi.second_ref_frame > 0;

+    const int x = 4 * (i & 1);

+    const int y = ((i - 16) >> 1) * 4;

+    int which_mv;

+    BLOCKD *d0 = &blockd[i];

+    BLOCKD *d1 = &blockd[i + 1];

+    for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+      build_2x1_inter_predictor(d0, d1, xd->scale_factor_uv, 4, 8, which_mv,

+                                which_mv ? weight : 0,

+                                &xd->subpix, mb_row * 8 + y, mb_col * 8 + x);

+    }

--- a/vp9/common/vp9_reconinter.h

+++ b/vp9/common/vp9_reconinter.h

@@ -14,71 +14,128 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_onyxc_int.h"

-extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                                    uint8_t *dst_y,

-                                                    int dst_ystride,

-                                                    int clamp_mvs);

+struct subpix_fn_table;

-extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                                     uint8_t *dst_u,

-                                                     uint8_t *dst_v,

-                                                     int dst_uvstride);

+void vp9_build_inter16x16_predictors_mby(MACROBLOCKD *xd,

+                                         uint8_t *dst_y,

+                                         int dst_ystride,

+                                         int mb_row,

+                                         int mb_col);

-extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                                   uint8_t *dst_y,

-                                                   uint8_t *dst_u,

-                                                   uint8_t *dst_v,

-                                                   int dst_ystride,

-                                                   int dst_uvstride);

+void vp9_build_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

+                                          uint8_t *dst_u,

+                                          uint8_t *dst_v,

+                                          int dst_uvstride,

+                                          int mb_row,

+                                          int mb_col);

-extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,

-                                                    uint8_t *dst_y,

-                                                    int dst_ystride);

+void vp9_build_inter16x16_predictors_mb(MACROBLOCKD *xd,

+                                        uint8_t *dst_y,

+                                        uint8_t *dst_u,

+                                        uint8_t *dst_v,

+                                        int dst_ystride,

+                                        int dst_uvstride,

+                                        int mb_row,

+                                        int mb_col);

-extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                                     uint8_t *dst_u,

-                                                     uint8_t *dst_v,

-                                                     int dst_uvstride);

+void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

+                                        uint8_t *dst_y,

+                                        uint8_t *dst_u,

+                                        uint8_t *dst_v,

+                                        int dst_ystride,

+                                        int dst_uvstride,

+                                        int mb_row,

+                                        int mb_col);

-extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,

-                                                   uint8_t *dst_y,

-                                                   uint8_t *dst_u,

-                                                   uint8_t *dst_v,

-                                                   int dst_ystride,

-                                                   int dst_uvstride);

+void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,

+                                        uint8_t *dst_y,

+                                        uint8_t *dst_u,

+                                        uint8_t *dst_v,

+                                        int dst_ystride,

+                                        int dst_uvstride,

+                                        int mb_row,

+                                        int mb_col);

-extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x,

-                                               uint8_t *dst_y,

-                                               uint8_t *dst_u,

-                                               uint8_t *dst_v,

-                                               int dst_ystride,

-                                               int dst_uvstride);

+void vp9_build_inter_predictors_mb(MACROBLOCKD *xd,

+                                   int mb_row,

+                                   int mb_col);

-extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,

-                                               uint8_t *dst_y,

-                                               uint8_t *dst_u,

-                                               uint8_t *dst_v,

-                                               int dst_ystride,

-                                               int dst_uvstride);

+void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd,

+                                        int mb_row,

+                                        int mb_col);

-extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);

+void vp9_setup_interp_filters(MACROBLOCKD *xd,

+                              INTERPOLATIONFILTERTYPE filter,

+                              VP9_COMMON *cm);

-extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,

-                                         vp9_subpix_fn_t sppf);

+void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,

+                                       YV12_BUFFER_CONFIG *other,

+                                       int this_w, int this_h);

-extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,

-                                             vp9_subpix_fn_t sppf);

+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int_mv *mv_q3,

+                               const struct scale_factors *scale,

+                               int w, int h, int do_avg,

+                               const struct subpix_fn_table *subpix);

-extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,

-                                         int pitch);

+void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride,

+                                  uint8_t *dst, int dst_stride,

+                                  const int_mv *fullpel_mv_q3,

+                                  const int_mv *frac_mv_q4,

+                                  const struct scale_factors *scale,

+                                  int w, int h, int do_avg,

+                                  const struct subpix_fn_table *subpix);

-extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,

-                                             BLOCKD *d, int pitch);

+static int scale_value_x(int val, const struct scale_factors *scale) {

+  return val * scale->x_num / scale->x_den;

+}

-extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd);

+static int scale_value_y(int val, const struct scale_factors *scale) {

+  return val * scale->y_num / scale->y_den;

+}

-extern void vp9_setup_interp_filters(MACROBLOCKD *xd,

-                                     INTERPOLATIONFILTERTYPE filter,

-                                     VP9_COMMON *cm);

+static int scaled_buffer_offset(int x_offset,

+                                int y_offset,

+                                int stride,

+                                const struct scale_factors *scale) {

+  return scale_value_y(y_offset, scale) * stride +

+      scale_value_x(x_offset, scale);

+}

+static void setup_pred_block(YV12_BUFFER_CONFIG *dst,

+                             const YV12_BUFFER_CONFIG *src,

+                             int mb_row, int mb_col,

+                             const struct scale_factors *scale,

+                             const struct scale_factors *scale_uv) {

+  const int recon_y_stride = src->y_stride;

+  const int recon_uv_stride = src->uv_stride;

+  int recon_yoffset;

+  int recon_uvoffset;

+  if (scale) {

+    recon_yoffset = scaled_buffer_offset(16 * mb_col, 16 * mb_row,

+                                         recon_y_stride, scale);

+    recon_uvoffset = scaled_buffer_offset(8 * mb_col, 8 * mb_row,

+                                          recon_uv_stride, scale_uv);

+  } else {

+    recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;

+    recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;

+  }

+  *dst = *src;

+  dst->y_buffer += recon_yoffset;

+  dst->u_buffer += recon_uvoffset;

+  dst->v_buffer += recon_uvoffset;

+}

+static void set_scale_factors(MACROBLOCKD *xd,

+    int ref0, int ref1,

+    struct scale_factors scale_factor[MAX_REF_FRAMES]) {

+  xd->scale_factor[0] = scale_factor[ref0 >= 0 ? ref0 : 0];

+  xd->scale_factor[1] = scale_factor[ref1 >= 0 ? ref1 : 0];

+  xd->scale_factor_uv[0] = xd->scale_factor[0];

+  xd->scale_factor_uv[1] = xd->scale_factor[1];

+}

 #endif  // VP9_COMMON_VP9_RECONINTER_H_

--- a/vp9/common/vp9_reconintra.c

+++ b/vp9/common/vp9_reconintra.c

@@ -9,59 +9,81 @@

*/

 #include <stdio.h>

 #include "./vpx_config.h"

 #include "vp9_rtcd.h"

 #include "vp9/common/vp9_reconintra.h"

 #include "vpx_mem/vpx_mem.h"

-/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)

- * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).

- */

+// For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd)

+// and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd).

+// Using multiplication and shifting instead of division in diagonal prediction.

+// iscale table is calculated from ((1 << 16) + (i + 2) / 2) / (i+2) and used as

+// ((A + B) * iscale[i] + (1 << 15)) >> 16;

+// where A and B are weighted pixel values.

+static const unsigned int iscale[64] = {

+  32768, 21845, 16384, 13107, 10923,  9362,  8192,  7282,

+   6554,  5958,  5461,  5041,  4681,  4369,  4096,  3855,

+   3641,  3449,  3277,  3121,  2979,  2849,  2731,  2621,

+   2521,  2427,  2341,  2260,  2185,  2114,  2048,  1986,

+   1928,  1872,  1820,  1771,  1725,  1680,  1638,  1598,

+   1560,  1524,  1489,  1456,  1425,  1394,  1365,  1337,

+   1311,  1285,  1260,  1237,  1214,  1192,  1170,  1150,

+   1130,  1111,  1092,  1074,  1057,  1040,  1024,  1008,

+};

+static INLINE int iscale_round(int value, int i) {

+    return ROUND_POWER_OF_TWO(value * iscale[i], 16);

+}

 static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n,

                           uint8_t *yabove_row, uint8_t *yleft_col) {

-  int r, c, h, w, v;

-  int a, b;

+  int r, c;

   r = 0;

   for (c = 0; c < n - 2; c++) {

-    if (c & 1)

-      a = yleft_col[r + 1];

-    else

-      a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;

-    b = yabove_row[c + 2];

-    ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);

+    int a = c & 1 ? yleft_col[r + 1]

+                  : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);

+    int b = yabove_row[c + 2];

+    ypred_ptr[c] = iscale_round(2 * a + (c + 1) * b, 1 + c);

   for (r = 1; r < n / 2 - 1; r++) {

     for (c = 0; c < n - 2 - 2 * r; c++) {

-      if (c & 1)

-        a = yleft_col[r + 1];

-      else

-        a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1;

-      b = ypred_ptr[(r - 1) * y_stride + c + 2];

-      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);

+      int a = c & 1 ? yleft_col[r + 1]

+                    : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);

+      int b = ypred_ptr[(r - 1) * y_stride + c + 2];

+      ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c);

-  for (; r < n - 1; ++r) {

+  for (; r < n - 1; r++) {

     for (c = 0; c < n; c++) {

-      v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1);

-      h = r - c / 2;

+      int v = c & 1 ? yleft_col[r + 1]

+                    : ROUND_POWER_OF_TWO(yleft_col[r] + yleft_col[r + 1], 1);

+      int h = r - c / 2;

       ypred_ptr[h * y_stride + c] = v;

   c = 0;

   r = n - 1;

-  ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] +

-                             yleft_col[r] + 1) >> 1;

+  ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride] +

+                                               yleft_col[r], 1);

   for (r = n - 2; r >= n / 2; --r) {

-    w = c + (n - 1 - r) * 2;

-    ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +

-                                   ypred_ptr[r * y_stride + w - 1] + 1) >> 1;

+    int w = c + (n - 1 - r) * 2;

+    ypred_ptr[r * y_stride + w] =

+        ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] +

+                           ypred_ptr[r * y_stride + w - 1], 1);

   for (c = 1; c < n; c++) {

     for (r = n - 1; r >= n / 2 + c / 2; --r) {

-      w = c + (n - 1 - r) * 2;

-      ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] +

-                                     ypred_ptr[r * y_stride + w - 1] + 1) >> 1;

+      int w = c + (n - 1 - r) * 2;

+      ypred_ptr[r * y_stride + w] =

+          ROUND_POWER_OF_TWO(ypred_ptr[(r - 1) * y_stride + w] +

+                             ypred_ptr[r * y_stride + w - 1], 1);

@@ -68,47 +90,50 @@

 static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n,

                           uint8_t *yabove_row, uint8_t *yleft_col) {

-  int r, c, h, w, v;

-  int a, b;

+  int r, c;

   c = 0;

   for (r = 0; r < n - 2; r++) {

-    if (r & 1)

-      a = yabove_row[c + 1];

-    else

-      a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;

-    b = yleft_col[r + 2];

-    ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3);

+    int a = r & 1 ? yabove_row[c + 1]

+                  : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);

+    int b = yleft_col[r + 2];

+    ypred_ptr[r * y_stride] = iscale_round(2 * a + (r + 1) * b, 1 + r);

   for (c = 1; c < n / 2 - 1; c++) {

     for (r = 0; r < n - 2 - 2 * c; r++) {

-      if (r & 1)

-        a = yabove_row[c + 1];

-      else

-        a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1;

-      b = ypred_ptr[(r + 2) * y_stride + c - 1];

-      ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3);

+      int a = r & 1 ? yabove_row[c + 1]

+                    : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);

+      int b = ypred_ptr[(r + 2) * y_stride + c - 1];

+      ypred_ptr[r * y_stride + c] = iscale_round(2 * a + (c + 1) * b, 1 + c);

   for (; c < n - 1; ++c) {

     for (r = 0; r < n; r++) {

-      v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1);

-      w = c - r / 2;

+      int v = r & 1 ? yabove_row[c + 1]

+                    : ROUND_POWER_OF_TWO(yabove_row[c] + yabove_row[c + 1], 1);

+      int w = c - r / 2;

       ypred_ptr[r * y_stride + w] = v;

   r = 0;

   c = n - 1;

-  ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1;

+  ypred_ptr[c] = ROUND_POWER_OF_TWO(ypred_ptr[(c - 1)] + yabove_row[c], 1);

   for (c = n - 2; c >= n / 2; --c) {

-    h = r + (n - 1 - c) * 2;

-    ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +

-                                   ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;

+    int h = r + (n - 1 - c) * 2;

+    ypred_ptr[h * y_stride + c] =

+         ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] +

+                            ypred_ptr[(h - 1) * y_stride + c], 1);

   for (r = 1; r < n; r++) {

     for (c = n - 1; c >= n / 2 + r / 2; --c) {

-      h = r + (n - 1 - c) * 2;

-      ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] +

-                                     ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1;

+      int h = r + (n - 1 - c) * 2;

+      ypred_ptr[h * y_stride + c] =

+          ROUND_POWER_OF_TWO(ypred_ptr[h * y_stride + c - 1] +

+                             ypred_ptr[(h - 1) * y_stride + c], 1);

@@ -116,13 +141,14 @@

 static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n,

                           uint8_t *yabove_row, uint8_t *yleft_col) {

   int r, c;

   for (r = 0; r < n - 1; ++r) {

     for (c = 0; c <= r; ++c) {

-      ypred_ptr[(r - c) * y_stride + c] =

-        (yabove_row[r + 1] * (c + 1) +

-         yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2);

+      ypred_ptr[(r - c) * y_stride + c] = iscale_round(

+          yabove_row[r + 1] * (c + 1) + yleft_col[r + 1] * (r - c + 1), r);

   for (c = 0; c <= r; ++c) {

     int yabove_ext = yabove_row[r];  // clip_pixel(2 * yabove_row[r] -

                                      //            yabove_row[r - 1]);

@@ -129,14 +155,14 @@

     int yleft_ext = yleft_col[r];  // clip_pixel(2 * yleft_col[r] -

                                    //            yleft_col[r-1]);

     ypred_ptr[(r - c) * y_stride + c] =

-      (yabove_ext * (c + 1) +

-       yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2);

+         iscale_round(yabove_ext * (c + 1) + yleft_ext * (r - c + 1), r);

   for (r = 1; r < n; ++r) {

     for (c = n - r; c < n; ++c) {

       const int yabove_ext = ypred_ptr[(r - 1) * y_stride + c];

       const int yleft_ext = ypred_ptr[r * y_stride + c - 1];

-      ypred_ptr[r * y_stride + c] = (yabove_ext + yleft_ext + 1) >> 1;

+      ypred_ptr[r * y_stride + c] =

+          ROUND_POWER_OF_TWO(yabove_ext + yleft_ext, 1);

@@ -145,7 +171,7 @@

                            uint8_t *yabove_row, uint8_t *yleft_col) {

   int r, c;

   for (c = 0; c < n; c++)

-    ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1;

+    ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + yabove_row[c], 1);

   ypred_ptr += y_stride;

   for (c = 0; c < n; c++)

     ypred_ptr[c] = yabove_row[c - 1];

@@ -179,9 +205,10 @@

 static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n,

                            uint8_t *yabove_row, uint8_t *yleft_col) {

   int r, c;

-  ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1;

+  ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1);

   for (r = 1; r < n; r++)

-    ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1;

+    ypred_ptr[r * y_stride] =

+        ROUND_POWER_OF_TWO(yleft_col[r - 1] + yleft_col[r], 1);

   ypred_ptr++;

   ypred_ptr[0] = yabove_row[-1];

   for (r = 1; r < n; r++)

@@ -248,20 +275,58 @@

+static INLINE int log2_minus_1(int n) {

+  switch (n) {

+    case 4: return 1;

+    case 8: return 2;

+    case 16: return 3;

+    case 32: return 4;

+    case 64: return 5;

+    default:

+      assert(0);

+      return 0;

+  }

+}

 void vp9_build_intra_predictors_internal(uint8_t *src, int src_stride,

                                          uint8_t *ypred_ptr,

                                          int y_stride, int mode, int bsize,

-                                         int up_available, int left_available) {

-  uint8_t *yabove_row = src - src_stride;

-  uint8_t yleft_col[64];

-  uint8_t ytop_left = yabove_row[-1];

+                                         int up_available, int left_available,

+                                         int right_available) {

   int r, c, i;

+  uint8_t yleft_col[64], yabove_data[65], ytop_left;

+  uint8_t *yabove_row = yabove_data + 1;

+  /*

+   * 127 127 127 .. 127 127 127 127 127 127

+   * 129  A   B  ..  Y   Z

+   * 129  C   D  ..  W   X

+   * 129  E   F  ..  U   V

+   * 129  G   H  ..  S   T   T   T   T   T

+   *  ..

+   */

-  for (i = 0; i < bsize; i++) {

-    yleft_col[i] = src[i * src_stride - 1];

+  if (left_available) {

+    for (i = 0; i < bsize; i++)

+      yleft_col[i] = src[i * src_stride - 1];

+  } else {

+    vpx_memset(yleft_col, 129, bsize);

+  if (up_available) {

+    uint8_t *yabove_ptr = src - src_stride;

+    vpx_memcpy(yabove_row, yabove_ptr, bsize);

+    if (left_available) {

+      ytop_left = yabove_ptr[-1];

+    } else {

+      ytop_left = 127;

+    }

+  } else {

+    vpx_memset(yabove_row, 127, bsize);

+    ytop_left = 127;

+  }

+  yabove_row[-1] = ytop_left;

   /* for Y */

   switch (mode) {

     case DC_PRED: {

@@ -269,23 +334,8 @@

       int i;

       int shift;

       int average = 0;

-      int log2_bsize_minus_1;

+      int log2_bsize_minus_1 = log2_minus_1(bsize);

-      assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32 ||

-             bsize == 64);

-      if (bsize == 4) {

-        log2_bsize_minus_1 = 1;

-      } else if (bsize == 8) {

-        log2_bsize_minus_1 = 2;

-      } else if (bsize == 16) {

-        log2_bsize_minus_1 = 3;

-      } else if (bsize == 32) {

-        log2_bsize_minus_1 = 4;

-      } else {

-        assert(bsize == 64);

-        log2_bsize_minus_1 = 5;

-      }

       if (up_available || left_available) {

         if (up_available) {

           for (i = 0; i < bsize; i++) {

@@ -299,7 +349,7 @@

         shift = log2_bsize_minus_1 + up_available + left_available;

-        expected_dc = (average + (1 << (shift - 1))) >> shift;

+        expected_dc = ROUND_POWER_OF_TWO(average, shift);

       } else {

         expected_dc = 128;

@@ -310,21 +360,19 @@

     break;

-    case V_PRED: {

+    case V_PRED:

       for (r = 0; r < bsize; r++) {

         memcpy(ypred_ptr, yabove_row, bsize);

         ypred_ptr += y_stride;

-    }

-    break;

-    case H_PRED: {

+      break;

+    case H_PRED:

       for (r = 0; r < bsize; r++) {

         vpx_memset(ypred_ptr, yleft_col[r], bsize);

         ypred_ptr += y_stride;

-    }

-    break;

-    case TM_PRED: {

+      break;

+    case TM_PRED:

       for (r = 0; r < bsize; r++) {

         for (c = 0; c < bsize; c++) {

           ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left);

@@ -332,32 +380,25 @@

         ypred_ptr += y_stride;

-    }

-    break;

-    case D45_PRED: {

+      break;

+    case D45_PRED:

       d45_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D135_PRED: {

+      break;

+    case D135_PRED:

       d135_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D117_PRED: {

+      break;

+    case D117_PRED:

       d117_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D153_PRED: {

+      break;

+    case D153_PRED:

       d153_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D27_PRED: {

+      break;

+    case D27_PRED:

       d27_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

-    case D63_PRED: {

+      break;

+    case D63_PRED:

       d63_predictor(ypred_ptr, y_stride, bsize,  yabove_row, yleft_col);

-    }

-    break;

+      break;

     case I8X8_PRED:

     case B_PRED:

     case NEARESTMV:

@@ -383,148 +424,21 @@

   static const int scale_max = 256;     // 1 << scale_bits;

   static const int scale_round = 127;   // (1 << (scale_bits - 1));

   // This table is a function A + B*exp(-kx), where x is hor. index

-  static const int weights1d[32] = {

-    128, 122, 116, 111, 107, 103,  99,  96,

-    93, 90, 88, 85, 83, 81, 80, 78,

-    77, 76, 75, 74, 73, 72, 71, 70,

-    70, 69, 69, 68, 68, 68, 67, 67,

+  static const int weights1d[64] = {

+    128, 125, 122, 119, 116, 114, 111, 109,

+    107, 105, 103, 101,  99,  97,  96,  94,

+     93,  91,  90,  89,  88,  86,  85,  84,

+     83,  82,  81,  81,  80,  79,  78,  78,

+     77,  76,  76,  75,  75,  74,  74,  73,

+     73,  72,  72,  71,  71,  71,  70,  70,

+     70,  70,  69,  69,  69,  69,  68,  68,

+     68,  68,  68,  67,  67,  67,  67,  67,

};

-  // This table is a function A + B*exp(-k.sqrt(xy)), where x, y are

-  // hor. and vert. indices

-  static const int weights2d[1024] = {

-    128, 128, 128, 128, 128, 128, 128, 128,

-    128, 128, 128, 128, 128, 128, 128, 128,

-    128, 128, 128, 128, 128, 128, 128, 128,

-    128, 128, 128, 128, 128, 128, 128, 128,

-    128, 122, 120, 118, 116, 115, 114, 113,

-    112, 111, 111, 110, 109, 109, 108, 107,

-    107, 106, 106, 105, 105, 104, 104, 104,

-    103, 103, 102, 102, 102, 101, 101, 101,

-    128, 120, 116, 114, 112, 111, 109, 108,

-    107, 106, 105, 104, 103, 102, 102, 101,

-    100, 100,  99,  99,  98,  97,  97,  96,

-    96,  96,  95,  95,  94,  94,  93,  93,

-    128, 118, 114, 111, 109, 107, 106, 104,

-    103, 102, 101, 100,  99,  98,  97,  97,

-    96,  95,  95,  94,  93,  93,  92,  92,

-    91,  91,  90,  90,  90,  89,  89,  88,

-    128, 116, 112, 109, 107, 105, 103, 102,

-    100,  99,  98,  97,  96,  95,  94,  93,

-    93,  92,  91,  91,  90,  90,  89,  89,

-    88,  88,  87,  87,  86,  86,  85,  85,

-    128, 115, 111, 107, 105, 103, 101,  99,

-    98,  97,  96,  94,  93,  93,  92,  91,

-    90,  89,  89,  88,  88,  87,  86,  86,

-    85,  85,  84,  84,  84,  83,  83,  82,

-    128, 114, 109, 106, 103, 101,  99,  97,

-    96,  95,  93,  92,  91,  90,  90,  89,

-    88,  87,  87,  86,  85,  85,  84,  84,

-    83,  83,  82,  82,  82,  81,  81,  80,

-    128, 113, 108, 104, 102,  99,  97,  96,

-    94,  93,  92,  91,  90,  89,  88,  87,

-    86,  85,  85,  84,  84,  83,  83,  82,

-    82,  81,  81,  80,  80,  79,  79,  79,

-    128, 112, 107, 103, 100,  98,  96,  94,

-    93,  91,  90,  89,  88,  87,  86,  85,

-    85,  84,  83,  83,  82,  82,  81,  80,

-    80,  80,  79,  79,  78,  78,  78,  77,

-    128, 111, 106, 102,  99,  97,  95,  93,

-    91,  90,  89,  88,  87,  86,  85,  84,

-    83,  83,  82,  81,  81,  80,  80,  79,

-    79,  78,  78,  77,  77,  77,  76,  76,

-    128, 111, 105, 101,  98,  96,  93,  92,

-    90,  89,  88,  86,  85,  84,  84,  83,

-    82,  81,  81,  80,  80,  79,  79,  78,

-    78,  77,  77,  76,  76,  76,  75,  75,

-    128, 110, 104, 100,  97,  94,  92,  91,

-    89,  88,  86,  85,  84,  83,  83,  82,

-    81,  80,  80,  79,  79,  78,  78,  77,

-    77,  76,  76,  75,  75,  75,  74,  74,

-    128, 109, 103,  99,  96,  93,  91,  90,

-    88,  87,  85,  84,  83,  82,  82,  81,

-    80,  79,  79,  78,  78,  77,  77,  76,

-    76,  75,  75,  75,  74,  74,  74,  73,

-    128, 109, 102,  98,  95,  93,  90,  89,

-    87,  86,  84,  83,  82,  81,  81,  80,

-    79,  78,  78,  77,  77,  76,  76,  75,

-    75,  75,  74,  74,  73,  73,  73,  73,

-    128, 108, 102,  97,  94,  92,  90,  88,

-    86,  85,  84,  83,  82,  81,  80,  79,

-    78,  78,  77,  77,  76,  76,  75,  75,

-    74,  74,  73,  73,  73,  73,  72,  72,

-    128, 107, 101,  97,  93,  91,  89,  87,

-    85,  84,  83,  82,  81,  80,  79,  78,

-    78,  77,  76,  76,  75,  75,  74,  74,

-    74,  73,  73,  73,  72,  72,  72,  71,

-    128, 107, 100,  96,  93,  90,  88,  86,

-    85,  83,  82,  81,  80,  79,  78,  78,

-    77,  76,  76,  75,  75,  74,  74,  73,

-    73,  73,  72,  72,  72,  71,  71,  71,

-    128, 106, 100,  95,  92,  89,  87,  85,

-    84,  83,  81,  80,  79,  78,  78,  77,

-    76,  76,  75,  75,  74,  74,  73,  73,

-    72,  72,  72,  72,  71,  71,  71,  70,

-    128, 106,  99,  95,  91,  89,  87,  85,

-    83,  82,  81,  80,  79,  78,  77,  76,

-    76,  75,  75,  74,  74,  73,  73,  72,

-    72,  72,  71,  71,  71,  71,  70,  70,

-    128, 105,  99,  94,  91,  88,  86,  84,

-    83,  81,  80,  79,  78,  77,  77,  76,

-    75,  75,  74,  74,  73,  73,  72,  72,

-    72,  71,  71,  71,  70,  70,  70,  70,

-    128, 105,  98,  93,  90,  88,  85,  84,

-    82,  81,  80,  79,  78,  77,  76,  75,

-    75,  74,  74,  73,  73,  72,  72,  71,

-    71,  71,  71,  70,  70,  70,  70,  69,

-    128, 104,  97,  93,  90,  87,  85,  83,

-    82,  80,  79,  78,  77,  76,  76,  75,

-    74,  74,  73,  73,  72,  72,  71,  71,

-    71,  70,  70,  70,  70,  69,  69,  69,

-    128, 104,  97,  92,  89,  86,  84,  83,

-    81,  80,  79,  78,  77,  76,  75,  74,

-    74,  73,  73,  72,  72,  71,  71,  71,

-    70,  70,  70,  70,  69,  69,  69,  69,

-    128, 104,  96,  92,  89,  86,  84,  82,

-    80,  79,  78,  77,  76,  75,  75,  74,

-    73,  73,  72,  72,  71,  71,  71,  70,

-    70,  70,  70,  69,  69,  69,  69,  68,

-    128, 103,  96,  91,  88,  85,  83,  82,

-    80,  79,  78,  77,  76,  75,  74,  74,

-    73,  72,  72,  72,  71,  71,  70,  70,

-    70,  70,  69,  69,  69,  69,  68,  68,

-    128, 103,  96,  91,  88,  85,  83,  81,

-    80,  78,  77,  76,  75,  75,  74,  73,

-    73,  72,  72,  71,  71,  70,  70,  70,

-    70,  69,  69,  69,  69,  68,  68,  68,

-    128, 102,  95,  90,  87,  84,  82,  81,

-    79,  78,  77,  76,  75,  74,  73,  73,

-    72,  72,  71,  71,  71,  70,  70,  70,

-    69,  69,  69,  69,  68,  68,  68,  68,

-    128, 102,  95,  90,  87,  84,  82,  80,

-    79,  77,  76,  75,  75,  74,  73,  73,

-    72,  72,  71,  71,  70,  70,  70,  69,

-    69,  69,  69,  68,  68,  68,  68,  68,

-    128, 102,  94,  90,  86,  84,  82,  80,

-    78,  77,  76,  75,  74,  73,  73,  72,

-    72,  71,  71,  70,  70,  70,  69,  69,

-    69,  69,  68,  68,  68,  68,  68,  67,

-    128, 101,  94,  89,  86,  83,  81,  79,

-    78,  77,  76,  75,  74,  73,  73,  72,

-    71,  71,  71,  70,  70,  69,  69,  69,

-    69,  68,  68,  68,  68,  68,  67,  67,

-    128, 101,  93,  89,  85,  83,  81,  79,

-    78,  76,  75,  74,  74,  73,  72,  72,

-    71,  71,  70,  70,  70,  69,  69,  69,

-    68,  68,  68,  68,  68,  67,  67,  67,

-    128, 101,  93,  88,  85,  82,  80,  79,

-    77,  76,  75,  74,  73,  73,  72,  71,

-    71,  70,  70,  70,  69,  69,  69,  68,

-    68,  68,  68,  68,  67,  67,  67,  67,

-  };

-  int size_scale = (size >= 32 ? 1 :

-                    size == 16 ? 2 :

-                    size == 8  ? 4 : 8);

-  int size_shift = size == 64 ? 1 : 0;

+  int size_scale = (size >= 64 ? 1:

+                    size == 32 ? 2 :

+                    size == 16 ? 4 :

+                    size == 8  ? 8 : 16);

   int i, j;

   switch (mode) {

     case V_PRED:

@@ -531,7 +445,7 @@

       for (i = 0; i < size; ++i) {

         for (j = 0; j < size; ++j) {

           int k = i * interstride + j;

-          int scale = weights1d[i * size_scale >> size_shift];

+          int scale = weights1d[i * size_scale];

           interpred[k] =

               ((scale_max - scale) * interpred[k] +

                scale * intrapred[i * intrastride + j] + scale_round)

@@ -544,7 +458,7 @@

       for (i = 0; i < size; ++i) {

         for (j = 0; j < size; ++j) {

           int k = i * interstride + j;

-          int scale = weights1d[j * size_scale >> size_shift];

+          int scale = weights1d[j * size_scale];

           interpred[k] =

               ((scale_max - scale) * interpred[k] +

                scale * intrapred[i * intrastride + j] + scale_round)

@@ -558,9 +472,8 @@

       for (i = 0; i < size; ++i) {

         for (j = 0; j < size; ++j) {

           int k = i * interstride + j;

-          int scale = (weights2d[(i * size_scale * 32 +

-                                  j * size_scale) >> size_shift] +

-                       weights1d[i * size_scale >> size_shift]) >> 1;

+          int scale = (weights1d[i * size_scale] * 3 +

+                       weights1d[j * size_scale]) >> 2;

           interpred[k] =

               ((scale_max - scale) * interpred[k] +

                scale * intrapred[i * intrastride + j] + scale_round)

@@ -574,9 +487,8 @@

       for (i = 0; i < size; ++i) {

         for (j = 0; j < size; ++j) {

           int k = i * interstride + j;

-          int scale = (weights2d[(i * size_scale * 32 +

-                                  j * size_scale) >> size_shift] +

-                       weights1d[j * size_scale >> size_shift]) >> 1;

+          int scale = (weights1d[j * size_scale] * 3 +

+                       weights1d[i * size_scale]) >> 2;

           interpred[k] =

               ((scale_max - scale) * interpred[k] +

                scale * intrapred[i * intrastride + j] + scale_round)

@@ -589,8 +501,7 @@

       for (i = 0; i < size; ++i) {

         for (j = 0; j < size; ++j) {

           int k = i * interstride + j;

-          int scale = weights2d[(i * size_scale * 32 +

-                                 j * size_scale) >> size_shift];

+          int scale = weights1d[(i < j ? i : j) * size_scale];

           interpred[k] =

               ((scale_max - scale) * interpred[k] +

                scale * intrapred[i * intrastride + j] + scale_round)

@@ -600,8 +511,21 @@

       break;

     case D45_PRED:

-    case DC_PRED:

+      for (i = 0; i < size; ++i) {

+        for (j = 0; j < size; ++j) {

+          int k = i * interstride + j;

+          int scale = (weights1d[i * size_scale] +

+                       weights1d[j * size_scale]) >> 1;

+          interpred[k] =

+              ((scale_max - scale) * interpred[k] +

+               scale * intrapred[i * intrastride + j] + scale_round)

+              >> scale_bits;

+        }

+      }

+      break;

     case TM_PRED:

+    case DC_PRED:

     default:

       // simple average

       for (i = 0; i < size; ++i) {

@@ -631,7 +555,7 @@

       xd->dst.y_buffer, xd->dst.y_stride,

       intrapredictor, 16,

       xd->mode_info_context->mbmi.interintra_mode, 16,

-      xd->up_available, xd->left_available);

+      xd->up_available, xd->left_available, xd->right_available);

   combine_interintra(xd->mode_info_context->mbmi.interintra_mode,

                      ypred, ystride, intrapredictor, 16, 16);

@@ -646,12 +570,12 @@

       xd->dst.u_buffer, xd->dst.uv_stride,

       uintrapredictor, 8,

       xd->mode_info_context->mbmi.interintra_uv_mode, 8,

-      xd->up_available, xd->left_available);

+      xd->up_available, xd->left_available, xd->right_available);

   vp9_build_intra_predictors_internal(

       xd->dst.v_buffer, xd->dst.uv_stride,

       vintrapredictor, 8,

       xd->mode_info_context->mbmi.interintra_uv_mode, 8,

-      xd->up_available, xd->left_available);

+      xd->up_available, xd->left_available, xd->right_available);

   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

                      upred, uvstride, uintrapredictor, 8, 8);

   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

@@ -666,7 +590,7 @@

       xd->dst.y_buffer, xd->dst.y_stride,

       intrapredictor, 32,

       xd->mode_info_context->mbmi.interintra_mode, 32,

-      xd->up_available, xd->left_available);

+      xd->up_available, xd->left_available, xd->right_available);

   combine_interintra(xd->mode_info_context->mbmi.interintra_mode,

                      ypred, ystride, intrapredictor, 32, 32);

@@ -681,12 +605,12 @@

       xd->dst.u_buffer, xd->dst.uv_stride,

       uintrapredictor, 16,

       xd->mode_info_context->mbmi.interintra_uv_mode, 16,

-      xd->up_available, xd->left_available);

+      xd->up_available, xd->left_available, xd->right_available);

   vp9_build_intra_predictors_internal(

       xd->dst.v_buffer, xd->dst.uv_stride,

       vintrapredictor, 16,

       xd->mode_info_context->mbmi.interintra_uv_mode, 16,

-      xd->up_available, xd->left_available);

+      xd->up_available, xd->left_available, xd->right_available);

   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

                      upred, uvstride, uintrapredictor, 16, 16);

   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

@@ -710,7 +634,8 @@

   const int mode = xd->mode_info_context->mbmi.interintra_mode;

   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

                                       intrapredictor, 64, mode, 64,

-                                      xd->up_available, xd->left_available);

+                                      xd->up_available, xd->left_available,

+                                      xd->right_available);

   combine_interintra(xd->mode_info_context->mbmi.interintra_mode,

                      ypred, ystride, intrapredictor, 64, 64);

@@ -724,10 +649,12 @@

   const int mode = xd->mode_info_context->mbmi.interintra_uv_mode;

   vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,

                                       uintrapredictor, 32, mode, 32,

-                                      xd->up_available, xd->left_available);

+                                      xd->up_available, xd->left_available,

+                                      xd->right_available);

   vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,

                                       vintrapredictor, 32, mode, 32,

-                                      xd->up_available, xd->left_available);

+                                      xd->up_available, xd->left_available,

+                                      xd->right_available);

   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

                      upred, uvstride, uintrapredictor, 32, 32);

   combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode,

@@ -749,7 +676,8 @@

   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

                                       xd->predictor, 16,

                                       xd->mode_info_context->mbmi.mode, 16,

-                                      xd->up_available, xd->left_available);

+                                      xd->up_available, xd->left_available,

+                                      xd->right_available);

 void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) {

@@ -756,7 +684,8 @@

   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

                                       xd->dst.y_buffer, xd->dst.y_stride,

                                       xd->mode_info_context->mbmi.mode, 16,

-                                      xd->up_available, xd->left_available);

+                                      xd->up_available, xd->left_available,

+                                      xd->right_available);

 void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) {

@@ -763,7 +692,8 @@

   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

                                       xd->dst.y_buffer, xd->dst.y_stride,

                                       xd->mode_info_context->mbmi.mode, 32,

-                                      xd->up_available, xd->left_available);

+                                      xd->up_available, xd->left_available,

+                                      xd->right_available);

 void vp9_build_intra_predictors_sb64y_s(MACROBLOCKD *xd) {

@@ -770,7 +700,8 @@

   vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,

                                       xd->dst.y_buffer, xd->dst.y_stride,

                                       xd->mode_info_context->mbmi.mode, 64,

-                                      xd->up_available, xd->left_available);

+                                      xd->up_available, xd->left_available,

+                                      xd->right_available);

 void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,

@@ -780,10 +711,12 @@

                                               int mode, int bsize) {

   vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,

                                       upred_ptr, uv_stride, mode, bsize,

-                                      xd->up_available, xd->left_available);

+                                      xd->up_available, xd->left_available,

+                                      xd->right_available);

   vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,

                                       vpred_ptr, uv_stride, mode, bsize,

-                                      xd->up_available, xd->left_available);

+                                      xd->up_available, xd->left_available,

+                                      xd->right_available);

 void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) {

@@ -815,20 +748,35 @@

                                            32);

-void vp9_intra8x8_predict(BLOCKD *xd,

+void vp9_intra8x8_predict(MACROBLOCKD *xd,

+                          BLOCKD *b,

                           int mode,

                           uint8_t *predictor) {

-  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,

-                                      xd->dst_stride, predictor, 16,

-                                      mode, 8, 1, 1);

+  const int block4x4_idx = (b - xd->block);

+  const int block_idx = (block4x4_idx >> 2) | !!(block4x4_idx & 2);

+  const int have_top = (block_idx >> 1) || xd->up_available;

+  const int have_left = (block_idx & 1)  || xd->left_available;

+  const int have_right = !(block_idx & 1) || xd->right_available;

+  vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst,

+                                      b->dst_stride, predictor, 16,

+                                      mode, 8, have_top, have_left,

+                                      have_right);

-void vp9_intra_uv4x4_predict(BLOCKD *xd,

+void vp9_intra_uv4x4_predict(MACROBLOCKD *xd,

+                             BLOCKD *b,

                              int mode,

                              uint8_t *predictor) {

-  vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,

-                                      xd->dst_stride, predictor, 8,

-                                      mode, 4, 1, 1);

+  const int block_idx = (b - xd->block) & 3;

+  const int have_top = (block_idx >> 1) || xd->up_available;

+  const int have_left = (block_idx & 1)  || xd->left_available;

+  const int have_right = !(block_idx & 1) || xd->right_available;

+  vp9_build_intra_predictors_internal(*(b->base_dst) + b->dst,

+                                      b->dst_stride, predictor, 8,

+                                      mode, 4, have_top, have_left,

+                                      have_right);

 /* TODO: try different ways of use Y-UV mode correlation

--- a/vp9/common/vp9_reconintra.h

+++ b/vp9/common/vp9_reconintra.h

@@ -14,37 +14,44 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_blockd.h"

-extern void vp9_recon_intra_mbuv(MACROBLOCKD *xd);

-extern B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

-                                                     int stride, int n);

-extern B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x);

+void vp9_recon_intra_mbuv(MACROBLOCKD *xd);

+B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

+                                              int stride, int n,

+                                              int tx, int ty);

+B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x);

 #if CONFIG_COMP_INTERINTRA_PRED

-extern void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,

-                                                     uint8_t *ypred,

-                                                     uint8_t *upred,

-                                                     uint8_t *vpred,

-                                                     int ystride,

-                                                     int uvstride);

-extern void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,

-                                                      uint8_t *ypred,

-                                                      int ystride);

-extern void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,

-                                                       uint8_t *upred,

-                                                       uint8_t *vpred,

-                                                       int uvstride);

+void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd,

+                                              uint8_t *ypred,

+                                              uint8_t *upred,

+                                              uint8_t *vpred,

+                                              int ystride,

+                                              int uvstride);

+void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd,

+                                               uint8_t *ypred,

+                                               int ystride);

+void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd,

+                                                uint8_t *upred,

+                                                uint8_t *vpred,

+                                                int uvstride);

 #endif  // CONFIG_COMP_INTERINTRA_PRED

-extern void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,

-                                                     uint8_t *ypred,

-                                                     uint8_t *upred,

-                                                     uint8_t *vpred,

-                                                     int ystride,

-                                                     int uvstride);

-extern void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,

-                                                     uint8_t *ypred,

-                                                     uint8_t *upred,

-                                                     uint8_t *vpred,

-                                                     int ystride,

-                                                     int uvstride);

+void vp9_build_interintra_32x32_predictors_sb(MACROBLOCKD *xd,

+                                              uint8_t *ypred,

+                                              uint8_t *upred,

+                                              uint8_t *vpred,

+                                              int ystride,

+                                              int uvstride);

+void vp9_build_interintra_64x64_predictors_sb(MACROBLOCKD *xd,

+                                              uint8_t *ypred,

+                                              uint8_t *upred,

+                                              uint8_t *vpred,

+                                              int ystride,

+                                              int uvstride);

 #endif  // VP9_COMMON_VP9_RECONINTRA_H_

--- a/vp9/common/vp9_reconintra4x4.c

+++ b/vp9/common/vp9_reconintra4x4.c

@@ -15,17 +15,17 @@

 #include "vp9_rtcd.h"

 #if CONFIG_NEWBINTRAMODES

-static int find_grad_measure(uint8_t *x, int stride, int n, int t,

+static int find_grad_measure(uint8_t *x, int stride, int n, int tx, int ty,

                              int dx, int dy) {

   int i, j;

   int count = 0, gsum = 0, gdiv;

   /* TODO: Make this code more efficient by breaking up into two loops */

-  for (i = -t; i < n; ++i)

-    for (j = -t; j < n; ++j) {

+  for (i = -ty; i < n; ++i)

+    for (j = -tx; j < n; ++j) {

       int g;

       if (i >= 0 && j >= 0) continue;

       if (i + dy >= 0 && j + dx >= 0) continue;

-      if (i + dy < -t || i + dy >= n || j + dx < -t || j + dx >= n) continue;

+      if (i + dy < -ty || i + dy >= n || j + dx < -tx || j + dx >= n) continue;

       g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]);

       gsum += g * g;

       count++;

@@ -36,14 +36,15 @@

 #if CONTEXT_PRED_REPLACEMENTS == 6

 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

-                                              int stride, int n) {

+                                              int stride, int n,

+                                              int tx, int ty) {

   int g[8], i, imin, imax;

-  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);

-  g[2] = find_grad_measure(ptr, stride, n, 4,  1, 1);

-  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);

-  g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);

-  g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1);

-  g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);

+  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);

+  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);

+  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);

+  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);

+  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);

+  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);

   imin = 1;

   for (i = 2; i < 8; i += 1 + (i == 3))

     imin = (g[i] < g[imin] ? i : imin);

@@ -73,12 +74,13 @@

 #elif CONTEXT_PRED_REPLACEMENTS == 4

 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

-                                              int stride, int n) {

+                                              int stride, int n,

+                                              int tx, int ty) {

   int g[8], i, imin, imax;

-  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);

-  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);

-  g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);

-  g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);

+  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);

+  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);

+  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);

+  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);

   imin = 1;

   for (i = 3; i < 8; i+=2)

     imin = (g[i] < g[imin] ? i : imin);

@@ -104,16 +106,17 @@

 #elif CONTEXT_PRED_REPLACEMENTS == 0

 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr,

-                                              int stride, int n) {

+                                              int stride, int n,

+                                              int tx, int ty) {

   int g[8], i, imin, imax;

-  g[0] = find_grad_measure(ptr, stride, n, 4,  1, 0);

-  g[1] = find_grad_measure(ptr, stride, n, 4,  2, 1);

-  g[2] = find_grad_measure(ptr, stride, n, 4,  1, 1);

-  g[3] = find_grad_measure(ptr, stride, n, 4,  1, 2);

-  g[4] = find_grad_measure(ptr, stride, n, 4,  0, 1);

-  g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2);

-  g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1);

-  g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1);

+  g[0] = find_grad_measure(ptr, stride, n, tx, ty,  1, 0);

+  g[1] = find_grad_measure(ptr, stride, n, tx, ty,  2, 1);

+  g[2] = find_grad_measure(ptr, stride, n, tx, ty,  1, 1);

+  g[3] = find_grad_measure(ptr, stride, n, tx, ty,  1, 2);

+  g[4] = find_grad_measure(ptr, stride, n, tx, ty,  0, 1);

+  g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2);

+  g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1);

+  g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1);

   imax = 0;

   for (i = 1; i < 8; i++)

     imax = (g[i] > g[imax] ? i : imax);

@@ -144,27 +147,114 @@

 #endif

-B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) {

+B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x) {

+  const int block_idx = x - xd->block;

+  const int have_top = (block_idx >> 2) || xd->up_available;

+  const int have_left = (block_idx & 3)  || xd->left_available;

   uint8_t *ptr = *(x->base_dst) + x->dst;

   int stride = x->dst_stride;

-  return vp9_find_dominant_direction(ptr, stride, 4);

+  int tx = have_left ? 4 : 0;

+  int ty = have_top ? 4 : 0;

+  if (!have_left && !have_top)

+    return B_DC_PRED;

+  return vp9_find_dominant_direction(ptr, stride, 4, tx, ty);

 #endif

-void vp9_intra4x4_predict(BLOCKD *x,

+void vp9_intra4x4_predict(MACROBLOCKD *xd,

+                          BLOCKD *x,

                           int b_mode,

                           uint8_t *predictor) {

   int i, r, c;

+  const int block_idx = x - xd->block;

+  const int have_top = (block_idx >> 2) || xd->up_available;

+  const int have_left = (block_idx & 3)  || xd->left_available;

+  const int have_right = (block_idx & 3) != 3 || xd->right_available;

+  uint8_t left[4], above[8], top_left;

+  /*

+   * 127 127 127 .. 127 127 127 127 127 127

+   * 129  A   B  ..  Y   Z

+   * 129  C   D  ..  W   X

+   * 129  E   F  ..  U   V

+   * 129  G   H  ..  S   T   T   T   T   T

+   *  ..

+   */

-  uint8_t *above = *(x->base_dst) + x->dst - x->dst_stride;

-  uint8_t left[4];

-  uint8_t top_left = above[-1];

+  if (have_left) {

+    uint8_t *left_ptr = *(x->base_dst) + x->dst - 1;

+    const int stride = x->dst_stride;

-  left[0] = (*(x->base_dst))[x->dst - 1];

-  left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];

-  left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];

-  left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];

+    left[0] = left_ptr[0 * stride];

+    left[1] = left_ptr[1 * stride];

+    left[2] = left_ptr[2 * stride];

+    left[3] = left_ptr[3 * stride];

+  } else {

+    left[0] = left[1] = left[2] = left[3] = 129;

+  }

+  if (have_top) {

+    uint8_t *above_ptr = *(x->base_dst) + x->dst - x->dst_stride;

+    if (have_left) {

+      top_left = above_ptr[-1];

+    } else {

+      top_left = 127;

+    }

+    above[0] = above_ptr[0];

+    above[1] = above_ptr[1];

+    above[2] = above_ptr[2];

+    above[3] = above_ptr[3];

+    if (((block_idx & 3) != 3) ||

+        (have_right && block_idx == 3 &&

+         ((xd->mb_index != 3 && xd->sb_index != 3) ||

+          ((xd->mb_index & 1) == 0 && xd->sb_index == 3)))) {

+      above[4] = above_ptr[4];

+      above[5] = above_ptr[5];

+      above[6] = above_ptr[6];

+      above[7] = above_ptr[7];

+    } else if (have_right) {

+      uint8_t *above_right = above_ptr + 4;

+      if (xd->sb_index == 3 && (xd->mb_index & 1))

+        above_right -= 32 * x->dst_stride;

+      if (xd->mb_index == 3)

+        above_right -= 16 * x->dst_stride;

+      above_right -= (block_idx & ~3) * x->dst_stride;

+      /* use a more distant above-right (from closest available top-right

+       * corner), but with a "localized DC" (similar'ish to TM-pred):

+       *

+       *  A   B   C   D   E   F   G   H

+       *  I   J   K   L

+       *  M   N   O   P

+       *  Q   R   S   T

+       *  U   V   W   X   x1  x2  x3  x4

+       *

+       * Where:

+       * x1 = clip_pixel(E + X - D)

+       * x2 = clip_pixel(F + X - D)

+       * x3 = clip_pixel(G + X - D)

+       * x4 = clip_pixel(H + X - D)

+       *

+       * This is applied anytime when we use a "distant" above-right edge

+       * that is not immediately top-right to the block that we're going

+       * to do intra prediction for.

+       */

+      above[4] = clip_pixel(above_right[0] + above_ptr[3] - above_right[-1]);

+      above[5] = clip_pixel(above_right[1] + above_ptr[3] - above_right[-1]);

+      above[6] = clip_pixel(above_right[2] + above_ptr[3] - above_right[-1]);

+      above[7] = clip_pixel(above_right[3] + above_ptr[3] - above_right[-1]);

+    } else {

+      // extend edge

+      above[4] = above[5] = above[6] = above[7] = above[3];

+    }

+  } else {

+    above[0] = above[1] = above[2] = above[3] = 127;

+    above[4] = above[5] = above[6] = above[7] = 127;

+    top_left = 127;

+  }

 #if CONFIG_NEWBINTRAMODES

   if (b_mode == B_CONTEXT_PRED)

     b_mode = x->bmi.as_mode.context;

@@ -410,40 +500,4 @@

*/

 #endif

-}

-/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and

- * to the right prediction have filled in pixels to use.

- */

-void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) {

-  int extend_edge = xd->mb_to_right_edge == 0 && xd->mb_index < 2;

-  uint8_t *above_right = *(xd->block[0].base_dst) + xd->block[0].dst -

-                               xd->block[0].dst_stride + 16;

-  uint32_t *dst_ptr0 = (uint32_t *)above_right;

-  uint32_t *dst_ptr1 =

-    (uint32_t *)(above_right + 4 * xd->block[0].dst_stride);

-  uint32_t *dst_ptr2 =

-    (uint32_t *)(above_right + 8 * xd->block[0].dst_stride);

-  uint32_t *dst_ptr3 =

-    (uint32_t *)(above_right + 12 * xd->block[0].dst_stride);

-  uint32_t *src_ptr = (uint32_t *) above_right;

-  if ((xd->sb_index >= 2 && xd->mb_to_right_edge == 0) ||

-      (xd->sb_index == 3 && xd->mb_index & 1))

-    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 32 *

-                                                    xd->block[0].dst_stride);

-  if (xd->mb_index == 3 ||

-      (xd->mb_to_right_edge == 0 && xd->mb_index == 2))

-    src_ptr = (uint32_t *) (((uint8_t *) src_ptr) - 16 *

-                                                    xd->block[0].dst_stride);

-  if (extend_edge) {

-    *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U;

-  }

-  *dst_ptr0 = *src_ptr;

-  *dst_ptr1 = *src_ptr;

-  *dst_ptr2 = *src_ptr;

-  *dst_ptr3 = *src_ptr;

--- a/vp9/common/vp9_reconintra4x4.h

+++ /dev/null

@@ -1,17 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_VP9_RECONINTRA4X4_H_

-#define VP9_COMMON_VP9_RECONINTRA4X4_H_

-extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd);

-#endif  // VP9_COMMON_VP9_RECONINTRA4X4_H_

--- a/vp9/common/vp9_rtcd.c

+++ b/vp9/common/vp9_rtcd.c

@@ -12,10 +12,9 @@

 #include "vp9_rtcd.h"

 #include "vpx_ports/vpx_once.h"

-extern void vpx_scale_rtcd(void);

+void vpx_scale_rtcd(void);

-void vp9_rtcd()

-{

+void vp9_rtcd() {

     vpx_scale_rtcd();

     once(setup_rtcd_internal);

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -23,90 +23,50 @@

 forward_decls vp9_common_forward_decls

-prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"

-prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"

-prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"

-prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"

-# At the very least, MSVC 2008 has compiler bug exhibited by this code; code

-# compiles warning free but a dissassembly of generated code show bugs. To be

-# on the safe side, only enabled when compiled with 'gcc'.

-if [ "$CONFIG_GCC" = "yes" ]; then

-    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2

-fi

-    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2

-    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2

-    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2

 # Dequant

-prototype void vp9_dequantize_b "struct blockd *x"

-specialize vp9_dequantize_b

-prototype void vp9_dequantize_b_2x2 "struct blockd *x"

-specialize vp9_dequantize_b_2x2

-prototype void vp9_dequant_dc_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dc, struct macroblockd *xd"

-specialize vp9_dequant_dc_idct_add_y_block_8x8

-prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, struct macroblockd *xd"

+prototype void vp9_dequant_idct_add_y_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd"

 specialize vp9_dequant_idct_add_y_block_8x8

-prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs, struct macroblockd *xd"

+prototype void vp9_dequant_idct_add_uv_block_8x8 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"

 specialize vp9_dequant_idct_add_uv_block_8x8

 prototype void vp9_dequant_idct_add_16x16 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"

 specialize vp9_dequant_idct_add_16x16

-prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc, int eob"

+prototype void vp9_dequant_idct_add_8x8 "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"

 specialize vp9_dequant_idct_add_8x8

-prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride"

+prototype void vp9_dequant_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob"

 specialize vp9_dequant_idct_add

-prototype void vp9_dequant_dc_idct_add "int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc"

-specialize vp9_dequant_dc_idct_add

-prototype void vp9_dequant_dc_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs, const int16_t *dcs"

-specialize vp9_dequant_dc_idct_add_y_block

-prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, uint16_t *eobs"

+prototype void vp9_dequant_idct_add_y_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int stride, struct macroblockd *xd"

 specialize vp9_dequant_idct_add_y_block

-prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"

+prototype void vp9_dequant_idct_add_uv_block "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"

 specialize vp9_dequant_idct_add_uv_block

 prototype void vp9_dequant_idct_add_32x32 "int16_t *q, const int16_t *dq, uint8_t *pre, uint8_t *dst, int pitch, int stride, int eob"

 specialize vp9_dequant_idct_add_32x32

-prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, uint16_t *eobs"

+prototype void vp9_dequant_idct_add_uv_block_16x16 "int16_t *q, const int16_t *dq, uint8_t *dstu, uint8_t *dstv, int stride, struct macroblockd *xd"

 specialize vp9_dequant_idct_add_uv_block_16x16

 # RECON

-prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

+prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

 specialize vp9_copy_mem16x16 mmx sse2 dspr2

 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2

-prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

+prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

 specialize vp9_copy_mem8x8 mmx dspr2

 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2

-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

+prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

 specialize vp9_copy_mem8x4 mmx

-prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

-specialize vp9_avg_mem16x16

-prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

-specialize vp9_avg_mem8x8

-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

-specialize vp9_copy_mem8x4 mmx dspr2

-vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2

 prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"

 specialize vp9_recon_b

@@ -137,6 +97,12 @@

 prototype void vp9_recon_sbuv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"

 specialize void vp9_recon_sbuv_s

+prototype void vp9_recon_sb64y_s "struct macroblockd *x, uint8_t *dst"

+specialize vp9_recon_sb64y_s

+prototype void vp9_recon_sb64uv_s "struct macroblockd *x, uint8_t *udst, uint8_t *vdst"

+specialize void vp9_recon_sb64uv_s

 prototype void vp9_build_intra_predictors_mby_s "struct macroblockd *x"

 specialize vp9_build_intra_predictors_mby_s

@@ -164,15 +130,38 @@

 prototype void vp9_build_intra_predictors_sb64uv_s "struct macroblockd *x"

 specialize vp9_build_intra_predictors_sb64uv_s;

-prototype void vp9_intra4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor"

+prototype void vp9_intra4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"

 specialize vp9_intra4x4_predict;

-prototype void vp9_intra8x8_predict "struct blockd *x, int b_mode, uint8_t *predictor"

+prototype void vp9_intra8x8_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"

 specialize vp9_intra8x8_predict;

-prototype void vp9_intra_uv4x4_predict "struct blockd *x, int b_mode, uint8_t *predictor"

+prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor"

 specialize vp9_intra_uv4x4_predict;

+if [ "$CONFIG_VP9_DECODER" = "yes" ]; then

+prototype void vp9_add_residual_4x4 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+specialize vp9_add_residual_4x4 sse2

+prototype void vp9_add_residual_8x8 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+specialize vp9_add_residual_8x8 sse2

+prototype void vp9_add_residual_16x16 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+specialize vp9_add_residual_16x16 sse2

+prototype void vp9_add_residual_32x32 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+specialize vp9_add_residual_32x32 sse2

+prototype void vp9_add_constant_residual_8x8 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+specialize vp9_add_constant_residual_8x8 sse2

+prototype void vp9_add_constant_residual_16x16 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+specialize vp9_add_constant_residual_16x16 sse2

+prototype void vp9_add_constant_residual_32x32 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride"

+specialize vp9_add_constant_residual_32x32 sse2

+fi

 # Loopfilter

@@ -263,171 +252,146 @@

 prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride"

 specialize vp9_sad3x16 sse2

-prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

+prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, const int source_stride, const int xoffset, const int  yoffset, const uint8_t *ref_ptr, const int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance16x2 sse2

 # Sub Pixel Filters

-prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict16x16

+prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8 ssse3

-prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x8

+prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_horiz ssse3

-prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg16x16

+prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_vert ssse3

-prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg8x8

+prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_avg ssse3

-prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg4x4

+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_avg_horiz ssse3

-prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x4

+prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_avg_vert ssse3

-prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict4x4

+#if CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT

+prototype void vp9_convolve8_1by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_1by8

-prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict16x16_sharp

+prototype void vp9_convolve8_qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_qtr

-prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x8_sharp

+prototype void vp9_convolve8_3by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_3by8

-prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg16x16_sharp

+prototype void vp9_convolve8_5by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_5by8

-prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg8x8_sharp

+prototype void vp9_convolve8_3qtr "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_3qtr

-prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg4x4_sharp

+prototype void vp9_convolve8_7by8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_7by8

-prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x4_sharp

+prototype void vp9_convolve8_1by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_1by8_horiz

-prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict4x4_sharp

+prototype void vp9_convolve8_qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_qtr_horiz

-prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict16x16_smooth

+prototype void vp9_convolve8_3by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_3by8_horiz

-prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x8_smooth

+prototype void vp9_convolve8_5by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_5by8_horiz

-prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg16x16_smooth

+prototype void vp9_convolve8_3qtr_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_3qtr_horiz

-prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg8x8_smooth

+prototype void vp9_convolve8_7by8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_7by8_horiz

-prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg4x4_smooth

+prototype void vp9_convolve8_1by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_1by8_vert

-prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x4_smooth

+prototype void vp9_convolve8_qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_qtr_vert

-prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict4x4_smooth

+prototype void vp9_convolve8_3by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_3by8_vert

-prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict16x16

+prototype void vp9_convolve8_5by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_5by8_vert

-prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict8x8

+prototype void vp9_convolve8_3qtr_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_3qtr_vert

-prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict_avg16x16

+prototype void vp9_convolve8_7by8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_7by8_vert

+#endif

-prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict_avg8x8

-prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict8x4

-prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict4x4

-prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict_avg4x4

-prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict16x16 sse2

-prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict8x8 sse2

-prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict_avg16x16

-prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict_avg8x8

-prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict8x4

-prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict4x4

-prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict_avg4x4

 # dct

-prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct4x4llm_1

+prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"

+specialize vp9_short_idct4x4_1

-prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct4x4llm

+prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"

+specialize vp9_short_idct4x4 sse2

 prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct8x8

+specialize vp9_short_idct8x8 sse2

 prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct10_8x8

+specialize vp9_short_idct10_8x8 sse2

-prototype void vp9_short_ihaar2x2 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_ihaar2x2

+prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"

+specialize vp9_short_idct1_8x8

 prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct16x16

+specialize vp9_short_idct16x16 sse2

 prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct10_16x16

+specialize vp9_short_idct10_16x16 sse2

+prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"

+specialize vp9_short_idct1_16x16

 prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"

-specialize vp9_short_idct32x32

+specialize vp9_short_idct32x32 sse2

-prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"

-specialize vp9_ihtllm

+prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"

+specialize vp9_short_idct1_32x32

-#

-# 2nd order

-#

-prototype void vp9_short_inv_walsh4x4_1 "int16_t *in, int16_t *out"

-specialize vp9_short_inv_walsh4x4_1

+prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"

+specialize vp9_short_idct10_32x32

-prototype void vp9_short_inv_walsh4x4 "int16_t *in, int16_t *out"

-specialize vp9_short_inv_walsh4x4_

+prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"

+specialize vp9_short_iht8x8

+prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"

+specialize vp9_short_iht4x4

+prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"

+specialize vp9_short_iht16x16

+prototype void vp9_idct4_1d "int16_t *input, int16_t *output"

+specialize vp9_idct4_1d sse2

 # dct and add

-prototype void vp9_dc_only_idct_add_8x8 "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

-specialize vp9_dc_only_idct_add_8x8

 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

-specialize vp9_dc_only_idct_add

+specialize vp9_dc_only_idct_add sse2

-if [ "$CONFIG_LOSSLESS" = "yes" ]; then

-prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"

-prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch"

+prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"

+specialize vp9_short_iwalsh4x4_1

+prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"

+specialize vp9_short_iwalsh4x4

 prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"

-prototype void vp9_short_inv_walsh4x4_1_lossless "int16_t *in, int16_t *out"

-prototype void vp9_short_inv_walsh4x4_lossless "int16_t *in, int16_t *out"

-fi

+specialize vp9_dc_only_inv_walsh_add

 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"

 specialize vp9_sad32x3

@@ -475,58 +439,52 @@

 vp9_variance4x4_sse2=vp9_variance4x4_wmt

 vp9_variance4x4_mmx=vp9_variance4x4_mmx

-prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

-specialize vp9_sub_pixel_variance64x64

+prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_sub_pixel_variance64x64 sse2

-prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

-specialize vp9_sub_pixel_variance32x32

+prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

+specialize vp9_sub_pixel_variance32x32 sse2

-prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

+prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3

-vp9_sub_pixel_variance16x16_sse2=vp9_sub_pixel_variance16x16_wmt

-prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

+prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance8x16 sse2 mmx

 vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt

-prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

+prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3

 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;

 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt

-prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

+prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance8x8 sse2 mmx

 vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt

-prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

+prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_variance4x4 sse2 mmx

 vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt

 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad64x64

+specialize vp9_sad64x64 sse2

 prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad32x32

+specialize vp9_sad32x32 sse2

 prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad16x16 mmx sse2 sse3

-vp9_sad16x16_sse2=vp9_sad16x16_wmt

+specialize vp9_sad16x16 mmx sse2

 prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

 specialize vp9_sad16x8 mmx sse2

-vp9_sad16x8_sse2=vp9_sad16x8_wmt

 prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

 specialize vp9_sad8x16 mmx sse2

-vp9_sad8x16_sse2=vp9_sad8x16_wmt

 prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

 specialize vp9_sad8x8 mmx sse2

-vp9_sad8x8_sse2=vp9_sad8x8_wmt

 prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int max_sad"

-specialize vp9_sad4x4 mmx sse2

-vp9_sad4x4_sse2=vp9_sad4x4_wmt

+specialize vp9_sad4x4 mmx sse

 prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_variance_halfpixvar16x16_h mmx sse2

@@ -579,76 +537,64 @@

 prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array"

 specialize vp9_sad4x4x3 sse3

-prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"

+prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"

 specialize vp9_sad64x64x8

-prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"

+prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"

 specialize vp9_sad32x32x8

-prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"

+prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"

 specialize vp9_sad16x16x8 sse4

-prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"

+prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"

 specialize vp9_sad16x8x8 sse4

-prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"

+prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"

 specialize vp9_sad8x16x8 sse4

-prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"

+prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"

 specialize vp9_sad8x8x8 sse4

-prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint16_t *sad_array"

+prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array"

 specialize vp9_sad4x4x8 sse4

-prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad64x64x4d

+prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad64x64x4d sse2

-prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad32x32x4d

+prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad32x32x4d sse2

-prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad16x16x4d sse3

+prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad16x16x4d sse2

-prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad16x8x4d sse3

+prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad16x8x4d sse2

-prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad8x16x4d sse3

+prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad8x16x4d sse2

-prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad8x8x4d sse3

+prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad8x8x4d sse2

-prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t **ref_ptr, int  ref_stride, unsigned int *sad_array"

-specialize vp9_sad4x4x4d sse3

-#

-# Block copy

-#

-case $arch in

-    x86*)

-    prototype void vp9_copy32xn "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, int n"

-    specialize vp9_copy32xn sse2 sse3

-    ;;

-esac

+prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array"

+specialize vp9_sad4x4x4d sse

 prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"

 specialize vp9_sub_pixel_mse16x16 sse2 mmx

-vp9_sub_pixel_mse16x16_sse2=vp9_sub_pixel_mse16x16_wmt

 prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"

 specialize vp9_mse16x16 mmx sse2

 vp9_mse16x16_sse2=vp9_mse16x16_wmt

-prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

+prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_mse64x64

-prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse"

+prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int  source_stride, int  xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"

 specialize vp9_sub_pixel_mse32x32

 prototype unsigned int vp9_get_mb_ss "const int16_t *"

 specialize vp9_get_mb_ss mmx sse2

 # ENCODEMB INVOKE

-prototype int vp9_mbblock_error "struct macroblock *mb, int dc"

+prototype int vp9_mbblock_error "struct macroblock *mb"

 specialize vp9_mbblock_error mmx sse2

 vp9_mbblock_error_sse2=vp9_mbblock_error_xmm

@@ -686,15 +632,18 @@

fi

 # fdct functions

-prototype void vp9_fht "const int16_t *input, int pitch, int16_t *output, int tx_type, int tx_dim"

-specialize vp9_fht

+prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"

+specialize vp9_short_fht4x4

-prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"

-specialize vp9_short_fdct8x8

+prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"

+specialize vp9_short_fht8x8

-prototype void vp9_short_fhaar2x2 "int16_t *InputData, int16_t *OutputData, int pitch"

-specialize vp9_short_fhaar2x2

+prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"

+specialize vp9_short_fht16x16

+prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch"

+specialize vp9_short_fdct8x8 sse2

 prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch"

 specialize vp9_short_fdct4x4

@@ -701,23 +650,17 @@

 prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch"

 specialize vp9_short_fdct8x4

-prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"

-specialize vp9_short_walsh4x4

 prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"

 specialize vp9_short_fdct32x32

 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"

-specialize vp9_short_fdct16x16

+specialize vp9_short_fdct16x16 sse2

-prototype void vp9_short_walsh4x4_lossless "int16_t *InputData, int16_t *OutputData, int pitch"

-specialize vp9_short_walsh4x4_lossless

+prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"

+specialize vp9_short_walsh4x4

-prototype void vp9_short_walsh4x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"

-specialize vp9_short_walsh4x4_x8

-prototype void vp9_short_walsh8x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch"

-specialize vp9_short_walsh8x4_x8

+prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch"

+specialize vp9_short_walsh8x4

 # Motion search

--- a/vp9/common/vp9_sadmxn.h

+++ b/vp9/common/vp9_sadmxn.h

@@ -11,14 +11,15 @@

 #ifndef VP9_COMMON_VP9_SADMXN_H_

 #define VP9_COMMON_VP9_SADMXN_H_

+#include "./vpx_config.h"

 #include "vpx/vpx_integer.h"

-static __inline unsigned int sad_mx_n_c(const uint8_t *src_ptr,

-                                        int src_stride,

-                                        const uint8_t *ref_ptr,

-                                        int ref_stride,

-                                        int m,

-                                        int n) {

+static INLINE unsigned int sad_mx_n_c(const uint8_t *src_ptr,

+                                      int src_stride,

+                                      const uint8_t *ref_ptr,

+                                      int ref_stride,

+                                      int m,

+                                      int n) {

   int r, c;

   unsigned int sad = 0;

--- a/vp9/common/vp9_seg_common.c

+++ b/vp9/common/vp9_seg_common.c

@@ -12,9 +12,8 @@

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_seg_common.h"

-static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0, 0, 0 };

-static const int seg_feature_data_max[SEG_LVL_MAX] =

-                 { MAXQ, 63, 0xf, MB_MODE_COUNT - 1, 255, TX_SIZE_MAX_SB - 1};

+static const int segfeaturedata_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 };

+static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 0xf, 0xf };

 // These functions provide access to new segment level features.

 // Eventually these function may be "optimized out" but for the moment,

@@ -52,7 +51,7 @@

 int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {

-  return (segfeaturedata_signed[feature_id]);

+  return segfeaturedata_signed[feature_id];

 void vp9_clear_segdata(MACROBLOCKD *xd,

@@ -103,10 +102,4 @@

           ~(1 << INTRA_FRAME)) ? 1 : 0;

-int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) {

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM))

-    return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM);

-  else

-    return TX_4X4;

-}

 // TBD? Functions to read and write segment data with range / validity checking

--- a/vp9/common/vp9_seg_common.h

+++ b/vp9/common/vp9_seg_common.h

@@ -57,7 +57,5 @@

 int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id);

-int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id);

 #endif  // VP9_COMMON_VP9_SEG_COMMON_H_

--- a/vp9/common/vp9_setupintrarecon.h

+++ b/vp9/common/vp9_setupintrarecon.h

@@ -13,6 +13,6 @@

 #include "vpx_scale/yv12config.h"

-extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);

+void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);

 #endif  // VP9_COMMON_VP9_SETUPINTRARECON_H_

--- a/vp9/common/vp9_subpixel.h

+++ /dev/null

@@ -1,20 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_VP9_SUBPIXEL_H_

-#define VP9_COMMON_VP9_SUBPIXEL_H_

-#define prototype_subpixel_predict(sym) \

-  void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \

-           uint8_t *dst, int dst_pitch)

-typedef prototype_subpixel_predict((*vp9_subpix_fn_t));

-#endif  // VP9_COMMON_VP9_SUBPIXEL_H_

--- a/vp9/common/vp9_textblit.c

+++ b/vp9/common/vp9_textblit.c

@@ -12,22 +12,26 @@

 #include "vp9/common/vp9_textblit.h"

+static const int font[] = {

+  0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,

+  0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,

+  0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,

+  0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,

+  0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,

+  0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,

+  0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,

+  0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,

+  0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820

+};

+static void plot(int x, int y, unsigned char *image, int pitch) {

+  image[x + y * pitch] ^= 255;

+}

 void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) {

   int letter_bitmap;

   unsigned char *output_pos = address;

-  int colpos;

-  const int font[] = {

-    0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000,

-    0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110,

-    0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA,

-    0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20,

-    0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF,

-    0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F,

-    0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2,

-    0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731,

-    0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820

-  };

-  colpos = 0;

+  int colpos = 0;

   while (msg[colpos] != 0) {

     char letter = msg[colpos];

@@ -50,12 +54,11 @@

-static void plot(const int x, const int y, unsigned char *image, const int pitch) {

-  image [x + y * pitch] ^= 255;

-}

 /* Bresenham line algorithm */

-void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) {

+void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,

+                   int pitch) {

   int steep = abs(y1 - y0) > abs(x1 - x0);

   int deltax, deltay;

   int error, ystep, y, x;

--- a/vp9/common/vp9_textblit.h

+++ b/vp9/common/vp9_textblit.h

@@ -11,9 +11,9 @@

 #ifndef VP9_COMMON_VP9_TEXTBLIT_H_

 #define VP9_COMMON_VP9_TEXTBLIT_H_

-extern void vp9_blit_text(const char *msg, unsigned char *address,

-                          const int pitch);

-extern void vp9_blit_line(int x0, int x1, int y0, int y1,

-                          unsigned char *image, const int pitch);

+void vp9_blit_text(const char *msg, unsigned char *address, int pitch);

+void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image,

+                   int pitch);

 #endif  // VP9_COMMON_VP9_TEXTBLIT_H_

--- /dev/null

+++ b/vp9/common/vp9_tile_common.c

@@ -1,0 +1,58 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "vp9/common/vp9_tile_common.h"

+#define MIN_TILE_WIDTH 256

+#define MAX_TILE_WIDTH 4096

+#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6)

+#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6)

+static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off,

+                                 int *max_tile_off, int tile_idx,

+                                 int log2_n_tiles, int n_mbs) {

+  const int n_sbs = (n_mbs + 3) >> 2;

+  const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;

+  const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;

+  *min_tile_off = MIN(sb_off1 << 2, n_mbs);

+  *max_tile_off = MIN(sb_off2 << 2, n_mbs);

+}

+void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {

+  cm->cur_tile_col_idx = tile_col_idx;

+  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_col_start,

+                       &cm->cur_tile_mb_col_end, tile_col_idx,

+                       cm->log2_tile_columns, cm->mb_cols);

+}

+void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {

+  cm->cur_tile_row_idx = tile_row_idx;

+  vp9_get_tile_offsets(cm, &cm->cur_tile_mb_row_start,

+                       &cm->cur_tile_mb_row_end, tile_row_idx,

+                       cm->log2_tile_rows, cm->mb_rows);

+}

+void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr,

+                         int *delta_log2_n_tiles) {

+  const int sb_cols = (cm->mb_cols + 3) >> 2;

+  int min_log2_n_tiles, max_log2_n_tiles;

+  for (max_log2_n_tiles = 0;

+       (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS;

+       max_log2_n_tiles++) {}

+  for (min_log2_n_tiles = 0;

+       (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols;

+       min_log2_n_tiles++) {}

+  *min_log2_n_tiles_ptr = min_log2_n_tiles;

+  *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles;

+}

--- /dev/null

+++ b/vp9/common/vp9_tile_common.h

@@ -1,0 +1,23 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_COMMON_VP9_TILE_COMMON_H_

+#define VP9_COMMON_VP9_TILE_COMMON_H_

+#include "vp9/common/vp9_onyxc_int.h"

+void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);

+void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);

+void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles,

+                         int *delta_log2_n_tiles);

+#endif  // VP9_COMMON_VP9_TILE_COMMON_H_

--- a/vp9/common/vp9_treecoder.c

+++ b/vp9/common/vp9_treecoder.c

@@ -48,66 +48,37 @@

   tree2tok(p - offset, t, 0, 0, 0);

-static void branch_counts(

-  int n,                      /* n = size of alphabet */

-  vp9_token tok               [ /* n */ ],

-  vp9_tree tree,

-  unsigned int branch_ct       [ /* n-1 */ ] [2],

-  const unsigned int num_events[ /* n */ ]

-) {

-  const int tree_len = n - 1;

-  int t = 0;

-#if CONFIG_DEBUG

-  assert(tree_len);

-#endif

-  do {

-    branch_ct[t][0] = branch_ct[t][1] = 0;

-  } while (++t < tree_len);

-  t = 0;

-  do {

-    int L = tok[t].Len;

-    const int enc = tok[t].value;

-    const unsigned int ct = num_events[t];

-    vp9_tree_index i = 0;

-    do {

-      const int b = (enc >> --L) & 1;

-      const int j = i >> 1;

-#if CONFIG_DEBUG

-      assert(j < tree_len  &&  0 <= L);

-#endif

+static unsigned int convert_distribution(unsigned int i,

+                                         vp9_tree tree,

+                                         vp9_prob probs[],

+                                         unsigned int branch_ct[][2],

+                                         const unsigned int num_events[],

+                                         unsigned int tok0_offset) {

+  unsigned int left, right;

-      branch_ct [j] [b] += ct;

-      i = tree[ i + b];

-    } while (i > 0);

-#if CONFIG_DEBUG

-    assert(!L);

-#endif

-  } while (++t < n);

+  if (tree[i] <= 0) {

+    left = num_events[-tree[i] - tok0_offset];

+  } else {

+    left = convert_distribution(tree[i], tree, probs, branch_ct,

+                                num_events, tok0_offset);

+  }

+  if (tree[i + 1] <= 0) {

+    right = num_events[-tree[i + 1] - tok0_offset];

+  } else {

+    right = convert_distribution(tree[i + 1], tree, probs, branch_ct,

+                                num_events, tok0_offset);

+  }

+  probs[i>>1] = get_binary_prob(left, right);

+  branch_ct[i>>1][0] = left;

+  branch_ct[i>>1][1] = right;

+  return left + right;

 void vp9_tree_probs_from_distribution(

-  int n,                      /* n = size of alphabet */

-  vp9_token tok               [ /* n */ ],

   vp9_tree tree,

   vp9_prob probs          [ /* n-1 */ ],

   unsigned int branch_ct       [ /* n-1 */ ] [2],

-  const unsigned int num_events[ /* n */ ]

-) {

-  const int tree_len = n - 1;

-  int t = 0;

-  branch_counts(n, tok, tree, branch_ct, num_events);

-  do {

-    probs[t] = get_binary_prob(branch_ct[t][0], branch_ct[t][1]);

-  } while (++t < tree_len);

+  const unsigned int num_events[ /* n */ ],

+  unsigned int tok0_offset) {

+  convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset);

--- a/vp9/common/vp9_treecoder.h

+++ b/vp9/common/vp9_treecoder.h

@@ -11,6 +11,7 @@

 #ifndef VP9_COMMON_VP9_TREECODER_H_

 #define VP9_COMMON_VP9_TREECODER_H_

+#include "./vpx_config.h"

 #include "vpx/vpx_integer.h"

 typedef uint8_t vp9_prob;

@@ -46,27 +47,35 @@

    taken for each node on the tree; this facilitiates decisions as to

    probability updates. */

-void vp9_tree_probs_from_distribution(int n,  /* n = size of alphabet */

-                                      vp9_token tok[ /* n */ ],

-                                      vp9_tree tree,

+void vp9_tree_probs_from_distribution(vp9_tree tree,

                                       vp9_prob probs[ /* n - 1 */ ],

                                       unsigned int branch_ct[ /* n - 1 */ ][2],

-                                      const unsigned int num_events[ /* n */ ]);

+                                      const unsigned int num_events[ /* n */ ],

+                                      unsigned int tok0_offset);

-static __inline vp9_prob clip_prob(int p) {

+static INLINE vp9_prob clip_prob(int p) {

   return (p > 255) ? 255u : (p < 1) ? 1u : p;

-static __inline vp9_prob get_prob(int num, int den) {

+// int64 is not needed for normal frame level calculations.

+// However when outputing entropy stats accumulated over many frames

+// or even clips we can overflow int math.

+#ifdef ENTROPY_STATS

+static INLINE vp9_prob get_prob(int num, int den) {

+  return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den);

+}

+#else

+static INLINE vp9_prob get_prob(int num, int den) {

   return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den);

+#endif

-static __inline vp9_prob get_binary_prob(int n0, int n1) {

+static INLINE vp9_prob get_binary_prob(int n0, int n1) {

   return get_prob(n0, n0 + n1);

 /* this function assumes prob1 and prob2 are already within [1,255] range */

-static __inline vp9_prob weighted_prob(int prob1, int prob2, int factor) {

+static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {

   return (prob1 * (256 - factor) + prob2 * factor + 128) >> 8;

--- a/vp9/common/x86/vp9_asm_stubs.c

+++ b/vp9/common/x86/vp9_asm_stubs.c

@@ -8,91 +8,11 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include "./vpx_config.h"

+#include "./vp9_rtcd.h"

 #include "vpx_ports/mem.h"

-#include "vp9/common/vp9_subpixel.h"

-extern const short vp9_six_tap_mmx[8][6 * 8];

-extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,

-                                      unsigned short  *output_ptr,

-                                      unsigned int     src_pixels_per_line,

-                                      unsigned int     pixel_step,

-                                      unsigned int     output_height,

-                                      unsigned int     output_width,

-                                      const short     *vp9_filter);

-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,

-                                       unsigned char  *output_ptr,

-                                       int             output_pitch,

-                                       unsigned int    pixels_per_line,

-                                       unsigned int    pixel_step,

-                                       unsigned int    output_height,

-                                       unsigned int    output_width,

-                                       const short    *vp9_filter);

-extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,

-                                        unsigned short *output_ptr,

-                                        unsigned int    src_pixels_per_line,

-                                        unsigned int    pixel_step,

-                                        unsigned int    output_height,

-                                        unsigned int    output_width,

-                                        const short    *vp9_filter);

-extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,

-                                         unsigned short *output_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned int    pixel_step,

-                                         unsigned int    output_height,

-                                         unsigned int    output_width,

-                                         const short    *vp9_filter);

-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,

-                                        unsigned char *output_ptr,

-                                        int dst_ptich,

-                                        unsigned int pixels_per_line,

-                                        unsigned int pixel_step,

-                                        unsigned int output_height,

-                                        unsigned int output_width,

-                                        const short    *vp9_filter);

-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,

-                                         unsigned char *output_ptr,

-                                         int dst_ptich,

-                                         unsigned int pixels_per_line,

-                                         unsigned int pixel_step,

-                                         unsigned int output_height,

-                                         unsigned int output_width,

-                                         const short    *vp9_filter);

-extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,

-                                         unsigned short *output_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned int    output_height,

-                                         unsigned int    output_width);

-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,

-                                             unsigned int   src_pixels_per_line,

-                                             unsigned char *output_ptr,

-                                             int            dst_pitch,

-                                             unsigned int   output_height,

-                                             const short   *vp9_filter);

-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,

-                                              unsigned int   src_pixels_per_lin,

-                                              unsigned char *output_ptr,

-                                              int            dst_pitch,

-                                              unsigned int   output_height,

-                                              const short   *vp9_filter);

-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,

-                                             unsigned int   src_pixels_per_line,

-                                             unsigned char *output_ptr,

-                                             int            dst_pitch,

-                                             unsigned int   output_height,

-                                             const short   *vp9_filter);

 ///////////////////////////////////////////////////////////////////////////

 // the mmx function that does the bilinear filtering and var calculation //

 // int one pass                                                          //

@@ -116,486 +36,332 @@

   {   8,  8,  8,  8, 120, 120, 120, 120 }

};

-#if HAVE_MMX

-void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict4x4_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);

-  const short *hfilter, *vfilter;

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,

-                            src_pixels_per_line, 1, 9, 8, hfilter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,

-                             8, 4, 4, 4, vfilter);

-}

+#if HAVE_SSSE3

+void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

-void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict16x16_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);

-  const short *hfilter, *vfilter;

+void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

-                            fdata2,   src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

-                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,

-                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,

-                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,

-                            hfilter);

+void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,

-                             32, 16, 16, 16, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,

-                             32, 16, 16, 16, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,

-                             32, 16, 16, 16, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,

-                             32, 16, 16, 16, vfilter);

-}

+void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

-void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x8_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

+void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

-                            fdata2,   src_pixels_per_line, 1, 13, 16,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

-                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,

-                            hfilter);

+void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,

+                                   const unsigned int src_pitch,

+                                   unsigned char *output_ptr,

+                                   unsigned int out_pitch,

+                                   unsigned int output_height,

+                                   const short *filter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,

-                             16, 8, 8, 8, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,

-                             16, 8, 8, 8, vfilter);

-}

+void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,

+                                       const unsigned int src_pitch,

+                                       unsigned char *output_ptr,

+                                       unsigned int out_pitch,

+                                       unsigned int output_height,

+                                       const short *filter);

-void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x4_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

+void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,

+                                       const unsigned int src_pitch,

+                                       unsigned char *output_ptr,

+                                       unsigned int out_pitch,

+                                       unsigned int output_height,

+                                       const short *filter);

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

-                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

-                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);

+void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,

+                                     const unsigned int src_pitch,

+                                     unsigned char *output_ptr,

+                                     unsigned int out_pitch,

+                                     unsigned int output_height,

+                                     const short *filter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,

-                             16, 8, 4, 8, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,

-                             16, 8, 4, 8, vfilter);

-}

-#endif

+void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,

+                                     const unsigned int src_pitch,

+                                     unsigned char *output_ptr,

+                                     unsigned int out_pitch,

+                                     unsigned int output_height,

+                                     const short *filter);

-#if HAVE_SSE2

-void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,

-                                  int  src_pixels_per_line,

-                                  int  xoffset,

-                                  int  yoffset,

-                                  unsigned char *dst_ptr,

-                                  int  dst_pitch) {

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);

-  const short *hfilter, *vfilter;

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict16x16_sse2\n");

-#endif

+void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,

+                                     const unsigned int src_pitch,

+                                     unsigned char *output_ptr,

+                                     unsigned int out_pitch,

+                                     unsigned int output_height,

+                                     const short *filter);

-  if (xoffset) {

-    if (yoffset) {

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                   src_pixels_per_line, 1, 21, 32, hfilter);

-      vfilter = vp9_six_tap_mmx[yoffset];

-      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,

-                                   32, 16, 16, dst_pitch, vfilter);

-    } else {

-      /* First-pass only */

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,

-                                        dst_ptr, dst_pitch, 16, hfilter);

-    }

-  } else {

-    /* Second-pass only */

-    vfilter = vp9_six_tap_mmx[yoffset];

-    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                 src_pixels_per_line, 21, 32);

-    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,

-                                 32, 16, 16, dst_pitch, vfilter);

-  }

-}

+void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,

+                                     const unsigned int src_pitch,

+                                     unsigned char *output_ptr,

+                                     unsigned int out_pitch,

+                                     unsigned int output_height,

+                                     const short *filter);

-void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,

-                                int  src_pixels_per_line,

-                                int  xoffset,

-                                int  yoffset,

-                                unsigned char *dst_ptr,

-                                int  dst_pitch) {

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x8_sse2\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                  src_pixels_per_line, 1, 13, 16, hfilter);

-      vfilter = vp9_six_tap_mmx[yoffset];

-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,

-                                  16, 8, 8, dst_pitch, vfilter);

-    } else {

-      /* First-pass only */

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,

-                                       dst_ptr, dst_pitch, 8, hfilter);

+void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  if (x_step_q4 == 16 && filter_x[3] != 128) {

+    while (w >= 16) {

+      vp9_filter_block1d16_h8_ssse3(src, src_stride,

+                                    dst, dst_stride,

+                                    h, filter_x);

+      src += 16;

+      dst += 16;

+      w -= 16;

-  } else {

-    /* Second-pass only */

-    vfilter = vp9_six_tap_mmx[yoffset];

-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),

-                                     src_pixels_per_line,

-                                     dst_ptr, dst_pitch, 8, vfilter);

-  }

-}

-void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,

-                                int  src_pixels_per_line,

-                                int  xoffset,

-                                int  yoffset,

-                                unsigned char *dst_ptr,

-                                int  dst_pitch) {

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x4_sse2\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                  src_pixels_per_line, 1, 9, 16, hfilter);

-      vfilter = vp9_six_tap_mmx[yoffset];

-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,

-                                  16, 8, 4, dst_pitch, vfilter);

-    } else {

-      /* First-pass only */

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,

-                                       dst_ptr, dst_pitch, 4, hfilter);

+    while (w >= 8) {

+      vp9_filter_block1d8_h8_ssse3(src, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_x);

+      src += 8;

+      dst += 8;

+      w -= 8;

-  } else {

-    /* Second-pass only */

-    vfilter = vp9_six_tap_mmx[yoffset];

-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),

-                                     src_pixels_per_line,

-                                     dst_ptr, dst_pitch, 4, vfilter);

-  }

-}

-#endif

-#if HAVE_SSSE3

-extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned char  *output_ptr,

-                                         unsigned int    output_pitch,

-                                         unsigned int    output_height,

-                                         unsigned int    vp9_filter_index);

-extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,

-                                          unsigned int    src_pixels_per_line,

-                                          unsigned char  *output_ptr,

-                                          unsigned int    output_pitch,

-                                          unsigned int    output_height,

-                                          unsigned int    vp9_filter_index);

-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,

-                                          unsigned int   src_pitch,

-                                          unsigned char *output_ptr,

-                                          unsigned int   out_pitch,

-                                          unsigned int   output_height,

-                                          unsigned int   vp9_filter_index);

-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,

-                                         unsigned int   src_pitch,

-                                         unsigned char *output_ptr,

-                                         unsigned int   out_pitch,

-                                         unsigned int   output_height,

-                                         unsigned int   vp9_filter_index);

-extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned char  *output_ptr,

-                                         unsigned int    output_pitch,

-                                         unsigned int    output_height,

-                                         unsigned int    vp9_filter_index);

-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,

-                                         unsigned int   src_pitch,

-                                         unsigned char *output_ptr,

-                                         unsigned int   out_pitch,

-                                         unsigned int   output_height,

-                                         unsigned int   vp9_filter_index);

-void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,

-                                   int  src_pixels_per_line,

-                                   int  xoffset,

-                                   int  yoffset,

-                                   unsigned char *dst_ptr,

-                                   int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict16x16_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                    src_pixels_per_line,

-                                    fdata2, 16, 21, xoffset);

-      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,

-                                    16, yoffset);

-    } else {

-      /* First-pass only */

-      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,

-                                    dst_ptr, dst_pitch, 16, xoffset);

+    while (w >= 4) {

+      vp9_filter_block1d4_h8_ssse3(src, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_x);

+      src += 4;

+      dst += 4;

+      w -= 4;

-  } else {

-    /* Second-pass only */

-    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                  src_pixels_per_line,

-                                  dst_ptr, dst_pitch, 16, yoffset);

+  if (w) {

+    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,

+                          filter_x, x_step_q4, filter_y, y_step_q4,

+                          w, h);

+  }

-void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x8_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                   src_pixels_per_line, fdata2, 8, 13, xoffset);

-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);

-    } else {

-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,

-                                   dst_ptr, dst_pitch, 8, xoffset);

+void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int x_step_q4,

+                              const int16_t *filter_y, int y_step_q4,

+                              int w, int h) {

+  if (y_step_q4 == 16 && filter_y[3] != 128) {

+    while (w >= 16) {

+      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,

+                                    dst, dst_stride,

+                                    h, filter_y);

+      src += 16;

+      dst += 16;

+      w -= 16;

-  } else {

-    /* Second-pass only */

-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                 src_pixels_per_line,

-                                 dst_ptr, dst_pitch, 8, yoffset);

+    while (w >= 8) {

+      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_y);

+      src += 8;

+      dst += 8;

+      w -= 8;

+    }

+    while (w >= 4) {

+      vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_y);

+      src += 4;

+      dst += 4;

+      w -= 4;

+    }

+  if (w) {

+    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,

+                         filter_x, x_step_q4, filter_y, y_step_q4,

+                         w, h);

+  }

-void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x4_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                   src_pixels_per_line, fdata2, 8, 9, xoffset);

-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);

-    } else {

-      /* First-pass only */

-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,

-                                   dst_ptr, dst_pitch, 4, xoffset);

+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  if (x_step_q4 == 16 && filter_x[3] != 128) {

+    while (w >= 16) {

+      vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,

+                                    dst, dst_stride,

+                                    h, filter_x);

+      src += 16;

+      dst += 16;

+      w -= 16;

-  } else {

-    /* Second-pass only */

-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                 src_pixels_per_line,

-                                 dst_ptr, dst_pitch, 4, yoffset);

+    while (w >= 8) {

+      vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_x);

+      src += 8;

+      dst += 8;

+      w -= 8;

+    }

+    while (w >= 4) {

+      vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_x);

+      src += 4;

+      dst += 4;

+      w -= 4;

+    }

+  if (w) {

+    vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,

+                              filter_x, x_step_q4, filter_y, y_step_q4,

+                              w, h);

+  }

-void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,

-                                 int   src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict4x4_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                   src_pixels_per_line, fdata2, 4, 9, xoffset);

-      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);

-    } else {

-      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,

-                                   dst_ptr, dst_pitch, 4, xoffset);

+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int x_step_q4,

+                              const int16_t *filter_y, int y_step_q4,

+                              int w, int h) {

+  if (y_step_q4 == 16 && filter_y[3] != 128) {

+    while (w >= 16) {

+      vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,

+                                    dst, dst_stride,

+                                    h, filter_y);

+      src += 16;

+      dst += 16;

+      w -= 16;

-  } else {

-    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                 src_pixels_per_line,

-                                 dst_ptr, dst_pitch, 4, yoffset);

+    while (w >= 8) {

+      vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_y);

+      src += 8;

+      dst += 8;

+      w -= 8;

+    }

+    while (w >= 4) {

+      vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_y);

+      src += 4;

+      dst += 4;

+      w -= 4;

+    }

+  if (w) {

+    vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,

+                             filter_x, x_step_q4, filter_y, y_step_q4,

+                             w, h);

+  }

-void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,

-                                   const unsigned int src_pitch,

-                                   unsigned char *output_ptr,

-                                   unsigned int out_pitch,

-                                   unsigned int output_height,

-                                   const short *filter);

+void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);

-void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,

-                                   const unsigned int src_pitch,

-                                   unsigned char *output_ptr,

-                                   unsigned int out_pitch,

-                                   unsigned int output_height,

-                                   const short *filter);

+  // check w/h due to fixed size fdata2 array

+  assert(w <= 16);

+  assert(h <= 16);

-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,

-                                      const unsigned int src_stride,

-                                      const short *hfilter_aligned16,

-                                      const short *vfilter_aligned16,

-                                      unsigned char *dst_ptr,

-                                      unsigned int dst_stride) {

-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {

-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

-    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                  fdata2, 16, 23, hfilter_aligned16);

-    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,

-                                  vfilter_aligned16);

-  } else {

-    if (hfilter_aligned16[3] != 128) {

-      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,

-                                    16, hfilter_aligned16);

-    } else {

-      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                    dst_ptr, dst_stride, 16, vfilter_aligned16);

+  if (x_step_q4 == 16 && y_step_q4 == 16 &&

+      filter_x[3] != 128 && filter_y[3] != 128) {

+    if (w == 16) {

+      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,

+                                    fdata2, 16,

+                                    h + 7, filter_x);

+      vp9_filter_block1d16_v8_ssse3(fdata2, 16,

+                                    dst, dst_stride,

+                                    h, filter_y);

+      return;

+    if (w == 8) {

+      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,

+                                   fdata2, 16,

+                                   h + 7, filter_x);

+      vp9_filter_block1d8_v8_ssse3(fdata2, 16,

+                                   dst, dst_stride,

+                                   h, filter_y);

+      return;

+    }

+    if (w == 4) {

+      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,

+                                   fdata2, 16,

+                                   h + 7, filter_x);

+      vp9_filter_block1d4_v8_ssse3(fdata2, 16,

+                                   dst, dst_stride,

+                                   h, filter_y);

+      return;

+    }

+  vp9_convolve8_c(src, src_stride, dst, dst_stride,

+                  filter_x, x_step_q4, filter_y, y_step_q4,

+                  w, h);

-void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,

-                                   const unsigned int src_pitch,

-                                   unsigned char *output_ptr,

-                                   unsigned int out_pitch,

-                                   unsigned int output_height,

-                                   const short *filter);

+void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);

-void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,

-                                   const unsigned int src_pitch,

-                                   unsigned char *output_ptr,

-                                   unsigned int out_pitch,

-                                   unsigned int output_height,

-                                   const short *filter);

+  // check w/h due to fixed size fdata2 array

+  assert(w <= 16);

+  assert(h <= 16);

-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,

-                                    const unsigned int src_stride,

-                                    const short *hfilter_aligned16,

-                                    const short *vfilter_aligned16,

-                                    unsigned char *dst_ptr,

-                                    unsigned int dst_stride) {

-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {

-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

-    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                 fdata2, 16, 15, hfilter_aligned16);

-    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,

-                                 vfilter_aligned16);

-  } else {

-    if (hfilter_aligned16[3] != 128) {

-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,

-                                   hfilter_aligned16);

-    } else {

-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                   dst_ptr, dst_stride, 8, vfilter_aligned16);

+  if (x_step_q4 == 16 && y_step_q4 == 16 &&

+      filter_x[3] != 128 && filter_y[3] != 128) {

+    if (w == 16) {

+      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,

+                                    fdata2, 16,

+                                    h + 7, filter_x);

+      vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,

+                                        dst, dst_stride,

+                                        h, filter_y);

+      return;

-  }

-}

-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,

-                                    const unsigned int src_stride,

-                                    const short *hfilter_aligned16,

-                                    const short *vfilter_aligned16,

-                                    unsigned char *dst_ptr,

-                                    unsigned int dst_stride) {

-  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {

-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

-      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                   fdata2, 16, 11, hfilter_aligned16);

-      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,

-                                   vfilter_aligned16);

-  } else {

-    if (hfilter_aligned16[3] != 128) {

-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,

-                                   hfilter_aligned16);

-    } else {

-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                   dst_ptr, dst_stride, 4, vfilter_aligned16);

+    if (w == 8) {

+      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,

+                                   fdata2, 16,

+                                   h + 7, filter_x);

+      vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,

+                                       dst, dst_stride,

+                                       h, filter_y);

+      return;

+    if (w == 4) {

+      vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,

+                                   fdata2, 16,

+                                   h + 7, filter_x);

+      vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,

+                                       dst, dst_stride,

+                                       h, filter_y);

+      return;

+    }

+  vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h);

 #endif

--- a/vp9/common/x86/vp9_filter_sse2.c

+++ /dev/null

@@ -1,290 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h> // for alignment checks

-#include <emmintrin.h> // SSE2

-#include "vp9/common/vp9_filter.h"

-#include "vpx_ports/emmintrin_compat.h"

-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED

-#include "vp9_rtcd.h"

-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is

-//           just a quick partial snapshot so that other can already use some

-//           speedup.

-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap

-//           filtering.

-// TODO(cd): Add some comments, better variable naming.

-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum

-//           of positive above 128), or have higher precision filter

-//           coefficients.

-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-};

-// Creating a macro to do more than four pixels at once to hide instruction

-// latency is actually slower :-(

-#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \

-  {                                                                            \

-  /* Do shifted load to achieve require shuffles through unpacking */          \

-  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \

-  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \

-  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \

-  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \

-  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \

-  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \

-  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \

-  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \

-  /* Shit by 4 bytes through suffle to get additional shifted loads */         \

-  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \

-  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \

-  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \

-  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \

-  /* multiply accumulate them */                                               \

-  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \

-  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \

-  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \

-  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \

-  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \

-  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \

-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \

-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \

-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \

-  }

-void vp9_filter_block2d_4x4_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  __m128i intermediateA, intermediateB, intermediateC;

-  const int kInterp_Extend = 4;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);

-  // check alignment

-  assert(0 == ((long)HFilter_aligned16)%16);

-  assert(0 == ((long)VFilter_aligned16)%16);

-  {

-    __m128i transpose3_0;

-    __m128i transpose3_1;

-    __m128i transpose3_2;

-    __m128i transpose3_3;

-    // Horizontal pass (src -> intermediate).

-    {

-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

-      {

-        __m128i mad_all0;

-        __m128i mad_all1;

-        __m128i mad_all2;

-        __m128i mad_all3;

-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);

-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);

-      }

-    }

-    // Transpose result (intermediate -> transpose3_x)

-    {

-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33

-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73

-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx

-      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);

-      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);

-      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);

-      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);

-      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53

-      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73

-      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx

-      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx

-      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);

-      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);

-      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);

-      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);

-      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63

-      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73

-      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx

-      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx

-      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);

-      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);

-      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);

-      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);

-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

-      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx

-      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx

-      transpose3_0 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose2_2),

-                                           _MM_SHUFFLE(1, 0, 1, 0)));

-      transpose3_1 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose2_2),

-                                           _MM_SHUFFLE(3, 2, 3, 2)));

-      transpose3_2 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose2_3),

-                                           _MM_SHUFFLE(1, 0, 1, 0)));

-      transpose3_3 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose2_3),

-                                           _MM_SHUFFLE(3, 2, 3, 2)));

-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx

-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx

-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx

-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx

-    }

-    // Vertical pass (transpose3_x -> dst).

-    {

-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      __m128i col0, col1, col2, col3;

-        DECLARE_ALIGNED(16, unsigned char, temp[32]);

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_0);

-        DO_FOUR_PIXELS(col0, temp, 0);

-      }

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_1);

-        DO_FOUR_PIXELS(col1, temp, 0);

-      }

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_2);

-        DO_FOUR_PIXELS(col2, temp, 0);

-      }

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_3);

-        DO_FOUR_PIXELS(col3, temp, 0);

-      }

-      // transpose

-      {

-        __m128i T0 = _mm_unpacklo_epi32(col0, col1);

-        __m128i T1 = _mm_unpacklo_epi32(col2, col3);

-        __m128i T2 = _mm_unpackhi_epi32(col0, col1);

-        __m128i T3 = _mm_unpackhi_epi32(col2, col3);

-        col0 = _mm_unpacklo_epi64(T0, T1);

-        col1 = _mm_unpackhi_epi64(T0, T1);

-        col2 = _mm_unpacklo_epi64(T2, T3);

-        col3 = _mm_unpackhi_epi64(T2, T3);

-      }

-      // saturate to 8 bit

-      {

-        col0 = _mm_packs_epi32(col0, col0);

-        col0 = _mm_packus_epi16(col0, col0);

-        col1 = _mm_packs_epi32(col1, col1);

-        col1 = _mm_packus_epi16(col1, col1);

-        col2 = _mm_packs_epi32 (col2, col2);

-        col2 = _mm_packus_epi16(col2, col2);

-        col3 = _mm_packs_epi32 (col3, col3);

-        col3 = _mm_packus_epi16(col3, col3);

-      }

-      // store

-      {

-        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);

-        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);

-        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);

-        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);

-      }

-    }

-  }

-}

-void vp9_filter_block2d_8x4_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int j;

-  for (j=0; j<8; j+=4) {

-    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,

-                                  HFilter_aligned16, VFilter_aligned16,

-                                  dst_ptr + j, dst_stride);

-  }

-}

-void vp9_filter_block2d_8x8_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<8; i+=4) {

-    for (j=0; j<8; j+=4) {

-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,

-                                    HFilter_aligned16, VFilter_aligned16,

-                                    dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

-void vp9_filter_block2d_16x16_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<16; i+=4) {

-    for (j=0; j<16; j+=4) {

-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,

-                                    HFilter_aligned16, VFilter_aligned16,

-                                    dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

--- a/vp9/common/x86/vp9_filter_sse4.c

+++ /dev/null

@@ -1,362 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h> // for alignment checks

-#include <smmintrin.h> // SSE4.1

-#include "vp9/common/vp9_filter.h"

-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED

-#include "vp9_rtcd.h"

-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is

-//           just a quick partial snapshot so that other can already use some

-//           speedup.

-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap

-//           filtering.

-// TODO(cd): Reduce source size by using macros instead of current code

-//           duplication.

-// TODO(cd): Add some comments, better variable naming.

-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum

-//           of positive above 128), or have higher precision filter

-//           coefficients.

-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {

-  0x00, 0x01,

-  0x01, 0x02,

-  0x02, 0x03,

-  0x03, 0x04,

-  0x02, 0x03,

-  0x03, 0x04,

-  0x04, 0x05,

-  0x05, 0x06,

-};

-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {

-  0x04, 0x05,

-  0x05, 0x06,

-  0x06, 0x07,

-  0x07, 0x08,

-  0x06, 0x07,

-  0x07, 0x08,

-  0x08, 0x09,

-  0x09, 0x0A,

-};

-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-};

-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {

-  0, 4,  8, 12,

-  1, 5,  9, 13,

-  2, 6, 10, 14,

-  3, 7, 11, 15

-};

-// Creating a macro to do more than four pixels at once to hide instruction

-// latency is actually slower :-(

-#define DO_FOUR_PIXELS(result, offset)                                         \

-  {                                                                            \

-  /*load pixels*/                                                              \

-  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \

-  /* extract the ones used for first column */                                 \

-  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \

-  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \

-  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \

-  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \

-  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \

-  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \

-  /* multiply accumulate them */                                               \

-  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \

-  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \

-  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \

-  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \

-  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \

-  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \

-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \

-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \

-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \

-  }

-void vp9_filter_block2d_4x4_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  __m128i intermediateA, intermediateB, intermediateC;

-  const int kInterp_Extend = 4;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);

-  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);

-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);

-  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);

-  // check alignment

-  assert(0 == ((long)HFilter_aligned16)%16);

-  assert(0 == ((long)VFilter_aligned16)%16);

-  {

-    __m128i transpose3_0;

-    __m128i transpose3_1;

-    __m128i transpose3_2;

-    __m128i transpose3_3;

-    // Horizontal pass (src -> intermediate).

-    {

-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

-      {

-        __m128i mad_all0;

-        __m128i mad_all1;

-        __m128i mad_all2;

-        __m128i mad_all3;

-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);

-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);

-      }

-    }

-    // Transpose result (intermediate -> transpose3_x)

-    {

-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33

-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73

-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx

-      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);

-      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);

-      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);

-      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33

-      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73

-      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx

-      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);

-      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);

-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

-      transpose3_0 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(0, 0, 1, 0)));

-      transpose3_1 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(1, 1, 3, 2)));

-      transpose3_2 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(2, 2, 1, 0)));

-      transpose3_3 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(3, 3, 3, 2)));

-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx

-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx

-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx

-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx

-    }

-    // Vertical pass (transpose3_x -> dst).

-    {

-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      __m128i col0, col1, col2, col3;

-      {

-        //load pixels

-        __m128i src  = transpose3_0;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col0 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        //load pixels

-        __m128i src  = transpose3_1;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col1 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        //load pixels

-        __m128i src  = transpose3_2;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col2 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        //load pixels

-        __m128i src  = transpose3_3;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col3 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        __m128i col01 = _mm_unpacklo_epi8(col0, col1);

-        __m128i col23 = _mm_unpacklo_epi8(col2, col3);

-        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);

-        //TODO(cd): look into Ronald's comment:

-        //    Future suggestion: I believe here, too, you can merge the

-        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that

-        //    you get the data in a single register, and then use pshufb

-        //    (shuffle_epi8()) instead of the unpacks here. Should be

-        //    2+3+2 instructions faster.

-        *((unsigned int *)&dst_ptr[dst_stride * 0]) =

-            _mm_extract_epi32(col0123, 0);

-        *((unsigned int *)&dst_ptr[dst_stride * 1]) =

-            _mm_extract_epi32(col0123, 1);

-        *((unsigned int *)&dst_ptr[dst_stride * 2]) =

-            _mm_extract_epi32(col0123, 2);

-        *((unsigned int *)&dst_ptr[dst_stride * 3]) =

-            _mm_extract_epi32(col0123, 3);

-      }

-    }

-  }

-}

-void vp9_filter_block2d_8x4_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int j;

-  for (j=0; j<8; j+=4) {

-    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,

-                                    HFilter_aligned16, VFilter_aligned16,

-                                    dst_ptr + j, dst_stride);

-  }

-}

-void vp9_filter_block2d_8x8_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<8; i+=4) {

-    for (j=0; j<8; j+=4) {

-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,

-                                      HFilter_aligned16, VFilter_aligned16,

-                                      dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

-void vp9_filter_block2d_16x16_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<16; i+=4) {

-    for (j=0; j<16; j+=4) {

-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,

-                                      HFilter_aligned16, VFilter_aligned16,

-                                      dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

--- /dev/null

+++ b/vp9/common/x86/vp9_idct_sse2.asm

@@ -1,0 +1,712 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_idct_dequant_0_2x_sse2

+; (

+;   short *qcoeff       - 0

+;   short *dequant      - 1

+;   unsigned char *pre  - 2

+;   unsigned char *dst  - 3

+;   int dst_stride      - 4

+;   int blk_stride      - 5

+; )

+global sym(vp9_idct_dequant_0_2x_sse2) PRIVATE

+sym(vp9_idct_dequant_0_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    GET_GOT     rbx

+    ; end prolog

+        mov         rdx,            arg(1) ; dequant

+        mov         rax,            arg(0) ; qcoeff

+        movd        xmm4,           [rax]

+        movd        xmm5,           [rdx]

+        pinsrw      xmm4,           [rax+32],   4

+        pinsrw      xmm5,           [rdx],      4

+        pmullw      xmm4,           xmm5

+    ; Zero out xmm5, for use unpacking

+        pxor        xmm5,           xmm5

+    ; clear coeffs

+        movd        [rax],          xmm5

+        movd        [rax+32],       xmm5

+;pshufb

+        pshuflw     xmm4,           xmm4,       00000000b

+        pshufhw     xmm4,           xmm4,       00000000b

+        mov         rax,            arg(2) ; pre

+        paddw       xmm4,           [GLOBAL(fours)]

+        movsxd      rcx,            dword ptr arg(5) ; blk_stride

+        psraw       xmm4,           3

+        movq        xmm0,           [rax]

+        movq        xmm1,           [rax+rcx]

+        movq        xmm2,           [rax+2*rcx]

+        lea         rcx,            [3*rcx]

+        movq        xmm3,           [rax+rcx]

+        punpcklbw   xmm0,           xmm5

+        punpcklbw   xmm1,           xmm5

+        punpcklbw   xmm2,           xmm5

+        punpcklbw   xmm3,           xmm5

+        mov         rax,            arg(3) ; dst

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; Add to predict buffer

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm4

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm4

+    ; pack up before storing

+        packuswb    xmm0,           xmm5

+        packuswb    xmm1,           xmm5

+        packuswb    xmm2,           xmm5

+        packuswb    xmm3,           xmm5

+    ; store blocks back out

+        movq        [rax],          xmm0

+        movq        [rax + rdx],    xmm1

+        lea         rax,            [rax + 2*rdx]

+        movq        [rax],          xmm2

+        movq        [rax + rdx],    xmm3

+    ; begin epilog

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_idct_dequant_full_2x_sse2) PRIVATE

+sym(vp9_idct_dequant_full_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; special case when 2 blocks have 0 or 1 coeffs

+    ; dc is set as first coeff, so no need to load qcoeff

+        mov         rax,            arg(0) ; qcoeff

+        mov         rsi,            arg(2) ; pre

+        mov         rdi,            arg(3) ; dst

+        movsxd      rcx,            dword ptr arg(5) ; blk_stride

+    ; Zero out xmm7, for use unpacking

+        pxor        xmm7,           xmm7

+        mov         rdx,            arg(1)  ; dequant

+    ; note the transpose of xmm1 and xmm2, necessary for shuffle

+    ;   to spit out sensicle data

+        movdqa      xmm0,           [rax]

+        movdqa      xmm2,           [rax+16]

+        movdqa      xmm1,           [rax+32]

+        movdqa      xmm3,           [rax+48]

+    ; Clear out coeffs

+        movdqa      [rax],          xmm7

+        movdqa      [rax+16],       xmm7

+        movdqa      [rax+32],       xmm7

+        movdqa      [rax+48],       xmm7

+    ; dequantize qcoeff buffer

+        pmullw      xmm0,           [rdx]

+        pmullw      xmm2,           [rdx+16]

+        pmullw      xmm1,           [rdx]

+        pmullw      xmm3,           [rdx+16]

+    ; repack so block 0 row x and block 1 row x are together

+        movdqa      xmm4,           xmm0

+        punpckldq   xmm0,           xmm1

+        punpckhdq   xmm4,           xmm1

+        pshufd      xmm0,           xmm0,       11011000b

+        pshufd      xmm1,           xmm4,       11011000b

+        movdqa      xmm4,           xmm2

+        punpckldq   xmm2,           xmm3

+        punpckhdq   xmm4,           xmm3

+        pshufd      xmm2,           xmm2,       11011000b

+        pshufd      xmm3,           xmm4,       11011000b

+    ; first pass

+        psubw       xmm0,           xmm2        ; b1 = 0-2

+        paddw       xmm2,           xmm2        ;

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0        ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5        ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5        ; d1

+        movdqa      xmm6,           xmm2        ; a1

+        movdqa      xmm4,           xmm0        ; b1

+        paddw       xmm2,           xmm3        ;0

+        paddw       xmm4,           xmm7        ;1

+        psubw       xmm0,           xmm7        ;2

+        psubw       xmm6,           xmm3        ;3

+    ; transpose for the second pass

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+    ; second pass

+        psubw       xmm0,           xmm2            ; b1 = 0-2

+        paddw       xmm2,           xmm2

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0            ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5            ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5            ; d1

+        paddw       xmm0,           [GLOBAL(fours)]

+        paddw       xmm2,           [GLOBAL(fours)]

+        movdqa      xmm6,           xmm2            ; a1

+        movdqa      xmm4,           xmm0            ; b1

+        paddw       xmm2,           xmm3            ;0

+        paddw       xmm4,           xmm7            ;1

+        psubw       xmm0,           xmm7            ;2

+        psubw       xmm6,           xmm3            ;3

+        psraw       xmm2,           3

+        psraw       xmm0,           3

+        psraw       xmm4,           3

+        psraw       xmm6,           3

+    ; transpose to save

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+        pxor        xmm7,           xmm7

+    ; Load up predict blocks

+        movq        xmm4,           [rsi]

+        movq        xmm5,           [rsi+rcx]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm5

+        movq        xmm4,           [rsi+2*rcx]

+        lea         rcx,            [3*rcx]

+        movq        xmm5,           [rsi+rcx]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm5

+.finish:

+    ; pack up before storing

+        packuswb    xmm0,           xmm7

+        packuswb    xmm1,           xmm7

+        packuswb    xmm2,           xmm7

+        packuswb    xmm3,           xmm7

+    ; Load destination stride before writing out,

+    ;   doesn't need to persist

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; store blocks back out

+        movq        [rdi],          xmm0

+        movq        [rdi + rdx],    xmm1

+        lea         rdi,            [rdi + 2*rdx]

+        movq        [rdi],          xmm2

+        movq        [rdi + rdx],    xmm3

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_idct_dequant_dc_0_2x_sse2

+; (

+;   short *qcoeff       - 0

+;   short *dequant      - 1

+;   unsigned char *pre  - 2

+;   unsigned char *dst  - 3

+;   int dst_stride      - 4

+;   short *dc           - 5

+; )

+global sym(vp9_idct_dequant_dc_0_2x_sse2) PRIVATE

+sym(vp9_idct_dequant_dc_0_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; special case when 2 blocks have 0 or 1 coeffs

+    ; dc is set as first coeff, so no need to load qcoeff

+        mov         rax,            arg(0) ; qcoeff

+        mov         rsi,            arg(2) ; pre

+        mov         rdi,            arg(3) ; dst

+        mov         rdx,            arg(5) ; dc

+    ; Zero out xmm5, for use unpacking

+        pxor        xmm5,           xmm5

+    ; load up 2 dc words here == 2*16 = doubleword

+        movd        xmm4,           [rdx]

+    ; Load up predict blocks

+        movq        xmm0,           [rsi]

+        movq        xmm1,           [rsi+16]

+        movq        xmm2,           [rsi+32]

+        movq        xmm3,           [rsi+48]

+    ; Duplicate and expand dc across

+        punpcklwd   xmm4,           xmm4

+        punpckldq   xmm4,           xmm4

+    ; Rounding to dequant and downshift

+        paddw       xmm4,           [GLOBAL(fours)]

+        psraw       xmm4,           3

+    ; Predict buffer needs to be expanded from bytes to words

+        punpcklbw   xmm0,           xmm5

+        punpcklbw   xmm1,           xmm5

+        punpcklbw   xmm2,           xmm5

+        punpcklbw   xmm3,           xmm5

+    ; Add to predict buffer

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm4

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm4

+    ; pack up before storing

+        packuswb    xmm0,           xmm5

+        packuswb    xmm1,           xmm5

+        packuswb    xmm2,           xmm5

+        packuswb    xmm3,           xmm5

+    ; Load destination stride before writing out,

+    ;   doesn't need to persist

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; store blocks back out

+        movq        [rdi],          xmm0

+        movq        [rdi + rdx],    xmm1

+        lea         rdi,            [rdi + 2*rdx]

+        movq        [rdi],          xmm2

+        movq        [rdi + rdx],    xmm3

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_idct_dequant_dc_full_2x_sse2) PRIVATE

+sym(vp9_idct_dequant_dc_full_2x_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 7

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ; special case when 2 blocks have 0 or 1 coeffs

+    ; dc is set as first coeff, so no need to load qcoeff

+        mov         rax,            arg(0) ; qcoeff

+        mov         rsi,            arg(2) ; pre

+        mov         rdi,            arg(3) ; dst

+    ; Zero out xmm7, for use unpacking

+        pxor        xmm7,           xmm7

+        mov         rdx,            arg(1)  ; dequant

+    ; note the transpose of xmm1 and xmm2, necessary for shuffle

+    ;   to spit out sensicle data

+        movdqa      xmm0,           [rax]

+        movdqa      xmm2,           [rax+16]

+        movdqa      xmm1,           [rax+32]

+        movdqa      xmm3,           [rax+48]

+    ; Clear out coeffs

+        movdqa      [rax],          xmm7

+        movdqa      [rax+16],       xmm7

+        movdqa      [rax+32],       xmm7

+        movdqa      [rax+48],       xmm7

+    ; dequantize qcoeff buffer

+        pmullw      xmm0,           [rdx]

+        pmullw      xmm2,           [rdx+16]

+        pmullw      xmm1,           [rdx]

+        pmullw      xmm3,           [rdx+16]

+    ; DC component

+        mov         rdx,            arg(5)

+    ; repack so block 0 row x and block 1 row x are together

+        movdqa      xmm4,           xmm0

+        punpckldq   xmm0,           xmm1

+        punpckhdq   xmm4,           xmm1

+        pshufd      xmm0,           xmm0,       11011000b

+        pshufd      xmm1,           xmm4,       11011000b

+        movdqa      xmm4,           xmm2

+        punpckldq   xmm2,           xmm3

+        punpckhdq   xmm4,           xmm3

+        pshufd      xmm2,           xmm2,       11011000b

+        pshufd      xmm3,           xmm4,       11011000b

+    ; insert DC component

+        pinsrw      xmm0,           [rdx],      0

+        pinsrw      xmm0,           [rdx+2],    4

+    ; first pass

+        psubw       xmm0,           xmm2        ; b1 = 0-2

+        paddw       xmm2,           xmm2        ;

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0        ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5        ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5        ; d1

+        movdqa      xmm6,           xmm2        ; a1

+        movdqa      xmm4,           xmm0        ; b1

+        paddw       xmm2,           xmm3        ;0

+        paddw       xmm4,           xmm7        ;1

+        psubw       xmm0,           xmm7        ;2

+        psubw       xmm6,           xmm3        ;3

+    ; transpose for the second pass

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+    ; second pass

+        psubw       xmm0,           xmm2            ; b1 = 0-2

+        paddw       xmm2,           xmm2

+        movdqa      xmm5,           xmm1

+        paddw       xmm2,           xmm0            ; a1 = 0+2

+        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

+        movdqa      xmm7,           xmm3

+        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

+        psubw       xmm7,           xmm5            ; c1

+        movdqa      xmm5,           xmm1

+        movdqa      xmm4,           xmm3

+        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

+        paddw       xmm5,           xmm1

+        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

+        paddw       xmm3,           xmm4

+        paddw       xmm3,           xmm5            ; d1

+        paddw       xmm0,           [GLOBAL(fours)]

+        paddw       xmm2,           [GLOBAL(fours)]

+        movdqa      xmm6,           xmm2            ; a1

+        movdqa      xmm4,           xmm0            ; b1

+        paddw       xmm2,           xmm3            ;0

+        paddw       xmm4,           xmm7            ;1

+        psubw       xmm0,           xmm7            ;2

+        psubw       xmm6,           xmm3            ;3

+        psraw       xmm2,           3

+        psraw       xmm0,           3

+        psraw       xmm4,           3

+        psraw       xmm6,           3

+    ; transpose to save

+        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

+        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

+        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

+        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

+        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

+        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

+        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

+        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

+        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

+        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

+        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

+        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

+        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

+        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

+        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

+        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

+        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

+        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

+        pshufd      xmm0,           xmm2,       11011000b

+        pshufd      xmm2,           xmm1,       11011000b

+        pshufd      xmm1,           xmm5,       11011000b

+        pshufd      xmm3,           xmm7,       11011000b

+        pxor        xmm7,           xmm7

+    ; Load up predict blocks

+        movq        xmm4,           [rsi]

+        movq        xmm5,           [rsi+16]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm0,           xmm4

+        paddw       xmm1,           xmm5

+        movq        xmm4,           [rsi+32]

+        movq        xmm5,           [rsi+48]

+        punpcklbw   xmm4,           xmm7

+        punpcklbw   xmm5,           xmm7

+        paddw       xmm2,           xmm4

+        paddw       xmm3,           xmm5

+.finish:

+    ; pack up before storing

+        packuswb    xmm0,           xmm7

+        packuswb    xmm1,           xmm7

+        packuswb    xmm2,           xmm7

+        packuswb    xmm3,           xmm7

+    ; Load destination stride before writing out,

+    ;   doesn't need to persist

+        movsxd      rdx,            dword ptr arg(4) ; dst_stride

+    ; store blocks back out

+        movq        [rdi],          xmm0

+        movq        [rdi + rdx],    xmm1

+        lea         rdi,            [rdi + 2*rdx]

+        movq        [rdi],          xmm2

+        movq        [rdi + rdx],    xmm3

+    ; begin epilog

+    pop         rdi

+    pop         rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+SECTION_RODATA

+align 16

+fours:

+    times 8 dw 0x0004

+align 16

+x_s1sqr2:

+    times 8 dw 0x8A8C

+align 16

+x_c1sqr2less1:

+    times 8 dw 0x4E7B

--- /dev/null

+++ b/vp9/common/x86/vp9_idct_x86.c

@@ -1,0 +1,1975 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include <emmintrin.h>  // SSE2

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vp9/common/vp9_common.h"

+#include "vp9/common/vp9_idct.h"

+#if HAVE_SSE2

+// In order to improve performance, clip absolute diff values to [0, 255],

+// which allows to keep the additions/subtractions in 8 bits.

+void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,

+                               uint8_t *dst_ptr, int pitch, int stride) {

+  int a1;

+  int16_t out;

+  uint8_t abs_diff;

+  __m128i p0, p1, p2, p3;

+  unsigned int extended_diff;

+  __m128i diff;

+  out = dct_const_round_shift(input_dc * cospi_16_64);

+  out = dct_const_round_shift(out * cospi_16_64);

+  a1 = ROUND_POWER_OF_TWO(out, 4);

+  // Read prediction data.

+  p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));

+  p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));

+  p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));

+  p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));

+  // Unpack prediction data, and store 4x4 array in 1 XMM register.

+  p0 = _mm_unpacklo_epi32(p0, p1);

+  p2 = _mm_unpacklo_epi32(p2, p3);

+  p0 = _mm_unpacklo_epi64(p0, p2);

+  // Clip dc value to [0, 255] range. Then, do addition or subtraction

+  // according to its sign.

+  if (a1 >= 0) {

+    abs_diff = (a1 > 255) ? 255 : a1;

+    extended_diff = abs_diff * 0x01010101u;

+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);

+    p1 = _mm_adds_epu8(p0, diff);

+  } else {

+    abs_diff = (a1 < -255) ? 255 : -a1;

+    extended_diff = abs_diff * 0x01010101u;

+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);

+    p1 = _mm_subs_epu8(p0, diff);

+  }

+  // Store results to dst.

+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

+  dst_ptr += stride;

+  p1 = _mm_srli_si128(p1, 4);

+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

+  dst_ptr += stride;

+  p1 = _mm_srli_si128(p1, 4);

+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

+  dst_ptr += stride;

+  p1 = _mm_srli_si128(p1, 4);

+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);

+}

+void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i eight = _mm_set1_epi16(8);

+  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,

+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const int half_pitch = pitch >> 1;

+  __m128i input0, input1, input2, input3;

+  // Rows

+  input0 = _mm_loadl_epi64((__m128i *)input);

+  input1 = _mm_loadl_epi64((__m128i *)(input + 4));

+  input2 = _mm_loadl_epi64((__m128i *)(input + 8));

+  input3 = _mm_loadl_epi64((__m128i *)(input + 12));

+  // Construct i3, i1, i3, i1, i2, i0, i2, i0

+  input0 = _mm_shufflelo_epi16(input0, 0xd8);

+  input1 = _mm_shufflelo_epi16(input1, 0xd8);

+  input2 = _mm_shufflelo_epi16(input2, 0xd8);

+  input3 = _mm_shufflelo_epi16(input3, 0xd8);

+  input0 = _mm_unpacklo_epi32(input0, input0);

+  input1 = _mm_unpacklo_epi32(input1, input1);

+  input2 = _mm_unpacklo_epi32(input2, input2);

+  input3 = _mm_unpacklo_epi32(input3, input3);

+  // Stage 1

+  input0 = _mm_madd_epi16(input0, cst);

+  input1 = _mm_madd_epi16(input1, cst);

+  input2 = _mm_madd_epi16(input2, cst);

+  input3 = _mm_madd_epi16(input3, cst);

+  input0 = _mm_add_epi32(input0, rounding);

+  input1 = _mm_add_epi32(input1, rounding);

+  input2 = _mm_add_epi32(input2, rounding);

+  input3 = _mm_add_epi32(input3, rounding);

+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

+  // Stage 2

+  input0 = _mm_packs_epi32(input0, zero);

+  input1 = _mm_packs_epi32(input1, zero);

+  input2 = _mm_packs_epi32(input2, zero);

+  input3 = _mm_packs_epi32(input3, zero);

+  // Transpose

+  input1 = _mm_unpacklo_epi16(input0, input1);

+  input3 = _mm_unpacklo_epi16(input2, input3);

+  input0 = _mm_unpacklo_epi32(input1, input3);

+  input1 = _mm_unpackhi_epi32(input1, input3);

+  // Switch column2, column 3, and then, we got:

+  // input2: column1, column 0;  input3: column2, column 3.

+  input1 = _mm_shuffle_epi32(input1, 0x4e);

+  input2 = _mm_add_epi16(input0, input1);

+  input3 = _mm_sub_epi16(input0, input1);

+  // Columns

+  // Construct i3, i1, i3, i1, i2, i0, i2, i0

+  input0 = _mm_shufflelo_epi16(input2, 0xd8);

+  input1 = _mm_shufflehi_epi16(input2, 0xd8);

+  input2 = _mm_shufflehi_epi16(input3, 0xd8);

+  input3 = _mm_shufflelo_epi16(input3, 0xd8);

+  input0 = _mm_unpacklo_epi32(input0, input0);

+  input1 = _mm_unpackhi_epi32(input1, input1);

+  input2 = _mm_unpackhi_epi32(input2, input2);

+  input3 = _mm_unpacklo_epi32(input3, input3);

+  // Stage 1

+  input0 = _mm_madd_epi16(input0, cst);

+  input1 = _mm_madd_epi16(input1, cst);

+  input2 = _mm_madd_epi16(input2, cst);

+  input3 = _mm_madd_epi16(input3, cst);

+  input0 = _mm_add_epi32(input0, rounding);

+  input1 = _mm_add_epi32(input1, rounding);

+  input2 = _mm_add_epi32(input2, rounding);

+  input3 = _mm_add_epi32(input3, rounding);

+  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);

+  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);

+  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);

+  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);

+  // Stage 2

+  input0 = _mm_packs_epi32(input0, zero);

+  input1 = _mm_packs_epi32(input1, zero);

+  input2 = _mm_packs_epi32(input2, zero);

+  input3 = _mm_packs_epi32(input3, zero);

+  // Transpose

+  input1 = _mm_unpacklo_epi16(input0, input1);

+  input3 = _mm_unpacklo_epi16(input2, input3);

+  input0 = _mm_unpacklo_epi32(input1, input3);

+  input1 = _mm_unpackhi_epi32(input1, input3);

+  // Switch column2, column 3, and then, we got:

+  // input2: column1, column 0;  input3: column2, column 3.

+  input1 = _mm_shuffle_epi32(input1, 0x4e);

+  input2 = _mm_add_epi16(input0, input1);

+  input3 = _mm_sub_epi16(input0, input1);

+  // Final round and shift

+  input2 = _mm_add_epi16(input2, eight);

+  input3 = _mm_add_epi16(input3, eight);

+  input2 = _mm_srai_epi16(input2, 4);

+  input3 = _mm_srai_epi16(input3, 4);

+  // Store results

+  _mm_storel_epi64((__m128i *)output, input2);

+  input2 = _mm_srli_si128(input2, 8);

+  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);

+  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);

+  input3 = _mm_srli_si128(input3, 8);

+  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);

+}

+void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

+                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,

+                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,

+                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);

+  const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  __m128i in, temp;

+  // Load input data.

+  in = _mm_loadl_epi64((__m128i *)input);

+  // Construct i3, i1, i3, i1, i2, i0, i2, i0

+  in = _mm_shufflelo_epi16(in, 0xd8);

+  in = _mm_unpacklo_epi32(in, in);

+  // Stage 1

+  in = _mm_madd_epi16(in, c1);

+  in = _mm_add_epi32(in, rounding);

+  in = _mm_srai_epi32(in, DCT_CONST_BITS);

+  in = _mm_packs_epi32(in, zero);

+  // Stage 2

+  temp = _mm_shufflelo_epi16(in, 0x9c);

+  in = _mm_shufflelo_epi16(in, 0xc9);

+  in = _mm_unpacklo_epi64(temp, in);

+  in = _mm_madd_epi16(in, c2);

+  in = _mm_packs_epi32(in, zero);

+  // Store results

+  _mm_storel_epi64((__m128i *)output, in);

+}

+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \

+                      out0, out1, out2, out3, out4, out5, out6, out7) \

+  {                                                     \

+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \

+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \

+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \

+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \

+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \

+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \

+    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \

+    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \

+                                                        \

+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \

+    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \

+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \

+    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \

+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \

+    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \

+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \

+    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \

+                                                            \

+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \

+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \

+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \

+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \

+    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \

+    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \

+    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \

+    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \

+  }

+#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \

+                      out0, out1, out2, out3, out4, out5, out6, out7) \

+  {                                                     \

+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \

+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \

+    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \

+    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \

+                                                        \

+    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \

+    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \

+    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \

+    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \

+                                                            \

+    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \

+    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \

+    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \

+    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \

+    out4 = out5 = out6 = out7 = zero; \

+  }

+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \

+  {                                                     \

+    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \

+    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \

+    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \

+    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \

+                                                        \

+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);  /* i1 i0 */  \

+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);  /* i3 i2 */  \

+    in2 = _mm_unpacklo_epi32(tr0_2, tr0_3);  /* i5 i4 */  \

+    in3 = _mm_unpackhi_epi32(tr0_2, tr0_3);  /* i7 i6 */  \

+  }

+// Define Macro for multiplying elements by constants and adding them together.

+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \

+                               cst0, cst1, cst2, cst3, res0, res1, res2, res3) \

+  {   \

+      tmp0 = _mm_madd_epi16(lo_0, cst0); \

+      tmp1 = _mm_madd_epi16(hi_0, cst0); \

+      tmp2 = _mm_madd_epi16(lo_0, cst1); \

+      tmp3 = _mm_madd_epi16(hi_0, cst1); \

+      tmp4 = _mm_madd_epi16(lo_1, cst2); \

+      tmp5 = _mm_madd_epi16(hi_1, cst2); \

+      tmp6 = _mm_madd_epi16(lo_1, cst3); \

+      tmp7 = _mm_madd_epi16(hi_1, cst3); \

+      \

+      tmp0 = _mm_add_epi32(tmp0, rounding); \

+      tmp1 = _mm_add_epi32(tmp1, rounding); \

+      tmp2 = _mm_add_epi32(tmp2, rounding); \

+      tmp3 = _mm_add_epi32(tmp3, rounding); \

+      tmp4 = _mm_add_epi32(tmp4, rounding); \

+      tmp5 = _mm_add_epi32(tmp5, rounding); \

+      tmp6 = _mm_add_epi32(tmp6, rounding); \

+      tmp7 = _mm_add_epi32(tmp7, rounding); \

+      \

+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+      tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \

+      tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \

+      tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \

+      tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \

+      \

+      res0 = _mm_packs_epi32(tmp0, tmp1); \

+      res1 = _mm_packs_epi32(tmp2, tmp3); \

+      res2 = _mm_packs_epi32(tmp4, tmp5); \

+      res3 = _mm_packs_epi32(tmp6, tmp7); \

+  }

+#define IDCT8x8_1D  \

+  /* Stage1 */      \

+  { \

+    const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \

+    const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \

+    const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \

+    const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \

+    \

+    MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \

+                          stg1_1, stg1_2, stg1_3, stp1_4,      \

+                          stp1_7, stp1_5, stp1_6)              \

+  } \

+    \

+  /* Stage2 */ \

+  { \

+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \

+    const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \

+    const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \

+    const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \

+    \

+    MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \

+                           stg2_1, stg2_2, stg2_3, stp2_0,     \

+                           stp2_1, stp2_2, stp2_3)             \

+    \

+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \

+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \

+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \

+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \

+  } \

+    \

+  /* Stage3 */ \

+  { \

+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

+    const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

+    \

+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \

+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \

+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \

+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \

+    \

+    tmp0 = _mm_madd_epi16(lo_56, stg2_1); \

+    tmp1 = _mm_madd_epi16(hi_56, stg2_1); \

+    tmp2 = _mm_madd_epi16(lo_56, stg2_0); \

+    tmp3 = _mm_madd_epi16(hi_56, stg2_0); \

+    \

+    tmp0 = _mm_add_epi32(tmp0, rounding); \

+    tmp1 = _mm_add_epi32(tmp1, rounding); \

+    tmp2 = _mm_add_epi32(tmp2, rounding); \

+    tmp3 = _mm_add_epi32(tmp3, rounding); \

+    \

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+    \

+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

+  } \

+  \

+  /* Stage4  */ \

+  in0 = _mm_adds_epi16(stp1_0, stp2_7); \

+  in1 = _mm_adds_epi16(stp1_1, stp1_6); \

+  in2 = _mm_adds_epi16(stp1_2, stp1_5); \

+  in3 = _mm_adds_epi16(stp1_3, stp2_4); \

+  in4 = _mm_subs_epi16(stp1_3, stp2_4); \

+  in5 = _mm_subs_epi16(stp1_2, stp1_5); \

+  in6 = _mm_subs_epi16(stp1_1, stp1_6); \

+  in7 = _mm_subs_epi16(stp1_0, stp2_7);

+void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {

+  const int half_pitch = pitch >> 1;

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1<<4);

+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  int i;

+  // Load input data.

+  in0 = _mm_load_si128((__m128i *)input);

+  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));

+  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));

+  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));

+  in4 = _mm_load_si128((__m128i *)(input + 8 * 4));

+  in5 = _mm_load_si128((__m128i *)(input + 8 * 5));

+  in6 = _mm_load_si128((__m128i *)(input + 8 * 6));

+  in7 = _mm_load_si128((__m128i *)(input + 8 * 7));

+  // 2-D

+  for (i = 0; i < 2; i++) {

+    // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()

+    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,

+                  in4, in5, in6, in7);

+    // 4-stage 1D idct8x8

+    IDCT8x8_1D

+  }

+  // Final rounding and shift

+  in0 = _mm_adds_epi16(in0, final_rounding);

+  in1 = _mm_adds_epi16(in1, final_rounding);

+  in2 = _mm_adds_epi16(in2, final_rounding);

+  in3 = _mm_adds_epi16(in3, final_rounding);

+  in4 = _mm_adds_epi16(in4, final_rounding);

+  in5 = _mm_adds_epi16(in5, final_rounding);

+  in6 = _mm_adds_epi16(in6, final_rounding);

+  in7 = _mm_adds_epi16(in7, final_rounding);

+  in0 = _mm_srai_epi16(in0, 5);

+  in1 = _mm_srai_epi16(in1, 5);

+  in2 = _mm_srai_epi16(in2, 5);

+  in3 = _mm_srai_epi16(in3, 5);

+  in4 = _mm_srai_epi16(in4, 5);

+  in5 = _mm_srai_epi16(in5, 5);

+  in6 = _mm_srai_epi16(in6, 5);

+  in7 = _mm_srai_epi16(in7, 5);

+  // Store results

+  _mm_store_si128((__m128i *)output, in0);

+  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

+  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

+  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

+  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

+  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

+  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

+  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

+}

+void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {

+  const int half_pitch = pitch >> 1;

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1<<4);

+  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  // Rows. Load 4-row input data.

+  in0 = _mm_load_si128((__m128i *)input);

+  in1 = _mm_load_si128((__m128i *)(input + 8 * 1));

+  in2 = _mm_load_si128((__m128i *)(input + 8 * 2));

+  in3 = _mm_load_si128((__m128i *)(input + 8 * 3));

+  // 8x4 Transpose

+  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)

+  // Stage1

+  {

+    const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);

+    const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);

+    tmp0 = _mm_madd_epi16(lo_17, stg1_0);

+    tmp2 = _mm_madd_epi16(lo_17, stg1_1);

+    tmp4 = _mm_madd_epi16(lo_35, stg1_2);

+    tmp6 = _mm_madd_epi16(lo_35, stg1_3);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp4 = _mm_add_epi32(tmp4, rounding);

+    tmp6 = _mm_add_epi32(tmp6, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

+    stp1_4 = _mm_packs_epi32(tmp0, zero);

+    stp1_7 = _mm_packs_epi32(tmp2, zero);

+    stp1_5 = _mm_packs_epi32(tmp4, zero);

+    stp1_6 = _mm_packs_epi32(tmp6, zero);

+  }

+  // Stage2

+  {

+    const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);

+    const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);

+    tmp0 = _mm_madd_epi16(lo_04, stg2_0);

+    tmp2 = _mm_madd_epi16(lo_04, stg2_1);

+    tmp4 = _mm_madd_epi16(lo_26, stg2_2);

+    tmp6 = _mm_madd_epi16(lo_26, stg2_3);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp4 = _mm_add_epi32(tmp4, rounding);

+    tmp6 = _mm_add_epi32(tmp6, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

+    stp2_0 = _mm_packs_epi32(tmp0, zero);

+    stp2_1 = _mm_packs_epi32(tmp2, zero);

+    stp2_2 = _mm_packs_epi32(tmp4, zero);

+    stp2_3 = _mm_packs_epi32(tmp6, zero);

+    stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);

+    stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);

+    stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);

+    stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);

+  }

+  // Stage3

+  {

+    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);

+    stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);

+    stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);

+    stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);

+    stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);

+    tmp0 = _mm_madd_epi16(lo_56, stg3_0);

+    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    stp1_5 = _mm_packs_epi32(tmp0, zero);

+    stp1_6 = _mm_packs_epi32(tmp2, zero);

+  }

+  // Stage4

+  in0 = _mm_adds_epi16(stp1_0, stp2_7);

+  in1 = _mm_adds_epi16(stp1_1, stp1_6);

+  in2 = _mm_adds_epi16(stp1_2, stp1_5);

+  in3 = _mm_adds_epi16(stp1_3, stp2_4);

+  in4 = _mm_subs_epi16(stp1_3, stp2_4);

+  in5 = _mm_subs_epi16(stp1_2, stp1_5);

+  in6 = _mm_subs_epi16(stp1_1, stp1_6);

+  in7 = _mm_subs_epi16(stp1_0, stp2_7);

+  // Columns. 4x8 Transpose

+  TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,

+                in4, in5, in6, in7)

+  // 1D idct8x8

+  IDCT8x8_1D

+  // Final rounding and shift

+  in0 = _mm_adds_epi16(in0, final_rounding);

+  in1 = _mm_adds_epi16(in1, final_rounding);

+  in2 = _mm_adds_epi16(in2, final_rounding);

+  in3 = _mm_adds_epi16(in3, final_rounding);

+  in4 = _mm_adds_epi16(in4, final_rounding);

+  in5 = _mm_adds_epi16(in5, final_rounding);

+  in6 = _mm_adds_epi16(in6, final_rounding);

+  in7 = _mm_adds_epi16(in7, final_rounding);

+  in0 = _mm_srai_epi16(in0, 5);

+  in1 = _mm_srai_epi16(in1, 5);

+  in2 = _mm_srai_epi16(in2, 5);

+  in3 = _mm_srai_epi16(in3, 5);

+  in4 = _mm_srai_epi16(in4, 5);

+  in5 = _mm_srai_epi16(in5, 5);

+  in6 = _mm_srai_epi16(in6, 5);

+  in7 = _mm_srai_epi16(in7, 5);

+  // Store results

+  _mm_store_si128((__m128i *)output, in0);

+  _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

+  _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

+  _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

+  _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

+  _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

+  _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

+  _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

+}

+#define IDCT16x16_1D \

+  /* Stage2 */ \

+  { \

+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \

+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \

+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \

+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \

+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \

+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \

+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \

+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \

+    \

+    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \

+                           stg2_0, stg2_1, stg2_2, stg2_3, \

+                           stp2_8, stp2_15, stp2_9, stp2_14) \

+    \

+    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \

+                           stg2_4, stg2_5, stg2_6, stg2_7, \

+                           stp2_10, stp2_13, stp2_11, stp2_12) \

+  } \

+    \

+  /* Stage3 */ \

+  { \

+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \

+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \

+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \

+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \

+    \

+    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \

+                           stg3_0, stg3_1, stg3_2, stg3_3, \

+                           stp1_4, stp1_7, stp1_5, stp1_6) \

+    \

+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);  \

+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);    \

+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \

+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \

+    \

+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \

+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \

+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \

+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \

+  } \

+  \

+  /* Stage4 */ \

+  { \

+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \

+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \

+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \

+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \

+    \

+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \

+    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \

+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

+    \

+    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \

+                           stg4_0, stg4_1, stg4_2, stg4_3, \

+                           stp2_0, stp2_1, stp2_2, stp2_3) \

+    \

+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \

+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \

+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \

+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \

+    \

+    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \

+                           stg4_4, stg4_5, stg4_6, stg4_7, \

+                           stp2_9, stp2_14, stp2_10, stp2_13) \

+  } \

+    \

+  /* Stage5 */ \

+  { \

+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \

+    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \

+    \

+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \

+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \

+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \

+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \

+    \

+    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \

+    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \

+    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \

+    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \

+    \

+    tmp0 = _mm_add_epi32(tmp0, rounding); \

+    tmp1 = _mm_add_epi32(tmp1, rounding); \

+    tmp2 = _mm_add_epi32(tmp2, rounding); \

+    tmp3 = _mm_add_epi32(tmp3, rounding); \

+    \

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \

+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \

+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \

+    \

+    stp1_5 = _mm_packs_epi32(tmp0, tmp1); \

+    stp1_6 = _mm_packs_epi32(tmp2, tmp3); \

+    \

+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);  \

+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);    \

+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);   \

+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \

+    \

+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \

+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);   \

+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);   \

+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \

+  } \

+    \

+  /* Stage6 */ \

+  { \

+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \

+    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \

+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \

+    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \

+    \

+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \

+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \

+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \

+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \

+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \

+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \

+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \

+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \

+    \

+    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \

+                           stg6_0, stg4_0, stg6_0, stg4_0, \

+                           stp2_10, stp2_13, stp2_11, stp2_12) \

+  }

+void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {

+  const int half_pitch = pitch >> 1;

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1<<5);

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);

+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);

+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);

+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);

+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);

+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,

+          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,

+          in10 = zero, in11 = zero, in12 = zero, in13 = zero,

+          in14 = zero, in15 = zero;

+  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,

+          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,

+          l12 = zero, l13 = zero, l14 = zero, l15 = zero;

+  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,

+          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,

+          r12 = zero, r13 = zero, r14 = zero, r15 = zero;

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,

+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

+          stp1_8_0, stp1_12_0;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  int i;

+  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.

+  for (i = 0; i < 4; i++) {

+    // 1-D idct

+    if (i < 2) {

+      if (i == 1) input += 128;

+      // Load input data.

+      in0 = _mm_load_si128((__m128i *)input);

+      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));

+      in1 = _mm_load_si128((__m128i *)(input + 8 * 2));

+      in9 = _mm_load_si128((__m128i *)(input + 8 * 3));

+      in2 = _mm_load_si128((__m128i *)(input + 8 * 4));

+      in10 = _mm_load_si128((__m128i *)(input + 8 * 5));

+      in3 = _mm_load_si128((__m128i *)(input + 8 * 6));

+      in11 = _mm_load_si128((__m128i *)(input + 8 * 7));

+      in4 = _mm_load_si128((__m128i *)(input + 8 * 8));

+      in12 = _mm_load_si128((__m128i *)(input + 8 * 9));

+      in5 = _mm_load_si128((__m128i *)(input + 8 * 10));

+      in13 = _mm_load_si128((__m128i *)(input + 8 * 11));

+      in6 = _mm_load_si128((__m128i *)(input + 8 * 12));

+      in14 = _mm_load_si128((__m128i *)(input + 8 * 13));

+      in7 = _mm_load_si128((__m128i *)(input + 8 * 14));

+      in15 = _mm_load_si128((__m128i *)(input + 8 * 15));

+      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,

+                    in4, in5, in6, in7);

+      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,

+                    in10, in11, in12, in13, in14, in15);

+    }

+    if (i == 2) {

+      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,

+                    in5, in6, in7);

+      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,

+                    in13, in14, in15);

+    }

+    if (i == 3) {

+      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,

+                    in4, in5, in6, in7);

+      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,

+                    in12, in13, in14, in15);

+    }

+    IDCT16x16_1D

+    // Stage7

+    if (i == 0) {

+      // Left 8x16

+      l0 = _mm_add_epi16(stp2_0, stp1_15);

+      l1 = _mm_add_epi16(stp2_1, stp1_14);

+      l2 = _mm_add_epi16(stp2_2, stp2_13);

+      l3 = _mm_add_epi16(stp2_3, stp2_12);

+      l4 = _mm_add_epi16(stp2_4, stp2_11);

+      l5 = _mm_add_epi16(stp2_5, stp2_10);

+      l6 = _mm_add_epi16(stp2_6, stp1_9);

+      l7 = _mm_add_epi16(stp2_7, stp1_8);

+      l8 = _mm_sub_epi16(stp2_7, stp1_8);

+      l9 = _mm_sub_epi16(stp2_6, stp1_9);

+      l10 = _mm_sub_epi16(stp2_5, stp2_10);

+      l11 = _mm_sub_epi16(stp2_4, stp2_11);

+      l12 = _mm_sub_epi16(stp2_3, stp2_12);

+      l13 = _mm_sub_epi16(stp2_2, stp2_13);

+      l14 = _mm_sub_epi16(stp2_1, stp1_14);

+      l15 = _mm_sub_epi16(stp2_0, stp1_15);

+    } else if (i == 1) {

+      // Right 8x16

+      r0 = _mm_add_epi16(stp2_0, stp1_15);

+      r1 = _mm_add_epi16(stp2_1, stp1_14);

+      r2 = _mm_add_epi16(stp2_2, stp2_13);

+      r3 = _mm_add_epi16(stp2_3, stp2_12);

+      r4 = _mm_add_epi16(stp2_4, stp2_11);

+      r5 = _mm_add_epi16(stp2_5, stp2_10);

+      r6 = _mm_add_epi16(stp2_6, stp1_9);

+      r7 = _mm_add_epi16(stp2_7, stp1_8);

+      r8 = _mm_sub_epi16(stp2_7, stp1_8);

+      r9 = _mm_sub_epi16(stp2_6, stp1_9);

+      r10 = _mm_sub_epi16(stp2_5, stp2_10);

+      r11 = _mm_sub_epi16(stp2_4, stp2_11);

+      r12 = _mm_sub_epi16(stp2_3, stp2_12);

+      r13 = _mm_sub_epi16(stp2_2, stp2_13);

+      r14 = _mm_sub_epi16(stp2_1, stp1_14);

+      r15 = _mm_sub_epi16(stp2_0, stp1_15);

+    } else {

+      // 2-D

+      in0 = _mm_add_epi16(stp2_0, stp1_15);

+      in1 = _mm_add_epi16(stp2_1, stp1_14);

+      in2 = _mm_add_epi16(stp2_2, stp2_13);

+      in3 = _mm_add_epi16(stp2_3, stp2_12);

+      in4 = _mm_add_epi16(stp2_4, stp2_11);

+      in5 = _mm_add_epi16(stp2_5, stp2_10);

+      in6 = _mm_add_epi16(stp2_6, stp1_9);

+      in7 = _mm_add_epi16(stp2_7, stp1_8);

+      in8 = _mm_sub_epi16(stp2_7, stp1_8);

+      in9 = _mm_sub_epi16(stp2_6, stp1_9);

+      in10 = _mm_sub_epi16(stp2_5, stp2_10);

+      in11 = _mm_sub_epi16(stp2_4, stp2_11);

+      in12 = _mm_sub_epi16(stp2_3, stp2_12);

+      in13 = _mm_sub_epi16(stp2_2, stp2_13);

+      in14 = _mm_sub_epi16(stp2_1, stp1_14);

+      in15 = _mm_sub_epi16(stp2_0, stp1_15);

+      // Final rounding and shift

+      in0 = _mm_adds_epi16(in0, final_rounding);

+      in1 = _mm_adds_epi16(in1, final_rounding);

+      in2 = _mm_adds_epi16(in2, final_rounding);

+      in3 = _mm_adds_epi16(in3, final_rounding);

+      in4 = _mm_adds_epi16(in4, final_rounding);

+      in5 = _mm_adds_epi16(in5, final_rounding);

+      in6 = _mm_adds_epi16(in6, final_rounding);

+      in7 = _mm_adds_epi16(in7, final_rounding);

+      in8 = _mm_adds_epi16(in8, final_rounding);

+      in9 = _mm_adds_epi16(in9, final_rounding);

+      in10 = _mm_adds_epi16(in10, final_rounding);

+      in11 = _mm_adds_epi16(in11, final_rounding);

+      in12 = _mm_adds_epi16(in12, final_rounding);

+      in13 = _mm_adds_epi16(in13, final_rounding);

+      in14 = _mm_adds_epi16(in14, final_rounding);

+      in15 = _mm_adds_epi16(in15, final_rounding);

+      in0 = _mm_srai_epi16(in0, 6);

+      in1 = _mm_srai_epi16(in1, 6);

+      in2 = _mm_srai_epi16(in2, 6);

+      in3 = _mm_srai_epi16(in3, 6);

+      in4 = _mm_srai_epi16(in4, 6);

+      in5 = _mm_srai_epi16(in5, 6);

+      in6 = _mm_srai_epi16(in6, 6);

+      in7 = _mm_srai_epi16(in7, 6);

+      in8 = _mm_srai_epi16(in8, 6);

+      in9 = _mm_srai_epi16(in9, 6);

+      in10 = _mm_srai_epi16(in10, 6);

+      in11 = _mm_srai_epi16(in11, 6);

+      in12 = _mm_srai_epi16(in12, 6);

+      in13 = _mm_srai_epi16(in13, 6);

+      in14 = _mm_srai_epi16(in14, 6);

+      in15 = _mm_srai_epi16(in15, 6);

+      // Store results

+      _mm_store_si128((__m128i *)output, in0);

+      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

+      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

+      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

+      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

+      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

+      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

+      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

+      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);

+      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);

+      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);

+      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);

+      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);

+      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);

+      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);

+      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);

+      output += 8;

+    }

+  }

+}

+void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {

+  const int half_pitch = pitch >> 1;

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1<<5);

+  const __m128i zero = _mm_setzero_si128();

+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);

+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);

+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);

+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);

+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);

+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

+  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,

+          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,

+          in10 = zero, in11 = zero, in12 = zero, in13 = zero,

+          in14 = zero, in15 = zero;

+  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,

+          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,

+          l12 = zero, l13 = zero, l14 = zero, l15 = zero;

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,

+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

+          stp1_8_0, stp1_12_0;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  int i;

+  // 1-D idct. Load input data.

+  in0 = _mm_load_si128((__m128i *)input);

+  in8 = _mm_load_si128((__m128i *)(input + 8 * 1));

+  in1 = _mm_load_si128((__m128i *)(input + 8 * 2));

+  in9 = _mm_load_si128((__m128i *)(input + 8 * 3));

+  in2 = _mm_load_si128((__m128i *)(input + 8 * 4));

+  in10 = _mm_load_si128((__m128i *)(input + 8 * 5));

+  in3 = _mm_load_si128((__m128i *)(input + 8 * 6));

+  in11 = _mm_load_si128((__m128i *)(input + 8 * 7));

+  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);

+  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);

+  // Stage2

+  {

+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);

+    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);

+    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);

+    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);

+    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);

+    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);

+    tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);

+    tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);

+    tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);

+    tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);

+    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);

+    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp4 = _mm_add_epi32(tmp4, rounding);

+    tmp6 = _mm_add_epi32(tmp6, rounding);

+    tmp1 = _mm_add_epi32(tmp1, rounding);

+    tmp3 = _mm_add_epi32(tmp3, rounding);

+    tmp5 = _mm_add_epi32(tmp5, rounding);

+    tmp7 = _mm_add_epi32(tmp7, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);

+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);

+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);

+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);

+    stp2_8 = _mm_packs_epi32(tmp0, zero);

+    stp2_15 = _mm_packs_epi32(tmp2, zero);

+    stp2_9 = _mm_packs_epi32(tmp4, zero);

+    stp2_14 = _mm_packs_epi32(tmp6, zero);

+    stp2_10 = _mm_packs_epi32(tmp1, zero);

+    stp2_13 = _mm_packs_epi32(tmp3, zero);

+    stp2_11 = _mm_packs_epi32(tmp5, zero);

+    stp2_12 = _mm_packs_epi32(tmp7, zero);

+  }

+  // Stage3

+  {

+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);

+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);

+    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);

+    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);

+    tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);

+    tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp4 = _mm_add_epi32(tmp4, rounding);

+    tmp6 = _mm_add_epi32(tmp6, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

+    stp1_4 = _mm_packs_epi32(tmp0, zero);

+    stp1_7 = _mm_packs_epi32(tmp2, zero);

+    stp1_5 = _mm_packs_epi32(tmp4, zero);

+    stp1_6 = _mm_packs_epi32(tmp6, zero);

+    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);

+    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);

+    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);

+    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);

+    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);

+    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);

+    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);

+    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);

+  }

+  // Stage4

+  {

+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);

+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);

+    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);

+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);

+    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);

+    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);

+    tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);

+    tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);

+    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);

+    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);

+    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);

+    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp4 = _mm_add_epi32(tmp4, rounding);

+    tmp6 = _mm_add_epi32(tmp6, rounding);

+    tmp1 = _mm_add_epi32(tmp1, rounding);

+    tmp3 = _mm_add_epi32(tmp3, rounding);

+    tmp5 = _mm_add_epi32(tmp5, rounding);

+    tmp7 = _mm_add_epi32(tmp7, rounding);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);

+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);

+    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);

+    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);

+    stp2_0 = _mm_packs_epi32(tmp0, zero);

+    stp2_1 = _mm_packs_epi32(tmp2, zero);

+    stp2_2 = _mm_packs_epi32(tmp4, zero);

+    stp2_3 = _mm_packs_epi32(tmp6, zero);

+    stp2_9 = _mm_packs_epi32(tmp1, zero);

+    stp2_14 = _mm_packs_epi32(tmp3, zero);

+    stp2_10 = _mm_packs_epi32(tmp5, zero);

+    stp2_13 = _mm_packs_epi32(tmp7, zero);

+    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);

+    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);

+    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);

+    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);

+  }

+  // Stage5 and Stage6

+  {

+    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);

+    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);

+    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);

+    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);

+    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);

+    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);

+    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);

+    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);

+    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);

+    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);

+    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);

+    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);

+  }

+  // Stage6

+  {

+    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);

+    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);

+    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);

+    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);

+    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);

+    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);

+    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);

+    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);

+    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);

+    tmp1 = _mm_add_epi32(tmp1, rounding);

+    tmp3 = _mm_add_epi32(tmp3, rounding);

+    tmp0 = _mm_add_epi32(tmp0, rounding);

+    tmp2 = _mm_add_epi32(tmp2, rounding);

+    tmp4 = _mm_add_epi32(tmp4, rounding);

+    tmp6 = _mm_add_epi32(tmp6, rounding);

+    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);

+    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);

+    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);

+    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);

+    stp1_5 = _mm_packs_epi32(tmp1, zero);

+    stp1_6 = _mm_packs_epi32(tmp3, zero);

+    stp2_10 = _mm_packs_epi32(tmp0, zero);

+    stp2_13 = _mm_packs_epi32(tmp2, zero);

+    stp2_11 = _mm_packs_epi32(tmp4, zero);

+    stp2_12 = _mm_packs_epi32(tmp6, zero);

+    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);

+    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);

+    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);

+    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);

+    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);

+    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);

+    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);

+    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);

+  }

+  // Stage7. Left 8x16 only.

+  l0 = _mm_add_epi16(stp2_0, stp1_15);

+  l1 = _mm_add_epi16(stp2_1, stp1_14);

+  l2 = _mm_add_epi16(stp2_2, stp2_13);

+  l3 = _mm_add_epi16(stp2_3, stp2_12);

+  l4 = _mm_add_epi16(stp2_4, stp2_11);

+  l5 = _mm_add_epi16(stp2_5, stp2_10);

+  l6 = _mm_add_epi16(stp2_6, stp1_9);

+  l7 = _mm_add_epi16(stp2_7, stp1_8);

+  l8 = _mm_sub_epi16(stp2_7, stp1_8);

+  l9 = _mm_sub_epi16(stp2_6, stp1_9);

+  l10 = _mm_sub_epi16(stp2_5, stp2_10);

+  l11 = _mm_sub_epi16(stp2_4, stp2_11);

+  l12 = _mm_sub_epi16(stp2_3, stp2_12);

+  l13 = _mm_sub_epi16(stp2_2, stp2_13);

+  l14 = _mm_sub_epi16(stp2_1, stp1_14);

+  l15 = _mm_sub_epi16(stp2_0, stp1_15);

+  // 2-D idct. We do 2 8x16 blocks.

+  for (i = 0; i < 2; i++) {

+    if (i == 0)

+      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,

+                    in5, in6, in7);

+    if (i == 1)

+      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,

+                    in4, in5, in6, in7);

+    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;

+    IDCT16x16_1D

+    // Stage7

+    in0 = _mm_add_epi16(stp2_0, stp1_15);

+    in1 = _mm_add_epi16(stp2_1, stp1_14);

+    in2 = _mm_add_epi16(stp2_2, stp2_13);

+    in3 = _mm_add_epi16(stp2_3, stp2_12);

+    in4 = _mm_add_epi16(stp2_4, stp2_11);

+    in5 = _mm_add_epi16(stp2_5, stp2_10);

+    in6 = _mm_add_epi16(stp2_6, stp1_9);

+    in7 = _mm_add_epi16(stp2_7, stp1_8);

+    in8 = _mm_sub_epi16(stp2_7, stp1_8);

+    in9 = _mm_sub_epi16(stp2_6, stp1_9);

+    in10 = _mm_sub_epi16(stp2_5, stp2_10);

+    in11 = _mm_sub_epi16(stp2_4, stp2_11);

+    in12 = _mm_sub_epi16(stp2_3, stp2_12);

+    in13 = _mm_sub_epi16(stp2_2, stp2_13);

+    in14 = _mm_sub_epi16(stp2_1, stp1_14);

+    in15 = _mm_sub_epi16(stp2_0, stp1_15);

+    // Final rounding and shift

+    in0 = _mm_adds_epi16(in0, final_rounding);

+    in1 = _mm_adds_epi16(in1, final_rounding);

+    in2 = _mm_adds_epi16(in2, final_rounding);

+    in3 = _mm_adds_epi16(in3, final_rounding);

+    in4 = _mm_adds_epi16(in4, final_rounding);

+    in5 = _mm_adds_epi16(in5, final_rounding);

+    in6 = _mm_adds_epi16(in6, final_rounding);

+    in7 = _mm_adds_epi16(in7, final_rounding);

+    in8 = _mm_adds_epi16(in8, final_rounding);

+    in9 = _mm_adds_epi16(in9, final_rounding);

+    in10 = _mm_adds_epi16(in10, final_rounding);

+    in11 = _mm_adds_epi16(in11, final_rounding);

+    in12 = _mm_adds_epi16(in12, final_rounding);

+    in13 = _mm_adds_epi16(in13, final_rounding);

+    in14 = _mm_adds_epi16(in14, final_rounding);

+    in15 = _mm_adds_epi16(in15, final_rounding);

+    in0 = _mm_srai_epi16(in0, 6);

+    in1 = _mm_srai_epi16(in1, 6);

+    in2 = _mm_srai_epi16(in2, 6);

+    in3 = _mm_srai_epi16(in3, 6);

+    in4 = _mm_srai_epi16(in4, 6);

+    in5 = _mm_srai_epi16(in5, 6);

+    in6 = _mm_srai_epi16(in6, 6);

+    in7 = _mm_srai_epi16(in7, 6);

+    in8 = _mm_srai_epi16(in8, 6);

+    in9 = _mm_srai_epi16(in9, 6);

+    in10 = _mm_srai_epi16(in10, 6);

+    in11 = _mm_srai_epi16(in11, 6);

+    in12 = _mm_srai_epi16(in12, 6);

+    in13 = _mm_srai_epi16(in13, 6);

+    in14 = _mm_srai_epi16(in14, 6);

+    in15 = _mm_srai_epi16(in15, 6);

+    // Store results

+    _mm_store_si128((__m128i *)output, in0);

+    _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

+    _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

+    _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

+    _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

+    _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

+    _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

+    _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

+    _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);

+    _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);

+    _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);

+    _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);

+    _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);

+    _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);

+    _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);

+    _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);

+    output += 8;

+  }

+}

+void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {

+  const int half_pitch = pitch >> 1;

+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i final_rounding = _mm_set1_epi16(1<<5);

+  // idct constants for each stage

+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);

+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);

+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);

+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);

+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);

+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);

+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);

+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);

+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);

+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);

+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);

+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);

+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);

+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);

+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);

+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);

+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);

+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);

+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);

+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);

+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);

+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);

+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);

+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);

+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);

+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);

+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);

+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);

+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);

+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);

+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);

+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);

+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);

+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);

+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);

+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);

+  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,

+          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,

+          in24, in25, in26, in27, in28, in29, in30, in31;

+  __m128i col[128];

+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,

+          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,

+          stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,

+          stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,

+          stp1_30, stp1_31;

+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,

+          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,

+          stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,

+          stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,

+          stp2_30, stp2_31;

+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  int i, j;

+  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.

+  for (i = 0; i < 8; i++) {

+    if (i < 4) {

+      // First 1-D idct

+      // Load input data.

+      in0 = _mm_load_si128((__m128i *)input);

+      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));

+      in16 = _mm_load_si128((__m128i *)(input + 8 * 2));

+      in24 = _mm_load_si128((__m128i *)(input + 8 * 3));

+      in1 = _mm_load_si128((__m128i *)(input + 8 * 4));

+      in9 = _mm_load_si128((__m128i *)(input + 8 * 5));

+      in17 = _mm_load_si128((__m128i *)(input + 8 * 6));

+      in25 = _mm_load_si128((__m128i *)(input + 8 * 7));

+      in2 = _mm_load_si128((__m128i *)(input + 8 * 8));

+      in10 = _mm_load_si128((__m128i *)(input + 8 * 9));

+      in18 = _mm_load_si128((__m128i *)(input + 8 * 10));

+      in26 = _mm_load_si128((__m128i *)(input + 8 * 11));

+      in3 = _mm_load_si128((__m128i *)(input + 8 * 12));

+      in11 = _mm_load_si128((__m128i *)(input + 8 * 13));

+      in19 = _mm_load_si128((__m128i *)(input + 8 * 14));

+      in27 = _mm_load_si128((__m128i *)(input + 8 * 15));

+      in4 = _mm_load_si128((__m128i *)(input + 8 * 16));

+      in12 = _mm_load_si128((__m128i *)(input + 8 * 17));

+      in20 = _mm_load_si128((__m128i *)(input + 8 * 18));

+      in28 = _mm_load_si128((__m128i *)(input + 8 * 19));

+      in5 = _mm_load_si128((__m128i *)(input + 8 * 20));

+      in13 = _mm_load_si128((__m128i *)(input + 8 * 21));

+      in21 = _mm_load_si128((__m128i *)(input + 8 * 22));

+      in29 = _mm_load_si128((__m128i *)(input + 8 * 23));

+      in6 = _mm_load_si128((__m128i *)(input + 8 * 24));

+      in14 = _mm_load_si128((__m128i *)(input + 8 * 25));

+      in22 = _mm_load_si128((__m128i *)(input + 8 * 26));

+      in30 = _mm_load_si128((__m128i *)(input + 8 * 27));

+      in7 = _mm_load_si128((__m128i *)(input + 8 * 28));

+      in15 = _mm_load_si128((__m128i *)(input + 8 * 29));

+      in23 = _mm_load_si128((__m128i *)(input + 8 * 30));

+      in31 = _mm_load_si128((__m128i *)(input + 8 * 31));

+      input += 256;

+      // Transpose 32x8 block to 8x32 block

+      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,

+                    in4, in5, in6, in7);

+      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,

+                    in10, in11, in12, in13, in14, in15);

+      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,

+                    in18, in19, in20, in21, in22, in23);

+      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,

+                    in26, in27, in28, in29, in30, in31);

+    } else {

+      // Second 1-D idct

+      j = i - 4;

+      // Transpose 32x8 block to 8x32 block

+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],

+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],

+                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,

+                    in5, in6, in7);

+      j += 4;

+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],

+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],

+                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,

+                    in11, in12, in13, in14, in15);

+      j += 4;

+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],

+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],

+                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,

+                    in19, in20, in21, in22, in23);

+      j += 4;

+      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],

+                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],

+                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,

+                    in28, in29, in30, in31);

+    }

+    // Stage1

+    {

+      const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);

+      const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);

+      const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);

+      const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);

+      const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);

+      const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);

+      const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);

+      const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);

+      const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);

+      const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);

+      const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);

+      const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);

+      const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);

+      const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);

+      const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);

+      const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);

+      MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,

+                             stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,

+                             stp1_17, stp1_30)

+      MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,

+                             stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,

+                             stp1_19, stp1_28)

+      MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,

+                             stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,

+                             stp1_21, stp1_26)

+      MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,

+                             stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,

+                             stp1_23, stp1_24)

+    }

+    // Stage2

+    {

+      const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);

+      const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);

+      const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);

+      const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);

+      const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);

+      const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);

+      const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);

+      const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);

+      MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,

+                             stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,

+                             stp2_14)

+      MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,

+                             stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,

+                             stp2_11, stp2_12)

+      stp2_16 = _mm_add_epi16(stp1_16, stp1_17);

+      stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);

+      stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);

+      stp2_19 = _mm_add_epi16(stp1_19, stp1_18);

+      stp2_20 = _mm_add_epi16(stp1_20, stp1_21);

+      stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);

+      stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);

+      stp2_23 = _mm_add_epi16(stp1_23, stp1_22);

+      stp2_24 = _mm_add_epi16(stp1_24, stp1_25);

+      stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);

+      stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);

+      stp2_27 = _mm_add_epi16(stp1_27, stp1_26);

+      stp2_28 = _mm_add_epi16(stp1_28, stp1_29);

+      stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);

+      stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);

+      stp2_31 = _mm_add_epi16(stp1_31, stp1_30);

+    }

+    // Stage3

+    {

+      const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);

+      const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);

+      const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);

+      const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);

+      const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);

+      const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);

+      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);

+      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);

+      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);

+      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);

+      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);

+      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);

+      MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,

+                             stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,

+                             stp1_6)

+      stp1_8 = _mm_add_epi16(stp2_8, stp2_9);

+      stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);

+      stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);

+      stp1_11 = _mm_add_epi16(stp2_11, stp2_10);

+      stp1_12 = _mm_add_epi16(stp2_12, stp2_13);

+      stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);

+      stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);

+      stp1_15 = _mm_add_epi16(stp2_15, stp2_14);

+      MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,

+                             stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,

+                             stp1_18, stp1_29)

+      MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,

+                             stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,

+                             stp1_22, stp1_25)

+      stp1_16 = stp2_16;

+      stp1_31 = stp2_31;

+      stp1_19 = stp2_19;

+      stp1_20 = stp2_20;

+      stp1_23 = stp2_23;

+      stp1_24 = stp2_24;

+      stp1_27 = stp2_27;

+      stp1_28 = stp2_28;

+    }

+    // Stage4

+    {

+      const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);

+      const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);

+      const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);

+      const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);

+      const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);

+      const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);

+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);

+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);

+      MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,

+                             stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,

+                             stp2_2, stp2_3)

+      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);

+      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);

+      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);

+      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);

+      MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,

+                             stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,

+                             stp2_10, stp2_13)

+      stp2_8 = stp1_8;

+      stp2_15 = stp1_15;

+      stp2_11 = stp1_11;

+      stp2_12 = stp1_12;

+      stp2_16 = _mm_add_epi16(stp1_16, stp1_19);

+      stp2_17 = _mm_add_epi16(stp1_17, stp1_18);

+      stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);

+      stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);

+      stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);

+      stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);

+      stp2_22 = _mm_add_epi16(stp1_22, stp1_21);

+      stp2_23 = _mm_add_epi16(stp1_23, stp1_20);

+      stp2_24 = _mm_add_epi16(stp1_24, stp1_27);

+      stp2_25 = _mm_add_epi16(stp1_25, stp1_26);

+      stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);

+      stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);

+      stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);

+      stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);

+      stp2_30 = _mm_add_epi16(stp1_29, stp1_30);

+      stp2_31 = _mm_add_epi16(stp1_28, stp1_31);

+    }

+    // Stage5

+    {

+      const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);

+      const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);

+      const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);

+      const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);

+      const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);

+      const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);

+      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);

+      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);

+      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);

+      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);

+      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);

+      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);

+      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);

+      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);

+      tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);

+      tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);

+      tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);

+      tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);

+      tmp0 = _mm_add_epi32(tmp0, rounding);

+      tmp1 = _mm_add_epi32(tmp1, rounding);

+      tmp2 = _mm_add_epi32(tmp2, rounding);

+      tmp3 = _mm_add_epi32(tmp3, rounding);

+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);

+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);

+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);

+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);

+      stp1_5 = _mm_packs_epi32(tmp0, tmp1);

+      stp1_6 = _mm_packs_epi32(tmp2, tmp3);

+      stp1_4 = stp2_4;

+      stp1_7 = stp2_7;

+      stp1_8 = _mm_add_epi16(stp2_8, stp2_11);

+      stp1_9 = _mm_add_epi16(stp2_9, stp2_10);

+      stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);

+      stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);

+      stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);

+      stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);

+      stp1_14 = _mm_add_epi16(stp2_14, stp2_13);

+      stp1_15 = _mm_add_epi16(stp2_15, stp2_12);

+      stp1_16 = stp2_16;

+      stp1_17 = stp2_17;

+      MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,

+                             stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,

+                             stp1_19, stp1_28)

+      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,

+                             stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,

+                             stp1_21, stp1_26)

+      stp1_22 = stp2_22;

+      stp1_23 = stp2_23;

+      stp1_24 = stp2_24;

+      stp1_25 = stp2_25;

+      stp1_30 = stp2_30;

+      stp1_31 = stp2_31;

+    }

+    // Stage6

+    {

+      const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);

+      const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);

+      const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);

+      const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);

+      stp2_0 = _mm_add_epi16(stp1_0, stp1_7);

+      stp2_1 = _mm_add_epi16(stp1_1, stp1_6);

+      stp2_2 = _mm_add_epi16(stp1_2, stp1_5);

+      stp2_3 = _mm_add_epi16(stp1_3, stp1_4);

+      stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);

+      stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);

+      stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);

+      stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);

+      stp2_8 = stp1_8;

+      stp2_9 = stp1_9;

+      stp2_14 = stp1_14;

+      stp2_15 = stp1_15;

+      MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,

+                             stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,

+                             stp2_13, stp2_11, stp2_12)

+      stp2_16 = _mm_add_epi16(stp1_16, stp1_23);

+      stp2_17 = _mm_add_epi16(stp1_17, stp1_22);

+      stp2_18 = _mm_add_epi16(stp1_18, stp1_21);

+      stp2_19 = _mm_add_epi16(stp1_19, stp1_20);

+      stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);

+      stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);

+      stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);

+      stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);

+      stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);

+      stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);

+      stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);

+      stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);

+      stp2_28 = _mm_add_epi16(stp1_27, stp1_28);

+      stp2_29 = _mm_add_epi16(stp1_26, stp1_29);

+      stp2_30 = _mm_add_epi16(stp1_25, stp1_30);

+      stp2_31 = _mm_add_epi16(stp1_24, stp1_31);

+    }

+    // Stage7

+    {

+      const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);

+      const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);

+      const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);

+      const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);

+      const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);

+      const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);

+      const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);

+      const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);

+      stp1_0 = _mm_add_epi16(stp2_0, stp2_15);

+      stp1_1 = _mm_add_epi16(stp2_1, stp2_14);

+      stp1_2 = _mm_add_epi16(stp2_2, stp2_13);

+      stp1_3 = _mm_add_epi16(stp2_3, stp2_12);

+      stp1_4 = _mm_add_epi16(stp2_4, stp2_11);

+      stp1_5 = _mm_add_epi16(stp2_5, stp2_10);

+      stp1_6 = _mm_add_epi16(stp2_6, stp2_9);

+      stp1_7 = _mm_add_epi16(stp2_7, stp2_8);

+      stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);

+      stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);

+      stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);

+      stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);

+      stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);

+      stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);

+      stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);

+      stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);

+      stp1_16 = stp2_16;

+      stp1_17 = stp2_17;

+      stp1_18 = stp2_18;

+      stp1_19 = stp2_19;

+      MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,

+                             stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,

+                             stp1_21, stp1_26)

+      MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,

+                             stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,

+                             stp1_23, stp1_24)

+      stp1_28 = stp2_28;

+      stp1_29 = stp2_29;

+      stp1_30 = stp2_30;

+      stp1_31 = stp2_31;

+    }

+    // final stage

+    if (i < 4) {

+      // 1_D: Store 32 intermediate results for each 8x32 block.

+      col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);

+      col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);

+      col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);

+      col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);

+      col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);

+      col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);

+      col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);

+      col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);

+      col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);

+      col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);

+      col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);

+      col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);

+      col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);

+      col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);

+      col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);

+      col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);

+      col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);

+      col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);

+      col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);

+      col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);

+      col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);

+      col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);

+      col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);

+      col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);

+      col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);

+      col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);

+      col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);

+      col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);

+      col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);

+      col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);

+      col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);

+      col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);

+    } else {

+      // 2_D: Calculate the results and store them to destination.

+      in0 = _mm_add_epi16(stp1_0, stp1_31);

+      in1 = _mm_add_epi16(stp1_1, stp1_30);

+      in2 = _mm_add_epi16(stp1_2, stp1_29);

+      in3 = _mm_add_epi16(stp1_3, stp1_28);

+      in4 = _mm_add_epi16(stp1_4, stp1_27);

+      in5 = _mm_add_epi16(stp1_5, stp1_26);

+      in6 = _mm_add_epi16(stp1_6, stp1_25);

+      in7 = _mm_add_epi16(stp1_7, stp1_24);

+      in8 = _mm_add_epi16(stp1_8, stp1_23);

+      in9 = _mm_add_epi16(stp1_9, stp1_22);

+      in10 = _mm_add_epi16(stp1_10, stp1_21);

+      in11 = _mm_add_epi16(stp1_11, stp1_20);

+      in12 = _mm_add_epi16(stp1_12, stp1_19);

+      in13 = _mm_add_epi16(stp1_13, stp1_18);

+      in14 = _mm_add_epi16(stp1_14, stp1_17);

+      in15 = _mm_add_epi16(stp1_15, stp1_16);

+      in16 = _mm_sub_epi16(stp1_15, stp1_16);

+      in17 = _mm_sub_epi16(stp1_14, stp1_17);

+      in18 = _mm_sub_epi16(stp1_13, stp1_18);

+      in19 = _mm_sub_epi16(stp1_12, stp1_19);

+      in20 = _mm_sub_epi16(stp1_11, stp1_20);

+      in21 = _mm_sub_epi16(stp1_10, stp1_21);

+      in22 = _mm_sub_epi16(stp1_9, stp1_22);

+      in23 = _mm_sub_epi16(stp1_8, stp1_23);

+      in24 = _mm_sub_epi16(stp1_7, stp1_24);

+      in25 = _mm_sub_epi16(stp1_6, stp1_25);

+      in26 = _mm_sub_epi16(stp1_5, stp1_26);

+      in27 = _mm_sub_epi16(stp1_4, stp1_27);

+      in28 = _mm_sub_epi16(stp1_3, stp1_28);

+      in29 = _mm_sub_epi16(stp1_2, stp1_29);

+      in30 = _mm_sub_epi16(stp1_1, stp1_30);

+      in31 = _mm_sub_epi16(stp1_0, stp1_31);

+      // Final rounding and shift

+      in0 = _mm_adds_epi16(in0, final_rounding);

+      in1 = _mm_adds_epi16(in1, final_rounding);

+      in2 = _mm_adds_epi16(in2, final_rounding);

+      in3 = _mm_adds_epi16(in3, final_rounding);

+      in4 = _mm_adds_epi16(in4, final_rounding);

+      in5 = _mm_adds_epi16(in5, final_rounding);

+      in6 = _mm_adds_epi16(in6, final_rounding);

+      in7 = _mm_adds_epi16(in7, final_rounding);

+      in8 = _mm_adds_epi16(in8, final_rounding);

+      in9 = _mm_adds_epi16(in9, final_rounding);

+      in10 = _mm_adds_epi16(in10, final_rounding);

+      in11 = _mm_adds_epi16(in11, final_rounding);

+      in12 = _mm_adds_epi16(in12, final_rounding);

+      in13 = _mm_adds_epi16(in13, final_rounding);

+      in14 = _mm_adds_epi16(in14, final_rounding);

+      in15 = _mm_adds_epi16(in15, final_rounding);

+      in16 = _mm_adds_epi16(in16, final_rounding);

+      in17 = _mm_adds_epi16(in17, final_rounding);

+      in18 = _mm_adds_epi16(in18, final_rounding);

+      in19 = _mm_adds_epi16(in19, final_rounding);

+      in20 = _mm_adds_epi16(in20, final_rounding);

+      in21 = _mm_adds_epi16(in21, final_rounding);

+      in22 = _mm_adds_epi16(in22, final_rounding);

+      in23 = _mm_adds_epi16(in23, final_rounding);

+      in24 = _mm_adds_epi16(in24, final_rounding);

+      in25 = _mm_adds_epi16(in25, final_rounding);

+      in26 = _mm_adds_epi16(in26, final_rounding);

+      in27 = _mm_adds_epi16(in27, final_rounding);

+      in28 = _mm_adds_epi16(in28, final_rounding);

+      in29 = _mm_adds_epi16(in29, final_rounding);

+      in30 = _mm_adds_epi16(in30, final_rounding);

+      in31 = _mm_adds_epi16(in31, final_rounding);

+      in0 = _mm_srai_epi16(in0, 6);

+      in1 = _mm_srai_epi16(in1, 6);

+      in2 = _mm_srai_epi16(in2, 6);

+      in3 = _mm_srai_epi16(in3, 6);

+      in4 = _mm_srai_epi16(in4, 6);

+      in5 = _mm_srai_epi16(in5, 6);

+      in6 = _mm_srai_epi16(in6, 6);

+      in7 = _mm_srai_epi16(in7, 6);

+      in8 = _mm_srai_epi16(in8, 6);

+      in9 = _mm_srai_epi16(in9, 6);

+      in10 = _mm_srai_epi16(in10, 6);

+      in11 = _mm_srai_epi16(in11, 6);

+      in12 = _mm_srai_epi16(in12, 6);

+      in13 = _mm_srai_epi16(in13, 6);

+      in14 = _mm_srai_epi16(in14, 6);

+      in15 = _mm_srai_epi16(in15, 6);

+      in16 = _mm_srai_epi16(in16, 6);

+      in17 = _mm_srai_epi16(in17, 6);

+      in18 = _mm_srai_epi16(in18, 6);

+      in19 = _mm_srai_epi16(in19, 6);

+      in20 = _mm_srai_epi16(in20, 6);

+      in21 = _mm_srai_epi16(in21, 6);

+      in22 = _mm_srai_epi16(in22, 6);

+      in23 = _mm_srai_epi16(in23, 6);

+      in24 = _mm_srai_epi16(in24, 6);

+      in25 = _mm_srai_epi16(in25, 6);

+      in26 = _mm_srai_epi16(in26, 6);

+      in27 = _mm_srai_epi16(in27, 6);

+      in28 = _mm_srai_epi16(in28, 6);

+      in29 = _mm_srai_epi16(in29, 6);

+      in30 = _mm_srai_epi16(in30, 6);

+      in31 = _mm_srai_epi16(in31, 6);

+      // Store results

+      _mm_store_si128((__m128i *)output, in0);

+      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);

+      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);

+      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);

+      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);

+      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);

+      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);

+      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);

+      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);

+      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);

+      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);

+      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);

+      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);

+      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);

+      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);

+      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);

+      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);

+      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);

+      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);

+      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);

+      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);

+      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);

+      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);

+      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);

+      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);

+      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);

+      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);

+      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);

+      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);

+      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);

+      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);

+      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);

+      output += 8;

+    }

+  }

+}

+#endif

--- a/vp9/common/x86/vp9_idct_x86.h

+++ b/vp9/common/x86/vp9_idct_x86.h

@@ -20,23 +20,10 @@

*/

 #if HAVE_MMX

-extern prototype_idct(vp9_short_idct4x4llm_1_mmx);

-extern prototype_idct(vp9_short_idct4x4llm_mmx);

-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);

 extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);

 extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);

 #if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_idct_idct1

-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx

-#undef  vp9_idct_idct16

-#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx

-#undef  vp9_idct_idct1_scalar_add

-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx

 #undef vp9_idct_iwalsh16

 #define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx

--- a/vp9/common/x86/vp9_idctllm_mmx.asm

+++ /dev/null

@@ -1,241 +1,0 @@

-;

-;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-SECTION_RODATA

-align 16

-x_s1sqr2:      times 4 dw 0x8A8C

-align 16

-x_c1sqr2less1: times 4 dw 0x4E7B

-align 16

-pw_16:         times 4 dw 16

-SECTION .text

-; /****************************************************************************

-; * Notes:

-; *

-; * This implementation makes use of 16 bit fixed point version of two multiply

-; * constants:

-; *        1.   sqrt(2) * cos (pi/8)

-; *        2.   sqrt(2) * sin (pi/8)

-; * Because the first constant is bigger than 1, to maintain the same 16 bit

-; * fixed point precision as the second one, we use a trick of

-; *        x * a = x + x*(a-1)

-; * so

-; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).

-; *

-; * For the second constant, because of the 16bit version is 35468, which

-; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative

-; * number.

-; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x

-; *

-; **************************************************************************/

-INIT_MMX

-;void short_idct4x4llm_mmx(short *input, short *output, int pitch)

-cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit

-    mova            m0,     [inpq +0]

-    mova            m1,     [inpq +8]

-    mova            m2,     [inpq+16]

-    mova            m3,     [inpq+24]

-    psubw           m0,      m2             ; b1= 0-2

-    paddw           m2,      m2             ;

-    mova            m5,      m1

-    paddw           m2,      m0             ; a1 =0+2

-    pmulhw          m5,     [x_s1sqr2]       ;

-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova            m7,      m3             ;

-    pmulhw          m7,     [x_c1sqr2less1]   ;

-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw           m7,      m5             ; c1

-    mova            m5,      m1

-    mova            m4,      m3

-    pmulhw          m5,     [x_c1sqr2less1]

-    paddw           m5,      m1

-    pmulhw          m3,     [x_s1sqr2]

-    paddw           m3,      m4

-    paddw           m3,      m5             ; d1

-    mova            m6,      m2             ; a1

-    mova            m4,      m0             ; b1

-    paddw           m2,      m3             ;0

-    paddw           m4,      m7             ;1

-    psubw           m0,      m7             ;2

-    psubw           m6,      m3             ;3

-    mova            m1,      m2             ; 03 02 01 00

-    mova            m3,      m4             ; 23 22 21 20

-    punpcklwd       m1,      m0             ; 11 01 10 00

-    punpckhwd       m2,      m0             ; 13 03 12 02

-    punpcklwd       m3,      m6             ; 31 21 30 20

-    punpckhwd       m4,      m6             ; 33 23 32 22

-    mova            m0,      m1             ; 11 01 10 00

-    mova            m5,      m2             ; 13 03 12 02

-    punpckldq       m0,      m3             ; 30 20 10 00

-    punpckhdq       m1,      m3             ; 31 21 11 01

-    punpckldq       m2,      m4             ; 32 22 12 02

-    punpckhdq       m5,      m4             ; 33 23 13 03

-    mova            m3,      m5             ; 33 23 13 03

-    psubw           m0,      m2             ; b1= 0-2

-    paddw           m2,      m2             ;

-    mova            m5,      m1

-    paddw           m2,      m0             ; a1 =0+2

-    pmulhw          m5,     [x_s1sqr2]        ;

-    paddw           m5,      m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova            m7,      m3             ;

-    pmulhw          m7,     [x_c1sqr2less1]   ;

-    paddw           m7,      m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw           m7,      m5             ; c1

-    mova            m5,      m1

-    mova            m4,      m3

-    pmulhw          m5,     [x_c1sqr2less1]

-    paddw           m5,      m1

-    pmulhw          m3,     [x_s1sqr2]

-    paddw           m3,      m4

-    paddw           m3,      m5             ; d1

-    paddw           m0,     [pw_16]

-    paddw           m2,     [pw_16]

-    mova            m6,      m2             ; a1

-    mova            m4,      m0             ; b1

-    paddw           m2,      m3             ;0

-    paddw           m4,      m7             ;1

-    psubw           m0,      m7             ;2

-    psubw           m6,      m3             ;3

-    psraw           m2,      5

-    psraw           m0,      5

-    psraw           m4,      5

-    psraw           m6,      5

-    mova            m1,      m2             ; 03 02 01 00

-    mova            m3,      m4             ; 23 22 21 20

-    punpcklwd       m1,      m0             ; 11 01 10 00

-    punpckhwd       m2,      m0             ; 13 03 12 02

-    punpcklwd       m3,      m6             ; 31 21 30 20

-    punpckhwd       m4,      m6             ; 33 23 32 22

-    mova            m0,      m1             ; 11 01 10 00

-    mova            m5,      m2             ; 13 03 12 02

-    punpckldq       m0,      m3             ; 30 20 10 00

-    punpckhdq       m1,      m3             ; 31 21 11 01

-    punpckldq       m2,      m4             ; 32 22 12 02

-    punpckhdq       m5,      m4             ; 33 23 13 03

-    mova        [outq],      m0

-    mova     [outq+r2],      m1

-    mova [outq+pitq*2],      m2

-    add           outq,      pitq

-    mova [outq+pitq*2],      m5

-    RET

-;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)

-cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit

-    movh            m0,     [inpq]

-    paddw           m0,     [pw_16]

-    psraw           m0,      5

-    punpcklwd       m0,      m0

-    punpckldq       m0,      m0

-    mova        [outq],      m0

-    mova   [outq+pitq],      m0

-    mova [outq+pitq*2],      m0

-    add             r1,      r2

-    mova [outq+pitq*2],      m0

-    RET

-;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)

-cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride

-%if ARCH_X86_64

-    movsxd         strideq,      dword stridem

-%else

-    mov            strideq,      stridem

-%endif

-    pxor                m0,      m0

-    movh                m5,      in_dcq ; dc

-    paddw               m5,     [pw_16]

-    psraw               m5,      5

-    punpcklwd           m5,      m5

-    punpckldq           m5,      m5

-    movh                m1,     [predq]

-    punpcklbw           m1,      m0

-    paddsw              m1,      m5

-    packuswb            m1,      m0              ; pack and unpack to saturate

-    movh            [dstq],      m1

-    movh                m2,     [predq+pitq]

-    punpcklbw           m2,      m0

-    paddsw              m2,      m5

-    packuswb            m2,      m0              ; pack and unpack to saturate

-    movh    [dstq+strideq],      m2

-    movh                m3,     [predq+2*pitq]

-    punpcklbw           m3,      m0

-    paddsw              m3,      m5

-    packuswb            m3,      m0              ; pack and unpack to saturate

-    movh  [dstq+2*strideq],      m3

-    add               dstq,      strideq

-    add              predq,      pitq

-    movh                m4,     [predq+2*pitq]

-    punpcklbw           m4,      m0

-    paddsw              m4,      m5

-    packuswb            m4,      m0              ; pack and unpack to saturate

-    movh  [dstq+2*strideq],      m4

-    RET

--- a/vp9/common/x86/vp9_idctllm_sse2.asm

+++ /dev/null

@@ -1,712 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-;void vp9_idct_dequant_0_2x_sse2

-; (

-;   short *qcoeff       - 0

-;   short *dequant      - 1

-;   unsigned char *pre  - 2

-;   unsigned char *dst  - 3

-;   int dst_stride      - 4

-;   int blk_stride      - 5

-; )

-global sym(vp9_idct_dequant_0_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_0_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    ; end prolog

-        mov         rdx,            arg(1) ; dequant

-        mov         rax,            arg(0) ; qcoeff

-        movd        xmm4,           [rax]

-        movd        xmm5,           [rdx]

-        pinsrw      xmm4,           [rax+32],   4

-        pinsrw      xmm5,           [rdx],      4

-        pmullw      xmm4,           xmm5

-    ; Zero out xmm5, for use unpacking

-        pxor        xmm5,           xmm5

-    ; clear coeffs

-        movd        [rax],          xmm5

-        movd        [rax+32],       xmm5

-;pshufb

-        pshuflw     xmm4,           xmm4,       00000000b

-        pshufhw     xmm4,           xmm4,       00000000b

-        mov         rax,            arg(2) ; pre

-        paddw       xmm4,           [GLOBAL(fours)]

-        movsxd      rcx,            dword ptr arg(5) ; blk_stride

-        psraw       xmm4,           3

-        movq        xmm0,           [rax]

-        movq        xmm1,           [rax+rcx]

-        movq        xmm2,           [rax+2*rcx]

-        lea         rcx,            [3*rcx]

-        movq        xmm3,           [rax+rcx]

-        punpcklbw   xmm0,           xmm5

-        punpcklbw   xmm1,           xmm5

-        punpcklbw   xmm2,           xmm5

-        punpcklbw   xmm3,           xmm5

-        mov         rax,            arg(3) ; dst

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; Add to predict buffer

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm4

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm4

-    ; pack up before storing

-        packuswb    xmm0,           xmm5

-        packuswb    xmm1,           xmm5

-        packuswb    xmm2,           xmm5

-        packuswb    xmm3,           xmm5

-    ; store blocks back out

-        movq        [rax],          xmm0

-        movq        [rax + rdx],    xmm1

-        lea         rax,            [rax + 2*rdx]

-        movq        [rax],          xmm2

-        movq        [rax + rdx],    xmm3

-    ; begin epilog

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-global sym(vp9_idct_dequant_full_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_full_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-        movsxd      rcx,            dword ptr arg(5) ; blk_stride

-    ; Zero out xmm7, for use unpacking

-        pxor        xmm7,           xmm7

-        mov         rdx,            arg(1)  ; dequant

-    ; note the transpose of xmm1 and xmm2, necessary for shuffle

-    ;   to spit out sensicle data

-        movdqa      xmm0,           [rax]

-        movdqa      xmm2,           [rax+16]

-        movdqa      xmm1,           [rax+32]

-        movdqa      xmm3,           [rax+48]

-    ; Clear out coeffs

-        movdqa      [rax],          xmm7

-        movdqa      [rax+16],       xmm7

-        movdqa      [rax+32],       xmm7

-        movdqa      [rax+48],       xmm7

-    ; dequantize qcoeff buffer

-        pmullw      xmm0,           [rdx]

-        pmullw      xmm2,           [rdx+16]

-        pmullw      xmm1,           [rdx]

-        pmullw      xmm3,           [rdx+16]

-    ; repack so block 0 row x and block 1 row x are together

-        movdqa      xmm4,           xmm0

-        punpckldq   xmm0,           xmm1

-        punpckhdq   xmm4,           xmm1

-        pshufd      xmm0,           xmm0,       11011000b

-        pshufd      xmm1,           xmm4,       11011000b

-        movdqa      xmm4,           xmm2

-        punpckldq   xmm2,           xmm3

-        punpckhdq   xmm4,           xmm3

-        pshufd      xmm2,           xmm2,       11011000b

-        pshufd      xmm3,           xmm4,       11011000b

-    ; first pass

-        psubw       xmm0,           xmm2        ; b1 = 0-2

-        paddw       xmm2,           xmm2        ;

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0        ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5        ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5        ; d1

-        movdqa      xmm6,           xmm2        ; a1

-        movdqa      xmm4,           xmm0        ; b1

-        paddw       xmm2,           xmm3        ;0

-        paddw       xmm4,           xmm7        ;1

-        psubw       xmm0,           xmm7        ;2

-        psubw       xmm6,           xmm3        ;3

-    ; transpose for the second pass

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-    ; second pass

-        psubw       xmm0,           xmm2            ; b1 = 0-2

-        paddw       xmm2,           xmm2

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0            ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5            ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5            ; d1

-        paddw       xmm0,           [GLOBAL(fours)]

-        paddw       xmm2,           [GLOBAL(fours)]

-        movdqa      xmm6,           xmm2            ; a1

-        movdqa      xmm4,           xmm0            ; b1

-        paddw       xmm2,           xmm3            ;0

-        paddw       xmm4,           xmm7            ;1

-        psubw       xmm0,           xmm7            ;2

-        psubw       xmm6,           xmm3            ;3

-        psraw       xmm2,           3

-        psraw       xmm0,           3

-        psraw       xmm4,           3

-        psraw       xmm6,           3

-    ; transpose to save

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-        pxor        xmm7,           xmm7

-    ; Load up predict blocks

-        movq        xmm4,           [rsi]

-        movq        xmm5,           [rsi+rcx]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm5

-        movq        xmm4,           [rsi+2*rcx]

-        lea         rcx,            [3*rcx]

-        movq        xmm5,           [rsi+rcx]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm5

-.finish:

-    ; pack up before storing

-        packuswb    xmm0,           xmm7

-        packuswb    xmm1,           xmm7

-        packuswb    xmm2,           xmm7

-        packuswb    xmm3,           xmm7

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_idct_dequant_dc_0_2x_sse2

-; (

-;   short *qcoeff       - 0

-;   short *dequant      - 1

-;   unsigned char *pre  - 2

-;   unsigned char *dst  - 3

-;   int dst_stride      - 4

-;   short *dc           - 5

-; )

-global sym(vp9_idct_dequant_dc_0_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_dc_0_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-        mov         rdx,            arg(5) ; dc

-    ; Zero out xmm5, for use unpacking

-        pxor        xmm5,           xmm5

-    ; load up 2 dc words here == 2*16 = doubleword

-        movd        xmm4,           [rdx]

-    ; Load up predict blocks

-        movq        xmm0,           [rsi]

-        movq        xmm1,           [rsi+16]

-        movq        xmm2,           [rsi+32]

-        movq        xmm3,           [rsi+48]

-    ; Duplicate and expand dc across

-        punpcklwd   xmm4,           xmm4

-        punpckldq   xmm4,           xmm4

-    ; Rounding to dequant and downshift

-        paddw       xmm4,           [GLOBAL(fours)]

-        psraw       xmm4,           3

-    ; Predict buffer needs to be expanded from bytes to words

-        punpcklbw   xmm0,           xmm5

-        punpcklbw   xmm1,           xmm5

-        punpcklbw   xmm2,           xmm5

-        punpcklbw   xmm3,           xmm5

-    ; Add to predict buffer

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm4

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm4

-    ; pack up before storing

-        packuswb    xmm0,           xmm5

-        packuswb    xmm1,           xmm5

-        packuswb    xmm2,           xmm5

-        packuswb    xmm3,           xmm5

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-global sym(vp9_idct_dequant_dc_full_2x_sse2) PRIVATE

-sym(vp9_idct_dequant_dc_full_2x_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ; special case when 2 blocks have 0 or 1 coeffs

-    ; dc is set as first coeff, so no need to load qcoeff

-        mov         rax,            arg(0) ; qcoeff

-        mov         rsi,            arg(2) ; pre

-        mov         rdi,            arg(3) ; dst

-    ; Zero out xmm7, for use unpacking

-        pxor        xmm7,           xmm7

-        mov         rdx,            arg(1)  ; dequant

-    ; note the transpose of xmm1 and xmm2, necessary for shuffle

-    ;   to spit out sensicle data

-        movdqa      xmm0,           [rax]

-        movdqa      xmm2,           [rax+16]

-        movdqa      xmm1,           [rax+32]

-        movdqa      xmm3,           [rax+48]

-    ; Clear out coeffs

-        movdqa      [rax],          xmm7

-        movdqa      [rax+16],       xmm7

-        movdqa      [rax+32],       xmm7

-        movdqa      [rax+48],       xmm7

-    ; dequantize qcoeff buffer

-        pmullw      xmm0,           [rdx]

-        pmullw      xmm2,           [rdx+16]

-        pmullw      xmm1,           [rdx]

-        pmullw      xmm3,           [rdx+16]

-    ; DC component

-        mov         rdx,            arg(5)

-    ; repack so block 0 row x and block 1 row x are together

-        movdqa      xmm4,           xmm0

-        punpckldq   xmm0,           xmm1

-        punpckhdq   xmm4,           xmm1

-        pshufd      xmm0,           xmm0,       11011000b

-        pshufd      xmm1,           xmm4,       11011000b

-        movdqa      xmm4,           xmm2

-        punpckldq   xmm2,           xmm3

-        punpckhdq   xmm4,           xmm3

-        pshufd      xmm2,           xmm2,       11011000b

-        pshufd      xmm3,           xmm4,       11011000b

-    ; insert DC component

-        pinsrw      xmm0,           [rdx],      0

-        pinsrw      xmm0,           [rdx+2],    4

-    ; first pass

-        psubw       xmm0,           xmm2        ; b1 = 0-2

-        paddw       xmm2,           xmm2        ;

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0        ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5        ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5        ; d1

-        movdqa      xmm6,           xmm2        ; a1

-        movdqa      xmm4,           xmm0        ; b1

-        paddw       xmm2,           xmm3        ;0

-        paddw       xmm4,           xmm7        ;1

-        psubw       xmm0,           xmm7        ;2

-        psubw       xmm6,           xmm3        ;3

-    ; transpose for the second pass

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-    ; second pass

-        psubw       xmm0,           xmm2            ; b1 = 0-2

-        paddw       xmm2,           xmm2

-        movdqa      xmm5,           xmm1

-        paddw       xmm2,           xmm0            ; a1 = 0+2

-        pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)

-        movdqa      xmm7,           xmm3

-        pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)

-        psubw       xmm7,           xmm5            ; c1

-        movdqa      xmm5,           xmm1

-        movdqa      xmm4,           xmm3

-        pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]

-        paddw       xmm5,           xmm1

-        pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]

-        paddw       xmm3,           xmm4

-        paddw       xmm3,           xmm5            ; d1

-        paddw       xmm0,           [GLOBAL(fours)]

-        paddw       xmm2,           [GLOBAL(fours)]

-        movdqa      xmm6,           xmm2            ; a1

-        movdqa      xmm4,           xmm0            ; b1

-        paddw       xmm2,           xmm3            ;0

-        paddw       xmm4,           xmm7            ;1

-        psubw       xmm0,           xmm7            ;2

-        psubw       xmm6,           xmm3            ;3

-        psraw       xmm2,           3

-        psraw       xmm0,           3

-        psraw       xmm4,           3

-        psraw       xmm6,           3

-    ; transpose to save

-        movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000

-        punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000

-        punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100

-        movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008

-        punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008

-        punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108

-        movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000

-        punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000

-        punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002

-        movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100

-        punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100

-        punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102

-        movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000

-        punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000

-        punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001

-        movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002

-        punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002

-        punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003

-        pshufd      xmm0,           xmm2,       11011000b

-        pshufd      xmm2,           xmm1,       11011000b

-        pshufd      xmm1,           xmm5,       11011000b

-        pshufd      xmm3,           xmm7,       11011000b

-        pxor        xmm7,           xmm7

-    ; Load up predict blocks

-        movq        xmm4,           [rsi]

-        movq        xmm5,           [rsi+16]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm0,           xmm4

-        paddw       xmm1,           xmm5

-        movq        xmm4,           [rsi+32]

-        movq        xmm5,           [rsi+48]

-        punpcklbw   xmm4,           xmm7

-        punpcklbw   xmm5,           xmm7

-        paddw       xmm2,           xmm4

-        paddw       xmm3,           xmm5

-.finish:

-    ; pack up before storing

-        packuswb    xmm0,           xmm7

-        packuswb    xmm1,           xmm7

-        packuswb    xmm2,           xmm7

-        packuswb    xmm3,           xmm7

-    ; Load destination stride before writing out,

-    ;   doesn't need to persist

-        movsxd      rdx,            dword ptr arg(4) ; dst_stride

-    ; store blocks back out

-        movq        [rdi],          xmm0

-        movq        [rdi + rdx],    xmm1

-        lea         rdi,            [rdi + 2*rdx]

-        movq        [rdi],          xmm2

-        movq        [rdi + rdx],    xmm3

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-fours:

-    times 8 dw 0x0004

-align 16

-x_s1sqr2:

-    times 8 dw 0x8A8C

-align 16

-x_c1sqr2less1:

-    times 8 dw 0x4E7B

--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

@@ -26,14 +26,16 @@

   DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);

   DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);

-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);

+  DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);

+  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);

+  DECLARE_ALIGNED(16, unsigned char, ap[8][16]);

+  DECLARE_ALIGNED(16, unsigned char, aq[8][16]);

   __m128i mask, hev, flat, flat2;

   const __m128i zero = _mm_set1_epi16(0);

+  const __m128i one = _mm_set1_epi8(1);

   __m128i p7, p6, p5;

   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;

   __m128i q5, q6, q7;

@@ -58,12 +60,24 @@

   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));

   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));

   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));

+  _mm_store_si128((__m128i *)ap[4], p4);

+  _mm_store_si128((__m128i *)ap[3], p3);

+  _mm_store_si128((__m128i *)ap[2], p2);

+  _mm_store_si128((__m128i *)ap[1], p1);

+  _mm_store_si128((__m128i *)ap[0], p0);

+  _mm_store_si128((__m128i *)aq[4], q4);

+  _mm_store_si128((__m128i *)aq[3], q3);

+  _mm_store_si128((__m128i *)aq[2], q2);

+  _mm_store_si128((__m128i *)aq[1], q1);

+  _mm_store_si128((__m128i *)aq[0], q0);

     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),

                                           _mm_subs_epu8(p0, p1));

     const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),

                                           _mm_subs_epu8(q0, q1));

-    const __m128i one = _mm_set1_epi8(1);

     const __m128i fe = _mm_set1_epi8(0xfe);

     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);

     __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),

@@ -95,246 +109,8 @@

     mask = _mm_max_epu8(work, mask);

     mask = _mm_subs_epu8(mask, limit);

     mask = _mm_cmpeq_epi8(mask, zero);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),

-                                     _mm_subs_epu8(p0, p2)),

-                         _mm_or_si128(_mm_subs_epu8(q2, q0),

-                                      _mm_subs_epu8(q0, q2)));

-    flat = _mm_max_epu8(work, flat);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),

-                                     _mm_subs_epu8(p0, p3)),

-                         _mm_or_si128(_mm_subs_epu8(q3, q0),

-                                      _mm_subs_epu8(q0, q3)));

-    flat = _mm_max_epu8(work, flat);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),

-                                     _mm_subs_epu8(p0, p4)),

-                         _mm_or_si128(_mm_subs_epu8(q4, q0),

-                                      _mm_subs_epu8(q0, q4)));

-    flat = _mm_max_epu8(work, flat);

-    flat = _mm_subs_epu8(flat, one);

-    flat = _mm_cmpeq_epi8(flat, zero);

-    flat = _mm_and_si128(flat, mask);

-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-  // calculate flat2

-  p4 = _mm_loadu_si128((__m128i *)(s - 8 * p));

-  p3 = _mm_loadu_si128((__m128i *)(s - 7 * p));

-  p2 = _mm_loadu_si128((__m128i *)(s - 6 * p));

-  p1 = _mm_loadu_si128((__m128i *)(s - 5 * p));

-//  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));

-//  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));

-  q1 = _mm_loadu_si128((__m128i *)(s + 4 * p));

-  q2 = _mm_loadu_si128((__m128i *)(s + 5 * p));

-  q3 = _mm_loadu_si128((__m128i *)(s + 6 * p));

-  q4 = _mm_loadu_si128((__m128i *)(s + 7 * p));

-  {

-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),

-                                          _mm_subs_epu8(p0, p1));

-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),

-                                          _mm_subs_epu8(q0, q1));

-    const __m128i one = _mm_set1_epi8(1);

-    __m128i work;

-    flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),

-                                     _mm_subs_epu8(p0, p2)),

-                         _mm_or_si128(_mm_subs_epu8(q2, q0),

-                                      _mm_subs_epu8(q0, q2)));

-    flat2 = _mm_max_epu8(work, flat2);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),

-                                     _mm_subs_epu8(p0, p3)),

-                         _mm_or_si128(_mm_subs_epu8(q3, q0),

-                                      _mm_subs_epu8(q0, q3)));

-    flat2 = _mm_max_epu8(work, flat2);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),

-                                     _mm_subs_epu8(p0, p4)),

-                         _mm_or_si128(_mm_subs_epu8(q4, q0),

-                                      _mm_subs_epu8(q0, q4)));

-    flat2 = _mm_max_epu8(work, flat2);

-    flat2 = _mm_subs_epu8(flat2, one);

-    flat2 = _mm_cmpeq_epi8(flat2, zero);

-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask

-  }

-  // calculate flat2

-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-  {

-    const __m128i four = _mm_set1_epi16(4);

-    unsigned char *src = s;

-    i = 0;

-    do {

-      __m128i workp_a, workp_b, workp_shft;

-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);

-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);

-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);

-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);

-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);

-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);

-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);

-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);

-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);

-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);

-      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));

-      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);

-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_op2[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_op1[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_op0[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_oq0[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_oq1[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

-      _mm_storel_epi64((__m128i *)&flat_oq2[i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      src += 8;

-    } while (++i < 2);

-  }

-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-  // wide flat

-  // TODO(slavarnway): interleave with the flat pixel calculations (see above)

-  {

-    const __m128i eight = _mm_set1_epi16(8);

-    unsigned char *src = s;

-    int i = 0;

-    do {

-      __m128i workp_a, workp_b, workp_shft;

-      p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero);

-      p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero);

-      p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero);

-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);

-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);

-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);

-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);

-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);

-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);

-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);

-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);

-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);

-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);

-      q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero);

-      q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero);

-      q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero);

-      workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7

-      workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a);

-      workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2));

-      workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a);

-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6);

-      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7);

-      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);

-      _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],

-                       _mm_packus_epi16(workp_shft, workp_shft));

-      src += 8;

-    } while (++i < 2);

-  }

-  // wide flat

-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

   // lp filter

     const __m128i t4 = _mm_set1_epi8(4);

@@ -345,14 +121,10 @@

     const __m128i t1 = _mm_set1_epi8(0x1);

     const __m128i t7f = _mm_set1_epi8(0x7f);

-    __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),

-                                      t80);

-    __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),

-                                      t80);

-    __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),

-                                      t80);

-    __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),

-                                      t80);

+    __m128i ps1 = _mm_xor_si128(p1, t80);

+    __m128i ps0 = _mm_xor_si128(p0, t80);

+    __m128i qs0 = _mm_xor_si128(q0, t80);

+    __m128i qs1 = _mm_xor_si128(q1, t80);

     __m128i filt;

     __m128i work_a;

     __m128i filter1, filter2;

@@ -374,6 +146,7 @@

     work_a = _mm_and_si128(work_a, te0);

     filter1 = _mm_and_si128(filter1, t1f);

     filter1 = _mm_or_si128(filter1, work_a);

+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);

     /* Filter2 >> 3 */

     work_a = _mm_cmpgt_epi8(zero, filter2);

@@ -381,6 +154,7 @@

     work_a = _mm_and_si128(work_a, te0);

     filter2 = _mm_and_si128(filter2, t1f);

     filter2 = _mm_or_si128(filter2, work_a);

+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);

     /* filt >> 1 */

     filt = _mm_adds_epi8(filter1, t1);

@@ -389,20 +163,265 @@

     work_a = _mm_and_si128(work_a, t80);

     filt = _mm_and_si128(filt, t7f);

     filt = _mm_or_si128(filt, work_a);

     filt = _mm_andnot_si128(hev, filt);

-    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);

     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);

-    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);

     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);

+    // loopfilter done

+    {

+      __m128i work;

+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),

+                                       _mm_subs_epu8(p0, p2)),

+                           _mm_or_si128(_mm_subs_epu8(q2, q0),

+                                        _mm_subs_epu8(q0, q2)));

+      flat = _mm_max_epu8(work, flat);

+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),

+                                       _mm_subs_epu8(p0, p3)),

+                           _mm_or_si128(_mm_subs_epu8(q3, q0),

+                                        _mm_subs_epu8(q0, q3)));

+      flat = _mm_max_epu8(work, flat);

+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),

+                                       _mm_subs_epu8(p0, p4)),

+                           _mm_or_si128(_mm_subs_epu8(q4, q0),

+                                        _mm_subs_epu8(q0, q4)));

+      flat = _mm_subs_epu8(flat, one);

+      flat = _mm_cmpeq_epi8(flat, zero);

+      flat = _mm_and_si128(flat, mask);

+      p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));

+      q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));

+      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),

+                                       _mm_subs_epu8(p0, p5)),

+                           _mm_or_si128(_mm_subs_epu8(q5, q0),

+                                        _mm_subs_epu8(q0, q5)));

+      _mm_store_si128((__m128i *)ap[5], p5);

+      _mm_store_si128((__m128i *)aq[5], q5);

+      flat2 = _mm_max_epu8(work, flat2);

+      p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));

+      q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));

+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),

+                                       _mm_subs_epu8(p0, p6)),

+                           _mm_or_si128(_mm_subs_epu8(q6, q0),

+                                        _mm_subs_epu8(q0, q6)));

+      _mm_store_si128((__m128i *)ap[6], p6);

+      _mm_store_si128((__m128i *)aq[6], q6);

+      flat2 = _mm_max_epu8(work, flat2);

+      p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));

+      q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));

+      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),

+                                       _mm_subs_epu8(p0, p7)),

+                           _mm_or_si128(_mm_subs_epu8(q7, q0),

+                                        _mm_subs_epu8(q0, q7)));

+      _mm_store_si128((__m128i *)ap[7], p7);

+      _mm_store_si128((__m128i *)aq[7], q7);

+      flat2 = _mm_max_epu8(work, flat2);

+      flat2 = _mm_subs_epu8(flat2, one);

+      flat2 = _mm_cmpeq_epi8(flat2, zero);

+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask

+    }

+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+    // flat and wide flat calculations

+    {

+      const __m128i eight = _mm_set1_epi16(8);

+      const __m128i four = _mm_set1_epi16(4);

+      __m128i temp_flat2 = flat2;

+      unsigned char *src = s;

+      int i = 0;

+      do {

+        __m128i workp_shft;

+        __m128i a, b, c;

+        unsigned int off = i * 8;

+        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);

+        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);

+        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);

+        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);

+        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);

+        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);

+        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);

+        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);

+        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);

+        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);

+        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);

+        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);

+        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);

+        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);

+        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);

+        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);

+        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7

+        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));

+        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));

+        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));

+        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);

+        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],

+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)

+                                          , b));

+        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q1, a);

+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);

+        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],

+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)

+                                          , b));

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q2, a);

+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);

+        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],

+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)

+                                          , b));

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q3, a);

+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);

+        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],

+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)

+                                          , b));

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        b = _mm_add_epi16(q3, b);

+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);

+        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],

+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)

+                                          , b));

+        c = _mm_add_epi16(q4, c);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        b = _mm_add_epi16(q3, b);

+        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);

+        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],

+                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)

+                                          , b));

+        a = _mm_add_epi16(q5, a);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q6, a);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q7, a);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q7, a);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q7, a);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q7, a);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q7, a);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q7, a);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        a = _mm_add_epi16(q7, a);

+        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);

+        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);

+        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],

+                         _mm_packus_epi16(workp_shft, workp_shft));

+        temp_flat2 = _mm_srli_si128(temp_flat2, 8);

+        src += 8;

+      } while (++i < 2);

+    }

+    // wide flat

+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+    work_a = _mm_load_si128((__m128i *)ap[2]);

+    p2 = _mm_load_si128((__m128i *)flat_op[2]);

+    work_a = _mm_andnot_si128(flat, work_a);

+    p2 = _mm_and_si128(flat, p2);

+    p2 = _mm_or_si128(work_a, p2);

+    _mm_store_si128((__m128i *)flat_op[2], p2);

+    p1 = _mm_load_si128((__m128i *)flat_op[1]);

+    work_a = _mm_andnot_si128(flat, ps1);

+    p1 = _mm_and_si128(flat, p1);

+    p1 = _mm_or_si128(work_a, p1);

+    _mm_store_si128((__m128i *)flat_op[1], p1);

+    p0 = _mm_load_si128((__m128i *)flat_op[0]);

+    work_a = _mm_andnot_si128(flat, ps0);

+    p0 = _mm_and_si128(flat, p0);

+    p0 = _mm_or_si128(work_a, p0);

+    _mm_store_si128((__m128i *)flat_op[0], p0);

+    q0 = _mm_load_si128((__m128i *)flat_oq[0]);

+    work_a = _mm_andnot_si128(flat, qs0);

+    q0 = _mm_and_si128(flat, q0);

+    q0 = _mm_or_si128(work_a, q0);

+    _mm_store_si128((__m128i *)flat_oq[0], q0);

+    q1 = _mm_load_si128((__m128i *)flat_oq[1]);

+    work_a = _mm_andnot_si128(flat, qs1);

+    q1 = _mm_and_si128(flat, q1);

+    q1 = _mm_or_si128(work_a, q1);

+    _mm_store_si128((__m128i *)flat_oq[1], q1);

+    work_a = _mm_load_si128((__m128i *)aq[2]);

+    q2 = _mm_load_si128((__m128i *)flat_oq[2]);

+    work_a = _mm_andnot_si128(flat, work_a);

+    q2 = _mm_and_si128(flat, q2);

+    q2 = _mm_or_si128(work_a, q2);

+    _mm_store_si128((__m128i *)flat_oq[2], q2);

     // write out op6 - op3

       unsigned char *dst = (s - 7 * p);

       for (i = 6; i > 2; i--) {

         __m128i flat2_output;

-        work_a = _mm_loadu_si128((__m128i *)dst);

+        work_a = _mm_load_si128((__m128i *)ap[i]);

         flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);

         work_a = _mm_andnot_si128(flat2, work_a);

         flat2_output = _mm_and_si128(flat2, flat2_output);

@@ -412,11 +431,7 @@

-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));

-    p2 = _mm_load_si128((__m128i *)flat_op2);

-    work_a = _mm_andnot_si128(flat, work_a);

-    p2 = _mm_and_si128(flat, p2);

-    work_a = _mm_or_si128(work_a, p2);

+    work_a = _mm_load_si128((__m128i *)flat_op[2]);

     p2 = _mm_load_si128((__m128i *)flat2_op[2]);

     work_a = _mm_andnot_si128(flat2, work_a);

     p2 = _mm_and_si128(flat2, p2);

@@ -423,10 +438,7 @@

     p2 = _mm_or_si128(work_a, p2);

     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);

-    p1 = _mm_load_si128((__m128i *)flat_op1);

-    work_a = _mm_andnot_si128(flat, ps1);

-    p1 = _mm_and_si128(flat, p1);

-    work_a = _mm_or_si128(work_a, p1);

+    work_a = _mm_load_si128((__m128i *)flat_op[1]);

     p1 = _mm_load_si128((__m128i *)flat2_op[1]);

     work_a = _mm_andnot_si128(flat2, work_a);

     p1 = _mm_and_si128(flat2, p1);

@@ -433,10 +445,7 @@

     p1 = _mm_or_si128(work_a, p1);

     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);

-    p0 = _mm_load_si128((__m128i *)flat_op0);

-    work_a = _mm_andnot_si128(flat, ps0);

-    p0 = _mm_and_si128(flat, p0);

-    work_a = _mm_or_si128(work_a, p0);

+    work_a = _mm_load_si128((__m128i *)flat_op[0]);

     p0 = _mm_load_si128((__m128i *)flat2_op[0]);

     work_a = _mm_andnot_si128(flat2, work_a);

     p0 = _mm_and_si128(flat2, p0);

@@ -443,10 +452,7 @@

     p0 = _mm_or_si128(work_a, p0);

     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);

-    q0 = _mm_load_si128((__m128i *)flat_oq0);

-    work_a = _mm_andnot_si128(flat, qs0);

-    q0 = _mm_and_si128(flat, q0);

-    work_a = _mm_or_si128(work_a, q0);

+    work_a = _mm_load_si128((__m128i *)flat_oq[0]);

     q0 = _mm_load_si128((__m128i *)flat2_oq[0]);

     work_a = _mm_andnot_si128(flat2, work_a);

     q0 = _mm_and_si128(flat2, q0);

@@ -453,10 +459,7 @@

     q0 = _mm_or_si128(work_a, q0);

     _mm_storeu_si128((__m128i *)(s - 0 * p), q0);

-    q1 = _mm_load_si128((__m128i *)flat_oq1);

-    work_a = _mm_andnot_si128(flat, qs1);

-    q1 = _mm_and_si128(flat, q1);

-    work_a = _mm_or_si128(work_a, q1);

+    work_a = _mm_load_si128((__m128i *)flat_oq[1]);

     q1 = _mm_load_si128((__m128i *)flat2_oq[1]);

     work_a = _mm_andnot_si128(flat2, work_a);

     q1 = _mm_and_si128(flat2, q1);

@@ -463,11 +466,7 @@

     q1 = _mm_or_si128(work_a, q1);

     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);

-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));

-    q2 = _mm_load_si128((__m128i *)flat_oq2);

-    work_a = _mm_andnot_si128(flat, work_a);

-    q2 = _mm_and_si128(flat, q2);

-    work_a = _mm_or_si128(work_a, q2);

+    work_a = _mm_load_si128((__m128i *)flat_oq[2]);

     q2 = _mm_load_si128((__m128i *)flat2_oq[2]);

     work_a = _mm_andnot_si128(flat2, work_a);

     q2 = _mm_and_si128(flat2, q2);

@@ -479,7 +478,7 @@

       unsigned char *dst = (s + 3 * p);

       for (i = 3; i < 7; i++) {

         __m128i flat2_output;

-        work_a = _mm_loadu_si128((__m128i *)dst);

+        work_a = _mm_load_si128((__m128i *)aq[i]);

         flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);

         work_a = _mm_andnot_si128(flat2, work_a);

         flat2_output = _mm_and_si128(flat2, flat2_output);

@@ -504,7 +503,7 @@

   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);

   __m128i mask, hev, flat;

   const __m128i zero = _mm_set1_epi16(0);

-  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;

+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;

   const unsigned int extended_thresh = _thresh[0] * 0x01010101u;

   const unsigned int extended_limit  = _limit[0]  * 0x01010101u;

   const unsigned int extended_blimit = _blimit[0] * 0x01010101u;

@@ -515,7 +514,6 @@

   const __m128i blimit =

       _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);

-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));

   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));

   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));

   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));

@@ -524,7 +522,6 @@

   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));

   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));

   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));

-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));

     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),

                                           _mm_subs_epu8(p0, p1));

@@ -573,11 +570,6 @@

                          _mm_or_si128(_mm_subs_epu8(q3, q0),

                                       _mm_subs_epu8(q0, q3)));

     flat = _mm_max_epu8(work, flat);

-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),

-                                     _mm_subs_epu8(p0, p4)),

-                         _mm_or_si128(_mm_subs_epu8(q4, q0),

-                                      _mm_subs_epu8(q0, q4)));

-    flat = _mm_max_epu8(work, flat);

     flat = _mm_subs_epu8(flat, one);

     flat = _mm_cmpeq_epi8(flat, zero);

     flat = _mm_and_si128(flat, mask);

@@ -588,7 +580,6 @@

     int i = 0;

     do {

       __m128i workp_a, workp_b, workp_shft;

-      p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);

       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);

       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);

       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);

@@ -597,11 +588,10 @@

       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);

       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);

       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);

-      q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);

-      workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));

+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));

       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);

-      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);

+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);

       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

       _mm_storel_epi64((__m128i *)&flat_op2[i*8],

                        _mm_packus_epi16(workp_shft, workp_shft));

@@ -611,7 +601,7 @@

       _mm_storel_epi64((__m128i *)&flat_op1[i*8],

                        _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);

+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);

       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);

       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

       _mm_storel_epi64((__m128i *)&flat_op0[i*8],

@@ -623,13 +613,13 @@

       _mm_storel_epi64((__m128i *)&flat_oq0[i*8],

                        _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);

+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);

       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);

       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

       _mm_storel_epi64((__m128i *)&flat_oq1[i*8],

                        _mm_packus_epi16(workp_shft, workp_shft));

-      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);

+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);

       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);

       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);

       _mm_storel_epi64((__m128i *)&flat_oq2[i*8],

@@ -813,8 +803,8 @@

                    _mm_loadl_epi64((__m128i *)(src + 120)));

-static __inline void transpose8x16(unsigned char *in0, unsigned char *in1,

-                                   int in_p, unsigned char *out, int out_p) {

+static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,

+                                 int in_p, unsigned char *out, int out_p) {

   __m128i x0, x1, x2, x3, x4, x5, x6, x7;

   __m128i x8, x9, x10, x11, x12, x13, x14, x15;

@@ -879,9 +869,9 @@

   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));

-static __inline void transpose(unsigned char *src[], int in_p,

-                               unsigned char *dst[], int out_p,

-                               int num_8x8_to_transpose) {

+static INLINE void transpose(unsigned char *src[], int in_p,

+                             unsigned char *dst[], int out_p,

+                             int num_8x8_to_transpose) {

   int idx8x8 = 0;

   __m128i x0, x1, x2, x3, x4, x5, x6, x7;

   do {

--- a/vp9/common/x86/vp9_postproc_mmx.asm

+++ b/vp9/common/x86/vp9_postproc_mmx.asm

@@ -459,11 +459,11 @@

 %undef flimit2

-;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,

+;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,

 ;                            unsigned char blackclamp[16],

 ;                            unsigned char whiteclamp[16],

 ;                            unsigned char bothclamp[16],

-;                            unsigned int Width, unsigned int Height, int Pitch)

+;                            unsigned int width, unsigned int height, int pitch)

 extern sym(rand)

 global sym(vp9_plane_add_noise_mmx) PRIVATE

 sym(vp9_plane_add_noise_mmx):

--- a/vp9/common/x86/vp9_postproc_sse2.asm

+++ b/vp9/common/x86/vp9_postproc_sse2.asm

@@ -624,11 +624,11 @@

 %undef flimit4

-;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,

+;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,

 ;                            unsigned char blackclamp[16],

 ;                            unsigned char whiteclamp[16],

 ;                            unsigned char bothclamp[16],

-;                            unsigned int Width, unsigned int Height, int Pitch)

+;                            unsigned int width, unsigned int height, int pitch)

 extern sym(rand)

 global sym(vp9_plane_add_noise_wmt) PRIVATE

 sym(vp9_plane_add_noise_wmt):

--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

@@ -21,34 +21,92 @@

 ;*************************************************************************************/

-;void vp9_filter_block1d8_v8_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    short *filter

-;)

-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE

-sym(vp9_filter_block1d8_v8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    push        rbx

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 16*5

-    %define k0k1 [rsp + 16*0]

-    %define k2k3 [rsp + 16*1]

-    %define k4k5 [rsp + 16*2]

-    %define k6k7 [rsp + 16*3]

-    %define krd [rsp + 16*4]

+%macro VERTx4 1

+    mov         rdx, arg(5)                 ;filter ptr

+    mov         rsi, arg(0)                 ;src_ptr

+    mov         rdi, arg(2)                 ;output_ptr

+    mov         rcx, 0x0400040

+    movdqa      xmm4, [rdx]                 ;load filters

+    movd        xmm5, rcx

+    packsswb    xmm4, xmm4

+    pshuflw     xmm0, xmm4, 0b              ;k0_k1

+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

+    punpcklqdq  xmm0, xmm0

+    punpcklqdq  xmm1, xmm1

+    punpcklqdq  xmm2, xmm2

+    punpcklqdq  xmm3, xmm3

+    movdqa      k0k1, xmm0

+    movdqa      k2k3, xmm1

+    pshufd      xmm5, xmm5, 0

+    movdqa      k4k5, xmm2

+    movdqa      k6k7, xmm3

+    movdqa      krd, xmm5

+    movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line

+%if ABI_IS_32BIT=0

+    movsxd      r8, DWORD PTR arg(3)        ;out_pitch

+%endif

+    mov         rax, rsi

+    movsxd      rcx, DWORD PTR arg(4)       ;output_height

+    add         rax, rdx

+    lea         rbx, [rdx + rdx*4]

+    add         rbx, rdx                    ;pitch * 6

+.loop:

+    movd        xmm0, [rsi]                 ;A

+    movd        xmm1, [rsi + rdx]           ;B

+    movd        xmm2, [rsi + rdx * 2]       ;C

+    movd        xmm3, [rax + rdx * 2]       ;D

+    movd        xmm4, [rsi + rdx * 4]       ;E

+    movd        xmm5, [rax + rdx * 4]       ;F

+    punpcklbw   xmm0, xmm1                  ;A B

+    punpcklbw   xmm2, xmm3                  ;C D

+    punpcklbw   xmm4, xmm5                  ;E F

+    movd        xmm6, [rsi + rbx]           ;G

+    movd        xmm7, [rax + rbx]           ;H

+    pmaddubsw   xmm0, k0k1

+    pmaddubsw   xmm2, k2k3

+    punpcklbw   xmm6, xmm7                  ;G H

+    pmaddubsw   xmm4, k4k5

+    pmaddubsw   xmm6, k6k7

+    paddsw      xmm0, xmm2

+    paddsw      xmm0, krd

+    paddsw      xmm4, xmm6

+    paddsw      xmm0, xmm4

+    psraw       xmm0, 7

+    packuswb    xmm0, xmm0

+    add         rsi,  rdx

+    add         rax,  rdx

+%if %1

+    movd        xmm1, [rdi]

+    pavgb       xmm0, xmm1

+%endif

+    movd        [rdi], xmm0

+%if ABI_IS_32BIT

+    add         rdi, DWORD PTR arg(3)       ;out_pitch

+%else

+    add         rdi, r8

+%endif

+    dec         rcx

+    jnz         .loop

+%endm

+%macro VERTx8 1

     mov         rdx, arg(5)                 ;filter ptr

     mov         rsi, arg(0)                 ;src_ptr

     mov         rdi, arg(2)                 ;output_ptr

@@ -86,7 +144,7 @@

     lea         rbx, [rdx + rdx*4]

     add         rbx, rdx                    ;pitch * 6

-.vp9_filter_block1d8_v8_ssse3_loop:

+.loop:

     movq        xmm0, [rsi]                 ;A

     movq        xmm1, [rsi + rdx]           ;B

     movq        xmm2, [rsi + rdx * 2]       ;C

@@ -117,7 +175,10 @@

     add         rsi,  rdx

     add         rax,  rdx

+%if %1

+    movq        xmm1, [rdi]

+    pavgb       xmm0, xmm1

+%endif

     movq        [rdi], xmm0

 %if ABI_IS_32BIT

@@ -126,47 +187,11 @@

     add         rdi, r8

 %endif

     dec         rcx

-    jnz         .vp9_filter_block1d8_v8_ssse3_loop

+    jnz         .loop

+%endm

-    add rsp, 16*5

-    pop rsp

-    pop rbx

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_v8_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    short *filter

-;)

-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE

-sym(vp9_filter_block1d16_v8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    push        rbx

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 16*5

-    %define k0k1 [rsp + 16*0]

-    %define k2k3 [rsp + 16*1]

-    %define k4k5 [rsp + 16*2]

-    %define k6k7 [rsp + 16*3]

-    %define krd [rsp + 16*4]

+%macro VERTx16 1

     mov         rdx, arg(5)                 ;filter ptr

     mov         rsi, arg(0)                 ;src_ptr

     mov         rdi, arg(2)                 ;output_ptr

@@ -204,7 +229,7 @@

     lea         rbx, [rdx + rdx*4]

     add         rbx, rdx                    ;pitch * 6

-.vp9_filter_block1d16_v8_ssse3_loop:

+.loop:

     movq        xmm0, [rsi]                 ;A

     movq        xmm1, [rsi + rdx]           ;B

     movq        xmm2, [rsi + rdx * 2]       ;C

@@ -232,7 +257,10 @@

     psraw       xmm0, 7

     packuswb    xmm0, xmm0

+%if %1

+    movq        xmm1, [rdi]

+    pavgb       xmm0, xmm1

+%endif

     movq        [rdi], xmm0

     movq        xmm0, [rsi + 8]             ;A

@@ -267,6 +295,10 @@

     add         rsi,  rdx

     add         rax,  rdx

+%if %1

+    movq    xmm1, [rdi+8]

+    pavgb   xmm0, xmm1

+%endif

     movq        [rdi+8], xmm0

@@ -276,8 +308,39 @@

     add         rdi, r8

 %endif

     dec         rcx

-    jnz         .vp9_filter_block1d16_v8_ssse3_loop

+    jnz         .loop

+%endm

+;void vp9_filter_block1d8_v8_ssse3

+;(

+;    unsigned char *src_ptr,

+;    unsigned int   src_pitch,

+;    unsigned char *output_ptr,

+;    unsigned int   out_pitch,

+;    unsigned int   output_height,

+;    short *filter

+;)

+global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE

+sym(vp9_filter_block1d4_v8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    push        rbx

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    VERTx4 0

     add rsp, 16*5

     pop rsp

     pop rbx

@@ -289,24 +352,24 @@

     pop         rbp

ret

-;void vp9_filter_block1d8_h8_ssse3

+;void vp9_filter_block1d8_v8_ssse3

;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

+;    unsigned char *src_ptr,

+;    unsigned int   src_pitch,

+;    unsigned char *output_ptr,

+;    unsigned int   out_pitch,

+;    unsigned int   output_height,

 ;    short *filter

;)

-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE

-sym(vp9_filter_block1d8_h8_ssse3):

+global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE

+sym(vp9_filter_block1d8_v8_ssse3):

     push        rbp

     mov         rbp, rsp

     SHADOW_ARGS_TO_STACK 6

     SAVE_XMM 7

-    GET_GOT     rbx

     push        rsi

     push        rdi

+    push        rbx

     ; end prolog

     ALIGN_STACK 16, rax

@@ -317,6 +380,162 @@

     %define k6k7 [rsp + 16*3]

     %define krd [rsp + 16*4]

+    VERTx8 0

+    add rsp, 16*5

+    pop rsp

+    pop rbx

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d16_v8_ssse3

+;(

+;    unsigned char *src_ptr,

+;    unsigned int   src_pitch,

+;    unsigned char *output_ptr,

+;    unsigned int   out_pitch,

+;    unsigned int   output_height,

+;    short *filter

+;)

+global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE

+sym(vp9_filter_block1d16_v8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    push        rbx

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    VERTx16 0

+    add rsp, 16*5

+    pop rsp

+    pop rbx

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE

+sym(vp9_filter_block1d4_v8_avg_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    push        rbx

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    VERTx4 1

+    add rsp, 16*5

+    pop rsp

+    pop rbx

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE

+sym(vp9_filter_block1d8_v8_avg_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    push        rbx

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    VERTx8 1

+    add rsp, 16*5

+    pop rsp

+    pop rbx

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE

+sym(vp9_filter_block1d16_v8_avg_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    push        rbx

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    VERTx16 1

+    add rsp, 16*5

+    pop rsp

+    pop rbx

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+%macro HORIZx4 1

     mov         rdx, arg(5)                 ;filter ptr

     mov         rsi, arg(0)                 ;src_ptr

     mov         rdi, arg(2)                 ;output_ptr

@@ -340,19 +559,16 @@

     pshufd      xmm5, xmm5, 0

     movdqa      k4k5, xmm2

     movdqa      k6k7, xmm3

-;    movdqa      krd, xmm5

+    movdqa      krd, xmm5

     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line

     movsxd      rdx, dword ptr arg(3)       ;output_pitch

     movsxd      rcx, dword ptr arg(4)       ;output_height

-.filter_block1d8_h8_rowloop_ssse3:

+.loop:

     movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4

-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11

     movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12

-;note: if we create a k0_k7 filter, we can save a pshufb

-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11

     punpcklqdq  xmm0,   xmm3

     movdqa      xmm1,   xmm0

@@ -371,59 +587,94 @@

     pmaddubsw   xmm4,   k6k7

     paddsw      xmm0,   xmm1

-    paddsw      xmm0,   xmm2

-    paddsw      xmm0,   xmm5

     paddsw      xmm0,   xmm4

+    paddsw      xmm0,   xmm2

+    paddsw      xmm0,   krd

     psraw       xmm0,   7

     packuswb    xmm0,   xmm0

+%if %1

+    movd        xmm1,   [rdi]

+    pavgb       xmm0,   xmm1

+%endif

     lea         rsi,    [rsi + rax]

-    movq        [rdi],  xmm0

+    movd        [rdi],  xmm0

     lea         rdi,    [rdi + rdx]

     dec         rcx

-    jnz         .filter_block1d8_h8_rowloop_ssse3

+    jnz         .loop

+%endm

-    add rsp, 16*5

-    pop rsp

+%macro HORIZx8 1

+    mov         rdx, arg(5)                 ;filter ptr

+    mov         rsi, arg(0)                 ;src_ptr

+    mov         rdi, arg(2)                 ;output_ptr

+    mov         rcx, 0x0400040

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

+    movdqa      xmm4, [rdx]                 ;load filters

+    movd        xmm5, rcx

+    packsswb    xmm4, xmm4

+    pshuflw     xmm0, xmm4, 0b              ;k0_k1

+    pshuflw     xmm1, xmm4, 01010101b       ;k2_k3

+    pshuflw     xmm2, xmm4, 10101010b       ;k4_k5

+    pshuflw     xmm3, xmm4, 11111111b       ;k6_k7

-;void vp9_filter_block1d16_h8_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    short *filter

-;)

-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE

-sym(vp9_filter_block1d16_h8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

+    punpcklqdq  xmm0, xmm0

+    punpcklqdq  xmm1, xmm1

+    punpcklqdq  xmm2, xmm2

+    punpcklqdq  xmm3, xmm3

-    ALIGN_STACK 16, rax

-    sub         rsp, 16*5

-    %define k0k1 [rsp + 16*0]

-    %define k2k3 [rsp + 16*1]

-    %define k4k5 [rsp + 16*2]

-    %define k6k7 [rsp + 16*3]

-    %define krd [rsp + 16*4]

+    movdqa      k0k1, xmm0

+    movdqa      k2k3, xmm1

+    pshufd      xmm5, xmm5, 0

+    movdqa      k4k5, xmm2

+    movdqa      k6k7, xmm3

+    movdqa      krd, xmm5

+    movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line

+    movsxd      rdx, dword ptr arg(3)       ;output_pitch

+    movsxd      rcx, dword ptr arg(4)       ;output_height

+.loop:

+    movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4

+    movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12

+    punpcklqdq  xmm0,   xmm3

+    movdqa      xmm1,   xmm0

+    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]

+    pmaddubsw   xmm0,   k0k1

+    movdqa      xmm2,   xmm1

+    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]

+    pmaddubsw   xmm1,   k2k3

+    movdqa      xmm4,   xmm2

+    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]

+    pmaddubsw   xmm2,   k4k5

+    pshufb      xmm4,   [GLOBAL(shuf_t6t7)]

+    pmaddubsw   xmm4,   k6k7

+    paddsw      xmm0,   xmm1

+    paddsw      xmm0,   xmm4

+    paddsw      xmm0,   xmm2

+    paddsw      xmm0,   krd

+    psraw       xmm0,   7

+    packuswb    xmm0,   xmm0

+%if %1

+    movq        xmm1,   [rdi]

+    pavgb       xmm0,   xmm1

+%endif

+    lea         rsi,    [rsi + rax]

+    movq        [rdi],  xmm0

+    lea         rdi,    [rdi + rdx]

+    dec         rcx

+    jnz         .loop

+%endm

+%macro HORIZx16 1

     mov         rdx, arg(5)                 ;filter ptr

     mov         rsi, arg(0)                 ;src_ptr

     mov         rdi, arg(2)                 ;output_ptr

@@ -453,13 +704,10 @@

     movsxd      rdx, dword ptr arg(3)       ;output_pitch

     movsxd      rcx, dword ptr arg(4)       ;output_height

-.filter_block1d16_h8_rowloop_ssse3:

+.loop:

     movq        xmm0,   [rsi - 3]    ; -3 -2 -1  0  1  2  3  4

-;    movq        xmm3,   [rsi + 4]    ; 4  5  6  7  8  9 10 11

     movq        xmm3,   [rsi + 5]    ; 5  6  7  8  9 10 11 12

-;note: if we create a k0_k7 filter, we can save a pshufb

-;    punpcklbw   xmm0,   xmm3         ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11

     punpcklqdq  xmm0,   xmm3

     movdqa      xmm1,   xmm0

@@ -486,10 +734,7 @@

     movq        xmm3,   [rsi +  5]

-;    movq        xmm7,   [rsi + 12]

     movq        xmm7,   [rsi + 13]

-;note: same as above

-;    punpcklbw   xmm3,   xmm7

     punpcklqdq  xmm3,   xmm7

     movdqa      xmm1,   xmm3

@@ -508,12 +753,16 @@

     pmaddubsw   xmm4,   k6k7

     paddsw      xmm3,   xmm1

+    paddsw      xmm3,   xmm4

     paddsw      xmm3,   xmm2

     paddsw      xmm3,   krd

-    paddsw      xmm3,   xmm4

     psraw       xmm3,   7

     packuswb    xmm3,   xmm3

     punpcklqdq  xmm0,   xmm3

+%if %1

+    movdqa      xmm1,   [rdi]

+    pavgb       xmm0,   xmm1

+%endif

     lea         rsi,    [rsi + rax]

     movdqa      [rdi],  xmm0

@@ -520,8 +769,39 @@

     lea         rdi,    [rdi + rdx]

     dec         rcx

-    jnz         .filter_block1d16_h8_rowloop_ssse3

+    jnz         .loop

+%endm

+;void vp9_filter_block1d4_h8_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char  *output_ptr,

+;    unsigned int    output_pitch,

+;    unsigned int    output_height,

+;    short *filter

+;)

+global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE

+sym(vp9_filter_block1d4_h8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    HORIZx4 0

     add rsp, 16*5

     pop rsp

@@ -534,7 +814,188 @@

     pop         rbp

ret

+;void vp9_filter_block1d8_h8_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char  *output_ptr,

+;    unsigned int    output_pitch,

+;    unsigned int    output_height,

+;    short *filter

+;)

+global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE

+sym(vp9_filter_block1d8_h8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    HORIZx8 0

+    add rsp, 16*5

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+;void vp9_filter_block1d16_h8_ssse3

+;(

+;    unsigned char  *src_ptr,

+;    unsigned int    src_pixels_per_line,

+;    unsigned char  *output_ptr,

+;    unsigned int    output_pitch,

+;    unsigned int    output_height,

+;    short *filter

+;)

+global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE

+sym(vp9_filter_block1d16_h8_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    HORIZx16 0

+    add rsp, 16*5

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE

+sym(vp9_filter_block1d4_h8_avg_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    HORIZx4 1

+    add rsp, 16*5

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE

+sym(vp9_filter_block1d8_h8_avg_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    HORIZx8 1

+    add rsp, 16*5

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

+global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE

+sym(vp9_filter_block1d16_h8_avg_ssse3):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 6

+    SAVE_XMM 7

+    GET_GOT     rbx

+    push        rsi

+    push        rdi

+    ; end prolog

+    ALIGN_STACK 16, rax

+    sub         rsp, 16*5

+    %define k0k1 [rsp + 16*0]

+    %define k2k3 [rsp + 16*1]

+    %define k4k5 [rsp + 16*2]

+    %define k6k7 [rsp + 16*3]

+    %define krd [rsp + 16*4]

+    HORIZx16 1

+    add rsp, 16*5

+    pop rsp

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_GOT

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

 SECTION_RODATA

 align 16

 shuf_t0t1:

--- a/vp9/common/x86/vp9_subpixel_mmx.asm

+++ /dev/null

@@ -1,268 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define BLOCK_HEIGHT_WIDTH 4

-%define vp9_filter_weight 128

-%define VP9_FILTER_SHIFT  7

-;void vp9_filter_block1d_h6_mmx

-;(

-;    unsigned char   *src_ptr,

-;    unsigned short  *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    pixel_step,

-;    unsigned int    output_height,

-;    unsigned int    output_width,

-;    short           * vp9_filter

-;)

-global sym(vp9_filter_block1d_h6_mmx) PRIVATE

-sym(vp9_filter_block1d_h6_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,    arg(6) ;vp9_filter

-        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!

-        movq        mm2,    [rdx + 32]         ;

-        movq        mm6,    [rdx + 48]        ;

-        movq        mm7,    [rdx + 64]        ;

-        mov         rdi,    arg(1) ;output_ptr

-        mov         rsi,    arg(0) ;src_ptr

-        movsxd      rcx,    dword ptr arg(4) ;output_height

-        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?

-        pxor        mm0,    mm0              ; mm0 = 00000000

-.nextrow:

-        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5

-        movq        mm4,    mm3              ; mm4 = p-2..p5

-        psrlq       mm3,    8                ; mm3 = p-1..p5

-        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2

-        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.

-        movq        mm5,    mm4              ; mm5 = p-2..p5

-        punpckhbw   mm4,    mm0              ; mm5 = p2..p5

-        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers

-        paddsw      mm3,    mm4              ; mm3 += mm5

-        movq        mm4,    mm5              ; mm4 = p-2..p5;

-        psrlq       mm5,    16               ; mm5 = p0..p5;

-        punpcklbw   mm5,    mm0              ; mm5 = p0..p3

-        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers

-        paddsw      mm3,    mm5              ; mm3 += mm5

-        movq        mm5,    mm4              ; mm5 = p-2..p5

-        psrlq       mm4,    24               ; mm4 = p1..p5

-        punpcklbw   mm4,    mm0              ; mm4 = p1..p4

-        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers

-        paddsw      mm3,    mm4              ; mm3 += mm5

-        ; do outer positive taps

-        movd        mm4,    [rsi+3]

-        punpcklbw   mm4,    mm0              ; mm5 = p3..p6

-        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers

-        paddsw      mm3,    mm4              ; mm3 += mm5

-        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1

-        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers

-        paddsw      mm3,    mm5              ; mm3 += mm5

-        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value

-        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128

-        packuswb    mm3,    mm0              ; pack and unpack to saturate

-        punpcklbw   mm3,    mm0              ;

-        movq        [rdi],  mm3              ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line

-        add         rdi,    rax;

-%else

-        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line

-        add         rdi,    rax;

-        add         rsi,    r8               ; next line

-%endif

-        dec         rcx                      ; decrement count

-        jnz         .nextrow                 ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1dc_v6_mmx

-;(

-;   short *src_ptr,

-;   unsigned char *output_ptr,

-;    int output_pitch,

-;   unsigned int pixels_per_line,

-;   unsigned int pixel_step,

-;   unsigned int output_height,

-;   unsigned int output_width,

-;   short * vp9_filter

-;)

-global sym(vp9_filter_block1dc_v6_mmx) PRIVATE

-sym(vp9_filter_block1dc_v6_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        movq      mm5, [GLOBAL(rd)]

-        push        rbx

-        mov         rbx, arg(7) ;vp9_filter

-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!

-        movq      mm2, [rbx + 32]         ;

-        movq      mm6, [rbx + 48]        ;

-        movq      mm7, [rbx + 64]        ;

-        movsxd      rdx, dword ptr arg(3) ;pixels_per_line

-        mov         rdi, arg(1) ;output_ptr

-        mov         rsi, arg(0) ;src_ptr

-        sub         rsi, rdx

-        sub         rsi, rdx

-        movsxd      rcx, DWORD PTR arg(5) ;output_height

-        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?

-        pxor        mm0, mm0              ; mm0 = 00000000

-.nextrow_cv:

-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1

-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.

-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2

-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0

-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2

-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch

-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1

-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3

-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        paddsw      mm3, mm5               ; mm3 += round value

-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128

-        packuswb    mm3, mm0              ; pack and saturate

-        movd        [rdi],mm3             ; store the results in the destination

-        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the

-        ; recon block should be in cache this shouldn't cost much.  Its obviously

-        ; avoidable!!!.

-        lea         rdi,  [rdi+rax] ;

-        dec         rcx                   ; decrement count

-        jnz         .nextrow_cv           ; next row

-        pop         rbx

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-rd:

-    times 4 dw 0x40

-align 16

-global HIDDEN_DATA(sym(vp9_six_tap_mmx))

-sym(vp9_six_tap_mmx):

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw 128

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw -6

-    times 8 dw 123

-    times 8 dw 12

-    times 8 dw -1

-    times 8 dw 0

-    times 8 dw 2

-    times 8 dw -11

-    times 8 dw 108

-    times 8 dw 36

-    times 8 dw -8

-    times 8 dw 1

-    times 8 dw 0

-    times 8 dw -9

-    times 8 dw 93

-    times 8 dw 50

-    times 8 dw -6

-    times 8 dw 0

-    times 8 dw 3

-    times 8 dw -16

-    times 8 dw 77

-    times 8 dw 77

-    times 8 dw -16

-    times 8 dw 3

-    times 8 dw 0

-    times 8 dw -6

-    times 8 dw 50

-    times 8 dw 93

-    times 8 dw -9

-    times 8 dw 0

-    times 8 dw 1

-    times 8 dw -8

-    times 8 dw 36

-    times 8 dw 108

-    times 8 dw -11

-    times 8 dw 2

-    times 8 dw 0

-    times 8 dw -1

-    times 8 dw 12

-    times 8 dw 123

-    times 8 dw -6

-    times 8 dw 0

--- a/vp9/common/x86/vp9_subpixel_sse2.asm

+++ /dev/null

@@ -1,1372 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define BLOCK_HEIGHT_WIDTH 4

-%define VP9_FILTER_WEIGHT 128

-%define VP9_FILTER_SHIFT  7

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;*************************************************************************************/

-;void vp9_filter_block1d8_h6_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned short *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    pixel_step,

-;    unsigned int    output_height,

-;    unsigned int    output_width,

-;    short           *vp9_filter

-;)

-global sym(vp9_filter_block1d8_h6_sse2) PRIVATE

-sym(vp9_filter_block1d8_h6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(6) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5) ;output_width

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d8_h6_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        punpcklbw   xmm4,       xmm0

-        movdqa      XMMWORD Ptr [rdi],         xmm4

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(5) ;[output_width]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d8_h6_rowloop                ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h6_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned short *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    pixel_step,

-;    unsigned int    output_height,

-;    unsigned int    output_width,

-;    short           *vp9_filter

-;)

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;*************************************************************************************/

-global sym(vp9_filter_block1d16_h6_sse2) PRIVATE

-sym(vp9_filter_block1d16_h6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(6) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5) ;output_width

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d16_h6_sse2_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        movq        xmm2,       MMWORD PTR [rsi +14]

-        pslldq      xmm2,       8

-        por         xmm2,       xmm1

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        punpcklbw   xmm4,       xmm0

-        movdqa      XMMWORD Ptr [rdi],         xmm4

-        movdqa      xmm3,       xmm2

-        movdqa      xmm4,       xmm2

-        movdqa      xmm5,       xmm2

-        movdqa      xmm6,       xmm2

-        movdqa      xmm7,       xmm2

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm2

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        punpcklbw   xmm4,       xmm0

-        movdqa      XMMWORD Ptr [rdi+16],      xmm4

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(5) ;[output_width]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_v6_sse2

-;(

-;    short *src_ptr,

-;    unsigned char *output_ptr,

-;    int dst_ptich,

-;    unsigned int pixels_per_line,

-;    unsigned int pixel_step,

-;    unsigned int output_height,

-;    unsigned int output_width,

-;    short * vp9_filter

-;)

-;/************************************************************************************

-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The

-; input pixel array has output_height rows.

-;*************************************************************************************/

-global sym(vp9_filter_block1d8_v6_sse2) PRIVATE

-sym(vp9_filter_block1d8_v6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rax,        arg(7) ;vp9_filter

-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

-        mov         rdi,        arg(1) ;output_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        sub         rsi,        rdx

-        sub         rsi,        rdx

-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

-        pxor        xmm0,       xmm0                        ; clear xmm0

-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(2) ; dst_ptich

-%endif

-.vp9_filter_block1d8_v6_sse2_loop:

-        movdqa      xmm1,       XMMWORD PTR [rsi]

-        pmullw      xmm1,       [rax]

-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]

-        pmullw      xmm2,       [rax + 16]

-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]

-        pmullw      xmm3,       [rax + 32]

-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]

-        pmullw      xmm5,       [rax + 64]

-        add         rsi,        rdx

-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]

-        pmullw      xmm4,       [rax + 48]

-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]

-        pmullw      xmm6,       [rax + 80]

-        paddsw      xmm2,       xmm5

-        paddsw      xmm2,       xmm3

-        paddsw      xmm2,       xmm1

-        paddsw      xmm2,       xmm4

-        paddsw      xmm2,       xmm6

-        paddsw      xmm2,       xmm7

-        psraw       xmm2,       7

-        packuswb    xmm2,       xmm0              ; pack and saturate

-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx         ; decrement count

-        jnz         .vp9_filter_block1d8_v6_sse2_loop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_v6_sse2

-;(

-;    unsigned short *src_ptr,

-;    unsigned char *output_ptr,

-;    int dst_ptich,

-;    unsigned int pixels_per_line,

-;    unsigned int pixel_step,

-;    unsigned int output_height,

-;    unsigned int output_width,

-;    const short    *vp9_filter

-;)

-;/************************************************************************************

-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The

-; input pixel array has output_height rows.

-;*************************************************************************************/

-global sym(vp9_filter_block1d16_v6_sse2) PRIVATE

-sym(vp9_filter_block1d16_v6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rax,        arg(7) ;vp9_filter

-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

-        mov         rdi,        arg(1) ;output_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        sub         rsi,        rdx

-        sub         rsi,        rdx

-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(2) ; dst_ptich

-%endif

-.vp9_filter_block1d16_v6_sse2_loop:

-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.

-        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2

-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]

-        pmullw      xmm1,       [rax + 16]

-        pmullw      xmm2,       [rax + 16]

-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5

-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]

-        pmullw      xmm3,       [rax + 64]

-        pmullw      xmm4,       [rax + 64]

-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3

-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]

-        pmullw      xmm5,       [rax + 32]

-        pmullw      xmm6,       [rax + 32]

-        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1

-        movdqa      xmm0,       XMMWORD PTR [rsi + 16]

-        pmullw      xmm7,       [rax]

-        pmullw      xmm0,       [rax]

-        paddsw      xmm1,       xmm3

-        paddsw      xmm2,       xmm4

-        paddsw      xmm1,       xmm5

-        paddsw      xmm2,       xmm6

-        paddsw      xmm1,       xmm7

-        paddsw      xmm2,       xmm0

-        add         rsi,        rdx

-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4

-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]

-        pmullw      xmm3,       [rax + 48]

-        pmullw      xmm4,       [rax + 48]

-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6

-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]

-        pmullw      xmm5,       [rax + 80]

-        pmullw      xmm6,       [rax + 80]

-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

-        pxor        xmm0,       xmm0                        ; clear xmm0

-        paddsw      xmm1,       xmm3

-        paddsw      xmm2,       xmm4

-        paddsw      xmm1,       xmm5

-        paddsw      xmm2,       xmm6

-        paddsw      xmm1,       xmm7

-        paddsw      xmm2,       xmm7

-        psraw       xmm1,       7

-        psraw       xmm2,       7

-        packuswb    xmm1,       xmm2              ; pack and saturate

-        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx         ; decrement count

-        jnz         .vp9_filter_block1d16_v6_sse2_loop              ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_h6_only_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    int dst_ptich,

-;    unsigned int    output_height,

-;    const short    *vp9_filter

-;)

-; First-pass filter only when yoffset==0

-global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE

-sym(vp9_filter_block1d8_h6_only_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(5) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(2) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(3) ;dst_ptich

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d8_h6_only_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d8_h6_only_rowloop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h6_only_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    int dst_ptich,

-;    unsigned int    output_height,

-;    const short    *vp9_filter

-;)

-; First-pass filter only when yoffset==0

-global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE

-sym(vp9_filter_block1d16_h6_only_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(5) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(2) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(3) ;dst_ptich

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d16_h6_only_sse2_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        movq        xmm2,       MMWORD PTR [rsi +14]

-        pslldq      xmm2,       8

-        por         xmm2,       xmm1

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0                        ; lower 8 bytes

-        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination

-        movdqa      xmm3,       xmm2

-        movdqa      xmm4,       xmm2

-        movdqa      xmm5,       xmm2

-        movdqa      xmm6,       xmm2

-        movdqa      xmm7,       xmm2

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm2

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0                        ; higher 8 bytes

-        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_v6_only_sse2

-;(

-;    unsigned char *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char *output_ptr,

-;    int dst_ptich,

-;    unsigned int output_height,

-;    const short    *vp9_filter

-;)

-; Second-pass filter only when xoffset==0

-global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE

-sym(vp9_filter_block1d8_v6_only_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(2) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-        mov         rax,        arg(5) ;vp9_filter

-        pxor        xmm0,       xmm0                        ; clear xmm0

-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(3) ; dst_ptich

-%endif

-.vp9_filter_block1d8_v6_only_sse2_loop:

-        movq        xmm1,       MMWORD PTR [rsi]

-        movq        xmm2,       MMWORD PTR [rsi + rdx]

-        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]

-        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]

-        add         rsi,        rdx

-        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]

-        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]

-        punpcklbw   xmm1,       xmm0

-        pmullw      xmm1,       [rax]

-        punpcklbw   xmm2,       xmm0

-        pmullw      xmm2,       [rax + 16]

-        punpcklbw   xmm3,       xmm0

-        pmullw      xmm3,       [rax + 32]

-        punpcklbw   xmm5,       xmm0

-        pmullw      xmm5,       [rax + 64]

-        punpcklbw   xmm4,       xmm0

-        pmullw      xmm4,       [rax + 48]

-        punpcklbw   xmm6,       xmm0

-        pmullw      xmm6,       [rax + 80]

-        paddsw      xmm2,       xmm5

-        paddsw      xmm2,       xmm3

-        paddsw      xmm2,       xmm1

-        paddsw      xmm2,       xmm4

-        paddsw      xmm2,       xmm6

-        paddsw      xmm2,       xmm7

-        psraw       xmm2,       7

-        packuswb    xmm2,       xmm0              ; pack and saturate

-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx         ; decrement count

-        jnz         .vp9_filter_block1d8_v6_only_sse2_loop              ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_unpack_block1d16_h6_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned short *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    output_height,

-;    unsigned int    output_width

-;)

-global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE

-sym(vp9_unpack_block1d16_h6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;output_ptr

-        movsxd      rcx,        dword ptr arg(3) ;output_height

-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source

-%endif

-.unpack_block1d16_h6_sse2_rowloop:

-        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2

-        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        punpcklbw   xmm1,       xmm0

-        movdqa      XMMWORD Ptr [rdi],         xmm1

-        movdqa      XMMWORD Ptr [rdi + 16],    xmm3

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(4) ;[output_width]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict16x16_sse2

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-extern sym(vp9_bilinear_filters_mmx)

-global sym(vp9_bilinear_predict16x16_sse2) PRIVATE

-sym(vp9_bilinear_predict16x16_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ;const short *HFilter = bilinear_filters_mmx[xoffset]

-    ;const short *VFilter = bilinear_filters_mmx[yoffset]

-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        cmp         rax,        0      ;skip first_pass filter if xoffset=0

-        je          .b16x16_sp_only

-        shl         rax,        5

-        add         rax,        rcx    ;HFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movdqa      xmm1,       [rax]

-        movdqa      xmm2,       [rax+16]

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        cmp         rax,        0      ;skip second_pass filter if yoffset=0

-        je          .b16x16_fp_only

-        shl         rax,        5

-        add         rax,        rcx    ;VFilter

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-        pxor        xmm0,       xmm0

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5) ;dst_pitch

-%endif

-        ; get the first horizontal line done

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm1

-        movdqu      xmm5,       [rsi+1]

-        movdqa      xmm6,       xmm5

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       xmm2

-        pmullw      xmm6,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm4

-        add         rsi,        rdx                 ; next line

-.next_row:

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm1

-        movdqu      xmm5,       [rsi+1]

-        movdqa      xmm6,       xmm5

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       xmm2

-        pmullw      xmm6,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        movdqa      xmm5,       xmm7

-        movdqa      xmm6,       xmm7

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       [rax]

-        pmullw      xmm6,       [rax]

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm4

-        pmullw      xmm3,       [rax+16]

-        pmullw      xmm4,       [rax+16]

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm4

-        movdqa      [rdi],      xmm3                 ; store the results in the destination

-        add         rsi,        rdx                 ; next line

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(5) ;dst_pitch

-%else

-        add         rdi,        r8

-%endif

-        cmp         rdi,        rcx

-        jne         .next_row

-        jmp         .done

-.b16x16_sp_only:

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        shl         rax,        5

-        add         rax,        rcx    ;VFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movdqa      xmm1,       [rax]

-        movdqa      xmm2,       [rax+16]

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

-        pxor        xmm0,       xmm0

-        ; get the first horizontal line done

-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        add         rsi,        rax                 ; next line

-.next_row_spo:

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm5,       xmm7

-        movdqa      xmm6,       xmm7

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        movdqa      xmm7,       xmm3

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm5,       xmm1

-        pmullw      xmm6,       xmm1

-        pmullw      xmm3,       xmm2

-        pmullw      xmm4,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm4

-        movdqa      [rdi],      xmm3                 ; store the results in the destination

-        add         rsi,        rax                 ; next line

-        add         rdi,        rdx                 ;dst_pitch

-        cmp         rdi,        rcx

-        jne         .next_row_spo

-        jmp         .done

-.b16x16_fp_only:

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

-        pxor        xmm0,       xmm0

-.next_row_fpo:

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm1

-        movdqu      xmm5,       [rsi+1]

-        movdqa      xmm6,       xmm5

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       xmm2

-        pmullw      xmm6,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm4

-        movdqa      [rdi],      xmm3                 ; store the results in the destination

-        add         rsi,        rax                 ; next line

-        add         rdi,        rdx                 ; dst_pitch

-        cmp         rdi,        rcx

-        jne         .next_row_fpo

-.done:

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict8x8_sse2

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-extern sym(vp9_bilinear_filters_mmx)

-global sym(vp9_bilinear_predict8x8_sse2) PRIVATE

-sym(vp9_bilinear_predict8x8_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 144                         ; reserve 144 bytes

-    ;const short *HFilter = bilinear_filters_mmx[xoffset]

-    ;const short *VFilter = bilinear_filters_mmx[yoffset]

-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-    ;Read 9-line unaligned data in and put them on stack. This gives a big

-    ;performance boost.

-        movdqu      xmm0,       [rsi]

-        lea         rax,        [rdx + rdx*2]

-        movdqu      xmm1,       [rsi+rdx]

-        movdqu      xmm2,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm3,       [rsi]

-        movdqu      xmm4,       [rsi+rdx]

-        movdqu      xmm5,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm6,       [rsi]

-        movdqu      xmm7,       [rsi+rdx]

-        movdqa      XMMWORD PTR [rsp],            xmm0

-        movdqu      xmm0,       [rsi+rdx*2]

-        movdqa      XMMWORD PTR [rsp+16],         xmm1

-        movdqa      XMMWORD PTR [rsp+32],         xmm2

-        movdqa      XMMWORD PTR [rsp+48],         xmm3

-        movdqa      XMMWORD PTR [rsp+64],         xmm4

-        movdqa      XMMWORD PTR [rsp+80],         xmm5

-        movdqa      XMMWORD PTR [rsp+96],         xmm6

-        movdqa      XMMWORD PTR [rsp+112],        xmm7

-        movdqa      XMMWORD PTR [rsp+128],        xmm0

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        shl         rax,        5

-        add         rax,        rcx    ;HFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movdqa      xmm1,       [rax]

-        movdqa      xmm2,       [rax+16]

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        shl         rax,        5

-        add         rax,        rcx    ;VFilter

-        lea         rcx,        [rdi+rdx*8]

-        movdqa      xmm5,       [rax]

-        movdqa      xmm6,       [rax+16]

-        pxor        xmm0,       xmm0

-        ; get the first horizontal line done

-        movdqa      xmm3,       XMMWORD PTR [rsp]

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        psrldq      xmm4,       1

-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm2

-        paddw       xmm3,       xmm4

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        movdqa      xmm7,       xmm3

-        add         rsp,        16                 ; next line

-.next_row8x8:

-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        psrldq      xmm4,       1

-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm2

-        paddw       xmm3,       xmm4

-        pmullw      xmm7,       xmm5

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        movdqa      xmm4,       xmm3

-        pmullw      xmm3,       xmm6

-        paddw       xmm3,       xmm7

-        movdqa      xmm7,       xmm4

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        packuswb    xmm3,       xmm0

-        movq        [rdi],      xmm3                 ; store the results in the destination

-        add         rsp,        16                 ; next line

-        add         rdi,        rdx

-        cmp         rdi,        rcx

-        jne         .next_row8x8

-    ;add rsp, 144

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-rd:

-    times 8 dw 0x40

--- a/vp9/common/x86/vp9_subpixel_ssse3.asm

+++ /dev/null

@@ -1,1515 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define BLOCK_HEIGHT_WIDTH 4

-%define VP9_FILTER_WEIGHT 128

-%define VP9_FILTER_SHIFT  7

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;

-; This is an implementation of some of the SSE optimizations first seen in ffvp8

-;

-;*************************************************************************************/

-;void vp9_filter_block1d8_h6_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    unsigned int    vp9_filter_index

-;)

-global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE

-sym(vp9_filter_block1d8_h6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4

-    movdqa      xmm7, [GLOBAL(rd)]

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    mov         rdi, arg(2)             ;output_ptr

-    cmp         esi, DWORD PTR [rax]

-    je          vp9_filter_block1d8_h4_ssse3

-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-    sub         rdi, rdx

-;xmm3 free

-.filter_block1d8_h6_rowloop_ssse3:

-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    movdqa      xmm1,   xmm0

-    pmaddubsw   xmm0,   xmm4

-    movdqa      xmm2,   xmm1

-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

-    pmaddubsw   xmm1,   xmm5

-    lea         rdi,    [rdi + rdx]

-    pmaddubsw   xmm2,   xmm6

-    lea         rsi,    [rsi + rax]

-    dec         rcx

-    paddsw      xmm0,   xmm1

-    paddsw      xmm2,   xmm7

-    paddsw      xmm0,   xmm2

-    psraw       xmm0,   7

-    packuswb    xmm0,   xmm0

-    movq        MMWORD Ptr [rdi], xmm0

-    jnz         .filter_block1d8_h6_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-vp9_filter_block1d8_h4_ssse3:

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]

-    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-    sub         rdi, rdx

-.filter_block1d8_h4_rowloop_ssse3:

-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    movdqa      xmm2,   xmm0

-    pshufb      xmm0,   xmm3

-    pshufb      xmm2,   xmm4

-    pmaddubsw   xmm0,   xmm5

-    lea         rdi,    [rdi + rdx]

-    pmaddubsw   xmm2,   xmm6

-    lea         rsi,    [rsi + rax]

-    dec         rcx

-    paddsw      xmm0,   xmm7

-    paddsw      xmm0,   xmm2

-    psraw       xmm0,   7

-    packuswb    xmm0,   xmm0

-    movq        MMWORD Ptr [rdi], xmm0

-    jnz         .filter_block1d8_h4_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h6_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    unsigned int    vp9_filter_index

-;)

-global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE

-sym(vp9_filter_block1d16_h6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)           ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    mov         rdi, arg(2)                     ;output_ptr

-    mov         rsi, arg(0)                     ;src_ptr

-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)           ;output_height

-    movsxd      rdx, dword ptr arg(3)           ;output_pitch

-.filter_block1d16_h6_rowloop_ssse3:

-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    movdqa      xmm1,   xmm0

-    pmaddubsw   xmm0,   xmm4

-    movdqa      xmm2,   xmm1

-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

-    movq        xmm3,   MMWORD PTR [rsi +  6]

-    pmaddubsw   xmm1,   xmm5

-    movq        xmm7,   MMWORD PTR [rsi + 11]

-    pmaddubsw   xmm2,   xmm6

-    punpcklbw   xmm3,   xmm7

-    paddsw      xmm0,   xmm1

-    movdqa      xmm1,   xmm3

-    pmaddubsw   xmm3,   xmm4

-    paddsw      xmm0,   xmm2

-    movdqa      xmm2,   xmm1

-    paddsw      xmm0,   [GLOBAL(rd)]

-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

-    psraw       xmm0,   7

-    pmaddubsw   xmm1,   xmm5

-    pmaddubsw   xmm2,   xmm6

-    packuswb    xmm0,   xmm0

-    lea         rsi,    [rsi + rax]

-    paddsw      xmm3,   xmm1

-    paddsw      xmm3,   xmm2

-    paddsw      xmm3,   [GLOBAL(rd)]

-    psraw       xmm3,   7

-    packuswb    xmm3,   xmm3

-    punpcklqdq  xmm0,   xmm3

-    movdqa      XMMWORD Ptr [rdi], xmm0

-    lea         rdi,    [rdi + rdx]

-    dec         rcx

-    jnz         .filter_block1d16_h6_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d4_h6_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    unsigned int    vp9_filter_index

-;)

-global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE

-sym(vp9_filter_block1d4_h6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    movdqa      xmm7, [GLOBAL(rd)]

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d4_h4_ssse3

-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rdi, arg(2)             ;output_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-;xmm3 free

-.filter_block1d4_h6_rowloop_ssse3:

-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

-    movdqa      xmm1, xmm0

-    pshufb      xmm0, [GLOBAL(shuf1b)]

-    movdqa      xmm2, xmm1

-    pshufb      xmm1, [GLOBAL(shuf2b)]

-    pmaddubsw   xmm0, xmm4

-    pshufb      xmm2, [GLOBAL(shuf3b)]

-    pmaddubsw   xmm1, xmm5

-;--

-    pmaddubsw   xmm2, xmm6

-    lea         rsi,    [rsi + rax]

-;--

-    paddsw      xmm0, xmm1

-    paddsw      xmm0, xmm7

-    pxor        xmm1, xmm1

-    paddsw      xmm0, xmm2

-    psraw       xmm0, 7

-    packuswb    xmm0, xmm0

-    movd        DWORD PTR [rdi], xmm0

-    add         rdi, rdx

-    dec         rcx

-    jnz         .filter_block1d4_h6_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d4_h4_ssse3:

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]

-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rdi, arg(2)             ;output_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-.filter_block1d4_h4_rowloop_ssse3:

-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]

-    movdqa      xmm2, xmm1

-    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]

-    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]

-    pmaddubsw   xmm1, xmm5

-;--

-    pmaddubsw   xmm2, xmm6

-    lea         rsi,    [rsi + rax]

-;--

-    paddsw      xmm1, xmm7

-    paddsw      xmm1, xmm2

-    psraw       xmm1, 7

-    packuswb    xmm1, xmm1

-    movd        DWORD PTR [rdi], xmm1

-    add         rdi, rdx

-    dec         rcx

-    jnz         .filter_block1d4_h4_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_v6_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    unsigned int   vp9_filter_index

-;)

-global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE

-sym(vp9_filter_block1d16_v6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d16_v4_ssse3

-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch

-%endif

-    mov         rax, rsi

-    movsxd      rcx, DWORD PTR arg(4)   ;output_height

-    add         rax, rdx

-.vp9_filter_block1d16_v6_ssse3_loop:

-    movq        xmm1, MMWORD PTR [rsi]                  ;A

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F

-    pmaddubsw   xmm3, xmm6

-    punpcklbw   xmm1, xmm0                  ;A F

-    pmaddubsw   xmm2, xmm7

-    pmaddubsw   xmm1, xmm5

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm1

-    paddsw      xmm2, [GLOBAL(rd)]

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi], xmm2          ;store the results

-    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A

-    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F

-    pmaddubsw   xmm3, xmm6

-    punpcklbw   xmm1, xmm0                  ;A F

-    pmaddubsw   xmm2, xmm7

-    pmaddubsw   xmm1, xmm5

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm1

-    paddsw      xmm2, [GLOBAL(rd)]

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi+8], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;out_pitch

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d16_v6_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d16_v4_ssse3:

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch

-%endif

-    mov         rax, rsi

-    movsxd      rcx, DWORD PTR arg(4)   ;output_height

-    add         rax, rdx

-.vp9_filter_block1d16_v4_ssse3_loop:

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    pmaddubsw   xmm3, xmm6

-    pmaddubsw   xmm2, xmm7

-    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B

-    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

-    paddsw      xmm2, [GLOBAL(rd)]

-    paddsw      xmm2, xmm3

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    punpcklbw   xmm5, xmm4                  ;B D

-    punpcklbw   xmm1, xmm0                  ;C E

-    pmaddubsw   xmm1, xmm6

-    pmaddubsw   xmm5, xmm7

-    movdqa      xmm4, [GLOBAL(rd)]

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm5, xmm1

-    paddsw      xmm5, xmm4

-    psraw       xmm5, 7

-    packuswb    xmm5, xmm5

-    punpcklqdq  xmm2, xmm5

-    movdqa       XMMWORD PTR [rdi], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;out_pitch

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d16_v4_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_v6_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    unsigned int   vp9_filter_index

-;)

-global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE

-sym(vp9_filter_block1d8_v6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch

-%endif

-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d8_v4_ssse3

-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d8_v6_ssse3_loop:

-    movq        xmm1, MMWORD PTR [rsi]                  ;A

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F

-    movdqa      xmm4, [GLOBAL(rd)]

-    pmaddubsw   xmm3, xmm6

-    punpcklbw   xmm1, xmm0                  ;A F

-    pmaddubsw   xmm2, xmm7

-    pmaddubsw   xmm1, xmm5

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm1

-    paddsw      xmm2, xmm4

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d8_v6_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d8_v4_ssse3:

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm5, [GLOBAL(rd)]

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d8_v4_ssse3_loop:

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    pmaddubsw   xmm3, xmm6

-    pmaddubsw   xmm2, xmm7

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm5

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d8_v4_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d4_v6_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    unsigned int   vp9_filter_index

-;)

-global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE

-sym(vp9_filter_block1d4_v6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch

-%endif

-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d4_v4_ssse3

-    movq        mm5, MMWORD PTR [rax]         ;k0_k5

-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4

-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d4_v6_ssse3_loop:

-    movd        mm1, DWORD PTR [rsi]                  ;A

-    movd        mm2, DWORD PTR [rsi + rdx]            ;B

-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C

-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D

-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   mm2, mm4                  ;B D

-    punpcklbw   mm3, mm0                  ;C E

-    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F

-    movq        mm4, [GLOBAL(rd)]

-    pmaddubsw   mm3, mm6

-    punpcklbw   mm1, mm0                  ;A F

-    pmaddubsw   mm2, mm7

-    pmaddubsw   mm1, mm5

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      mm2, mm3

-    paddsw      mm2, mm1

-    paddsw      mm2, mm4

-    psraw       mm2, 7

-    packuswb    mm2, mm2

-    movd        DWORD PTR [rdi], mm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d4_v6_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d4_v4_ssse3:

-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4

-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3

-    movq        mm5, MMWORD PTR [GLOBAL(rd)]

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d4_v4_ssse3_loop:

-    movd        mm2, DWORD PTR [rsi + rdx]            ;B

-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C

-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D

-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   mm2, mm4                  ;B D

-    punpcklbw   mm3, mm0                  ;C E

-    pmaddubsw   mm3, mm6

-    pmaddubsw   mm2, mm7

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      mm2, mm3

-    paddsw      mm2, mm5

-    psraw       mm2, 7

-    packuswb    mm2, mm2

-    movd        DWORD PTR [rdi], mm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d4_v4_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict16x16_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE

-sym(vp9_bilinear_predict16x16_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]

-        movsxd      rax,        dword ptr arg(2)    ; xoffset

-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0

-        je          .b16x16_sp_only

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; HFilter

-        mov         rdi,        arg(4)              ; dst_ptr

-        mov         rsi,        arg(0)              ; src_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm1,       [rax]

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0

-        je          .b16x16_fp_only

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line

-        movdqa      xmm2,       [rax]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5)    ; dst_pitch

-%endif

-        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07

-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        lea         rsi,        [rsi + rdx]         ; next line

-        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14

-        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16

-        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value

-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-.next_row:

-        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07

-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        punpcklbw   xmm6,       xmm5

-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        lea         rsi,        [rsi + rdx]         ; next line

-        pmaddubsw   xmm6,       xmm1

-        punpcklbw   xmm4,       xmm5

-        pmaddubsw   xmm4,       xmm1

-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value

-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value

-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128

-        packuswb    xmm6,       xmm4

-        movdqa      xmm5,       xmm7

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm5,       xmm2

-        punpckhbw   xmm7,       xmm6

-        pmaddubsw   xmm7,       xmm2

-        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value

-        psraw       xmm5,       VP9_FILTER_SHIFT    ; xmm5 /= 128

-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value

-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128

-        packuswb    xmm5,       xmm7

-        movdqa      xmm7,       xmm6

-        movdqa      [rdi],      xmm5                ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(5)    ; dst_pitch

-%else

-        add         rdi,        r8

-%endif

-        cmp         rdi,        rcx

-        jne         .next_row

-        jmp         .done

-.b16x16_sp_only:

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        mov         rdi,        arg(4)              ; dst_ptr

-        mov         rsi,        arg(0)              ; src_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm1,       [rax]               ; VFilter

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

-        ; get the first horizontal line done

-        movq        xmm4,       [rsi]               ; load row 0

-        movq        xmm2,       [rsi + 8]           ; load row 0

-        lea         rsi,        [rsi + rax]         ; next line

-.next_row_sp:

-        movq        xmm3,       [rsi]               ; load row + 1

-        movq        xmm5,       [rsi + 8]           ; load row + 1

-        punpcklbw   xmm4,       xmm3

-        punpcklbw   xmm2,       xmm5

-        pmaddubsw   xmm4,       xmm1

-        movq        xmm7,       [rsi + rax]         ; load row + 2

-        pmaddubsw   xmm2,       xmm1

-        movq        xmm6,       [rsi + rax + 8]     ; load row + 2

-        punpcklbw   xmm3,       xmm7

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm3,       xmm1

-        paddw       xmm4,       [GLOBAL(rd)]

-        pmaddubsw   xmm5,       xmm1

-        paddw       xmm2,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        psraw       xmm2,       VP9_FILTER_SHIFT

-        packuswb    xmm4,       xmm2

-        paddw       xmm3,       [GLOBAL(rd)]

-        movdqa      [rdi],      xmm4                ; store row 0

-        paddw       xmm5,       [GLOBAL(rd)]

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm5

-        movdqa      xmm4,       xmm7

-        movdqa      [rdi + rdx],xmm3                ; store row 1

-        lea         rsi,        [rsi + 2*rax]

-        movdqa      xmm2,       xmm6

-        lea         rdi,        [rdi + 2*rdx]

-        cmp         rdi,        rcx

-        jne         .next_row_sp

-        jmp         .done

-.b16x16_fp_only:

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

-.next_row_fp:

-        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07

-        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        punpcklbw   xmm2,       xmm4

-        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        pmaddubsw   xmm2,       xmm1

-        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        lea         rsi,        [rsi + rax]         ; next line

-        punpcklbw   xmm3,       xmm4

-        pmaddubsw   xmm3,       xmm1

-        movq        xmm5,       [rsi]

-        paddw       xmm2,       [GLOBAL(rd)]

-        movq        xmm7,       [rsi+1]

-        movq        xmm6,       [rsi+8]

-        psraw       xmm2,       VP9_FILTER_SHIFT

-        punpcklbw   xmm5,       xmm7

-        movq        xmm7,       [rsi+9]

-        paddw       xmm3,       [GLOBAL(rd)]

-        pmaddubsw   xmm5,       xmm1

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        punpcklbw   xmm6,       xmm7

-        packuswb    xmm2,       xmm3

-        pmaddubsw   xmm6,       xmm1

-        movdqa      [rdi],      xmm2                ; store the results in the destination

-        paddw       xmm5,       [GLOBAL(rd)]

-        lea         rdi,        [rdi + rdx]         ; dst_pitch

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        paddw       xmm6,       [GLOBAL(rd)]

-        psraw       xmm6,       VP9_FILTER_SHIFT

-        packuswb    xmm5,       xmm6

-        lea         rsi,        [rsi + rax]         ; next line

-        movdqa      [rdi],      xmm5                ; store the results in the destination

-        lea         rdi,        [rdi + rdx]         ; dst_pitch

-        cmp         rdi,        rcx

-        jne         .next_row_fp

-.done:

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict8x8_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE

-sym(vp9_bilinear_predict8x8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 144                         ; reserve 144 bytes

-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-    ;Read 9-line unaligned data in and put them on stack. This gives a big

-    ;performance boost.

-        movdqu      xmm0,       [rsi]

-        lea         rax,        [rdx + rdx*2]

-        movdqu      xmm1,       [rsi+rdx]

-        movdqu      xmm2,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm3,       [rsi]

-        movdqu      xmm4,       [rsi+rdx]

-        movdqu      xmm5,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm6,       [rsi]

-        movdqu      xmm7,       [rsi+rdx]

-        movdqa      XMMWORD PTR [rsp],            xmm0

-        movdqu      xmm0,       [rsi+rdx*2]

-        movdqa      XMMWORD PTR [rsp+16],         xmm1

-        movdqa      XMMWORD PTR [rsp+32],         xmm2

-        movdqa      XMMWORD PTR [rsp+48],         xmm3

-        movdqa      XMMWORD PTR [rsp+64],         xmm4

-        movdqa      XMMWORD PTR [rsp+80],         xmm5

-        movdqa      XMMWORD PTR [rsp+96],         xmm6

-        movdqa      XMMWORD PTR [rsp+112],        xmm7

-        movdqa      XMMWORD PTR [rsp+128],        xmm0

-        movsxd      rax,        dword ptr arg(2)    ; xoffset

-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0

-        je          .b8x8_sp_only

-        shl         rax,        4

-        add         rax,        rcx                 ; HFilter

-        mov         rdi,        arg(4)              ; dst_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm0,       [rax]

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0

-        je          .b8x8_fp_only

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        lea         rcx,        [rdi+rdx*8]

-        movdqa      xmm1,       [rax]

-        ; get the first horizontal line done

-        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx

-        psrldq      xmm5,       1

-        lea         rsp,        [rsp + 16]          ; next line

-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

-        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-.next_row:

-        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-        lea         rsp,        [rsp + 16]          ; next line

-        movdqa      xmm5,       xmm6

-        psrldq      xmm5,       1

-        punpcklbw   xmm6,       xmm5

-        pmaddubsw   xmm6,       xmm0

-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value

-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128

-        packuswb    xmm6,       xmm6

-        punpcklbw   xmm7,       xmm6

-        pmaddubsw   xmm7,       xmm1

-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value

-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128

-        packuswb    xmm7,       xmm7

-        movq        [rdi],      xmm7                ; store the results in the destination

-        lea         rdi,        [rdi + rdx]

-        movdqa      xmm7,       xmm6

-        cmp         rdi,        rcx

-        jne         .next_row

-        jmp         .done8x8

-.b8x8_sp_only:

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm0,       [rax]               ; VFilter

-        movq        xmm1,       XMMWORD PTR [rsp]

-        movq        xmm2,       XMMWORD PTR [rsp+16]

-        movq        xmm3,       XMMWORD PTR [rsp+32]

-        punpcklbw   xmm1,       xmm2

-        movq        xmm4,       XMMWORD PTR [rsp+48]

-        punpcklbw   xmm2,       xmm3

-        movq        xmm5,       XMMWORD PTR [rsp+64]

-        punpcklbw   xmm3,       xmm4

-        movq        xmm6,       XMMWORD PTR [rsp+80]

-        punpcklbw   xmm4,       xmm5

-        movq        xmm7,       XMMWORD PTR [rsp+96]

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm1,       xmm0

-        pmaddubsw   xmm2,       xmm0

-        pmaddubsw   xmm3,       xmm0

-        pmaddubsw   xmm4,       xmm0

-        pmaddubsw   xmm5,       xmm0

-        punpcklbw   xmm6,       xmm7

-        pmaddubsw   xmm6,       xmm0

-        paddw       xmm1,       [GLOBAL(rd)]

-        paddw       xmm2,       [GLOBAL(rd)]

-        psraw       xmm1,       VP9_FILTER_SHIFT

-        paddw       xmm3,       [GLOBAL(rd)]

-        psraw       xmm2,       VP9_FILTER_SHIFT

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        paddw       xmm5,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        paddw       xmm6,       [GLOBAL(rd)]

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        psraw       xmm6,       VP9_FILTER_SHIFT

-        packuswb    xmm1,       xmm1

-        packuswb    xmm2,       xmm2

-        movq        [rdi],      xmm1

-        packuswb    xmm3,       xmm3

-        movq        [rdi+rdx],  xmm2

-        packuswb    xmm4,       xmm4

-        movq        xmm1,       XMMWORD PTR [rsp+112]

-        lea         rdi,        [rdi + 2*rdx]

-        movq        xmm2,       XMMWORD PTR [rsp+128]

-        packuswb    xmm5,       xmm5

-        movq        [rdi],      xmm3

-        packuswb    xmm6,       xmm6

-        movq        [rdi+rdx],  xmm4

-        lea         rdi,        [rdi + 2*rdx]

-        punpcklbw   xmm7,       xmm1

-        movq        [rdi],      xmm5

-        pmaddubsw   xmm7,       xmm0

-        movq        [rdi+rdx],  xmm6

-        punpcklbw   xmm1,       xmm2

-        pmaddubsw   xmm1,       xmm0

-        paddw       xmm7,       [GLOBAL(rd)]

-        psraw       xmm7,       VP9_FILTER_SHIFT

-        paddw       xmm1,       [GLOBAL(rd)]

-        psraw       xmm1,       VP9_FILTER_SHIFT

-        packuswb    xmm7,       xmm7

-        packuswb    xmm1,       xmm1

-        lea         rdi,        [rdi + 2*rdx]

-        movq        [rdi],      xmm7

-        movq        [rdi+rdx],  xmm1

-        lea         rsp,        [rsp + 144]

-        jmp         .done8x8

-.b8x8_fp_only:

-        lea         rcx,        [rdi+rdx*8]

-.next_row_fp:

-        movdqa      xmm1,       XMMWORD PTR [rsp]

-        movdqa      xmm3,       XMMWORD PTR [rsp+16]

-        movdqa      xmm2,       xmm1

-        movdqa      xmm5,       XMMWORD PTR [rsp+32]

-        psrldq      xmm2,       1

-        movdqa      xmm7,       XMMWORD PTR [rsp+48]

-        movdqa      xmm4,       xmm3

-        psrldq      xmm4,       1

-        movdqa      xmm6,       xmm5

-        psrldq      xmm6,       1

-        punpcklbw   xmm1,       xmm2

-        pmaddubsw   xmm1,       xmm0

-        punpcklbw   xmm3,       xmm4

-        pmaddubsw   xmm3,       xmm0

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm5,       xmm0

-        movdqa      xmm2,       xmm7

-        psrldq      xmm2,       1

-        punpcklbw   xmm7,       xmm2

-        pmaddubsw   xmm7,       xmm0

-        paddw       xmm1,       [GLOBAL(rd)]

-        psraw       xmm1,       VP9_FILTER_SHIFT

-        paddw       xmm3,       [GLOBAL(rd)]

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        paddw       xmm5,       [GLOBAL(rd)]

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        paddw       xmm7,       [GLOBAL(rd)]

-        psraw       xmm7,       VP9_FILTER_SHIFT

-        packuswb    xmm1,       xmm1

-        packuswb    xmm3,       xmm3

-        packuswb    xmm5,       xmm5

-        movq        [rdi],      xmm1

-        packuswb    xmm7,       xmm7

-        movq        [rdi+rdx],  xmm3

-        lea         rdi,        [rdi + 2*rdx]

-        movq        [rdi],      xmm5

-        lea         rsp,        [rsp + 4*16]

-        movq        [rdi+rdx],  xmm7

-        lea         rdi,        [rdi + 2*rdx]

-        cmp         rdi,        rcx

-        jne         .next_row_fp

-        lea         rsp,        [rsp + 16]

-.done8x8:

-    ;add rsp, 144

-    pop         rsp

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-shuf1b:

-    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12

-shuf2b:

-    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11

-shuf3b:

-    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10

-align 16

-shuf2bfrom1:

-    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13

-align 16

-shuf3bfrom1:

-    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11

-align 16

-rd:

-    times 8 dw 0x40

-align 16

-k0_k5:

-    times 8 db 0, 0             ;placeholder

-    times 8 db 0, 0

-    times 8 db 2, 1

-    times 8 db 0, 0

-    times 8 db 3, 3

-    times 8 db 0, 0

-    times 8 db 1, 2

-    times 8 db 0, 0

-k1_k3:

-    times 8 db  0,    0         ;placeholder

-    times 8 db  -6,  12

-    times 8 db -11,  36

-    times 8 db  -9,  50

-    times 8 db -16,  77

-    times 8 db  -6,  93

-    times 8 db  -8, 108

-    times 8 db  -1, 123

-k2_k4:

-    times 8 db 128,    0        ;placeholder

-    times 8 db 123,   -1

-    times 8 db 108,   -8

-    times 8 db  93,   -6

-    times 8 db  77,  -16

-    times 8 db  50,   -9

-    times 8 db  36,  -11

-    times 8 db  12,   -6

-align 16

-bilinear_filters_ssse3:

-    times 8 db 128, 0

-    times 8 db 120, 8

-    times 8 db 112, 16

-    times 8 db 104, 24

-    times 8 db 96,  32

-    times 8 db 88,  40

-    times 8 db 80,  48

-    times 8 db 72,  56

-    times 8 db 64,  64

-    times 8 db 56,  72

-    times 8 db 48,  80

-    times 8 db 40,  88

-    times 8 db 32,  96

-    times 8 db 24,  104

-    times 8 db 16,  112

-    times 8 db 8,   120

--- a/vp9/common/x86/vp9_subpixel_x86.h

+++ /dev/null

@@ -1,109 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_

-#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);

-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx

-#undef  vp9_subpix_sixtap4x4

-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx

-#endif

-#endif

-#if HAVE_SSE2

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2

-#undef  vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2

-#endif

-#endif

-#if HAVE_SSSE3

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);

-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3

-#undef  vp9_subpix_sixtap4x4

-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3

-#undef  vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3

-#endif

-#endif

-#endif

--- a/vp9/decoder/vp9_dboolhuff.c

+++ b/vp9/decoder/vp9_dboolhuff.c

@@ -8,19 +8,19 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "vp9/decoder/vp9_dboolhuff.h"

 #include "vpx_ports/mem.h"

 #include "vpx_mem/vpx_mem.h"

+#include "vp9/decoder/vp9_dboolhuff.h"

 int vp9_start_decode(BOOL_DECODER *br,

                      const unsigned char *source,

                      unsigned int source_sz) {

   br->user_buffer_end = source + source_sz;

-  br->user_buffer     = source;

-  br->value    = 0;

-  br->count    = -8;

-  br->range    = 255;

+  br->user_buffer = source;

+  br->value = 0;

+  br->count = -8;

+  br->range = 255;

   if (source_sz && !source)

     return 1;

@@ -33,17 +33,28 @@

 void vp9_bool_decoder_fill(BOOL_DECODER *br) {

-  const unsigned char *bufptr;

-  const unsigned char *bufend;

-  VP9_BD_VALUE         value;

-  int                  count;

-  bufend = br->user_buffer_end;

-  bufptr = br->user_buffer;

-  value = br->value;

-  count = br->count;

+  const unsigned char *bufptr = br->user_buffer;

+  const unsigned char *bufend = br->user_buffer_end;

+  VP9_BD_VALUE value = br->value;

+  int count = br->count;

+  int shift = VP9_BD_VALUE_SIZE - 8 - (count + 8);

+  int loop_end = 0;

+  int bits_left = (int)((bufend - bufptr)*CHAR_BIT);

+  int x = shift + CHAR_BIT - bits_left;

-  VP9DX_BOOL_DECODER_FILL(count, value, bufptr, bufend);

+  if (x >= 0) {

+    count += VP9_LOTS_OF_BITS;

+    loop_end = x;

+  }

+  if (x < 0 || bits_left) {

+    while (shift >= loop_end) {

+      count += CHAR_BIT;

+      value |= (VP9_BD_VALUE)*bufptr++ << shift;

+      shift -= CHAR_BIT;

+    }

+  }

   br->user_buffer = bufptr;

   br->value = value;

   br->count = count;

@@ -52,7 +63,9 @@

 static int get_unsigned_bits(unsigned num_values) {

   int cat = 0;

-  if ((num_values--) <= 1) return 0;

+  if (num_values <= 1)

+    return 0;

+  num_values--;

   while (num_values > 0) {

     cat++;

     num_values >>= 1;

@@ -61,9 +74,12 @@

 int vp9_inv_recenter_nonneg(int v, int m) {

-  if (v > (m << 1)) return v;

-  else if ((v & 1) == 0) return (v >> 1) + m;

-  else return m - ((v + 1) >> 1);

+  if (v > (m << 1))

+    return v;

+  else if ((v & 1) == 0)

+    return (v >> 1) + m;

+  else

+    return m - ((v + 1) >> 1);

 int vp9_decode_uniform(BOOL_DECODER *br, int n) {

--- a/vp9/decoder/vp9_dboolhuff.h

+++ b/vp9/decoder/vp9_dboolhuff.h

@@ -13,6 +13,7 @@

 #include <stddef.h>

 #include <limits.h>

 #include "./vpx_config.h"

 #include "vpx_ports/mem.h"

 #include "vpx/vpx_integer.h"

@@ -19,11 +20,11 @@

 typedef size_t VP9_BD_VALUE;

-# define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)

+#define VP9_BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)

 /*This is meant to be a large, positive constant that can still be efficiently

    loaded as an immediate (on platforms like ARM, for example).

   Even relatively modest values like 100 would work fine.*/

-# define VP9_LOTS_OF_BITS (0x40000000)

+#define VP9_LOTS_OF_BITS (0x40000000)

 typedef struct {

   const unsigned char *user_buffer_end;

@@ -45,47 +46,14 @@

 int vp9_decode_term_subexp(BOOL_DECODER *br, int k, int num_syms);

 int vp9_inv_recenter_nonneg(int v, int m);

-/*The refill loop is used in several places, so define it in a macro to make

-   sure they're all consistent.

-  An inline function would be cleaner, but has a significant penalty, because

-   multiple BOOL_DECODER fields must be modified, and the compiler is not smart

-   enough to eliminate the stores to those fields and the subsequent reloads

-   from them when inlining the function.*/

-#define VP9DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \

-  do \

-  { \

-    int shift = VP9_BD_VALUE_SIZE - 8 - ((_count) + 8); \

-    int loop_end, x; \

-    int bits_left = (int)(((_bufend)-(_bufptr))*CHAR_BIT); \

-    \

-    x = shift + CHAR_BIT - bits_left; \

-    loop_end = 0; \

-    if(x >= 0) \

-    { \

-      (_count) += VP9_LOTS_OF_BITS; \

-      loop_end = x; \

-      if(!bits_left) break; \

-    } \

-    while(shift >= loop_end) \

-    { \

-      (_count) += CHAR_BIT; \

-      (_value) |= (VP9_BD_VALUE)*(_bufptr)++ << shift; \

-      shift -= CHAR_BIT; \

-    } \

-  } \

-  while(0) \

 static int decode_bool(BOOL_DECODER *br, int probability) {

   unsigned int bit = 0;

   VP9_BD_VALUE value;

-  unsigned int split;

   VP9_BD_VALUE bigsplit;

   int count;

   unsigned int range;

+  unsigned int split = 1 + (((br->range - 1) * probability) >> 8);

-  split = 1 + (((br->range - 1) * probability) >> 8);

   if (br->count < 0)

     vp9_bool_decoder_fill(br);

@@ -120,7 +88,7 @@

   int bit;

   for (bit = bits - 1; bit >= 0; bit--) {

-    z |= (decode_bool(br, 0x80) << bit);

+    z |= decode_bool(br, 0x80) << bit;

   return z;

@@ -127,29 +95,23 @@

 static int bool_error(BOOL_DECODER *br) {

-  /* Check if we have reached the end of the buffer.

-   *

-   * Variable 'count' stores the number of bits in the 'value' buffer, minus

-   * 8. The top byte is part of the algorithm, and the remainder is buffered

-   * to be shifted into it. So if count == 8, the top 16 bits of 'value' are

-   * occupied, 8 for the algorithm and 8 in the buffer.

-   *

-   * When reading a byte from the user's buffer, count is filled with 8 and

-   * one byte is filled into the value buffer. When we reach the end of the

-   * data, count is additionally filled with VP9_LOTS_OF_BITS. So when

-   * count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.

-   */

-  if ((br->count > VP9_BD_VALUE_SIZE) && (br->count < VP9_LOTS_OF_BITS)) {

-    /* We have tried to decode bits after the end of

-     * stream was encountered.

-     */

-    return 1;

-  }

-  /* No error. */

-  return 0;

+  // Check if we have reached the end of the buffer.

+  //

+  // Variable 'count' stores the number of bits in the 'value' buffer, minus

+  // 8. The top byte is part of the algorithm, and the remainder is buffered

+  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are

+  // occupied, 8 for the algorithm and 8 in the buffer.

+  //

+  // When reading a byte from the user's buffer, count is filled with 8 and

+  // one byte is filled into the value buffer. When we reach the end of the

+  // data, count is additionally filled with VP9_LOTS_OF_BITS. So when

+  // count == VP9_LOTS_OF_BITS - 1, the user's data has been exhausted.

+  //

+  // 1 if we have tried to decode bits after the end of stream was encountered.

+  // 0 No error.

+  return br->count > VP9_BD_VALUE_SIZE && br->count < VP9_LOTS_OF_BITS;

-extern int vp9_decode_unsigned_max(BOOL_DECODER *br, int max);

+int vp9_decode_unsigned_max(BOOL_DECODER *br, int max);

 #endif  // VP9_DECODER_VP9_DBOOLHUFF_H_

--- a/vp9/decoder/vp9_decodemv.c

+++ b/vp9/decoder/vp9_decodemv.c

@@ -12,6 +12,7 @@

 #include "vp9/decoder/vp9_treereader.h"

 #include "vp9/common/vp9_entropymv.h"

 #include "vp9/common/vp9_entropymode.h"

+#include "vp9/common/vp9_reconinter.h"

 #include "vp9/decoder/vp9_onyxd_int.h"

 #include "vp9/common/vp9_findnearmv.h"

 #include "vp9/common/vp9_common.h"

@@ -28,12 +29,13 @@

 #ifdef DEBUG_DEC_MV

 int dec_mvcount = 0;

 #endif

 // #define DEC_DEBUG

 #ifdef DEC_DEBUG

 extern int dec_debug;

 #endif

-static int read_bmode(vp9_reader *bc, const vp9_prob *p) {

+static B_PREDICTION_MODE read_bmode(vp9_reader *bc, const vp9_prob *p) {

   B_PREDICTION_MODE m = treed_read(bc, vp9_bmode_tree, p);

 #if CONFIG_NEWBINTRAMODES

   if (m == B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS)

@@ -43,24 +45,24 @@

   return m;

-static int read_kf_bmode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_kf_bmode_tree, p);

+static B_PREDICTION_MODE read_kf_bmode(vp9_reader *bc, const vp9_prob *p) {

+  return (B_PREDICTION_MODE)treed_read(bc, vp9_kf_bmode_tree, p);

-static int read_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_ymode_tree, p);

+static MB_PREDICTION_MODE read_ymode(vp9_reader *bc, const vp9_prob *p) {

+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_ymode_tree, p);

-static int read_sb_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_sb_ymode_tree, p);

+static MB_PREDICTION_MODE read_sb_ymode(vp9_reader *bc, const vp9_prob *p) {

+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_sb_ymode_tree, p);

-static int read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_uv_mode_tree, p);

+static MB_PREDICTION_MODE read_kf_sb_ymode(vp9_reader *bc, const vp9_prob *p) {

+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p);

-static int read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_kf_ymode_tree, p);

+static MB_PREDICTION_MODE read_kf_mb_ymode(vp9_reader *bc, const vp9_prob *p) {

+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_kf_ymode_tree, p);

 static int read_i8x8_mode(vp9_reader *bc, const vp9_prob *p) {

@@ -67,29 +69,39 @@

   return treed_read(bc, vp9_i8x8_mode_tree, p);

-static int read_uv_mode(vp9_reader *bc, const vp9_prob *p) {

-  return treed_read(bc, vp9_uv_mode_tree, p);

+static MB_PREDICTION_MODE read_uv_mode(vp9_reader *bc, const vp9_prob *p) {

+  return (MB_PREDICTION_MODE)treed_read(bc, vp9_uv_mode_tree, p);

 // This function reads the current macro block's segnent id from the bitstream

 // It should only be called if a segment map update is indicated.

-static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi,

-                          MACROBLOCKD *xd) {

-  /* Is segmentation enabled */

+static void read_mb_segid(vp9_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *xd) {

   if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

-    /* If so then read the segment id. */

-    if (vp9_read(r, xd->mb_segment_tree_probs[0]))

-      mi->segment_id =

-        (unsigned char)(2 + vp9_read(r, xd->mb_segment_tree_probs[2]));

-    else

-      mi->segment_id =

-        (unsigned char)(vp9_read(r, xd->mb_segment_tree_probs[1]));

+    const vp9_prob *const p = xd->mb_segment_tree_probs;

+    mi->segment_id = vp9_read(r, p[0]) ? 2 + vp9_read(r, p[2])

+                                       : vp9_read(r, p[1]);

+// This function reads the current macro block's segnent id from the bitstream

+// It should only be called if a segment map update is indicated.

+static void read_mb_segid_except(VP9_COMMON *cm,

+                                 vp9_reader *r, MB_MODE_INFO *mi,

+                                 MACROBLOCKD *xd, int mb_row, int mb_col) {

+  const int mb_index = mb_row * cm->mb_cols + mb_col;

+  const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, mb_index);

+  const vp9_prob *const p = xd->mb_segment_tree_probs;

+  const vp9_prob prob = xd->mb_segment_mispred_tree_probs[pred_seg_id];

+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

+    mi->segment_id = vp9_read(r, prob)

+        ? 2 + (pred_seg_id  < 2 ? vp9_read(r, p[2]) : (pred_seg_id == 2))

+        :     (pred_seg_id >= 2 ? vp9_read(r, p[1]) : (pred_seg_id == 0));

+  }

+}

 #if CONFIG_NEW_MVREF

-int vp9_read_mv_ref_id(vp9_reader *r,

-                       vp9_prob * ref_id_probs) {

+int vp9_read_mv_ref_id(vp9_reader *r, vp9_prob *ref_id_probs) {

   int ref_index = 0;

   if (vp9_read(r, ref_id_probs[0])) {

@@ -111,10 +123,13 @@

                          int mb_col,

                          BOOL_DECODER* const bc) {

   VP9_COMMON *const cm = &pbi->common;

+  MACROBLOCKD *const xd  = &pbi->mb;

   const int mis = pbi->common.mode_info_stride;

   int map_index = mb_row * pbi->common.mb_cols + mb_col;

   MB_PREDICTION_MODE y_mode;

+  m->mbmi.ref_frame = INTRA_FRAME;

   // Read the Macroblock segmentation map if it is being updated explicitly

   // this frame (reset to 0 by default).

   m->mbmi.segment_id = 0;

@@ -139,30 +154,19 @@

   m->mbmi.mb_skip_coeff = 0;

   if (pbi->common.mb_no_coeff_skip &&

-      (!vp9_segfeature_active(&pbi->mb,

-                              m->mbmi.segment_id, SEG_LVL_EOB) ||

-       (vp9_get_segdata(&pbi->mb,

-                        m->mbmi.segment_id, SEG_LVL_EOB) != 0))) {

-    MACROBLOCKD *const xd  = &pbi->mb;

-    m->mbmi.mb_skip_coeff =

-      vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));

+      (!vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id, SEG_LVL_SKIP))) {

+    m->mbmi.mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, &pbi->mb,

+                                                           PRED_MBSKIP));

   } else {

-    if (vp9_segfeature_active(&pbi->mb,

-                              m->mbmi.segment_id, SEG_LVL_EOB) &&

-        (vp9_get_segdata(&pbi->mb,

-                         m->mbmi.segment_id, SEG_LVL_EOB) == 0)) {

-      m->mbmi.mb_skip_coeff = 1;

-    } else

-      m->mbmi.mb_skip_coeff = 0;

+    m->mbmi.mb_skip_coeff = vp9_segfeature_active(&pbi->mb, m->mbmi.segment_id,

+                                                  SEG_LVL_SKIP);

-  if (m->mbmi.sb_type) {

-    y_mode = (MB_PREDICTION_MODE) read_kf_sb_ymode(bc,

-      pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]);

-  } else {

-    y_mode = (MB_PREDICTION_MODE) read_kf_mb_ymode(bc,

-      pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);

-  }

+  y_mode = m->mbmi.sb_type ?

+      read_kf_sb_ymode(bc,

+          pbi->common.sb_kf_ymode_prob[pbi->common.kf_ymode_probs_index]):

+      read_kf_mb_ymode(bc,

+          pbi->common.kf_ymode_prob[pbi->common.kf_ymode_probs_index]);

   m->mbmi.ref_frame = INTRA_FRAME;

@@ -169,30 +173,33 @@

   if ((m->mbmi.mode = y_mode) == B_PRED) {

     int i = 0;

     do {

-      const B_PREDICTION_MODE A = above_block_mode(m, i, mis);

-      const B_PREDICTION_MODE L = left_block_mode(m, i);

+      const B_PREDICTION_MODE a = above_block_mode(m, i, mis);

+      const B_PREDICTION_MODE l = (xd->left_available || (i & 3)) ?

+                                  left_block_mode(m, i) : B_DC_PRED;

-      m->bmi[i].as_mode.first =

-        (B_PREDICTION_MODE) read_kf_bmode(

-          bc, pbi->common.kf_bmode_prob [A] [L]);

+      m->bmi[i].as_mode.first = read_kf_bmode(bc,

+                                              pbi->common.kf_bmode_prob[a][l]);

     } while (++i < 16);

   if ((m->mbmi.mode = y_mode) == I8X8_PRED) {

     int i;

-    int mode8x8;

     for (i = 0; i < 4; i++) {

-      int ib = vp9_i8x8_block[i];

-      mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

+      const int ib = vp9_i8x8_block[i];

+      const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

       m->bmi[ib + 0].as_mode.first = mode8x8;

       m->bmi[ib + 1].as_mode.first = mode8x8;

       m->bmi[ib + 4].as_mode.first = mode8x8;

       m->bmi[ib + 5].as_mode.first = mode8x8;

-  } else

-    m->mbmi.uv_mode = (MB_PREDICTION_MODE)read_uv_mode(bc,

-                                                       pbi->common.kf_uv_mode_prob[m->mbmi.mode]);

+  } else {

+    m->mbmi.uv_mode = read_uv_mode(bc,

+                                   pbi->common.kf_uv_mode_prob[m->mbmi.mode]);

+  }

-  if (cm->txfm_mode == TX_MODE_SELECT && m->mbmi.mb_skip_coeff == 0 &&

+  if (cm->txfm_mode == TX_MODE_SELECT &&

+      m->mbmi.mb_skip_coeff == 0 &&

       m->mbmi.mode <= I8X8_PRED) {

     // FIXME(rbultje) code ternary symbol once all experiments are merged

     m->mbmi.txfm_size = vp9_read(bc, cm->prob_tx[0]);

@@ -215,23 +222,23 @@

 static int read_nmv_component(vp9_reader *r,

                               int rv,

                               const nmv_component *mvcomp) {

-  int v, s, z, c, o, d;

-  s = vp9_read(r, mvcomp->sign);

-  c = treed_read(r, vp9_mv_class_tree, mvcomp->classes);

-  if (c == MV_CLASS_0) {

+  int mag, d;

+  const int sign = vp9_read(r, mvcomp->sign);

+  const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);

+  if (mv_class == MV_CLASS_0) {

     d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);

   } else {

-    int i, b;

+    int i;

+    int n = mv_class + CLASS0_BITS - 1;  // number of bits

     d = 0;

-    b = c + CLASS0_BITS - 1;  /* number of bits */

-    for (i = 0; i < b; ++i)

-      d |= (vp9_read(r, mvcomp->bits[i]) << i);

+    for (i = 0; i < n; ++i)

+      d |= vp9_read(r, mvcomp->bits[i]) << i;

-  o = d << 3;

-  z = vp9_get_mv_mag(c, o);

-  v = (s ? -(z + 8) : (z + 8));

-  return v;

+  mag = vp9_get_mv_mag(mv_class, d << 3);

+  return sign ? -(mag + 8) : (mag + 8);

 static int read_nmv_component_fp(vp9_reader *r,

@@ -239,43 +246,34 @@

                                  int rv,

                                  const nmv_component *mvcomp,

                                  int usehp) {

-  int s, z, c, o, d, e, f;

-  s = v < 0;

-  z = (s ? -v : v) - 1;       /* magnitude - 1 */

-  z &= ~7;

+  const int sign = v < 0;

+  int mag = ((sign ? -v : v) - 1) & ~7;  // magnitude - 1

+  int offset;

+  const int mv_class = vp9_get_mv_class(mag, &offset);

+  const int f = mv_class == MV_CLASS_0 ?

+      treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[offset >> 3]):

+      treed_read(r, vp9_mv_fp_tree, mvcomp->fp);

-  c = vp9_get_mv_class(z, &o);

-  d = o >> 3;

+  offset += f << 1;

-  if (c == MV_CLASS_0) {

-    f = treed_read(r, vp9_mv_fp_tree, mvcomp->class0_fp[d]);

-  } else {

-    f = treed_read(r, vp9_mv_fp_tree, mvcomp->fp);

-  }

-  o += (f << 1);

   if (usehp) {

-    if (c == MV_CLASS_0) {

-      e = vp9_read(r, mvcomp->class0_hp);

-    } else {

-      e = vp9_read(r, mvcomp->hp);

-    }

-    o += e;

+    const vp9_prob p = mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp;

+    offset += vp9_read(r, p);

   } else {

-    ++o;  /* Note if hp is not used, the default value of the hp bit is 1 */

+    offset += 1;  // If hp is not used, the default value of the hp bit is 1

-  z = vp9_get_mv_mag(c, o);

-  v = (s ? -(z + 1) : (z + 1));

-  return v;

+  mag = vp9_get_mv_mag(mv_class, offset);

+  return sign ? -(mag + 1) : (mag + 1);

 static void read_nmv(vp9_reader *r, MV *mv, const MV *ref,

                      const nmv_context *mvctx) {

-  MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);

+  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, mvctx->joints);

   mv->row = mv-> col = 0;

   if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

     mv->row = read_nmv_component(r, ref->row, &mvctx->comps[0]);

   if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) {

     mv->col = read_nmv_component(r, ref->col, &mvctx->comps[1]);

@@ -283,7 +281,7 @@

 static void read_nmv_fp(vp9_reader *r, MV *mv, const MV *ref,

                         const nmv_context *mvctx, int usehp) {

-  MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

+  const MV_JOINT_TYPE j = vp9_get_mv_joint(*mv);

   usehp = usehp && vp9_use_nmv_hp(ref);

   if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) {

     mv->row = read_nmv_component_fp(r, mv->row, ref->row, &mvctx->comps[0],

@@ -293,7 +291,10 @@

     mv->col = read_nmv_component_fp(r, mv->col, ref->col, &mvctx->comps[1],

                                     usehp);

-  //printf("  %d: %d %d ref: %d %d\n", usehp, mv->row, mv-> col, ref->row, ref->col);

+  /*

+  printf("MV: %d %d REF: %d %d\n", mv->row + ref->row, mv->col + ref->col,

+	 ref->row, ref->col);

+	 */

 static void update_nmv(vp9_reader *bc, vp9_prob *const p,

@@ -310,48 +311,40 @@

 static void read_nmvprobs(vp9_reader *bc, nmv_context *mvctx,

                           int usehp) {

   int i, j, k;

 #ifdef MV_GROUP_UPDATE

-  if (!vp9_read_bit(bc)) return;

+  if (!vp9_read_bit(bc))

+    return;

 #endif

-  for (j = 0; j < MV_JOINTS - 1; ++j) {

-    update_nmv(bc, &mvctx->joints[j],

-               VP9_NMV_UPDATE_PROB);

-  }

+  for (j = 0; j < MV_JOINTS - 1; ++j)

+    update_nmv(bc, &mvctx->joints[j], VP9_NMV_UPDATE_PROB);

   for (i = 0; i < 2; ++i) {

-    update_nmv(bc, &mvctx->comps[i].sign,

-               VP9_NMV_UPDATE_PROB);

-    for (j = 0; j < MV_CLASSES - 1; ++j) {

-      update_nmv(bc, &mvctx->comps[i].classes[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-    for (j = 0; j < CLASS0_SIZE - 1; ++j) {

-      update_nmv(bc, &mvctx->comps[i].class0[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

-    for (j = 0; j < MV_OFFSET_BITS; ++j) {

-      update_nmv(bc, &mvctx->comps[i].bits[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

+    update_nmv(bc, &mvctx->comps[i].sign, VP9_NMV_UPDATE_PROB);

+    for (j = 0; j < MV_CLASSES - 1; ++j)

+      update_nmv(bc, &mvctx->comps[i].classes[j], VP9_NMV_UPDATE_PROB);

+    for (j = 0; j < CLASS0_SIZE - 1; ++j)

+      update_nmv(bc, &mvctx->comps[i].class0[j], VP9_NMV_UPDATE_PROB);

+    for (j = 0; j < MV_OFFSET_BITS; ++j)

+      update_nmv(bc, &mvctx->comps[i].bits[j], VP9_NMV_UPDATE_PROB);

   for (i = 0; i < 2; ++i) {

     for (j = 0; j < CLASS0_SIZE; ++j) {

       for (k = 0; k < 3; ++k)

-        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k],

-                   VP9_NMV_UPDATE_PROB);

+        update_nmv(bc, &mvctx->comps[i].class0_fp[j][k], VP9_NMV_UPDATE_PROB);

-    for (j = 0; j < 3; ++j) {

-      update_nmv(bc, &mvctx->comps[i].fp[j],

-                 VP9_NMV_UPDATE_PROB);

-    }

+    for (j = 0; j < 3; ++j)

+      update_nmv(bc, &mvctx->comps[i].fp[j], VP9_NMV_UPDATE_PROB);

   if (usehp) {

     for (i = 0; i < 2; ++i) {

-      update_nmv(bc, &mvctx->comps[i].class0_hp,

-                 VP9_NMV_UPDATE_PROB);

-      update_nmv(bc, &mvctx->comps[i].hp,

-                 VP9_NMV_UPDATE_PROB);

+      update_nmv(bc, &mvctx->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);

+      update_nmv(bc, &mvctx->comps[i].hp, VP9_NMV_UPDATE_PROB);

@@ -361,15 +354,11 @@

                                          vp9_reader *const bc,

                                          unsigned char segment_id) {

   MV_REFERENCE_FRAME ref_frame;

-  int seg_ref_active;

-  int seg_ref_count = 0;

   VP9_COMMON *const cm = &pbi->common;

   MACROBLOCKD *const xd = &pbi->mb;

-  seg_ref_active = vp9_segfeature_active(xd,

-                                         segment_id,

-                                         SEG_LVL_REF_FRAME);

+  int seg_ref_count = 0;

+  int seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);

   // If segment coding enabled does the segment allow for more than one

   // possible reference frame

@@ -384,15 +373,13 @@

   // multiple reference frame options

   if (!seg_ref_active || (seg_ref_count > 1)) {

     // Values used in prediction model coding

-    unsigned char prediction_flag;

-    vp9_prob pred_prob;

     MV_REFERENCE_FRAME pred_ref;

     // Get the context probability the prediction flag

-    pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

+    vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_REF);

     // Read the prediction status flag

-    prediction_flag = (unsigned char)vp9_read(bc, pred_prob);

+    unsigned char prediction_flag = vp9_read(bc, pred_prob);

     // Store the prediction flag.

     vp9_set_pred_flag(xd, PRED_REF, prediction_flag);

@@ -403,9 +390,8 @@

     // If correctly predicted then use the predicted value

     if (prediction_flag) {

       ref_frame = pred_ref;

-    }

-    // else decode the explicitly coded value

-    else {

+    } else {

+      // decode the explicitly coded value

       vp9_prob mod_refprobs[PREDICTION_PROBS];

       vpx_memcpy(mod_refprobs,

                  cm->mod_refprobs[pred_ref], sizeof(mod_refprobs));

@@ -456,10 +442,8 @@

-  }

-  // Segment reference frame features are enabled

-  else {

+  } else {

+    // Segment reference frame features are enabled

     // The reference frame for the mb is considered as correclty predicted

     // if it is signaled at the segment level for the purposes of the

     // common prediction model

@@ -492,12 +476,12 @@

};

 #endif

-static const unsigned char mbsplit_fill_count[4] = {8, 8, 4, 1};

+static const unsigned char mbsplit_fill_count[4] = { 8, 8, 4, 1 };

 static const unsigned char mbsplit_fill_offset[4][16] = {

-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15},

-  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15},

-  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15},

-  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15}

+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 },

+  { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,   6,  7, 10, 11, 14, 15 },

+  { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9,  12, 13, 10, 11, 14, 15 },

+  { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 }

};

 static void read_switchable_interp_probs(VP9D_COMP* const pbi,

@@ -506,7 +490,7 @@

   int i, j;

   for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

     for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {

-      cm->fc.switchable_interp_prob[j][i] = vp9_read_literal(bc, 8);

+      cm->fc.switchable_interp_prob[j][i] = vp9_read_prob(bc);

   //printf("DECODER: %d %d\n", cm->fc.switchable_interp_prob[0],

@@ -527,13 +511,13 @@

 #if CONFIG_COMP_INTERINTRA_PRED

     if (cm->use_interintra) {

       if (vp9_read(bc, VP9_UPD_INTERINTRA_PROB))

-        cm->fc.interintra_prob  = (vp9_prob)vp9_read_literal(bc, 8);

+        cm->fc.interintra_prob = vp9_read_prob(bc);

 #endif

     // Decode the baseline probabilities for decoding reference frame

-    cm->prob_intra_coded = (vp9_prob)vp9_read_literal(bc, 8);

-    cm->prob_last_coded  = (vp9_prob)vp9_read_literal(bc, 8);

-    cm->prob_gf_coded    = (vp9_prob)vp9_read_literal(bc, 8);

+    cm->prob_intra_coded = vp9_read_prob(bc);

+    cm->prob_last_coded  = vp9_read_prob(bc);

+    cm->prob_gf_coded    = vp9_read_prob(bc);

     // Computes a modified set of probabilities for use when reference

     // frame prediction fails.

@@ -545,7 +529,7 @@

     if (cm->comp_pred_mode == HYBRID_PREDICTION) {

       int i;

       for (i = 0; i < COMP_PRED_CONTEXTS; i++)

-        cm->prob_comppred[i] = (vp9_prob)vp9_read_literal(bc, 8);

+        cm->prob_comppred[i] = vp9_read_prob(bc);

     if (vp9_read_bit(bc)) {

@@ -552,7 +536,7 @@

       int i = 0;

       do {

-        cm->fc.ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);

+        cm->fc.ymode_prob[i] = vp9_read_prob(bc);

       } while (++i < VP9_YMODES - 1);

@@ -560,7 +544,7 @@

       int i = 0;

       do {

-        cm->fc.sb_ymode_prob[i] = (vp9_prob) vp9_read_literal(bc, 8);

+        cm->fc.sb_ymode_prob[i] = vp9_read_prob(bc);

       } while (++i < VP9_I32X32_MODES - 1);

@@ -575,10 +559,10 @@

                                int mb_row, int mb_col,

                                BOOL_DECODER* const bc) {

   VP9_COMMON *const cm = &pbi->common;

-  MACROBLOCKD *const xd  = &pbi->mb;

+  MACROBLOCKD *const xd = &pbi->mb;

   MODE_INFO *mi = xd->mode_info_context;

   MB_MODE_INFO *mbmi = &mi->mbmi;

-  int index = mb_row * pbi->common.mb_cols + mb_col;

+  int mb_index = mb_row * pbi->common.mb_cols + mb_col;

   if (xd->segmentation_enabled) {

     if (xd->update_mb_segmentation_map) {

@@ -586,12 +570,10 @@

       if (cm->temporal_update) {

         // Get the context based probability for reading the

         // prediction status flag

-        vp9_prob pred_prob =

-          vp9_get_pred_prob(cm, xd, PRED_SEG_ID);

+        vp9_prob pred_prob = vp9_get_pred_prob(cm, xd, PRED_SEG_ID);

         // Read the prediction status flag

-        unsigned char seg_pred_flag =

-          (unsigned char)vp9_read(bc, pred_prob);

+        unsigned char seg_pred_flag = vp9_read(bc, pred_prob);

         // Store the prediction flag.

         vp9_set_pred_flag(xd, PRED_SEG_ID, seg_pred_flag);

@@ -599,17 +581,16 @@

         // If the value is flagged as correctly predicted

         // then use the predicted value

         if (seg_pred_flag) {

-          mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, index);

+          mbmi->segment_id = vp9_get_pred_mb_segid(cm, xd, mb_index);

+        } else {

+          // Decode it explicitly

+          read_mb_segid_except(cm, bc, mbmi, xd, mb_row, mb_col);

-        // Else .... decode it explicitly

-        else {

-          read_mb_segid(bc, mbmi, xd);

-        }

-      }

-      // Normal unpredicted coding mode

-      else {

+      } else {

+        // Normal unpredicted coding mode

         read_mb_segid(bc, mbmi, xd);

       if (mbmi->sb_type) {

         const int nmbs = 1 << mbmi->sb_type;

         const int ymbs = MIN(cm->mb_rows - mb_row, nmbs);

@@ -618,12 +599,12 @@

         for (y = 0; y < ymbs; y++) {

           for (x = 0; x < xmbs; x++) {

-            cm->last_frame_seg_map[index + x + y * cm->mb_cols] =

+            cm->last_frame_seg_map[mb_index + x + y * cm->mb_cols] =

                 mbmi->segment_id;

       } else {

-        cm->last_frame_seg_map[index] = mbmi->segment_id;

+        cm->last_frame_seg_map[mb_index] = mbmi->segment_id;

     } else {

       if (mbmi->sb_type) {

@@ -636,13 +617,12 @@

         for (y = 0; y < ymbs; y++) {

           for (x = 0; x < xmbs; x++) {

             segment_id = MIN(segment_id,

-                             cm->last_frame_seg_map[index + x +

-                                                    y * cm->mb_cols]);

+                cm->last_frame_seg_map[mb_index + x + y * cm->mb_cols]);

         mbmi->segment_id = segment_id;

       } else {

-        mbmi->segment_id = cm->last_frame_seg_map[index];

+        mbmi->segment_id = cm->last_frame_seg_map[mb_index];

   } else {

@@ -652,6 +632,27 @@

+static INLINE void assign_and_clamp_mv(int_mv *dst, const int_mv *src,

+                                       int mb_to_left_edge,

+                                       int mb_to_right_edge,

+                                       int mb_to_top_edge,

+                                       int mb_to_bottom_edge) {

+  dst->as_int = src->as_int;

+  clamp_mv(dst, mb_to_left_edge, mb_to_right_edge, mb_to_top_edge,

+           mb_to_bottom_edge);

+}

+static INLINE void process_mv(BOOL_DECODER* bc, MV *mv, MV *ref,

+                              nmv_context *nmvc, nmv_context_counts *mvctx,

+                              int usehp) {

+  read_nmv(bc, mv, ref, nmvc);

+  read_nmv_fp(bc, mv, ref, nmvc, usehp);

+  vp9_increment_nmv(mv, ref, mvctx, usehp);

+  mv->row += ref->row;

+  mv->col += ref->col;

+}

 static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,

                              MODE_INFO *prev_mi,

                              int mb_row, int mb_col,

@@ -659,31 +660,20 @@

   VP9_COMMON *const cm = &pbi->common;

   nmv_context *const nmvc = &pbi->common.fc.nmvc;

   const int mis = pbi->common.mode_info_stride;

-  MACROBLOCKD *const xd  = &pbi->mb;

+  MACROBLOCKD *const xd = &pbi->mb;

   int_mv *const mv = &mbmi->mv[0];

-  int mb_to_left_edge;

-  int mb_to_right_edge;

-  int mb_to_top_edge;

-  int mb_to_bottom_edge;

   const int mb_size = 1 << mi->mbmi.sb_type;

-  mb_to_top_edge = xd->mb_to_top_edge;

-  mb_to_bottom_edge = xd->mb_to_bottom_edge;

-  mb_to_top_edge -= LEFT_TOP_MARGIN;

-  mb_to_bottom_edge += RIGHT_BOTTOM_MARGIN;

+  const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&

+                                       cm->height == cm->last_height &&

+                                       !cm->error_resilient_mode;

+  int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;

   mbmi->need_to_clamp_mvs = 0;

   mbmi->need_to_clamp_secondmv = 0;

   mbmi->second_ref_frame = NONE;

-  /* Distance of Mb to the various image edges.

-   * These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units

-   */

-  xd->mb_to_left_edge =

-    mb_to_left_edge = -((mb_col * 16) << 3);

-  mb_to_left_edge -= LEFT_TOP_MARGIN;

-  xd->mb_to_right_edge =

-      mb_to_right_edge = ((pbi->common.mb_cols - mb_size - mb_col) * 16) << 3;

-  mb_to_right_edge += RIGHT_BOTTOM_MARGIN;

   // Make sure the MACROBLOCKD mode info pointer is pointed at the

   // correct entry for the current macroblock.

@@ -690,69 +680,68 @@

   xd->mode_info_context = mi;

   xd->prev_mode_info_context = prev_mi;

+  // Distance of Mb to the various image edges.

+  // These specified to 8th pel as they are always compared to MV values

+  // that are in 1/8th pel units

+  set_mb_row(cm, xd, mb_row, mb_size);

+  set_mb_col(cm, xd, mb_col, mb_size);

+  mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;

+  mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;

+  mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN;

+  mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;

   // Read the macroblock segment id.

   read_mb_segment_id(pbi, mb_row, mb_col, bc);

   if (pbi->common.mb_no_coeff_skip &&

-      (!vp9_segfeature_active(xd,

-                              mbmi->segment_id, SEG_LVL_EOB) ||

-       (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) != 0))) {

+      (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP))) {

     // Read the macroblock coeff skip flag if this feature is in use,

     // else default to 0

     mbmi->mb_skip_coeff = vp9_read(bc, vp9_get_pred_prob(cm, xd, PRED_MBSKIP));

   } else {

-    if (vp9_segfeature_active(xd,

-                              mbmi->segment_id, SEG_LVL_EOB) &&

-        (vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_EOB) == 0)) {

-      mbmi->mb_skip_coeff = 1;

-    } else

-      mbmi->mb_skip_coeff = 0;

+    mbmi->mb_skip_coeff = vp9_segfeature_active(xd, mbmi->segment_id,

+                                                SEG_LVL_SKIP);

   // Read the reference frame

-  if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)

-      && vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE) < NEARESTMV)

-    mbmi->ref_frame = INTRA_FRAME;

-  else

-    mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);

+  mbmi->ref_frame = read_ref_frame(pbi, bc, mbmi->segment_id);

+  /*

+  if (pbi->common.current_video_frame == 1)

+    printf("ref frame: %d [%d %d]\n", mbmi->ref_frame, mb_row, mb_col);

+    */

   // If reference frame is an Inter frame

   if (mbmi->ref_frame) {

     int_mv nearest, nearby, best_mv;

     int_mv nearest_second, nearby_second, best_mv_second;

-    vp9_prob mv_ref_p [VP9_MVREFS - 1];

+    vp9_prob mv_ref_p[VP9_MVREFS - 1];

-    int recon_y_stride, recon_yoffset;

-    int recon_uv_stride, recon_uvoffset;

     MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;

+    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];

-      int ref_fb_idx;

+      const int use_prev_in_find_best_ref =

+          xd->scale_factor[0].x_num == xd->scale_factor[0].x_den &&

+          xd->scale_factor[0].y_num == xd->scale_factor[0].y_den &&

+          !cm->error_resilient_mode &&

+          !cm->frame_parallel_decoding_mode;

       /* Select the appropriate reference frame for this MB */

-      if (ref_frame == LAST_FRAME)

-        ref_fb_idx = cm->lst_fb_idx;

-      else if (ref_frame == GOLDEN_FRAME)

-        ref_fb_idx = cm->gld_fb_idx;

-      else

-        ref_fb_idx = cm->alt_fb_idx;

+      const int ref_fb_idx = cm->active_ref_idx[ref_frame - 1];

-      recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride  ;

-      recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

+      setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx],

+          mb_row, mb_col, &xd->scale_factor[0], &xd->scale_factor_uv[0]);

-      recon_yoffset = (mb_row * recon_y_stride * 16) + (mb_col * 16);

-      recon_uvoffset = (mb_row * recon_uv_stride * 8) + (mb_col * 8);

-      xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

-      xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

-      xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

 #ifdef DEC_DEBUG

       if (dec_debug)

         printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,

                xd->mode_info_context->mbmi.mv[0].as_mv.col);

 #endif

-      vp9_find_mv_refs(xd, mi, prev_mi,

+      // if (cm->current_video_frame == 1 && mb_row == 4 && mb_col == 5)

+      //  printf("Dello\n");

+      vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,

                        ref_frame, mbmi->ref_mvs[ref_frame],

                        cm->ref_frame_sign_bias);

@@ -759,16 +748,12 @@

       vp9_mv_ref_probs(&pbi->common, mv_ref_p,

                        mbmi->mb_mode_context[ref_frame]);

-      // Is the segment level mode feature enabled for this segment

-      if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {

-        mbmi->mode =

-          vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);

+      // If the segment level skip mode enabled

+      if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_SKIP)) {

+        mbmi->mode = ZEROMV;

       } else {

-        if (mbmi->sb_type)

-          mbmi->mode = read_sb_mv_ref(bc, mv_ref_p);

-        else

-          mbmi->mode = read_mv_ref(bc, mv_ref_p);

+        mbmi->mode = mbmi->sb_type ? read_sb_mv_ref(bc, mv_ref_p)

+                                   : read_mv_ref(bc, mv_ref_p);

         vp9_accum_mv_refs(&pbi->common, mbmi->mode,

                           mbmi->mb_mode_context[ref_frame]);

@@ -775,8 +760,9 @@

       if (mbmi->mode != ZEROMV) {

         vp9_find_best_ref_mvs(xd,

-                              xd->pre.y_buffer,

-                              recon_y_stride,

+                              use_prev_in_find_best_ref ?

+                                  xd->pre.y_buffer : NULL,

+                              xd->pre.y_stride,

                               mbmi->ref_mvs[ref_frame],

                               &nearest, &nearby);

@@ -791,8 +777,7 @@

 #endif

-    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV)

-    {

+    if (mbmi->mode >= NEARESTMV && mbmi->mode <= SPLITMV) {

       if (cm->mcomp_filter_type == SWITCHABLE) {

         mbmi->interp_filter = vp9_switchable_interp[

             treed_read(bc, vp9_switchable_interp_tree,

@@ -817,23 +802,22 @@

         mbmi->second_ref_frame = 1;

       if (mbmi->second_ref_frame > 0) {

         int second_ref_fb_idx;

+        int use_prev_in_find_best_ref;

+        xd->scale_factor[1] = cm->active_ref_scale[mbmi->second_ref_frame - 1];

+        use_prev_in_find_best_ref =

+            xd->scale_factor[1].x_num == xd->scale_factor[1].x_den &&

+            xd->scale_factor[1].y_num == xd->scale_factor[1].y_den &&

+            !cm->error_resilient_mode &&

+            !cm->frame_parallel_decoding_mode;

         /* Select the appropriate reference frame for this MB */

-        if (mbmi->second_ref_frame == LAST_FRAME)

-          second_ref_fb_idx = cm->lst_fb_idx;

-        else if (mbmi->second_ref_frame ==

-          GOLDEN_FRAME)

-          second_ref_fb_idx = cm->gld_fb_idx;

-        else

-          second_ref_fb_idx = cm->alt_fb_idx;

+        second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];

-        xd->second_pre.y_buffer =

-          cm->yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;

-        xd->second_pre.u_buffer =

-          cm->yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;

-        xd->second_pre.v_buffer =

-          cm->yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;

+        setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],

+             mb_row, mb_col, &xd->scale_factor[1], &xd->scale_factor_uv[1]);

-        vp9_find_mv_refs(xd, mi, prev_mi,

+        vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,

                          mbmi->second_ref_frame,

                          mbmi->ref_mvs[mbmi->second_ref_frame],

                          cm->ref_frame_sign_bias);

@@ -840,8 +824,9 @@

         if (mbmi->mode != ZEROMV) {

           vp9_find_best_ref_mvs(xd,

-                                xd->second_pre.y_buffer,

-                                recon_y_stride,

+                                use_prev_in_find_best_ref ?

+                                    xd->second_pre.y_buffer : NULL,

+                                xd->second_pre.y_stride,

                                 mbmi->ref_mvs[mbmi->second_ref_frame],

                                 &nearest_second,

                                 &nearby_second);

@@ -861,12 +846,11 @@

         pbi->common.fc.interintra_counts[

             mbmi->second_ref_frame == INTRA_FRAME]++;

         if (mbmi->second_ref_frame == INTRA_FRAME) {

-          mbmi->interintra_mode = (MB_PREDICTION_MODE)read_ymode(

-              bc, pbi->common.fc.ymode_prob);

+          mbmi->interintra_mode = read_ymode(bc, pbi->common.fc.ymode_prob);

           pbi->common.fc.ymode_counts[mbmi->interintra_mode]++;

 #if SEPARATE_INTERINTRA_UV

-          mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)read_uv_mode(

-              bc, pbi->common.fc.uv_mode_prob[mbmi->interintra_mode]);

+          mbmi->interintra_uv_mode = read_uv_mode(bc,

+              pbi->common.fc.uv_mode_prob[mbmi->interintra_mode]);

           pbi->common.fc.uv_mode_counts[mbmi->interintra_mode]

                                        [mbmi->interintra_uv_mode]++;

 #else

@@ -905,28 +889,26 @@

     mbmi->uv_mode = DC_PRED;

     switch (mbmi->mode) {

       case SPLITMV: {

-        const int s = mbmi->partitioning =

-                        treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);

-        const int num_p = vp9_mbsplit_count [s];

+        const int s = treed_read(bc, vp9_mbsplit_tree, cm->fc.mbsplit_prob);

+        const int num_p = vp9_mbsplit_count[s];

         int j = 0;

-        cm->fc.mbsplit_counts[s]++;

+        cm->fc.mbsplit_counts[s]++;

         mbmi->need_to_clamp_mvs = 0;

-        do { /* for each subset j */

+        mbmi->partitioning = s;

+        do {  // for each subset j

           int_mv leftmv, abovemv, second_leftmv, second_abovemv;

           int_mv blockmv, secondmv;

-          int k;  /* first block in subset j */

           int mv_contz;

           int blockmode;

+          int k = vp9_mbsplit_offset[s][j];  // first block in subset j

-          k = vp9_mbsplit_offset[s][j];

-          leftmv.as_int = left_block_mv(mi, k);

+          leftmv.as_int = left_block_mv(xd, mi, k);

           abovemv.as_int = above_block_mv(mi, k, mis);

           second_leftmv.as_int = 0;

           second_abovemv.as_int = 0;

           if (mbmi->second_ref_frame > 0) {

-            second_leftmv.as_int = left_block_second_mv(mi, k);

+            second_leftmv.as_int = left_block_second_mv(xd, mi, k);

             second_abovemv.as_int = above_block_second_mv(mi, k, mis);

           mv_contz = vp9_mv_cont(&leftmv, &abovemv);

@@ -935,23 +917,13 @@

           switch (blockmode) {

             case NEW4X4:

-              read_nmv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc);

-              read_nmv_fp(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,

-                          xd->allow_high_precision_mv);

-              vp9_increment_nmv(&blockmv.as_mv, &best_mv.as_mv,

-                                &cm->fc.NMVcount, xd->allow_high_precision_mv);

-              blockmv.as_mv.row += best_mv.as_mv.row;

-              blockmv.as_mv.col += best_mv.as_mv.col;

+              process_mv(bc, &blockmv.as_mv, &best_mv.as_mv, nmvc,

+                         &cm->fc.NMVcount, xd->allow_high_precision_mv);

-              if (mbmi->second_ref_frame > 0) {

-                read_nmv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc);

-                read_nmv_fp(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,

-                            xd->allow_high_precision_mv);

-                vp9_increment_nmv(&secondmv.as_mv, &best_mv_second.as_mv,

-                                  &cm->fc.NMVcount, xd->allow_high_precision_mv);

-                secondmv.as_mv.row += best_mv_second.as_mv.row;

-                secondmv.as_mv.col += best_mv_second.as_mv.col;

-              }

+              if (mbmi->second_ref_frame > 0)

+                process_mv(bc, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,

+                           &cm->fc.NMVcount, xd->allow_high_precision_mv);

 #ifdef VPX_MODE_COUNT

               vp9_mv_cont_count[mv_contz][3]++;

 #endif

@@ -1005,15 +977,14 @@

             /* Fill (uniform) modes, mvs of jth subset.

              Must do it here because ensuing subsets can

              refer back to us via "left" or "above". */

-            const unsigned char *fill_offset;

             unsigned int fill_count = mbsplit_fill_count[s];

+            const unsigned char *fill_offset =

+                &mbsplit_fill_offset[s][j * fill_count];

-            fill_offset = &mbsplit_fill_offset[s][(unsigned char)j * mbsplit_fill_count[s]];

             do {

-              mi->bmi[ *fill_offset].as_mv.first.as_int = blockmv.as_int;

+              mi->bmi[*fill_offset].as_mv[0].as_int = blockmv.as_int;

               if (mbmi->second_ref_frame > 0)

-                mi->bmi[ *fill_offset].as_mv.second.as_int = secondmv.as_int;

+                mi->bmi[*fill_offset].as_mv[1].as_int = secondmv.as_int;

               fill_offset++;

             } while (--fill_count);

@@ -1021,33 +992,35 @@

         } while (++j < num_p);

-      mv->as_int = mi->bmi[15].as_mv.first.as_int;

-      mbmi->mv[1].as_int = mi->bmi[15].as_mv.second.as_int;

+      mv->as_int = mi->bmi[15].as_mv[0].as_int;

+      mbmi->mv[1].as_int = mi->bmi[15].as_mv[1].as_int;

       break;  /* done with SPLITMV */

       case NEARMV:

-        mv->as_int = nearby.as_int;

-        /* Clip "next_nearest" so that it does not extend to far out of image */

-        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,

-                 mb_to_top_edge, mb_to_bottom_edge);

-        if (mbmi->second_ref_frame > 0) {

-          mbmi->mv[1].as_int = nearby_second.as_int;

-          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,

-                   mb_to_top_edge, mb_to_bottom_edge);

-        }

+        // Clip "next_nearest" so that it does not extend to far out of image

+        assign_and_clamp_mv(mv, &nearby, mb_to_left_edge,

+                                         mb_to_right_edge,

+                                         mb_to_top_edge,

+                                         mb_to_bottom_edge);

+        if (mbmi->second_ref_frame > 0)

+          assign_and_clamp_mv(&mbmi->mv[1], &nearby_second, mb_to_left_edge,

+                                                            mb_to_right_edge,

+                                                            mb_to_top_edge,

+                                                            mb_to_bottom_edge);

         break;

       case NEARESTMV:

-        mv->as_int = nearest.as_int;

-        /* Clip "next_nearest" so that it does not extend to far out of image */

-        clamp_mv(mv, mb_to_left_edge, mb_to_right_edge,

-                 mb_to_top_edge, mb_to_bottom_edge);

-        if (mbmi->second_ref_frame > 0) {

-          mbmi->mv[1].as_int = nearest_second.as_int;

-          clamp_mv(&mbmi->mv[1], mb_to_left_edge, mb_to_right_edge,

-                   mb_to_top_edge, mb_to_bottom_edge);

-        }

+        // Clip "next_nearest" so that it does not extend to far out of image

+        assign_and_clamp_mv(mv, &nearest, mb_to_left_edge,

+                                          mb_to_right_edge,

+                                          mb_to_top_edge,

+                                          mb_to_bottom_edge);

+        if (mbmi->second_ref_frame > 0)

+          assign_and_clamp_mv(&mbmi->mv[1], &nearest_second, mb_to_left_edge,

+                                                             mb_to_right_edge,

+                                                             mb_to_top_edge,

+                                                             mb_to_bottom_edge);

         break;

       case ZEROMV:

@@ -1057,21 +1030,13 @@

         break;

       case NEWMV:

+        process_mv(bc, &mv->as_mv, &best_mv.as_mv, nmvc, &cm->fc.NMVcount,

+                   xd->allow_high_precision_mv);

-        read_nmv(bc, &mv->as_mv, &best_mv.as_mv, nmvc);

-        read_nmv_fp(bc, &mv->as_mv, &best_mv.as_mv, nmvc,

-                    xd->allow_high_precision_mv);

-        vp9_increment_nmv(&mv->as_mv, &best_mv.as_mv, &cm->fc.NMVcount,

-                          xd->allow_high_precision_mv);

-        mv->as_mv.row += best_mv.as_mv.row;

-        mv->as_mv.col += best_mv.as_mv.col;

-        /* Don't need to check this on NEARMV and NEARESTMV modes

-         * since those modes clamp the MV. The NEWMV mode does not,

-         * so signal to the prediction stage whether special

-         * handling may be required.

-         */

+        // Don't need to check this on NEARMV and NEARESTMV modes

+        // since those modes clamp the MV. The NEWMV mode does not,

+        // so signal to the prediction stage whether special

+        // handling may be required.

         mbmi->need_to_clamp_mvs = check_mv_bounds(mv,

                                                   mb_to_left_edge,

                                                   mb_to_right_edge,

@@ -1079,17 +1044,13 @@

                                                   mb_to_bottom_edge);

         if (mbmi->second_ref_frame > 0) {

-          read_nmv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc);

-          read_nmv_fp(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,

-                      xd->allow_high_precision_mv);

-          vp9_increment_nmv(&mbmi->mv[1].as_mv, &best_mv_second.as_mv,

-                            &cm->fc.NMVcount, xd->allow_high_precision_mv);

-          mbmi->mv[1].as_mv.row += best_mv_second.as_mv.row;

-          mbmi->mv[1].as_mv.col += best_mv_second.as_mv.col;

-          mbmi->need_to_clamp_secondmv |=

-            check_mv_bounds(&mbmi->mv[1],

-                            mb_to_left_edge, mb_to_right_edge,

-                            mb_to_top_edge, mb_to_bottom_edge);

+          process_mv(bc, &mbmi->mv[1].as_mv, &best_mv_second.as_mv, nmvc,

+                     &cm->fc.NMVcount, xd->allow_high_precision_mv);

+          mbmi->need_to_clamp_secondmv |= check_mv_bounds(&mbmi->mv[1],

+                                                          mb_to_left_edge,

+                                                          mb_to_right_edge,

+                                                          mb_to_top_edge,

+                                                          mb_to_bottom_edge);

         break;

       default:

@@ -1102,16 +1063,11 @@

     /* required for left and above block mv */

     mbmi->mv[0].as_int = 0;

-    if (vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_MODE)) {

-      mbmi->mode = (MB_PREDICTION_MODE)

-                   vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_MODE);

-    } else if (mbmi->sb_type) {

-      mbmi->mode = (MB_PREDICTION_MODE)

-                   read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob);

+    if (mbmi->sb_type) {

+      mbmi->mode = read_sb_ymode(bc, pbi->common.fc.sb_ymode_prob);

       pbi->common.fc.sb_ymode_counts[mbmi->mode]++;

     } else {

-      mbmi->mode = (MB_PREDICTION_MODE)

-                   read_ymode(bc, pbi->common.fc.ymode_prob);

+      mbmi->mode = read_ymode(bc, pbi->common.fc.ymode_prob);

       pbi->common.fc.ymode_counts[mbmi->mode]++;

@@ -1119,9 +1075,8 @@

     if (mbmi->mode == B_PRED) {

       int j = 0;

       do {

-        int m;

-        m = mi->bmi[j].as_mode.first = (B_PREDICTION_MODE)

-            read_bmode(bc, pbi->common.fc.bmode_prob);

+        int m = read_bmode(bc, pbi->common.fc.bmode_prob);

+        mi->bmi[j].as_mode.first = m;

 #if CONFIG_NEWBINTRAMODES

         if (m == B_CONTEXT_PRED) m -= CONTEXT_PRED_REPLACEMENTS;

 #endif

@@ -1131,10 +1086,10 @@

     if (mbmi->mode == I8X8_PRED) {

       int i;

-      int mode8x8;

       for (i = 0; i < 4; i++) {

-        int ib = vp9_i8x8_block[i];

-        mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

+        const int ib = vp9_i8x8_block[i];

+        const int mode8x8 = read_i8x8_mode(bc, pbi->common.fc.i8x8_mode_prob);

         mi->bmi[ib + 0].as_mode.first = mode8x8;

         mi->bmi[ib + 1].as_mode.first = mode8x8;

         mi->bmi[ib + 4].as_mode.first = mode8x8;

@@ -1142,11 +1097,14 @@

         pbi->common.fc.i8x8_mode_counts[mode8x8]++;

     } else {

-      mbmi->uv_mode = (MB_PREDICTION_MODE)read_uv_mode(

-        bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);

+      mbmi->uv_mode = read_uv_mode(bc, pbi->common.fc.uv_mode_prob[mbmi->mode]);

       pbi->common.fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;

+  /*

+  if (pbi->common.current_video_frame == 1)

+    printf("mode: %d skip: %d\n", mbmi->mode, mbmi->mb_skip_coeff);

+    */

   if (cm->txfm_mode == TX_MODE_SELECT && mbmi->mb_skip_coeff == 0 &&

       ((mbmi->ref_frame == INTRA_FRAME && mbmi->mode <= I8X8_PRED) ||

@@ -1182,22 +1140,305 @@

   vpx_memset(cm->mbskip_pred_probs, 0, sizeof(cm->mbskip_pred_probs));

   if (pbi->common.mb_no_coeff_skip) {

     int k;

-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)

-      cm->mbskip_pred_probs[k] = (vp9_prob)vp9_read_literal(bc, 8);

+    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

+      cm->mbskip_pred_probs[k] = vp9_read_prob(bc);

+    }

   mb_mode_mv_init(pbi, bc);

+#if CONFIG_CODE_NONZEROCOUNT

+static uint16_t read_nzc(VP9_COMMON *const cm,

+                         int nzc_context,

+                         TX_SIZE tx_size,

+                         int ref,

+                         int type,

+                         BOOL_DECODER* const bc) {

+  int c, e;

+  uint16_t nzc;

+  if (tx_size == TX_32X32) {

+    c = treed_read(bc, vp9_nzc32x32_tree,

+                   cm->fc.nzc_probs_32x32[nzc_context][ref][type]);

+    cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;

+  } else if (tx_size == TX_16X16) {

+    c = treed_read(bc, vp9_nzc16x16_tree,

+                   cm->fc.nzc_probs_16x16[nzc_context][ref][type]);

+    cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;

+  } else if (tx_size == TX_8X8) {

+    c = treed_read(bc, vp9_nzc8x8_tree,

+                   cm->fc.nzc_probs_8x8[nzc_context][ref][type]);

+    cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;

+  } else if (tx_size == TX_4X4) {

+    c = treed_read(bc, vp9_nzc4x4_tree,

+                   cm->fc.nzc_probs_4x4[nzc_context][ref][type]);

+    cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;

+  } else {

+    assert(0);

+  }

+  nzc = vp9_basenzcvalue[c];

+  if ((e = vp9_extranzcbits[c])) {

+    int x = 0;

+    while (e--) {

+      int b = vp9_read(

+          bc, cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);

+      x |= (b << e);

+      cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;

+    }

+    nzc += x;

+  }

+  if (tx_size == TX_32X32)

+    assert(nzc <= 1024);

+  else if (tx_size == TX_16X16)

+    assert(nzc <= 256);

+  else if (tx_size == TX_8X8)

+    assert(nzc <= 64);

+  else if (tx_size == TX_4X4)

+    assert(nzc <= 16);

+  return nzc;

+}

+static void read_nzcs_sb64(VP9_COMMON *const cm,

+                           MACROBLOCKD* xd,

+                           int mb_row,

+                           int mb_col,

+                           BOOL_DECODER* const bc) {

+  MODE_INFO *m = xd->mode_info_context;

+  MB_MODE_INFO *const mi = &m->mbmi;

+  int j, nzc_context;

+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  assert(mb_col == get_mb_col(xd));

+  assert(mb_row == get_mb_row(xd));

+  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));

+  if (mi->mb_skip_coeff)

+    return;

+  switch (mi->txfm_size) {

+    case TX_32X32:

+      for (j = 0; j < 256; j += 64) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);

+      }

+      for (j = 256; j < 384; j += 64) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 1, bc);

+      }

+      break;

+    case TX_16X16:

+      for (j = 0; j < 256; j += 16) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);

+      }

+      for (j = 256; j < 384; j += 16) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);

+      }

+      break;

+    case TX_8X8:

+      for (j = 0; j < 256; j += 4) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);

+      }

+      for (j = 256; j < 384; j += 4) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);

+      }

+      break;

+    case TX_4X4:

+      for (j = 0; j < 256; ++j) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);

+      }

+      for (j = 256; j < 384; ++j) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);

+      }

+      break;

+    default:

+      break;

+  }

+}

+static void read_nzcs_sb32(VP9_COMMON *const cm,

+                           MACROBLOCKD* xd,

+                           int mb_row,

+                           int mb_col,

+                           BOOL_DECODER* const bc) {

+  MODE_INFO *m = xd->mode_info_context;

+  MB_MODE_INFO *const mi = &m->mbmi;

+  int j, nzc_context;

+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  assert(mb_col == get_mb_col(xd));

+  assert(mb_row == get_mb_row(xd));

+  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));

+  if (mi->mb_skip_coeff)

+    return;

+  switch (mi->txfm_size) {

+    case TX_32X32:

+      for (j = 0; j < 64; j += 64) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_32X32, ref, 0, bc);

+      }

+      for (j = 64; j < 96; j += 16) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);

+      }

+      break;

+    case TX_16X16:

+      for (j = 0; j < 64; j += 16) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);

+      }

+      for (j = 64; j < 96; j += 16) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 1, bc);

+      }

+      break;

+    case TX_8X8:

+      for (j = 0; j < 64; j += 4) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);

+      }

+      for (j = 64; j < 96; j += 4) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);

+      }

+      break;

+    case TX_4X4:

+      for (j = 0; j < 64; ++j) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);

+      }

+      for (j = 64; j < 96; ++j) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);

+      }

+      break;

+    default:

+      break;

+  }

+}

+static void read_nzcs_mb16(VP9_COMMON *const cm,

+                           MACROBLOCKD* xd,

+                           int mb_row,

+                           int mb_col,

+                           BOOL_DECODER* const bc) {

+  MODE_INFO *m = xd->mode_info_context;

+  MB_MODE_INFO *const mi = &m->mbmi;

+  int j, nzc_context;

+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  assert(mb_col == get_mb_col(xd));

+  assert(mb_row == get_mb_row(xd));

+  vpx_memset(m->mbmi.nzcs, 0, 384 * sizeof(m->mbmi.nzcs[0]));

+  if (mi->mb_skip_coeff)

+    return;

+  switch (mi->txfm_size) {

+    case TX_16X16:

+      for (j = 0; j < 16; j += 16) {

+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_16X16, ref, 0, bc);

+      }

+      for (j = 16; j < 24; j += 4) {

+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);

+      }

+      break;

+    case TX_8X8:

+      for (j = 0; j < 16; j += 4) {

+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 0, bc);

+      }

+      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {

+        for (j = 16; j < 24; ++j) {

+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);

+        }

+      } else {

+        for (j = 16; j < 24; j += 4) {

+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+          m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_8X8, ref, 1, bc);

+        }

+      }

+      break;

+    case TX_4X4:

+      for (j = 0; j < 16; ++j) {

+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 0, bc);

+      }

+      for (j = 16; j < 24; ++j) {

+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+        m->mbmi.nzcs[j] = read_nzc(cm, nzc_context, TX_4X4, ref, 1, bc);

+      }

+      break;

+    default:

+      break;

+  }

+}

+#endif  // CONFIG_CODE_NONZEROCOUNT

 void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,

                            MACROBLOCKD* const xd,

                            int mb_row,

                            int mb_col,

                            BOOL_DECODER* const bc) {

+  VP9_COMMON *const cm = &pbi->common;

   MODE_INFO *mi = xd->mode_info_context;

   MODE_INFO *prev_mi = xd->prev_mode_info_context;

+  MB_MODE_INFO *const mbmi = &mi->mbmi;

-  if (pbi->common.frame_type == KEY_FRAME)

+  if (pbi->common.frame_type == KEY_FRAME) {

     kfread_modes(pbi, mi, mb_row, mb_col, bc);

-  else

+  } else {

     read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mb_row, mb_col, bc);

+    set_scale_factors(xd,

+                      mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1,

+                      pbi->common.active_ref_scale);

+  }

+#if CONFIG_CODE_NONZEROCOUNT

+  if (mbmi->sb_type == BLOCK_SIZE_SB64X64)

+    read_nzcs_sb64(cm, xd, mb_row, mb_col, bc);

+  else if (mbmi->sb_type == BLOCK_SIZE_SB32X32)

+    read_nzcs_sb32(cm, xd, mb_row, mb_col, bc);

+  else

+    read_nzcs_mb16(cm, xd, mb_row, mb_col, bc);

+#endif  // CONFIG_CODE_NONZEROCOUNT

+  if (mbmi->sb_type) {

+    const int n_mbs = 1 << mbmi->sb_type;

+    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);

+    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);

+    const int mis = cm->mode_info_stride;

+    int x, y;

+    for (y = 0; y < y_mbs; y++) {

+      for (x = !y; x < x_mbs; x++) {

+        mi[y * mis + x] = *mi;

+      }

+    }

+  } else {

+    update_blockd_bmi(xd);

+  }

--- a/vp9/decoder/vp9_decodframe.c

+++ b/vp9/decoder/vp9_decodframe.c

@@ -13,7 +13,6 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_header.h"

 #include "vp9/common/vp9_reconintra.h"

-#include "vp9/common/vp9_reconintra4x4.h"

 #include "vp9/common/vp9_reconinter.h"

 #include "vp9/common/vp9_entropy.h"

 #include "vp9/decoder/vp9_decodframe.h"

@@ -32,7 +31,7 @@

 #include "vp9/decoder/vp9_dboolhuff.h"

 #include "vp9/common/vp9_seg_common.h"

-#include "vp9/common/vp9_entropy.h"

+#include "vp9/common/vp9_tile_common.h"

 #include "vp9_rtcd.h"

 #include <assert.h>

@@ -40,11 +39,25 @@

 #define COEFCOUNT_TESTING

-//#define DEC_DEBUG

+// #define DEC_DEBUG

 #ifdef DEC_DEBUG

 int dec_debug = 0;

 #endif

+static int read_le16(const uint8_t *p) {

+  return (p[1] << 8) | p[0];

+}

+static int read_le32(const uint8_t *p) {

+  return (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];

+}

+// len == 0 is not allowed

+static int read_is_valid(const unsigned char *start, size_t len,

+                         const unsigned char *end) {

+  return start + len > start && start + len <= end;

+}

 static int merge_index(int v, int n, int modulus) {

   int max1 = (n - 1 - modulus / 2) / modulus + 1;

   if (v < max1) v = v * modulus + modulus / 2;

@@ -62,14 +75,13 @@

 static int inv_remap_prob(int v, int m) {

   const int n = 256;

   const int modulus = MODULUS_PARAM;

-  int i;

   v = merge_index(v, n - 1, modulus);

   if ((m << 1) <= n) {

-    i = vp9_inv_recenter_nonneg(v + 1, m);

+    return vp9_inv_recenter_nonneg(v + 1, m);

   } else {

-    i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);

+    return n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m);

-  return i;

 static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) {

@@ -79,103 +91,78 @@

 void vp9_init_de_quantizer(VP9D_COMP *pbi) {

   int i;

-  int Q;

+  int q;

   VP9_COMMON *const pc = &pbi->common;

-  for (Q = 0; Q < QINDEX_RANGE; Q++) {

-    pc->Y1dequant[Q][0] = (int16_t)vp9_dc_quant(Q, pc->y1dc_delta_q);

-    pc->Y2dequant[Q][0] = (int16_t)vp9_dc2quant(Q, pc->y2dc_delta_q);

-    pc->UVdequant[Q][0] = (int16_t)vp9_dc_uv_quant(Q, pc->uvdc_delta_q);

+  for (q = 0; q < QINDEX_RANGE; q++) {

+    pc->Y1dequant[q][0] = (int16_t)vp9_dc_quant(q, pc->y1dc_delta_q);

+    pc->UVdequant[q][0] = (int16_t)vp9_dc_uv_quant(q, pc->uvdc_delta_q);

     /* all the ac values =; */

     for (i = 1; i < 16; i++) {

       int rc = vp9_default_zig_zag1d_4x4[i];

-      pc->Y1dequant[Q][rc] = (int16_t)vp9_ac_yquant(Q);

-      pc->Y2dequant[Q][rc] = (int16_t)vp9_ac2quant(Q, pc->y2ac_delta_q);

-      pc->UVdequant[Q][rc] = (int16_t)vp9_ac_uv_quant(Q, pc->uvac_delta_q);

+      pc->Y1dequant[q][rc] = (int16_t)vp9_ac_yquant(q);

+      pc->UVdequant[q][rc] = (int16_t)vp9_ac_uv_quant(q, pc->uvac_delta_q);

-static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *xd) {

+static int get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex) {

+  // Set the Q baseline allowing for any segment level adjustment

+  if (vp9_segfeature_active(mb, segment_id, SEG_LVL_ALT_Q)) {

+    if (mb->mb_segment_abs_delta == SEGMENT_ABSDATA)

+      return vp9_get_segdata(mb, segment_id, SEG_LVL_ALT_Q);  // Abs Value

+    else

+      return clamp(base_qindex + vp9_get_segdata(mb, segment_id, SEG_LVL_ALT_Q),

+                   0, MAXQ);  // Delta Value

+  } else {

+    return base_qindex;

+  }

+}

+static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *mb) {

   int i;

-  int QIndex;

   VP9_COMMON *const pc = &pbi->common;

-  int segment_id = xd->mode_info_context->mbmi.segment_id;

+  const int segment_id = mb->mode_info_context->mbmi.segment_id;

+  const int qindex = get_qindex(mb, segment_id, pc->base_qindex);

+  mb->q_index = qindex;

-  // Set the Q baseline allowing for any segment level adjustment

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) {

-    /* Abs Value */

-    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA)

-      QIndex = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

+  for (i = 0; i < 16; i++)

+    mb->block[i].dequant = pc->Y1dequant[qindex];

-    /* Delta Value */

-    else {

-      QIndex = pc->base_qindex +

-               vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q);

-      QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    /* Clamp to valid range */

-    }

-  } else

-    QIndex = pc->base_qindex;

-  xd->q_index = QIndex;

+  for (i = 16; i < 24; i++)

+    mb->block[i].dequant = pc->UVdequant[qindex];

-  /* Set up the block level dequant pointers */

-  for (i = 0; i < 16; i++) {

-    xd->block[i].dequant = pc->Y1dequant[QIndex];

-  }

-#if CONFIG_LOSSLESS

-  if (!QIndex) {

-    pbi->mb.inv_xform4x4_1_x8     = vp9_short_inv_walsh4x4_1_x8;

-    pbi->mb.inv_xform4x4_x8       = vp9_short_inv_walsh4x4_x8;

-    pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1_lossless;

-    pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless;

-    pbi->idct_add            = vp9_dequant_idct_add_lossless_c;

-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add_lossless_c;

-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;

-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block_lossless_c;

-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block_lossless_c;

+  if (mb->lossless) {

+    assert(qindex == 0);

+    mb->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;

+    mb->inv_txm4x4        = vp9_short_iwalsh4x4;

+    mb->itxm_add          = vp9_dequant_idct_add_lossless_c;

+    mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block_lossless_c;

+    mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c;

   } else {

-    pbi->mb.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;

-    pbi->mb.inv_xform4x4_x8       = vp9_short_idct4x4llm;

-    pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;

-    pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;

-    pbi->idct_add            = vp9_dequant_idct_add;

-    pbi->dc_idct_add         = vp9_dequant_dc_idct_add;

-    pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;

-    pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;

-    pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;

+    mb->inv_txm4x4_1      = vp9_short_idct4x4_1;

+    mb->inv_txm4x4        = vp9_short_idct4x4;

+    mb->itxm_add          = vp9_dequant_idct_add;

+    mb->itxm_add_y_block  = vp9_dequant_idct_add_y_block;

+    mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block;

-#else

-  pbi->mb.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;

-  pbi->mb.inv_xform4x4_x8       = vp9_short_idct4x4llm;

-  pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;

-  pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;

-  pbi->idct_add            = vp9_dequant_idct_add;

-  pbi->dc_idct_add         = vp9_dequant_dc_idct_add;

-  pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;

-  pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block;

-  pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;

-#endif

-  for (i = 16; i < 24; i++) {

-    xd->block[i].dequant = pc->UVdequant[QIndex];

-  }

-  xd->block[24].dequant = pc->Y2dequant[QIndex];

 /* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it

  *  to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.

*/

-static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd) {

+static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd,

+                          int mb_row, int mb_col) {

+  BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

-    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

+    if (sb_type == BLOCK_SIZE_SB64X64) {

       vp9_build_intra_predictors_sb64uv_s(xd);

       vp9_build_intra_predictors_sb64y_s(xd);

-    } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {

+    } else if (sb_type == BLOCK_SIZE_SB32X32) {

       vp9_build_intra_predictors_sbuv_s(xd);

       vp9_build_intra_predictors_sby_s(xd);

     } else {

@@ -183,46 +170,30 @@

       vp9_build_intra_predictors_mby_s(xd);

   } else {

-    if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

+    if (sb_type == BLOCK_SIZE_SB64X64) {

       vp9_build_inter64x64_predictors_sb(xd,

                                          xd->dst.y_buffer,

                                          xd->dst.u_buffer,

                                          xd->dst.v_buffer,

                                          xd->dst.y_stride,

-                                         xd->dst.uv_stride);

-    } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {

+                                         xd->dst.uv_stride,

+                                         mb_row, mb_col);

+    } else if (sb_type == BLOCK_SIZE_SB32X32) {

       vp9_build_inter32x32_predictors_sb(xd,

                                          xd->dst.y_buffer,

                                          xd->dst.u_buffer,

                                          xd->dst.v_buffer,

                                          xd->dst.y_stride,

-                                         xd->dst.uv_stride);

+                                         xd->dst.uv_stride,

+                                         mb_row, mb_col);

     } else {

-      vp9_build_1st_inter16x16_predictors_mb(xd,

-                                             xd->dst.y_buffer,

-                                             xd->dst.u_buffer,

-                                             xd->dst.v_buffer,

-                                             xd->dst.y_stride,

-                                             xd->dst.uv_stride);

-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-        vp9_build_2nd_inter16x16_predictors_mb(xd,

-                                               xd->dst.y_buffer,

-                                               xd->dst.u_buffer,

-                                               xd->dst.v_buffer,

-                                               xd->dst.y_stride,

-                                               xd->dst.uv_stride);

-      }

-#if CONFIG_COMP_INTERINTRA_PRED

-      else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

-        vp9_build_interintra_16x16_predictors_mb(xd,

-                                                 xd->dst.y_buffer,

-                                                 xd->dst.u_buffer,

-                                                 xd->dst.v_buffer,

-                                                 xd->dst.y_stride,

-                                                 xd->dst.uv_stride);

-      }

-#endif

+      vp9_build_inter16x16_predictors_mb(xd,

+                                         xd->dst.y_buffer,

+                                         xd->dst.u_buffer,

+                                         xd->dst.v_buffer,

+                                         xd->dst.y_stride,

+                                         xd->dst.uv_stride,

+                                         mb_row, mb_col);

@@ -229,10 +200,8 @@

 static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd,

                          BOOL_DECODER* const bc) {

-  BLOCKD *bd = &xd->block[0];

-  TX_TYPE tx_type = get_tx_type_16x16(xd, bd);

-  assert(get_2nd_order_usage(xd) == 0);

-#ifdef DEC_DEBUG

+  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);

+#if 0  // def DEC_DEBUG

   if (dec_debug) {

     int i;

     printf("\n");

@@ -262,7 +231,7 @@

   vp9_dequant_idct_add_uv_block_8x8(

       xd->qcoeff + 16 * 16, xd->block[16].dequant,

       xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

-      xd->dst.uv_stride, xd->eobs + 16, xd);

+      xd->dst.uv_stride, xd);

 static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd,

@@ -269,13 +238,13 @@

                        BOOL_DECODER* const bc) {

   // First do Y

   // if the first one is DCT_DCT assume all the rest are as well

-  TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]);

-#ifdef DEC_DEBUG

+  TX_TYPE tx_type = get_tx_type_8x8(xd, 0);

+#if 0  // def DEC_DEBUG

   if (dec_debug) {

     int i;

     printf("\n");

     printf("qcoeff 8x8\n");

-    for (i = 0; i < 400; i++) {

+    for (i = 0; i < 384; i++) {

       printf("%3d ", xd->qcoeff[i]);

       if (i % 16 == 15) printf("\n");

@@ -283,7 +252,6 @@

 #endif

   if (tx_type != DCT_DCT || xd->mode_info_context->mbmi.mode == I8X8_PRED) {

     int i;

-    assert(get_2nd_order_usage(xd) == 0);

     for (i = 0; i < 4; i++) {

       int ib = vp9_i8x8_block[i];

       int idx = (ib & 0x02) ? (ib + 2) : ib;

@@ -295,46 +263,24 @@

       BLOCKD *b = &xd->block[ib];

       if (xd->mode_info_context->mbmi.mode == I8X8_PRED) {

         int i8x8mode = b->bmi.as_mode.first;

-        vp9_intra8x8_predict(b, i8x8mode, b->predictor);

+        vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);

-      tx_type = get_tx_type_8x8(xd, &xd->block[ib]);

+      tx_type = get_tx_type_8x8(xd, ib);

       if (tx_type != DCT_DCT) {

         vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride,

                                       xd->eobs[idx]);

       } else {

         vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride,

-                                   0, xd->eobs[idx]);

+                                   xd->eobs[idx]);

-  } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {

-    assert(get_2nd_order_usage(xd) == 0);

+  } else {

     vp9_dequant_idct_add_y_block_8x8(xd->qcoeff,

                                      xd->block[0].dequant,

                                      xd->predictor,

                                      xd->dst.y_buffer,

                                      xd->dst.y_stride,

-                                     xd->eobs, xd);

-  } else {

-    BLOCKD *b = &xd->block[24];

-    assert(get_2nd_order_usage(xd) == 1);

-    vp9_dequantize_b_2x2(b);

-    vp9_short_ihaar2x2(&b->dqcoeff[0], b->diff, 8);

-    ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct

-    ((int *)b->qcoeff)[1] = 0;

-    ((int *)b->qcoeff)[2] = 0;

-    ((int *)b->qcoeff)[3] = 0;

-    ((int *)b->qcoeff)[4] = 0;

-    ((int *)b->qcoeff)[5] = 0;

-    ((int *)b->qcoeff)[6] = 0;

-    ((int *)b->qcoeff)[7] = 0;

-    vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,

-                                        xd->block[0].dequant,

-                                        xd->predictor,

-                                        xd->dst.y_buffer,

-                                        xd->dst.y_stride,

-                                        xd->eobs,

-                                        xd->block[24].diff,

-                                        xd);

+                                     xd);

   // Now do UV

@@ -344,26 +290,28 @@

       int ib = vp9_i8x8_block[i];

       BLOCKD *b = &xd->block[ib];

       int i8x8mode = b->bmi.as_mode.first;

       b = &xd->block[16 + i];

-      vp9_intra_uv4x4_predict(&xd->block[16 + i], i8x8mode, b->predictor);

-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,

-                    *(b->base_dst) + b->dst, 8, b->dst_stride);

+      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);

+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

+                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);

       b = &xd->block[20 + i];

-      vp9_intra_uv4x4_predict(&xd->block[20 + i], i8x8mode, b->predictor);

-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,

-                    *(b->base_dst) + b->dst, 8, b->dst_stride);

+      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);

+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

+                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);

   } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {

-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,

+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16, xd->block[16].dequant,

          xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

-         xd->dst.uv_stride, xd->eobs + 16);

+         xd->dst.uv_stride, xd);

   } else {

     vp9_dequant_idct_add_uv_block_8x8

         (xd->qcoeff + 16 * 16, xd->block[16].dequant,

          xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

-         xd->dst.uv_stride, xd->eobs + 16, xd);

+         xd->dst.uv_stride, xd);

-#ifdef DEC_DEBUG

+#if 0  // def DEC_DEBUG

   if (dec_debug) {

     int i;

     printf("\n");

@@ -381,94 +329,98 @@

   TX_TYPE tx_type;

   int i, eobtotal = 0;

   MB_PREDICTION_MODE mode = xd->mode_info_context->mbmi.mode;

+#if 0  // def DEC_DEBUG

+  if (dec_debug) {

+    int i;

+    printf("\n");

+    printf("predictor\n");

+    for (i = 0; i < 384; i++) {

+      printf("%3d ", xd->predictor[i]);

+      if (i % 16 == 15) printf("\n");

+    }

+  }

+#endif

   if (mode == I8X8_PRED) {

-    assert(get_2nd_order_usage(xd) == 0);

     for (i = 0; i < 4; i++) {

       int ib = vp9_i8x8_block[i];

       const int iblock[4] = {0, 1, 4, 5};

       int j;

-      int i8x8mode;

-      BLOCKD *b;

-      b = &xd->block[ib];

-      i8x8mode = b->bmi.as_mode.first;

-      vp9_intra8x8_predict(b, i8x8mode, b->predictor);

+      BLOCKD *b = &xd->block[ib];

+      int i8x8mode = b->bmi.as_mode.first;

+      vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor);

       for (j = 0; j < 4; j++) {

         b = &xd->block[ib + iblock[j]];

-        tx_type = get_tx_type_4x4(xd, b);

+        tx_type = get_tx_type_4x4(xd, ib + iblock[j]);

         if (tx_type != DCT_DCT) {

           vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,

                                     b->dequant, b->predictor,

                                     *(b->base_dst) + b->dst, 16,

-                                    b->dst_stride, b->eob);

+                                    b->dst_stride, xd->eobs[ib + iblock[j]]);

         } else {

-          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,

-                               *(b->base_dst) + b->dst, 16, b->dst_stride);

+          xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

+                       *(b->base_dst) + b->dst, 16, b->dst_stride,

+                       xd->eobs[ib + iblock[j]]);

       b = &xd->block[16 + i];

-      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);

-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,

-                    *(b->base_dst) + b->dst, 8, b->dst_stride);

+      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);

+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

+                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]);

       b = &xd->block[20 + i];

-      vp9_intra_uv4x4_predict(b, i8x8mode, b->predictor);

-      pbi->idct_add(b->qcoeff, b->dequant, b->predictor,

-                    *(b->base_dst) + b->dst, 8, b->dst_stride);

+      vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor);

+      xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

+                   *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]);

   } else if (mode == B_PRED) {

-    assert(get_2nd_order_usage(xd) == 0);

     for (i = 0; i < 16; i++) {

-      int b_mode;

       BLOCKD *b = &xd->block[i];

-      b_mode = xd->mode_info_context->bmi[i].as_mode.first;

+      int b_mode = xd->mode_info_context->bmi[i].as_mode.first;

 #if CONFIG_NEWBINTRAMODES

       xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context =

-          vp9_find_bpred_context(b);

+          vp9_find_bpred_context(xd, b);

 #endif

       if (!xd->mode_info_context->mbmi.mb_skip_coeff)

         eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);

-      vp9_intra4x4_predict(b, b_mode, b->predictor);

-      tx_type = get_tx_type_4x4(xd, b);

+      vp9_intra4x4_predict(xd, b, b_mode, b->predictor);

+      tx_type = get_tx_type_4x4(xd, i);

       if (tx_type != DCT_DCT) {

         vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,

                                   b->dequant, b->predictor,

                                   *(b->base_dst) + b->dst, 16, b->dst_stride,

-                                  b->eob);

+                                  xd->eobs[i]);

       } else {

-        vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,

-                             *(b->base_dst) + b->dst, 16, b->dst_stride);

+        xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

+                      *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]);

     if (!xd->mode_info_context->mbmi.mb_skip_coeff) {

       vp9_decode_mb_tokens_4x4_uv(pbi, xd, bc);

-    xd->above_context->y2 = 0;

-    xd->left_context->y2 = 0;

     vp9_build_intra_predictors_mbuv(xd);

-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,

+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,

                            xd->block[16].dequant,

                            xd->predictor + 16 * 16,

                            xd->dst.u_buffer,

                            xd->dst.v_buffer,

                            xd->dst.uv_stride,

-                           xd->eobs + 16);

-  } else if (mode == SPLITMV) {

-    assert(get_2nd_order_usage(xd) == 0);

-    pbi->idct_add_y_block(xd->qcoeff,

+                           xd);

+  } else if (mode == SPLITMV || get_tx_type_4x4(xd, 0) == DCT_DCT) {

+    xd->itxm_add_y_block(xd->qcoeff,

                           xd->block[0].dequant,

                           xd->predictor,

                           xd->dst.y_buffer,

                           xd->dst.y_stride,

-                          xd->eobs);

-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,

+                          xd);

+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,

                            xd->block[16].dequant,

                            xd->predictor + 16 * 16,

                            xd->dst.u_buffer,

                            xd->dst.v_buffer,

                            xd->dst.uv_stride,

-                           xd->eobs + 16);

+                           xd);

   } else {

-#ifdef DEC_DEBUG

+#if 0  // def DEC_DEBUG

     if (dec_debug) {

       int i;

       printf("\n");

@@ -485,211 +437,35 @@

 #endif

-    tx_type = get_tx_type_4x4(xd, &xd->block[0]);

-    if (tx_type != DCT_DCT) {

-      assert(get_2nd_order_usage(xd) == 0);

-      for (i = 0; i < 16; i++) {

-        BLOCKD *b = &xd->block[i];

-        tx_type = get_tx_type_4x4(xd, b);

-        if (tx_type != DCT_DCT) {

-          vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,

-                                    b->dequant, b->predictor,

-                                    *(b->base_dst) + b->dst, 16,

-                                    b->dst_stride, b->eob);

-        } else {

-          vp9_dequant_idct_add(b->qcoeff, b->dequant, b->predictor,

-                               *(b->base_dst) + b->dst, 16, b->dst_stride);

-        }

-      }

-    } else {

-      BLOCKD *b = &xd->block[24];

-      assert(get_2nd_order_usage(xd) == 1);

-      vp9_dequantize_b(b);

-      if (xd->eobs[24] > 1) {

-        vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff);

-        ((int *)b->qcoeff)[0] = 0;

-        ((int *)b->qcoeff)[1] = 0;

-        ((int *)b->qcoeff)[2] = 0;

-        ((int *)b->qcoeff)[3] = 0;

-        ((int *)b->qcoeff)[4] = 0;

-        ((int *)b->qcoeff)[5] = 0;

-        ((int *)b->qcoeff)[6] = 0;

-        ((int *)b->qcoeff)[7] = 0;

+    for (i = 0; i < 16; i++) {

+      BLOCKD *b = &xd->block[i];

+      tx_type = get_tx_type_4x4(xd, i);

+      if (tx_type != DCT_DCT) {

+        vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff,

+                                  b->dequant, b->predictor,

+                                  *(b->base_dst) + b->dst, 16,

+                                  b->dst_stride, xd->eobs[i]);

       } else {

-        xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff);

-        ((int *)b->qcoeff)[0] = 0;

+        xd->itxm_add(b->qcoeff, b->dequant, b->predictor,

+                      *(b->base_dst) + b->dst, 16, b->dst_stride, xd->eobs[i]);

-      vp9_dequantize_b(b);

-      pbi->dc_idct_add_y_block(xd->qcoeff,

-                               xd->block[0].dequant,

-                               xd->predictor,

-                               xd->dst.y_buffer,

-                               xd->dst.y_stride,

-                               xd->eobs,

-                               xd->block[24].diff);

-    pbi->idct_add_uv_block(xd->qcoeff + 16 * 16,

+    xd->itxm_add_uv_block(xd->qcoeff + 16 * 16,

                            xd->block[16].dequant,

                            xd->predictor + 16 * 16,

                            xd->dst.u_buffer,

                            xd->dst.v_buffer,

                            xd->dst.uv_stride,

-                           xd->eobs + 16);

+                           xd);

-static void decode_16x16_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                            BOOL_DECODER* const bc, int n,

-                            int maska, int shiftb) {

-  int x_idx = n & maska, y_idx = n >> shiftb;

-  TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);

-  if (tx_type != DCT_DCT) {

-    vp9_ht_dequant_idct_add_16x16_c(

-        tx_type, xd->qcoeff, xd->block[0].dequant,

-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,

-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,

-        xd->dst.y_stride, xd->dst.y_stride, xd->block[0].eob);

-  } else {

-    vp9_dequant_idct_add_16x16(

-        xd->qcoeff, xd->block[0].dequant,

-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,

-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,

-        xd->dst.y_stride, xd->dst.y_stride, xd->eobs[0]);

-  }

-  vp9_dequant_idct_add_uv_block_8x8_inplace_c(

-      xd->qcoeff + 16 * 16,

-      xd->block[16].dequant,

-      xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,

-      xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,

-      xd->dst.uv_stride, xd->eobs + 16, xd);

-};

-static void decode_8x8_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                          BOOL_DECODER* const bc, int n,

-                          int maska, int shiftb) {

-  int x_idx = n & maska, y_idx = n >> shiftb;

-  BLOCKD *b = &xd->block[24];

-  TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]);

-  if (tx_type != DCT_DCT) {

-    int i;

-    for (i = 0; i < 4; i++) {

-      int ib = vp9_i8x8_block[i];

-      int idx = (ib & 0x02) ? (ib + 2) : ib;

-      int16_t *q  = xd->block[idx].qcoeff;

-      int16_t *dq = xd->block[0].dequant;

-      int stride = xd->dst.y_stride;

-      BLOCKD *b = &xd->block[ib];

-      tx_type = get_tx_type_8x8(xd, &xd->block[ib]);

-      if (tx_type != DCT_DCT) {

-        vp9_ht_dequant_idct_add_8x8_c(

-            tx_type, q, dq,

-            xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride

-            + x_idx * 16 + (i & 1) * 8,

-            xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride

-            + x_idx * 16 + (i & 1) * 8,

-            stride, stride, b->eob);

-      } else {

-        vp9_dequant_idct_add_8x8_c(

-            q, dq,

-            xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride

-            + x_idx * 16 + (i & 1) * 8,

-            xd->dst.y_buffer + (y_idx * 16 + (i / 2) * 8) * xd->dst.y_stride

-            + x_idx * 16 + (i & 1) * 8,

-            stride, stride, 0, b->eob);

-      }

-      vp9_dequant_idct_add_uv_block_8x8_inplace_c(

-          xd->qcoeff + 16 * 16, xd->block[16].dequant,

-          xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,

-          xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,

-          xd->dst.uv_stride, xd->eobs + 16, xd);

-    }

-  } else {

-    vp9_dequantize_b_2x2(b);

-    vp9_short_ihaar2x2(&b->dqcoeff[0], b->diff, 8);

-    ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct

-    ((int *)b->qcoeff)[1] = 0;

-    ((int *)b->qcoeff)[2] = 0;

-    ((int *)b->qcoeff)[3] = 0;

-    ((int *)b->qcoeff)[4] = 0;

-    ((int *)b->qcoeff)[5] = 0;

-    ((int *)b->qcoeff)[6] = 0;

-    ((int *)b->qcoeff)[7] = 0;

-    vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(

-        xd->qcoeff, xd->block[0].dequant,

-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,

-        xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);

-    vp9_dequant_idct_add_uv_block_8x8_inplace_c(

-        xd->qcoeff + 16 * 16, xd->block[16].dequant,

-        xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,

-        xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,

-        xd->dst.uv_stride, xd->eobs + 16, xd);

-  }

-};

-static void decode_4x4_sb(VP9D_COMP *pbi, MACROBLOCKD *xd,

-                          BOOL_DECODER* const bc, int n,

-                          int maska, int shiftb) {

-  int x_idx = n & maska, y_idx = n >> shiftb;

-  BLOCKD *b = &xd->block[24];

-  TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[0]);

-  if (tx_type != DCT_DCT) {

-    int i;

-    for (i = 0; i < 16; i++) {

-      BLOCKD *b = &xd->block[i];

-      tx_type = get_tx_type_4x4(xd, b);

-      if (tx_type != DCT_DCT) {

-        vp9_ht_dequant_idct_add_c(

-            tx_type, b->qcoeff, b->dequant,

-            xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride

-            + x_idx * 16 + (i & 3) * 4,

-            xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride

-            + x_idx * 16 + (i & 3) * 4,

-            xd->dst.y_stride, xd->dst.y_stride, b->eob);

-      } else {

-        vp9_dequant_idct_add_c(

-            b->qcoeff, b->dequant,

-            xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride

-            + x_idx * 16 + (i & 3) * 4,

-            xd->dst.y_buffer + (y_idx * 16 + (i / 4) * 4) * xd->dst.y_stride

-            + x_idx * 16 + (i & 3) * 4,

-            xd->dst.y_stride, xd->dst.y_stride);

-      }

-    }

-  } else {

-    vp9_dequantize_b(b);

-    if (xd->eobs[24] > 1) {

-      vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff);

-      ((int *)b->qcoeff)[0] = 0;

-      ((int *)b->qcoeff)[1] = 0;

-      ((int *)b->qcoeff)[2] = 0;

-      ((int *)b->qcoeff)[3] = 0;

-      ((int *)b->qcoeff)[4] = 0;

-      ((int *)b->qcoeff)[5] = 0;

-      ((int *)b->qcoeff)[6] = 0;

-      ((int *)b->qcoeff)[7] = 0;

-    } else {

-      xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff);

-      ((int *)b->qcoeff)[0] = 0;

-    }

-    vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(

-        xd->qcoeff, xd->block[0].dequant,

-        xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16,

-        xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);

-  }

-  vp9_dequant_idct_add_uv_block_4x4_inplace_c(

-      xd->qcoeff + 16 * 16, xd->block[16].dequant,

-      xd->dst.u_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,

-      xd->dst.v_buffer + y_idx * 8 * xd->dst.uv_stride + x_idx * 8,

-      xd->dst.uv_stride, xd->eobs + 16, xd);

-};

 static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd,

                                 int mb_row, int mb_col,

                                 BOOL_DECODER* const bc) {

-  int i, n, eobtotal;

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

+  int n, eobtotal;

   VP9_COMMON *const pc = &pbi->common;

-  MODE_INFO *orig_mi = xd->mode_info_context;

+  MODE_INFO *mi = xd->mode_info_context;

   const int mis = pc->mode_info_stride;

   assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64);

@@ -702,25 +478,12 @@

     mb_init_dequantizer(pbi, xd);

   if (xd->mode_info_context->mbmi.mb_skip_coeff) {

-    int n;

+    vp9_reset_sb64_tokens_context(xd);

-    vp9_reset_mb_tokens_context(xd);

-    for (n = 1; n <= 3; n++) {

-      if (mb_col < pc->mb_cols - n)

-        xd->above_context += n;

-      if (mb_row < pc->mb_rows - n)

-        xd->left_context += n;

-      vp9_reset_mb_tokens_context(xd);

-      if (mb_col < pc->mb_cols - n)

-        xd->above_context -= n;

-      if (mb_row < pc->mb_rows - n)

-        xd->left_context -= n;

-    }

     /* Special case:  Force the loopfilter to skip when eobtotal and

      * mb_skip_coeff are zero.

*/

-    skip_recon_mb(pbi, xd);

+    skip_recon_mb(pbi, xd, mb_row, mb_col);

     return;

@@ -731,91 +494,151 @@

   } else {

     vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,

                                        xd->dst.u_buffer, xd->dst.v_buffer,

-                                       xd->dst.y_stride, xd->dst.uv_stride);

+                                       xd->dst.y_stride, xd->dst.uv_stride,

+                                       mb_row, mb_col);

   /* dequantization and idct */

-  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {

-    for (n = 0; n < 4; n++) {

-      const int x_idx = n & 1, y_idx = n >> 1;

+  eobtotal = vp9_decode_sb64_tokens(pbi, xd, bc);

+  if (eobtotal == 0) {  // skip loopfilter

+    for (n = 0; n < 16; n++) {

+      const int x_idx = n & 3, y_idx = n >> 2;

-      if (mb_col + x_idx * 2 >= pc->mb_cols ||

-          mb_row + y_idx * 2 >= pc->mb_rows)

-        continue;

+      if (mb_col + x_idx < pc->mb_cols && mb_row + y_idx < pc->mb_rows)

+        mi[y_idx * mis + x_idx].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

+    }

+  } else {

+    switch (xd->mode_info_context->mbmi.txfm_size) {

+      case TX_32X32:

+        for (n = 0; n < 4; n++) {

+          const int x_idx = n & 1, y_idx = n >> 1;

+          const int y_offset = x_idx * 32 + y_idx * xd->dst.y_stride * 32;

+          vp9_dequant_idct_add_32x32(xd->qcoeff + n * 1024,

+              xd->block[0].dequant,

+              xd->dst.y_buffer + y_offset,

+              xd->dst.y_buffer + y_offset,

+              xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 64]);

+        }

+        vp9_dequant_idct_add_32x32(xd->qcoeff + 4096,

+            xd->block[16].dequant, xd->dst.u_buffer, xd->dst.u_buffer,

+            xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256]);

+        vp9_dequant_idct_add_32x32(xd->qcoeff + 4096 + 1024,

+            xd->block[20].dequant, xd->dst.v_buffer, xd->dst.v_buffer,

+            xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320]);

+        break;

+      case TX_16X16:

+        for (n = 0; n < 16; n++) {

+          const int x_idx = n & 3, y_idx = n >> 2;

+          const int y_offset = y_idx * 16 * xd->dst.y_stride + x_idx * 16;

+          const TX_TYPE tx_type = get_tx_type_16x16(xd,

+                                                    (y_idx * 16 + x_idx) * 4);

-      xd->left_context = pc->left_context + (y_idx << 1);

-      xd->above_context = pc->above_context + mb_col + (x_idx << 1);

-      xd->mode_info_context = orig_mi + x_idx * 2 + y_idx * 2 * mis;

-      eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);

-      if (eobtotal == 0) {  // skip loopfilter

-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-        if (mb_col + 1 < pc->mb_cols)

-          xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;

-        if (mb_row + 1 < pc->mb_rows) {

-          xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;

-          if (mb_col + 1 < pc->mb_cols)

-            xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;

+          if (tx_type == DCT_DCT) {

+            vp9_dequant_idct_add_16x16(xd->qcoeff + n * 256,

+                xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);

+          } else {

+            vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256,

+                xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);

+          }

-      } else {

-        vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,

-                                   xd->dst.y_buffer + x_idx * 32 +

-                                       xd->dst.y_stride * y_idx * 32,

-                                   xd->dst.y_buffer + x_idx * 32 +

-                                       xd->dst.y_stride * y_idx * 32,

-                                   xd->dst.y_stride, xd->dst.y_stride,

-                                   xd->eobs[0]);

-        vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,

-                                              xd->block[16].dequant,

-                                              xd->dst.u_buffer + x_idx * 16 +

-                                                xd->dst.uv_stride * y_idx * 16,

-                                              xd->dst.v_buffer + x_idx * 16 +

-                                                xd->dst.uv_stride * y_idx * 16,

-                                              xd->dst.uv_stride, xd->eobs + 16);

-      }

+        for (n = 0; n < 4; n++) {

+          const int x_idx = n & 1, y_idx = n >> 1;

+          const int uv_offset = y_idx * 16 * xd->dst.uv_stride + x_idx * 16;

+          vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + n * 256,

+              xd->block[16].dequant,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 16]);

+          vp9_dequant_idct_add_16x16(xd->qcoeff + 4096 + 1024 + n * 256,

+              xd->block[20].dequant,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 16]);

+        }

+        break;

+      case TX_8X8:

+        for (n = 0; n < 64; n++) {

+          const int x_idx = n & 7, y_idx = n >> 3;

+          const int y_offset = y_idx * 8 * xd->dst.y_stride + x_idx * 8;

+          const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);

+          if (tx_type == DCT_DCT) {

+            vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64,

+                xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);

+          } else {

+            vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64,

+                xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);

+          }

+        }

+        for (n = 0; n < 16; n++) {

+          const int x_idx = n & 3, y_idx = n >> 2;

+          const int uv_offset = y_idx * 8 * xd->dst.uv_stride + x_idx * 8;

+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096,

+              xd->block[16].dequant,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n * 4]);

+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 4096 + 1024,

+              xd->block[20].dequant,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 4]);

+        }

+        break;

+      case TX_4X4:

+        for (n = 0; n < 256; n++) {

+          const int x_idx = n & 15, y_idx = n >> 4;

+          const int y_offset = y_idx * 4 * xd->dst.y_stride + x_idx * 4;

+          const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);

+          if (tx_type == DCT_DCT) {

+            xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);

+          } else {

+            vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16,

+                xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);

+          }

+        }

+        for (n = 0; n < 64; n++) {

+          const int x_idx = n & 7, y_idx = n >> 3;

+          const int uv_offset = y_idx * 4 * xd->dst.uv_stride + x_idx * 4;

+          xd->itxm_add(xd->qcoeff + 4096 + n * 16,

+              xd->block[16].dequant,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[256 + n]);

+          xd->itxm_add(xd->qcoeff + 4096 + 1024 + n * 16,

+              xd->block[20].dequant,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n]);

+        }

+        break;

+      default: assert(0);

-  } else {

-    for (n = 0; n < 16; n++) {

-      int x_idx = n & 3, y_idx = n >> 2;

-      if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)

-        continue;

-      xd->above_context = pc->above_context + mb_col + x_idx;

-      xd->left_context = pc->left_context + y_idx;

-      xd->mode_info_context = orig_mi + x_idx + y_idx * mis;

-      for (i = 0; i < 25; i++) {

-        xd->block[i].eob = 0;

-        xd->eobs[i] = 0;

-      }

-      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);

-      if (eobtotal == 0) {  // skip loopfilter

-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-        continue;

-      }

-      if (tx_size == TX_16X16) {

-        decode_16x16_sb(pbi, xd, bc, n, 3, 2);

-      } else if (tx_size == TX_8X8) {

-        decode_8x8_sb(pbi, xd, bc, n, 3, 2);

-      } else {

-        decode_4x4_sb(pbi, xd, bc, n, 3, 2);

-      }

-    }

-  xd->above_context = pc->above_context + mb_col;

-  xd->left_context = pc->left_context;

-  xd->mode_info_context = orig_mi;

 static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd,

                                 int mb_row, int mb_col,

                                 BOOL_DECODER* const bc) {

-  int i, n, eobtotal;

-  TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

+  int n, eobtotal;

   VP9_COMMON *const pc = &pbi->common;

-  MODE_INFO *orig_mi = xd->mode_info_context;

   const int mis = pc->mode_info_stride;

   assert(xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32);

@@ -828,21 +651,12 @@

     mb_init_dequantizer(pbi, xd);

   if (xd->mode_info_context->mbmi.mb_skip_coeff) {

-    vp9_reset_mb_tokens_context(xd);

-    if (mb_col < pc->mb_cols - 1)

-      xd->above_context++;

-    if (mb_row < pc->mb_rows - 1)

-      xd->left_context++;

-    vp9_reset_mb_tokens_context(xd);

-    if (mb_col < pc->mb_cols - 1)

-      xd->above_context--;

-    if (mb_row < pc->mb_rows - 1)

-      xd->left_context--;

+    vp9_reset_sb_tokens_context(xd);

     /* Special case:  Force the loopfilter to skip when eobtotal and

      * mb_skip_coeff are zero.

*/

-    skip_recon_mb(pbi, xd);

+    skip_recon_mb(pbi, xd, mb_row, mb_col);

     return;

@@ -853,64 +667,131 @@

   } else {

     vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

                                        xd->dst.u_buffer, xd->dst.v_buffer,

-                                       xd->dst.y_stride, xd->dst.uv_stride);

+                                       xd->dst.y_stride, xd->dst.uv_stride,

+                                       mb_row, mb_col);

   /* dequantization and idct */

-  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {

-    eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);

-    if (eobtotal == 0) {  // skip loopfilter

-      xd->mode_info_context->mbmi.mb_skip_coeff = 1;

+  eobtotal = vp9_decode_sb_tokens(pbi, xd, bc);

+  if (eobtotal == 0) {  // skip loopfilter

+    xd->mode_info_context->mbmi.mb_skip_coeff = 1;

+    if (mb_col + 1 < pc->mb_cols)

+      xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;

+    if (mb_row + 1 < pc->mb_rows) {

+      xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;

       if (mb_col + 1 < pc->mb_cols)

-        xd->mode_info_context[1].mbmi.mb_skip_coeff = 1;

-      if (mb_row + 1 < pc->mb_rows) {

-        xd->mode_info_context[mis].mbmi.mb_skip_coeff = 1;

-        if (mb_col + 1 < pc->mb_cols)

-          xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;

-      }

-    } else {

-      vp9_dequant_idct_add_32x32(xd->sb_coeff_data.qcoeff, xd->block[0].dequant,

-                                 xd->dst.y_buffer, xd->dst.y_buffer,

-                                 xd->dst.y_stride, xd->dst.y_stride,

-                                 xd->eobs[0]);

-      vp9_dequant_idct_add_uv_block_16x16_c(xd->sb_coeff_data.qcoeff + 1024,

-                                            xd->block[16].dequant,

-                                            xd->dst.u_buffer, xd->dst.v_buffer,

-                                            xd->dst.uv_stride, xd->eobs + 16);

+        xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = 1;

   } else {

-    for (n = 0; n < 4; n++) {

-      int x_idx = n & 1, y_idx = n >> 1;

+    switch (xd->mode_info_context->mbmi.txfm_size) {

+      case TX_32X32:

+        vp9_dequant_idct_add_32x32(xd->qcoeff, xd->block[0].dequant,

+                                   xd->dst.y_buffer, xd->dst.y_buffer,

+                                   xd->dst.y_stride, xd->dst.y_stride,

+                                   xd->eobs[0]);

+        vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024,

+                                              xd->block[16].dequant,

+                                              xd->dst.u_buffer,

+                                              xd->dst.v_buffer,

+                                              xd->dst.uv_stride, xd);

+        break;

+      case TX_16X16:

+        for (n = 0; n < 4; n++) {

+          const int x_idx = n & 1, y_idx = n >> 1;

+          const int y_offset = y_idx * 16 * xd->dst.y_stride + x_idx * 16;

+          const TX_TYPE tx_type = get_tx_type_16x16(xd,

+                                                    (y_idx * 8 + x_idx) * 4);

+          if (tx_type == DCT_DCT) {

+            vp9_dequant_idct_add_16x16(

+                xd->qcoeff + n * 256, xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);

+          } else {

+            vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256,

+                xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]);

+          }

+        }

+        vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024,

+                                              xd->block[16].dequant,

+                                              xd->dst.u_buffer,

+                                              xd->dst.v_buffer,

+                                              xd->dst.uv_stride, xd);

+        break;

+      case TX_8X8:

+        for (n = 0; n < 16; n++) {

+          const int x_idx = n & 3, y_idx = n >> 2;

+          const int y_offset = y_idx * 8 * xd->dst.y_stride + x_idx * 8;

+          const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);

+          if (tx_type == DCT_DCT) {

+            vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64,

+                xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);

+          } else {

+            vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64,

+                xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]);

+          }

+        }

+        for (n = 0; n < 4; n++) {

+          const int x_idx = n & 1, y_idx = n >> 1;

+          const int uv_offset = y_idx * 8 * xd->dst.uv_stride + x_idx * 8;

+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1024,

+              xd->block[16].dequant,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n * 4]);

+          vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64 + 1280,

+              xd->block[20].dequant,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n * 4]);

+        }

+        break;

+      case TX_4X4:

+        for (n = 0; n < 64; n++) {

+          const int x_idx = n & 7, y_idx = n >> 3;

+          const int y_offset = y_idx * 4 * xd->dst.y_stride + x_idx * 4;

-      if (mb_col + x_idx >= pc->mb_cols || mb_row + y_idx >= pc->mb_rows)

-        continue;

+          const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);

+          if (tx_type == DCT_DCT) {

+            xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);

+          } else {

+            vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16,

+                xd->block[0].dequant,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_buffer + y_offset,

+                xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]);

+          }

+        }

-      xd->above_context = pc->above_context + mb_col + x_idx;

-      xd->left_context = pc->left_context + y_idx + (mb_row & 2);

-      xd->mode_info_context = orig_mi + x_idx + y_idx * mis;

-      for (i = 0; i < 25; i++) {

-        xd->block[i].eob = 0;

-        xd->eobs[i] = 0;

-      }

-      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);

-      if (eobtotal == 0) {  // skip loopfilter

-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-        continue;

-      }

-      if (tx_size == TX_16X16) {

-        decode_16x16_sb(pbi, xd, bc, n, 1, 1);

-      } else if (tx_size == TX_8X8) {

-        decode_8x8_sb(pbi, xd, bc, n, 1, 1);

-      } else {

-        decode_4x4_sb(pbi, xd, bc, n, 1, 1);

-      }

+        for (n = 0; n < 16; n++) {

+          const int x_idx = n & 3, y_idx = n >> 2;

+          const int uv_offset = y_idx * 4 * xd->dst.uv_stride + x_idx * 4;

+          xd->itxm_add(xd->qcoeff + 1024 + n * 16,

+              xd->block[16].dequant,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.u_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[64 + n]);

+          xd->itxm_add(xd->qcoeff + 1280 + n * 16,

+              xd->block[20].dequant,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.v_buffer + uv_offset,

+              xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n]);

+        }

+        break;

+      default: assert(0);

-    xd->above_context = pc->above_context + mb_col;

-    xd->left_context = pc->left_context + (mb_row & 2);

-    xd->mode_info_context = orig_mi;

@@ -919,7 +800,6 @@

                               BOOL_DECODER* const bc) {

   int eobtotal = 0;

   MB_PREDICTION_MODE mode;

-  int i;

   int tx_size;

   assert(!xd->mode_info_context->mbmi.sb_type);

@@ -934,13 +814,8 @@

   if (xd->mode_info_context->mbmi.mb_skip_coeff) {

     vp9_reset_mb_tokens_context(xd);

   } else if (!bool_error(bc)) {

-    for (i = 0; i < 25; i++) {

-      xd->block[i].eob = 0;

-      xd->eobs[i] = 0;

-    }

-    if (mode != B_PRED) {

+    if (mode != B_PRED)

       eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);

-    }

   //mode = xd->mode_info_context->mbmi.mode;

@@ -948,24 +823,25 @@

     vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter,

                              &pbi->common);

-  if (eobtotal == 0 && mode != B_PRED && mode != SPLITMV

-      && mode != I8X8_PRED

-      && !bool_error(bc)) {

+  if (eobtotal == 0 &&

+      mode != B_PRED &&

+      mode != SPLITMV &&

+      mode != I8X8_PRED &&

+      !bool_error(bc)) {

     /* Special case:  Force the loopfilter to skip when eobtotal and

-     * mb_skip_coeff are zero.

-     * */

+       mb_skip_coeff are zero. */

     xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-    skip_recon_mb(pbi, xd);

+    skip_recon_mb(pbi, xd, mb_row, mb_col);

     return;

-#ifdef DEC_DEBUG

+#if 0  // def DEC_DEBUG

   if (dec_debug)

     printf("Decoding mb:  %d %d\n", xd->mode_info_context->mbmi.mode, tx_size);

 #endif

   // moved to be performed before detokenization

-//  if (xd->segmentation_enabled)

-//    mb_init_dequantizer(pbi, xd);

+  //  if (xd->segmentation_enabled)

+  //    mb_init_dequantizer(pbi, xd);

   /* do prediction */

   if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {

@@ -976,13 +852,13 @@

   } else {

-#ifdef DEC_DEBUG

+#if 0  // def DEC_DEBUG

   if (dec_debug)

     printf("Decoding mb:  %d %d interp %d\n",

            xd->mode_info_context->mbmi.mode, tx_size,

            xd->mode_info_context->mbmi.interp_filter);

 #endif

-    vp9_build_inter_predictors_mb(xd);

+    vp9_build_inter_predictors_mb(xd, mb_row, mb_col);

   if (tx_size == TX_16X16) {

@@ -996,6 +872,13 @@

   if (dec_debug) {

     int i, j;

     printf("\n");

+    printf("predictor y\n");

+    for (i = 0; i < 16; i++) {

+      for (j = 0; j < 16; j++)

+        printf("%3d ", xd->predictor[i * 16 + j]);

+      printf("\n");

+    }

+    printf("\n");

     printf("final y\n");

     for (i = 0; i < 16; i++) {

       for (j = 0; j < 16; j++)

@@ -1062,18 +945,13 @@

   xd->above_context = cm->above_context + mb_col;

   xd->left_context = cm->left_context + (mb_row & 3);

-  /* Distance of Mb to the various image edges.

-   * These are specified to 8th pel as they are always compared to

-   * values that are in 1/8th pel units

-   */

+  // Distance of Mb to the various image edges.

+  // These are specified to 8th pel as they are always compared to

+  // values that are in 1/8th pel units

   block_size >>= 4;  // in mb units

-  xd->mb_to_top_edge = -((mb_row * 16)) << 3;

-  xd->mb_to_left_edge = -((mb_col * 16) << 3);

-  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;

-  xd->mb_to_right_edge = ((cm->mb_cols - block_size - mb_col) * 16) << 3;

-  xd->up_available = (mb_row != 0);

-  xd->left_available = (mb_col != 0);

+  set_mb_row(cm, xd, mb_row, block_size);

+  set_mb_col(cm, xd, mb_col, block_size);

   xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

   xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

@@ -1080,71 +958,34 @@

   xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

-static void set_refs(VP9D_COMP *pbi, int block_size,

-                     int mb_row, int mb_col) {

+static void set_refs(VP9D_COMP *pbi, int block_size, int mb_row, int mb_col) {

   VP9_COMMON *const cm = &pbi->common;

   MACROBLOCKD *const xd = &pbi->mb;

-  MODE_INFO *mi = xd->mode_info_context;

-  MB_MODE_INFO *const mbmi = &mi->mbmi;

+  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

   if (mbmi->ref_frame > INTRA_FRAME) {

-    int ref_fb_idx, ref_yoffset, ref_uvoffset, ref_y_stride, ref_uv_stride;

+    // Select the appropriate reference frame for this MB

+    int ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1];

+    xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1];

+    xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1];

+    setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], mb_row, mb_col,

+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);

-    /* Select the appropriate reference frame for this MB */

-    if (mbmi->ref_frame == LAST_FRAME)

-      ref_fb_idx = cm->lst_fb_idx;

-    else if (mbmi->ref_frame == GOLDEN_FRAME)

-      ref_fb_idx = cm->gld_fb_idx;

-    else

-      ref_fb_idx = cm->alt_fb_idx;

-    ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;

-    ref_yoffset = mb_row * 16 * ref_y_stride + 16 * mb_col;

-    xd->pre.y_buffer = cm->yv12_fb[ref_fb_idx].y_buffer + ref_yoffset;

-    ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

-    ref_uvoffset = mb_row * 8 * ref_uv_stride + 8 * mb_col;

-    xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + ref_uvoffset;

-    xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + ref_uvoffset;

-    /* propagate errors from reference frames */

+    // propagate errors from reference frames

     xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted;

     if (mbmi->second_ref_frame > INTRA_FRAME) {

-      int second_ref_fb_idx;

+      // Select the appropriate reference frame for this MB

+      int second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1];

-      /* Select the appropriate reference frame for this MB */

-      if (mbmi->second_ref_frame == LAST_FRAME)

-        second_ref_fb_idx = cm->lst_fb_idx;

-      else if (mbmi->second_ref_frame == GOLDEN_FRAME)

-        second_ref_fb_idx = cm->gld_fb_idx;

-      else

-        second_ref_fb_idx = cm->alt_fb_idx;

+      setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx],

+                       mb_row, mb_col,

+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);

-      xd->second_pre.y_buffer =

-          cm->yv12_fb[second_ref_fb_idx].y_buffer + ref_yoffset;

-      xd->second_pre.u_buffer =

-          cm->yv12_fb[second_ref_fb_idx].u_buffer + ref_uvoffset;

-      xd->second_pre.v_buffer =

-          cm->yv12_fb[second_ref_fb_idx].v_buffer + ref_uvoffset;

-      /* propagate errors from reference frames */

+      // propagate errors from reference frames

       xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted;

-  if (mbmi->sb_type) {

-    const int n_mbs = 1 << mbmi->sb_type;

-    const int y_mbs = MIN(n_mbs, cm->mb_rows - mb_row);

-    const int x_mbs = MIN(n_mbs, cm->mb_cols - mb_col);

-    const int mis = cm->mode_info_stride;

-    int x, y;

-    for (y = 0; y < y_mbs; y++) {

-      for (x = !y; x < x_mbs; x++) {

-        mi[y * mis + x] = *mi;

-      }

-    }

-  }

 /* Decode a row of Superblocks (2x2 region of MBs) */

@@ -1156,8 +997,15 @@

   // For a SB there are 2 left contexts, each pertaining to a MB row within

   vpx_memset(pc->left_context, 0, sizeof(pc->left_context));

-  for (mb_col = 0; mb_col < pc->mb_cols; mb_col += 4) {

+  for (mb_col = pc->cur_tile_mb_col_start;

+       mb_col < pc->cur_tile_mb_col_end; mb_col += 4) {

     if (vp9_read(bc, pc->sb64_coded)) {

+#ifdef DEC_DEBUG

+      dec_debug = (pc->current_video_frame == 11 && pc->show_frame &&

+                   mb_row == 8 && mb_col == 0);

+      if (dec_debug)

+        printf("Debug Decode SB64\n");

+#endif

       set_offsets(pbi, 64, mb_row, mb_col);

       vp9_decode_mb_mode_mv(pbi, xd, mb_row, mb_col, bc);

       set_refs(pbi, 64, mb_row, mb_col);

@@ -1178,6 +1026,12 @@

         xd->sb_index = j;

         if (vp9_read(bc, pc->sb32_coded)) {

+#ifdef DEC_DEBUG

+          dec_debug = (pc->current_video_frame == 11 && pc->show_frame &&

+                       mb_row + y_idx_sb == 8 && mb_col + x_idx_sb == 0);

+          if (dec_debug)

+            printf("Debug Decode SB32\n");

+#endif

           set_offsets(pbi, 32, mb_row + y_idx_sb, mb_col + x_idx_sb);

           vp9_decode_mb_mode_mv(pbi,

                                 xd, mb_row + y_idx_sb, mb_col + x_idx_sb, bc);

@@ -1198,14 +1052,18 @@

               // MB lies outside frame, skip on to next

               continue;

+#ifdef DEC_DEBUG

+            dec_debug = (pc->current_video_frame == 11 && pc->show_frame &&

+                         mb_row + y_idx == 8 && mb_col + x_idx == 0);

+            if (dec_debug)

+              printf("Debug Decode MB\n");

+#endif

             set_offsets(pbi, 16, mb_row + y_idx, mb_col + x_idx);

             xd->mb_index = i;

             vp9_decode_mb_mode_mv(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);

-            update_blockd_bmi(xd);

             set_refs(pbi, 16, mb_row + y_idx, mb_col + x_idx);

-            vp9_intra_prediction_down_copy(xd);

-            decode_macroblock(pbi, xd, mb_row, mb_col, bc);

+            decode_macroblock(pbi, xd, mb_row + y_idx, mb_col + x_idx, bc);

             /* check if the boolean decoder has suffered an error */

             xd->corrupted |= bool_error(bc);

@@ -1216,38 +1074,19 @@

-static unsigned int read_partition_size(const unsigned char *cx_size) {

-  const unsigned int size =

-    cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16);

-  return size;

-}

-static int read_is_valid(const unsigned char *start,

-                         size_t               len,

-                         const unsigned char *end) {

-  return (start + len > start && start + len <= end);

-}

 static void setup_token_decoder(VP9D_COMP *pbi,

                                 const unsigned char *cx_data,

                                 BOOL_DECODER* const bool_decoder) {

-  VP9_COMMON          *pc = &pbi->common;

+  VP9_COMMON *pc = &pbi->common;

   const unsigned char *user_data_end = pbi->Source + pbi->source_sz;

-  const unsigned char *partition;

+  const unsigned char *partition = cx_data;

+  ptrdiff_t bytes_left = user_data_end - partition;

+  ptrdiff_t partition_size = bytes_left;

-  ptrdiff_t            partition_size;

-  ptrdiff_t            bytes_left;

-  // Set up pointers to token partition

-  partition = cx_data;

-  bytes_left = user_data_end - partition;

-  partition_size = bytes_left;

-  /* Validate the calculated partition length. If the buffer

-   * described by the partition can't be fully read, then restrict

-   * it to the portion that can be (for EC mode) or throw an error.

-   */

+  // Validate the calculated partition length. If the buffer

+  // described by the partition can't be fully read, then restrict

+  // it to the portion that can be (for EC mode) or throw an error.

   if (!read_is_valid(partition, partition_size, user_data_end)) {

     vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

                        "Truncated packet or corrupt partition "

@@ -1262,100 +1101,251 @@

 static void init_frame(VP9D_COMP *pbi) {

   VP9_COMMON *const pc = &pbi->common;

-  MACROBLOCKD *const xd  = &pbi->mb;

+  MACROBLOCKD *const xd = &pbi->mb;

   if (pc->frame_type == KEY_FRAME) {

+    vp9_setup_past_independence(pc, xd);

+    // All buffers are implicitly updated on key frames.

+    pbi->refresh_frame_flags = (1 << NUM_REF_FRAMES) - 1;

+  } else if (pc->error_resilient_mode) {

+    vp9_setup_past_independence(pc, xd);

+  }

-    if (pc->last_frame_seg_map)

-      vpx_memset(pc->last_frame_seg_map, 0, (pc->mb_rows * pc->mb_cols));

+  if (pc->frame_type != KEY_FRAME) {

+    pc->mcomp_filter_type = pc->use_bilinear_mc_filter ? BILINEAR : EIGHTTAP;

-    vp9_init_mv_probs(pc);

+    // To enable choice of different interpolation filters

+    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

+  }

-    vp9_init_mbmode_probs(pc);

-    vp9_default_bmode_probs(pc->fc.bmode_prob);

+  xd->mode_info_context = pc->mi;

+  xd->prev_mode_info_context = pc->prev_mi;

+  xd->frame_type = pc->frame_type;

+  xd->mode_info_context->mbmi.mode = DC_PRED;

+  xd->mode_info_stride = pc->mode_info_stride;

+  xd->corrupted = 0;

+  xd->fullpixel_mask = pc->full_pixel ? 0xfffffff8 : 0xffffffff;

+}

-    vp9_default_coef_probs(pc);

-    vp9_kf_default_bmode_probs(pc->kf_bmode_prob);

+#if CONFIG_CODE_NONZEROCOUNT

+static void read_nzc_probs_common(VP9_COMMON *cm,

+                                  BOOL_DECODER* const bc,

+                                  int block_size) {

+  int c, r, b, t;

+  int tokens, nodes;

+  vp9_prob *nzc_probs;

+  vp9_prob upd;

-    // Reset the segment feature data to the default stats:

-    // Features disabled, 0, with delta coding (Default state).

-    vp9_clearall_segfeatures(xd);

+  if (!vp9_read_bit(bc)) return;

-    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;

+  if (block_size == 32) {

+    tokens = NZC32X32_TOKENS;

+    nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];

+    upd = NZC_UPDATE_PROB_32X32;

+  } else if (block_size == 16) {

+    tokens = NZC16X16_TOKENS;

+    nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];

+    upd = NZC_UPDATE_PROB_16X16;

+  } else if (block_size == 8) {

+    tokens = NZC8X8_TOKENS;

+    nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];

+    upd = NZC_UPDATE_PROB_8X8;

+  } else {

+    tokens = NZC4X4_TOKENS;

+    nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];

+    upd = NZC_UPDATE_PROB_4X4;

+  }

+  nodes = tokens - 1;

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (r = 0; r < REF_TYPES; ++r) {

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

+        int offset_nodes = offset * nodes;

+        for (t = 0; t < nodes; ++t) {

+          vp9_prob *p = &nzc_probs[offset_nodes + t];

+          if (vp9_read(bc, upd)) {

+            *p = read_prob_diff_update(bc, *p);

+          }

+        }

+      }

+    }

+  }

+}

-    /* reset the mode ref deltasa for loop filter */

-    vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));

-    vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));

+static void read_nzc_pcat_probs(VP9_COMMON *cm, BOOL_DECODER* const bc) {

+  int c, t, b;

+  vp9_prob upd = NZC_UPDATE_PROB_PCAT;

+  if (!vp9_read_bit(bc)) {

+    return;

+  }

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

+      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

+      for (b = 0; b < bits; ++b) {

+        vp9_prob *p = &cm->fc.nzc_pcat_probs[c][t][b];

+        if (vp9_read(bc, upd)) {

+          *p = read_prob_diff_update(bc, *p);

+        }

+      }

+    }

+  }

+}

-    /* All buffers are implicitly updated on key frames. */

-    pc->refresh_golden_frame = 1;

-    pc->refresh_alt_ref_frame = 1;

-    pc->copy_buffer_to_gf = 0;

-    pc->copy_buffer_to_arf = 0;

+static void read_nzc_probs(VP9_COMMON *cm,

+                           BOOL_DECODER* const bc) {

+  read_nzc_probs_common(cm, bc, 4);

+  if (cm->txfm_mode != ONLY_4X4)

+    read_nzc_probs_common(cm, bc, 8);

+  if (cm->txfm_mode > ALLOW_8X8)

+    read_nzc_probs_common(cm, bc, 16);

+  if (cm->txfm_mode > ALLOW_16X16)

+    read_nzc_probs_common(cm, bc, 32);

+#ifdef NZC_PCAT_UPDATE

+  read_nzc_pcat_probs(cm, bc);

+#endif

+}

+#endif  // CONFIG_CODE_NONZEROCOUNT

-    /* Note that Golden and Altref modes cannot be used on a key frame so

-     * ref_frame_sign_bias[] is undefined and meaningless

-     */

-    pc->ref_frame_sign_bias[GOLDEN_FRAME] = 0;

-    pc->ref_frame_sign_bias[ALTREF_FRAME] = 0;

+static void read_coef_probs_common(BOOL_DECODER* const bc,

+                                   vp9_coeff_probs *coef_probs,

+                                   int block_types) {

+#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

+  const int entropy_nodes_update = UNCONSTRAINED_UPDATE_NODES;

+#else

+  const int entropy_nodes_update = ENTROPY_NODES;

+#endif

-    vp9_init_mode_contexts(&pbi->common);

-    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

-    vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));

+  int i, j, k, l, m;

-    vpx_memset(pc->prev_mip, 0,

-               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));

-    vpx_memset(pc->mip, 0,

-               (pc->mb_cols + 1) * (pc->mb_rows + 1)* sizeof(MODE_INFO));

+  if (vp9_read_bit(bc)) {

+    for (i = 0; i < block_types; i++) {

+      for (j = 0; j < REF_TYPES; j++) {

+        for (k = 0; k < COEF_BANDS; k++) {

+          for (l = 0; l < PREV_COEF_CONTEXTS; l++) {

+            if (l >= 3 && k == 0)

+              continue;

+            for (m = CONFIG_CODE_NONZEROCOUNT; m < entropy_nodes_update; m++) {

+              vp9_prob *const p = coef_probs[i][j][k][l] + m;

-    vp9_update_mode_info_border(pc, pc->mip);

-    vp9_update_mode_info_in_image(pc, pc->mi);

+              if (vp9_read(bc, vp9_coef_update_prob[m])) {

+                *p = read_prob_diff_update(bc, *p);

+#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

+                if (m == UNCONSTRAINED_NODES - 1)

+                  vp9_get_model_distribution(*p, coef_probs[i][j][k][l], i, j);

+#endif

+              }

+            }

+          }

+        }

+      }

+    }

+  }

+}

+static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {

+  VP9_COMMON *const pc = &pbi->common;

-  } else {

+  read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES);

-    if (!pc->use_bilinear_mc_filter)

-      pc->mcomp_filter_type = EIGHTTAP;

-    else

-      pc->mcomp_filter_type = BILINEAR;

+  if (pbi->common.txfm_mode != ONLY_4X4)

+    read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES);

-    /* To enable choice of different interpolation filters */

-    vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

-  }

+  if (pbi->common.txfm_mode > ALLOW_8X8)

+    read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES);

-  xd->mode_info_context = pc->mi;

-  xd->prev_mode_info_context = pc->prev_mi;

-  xd->frame_type = pc->frame_type;

-  xd->mode_info_context->mbmi.mode = DC_PRED;

-  xd->mode_info_stride = pc->mode_info_stride;

-  xd->corrupted = 0; /* init without corruption */

+  if (pbi->common.txfm_mode > ALLOW_16X16)

+    read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES);

+}

-  xd->fullpixel_mask = 0xffffffff;

-  if (pc->full_pixel)

-    xd->fullpixel_mask = 0xfffffff8;

+static void update_frame_size(VP9D_COMP *pbi) {

+  VP9_COMMON *cm = &pbi->common;

+  /* our internal buffers are always multiples of 16 */

+  const int width = (cm->width + 15) & ~15;

+  const int height = (cm->height + 15) & ~15;

+  cm->mb_rows = height >> 4;

+  cm->mb_cols = width >> 4;

+  cm->MBs = cm->mb_rows * cm->mb_cols;

+  cm->mode_info_stride = cm->mb_cols + 1;

+  memset(cm->mip, 0,

+        (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));

+  vp9_update_mode_info_border(cm, cm->mip);

+  cm->mi = cm->mip + cm->mode_info_stride + 1;

+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;

+  vp9_update_mode_info_in_image(cm, cm->mi);

-static void read_coef_probs_common(BOOL_DECODER* const bc,

-                                   vp9_coeff_probs *coef_probs,

-                                   int block_types) {

-  int i, j, k, l;

+static void setup_segmentation(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {

+  int i, j;

-  if (vp9_read_bit(bc)) {

-    for (i = 0; i < block_types; i++) {

-      for (j = !i; j < COEF_BANDS; j++) {

-        /* NB: This j loop starts from 1 on block type i == 0 */

-        for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-          if (k >= 3 && ((i == 0 && j == 1) ||

-                         (i > 0 && j == 0)))

-            continue;

-          for (l = 0; l < ENTROPY_NODES; l++) {

-            vp9_prob *const p = coef_probs[i][j][k] + l;

+  xd->segmentation_enabled = vp9_read_bit(r);

+  if (xd->segmentation_enabled) {

+    // Read whether or not the segmentation map is being explicitly updated

+    // this frame.

+    xd->update_mb_segmentation_map = vp9_read_bit(r);

-            if (vp9_read(bc, COEF_UPDATE_PROB)) {

-              *p = read_prob_diff_update(bc, *p);

+    // If so what method will be used.

+    if (xd->update_mb_segmentation_map) {

+      // Which macro block level features are enabled. Read the probs used to

+      // decode the segment id for each macro block.

+      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

+        xd->mb_segment_tree_probs[i] = vp9_read_bit(r) ? vp9_read_prob(r) : 255;

+      }

+      // Read the prediction probs needed to decode the segment id

+      pc->temporal_update = vp9_read_bit(r);

+      for (i = 0; i < PREDICTION_PROBS; i++) {

+        pc->segment_pred_probs[i] = pc->temporal_update

+            ? (vp9_read_bit(r) ? vp9_read_prob(r) : 255)

+            : 255;

+      }

+      if (pc->temporal_update) {

+        const vp9_prob *p = xd->mb_segment_tree_probs;

+        vp9_prob *p_mod = xd->mb_segment_mispred_tree_probs;

+        const int c0 =        p[0]  *        p[1];

+        const int c1 =        p[0]  * (256 - p[1]);

+        const int c2 = (256 - p[0]) *        p[2];

+        const int c3 = (256 - p[0]) * (256 - p[2]);

+        p_mod[0] = get_binary_prob(c1, c2 + c3);

+        p_mod[1] = get_binary_prob(c0, c2 + c3);

+        p_mod[2] = get_binary_prob(c0 + c1, c3);

+        p_mod[3] = get_binary_prob(c0 + c1, c2);

+      }

+    }

+    xd->update_mb_segmentation_data = vp9_read_bit(r);

+    if (xd->update_mb_segmentation_data) {

+      int data;

+      xd->mb_segment_abs_delta = vp9_read_bit(r);

+      vp9_clearall_segfeatures(xd);

+      // For each segmentation...

+      for (i = 0; i < MAX_MB_SEGMENTS; i++) {

+        // For each of the segments features...

+        for (j = 0; j < SEG_LVL_MAX; j++) {

+          // Is the feature enabled

+          if (vp9_read_bit(r)) {

+            // Update the feature data and mask

+            vp9_enable_segfeature(xd, i, j);

+            data = vp9_decode_unsigned_max(r, vp9_seg_feature_data_max(j));

+            // Is the segment data signed..

+            if (vp9_is_segfeature_signed(j)) {

+              if (vp9_read_bit(r))

+                data = -data;

+          } else {

+            data = 0;

+          vp9_set_segdata(xd, i, j, data);

@@ -1362,23 +1352,266 @@

-static void read_coef_probs(VP9D_COMP *pbi, BOOL_DECODER* const bc) {

+static void setup_loopfilter(VP9_COMMON *pc, MACROBLOCKD *xd, vp9_reader *r) {

+  int i;

+  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(r);

+  pc->filter_level = vp9_read_literal(r, 6);

+  pc->sharpness_level = vp9_read_literal(r, 3);

+#if CONFIG_LOOP_DERING

+  if (vp9_read_bit(r))

+    pc->dering_enabled = 1 + vp9_read_literal(r, 4);

+  else

+    pc->dering_enabled = 0;

+#endif

+  // Read in loop filter deltas applied at the MB level based on mode or ref

+  // frame.

+  xd->mode_ref_lf_delta_update = 0;

+  xd->mode_ref_lf_delta_enabled = vp9_read_bit(r);

+  if (xd->mode_ref_lf_delta_enabled) {

+    // Do the deltas need to be updated

+    xd->mode_ref_lf_delta_update = vp9_read_bit(r);

+    if (xd->mode_ref_lf_delta_update) {

+      // Send update

+      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

+        if (vp9_read_bit(r)) {

+          // sign = vp9_read_bit(r);

+          xd->ref_lf_deltas[i] = vp9_read_literal(r, 6);

+          if (vp9_read_bit(r))

+            xd->ref_lf_deltas[i] = -xd->ref_lf_deltas[i];  // Apply sign

+        }

+      }

+      // Send update

+      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {

+        if (vp9_read_bit(r)) {

+          // sign = vp9_read_bit(r);

+          xd->mode_lf_deltas[i] = vp9_read_literal(r, 6);

+          if (vp9_read_bit(r))

+            xd->mode_lf_deltas[i] = -xd->mode_lf_deltas[i];  // Apply sign

+        }

+      }

+    }

+  }

+}

+static const uint8_t *setup_frame_size(VP9D_COMP *pbi, int scaling_active,

+                                      const uint8_t *data,

+                                      const uint8_t *data_end) {

   VP9_COMMON *const pc = &pbi->common;

+  const int width = pc->width;

+  const int height = pc->height;

-  read_coef_probs_common(bc, pc->fc.coef_probs_4x4, BLOCK_TYPES_4X4);

-  read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4);

+  // If error concealment is enabled we should only parse the new size

+  // if we have enough data. Otherwise we will end up with the wrong size.

+  if (scaling_active && data + 4 < data_end) {

+    pc->display_width = read_le16(data + 0);

+    pc->display_height = read_le16(data + 2);

+    data += 4;

+  }

-  if (pbi->common.txfm_mode != ONLY_4X4) {

-    read_coef_probs_common(bc, pc->fc.coef_probs_8x8, BLOCK_TYPES_8X8);

-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8);

+  if (data + 4 < data_end) {

+    pc->width = read_le16(data + 0);

+    pc->height = read_le16(data + 2);

+    data += 4;

-  if (pbi->common.txfm_mode > ALLOW_8X8) {

-    read_coef_probs_common(bc, pc->fc.coef_probs_16x16, BLOCK_TYPES_16X16);

-    read_coef_probs_common(bc, pc->fc.hybrid_coef_probs_16x16,

-                           BLOCK_TYPES_16X16);

+  if (!scaling_active) {

+    pc->display_width = pc->width;

+    pc->display_height = pc->height;

+  }

+  if (width != pc->width || height != pc->height) {

+    if (pc->width <= 0) {

+      pc->width = width;

+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                         "Invalid frame width");

+    }

+    if (pc->height <= 0) {

+      pc->height = height;

+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                         "Invalid frame height");

+    }

+    if (!pbi->initial_width || !pbi->initial_height) {

+      if (vp9_alloc_frame_buffers(pc, pc->width, pc->height))

+        vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

+                           "Failed to allocate frame buffers");

+      pbi->initial_width = pc->width;

+      pbi->initial_height = pc->height;

+    }

+    if (pc->width > pbi->initial_width) {

+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                         "Frame width too large");

+    }

+    if (pc->height > pbi->initial_height) {

+      vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

+                         "Frame height too large");

+    }

+    update_frame_size(pbi);

+  }

+  return data;

+}

+static void update_frame_context(VP9D_COMP *pbi, vp9_reader *r) {

+  FRAME_CONTEXT *const fc = &pbi->common.fc;

+  vp9_copy(fc->pre_coef_probs_4x4, fc->coef_probs_4x4);

+  vp9_copy(fc->pre_coef_probs_8x8, fc->coef_probs_8x8);

+  vp9_copy(fc->pre_coef_probs_16x16, fc->coef_probs_16x16);

+  vp9_copy(fc->pre_coef_probs_32x32, fc->coef_probs_32x32);

+  vp9_copy(fc->pre_ymode_prob, fc->ymode_prob);

+  vp9_copy(fc->pre_sb_ymode_prob, fc->sb_ymode_prob);

+  vp9_copy(fc->pre_uv_mode_prob, fc->uv_mode_prob);

+  vp9_copy(fc->pre_bmode_prob, fc->bmode_prob);

+  vp9_copy(fc->pre_i8x8_mode_prob, fc->i8x8_mode_prob);

+  vp9_copy(fc->pre_sub_mv_ref_prob, fc->sub_mv_ref_prob);

+  vp9_copy(fc->pre_mbsplit_prob, fc->mbsplit_prob);

+  fc->pre_nmvc = fc->nmvc;

+  vp9_zero(fc->coef_counts_4x4);

+  vp9_zero(fc->coef_counts_8x8);

+  vp9_zero(fc->coef_counts_16x16);

+  vp9_zero(fc->coef_counts_32x32);

+  vp9_zero(fc->eob_branch_counts);

+  vp9_zero(fc->ymode_counts);

+  vp9_zero(fc->sb_ymode_counts);

+  vp9_zero(fc->uv_mode_counts);

+  vp9_zero(fc->bmode_counts);

+  vp9_zero(fc->i8x8_mode_counts);

+  vp9_zero(fc->sub_mv_ref_counts);

+  vp9_zero(fc->mbsplit_counts);

+  vp9_zero(fc->NMVcount);

+  vp9_zero(fc->mv_ref_ct);

+#if CONFIG_COMP_INTERINTRA_PRED

+  fc->pre_interintra_prob = fc->interintra_prob;

+  vp9_zero(fc->interintra_counts);

+#endif

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_copy(fc->pre_nzc_probs_4x4, fc->nzc_probs_4x4);

+  vp9_copy(fc->pre_nzc_probs_8x8, fc->nzc_probs_8x8);

+  vp9_copy(fc->pre_nzc_probs_16x16, fc->nzc_probs_16x16);

+  vp9_copy(fc->pre_nzc_probs_32x32, fc->nzc_probs_32x32);

+  vp9_copy(fc->pre_nzc_pcat_probs, fc->nzc_pcat_probs);

+  vp9_zero(fc->nzc_counts_4x4);

+  vp9_zero(fc->nzc_counts_8x8);

+  vp9_zero(fc->nzc_counts_16x16);

+  vp9_zero(fc->nzc_counts_32x32);

+  vp9_zero(fc->nzc_pcat_counts);

+#endif

+  read_coef_probs(pbi, r);

+#if CONFIG_CODE_NONZEROCOUNT

+  read_nzc_probs(&pbi->common, r);

+#endif

+}

+static void decode_tiles(VP9D_COMP *pbi,

+                         const uint8_t *data, int first_partition_size,

+                         BOOL_DECODER *header_bc, BOOL_DECODER *residual_bc) {

+  VP9_COMMON *const pc = &pbi->common;

+  MACROBLOCKD *const xd  = &pbi->mb;

+  const uint8_t *data_ptr = data + first_partition_size;

+  int tile_row, tile_col, delta_log2_tiles;

+  int mb_row;

+  vp9_get_tile_n_bits(pc, &pc->log2_tile_columns, &delta_log2_tiles);

+  while (delta_log2_tiles--) {

+    if (vp9_read_bit(header_bc)) {

+      pc->log2_tile_columns++;

+    } else {

+      break;

+    }

-  if (pbi->common.txfm_mode > ALLOW_16X16) {

-    read_coef_probs_common(bc, pc->fc.coef_probs_32x32, BLOCK_TYPES_32X32);

+  pc->log2_tile_rows = vp9_read_bit(header_bc);

+  if (pc->log2_tile_rows)

+    pc->log2_tile_rows += vp9_read_bit(header_bc);

+  pc->tile_columns = 1 << pc->log2_tile_columns;

+  pc->tile_rows    = 1 << pc->log2_tile_rows;

+  vpx_memset(pc->above_context, 0,

+             sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);

+  if (pbi->oxcf.inv_tile_order) {

+    const int n_cols = pc->tile_columns;

+    const uint8_t *data_ptr2[4][1 << 6];

+    BOOL_DECODER UNINITIALIZED_IS_SAFE(bc_bak);

+    // pre-initialize the offsets, we're going to read in inverse order

+    data_ptr2[0][0] = data_ptr;

+    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {

+      if (tile_row) {

+        const int size = read_le32(data_ptr2[tile_row - 1][n_cols - 1]);

+        data_ptr2[tile_row - 1][n_cols - 1] += 4;

+        data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size;

+      }

+      for (tile_col = 1; tile_col < n_cols; tile_col++) {

+        const int size = read_le32(data_ptr2[tile_row][tile_col - 1]);

+        data_ptr2[tile_row][tile_col - 1] += 4;

+        data_ptr2[tile_row][tile_col] =

+            data_ptr2[tile_row][tile_col - 1] + size;

+      }

+    }

+    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {

+      vp9_get_tile_row_offsets(pc, tile_row);

+      for (tile_col = n_cols - 1; tile_col >= 0; tile_col--) {

+        vp9_get_tile_col_offsets(pc, tile_col);

+        setup_token_decoder(pbi, data_ptr2[tile_row][tile_col], residual_bc);

+        // Decode a row of superblocks

+        for (mb_row = pc->cur_tile_mb_row_start;

+             mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {

+          decode_sb_row(pbi, pc, mb_row, xd, residual_bc);

+        }

+        if (tile_row == pc->tile_rows - 1 && tile_col == n_cols - 1)

+          bc_bak = *residual_bc;

+      }

+    }

+    *residual_bc = bc_bak;

+  } else {

+    int has_more;

+    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {

+      vp9_get_tile_row_offsets(pc, tile_row);

+      for (tile_col = 0; tile_col < pc->tile_columns; tile_col++) {

+        vp9_get_tile_col_offsets(pc, tile_col);

+        has_more = tile_col < pc->tile_columns - 1 ||

+                   tile_row < pc->tile_rows - 1;

+        // Setup decoder

+        setup_token_decoder(pbi, data_ptr + (has_more ? 4 : 0), residual_bc);

+        // Decode a row of superblocks

+        for (mb_row = pc->cur_tile_mb_row_start;

+             mb_row < pc->cur_tile_mb_row_end; mb_row += 4) {

+          decode_sb_row(pbi, pc, mb_row, xd, residual_bc);

+        }

+        if (has_more) {

+          const int size = read_le32(data_ptr);

+          data_ptr += 4 + size;

+        }

+      }

+    }

@@ -1386,31 +1619,28 @@

   BOOL_DECODER header_bc, residual_bc;

   VP9_COMMON *const pc = &pbi->common;

   MACROBLOCKD *const xd  = &pbi->mb;

-  const unsigned char *data = (const unsigned char *)pbi->Source;

-  const unsigned char *data_end = data + pbi->source_sz;

+  const uint8_t *data = (const uint8_t *)pbi->Source;

+  const uint8_t *data_end = data + pbi->source_sz;

   ptrdiff_t first_partition_length_in_bytes = 0;

+  int i, corrupt_tokens = 0;

-  int mb_row;

-  int i, j;

-  int corrupt_tokens = 0;

+  // printf("Decoding frame %d\n", pc->current_video_frame);

-  /* start with no corruption of current frame */

-  xd->corrupted = 0;

+  xd->corrupted = 0;  // start with no corruption of current frame

   pc->yv12_fb[pc->new_fb_idx].corrupted = 0;

   if (data_end - data < 3) {

-    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                       "Truncated packet");

+    vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");

   } else {

+    int scaling_active;

     pc->last_frame_type = pc->frame_type;

     pc->frame_type = (FRAME_TYPE)(data[0] & 1);

     pc->version = (data[0] >> 1) & 7;

     pc->show_frame = (data[0] >> 4) & 1;

-    first_partition_length_in_bytes =

-      (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5;

+    scaling_active = (data[0] >> 5) & 1;

+    first_partition_length_in_bytes = read_le16(data + 1);

-    if ((data + first_partition_length_in_bytes > data_end

-         || data + first_partition_length_in_bytes < data))

+    if (!read_is_valid(data, first_partition_length_in_bytes, data_end))

       vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

                          "Truncated packet or corrupt partition 0 length");

@@ -1419,136 +1649,42 @@

     vp9_setup_version(pc);

     if (pc->frame_type == KEY_FRAME) {

-      const int Width = pc->Width;

-      const int Height = pc->Height;

-      /* vet via sync code */

-      /* When error concealment is enabled we should only check the sync

-       * code if we have enough bits available

-       */

+      // When error concealment is enabled we should only check the sync

+      // code if we have enough bits available

       if (data + 3 < data_end) {

         if (data[0] != 0x9d || data[1] != 0x01 || data[2] != 0x2a)

           vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,

                              "Invalid frame sync code");

-      /* If error concealment is enabled we should only parse the new size

-       * if we have enough data. Otherwise we will end up with the wrong

-       * size.

-       */

-      if (data + 6 < data_end) {

-        pc->Width = (data[3] | (data[4] << 8)) & 0x3fff;

-        pc->horiz_scale = data[4] >> 6;

-        pc->Height = (data[5] | (data[6] << 8)) & 0x3fff;

-        pc->vert_scale = data[6] >> 6;

-      }

-      data += 7;

-      if (Width != pc->Width  ||  Height != pc->Height) {

-        if (pc->Width <= 0) {

-          pc->Width = Width;

-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                             "Invalid frame width");

-        }

-        if (pc->Height <= 0) {

-          pc->Height = Height;

-          vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,

-                             "Invalid frame height");

-        }

-        if (vp9_alloc_frame_buffers(pc, pc->Width, pc->Height))

-          vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

-                             "Failed to allocate frame buffers");

-      }

+      data += 3;

+    data = setup_frame_size(pbi, scaling_active, data, data_end);

-#ifdef DEC_DEBUG

-  printf("Decode frame %d\n", pc->current_video_frame);

-#endif

   if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME) ||

-      pc->Width == 0 || pc->Height == 0) {

+      pc->width == 0 || pc->height == 0) {

     return -1;

   init_frame(pbi);

+  // Reset the frame pointers to the current frame size

+  vp8_yv12_realloc_frame_buffer(&pc->yv12_fb[pc->new_fb_idx],

+                                pc->width, pc->height,

+                                VP9BORDERINPIXELS);

   if (vp9_start_decode(&header_bc, data,

                        (unsigned int)first_partition_length_in_bytes))

     vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate bool decoder 0");

-  if (pc->frame_type == KEY_FRAME) {

-    pc->clr_type    = (YUV_TYPE)vp9_read_bit(&header_bc);

-    pc->clamp_type  = (CLAMP_TYPE)vp9_read_bit(&header_bc);

-  }

-  /* Is segmentation enabled */

-  xd->segmentation_enabled = (unsigned char)vp9_read_bit(&header_bc);

+  pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc);

+  pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc);

+  pc->error_resilient_mode = vp9_read_bit(&header_bc);

-  if (xd->segmentation_enabled) {

-    // Read whether or not the segmentation map is being explicitly

-    // updated this frame.

-    xd->update_mb_segmentation_map = (unsigned char)vp9_read_bit(&header_bc);

+  setup_segmentation(pc, xd, &header_bc);

-    // If so what method will be used.

-    if (xd->update_mb_segmentation_map) {

-      // Which macro block level features are enabled

-      // Read the probs used to decode the segment id for each macro

-      // block.

-      for (i = 0; i < MB_FEATURE_TREE_PROBS; i++) {

-          xd->mb_segment_tree_probs[i] = vp9_read_bit(&header_bc) ?

-              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;

-      }

-      // Read the prediction probs needed to decode the segment id

-      pc->temporal_update = (unsigned char)vp9_read_bit(&header_bc);

-      for (i = 0; i < PREDICTION_PROBS; i++) {

-        if (pc->temporal_update) {

-          pc->segment_pred_probs[i] = vp9_read_bit(&header_bc) ?

-              (vp9_prob)vp9_read_literal(&header_bc, 8) : 255;

-        } else {

-          pc->segment_pred_probs[i] = 255;

-        }

-      }

-    }

-    // Is the segment data being updated

-    xd->update_mb_segmentation_data = (unsigned char)vp9_read_bit(&header_bc);

-    if (xd->update_mb_segmentation_data) {

-      int data;

-      xd->mb_segment_abs_delta = (unsigned char)vp9_read_bit(&header_bc);

-      vp9_clearall_segfeatures(xd);

-      // For each segmentation...

-      for (i = 0; i < MAX_MB_SEGMENTS; i++) {

-        // For each of the segments features...

-        for (j = 0; j < SEG_LVL_MAX; j++) {

-          // Is the feature enabled

-          if (vp9_read_bit(&header_bc)) {

-            // Update the feature data and mask

-            vp9_enable_segfeature(xd, i, j);

-            data = vp9_decode_unsigned_max(&header_bc,

-                                           vp9_seg_feature_data_max(j));

-            // Is the segment data signed..

-            if (vp9_is_segfeature_signed(j)) {

-              if (vp9_read_bit(&header_bc))

-                data = -data;

-            }

-          } else

-            data = 0;

-          vp9_set_segdata(xd, i, j, data);

-        }

-      }

-    }

-  }

   // Read common prediction model status flag probability updates for the

   // reference frame

   if (pc->frame_type == KEY_FRAME) {

@@ -1556,81 +1692,43 @@

     pc->ref_pred_probs[0] = 120;

     pc->ref_pred_probs[1] = 80;

     pc->ref_pred_probs[2] = 40;

   } else {

     for (i = 0; i < PREDICTION_PROBS; i++) {

       if (vp9_read_bit(&header_bc))

-        pc->ref_pred_probs[i] = (vp9_prob)vp9_read_literal(&header_bc, 8);

+        pc->ref_pred_probs[i] = vp9_read_prob(&header_bc);

-  pc->sb64_coded = vp9_read_literal(&header_bc, 8);

-  pc->sb32_coded = vp9_read_literal(&header_bc, 8);

+  pc->sb64_coded = vp9_read_prob(&header_bc);

+  pc->sb32_coded = vp9_read_prob(&header_bc);

+  xd->lossless = vp9_read_bit(&header_bc);

+  if (xd->lossless) {

+    pc->txfm_mode = ONLY_4X4;

+  } else {

+    // Read the loop filter level and type

+    pc->txfm_mode = vp9_read_literal(&header_bc, 2);

+    if (pc->txfm_mode == ALLOW_32X32)

+      pc->txfm_mode += vp9_read_bit(&header_bc);

-  /* Read the loop filter level and type */

-  pc->txfm_mode = vp9_read_literal(&header_bc, 2);

-  if (pc->txfm_mode == 3)

-    pc->txfm_mode += vp9_read_bit(&header_bc);

-  if (pc->txfm_mode == TX_MODE_SELECT) {

-    pc->prob_tx[0] = vp9_read_literal(&header_bc, 8);

-    pc->prob_tx[1] = vp9_read_literal(&header_bc, 8);

-    pc->prob_tx[2] = vp9_read_literal(&header_bc, 8);

-  }

-  pc->filter_type = (LOOPFILTERTYPE) vp9_read_bit(&header_bc);

-  pc->filter_level = vp9_read_literal(&header_bc, 6);

-  pc->sharpness_level = vp9_read_literal(&header_bc, 3);

-  /* Read in loop filter deltas applied at the MB level based on mode or ref frame. */

-  xd->mode_ref_lf_delta_update = 0;

-  xd->mode_ref_lf_delta_enabled = (unsigned char)vp9_read_bit(&header_bc);

-  if (xd->mode_ref_lf_delta_enabled) {

-    /* Do the deltas need to be updated */

-    xd->mode_ref_lf_delta_update = (unsigned char)vp9_read_bit(&header_bc);

-    if (xd->mode_ref_lf_delta_update) {

-      /* Send update */

-      for (i = 0; i < MAX_REF_LF_DELTAS; i++) {

-        if (vp9_read_bit(&header_bc)) {

-          /*sign = vp9_read_bit( &header_bc );*/

-          xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);

-          if (vp9_read_bit(&header_bc))        /* Apply sign */

-            xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1;

-        }

-      }

-      /* Send update */

-      for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {

-        if (vp9_read_bit(&header_bc)) {

-          /*sign = vp9_read_bit( &header_bc );*/

-          xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6);

-          if (vp9_read_bit(&header_bc))        /* Apply sign */

-            xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1;

-        }

-      }

+    if (pc->txfm_mode == TX_MODE_SELECT) {

+      pc->prob_tx[0] = vp9_read_prob(&header_bc);

+      pc->prob_tx[1] = vp9_read_prob(&header_bc);

+      pc->prob_tx[2] = vp9_read_prob(&header_bc);

+  setup_loopfilter(pc, xd, &header_bc);

   // Dummy read for now

   vp9_read_literal(&header_bc, 2);

-  setup_token_decoder(pbi, data + first_partition_length_in_bytes,

-                      &residual_bc);

   /* Read the default quantizers. */

-    int Q, q_update;

+    int q_update = 0;

+    pc->base_qindex = vp9_read_literal(&header_bc, QINDEX_BITS);

-    Q = vp9_read_literal(&header_bc, QINDEX_BITS);

-    pc->base_qindex = Q;

-    q_update = 0;

     /* AC 1st order Q = default */

     pc->y1dc_delta_q = get_delta_q(&header_bc, pc->y1dc_delta_q, &q_update);

-    pc->y2dc_delta_q = get_delta_q(&header_bc, pc->y2dc_delta_q, &q_update);

-    pc->y2ac_delta_q = get_delta_q(&header_bc, pc->y2ac_delta_q, &q_update);

     pc->uvdc_delta_q = get_delta_q(&header_bc, pc->uvdc_delta_q, &q_update);

     pc->uvac_delta_q = get_delta_q(&header_bc, pc->uvac_delta_q, &q_update);

@@ -1641,58 +1739,52 @@

     mb_init_dequantizer(pbi, &pbi->mb);

-  /* Determine if the golden frame or ARF buffer should be updated and how.

-   * For all non key frames the GF and ARF refresh flags and sign bias

-   * flags must be set explicitly.

-   */

-  if (pc->frame_type != KEY_FRAME) {

-    /* Should the GF or ARF be updated from the current frame */

-    pc->refresh_golden_frame = vp9_read_bit(&header_bc);

-    pc->refresh_alt_ref_frame = vp9_read_bit(&header_bc);

+  // Determine if the golden frame or ARF buffer should be updated and how.

+  // For all non key frames the GF and ARF refresh flags and sign bias

+  // flags must be set explicitly.

+  if (pc->frame_type == KEY_FRAME) {

+    pc->active_ref_idx[0] = pc->new_fb_idx;

+    pc->active_ref_idx[1] = pc->new_fb_idx;

+    pc->active_ref_idx[2] = pc->new_fb_idx;

+  } else {

+    // Should the GF or ARF be updated from the current frame

+    pbi->refresh_frame_flags = vp9_read_literal(&header_bc, NUM_REF_FRAMES);

-    if (pc->refresh_alt_ref_frame) {

-      vpx_memcpy(&pc->fc, &pc->lfc_a, sizeof(pc->fc));

-    } else {

-      vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));

+    // Select active reference frames

+    for (i = 0; i < 3; i++) {

+      int ref_frame_num = vp9_read_literal(&header_bc, NUM_REF_FRAMES_LG2);

+      pc->active_ref_idx[i] = pc->ref_frame_map[ref_frame_num];

-    /* Buffer to buffer copy flags. */

-    pc->copy_buffer_to_gf = 0;

-    if (!pc->refresh_golden_frame)

-      pc->copy_buffer_to_gf = vp9_read_literal(&header_bc, 2);

-    pc->copy_buffer_to_arf = 0;

-    if (!pc->refresh_alt_ref_frame)

-      pc->copy_buffer_to_arf = vp9_read_literal(&header_bc, 2);

     pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc);

     pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc);

-    /* Is high precision mv allowed */

-    xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc);

+    // Is high precision mv allowed

+    xd->allow_high_precision_mv = vp9_read_bit(&header_bc);

     // Read the type of subpel filter to use

-    if (vp9_read_bit(&header_bc)) {

-      pc->mcomp_filter_type = SWITCHABLE;

-    } else {

-      pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2);

-    }

+    pc->mcomp_filter_type = vp9_read_bit(&header_bc)

+                                ? SWITCHABLE

+                                : vp9_read_literal(&header_bc, 2);

 #if CONFIG_COMP_INTERINTRA_PRED

     pc->use_interintra = vp9_read_bit(&header_bc);

 #endif

-    /* To enable choice of different interploation filters */

+    // To enable choice of different interploation filters

     vp9_setup_interp_filters(xd, pc->mcomp_filter_type, pc);

-  pc->refresh_entropy_probs = vp9_read_bit(&header_bc);

-  if (pc->refresh_entropy_probs == 0) {

-    vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

+  if (!pc->error_resilient_mode) {

+    pc->refresh_entropy_probs = vp9_read_bit(&header_bc);

+    pc->frame_parallel_decoding_mode = vp9_read_bit(&header_bc);

+  } else {

+    pc->refresh_entropy_probs = 0;

+    pc->frame_parallel_decoding_mode = 1;

+  pc->frame_context_idx = vp9_read_literal(&header_bc, NUM_FRAME_CONTEXTS_LG2);

+  vpx_memcpy(&pc->fc, &pc->frame_contexts[pc->frame_context_idx],

+             sizeof(pc->fc));

-  pc->refresh_last_frame = (pc->frame_type == KEY_FRAME)

-                           || vp9_read_bit(&header_bc);

   // Read inter mode probability context updates

   if (pc->frame_type != KEY_FRAME) {

     int i, j;

@@ -1699,20 +1791,19 @@

     for (i = 0; i < INTER_MODE_CONTEXTS; i++) {

       for (j = 0; j < 4; j++) {

         if (vp9_read(&header_bc, 252)) {

-          pc->fc.vp9_mode_contexts[i][j] =

-            (vp9_prob)vp9_read_literal(&header_bc, 8);

+          pc->fc.vp9_mode_contexts[i][j] = vp9_read_prob(&header_bc);

+#if CONFIG_MODELCOEFPROB && ADJUST_KF_COEF_PROBS

+  if (pc->frame_type == KEY_FRAME)

+    vp9_adjust_default_coef_probs(pc);

+#endif

 #if CONFIG_NEW_MVREF

   // If Key frame reset mv ref id probabilities to defaults

-  if (pc->frame_type == KEY_FRAME) {

-    // Defaults probabilities for encoding the MV ref id signal

-    vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,

-               sizeof(xd->mb_mv_ref_probs));

-  } else {

+  if (pc->frame_type != KEY_FRAME) {

     // Read any mv_ref index probability updates

     int i, j;

@@ -1725,8 +1816,7 @@

       // Read any updates to probabilities

       for (j = 0; j < MAX_MV_REF_CANDIDATES - 1; ++j) {

         if (vp9_read(&header_bc, VP9_MVREF_UPDATE_PROB)) {

-          xd->mb_mv_ref_probs[i][j] =

-            (vp9_prob)vp9_read_literal(&header_bc, 8);

+          xd->mb_mv_ref_probs[i][j] = vp9_read_prob(&header_bc);

@@ -1735,66 +1825,22 @@

   if (0) {

     FILE *z = fopen("decodestats.stt", "a");

-    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",

+    fprintf(z, "%6d F:%d,R:%d,Q:%d\n",

             pc->current_video_frame,

             pc->frame_type,

-            pc->refresh_golden_frame,

-            pc->refresh_alt_ref_frame,

-            pc->refresh_last_frame,

+            pbi->refresh_frame_flags,

             pc->base_qindex);

     fclose(z);

-  vp9_copy(pbi->common.fc.pre_coef_probs_4x4,

-           pbi->common.fc.coef_probs_4x4);

-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_4x4,

-           pbi->common.fc.hybrid_coef_probs_4x4);

-  vp9_copy(pbi->common.fc.pre_coef_probs_8x8,

-           pbi->common.fc.coef_probs_8x8);

-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_8x8,

-           pbi->common.fc.hybrid_coef_probs_8x8);

-  vp9_copy(pbi->common.fc.pre_coef_probs_16x16,

-           pbi->common.fc.coef_probs_16x16);

-  vp9_copy(pbi->common.fc.pre_hybrid_coef_probs_16x16,

-           pbi->common.fc.hybrid_coef_probs_16x16);

-  vp9_copy(pbi->common.fc.pre_coef_probs_32x32,

-           pbi->common.fc.coef_probs_32x32);

-  vp9_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);

-  vp9_copy(pbi->common.fc.pre_sb_ymode_prob, pbi->common.fc.sb_ymode_prob);

-  vp9_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);

-  vp9_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);

-  vp9_copy(pbi->common.fc.pre_i8x8_mode_prob, pbi->common.fc.i8x8_mode_prob);

-  vp9_copy(pbi->common.fc.pre_sub_mv_ref_prob, pbi->common.fc.sub_mv_ref_prob);

-  vp9_copy(pbi->common.fc.pre_mbsplit_prob, pbi->common.fc.mbsplit_prob);

-#if CONFIG_COMP_INTERINTRA_PRED

-  pbi->common.fc.pre_interintra_prob = pbi->common.fc.interintra_prob;

-#endif

-  pbi->common.fc.pre_nmvc = pbi->common.fc.nmvc;

-  vp9_zero(pbi->common.fc.coef_counts_4x4);

-  vp9_zero(pbi->common.fc.hybrid_coef_counts_4x4);

-  vp9_zero(pbi->common.fc.coef_counts_8x8);

-  vp9_zero(pbi->common.fc.hybrid_coef_counts_8x8);

-  vp9_zero(pbi->common.fc.coef_counts_16x16);

-  vp9_zero(pbi->common.fc.hybrid_coef_counts_16x16);

-  vp9_zero(pbi->common.fc.coef_counts_32x32);

-  vp9_zero(pbi->common.fc.ymode_counts);

-  vp9_zero(pbi->common.fc.sb_ymode_counts);

-  vp9_zero(pbi->common.fc.uv_mode_counts);

-  vp9_zero(pbi->common.fc.bmode_counts);

-  vp9_zero(pbi->common.fc.i8x8_mode_counts);

-  vp9_zero(pbi->common.fc.sub_mv_ref_counts);

-  vp9_zero(pbi->common.fc.mbsplit_counts);

-  vp9_zero(pbi->common.fc.NMVcount);

-  vp9_zero(pbi->common.fc.mv_ref_ct);

-#if CONFIG_COMP_INTERINTRA_PRED

-  vp9_zero(pbi->common.fc.interintra_counts);

-#endif

+  update_frame_context(pbi, &header_bc);

-  read_coef_probs(pbi, &header_bc);

+  // Initialize xd pointers. Any reference should do for xd->pre, so use 0.

+  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->active_ref_idx[0]],

+             sizeof(YV12_BUFFER_CONFIG));

+  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx],

+             sizeof(YV12_BUFFER_CONFIG));

-  vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));

-  vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));

   // Create the segmentation map structure and set to 0

   if (!pc->last_frame_seg_map)

     CHECK_MEM_ERROR(pc->last_frame_seg_map,

@@ -1815,23 +1861,22 @@

   vp9_decode_mode_mvs_init(pbi, &header_bc);

-  vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);

-  /* Decode a row of superblocks */

-  for (mb_row = 0; mb_row < pc->mb_rows; mb_row += 4) {

-    decode_sb_row(pbi, pc, mb_row, xd, &residual_bc);

-  }

+  decode_tiles(pbi, data, first_partition_length_in_bytes,

+               &header_bc, &residual_bc);

   corrupt_tokens |= xd->corrupted;

-  /* Collect information about decoder corruption. */

-  /* 1. Check first boolean decoder for errors. */

-  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc);

-  /* 2. Check the macroblock information */

-  pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens;

+  // keep track of the last coded dimensions

+  pc->last_width = pc->width;

+  pc->last_height = pc->height;

+  // Collect information about decoder corruption.

+  // 1. Check first boolean decoder for errors.

+  // 2. Check the macroblock information

+  pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc) |

+                                          corrupt_tokens;

   if (!pbi->decoded_key_frame) {

-    if (pc->frame_type == KEY_FRAME &&

-        !pc->yv12_fb[pc->new_fb_idx].corrupted)

+    if (pc->frame_type == KEY_FRAME && !pc->yv12_fb[pc->new_fb_idx].corrupted)

       pbi->decoded_key_frame = 1;

     else

       vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,

@@ -1838,23 +1883,24 @@

                          "A stream must start with a complete key frame");

-  vp9_adapt_coef_probs(pc);

-  if (pc->frame_type != KEY_FRAME) {

-    vp9_adapt_mode_probs(pc);

-    vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);

-    vp9_update_mode_context(&pbi->common);

+  if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {

+    vp9_adapt_coef_probs(pc);

+#if CONFIG_CODE_NONZEROCOUNT

+    vp9_adapt_nzc_probs(pc);

+#endif

-  /* If this was a kf or Gf note the Q used */

-  if ((pc->frame_type == KEY_FRAME) ||

-      pc->refresh_golden_frame || pc->refresh_alt_ref_frame) {

-    pc->last_kf_gf_q = pc->base_qindex;

+  if (pc->frame_type != KEY_FRAME) {

+    if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) {

+      vp9_adapt_mode_probs(pc);

+      vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);

+      vp9_adapt_mode_context(&pbi->common);

+    }

   if (pc->refresh_entropy_probs) {

-    if (pc->refresh_alt_ref_frame)

-      vpx_memcpy(&pc->lfc_a, &pc->fc, sizeof(pc->fc));

-    else

-      vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

+    vpx_memcpy(&pc->frame_contexts[pc->frame_context_idx], &pc->fc,

+               sizeof(pc->fc));

 #ifdef PACKET_TESTING

@@ -1866,11 +1912,10 @@

     fclose(f);

 #endif

-  // printf("Frame %d Done\n", frame_count++);

   /* Find the end of the coded buffer */

-  while (residual_bc.count > CHAR_BIT

-         && residual_bc.count < VP9_BD_VALUE_SIZE) {

+  while (residual_bc.count > CHAR_BIT &&

+         residual_bc.count < VP9_BD_VALUE_SIZE) {

     residual_bc.count -= CHAR_BIT;

     residual_bc.user_buffer--;

--- a/vp9/decoder/vp9_decodframe.h

+++ b/vp9/decoder/vp9_decodframe.h

@@ -14,6 +14,6 @@

 struct VP9Decompressor;

-extern void vp9_init_de_quantizer(struct VP9Decompressor *pbi);

+void vp9_init_de_quantizer(struct VP9Decompressor *pbi);

 #endif  // VP9_DECODER_VP9_DECODFRAME_H_

--- a/vp9/decoder/vp9_dequantize.c

+++ b/vp9/decoder/vp9_dequantize.c

@@ -14,14 +14,15 @@

 #include "vpx_mem/vpx_mem.h"

 #include "vp9/decoder/vp9_onyxd_int.h"

 #include "vp9/common/vp9_common.h"

 static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch,

                          uint8_t *dest, int stride, int width, int height) {

   int r, c;

   for (r = 0; r < height; r++) {

-    for (c = 0; c < width; c++) {

+    for (c = 0; c < width; c++)

       dest[c] = clip_pixel(diff[c] + pred[c]);

-    }

     dest += stride;

     diff += width;

@@ -29,6 +30,26 @@

+void vp9_add_residual_4x4_c(const int16_t *diff, const uint8_t *pred, int pitch,

+                         uint8_t *dest, int stride) {

+  add_residual(diff, pred, pitch, dest, stride, 4, 4);

+}

+void vp9_add_residual_8x8_c(const int16_t *diff, const uint8_t *pred, int pitch,

+                         uint8_t *dest, int stride) {

+  add_residual(diff, pred, pitch, dest, stride, 8, 8);

+}

+void vp9_add_residual_16x16_c(const int16_t *diff, const uint8_t *pred,

+                              int pitch, uint8_t *dest, int stride) {

+  add_residual(diff, pred, pitch, dest, stride, 16, 16);

+}

+void vp9_add_residual_32x32_c(const int16_t *diff, const uint8_t *pred,

+                              int pitch, uint8_t *dest, int stride) {

+  add_residual(diff, pred, pitch, dest, stride, 32, 32);

+}

 static void add_constant_residual(const int16_t diff, const uint8_t *pred,

                                   int pitch, uint8_t *dest, int stride,

                                   int width, int height) {

@@ -35,9 +56,8 @@

   int r, c;

   for (r = 0; r < height; r++) {

-    for (c = 0; c < width; c++) {

+    for (c = 0; c < width; c++)

       dest[c] = clip_pixel(diff + pred[c]);

-    }

     dest += stride;

     pred += pitch;

@@ -44,117 +64,114 @@

-void vp9_dequantize_b_c(BLOCKD *d) {

+void vp9_add_constant_residual_8x8_c(const int16_t diff, const uint8_t *pred,

+                                     int pitch, uint8_t *dest, int stride) {

+  add_constant_residual(diff, pred, pitch, dest, stride, 8, 8);

+}

-  int i;

-  int16_t *DQ  = d->dqcoeff;

-  const int16_t *Q   = d->qcoeff;

-  const int16_t *DQC = d->dequant;

+void vp9_add_constant_residual_16x16_c(const int16_t diff, const uint8_t *pred,

+                                       int pitch, uint8_t *dest, int stride) {

+  add_constant_residual(diff, pred, pitch, dest, stride, 16, 16);

+}

-  for (i = 0; i < 16; i++) {

-    DQ[i] = Q[i] * DQC[i];

-  }

+void vp9_add_constant_residual_32x32_c(const int16_t diff, const uint8_t *pred,

+                                       int pitch, uint8_t *dest, int stride) {

+  add_constant_residual(diff, pred, pitch, dest, stride, 32, 32);

 void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input,

                                const int16_t *dq,

                                uint8_t *pred, uint8_t *dest,

-                               int pitch, int stride, uint16_t eobs) {

-  int16_t output[16];

-  int16_t *diff_ptr = output;

+                               int pitch, int stride, int eob) {

   int i;

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  for (i = 0; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

+  for (i = 0; i < 16; i++)

+    input[i] *= dq[i];

-  vp9_ihtllm(input, output, 4 << 1, tx_type, 4, eobs);

+  vp9_short_iht4x4(input, output, 4, tx_type);

   vpx_memset(input, 0, 32);

-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

+  vp9_add_residual_4x4(output, pred, pitch, dest, stride);

 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,

                                    const int16_t *dq,

                                    uint8_t *pred, uint8_t *dest,

-                                   int pitch, int stride, uint16_t eobs) {

-  int16_t output[64];

-  int16_t *diff_ptr = output;

-  int i;

-  if (eobs == 0) {

-    /* All 0 DCT coefficient */

+                                   int pitch, int stride, int eob) {

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);

+  if (eob == 0) {

+    // All 0 DCT coefficients

     vp9_copy_mem8x8(pred, pitch, dest, stride);

-  } else if (eobs > 0) {

-    input[0] = dq[0] * input[0];

-    for (i = 1; i < 64; i++) {

-      input[i] = dq[1] * input[i];

-    }

+  } else if (eob > 0) {

+    int i;

-    vp9_ihtllm(input, output, 16, tx_type, 8, eobs);

+    input[0] *= dq[0];

+    for (i = 1; i < 64; i++)

+      input[i] *= dq[1];

+    vp9_short_iht8x8(input, output, 8, tx_type);

     vpx_memset(input, 0, 128);

-    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);

+    vp9_add_residual_8x8(output, pred, pitch, dest, stride);

 void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,

-                            uint8_t *dest, int pitch, int stride) {

-  int16_t output[16];

-  int16_t *diff_ptr = output;

+                            uint8_t *dest, int pitch, int stride, int eob) {

   int i;

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  for (i = 0; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

+  if (eob > 1) {

+    for (i = 0; i < 16; i++)

+      input[i] *= dq[i];

-  /* the idct halves ( >> 1) the pitch */

-  vp9_short_idct4x4llm_c(input, output, 4 << 1);

+    // the idct halves ( >> 1) the pitch

+    vp9_short_idct4x4(input, output, 4 << 1);

-  vpx_memset(input, 0, 32);

+    vpx_memset(input, 0, 32);

-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

+    vp9_add_residual_4x4(output, pred, pitch, dest, stride);

+  } else {

+    vp9_dc_only_idct_add(input[0]*dq[0], pred, dest, pitch, stride);

+    ((int *)input)[0] = 0;

+  }

 void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred,

-                               uint8_t *dest, int pitch, int stride, int Dc) {

+                               uint8_t *dest, int pitch, int stride, int dc) {

   int i;

-  int16_t output[16];

-  int16_t *diff_ptr = output;

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  input[0] = (int16_t)Dc;

+  input[0] = dc;

-  for (i = 1; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

+  for (i = 1; i < 16; i++)

+    input[i] *= dq[i];

-  /* the idct halves ( >> 1) the pitch */

-  vp9_short_idct4x4llm_c(input, output, 4 << 1);

+  // the idct halves ( >> 1) the pitch

+  vp9_short_idct4x4(input, output, 4 << 1);

   vpx_memset(input, 0, 32);

-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

+  vp9_add_residual_4x4(output, pred, pitch, dest, stride);

-#if CONFIG_LOSSLESS

 void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,

                                      uint8_t *pred, uint8_t *dest,

-                                     int pitch, int stride) {

-  int16_t output[16];

-  int16_t *diff_ptr = output;

+                                     int pitch, int stride, int eob) {

   int i;

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  for (i = 0; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

+  if (eob > 1) {

+    for (i = 0; i < 16; i++)

+      input[i] *= dq[i];

-  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

+    vp9_short_iwalsh4x4_c(input, output, 4 << 1);

-  vpx_memset(input, 0, 32);

+    vpx_memset(input, 0, 32);

-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

+    vp9_add_residual_4x4(output, pred, pitch, dest, stride);

+  } else {

+    vp9_dc_only_inv_walsh_add(input[0]*dq[0], pred, dest, pitch, stride);

+    ((int *)input)[0] = 0;

+  }

 void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,

@@ -162,79 +179,58 @@

                                         uint8_t *dest,

                                         int pitch, int stride, int dc) {

   int i;

-  int16_t output[16];

-  int16_t *diff_ptr = output;

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);

-  input[0] = (int16_t)dc;

+  input[0] = dc;

-  for (i = 1; i < 16; i++) {

-    input[i] = dq[i] * input[i];

-  }

+  for (i = 1; i < 16; i++)

+    input[i] *= dq[i];

-  vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

+  vp9_short_iwalsh4x4_c(input, output, 4 << 1);

   vpx_memset(input, 0, 32);

-  add_residual(diff_ptr, pred, pitch, dest, stride, 4, 4);

+  vp9_add_residual_4x4(output, pred, pitch, dest, stride);

-#endif

-void vp9_dequantize_b_2x2_c(BLOCKD *d) {

-  int i;

-  int16_t *DQ  = d->dqcoeff;

-  const int16_t *Q   = d->qcoeff;

-  const int16_t *DQC = d->dequant;

-  for (i = 0; i < 16; i++) {

-    DQ[i] = (int16_t)((Q[i] * DQC[i]));

-  }

-}

 void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq,

                                 uint8_t *pred, uint8_t *dest, int pitch,

-                                int stride, int dc, int eob) {

-  int16_t output[64];

-  int16_t *diff_ptr = output;

-  int i;

+                                int stride, int eob) {

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);

-  /* If dc is 1, then input[0] is the reconstructed value, do not need

-   * dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

-   */

-  if (!dc)

-    input[0] *= dq[0];

+  // If dc is 1, then input[0] is the reconstructed value, do not need

+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.

+  input[0] *= dq[0];

-  /* The calculation can be simplified if there are not many non-zero dct

-   * coefficients. Use eobs to decide what to do.

-   * TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

-   * Combine that with code here.

-   */

+  // The calculation can be simplified if there are not many non-zero dct

+  // coefficients. Use eobs to decide what to do.

+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.

+  // Combine that with code here.

   if (eob == 0) {

-    /* All 0 DCT coefficient */

+    // All 0 DCT coefficients

     vp9_copy_mem8x8(pred, pitch, dest, stride);

   } else if (eob == 1) {

-    /* DC only DCT coefficient. */

+    // DC only DCT coefficient

+    int16_t in = input[0];

     int16_t out;

-    /* Note: the idct1 will need to be modified accordingly whenever

-     * vp9_short_idct8x8_c() is modified. */

-    out = (input[0] + 1 + (input[0] < 0)) >> 2;

-    out = out << 3;

-    out = (out + 32) >> 7;

+     // Note: the idct1 will need to be modified accordingly whenever

+     // vp9_short_idct8x8_c() is modified.

+    vp9_short_idct1_8x8_c(&in, &out);

     input[0] = 0;

-    add_constant_residual(out, pred, pitch, dest, stride, 8, 8);

+    vp9_add_constant_residual_8x8(out, pred, pitch, dest, stride);

+#if !CONFIG_SCATTERSCAN

   } else if (eob <= 10) {

-    input[1] = input[1] * dq[1];

-    input[2] = input[2] * dq[1];

-    input[3] = input[3] * dq[1];

-    input[8] = input[8] * dq[1];

-    input[9] = input[9] * dq[1];

-    input[10] = input[10] * dq[1];

-    input[16] = input[16] * dq[1];

-    input[17] = input[17] * dq[1];

-    input[24] = input[24] * dq[1];

+    input[1] *= dq[1];

+    input[2] *= dq[1];

+    input[3] *= dq[1];

+    input[8] *= dq[1];

+    input[9] *= dq[1];

+    input[10] *= dq[1];

+    input[16] *= dq[1];

+    input[17] *= dq[1];

+    input[24] *= dq[1];

-    vp9_short_idct10_8x8_c(input, output, 16);

+    vp9_short_idct10_8x8(input, output, 16);

     input[0] = input[1] = input[2] = input[3] = 0;

     input[8] = input[9] = input[10] = 0;

@@ -241,19 +237,19 @@

     input[16] = input[17] = 0;

     input[24] = 0;

-    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);

+    vp9_add_residual_8x8(output, pred, pitch, dest, stride);

+#endif

   } else {

+    int i;

     // recover quantizer for 4 4x4 blocks

-    for (i = 1; i < 64; i++) {

-      input[i] = input[i] * dq[1];

-    }

-    // the idct halves ( >> 1) the pitch

-    vp9_short_idct8x8_c(input, output, 16);

+    for (i = 1; i < 64; i++)

+      input[i] *= dq[1];

+    // the idct halves ( >> 1) the pitch

+    vp9_short_idct8x8(input, output, 8 << 1);

     vpx_memset(input, 0, 128);

-    add_residual(diff_ptr, pred, pitch, dest, stride, 8, 8);

+    vp9_add_residual_8x8(output, pred, pitch, dest, stride);

@@ -260,29 +256,30 @@

 void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,

                                      const int16_t *dq, uint8_t *pred,

                                      uint8_t *dest, int pitch, int stride,

-                                     uint16_t eobs) {

-  int16_t output[256];

-  int16_t *diff_ptr = output;

-  int i;

-  if (eobs == 0) {

-    /* All 0 DCT coefficient */

+                                     int eob) {

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);

+  if (eob == 0) {

+    // All 0 DCT coefficients

     vp9_copy_mem16x16(pred, pitch, dest, stride);

-  } else if (eobs > 0) {

-    input[0]= input[0] * dq[0];

+  } else if (eob > 0) {

+    int i;

+    input[0] *= dq[0];

     // recover quantizer for 4 4x4 blocks

     for (i = 1; i < 256; i++)

-      input[i] = input[i] * dq[1];

+      input[i] *= dq[1];

     // inverse hybrid transform

-    vp9_ihtllm(input, output, 32, tx_type, 16, eobs);

+    vp9_short_iht16x16(input, output, 16, tx_type);

     // the idct halves ( >> 1) the pitch

-    // vp9_short_idct16x16_c(input, output, 32);

+    // vp9_short_idct16x16(input, output, 32);

     vpx_memset(input, 0, 512);

-    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);

+    vp9_add_residual_16x16(output, pred, pitch, dest, stride);

@@ -289,9 +286,7 @@

 void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,

                                   uint8_t *pred, uint8_t *dest, int pitch,

                                   int stride, int eob) {

-  int16_t output[256];

-  int16_t *diff_ptr = output;

-  int i;

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);

   /* The calculation can be simplified if there are not many non-zero dct

    * coefficients. Use eobs to separate different cases. */

@@ -300,31 +295,30 @@

     vp9_copy_mem16x16(pred, pitch, dest, stride);

   } else if (eob == 1) {

     /* DC only DCT coefficient. */

+    int16_t in = input[0] * dq[0];

     int16_t out;

     /* Note: the idct1 will need to be modified accordingly whenever

-     * vp9_short_idct16x16_c() is modified. */

-    out = (input[0] * dq[0] + 2) >> 2;

-    out = (out + 2) >> 2;

-    out = (out + 4) >> 3;

+     * vp9_short_idct16x16() is modified. */

+    vp9_short_idct1_16x16_c(&in, &out);

     input[0] = 0;

-    add_constant_residual(out, pred, pitch, dest, stride, 16, 16);

+    vp9_add_constant_residual_16x16(out, pred, pitch, dest, stride);

+#if !CONFIG_SCATTERSCAN

   } else if (eob <= 10) {

-    input[0]= input[0] * dq[0];

-    input[1] = input[1] * dq[1];

-    input[2] = input[2] * dq[1];

-    input[3] = input[3] * dq[1];

-    input[16] = input[16] * dq[1];

-    input[17] = input[17] * dq[1];

-    input[18] = input[18] * dq[1];

-    input[32] = input[32] * dq[1];

-    input[33] = input[33] * dq[1];

-    input[48] = input[48] * dq[1];

+    input[0] *= dq[0];

+    input[1] *= dq[1];

+    input[2] *= dq[1];

+    input[3] *= dq[1];

+    input[16] *= dq[1];

+    input[17] *= dq[1];

+    input[18] *= dq[1];

+    input[32] *= dq[1];

+    input[33] *= dq[1];

+    input[48] *= dq[1];

     // the idct halves ( >> 1) the pitch

-    vp9_short_idct10_16x16_c(input, output, 32);

+    vp9_short_idct10_16x16(input, output, 32);

     input[0] = input[1] = input[2] = input[3] = 0;

     input[16] = input[17] = input[18] = 0;

@@ -331,20 +325,23 @@

     input[32] = input[33] = 0;

     input[48] = 0;

-    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);

+    vp9_add_residual_16x16(output, pred, pitch, dest, stride);

+#endif

   } else {

-    input[0]= input[0] * dq[0];

+    int i;

+    input[0] *= dq[0];

     // recover quantizer for 4 4x4 blocks

     for (i = 1; i < 256; i++)

-      input[i] = input[i] * dq[1];

+      input[i] *= dq[1];

     // the idct halves ( >> 1) the pitch

-    vp9_short_idct16x16_c(input, output, 32);

+    vp9_short_idct16x16(input, output, 16 << 1);

     vpx_memset(input, 0, 512);

-    add_residual(diff_ptr, pred, pitch, dest, stride, 16, 16);

+    vp9_add_residual_16x16(output, pred, pitch, dest, stride);

@@ -351,16 +348,45 @@

 void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq,

                                   uint8_t *pred, uint8_t *dest, int pitch,

                                   int stride, int eob) {

-  int16_t output[1024];

-  int i;

+  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024);

-  input[0]= input[0] * dq[0] / 2;

-  for (i = 1; i < 1024; i++)

-    input[i] = input[i] * dq[1] / 2;

-  vp9_short_idct32x32_c(input, output, 64);

-  vpx_memset(input, 0, 2048);

+  if (eob) {

+    input[0] = input[0] * dq[0] / 2;

+    if (eob == 1) {

+      vp9_short_idct1_32x32(input, output);

+      vp9_add_constant_residual_32x32(output[0], pred, pitch, dest, stride);

+      input[0] = 0;

+#if !CONFIG_SCATTERSCAN

+    } else if (eob <= 10) {

+      input[1] = input[1] * dq[1] / 2;

+      input[2] = input[2] * dq[1] / 2;

+      input[3] = input[3] * dq[1] / 2;

+      input[32] = input[32] * dq[1] / 2;

+      input[33] = input[33] * dq[1] / 2;

+      input[34] = input[34] * dq[1] / 2;

+      input[64] = input[64] * dq[1] / 2;

+      input[65] = input[65] * dq[1] / 2;

+      input[96] = input[96] * dq[1] / 2;

-  add_residual(output, pred, pitch, dest, stride, 32, 32);

+      // the idct halves ( >> 1) the pitch

+      vp9_short_idct10_32x32(input, output, 64);

+      input[0] = input[1] = input[2] = input[3] = 0;

+      input[32] = input[33] = input[34] = 0;

+      input[64] = input[65] = 0;

+      input[96] = 0;

+      vp9_add_residual_32x32(output, pred, pitch, dest, stride);

+#endif

+    } else {

+      int i;

+      for (i = 1; i < 1024; i++)

+        input[i] = input[i] * dq[1] / 2;

+      vp9_short_idct32x32(input, output, 64);

+      vpx_memset(input, 0, 2048);

+      vp9_add_residual_32x32(output, pred, pitch, dest, stride);

+    }

+  }

 void vp9_dequant_idct_add_uv_block_16x16_c(int16_t *q, const int16_t *dq,

@@ -367,8 +393,9 @@

                                            uint8_t *dstu,

                                            uint8_t *dstv,

                                            int stride,

-                                           uint16_t *eobs) {

-  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride, eobs[0]);

-  vp9_dequant_idct_add_16x16_c(q + 256, dq,

-                               dstv, dstv, stride, stride, eobs[4]);

+                                           MACROBLOCKD *xd) {

+  vp9_dequant_idct_add_16x16_c(q, dq, dstu, dstu, stride, stride,

+                               xd->eobs[64]);

+  vp9_dequant_idct_add_16x16_c(q + 256, dq, dstv, dstv, stride, stride,

+                               xd->eobs[80]);

--- a/vp9/decoder/vp9_dequantize.h

+++ b/vp9/decoder/vp9_dequantize.h

@@ -11,84 +11,80 @@

 #ifndef VP9_DECODER_VP9_DEQUANTIZE_H_

 #define VP9_DECODER_VP9_DEQUANTIZE_H_

 #include "vp9/common/vp9_blockd.h"

-#if CONFIG_LOSSLESS

-extern void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,

-                                            unsigned char *pred,

-                                            unsigned char *output,

-                                            int pitch, int stride);

-extern void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,

-                                               unsigned char *pred,

-                                               unsigned char *output,

-                                               int pitch, int stride, int dc);

-extern void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,

-                                                       const int16_t *dq,

-                                                       unsigned char *pre,

-                                                       unsigned char *dst,

-                                                       int stride,

-                                                       uint16_t *eobs,

-                                                       const int16_t *dc);

-extern void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,

-                                                    unsigned char *pre,

-                                                    unsigned char *dst,

-                                                    int stride,

-                                                    uint16_t *eobs);

-extern void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,

-                                                     unsigned char *pre,

-                                                     unsigned char *dst_u,

-                                                     unsigned char *dst_v,

-                                                     int stride,

-                                                     uint16_t *eobs);

-#endif

-typedef void (*vp9_dequant_idct_add_fn_t)(int16_t *input, const int16_t *dq,

-    unsigned char *pred, unsigned char *output, int pitch, int stride);

-typedef void(*vp9_dequant_dc_idct_add_fn_t)(int16_t *input, const int16_t *dq,

-    unsigned char *pred, unsigned char *output, int pitch, int stride, int dc);

+void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq,

+                                     unsigned char *pred,

+                                     unsigned char *output,

+                                     int pitch, int stride, int eob);

-typedef void(*vp9_dequant_dc_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,

-    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs,

-    const int16_t *dc);

-typedef void(*vp9_dequant_idct_add_y_block_fn_t)(int16_t *q, const int16_t *dq,

-    unsigned char *pre, unsigned char *dst, int stride, uint16_t *eobs);

-typedef void(*vp9_dequant_idct_add_uv_block_fn_t)(int16_t *q, const int16_t *dq,

-    unsigned char *pre, unsigned char *dst_u, unsigned char *dst_v, int stride,

-    uint16_t *eobs);

+void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq,

+                                        unsigned char *pred,

+                                        unsigned char *output,

+                                        int pitch, int stride, int dc);

+void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q,

+                                                const int16_t *dq,

+                                                unsigned char *pre,

+                                                unsigned char *dst,

+                                                int stride,

+                                                const int16_t *dc);

+void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,

+                                             unsigned char *pre,

+                                             unsigned char *dst,

+                                             int stride,

+                                             struct macroblockd *xd);

+void vp9_dequant_idct_add_uv_block_lossless_c(int16_t *q, const int16_t *dq,

+                                              unsigned char *pre,

+                                              unsigned char *dst_u,

+                                              unsigned char *dst_v,

+                                              int stride,

+                                              struct macroblockd *xd);

 void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq,

                                     unsigned char *pred, unsigned char *dest,

-                                    int pitch, int stride, uint16_t eobs);

+                                    int pitch, int stride, int eob);

 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input,

                                    const int16_t *dq, unsigned char *pred,

                                    unsigned char *dest, int pitch, int stride,

-                                   uint16_t eobs);

+                                   int eob);

 void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,

                                      const int16_t *dq, unsigned char *pred,

                                      unsigned char *dest,

-                                     int pitch, int stride, uint16_t eobs);

+                                     int pitch, int stride, int eob);

 void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,

                                                    unsigned char *dst,

                                                    int stride,

-                                                   uint16_t *eobs,

                                                    const int16_t *dc,

                                                    MACROBLOCKD *xd);

+void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q, const int16_t *dq,

+                                                unsigned char *dst,

+                                                int stride,

+                                                MACROBLOCKD *xd);

 void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,

                                                    unsigned char *dst,

                                                    int stride,

-                                                   uint16_t *eobs,

                                                    const int16_t *dc,

                                                    MACROBLOCKD *xd);

+void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q, const int16_t *dq,

+                                                unsigned char *dst,

+                                                int stride,

+                                                MACROBLOCKD *xd);

 void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,

                                                  unsigned char *dstu,

                                                  unsigned char *dstv,

                                                  int stride,

-                                                 uint16_t *eobs,

                                                  MACROBLOCKD *xd);

 void vp9_dequant_idct_add_uv_block_4x4_inplace_c(int16_t *q, const int16_t *dq,

@@ -95,7 +91,6 @@

                                                  unsigned char *dstu,

                                                  unsigned char *dstv,

                                                  int stride,

-                                                 uint16_t *eobs,

                                                  MACROBLOCKD *xd);

-#endif

+#endif  // VP9_DECODER_VP9_DEQUANTIZE_H_

--- a/vp9/decoder/vp9_detokenize.c

+++ b/vp9/decoder/vp9_detokenize.c

@@ -59,115 +59,215 @@

 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);

-static int get_signed(BOOL_DECODER *br, int value_to_sign) {

+static int16_t get_signed(BOOL_DECODER *br, int16_t value_to_sign) {

   return decode_bool(br, 128) ? -value_to_sign : value_to_sign;

-#if CONFIG_NEWCOEFCONTEXT

-#define PT pn

-#define INCREMENT_COUNT(token)                       \

-  do {                                               \

-    coef_counts[type][coef_bands[c]][pn][token]++;   \

-    pn = pt = vp9_prev_token_class[token];           \

-    if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(coef_bands[c + 1]))  \

-      pn = vp9_get_coef_neighbor_context(            \

-          qcoeff_ptr, nodc, neighbors, scan[c + 1]); \

-  } while (0)

-#else

-#define PT pt

 #define INCREMENT_COUNT(token)               \

   do {                                       \

-    coef_counts[type][coef_bands[c]][pt][token]++; \

-    pt = vp9_prev_token_class[token];              \

+    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \

+               [pt][token]++;     \

+    token_cache[c] = token; \

+    pt = vp9_get_coef_context(scan, nb, pad, token_cache,     \

+                              c + 1, default_eob); \

   } while (0)

-#endif  /* CONFIG_NEWCOEFCONTEXT */

+#if CONFIG_CODE_NONZEROCOUNT

 #define WRITE_COEF_CONTINUE(val, token)                       \

   {                                                           \

-    qcoeff_ptr[scan[c]] = (int16_t) get_signed(br, val);        \

+    qcoeff_ptr[scan[c]] = get_signed(br, val);                \

     INCREMENT_COUNT(token);                                   \

     c++;                                                      \

+    nzc++;                                                    \

     continue;                                                 \

+#else

+#define WRITE_COEF_CONTINUE(val, token)                  \

+  {                                                      \

+    qcoeff_ptr[scan[c]] = get_signed(br, val);           \

+    INCREMENT_COUNT(token);                              \

+    c++;                                                 \

+    continue;                                            \

+  }

+#endif  // CONFIG_CODE_NONZEROCOUNT

 #define ADJUST_COEF(prob, bits_count)  \

   do {                                 \

     if (vp9_read(br, prob))            \

-      val += (uint16_t)(1 << bits_count);\

+      val += 1 << bits_count;          \

   } while (0);

 static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,

-                        BOOL_DECODER* const br,

-                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

-                        PLANE_TYPE type,

-                        TX_TYPE tx_type,

-                        int seg_eob, int16_t *qcoeff_ptr,

-                        const int *const scan, TX_SIZE txfm_size,

-                        const int *coef_bands) {

+                        BOOL_DECODER* const br, int block_idx,

+                        PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,

+                        TX_SIZE txfm_size) {

+  ENTROPY_CONTEXT* const A0 = (ENTROPY_CONTEXT *) xd->above_context;

+  ENTROPY_CONTEXT* const L0 = (ENTROPY_CONTEXT *) xd->left_context;

+  int aidx, lidx;

+  ENTROPY_CONTEXT above_ec, left_ec;

   FRAME_CONTEXT *const fc = &dx->common.fc;

-#if CONFIG_NEWCOEFCONTEXT

-  const int *neighbors;

-  int pn;

-#endif

-  int nodc = (type == PLANE_TYPE_Y_NO_DC);

-  int pt, c = nodc;

+  int pt, c = 0, pad, default_eob;

   vp9_coeff_probs *coef_probs;

   vp9_prob *prob;

   vp9_coeff_count *coef_counts;

+  const int ref = xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME;

+#if CONFIG_CODE_NONZEROCOUNT

+  uint16_t nzc = 0;

+  uint16_t nzc_expected = xd->mode_info_context->mbmi.nzcs[block_idx];

+#endif

+  const int *scan, *nb;

+  uint8_t token_cache[1024];

+  if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

+    aidx = vp9_block2above_sb64[txfm_size][block_idx];

+    lidx = vp9_block2left_sb64[txfm_size][block_idx];

+  } else if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB32X32) {

+    aidx = vp9_block2above_sb[txfm_size][block_idx];

+    lidx = vp9_block2left_sb[txfm_size][block_idx];

+  } else {

+    aidx = vp9_block2above[txfm_size][block_idx];

+    lidx = vp9_block2left[txfm_size][block_idx];

+  }

   switch (txfm_size) {

     default:

-    case TX_4X4:

-      if (tx_type == DCT_DCT) {

-        coef_probs  = fc->coef_probs_4x4;

-        coef_counts = fc->coef_counts_4x4;

-      } else {

-        coef_probs  = fc->hybrid_coef_probs_4x4;

-        coef_counts = fc->hybrid_coef_counts_4x4;

+    case TX_4X4: {

+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                              get_tx_type_4x4(xd, block_idx) : DCT_DCT;

+      switch (tx_type) {

+        default:

+          scan = vp9_default_zig_zag1d_4x4;

+          break;

+        case ADST_DCT:

+          scan = vp9_row_scan_4x4;

+          break;

+        case DCT_ADST:

+          scan = vp9_col_scan_4x4;

+          break;

+      above_ec = A0[aidx] != 0;

+      left_ec = L0[lidx] != 0;

+      coef_probs  = fc->coef_probs_4x4;

+      coef_counts = fc->coef_counts_4x4;

+      default_eob = 16;

       break;

-    case TX_8X8:

-      if (tx_type == DCT_DCT) {

-        coef_probs  = fc->coef_probs_8x8;

-        coef_counts = fc->coef_counts_8x8;

-      } else {

-        coef_probs  = fc->hybrid_coef_probs_8x8;

-        coef_counts = fc->hybrid_coef_counts_8x8;

+    }

+    case TX_8X8: {

+      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+      const int sz = 3 + sb_type, x = block_idx & ((1 << sz) - 1);

+      const int y = block_idx - x;

+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;

+      switch (tx_type) {

+        default:

+          scan = vp9_default_zig_zag1d_8x8;

+          break;

+        case ADST_DCT:

+          scan = vp9_row_scan_8x8;

+          break;

+        case DCT_ADST:

+          scan = vp9_col_scan_8x8;

+          break;

+      coef_probs  = fc->coef_probs_8x8;

+      coef_counts = fc->coef_counts_8x8;

+      above_ec = (A0[aidx] + A0[aidx + 1]) != 0;

+      left_ec  = (L0[lidx] + L0[lidx + 1]) != 0;

+      default_eob = 64;

       break;

-    case TX_16X16:

-      if (tx_type == DCT_DCT) {

-        coef_probs  = fc->coef_probs_16x16;

-        coef_counts = fc->coef_counts_16x16;

+    }

+    case TX_16X16: {

+      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+      const int sz = 4 + sb_type, x = block_idx & ((1 << sz) - 1);

+      const int y = block_idx - x;

+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;

+      switch (tx_type) {

+        default:

+          scan = vp9_default_zig_zag1d_16x16;

+          break;

+        case ADST_DCT:

+          scan = vp9_row_scan_16x16;

+          break;

+        case DCT_ADST:

+          scan = vp9_col_scan_16x16;

+          break;

+      }

+      coef_probs  = fc->coef_probs_16x16;

+      coef_counts = fc->coef_counts_16x16;

+      if (type == PLANE_TYPE_UV) {

+        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

+        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

+        above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1]) != 0;

+        left_ec  = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1]) != 0;

       } else {

-        coef_probs  = fc->hybrid_coef_probs_16x16;

-        coef_counts = fc->hybrid_coef_counts_16x16;

+        above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3]) != 0;

+        left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3]) != 0;

+      default_eob = 256;

       break;

+    }

     case TX_32X32:

+      scan = vp9_default_zig_zag1d_32x32;

       coef_probs = fc->coef_probs_32x32;

       coef_counts = fc->coef_counts_32x32;

+      if (type == PLANE_TYPE_UV) {

+        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

+        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

+        ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2);

+        ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2);

+        ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3);

+        ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3);

+        above_ec = (A0[aidx] + A0[aidx + 1] + A1[aidx] + A1[aidx + 1] +

+                    A2[aidx] + A2[aidx + 1] + A3[aidx] + A3[aidx + 1]) != 0;

+        left_ec  = (L0[lidx] + L0[lidx + 1] + L1[lidx] + L1[lidx + 1] +

+                    L2[lidx] + L2[lidx + 1] + L3[lidx] + L3[lidx + 1]) != 0;

+      } else {

+        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

+        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

+        above_ec = (A0[aidx] + A0[aidx + 1] + A0[aidx + 2] + A0[aidx + 3] +

+                    A1[aidx] + A1[aidx + 1] + A1[aidx + 2] + A1[aidx + 3]) != 0;

+        left_ec  = (L0[lidx] + L0[lidx + 1] + L0[lidx + 2] + L0[lidx + 3] +

+                    L1[lidx] + L1[lidx + 1] + L1[lidx + 2] + L1[lidx + 3]) != 0;

+      }

+      default_eob = 1024;

       break;

-  VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

-#if CONFIG_NEWCOEFCONTEXT

-  pn = pt;

-  neighbors = vp9_get_coef_neighbors_handle(scan);

-#endif

+  VP9_COMBINEENTROPYCONTEXTS(pt, above_ec, left_ec);

+  nb = vp9_get_coef_neighbors_handle(scan, &pad);

   while (1) {

     int val;

     const uint8_t *cat6 = cat6_prob;

-    if (c >= seg_eob) break;

-    prob = coef_probs[type][coef_bands[c]][PT];

+    if (c >= seg_eob)

+      break;

+#if CONFIG_CODE_NONZEROCOUNT

+    if (nzc == nzc_expected)

+      break;

+#endif

+    prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt];

+#if CONFIG_CODE_NONZEROCOUNT == 0

+    fc->eob_branch_counts[txfm_size][type][ref]

+                         [get_coef_band(scan, txfm_size, c)][pt]++;

     if (!vp9_read(br, prob[EOB_CONTEXT_NODE]))

       break;

+#endif

 SKIP_START:

-    if (c >= seg_eob) break;

+    if (c >= seg_eob)

+      break;

+#if CONFIG_CODE_NONZEROCOUNT

+    if (nzc == nzc_expected)

+      break;

+    // decode zero node only if there are zeros left

+    if (seg_eob - nzc_expected - c + nzc > 0)

+#endif

     if (!vp9_read(br, prob[ZERO_CONTEXT_NODE])) {

       INCREMENT_COUNT(ZERO_TOKEN);

       ++c;

-      prob = coef_probs[type][coef_bands[c]][PT];

+      prob = coef_probs[type][ref][get_coef_band(scan, txfm_size, c)][pt];

       goto SKIP_START;

     // ONE_CONTEXT_NODE_0_

@@ -230,141 +330,147 @@

     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);

+#if CONFIG_CODE_NONZEROCOUNT == 0

   if (c < seg_eob)

-    coef_counts[type][coef_bands[c]][PT][DCT_EOB_TOKEN]++;

+    coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]

+               [pt][DCT_EOB_TOKEN]++;

+#endif

-  a[0] = l[0] = (c > !type);

+  A0[aidx] = L0[lidx] = c > 0;

+  if (txfm_size >= TX_8X8) {

+    A0[aidx + 1] = L0[lidx + 1] = A0[aidx];

+    if (txfm_size >= TX_16X16) {

+      if (type == PLANE_TYPE_UV) {

+        ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

+        ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

+        A1[aidx] = A1[aidx + 1] = L1[lidx] = L1[lidx + 1] = A0[aidx];

+        if (txfm_size >= TX_32X32) {

+          ENTROPY_CONTEXT *A2 = (ENTROPY_CONTEXT *) (xd->above_context + 2);

+          ENTROPY_CONTEXT *L2 = (ENTROPY_CONTEXT *) (xd->left_context + 2);

+          ENTROPY_CONTEXT *A3 = (ENTROPY_CONTEXT *) (xd->above_context + 3);

+          ENTROPY_CONTEXT *L3 = (ENTROPY_CONTEXT *) (xd->left_context + 3);

+          A2[aidx] = A2[aidx + 1] = A3[aidx] = A3[aidx + 1] = A0[aidx];

+          L2[lidx] = L2[lidx + 1] = L3[lidx] = L3[lidx + 1] = A0[aidx];

+        }

+      } else {

+        A0[aidx + 2] = A0[aidx + 3] = L0[lidx + 2] = L0[lidx + 3] = A0[aidx];

+        if (txfm_size >= TX_32X32) {

+          ENTROPY_CONTEXT *A1 = (ENTROPY_CONTEXT *) (xd->above_context + 1);

+          ENTROPY_CONTEXT *L1 = (ENTROPY_CONTEXT *) (xd->left_context + 1);

+          A1[aidx] = A1[aidx + 1] = A1[aidx + 2] = A1[aidx + 3] = A0[aidx];

+          L1[lidx] = L1[lidx + 1] = L1[lidx + 2] = L1[lidx + 3] = A0[aidx];

+        }

+      }

+    }

+  }

   return c;

 static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {

-  int active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);

-  int eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-  if (!active || eob > eob_max)

-    eob = eob_max;

-  return eob;

+  return vp9_get_segdata(xd, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;

-int vp9_decode_sb_tokens(VP9D_COMP* const pbi,

-                         MACROBLOCKD* const xd,

-                         BOOL_DECODER* const bc) {

-  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;

-  ENTROPY_CONTEXT* const A1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]);

-  ENTROPY_CONTEXT* const L1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]);

-  uint16_t *const eobs = xd->eobs;

+static INLINE int decode_sb(VP9D_COMP* const pbi,

+                            MACROBLOCKD* const xd,

+                            BOOL_DECODER* const bc,

+                            int offset, int count, int inc,

+                            int eob_max, TX_SIZE tx_size) {

   const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  int c, i, eobtotal = 0, seg_eob;

+  const int seg_eob = get_eob(xd, segment_id, eob_max);

+  int i, eobtotal = 0;

-  // Luma block

-#if CONFIG_CNVCONTEXT

-  ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3] +

-                              A1[0] + A1[1] + A1[2] + A1[3]) != 0;

-  ENTROPY_CONTEXT left_ec =  (L[0] + L[1] + L[2] + L[3] +

-                              L1[0] + L1[1] + L1[2] + L1[3]) != 0;

-#else

-  ENTROPY_CONTEXT above_ec = A[0];

-  ENTROPY_CONTEXT left_ec =  L[0];

-#endif

-  eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec,

-                             PLANE_TYPE_Y_WITH_DC,

-                             DCT_DCT, get_eob(xd, segment_id, 1024),

-                             xd->sb_coeff_data.qcoeff,

-                             vp9_default_zig_zag1d_32x32,

-                             TX_32X32, vp9_coef_bands_32x32);

-  A[1] = A[2] = A[3] = A[0] = above_ec;

-  L[1] = L[2] = L[3] = L[0] = left_ec;

-  A1[1] = A1[2] = A1[3] = A1[0] = above_ec;

-  L1[1] = L1[2] = L1[3] = L1[0] = left_ec;

+  // luma blocks

+  for (i = 0; i < offset; i += inc) {

+    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, seg_eob,

+                               xd->qcoeff + i * 16, tx_size);

+    xd->eobs[i] = c;

+    eobtotal += c;

+  }

-  eobtotal += c;

+  // chroma blocks

+  for (i = offset; i < count; i += inc) {

+    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,

+                               xd->qcoeff + i * 16, tx_size);

+    xd->eobs[i] = c;

+    eobtotal += c;

+  }

-  // 16x16 chroma blocks

-  seg_eob = get_eob(xd, segment_id, 256);

+  return eobtotal;

+}

-  for (i = 16; i < 24; i += 4) {

-    ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_16X16][i];

-    ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_16X16][i];

-    ENTROPY_CONTEXT* const a1 = A1 + vp9_block2above[TX_16X16][i];

-    ENTROPY_CONTEXT* const l1 = L1 + vp9_block2left[TX_16X16][i];

-#if CONFIG_CNVCONTEXT

-    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

-    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

-#else

-    above_ec = a[0];

-    left_ec = l[0];

-#endif

+int vp9_decode_sb_tokens(VP9D_COMP* const pbi,

+                         MACROBLOCKD* const xd,

+                         BOOL_DECODER* const bc) {

+  switch (xd->mode_info_context->mbmi.txfm_size) {

+    case TX_32X32: {

+      // 32x32 luma block

+      const int segment_id = xd->mode_info_context->mbmi.segment_id;

+      int i, eobtotal = 0, seg_eob;

+      int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,

+                       get_eob(xd, segment_id, 1024), xd->qcoeff, TX_32X32);

+      xd->eobs[0] = c;

+      eobtotal += c;

-    eobs[i] = c = decode_coefs(pbi, xd, bc,

-                               &above_ec, &left_ec,

-                               PLANE_TYPE_UV,

-                               DCT_DCT, seg_eob,

-                               xd->sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,

-                               vp9_default_zig_zag1d_16x16,

-                               TX_16X16, vp9_coef_bands_16x16);

+      // 16x16 chroma blocks

+      seg_eob = get_eob(xd, segment_id, 256);

+      for (i = 64; i < 96; i += 16) {

+        c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob,

+                         xd->qcoeff + i * 16, TX_16X16);

+        xd->eobs[i] = c;

+        eobtotal += c;

+      }

+      return eobtotal;

+    }

+    case TX_16X16:

+      return decode_sb(pbi, xd, bc, 64, 96, 16, 16 * 16, TX_16X16);

+    case TX_8X8:

+      return decode_sb(pbi, xd, bc, 64, 96, 4, 8 * 8, TX_8X8);

+    case TX_4X4:

+      return decode_sb(pbi, xd, bc, 64, 96, 1, 4 * 4, TX_4X4);

+    default:

+      assert(0);

+      return 0;

+  }

+}

-    a1[1] = a1[0] = a[1] = a[0] = above_ec;

-    l1[1] = l1[0] = l[1] = l[0] = left_ec;

-    eobtotal += c;

+int vp9_decode_sb64_tokens(VP9D_COMP* const pbi,

+                           MACROBLOCKD* const xd,

+                           BOOL_DECODER* const bc) {

+  switch (xd->mode_info_context->mbmi.txfm_size) {

+    case TX_32X32:

+      return decode_sb(pbi, xd, bc, 256, 384, 64, 32 * 32, TX_32X32);

+    case TX_16X16:

+      return decode_sb(pbi, xd, bc, 256, 384, 16, 16 * 16, TX_16X16);

+    case TX_8X8:

+      return decode_sb(pbi, xd, bc, 256, 384, 4, 8 * 8, TX_8X8);

+    case TX_4X4:

+      return decode_sb(pbi, xd, bc, 256, 384, 1, 4 * 4, TX_4X4);

+    default:

+      assert(0);

+      return 0;

-  // no Y2 block

-  A[8] = L[8] = A1[8] = L1[8] = 0;

-  return eobtotal;

 static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,

                                       MACROBLOCKD* const xd,

                                       BOOL_DECODER* const bc) {

-  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;

-  uint16_t *const eobs = xd->eobs;

   const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  int c, i, eobtotal = 0, seg_eob;

-  // Luma block

+  int i, eobtotal = 0, seg_eob;

-#if CONFIG_CNVCONTEXT

-  ENTROPY_CONTEXT above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;

-  ENTROPY_CONTEXT left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;

-#else

-  ENTROPY_CONTEXT above_ec = A[0];

-  ENTROPY_CONTEXT left_ec = L[0];

-#endif

-  eobs[0] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec,

-                             PLANE_TYPE_Y_WITH_DC,

-                             get_tx_type(xd, &xd->block[0]),

-                             get_eob(xd, segment_id, 256),

-                             xd->qcoeff, vp9_default_zig_zag1d_16x16,

-                             TX_16X16, vp9_coef_bands_16x16);

-  A[1] = A[2] = A[3] = A[0] = above_ec;

-  L[1] = L[2] = L[3] = L[0] = left_ec;

+  // Luma block

+  int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC,

+                       get_eob(xd, segment_id, 256), xd->qcoeff, TX_16X16);

+  xd->eobs[0] = c;

   eobtotal += c;

   // 8x8 chroma blocks

   seg_eob = get_eob(xd, segment_id, 64);

   for (i = 16; i < 24; i += 4) {

-    ENTROPY_CONTEXT* const a = A + vp9_block2above[TX_8X8][i];

-    ENTROPY_CONTEXT* const l = L + vp9_block2left[TX_8X8][i];

-#if CONFIG_CNVCONTEXT

-    above_ec = (a[0] + a[1]) != 0;

-    left_ec = (l[0] + l[1]) != 0;

-#else

-    above_ec = a[0];

-    left_ec = l[0];

-#endif

-    eobs[i] = c = decode_coefs(pbi, xd, bc,

-                               &above_ec, &left_ec,

-                               PLANE_TYPE_UV,

-                               DCT_DCT, seg_eob, xd->block[i].qcoeff,

-                               vp9_default_zig_zag1d_8x8,

-                               TX_8X8, vp9_coef_bands_8x8);

-    a[1] = a[0] = above_ec;

-    l[1] = l[0] = left_ec;

+    c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,

+                     seg_eob, xd->block[i].qcoeff, TX_8X8);

+    xd->eobs[i] = c;

     eobtotal += c;

-  A[8] = 0;

-  L[8] = 0;

   return eobtotal;

@@ -371,53 +477,15 @@

 static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,

                                     MACROBLOCKD* const xd,

                                     BOOL_DECODER* const bc) {

-  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;

-  uint16_t *const eobs = xd->eobs;

-  PLANE_TYPE type;

-  int c, i, eobtotal = 0, seg_eob;

+  int i, eobtotal = 0;

   const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  int has_2nd_order = get_2nd_order_usage(xd);

-  // 2nd order DC block

-  if (has_2nd_order) {

-    ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][24];

-    ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][24];

-    eobs[24] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_Y2,

-                                DCT_DCT, get_eob(xd, segment_id, 4),

-                                xd->block[24].qcoeff,

-                                vp9_default_zig_zag1d_4x4, TX_8X8,

-                                vp9_coef_bands_4x4);

-    eobtotal += c - 4;

-    type = PLANE_TYPE_Y_NO_DC;

-  } else {

-    xd->above_context->y2 = 0;

-    xd->left_context->y2 = 0;

-    eobs[24] = 0;

-    type = PLANE_TYPE_Y_WITH_DC;

-  }

   // luma blocks

-  seg_eob = get_eob(xd, segment_id, 64);

+  int seg_eob = get_eob(xd, segment_id, 64);

   for (i = 0; i < 16; i += 4) {

-    ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i];

-    ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i];

-#if CONFIG_CNVCONTEXT

-    ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;

-    ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;

-#else

-    ENTROPY_CONTEXT above_ec = a[0];

-    ENTROPY_CONTEXT left_ec = l[0];

-#endif

-    eobs[i] = c = decode_coefs(pbi, xd, bc, &above_ec, &left_ec, type,

-                               type == PLANE_TYPE_Y_WITH_DC ?

-                               get_tx_type(xd, xd->block + i) : DCT_DCT,

-                               seg_eob, xd->block[i].qcoeff,

-                               vp9_default_zig_zag1d_8x8,

-                               TX_8X8, vp9_coef_bands_8x8);

-    a[1] = a[0] = above_ec;

-    l[1] = l[0] = left_ec;

+    const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC,

+                               seg_eob, xd->block[i].qcoeff, TX_8X8);

+    xd->eobs[i] = c;

     eobtotal += c;

@@ -427,34 +495,16 @@

     // use 4x4 transform for U, V components in I8X8/splitmv prediction mode

     seg_eob = get_eob(xd, segment_id, 16);

     for (i = 16; i < 24; i++) {

-      ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i];

-      ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i];

-      eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,

-                                 DCT_DCT, seg_eob, xd->block[i].qcoeff,

-                                 vp9_default_zig_zag1d_4x4, TX_4X4,

-                                 vp9_coef_bands_4x4);

+      const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,

+                                 seg_eob, xd->block[i].qcoeff, TX_4X4);

+      xd->eobs[i] = c;

       eobtotal += c;

   } else {

     for (i = 16; i < 24; i += 4) {

-      ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_8X8][i];

-      ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_8X8][i];

-#if CONFIG_CNVCONTEXT

-      ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;

-      ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;

-#else

-      ENTROPY_CONTEXT above_ec = a[0];

-      ENTROPY_CONTEXT left_ec = l[0];

-#endif

-      eobs[i] = c = decode_coefs(pbi, xd, bc,

-                                 &above_ec, &left_ec,

-                                 PLANE_TYPE_UV,

-                                 DCT_DCT, seg_eob, xd->block[i].qcoeff,

-                                 vp9_default_zig_zag1d_8x8,

-                                 TX_8X8, vp9_coef_bands_8x8);

-      a[1] = a[0] = above_ec;

-      l[1] = l[0] = left_ec;

+      const int c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV,

+                                 seg_eob, xd->block[i].qcoeff, TX_8X8);

+      xd->eobs[i] = c;

       eobtotal += c;

@@ -464,44 +514,13 @@

 static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,

                             BOOL_DECODER* const bc,

-                            PLANE_TYPE type, int i, int seg_eob,

-                            TX_TYPE tx_type, const int *scan) {

-  ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;

-  ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;

-  ENTROPY_CONTEXT *const a = A + vp9_block2above[TX_4X4][i];

-  ENTROPY_CONTEXT *const l = L + vp9_block2left[TX_4X4][i];

-  uint16_t *const eobs = xd->eobs;

-  int c;

-  c = decode_coefs(dx, xd, bc, a, l, type, tx_type, seg_eob,

-                   xd->block[i].qcoeff, scan, TX_4X4, vp9_coef_bands_4x4);

-  eobs[i] = c;

+                            PLANE_TYPE type, int i, int seg_eob) {

+  const int c = decode_coefs(dx, xd, bc, i, type, seg_eob,

+                             xd->block[i].qcoeff, TX_4X4);

+  xd->eobs[i] = c;

   return c;

-static int decode_coefs_4x4_y(VP9D_COMP *dx, MACROBLOCKD *xd,

-                              BOOL_DECODER* const bc,

-                              PLANE_TYPE type, int i, int seg_eob) {

-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                          get_tx_type(xd, &xd->block[i]) : DCT_DCT;

-  const int *scan;

-  switch (tx_type) {

-    case ADST_DCT:

-      scan = vp9_row_scan_4x4;

-      break;

-    case DCT_ADST:

-      scan = vp9_col_scan_4x4;

-      break;

-    default:

-      scan = vp9_default_zig_zag1d_4x4;

-      break;

-  }

-  return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob, tx_type, scan);

-}

 int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,

                          BOOL_DECODER* const bc,

                          PLANE_TYPE type, int i) {

@@ -508,7 +527,7 @@

   const int segment_id = xd->mode_info_context->mbmi.segment_id;

   const int seg_eob = get_eob(xd, segment_id, 16);

-  return decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob);

+  return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob);

 static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,

@@ -515,13 +534,11 @@

                                    MACROBLOCKD* const xd,

                                    BOOL_DECODER* const bc,

                                    int seg_eob) {

-  int eobtotal = 0, i;

+  int i, eobtotal = 0;

   // chroma blocks

-  for (i = 16; i < 24; i++) {

-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob,

-                                 DCT_DCT, vp9_default_zig_zag1d_4x4);

-  }

+  for (i = 16; i < 24; i++)

+    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob);

   return eobtotal;

@@ -539,27 +556,12 @@

                                     MACROBLOCKD* const xd,

                                     BOOL_DECODER* const bc) {

   int i, eobtotal = 0;

-  PLANE_TYPE type;

   const int segment_id = xd->mode_info_context->mbmi.segment_id;

   const int seg_eob = get_eob(xd, segment_id, 16);

-  const int has_2nd_order = get_2nd_order_usage(xd);

-  // 2nd order DC block

-  if (has_2nd_order) {

-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y2, 24, seg_eob,

-                                 DCT_DCT, vp9_default_zig_zag1d_4x4) - 16;

-    type = PLANE_TYPE_Y_NO_DC;

-  } else {

-    xd->above_context->y2 = 0;

-    xd->left_context->y2 = 0;

-    xd->eobs[24] = 0;

-    type = PLANE_TYPE_Y_WITH_DC;

-  }

   // luma blocks

-  for (i = 0; i < 16; ++i) {

-    eobtotal += decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob);

-  }

+  for (i = 0; i < 16; ++i)

+    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y_WITH_DC, i, seg_eob);

   // chroma blocks

   eobtotal += decode_mb_tokens_4x4_uv(dx, xd, bc, seg_eob);

@@ -571,16 +573,13 @@

                          MACROBLOCKD* const xd,

                          BOOL_DECODER* const bc) {

   const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

-  int eobtotal;

-  if (tx_size == TX_16X16) {

-    eobtotal = vp9_decode_mb_tokens_16x16(dx, xd, bc);

-  } else if (tx_size == TX_8X8) {

-    eobtotal = vp9_decode_mb_tokens_8x8(dx, xd, bc);

-  } else {

-    assert(tx_size == TX_4X4);

-    eobtotal = vp9_decode_mb_tokens_4x4(dx, xd, bc);

+  switch (tx_size) {

+    case TX_16X16:

+      return vp9_decode_mb_tokens_16x16(dx, xd, bc);

+    case TX_8X8:

+      return vp9_decode_mb_tokens_8x8(dx, xd, bc);

+    default:

+      assert(tx_size == TX_4X4);

+      return vp9_decode_mb_tokens_4x4(dx, xd, bc);

-  return eobtotal;

--- a/vp9/decoder/vp9_detokenize.h

+++ b/vp9/decoder/vp9_detokenize.h

@@ -14,8 +14,6 @@

 #include "vp9/decoder/vp9_onyxd_int.h"

-void vp9_reset_mb_tokens_context(MACROBLOCKD* const);

 int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,

                          BOOL_DECODER* const bc,

                          PLANE_TYPE type, int i);

@@ -26,6 +24,10 @@

 int vp9_decode_sb_tokens(VP9D_COMP* const pbi,

                          MACROBLOCKD* const xd,

                          BOOL_DECODER* const bc);

+int vp9_decode_sb64_tokens(VP9D_COMP* const pbi,

+                           MACROBLOCKD* const xd,

+                           BOOL_DECODER* const bc);

 int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,

                                 BOOL_DECODER* const bc);

--- a/vp9/decoder/vp9_idct_blk.c

+++ b/vp9/decoder/vp9_idct_blk.c

@@ -10,75 +10,35 @@

 #include "vp9_rtcd.h"

 #include "vp9/common/vp9_blockd.h"

-#if CONFIG_LOSSLESS

 #include "vp9/decoder/vp9_dequantize.h"

-#endif

-void vp9_dequant_dc_idct_add_y_block_c(int16_t *q, const int16_t *dq,

-                                       uint8_t *pre,

-                                       uint8_t *dst,

-                                       int stride, uint16_t *eobs,

-                                       const int16_t *dc) {

+void vp9_dequant_idct_add_y_block_4x4_inplace_c(int16_t *q,

+                                                const int16_t *dq,

+                                                uint8_t *dst,

+                                                int stride,

+                                                MACROBLOCKD *xd) {

   int i, j;

   for (i = 0; i < 4; i++) {

     for (j = 0; j < 4; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_dc_idct_add_c(q, dq, pre, dst, 16, stride, dc[0]);

-      else

-        vp9_dc_only_idct_add_c(dc[0], pre, dst, 16, stride);

+      xd->itxm_add(q, dq, dst, dst, stride, stride, xd->eobs[i * 4 + j]);

       q   += 16;

-      pre += 4;

       dst += 4;

-      dc++;

-    pre += 64 - 16;

     dst += 4 * stride - 16;

-void vp9_dequant_dc_idct_add_y_block_4x4_inplace_c(int16_t *q,

-                                                   const int16_t *dq,

-                                                   uint8_t *dst,

-                                                   int stride,

-                                                   uint16_t *eobs,

-                                                   const int16_t *dc,

-                                                   MACROBLOCKD *xd) {

-  int i, j;

-  for (i = 0; i < 4; i++) {

-    for (j = 0; j < 4; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_dc_idct_add_c(q, dq, dst, dst, stride, stride, dc[0]);

-      else

-        vp9_dc_only_idct_add_c(dc[0], dst, dst, stride, stride);

-      q   += 16;

-      dst += 4;

-      dc++;

-    }

-    dst += 4 * stride - 16;

-  }

-}

 void vp9_dequant_idct_add_y_block_c(int16_t *q, const int16_t *dq,

                                     uint8_t *pre,

                                     uint8_t *dst,

-                                    int stride, uint16_t *eobs) {

+                                    int stride, MACROBLOCKD *xd) {

   int i, j;

   for (i = 0; i < 4; i++) {

     for (j = 0; j < 4; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_c(q, dq, pre, dst, 16, stride);

-      else {

-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dst, 16, stride);

-        ((int *)q)[0] = 0;

-      }

+      vp9_dequant_idct_add(q, dq, pre, dst, 16, stride, xd->eobs[i * 4  + j]);

       q   += 16;

       pre += 4;

       dst += 4;

@@ -92,18 +52,13 @@

 void vp9_dequant_idct_add_uv_block_c(int16_t *q, const int16_t *dq,

                                      uint8_t *pre, uint8_t *dstu,

                                      uint8_t *dstv, int stride,

-                                     uint16_t *eobs) {

+                                     MACROBLOCKD *xd) {

   int i, j;

   for (i = 0; i < 2; i++) {

     for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_c(q, dq, pre, dstu, 8, stride);

-      else {

-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstu, 8, stride);

-        ((int *)q)[0] = 0;

-      }

+      vp9_dequant_idct_add(q, dq, pre, dstu, 8, stride,

+                           xd->eobs[16 + i * 2 + j]);

       q    += 16;

       pre  += 4;

       dstu += 4;

@@ -115,13 +70,8 @@

   for (i = 0; i < 2; i++) {

     for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_c(q, dq, pre, dstv, 8, stride);

-      else {

-        vp9_dc_only_idct_add_c(q[0]*dq[0], pre, dstv, 8, stride);

-        ((int *)q)[0] = 0;

-      }

+      vp9_dequant_idct_add(q, dq, pre, dstv, 8, stride,

+                           xd->eobs[20 + i * 2 + j]);

       q    += 16;

       pre  += 4;

       dstv += 4;

@@ -136,19 +86,12 @@

                                                  uint8_t *dstu,

                                                  uint8_t *dstv,

                                                  int stride,

-                                                 uint16_t *eobs,

                                                  MACROBLOCKD *xd) {

   int i, j;

   for (i = 0; i < 2; i++) {

     for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1) {

-        vp9_dequant_idct_add_c(q, dq, dstu, dstu, stride, stride);

-      } else {

-        vp9_dc_only_idct_add_c(q[0]*dq[0], dstu, dstu, stride, stride);

-        ((int *)q)[0] = 0;

-      }

+      xd->itxm_add(q, dq, dstu, dstu, stride, stride, xd->eobs[16 + i * 2 + j]);

       q    += 16;

       dstu += 4;

@@ -158,13 +101,7 @@

   for (i = 0; i < 2; i++) {

     for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1) {

-        vp9_dequant_idct_add_c(q, dq, dstv, dstv, stride, stride);

-      } else {

-        vp9_dc_only_idct_add_c(q[0]*dq[0], dstv, dstv, stride, stride);

-        ((int *)q)[0] = 0;

-      }

+      xd->itxm_add(q, dq, dstv, dstv, stride, stride, xd->eobs[20 + i * 2 + j]);

       q    += 16;

       dstv += 4;

@@ -173,69 +110,40 @@

-void vp9_dequant_dc_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,

-                                           uint8_t *pre,

-                                           uint8_t *dst,

-                                           int stride, uint16_t *eobs,

-                                           const int16_t *dc,

-                                           MACROBLOCKD *xd) {

-  q[0] = dc[0];

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 1, xd->eobs[0]);

+void vp9_dequant_idct_add_y_block_8x8_inplace_c(int16_t *q,

+                                                const int16_t *dq,

+                                                uint8_t *dst,

+                                                int stride,

+                                                MACROBLOCKD *xd) {

+  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, xd->eobs[0]);

-  q[64] = dc[1];

-  vp9_dequant_idct_add_8x8_c(&q[64], dq, pre + 8, dst + 8, 16, stride, 1,

-                             xd->eobs[4]);

-  q[128] = dc[4];

-  vp9_dequant_idct_add_8x8_c(&q[128], dq, pre + 8 * 16,

-                                dst + 8 * stride, 16, stride, 1, xd->eobs[8]);

-  q[192] = dc[8];

-  vp9_dequant_idct_add_8x8_c(&q[192], dq, pre + 8 * 16 + 8,

-                                dst + 8 * stride + 8, 16, stride, 1,

-                                xd->eobs[12]);

-}

-void vp9_dequant_dc_idct_add_y_block_8x8_inplace_c(int16_t *q,

-                                                   const int16_t *dq,

-                                                   uint8_t *dst,

-                                                   int stride,

-                                                   uint16_t *eobs,

-                                                   const int16_t *dc,

-                                                   MACROBLOCKD *xd) {

-  q[0] = dc[0];

-  vp9_dequant_idct_add_8x8_c(q, dq, dst, dst, stride, stride, 1, xd->eobs[0]);

-  q[64] = dc[1];

   vp9_dequant_idct_add_8x8_c(&q[64], dq, dst + 8,

-                                dst + 8, stride, stride, 1, xd->eobs[4]);

+                             dst + 8, stride, stride, xd->eobs[4]);

-  q[128] = dc[4];

   vp9_dequant_idct_add_8x8_c(&q[128], dq, dst + 8 * stride,

-                                dst + 8 * stride, stride, stride, 1,

-                                xd->eobs[8]);

+                             dst + 8 * stride, stride, stride,

+                             xd->eobs[8]);

-  q[192] = dc[8];

   vp9_dequant_idct_add_8x8_c(&q[192], dq, dst + 8 * stride + 8,

-                                dst + 8 * stride + 8, stride, stride, 1,

-                                xd->eobs[12]);

+                             dst + 8 * stride + 8, stride, stride,

+                             xd->eobs[12]);

 void vp9_dequant_idct_add_y_block_8x8_c(int16_t *q, const int16_t *dq,

                                         uint8_t *pre,

                                         uint8_t *dst,

-                                        int stride, uint16_t *eobs,

-                                        MACROBLOCKD *xd) {

+                                        int stride, MACROBLOCKD *xd) {

   uint8_t *origdest = dst;

   uint8_t *origpred = pre;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, 0, xd->eobs[0]);

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dst, 16, stride, xd->eobs[0]);

   vp9_dequant_idct_add_8x8_c(&q[64], dq, origpred + 8,

-                             origdest + 8, 16, stride, 0, xd->eobs[4]);

+                             origdest + 8, 16, stride, xd->eobs[4]);

   vp9_dequant_idct_add_8x8_c(&q[128], dq, origpred + 8 * 16,

-                             origdest + 8 * stride, 16, stride, 0, xd->eobs[8]);

+                             origdest + 8 * stride, 16, stride,

+                             xd->eobs[8]);

   vp9_dequant_idct_add_8x8_c(&q[192], dq, origpred + 8 * 16 + 8,

-                             origdest + 8 * stride + 8, 16, stride, 0,

+                             origdest + 8 * stride + 8, 16, stride,

                              xd->eobs[12]);

@@ -243,14 +151,13 @@

                                          uint8_t *pre,

                                          uint8_t *dstu,

                                          uint8_t *dstv,

-                                         int stride, uint16_t *eobs,

-                                         MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, 0, xd->eobs[16]);

+                                         int stride, MACROBLOCKD *xd) {

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstu, 8, stride, xd->eobs[16]);

   q    += 64;

   pre  += 64;

-  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, 0, xd->eobs[20]);

+  vp9_dequant_idct_add_8x8_c(q, dq, pre, dstv, 8, stride, xd->eobs[20]);

 void vp9_dequant_idct_add_uv_block_8x8_inplace_c(int16_t *q, const int16_t *dq,

@@ -257,58 +164,26 @@

                                                  uint8_t *dstu,

                                                  uint8_t *dstv,

                                                  int stride,

-                                                 uint16_t *eobs,

                                                  MACROBLOCKD *xd) {

-  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride, 0,

+  vp9_dequant_idct_add_8x8_c(q, dq, dstu, dstu, stride, stride,

                              xd->eobs[16]);

   q += 64;

-  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride, 0,

+  vp9_dequant_idct_add_8x8_c(q, dq, dstv, dstv, stride, stride,

                              xd->eobs[20]);

-#if CONFIG_LOSSLESS

-void vp9_dequant_dc_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,

-                                                uint8_t *pre,

-                                                uint8_t *dst,

-                                                int stride,

-                                                uint16_t *eobs,

-                                                const int16_t *dc) {

-  int i, j;

-  for (i = 0; i < 4; i++) {

-    for (j = 0; j < 4; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_dc_idct_add_lossless_c(q, dq, pre, dst, 16, stride, dc[0]);

-      else

-        vp9_dc_only_inv_walsh_add_c(dc[0], pre, dst, 16, stride);

-      q   += 16;

-      pre += 4;

-      dst += 4;

-      dc++;

-    }

-    pre += 64 - 16;

-    dst += 4 * stride - 16;

-  }

-}

 void vp9_dequant_idct_add_y_block_lossless_c(int16_t *q, const int16_t *dq,

                                              uint8_t *pre,

                                              uint8_t *dst,

-                                             int stride, uint16_t *eobs) {

+                                             int stride, MACROBLOCKD *xd) {

   int i, j;

   for (i = 0; i < 4; i++) {

     for (j = 0; j < 4; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride);

-      else {

-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dst, 16, stride);

-        ((int *)q)[0] = 0;

-      }

+      vp9_dequant_idct_add_lossless_c(q, dq, pre, dst, 16, stride,

+                                      xd->eobs[i * 4 + j]);

       q   += 16;

       pre += 4;

       dst += 4;

@@ -324,18 +199,13 @@

                                               uint8_t *dstu,

                                               uint8_t *dstv,

                                               int stride,

-                                              uint16_t *eobs) {

+                                              MACROBLOCKD *xd) {

   int i, j;

   for (i = 0; i < 2; i++) {

     for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride);

-      else {

-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstu, 8, stride);

-        ((int *)q)[0] = 0;

-      }

+      vp9_dequant_idct_add_lossless_c(q, dq, pre, dstu, 8, stride,

+                                      xd->eobs[16 + i * 2 + j]);

       q    += 16;

       pre  += 4;

       dstu += 4;

@@ -347,13 +217,8 @@

   for (i = 0; i < 2; i++) {

     for (j = 0; j < 2; j++) {

-      if (*eobs++ > 1)

-        vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride);

-      else {

-        vp9_dc_only_inv_walsh_add_c(q[0]*dq[0], pre, dstv, 8, stride);

-        ((int *)q)[0] = 0;

-      }

+      vp9_dequant_idct_add_lossless_c(q, dq, pre, dstv, 8, stride,

+                                      xd->eobs[20 + i * 2 + j]);

       q    += 16;

       pre  += 4;

       dstv += 4;

@@ -363,5 +228,4 @@

     dstv += 4 * stride - 8;

-#endif

--- a/vp9/decoder/vp9_onyxd.h

+++ b/vp9/decoder/vp9_onyxd.h

@@ -27,6 +27,7 @@

     int     Version;

     int     postprocess;

     int     max_threads;

+    int     inv_tile_order;

     int     input_partition;

   } VP9D_CONFIG;

   typedef enum {

@@ -45,13 +46,15 @@

                         int64_t *time_stamp, int64_t *time_end_stamp,

                         vp9_ppflags_t *flags);

-  vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp,

-                                        VP9_REFFRAME ref_frame_flag,

-                                        YV12_BUFFER_CONFIG *sd);

+  vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR comp,

+                                         VP9_REFFRAME ref_frame_flag,

+                                         YV12_BUFFER_CONFIG *sd);

   vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp,

                                         VP9_REFFRAME ref_frame_flag,

                                         YV12_BUFFER_CONFIG *sd);

+  int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb);

   VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf);

--- a/vp9/decoder/vp9_onyxd_if.c

+++ b/vp9/decoder/vp9_onyxd_if.c

@@ -9,6 +9,9 @@

*/

+#include <stdio.h>

+#include <assert.h>

 #include "vp9/common/vp9_onyxc_int.h"

 #if CONFIG_POSTPROC

 #include "vp9/common/vp9_postproc.h"

@@ -19,8 +22,6 @@

 #include "vp9/common/vp9_alloccommon.h"

 #include "vp9/common/vp9_loopfilter.h"

 #include "vp9/common/vp9_swapyv12buffer.h"

-#include <stdio.h>

-#include <assert.h>

 #include "vp9/common/vp9_quant_common.h"

 #include "vpx_scale/vpx_scale.h"

@@ -30,34 +31,34 @@

 #include "vp9/decoder/vp9_detokenize.h"

 #include "./vpx_scale_rtcd.h"

-static int get_free_fb(VP9_COMMON *cm);

-static void ref_cnt_fb(int *buf, int *idx, int new_idx);

 #define WRITE_RECON_BUFFER 0

 #if WRITE_RECON_BUFFER == 1

-static void recon_write_yuv_frame(char *name, YV12_BUFFER_CONFIG *s) {

+static void recon_write_yuv_frame(const char *name,

+                                  const YV12_BUFFER_CONFIG *s,

+                                  int w, int _h) {

   FILE *yuv_file = fopen((char *)name, "ab");

-  uint8_t *src = s->y_buffer;

-  int h = s->y_height;

+  const uint8_t *src = s->y_buffer;

+  int h = _h;

   do {

-    fwrite(src, s->y_width, 1,  yuv_file);

+    fwrite(src, w, 1,  yuv_file);

     src += s->y_stride;

   } while (--h);

   src = s->u_buffer;

-  h = s->uv_height;

+  h = (_h + 1) >> 1;

+  w = (w + 1) >> 1;

   do {

-    fwrite(src, s->uv_width, 1,  yuv_file);

+    fwrite(src, w, 1,  yuv_file);

     src += s->uv_stride;

   } while (--h);

   src = s->v_buffer;

-  h = s->uv_height;

+  h = (_h + 1) >> 1;

   do {

-    fwrite(src, s->uv_width, 1, yuv_file);

+    fwrite(src, w, 1, yuv_file);

     src += s->uv_stride;

   } while (--h);

@@ -99,7 +100,7 @@

 #endif

-void vp9_initialize_dec(void) {

+void vp9_initialize_dec() {

   static int init_done = 0;

   if (!init_done) {

@@ -127,6 +128,7 @@

   vp9_initialize_dec();

   vp9_create_common(&pbi->common);

+  pbi->oxcf = *oxcf;

   pbi->common.current_video_frame = 0;

   pbi->ready_for_new_data = 1;

@@ -152,8 +154,8 @@

   if (!pbi)

     return;

-  // Delete sementation map

-  if (pbi->common.last_frame_seg_map != 0)

+  // Delete segmentation map

+  if (pbi->common.last_frame_seg_map)

     vpx_free(pbi->common.last_frame_seg_map);

   vp9_remove_common(&pbi->common);

@@ -161,33 +163,37 @@

   vpx_free(pbi);

+static int equal_dimensions(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {

+    return a->y_height == b->y_height && a->y_width == b->y_width &&

+           a->uv_height == b->uv_height && a->uv_width == b->uv_width;

+}

-vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR ptr, VP9_REFFRAME ref_frame_flag,

-                                      YV12_BUFFER_CONFIG *sd) {

+vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,

+                                       VP9_REFFRAME ref_frame_flag,

+                                       YV12_BUFFER_CONFIG *sd) {

   VP9D_COMP *pbi = (VP9D_COMP *) ptr;

   VP9_COMMON *cm = &pbi->common;

   int ref_fb_idx;

-  if (ref_frame_flag == VP9_LAST_FLAG)

-    ref_fb_idx = cm->lst_fb_idx;

-  else if (ref_frame_flag == VP9_GOLD_FLAG)

-    ref_fb_idx = cm->gld_fb_idx;

-  else if (ref_frame_flag == VP9_ALT_FLAG)

-    ref_fb_idx = cm->alt_fb_idx;

-  else {

+  /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the

+   * encoder is using the frame buffers for. This is just a stub to keep the

+   * vpxenc --test-decode functionality working, and will be replaced in a

+   * later commit that adds VP9-specific controls for this functionality.

+   */

+  if (ref_frame_flag == VP9_LAST_FLAG) {

+    ref_fb_idx = pbi->common.ref_frame_map[0];

+  } else {

     vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

                        "Invalid reference frame");

     return pbi->common.error.error_code;

-  if (cm->yv12_fb[ref_fb_idx].y_height != sd->y_height ||

-      cm->yv12_fb[ref_fb_idx].y_width != sd->y_width ||

-      cm->yv12_fb[ref_fb_idx].uv_height != sd->uv_height ||

-      cm->yv12_fb[ref_fb_idx].uv_width != sd->uv_width) {

+  if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) {

     vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

                        "Incorrect buffer dimensions");

-  } else

+  } else {

     vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);

+  }

   return pbi->common.error.error_code;

@@ -198,14 +204,18 @@

   VP9D_COMP *pbi = (VP9D_COMP *) ptr;

   VP9_COMMON *cm = &pbi->common;

   int *ref_fb_ptr = NULL;

-  int free_fb;

+  /* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the

+   * encoder is using the frame buffers for. This is just a stub to keep the

+   * vpxenc --test-decode functionality working, and will be replaced in a

+   * later commit that adds VP9-specific controls for this functionality.

+   */

   if (ref_frame_flag == VP9_LAST_FLAG)

-    ref_fb_ptr = &cm->lst_fb_idx;

+    ref_fb_ptr = &pbi->common.active_ref_idx[0];

   else if (ref_frame_flag == VP9_GOLD_FLAG)

-    ref_fb_ptr = &cm->gld_fb_idx;

+    ref_fb_ptr = &pbi->common.active_ref_idx[1];

   else if (ref_frame_flag == VP9_ALT_FLAG)

-    ref_fb_ptr = &cm->alt_fb_idx;

+    ref_fb_ptr = &pbi->common.active_ref_idx[2];

   else {

     vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

                        "Invalid reference frame");

@@ -212,20 +222,17 @@

     return pbi->common.error.error_code;

-  if (cm->yv12_fb[*ref_fb_ptr].y_height != sd->y_height ||

-      cm->yv12_fb[*ref_fb_ptr].y_width != sd->y_width ||

-      cm->yv12_fb[*ref_fb_ptr].uv_height != sd->uv_height ||

-      cm->yv12_fb[*ref_fb_ptr].uv_width != sd->uv_width) {

+  if (!equal_dimensions(&cm->yv12_fb[*ref_fb_ptr], sd)) {

     vpx_internal_error(&pbi->common.error, VPX_CODEC_ERROR,

                        "Incorrect buffer dimensions");

   } else {

-    /* Find an empty frame buffer. */

-    free_fb = get_free_fb(cm);

-    /* Decrease fb_idx_ref_cnt since it will be increased again in

-     * ref_cnt_fb() below. */

+    // Find an empty frame buffer.

+    const int free_fb = get_free_fb(cm);

+    // Decrease fb_idx_ref_cnt since it will be increased again in

+    // ref_cnt_fb() below.

     cm->fb_idx_ref_cnt[free_fb]--;

-    /* Manage the reference counters and copy image. */

+    // Manage the reference counters and copy image.

     ref_cnt_fb(cm->fb_idx_ref_cnt, ref_fb_ptr, free_fb);

     vp8_yv12_copy_frame(sd, &cm->yv12_fb[*ref_fb_ptr]);

@@ -234,77 +241,36 @@

-static int get_free_fb(VP9_COMMON *cm) {

-  int i;

-  for (i = 0; i < NUM_YV12_BUFFERS; i++)

-    if (cm->fb_idx_ref_cnt[i] == 0)

-      break;

+int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {

+  VP9D_COMP *pbi = (VP9D_COMP *) ptr;

+  VP9_COMMON *cm = &pbi->common;

-  assert(i < NUM_YV12_BUFFERS);

-  cm->fb_idx_ref_cnt[i] = 1;

-  return i;

-}

+  if (index < 0 || index >= NUM_REF_FRAMES)

+    return -1;

-static void ref_cnt_fb(int *buf, int *idx, int new_idx) {

-  if (buf[*idx] > 0)

-    buf[*idx]--;

-  *idx = new_idx;

-  buf[new_idx]++;

+  *fb = &cm->yv12_fb[cm->ref_frame_map[index]];

+  return 0;

-/* If any buffer copy / swapping is signalled it should be done here. */

-static int swap_frame_buffers(VP9_COMMON *cm) {

-  int err = 0;

+/* If any buffer updating is signalled it should be done here. */

+static void swap_frame_buffers(VP9D_COMP *pbi) {

+  int ref_index = 0, mask;

-  /* The alternate reference frame or golden frame can be updated

-   *  using the new, last, or golden/alt ref frame.  If it

-   *  is updated using the newly decoded frame it is a refresh.

-   *  An update using the last or golden/alt ref frame is a copy.

-   */

-  if (cm->copy_buffer_to_arf) {

-    int new_fb = 0;

-    if (cm->copy_buffer_to_arf == 1)

-      new_fb = cm->lst_fb_idx;

-    else if (cm->copy_buffer_to_arf == 2)

-      new_fb = cm->gld_fb_idx;

-    else

-      err = -1;

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, new_fb);

+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {

+    if (mask & 1) {

+      ref_cnt_fb(pbi->common.fb_idx_ref_cnt,

+                 &pbi->common.ref_frame_map[ref_index],

+                 pbi->common.new_fb_idx);

+    }

+    ++ref_index;

-  if (cm->copy_buffer_to_gf) {

-    int new_fb = 0;

+  pbi->common.frame_to_show = &pbi->common.yv12_fb[pbi->common.new_fb_idx];

+  pbi->common.fb_idx_ref_cnt[pbi->common.new_fb_idx]--;

-    if (cm->copy_buffer_to_gf == 1)

-      new_fb = cm->lst_fb_idx;

-    else if (cm->copy_buffer_to_gf == 2)

-      new_fb = cm->alt_fb_idx;

-    else

-      err = -1;

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, new_fb);

-  }

-  if (cm->refresh_golden_frame)

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->gld_fb_idx, cm->new_fb_idx);

-  if (cm->refresh_alt_ref_frame)

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->alt_fb_idx, cm->new_fb_idx);

-  if (cm->refresh_last_frame) {

-    ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->lst_fb_idx, cm->new_fb_idx);

-    cm->frame_to_show = &cm->yv12_fb[cm->lst_fb_idx];

-  } else

-    cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];

-  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

-  return err;

+  /* Invalidate these references until the next frame starts. */

+  for (ref_index = 0; ref_index < 3; ref_index++)

+    pbi->common.active_ref_idx[ref_index] = INT_MAX;

 int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,

@@ -332,8 +298,13 @@

      * We do not know if the missing frame(s) was supposed to update

      * any of the reference buffers, but we act conservative and

      * mark only the last buffer as corrupted.

+     *

+     * TODO(jkoleszar): Error concealment is undefined and non-normative

+     * at this point, but if it becomes so, [0] may not always be the correct

+     * thing to do here.

*/

-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;

+    if (cm->active_ref_idx[0] != INT_MAX)

+      cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;

   cm->new_fb_idx = get_free_fb(cm);

@@ -344,8 +315,13 @@

     /* We do not know if the missing frame(s) was supposed to update

      * any of the reference buffers, but we act conservative and

      * mark only the last buffer as corrupted.

+     *

+     * TODO(jkoleszar): Error concealment is undefined and non-normative

+     * at this point, but if it becomes so, [0] may not always be the correct

+     * thing to do here.

*/

-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;

+    if (cm->active_ref_idx[0] != INT_MAX)

+      cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;

     if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)

       cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

@@ -365,11 +341,7 @@

-    if (swap_frame_buffers(cm)) {

-      pbi->common.error.error_code = VPX_CODEC_ERROR;

-      pbi->common.error.setjmp = 0;

-      return -1;

-    }

+    swap_frame_buffers(pbi);

 #if WRITE_RECON_BUFFER == 2

     if (cm->show_frame)

@@ -382,7 +354,8 @@

     if (cm->filter_level) {

       /* Apply the loop filter if appropriate. */

-      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0);

+      vp9_loop_filter_frame(cm, &pbi->mb, cm->filter_level, 0,

+                            cm->dering_enabled);

     vp8_yv12_extend_frame_borders(cm->frame_to_show);

@@ -389,7 +362,8 @@

 #if WRITE_RECON_BUFFER == 1

   if (cm->show_frame)

-    recon_write_yuv_frame("recon.yuv", cm->frame_to_show);

+    recon_write_yuv_frame("recon.yuv", cm->frame_to_show,

+                          cm->width, cm->height);

 #endif

   vp9_clear_system_state();

@@ -440,9 +414,9 @@

   if (pbi->common.frame_to_show) {

     *sd = *pbi->common.frame_to_show;

-    sd->y_width = pbi->common.Width;

-    sd->y_height = pbi->common.Height;

-    sd->uv_height = pbi->common.Height / 2;

+    sd->y_width = pbi->common.width;

+    sd->y_height = pbi->common.height;

+    sd->uv_height = pbi->common.height / 2;

     ret = 0;

   } else {

     ret = -1;

--- a/vp9/decoder/vp9_onyxd_int.h

+++ b/vp9/decoder/vp9_onyxd_int.h

@@ -18,41 +18,6 @@

 // #define DEC_DEBUG

-typedef struct {

-  int ithread;

-  void *ptr1;

-  void *ptr2;

-} DECODETHREAD_DATA;

-typedef struct {

-  MACROBLOCKD  mbd;

-  int mb_row;

-  int current_mb_col;

-  short *coef_ptr;

-} MB_ROW_DEC;

-typedef struct {

-  int const *scan;

-  int const *scan_8x8;

-  uint8_t const *ptr_block2leftabove;

-  vp9_tree_index const *vp9_coef_tree_ptr;

-  unsigned char *norm_ptr;

-  uint8_t *ptr_coef_bands_x;

-  uint8_t *ptr_coef_bands_x_8x8;

-  ENTROPY_CONTEXT_PLANES *A;

-  ENTROPY_CONTEXT_PLANES *L;

-  int16_t *qcoeff_start_ptr;

-  vp9_prob const *coef_probs_4x4[BLOCK_TYPES_4X4];

-  vp9_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];

-  uint8_t eob[25];

-} DETOK;

 typedef struct VP9Decompressor {

   DECLARE_ALIGNED(16, MACROBLOCKD, mb);

@@ -68,18 +33,13 @@

   int64_t last_time_stamp;

   int   ready_for_new_data;

-  DETOK detoken;

-  vp9_dequant_idct_add_fn_t            idct_add;

-  vp9_dequant_dc_idct_add_fn_t         dc_idct_add;

-  vp9_dequant_dc_idct_add_y_block_fn_t dc_idct_add_y_block;

-  vp9_dequant_idct_add_y_block_fn_t    idct_add_y_block;

-  vp9_dequant_idct_add_uv_block_fn_t   idct_add_uv_block;

+  int refresh_frame_flags;

   vp9_prob prob_skip_false;

   int decoded_key_frame;

+  int initial_width;

+  int initial_height;

 } VP9D_COMP;

 int vp9_decode_frame(VP9D_COMP *cpi, const unsigned char **p_data_end);

--- a/vp9/decoder/vp9_treereader.h

+++ b/vp9/decoder/vp9_treereader.h

@@ -13,7 +13,6 @@

 #define VP9_DECODER_VP9_TREEREADER_H_

 #include "vp9/common/vp9_treecoder.h"

 #include "vp9/decoder/vp9_dboolhuff.h"

 typedef BOOL_DECODER vp9_reader;

@@ -20,10 +19,10 @@

 #define vp9_read decode_bool

 #define vp9_read_literal decode_value

-#define vp9_read_bit(R) vp9_read(R, vp9_prob_half)

+#define vp9_read_bit(r) vp9_read(r, vp9_prob_half)

+#define vp9_read_prob(r) ((vp9_prob)vp9_read_literal(r, 8))

-/* Intent of tree data structure is to make decoding trivial. */

+// Intent of tree data structure is to make decoding trivial.

 static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */

                       vp9_tree t,

                       const vp9_prob *const p) {

--- a/vp9/decoder/x86/vp9_dequantize_mmx.asm

+++ /dev/null

@@ -1,406 +1,0 @@

-;

-;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-SECTION_RODATA

-align 16

-x_s1sqr2:      times 4 dw 0x8A8C

-align 16

-x_c1sqr2less1: times 4 dw 0x4E7B

-align 16

-pw_16:         times 4 dw 16

-SECTION .text

-INIT_MMX

-;void dequantize_b_impl_mmx(short *sq, short *dq, short *q)

-cglobal dequantize_b_impl_mmx, 3,3,0,sq,dq,arg3

-    mova       m1, [sqq]

-    pmullw     m1, [arg3q+0]            ; mm4 *= kernel 0 modifiers.

-    mova [dqq+ 0], m1

-    mova       m1, [sqq+8]

-    pmullw     m1, [arg3q+8]            ; mm4 *= kernel 0 modifiers.

-    mova [dqq+ 8], m1

-    mova       m1, [sqq+16]

-    pmullw     m1, [arg3q+16]            ; mm4 *= kernel 0 modifiers.

-    mova [dqq+16], m1

-    mova       m1, [sqq+24]

-    pmullw     m1, [arg3q+24]            ; mm4 *= kernel 0 modifiers.

-    mova [dqq+24], m1

-    RET

-;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)

-cglobal dequant_idct_add_mmx, 4,6,0,inp,dq,pred,dest,pit,stride

-%if ARCH_X86_64

-    movsxd              strideq,  dword stridem

-    movsxd              pitq,     dword pitm

-%else

-    mov                 strideq,  stridem

-    mov                 pitq,     pitm

-%endif

-    mova                m0,       [inpq+ 0]

-    pmullw              m0,       [dqq]

-    mova                m1,       [inpq+ 8]

-    pmullw              m1,       [dqq+ 8]

-    mova                m2,       [inpq+16]

-    pmullw              m2,       [dqq+16]

-    mova                m3,       [inpq+24]

-    pmullw              m3,       [dqq+24]

-    pxor                m7,        m7

-    mova            [inpq],        m7

-    mova          [inpq+8],        m7

-    mova         [inpq+16],        m7

-    mova         [inpq+24],        m7

-    psubw               m0,        m2             ; b1= 0-2

-    paddw               m2,        m2             ;

-    mova                m5,        m1

-    paddw               m2,        m0             ; a1 =0+2

-    pmulhw              m5,       [x_s1sqr2];

-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova                m7,        m3             ;

-    pmulhw              m7,       [x_c1sqr2less1];

-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw               m7,        m5             ; c1

-    mova                m5,        m1

-    mova                m4,        m3

-    pmulhw              m5,       [x_c1sqr2less1]

-    paddw               m5,        m1

-    pmulhw              m3,       [x_s1sqr2]

-    paddw               m3,        m4

-    paddw               m3,        m5             ; d1

-    mova                m6,        m2             ; a1

-    mova                m4,        m0             ; b1

-    paddw               m2,        m3             ;0

-    paddw               m4,        m7             ;1

-    psubw               m0,        m7             ;2

-    psubw               m6,        m3             ;3

-    mova                m1,        m2             ; 03 02 01 00

-    mova                m3,        m4             ; 23 22 21 20

-    punpcklwd           m1,        m0             ; 11 01 10 00

-    punpckhwd           m2,        m0             ; 13 03 12 02

-    punpcklwd           m3,        m6             ; 31 21 30 20

-    punpckhwd           m4,        m6             ; 33 23 32 22

-    mova                m0,        m1             ; 11 01 10 00

-    mova                m5,        m2             ; 13 03 12 02

-    punpckldq           m0,        m3             ; 30 20 10 00

-    punpckhdq           m1,        m3             ; 31 21 11 01

-    punpckldq           m2,        m4             ; 32 22 12 02

-    punpckhdq           m5,        m4             ; 33 23 13 03

-    mova                m3,        m5             ; 33 23 13 03

-    psubw               m0,        m2             ; b1= 0-2

-    paddw               m2,        m2             ;

-    mova                m5,        m1

-    paddw               m2,        m0             ; a1 =0+2

-    pmulhw              m5,       [x_s1sqr2];

-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova                m7,        m3             ;

-    pmulhw              m7,       [x_c1sqr2less1];

-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw               m7,        m5             ; c1

-    mova                m5,        m1

-    mova                m4,        m3

-    pmulhw              m5,       [x_c1sqr2less1]

-    paddw               m5,        m1

-    pmulhw              m3,       [x_s1sqr2]

-    paddw               m3,        m4

-    paddw               m3,        m5             ; d1

-    paddw               m0,       [pw_16]

-    paddw               m2,       [pw_16]

-    mova                m6,        m2             ; a1

-    mova                m4,        m0             ; b1

-    paddw               m2,        m3             ;0

-    paddw               m4,        m7             ;1

-    psubw               m0,        m7             ;2

-    psubw               m6,        m3             ;3

-    psraw               m2,        5

-    psraw               m0,        5

-    psraw               m4,        5

-    psraw               m6,        5

-    mova                m1,        m2             ; 03 02 01 00

-    mova                m3,        m4             ; 23 22 21 20

-    punpcklwd           m1,        m0             ; 11 01 10 00

-    punpckhwd           m2,        m0             ; 13 03 12 02

-    punpcklwd           m3,        m6             ; 31 21 30 20

-    punpckhwd           m4,        m6             ; 33 23 32 22

-    mova                m0,        m1             ; 11 01 10 00

-    mova                m5,        m2             ; 13 03 12 02

-    punpckldq           m0,        m3             ; 30 20 10 00

-    punpckhdq           m1,        m3             ; 31 21 11 01

-    punpckldq           m2,        m4             ; 32 22 12 02

-    punpckhdq           m5,        m4             ; 33 23 13 03

-    pxor                m7,        m7

-    movh                m4,       [predq]

-    punpcklbw           m4,        m7

-    paddsw              m0,        m4

-    packuswb            m0,        m7

-    movh           [destq],      m0

-    movh                m4,       [predq+pitq]

-    punpcklbw           m4,        m7

-    paddsw              m1,        m4

-    packuswb            m1,        m7

-    movh   [destq+strideq],        m1

-    movh                m4,       [predq+2*pitq]

-    punpcklbw           m4,        m7

-    paddsw              m2,        m4

-    packuswb            m2,        m7

-    movh [destq+strideq*2],        m2

-    add              destq,        strideq

-    add              predq,        pitq

-    movh                m4,       [predq+2*pitq]

-    punpcklbw           m4,        m7

-    paddsw              m5,        m4

-    packuswb            m5,        m7

-    movh [destq+strideq*2],        m5

-    RET

-;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)

-cglobal dequant_dc_idct_add_mmx, 4,7,0,inp,dq,pred,dest,pit,stride,Dc

-%if ARCH_X86_64

-    movsxd              strideq,   dword stridem

-    movsxd              pitq,      dword pitm

-%else

-    mov                 strideq,   stridem

-    mov                 pitq,      pitm

-%endif

-    mov                 Dcq, Dcm

-    mova                m0,       [inpq+ 0]

-    pmullw              m0,       [dqq+ 0]

-    mova                m1,       [inpq+ 8]

-    pmullw              m1,       [dqq+ 8]

-    mova                m2,       [inpq+16]

-    pmullw              m2,       [dqq+16]

-    mova                m3,       [inpq+24]

-    pmullw              m3,       [dqq+24]

-    pxor                m7,        m7

-    mova         [inpq+ 0],        m7

-    mova         [inpq+ 8],        m7

-    mova         [inpq+16],        m7

-    mova         [inpq+24],        m7

-    ; move lower word of Dc to lower word of m0

-    psrlq               m0,        16

-    psllq               m0,        16

-    and                Dcq,        0xFFFF         ; If Dc < 0, we don't want the full dword precision.

-    movh                m7,        Dcq

-    por                 m0,        m7

-    psubw               m0,        m2             ; b1= 0-2

-    paddw               m2,        m2             ;

-    mova                m5,        m1

-    paddw               m2,        m0             ; a1 =0+2

-    pmulhw              m5,       [x_s1sqr2];

-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova                m7,        m3             ;

-    pmulhw              m7,       [x_c1sqr2less1];

-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw               m7,        m5             ; c1

-    mova                m5,        m1

-    mova                m4,        m3

-    pmulhw              m5,       [x_c1sqr2less1]

-    paddw               m5,        m1

-    pmulhw              m3,       [x_s1sqr2]

-    paddw               m3,        m4

-    paddw               m3,        m5             ; d1

-    mova                m6,        m2             ; a1

-    mova                m4,        m0             ; b1

-    paddw               m2,        m3             ;0

-    paddw               m4,        m7             ;1

-    psubw               m0,        m7             ;2

-    psubw               m6,        m3             ;3

-    mova                m1,        m2             ; 03 02 01 00

-    mova                m3,        m4             ; 23 22 21 20

-    punpcklwd           m1,        m0             ; 11 01 10 00

-    punpckhwd           m2,        m0             ; 13 03 12 02

-    punpcklwd           m3,        m6             ; 31 21 30 20

-    punpckhwd           m4,        m6             ; 33 23 32 22

-    mova                m0,        m1             ; 11 01 10 00

-    mova                m5,        m2             ; 13 03 12 02

-    punpckldq           m0,        m3             ; 30 20 10 00

-    punpckhdq           m1,        m3             ; 31 21 11 01

-    punpckldq           m2,        m4             ; 32 22 12 02

-    punpckhdq           m5,        m4             ; 33 23 13 03

-    mova                m3,        m5             ; 33 23 13 03

-    psubw               m0,        m2             ; b1= 0-2

-    paddw               m2,        m2             ;

-    mova                m5,        m1

-    paddw               m2,        m0             ; a1 =0+2

-    pmulhw              m5,       [x_s1sqr2];

-    paddw               m5,        m1             ; ip1 * sin(pi/8) * sqrt(2)

-    mova                m7,        m3             ;

-    pmulhw              m7,       [x_c1sqr2less1];

-    paddw               m7,        m3             ; ip3 * cos(pi/8) * sqrt(2)

-    psubw               m7,        m5             ; c1

-    mova                m5,        m1

-    mova                m4,        m3

-    pmulhw              m5,       [x_c1sqr2less1]

-    paddw               m5,        m1

-    pmulhw              m3,       [x_s1sqr2]

-    paddw               m3,        m4

-    paddw               m3,        m5             ; d1

-    paddw               m0,       [pw_16]

-    paddw               m2,       [pw_16]

-    mova                m6,        m2             ; a1

-    mova                m4,        m0             ; b1

-    paddw               m2,        m3             ;0

-    paddw               m4,        m7             ;1

-    psubw               m0,        m7             ;2

-    psubw               m6,        m3             ;3

-    psraw               m2,        5

-    psraw               m0,        5

-    psraw               m4,        5

-    psraw               m6,        5

-    mova                m1,        m2             ; 03 02 01 00

-    mova                m3,        m4             ; 23 22 21 20

-    punpcklwd           m1,        m0             ; 11 01 10 00

-    punpckhwd           m2,        m0             ; 13 03 12 02

-    punpcklwd           m3,        m6             ; 31 21 30 20

-    punpckhwd           m4,        m6             ; 33 23 32 22

-    mova                m0,        m1             ; 11 01 10 00

-    mova                m5,        m2             ; 13 03 12 02

-    punpckldq           m0,        m3             ; 30 20 10 00

-    punpckhdq           m1,        m3             ; 31 21 11 01

-    punpckldq           m2,        m4             ; 32 22 12 02

-    punpckhdq           m5,        m4             ; 33 23 13 03

-    pxor                m7,        m7

-    movh                m4,       [predq]

-    punpcklbw           m4,        m7

-    paddsw              m0,        m4

-    packuswb            m0,        m7

-    movh           [destq],        m0

-    movh                m4,       [predq+pitq]

-    punpcklbw           m4,        m7

-    paddsw              m1,        m4

-    packuswb            m1,        m7

-    movh   [destq+strideq],        m1

-    movh                m4,       [predq+2*pitq]

-    punpcklbw           m4,        m7

-    paddsw              m2,        m4

-    packuswb            m2,        m7

-    movh [destq+strideq*2],        m2

-    add              destq,        strideq

-    add              predq,        pitq

-    movh                m4,       [predq+2*pitq]

-    punpcklbw           m4,        m7

-    paddsw              m5,        m4

-    packuswb            m5,        m7

-    movh [destq+strideq*2],        m5

-    RET

--- /dev/null

+++ b/vp9/decoder/x86/vp9_dequantize_x86.c

@@ -1,0 +1,455 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include <emmintrin.h>  // SSE2

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vp9/common/vp9_common.h"

+#include "vp9/common/vp9_idct.h"

+#if HAVE_SSE2

+void vp9_add_residual_4x4_sse2(const int16_t *diff, const uint8_t *pred,

+                               int pitch, uint8_t *dest, int stride) {

+  const int width = 4;

+  const __m128i zero = _mm_setzero_si128();

+  // Diff data

+  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));

+  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));

+  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));

+  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));

+  // Prediction data.

+  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(pred + 0 * pitch));

+  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(pred + 1 * pitch));

+  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(pred + 2 * pitch));

+  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(pred + 3 * pitch));

+  p0 = _mm_unpacklo_epi8(p0, zero);

+  p1 = _mm_unpacklo_epi8(p1, zero);

+  p2 = _mm_unpacklo_epi8(p2, zero);

+  p3 = _mm_unpacklo_epi8(p3, zero);

+  p0 = _mm_add_epi16(p0, d0);

+  p1 = _mm_add_epi16(p1, d1);

+  p2 = _mm_add_epi16(p2, d2);

+  p3 = _mm_add_epi16(p3, d3);

+  p0 = _mm_packus_epi16(p0, p1);

+  p2 = _mm_packus_epi16(p2, p3);

+  *(int *)dest = _mm_cvtsi128_si32(p0);

+  dest += stride;

+  p0 = _mm_srli_si128(p0, 8);

+  *(int *)dest = _mm_cvtsi128_si32(p0);

+  dest += stride;

+  *(int *)dest = _mm_cvtsi128_si32(p2);

+  dest += stride;

+  p2 = _mm_srli_si128(p2, 8);

+  *(int *)dest = _mm_cvtsi128_si32(p2);

+}

+void vp9_add_residual_8x8_sse2(const int16_t *diff, const uint8_t *pred,

+                               int pitch, uint8_t *dest, int stride) {

+  const int width = 8;

+  const __m128i zero = _mm_setzero_si128();

+  // Diff data

+  const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));

+  const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width));

+  const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width));

+  const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width));

+  const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width));

+  const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width));

+  const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width));

+  const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width));

+  // Prediction data.

+  __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch));

+  __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch));

+  __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch));

+  __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch));

+  __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch));

+  __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch));

+  __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch));

+  __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch));

+  p0 = _mm_unpacklo_epi8(p0, zero);

+  p1 = _mm_unpacklo_epi8(p1, zero);

+  p2 = _mm_unpacklo_epi8(p2, zero);

+  p3 = _mm_unpacklo_epi8(p3, zero);

+  p4 = _mm_unpacklo_epi8(p4, zero);

+  p5 = _mm_unpacklo_epi8(p5, zero);

+  p6 = _mm_unpacklo_epi8(p6, zero);

+  p7 = _mm_unpacklo_epi8(p7, zero);

+  p0 = _mm_add_epi16(p0, d0);

+  p1 = _mm_add_epi16(p1, d1);

+  p2 = _mm_add_epi16(p2, d2);

+  p3 = _mm_add_epi16(p3, d3);

+  p4 = _mm_add_epi16(p4, d4);

+  p5 = _mm_add_epi16(p5, d5);

+  p6 = _mm_add_epi16(p6, d6);

+  p7 = _mm_add_epi16(p7, d7);

+  p0 = _mm_packus_epi16(p0, p1);

+  p2 = _mm_packus_epi16(p2, p3);

+  p4 = _mm_packus_epi16(p4, p5);

+  p6 = _mm_packus_epi16(p6, p7);

+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);

+  p0 = _mm_srli_si128(p0, 8);

+  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);

+  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);

+  p2 = _mm_srli_si128(p2, 8);

+  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);

+  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);

+  p4 = _mm_srli_si128(p4, 8);

+  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);

+  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);

+  p6 = _mm_srli_si128(p6, 8);

+  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);

+}

+void vp9_add_residual_16x16_sse2(const int16_t *diff, const uint8_t *pred,

+                             int pitch, uint8_t *dest, int stride) {

+  const int width = 16;

+  int i = 4;

+  const __m128i zero = _mm_setzero_si128();

+  // Diff data

+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;

+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;

+  do {

+    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));

+    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));

+    d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));

+    d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));

+    d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));

+    d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));

+    d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));

+    d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));

+    // Prediction data.

+    p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));

+    p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));

+    p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));

+    p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));

+    p0 = _mm_unpacklo_epi8(p1, zero);

+    p1 = _mm_unpackhi_epi8(p1, zero);

+    p2 = _mm_unpacklo_epi8(p3, zero);

+    p3 = _mm_unpackhi_epi8(p3, zero);

+    p4 = _mm_unpacklo_epi8(p5, zero);

+    p5 = _mm_unpackhi_epi8(p5, zero);

+    p6 = _mm_unpacklo_epi8(p7, zero);

+    p7 = _mm_unpackhi_epi8(p7, zero);

+    p0 = _mm_add_epi16(p0, d0);

+    p1 = _mm_add_epi16(p1, d1);

+    p2 = _mm_add_epi16(p2, d2);

+    p3 = _mm_add_epi16(p3, d3);

+    p4 = _mm_add_epi16(p4, d4);

+    p5 = _mm_add_epi16(p5, d5);

+    p6 = _mm_add_epi16(p6, d6);

+    p7 = _mm_add_epi16(p7, d7);

+    p0 = _mm_packus_epi16(p0, p1);

+    p1 = _mm_packus_epi16(p2, p3);

+    p2 = _mm_packus_epi16(p4, p5);

+    p3 = _mm_packus_epi16(p6, p7);

+    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);

+    _mm_store_si128((__m128i *)(dest + 1 * stride), p1);

+    _mm_store_si128((__m128i *)(dest + 2 * stride), p2);

+    _mm_store_si128((__m128i *)(dest + 3 * stride), p3);

+    diff += 4 * width;

+    pred += 4 * pitch;

+    dest += 4 * stride;

+  } while (--i);

+}

+void vp9_add_residual_32x32_sse2(const int16_t *diff, const uint8_t *pred,

+                             int pitch, uint8_t *dest, int stride) {

+  const int width = 32;

+  int i = 16;

+  const __m128i zero = _mm_setzero_si128();

+  // Diff data

+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;

+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;

+  do {

+    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));

+    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));

+    d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));

+    d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));

+    d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));

+    d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));

+    d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));

+    d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));

+    // Prediction data.

+    p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));

+    p3 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16));

+    p5 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));

+    p7 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16));

+    p0 = _mm_unpacklo_epi8(p1, zero);

+    p1 = _mm_unpackhi_epi8(p1, zero);

+    p2 = _mm_unpacklo_epi8(p3, zero);

+    p3 = _mm_unpackhi_epi8(p3, zero);

+    p4 = _mm_unpacklo_epi8(p5, zero);

+    p5 = _mm_unpackhi_epi8(p5, zero);

+    p6 = _mm_unpacklo_epi8(p7, zero);

+    p7 = _mm_unpackhi_epi8(p7, zero);

+    p0 = _mm_add_epi16(p0, d0);

+    p1 = _mm_add_epi16(p1, d1);

+    p2 = _mm_add_epi16(p2, d2);

+    p3 = _mm_add_epi16(p3, d3);

+    p4 = _mm_add_epi16(p4, d4);

+    p5 = _mm_add_epi16(p5, d5);

+    p6 = _mm_add_epi16(p6, d6);

+    p7 = _mm_add_epi16(p7, d7);

+    p0 = _mm_packus_epi16(p0, p1);

+    p1 = _mm_packus_epi16(p2, p3);

+    p2 = _mm_packus_epi16(p4, p5);

+    p3 = _mm_packus_epi16(p6, p7);

+    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);

+    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);

+    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);

+    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);

+    diff += 2 * width;

+    pred += 2 * pitch;

+    dest += 2 * stride;

+  } while (--i);

+}

+void vp9_add_constant_residual_8x8_sse2(const int16_t diff, const uint8_t *pred,

+                                        int pitch, uint8_t *dest, int stride) {

+  uint8_t abs_diff;

+  __m128i d;

+  // Prediction data.

+  __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch));

+  __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch));

+  __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch));

+  __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch));

+  __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch));

+  __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch));

+  __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch));

+  __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch));

+  p0 = _mm_unpacklo_epi64(p0, p1);

+  p2 = _mm_unpacklo_epi64(p2, p3);

+  p4 = _mm_unpacklo_epi64(p4, p5);

+  p6 = _mm_unpacklo_epi64(p6, p7);

+  // Clip diff value to [0, 255] range. Then, do addition or subtraction

+  // according to its sign.

+  if (diff >= 0) {

+    abs_diff = (diff > 255) ? 255 : diff;

+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

+    p0 = _mm_adds_epu8(p0, d);

+    p2 = _mm_adds_epu8(p2, d);

+    p4 = _mm_adds_epu8(p4, d);

+    p6 = _mm_adds_epu8(p6, d);

+  } else {

+    abs_diff = (diff < -255) ? 255 : -diff;

+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

+    p0 = _mm_subs_epu8(p0, d);

+    p2 = _mm_subs_epu8(p2, d);

+    p4 = _mm_subs_epu8(p4, d);

+    p6 = _mm_subs_epu8(p6, d);

+  }

+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);

+  p0 = _mm_srli_si128(p0, 8);

+  _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);

+  _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);

+  p2 = _mm_srli_si128(p2, 8);

+  _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);

+  _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);

+  p4 = _mm_srli_si128(p4, 8);

+  _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);

+  _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);

+  p6 = _mm_srli_si128(p6, 8);

+  _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);

+}

+void vp9_add_constant_residual_16x16_sse2(const int16_t diff,

+                                          const uint8_t *pred, int pitch,

+                                          uint8_t *dest, int stride) {

+  uint8_t abs_diff;

+  __m128i d;

+  // Prediction data.

+  __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));

+  __m128i p1 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));

+  __m128i p2 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));

+  __m128i p3 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));

+  __m128i p4 = _mm_load_si128((const __m128i *)(pred + 4 * pitch));

+  __m128i p5 = _mm_load_si128((const __m128i *)(pred + 5 * pitch));

+  __m128i p6 = _mm_load_si128((const __m128i *)(pred + 6 * pitch));

+  __m128i p7 = _mm_load_si128((const __m128i *)(pred + 7 * pitch));

+  __m128i p8 = _mm_load_si128((const __m128i *)(pred + 8 * pitch));

+  __m128i p9 = _mm_load_si128((const __m128i *)(pred + 9 * pitch));

+  __m128i p10 = _mm_load_si128((const __m128i *)(pred + 10 * pitch));

+  __m128i p11 = _mm_load_si128((const __m128i *)(pred + 11 * pitch));

+  __m128i p12 = _mm_load_si128((const __m128i *)(pred + 12 * pitch));

+  __m128i p13 = _mm_load_si128((const __m128i *)(pred + 13 * pitch));

+  __m128i p14 = _mm_load_si128((const __m128i *)(pred + 14 * pitch));

+  __m128i p15 = _mm_load_si128((const __m128i *)(pred + 15 * pitch));

+  // Clip diff value to [0, 255] range. Then, do addition or subtraction

+  // according to its sign.

+  if (diff >= 0) {

+    abs_diff = (diff > 255) ? 255 : diff;

+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

+    p0 = _mm_adds_epu8(p0, d);

+    p1 = _mm_adds_epu8(p1, d);

+    p2 = _mm_adds_epu8(p2, d);

+    p3 = _mm_adds_epu8(p3, d);

+    p4 = _mm_adds_epu8(p4, d);

+    p5 = _mm_adds_epu8(p5, d);

+    p6 = _mm_adds_epu8(p6, d);

+    p7 = _mm_adds_epu8(p7, d);

+    p8 = _mm_adds_epu8(p8, d);

+    p9 = _mm_adds_epu8(p9, d);

+    p10 = _mm_adds_epu8(p10, d);

+    p11 = _mm_adds_epu8(p11, d);

+    p12 = _mm_adds_epu8(p12, d);

+    p13 = _mm_adds_epu8(p13, d);

+    p14 = _mm_adds_epu8(p14, d);

+    p15 = _mm_adds_epu8(p15, d);

+  } else {

+    abs_diff = (diff < -255) ? 255 : -diff;

+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

+    p0 = _mm_subs_epu8(p0, d);

+    p1 = _mm_subs_epu8(p1, d);

+    p2 = _mm_subs_epu8(p2, d);

+    p3 = _mm_subs_epu8(p3, d);

+    p4 = _mm_subs_epu8(p4, d);

+    p5 = _mm_subs_epu8(p5, d);

+    p6 = _mm_subs_epu8(p6, d);

+    p7 = _mm_subs_epu8(p7, d);

+    p8 = _mm_subs_epu8(p8, d);

+    p9 = _mm_subs_epu8(p9, d);

+    p10 = _mm_subs_epu8(p10, d);

+    p11 = _mm_subs_epu8(p11, d);

+    p12 = _mm_subs_epu8(p12, d);

+    p13 = _mm_subs_epu8(p13, d);

+    p14 = _mm_subs_epu8(p14, d);

+    p15 = _mm_subs_epu8(p15, d);

+  }

+  // Store results

+  _mm_store_si128((__m128i *)(dest + 0 * stride), p0);

+  _mm_store_si128((__m128i *)(dest + 1 * stride), p1);

+  _mm_store_si128((__m128i *)(dest + 2 * stride), p2);

+  _mm_store_si128((__m128i *)(dest + 3 * stride), p3);

+  _mm_store_si128((__m128i *)(dest + 4 * stride), p4);

+  _mm_store_si128((__m128i *)(dest + 5 * stride), p5);

+  _mm_store_si128((__m128i *)(dest + 6 * stride), p6);

+  _mm_store_si128((__m128i *)(dest + 7 * stride), p7);

+  _mm_store_si128((__m128i *)(dest + 8 * stride), p8);

+  _mm_store_si128((__m128i *)(dest + 9 * stride), p9);

+  _mm_store_si128((__m128i *)(dest + 10 * stride), p10);

+  _mm_store_si128((__m128i *)(dest + 11 * stride), p11);

+  _mm_store_si128((__m128i *)(dest + 12 * stride), p12);

+  _mm_store_si128((__m128i *)(dest + 13 * stride), p13);

+  _mm_store_si128((__m128i *)(dest + 14 * stride), p14);

+  _mm_store_si128((__m128i *)(dest + 15 * stride), p15);

+}

+void vp9_add_constant_residual_32x32_sse2(const int16_t diff,

+                                          const uint8_t *pred, int pitch,

+                                          uint8_t *dest, int stride) {

+  uint8_t abs_diff;

+  __m128i d;

+  int i = 8;

+  if (diff >= 0) {

+    abs_diff = (diff > 255) ? 255 : diff;

+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

+  } else {

+    abs_diff = (diff < -255) ? 255 : -diff;

+    d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0);

+  }

+  do {

+    // Prediction data.

+    __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch));

+    __m128i p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16));

+    __m128i p2 = _mm_load_si128((const __m128i *)(pred + 1 * pitch));

+    __m128i p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16));

+    __m128i p4 = _mm_load_si128((const __m128i *)(pred + 2 * pitch));

+    __m128i p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch + 16));

+    __m128i p6 = _mm_load_si128((const __m128i *)(pred + 3 * pitch));

+    __m128i p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch + 16));

+    // Clip diff value to [0, 255] range. Then, do addition or subtraction

+    // according to its sign.

+    if (diff >= 0) {

+      p0 = _mm_adds_epu8(p0, d);

+      p1 = _mm_adds_epu8(p1, d);

+      p2 = _mm_adds_epu8(p2, d);

+      p3 = _mm_adds_epu8(p3, d);

+      p4 = _mm_adds_epu8(p4, d);

+      p5 = _mm_adds_epu8(p5, d);

+      p6 = _mm_adds_epu8(p6, d);

+      p7 = _mm_adds_epu8(p7, d);

+    } else {

+      p0 = _mm_subs_epu8(p0, d);

+      p1 = _mm_subs_epu8(p1, d);

+      p2 = _mm_subs_epu8(p2, d);

+      p3 = _mm_subs_epu8(p3, d);

+      p4 = _mm_subs_epu8(p4, d);

+      p5 = _mm_subs_epu8(p5, d);

+      p6 = _mm_subs_epu8(p6, d);

+      p7 = _mm_subs_epu8(p7, d);

+    }

+    // Store results

+    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);

+    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);

+    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);

+    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);

+    _mm_store_si128((__m128i *)(dest + 2 * stride), p4);

+    _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5);

+    _mm_store_si128((__m128i *)(dest + 3 * stride), p6);

+    _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7);

+    pred += 4 * pitch;

+    dest += 4 * stride;

+  } while (--i);

+}

+#endif

--- a/vp9/decoder/x86/vp9_idct_blk_mmx.c

+++ /dev/null

@@ -1,145 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_config.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/decoder/vp9_dequantize.h"

-#include "vp9/decoder/x86/vp9_idct_mmx.h"

-void vp9_dequant_dc_idct_add_y_block_mmx(short *q, const short *dq,

-                                         unsigned char *pre,

-                                         unsigned char *dst,

-                                         int stride, unsigned short *eobs,

-                                         const short *dc) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (eobs[0] > 1)

-      vp9_dequant_dc_idct_add_mmx(q, dq, pre, dst, 16, stride, dc[0]);

-    else

-      vp9_dc_only_idct_add_mmx(dc[0], pre, dst, 16, stride);

-    if (eobs[1] > 1)

-      vp9_dequant_dc_idct_add_mmx(q + 16, dq, pre + 4,

-                                  dst + 4, 16, stride, dc[1]);

-    else

-      vp9_dc_only_idct_add_mmx(dc[1], pre + 4, dst + 4, 16, stride);

-    if (eobs[2] > 1)

-      vp9_dequant_dc_idct_add_mmx(q + 32, dq, pre + 8,

-                                  dst + 8, 16, stride, dc[2]);

-    else

-      vp9_dc_only_idct_add_mmx(dc[2], pre + 8, dst + 8, 16, stride);

-    if (eobs[3] > 1)

-      vp9_dequant_dc_idct_add_mmx(q + 48, dq, pre + 12,

-                                  dst + 12, 16, stride, dc[3]);

-    else

-      vp9_dc_only_idct_add_mmx(dc[3], pre + 12, dst + 12, 16, stride);

-    q    += 64;

-    dc   += 4;

-    pre  += 64;

-    dst  += 4 * stride;

-    eobs += 4;

-  }

-}

-void vp9_dequant_idct_add_y_block_mmx(short *q, const short *dq,

-                                      unsigned char *pre,

-                                      unsigned char *dst,

-                                      int stride, unsigned short *eobs) {

-  int i;

-  for (i = 0; i < 4; i++) {

-    if (eobs[0] > 1)

-      vp9_dequant_idct_add_mmx(q, dq, pre, dst, 16, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dst, 16, stride);

-      ((int *)q)[0] = 0;

-    }

-    if (eobs[1] > 1)

-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dst + 4, 16, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dst + 4, 16, stride);

-      ((int *)(q + 16))[0] = 0;

-    }

-    if (eobs[2] > 1)

-      vp9_dequant_idct_add_mmx(q + 32, dq, pre + 8, dst + 8, 16, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[32]*dq[0], pre + 8, dst + 8, 16, stride);

-      ((int *)(q + 32))[0] = 0;

-    }

-    if (eobs[3] > 1)

-      vp9_dequant_idct_add_mmx(q + 48, dq, pre + 12, dst + 12, 16, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[48]*dq[0], pre + 12, dst + 12, 16, stride);

-      ((int *)(q + 48))[0] = 0;

-    }

-    q    += 64;

-    pre  += 64;

-    dst  += 4 * stride;

-    eobs += 4;

-  }

-}

-void vp9_dequant_idct_add_uv_block_mmx(short *q, const short *dq,

-                                       unsigned char *pre,

-                                       unsigned char *dstu,

-                                       unsigned char *dstv,

-                                       int stride, unsigned short *eobs) {

-  int i;

-  for (i = 0; i < 2; i++) {

-    if (eobs[0] > 1)

-      vp9_dequant_idct_add_mmx(q, dq, pre, dstu, 8, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstu, 8, stride);

-      ((int *)q)[0] = 0;

-    }

-    if (eobs[1] > 1)

-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstu + 4, 8, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstu + 4, 8, stride);

-      ((int *)(q + 16))[0] = 0;

-    }

-    q    += 32;

-    pre  += 32;

-    dstu += 4 * stride;

-    eobs += 2;

-  }

-  for (i = 0; i < 2; i++) {

-    if (eobs[0] > 1)

-      vp9_dequant_idct_add_mmx(q, dq, pre, dstv, 8, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[0]*dq[0], pre, dstv, 8, stride);

-      ((int *)q)[0] = 0;

-    }

-    if (eobs[1] > 1)

-      vp9_dequant_idct_add_mmx(q + 16, dq, pre + 4, dstv + 4, 8, stride);

-    else {

-      vp9_dc_only_idct_add_mmx(q[16]*dq[0], pre + 4, dstv + 4, 8, stride);

-      ((int *)(q + 16))[0] = 0;

-    }

-    q    += 32;

-    pre  += 32;

-    dstv += 4 * stride;

-    eobs += 2;

-  }

-}

--- a/vp9/decoder/x86/vp9_idct_mmx.h

+++ b/vp9/decoder/x86/vp9_idct_mmx.h

@@ -16,9 +16,6 @@

                                  unsigned char *pred, unsigned char *dest,

                                  int pitch, int stride, int Dc);

-void vp9_dc_only_idct_add_mmx(short input_dc, const unsigned char *pred_ptr,

-                              unsigned char *dst_ptr, int pitch, int stride);

 void vp9_dequant_idct_add_mmx(short *input, const short *dq, unsigned char *pred,

                               unsigned char *dest, int pitch, int stride);

--- a/vp9/decoder/x86/vp9_x86_dsystemdependent.c

+++ /dev/null

@@ -1,26 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vpx_config.h"

-#include "vpx_ports/x86.h"

-#include "vp9/decoder/vp9_onyxd_int.h"

-#if HAVE_MMX

-void vp9_dequantize_b_impl_mmx(short *sq, short *dq, short *q);

-void vp9_dequantize_b_mmx(BLOCKD *d) {

-  short *sq = (short *) d->qcoeff;

-  short *dq = (short *) d->dqcoeff;

-  short *q = (short *) d->dequant;

-  vp9_dequantize_b_impl_mmx(sq, dq, q);

-}

-#endif

--- a/vp9/encoder/vp9_asm_enc_offsets.c

+++ b/vp9/encoder/vp9_asm_enc_offsets.c

@@ -32,7 +32,6 @@

 DEFINE(vp9_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));

 DEFINE(vp9_blockd_dequant,                      offsetof(BLOCKD, dequant));

 DEFINE(vp9_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));

-DEFINE(vp9_blockd_eob,                          offsetof(BLOCKD, eob));

END

--- a/vp9/encoder/vp9_bitstream.c

+++ b/vp9/encoder/vp9_bitstream.c

@@ -14,6 +14,7 @@

 #include "vp9/common/vp9_entropymode.h"

 #include "vp9/common/vp9_entropymv.h"

 #include "vp9/common/vp9_findnearmv.h"

+#include "vp9/common/vp9_tile_common.h"

 #include "vp9/encoder/vp9_mcomp.h"

 #include "vp9/common/vp9_systemdependent.h"

 #include <assert.h>

@@ -41,17 +42,32 @@

 int intra_mode_stats[VP9_KF_BINTRAMODES]

                     [VP9_KF_BINTRAMODES]

                     [VP9_KF_BINTRAMODES];

-vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4];

-vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4];

-vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8];

-vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8];

-vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16];

-vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16];

-vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32];

+vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];

+vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];

+vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];

+vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];

 extern unsigned int active_section;

 #endif

+#if CONFIG_CODE_NONZEROCOUNT

+#ifdef NZC_STATS

+unsigned int nzc_stats_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                          [NZC4X4_TOKENS];

+unsigned int nzc_stats_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                          [NZC8X8_TOKENS];

+unsigned int nzc_stats_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                          [NZC16X16_TOKENS];

+unsigned int nzc_stats_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                          [NZC32X32_TOKENS];

+unsigned int nzc_pcat_stats[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]

+                          [NZC_BITS_EXTRA][2];

+void init_nzcstats();

+void update_nzcstats(VP9_COMMON *const cm);

+void print_nzcstats();

+#endif

+#endif

 #ifdef MODE_STATS

 int count_mb_seg[4] = { 0, 0, 0, 0 };

 #endif

@@ -112,8 +128,8 @@

   unsigned int new_b = 0, old_b = 0;

   int i = 0;

-  vp9_tree_probs_from_distribution(n--, tok, tree,

-                                   Pnew, bct, num_events);

+  vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);

+  n--;

   do {

     new_b += cost_branch(bct[i], Pnew[i]);

@@ -169,10 +185,9 @@

   int i, j;

   for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) {

     vp9_tree_probs_from_distribution(

-        VP9_SWITCHABLE_FILTERS,

-        vp9_switchable_interp_encodings, vp9_switchable_interp_tree,

+        vp9_switchable_interp_tree,

         pc->fc.switchable_interp_prob[j], branch_ct,

-        cpi->switchable_interp_count[j]);

+        cpi->switchable_interp_count[j], 0);

     for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {

       if (pc->fc.switchable_interp_prob[j][i] < 1)

         pc->fc.switchable_interp_prob[j][i] = 1;

@@ -189,15 +204,7 @@

   int old_cost, new_cost;

   // Set the prediction probability structures to defaults

-  if (cm->frame_type == KEY_FRAME) {

-    // Set the prediction probabilities to defaults

-    cm->ref_pred_probs[0] = 120;

-    cm->ref_pred_probs[1] = 80;

-    cm->ref_pred_probs[2] = 40;

-    vpx_memset(cpi->ref_pred_probs_update, 0,

-               sizeof(cpi->ref_pred_probs_update));

-  } else {

+  if (cm->frame_type != KEY_FRAME) {

     // From the prediction counts set the probabilities for each context

     for (i = 0; i < PREDICTION_PROBS; i++) {

       new_pred_probs[i] = get_binary_prob(cpi->ref_pred_count[i][0],

@@ -219,7 +226,6 @@

         cm->ref_pred_probs[i] = new_pred_probs[i];

       } else

         cpi->ref_pred_probs_update[i] = 0;

@@ -230,8 +236,8 @@

//

 // The branch counts table is re-populated during the actual pack stage and in

 // the decoder to facilitate backwards update of the context.

-static void update_mode_probs(VP9_COMMON *cm,

-                              int mode_context[INTER_MODE_CONTEXTS][4]) {

+static void update_inter_mode_probs(VP9_COMMON *cm,

+                                    int mode_context[INTER_MODE_CONTEXTS][4]) {

   int i, j;

   unsigned int (*mv_ref_ct)[4][2];

@@ -393,6 +399,43 @@

   return bestsavings;

+#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

+static int prob_diff_update_savings_search_model(const unsigned int *ct,

+                                                 const vp9_prob *oldp,

+                                                 vp9_prob *bestp,

+                                                 const vp9_prob upd,

+                                                 int b, int r) {

+  int i, old_b, new_b, update_b, savings, bestsavings, step;

+  int newp;

+  vp9_prob bestnewp, newplist[ENTROPY_NODES];

+  for (i = UNCONSTRAINED_NODES - 1, old_b = 0; i < ENTROPY_NODES; ++i)

+    old_b += cost_branch256(ct + 2 * i, oldp[i]);

+  bestsavings = 0;

+  bestnewp = oldp[UNCONSTRAINED_NODES - 1];

+  step = (*bestp > oldp[UNCONSTRAINED_NODES - 1] ? -1 : 1);

+  newp = *bestp;

+  // newp = *bestp - step * (abs(*bestp - oldp[UNCONSTRAINED_NODES - 1]) >> 1);

+  for (; newp != oldp[UNCONSTRAINED_NODES - 1]; newp += step) {

+    if (newp < 1 || newp > 255) continue;

+    newplist[UNCONSTRAINED_NODES - 1] = newp;

+    vp9_get_model_distribution(newp, newplist, b, r);

+    for (i = UNCONSTRAINED_NODES - 1, new_b = 0; i < ENTROPY_NODES; ++i)

+      new_b += cost_branch256(ct + 2 * i, newplist[i]);

+    update_b = prob_diff_update_cost(newp, oldp[UNCONSTRAINED_NODES - 1]) +

+        vp9_cost_upd256;

+    savings = old_b - new_b - update_b;

+    if (savings > bestsavings) {

+      bestsavings = savings;

+      bestnewp = newp;

+    }

+  }

+  *bestp = bestnewp;

+  return bestsavings;

+}

+#endif

 static void vp9_cond_prob_update(vp9_writer *bc, vp9_prob *oldp, vp9_prob upd,

                                  unsigned int *ct) {

   vp9_prob newp;

@@ -508,7 +551,8 @@

               vp9_sub_mv_ref_encoding_array - LEFT4X4 + m);

-static void write_nmv(vp9_writer *bc, const MV *mv, const int_mv *ref,

+static void write_nmv(VP9_COMP *cpi, vp9_writer *bc,

+                      const MV *mv, const int_mv *ref,

                       const nmv_context *nmvc, int usehp) {

   MV e;

   e.row = mv->row - ref->as_mv.row;

@@ -585,6 +629,28 @@

+static void write_mb_segid_except(VP9_COMMON *cm,

+                                  vp9_writer *bc,

+                                  const MB_MODE_INFO *mi,

+                                  const MACROBLOCKD *xd,

+                                  int mb_row, int mb_col) {

+  // Encode the MB segment id.

+  int seg_id = mi->segment_id;

+  int pred_seg_id = vp9_get_pred_mb_segid(cm, xd,

+                                          mb_row * cm->mb_cols + mb_col);

+  const vp9_prob *p = xd->mb_segment_tree_probs;

+  const vp9_prob p1 = xd->mb_segment_mispred_tree_probs[pred_seg_id];

+  if (xd->segmentation_enabled && xd->update_mb_segmentation_map) {

+    vp9_write(bc, seg_id >= 2, p1);

+    if (pred_seg_id >= 2 && seg_id < 2) {

+      vp9_write(bc, seg_id == 1, p[1]);

+    } else if (pred_seg_id < 2 && seg_id >= 2) {

+      vp9_write(bc, seg_id == 3, p[2]);

+    }

+  }

+}

 // This function encodes the reference frame

 static void encode_ref_frame(vp9_writer *const bc,

                              VP9_COMMON *const cm,

@@ -708,11 +774,10 @@

   // Distance of Mb to the various image edges.

   // These specified to 8th pel as they are always compared to MV

   // values that are in 1/8th pel units

-  xd->mb_to_left_edge = -((mb_col * 16) << 3);

-  xd->mb_to_top_edge = -((mb_row * 16)) << 3;

-  xd->mb_to_right_edge = ((pc->mb_cols - mb_size - mb_col) * 16) << 3;

-  xd->mb_to_bottom_edge = ((pc->mb_rows - mb_size - mb_row) * 16) << 3;

+  set_mb_row(pc, xd, mb_row, mb_size);

+  set_mb_col(pc, xd, mb_col, mb_size);

 #ifdef ENTROPY_STATS

   active_section = 9;

 #endif

@@ -728,7 +793,7 @@

       // If the mb segment id wasn't predicted code explicitly

       if (!prediction_flag)

-        write_mb_segid(bc, mi, &cpi->mb.e_mbd);

+        write_mb_segid_except(pc, bc, mi, &cpi->mb.e_mbd, mb_row, mb_col);

     } else {

       // Normal unpredicted coding

       write_mb_segid(bc, mi, &cpi->mb.e_mbd);

@@ -737,33 +802,16 @@

   if (!pc->mb_no_coeff_skip) {

     skip_coeff = 0;

-  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-             vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) {

+  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

     skip_coeff = 1;

   } else {

-    const int nmbs = mb_size;

-    const int xmbs = MIN(nmbs, mb_cols_left);

-    const int ymbs = MIN(nmbs, mb_rows_left);

-    int x, y;

-    skip_coeff = 1;

-    for (y = 0; y < ymbs; y++) {

-      for (x = 0; x < xmbs; x++) {

-        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;

-      }

-    }

+    skip_coeff = m->mbmi.mb_skip_coeff;

     vp9_write(bc, skip_coeff,

               vp9_get_pred_prob(pc, xd, PRED_MBSKIP));

   // Encode the reference frame.

-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)

-      || vp9_get_segdata(xd, segment_id, SEG_LVL_MODE) >= NEARESTMV) {

-    encode_ref_frame(bc, pc, xd, segment_id, rf);

-  } else {

-    assert(rf == INTRA_FRAME);

-  }

+  encode_ref_frame(bc, pc, xd, segment_id, rf);

   if (rf == INTRA_FRAME) {

 #ifdef ENTROPY_STATS

@@ -770,12 +818,11 @@

     active_section = 6;

 #endif

-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

-      if (m->mbmi.sb_type)

-        write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);

-      else

-        write_ymode(bc, mode, pc->fc.ymode_prob);

-    }

+    if (m->mbmi.sb_type)

+      write_sb_ymode(bc, mode, pc->fc.sb_ymode_prob);

+    else

+      write_ymode(bc, mode, pc->fc.ymode_prob);

     if (mode == B_PRED) {

       int j = 0;

       do {

@@ -801,14 +848,12 @@

     vp9_mv_ref_probs(&cpi->common, mv_ref_p, mi->mb_mode_context[rf]);

-    // #ifdef ENTROPY_STATS

 #ifdef ENTROPY_STATS

-    accum_mv_refs(mode, ct);

     active_section = 3;

 #endif

-    // Is the segment coding of mode enabled

-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

+    // If segment skip is not enabled code the mode.

+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

       if (mi->sb_type) {

         write_sb_mv_ref(bc, mode, mv_ref_p);

       } else {

@@ -878,12 +923,12 @@

 #ifdef ENTROPY_STATS

         active_section = 5;

 #endif

-        write_nmv(bc, &mi->mv[0].as_mv, &mi->best_mv,

+        write_nmv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv,

                   (const nmv_context*) nmvc,

                   xd->allow_high_precision_mv);

         if (mi->second_ref_frame > 0) {

-          write_nmv(bc, &mi->mv[1].as_mv, &mi->best_second_mv,

+          write_nmv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv,

                     (const nmv_context*) nmvc,

                     xd->allow_high_precision_mv);

@@ -915,7 +960,7 @@

 #else

           while (j != L[++k]);

 #endif

-          leftmv.as_int = left_block_mv(m, k);

+          leftmv.as_int = left_block_mv(xd, m, k);

           abovemv.as_int = above_block_mv(m, k, mis);

           mv_contz = vp9_mv_cont(&leftmv, &abovemv);

@@ -926,12 +971,12 @@

 #ifdef ENTROPY_STATS

             active_section = 11;

 #endif

-            write_nmv(bc, &blockmv.as_mv, &mi->best_mv,

+            write_nmv(cpi, bc, &blockmv.as_mv, &mi->best_mv,

                       (const nmv_context*) nmvc,

                       xd->allow_high_precision_mv);

             if (mi->second_ref_frame > 0) {

-              write_nmv(bc,

+              write_nmv(cpi, bc,

                         &cpi->mb.partition_info->bmi[j].second_mv.as_mv,

                         &mi->best_second_mv,

                         (const nmv_context*) nmvc,

@@ -951,8 +996,7 @@

                                mi->partitioning == PARTITIONING_4X4))) &&

       pc->txfm_mode == TX_MODE_SELECT &&

       !((pc->mb_no_coeff_skip && skip_coeff) ||

-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {

+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

     TX_SIZE sz = mi->txfm_size;

     // FIXME(rbultje) code ternary symbol once all experiments are merged

     vp9_write(bc, sz != TX_4X4, pc->prob_tx[0]);

@@ -965,7 +1009,7 @@

 static void write_mb_modes_kf(const VP9_COMP *cpi,

-                              const MODE_INFO *m,

+                              MODE_INFO *m,

                               vp9_writer *bc,

                               int mb_rows_left, int mb_cols_left) {

   const VP9_COMMON *const c = &cpi->common;

@@ -981,22 +1025,10 @@

   if (!c->mb_no_coeff_skip) {

     skip_coeff = 0;

-  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-             vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) {

+  } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

     skip_coeff = 1;

   } else {

-    const int nmbs = 1 << m->mbmi.sb_type;

-    const int xmbs = MIN(nmbs, mb_cols_left);

-    const int ymbs = MIN(nmbs, mb_rows_left);

-    int x, y;

-    skip_coeff = 1;

-    for (y = 0; y < ymbs; y++) {

-      for (x = 0; x < xmbs; x++) {

-        skip_coeff = skip_coeff && m[y * mis + x].mbmi.mb_skip_coeff;

-      }

-    }

+    skip_coeff = m->mbmi.mb_skip_coeff;

     vp9_write(bc, skip_coeff,

               vp9_get_pred_prob(c, xd, PRED_MBSKIP));

@@ -1013,7 +1045,8 @@

     int i = 0;

     do {

       const B_PREDICTION_MODE A = above_block_mode(m, i, mis);

-      const B_PREDICTION_MODE L = left_block_mode(m, i);

+      const B_PREDICTION_MODE L = (xd->left_available || (i & 3)) ?

+                                  left_block_mode(m, i) : B_DC_PRED;

       const int bm = m->bmi[i].as_mode.first;

 #ifdef ENTROPY_STATS

@@ -1041,8 +1074,7 @@

   if (ym <= I8X8_PRED && c->txfm_mode == TX_MODE_SELECT &&

       !((c->mb_no_coeff_skip && skip_coeff) ||

-        (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-         vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {

+        (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

     TX_SIZE sz = m->mbmi.txfm_size;

     // FIXME(rbultje) code ternary symbol once all experiments are merged

     vp9_write(bc, sz != TX_4X4, c->prob_tx[0]);

@@ -1054,45 +1086,609 @@

+#if CONFIG_CODE_NONZEROCOUNT

+static void write_nzc(VP9_COMMON *const cm,

+                      uint16_t nzc,

+                      int nzc_context,

+                      TX_SIZE tx_size,

+                      int ref,

+                      int type,

+                      vp9_writer* const bc) {

+  int c, e;

+  c = codenzc(nzc);

+  if (tx_size == TX_32X32) {

+    write_token(bc, vp9_nzc32x32_tree,

+                cm->fc.nzc_probs_32x32[nzc_context][ref][type],

+                vp9_nzc32x32_encodings + c);

+    // cm->fc.nzc_counts_32x32[nzc_context][ref][type][c]++;

+  } else if (tx_size == TX_16X16) {

+    write_token(bc, vp9_nzc16x16_tree,

+                cm->fc.nzc_probs_16x16[nzc_context][ref][type],

+                vp9_nzc16x16_encodings + c);

+    // cm->fc.nzc_counts_16x16[nzc_context][ref][type][c]++;

+  } else if (tx_size == TX_8X8) {

+    write_token(bc, vp9_nzc8x8_tree,

+                cm->fc.nzc_probs_8x8[nzc_context][ref][type],

+                vp9_nzc8x8_encodings + c);

+    // cm->fc.nzc_counts_8x8[nzc_context][ref][type][c]++;

+  } else if (tx_size == TX_4X4) {

+    write_token(bc, vp9_nzc4x4_tree,

+                cm->fc.nzc_probs_4x4[nzc_context][ref][type],

+                vp9_nzc4x4_encodings + c);

+    // cm->fc.nzc_counts_4x4[nzc_context][ref][type][c]++;

+  } else {

+    assert(0);

+  }

+  if ((e = vp9_extranzcbits[c])) {

+    int x = nzc - vp9_basenzcvalue[c];

+    while (e--) {

+      int b = (x >> e) & 1;

+      vp9_write(bc, b,

+                cm->fc.nzc_pcat_probs[nzc_context][c - NZC_TOKENS_NOEXTRA][e]);

+      // cm->fc.nzc_pcat_counts[nzc_context][c - NZC_TOKENS_NOEXTRA][e][b]++;

+    }

+  }

+}

+static void write_nzcs_sb64(VP9_COMP *cpi,

+                            MACROBLOCKD *xd,

+                            int mb_row,

+                            int mb_col,

+                            vp9_writer* const bc) {

+  VP9_COMMON *const cm = &cpi->common;

+  MODE_INFO *m = xd->mode_info_context;

+  MB_MODE_INFO *const mi = &m->mbmi;

+  int j, nzc_context;

+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  assert(mb_col == get_mb_col(xd));

+  assert(mb_row == get_mb_row(xd));

+  if (mi->mb_skip_coeff)

+    return;

+  switch (mi->txfm_size) {

+    case TX_32X32:

+      for (j = 0; j < 256; j += 64) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);

+      }

+      for (j = 256; j < 384; j += 64) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 1, bc);

+      }

+      break;

+    case TX_16X16:

+      for (j = 0; j < 256; j += 16) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);

+      }

+      for (j = 256; j < 384; j += 16) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);

+      }

+      break;

+    case TX_8X8:

+      for (j = 0; j < 256; j += 4) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);

+      }

+      for (j = 256; j < 384; j += 4) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);

+      }

+      break;

+    case TX_4X4:

+      for (j = 0; j < 256; ++j) {

+        nzc_context = vp9_get_nzc_context_y_sb64(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);

+      }

+      for (j = 256; j < 384; ++j) {

+        nzc_context = vp9_get_nzc_context_uv_sb64(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);

+      }

+      break;

+    default:

+      break;

+  }

+}

+static void write_nzcs_sb32(VP9_COMP *cpi,

+                            MACROBLOCKD *xd,

+                            int mb_row,

+                            int mb_col,

+                            vp9_writer* const bc) {

+  VP9_COMMON *const cm = &cpi->common;

+  MODE_INFO *m = xd->mode_info_context;

+  MB_MODE_INFO *const mi = &m->mbmi;

+  int j, nzc_context;

+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  assert(mb_col == get_mb_col(xd));

+  assert(mb_row == get_mb_row(xd));

+  if (mi->mb_skip_coeff)

+    return;

+  switch (mi->txfm_size) {

+    case TX_32X32:

+      for (j = 0; j < 64; j += 64) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_32X32, ref, 0, bc);

+      }

+      for (j = 64; j < 96; j += 16) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);

+      }

+      break;

+    case TX_16X16:

+      for (j = 0; j < 64; j += 16) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);

+      }

+      for (j = 64; j < 96; j += 16) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 1, bc);

+      }

+      break;

+    case TX_8X8:

+      for (j = 0; j < 64; j += 4) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);

+      }

+      for (j = 64; j < 96; j += 4) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);

+      }

+      break;

+    case TX_4X4:

+      for (j = 0; j < 64; ++j) {

+        nzc_context = vp9_get_nzc_context_y_sb32(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);

+      }

+      for (j = 64; j < 96; ++j) {

+        nzc_context = vp9_get_nzc_context_uv_sb32(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);

+      }

+      break;

+    default:

+      break;

+  }

+}

+static void write_nzcs_mb16(VP9_COMP *cpi,

+                            MACROBLOCKD *xd,

+                            int mb_row,

+                            int mb_col,

+                            vp9_writer* const bc) {

+  VP9_COMMON *const cm = &cpi->common;

+  MODE_INFO *m = xd->mode_info_context;

+  MB_MODE_INFO *const mi = &m->mbmi;

+  int j, nzc_context;

+  const int ref = m->mbmi.ref_frame != INTRA_FRAME;

+  assert(mb_col == get_mb_col(xd));

+  assert(mb_row == get_mb_row(xd));

+  if (mi->mb_skip_coeff)

+    return;

+  switch (mi->txfm_size) {

+    case TX_16X16:

+      for (j = 0; j < 16; j += 16) {

+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_16X16, ref, 0, bc);

+      }

+      for (j = 16; j < 24; j += 4) {

+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);

+      }

+      break;

+    case TX_8X8:

+      for (j = 0; j < 16; j += 4) {

+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 0, bc);

+      }

+      if (mi->mode == I8X8_PRED || mi->mode == SPLITMV) {

+        for (j = 16; j < 24; ++j) {

+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);

+        }

+      } else {

+        for (j = 16; j < 24; j += 4) {

+          nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+          write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_8X8, ref, 1, bc);

+        }

+      }

+      break;

+    case TX_4X4:

+      for (j = 0; j < 16; ++j) {

+        nzc_context = vp9_get_nzc_context_y_mb16(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 0, bc);

+      }

+      for (j = 16; j < 24; ++j) {

+        nzc_context = vp9_get_nzc_context_uv_mb16(cm, m, mb_row, mb_col, j);

+        write_nzc(cm, m->mbmi.nzcs[j], nzc_context, TX_4X4, ref, 1, bc);

+      }

+      break;

+    default:

+      break;

+  }

+}

+#ifdef NZC_STATS

+void init_nzcstats() {

+  vp9_zero(nzc_stats_4x4);

+  vp9_zero(nzc_stats_8x8);

+  vp9_zero(nzc_stats_16x16);

+  vp9_zero(nzc_stats_32x32);

+  vp9_zero(nzc_pcat_stats);

+}

+void update_nzcstats(VP9_COMMON *const cm) {

+  int c, r, b, t;

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (r = 0; r < REF_TYPES; ++r) {

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        for (t = 0; t < NZC4X4_TOKENS; ++t) {

+          nzc_stats_4x4[c][r][b][t] += cm->fc.nzc_counts_4x4[c][r][b][t];

+        }

+      }

+    }

+  }

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (r = 0; r < REF_TYPES; ++r) {

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        for (t = 0; t < NZC8X8_TOKENS; ++t) {

+          nzc_stats_8x8[c][r][b][t] += cm->fc.nzc_counts_8x8[c][r][b][t];

+        }

+      }

+    }

+  }

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (r = 0; r < REF_TYPES; ++r) {

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        for (t = 0; t < NZC16X16_TOKENS; ++t) {

+          nzc_stats_16x16[c][r][b][t] += cm->fc.nzc_counts_16x16[c][r][b][t];

+        }

+      }

+    }

+  }

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (r = 0; r < REF_TYPES; ++r) {

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        for (t = 0; t < NZC32X32_TOKENS; ++t) {

+          nzc_stats_32x32[c][r][b][t] += cm->fc.nzc_counts_32x32[c][r][b][t];

+        }

+      }

+    }

+  }

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

+      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

+      for (b = 0; b < bits; ++b) {

+        nzc_pcat_stats[c][t][b][0] += cm->fc.nzc_pcat_counts[c][t][b][0];

+        nzc_pcat_stats[c][t][b][1] += cm->fc.nzc_pcat_counts[c][t][b][1];

+      }

+    }

+  }

+}

+void print_nzcstats() {

+  int c, r, b, t;

+  FILE *f;

+  printf(

+    "static const unsigned int default_nzc_counts_4x4[MAX_NZC_CONTEXTS]\n"

+    "                                                [REF_TYPES]\n"

+    "                                                [BLOCK_TYPES]\n"

+    "                                                [NZC4X4_TOKENS] = {\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (r = 0; r < REF_TYPES; ++r) {

+      printf("    {\n");

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        printf("      {");

+        for (t = 0; t < NZC4X4_TOKENS; ++t) {

+          printf(" %-3d,", nzc_stats_4x4[c][r][b][t]);

+        }

+        printf(" },\n");

+      }

+      printf("    },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  printf(

+    "static const unsigned int default_nzc_counts_8x8[MAX_NZC_CONTEXTS]\n"

+    "                                                [REF_TYPES]\n"

+    "                                                [BLOCK_TYPES]\n"

+    "                                                [NZC8X8_TOKENS] = {\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (r = 0; r < REF_TYPES; ++r) {

+      printf("    {\n");

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        printf("      {");

+        for (t = 0; t < NZC8X8_TOKENS; ++t) {

+          printf(" %-3d,", nzc_stats_8x8[c][r][b][t]);

+        }

+        printf(" },\n");

+      }

+      printf("    },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  printf(

+    "static const unsigned int default_nzc_counts_16x16[MAX_NZC_CONTEXTS]\n"

+    "                                                  [REF_TYPES]\n"

+    "                                                  [BLOCK_TYPES]\n"

+    "                                                  [NZC16X16_TOKENS] = {"

+    "\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (r = 0; r < REF_TYPES; ++r) {

+      printf("    {\n");

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        printf("      {");

+        for (t = 0; t < NZC16X16_TOKENS; ++t) {

+          printf(" %-3d,", nzc_stats_16x16[c][r][b][t]);

+        }

+        printf(" },\n");

+      }

+      printf("    },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  printf(

+    "static const unsigned int default_nzc_counts_32x32[MAX_NZC_CONTEXTS]\n"

+    "                                                  [REF_TYPES]\n"

+    "                                                  [BLOCK_TYPES]\n"

+    "                                                  [NZC32X32_TOKENS] = {"

+    "\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (r = 0; r < REF_TYPES; ++r) {

+      printf("    {\n");

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        printf("      {");

+        for (t = 0; t < NZC32X32_TOKENS; ++t) {

+          printf(" %-3d,", nzc_stats_32x32[c][r][b][t]);

+        }

+        printf(" },\n");

+      }

+      printf("    },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  printf(

+    "static const vp9_prob default_nzc_pcat_counts[MAX_NZC_CONTEXTS]\n"

+    "                                             [NZC_TOKENS_EXTRA]\n"

+    "                                             [NZC_BITS_EXTRA] = {\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

+      printf("    {");

+      for (b = 0; b < NZC_BITS_EXTRA; ++b) {

+        printf(" %d/%d,",

+               nzc_pcat_stats[c][t][b][0], nzc_pcat_stats[c][t][b][1]);

+      }

+      printf(" },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  printf(

+    "static const vp9_prob default_nzc_probs_4x4[MAX_NZC_CONTEXTS]\n"

+    "                                           [REF_TYPES]\n"

+    "                                           [BLOCK_TYPES]\n"

+    "                                           [NZC4X4_TOKENS] = {\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (r = 0; r < REF_TYPES; ++r) {

+      printf("    {\n");

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        vp9_prob probs[NZC4X4_NODES];

+        unsigned int branch_ct[NZC4X4_NODES][2];

+        vp9_tree_probs_from_distribution(vp9_nzc4x4_tree,

+                                         probs, branch_ct,

+                                         nzc_stats_4x4[c][r][b], 0);

+        printf("      {");

+        for (t = 0; t < NZC4X4_NODES; ++t) {

+          printf(" %-3d,", probs[t]);

+        }

+        printf(" },\n");

+      }

+      printf("    },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  printf(

+    "static const vp9_prob default_nzc_probs_8x8[MAX_NZC_CONTEXTS]\n"

+    "                                           [REF_TYPES]\n"

+    "                                           [BLOCK_TYPES]\n"

+    "                                           [NZC8X8_TOKENS] = {\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (r = 0; r < REF_TYPES; ++r) {

+      printf("    {\n");

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        vp9_prob probs[NZC8X8_NODES];

+        unsigned int branch_ct[NZC8X8_NODES][2];

+        vp9_tree_probs_from_distribution(vp9_nzc8x8_tree,

+                                         probs, branch_ct,

+                                         nzc_stats_8x8[c][r][b], 0);

+        printf("      {");

+        for (t = 0; t < NZC8X8_NODES; ++t) {

+          printf(" %-3d,", probs[t]);

+        }

+        printf(" },\n");

+      }

+      printf("    },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  printf(

+    "static const vp9_prob default_nzc_probs_16x16[MAX_NZC_CONTEXTS]\n"

+    "                                             [REF_TYPES]\n"

+    "                                             [BLOCK_TYPES]\n"

+    "                                             [NZC16X16_TOKENS] = {\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (r = 0; r < REF_TYPES; ++r) {

+      printf("    {\n");

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        vp9_prob probs[NZC16X16_NODES];

+        unsigned int branch_ct[NZC16X16_NODES][2];

+        vp9_tree_probs_from_distribution(vp9_nzc16x16_tree,

+                                         probs, branch_ct,

+                                         nzc_stats_16x16[c][r][b], 0);

+        printf("      {");

+        for (t = 0; t < NZC16X16_NODES; ++t) {

+          printf(" %-3d,", probs[t]);

+        }

+        printf(" },\n");

+      }

+      printf("    },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  printf(

+    "static const vp9_prob default_nzc_probs_32x32[MAX_NZC_CONTEXTS]\n"

+    "                                             [REF_TYPES]\n"

+    "                                             [BLOCK_TYPES]\n"

+    "                                             [NZC32X32_TOKENS] = {\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (r = 0; r < REF_TYPES; ++r) {

+      printf("    {\n");

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        vp9_prob probs[NZC32X32_NODES];

+        unsigned int branch_ct[NZC32X32_NODES][2];

+        vp9_tree_probs_from_distribution(vp9_nzc32x32_tree,

+                                         probs, branch_ct,

+                                         nzc_stats_32x32[c][r][b], 0);

+        printf("      {");

+        for (t = 0; t < NZC32X32_NODES; ++t) {

+          printf(" %-3d,", probs[t]);

+        }

+        printf(" },\n");

+      }

+      printf("    },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  printf(

+    "static const vp9_prob default_nzc_pcat_probs[MAX_NZC_CONTEXTS]\n"

+    "                                            [NZC_TOKENS_EXTRA]\n"

+    "                                            [NZC_BITS_EXTRA] = {\n");

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    printf("  {\n");

+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

+      printf("    {");

+      for (b = 0; b < NZC_BITS_EXTRA; ++b) {

+        vp9_prob prob = get_binary_prob(nzc_pcat_stats[c][t][b][0],

+                                        nzc_pcat_stats[c][t][b][1]);

+        printf(" %-3d,", prob);

+      }

+      printf(" },\n");

+    }

+    printf("  },\n");

+  }

+  printf("};\n");

+  f = fopen("nzcstats.bin", "wb");

+  fwrite(nzc_stats_4x4, sizeof(nzc_stats_4x4), 1, f);

+  fwrite(nzc_stats_8x8, sizeof(nzc_stats_8x8), 1, f);

+  fwrite(nzc_stats_16x16, sizeof(nzc_stats_16x16), 1, f);

+  fwrite(nzc_stats_32x32, sizeof(nzc_stats_32x32), 1, f);

+  fwrite(nzc_pcat_stats, sizeof(nzc_pcat_stats), 1, f);

+  fclose(f);

+}

+#endif

+#endif  // CONFIG_CODE_NONZEROCOUNT

 static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,

                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,

                           int mb_row, int mb_col) {

-  VP9_COMMON *const c = &cpi->common;

+  VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &cpi->mb.e_mbd;

   xd->mode_info_context = m;

-  if (c->frame_type == KEY_FRAME) {

+  set_mb_row(&cpi->common, xd, mb_row, (1 << m->mbmi.sb_type));

+  set_mb_col(&cpi->common, xd, mb_col, (1 << m->mbmi.sb_type));

+  if (cm->frame_type == KEY_FRAME) {

     write_mb_modes_kf(cpi, m, bc,

-                      c->mb_rows - mb_row, c->mb_cols - mb_col);

+                      cm->mb_rows - mb_row, cm->mb_cols - mb_col);

 #ifdef ENTROPY_STATS

     active_section = 8;

 #endif

   } else {

     pack_inter_mode_mvs(cpi, m, bc,

-                        c->mb_rows - mb_row, c->mb_cols - mb_col);

+                        cm->mb_rows - mb_row, cm->mb_cols - mb_col);

 #ifdef ENTROPY_STATS

     active_section = 1;

 #endif

+#if CONFIG_CODE_NONZEROCOUNT

+  if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64)

+    write_nzcs_sb64(cpi, xd, mb_row, mb_col, bc);

+  else if (m->mbmi.sb_type == BLOCK_SIZE_SB32X32)

+    write_nzcs_sb32(cpi, xd, mb_row, mb_col, bc);

+  else

+    write_nzcs_mb16(cpi, xd, mb_row, mb_col, bc);

+#endif

   assert(*tok < tok_end);

   pack_mb_tokens(bc, tok, tok_end);

-static void write_modes(VP9_COMP *cpi, vp9_writer* const bc) {

+static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,

+                        TOKENEXTRA **tok, TOKENEXTRA *tok_end) {

   VP9_COMMON *const c = &cpi->common;

   const int mis = c->mode_info_stride;

   MODE_INFO *m, *m_ptr = c->mi;

   int i, mb_row, mb_col;

-  TOKENEXTRA *tok = cpi->tok;

-  TOKENEXTRA *tok_end = tok + cpi->tok_count;

-  for (mb_row = 0; mb_row < c->mb_rows; mb_row += 4, m_ptr += 4 * mis) {

+  m_ptr += c->cur_tile_mb_col_start + c->cur_tile_mb_row_start * mis;

+  for (mb_row = c->cur_tile_mb_row_start;

+       mb_row < c->cur_tile_mb_row_end; mb_row += 4, m_ptr += 4 * mis) {

     m = m_ptr;

-    for (mb_col = 0; mb_col < c->mb_cols; mb_col += 4, m += 4) {

+    for (mb_col = c->cur_tile_mb_col_start;

+         mb_col < c->cur_tile_mb_col_end; mb_col += 4, m += 4) {

       vp9_write(bc, m->mbmi.sb_type == BLOCK_SIZE_SB64X64, c->sb64_coded);

       if (m->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

-        write_modes_b(cpi, m, bc, &tok, tok_end, mb_row, mb_col);

+        write_modes_b(cpi, m, bc, tok, tok_end, mb_row, mb_col);

       } else {

         int j;

@@ -1107,7 +1703,7 @@

           vp9_write(bc, sb_m->mbmi.sb_type, c->sb32_coded);

           if (sb_m->mbmi.sb_type) {

             assert(sb_m->mbmi.sb_type == BLOCK_SIZE_SB32X32);

-            write_modes_b(cpi, sb_m, bc, &tok, tok_end,

+            write_modes_b(cpi, sb_m, bc, tok, tok_end,

                           mb_row + y_idx_sb, mb_col + x_idx_sb);

           } else {

             // Process the 4 MBs in the order:

@@ -1123,7 +1719,7 @@

               assert(mb_m->mbmi.sb_type == BLOCK_SIZE_MB16X16);

-              write_modes_b(cpi, mb_m, bc, &tok, tok_end,

+              write_modes_b(cpi, mb_m, bc, tok, tok_end,

                             mb_row + y_idx, mb_col + x_idx);

@@ -1135,20 +1731,23 @@

 /* This function is used for debugging probability trees. */

-static void print_prob_tree(vp9_coeff_probs *coef_probs) {

+static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) {

   /* print coef probability tree */

-  int i, j, k, l;

+  int i, j, k, l, m;

   FILE *f = fopen("enc_tree_probs.txt", "a");

   fprintf(f, "{\n");

-  for (i = 0; i < BLOCK_TYPES_4X4; i++) {

+  for (i = 0; i < block_types; i++) {

     fprintf(f, "  {\n");

-    for (j = 0; j < COEF_BANDS; j++) {

-      fprintf(f, "    {\n");

-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-        fprintf(f, "      {");

-        for (l = 0; l < ENTROPY_NODES; l++) {

-          fprintf(f, "%3u, ",

-                  (unsigned int)(coef_probs [i][j][k][l]));

+    for (j = 0; j < REF_TYPES; ++j) {

+      fprintf(f, "  {\n");

+      for (k = 0; k < COEF_BANDS; k++) {

+        fprintf(f, "    {\n");

+        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {

+          fprintf(f, "      {");

+          for (m = 0; m < ENTROPY_NODES; m++) {

+            fprintf(f, "%3u, ",

+                    (unsigned int)(coef_probs[i][j][k][l][m]));

+          }

         fprintf(f, " }\n");

@@ -1162,6 +1761,9 @@

 static void build_tree_distribution(vp9_coeff_probs *coef_probs,

                                     vp9_coeff_count *coef_counts,

+                                    unsigned int (*eob_branch_ct)[REF_TYPES]

+                                                                 [COEF_BANDS]

+                                                          [PREV_COEF_CONTEXTS],

 #ifdef ENTROPY_STATS

                                     VP9_COMP *cpi,

                                     vp9_coeff_accum *context_counters,

@@ -1168,26 +1770,35 @@

 #endif

                                     vp9_coeff_stats *coef_branch_ct,

                                     int block_types) {

-  int i = 0, j, k;

+  int i, j, k, l;

 #ifdef ENTROPY_STATS

   int t = 0;

 #endif

   for (i = 0; i < block_types; ++i) {

-    for (j = 0; j < COEF_BANDS; ++j) {

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-          continue;

-        vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,

-                                         vp9_coef_encodings, vp9_coef_tree,

-                                         coef_probs[i][j][k],

-                                         coef_branch_ct[i][j][k],

-                                         coef_counts[i][j][k]);

+    for (j = 0; j < REF_TYPES; ++j) {

+      for (k = 0; k < COEF_BANDS; ++k) {

+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {

+          if (l >= 3 && k == 0)

+            continue;

+          vp9_tree_probs_from_distribution(vp9_coef_tree,

+                                           coef_probs[i][j][k][l],

+                                           coef_branch_ct[i][j][k][l],

+                                           coef_counts[i][j][k][l], 0);

+          coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -

+                                             coef_branch_ct[i][j][k][l][0][0];

+          coef_probs[i][j][k][l][0] =

+              get_binary_prob(coef_branch_ct[i][j][k][l][0][0],

+                              coef_branch_ct[i][j][k][l][0][1]);

 #ifdef ENTROPY_STATS

-        if (!cpi->dummy_packing)

-          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-            context_counters[i][j][k][t] += coef_counts[i][j][k][t];

+          if (!cpi->dummy_packing) {

+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+              context_counters[i][j][k][l][t] += coef_counts[i][j][k][l][t];

+            context_counters[i][j][k][l][MAX_ENTROPY_TOKENS] +=

+                eob_branch_ct[i][j][k][l];

+          }

 #endif

+        }

@@ -1196,48 +1807,256 @@

 static void build_coeff_contexts(VP9_COMP *cpi) {

   build_tree_distribution(cpi->frame_coef_probs_4x4,

                           cpi->coef_counts_4x4,

+                          cpi->common.fc.eob_branch_counts[TX_4X4],

 #ifdef ENTROPY_STATS

                           cpi, context_counters_4x4,

 #endif

-                          cpi->frame_branch_ct_4x4, BLOCK_TYPES_4X4);

-  build_tree_distribution(cpi->frame_hybrid_coef_probs_4x4,

-                          cpi->hybrid_coef_counts_4x4,

-#ifdef ENTROPY_STATS

-                          cpi, hybrid_context_counters_4x4,

-#endif

-                          cpi->frame_hybrid_branch_ct_4x4, BLOCK_TYPES_4X4);

+                          cpi->frame_branch_ct_4x4, BLOCK_TYPES);

   build_tree_distribution(cpi->frame_coef_probs_8x8,

                           cpi->coef_counts_8x8,

+                          cpi->common.fc.eob_branch_counts[TX_8X8],

 #ifdef ENTROPY_STATS

                           cpi, context_counters_8x8,

 #endif

-                          cpi->frame_branch_ct_8x8, BLOCK_TYPES_8X8);

-  build_tree_distribution(cpi->frame_hybrid_coef_probs_8x8,

-                          cpi->hybrid_coef_counts_8x8,

-#ifdef ENTROPY_STATS

-                          cpi, hybrid_context_counters_8x8,

-#endif

-                          cpi->frame_hybrid_branch_ct_8x8, BLOCK_TYPES_8X8);

+                          cpi->frame_branch_ct_8x8, BLOCK_TYPES);

   build_tree_distribution(cpi->frame_coef_probs_16x16,

                           cpi->coef_counts_16x16,

+                          cpi->common.fc.eob_branch_counts[TX_16X16],

 #ifdef ENTROPY_STATS

                           cpi, context_counters_16x16,

 #endif

-                          cpi->frame_branch_ct_16x16, BLOCK_TYPES_16X16);

-  build_tree_distribution(cpi->frame_hybrid_coef_probs_16x16,

-                          cpi->hybrid_coef_counts_16x16,

-#ifdef ENTROPY_STATS

-                          cpi, hybrid_context_counters_16x16,

-#endif

-                          cpi->frame_hybrid_branch_ct_16x16, BLOCK_TYPES_16X16);

+                          cpi->frame_branch_ct_16x16, BLOCK_TYPES);

   build_tree_distribution(cpi->frame_coef_probs_32x32,

                           cpi->coef_counts_32x32,

+                          cpi->common.fc.eob_branch_counts[TX_32X32],

 #ifdef ENTROPY_STATS

                           cpi, context_counters_32x32,

 #endif

-                          cpi->frame_branch_ct_32x32, BLOCK_TYPES_32X32);

+                          cpi->frame_branch_ct_32x32, BLOCK_TYPES);

+#if CONFIG_CODE_NONZEROCOUNT

+static void update_nzc_probs_common(VP9_COMP* cpi,

+                                    vp9_writer* const bc,

+                                    int block_size) {

+  VP9_COMMON *cm = &cpi->common;

+  int c, r, b, t;

+  int update[2] = {0, 0};

+  int savings = 0;

+  int tokens, nodes;

+  const vp9_tree_index *nzc_tree;

+  vp9_prob *new_nzc_probs;

+  vp9_prob *old_nzc_probs;

+  unsigned int *nzc_counts;

+  unsigned int (*nzc_branch_ct)[2];

+  vp9_prob upd;

+  if (block_size == 32) {

+    tokens = NZC32X32_TOKENS;

+    nzc_tree = vp9_nzc32x32_tree;

+    old_nzc_probs = cm->fc.nzc_probs_32x32[0][0][0];

+    new_nzc_probs = cpi->frame_nzc_probs_32x32[0][0][0];

+    nzc_counts = cm->fc.nzc_counts_32x32[0][0][0];

+    nzc_branch_ct = cpi->frame_nzc_branch_ct_32x32[0][0][0];

+    upd = NZC_UPDATE_PROB_32X32;

+  } else if (block_size == 16) {

+    tokens = NZC16X16_TOKENS;

+    nzc_tree = vp9_nzc16x16_tree;

+    old_nzc_probs = cm->fc.nzc_probs_16x16[0][0][0];

+    new_nzc_probs = cpi->frame_nzc_probs_16x16[0][0][0];

+    nzc_counts = cm->fc.nzc_counts_16x16[0][0][0];

+    nzc_branch_ct = cpi->frame_nzc_branch_ct_16x16[0][0][0];

+    upd = NZC_UPDATE_PROB_16X16;

+  } else if (block_size == 8) {

+    tokens = NZC8X8_TOKENS;

+    nzc_tree = vp9_nzc8x8_tree;

+    old_nzc_probs = cm->fc.nzc_probs_8x8[0][0][0];

+    new_nzc_probs = cpi->frame_nzc_probs_8x8[0][0][0];

+    nzc_counts = cm->fc.nzc_counts_8x8[0][0][0];

+    nzc_branch_ct = cpi->frame_nzc_branch_ct_8x8[0][0][0];

+    upd = NZC_UPDATE_PROB_8X8;

+  } else {

+    nzc_tree = vp9_nzc4x4_tree;

+    tokens = NZC4X4_TOKENS;

+    old_nzc_probs = cm->fc.nzc_probs_4x4[0][0][0];

+    new_nzc_probs = cpi->frame_nzc_probs_4x4[0][0][0];

+    nzc_counts = cm->fc.nzc_counts_4x4[0][0][0];

+    nzc_branch_ct = cpi->frame_nzc_branch_ct_4x4[0][0][0];

+    upd = NZC_UPDATE_PROB_4X4;

+  }

+  nodes = tokens - 1;

+  // Get the new probabilities and the branch counts

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (r = 0; r < REF_TYPES; ++r) {

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

+        int offset_nodes = offset * nodes;

+        int offset_tokens = offset * tokens;

+        vp9_tree_probs_from_distribution(nzc_tree,

+                                         new_nzc_probs + offset_nodes,

+                                         nzc_branch_ct + offset_nodes,

+                                         nzc_counts + offset_tokens, 0);

+      }

+    }

+  }

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (r = 0; r < REF_TYPES; ++r) {

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

+        int offset_nodes = offset * nodes;

+        for (t = 0; t < nodes; ++t) {

+          vp9_prob newp = new_nzc_probs[offset_nodes + t];

+          vp9_prob oldp = old_nzc_probs[offset_nodes + t];

+          int s, u = 0;

+#if defined(SEARCH_NEWP)

+            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],

+                                                oldp, &newp, upd);

+            if (s > 0 && newp != oldp)

+              u = 1;

+            if (u)

+              savings += s - (int)(vp9_cost_zero(upd));

+            else

+              savings -= (int)(vp9_cost_zero(upd));

+#else

+          s = prob_update_savings(nzc_branch_ct[offset_nodes],

+                                  oldp, newp, upd);

+          if (s > 0)

+            u = 1;

+          if (u)

+            savings += s;

+#endif

+          update[u]++;

+        }

+      }

+    }

+  }

+  if (update[1] == 0 || savings < 0) {

+    vp9_write_bit(bc, 0);

+  } else {

+    vp9_write_bit(bc, 1);

+    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+      for (r = 0; r < REF_TYPES; ++r) {

+        for (b = 0; b < BLOCK_TYPES; ++b) {

+          int offset = c * REF_TYPES * BLOCK_TYPES + r * BLOCK_TYPES + b;

+          int offset_nodes = offset * nodes;

+          for (t = 0; t < nodes; ++t) {

+            vp9_prob newp = new_nzc_probs[offset_nodes + t];

+            vp9_prob *oldp = &old_nzc_probs[offset_nodes + t];

+            int s, u = 0;

+#if defined(SEARCH_NEWP)

+            s = prob_diff_update_savings_search(nzc_branch_ct[offset_nodes],

+                                                *oldp, &newp, upd);

+            if (s > 0 && newp != *oldp)

+              u = 1;

+#else

+            s = prob_update_savings(nzc_branch_ct[offset_nodes],

+                                    *oldp, newp, upd);

+            if (s > 0)

+              u = 1;

+#endif

+            vp9_write(bc, u, upd);

+            if (u) {

+              /* send/use new probability */

+              write_prob_diff_update(bc, newp, *oldp);

+              *oldp = newp;

+            }

+          }

+        }

+      }

+    }

+  }

+}

+static void update_nzc_pcat_probs(VP9_COMP *cpi, vp9_writer* const bc) {

+  VP9_COMMON *cm = &cpi->common;

+  int c, t, b;

+  int update[2] = {0, 0};

+  int savings = 0;

+  vp9_prob upd = NZC_UPDATE_PROB_PCAT;

+  for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+    for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

+      int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

+      for (b = 0; b < bits; ++b) {

+        vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],

+                                        cm->fc.nzc_pcat_counts[c][t][b][1]);

+        vp9_prob oldp = cm->fc.nzc_pcat_probs[c][t][b];

+        int s, u = 0;

+#if defined(SEARCH_NEWP)

+        s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],

+                                            oldp, &newp, upd);

+        if (s > 0 && newp != oldp)

+          u = 1;

+        if (u)

+          savings += s - (int)(vp9_cost_zero(upd));

+        else

+          savings -= (int)(vp9_cost_zero(upd));

+#else

+        s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],

+                                oldp, newp, upd);

+        if (s > 0)

+          u = 1;

+        if (u)

+          savings += s;

+#endif

+        update[u]++;

+      }

+    }

+  }

+  if (update[1] == 0 || savings < 0) {

+    vp9_write_bit(bc, 0);

+  } else {

+    vp9_write_bit(bc, 1);

+    for (c = 0; c < MAX_NZC_CONTEXTS; ++c) {

+      for (t = 0; t < NZC_TOKENS_EXTRA; ++t) {

+        int bits = vp9_extranzcbits[t + NZC_TOKENS_NOEXTRA];

+        for (b = 0; b < bits; ++b) {

+          vp9_prob newp = get_binary_prob(cm->fc.nzc_pcat_counts[c][t][b][0],

+                                          cm->fc.nzc_pcat_counts[c][t][b][1]);

+          vp9_prob *oldp = &cm->fc.nzc_pcat_probs[c][t][b];

+          int s, u = 0;

+#if defined(SEARCH_NEWP)

+          s = prob_diff_update_savings_search(cm->fc.nzc_pcat_counts[c][t][b],

+                                              *oldp, &newp, upd);

+          if (s > 0 && newp != *oldp)

+            u = 1;

+#else

+          s = prob_update_savings(cm->fc.nzc_pcat_counts[c][t][b],

+                                  *oldp, newp, upd);

+          if (s > 0)

+            u = 1;

+#endif

+          vp9_write(bc, u, upd);

+          if (u) {

+            /* send/use new probability */

+            write_prob_diff_update(bc, newp, *oldp);

+            *oldp = newp;

+          }

+        }

+      }

+    }

+  }

+}

+static void update_nzc_probs(VP9_COMP* cpi,

+                             vp9_writer* const bc) {

+  update_nzc_probs_common(cpi, bc, 4);

+  if (cpi->common.txfm_mode != ONLY_4X4)

+    update_nzc_probs_common(cpi, bc, 8);

+  if (cpi->common.txfm_mode > ALLOW_8X8)

+    update_nzc_probs_common(cpi, bc, 16);

+  if (cpi->common.txfm_mode > ALLOW_16X16)

+    update_nzc_probs_common(cpi, bc, 32);

+#ifdef NZC_PCAT_UPDATE

+  update_nzc_pcat_probs(cpi, bc);

+#endif

+#ifdef NZC_STATS

+  if (!cpi->dummy_packing)

+    update_nzcstats(&cpi->common);

+#endif

+}

+#endif  // CONFIG_CODE_NONZEROCOUNT

 static void update_coef_probs_common(vp9_writer* const bc,

 #ifdef ENTROPY_STATS

                                      VP9_COMP *cpi,

@@ -1247,46 +2066,59 @@

                                      vp9_coeff_probs *old_frame_coef_probs,

                                      vp9_coeff_stats *frame_branch_ct,

                                      int block_types) {

-  int i, j, k, t;

+  int i, j, k, l, t;

   int update[2] = {0, 0};

   int savings;

+#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

+  const int entropy_nodes_update = UNCONSTRAINED_UPDATE_NODES;

+#else

+  const int entropy_nodes_update = ENTROPY_NODES;

+#endif

   // vp9_prob bestupd = find_coef_update_prob(cpi);

   /* dry run to see if there is any udpate at all needed */

   savings = 0;

   for (i = 0; i < block_types; ++i) {

-    for (j = !i; j < COEF_BANDS; ++j) {

-      int prev_coef_savings[ENTROPY_NODES] = {0};

-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

-        for (t = 0; t < ENTROPY_NODES; ++t) {

-          vp9_prob newp = new_frame_coef_probs[i][j][k][t];

-          const vp9_prob oldp = old_frame_coef_probs[i][j][k][t];

-          const vp9_prob upd = COEF_UPDATE_PROB;

-          int s = prev_coef_savings[t];

-          int u = 0;

-          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

-            continue;

+    for (j = 0; j < REF_TYPES; ++j) {

+      for (k = 0; k < COEF_BANDS; ++k) {

+        // int prev_coef_savings[ENTROPY_NODES] = {0};

+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {

+          for (t = CONFIG_CODE_NONZEROCOUNT; t < entropy_nodes_update; ++t) {

+            vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];

+            const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];

+            const vp9_prob upd = vp9_coef_update_prob[t];

+            int s;  // = prev_coef_savings[t];

+            int u = 0;

+            if (l >= 3 && k == 0)

+              continue;

 #if defined(SEARCH_NEWP)

-          s = prob_diff_update_savings_search(

-                frame_branch_ct[i][j][k][t],

-                oldp, &newp, upd);

-          if (s > 0 && newp != oldp)

-            u = 1;

-          if (u)

-            savings += s - (int)(vp9_cost_zero(upd));

-          else

-            savings -= (int)(vp9_cost_zero(upd));

+#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

+            if (t == UNCONSTRAINED_NODES - 1)

+              s = prob_diff_update_savings_search_model(

+                  frame_branch_ct[i][j][k][l][0],

+                  old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);

+            else

+#endif

+              s = prob_diff_update_savings_search(

+                  frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);

+            if (s > 0 && newp != oldp)

+              u = 1;

+            if (u)

+              savings += s - (int)(vp9_cost_zero(upd));

+            else

+              savings -= (int)(vp9_cost_zero(upd));

 #else

-          s = prob_update_savings(

-                frame_branch_ct[i][j][k][t],

-                oldp, newp, upd);

-          if (s > 0)

-            u = 1;

-          if (u)

-            savings += s;

+            s = prob_update_savings(frame_branch_ct[i][j][k][l][t],

+                                    oldp, newp, upd);

+            if (s > 0)

+              u = 1;

+            if (u)

+              savings += s;

 #endif

-          update[u]++;

+            update[u]++;

+          }

@@ -1296,32 +2128,40 @@

   /* Is coef updated at all */

   if (update[1] == 0 || savings < 0) {

     vp9_write_bit(bc, 0);

-  } else {

-    vp9_write_bit(bc, 1);

-    for (i = 0; i < block_types; ++i) {

-      for (j = !i; j < COEF_BANDS; ++j) {

-        int prev_coef_savings[ENTROPY_NODES] = {0};

-        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+    return;

+  }

+  vp9_write_bit(bc, 1);

+  for (i = 0; i < block_types; ++i) {

+    for (j = 0; j < REF_TYPES; ++j) {

+      for (k = 0; k < COEF_BANDS; ++k) {

+        // int prev_coef_savings[ENTROPY_NODES] = {0};

+        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {

           // calc probs and branch cts for this frame only

-          for (t = 0; t < ENTROPY_NODES; ++t) {

-            vp9_prob newp = new_frame_coef_probs[i][j][k][t];

-            vp9_prob *oldp = old_frame_coef_probs[i][j][k] + t;

-            const vp9_prob upd = COEF_UPDATE_PROB;

-            int s = prev_coef_savings[t];

+          for (t = CONFIG_CODE_NONZEROCOUNT; t < entropy_nodes_update; ++t) {

+            vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];

+            vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;

+            const vp9_prob upd = vp9_coef_update_prob[t];

+            int s;  // = prev_coef_savings[t];

             int u = 0;

-            if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+            if (l >= 3 && k == 0)

               continue;

 #if defined(SEARCH_NEWP)

-            s = prob_diff_update_savings_search(

-                  frame_branch_ct[i][j][k][t],

+#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

+            if (t == UNCONSTRAINED_NODES - 1)

+              s = prob_diff_update_savings_search_model(

+                  frame_branch_ct[i][j][k][l][0],

+                  old_frame_coef_probs[i][j][k][l], &newp, upd, i, j);

+            else

+#endif

+              s = prob_diff_update_savings_search(

+                  frame_branch_ct[i][j][k][l][t],

                   *oldp, &newp, upd);

             if (s > 0 && newp != *oldp)

               u = 1;

 #else

-            s = prob_update_savings(

-                  frame_branch_ct[i][j][k][t],

-                  *oldp, newp, upd);

+            s = prob_update_savings(frame_branch_ct[i][j][k][l][t],

+                                    *oldp, newp, upd);

             if (s > 0)

               u = 1;

 #endif

@@ -1328,12 +2168,17 @@

             vp9_write(bc, u, upd);

 #ifdef ENTROPY_STATS

             if (!cpi->dummy_packing)

-              ++tree_update_hist[i][j][k][t][u];

+              ++tree_update_hist[i][j][k][l][t][u];

 #endif

             if (u) {

               /* send/use new probability */

               write_prob_diff_update(bc, newp, *oldp);

               *oldp = newp;

+#if CONFIG_MODELCOEFPROB && MODEL_BASED_UPDATE

+              if (t == UNCONSTRAINED_NODES - 1)

+                vp9_get_model_distribution(

+                    newp, old_frame_coef_probs[i][j][k][l], i, j);

+#endif

@@ -1356,18 +2201,8 @@

                            cpi->frame_coef_probs_4x4,

                            cpi->common.fc.coef_probs_4x4,

                            cpi->frame_branch_ct_4x4,

-                           BLOCK_TYPES_4X4);

+                           BLOCK_TYPES);

-  update_coef_probs_common(bc,

-#ifdef ENTROPY_STATS

-                           cpi,

-                           hybrid_tree_update_hist_4x4,

-#endif

-                           cpi->frame_hybrid_coef_probs_4x4,

-                           cpi->common.fc.hybrid_coef_probs_4x4,

-                           cpi->frame_hybrid_branch_ct_4x4,

-                           BLOCK_TYPES_4X4);

   /* do not do this if not even allowed */

   if (cpi->common.txfm_mode != ONLY_4X4) {

     update_coef_probs_common(bc,

@@ -1378,17 +2213,7 @@

                              cpi->frame_coef_probs_8x8,

                              cpi->common.fc.coef_probs_8x8,

                              cpi->frame_branch_ct_8x8,

-                             BLOCK_TYPES_8X8);

-    update_coef_probs_common(bc,

-#ifdef ENTROPY_STATS

-                             cpi,

-                             hybrid_tree_update_hist_8x8,

-#endif

-                             cpi->frame_hybrid_coef_probs_8x8,

-                             cpi->common.fc.hybrid_coef_probs_8x8,

-                             cpi->frame_hybrid_branch_ct_8x8,

-                             BLOCK_TYPES_8X8);

+                             BLOCK_TYPES);

   if (cpi->common.txfm_mode > ALLOW_8X8) {

@@ -1400,16 +2225,7 @@

                              cpi->frame_coef_probs_16x16,

                              cpi->common.fc.coef_probs_16x16,

                              cpi->frame_branch_ct_16x16,

-                             BLOCK_TYPES_16X16);

-    update_coef_probs_common(bc,

-#ifdef ENTROPY_STATS

-                             cpi,

-                             hybrid_tree_update_hist_16x16,

-#endif

-                             cpi->frame_hybrid_coef_probs_16x16,

-                             cpi->common.fc.hybrid_coef_probs_16x16,

-                             cpi->frame_hybrid_branch_ct_16x16,

-                             BLOCK_TYPES_16X16);

+                             BLOCK_TYPES);

   if (cpi->common.txfm_mode > ALLOW_16X16) {

@@ -1421,7 +2237,7 @@

                              cpi->frame_coef_probs_32x32,

                              cpi->common.fc.coef_probs_32x32,

                              cpi->frame_branch_ct_32x32,

-                             BLOCK_TYPES_32X32);

+                             BLOCK_TYPES);

@@ -1523,34 +2339,49 @@

    * and color type.

*/

   if (oh.type == KEY_FRAME) {

-    int v;

     // Start / synch code

     cx_data[0] = 0x9D;

     cx_data[1] = 0x01;

     cx_data[2] = 0x2a;

+    extra_bytes_packed = 3;

+    cx_data += extra_bytes_packed;

+  }

+  {

+    int v;

-    v = (pc->horiz_scale << 14) | pc->Width;

-    cx_data[3] = v;

-    cx_data[4] = v >> 8;

+    if (pc->width != pc->display_width || pc->height != pc->display_height) {

+      v = pc->display_width;

+      cx_data[0] = v;

+      cx_data[1] = v >> 8;

-    v = (pc->vert_scale << 14) | pc->Height;

-    cx_data[5] = v;

-    cx_data[6] = v >> 8;

+      v = pc->display_height;

+      cx_data[2] = v;

+      cx_data[3] = v >> 8;

+      cx_data += 4;

+      extra_bytes_packed += 4;

+    }

-    extra_bytes_packed = 7;

-    cx_data += extra_bytes_packed;

+    v = pc->width;

+    cx_data[0] = v;

+    cx_data[1] = v >> 8;

-    vp9_start_encode(&header_bc, cx_data);

+    v = pc->height;

+    cx_data[2] = v;

+    cx_data[3] = v >> 8;

-    // signal clr type

-    vp9_write_bit(&header_bc, pc->clr_type);

-    vp9_write_bit(&header_bc, pc->clamp_type);

-  } else {

-    vp9_start_encode(&header_bc, cx_data);

+    extra_bytes_packed += 4;

+    cx_data += 4;

+  vp9_start_encode(&header_bc, cx_data);

+  // TODO(jkoleszar): remove these two unused bits?

+  vp9_write_bit(&header_bc, pc->clr_type);

+  vp9_write_bit(&header_bc, pc->clamp_type);

+  // error resilient mode

+  vp9_write_bit(&header_bc, pc->error_resilient_mode);

   // Signal whether or not Segmentation is enabled

   vp9_write_bit(&header_bc, (xd->segmentation_enabled) ? 1 : 0);

@@ -1655,7 +2486,10 @@

   pc->sb32_coded = get_binary_prob(cpi->sb32_count[0], cpi->sb32_count[1]);

   vp9_write_literal(&header_bc, pc->sb32_coded, 8);

-  {

+  vp9_write_bit(&header_bc, cpi->mb.e_mbd.lossless);

+  if (cpi->mb.e_mbd.lossless) {

+    pc->txfm_mode = ONLY_4X4;

+  } else {

     if (pc->txfm_mode == TX_MODE_SELECT) {

       pc->prob_tx[0] = get_prob(cpi->txfm_count_32x32p[TX_4X4] +

                                 cpi->txfm_count_16x16p[TX_4X4] +

@@ -1699,6 +2533,14 @@

   vp9_write_bit(&header_bc, pc->filter_type);

   vp9_write_literal(&header_bc, pc->filter_level, 6);

   vp9_write_literal(&header_bc, pc->sharpness_level, 3);

+#if CONFIG_LOOP_DERING

+  if (pc->dering_enabled) {

+    vp9_write_bit(&header_bc, 1);

+    vp9_write_literal(&header_bc, pc->dering_enabled - 1, 4);

+  } else {

+    vp9_write_bit(&header_bc, 0);

+  }

+#endif

   // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).

   vp9_write_bit(&header_bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);

@@ -1765,30 +2607,36 @@

   // Transmit Dc, Second order and Uv quantizer delta information

   put_delta_q(&header_bc, pc->y1dc_delta_q);

-  put_delta_q(&header_bc, pc->y2dc_delta_q);

-  put_delta_q(&header_bc, pc->y2ac_delta_q);

   put_delta_q(&header_bc, pc->uvdc_delta_q);

   put_delta_q(&header_bc, pc->uvac_delta_q);

   // When there is a key frame all reference buffers are updated using the new key frame

   if (pc->frame_type != KEY_FRAME) {

+    int refresh_mask;

     // Should the GF or ARF be updated using the transmitted frame or buffer

-    vp9_write_bit(&header_bc, pc->refresh_golden_frame);

-    vp9_write_bit(&header_bc, pc->refresh_alt_ref_frame);

+    if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {

+      /* Preserve the previously existing golden frame and update the frame in

+       * the alt ref slot instead. This is highly specific to the use of

+       * alt-ref as a forward reference, and this needs to be generalized as

+       * other uses are implemented (like RTC/temporal scaling)

+       *

+       * gld_fb_idx and alt_fb_idx need to be swapped for future frames, but

+       * that happens in vp9_onyx_if.c:update_reference_frames() so that it can

+       * be done outside of the recode loop.

+       */

+      refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) |

+                     (cpi->refresh_golden_frame << cpi->alt_fb_idx);

+    } else {

+      refresh_mask = (cpi->refresh_last_frame << cpi->lst_fb_idx) |

+                     (cpi->refresh_golden_frame << cpi->gld_fb_idx) |

+                     (cpi->refresh_alt_ref_frame << cpi->alt_fb_idx);

+    }

+    vp9_write_literal(&header_bc, refresh_mask, NUM_REF_FRAMES);

+    vp9_write_literal(&header_bc, cpi->lst_fb_idx, NUM_REF_FRAMES_LG2);

+    vp9_write_literal(&header_bc, cpi->gld_fb_idx, NUM_REF_FRAMES_LG2);

+    vp9_write_literal(&header_bc, cpi->alt_fb_idx, NUM_REF_FRAMES_LG2);

-    // For inter frames the current default behavior is that when

-    // cm->refresh_golden_frame is set we copy the old GF over to

-    // the ARF buffer. This is purely an encoder decision at present.

-    if (pc->refresh_golden_frame)

-      pc->copy_buffer_to_arf  = 2;

-    // If not being updated from current frame should either GF or ARF be updated from another buffer

-    if (!pc->refresh_golden_frame)

-      vp9_write_literal(&header_bc, pc->copy_buffer_to_gf, 2);

-    if (!pc->refresh_alt_ref_frame)

-      vp9_write_literal(&header_bc, pc->copy_buffer_to_arf, 2);

     // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)

     vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);

     vp9_write_bit(&header_bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);

@@ -1831,10 +2679,13 @@

 #endif

-  vp9_write_bit(&header_bc, pc->refresh_entropy_probs);

+  if (!pc->error_resilient_mode) {

+    vp9_write_bit(&header_bc, pc->refresh_entropy_probs);

+    vp9_write_bit(&header_bc, pc->frame_parallel_decoding_mode);

+  }

-  if (pc->frame_type != KEY_FRAME)

-    vp9_write_bit(&header_bc, pc->refresh_last_frame);

+  vp9_write_literal(&header_bc, pc->frame_context_idx,

+                    NUM_FRAME_CONTEXTS_LG2);

 #ifdef ENTROPY_STATS

   if (pc->frame_type == INTER_FRAME)

@@ -1848,7 +2699,13 @@

   if (pc->frame_type != KEY_FRAME) {

     int i, j;

     int new_context[INTER_MODE_CONTEXTS][4];

-    update_mode_probs(pc, new_context);

+    if (!cpi->dummy_packing) {

+      update_inter_mode_probs(pc, new_context);

+    } else {

+      // In dummy pack assume context unchanged.

+      vpx_memcpy(new_context, pc->fc.vp9_mode_contexts,

+                 sizeof(pc->fc.vp9_mode_contexts));

+    }

     for (i = 0; i < INTER_MODE_CONTEXTS; i++) {

       for (j = 0; j < 4; j++) {

@@ -1902,18 +2759,33 @@

   vp9_copy(cpi->common.fc.pre_coef_probs_4x4,

            cpi->common.fc.coef_probs_4x4);

-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_4x4,

-           cpi->common.fc.hybrid_coef_probs_4x4);

   vp9_copy(cpi->common.fc.pre_coef_probs_8x8,

            cpi->common.fc.coef_probs_8x8);

-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_8x8,

-           cpi->common.fc.hybrid_coef_probs_8x8);

   vp9_copy(cpi->common.fc.pre_coef_probs_16x16,

            cpi->common.fc.coef_probs_16x16);

-  vp9_copy(cpi->common.fc.pre_hybrid_coef_probs_16x16,

-           cpi->common.fc.hybrid_coef_probs_16x16);

   vp9_copy(cpi->common.fc.pre_coef_probs_32x32,

            cpi->common.fc.coef_probs_32x32);

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_copy(cpi->common.fc.pre_nzc_probs_4x4,

+           cpi->common.fc.nzc_probs_4x4);

+  vp9_copy(cpi->common.fc.pre_nzc_probs_8x8,

+           cpi->common.fc.nzc_probs_8x8);

+  vp9_copy(cpi->common.fc.pre_nzc_probs_16x16,

+           cpi->common.fc.nzc_probs_16x16);

+  vp9_copy(cpi->common.fc.pre_nzc_probs_32x32,

+           cpi->common.fc.nzc_probs_32x32);

+  vp9_copy(cpi->common.fc.pre_nzc_pcat_probs,

+           cpi->common.fc.nzc_pcat_probs);

+  // NOTE that if the counts are reset, we also need to uncomment

+  // the count updates in the write_nzc function

+  /*

+  vp9_zero(cpi->common.fc.nzc_counts_4x4);

+  vp9_zero(cpi->common.fc.nzc_counts_8x8);

+  vp9_zero(cpi->common.fc.nzc_counts_16x16);

+  vp9_zero(cpi->common.fc.nzc_counts_32x32);

+  vp9_zero(cpi->common.fc.nzc_pcat_counts);

+  */

+#endif

   vp9_copy(cpi->common.fc.pre_sb_ymode_prob, cpi->common.fc.sb_ymode_prob);

   vp9_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);

   vp9_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);

@@ -1930,6 +2802,9 @@

   vp9_zero(cpi->common.fc.mv_ref_ct)

   update_coef_probs(cpi, &header_bc);

+#if CONFIG_CODE_NONZEROCOUNT

+  update_nzc_probs(cpi, &header_bc);

+#endif

 #ifdef ENTROPY_STATS

   active_section = 2;

@@ -1941,8 +2816,9 @@

     int k;

     vp9_update_skip_probs(cpi);

-    for (k = 0; k < MBSKIP_CONTEXTS; ++k)

+    for (k = 0; k < MBSKIP_CONTEXTS; ++k) {

       vp9_write_literal(&header_bc, pc->mbskip_pred_probs[k], 8);

+    }

   if (pc->frame_type == KEY_FRAME) {

@@ -1960,7 +2836,7 @@

     if (pc->mcomp_filter_type == SWITCHABLE)

       update_switchable_interp_probs(cpi, &header_bc);

-    #if CONFIG_COMP_INTERINTRA_PRED

+#if CONFIG_COMP_INTERINTRA_PRED

     if (pc->use_interintra) {

       vp9_cond_prob_update(&header_bc,

                            &pc->fc.interintra_prob,

@@ -1995,6 +2871,25 @@

     vp9_write_nmv_probs(cpi, xd->allow_high_precision_mv, &header_bc);

+  /* tiling */

+  {

+    int min_log2_tiles, delta_log2_tiles, n_tile_bits, n;

+    vp9_get_tile_n_bits(pc, &min_log2_tiles, &delta_log2_tiles);

+    n_tile_bits = pc->log2_tile_columns - min_log2_tiles;

+    for (n = 0; n < delta_log2_tiles; n++) {

+      if (n_tile_bits--) {

+        vp9_write_bit(&header_bc, 1);

+      } else {

+        vp9_write_bit(&header_bc, 0);

+        break;

+      }

+    }

+    vp9_write_bit(&header_bc, pc->log2_tile_rows != 0);

+    if (pc->log2_tile_rows != 0)

+      vp9_write_bit(&header_bc, pc->log2_tile_rows != 1);

+  }

   vp9_stop_encode(&header_bc);

   oh.first_partition_length_in_bytes = header_bc.pos;

@@ -2001,11 +2896,15 @@

   /* update frame tag */

-    int v = (oh.first_partition_length_in_bytes << 5) |

+    int scaling = (pc->width != pc->display_width ||

+                   pc->height != pc->display_height);

+    int v = (oh.first_partition_length_in_bytes << 8) |

+            (scaling << 5) |

             (oh.show_frame << 4) |

             (oh.version << 1) |

             oh.type;

+    assert(oh.first_partition_length_in_bytes <= 0xffff);

     dest[0] = v;

     dest[1] = v >> 8;

     dest[2] = v >> 16;

@@ -2012,23 +2911,57 @@

   *size = VP9_HEADER_SIZE + extra_bytes_packed + header_bc.pos;

-  vp9_start_encode(&residual_bc, cx_data + header_bc.pos);

   if (pc->frame_type == KEY_FRAME) {

     decide_kf_ymode_entropy(cpi);

-    write_modes(cpi, &residual_bc);

   } else {

     /* This is not required if the counts in cpi are consistent with the

      * final packing pass */

     // if (!cpi->dummy_packing) vp9_zero(cpi->NMVcount);

-    write_modes(cpi, &residual_bc);

-    vp9_update_mode_context(&cpi->common);

-  vp9_stop_encode(&residual_bc);

+  {

+    int tile_row, tile_col, total_size = 0;

+    unsigned char *data_ptr = cx_data + header_bc.pos;

+    TOKENEXTRA *tok[1 << 6], *tok_end;

-  *size += residual_bc.pos;

+    tok[0] = cpi->tok;

+    for (tile_col = 1; tile_col < pc->tile_columns; tile_col++)

+      tok[tile_col] = tok[tile_col - 1] + cpi->tok_count[tile_col - 1];

+    for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) {

+      vp9_get_tile_row_offsets(pc, tile_row);

+      tok_end = cpi->tok + cpi->tok_count[0];

+      for (tile_col = 0; tile_col < pc->tile_columns;

+           tile_col++, tok_end += cpi->tok_count[tile_col]) {

+        vp9_get_tile_col_offsets(pc, tile_col);

+        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1)

+          vp9_start_encode(&residual_bc, data_ptr + total_size + 4);

+        else

+          vp9_start_encode(&residual_bc, data_ptr + total_size);

+        write_modes(cpi, &residual_bc, &tok[tile_col], tok_end);

+        vp9_stop_encode(&residual_bc);

+        if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) {

+          /* size of this tile */

+          data_ptr[total_size + 0] = residual_bc.pos;

+          data_ptr[total_size + 1] = residual_bc.pos >> 8;

+          data_ptr[total_size + 2] = residual_bc.pos >> 16;

+          data_ptr[total_size + 3] = residual_bc.pos >> 24;

+          total_size += 4;

+        }

+        total_size += residual_bc.pos;

+      }

+    }

+    assert((unsigned int)(tok[0] - cpi->tok) == cpi->tok_count[0]);

+    for (tile_col = 1; tile_col < pc->tile_columns; tile_col++)

+      assert((unsigned int)(tok[tile_col] - tok[tile_col - 1]) ==

+                  cpi->tok_count[tile_col]);

+    *size += total_size;

+  }

 #ifdef ENTROPY_STATS

@@ -2035,19 +2968,23 @@

 static void print_tree_update_for_type(FILE *f,

                                        vp9_coeff_stats *tree_update_hist,

                                        int block_types, const char *header) {

-  int i, j, k, l;

+  int i, j, k, l, m;

   fprintf(f, "const vp9_coeff_prob %s = {\n", header);

   for (i = 0; i < block_types; i++) {

     fprintf(f, "  { \n");

-    for (j = 0; j < COEF_BANDS; j++) {

-      fprintf(f, "    {\n");

-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-        fprintf(f, "      {");

-        for (l = 0; l < ENTROPY_NODES; l++) {

-          fprintf(f, "%3d, ",

-                  get_binary_prob(tree_update_hist[i][j][k][l][0],

-                                  tree_update_hist[i][j][k][l][1]));

+    for (j = 0; j < REF_TYPES; j++) {

+      fprintf(f, "  { \n");

+      for (k = 0; k < COEF_BANDS; k++) {

+        fprintf(f, "    {\n");

+        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {

+          fprintf(f, "      {");

+          for (m = 0; m < ENTROPY_NODES; m++) {

+            fprintf(f, "%3d, ",

+                    get_binary_prob(tree_update_hist[i][j][k][l][m][0],

+                                    tree_update_hist[i][j][k][l][m][1]));

+          }

+          fprintf(f, "},\n");

         fprintf(f, "},\n");

@@ -2062,21 +2999,14 @@

   FILE *f = fopen("coefupdprob.h", "w");

   fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");

-  print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES_4X4,

-                             "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]");

-  print_tree_update_for_type(f, hybrid_tree_update_hist_4x4, BLOCK_TYPES_4X4,

-                             "vp9_coef_update_probs_4x4[BLOCK_TYPES_4X4]");

-  print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES_8X8,

-                             "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]");

-  print_tree_update_for_type(f, hybrid_tree_update_hist_8x8, BLOCK_TYPES_8X8,

-                             "vp9_coef_update_probs_8x8[BLOCK_TYPES_8X8]");

-  print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES_16X16,

-                             "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]");

-  print_tree_update_for_type(f, hybrid_tree_update_hist_16x16,

-                             BLOCK_TYPES_16X16,

-                             "vp9_coef_update_probs_16x16[BLOCK_TYPES_16X16]");

-  print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES_32X32,

-                             "vp9_coef_update_probs_32x32[BLOCK_TYPES_32X32]");

+  print_tree_update_for_type(f, tree_update_hist_4x4, BLOCK_TYPES,

+                             "vp9_coef_update_probs_4x4[BLOCK_TYPES]");

+  print_tree_update_for_type(f, tree_update_hist_8x8, BLOCK_TYPES,

+                             "vp9_coef_update_probs_8x8[BLOCK_TYPES]");

+  print_tree_update_for_type(f, tree_update_hist_16x16, BLOCK_TYPES,

+                             "vp9_coef_update_probs_16x16[BLOCK_TYPES]");

+  print_tree_update_for_type(f, tree_update_hist_32x32, BLOCK_TYPES,

+                             "vp9_coef_update_probs_32x32[BLOCK_TYPES]");

   fclose(f);

   f = fopen("treeupdate.bin", "wb");

@@ -2083,6 +3013,7 @@

   fwrite(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);

   fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

   fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

+  fwrite(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f);

   fclose(f);

 #endif

--- a/vp9/encoder/vp9_block.h

+++ b/vp9/encoder/vp9_block.h

@@ -50,10 +50,7 @@

   int src;

   int src_stride;

-  int eob_max_offset;

-  int eob_max_offset_8x8;

-  int eob_max_offset_16x16;

-  int eob_max_offset_32x32;

+  int skip_block;

 } BLOCK;

 typedef struct {

@@ -86,20 +83,13 @@

   int64_t txfm_rd_diff[NB_TXFM_MODES];

 } PICK_MODE_CONTEXT;

-typedef struct superblock {

-  DECLARE_ALIGNED(16, int16_t, src_diff[32*32+16*16*2]);

-  DECLARE_ALIGNED(16, int16_t, coeff[32*32+16*16*2]);

-} SUPERBLOCK;

-typedef struct macroblock {

-  DECLARE_ALIGNED(16, int16_t, src_diff[400]);  // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y

-  DECLARE_ALIGNED(16, int16_t, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y

+typedef struct macroblock MACROBLOCK;

+struct macroblock {

+  DECLARE_ALIGNED(16, int16_t, src_diff[64*64+32*32*2]);

+  DECLARE_ALIGNED(16, int16_t, coeff[64*64+32*32*2]);

   // 16 Y blocks, 4 U blocks, 4 V blocks,

-  // 1 DC 2nd order block each with 16 entries

-  BLOCK block[25];

+  BLOCK block[24];

-  SUPERBLOCK sb_coeff_data;

   YV12_BUFFER_CONFIG src;

   MACROBLOCKD e_mbd;

@@ -160,8 +150,13 @@

   unsigned char *active_ptr;

-  vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4];

-  vp9_coeff_count hybrid_token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES_4X4];

+  vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];

+#if CONFIG_CODE_NONZEROCOUNT

+  unsigned int nzc_costs_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][17];

+  unsigned int nzc_costs_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][65];

+  unsigned int nzc_costs_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][257];

+  unsigned int nzc_costs_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][1025];

+#endif

   int optimize;

@@ -172,17 +167,14 @@

   PICK_MODE_CONTEXT sb32_context[4];

   PICK_MODE_CONTEXT sb64_context;

-  void (*vp9_short_fdct4x4)(int16_t *input, int16_t *output, int pitch);

-  void (*vp9_short_fdct8x4)(int16_t *input, int16_t *output, int pitch);

-  void (*short_walsh4x4)(int16_t *input, int16_t *output, int pitch);

-  void (*quantize_b_4x4)(BLOCK *b, BLOCKD *d);

-  void (*quantize_b_4x4_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);

-  void (*vp9_short_fdct8x8)(int16_t *input, int16_t *output, int pitch);

-  void (*vp9_short_fdct16x16)(int16_t *input, int16_t *output, int pitch);

-  void (*short_fhaar2x2)(int16_t *input, int16_t *output, int pitch);

-  void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);

-  void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);

-  void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);

-} MACROBLOCK;

+  void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);

+  void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);

+  void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);

+  void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch);

+  void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx);

+  void (*quantize_b_4x4_pair)(MACROBLOCK *x, int b_idx1, int b_idx2);

+  void (*quantize_b_16x16)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type);

+  void (*quantize_b_8x8)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type);

+};

 #endif  // VP9_ENCODER_VP9_BLOCK_H_

--- a/vp9/encoder/vp9_boolhuff.c

+++ b/vp9/encoder/vp9_boolhuff.c

@@ -40,7 +40,6 @@

};

 void vp9_start_encode(BOOL_CODER *br, unsigned char *source) {

   br->lowvalue = 0;

   br->range    = 255;

   br->value    = 0;

@@ -54,6 +53,10 @@

   for (i = 0; i < 32; i++)

     encode_bool(br, 0, 128);

+  // Ensure there's no ambigous collision with any index marker bytes

+  if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0)

+    br->buffer[br->pos++] = 0;

--- a/vp9/encoder/vp9_dct.c

+++ b/vp9/encoder/vp9_dct.c

@@ -15,806 +15,545 @@

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9/common/vp9_blockd.h"

+#include "vp9/common/vp9_idct.h"

-// TODO: these transforms can be converted into integer forms to reduce

-//       the complexity

-static const float dct_4[16] = {

-  0.500000000000000,  0.500000000000000,  0.500000000000000,  0.500000000000000,

-  0.653281482438188,  0.270598050073099, -0.270598050073099, -0.653281482438188,

-  0.500000000000000, -0.500000000000000, -0.500000000000000,  0.500000000000000,

-  0.270598050073099, -0.653281482438188,  0.653281482438188, -0.270598050073099

-};

+static void fdct4_1d(int16_t *input, int16_t *output) {

+  int16_t step[4];

+  int temp1, temp2;

-static const float adst_4[16] = {

-  0.228013428883779,  0.428525073124360,  0.577350269189626,  0.656538502008139,

-  0.577350269189626,  0.577350269189626,  0.000000000000000, -0.577350269189626,

-  0.656538502008139, -0.228013428883779, -0.577350269189626,  0.428525073124359,

-  0.428525073124360, -0.656538502008139,  0.577350269189626, -0.228013428883779

-};

+  step[0] = input[0] + input[3];

+  step[1] = input[1] + input[2];

+  step[2] = input[1] - input[2];

+  step[3] = input[0] - input[3];

-static const float dct_8[64] = {

-  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,

-  0.353553390593274,   0.353553390593274,   0.353553390593274,   0.353553390593274,

-  0.490392640201615,   0.415734806151273,   0.277785116509801,   0.097545161008064,

- -0.097545161008064,  -0.277785116509801,  -0.415734806151273,  -0.490392640201615,

-  0.461939766255643,   0.191341716182545,  -0.191341716182545,  -0.461939766255643,

- -0.461939766255643,  -0.191341716182545,   0.191341716182545,   0.461939766255643,

-  0.415734806151273,  -0.097545161008064,  -0.490392640201615,  -0.277785116509801,

-  0.277785116509801,   0.490392640201615,   0.097545161008064,  -0.415734806151273,

-  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,

-  0.353553390593274,  -0.353553390593274,  -0.353553390593274,   0.353553390593274,

-  0.277785116509801,  -0.490392640201615,   0.097545161008064,   0.415734806151273,

- -0.415734806151273,  -0.097545161008064,   0.490392640201615,  -0.277785116509801,

-  0.191341716182545,  -0.461939766255643,   0.461939766255643,  -0.191341716182545,

- -0.191341716182545,   0.461939766255643,  -0.461939766255643,   0.191341716182545,

-  0.097545161008064,  -0.277785116509801,   0.415734806151273,  -0.490392640201615,

-  0.490392640201615,  -0.415734806151273,   0.277785116509801,  -0.097545161008064

-};

+  temp1 = (step[0] + step[1]) * cospi_16_64;

+  temp2 = (step[0] - step[1]) * cospi_16_64;

+  output[0] = dct_const_round_shift(temp1);

+  output[2] = dct_const_round_shift(temp2);

+  temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;

+  temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;

+  output[1] = dct_const_round_shift(temp1);

+  output[3] = dct_const_round_shift(temp2);

+}

-static const float adst_8[64] = {

-  0.089131608307533,   0.175227946595735,   0.255357107325376,   0.326790388032145,

-  0.387095214016349,   0.434217976756762,   0.466553967085785,   0.483002021635509,

-  0.255357107325376,   0.434217976756762,   0.483002021635509,   0.387095214016349,

-  0.175227946595735,  -0.089131608307533,  -0.326790388032145,  -0.466553967085785,

-  0.387095214016349,   0.466553967085785,   0.175227946595735,  -0.255357107325376,

- -0.483002021635509,  -0.326790388032145,   0.089131608307533,   0.434217976756762,

-  0.466553967085785,   0.255357107325376,  -0.326790388032145,  -0.434217976756762,

-  0.089131608307533,   0.483002021635509,   0.175227946595735,  -0.387095214016348,

-  0.483002021635509,  -0.089131608307533,  -0.466553967085785,   0.175227946595735,

-  0.434217976756762,  -0.255357107325376,  -0.387095214016348,   0.326790388032145,

-  0.434217976756762,  -0.387095214016348,  -0.089131608307533,   0.466553967085786,

- -0.326790388032145,  -0.175227946595735,   0.483002021635509,  -0.255357107325375,

-  0.326790388032145,  -0.483002021635509,   0.387095214016349,  -0.089131608307534,

- -0.255357107325377,   0.466553967085785,  -0.434217976756762,   0.175227946595736,

-  0.175227946595735,  -0.326790388032145,   0.434217976756762,  -0.483002021635509,

-  0.466553967085785,  -0.387095214016348,   0.255357107325376,  -0.089131608307532

-};

+void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int pitch) {

+  int16_t out[4 * 4];

+  int16_t *outptr = &out[0];

+  const int short_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[4], temp_out[4];

-/* Converted the transforms to integers. */

-static const int16_t dct_i4[16] = {

-  16384,  16384,  16384,  16384,

-  21407,   8867,  -8867, -21407,

-  16384, -16384, -16384,  16384,

-   8867, -21407,  21407,  -8867

-};

+  // Columns

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = input[j * short_pitch + i] << 4;

+    if (i == 0 && temp_in[0])

+      temp_in[0] += 1;

+    fdct4_1d(temp_in, temp_out);

+    for (j = 0; j < 4; ++j)

+      outptr[j * 4 + i] = temp_out[j];

+  }

-static const int16_t adst_i4[16] = {

-   7472,  14042,  18919,  21513,

-  18919,  18919,      0, -18919,

-  21513,  -7472, -18919,  14042,

-  14042, -21513,  18919,  -7472

-};

+  // Rows

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j + i * 4];

+    fdct4_1d(temp_in, temp_out);

+    for (j = 0; j < 4; ++j)

+        output[j + i * 4] = (temp_out[j] + 1) >> 2;

+  }

+}

-static const int16_t dct_i8[64] = {

-   11585,  11585,  11585,  11585,

-   11585,  11585,  11585,  11585,

-   16069,  13623,   9102,   3196,

-   -3196,  -9102, -13623, -16069,

-   15137,   6270,  -6270, -15137,

-  -15137,  -6270,   6270,  15137,

-   13623,  -3196, -16069,  -9102,

-    9102,  16069,   3196, -13623,

-   11585, -11585, -11585,  11585,

-   11585, -11585, -11585,  11585,

-    9102, -16069,   3196,  13623,

-  -13623,  -3196,  16069,  -9102,

-    6270, -15137,  15137,  -6270,

-   -6270,  15137, -15137,   6270,

-    3196,  -9102,  13623, -16069,

-   16069, -13623,   9102,  -3196

-};

+static void fadst4_1d(int16_t *input, int16_t *output) {

+  int x0, x1, x2, x3;

+  int s0, s1, s2, s3, s4, s5, s6, s7;

-static const int16_t adst_i8[64] = {

-    2921,   5742,   8368,  10708,

-   12684,  14228,  15288,  15827,

-    8368,  14228,  15827,  12684,

-    5742,  -2921, -10708, -15288,

-   12684,  15288,   5742,  -8368,

-  -15827, -10708,   2921,  14228,

-   15288,   8368, -10708, -14228,

-    2921,  15827,   5742, -12684,

-   15827,  -2921, -15288,   5742,

-   14228,  -8368, -12684,  10708,

-   14228, -12684,  -2921,  15288,

-  -10708,  -5742,  15827,  -8368,

-   10708, -15827,  12684,  -2921,

-   -8368,  15288, -14228,   5742,

-    5742, -10708,  14228, -15827,

-   15288, -12684,   8368,  -2921

-};

+  x0 = input[0];

+  x1 = input[1];

+  x2 = input[2];

+  x3 = input[3];

-static const float dct_16[256] = {

-  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,

-  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,  0.250000,

-  0.351851,  0.338330,  0.311806,  0.273300,  0.224292,  0.166664,  0.102631,  0.034654,

- -0.034654, -0.102631, -0.166664, -0.224292, -0.273300, -0.311806, -0.338330, -0.351851,

-  0.346760,  0.293969,  0.196424,  0.068975, -0.068975, -0.196424, -0.293969, -0.346760,

- -0.346760, -0.293969, -0.196424, -0.068975,  0.068975,  0.196424,  0.293969,  0.346760,

-  0.338330,  0.224292,  0.034654, -0.166664, -0.311806, -0.351851, -0.273300, -0.102631,

-  0.102631,  0.273300,  0.351851,  0.311806,  0.166664, -0.034654, -0.224292, -0.338330,

-  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,

-  0.326641,  0.135299, -0.135299, -0.326641, -0.326641, -0.135299,  0.135299,  0.326641,

-  0.311806,  0.034654, -0.273300, -0.338330, -0.102631,  0.224292,  0.351851,  0.166664,

- -0.166664, -0.351851, -0.224292,  0.102631,  0.338330,  0.273300, -0.034654, -0.311806,

-  0.293969, -0.068975, -0.346760, -0.196424,  0.196424,  0.346760,  0.068975, -0.293969,

- -0.293969,  0.068975,  0.346760,  0.196424, -0.196424, -0.346760, -0.068975,  0.293969,

-  0.273300, -0.166664, -0.338330,  0.034654,  0.351851,  0.102631, -0.311806, -0.224292,

-  0.224292,  0.311806, -0.102631, -0.351851, -0.034654,  0.338330,  0.166664, -0.273300,

-  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,

-  0.250000, -0.250000, -0.250000,  0.250000,  0.250000, -0.250000, -0.250000,  0.250000,

-  0.224292, -0.311806, -0.102631,  0.351851, -0.034654, -0.338330,  0.166664,  0.273300,

- -0.273300, -0.166664,  0.338330,  0.034654, -0.351851,  0.102631,  0.311806, -0.224292,

-  0.196424, -0.346760,  0.068975,  0.293969, -0.293969, -0.068975,  0.346760, -0.196424,

- -0.196424,  0.346760, -0.068975, -0.293969,  0.293969,  0.068975, -0.346760,  0.196424,

-  0.166664, -0.351851,  0.224292,  0.102631, -0.338330,  0.273300,  0.034654, -0.311806,

-  0.311806, -0.034654, -0.273300,  0.338330, -0.102631, -0.224292,  0.351851, -0.166664,

-  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,

-  0.135299, -0.326641,  0.326641, -0.135299, -0.135299,  0.326641, -0.326641,  0.135299,

-  0.102631, -0.273300,  0.351851, -0.311806,  0.166664,  0.034654, -0.224292,  0.338330,

- -0.338330,  0.224292, -0.034654, -0.166664,  0.311806, -0.351851,  0.273300, -0.102631,

-  0.068975, -0.196424,  0.293969, -0.346760,  0.346760, -0.293969,  0.196424, -0.068975,

- -0.068975,  0.196424, -0.293969,  0.346760, -0.346760,  0.293969, -0.196424,  0.068975,

-  0.034654, -0.102631,  0.166664, -0.224292,  0.273300, -0.311806,  0.338330, -0.351851,

-  0.351851, -0.338330,  0.311806, -0.273300,  0.224292, -0.166664,  0.102631, -0.034654

-};

+  if (!(x0 | x1 | x2 | x3)) {

+    output[0] = output[1] = output[2] = output[3] = 0;

+    return;

+  }

-static const float adst_16[256] = {

-  0.033094,  0.065889,  0.098087,  0.129396,  0.159534,  0.188227,  0.215215,  0.240255,

-  0.263118,  0.283599,  0.301511,  0.316693,  0.329007,  0.338341,  0.344612,  0.347761,

-  0.098087,  0.188227,  0.263118,  0.316693,  0.344612,  0.344612,  0.316693,  0.263118,

-  0.188227,  0.098087,  0.000000, -0.098087, -0.188227, -0.263118, -0.316693, -0.344612,

-  0.159534,  0.283599,  0.344612,  0.329007,  0.240255,  0.098087, -0.065889, -0.215215,

- -0.316693, -0.347761, -0.301511, -0.188227, -0.033094,  0.129396,  0.263118,  0.338341,

-  0.215215,  0.338341,  0.316693,  0.159534, -0.065889, -0.263118, -0.347761, -0.283599,

- -0.098087,  0.129396,  0.301511,  0.344612,  0.240255,  0.033094, -0.188227, -0.329007,

-  0.263118,  0.344612,  0.188227, -0.098087, -0.316693, -0.316693, -0.098087,  0.188227,

-  0.344612,  0.263118,  0.000000, -0.263118, -0.344612, -0.188227,  0.098087,  0.316693,

-  0.301511,  0.301511,  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,

-  0.000000, -0.301511, -0.301511, -0.000000,  0.301511,  0.301511,  0.000000, -0.301511,

-  0.329007,  0.215215, -0.188227, -0.338341, -0.033094,  0.316693,  0.240255, -0.159534,

- -0.344612, -0.065889,  0.301511,  0.263118, -0.129396, -0.347761, -0.098087,  0.283599,

-  0.344612,  0.098087, -0.316693, -0.188227,  0.263118,  0.263118, -0.188227, -0.316693,

-  0.098087,  0.344612,  0.000000, -0.344612, -0.098087,  0.316693,  0.188227, -0.263118,

-  0.347761, -0.033094, -0.344612,  0.065889,  0.338341, -0.098087, -0.329007,  0.129396,

-  0.316693, -0.159534, -0.301511,  0.188227,  0.283599, -0.215215, -0.263118,  0.240255,

-  0.338341, -0.159534, -0.263118,  0.283599,  0.129396, -0.344612,  0.033094,  0.329007,

- -0.188227, -0.240255,  0.301511,  0.098087, -0.347761,  0.065889,  0.316693, -0.215215,

-  0.316693, -0.263118, -0.098087,  0.344612, -0.188227, -0.188227,  0.344612, -0.098087,

- -0.263118,  0.316693,  0.000000, -0.316693,  0.263118,  0.098087, -0.344612,  0.188227,

-  0.283599, -0.329007,  0.098087,  0.215215, -0.347761,  0.188227,  0.129396, -0.338341,

-  0.263118,  0.033094, -0.301511,  0.316693, -0.065889, -0.240255,  0.344612, -0.159534,

-  0.240255, -0.347761,  0.263118, -0.033094, -0.215215,  0.344612, -0.283599,  0.065889,

-  0.188227, -0.338341,  0.301511, -0.098087, -0.159534,  0.329007, -0.316693,  0.129396,

-  0.188227, -0.316693,  0.344612, -0.263118,  0.098087,  0.098087, -0.263118,  0.344612,

- -0.316693,  0.188227,  0.000000, -0.188227,  0.316693, -0.344612,  0.263118, -0.098087,

-  0.129396, -0.240255,  0.316693, -0.347761,  0.329007, -0.263118,  0.159534, -0.033094,

- -0.098087,  0.215215, -0.301511,  0.344612, -0.338341,  0.283599, -0.188227,  0.065889,

-  0.065889, -0.129396,  0.188227, -0.240255,  0.283599, -0.316693,  0.338341, -0.347761,

-  0.344612, -0.329007,  0.301511, -0.263118,  0.215215, -0.159534,  0.098087, -0.033094

-};

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_4_9 * x0;

+  s2 = sinpi_2_9 * x1;

+  s3 = sinpi_1_9 * x1;

+  s4 = sinpi_3_9 * x2;

+  s5 = sinpi_4_9 * x3;

+  s6 = sinpi_2_9 * x3;

+  s7 = x0 + x1 - x3;

-/* Converted the transforms to integers. */

-static const int16_t dct_i16[256] = {

-    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,

-    8192,   8192,   8192,   8192,   8192,   8192,   8192,   8192,

-   11529,  11086,  10217,   8955,   7350,   5461,   3363,   1136,

-   -1136,  -3363,  -5461,  -7350,  -8955, -10217, -11086, -11529,

-   11363,   9633,   6436,   2260,  -2260,  -6436,  -9633, -11363,

-  -11363,  -9633,  -6436,  -2260,   2260,   6436,   9633,  11363,

-   11086,   7350,   1136,  -5461, -10217, -11529,  -8955,  -3363,

-    3363,   8955,  11529,  10217,   5461,  -1136,  -7350, -11086,

-   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,

-   10703,   4433,  -4433, -10703, -10703,  -4433,   4433,  10703,

-   10217,   1136,  -8955, -11086,  -3363,   7350,  11529,   5461,

-   -5461, -11529,  -7350,   3363,  11086,   8955,  -1136, -10217,

-    9633,  -2260, -11363,  -6436,   6436,  11363,   2260,  -9633,

-   -9633,   2260,  11363,   6436,  -6436, -11363,  -2260,   9633,

-    8955,  -5461, -11086,   1136,  11529,   3363, -10217,  -7350,

-    7350,  10217,  -3363, -11529,  -1136,  11086,   5461,  -8955,

-    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,

-    8192,  -8192,  -8192,   8192,   8192,  -8192,  -8192,   8192,

-    7350, -10217,  -3363,  11529,  -1136, -11086,   5461,   8955,

-   -8955,  -5461,  11086,   1136, -11529,   3363,  10217,  -7350,

-    6436, -11363,   2260,   9633,  -9633,  -2260,  11363,  -6436,

-   -6436,  11363,  -2260,  -9633,   9633,   2260, -11363,   6436,

-    5461, -11529,   7350,   3363, -11086,   8955,   1136, -10217,

-   10217,  -1136,  -8955,  11086,  -3363,  -7350,  11529,  -5461,

-    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,

-    4433, -10703,  10703,  -4433,  -4433,  10703, -10703,   4433,

-    3363,  -8955,  11529, -10217,   5461,   1136,  -7350,  11086,

-  -11086,   7350,  -1136,  -5461,  10217, -11529,   8955,  -3363,

-    2260,  -6436,   9633, -11363,  11363,  -9633,   6436,  -2260,

-   -2260,   6436,  -9633,  11363, -11363,   9633,  -6436,   2260,

-    1136,  -3363,   5461,  -7350,   8955, -10217,  11086, -11529,

-   11529, -11086,  10217,  -8955,   7350,  -5461,   3363,  -1136

-};

+  x0 = s0 + s2 + s5;

+  x1 = sinpi_3_9 * s7;

+  x2 = s1 - s3 + s6;

+  x3 = s4;

-static const int16_t adst_i16[256] = {

-    1084,   2159,   3214,   4240,   5228,   6168,   7052,   7873,

-    8622,   9293,   9880,  10377,  10781,  11087,  11292,  11395,

-    3214,   6168,   8622,  10377,  11292,  11292,  10377,   8622,

-    6168,   3214,      0,  -3214,  -6168,  -8622, -10377, -11292,

-    5228,   9293,  11292,  10781,   7873,   3214,  -2159,  -7052,

-  -10377, -11395,  -9880,  -6168,  -1084,   4240,   8622,  11087,

-    7052,  11087,  10377,   5228,  -2159,  -8622, -11395,  -9293,

-   -3214,   4240,   9880,  11292,   7873,   1084,  -6168, -10781,

-    8622,  11292,   6168,  -3214, -10377, -10377,  -3214,   6168,

-   11292,   8622,      0,  -8622, -11292,  -6168,   3214,  10377,

-    9880,   9880,      0,  -9880,  -9880,      0,   9880,   9880,

-       0,  -9880,  -9880,      0,   9880,   9880,      0,  -9880,

-   10781,   7052,  -6168, -11087,  -1084,  10377,   7873,  -5228,

-  -11292,  -2159,   9880,   8622,  -4240, -11395,  -3214,   9293,

-   11292,   3214, -10377,  -6168,   8622,   8622,  -6168, -10377,

-    3214,  11292,      0, -11292,  -3214,  10377,   6168,  -8622,

-   11395,  -1084, -11292,   2159,  11087,  -3214, -10781,   4240,

-   10377,  -5228,  -9880,   6168,   9293,  -7052,  -8622,   7873,

-   11087,  -5228,  -8622,   9293,   4240, -11292,   1084,  10781,

-   -6168,  -7873,   9880,   3214, -11395,   2159,  10377,  -7052,

-   10377,  -8622,  -3214,  11292,  -6168,  -6168,  11292,  -3214,

-   -8622,  10377,      0, -10377,   8622,   3214, -11292,   6168,

-    9293, -10781,   3214,   7052, -11395,   6168,   4240, -11087,

-    8622,   1084,  -9880,  10377,  -2159,  -7873,  11292,  -5228,

-    7873, -11395,   8622,  -1084,  -7052,  11292,  -9293,   2159,

-    6168, -11087,   9880,  -3214,  -5228,  10781, -10377,   4240,

-    6168, -10377,  11292,  -8622,   3214,   3214,  -8622,  11292,

-  -10377,   6168,      0,  -6168,  10377, -11292,   8622,  -3214,

-    4240,  -7873,  10377, -11395,  10781,  -8622,   5228,  -1084,

-   -3214,   7052,  -9880,  11292, -11087,   9293,  -6168,   2159,

-    2159,  -4240,   6168,  -7873,   9293, -10377,  11087, -11395,

-   11292, -10781,   9880,  -8622,   7052,  -5228,   3214,  -1084

-};

+  s0 = x0 + x3;

+  s1 = x1;

+  s2 = x2 - x3;

+  s3 = x2 - x0 + x3;

-static const int xC1S7 = 16069;

-static const int xC2S6 = 15137;

-static const int xC3S5 = 13623;

-static const int xC4S4 = 11585;

-static const int xC5S3 =  9102;

-static const int xC6S2 =  6270;

-static const int xC7S1 =  3196;

+  // 1-D transform scaling factor is sqrt(2).

+  output[0] = dct_const_round_shift(s0);

+  output[1] = dct_const_round_shift(s1);

+  output[2] = dct_const_round_shift(s2);

+  output[3] = dct_const_round_shift(s3);

+}

-#define SHIFT_BITS 14

-#define DOROUND(X) X += (1<<(SHIFT_BITS-1));

+static const transform_2d FHT_4[] = {

+  { fdct4_1d,  fdct4_1d  },  // DCT_DCT  = 0

+  { fadst4_1d, fdct4_1d  },  // ADST_DCT = 1

+  { fdct4_1d,  fadst4_1d },  // DCT_ADST = 2

+  { fadst4_1d, fadst4_1d }   // ADST_ADST = 3

+};

-#define FINAL_SHIFT 3

-#define FINAL_ROUNDING (1<<(FINAL_SHIFT -1))

-#define IN_SHIFT (FINAL_SHIFT+1)

+void vp9_short_fht4x4_c(int16_t *input, int16_t *output,

+                        int pitch, TX_TYPE tx_type) {

+  int16_t out[4 * 4];

+  int16_t *outptr = &out[0];

+  int i, j;

+  int16_t temp_in[4], temp_out[4];

+  const transform_2d ht = FHT_4[tx_type];

-void vp9_short_fdct8x8_c(short *InputData, short *OutputData, int pitch) {

-  int loop;

-  int short_pitch = pitch >> 1;

-  int is07, is12, is34, is56;

-  int is0734, is1256;

-  int id07, id12, id34, id56;

-  int irot_input_x, irot_input_y;

-  int icommon_product1;      // Re-used product  (c4s4 * (s12 - s56))

-  int icommon_product2;      // Re-used product  (c4s4 * (d12 + d56))

-  int temp1, temp2;          // intermediate variable for computation

-  int  InterData[64];

-  int  *ip = InterData;

-  short *op = OutputData;

-  for (loop = 0; loop < 8; loop++) {

-    // Pre calculate some common sums and differences.

-    is07 = (InputData[0] + InputData[7]) << IN_SHIFT;

-    is12 = (InputData[1] + InputData[2]) << IN_SHIFT;

-    is34 = (InputData[3] + InputData[4]) << IN_SHIFT;

-    is56 = (InputData[5] + InputData[6]) << IN_SHIFT;

-    id07 = (InputData[0] - InputData[7]) << IN_SHIFT;

-    id12 = (InputData[1] - InputData[2]) << IN_SHIFT;

-    id34 = (InputData[3] - InputData[4]) << IN_SHIFT;

-    id56 = (InputData[5] - InputData[6]) << IN_SHIFT;

-    is0734 = is07 + is34;

-    is1256 = is12 + is56;

-    // Pre-Calculate some common product terms.

-    icommon_product1 = xC4S4 * (is12 - is56);

-    DOROUND(icommon_product1)

-    icommon_product1 >>= SHIFT_BITS;

-    icommon_product2 = xC4S4 * (id12 + id56);

-    DOROUND(icommon_product2)

-    icommon_product2 >>= SHIFT_BITS;

-    ip[0] = (xC4S4 * (is0734 + is1256));

-    DOROUND(ip[0]);

-    ip[0] >>= SHIFT_BITS;

-    ip[4] = (xC4S4 * (is0734 - is1256));

-    DOROUND(ip[4]);

-    ip[4] >>= SHIFT_BITS;

-    // Define inputs to rotation for outputs 2 and 6

-    irot_input_x = id12 - id56;

-    irot_input_y = is07 - is34;

-    // Apply rotation for outputs 2 and 6.

-    temp1 = xC6S2 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC2S6 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[2] = temp1 + temp2;

-    temp1 = xC6S2 * irot_input_y;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC2S6 * irot_input_x;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[6] = temp1 - temp2;

-    // Define inputs to rotation for outputs 1 and 7

-    irot_input_x = icommon_product1 + id07;

-    irot_input_y = -(id34 + icommon_product2);

-    // Apply rotation for outputs 1 and 7.

-    temp1 = xC1S7 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC7S1 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[1] = temp1 - temp2;

-    temp1 = xC7S1 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC1S7 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[7] = temp1 + temp2;

-    // Define inputs to rotation for outputs 3 and 5

-    irot_input_x = id07 - icommon_product1;

-    irot_input_y = id34 - icommon_product2;

-    // Apply rotation for outputs 3 and 5.

-    temp1 = xC3S5 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC5S3 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[3] = temp1 - temp2;

-    temp1 = xC5S3 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC3S5 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    ip[5] = temp1 + temp2;

-    // Increment data pointer for next row

-    InputData += short_pitch;

-    ip += 8;

+  // Columns

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = input[j * pitch + i] << 4;

+    if (i == 0 && temp_in[0])

+      temp_in[0] += 1;

+    ht.cols(temp_in, temp_out);

+    for (j = 0; j < 4; ++j)

+      outptr[j * 4 + i] = temp_out[j];

-  // Performed DCT on rows, now transform the columns

-  ip = InterData;

-  for (loop = 0; loop < 8; loop++) {

-    // Pre calculate some common sums and differences.

-    is07 = ip[0 * 8] + ip[7 * 8];

-    is12 = ip[1 * 8] + ip[2 * 8];

-    is34 = ip[3 * 8] + ip[4 * 8];

-    is56 = ip[5 * 8] + ip[6 * 8];

-    id07 = ip[0 * 8] - ip[7 * 8];

-    id12 = ip[1 * 8] - ip[2 * 8];

-    id34 = ip[3 * 8] - ip[4 * 8];

-    id56 = ip[5 * 8] - ip[6 * 8];

-    is0734 = is07 + is34;

-    is1256 = is12 + is56;

-    // Pre-Calculate some common product terms

-    icommon_product1 = xC4S4 * (is12 - is56);

-    icommon_product2 = xC4S4 * (id12 + id56);

-    DOROUND(icommon_product1)

-    DOROUND(icommon_product2)

-    icommon_product1 >>= SHIFT_BITS;

-    icommon_product2 >>= SHIFT_BITS;

-    temp1 = xC4S4 * (is0734 + is1256);

-    temp2 = xC4S4 * (is0734 - is1256);

-    DOROUND(temp1);

-    DOROUND(temp2);

-    temp1 >>= SHIFT_BITS;

-    temp2 >>= SHIFT_BITS;

-    op[0 * 8] = (temp1 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    op[4 * 8] = (temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    // Define inputs to rotation for outputs 2 and 6

-    irot_input_x = id12 - id56;

-    irot_input_y = is07 - is34;

-    // Apply rotation for outputs 2 and 6.

-    temp1 = xC6S2 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC2S6 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[2 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    temp1 = xC6S2 * irot_input_y;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC2S6 * irot_input_x;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[6 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    // Define inputs to rotation for outputs 1 and 7

-    irot_input_x = icommon_product1 + id07;

-    irot_input_y = -(id34 + icommon_product2);

-    // Apply rotation for outputs 1 and 7.

-    temp1 = xC1S7 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC7S1 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[1 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    temp1 = xC7S1 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC1S7 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[7 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    // Define inputs to rotation for outputs 3 and 5

-    irot_input_x = id07 - icommon_product1;

-    irot_input_y = id34 - icommon_product2;

-    // Apply rotation for outputs 3 and 5.

-    temp1 = xC3S5 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC5S3 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[3 * 8] = (temp1 - temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    temp1 = xC5S3 * irot_input_x;

-    DOROUND(temp1);

-    temp1 >>= SHIFT_BITS;

-    temp2 = xC3S5 * irot_input_y;

-    DOROUND(temp2);

-    temp2 >>= SHIFT_BITS;

-    op[5 * 8] = (temp1 + temp2 + FINAL_ROUNDING) >> FINAL_SHIFT;

-    // Increment data pointer for next column.

-    ip++;

-    op++;

+  // Rows

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j + i * 4];

+    ht.rows(temp_in, temp_out);

+    for (j = 0; j < 4; ++j)

+      output[j + i * 4] = (temp_out[j] + 1) >> 2;

-void vp9_short_fhaar2x2_c(short *input, short *output, int pitch) {

-  /* [1 1; 1 -1] orthogonal transform */

-  /* use position: 0,1, 4, 8 */

-  int i;

-  short *ip1 = input;

-  short *op1 = output;

-  for (i = 0; i < 16; i++) {

-    op1[i] = 0;

-  }

-  op1[0] = (ip1[0] + ip1[1] + ip1[4] + ip1[8] + 1) >> 1;

-  op1[1] = (ip1[0] - ip1[1] + ip1[4] - ip1[8]) >> 1;

-  op1[4] = (ip1[0] + ip1[1] - ip1[4] - ip1[8]) >> 1;

-  op1[8] = (ip1[0] - ip1[1] - ip1[4] + ip1[8]) >> 1;

+void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) {

+    vp9_short_fdct4x4_c(input, output, pitch);

+    vp9_short_fdct4x4_c(input + 4, output + 16, pitch);

-/* For test */

-#define TEST_INT 1

-#if TEST_INT

-#define vp9_fht_int_c vp9_fht_c

-#else

-#define vp9_fht_float_c vp9_fht_c

-#endif

+static void fdct8_1d(int16_t *input, int16_t *output) {

+  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

+  /*needs32*/ int t0, t1, t2, t3;

+  /*canbe16*/ int x0, x1, x2, x3;

-void vp9_fht_float_c(const int16_t *input, int pitch, int16_t *output,

-               TX_TYPE tx_type, int tx_dim) {

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    int i, j, k;

-    float bufa[256], bufb[256];  // buffers are for floating-point test purpose

-                                 // the implementation could be simplified in

-                                 // conjunction with integer transform

-    const int16_t *ip = input;

-    int16_t *op = output;

+  // stage 1

+  s0 = input[0] + input[7];

+  s1 = input[1] + input[6];

+  s2 = input[2] + input[5];

+  s3 = input[3] + input[4];

+  s4 = input[3] - input[4];

+  s5 = input[2] - input[5];

+  s6 = input[1] - input[6];

+  s7 = input[0] - input[7];

-    float *pfa = &bufa[0];

-    float *pfb = &bufb[0];

+  // fdct4_1d(step, step);

+  x0 = s0 + s3;

+  x1 = s1 + s2;

+  x2 = s1 - s2;

+  x3 = s0 - s3;

+  t0 = (x0 + x1) * cospi_16_64;

+  t1 = (x0 - x1) * cospi_16_64;

+  t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;

+  t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;

+  output[0] = dct_const_round_shift(t0);

+  output[2] = dct_const_round_shift(t2);

+  output[4] = dct_const_round_shift(t1);

+  output[6] = dct_const_round_shift(t3);

-    // pointers to vertical and horizontal transforms

-    const float *ptv, *pth;

+  // Stage 2

+  t0 = (s6 - s5) * cospi_16_64;

+  t1 = (s6 + s5) * cospi_16_64;

+  t2 = dct_const_round_shift(t0);

+  t3 = dct_const_round_shift(t1);

-    assert(tx_type != DCT_DCT);

-    // load and convert residual array into floating-point

-    for (j = 0; j < tx_dim; j++) {

-      for (i = 0; i < tx_dim; i++) {

-        pfa[i] = (float)ip[i];

-      }

-      pfa += tx_dim;

-      ip  += pitch / 2;

-    }

+  // Stage 3

+  x0 = s4 + t2;

+  x1 = s4 - t2;

+  x2 = s7 - t3;

+  x3 = s7 + t3;

-    // vertical transformation

-    pfa = &bufa[0];

-    pfb = &bufb[0];

+  // Stage 4

+  t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;

+  t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;

+  t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

+  t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;

+  output[1] = dct_const_round_shift(t0);

+  output[3] = dct_const_round_shift(t2);

+  output[5] = dct_const_round_shift(t1);

+  output[7] = dct_const_round_shift(t3);

+}

-    switch (tx_type) {

-      case ADST_ADST :

-      case ADST_DCT  :

-        ptv = (tx_dim == 4) ? &adst_4[0] :

-                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

-        break;

+void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) {

+  const int stride = pitch >> 1;

+  int i, j;

+  int16_t intermediate[64];

-      default :

-        ptv = (tx_dim == 4) ? &dct_4[0] :

-                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

-        break;

-    }

+  // Transform columns

+  {

+    int16_t *output = intermediate;

+    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

+    /*needs32*/ int t0, t1, t2, t3;

+    /*canbe16*/ int x0, x1, x2, x3;

-    for (j = 0; j < tx_dim; j++) {

-      for (i = 0; i < tx_dim; i++) {

-        pfb[i] = 0;

-        for (k = 0; k < tx_dim; k++) {

-          pfb[i] += ptv[k] * pfa[(k * tx_dim)];

-        }

-        pfa += 1;

-      }

-      pfb += tx_dim;

-      ptv += tx_dim;

-      pfa = &bufa[0];

-    }

+    int i;

+    for (i = 0; i < 8; i++) {

+      // stage 1

+      s0 = (input[0 * stride] + input[7 * stride]) << 2;

+      s1 = (input[1 * stride] + input[6 * stride]) << 2;

+      s2 = (input[2 * stride] + input[5 * stride]) << 2;

+      s3 = (input[3 * stride] + input[4 * stride]) << 2;

+      s4 = (input[3 * stride] - input[4 * stride]) << 2;

+      s5 = (input[2 * stride] - input[5 * stride]) << 2;

+      s6 = (input[1 * stride] - input[6 * stride]) << 2;

+      s7 = (input[0 * stride] - input[7 * stride]) << 2;

-    // horizontal transformation

-    pfa = &bufa[0];

-    pfb = &bufb[0];

+      // fdct4_1d(step, step);

+      x0 = s0 + s3;

+      x1 = s1 + s2;

+      x2 = s1 - s2;

+      x3 = s0 - s3;

+      t0 = (x0 + x1) * cospi_16_64;

+      t1 = (x0 - x1) * cospi_16_64;

+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;

+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;

+      output[0 * 8] = dct_const_round_shift(t0);

+      output[2 * 8] = dct_const_round_shift(t2);

+      output[4 * 8] = dct_const_round_shift(t1);

+      output[6 * 8] = dct_const_round_shift(t3);

-    switch (tx_type) {

-      case ADST_ADST :

-      case  DCT_ADST :

-        pth = (tx_dim == 4) ? &adst_4[0] :

-                              ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

-        break;

+      // Stage 2

+      t0 = (s6 - s5) * cospi_16_64;

+      t1 = (s6 + s5) * cospi_16_64;

+      t2 = dct_const_round_shift(t0);

+      t3 = dct_const_round_shift(t1);

-      default :

-        pth = (tx_dim == 4) ? &dct_4[0] :

-                              ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

-        break;

-    }

+      // Stage 3

+      x0 = s4 + t2;

+      x1 = s4 - t2;

+      x2 = s7 - t3;

+      x3 = s7 + t3;

-    for (j = 0; j < tx_dim; j++) {

-      for (i = 0; i < tx_dim; i++) {

-        pfa[i] = 0;

-        for (k = 0; k < tx_dim; k++) {

-          pfa[i] += pfb[k] * pth[k];

-        }

-        pth += tx_dim;

-      }

-      pfa += tx_dim;

-      pfb += tx_dim;

-      // pth -= tx_dim * tx_dim;

-      switch (tx_type) {

-        case ADST_ADST :

-        case  DCT_ADST :

-          pth = (tx_dim == 4) ? &adst_4[0] :

-                                ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);

-          break;

-        default :

-          pth = (tx_dim == 4) ? &dct_4[0] :

-                                ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);

-          break;

-      }

+      // Stage 4

+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;

+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;

+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;

+      output[1 * 8] = dct_const_round_shift(t0);

+      output[3 * 8] = dct_const_round_shift(t2);

+      output[5 * 8] = dct_const_round_shift(t1);

+      output[7 * 8] = dct_const_round_shift(t3);

+      input++;

+      output++;

+  }

-    // convert to short integer format and load BLOCKD buffer

-    op = output;

-    pfa = &bufa[0];

-    for (j = 0; j < tx_dim; j++) {

-      for (i = 0; i < tx_dim; i++) {

-        op[i] = (pfa[i] > 0 ) ? (int16_t)( 8 * pfa[i] + 0.49) :

-                                     -(int16_t)(- 8 * pfa[i] + 0.49);

-      }

-      op  += tx_dim;

-      pfa += tx_dim;

-    }

+  // Rows

+  for (i = 0; i < 8; ++i) {

+    fdct8_1d(&intermediate[i * 8], &final_output[i * 8]);

+    for (j = 0; j < 8; ++j)

+      final_output[j + i * 8] /= 2;

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-/* Converted the transforms to integer form. */

-#define VERTICAL_SHIFT 11

-#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1)

-#define HORIZONTAL_SHIFT 16

-#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1)

-void vp9_fht_int_c(const int16_t *input, int pitch, int16_t *output,

-                   TX_TYPE tx_type, int tx_dim) {

-  int i, j, k;

-  int16_t imbuf[256];

-  const int16_t *ip = input;

-  int16_t *op = output;

-  int16_t *im = &imbuf[0];

-  /* pointers to vertical and horizontal transforms. */

-  const int16_t *ptv = NULL, *pth = NULL;

-  switch (tx_type) {

-    case ADST_ADST :

-      ptv = pth = (tx_dim == 4) ? &adst_i4[0]

-                                  : ((tx_dim == 8) ? &adst_i8[0]

-                                                     : &adst_i16[0]);

-      break;

-    case ADST_DCT  :

-      ptv = (tx_dim == 4) ? &adst_i4[0]

-                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);

-      pth = (tx_dim == 4) ? &dct_i4[0]

-                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

-      break;

-    case  DCT_ADST :

-      ptv = (tx_dim == 4) ? &dct_i4[0]

-                            : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

-      pth = (tx_dim == 4) ? &adst_i4[0]

-                            : ((tx_dim == 8) ? &adst_i8[0] : &adst_i16[0]);

-      break;

-    case  DCT_DCT :

-      ptv = pth = (tx_dim == 4) ? &dct_i4[0]

-                                  : ((tx_dim == 8) ? &dct_i8[0] : &dct_i16[0]);

-      break;

-    default:

-      assert(0);

-      break;

-  }

-  /* vertical transformation */

-  for (j = 0; j < tx_dim; j++) {

-    for (i = 0; i < tx_dim; i++) {

-      int temp = 0;

-      for (k = 0; k < tx_dim; k++) {

-        temp += ptv[k] * ip[(k * (pitch >> 1))];

+void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) {

+  // The 2D transform is done with two passes which are actually pretty

+  // similar. In the first one, we transform the columns and transpose

+  // the results. In the second one, we transform the rows. To achieve that,

+  // as the first pass results are transposed, we tranpose the columns (that

+  // is the transposed rows) and transpose the results (so that it goes back

+  // in normal/row positions).

+  const int stride = pitch >> 1;

+  int pass;

+  // We need an intermediate buffer between passes.

+  int16_t intermediate[256];

+  int16_t *in = input;

+  int16_t *out = intermediate;

+  // Do the two transform/transpose passes

+  for (pass = 0; pass < 2; ++pass) {

+    /*canbe16*/ int step1[8];

+    /*canbe16*/ int step2[8];

+    /*canbe16*/ int step3[8];

+    /*canbe16*/ int input[8];

+    /*needs32*/ int temp1, temp2;

+    int i;

+    for (i = 0; i < 16; i++) {

+      if (0 == pass) {

+        // Calculate input for the first 8 results.

+        input[0] = (in[0 * stride] + in[15 * stride]) << 2;

+        input[1] = (in[1 * stride] + in[14 * stride]) << 2;

+        input[2] = (in[2 * stride] + in[13 * stride]) << 2;

+        input[3] = (in[3 * stride] + in[12 * stride]) << 2;

+        input[4] = (in[4 * stride] + in[11 * stride]) << 2;

+        input[5] = (in[5 * stride] + in[10 * stride]) << 2;

+        input[6] = (in[6 * stride] + in[ 9 * stride]) << 2;

+        input[7] = (in[7 * stride] + in[ 8 * stride]) << 2;

+        // Calculate input for the next 8 results.

+        step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2;

+        step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2;

+        step1[2] = (in[5 * stride] - in[10 * stride]) << 2;

+        step1[3] = (in[4 * stride] - in[11 * stride]) << 2;

+        step1[4] = (in[3 * stride] - in[12 * stride]) << 2;

+        step1[5] = (in[2 * stride] - in[13 * stride]) << 2;

+        step1[6] = (in[1 * stride] - in[14 * stride]) << 2;

+        step1[7] = (in[0 * stride] - in[15 * stride]) << 2;

+      } else {

+        // Calculate input for the first 8 results.

+        input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);

+        input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2);

+        input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2);

+        input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2);

+        input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2);

+        input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2);

+        input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2);

+        input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2);

+        // Calculate input for the next 8 results.

+        step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2);

+        step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2);

+        step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2);

+        step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2);

+        step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2);

+        step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2);

+        step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2);

+        step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2);

+      // Work on the first eight values; fdct8_1d(input, even_results);

+      {

+        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

+        /*needs32*/ int t0, t1, t2, t3;

+        /*canbe16*/ int x0, x1, x2, x3;

-      im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT);

-      ip++;

-    }

-    im += tx_dim;  // 16

-    ptv += tx_dim;

-    ip = input;

-  }

+        // stage 1

+        s0 = input[0] + input[7];

+        s1 = input[1] + input[6];

+        s2 = input[2] + input[5];

+        s3 = input[3] + input[4];

+        s4 = input[3] - input[4];

+        s5 = input[2] - input[5];

+        s6 = input[1] - input[6];

+        s7 = input[0] - input[7];

-  /* horizontal transformation */

-  im = &imbuf[0];

+        // fdct4_1d(step, step);

+        x0 = s0 + s3;

+        x1 = s1 + s2;

+        x2 = s1 - s2;

+        x3 = s0 - s3;

+        t0 = (x0 + x1) * cospi_16_64;

+        t1 = (x0 - x1) * cospi_16_64;

+        t2 = x3 * cospi_8_64  + x2 * cospi_24_64;

+        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;

+        out[0] = dct_const_round_shift(t0);

+        out[4] = dct_const_round_shift(t2);

+        out[8] = dct_const_round_shift(t1);

+        out[12] = dct_const_round_shift(t3);

-  for (j = 0; j < tx_dim; j++) {

-    const int16_t *pthc = pth;

+        // Stage 2

+        t0 = (s6 - s5) * cospi_16_64;

+        t1 = (s6 + s5) * cospi_16_64;

+        t2 = dct_const_round_shift(t0);

+        t3 = dct_const_round_shift(t1);

-    for (i = 0; i < tx_dim; i++) {

-      int temp = 0;

+        // Stage 3

+        x0 = s4 + t2;

+        x1 = s4 - t2;

+        x2 = s7 - t3;

+        x3 = s7 + t3;

-      for (k = 0; k < tx_dim; k++) {

-        temp += im[k] * pthc[k];

+        // Stage 4

+        t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;

+        t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;

+        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

+        t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;

+        out[2] = dct_const_round_shift(t0);

+        out[6] = dct_const_round_shift(t2);

+        out[10] = dct_const_round_shift(t1);

+        out[14] = dct_const_round_shift(t3);

-      op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT);

-      pthc += tx_dim;

+      // Work on the next eight values; step1 -> odd_results

+      {

+        // step 2

+        temp1 = (step1[5] - step1[2]) * cospi_16_64;

+        temp2 = (step1[4] - step1[3]) * cospi_16_64;

+        step2[2] = dct_const_round_shift(temp1);

+        step2[3] = dct_const_round_shift(temp2);

+        temp1 = (step1[4] + step1[3]) * cospi_16_64;

+        temp2 = (step1[5] + step1[2]) * cospi_16_64;

+        step2[4] = dct_const_round_shift(temp1);

+        step2[5] = dct_const_round_shift(temp2);

+        // step 3

+        step3[0] = step1[0] + step2[3];

+        step3[1] = step1[1] + step2[2];

+        step3[2] = step1[1] - step2[2];

+        step3[3] = step1[0] - step2[3];

+        step3[4] = step1[7] - step2[4];

+        step3[5] = step1[6] - step2[5];

+        step3[6] = step1[6] + step2[5];

+        step3[7] = step1[7] + step2[4];

+        // step 4

+        temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;

+        temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;

+        step2[1] = dct_const_round_shift(temp1);

+        step2[2] = dct_const_round_shift(temp2);

+        temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;

+        temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;

+        step2[5] = dct_const_round_shift(temp1);

+        step2[6] = dct_const_round_shift(temp2);

+        // step 5

+        step1[0] = step3[0] + step2[1];

+        step1[1] = step3[0] - step2[1];

+        step1[2] = step3[3] - step2[2];

+        step1[3] = step3[3] + step2[2];

+        step1[4] = step3[4] + step2[5];

+        step1[5] = step3[4] - step2[5];

+        step1[6] = step3[7] - step2[6];

+        step1[7] = step3[7] + step2[6];

+        // step 6

+        temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;

+        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;

+        out[1] = dct_const_round_shift(temp1);

+        out[9] = dct_const_round_shift(temp2);

+        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;

+        temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;

+        out[5] = dct_const_round_shift(temp1);

+        out[13] = dct_const_round_shift(temp2);

+        temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;

+        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;

+        out[3] = dct_const_round_shift(temp1);

+        out[11] = dct_const_round_shift(temp2);

+        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;

+        temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;

+        out[7] = dct_const_round_shift(temp1);

+        out[15] = dct_const_round_shift(temp2);

+      }

+      // Do next column (which is a transposed row in second/horizontal pass)

+      in++;

+      out += 16;

-    im += tx_dim;  // 16

-    op += tx_dim;

+    // Setup in/out for next pass.

+    in = intermediate;

+    out = output;

-void vp9_short_fdct4x4_c(short *input, short *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

+static void fadst8_1d(int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7;

-  for (i = 0; i < 4; i++) {

-    a1 = ((ip[0] + ip[3]) << 5);

-    b1 = ((ip[1] + ip[2]) << 5);

-    c1 = ((ip[1] - ip[2]) << 5);

-    d1 = ((ip[0] - ip[3]) << 5);

+  int x0 = input[7];

+  int x1 = input[0];

+  int x2 = input[5];

+  int x3 = input[2];

+  int x4 = input[3];

+  int x5 = input[4];

+  int x6 = input[1];

+  int x7 = input[6];

-    op[0] = a1 + b1;

-    op[2] = a1 - b1;

+  // stage 1

+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;

+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;

+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;

+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;

+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;

+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;

+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;

+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;

-    op[1] = (c1 * 2217 + d1 * 5352 +  14500) >> 12;

-    op[3] = (d1 * 2217 - c1 * 5352 +   7500) >> 12;

+  x0 = dct_const_round_shift(s0 + s4);

+  x1 = dct_const_round_shift(s1 + s5);

+  x2 = dct_const_round_shift(s2 + s6);

+  x3 = dct_const_round_shift(s3 + s7);

+  x4 = dct_const_round_shift(s0 - s4);

+  x5 = dct_const_round_shift(s1 - s5);

+  x6 = dct_const_round_shift(s2 - s6);

+  x7 = dct_const_round_shift(s3 - s7);

-    ip += pitch / 2;

-    op += 4;

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = cospi_8_64  * x4 + cospi_24_64 * x5;

+  s5 = cospi_24_64 * x4 - cospi_8_64  * x5;

+  s6 = - cospi_24_64 * x6 + cospi_8_64  * x7;

+  s7 =   cospi_8_64  * x6 + cospi_24_64 * x7;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[12];

-    b1 = ip[4] + ip[8];

-    c1 = ip[4] - ip[8];

-    d1 = ip[0] - ip[12];

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = dct_const_round_shift(s4 + s6);

+  x5 = dct_const_round_shift(s5 + s7);

+  x6 = dct_const_round_shift(s4 - s6);

+  x7 = dct_const_round_shift(s5 - s7);

-    op[0]  = (a1 + b1 + 7) >> 4;

-    op[8]  = (a1 - b1 + 7) >> 4;

+  // stage 3

+  s2 = cospi_16_64 * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (x6 - x7);

-    op[4]  = ((c1 * 2217 + d1 * 5352 +  12000) >> 16) + (d1 != 0);

-    op[12] = (d1 * 2217 - c1 * 5352 +  51000) >> 16;

+  x2 = dct_const_round_shift(s2);

+  x3 = dct_const_round_shift(s3);

+  x6 = dct_const_round_shift(s6);

+  x7 = dct_const_round_shift(s7);

-    ip++;

-    op++;

-  }

+  output[0] =   x0;

+  output[1] = - x4;

+  output[2] =   x6;

+  output[3] = - x2;

+  output[4] =   x3;

+  output[5] = - x7;

+  output[6] =   x5;

+  output[7] = - x1;

-void vp9_short_fdct8x4_c(short *input, short *output, int pitch)

-{

-    vp9_short_fdct4x4_c(input,   output,    pitch);

-    vp9_short_fdct4x4_c(input + 4, output + 16, pitch);

-}

+static const transform_2d FHT_8[] = {

+  { fdct8_1d,  fdct8_1d  },  // DCT_DCT  = 0

+  { fadst8_1d, fdct8_1d  },  // ADST_DCT = 1

+  { fdct8_1d,  fadst8_1d },  // DCT_ADST = 2

+  { fadst8_1d, fadst8_1d }   // ADST_ADST = 3

+};

-void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  int pitch_short = pitch >> 1;

+void vp9_short_fht8x8_c(int16_t *input, int16_t *output,

+                        int pitch, TX_TYPE tx_type) {

+  int16_t out[64];

+  int16_t *outptr = &out[0];

+  int i, j;

+  int16_t temp_in[8], temp_out[8];

+  const transform_2d ht = FHT_8[tx_type];

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

-    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

-    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

-    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[4] = (c1 + d1) >> 1;

-    op[8] = (a1 - b1) >> 1;

-    op[12] = (d1 - c1) >> 1;

-    ip++;

-    op++;

+  // Columns

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = input[j * pitch + i] << 2;

+    ht.cols(temp_in, temp_out);

+    for (j = 0; j < 8; ++j)

+      outptr[j * 8 + i] = temp_out[j];

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[3];

-    b1 = ip[1] + ip[2];

-    c1 = ip[1] - ip[2];

-    d1 = ip[0] - ip[3];

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

-    ip += 4;

-    op += 4;

+  // Rows

+  for (i = 0; i < 8; ++i) {

+    for (j = 0; j < 8; ++j)

+      temp_in[j] = out[j + i * 8];

+    ht.rows(temp_in, temp_out);

+    for (j = 0; j < 8; ++j)

+      output[j + i * 8] = temp_out[j] >> 1;

-#if CONFIG_LOSSLESS

-void vp9_short_walsh4x4_lossless_c(short *input, short *output, int pitch) {

+void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {

   int i;

   int a1, b1, c1, d1;

   short *ip = input;

@@ -822,46 +561,6 @@

   int pitch_short = pitch >> 1;

   for (i = 0; i < 4; i++) {

-    a1 = (ip[0 * pitch_short] + ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

-    b1 = (ip[1 * pitch_short] + ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

-    c1 = (ip[1 * pitch_short] - ip[2 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

-    d1 = (ip[0 * pitch_short] - ip[3 * pitch_short]) >> Y2_WHT_UPSCALE_FACTOR;

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[4] = (c1 + d1) >> 1;

-    op[8] = (a1 - b1) >> 1;

-    op[12] = (d1 - c1) >> 1;

-    ip++;

-    op++;

-  }

-  ip = output;

-  op = output;

-  for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[3];

-    b1 = ip[1] + ip[2];

-    c1 = ip[1] - ip[2];

-    d1 = ip[0] - ip[3];

-    op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[1] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[2] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    op[3] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR;

-    ip += 4;

-    op += 4;

-  }

-}

-void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) {

-  int i;

-  int a1, b1, c1, d1;

-  short *ip = input;

-  short *op = output;

-  int pitch_short = pitch >> 1;

-  for (i = 0; i < 4; i++) {

     a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

     b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

     c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

@@ -894,1495 +593,658 @@

-void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) {

-  vp9_short_walsh4x4_x8_c(input,   output,    pitch);

-  vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch);

+void vp9_short_walsh8x4_c(short *input, short *output, int pitch) {

+  vp9_short_walsh4x4_c(input,   output,    pitch);

+  vp9_short_walsh4x4_c(input + 4, output + 16, pitch);

-#endif

-#define TEST_INT_16x16_DCT 1

-#if !TEST_INT_16x16_DCT

-static void dct16x16_1d(double input[16], double output[16]) {

-  static const double C1 = 0.995184726672197;

-  static const double C2 = 0.98078528040323;

-  static const double C3 = 0.956940335732209;

-  static const double C4 = 0.923879532511287;

-  static const double C5 = 0.881921264348355;

-  static const double C6 = 0.831469612302545;

-  static const double C7 = 0.773010453362737;

-  static const double C8 = 0.707106781186548;

-  static const double C9 = 0.634393284163646;

-  static const double C10 = 0.555570233019602;

-  static const double C11 = 0.471396736825998;

-  static const double C12 = 0.38268343236509;

-  static const double C13 = 0.290284677254462;

-  static const double C14 = 0.195090322016128;

-  static const double C15 = 0.098017140329561;

+// Rewrote to use same algorithm as others.

+static void fdct16_1d(int16_t in[16], int16_t out[16]) {

+  /*canbe16*/ int step1[8];

+  /*canbe16*/ int step2[8];

+  /*canbe16*/ int step3[8];

+  /*canbe16*/ int input[8];

+  /*needs32*/ int temp1, temp2;

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    double step[16];

-    double intermediate[16];

-    double temp1, temp2;

+  // step 1

+  input[0] = in[0] + in[15];

+  input[1] = in[1] + in[14];

+  input[2] = in[2] + in[13];

+  input[3] = in[3] + in[12];

+  input[4] = in[4] + in[11];

+  input[5] = in[5] + in[10];

+  input[6] = in[6] + in[ 9];

+  input[7] = in[7] + in[ 8];

-    // step 1

-    step[ 0] = input[0] + input[15];

-    step[ 1] = input[1] + input[14];

-    step[ 2] = input[2] + input[13];

-    step[ 3] = input[3] + input[12];

-    step[ 4] = input[4] + input[11];

-    step[ 5] = input[5] + input[10];

-    step[ 6] = input[6] + input[ 9];

-    step[ 7] = input[7] + input[ 8];

-    step[ 8] = input[7] - input[ 8];

-    step[ 9] = input[6] - input[ 9];

-    step[10] = input[5] - input[10];

-    step[11] = input[4] - input[11];

-    step[12] = input[3] - input[12];

-    step[13] = input[2] - input[13];

-    step[14] = input[1] - input[14];

-    step[15] = input[0] - input[15];

+  step1[0] = in[7] - in[ 8];

+  step1[1] = in[6] - in[ 9];

+  step1[2] = in[5] - in[10];

+  step1[3] = in[4] - in[11];

+  step1[4] = in[3] - in[12];

+  step1[5] = in[2] - in[13];

+  step1[6] = in[1] - in[14];

+  step1[7] = in[0] - in[15];

-    // step 2

-    output[0] = step[0] + step[7];

-    output[1] = step[1] + step[6];

-    output[2] = step[2] + step[5];

-    output[3] = step[3] + step[4];

-    output[4] = step[3] - step[4];

-    output[5] = step[2] - step[5];

-    output[6] = step[1] - step[6];

-    output[7] = step[0] - step[7];

+  // fdct8_1d(step, step);

+  {

+    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;

+    /*needs32*/ int t0, t1, t2, t3;

+    /*canbe16*/ int x0, x1, x2, x3;

-    temp1 = step[ 8]*C7;

-    temp2 = step[15]*C9;

-    output[ 8] = temp1 + temp2;

+    // stage 1

+    s0 = input[0] + input[7];

+    s1 = input[1] + input[6];

+    s2 = input[2] + input[5];

+    s3 = input[3] + input[4];

+    s4 = input[3] - input[4];

+    s5 = input[2] - input[5];

+    s6 = input[1] - input[6];

+    s7 = input[0] - input[7];

-    temp1 = step[ 9]*C11;

-    temp2 = step[14]*C5;

-    output[ 9] = temp1 - temp2;

+    // fdct4_1d(step, step);

+    x0 = s0 + s3;

+    x1 = s1 + s2;

+    x2 = s1 - s2;

+    x3 = s0 - s3;

+    t0 = (x0 + x1) * cospi_16_64;

+    t1 = (x0 - x1) * cospi_16_64;

+    t2 = x3 * cospi_8_64  + x2 * cospi_24_64;

+    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;

+    out[0] = dct_const_round_shift(t0);

+    out[4] = dct_const_round_shift(t2);

+    out[8] = dct_const_round_shift(t1);

+    out[12] = dct_const_round_shift(t3);

-    temp1 = step[10]*C3;

-    temp2 = step[13]*C13;

-    output[10] = temp1 + temp2;

+    // Stage 2

+    t0 = (s6 - s5) * cospi_16_64;

+    t1 = (s6 + s5) * cospi_16_64;

+    t2 = dct_const_round_shift(t0);

+    t3 = dct_const_round_shift(t1);

-    temp1 = step[11]*C15;

-    temp2 = step[12]*C1;

-    output[11] = temp1 - temp2;

+    // Stage 3

+    x0 = s4 + t2;

+    x1 = s4 - t2;

+    x2 = s7 - t3;

+    x3 = s7 + t3;

-    temp1 = step[11]*C1;

-    temp2 = step[12]*C15;

-    output[12] = temp2 + temp1;

+    // Stage 4

+    t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;

+    t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;

+    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;

+    t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;

+    out[2] = dct_const_round_shift(t0);

+    out[6] = dct_const_round_shift(t2);

+    out[10] = dct_const_round_shift(t1);

+    out[14] = dct_const_round_shift(t3);

+  }

-    temp1 = step[10]*C13;

-    temp2 = step[13]*C3;

-    output[13] = temp2 - temp1;

+  // step 2

+  temp1 = (step1[5] - step1[2]) * cospi_16_64;

+  temp2 = (step1[4] - step1[3]) * cospi_16_64;

+  step2[2] = dct_const_round_shift(temp1);

+  step2[3] = dct_const_round_shift(temp2);

+  temp1 = (step1[4] + step1[3]) * cospi_16_64;

+  temp2 = (step1[5] + step1[2]) * cospi_16_64;

+  step2[4] = dct_const_round_shift(temp1);

+  step2[5] = dct_const_round_shift(temp2);

-    temp1 = step[ 9]*C5;

-    temp2 = step[14]*C11;

-    output[14] = temp2 + temp1;

+  // step 3

+  step3[0] = step1[0] + step2[3];

+  step3[1] = step1[1] + step2[2];

+  step3[2] = step1[1] - step2[2];

+  step3[3] = step1[0] - step2[3];

+  step3[4] = step1[7] - step2[4];

+  step3[5] = step1[6] - step2[5];

+  step3[6] = step1[6] + step2[5];

+  step3[7] = step1[7] + step2[4];

-    temp1 = step[ 8]*C9;

-    temp2 = step[15]*C7;

-    output[15] = temp2 - temp1;

+  // step 4

+  temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;

+  temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;

+  step2[1] = dct_const_round_shift(temp1);

+  step2[2] = dct_const_round_shift(temp2);

+  temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;

+  temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;

+  step2[5] = dct_const_round_shift(temp1);

+  step2[6] = dct_const_round_shift(temp2);

-    // step 3

-    step[ 0] = output[0] + output[3];

-    step[ 1] = output[1] + output[2];

-    step[ 2] = output[1] - output[2];

-    step[ 3] = output[0] - output[3];

+  // step 5

+  step1[0] = step3[0] + step2[1];

+  step1[1] = step3[0] - step2[1];

+  step1[2] = step3[3] - step2[2];

+  step1[3] = step3[3] + step2[2];

+  step1[4] = step3[4] + step2[5];

+  step1[5] = step3[4] - step2[5];

+  step1[6] = step3[7] - step2[6];

+  step1[7] = step3[7] + step2[6];

-    temp1 = output[4]*C14;

-    temp2 = output[7]*C2;

-    step[ 4] = temp1 + temp2;

+  // step 6

+  temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;

+  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;

+  out[1] = dct_const_round_shift(temp1);

+  out[9] = dct_const_round_shift(temp2);

-    temp1 = output[5]*C10;

-    temp2 = output[6]*C6;

-    step[ 5] = temp1 + temp2;

+  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;

+  temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;

+  out[5] = dct_const_round_shift(temp1);

+  out[13] = dct_const_round_shift(temp2);

-    temp1 = output[5]*C6;

-    temp2 = output[6]*C10;

-    step[ 6] = temp2 - temp1;

+  temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;

+  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;

+  out[3] = dct_const_round_shift(temp1);

+  out[11] = dct_const_round_shift(temp2);

-    temp1 = output[4]*C2;

-    temp2 = output[7]*C14;

-    step[ 7] = temp2 - temp1;

-    step[ 8] = output[ 8] + output[11];

-    step[ 9] = output[ 9] + output[10];

-    step[10] = output[ 9] - output[10];

-    step[11] = output[ 8] - output[11];

-    step[12] = output[12] + output[15];

-    step[13] = output[13] + output[14];

-    step[14] = output[13] - output[14];

-    step[15] = output[12] - output[15];

-    // step 4

-    output[ 0] = (step[ 0] + step[ 1]);

-    output[ 8] = (step[ 0] - step[ 1]);

-    temp1 = step[2]*C12;

-    temp2 = step[3]*C4;

-    temp1 = temp1 + temp2;

-    output[ 4] = 2*(temp1*C8);

-    temp1 = step[2]*C4;

-    temp2 = step[3]*C12;

-    temp1 = temp2 - temp1;

-    output[12] = 2*(temp1*C8);

-    output[ 2] = 2*((step[4] + step[ 5])*C8);

-    output[14] = 2*((step[7] - step[ 6])*C8);

-    temp1 = step[4] - step[5];

-    temp2 = step[6] + step[7];

-    output[ 6] = (temp1 + temp2);

-    output[10] = (temp1 - temp2);

-    intermediate[8] = step[8] + step[14];

-    intermediate[9] = step[9] + step[15];

-    temp1 = intermediate[8]*C12;

-    temp2 = intermediate[9]*C4;

-    temp1 = temp1 - temp2;

-    output[3] = 2*(temp1*C8);

-    temp1 = intermediate[8]*C4;

-    temp2 = intermediate[9]*C12;

-    temp1 = temp2 + temp1;

-    output[13] = 2*(temp1*C8);

-    output[ 9] = 2*((step[10] + step[11])*C8);

-    intermediate[11] = step[10] - step[11];

-    intermediate[12] = step[12] + step[13];

-    intermediate[13] = step[12] - step[13];

-    intermediate[14] = step[ 8] - step[14];

-    intermediate[15] = step[ 9] - step[15];

-    output[15] = (intermediate[11] + intermediate[12]);

-    output[ 1] = -(intermediate[11] - intermediate[12]);

-    output[ 7] = 2*(intermediate[13]*C8);

-    temp1 = intermediate[14]*C12;

-    temp2 = intermediate[15]*C4;

-    temp1 = temp1 - temp2;

-    output[11] = -2*(temp1*C8);

-    temp1 = intermediate[14]*C4;

-    temp2 = intermediate[15]*C12;

-    temp1 = temp2 + temp1;

-    output[ 5] = 2*(temp1*C8);

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

+  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;

+  temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;

+  out[7] = dct_const_round_shift(temp1);

+  out[15] = dct_const_round_shift(temp2);

-void vp9_short_fdct16x16_c(short *input, short *out, int pitch) {

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-  {

-    int shortpitch = pitch >> 1;

-    int i, j;

-    double output[256];

-    // First transform columns

-    for (i = 0; i < 16; i++) {

-        double temp_in[16], temp_out[16];

-        for (j = 0; j < 16; j++)

-            temp_in[j] = input[j*shortpitch + i];

-        dct16x16_1d(temp_in, temp_out);

-        for (j = 0; j < 16; j++)

-            output[j*16 + i] = temp_out[j];

-    }

-    // Then transform rows

-    for (i = 0; i < 16; ++i) {

-        double temp_in[16], temp_out[16];

-        for (j = 0; j < 16; ++j)

-            temp_in[j] = output[j + i*16];

-        dct16x16_1d(temp_in, temp_out);

-        for (j = 0; j < 16; ++j)

-            output[j + i*16] = temp_out[j];

-    }

-    // Scale by some magic number

-    for (i = 0; i < 256; i++)

-        out[i] = (short)round(output[i]/2);

-  }

-  vp9_clear_system_state(); // Make it simd safe : __asm emms;

-}

+void fadst16_1d(int16_t *input, int16_t *output) {

+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

-#else

-static const int16_t C1 = 16305;

-static const int16_t C2 = 16069;

-static const int16_t C3 = 15679;

-static const int16_t C4 = 15137;

-static const int16_t C5 = 14449;

-static const int16_t C6 = 13623;

-static const int16_t C7 = 12665;

-static const int16_t C8 = 11585;

-static const int16_t C9 = 10394;

-static const int16_t C10 = 9102;

-static const int16_t C11 = 7723;

-static const int16_t C12 = 6270;

-static const int16_t C13 = 4756;

-static const int16_t C14 = 3196;

-static const int16_t C15 = 1606;

-#define RIGHT_SHIFT 14

-#define ROUNDING (1 << (RIGHT_SHIFT - 1))

-static void dct16x16_1d(int16_t input[16], int16_t output[16],

-                        int last_shift_bits) {

-    int16_t step[16];

-    int intermediate[16];

-    int temp1, temp2;

-    int final_shift = RIGHT_SHIFT;

-    int final_rounding = ROUNDING;

-    int output_shift = 0;

-    int output_rounding = 0;

-    final_shift += last_shift_bits;

-    if (final_shift > 0)

-    final_rounding = 1 << (final_shift - 1);

-    output_shift += last_shift_bits;

-    if (output_shift > 0)

-      output_rounding = 1 << (output_shift - 1);

-    // step 1

-    step[ 0] = input[0] + input[15];

-    step[ 1] = input[1] + input[14];

-    step[ 2] = input[2] + input[13];

-    step[ 3] = input[3] + input[12];

-    step[ 4] = input[4] + input[11];

-    step[ 5] = input[5] + input[10];

-    step[ 6] = input[6] + input[ 9];

-    step[ 7] = input[7] + input[ 8];

-    step[ 8] = input[7] - input[ 8];

-    step[ 9] = input[6] - input[ 9];

-    step[10] = input[5] - input[10];

-    step[11] = input[4] - input[11];

-    step[12] = input[3] - input[12];

-    step[13] = input[2] - input[13];

-    step[14] = input[1] - input[14];

-    step[15] = input[0] - input[15];

-    // step 2

-    output[0] = step[0] + step[7];

-    output[1] = step[1] + step[6];

-    output[2] = step[2] + step[5];

-    output[3] = step[3] + step[4];

-    output[4] = step[3] - step[4];

-    output[5] = step[2] - step[5];

-    output[6] = step[1] - step[6];

-    output[7] = step[0] - step[7];

-    temp1 = step[ 8] * C7;

-    temp2 = step[15] * C9;

-    output[ 8] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[ 9] * C11;

-    temp2 = step[14] * C5;

-    output[ 9] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[10] * C3;

-    temp2 = step[13] * C13;

-    output[10] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[11] * C15;

-    temp2 = step[12] * C1;

-    output[11] = (temp1 - temp2 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[11] * C1;

-    temp2 = step[12] * C15;

-    output[12] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[10] * C13;

-    temp2 = step[13] * C3;

-    output[13] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[ 9] * C5;

-    temp2 = step[14] * C11;

-    output[14] = (temp2 + temp1 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = step[ 8] * C9;

-    temp2 = step[15] * C7;

-    output[15] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;

-    // step 3

-    step[ 0] = output[0] + output[3];

-    step[ 1] = output[1] + output[2];

-    step[ 2] = output[1] - output[2];

-    step[ 3] = output[0] - output[3];

-    temp1 = output[4] * C14;

-    temp2 = output[7] * C2;

-    step[ 4] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[5] * C10;

-    temp2 = output[6] * C6;

-    step[ 5] = (temp1 + temp2 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[5] * C6;

-    temp2 = output[6] * C10;

-    step[ 6] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = output[4] * C2;

-    temp2 = output[7] * C14;

-    step[ 7] = (temp2 - temp1 + ROUNDING) >> RIGHT_SHIFT;

-    step[ 8] = output[ 8] + output[11];

-    step[ 9] = output[ 9] + output[10];

-    step[10] = output[ 9] - output[10];

-    step[11] = output[ 8] - output[11];

-    step[12] = output[12] + output[15];

-    step[13] = output[13] + output[14];

-    step[14] = output[13] - output[14];

-    step[15] = output[12] - output[15];

-    // step 4

-    output[ 0] = (step[ 0] + step[ 1] + output_rounding) >> output_shift;

-    output[ 8] = (step[ 0] - step[ 1] + output_rounding) >> output_shift;

+  int x0 = input[15];

+  int x1 = input[0];

+  int x2 = input[13];

+  int x3 = input[2];

+  int x4 = input[11];

+  int x5 = input[4];

+  int x6 = input[9];

+  int x7 = input[6];

+  int x8 = input[7];

+  int x9 = input[8];

+  int x10 = input[5];

+  int x11 = input[10];

+  int x12 = input[3];

+  int x13 = input[12];

+  int x14 = input[1];

+  int x15 = input[14];

-    temp1 = step[2] * C12;

-    temp2 = step[3] * C4;

-    temp1 = (temp1 + temp2 + final_rounding) >> final_shift;

-    output[ 4] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

+  // stage 1

+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

-    temp1 = step[2] * C4;

-    temp2 = step[3] * C12;

-    temp1 = (temp2 - temp1 + final_rounding) >> final_shift;

-    output[12] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

+  x0 = dct_const_round_shift(s0 + s8);

+  x1 = dct_const_round_shift(s1 + s9);

+  x2 = dct_const_round_shift(s2 + s10);

+  x3 = dct_const_round_shift(s3 + s11);

+  x4 = dct_const_round_shift(s4 + s12);

+  x5 = dct_const_round_shift(s5 + s13);

+  x6 = dct_const_round_shift(s6 + s14);

+  x7 = dct_const_round_shift(s7 + s15);

+  x8  = dct_const_round_shift(s0 - s8);

+  x9  = dct_const_round_shift(s1 - s9);

+  x10 = dct_const_round_shift(s2 - s10);

+  x11 = dct_const_round_shift(s3 - s11);

+  x12 = dct_const_round_shift(s4 - s12);

+  x13 = dct_const_round_shift(s5 - s13);

+  x14 = dct_const_round_shift(s6 - s14);

+  x15 = dct_const_round_shift(s7 - s15);

-    output[ 2] = (2 * ((step[4] + step[ 5]) * C8) + final_rounding)

-        >> final_shift;

-    output[14] = (2 * ((step[7] - step[ 6]) * C8) + final_rounding)

-        >> final_shift;

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

-    temp1 = step[4] - step[5];

-    temp2 = step[6] + step[7];

-    output[ 6] = (temp1 + temp2 + output_rounding) >> output_shift;

-    output[10] = (temp1 - temp2 + output_rounding) >> output_shift;

+  x0 = s0 + s4;

+  x1 = s1 + s5;

+  x2 = s2 + s6;

+  x3 = s3 + s7;

+  x4 = s0 - s4;

+  x5 = s1 - s5;

+  x6 = s2 - s6;

+  x7 = s3 - s7;

+  x8 = dct_const_round_shift(s8 + s12);

+  x9 = dct_const_round_shift(s9 + s13);

+  x10 = dct_const_round_shift(s10 + s14);

+  x11 = dct_const_round_shift(s11 + s15);

+  x12 = dct_const_round_shift(s8 - s12);

+  x13 = dct_const_round_shift(s9 - s13);

+  x14 = dct_const_round_shift(s10 - s14);

+  x15 = dct_const_round_shift(s11 - s15);

-    intermediate[8] = step[8] + step[14];

-    intermediate[9] = step[9] + step[15];

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

-    temp1 = intermediate[8] * C12;

-    temp2 = intermediate[9] * C4;

-    temp1 = (temp1 - temp2 + final_rounding) >> final_shift;

-    output[3] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = intermediate[8] * C4;

-    temp2 = intermediate[9] * C12;

-    temp1 = (temp2 + temp1 + final_rounding) >> final_shift;

-    output[13] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

-    output[ 9] = (2 * ((step[10] + step[11]) * C8) + final_rounding)

-        >> final_shift;

-    intermediate[11] = step[10] - step[11];

-    intermediate[12] = step[12] + step[13];

-    intermediate[13] = step[12] - step[13];

-    intermediate[14] = step[ 8] - step[14];

-    intermediate[15] = step[ 9] - step[15];

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = dct_const_round_shift(s4 + s6);

+  x5 = dct_const_round_shift(s5 + s7);

+  x6 = dct_const_round_shift(s4 - s6);

+  x7 = dct_const_round_shift(s5 - s7);

+  x8 = s8 + s10;

+  x9 = s9 + s11;

+  x10 = s8 - s10;

+  x11 = s9 - s11;

+  x12 = dct_const_round_shift(s12 + s14);

+  x13 = dct_const_round_shift(s13 + s15);

+  x14 = dct_const_round_shift(s12 - s14);

+  x15 = dct_const_round_shift(s13 - s15);

-    output[15] = (intermediate[11] + intermediate[12] + output_rounding)

-        >> output_shift;

-    output[ 1] = -(intermediate[11] - intermediate[12] + output_rounding)

-        >> output_shift;

+  // stage 4

+  s2 = (- cospi_16_64) * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (- x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (- x10 + x11);

+  s14 = (- cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

-    output[ 7] = (2 * (intermediate[13] * C8) + final_rounding) >> final_shift;

+  x2 = dct_const_round_shift(s2);

+  x3 = dct_const_round_shift(s3);

+  x6 = dct_const_round_shift(s6);

+  x7 = dct_const_round_shift(s7);

+  x10 = dct_const_round_shift(s10);

+  x11 = dct_const_round_shift(s11);

+  x14 = dct_const_round_shift(s14);

+  x15 = dct_const_round_shift(s15);

-    temp1 = intermediate[14] * C12;

-    temp2 = intermediate[15] * C4;

-    temp1 = (temp1 - temp2 + final_rounding) >> final_shift;

-    output[11] = (-2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

-    temp1 = intermediate[14] * C4;

-    temp2 = intermediate[15] * C12;

-    temp1 = (temp2 + temp1 + final_rounding) >> final_shift;

-    output[ 5] = (2 * (temp1 * C8) + ROUNDING) >> RIGHT_SHIFT;

+  output[0] = x0;

+  output[1] = - x8;

+  output[2] = x12;

+  output[3] = - x4;

+  output[4] = x6;

+  output[5] = x14;

+  output[6] = x10;

+  output[7] = x2;

+  output[8] = x3;

+  output[9] =  x11;

+  output[10] = x15;

+  output[11] = x7;

+  output[12] = x5;

+  output[13] = - x13;

+  output[14] = x9;

+  output[15] = - x1;

-void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) {

-    int shortpitch = pitch >> 1;

-    int i, j;

-    int16_t output[256];

-    int16_t *outptr = &output[0];

+static const transform_2d FHT_16[] = {

+  { fdct16_1d,  fdct16_1d  },  // DCT_DCT  = 0

+  { fadst16_1d, fdct16_1d  },  // ADST_DCT = 1

+  { fdct16_1d,  fadst16_1d },  // DCT_ADST = 2

+  { fadst16_1d, fadst16_1d }   // ADST_ADST = 3

+};

-    // First transform columns

-    for (i = 0; i < 16; i++) {

-        int16_t temp_in[16];

-        int16_t temp_out[16];

-        for (j = 0; j < 16; j++)

-            temp_in[j] = input[j * shortpitch + i];

-        dct16x16_1d(temp_in, temp_out, 0);

-        for (j = 0; j < 16; j++)

-            output[j * 16 + i] = temp_out[j];

-    }

+void vp9_short_fht16x16_c(int16_t *input, int16_t *output,

+                          int pitch, TX_TYPE tx_type) {

+  int16_t out[256];

+  int16_t *outptr = &out[0];

+  int i, j;

+  int16_t temp_in[16], temp_out[16];

+  const transform_2d ht = FHT_16[tx_type];

-    // Then transform rows

-    for (i = 0; i < 16; ++i) {

-        dct16x16_1d(outptr, out, 1);

-        outptr += 16;

-        out += 16;

-    }

+  // Columns

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = input[j * pitch + i] << 2;

+    ht.cols(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

+  }

+  // Rows

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j + i * 16];

+    ht.rows(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      output[j + i * 16] = temp_out[j];

+  }

-#undef RIGHT_SHIFT

-#undef ROUNDING

-#endif

-#if !CONFIG_DWTDCTHYBRID

-static void dct32_1d(double *input, double *output, int stride) {

-  static const double C1 = 0.998795456205;  // cos(pi * 1 / 64)

-  static const double C2 = 0.995184726672;  // cos(pi * 2 / 64)

-  static const double C3 = 0.989176509965;  // cos(pi * 3 / 64)

-  static const double C4 = 0.980785280403;  // cos(pi * 4 / 64)

-  static const double C5 = 0.970031253195;  // cos(pi * 5 / 64)

-  static const double C6 = 0.956940335732;  // cos(pi * 6 / 64)

-  static const double C7 = 0.941544065183;  // cos(pi * 7 / 64)

-  static const double C8 = 0.923879532511;  // cos(pi * 8 / 64)

-  static const double C9 = 0.903989293123;  // cos(pi * 9 / 64)

-  static const double C10 = 0.881921264348;  // cos(pi * 10 / 64)

-  static const double C11 = 0.857728610000;  // cos(pi * 11 / 64)

-  static const double C12 = 0.831469612303;  // cos(pi * 12 / 64)

-  static const double C13 = 0.803207531481;  // cos(pi * 13 / 64)

-  static const double C14 = 0.773010453363;  // cos(pi * 14 / 64)

-  static const double C15 = 0.740951125355;  // cos(pi * 15 / 64)

-  static const double C16 = 0.707106781187;  // cos(pi * 16 / 64)

-  static const double C17 = 0.671558954847;  // cos(pi * 17 / 64)

-  static const double C18 = 0.634393284164;  // cos(pi * 18 / 64)

-  static const double C19 = 0.595699304492;  // cos(pi * 19 / 64)

-  static const double C20 = 0.555570233020;  // cos(pi * 20 / 64)

-  static const double C21 = 0.514102744193;  // cos(pi * 21 / 64)

-  static const double C22 = 0.471396736826;  // cos(pi * 22 / 64)

-  static const double C23 = 0.427555093430;  // cos(pi * 23 / 64)

-  static const double C24 = 0.382683432365;  // cos(pi * 24 / 64)

-  static const double C25 = 0.336889853392;  // cos(pi * 25 / 64)

-  static const double C26 = 0.290284677254;  // cos(pi * 26 / 64)

-  static const double C27 = 0.242980179903;  // cos(pi * 27 / 64)

-  static const double C28 = 0.195090322016;  // cos(pi * 28 / 64)

-  static const double C29 = 0.146730474455;  // cos(pi * 29 / 64)

-  static const double C30 = 0.098017140330;  // cos(pi * 30 / 64)

-  static const double C31 = 0.049067674327;  // cos(pi * 31 / 64)

-  double step[32];

+static void dct32_1d(int *input, int *output) {

+  int step[32];

   // Stage 1

-  step[0] = input[stride*0] + input[stride*(32 - 1)];

-  step[1] = input[stride*1] + input[stride*(32 - 2)];

-  step[2] = input[stride*2] + input[stride*(32 - 3)];

-  step[3] = input[stride*3] + input[stride*(32 - 4)];

-  step[4] = input[stride*4] + input[stride*(32 - 5)];

-  step[5] = input[stride*5] + input[stride*(32 - 6)];

-  step[6] = input[stride*6] + input[stride*(32 - 7)];

-  step[7] = input[stride*7] + input[stride*(32 - 8)];

-  step[8] = input[stride*8] + input[stride*(32 - 9)];

-  step[9] = input[stride*9] + input[stride*(32 - 10)];

-  step[10] = input[stride*10] + input[stride*(32 - 11)];

-  step[11] = input[stride*11] + input[stride*(32 - 12)];

-  step[12] = input[stride*12] + input[stride*(32 - 13)];

-  step[13] = input[stride*13] + input[stride*(32 - 14)];

-  step[14] = input[stride*14] + input[stride*(32 - 15)];

-  step[15] = input[stride*15] + input[stride*(32 - 16)];

-  step[16] = -input[stride*16] + input[stride*(32 - 17)];

-  step[17] = -input[stride*17] + input[stride*(32 - 18)];

-  step[18] = -input[stride*18] + input[stride*(32 - 19)];

-  step[19] = -input[stride*19] + input[stride*(32 - 20)];

-  step[20] = -input[stride*20] + input[stride*(32 - 21)];

-  step[21] = -input[stride*21] + input[stride*(32 - 22)];

-  step[22] = -input[stride*22] + input[stride*(32 - 23)];

-  step[23] = -input[stride*23] + input[stride*(32 - 24)];

-  step[24] = -input[stride*24] + input[stride*(32 - 25)];

-  step[25] = -input[stride*25] + input[stride*(32 - 26)];

-  step[26] = -input[stride*26] + input[stride*(32 - 27)];

-  step[27] = -input[stride*27] + input[stride*(32 - 28)];

-  step[28] = -input[stride*28] + input[stride*(32 - 29)];

-  step[29] = -input[stride*29] + input[stride*(32 - 30)];

-  step[30] = -input[stride*30] + input[stride*(32 - 31)];

-  step[31] = -input[stride*31] + input[stride*(32 - 32)];

+  step[0] = input[0] + input[(32 - 1)];

+  step[1] = input[1] + input[(32 - 2)];

+  step[2] = input[2] + input[(32 - 3)];

+  step[3] = input[3] + input[(32 - 4)];

+  step[4] = input[4] + input[(32 - 5)];

+  step[5] = input[5] + input[(32 - 6)];

+  step[6] = input[6] + input[(32 - 7)];

+  step[7] = input[7] + input[(32 - 8)];

+  step[8] = input[8] + input[(32 - 9)];

+  step[9] = input[9] + input[(32 - 10)];

+  step[10] = input[10] + input[(32 - 11)];

+  step[11] = input[11] + input[(32 - 12)];

+  step[12] = input[12] + input[(32 - 13)];

+  step[13] = input[13] + input[(32 - 14)];

+  step[14] = input[14] + input[(32 - 15)];

+  step[15] = input[15] + input[(32 - 16)];

+  step[16] = -input[16] + input[(32 - 17)];

+  step[17] = -input[17] + input[(32 - 18)];

+  step[18] = -input[18] + input[(32 - 19)];

+  step[19] = -input[19] + input[(32 - 20)];

+  step[20] = -input[20] + input[(32 - 21)];

+  step[21] = -input[21] + input[(32 - 22)];

+  step[22] = -input[22] + input[(32 - 23)];

+  step[23] = -input[23] + input[(32 - 24)];

+  step[24] = -input[24] + input[(32 - 25)];

+  step[25] = -input[25] + input[(32 - 26)];

+  step[26] = -input[26] + input[(32 - 27)];

+  step[27] = -input[27] + input[(32 - 28)];

+  step[28] = -input[28] + input[(32 - 29)];

+  step[29] = -input[29] + input[(32 - 30)];

+  step[30] = -input[30] + input[(32 - 31)];

+  step[31] = -input[31] + input[(32 - 32)];

   // Stage 2

-  output[stride*0] = step[0] + step[16 - 1];

-  output[stride*1] = step[1] + step[16 - 2];

-  output[stride*2] = step[2] + step[16 - 3];

-  output[stride*3] = step[3] + step[16 - 4];

-  output[stride*4] = step[4] + step[16 - 5];

-  output[stride*5] = step[5] + step[16 - 6];

-  output[stride*6] = step[6] + step[16 - 7];

-  output[stride*7] = step[7] + step[16 - 8];

-  output[stride*8] = -step[8] + step[16 - 9];

-  output[stride*9] = -step[9] + step[16 - 10];

-  output[stride*10] = -step[10] + step[16 - 11];

-  output[stride*11] = -step[11] + step[16 - 12];

-  output[stride*12] = -step[12] + step[16 - 13];

-  output[stride*13] = -step[13] + step[16 - 14];

-  output[stride*14] = -step[14] + step[16 - 15];

-  output[stride*15] = -step[15] + step[16 - 16];

+  output[0] = step[0] + step[16 - 1];

+  output[1] = step[1] + step[16 - 2];

+  output[2] = step[2] + step[16 - 3];

+  output[3] = step[3] + step[16 - 4];

+  output[4] = step[4] + step[16 - 5];

+  output[5] = step[5] + step[16 - 6];

+  output[6] = step[6] + step[16 - 7];

+  output[7] = step[7] + step[16 - 8];

+  output[8] = -step[8] + step[16 - 9];

+  output[9] = -step[9] + step[16 - 10];

+  output[10] = -step[10] + step[16 - 11];

+  output[11] = -step[11] + step[16 - 12];

+  output[12] = -step[12] + step[16 - 13];

+  output[13] = -step[13] + step[16 - 14];

+  output[14] = -step[14] + step[16 - 15];

+  output[15] = -step[15] + step[16 - 16];

-  output[stride*16] = step[16];

-  output[stride*17] = step[17];

-  output[stride*18] = step[18];

-  output[stride*19] = step[19];

+  output[16] = step[16];

+  output[17] = step[17];

+  output[18] = step[18];

+  output[19] = step[19];

-  output[stride*20] = (-step[20] + step[27])*C16;

-  output[stride*21] = (-step[21] + step[26])*C16;

-  output[stride*22] = (-step[22] + step[25])*C16;

-  output[stride*23] = (-step[23] + step[24])*C16;

+  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);

+  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);

+  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);

+  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);

-  output[stride*24] = (step[24] + step[23])*C16;

-  output[stride*25] = (step[25] + step[22])*C16;

-  output[stride*26] = (step[26] + step[21])*C16;

-  output[stride*27] = (step[27] + step[20])*C16;

+  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);

+  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);

+  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);

+  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);

-  output[stride*28] = step[28];

-  output[stride*29] = step[29];

-  output[stride*30] = step[30];

-  output[stride*31] = step[31];

+  output[28] = step[28];

+  output[29] = step[29];

+  output[30] = step[30];

+  output[31] = step[31];

   // Stage 3

-  step[0] = output[stride*0] + output[stride*(8 - 1)];

-  step[1] = output[stride*1] + output[stride*(8 - 2)];

-  step[2] = output[stride*2] + output[stride*(8 - 3)];

-  step[3] = output[stride*3] + output[stride*(8 - 4)];

-  step[4] = -output[stride*4] + output[stride*(8 - 5)];

-  step[5] = -output[stride*5] + output[stride*(8 - 6)];

-  step[6] = -output[stride*6] + output[stride*(8 - 7)];

-  step[7] = -output[stride*7] + output[stride*(8 - 8)];

-  step[8] = output[stride*8];

-  step[9] = output[stride*9];

-  step[10] = (-output[stride*10] + output[stride*13])*C16;

-  step[11] = (-output[stride*11] + output[stride*12])*C16;

-  step[12] = (output[stride*12] + output[stride*11])*C16;

-  step[13] = (output[stride*13] + output[stride*10])*C16;

-  step[14] = output[stride*14];

-  step[15] = output[stride*15];

+  step[0] = output[0] + output[(8 - 1)];

+  step[1] = output[1] + output[(8 - 2)];

+  step[2] = output[2] + output[(8 - 3)];

+  step[3] = output[3] + output[(8 - 4)];

+  step[4] = -output[4] + output[(8 - 5)];

+  step[5] = -output[5] + output[(8 - 6)];

+  step[6] = -output[6] + output[(8 - 7)];

+  step[7] = -output[7] + output[(8 - 8)];

+  step[8] = output[8];

+  step[9] = output[9];

+  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);

+  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);

+  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);

+  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);

+  step[14] = output[14];

+  step[15] = output[15];

-  step[16] = output[stride*16] + output[stride*23];

-  step[17] = output[stride*17] + output[stride*22];

-  step[18] = output[stride*18] + output[stride*21];

-  step[19] = output[stride*19] + output[stride*20];

-  step[20] = -output[stride*20] + output[stride*19];

-  step[21] = -output[stride*21] + output[stride*18];

-  step[22] = -output[stride*22] + output[stride*17];

-  step[23] = -output[stride*23] + output[stride*16];

-  step[24] = -output[stride*24] + output[stride*31];

-  step[25] = -output[stride*25] + output[stride*30];

-  step[26] = -output[stride*26] + output[stride*29];

-  step[27] = -output[stride*27] + output[stride*28];

-  step[28] = output[stride*28] + output[stride*27];

-  step[29] = output[stride*29] + output[stride*26];

-  step[30] = output[stride*30] + output[stride*25];

-  step[31] = output[stride*31] + output[stride*24];

+  step[16] = output[16] + output[23];

+  step[17] = output[17] + output[22];

+  step[18] = output[18] + output[21];

+  step[19] = output[19] + output[20];

+  step[20] = -output[20] + output[19];

+  step[21] = -output[21] + output[18];

+  step[22] = -output[22] + output[17];

+  step[23] = -output[23] + output[16];

+  step[24] = -output[24] + output[31];

+  step[25] = -output[25] + output[30];

+  step[26] = -output[26] + output[29];

+  step[27] = -output[27] + output[28];

+  step[28] = output[28] + output[27];

+  step[29] = output[29] + output[26];

+  step[30] = output[30] + output[25];

+  step[31] = output[31] + output[24];

   // Stage 4

-  output[stride*0] = step[0] + step[3];

-  output[stride*1] = step[1] + step[2];

-  output[stride*2] = -step[2] + step[1];

-  output[stride*3] = -step[3] + step[0];

-  output[stride*4] = step[4];

-  output[stride*5] = (-step[5] + step[6])*C16;

-  output[stride*6] = (step[6] + step[5])*C16;

-  output[stride*7] = step[7];

-  output[stride*8] = step[8] + step[11];

-  output[stride*9] = step[9] + step[10];

-  output[stride*10] = -step[10] + step[9];

-  output[stride*11] = -step[11] + step[8];

-  output[stride*12] = -step[12] + step[15];

-  output[stride*13] = -step[13] + step[14];

-  output[stride*14] = step[14] + step[13];

-  output[stride*15] = step[15] + step[12];

+  output[0] = step[0] + step[3];

+  output[1] = step[1] + step[2];

+  output[2] = -step[2] + step[1];

+  output[3] = -step[3] + step[0];

+  output[4] = step[4];

+  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);

+  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);

+  output[7] = step[7];

+  output[8] = step[8] + step[11];

+  output[9] = step[9] + step[10];

+  output[10] = -step[10] + step[9];

+  output[11] = -step[11] + step[8];

+  output[12] = -step[12] + step[15];

+  output[13] = -step[13] + step[14];

+  output[14] = step[14] + step[13];

+  output[15] = step[15] + step[12];

-  output[stride*16] = step[16];

-  output[stride*17] = step[17];

-  output[stride*18] = step[18]*-C8 + step[29]*C24;

-  output[stride*19] = step[19]*-C8 + step[28]*C24;

-  output[stride*20] = step[20]*-C24 + step[27]*-C8;

-  output[stride*21] = step[21]*-C24 + step[26]*-C8;

-  output[stride*22] = step[22];

-  output[stride*23] = step[23];

-  output[stride*24] = step[24];

-  output[stride*25] = step[25];

-  output[stride*26] = step[26]*C24 + step[21]*-C8;

-  output[stride*27] = step[27]*C24 + step[20]*-C8;

-  output[stride*28] = step[28]*C8 + step[19]*C24;

-  output[stride*29] = step[29]*C8 + step[18]*C24;

-  output[stride*30] = step[30];

-  output[stride*31] = step[31];

+  output[16] = step[16];

+  output[17] = step[17];

+  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);

+  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);

+  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);

+  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);

+  output[22] = step[22];

+  output[23] = step[23];

+  output[24] = step[24];

+  output[25] = step[25];

+  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);

+  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);

+  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);

+  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);

+  output[30] = step[30];

+  output[31] = step[31];

   // Stage 5

-  step[0] = (output[stride*0] + output[stride*1]) * C16;

-  step[1] = (-output[stride*1] + output[stride*0]) * C16;

-  step[2] = output[stride*2]*C24 + output[stride*3] * C8;

-  step[3] = output[stride*3]*C24 - output[stride*2] * C8;

-  step[4] = output[stride*4] + output[stride*5];

-  step[5] = -output[stride*5] + output[stride*4];

-  step[6] = -output[stride*6] + output[stride*7];

-  step[7] = output[stride*7] + output[stride*6];

-  step[8] = output[stride*8];

-  step[9] = output[stride*9]*-C8 + output[stride*14]*C24;

-  step[10] = output[stride*10]*-C24 + output[stride*13]*-C8;

-  step[11] = output[stride*11];

-  step[12] = output[stride*12];

-  step[13] = output[stride*13]*C24 + output[stride*10]*-C8;

-  step[14] = output[stride*14]*C8 + output[stride*9]*C24;

-  step[15] = output[stride*15];

+  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);

+  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);

+  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);

+  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);

+  step[4] = output[4] + output[5];

+  step[5] = -output[5] + output[4];

+  step[6] = -output[6] + output[7];

+  step[7] = output[7] + output[6];

+  step[8] = output[8];

+  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);

+  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);

+  step[11] = output[11];

+  step[12] = output[12];

+  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);

+  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);

+  step[15] = output[15];

-  step[16] = output[stride*16] + output[stride*19];

-  step[17] = output[stride*17] + output[stride*18];

-  step[18] = -output[stride*18] + output[stride*17];

-  step[19] = -output[stride*19] + output[stride*16];

-  step[20] = -output[stride*20] + output[stride*23];

-  step[21] = -output[stride*21] + output[stride*22];

-  step[22] = output[stride*22] + output[stride*21];

-  step[23] = output[stride*23] + output[stride*20];

-  step[24] = output[stride*24] + output[stride*27];

-  step[25] = output[stride*25] + output[stride*26];

-  step[26] = -output[stride*26] + output[stride*25];

-  step[27] = -output[stride*27] + output[stride*24];

-  step[28] = -output[stride*28] + output[stride*31];

-  step[29] = -output[stride*29] + output[stride*30];

-  step[30] = output[stride*30] + output[stride*29];

-  step[31] = output[stride*31] + output[stride*28];

+  step[16] = output[16] + output[19];

+  step[17] = output[17] + output[18];

+  step[18] = -output[18] + output[17];

+  step[19] = -output[19] + output[16];

+  step[20] = -output[20] + output[23];

+  step[21] = -output[21] + output[22];

+  step[22] = output[22] + output[21];

+  step[23] = output[23] + output[20];

+  step[24] = output[24] + output[27];

+  step[25] = output[25] + output[26];

+  step[26] = -output[26] + output[25];

+  step[27] = -output[27] + output[24];

+  step[28] = -output[28] + output[31];

+  step[29] = -output[29] + output[30];

+  step[30] = output[30] + output[29];

+  step[31] = output[31] + output[28];

   // Stage 6

-  output[stride*0] = step[0];

-  output[stride*1] = step[1];

-  output[stride*2] = step[2];

-  output[stride*3] = step[3];

-  output[stride*4] = step[4]*C28 + step[7]*C4;

-  output[stride*5] = step[5]*C12 + step[6]*C20;

-  output[stride*6] = step[6]*C12 + step[5]*-C20;

-  output[stride*7] = step[7]*C28 + step[4]*-C4;

-  output[stride*8] = step[8] + step[9];

-  output[stride*9] = -step[9] + step[8];

-  output[stride*10] = -step[10] + step[11];

-  output[stride*11] = step[11] + step[10];

-  output[stride*12] = step[12] + step[13];

-  output[stride*13] = -step[13] + step[12];

-  output[stride*14] = -step[14] + step[15];

-  output[stride*15] = step[15] + step[14];

+  output[0] = step[0];

+  output[1] = step[1];

+  output[2] = step[2];

+  output[3] = step[3];

+  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);

+  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);

+  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);

+  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);

+  output[8] = step[8] + step[9];

+  output[9] = -step[9] + step[8];

+  output[10] = -step[10] + step[11];

+  output[11] = step[11] + step[10];

+  output[12] = step[12] + step[13];

+  output[13] = -step[13] + step[12];

+  output[14] = -step[14] + step[15];

+  output[15] = step[15] + step[14];

-  output[stride*16] = step[16];

-  output[stride*17] = step[17]*-C4 + step[30]*C28;

-  output[stride*18] = step[18]*-C28 + step[29]*-C4;

-  output[stride*19] = step[19];

-  output[stride*20] = step[20];

-  output[stride*21] = step[21]*-C20 + step[26]*C12;

-  output[stride*22] = step[22]*-C12 + step[25]*-C20;

-  output[stride*23] = step[23];

-  output[stride*24] = step[24];

-  output[stride*25] = step[25]*C12 + step[22]*-C20;

-  output[stride*26] = step[26]*C20 + step[21]*C12;

-  output[stride*27] = step[27];

-  output[stride*28] = step[28];

-  output[stride*29] = step[29]*C28 + step[18]*-C4;

-  output[stride*30] = step[30]*C4 + step[17]*C28;

-  output[stride*31] = step[31];

+  output[16] = step[16];

+  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);

+  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);

+  output[19] = step[19];

+  output[20] = step[20];

+  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);

+  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);

+  output[23] = step[23];

+  output[24] = step[24];

+  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);

+  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);

+  output[27] = step[27];

+  output[28] = step[28];

+  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);

+  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);

+  output[31] = step[31];

   // Stage 7

-  step[0] = output[stride*0];

-  step[1] = output[stride*1];

-  step[2] = output[stride*2];

-  step[3] = output[stride*3];

-  step[4] = output[stride*4];

-  step[5] = output[stride*5];

-  step[6] = output[stride*6];

-  step[7] = output[stride*7];

-  step[8] = output[stride*8]*C30 + output[stride*15]*C2;

-  step[9] = output[stride*9]*C14 + output[stride*14]*C18;

-  step[10] = output[stride*10]*C22 + output[stride*13]*C10;

-  step[11] = output[stride*11]*C6 + output[stride*12]*C26;

-  step[12] = output[stride*12]*C6 + output[stride*11]*-C26;

-  step[13] = output[stride*13]*C22 + output[stride*10]*-C10;

-  step[14] = output[stride*14]*C14 + output[stride*9]*-C18;

-  step[15] = output[stride*15]*C30 + output[stride*8]*-C2;

+  step[0] = output[0];

+  step[1] = output[1];

+  step[2] = output[2];

+  step[3] = output[3];

+  step[4] = output[4];

+  step[5] = output[5];

+  step[6] = output[6];

+  step[7] = output[7];

+  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);

+  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);

+  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);

+  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);

+  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);

+  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);

+  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);

+  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);

-  step[16] = output[stride*16] + output[stride*17];

-  step[17] = -output[stride*17] + output[stride*16];

-  step[18] = -output[stride*18] + output[stride*19];

-  step[19] = output[stride*19] + output[stride*18];

-  step[20] = output[stride*20] + output[stride*21];

-  step[21] = -output[stride*21] + output[stride*20];

-  step[22] = -output[stride*22] + output[stride*23];

-  step[23] = output[stride*23] + output[stride*22];

-  step[24] = output[stride*24] + output[stride*25];

-  step[25] = -output[stride*25] + output[stride*24];

-  step[26] = -output[stride*26] + output[stride*27];

-  step[27] = output[stride*27] + output[stride*26];

-  step[28] = output[stride*28] + output[stride*29];

-  step[29] = -output[stride*29] + output[stride*28];

-  step[30] = -output[stride*30] + output[stride*31];

-  step[31] = output[stride*31] + output[stride*30];

+  step[16] = output[16] + output[17];

+  step[17] = -output[17] + output[16];

+  step[18] = -output[18] + output[19];

+  step[19] = output[19] + output[18];

+  step[20] = output[20] + output[21];

+  step[21] = -output[21] + output[20];

+  step[22] = -output[22] + output[23];

+  step[23] = output[23] + output[22];

+  step[24] = output[24] + output[25];

+  step[25] = -output[25] + output[24];

+  step[26] = -output[26] + output[27];

+  step[27] = output[27] + output[26];

+  step[28] = output[28] + output[29];

+  step[29] = -output[29] + output[28];

+  step[30] = -output[30] + output[31];

+  step[31] = output[31] + output[30];

   // Final stage --- outputs indices are bit-reversed.

-  output[stride*0] = step[0];

-  output[stride*16] = step[1];

-  output[stride*8] = step[2];

-  output[stride*24] = step[3];

-  output[stride*4] = step[4];

-  output[stride*20] = step[5];

-  output[stride*12] = step[6];

-  output[stride*28] = step[7];

-  output[stride*2] = step[8];

-  output[stride*18] = step[9];

-  output[stride*10] = step[10];

-  output[stride*26] = step[11];

-  output[stride*6] = step[12];

-  output[stride*22] = step[13];

-  output[stride*14] = step[14];

-  output[stride*30] = step[15];

+  output[0]  = step[0];

+  output[16] = step[1];

+  output[8]  = step[2];

+  output[24] = step[3];

+  output[4]  = step[4];

+  output[20] = step[5];

+  output[12] = step[6];

+  output[28] = step[7];

+  output[2]  = step[8];

+  output[18] = step[9];

+  output[10] = step[10];

+  output[26] = step[11];

+  output[6]  = step[12];

+  output[22] = step[13];

+  output[14] = step[14];

+  output[30] = step[15];

-  output[stride*1] = step[16]*C31 + step[31]*C1;

-  output[stride*17] = step[17]*C15 + step[30]*C17;

-  output[stride*9] = step[18]*C23 + step[29]*C9;

-  output[stride*25] = step[19]*C7 + step[28]*C25;

-  output[stride*5] = step[20]*C27 + step[27]*C5;

-  output[stride*21] = step[21]*C11 + step[26]*C21;

-  output[stride*13] = step[22]*C19 + step[25]*C13;

-  output[stride*29] = step[23]*C3 + step[24]*C29;

-  output[stride*3] = step[24]*C3 + step[23]*-C29;

-  output[stride*19] = step[25]*C19 + step[22]*-C13;

-  output[stride*11] = step[26]*C11 + step[21]*-C21;

-  output[stride*27] = step[27]*C27 + step[20]*-C5;

-  output[stride*7] = step[28]*C7 + step[19]*-C25;

-  output[stride*23] = step[29]*C23 + step[18]*-C9;

-  output[stride*15] = step[30]*C15 + step[17]*-C17;

-  output[stride*31] = step[31]*C31 + step[16]*-C1;

+  output[1]  = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);

+  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);

+  output[9]  = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);

+  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);

+  output[5]  = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);

+  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);

+  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);

+  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);

+  output[3]  = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);

+  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);

+  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);

+  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);

+  output[7]  = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);

+  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);

+  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);

+  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);

 void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    int shortpitch = pitch >> 1;

-    int i, j;

-    double output[1024];

-    // First transform columns

-    for (i = 0; i < 32; i++) {

-      double temp_in[32], temp_out[32];

-      for (j = 0; j < 32; j++)

-        temp_in[j] = input[j*shortpitch + i];

-      dct32_1d(temp_in, temp_out, 1);

-      for (j = 0; j < 32; j++)

-        output[j*32 + i] = temp_out[j];

-    }

-    // Then transform rows

-    for (i = 0; i < 32; ++i) {

-      double temp_in[32], temp_out[32];

-      for (j = 0; j < 32; ++j)

-        temp_in[j] = output[j + i*32];

-      dct32_1d(temp_in, temp_out, 1);

-      for (j = 0; j < 32; ++j)

-        output[j + i*32] = temp_out[j];

-    }

-    // Scale by some magic number

-    for (i = 0; i < 1024; i++) {

-      out[i] = (short)round(output[i]/4);

-    }

-  }

+  int shortpitch = pitch >> 1;

+  int i, j;

+  int output[32 * 32];

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-}

-#else  // CONFIG_DWTDCTHYBRID

-#if DWT_TYPE == 53

-// Note: block length must be even for this implementation

-static void analysis_53_row(int length, short *x,

-                            short *lowpass, short *highpass) {

-  int n;

-  short r, *a, *b;

-  n = length >> 1;

-  b = highpass;

-  a = lowpass;

-  while (--n) {

-    *a++ = (r = *x++) << 1;

-    *b++ = *x - ((r + x[1] + 1) >> 1);

-    x++;

-  }

-  *a = (r = *x++) << 1;

-  *b = *x - r;

-  n = length >> 1;

-  b = highpass;

-  a = lowpass;

-  r = *highpass;

-  while (n--) {

-    *a++ += (r + (*b) + 1) >> 1;

-    r = *b++;

-  }

-}

-static void analysis_53_col(int length, short *x,

-                            short *lowpass, short *highpass) {

-  int n;

-  short r, *a, *b;

-  n = length >> 1;

-  b = highpass;

-  a = lowpass;

-  while (--n) {

-    *a++ = (r = *x++);

-    *b++ = (((*x) << 1) - (r + x[1]) + 2) >> 2;

-    x++;

-  }

-  *a = (r = *x++);

-  *b = (*x - r + 1) >> 1;

-  n = length >> 1;

-  b = highpass;

-  a = lowpass;

-  r = *highpass;

-  while (n--) {

-    *a++ += (r + (*b) + 1) >> 1;

-    r = *b++;

-  }

-}

-static void dyadic_analyze_53(int levels, int width, int height,

-                              short *x, int pitch_x, short *c, int pitch_c) {

-  int lv, i, j, nh, nw, hh = height, hw = width;

-  short buffer[2 * DWT_MAX_LENGTH];

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;

-    }

-  }

-  for (lv = 0; lv < levels; lv++) {

-    nh = hh;

-    hh = (hh + 1) >> 1;

-    nw = hw;

-    hw = (hw + 1) >> 1;

-    if ((nh < 2) || (nw < 2)) return;

-    for (i = 0; i < nh; i++) {

-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));

-      analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);

-    }

-    for (j = 0; j < nw; j++) {

-      for (i = 0; i < nh; i++)

-        buffer[i + nh] = c[i * pitch_c + j];

-      analysis_53_col(nh, buffer + nh, buffer, buffer + hh);

-      for (i = 0; i < nh; i++)

-        c[i * pitch_c + j] = buffer[i];

-    }

-  }

-}

-#elif DWT_TYPE == 26

-static void analysis_26_row(int length, short *x,

-                            short *lowpass, short *highpass) {

-  int i, n;

-  short r, s, *a, *b;

-  a = lowpass;

-  b = highpass;

-  for (i = length >> 1; i; i--) {

-    r = *x++;

-    s = *x++;

-    *a++ = r + s;

-    *b++ = r - s;

-  }

-  n = length >> 1;

-  if (n >= 4) {

-    a = lowpass;

-    b = highpass;

-    r = *lowpass;

-    while (--n) {

-      *b++ -= (r - a[1] + 4) >> 3;

-      r = *a++;

-    }

-    *b -= (r - *a + 4) >> 3;

-  }

-}

-static void analysis_26_col(int length, short *x,

-                            short *lowpass, short *highpass) {

-  int i, n;

-  short r, s, *a, *b;

-  a = lowpass;

-  b = highpass;

-  for (i = length >> 1; i; i--) {

-    r = *x++;

-    s = *x++;

-    *a++ = (r + s + 1) >> 1;

-    *b++ = (r - s + 1) >> 1;

-  }

-  n = length >> 1;

-  if (n >= 4) {

-    a = lowpass;

-    b = highpass;

-    r = *lowpass;

-    while (--n) {

-      *b++ -= (r - a[1] + 4) >> 3;

-      r = *a++;

-    }

-    *b -= (r - *a + 4) >> 3;

-  }

-}

-static void dyadic_analyze_26(int levels, int width, int height,

-                              short *x, int pitch_x, short *c, int pitch_c) {

-  int lv, i, j, nh, nw, hh = height, hw = width;

-  short buffer[2 * DWT_MAX_LENGTH];

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      c[i * pitch_c + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;

-    }

-  }

-  for (lv = 0; lv < levels; lv++) {

-    nh = hh;

-    hh = (hh + 1) >> 1;

-    nw = hw;

-    hw = (hw + 1) >> 1;

-    if ((nh < 2) || (nw < 2)) return;

-    for (i = 0; i < nh; i++) {

-      memcpy(buffer, &c[i * pitch_c], nw * sizeof(short));

-      analysis_26_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);

-    }

-    for (j = 0; j < nw; j++) {

-      for (i = 0; i < nh; i++)

-        buffer[i + nh] = c[i * pitch_c + j];

-      analysis_26_col(nh, buffer + nh, buffer, buffer + hh);

-      for (i = 0; i < nh; i++)

-        c[i * pitch_c + j] = buffer[i];

-    }

-  }

-}

-#elif DWT_TYPE == 97

-static void analysis_97(int length, double *x,

-                        double *lowpass, double *highpass) {

-  static const double a_predict1 = -1.586134342;

-  static const double a_update1 = -0.05298011854;

-  static const double a_predict2 = 0.8829110762;

-  static const double a_update2 = 0.4435068522;

-  static const double s_low = 1.149604398;

-  static const double s_high = 1/1.149604398;

-  int i;

-  double y[DWT_MAX_LENGTH];

-  // Predict 1

-  for (i = 1; i < length - 2; i += 2) {

-    x[i] += a_predict1 * (x[i - 1] + x[i + 1]);

-  }

-  x[length - 1] += 2 * a_predict1 * x[length - 2];

-  // Update 1

-  for (i = 2; i < length; i += 2) {

-    x[i] += a_update1 * (x[i - 1] + x[i + 1]);

-  }

-  x[0] += 2 * a_update1 * x[1];

-  // Predict 2

-  for (i = 1; i < length - 2; i += 2) {

-    x[i] += a_predict2 * (x[i - 1] + x[i + 1]);

-  }

-  x[length - 1] += 2 * a_predict2 * x[length - 2];

-  // Update 2

-  for (i = 2; i < length; i += 2) {

-    x[i] += a_update2 * (x[i - 1] + x[i + 1]);

-  }

-  x[0] += 2 * a_update2 * x[1];

-  memcpy(y, x, sizeof(*y) * length);

-  // Scale and pack

-  for (i = 0; i < length / 2; i++) {

-    lowpass[i] = y[2 * i] * s_low;

-    highpass[i] = y[2 * i + 1] * s_high;

-  }

-}

-static void dyadic_analyze_97(int levels, int width, int height,

-                             short *x, int pitch_x, short *c, int pitch_c) {

-  int lv, i, j, nh, nw, hh = height, hw = width;

-  double buffer[2 * DWT_MAX_LENGTH];

-  double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH];

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      y[i * DWT_MAX_LENGTH + j] = x[i * pitch_x + j] << DWT_PRECISION_BITS;

-    }

-  }

-  for (lv = 0; lv < levels; lv++) {

-    nh = hh;

-    hh = (hh + 1) >> 1;

-    nw = hw;

-    hw = (hw + 1) >> 1;

-    if ((nh < 2) || (nw < 2)) return;

-    for (i = 0; i < nh; i++) {

-      memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer));

-      analysis_97(nw, buffer, &y[i * DWT_MAX_LENGTH],

-                  &y[i * DWT_MAX_LENGTH] + hw);

-    }

-    for (j = 0; j < nw; j++) {

-      for (i = 0; i < nh; i++)

-        buffer[i + nh] = y[i * DWT_MAX_LENGTH + j];

-      analysis_97(nh, buffer + nh, buffer, buffer + hh);

-      for (i = 0; i < nh; i++)

-        c[i * pitch_c + j] = round(buffer[i]);

-    }

-  }

-}

-#endif  // DWT_TYPE

-// TODO(debargha): Implement the scaling differently so as not to have to

-// use the floating point dct

-static void dct16x16_1d_f(double input[16], double output[16]) {

-  static const double C1 = 0.995184726672197;

-  static const double C2 = 0.98078528040323;

-  static const double C3 = 0.956940335732209;

-  static const double C4 = 0.923879532511287;

-  static const double C5 = 0.881921264348355;

-  static const double C6 = 0.831469612302545;

-  static const double C7 = 0.773010453362737;

-  static const double C8 = 0.707106781186548;

-  static const double C9 = 0.634393284163646;

-  static const double C10 = 0.555570233019602;

-  static const double C11 = 0.471396736825998;

-  static const double C12 = 0.38268343236509;

-  static const double C13 = 0.290284677254462;

-  static const double C14 = 0.195090322016128;

-  static const double C15 = 0.098017140329561;

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    double step[16];

-    double intermediate[16];

-    double temp1, temp2;

-    // step 1

-    step[ 0] = input[0] + input[15];

-    step[ 1] = input[1] + input[14];

-    step[ 2] = input[2] + input[13];

-    step[ 3] = input[3] + input[12];

-    step[ 4] = input[4] + input[11];

-    step[ 5] = input[5] + input[10];

-    step[ 6] = input[6] + input[ 9];

-    step[ 7] = input[7] + input[ 8];

-    step[ 8] = input[7] - input[ 8];

-    step[ 9] = input[6] - input[ 9];

-    step[10] = input[5] - input[10];

-    step[11] = input[4] - input[11];

-    step[12] = input[3] - input[12];

-    step[13] = input[2] - input[13];

-    step[14] = input[1] - input[14];

-    step[15] = input[0] - input[15];

-    // step 2

-    output[0] = step[0] + step[7];

-    output[1] = step[1] + step[6];

-    output[2] = step[2] + step[5];

-    output[3] = step[3] + step[4];

-    output[4] = step[3] - step[4];

-    output[5] = step[2] - step[5];

-    output[6] = step[1] - step[6];

-    output[7] = step[0] - step[7];

-    temp1 = step[ 8]*C7;

-    temp2 = step[15]*C9;

-    output[ 8] = temp1 + temp2;

-    temp1 = step[ 9]*C11;

-    temp2 = step[14]*C5;

-    output[ 9] = temp1 - temp2;

-    temp1 = step[10]*C3;

-    temp2 = step[13]*C13;

-    output[10] = temp1 + temp2;

-    temp1 = step[11]*C15;

-    temp2 = step[12]*C1;

-    output[11] = temp1 - temp2;

-    temp1 = step[11]*C1;

-    temp2 = step[12]*C15;

-    output[12] = temp2 + temp1;

-    temp1 = step[10]*C13;

-    temp2 = step[13]*C3;

-    output[13] = temp2 - temp1;

-    temp1 = step[ 9]*C5;

-    temp2 = step[14]*C11;

-    output[14] = temp2 + temp1;

-    temp1 = step[ 8]*C9;

-    temp2 = step[15]*C7;

-    output[15] = temp2 - temp1;

-    // step 3

-    step[ 0] = output[0] + output[3];

-    step[ 1] = output[1] + output[2];

-    step[ 2] = output[1] - output[2];

-    step[ 3] = output[0] - output[3];

-    temp1 = output[4]*C14;

-    temp2 = output[7]*C2;

-    step[ 4] = temp1 + temp2;

-    temp1 = output[5]*C10;

-    temp2 = output[6]*C6;

-    step[ 5] = temp1 + temp2;

-    temp1 = output[5]*C6;

-    temp2 = output[6]*C10;

-    step[ 6] = temp2 - temp1;

-    temp1 = output[4]*C2;

-    temp2 = output[7]*C14;

-    step[ 7] = temp2 - temp1;

-    step[ 8] = output[ 8] + output[11];

-    step[ 9] = output[ 9] + output[10];

-    step[10] = output[ 9] - output[10];

-    step[11] = output[ 8] - output[11];

-    step[12] = output[12] + output[15];

-    step[13] = output[13] + output[14];

-    step[14] = output[13] - output[14];

-    step[15] = output[12] - output[15];

-    // step 4

-    output[ 0] = (step[ 0] + step[ 1]);

-    output[ 8] = (step[ 0] - step[ 1]);

-    temp1 = step[2]*C12;

-    temp2 = step[3]*C4;

-    temp1 = temp1 + temp2;

-    output[ 4] = 2*(temp1*C8);

-    temp1 = step[2]*C4;

-    temp2 = step[3]*C12;

-    temp1 = temp2 - temp1;

-    output[12] = 2*(temp1*C8);

-    output[ 2] = 2*((step[4] + step[ 5])*C8);

-    output[14] = 2*((step[7] - step[ 6])*C8);

-    temp1 = step[4] - step[5];

-    temp2 = step[6] + step[7];

-    output[ 6] = (temp1 + temp2);

-    output[10] = (temp1 - temp2);

-    intermediate[8] = step[8] + step[14];

-    intermediate[9] = step[9] + step[15];

-    temp1 = intermediate[8]*C12;

-    temp2 = intermediate[9]*C4;

-    temp1 = temp1 - temp2;

-    output[3] = 2*(temp1*C8);

-    temp1 = intermediate[8]*C4;

-    temp2 = intermediate[9]*C12;

-    temp1 = temp2 + temp1;

-    output[13] = 2*(temp1*C8);

-    output[ 9] = 2*((step[10] + step[11])*C8);

-    intermediate[11] = step[10] - step[11];

-    intermediate[12] = step[12] + step[13];

-    intermediate[13] = step[12] - step[13];

-    intermediate[14] = step[ 8] - step[14];

-    intermediate[15] = step[ 9] - step[15];

-    output[15] = (intermediate[11] + intermediate[12]);

-    output[ 1] = -(intermediate[11] - intermediate[12]);

-    output[ 7] = 2*(intermediate[13]*C8);

-    temp1 = intermediate[14]*C12;

-    temp2 = intermediate[15]*C4;

-    temp1 = temp1 - temp2;

-    output[11] = -2*(temp1*C8);

-    temp1 = intermediate[14]*C4;

-    temp2 = intermediate[15]*C12;

-    temp1 = temp2 + temp1;

-    output[ 5] = 2*(temp1*C8);

-  }

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-}

-static void vp9_short_fdct16x16_c_f(short *input, short *out, int pitch,

-                                    int scale) {

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-  {

-    int shortpitch = pitch >> 1;

-    int i, j;

-    double output[256];

-    // First transform columns

-    for (i = 0; i < 16; i++) {

-        double temp_in[16], temp_out[16];

-        for (j = 0; j < 16; j++)

-            temp_in[j] = input[j*shortpitch + i];

-        dct16x16_1d_f(temp_in, temp_out);

-        for (j = 0; j < 16; j++)

-            output[j*16 + i] = temp_out[j];

-    }

-    // Then transform rows

-    for (i = 0; i < 16; ++i) {

-        double temp_in[16], temp_out[16];

-        for (j = 0; j < 16; ++j)

-            temp_in[j] = output[j + i*16];

-        dct16x16_1d_f(temp_in, temp_out);

-        for (j = 0; j < 16; ++j)

-            output[j + i*16] = temp_out[j];

-    }

-    // Scale by some magic number

-    for (i = 0; i < 256; i++)

-        out[i] = (short)round(output[i] / (2 << scale));

+  // Columns

+  for (i = 0; i < 32; i++) {

+    int temp_in[32], temp_out[32];

+    for (j = 0; j < 32; j++)

+      temp_in[j] = input[j * shortpitch + i] << 2;

+    dct32_1d(temp_in, temp_out);

+    for (j = 0; j < 32; j++)

+      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

-  vp9_clear_system_state();  // Make it simd safe : __asm emms;

-}

-void vp9_short_fdct8x8_c_f(short *block, short *coefs, int pitch, int scale) {

-  int j1, i, j, k;

-  float b[8];

-  float b1[8];

-  float d[8][8];

-  float f0 = (float) .7071068;

-  float f1 = (float) .4903926;

-  float f2 = (float) .4619398;

-  float f3 = (float) .4157348;

-  float f4 = (float) .3535534;

-  float f5 = (float) .2777851;

-  float f6 = (float) .1913417;

-  float f7 = (float) .0975452;

-  pitch = pitch / 2;

-  for (i = 0, k = 0; i < 8; i++, k += pitch) {

-    for (j = 0; j < 8; j++) {

-      b[j] = (float)(block[k + j] << (3 - scale));

-    }

-    /* Horizontal transform */

-    for (j = 0; j < 4; j++) {

-      j1 = 7 - j;

-      b1[j] = b[j] + b[j1];

-      b1[j1] = b[j] - b[j1];

-    }

-    b[0] = b1[0] + b1[3];

-    b[1] = b1[1] + b1[2];

-    b[2] = b1[1] - b1[2];

-    b[3] = b1[0] - b1[3];

-    b[4] = b1[4];

-    b[5] = (b1[6] - b1[5]) * f0;

-    b[6] = (b1[6] + b1[5]) * f0;

-    b[7] = b1[7];

-    d[i][0] = (b[0] + b[1]) * f4;

-    d[i][4] = (b[0] - b[1]) * f4;

-    d[i][2] = b[2] * f6 + b[3] * f2;

-    d[i][6] = b[3] * f6 - b[2] * f2;

-    b1[4] = b[4] + b[5];

-    b1[7] = b[7] + b[6];

-    b1[5] = b[4] - b[5];

-    b1[6] = b[7] - b[6];

-    d[i][1] = b1[4] * f7 + b1[7] * f1;

-    d[i][5] = b1[5] * f3 + b1[6] * f5;

-    d[i][7] = b1[7] * f7 - b1[4] * f1;

-    d[i][3] = b1[6] * f3 - b1[5] * f5;

-  }

-  /* Vertical transform */

-  for (i = 0; i < 8; i++) {

-    for (j = 0; j < 4; j++) {

-      j1 = 7 - j;

-      b1[j] = d[j][i] + d[j1][i];

-      b1[j1] = d[j][i] - d[j1][i];

-    }

-    b[0] = b1[0] + b1[3];

-    b[1] = b1[1] + b1[2];

-    b[2] = b1[1] - b1[2];

-    b[3] = b1[0] - b1[3];

-    b[4] = b1[4];

-    b[5] = (b1[6] - b1[5]) * f0;

-    b[6] = (b1[6] + b1[5]) * f0;

-    b[7] = b1[7];

-    d[0][i] = (b[0] + b[1]) * f4;

-    d[4][i] = (b[0] - b[1]) * f4;

-    d[2][i] = b[2] * f6 + b[3] * f2;

-    d[6][i] = b[3] * f6 - b[2] * f2;

-    b1[4] = b[4] + b[5];

-    b1[7] = b[7] + b[6];

-    b1[5] = b[4] - b[5];

-    b1[6] = b[7] - b[6];

-    d[1][i] = b1[4] * f7 + b1[7] * f1;

-    d[5][i] = b1[5] * f3 + b1[6] * f5;

-    d[7][i] = b1[7] * f7 - b1[4] * f1;

-    d[3][i] = b1[6] * f3 - b1[5] * f5;

-  }

-  for (i = 0; i < 8; i++) {

-    for (j = 0; j < 8; j++) {

-      *(coefs + j + i * 8) = (short) floor(d[i][j] + 0.5);

-    }

-  }

-  return;

-}

-#define divide_bits(d, n) ((n) < 0 ? (d) << (n) : (d) >> (n))

-#if DWTDCT_TYPE == DWTDCT16X16_LEAN

-void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {

-  // assume out is a 32x32 buffer

-  short buffer[16 * 16];

-  int i, j;

-  const int short_pitch = pitch >> 1;

-#if DWT_TYPE == 26

-  dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);

-#elif DWT_TYPE == 97

-  dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);

-#elif DWT_TYPE == 53

-  dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);

-#endif

-  // TODO(debargha): Implement more efficiently by adding output pitch

-  // argument to the dct16x16 function

-  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i)

-    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);

-  for (i = 0; i < 16; ++i) {

-    for (j = 16; j < 32; ++j) {

-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);

-    }

-  }

-  for (i = 16; i < 32; ++i) {

-    for (j = 0; j < 32; ++j) {

-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);

-    }

-  }

-}

-#elif DWTDCT_TYPE == DWTDCT16X16

-void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {

-  // assume out is a 32x32 buffer

-  short buffer[16 * 16];

-  int i, j;

-  const int short_pitch = pitch >> 1;

-#if DWT_TYPE == 26

-  dyadic_analyze_26(1, 32, 32, input, short_pitch, out, 32);

-#elif DWT_TYPE == 97

-  dyadic_analyze_97(1, 32, 32, input, short_pitch, out, 32);

-#elif DWT_TYPE == 53

-  dyadic_analyze_53(1, 32, 32, input, short_pitch, out, 32);

-#endif

-  // TODO(debargha): Implement more efficiently by adding output pitch

-  // argument to the dct16x16 function

-  vp9_short_fdct16x16_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i)

-    vpx_memcpy(out + i * 32, buffer + i * 16, sizeof(short) * 16);

-  vp9_short_fdct16x16_c_f(out + 16, buffer, 64, 1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i)

-    vpx_memcpy(out + i * 32 + 16, buffer + i * 16, sizeof(short) * 16);

-  vp9_short_fdct16x16_c_f(out + 32 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i)

-    vpx_memcpy(out + i * 32 + 32 * 16, buffer + i * 16, sizeof(short) * 16);

-  vp9_short_fdct16x16_c_f(out + 33 * 16, buffer, 64, 1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i)

-    vpx_memcpy(out + i * 32 + 33 * 16, buffer + i * 16, sizeof(short) * 16);

-}

-#elif DWTDCT_TYPE == DWTDCT8X8

-void vp9_short_fdct32x32_c(short *input, short *out, int pitch) {

-  // assume out is a 32x32 buffer

-  short buffer[8 * 8];

-  int i, j;

-  const int short_pitch = pitch >> 1;

-#if DWT_TYPE == 26

-  dyadic_analyze_26(2, 32, 32, input, short_pitch, out, 32);

-#elif DWT_TYPE == 97

-  dyadic_analyze_97(2, 32, 32, input, short_pitch, out, 32);

-#elif DWT_TYPE == 53

-  dyadic_analyze_53(2, 32, 32, input, short_pitch, out, 32);

-#endif

-  // TODO(debargha): Implement more efficiently by adding output pitch

-  // argument to the dct16x16 function

-  vp9_short_fdct8x8_c_f(out, buffer, 64, 1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 8; ++i)

-    vpx_memcpy(out + i * 32, buffer + i * 8, sizeof(short) * 8);

-  vp9_short_fdct8x8_c_f(out + 8, buffer, 64, 1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 8; ++i)

-    vpx_memcpy(out + i * 32 + 8, buffer + i * 8, sizeof(short) * 8);

-  vp9_short_fdct8x8_c_f(out + 32 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 8; ++i)

-    vpx_memcpy(out + i * 32 + 32 * 8, buffer + i * 8, sizeof(short) * 8);

-  vp9_short_fdct8x8_c_f(out + 33 * 8, buffer, 64, 1 + DWT_PRECISION_BITS);

-  for (i = 0; i < 8; ++i)

-    vpx_memcpy(out + i * 32 + 33 * 8, buffer + i * 8, sizeof(short) * 8);

-  for (i = 0; i < 16; ++i) {

-    for (j = 16; j < 32; ++j) {

-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);

-    }

-  }

-  for (i = 16; i < 32; ++i) {

-    for (j = 0; j < 32; ++j) {

-      out[i * 32 + j] = divide_bits(out[i * 32 + j], DWT_PRECISION_BITS - 2);

-    }

-  }

-}

-#endif

-#if CONFIG_TX64X64

-void vp9_short_fdct64x64_c(short *input, short *out, int pitch) {

-  // assume out is a 64x64 buffer

-  short buffer[16 * 16];

-  int i, j;

-  const int short_pitch = pitch >> 1;

-#if DWT_TYPE == 26

-  dyadic_analyze_26(2, 64, 64, input, short_pitch, out, 64);

-#elif DWT_TYPE == 97

-  dyadic_analyze_97(2, 64, 64, input, short_pitch, out, 64);

-#elif DWT_TYPE == 53

-  dyadic_analyze_53(2, 64, 64, input, short_pitch, out, 64);

-#endif

-  // TODO(debargha): Implement more efficiently by adding output pitch

-  // argument to the dct16x16 function

-  vp9_short_fdct16x16_c_f(out, buffer, 128, 2 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i)

-    vpx_memcpy(out + i * 64, buffer + i * 16, sizeof(short) * 16);

-#if DWTDCT_TYPE == DWTDCT16X16_LEAN

-  for (i = 0; i < 16; ++i) {

-    for (j = 16; j < 48; ++j) {

-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);

-    }

-  }

-  for (i = 16; i < 64; ++i) {

-    for (j = 0; j < 64; ++j) {

-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);

-    }

-  }

-#elif DWTDCT_TYPE == DWTDCT16X16

-  vp9_short_fdct16x16_c_f(out + 16, buffer, 128, 2 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i)

-    vpx_memcpy(out + i * 64 + 16, buffer + i * 16, sizeof(short) * 16);

-  vp9_short_fdct16x16_c_f(out + 64 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i)

-    vpx_memcpy(out + i * 64 + 64 * 16, buffer + i * 16, sizeof(short) * 16);

-  vp9_short_fdct16x16_c_f(out + 65 * 16, buffer, 128, 2 + DWT_PRECISION_BITS);

-  for (i = 0; i < 16; ++i)

-    vpx_memcpy(out + i * 64 + 65 * 16, buffer + i * 16, sizeof(short) * 16);

-  // There is no dct used on the highest bands for now.

-  // Need to scale these coeffs by a factor of 2/2^DWT_PRECISION_BITS

-  // TODO(debargha): experiment with turning these coeffs to 0

+  // Rows

   for (i = 0; i < 32; ++i) {

-    for (j = 32; j < 64; ++j) {

-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);

-    }

+    int temp_in[32], temp_out[32];

+    for (j = 0; j < 32; ++j)

+      temp_in[j] = output[j + i * 32];

+    dct32_1d(temp_in, temp_out);

+    for (j = 0; j < 32; ++j)

+      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;

-  for (i = 32; i < 64; ++i) {

-    for (j = 0; j < 64; ++j) {

-      out[i * 64 + j] = divide_bits(out[i * 64 + j], DWT_PRECISION_BITS - 1);

-    }

-  }

-#endif  // DWTDCT_TYPE

-#endif  // CONFIG_TX64X64

-#endif  // CONFIG_DWTDCTHYBRID

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -21,7 +21,6 @@

 #include "vp9/common/vp9_quant_common.h"

 #include "vp9/encoder/vp9_segmentation.h"

 #include "vp9/common/vp9_setupintrarecon.h"

-#include "vp9/common/vp9_reconintra4x4.h"

 #include "vp9/encoder/vp9_encodeintra.h"

 #include "vp9/common/vp9_reconinter.h"

 #include "vp9/common/vp9_invtrans.h"

@@ -29,8 +28,9 @@

 #include "vp9/common/vp9_findnearmv.h"

 #include "vp9/common/vp9_reconintra.h"

 #include "vp9/common/vp9_seg_common.h"

+#include "vp9/common/vp9_tile_common.h"

 #include "vp9/encoder/vp9_tokenize.h"

-#include "vp9_rtcd.h"

+#include "./vp9_rtcd.h"

 #include <stdio.h>

 #include <math.h>

 #include <limits.h>

@@ -45,18 +45,15 @@

 int enc_debug = 0;

 #endif

-extern void select_interp_filter_type(VP9_COMP *cpi);

+void vp9_select_interp_filter_type(VP9_COMP *cpi);

 static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,

-                              int recon_yoffset, int recon_uvoffset,

                               int output_enabled, int mb_row, int mb_col);

 static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,

-                                int recon_yoffset, int recon_uvoffset,

                                 int output_enabled, int mb_row, int mb_col);

 static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,

-                                int recon_yoffset, int recon_uvoffset,

                                 int output_enabled, int mb_row, int mb_col);

 static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);

@@ -103,7 +100,7 @@

*/

   act = vp9_variance16x16(x->src.y_buffer, x->src.y_stride, VP9_VAR_OFFS, 0,

                           &sse);

-  act = act << 4;

+  act <<= 4;

   /* If the region is flat, lower the activity some more. */

   if (act < 8 << 12)

@@ -201,7 +198,7 @@

 #define OUTPUT_NORM_ACT_STATS   0

 #if USE_ACT_INDEX

-// Calculate and activity index for each mb

+// Calculate an activity index for each mb

 static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {

   VP9_COMMON *const cm = &cpi->common;

   int mb_row, mb_col;

@@ -271,6 +268,8 @@

   unsigned int mb_activity;

   int64_t activity_sum = 0;

+  x->mb_activity_ptr = cpi->mb_activity_map;

   // for each macroblock row in image

   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {

 #if ALT_ACT_MEASURE

@@ -488,8 +487,7 @@

     int segment_id = mbmi->segment_id;

-    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB)) {

+    if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

       for (i = 0; i < NB_TXFM_MODES; i++) {

         cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];

@@ -598,9 +596,6 @@

           [vp9_switchable_interp_map[mbmi->interp_filter]];

-    cpi->prediction_error += ctx->distortion;

-    cpi->intra_error += ctx->intra_error;

     cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;

     cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY]   += ctx->comp_pred_diff;

     cpi->rd_comp_pred_diff[HYBRID_PREDICTION]      += ctx->hybrid_pred_diff;

@@ -625,24 +620,12 @@

 static void set_offsets(VP9_COMP *cpi,

-                        int mb_row, int mb_col, int block_size,

-                        int *ref_yoffset, int *ref_uvoffset) {

+                        int mb_row, int mb_col, int block_size) {

   MACROBLOCK *const x = &cpi->mb;

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_MODE_INFO *mbmi;

   const int dst_fb_idx = cm->new_fb_idx;

-  const int recon_y_stride = cm->yv12_fb[dst_fb_idx].y_stride;

-  const int recon_uv_stride = cm->yv12_fb[dst_fb_idx].uv_stride;

-  const int recon_yoffset = 16 * mb_row * recon_y_stride + 16 * mb_col;

-  const int recon_uvoffset = 8 * mb_row * recon_uv_stride + 8 * mb_col;

-  const int src_y_stride = x->src.y_stride;

-  const int src_uv_stride = x->src.uv_stride;

-  const int src_yoffset = 16 * mb_row * src_y_stride + 16 * mb_col;

-  const int src_uvoffset = 8 * mb_row * src_uv_stride + 8 * mb_col;

-  const int ref_fb_idx = cm->lst_fb_idx;

-  const int ref_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;

-  const int ref_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;

   const int idx_map = mb_row * cm->mb_cols + mb_col;

   const int idx_str = xd->mode_info_stride * mb_row + mb_col;

@@ -664,9 +647,9 @@

   xd->prev_mode_info_context = cm->prev_mi + idx_str;

   // Set up destination pointers

-  xd->dst.y_buffer = cm->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;

-  xd->dst.u_buffer = cm->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;

-  xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;

+  setup_pred_block(&xd->dst,

+                   &cm->yv12_fb[dst_fb_idx],

+                   mb_row, mb_col, NULL, NULL);

   /* Set up limit values for MV components to prevent them from

    * extending beyond the UMV borders assuming 16x16 block size */

@@ -680,23 +663,11 @@

   // Set up distance of MB to edge of frame in 1/8th pel units

   block_size >>= 4;  // in macroblock units

   assert(!(mb_col & (block_size - 1)) && !(mb_row & (block_size - 1)));

-  xd->mb_to_top_edge    = -((mb_row * 16) << 3);

-  xd->mb_to_left_edge   = -((mb_col * 16) << 3);

-  xd->mb_to_bottom_edge = ((cm->mb_rows - block_size - mb_row) * 16) << 3;

-  xd->mb_to_right_edge  = ((cm->mb_cols - block_size - mb_col) * 16) << 3;

+  set_mb_row(cm, xd, mb_row, block_size);

+  set_mb_col(cm, xd, mb_col, block_size);

-  // Are edges available for intra prediction?

-  xd->up_available   = (mb_row != 0);

-  xd->left_available = (mb_col != 0);

-  /* Reference buffer offsets */

-  *ref_yoffset  = (mb_row * ref_y_stride * 16) + (mb_col * 16);

-  *ref_uvoffset = (mb_row * ref_uv_stride * 8) + (mb_col *  8);

   /* set up source buffers */

-  x->src.y_buffer = cpi->Source->y_buffer + src_yoffset;

-  x->src.u_buffer = cpi->Source->u_buffer + src_uvoffset;

-  x->src.v_buffer = cpi->Source->v_buffer + src_uvoffset;

+  setup_pred_block(&x->src, cpi->Source, mb_row, mb_col, NULL, NULL);

   /* R/D setup */

   x->rddiv = cpi->RDDIV;

@@ -727,9 +698,11 @@

       const int x = mb_col & ~3;

       const int p16 = ((mb_row & 1) << 1) +  (mb_col & 1);

       const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);

+      const int tile_progress = cm->cur_tile_mb_col_start * cm->mb_rows;

+      const int mb_cols = cm->cur_tile_mb_col_end - cm->cur_tile_mb_col_start;

       cpi->seg0_progress =

-          ((y * cm->mb_cols + x * 4 + p32 + p16) << 16) / cm->MBs;

+          ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;

   } else {

     mbmi->segment_id = 0;

@@ -736,25 +709,25 @@

-static void pick_mb_modes(VP9_COMP *cpi,

-                          int mb_row,

-                          int mb_col,

-                          TOKENEXTRA **tp,

-                          int *totalrate,

-                          int *totaldist) {

+static int pick_mb_modes(VP9_COMP *cpi,

+                         int mb_row0,

+                         int mb_col0,

+                         TOKENEXTRA **tp,

+                         int *totalrate,

+                         int *totaldist) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

   int i;

-  int recon_yoffset, recon_uvoffset;

+  int splitmodes_used = 0;

   ENTROPY_CONTEXT_PLANES left_context[2];

   ENTROPY_CONTEXT_PLANES above_context[2];

   ENTROPY_CONTEXT_PLANES *initial_above_context_ptr = cm->above_context

-                                                      + mb_col;

+                                                      + mb_col0;

   /* Function should not modify L & A contexts; save and restore on exit */

   vpx_memcpy(left_context,

-             cm->left_context + (mb_row & 2),

+             cm->left_context + (mb_row0 & 2),

              sizeof(left_context));

   vpx_memcpy(above_context,

              initial_above_context_ptr,

@@ -763,9 +736,11 @@

   /* Encode MBs in raster order within the SB */

   for (i = 0; i < 4; i++) {

     const int x_idx = i & 1, y_idx = i >> 1;

+    const int mb_row = mb_row0 + y_idx;

+    const int mb_col = mb_col0 + x_idx;

     MB_MODE_INFO *mbmi;

-    if ((mb_row + y_idx >= cm->mb_rows) || (mb_col + x_idx >= cm->mb_cols)) {

+    if ((mb_row >= cm->mb_rows) || (mb_col >= cm->mb_cols)) {

       // MB lies outside frame, move on

       continue;

@@ -772,8 +747,7 @@

     // Index of the MB in the SB 0..3

     xd->mb_index = i;

-    set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16,

-                &recon_yoffset, &recon_uvoffset);

+    set_offsets(cpi, mb_row, mb_col, 16);

     if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

       vp9_activity_masking(cpi, x);

@@ -781,15 +755,11 @@

     mbmi = &xd->mode_info_context->mbmi;

     mbmi->sb_type = BLOCK_SIZE_MB16X16;

-    cpi->update_context = 0;    // TODO Do we need this now??

-    vp9_intra_prediction_down_copy(xd);

     // Find best coding mode & reconstruct the MB so it is available

     // as a predictor for MBs that follow in the SB

     if (cm->frame_type == KEY_FRAME) {

       int r, d;

-#ifdef ENC_DEBUG

+#if 0  // ENC_DEBUG

       if (enc_debug)

         printf("intra pick_mb_modes %d %d\n", mb_row, mb_col);

 #endif

@@ -798,8 +768,8 @@

       *totaldist += d;

       // Dummy encode, do not do the tokenization

-      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0,

-                        mb_row + y_idx, mb_col + x_idx);

+      encode_macroblock(cpi, tp, 0, mb_row, mb_col);

       // Note the encoder may have changed the segment_id

       // Save the coding context

@@ -808,18 +778,18 @@

     } else {

       int seg_id, r, d;

-#ifdef ENC_DEBUG

+#if 0  // ENC_DEBUG

       if (enc_debug)

         printf("inter pick_mb_modes %d %d\n", mb_row, mb_col);

 #endif

-      vp9_pick_mode_inter_macroblock(cpi, x, recon_yoffset,

-                                     recon_uvoffset, &r, &d);

+      vp9_pick_mode_inter_macroblock(cpi, x, mb_row, mb_col, &r, &d);

       *totalrate += r;

       *totaldist += d;

+      splitmodes_used += (mbmi->mode == SPLITMV);

       // Dummy encode, do not do the tokenization

-      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset, 0,

-                        mb_row + y_idx, mb_col + x_idx);

+      encode_macroblock(cpi, tp, 0, mb_row, mb_col);

       seg_id = mbmi->segment_id;

       if (cpi->mb.e_mbd.segmentation_enabled && seg_id == 0) {

@@ -842,12 +812,14 @@

   /* Restore L & A coding context to those in place on entry */

-  vpx_memcpy(cm->left_context + (mb_row & 2),

+  vpx_memcpy(cm->left_context + (mb_row0 & 2),

              left_context,

              sizeof(left_context));

   vpx_memcpy(initial_above_context_ptr,

              above_context,

              sizeof(above_context));

+  return splitmodes_used;

 static void pick_sb_modes(VP9_COMP *cpi,

@@ -859,13 +831,11 @@

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  int recon_yoffset, recon_uvoffset;

-  set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset);

+  set_offsets(cpi, mb_row, mb_col, 32);

   xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB32X32;

   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

     vp9_activity_masking(cpi, x);

-  cpi->update_context = 0;    // TODO Do we need this now??

   /* Find best coding mode & reconstruct the MB so it is available

    * as a predictor for MBs that follow in the SB */

@@ -878,11 +848,7 @@

     vpx_memcpy(&x->sb32_context[xd->sb_index].mic, xd->mode_info_context,

                sizeof(MODE_INFO));

   } else {

-    vp9_rd_pick_inter_mode_sb32(cpi, x,

-                                recon_yoffset,

-                                recon_uvoffset,

-                                totalrate,

-                                totaldist);

+    vp9_rd_pick_inter_mode_sb32(cpi, x, mb_row, mb_col, totalrate, totaldist);

@@ -895,34 +861,25 @@

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  int recon_yoffset, recon_uvoffset;

-  set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset);

+  set_offsets(cpi, mb_row, mb_col, 64);

   xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;

   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

     vp9_activity_masking(cpi, x);

-  cpi->update_context = 0;    // TODO(rbultje) Do we need this now??

   /* Find best coding mode & reconstruct the MB so it is available

    * as a predictor for MBs that follow in the SB */

   if (cm->frame_type == KEY_FRAME) {

-    vp9_rd_pick_intra_mode_sb64(cpi, x,

-                                totalrate,

-                                totaldist);

+    vp9_rd_pick_intra_mode_sb64(cpi, x, totalrate, totaldist);

     /* Save the coding context */

-    vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context,

-               sizeof(MODE_INFO));

+    vpx_memcpy(&x->sb64_context.mic, xd->mode_info_context, sizeof(MODE_INFO));

   } else {

-    vp9_rd_pick_inter_mode_sb64(cpi, x,

-                                recon_yoffset,

-                                recon_uvoffset,

-                                totalrate,

-                                totaldist);

+    vp9_rd_pick_inter_mode_sb64(cpi, x, mb_row, mb_col, totalrate, totaldist);

-static void update_stats(VP9_COMP *cpi) {

+static void update_stats(VP9_COMP *cpi, int mb_row, int mb_col) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

@@ -976,6 +933,9 @@

     if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))

       cpi->inter_zz_count++;

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_update_nzc_counts(&cpi->common, xd, mb_row, mb_col);

+#endif

 static void encode_sb(VP9_COMP *cpi,

@@ -986,17 +946,17 @@

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  int recon_yoffset, recon_uvoffset;

   cpi->sb32_count[is_sb]++;

   if (is_sb) {

-    set_offsets(cpi, mb_row, mb_col, 32, &recon_yoffset, &recon_uvoffset);

+    set_offsets(cpi, mb_row, mb_col, 32);

     update_state(cpi, &x->sb32_context[xd->sb_index], 32, output_enabled);

-    encode_superblock32(cpi, tp, recon_yoffset, recon_uvoffset,

+    encode_superblock32(cpi, tp,

                         output_enabled, mb_row, mb_col);

-    if (output_enabled)

-      update_stats(cpi);

+    if (output_enabled) {

+      update_stats(cpi, mb_row, mb_col);

+    }

     if (output_enabled) {

       (*tp)->Token = EOSB_TOKEN;

@@ -1015,8 +975,7 @@

         continue;

-      set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16,

-                  &recon_yoffset, &recon_uvoffset);

+      set_offsets(cpi, mb_row + y_idx, mb_col + x_idx, 16);

       xd->mb_index = i;

       update_state(cpi, &x->mb_context[xd->sb_index][i], 16, output_enabled);

@@ -1023,16 +982,15 @@

       if (cpi->oxcf.tuning == VP8_TUNE_SSIM)

         vp9_activity_masking(cpi, x);

-      vp9_intra_prediction_down_copy(xd);

-      encode_macroblock(cpi, tp, recon_yoffset, recon_uvoffset,

+      encode_macroblock(cpi, tp,

                         output_enabled, mb_row + y_idx, mb_col + x_idx);

-      if (output_enabled)

-        update_stats(cpi);

+      if (output_enabled) {

+        update_stats(cpi, mb_row + y_idx, mb_col + x_idx);

+      }

       if (output_enabled) {

         (*tp)->Token = EOSB_TOKEN;

-        (*tp)++;

+       (*tp)++;

         if (mb_row + y_idx < cm->mb_rows)

           cpi->tplist[mb_row + y_idx].stop = *tp;

@@ -1060,13 +1018,11 @@

   cpi->sb64_count[is_sb[0] == 2]++;

   if (is_sb[0] == 2) {

-    int recon_yoffset, recon_uvoffset;

-    set_offsets(cpi, mb_row, mb_col, 64, &recon_yoffset, &recon_uvoffset);

+    set_offsets(cpi, mb_row, mb_col, 64);

     update_state(cpi, &x->sb64_context, 64, 1);

-    encode_superblock64(cpi, tp, recon_yoffset, recon_uvoffset,

+    encode_superblock64(cpi, tp,

                         1, mb_row, mb_col);

-    update_stats(cpi);

+    update_stats(cpi, mb_row, mb_col);

     (*tp)->Token = EOSB_TOKEN;

     (*tp)++;

@@ -1098,17 +1054,18 @@

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

   int mb_col;

-  int mb_cols = cm->mb_cols;

   // Initialize the left context for the new SB row

   vpx_memset(cm->left_context, 0, sizeof(cm->left_context));

   // Code each SB in the row

-  for (mb_col = 0; mb_col < mb_cols; mb_col += 4) {

+  for (mb_col = cm->cur_tile_mb_col_start;

+       mb_col < cm->cur_tile_mb_col_end; mb_col += 4) {

     int i;

     int sb32_rate = 0, sb32_dist = 0;

     int is_sb[4];

     int sb64_rate = INT_MAX, sb64_dist;

+    int sb64_skip = 0;

     ENTROPY_CONTEXT_PLANES l[4], a[4];

     TOKENEXTRA *tp_orig = *tp;

@@ -1118,6 +1075,8 @@

       const int x_idx = (i & 1) << 1, y_idx = i & 2;

       int mb_rate = 0, mb_dist = 0;

       int sb_rate = INT_MAX, sb_dist;

+      int splitmodes_used = 0;

+      int sb32_skip = 0;

       if (mb_row + y_idx >= cm->mb_rows || mb_col + x_idx >= cm->mb_cols)

         continue;

@@ -1124,12 +1083,19 @@

       xd->sb_index = i;

-      pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx,

-                    tp, &mb_rate, &mb_dist);

+      splitmodes_used = pick_mb_modes(cpi, mb_row + y_idx, mb_col + x_idx,

+                                      tp, &mb_rate, &mb_dist);

       mb_rate += vp9_cost_bit(cm->sb32_coded, 0);

-      if (!(((    mb_cols & 1) && mb_col + x_idx ==     mb_cols - 1) ||

-            ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) {

+      if (cpi->sf.splitmode_breakout) {

+        sb32_skip = splitmodes_used;

+        sb64_skip += splitmodes_used;

+      }

+      if ( !sb32_skip &&

+           !(((cm->mb_cols & 1) && mb_col + x_idx == cm->mb_cols - 1) ||

+             ((cm->mb_rows & 1) && mb_row + y_idx == cm->mb_rows - 1))) {

         /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */

         pick_sb_modes(cpi, mb_row + y_idx, mb_col + x_idx,

                       tp, &sb_rate, &sb_dist);

@@ -1147,6 +1113,11 @@

         is_sb[i] = 0;

         sb32_rate += mb_rate;

         sb32_dist += mb_dist;

+        // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).

+        if (cpi->sf.mb16_breakout) {

+          ++sb64_skip;

+        }

       /* Encode SB using best computed mode(s) */

@@ -1162,7 +1133,8 @@

     memcpy(cm->left_context, &l, sizeof(l));

     sb32_rate += vp9_cost_bit(cm->sb64_coded, 0);

-    if (!(((    mb_cols & 3) && mb_col + 3 >=     mb_cols) ||

+    if (!sb64_skip &&

+        !(((cm->mb_cols & 3) && mb_col + 3 >= cm->mb_cols) ||

           ((cm->mb_rows & 3) && mb_row + 3 >= cm->mb_rows))) {

       pick_sb64_modes(cpi, mb_row, mb_col, tp, &sb64_rate, &sb64_dist);

       sb64_rate += vp9_cost_bit(cm->sb64_coded, 1);

@@ -1205,7 +1177,7 @@

   // Copy data over into macro block data structures.

   x->src = *cpi->Source;

-  xd->pre = cm->yv12_fb[cm->lst_fb_idx];

+  xd->pre = cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];

   xd->dst = cm->yv12_fb[cm->new_fb_idx];

   // set up frame for intra coded blocks

@@ -1239,22 +1211,38 @@

   vpx_memset(cm->above_context, 0,

              sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);

-  xd->fullpixel_mask = 0xffffffff;

-  if (cm->full_pixel)

-    xd->fullpixel_mask = 0xfffffff8;

+  xd->fullpixel_mask = cm->full_pixel ? 0xfffffff8 : 0xffffffff;

+static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {

+  if (lossless) {

+    cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;

+    cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;

+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;

+    cpi->mb.optimize              = 0;

+    cpi->common.filter_level      = 0;

+    cpi->zbin_mode_boost_enabled  = FALSE;

+    cpi->common.txfm_mode         = ONLY_4X4;

+  } else {

+    cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;

+    cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;

+    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;

+    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;

+  }

+}

 static void encode_frame_internal(VP9_COMP *cpi) {

   int mb_row;

   MACROBLOCK *const x = &cpi->mb;

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &x->e_mbd;

-  TOKENEXTRA *tp = cpi->tok;

   int totalrate;

-  // printf("encode_frame_internal frame %d (%d)\n",

-  //        cpi->common.current_video_frame, cpi->common.show_frame);

+//   fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",

+//            cpi->common.current_video_frame, cpi->common.show_frame,

+//            cm->frame_type);

   // Compute a modified set of reference frame probabilities to use when

   // prediction fails. These are based on the current general estimates for

@@ -1273,14 +1261,9 @@

   totalrate = 0;

-  // Functions setup for all frame types so we can use MC in AltRef

-  vp9_setup_interp_filters(xd, cm->mcomp_filter_type, cm);

   // Reset frame count of inter 0,0 motion vector usage.

   cpi->inter_zz_count = 0;

-  cpi->prediction_error = 0;

-  cpi->intra_error = 0;

   cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0;

   cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0;

@@ -1292,16 +1275,27 @@

   vp9_zero(cpi->NMVcount);

   vp9_zero(cpi->coef_counts_4x4);

-  vp9_zero(cpi->hybrid_coef_counts_4x4);

   vp9_zero(cpi->coef_counts_8x8);

-  vp9_zero(cpi->hybrid_coef_counts_8x8);

   vp9_zero(cpi->coef_counts_16x16);

-  vp9_zero(cpi->hybrid_coef_counts_16x16);

   vp9_zero(cpi->coef_counts_32x32);

+  vp9_zero(cm->fc.eob_branch_counts);

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_zero(cm->fc.nzc_counts_4x4);

+  vp9_zero(cm->fc.nzc_counts_8x8);

+  vp9_zero(cm->fc.nzc_counts_16x16);

+  vp9_zero(cm->fc.nzc_counts_32x32);

+  vp9_zero(cm->fc.nzc_pcat_counts);

+#endif

 #if CONFIG_NEW_MVREF

   vp9_zero(cpi->mb_mv_ref_count);

 #endif

+  cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 &&

+                            cm->y1dc_delta_q == 0 &&

+                            cm->uvdc_delta_q == 0 &&

+                            cm->uvac_delta_q == 0);

+  switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);

   vp9_frame_init_quantizer(cpi);

   vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y1dc_delta_q);

@@ -1330,17 +1324,29 @@

     vpx_usec_timer_start(&emr_timer);

-      // For each row of SBs in the frame

-      for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4) {

-        encode_sb_row(cpi, mb_row, &tp, &totalrate);

-      }

+      // Take tiles into account and give start/end MB

+      int tile_col, tile_row;

+      TOKENEXTRA *tp = cpi->tok;

-      cpi->tok_count = (unsigned int)(tp - cpi->tok);

+      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {

+        vp9_get_tile_row_offsets(cm, tile_row);

+        for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {

+          TOKENEXTRA *tp_old = tp;

+          // For each row of SBs in the frame

+          vp9_get_tile_col_offsets(cm, tile_col);

+          for (mb_row = cm->cur_tile_mb_row_start;

+               mb_row < cm->cur_tile_mb_row_end; mb_row += 4) {

+            encode_sb_row(cpi, mb_row, &tp, &totalrate);

+          }

+          cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);

+        }

+      }

     vpx_usec_timer_mark(&emr_timer);

     cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);

   // 256 rate units to the bit,

@@ -1347,7 +1353,6 @@

   // projected_frame_size in units of BYTES

   cpi->projected_frame_size = totalrate >> 8;

 #if 0

   // Keep record of the total distortion this time around for future use

   cpi->last_frame_distortion = cpi->frame_distortion;

@@ -1388,8 +1393,7 @@

     const int segment_id = mbmi->segment_id;

     xd->mode_info_context = mi;

-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||

+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||

            (cm->mb_no_coeff_skip && mbmi->mb_skip_coeff));

     mbmi->txfm_size = txfm_max;

@@ -1413,9 +1417,8 @@

   int x, y;

   for (y = 0; y < ymbs; y++) {

-    for (x = 0; x < xmbs; x++) {

+    for (x = 0; x < xmbs; x++)

       mi[y * mis + x].mbmi.txfm_size = txfm_size;

-    }

@@ -1433,8 +1436,7 @@

     const int xmbs = MIN(2, mb_cols_left);

     xd->mode_info_context = mi;

-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||

+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||

            (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));

     set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);

@@ -1454,8 +1456,7 @@

     const int xmbs = MIN(4, mb_cols_left);

     xd->mode_info_context = mi;

-    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-            vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0) ||

+    assert((vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) ||

            (cm->mb_no_coeff_skip && get_skip_flag(mi, mis, ymbs, xmbs)));

     set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);

@@ -1526,9 +1527,9 @@

*/

     if (cpi->common.frame_type == KEY_FRAME)

       frame_type = 0;

-    else if (cpi->is_src_frame_alt_ref && cpi->common.refresh_golden_frame)

+    else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame)

       frame_type = 3;

-    else if (cpi->common.refresh_golden_frame || cpi->common.refresh_alt_ref_frame)

+    else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)

       frame_type = 1;

     else

       frame_type = 2;

@@ -1549,35 +1550,21 @@

       pred_type = HYBRID_PREDICTION;

     /* transform size (4x4, 8x8, 16x16 or select-per-mb) selection */

-#if CONFIG_LOSSLESS

+    cpi->mb.e_mbd.lossless = 0;

     if (cpi->oxcf.lossless) {

       txfm_type = ONLY_4X4;

+      cpi->mb.e_mbd.lossless = 1;

     } else

-#endif

-    /* FIXME (rbultje)

-     * this is a hack (no really), basically to work around the complete

-     * nonsense coefficient cost prediction for keyframes. The probabilities

-     * are reset to defaults, and thus we basically have no idea how expensive

-     * a 4x4 vs. 8x8 will really be. The result is that any estimate at which

-     * of the two is better is utterly bogus.

-     * I'd like to eventually remove this hack, but in order to do that, we

-     * need to move the frame reset code from the frame encode init to the

-     * bitstream write code, or alternatively keep a backup of the previous

-     * keyframe's probabilities as an estimate of what the current keyframe's

-     * coefficient cost distributions may look like. */

-    if (frame_type == 0) {

-      txfm_type = ALLOW_32X32;

-    } else

 #if 0

-    /* FIXME (rbultje)

-     * this code is disabled for a similar reason as the code above; the

-     * problem is that each time we "revert" to 4x4 only (or even 8x8 only),

-     * the coefficient probabilities for 16x16 (and 8x8) start lagging behind,

-     * thus leading to them lagging further behind and not being chosen for

-     * subsequent frames either. This is essentially a local minimum problem

-     * that we can probably fix by estimating real costs more closely within

-     * a frame, perhaps by re-calculating costs on-the-fly as frame encoding

-     * progresses. */

+    /* FIXME (rbultje): this code is disabled until we support cost updates

+     * while a frame is being encoded; the problem is that each time we

+     * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities

+     * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging

+     * further behind and not being chosen for subsequent frames either. This

+     * is essentially a local minimum problem that we can probably fix by

+     * estimating real costs more closely within a frame, perhaps by re-

+     * calculating costs on-the-fly as frame encoding progresses. */

     if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

             cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&

         cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >

@@ -1671,7 +1658,7 @@

     // Update interpolation filter strategy for next frame.

     if ((cpi->common.frame_type != KEY_FRAME) && (cpi->sf.search_best_filter))

-      select_interp_filter_type(cpi);

+      vp9_select_interp_filter_type(cpi);

   } else {

     encode_frame_internal(cpi);

@@ -1683,30 +1670,23 @@

   int i;

   for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

+    for (c = 0; c < 4; c++)

       x->block[r * 4 + c].src_diff = x->src_diff + r * 4 * 16 + c * 4;

-    }

   for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++) {

+    for (c = 0; c < 2; c++)

       x->block[16 + r * 2 + c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;

-    }

   for (r = 0; r < 2; r++) {

-    for (c = 0; c < 2; c++) {

+    for (c = 0; c < 2; c++)

       x->block[20 + r * 2 + c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;

-    }

-  x->block[24].src_diff = x->src_diff + 384;

-  for (i = 0; i < 25; i++) {

+  for (i = 0; i < 24; i++)

     x->block[i].coeff = x->coeff + i * 16;

-  }

 void vp9_build_block_offsets(MACROBLOCK *x) {

@@ -1826,63 +1806,6 @@

 #endif

-static void update_sb_skip_coeff_state(VP9_COMP *cpi,

-                                       ENTROPY_CONTEXT_PLANES ta[4],

-                                       ENTROPY_CONTEXT_PLANES tl[4],

-                                       TOKENEXTRA *t[4],

-                                       TOKENEXTRA **tp,

-                                       int skip[4], int output_enabled) {

-  MACROBLOCK *const x = &cpi->mb;

-  TOKENEXTRA tokens[4][16 * 25];

-  int n_tokens[4], n;

-  // if there were no skips, we don't need to do anything

-  if (!skip[0] && !skip[1] && !skip[2] && !skip[3])

-    return;

-  // if we don't do coeff skipping for this frame, we don't

-  // need to do anything here

-  if (!cpi->common.mb_no_coeff_skip)

-    return;

-  // if all 4 MBs skipped coeff coding, nothing to be done

-  if (skip[0] && skip[1] && skip[2] && skip[3])

-    return;

-  // so the situation now is that we want to skip coeffs

-  // for some MBs, but not all, and we didn't code EOB

-  // coefficients for them. However, the skip flag for this

-  // SB will be 0 overall, so we need to insert EOBs in the

-  // middle of the token tree. Do so here.

-  n_tokens[0] = t[1] - t[0];

-  n_tokens[1] = t[2] - t[1];

-  n_tokens[2] = t[3] - t[2];

-  n_tokens[3] = *tp  - t[3];

-  if (n_tokens[0])

-    memcpy(tokens[0], t[0], n_tokens[0] * sizeof(*t[0]));

-  if (n_tokens[1])

-    memcpy(tokens[1], t[1], n_tokens[1] * sizeof(*t[0]));

-  if (n_tokens[2])

-    memcpy(tokens[2], t[2], n_tokens[2] * sizeof(*t[0]));

-  if (n_tokens[3])

-    memcpy(tokens[3], t[3], n_tokens[3] * sizeof(*t[0]));

-  // reset pointer, stuff EOBs where necessary

-  *tp = t[0];

-  for (n = 0; n < 4; n++) {

-    if (skip[n]) {

-      x->e_mbd.above_context = &ta[n];

-      x->e_mbd.left_context  = &tl[n];

-      vp9_stuff_mb(cpi, &x->e_mbd, tp, !output_enabled);

-    } else {

-      if (n_tokens[n]) {

-        memcpy(*tp, tokens[n], sizeof(*t[0]) * n_tokens[n]);

-      }

-      (*tp) += n_tokens[n];

-    }

-  }

-}

 static void update_sb64_skip_coeff_state(VP9_COMP *cpi,

                                          ENTROPY_CONTEXT_PLANES ta[16],

                                          ENTROPY_CONTEXT_PLANES tl[16],

@@ -1994,21 +1917,151 @@

+#if CONFIG_CODE_NONZEROCOUNT

+static void gather_nzcs_mb16(VP9_COMMON *const cm,

+                             MACROBLOCKD *xd) {

+  int i;

+  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,

+             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));

+  switch (xd->mode_info_context->mbmi.txfm_size) {

+    case TX_4X4:

+      for (i = 0; i < 24; ++i) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    case TX_8X8:

+      for (i = 0; i < 16; i += 4) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

+          xd->mode_info_context->mbmi.mode == SPLITMV) {

+        for (i = 16; i < 24; ++i) {

+          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+        }

+      } else {

+        for (i = 16; i < 24; i += 4) {

+          xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+        }

+      }

+      break;

+    case TX_16X16:

+      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];

+      for (i = 16; i < 24; i += 4) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    default:

+      break;

+  }

+}

+static void gather_nzcs_sb32(VP9_COMMON *const cm,

+                             MACROBLOCKD *xd) {

+  int i, j;

+  MODE_INFO *m = xd->mode_info_context;

+  int mis = cm->mode_info_stride;

+  vpx_memset(m->mbmi.nzcs, 0,

+             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));

+  switch (xd->mode_info_context->mbmi.txfm_size) {

+    case TX_4X4:

+      for (i = 0; i < 96; ++i) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    case TX_8X8:

+      for (i = 0; i < 96; i += 4) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    case TX_16X16:

+      for (i = 0; i < 96; i += 16) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    case TX_32X32:

+      xd->mode_info_context->mbmi.nzcs[0] = xd->nzcs[0];

+      for (i = 64; i < 96; i += 16) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    default:

+      break;

+  }

+  for (i = 0; i < 2; ++i)

+    for (j = 0; j < 2; ++j) {

+      if (i == 0 && j == 0) continue;

+      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,

+                 384 * sizeof(m->mbmi.nzcs[0]));

+    }

+}

+static void gather_nzcs_sb64(VP9_COMMON *const cm,

+                             MACROBLOCKD *xd) {

+  int i, j;

+  MODE_INFO *m = xd->mode_info_context;

+  int mis = cm->mode_info_stride;

+  vpx_memset(xd->mode_info_context->mbmi.nzcs, 0,

+             384 * sizeof(xd->mode_info_context->mbmi.nzcs[0]));

+  switch (xd->mode_info_context->mbmi.txfm_size) {

+    case TX_4X4:

+      for (i = 0; i < 384; ++i) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    case TX_8X8:

+      for (i = 0; i < 384; i += 4) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    case TX_16X16:

+      for (i = 0; i < 384; i += 16) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    case TX_32X32:

+      for (i = 0; i < 384; i += 64) {

+        xd->mode_info_context->mbmi.nzcs[i] = xd->nzcs[i];

+      }

+      break;

+    default:

+      break;

+  }

+  for (i = 0; i < 4; ++i)

+    for (j = 0; j < 4; ++j) {

+      if (i == 0 && j == 0) continue;

+      vpx_memcpy((m + j + mis * i)->mbmi.nzcs, m->mbmi.nzcs,

+                 384 * sizeof(m->mbmi.nzcs[0]));

+    }

+}

+#endif

 static void encode_macroblock(VP9_COMP *cpi, TOKENEXTRA **t,

-                              int recon_yoffset, int recon_uvoffset,

                               int output_enabled,

                               int mb_row, int mb_col) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

-  MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

+  MODE_INFO *mi = xd->mode_info_context;

+  MB_MODE_INFO *const mbmi = &mi->mbmi;

+  const int mis = cm->mode_info_stride;

   unsigned char ref_pred_flag;

   assert(!xd->mode_info_context->mbmi.sb_type);

 #ifdef ENC_DEBUG

-  enc_debug = (cpi->common.current_video_frame == 46 &&

-               mb_row == 5 && mb_col == 2);

+  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&

+               mb_row == 8 && mb_col == 0 && output_enabled);

   if (enc_debug)

     printf("Encode MB %d %d output %d\n", mb_row, mb_col, output_enabled);

 #endif

@@ -2037,9 +2090,11 @@

           else

             cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

         } else if (mbmi->mode == SPLITMV)

-          cpi->zbin_mode_boost = 0;

+          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;

         else

           cpi->zbin_mode_boost = MV_ZBIN_BOOST;

+      } else {

+        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;

@@ -2053,7 +2108,7 @@

   if (mbmi->ref_frame == INTRA_FRAME) {

-#ifdef ENC_DEBUG

+#if 0  // def ENC_DEBUG

     if (enc_debug) {

       printf("Mode %d skip %d tx_size %d\n", mbmi->mode, x->skip,

              mbmi->txfm_size);

@@ -2060,14 +2115,14 @@

 #endif

     if (mbmi->mode == B_PRED) {

-      vp9_encode_intra16x16mbuv(x);

+      vp9_encode_intra16x16mbuv(cm, x);

       vp9_encode_intra4x4mby(x);

     } else if (mbmi->mode == I8X8_PRED) {

       vp9_encode_intra8x8mby(x);

       vp9_encode_intra8x8mbuv(x);

     } else {

-      vp9_encode_intra16x16mbuv(x);

-      vp9_encode_intra16x16mby(x);

+      vp9_encode_intra16x16mbuv(cm, x);

+      vp9_encode_intra16x16mby(cm, x);

     if (output_enabled)

@@ -2086,36 +2141,35 @@

     assert(cm->frame_type != KEY_FRAME);

     if (mbmi->ref_frame == LAST_FRAME)

-      ref_fb_idx = cpi->common.lst_fb_idx;

+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

     else if (mbmi->ref_frame == GOLDEN_FRAME)

-      ref_fb_idx = cpi->common.gld_fb_idx;

+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

     else

-      ref_fb_idx = cpi->common.alt_fb_idx;

+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

-    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

-    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+    setup_pred_block(&xd->pre,

+                     &cpi->common.yv12_fb[ref_fb_idx],

+                     mb_row, mb_col,

+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);

     if (mbmi->second_ref_frame > 0) {

       int second_ref_fb_idx;

       if (mbmi->second_ref_frame == LAST_FRAME)

-        second_ref_fb_idx = cpi->common.lst_fb_idx;

+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

       else if (mbmi->second_ref_frame == GOLDEN_FRAME)

-        second_ref_fb_idx = cpi->common.gld_fb_idx;

+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

       else

-        second_ref_fb_idx = cpi->common.alt_fb_idx;

+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +

-                                recon_yoffset;

-      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +

-                                recon_uvoffset;

-      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +

-                                recon_uvoffset;

+      setup_pred_block(&xd->second_pre,

+                       &cpi->common.yv12_fb[second_ref_fb_idx],

+                       mb_row, mb_col,

+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);

     if (!x->skip) {

-      vp9_encode_inter16x16(x);

+      vp9_encode_inter16x16(cm, x, mb_row, mb_col);

       // Clear mb_skip_coeff if mb_no_coeff_skip is not set

       if (!cpi->common.mb_no_coeff_skip)

@@ -2122,22 +2176,15 @@

         mbmi->mb_skip_coeff = 0;

     } else {

-      vp9_build_1st_inter16x16_predictors_mb(xd,

-                                             xd->dst.y_buffer,

-                                             xd->dst.u_buffer,

-                                             xd->dst.v_buffer,

-                                             xd->dst.y_stride,

-                                             xd->dst.uv_stride);

-      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-        vp9_build_2nd_inter16x16_predictors_mb(xd,

-                                               xd->dst.y_buffer,

-                                               xd->dst.u_buffer,

-                                               xd->dst.v_buffer,

-                                               xd->dst.y_stride,

-                                               xd->dst.uv_stride);

-      }

+      vp9_build_inter16x16_predictors_mb(xd,

+                                         xd->dst.y_buffer,

+                                         xd->dst.u_buffer,

+                                         xd->dst.v_buffer,

+                                         xd->dst.y_stride,

+                                         xd->dst.uv_stride,

+                                         mb_row, mb_col);

 #if CONFIG_COMP_INTERINTRA_PRED

-      else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

+      if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) {

         vp9_build_interintra_16x16_predictors_mb(xd,

                                                  xd->dst.y_buffer,

                                                  xd->dst.u_buffer,

@@ -2155,7 +2202,7 @@

       int i, j;

       printf("\n");

       printf("qcoeff\n");

-      for (i = 0; i < 400; i++) {

+      for (i = 0; i < 384; i++) {

         printf("%3d ", xd->qcoeff[i]);

         if (i % 16 == 15) printf("\n");

@@ -2202,15 +2249,17 @@

 #endif

+#if CONFIG_CODE_NONZEROCOUNT

+    gather_nzcs_mb16(cm, xd);

+#endif

     vp9_tokenize_mb(cpi, xd, t, !output_enabled);

   } else {

-    int mb_skip_context =

-      cpi->common.mb_no_coeff_skip ?

-      (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +

-      (x->e_mbd.mode_info_context - cpi->common.mode_info_stride)->mbmi.mb_skip_coeff :

-      0;

-    if (cpi->common.mb_no_coeff_skip) {

+    // FIXME(rbultje): not tile-aware (mi - 1)

+    int mb_skip_context = cpi->common.mb_no_coeff_skip ?

+      (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;

+    if (cm->mb_no_coeff_skip) {

       mbmi->mb_skip_coeff = 1;

       if (output_enabled)

         cpi->skip_true_count[mb_skip_context]++;

@@ -2227,8 +2276,7 @@

     int segment_id = mbmi->segment_id;

     if (cpi->common.txfm_mode == TX_MODE_SELECT &&

         !((cpi->common.mb_no_coeff_skip && mbmi->mb_skip_coeff) ||

-          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_EOB) &&

-           vp9_get_segdata(&x->e_mbd, segment_id, SEG_LVL_EOB) == 0))) {

+          (vp9_segfeature_active(&x->e_mbd, segment_id, SEG_LVL_SKIP)))) {

       assert(mbmi->txfm_size <= TX_16X16);

       if (mbmi->mode != B_PRED && mbmi->mode != I8X8_PRED &&

           mbmi->mode != SPLITMV) {

@@ -2253,7 +2301,6 @@

 static void encode_superblock32(VP9_COMP *cpi, TOKENEXTRA **t,

-                                int recon_yoffset, int recon_uvoffset,

                                 int output_enabled, int mb_row, int mb_col) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

@@ -2267,14 +2314,22 @@

   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

   unsigned char ref_pred_flag;

-  int n;

-  TOKENEXTRA *tp[4];

-  int skip[4];

   MODE_INFO *mi = x->e_mbd.mode_info_context;

   unsigned int segment_id = mi->mbmi.segment_id;

-  ENTROPY_CONTEXT_PLANES ta[4], tl[4];

   const int mis = cm->mode_info_stride;

+#ifdef ENC_DEBUG

+  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&

+               mb_row == 8 && mb_col == 0 && output_enabled);

+  if (enc_debug) {

+    printf("Encode SB32 %d %d output %d\n", mb_row, mb_col, output_enabled);

+    printf("Mode %d skip %d tx_size %d ref %d ref2 %d mv %d %d interp %d\n",

+           mi->mbmi.mode, x->skip, mi->mbmi.txfm_size,

+           mi->mbmi.ref_frame, mi->mbmi.second_ref_frame,

+           mi->mbmi.mv[0].as_mv.row, mi->mbmi.mv[0].as_mv.col,

+           mi->mbmi.interp_filter);

+  }

+#endif

   if (cm->frame_type == KEY_FRAME) {

     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

       adjust_act_zbin(cpi, x);

@@ -2299,9 +2354,11 @@

           else

             cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

         } else if (xd->mode_info_context->mbmi.mode == SPLITMV)

-          cpi->zbin_mode_boost = 0;

+          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;

         else

           cpi->zbin_mode_boost = MV_ZBIN_BOOST;

+      } else {

+        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;

@@ -2326,152 +2383,137 @@

     assert(cm->frame_type != KEY_FRAME);

     if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)

-      ref_fb_idx = cpi->common.lst_fb_idx;

+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

     else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)

-      ref_fb_idx = cpi->common.gld_fb_idx;

+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

     else

-      ref_fb_idx = cpi->common.alt_fb_idx;

+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-    xd->pre.y_buffer = cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

-    xd->pre.u_buffer = cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

-    xd->pre.v_buffer = cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+    setup_pred_block(&xd->pre,

+                     &cpi->common.yv12_fb[ref_fb_idx],

+                     mb_row, mb_col,

+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);

     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

       int second_ref_fb_idx;

       if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)

-        second_ref_fb_idx = cpi->common.lst_fb_idx;

+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

       else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)

-        second_ref_fb_idx = cpi->common.gld_fb_idx;

+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

       else

-        second_ref_fb_idx = cpi->common.alt_fb_idx;

+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-      xd->second_pre.y_buffer = cpi->common.yv12_fb[second_ref_fb_idx].y_buffer +

-                                    recon_yoffset;

-      xd->second_pre.u_buffer = cpi->common.yv12_fb[second_ref_fb_idx].u_buffer +

-                                    recon_uvoffset;

-      xd->second_pre.v_buffer = cpi->common.yv12_fb[second_ref_fb_idx].v_buffer +

-                                    recon_uvoffset;

+      setup_pred_block(&xd->second_pre,

+                       &cpi->common.yv12_fb[second_ref_fb_idx],

+                       mb_row, mb_col,

+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);

     vp9_build_inter32x32_predictors_sb(xd, xd->dst.y_buffer,

                                        xd->dst.u_buffer, xd->dst.v_buffer,

-                                       xd->dst.y_stride, xd->dst.uv_stride);

+                                       xd->dst.y_stride, xd->dst.uv_stride,

+                                       mb_row, mb_col);

-  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {

-    if (!x->skip) {

-      vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,

-                           dst, dst_y_stride);

-      vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,

-                            usrc, vsrc, src_uv_stride,

-                            udst, vdst, dst_uv_stride);

-      vp9_transform_sby_32x32(x);

-      vp9_transform_sbuv_16x16(x);

-      vp9_quantize_sby_32x32(x);

-      vp9_quantize_sbuv_16x16(x);

-      // TODO(rbultje): trellis optimize

-      vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data);

-      vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data);

-      vp9_recon_sby_s_c(&x->e_mbd, dst);

-      vp9_recon_sbuv_s_c(&x->e_mbd, udst, vdst);

-      vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled);

-    } else {

-      int mb_skip_context =

-          cpi->common.mb_no_coeff_skip ?

-          (mi - 1)->mbmi.mb_skip_coeff +

-          (mi - mis)->mbmi.mb_skip_coeff :

-          0;

-      mi->mbmi.mb_skip_coeff = 1;

-      if (cm->mb_no_coeff_skip) {

-        if (output_enabled)

-          cpi->skip_true_count[mb_skip_context]++;

-        vp9_fix_contexts_sb(xd);

-      } else {

-        vp9_stuff_sb(cpi, xd, t, !output_enabled);

-        if (output_enabled)

-          cpi->skip_false_count[mb_skip_context]++;

-      }

+  if (!x->skip) {

+    vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride,

+                         dst, dst_y_stride);

+    vp9_subtract_sbuv_s_c(x->src_diff,

+                          usrc, vsrc, src_uv_stride,

+                          udst, vdst, dst_uv_stride);

+    switch (mi->mbmi.txfm_size) {

+      case TX_32X32:

+        vp9_transform_sby_32x32(x);

+        vp9_transform_sbuv_16x16(x);

+        vp9_quantize_sby_32x32(x);

+        vp9_quantize_sbuv_16x16(x);

+        if (x->optimize) {

+          vp9_optimize_sby_32x32(cm, x);

+          vp9_optimize_sbuv_16x16(cm, x);

+        }

+        vp9_inverse_transform_sby_32x32(xd);

+        vp9_inverse_transform_sbuv_16x16(xd);

+        break;

+      case TX_16X16:

+        vp9_transform_sby_16x16(x);

+        vp9_transform_sbuv_16x16(x);

+        vp9_quantize_sby_16x16(x);

+        vp9_quantize_sbuv_16x16(x);

+        if (x->optimize) {

+          vp9_optimize_sby_16x16(cm, x);

+          vp9_optimize_sbuv_16x16(cm, x);

+        }

+        vp9_inverse_transform_sby_16x16(xd);

+        vp9_inverse_transform_sbuv_16x16(xd);

+        break;

+      case TX_8X8:

+        vp9_transform_sby_8x8(x);

+        vp9_transform_sbuv_8x8(x);

+        vp9_quantize_sby_8x8(x);

+        vp9_quantize_sbuv_8x8(x);

+        if (x->optimize) {

+          vp9_optimize_sby_8x8(cm, x);

+          vp9_optimize_sbuv_8x8(cm, x);

+        }

+        vp9_inverse_transform_sby_8x8(xd);

+        vp9_inverse_transform_sbuv_8x8(xd);

+        break;

+      case TX_4X4:

+        vp9_transform_sby_4x4(x);

+        vp9_transform_sbuv_4x4(x);

+        vp9_quantize_sby_4x4(x);

+        vp9_quantize_sbuv_4x4(x);

+        if (x->optimize) {

+          vp9_optimize_sby_4x4(cm, x);

+          vp9_optimize_sbuv_4x4(cm, x);

+        }

+        vp9_inverse_transform_sby_4x4(xd);

+        vp9_inverse_transform_sbuv_4x4(xd);

+        break;

+      default: assert(0);

+    vp9_recon_sby_s_c(xd, dst);

+    vp9_recon_sbuv_s_c(xd, udst, vdst);

+#if CONFIG_CODE_NONZEROCOUNT

+    gather_nzcs_sb32(cm, xd);

+#endif

-    // copy skip flag on all mb_mode_info contexts in this SB

-    // if this was a skip at this txfm size

-    if (mb_col < cm->mb_cols - 1)

-      mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

-    if (mb_row < cm->mb_rows - 1) {

-      mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

-      if (mb_col < cm->mb_cols - 1)

-        mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

-    }

-    skip[0] = skip[2] = skip[1] = skip[3] = mi->mbmi.mb_skip_coeff;

+    vp9_tokenize_sb(cpi, xd, t, !output_enabled);

   } else {

-    for (n = 0; n < 4; n++) {

-      int x_idx = n & 1, y_idx = n >> 1;

+    // FIXME(rbultje): not tile-aware (mi - 1)

+    int mb_skip_context = cm->mb_no_coeff_skip ?

+          (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;

-      xd->left_context = cm->left_context + y_idx + (mb_row & 2);

-      xd->above_context = cm->above_context + mb_col + x_idx;

-      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));

-      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));

-      tp[n] = *t;

-      xd->mode_info_context = mi + x_idx + y_idx * mis;

-      if (!x->skip) {

-        vp9_subtract_mby_s_c(x->src_diff,

-                             src + x_idx * 16 + y_idx * 16 * src_y_stride,

-                             src_y_stride,

-                             dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

-                             dst_y_stride);

-        vp9_subtract_mbuv_s_c(x->src_diff,

-                              usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                              vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                              src_uv_stride,

-                              udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                              vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                              dst_uv_stride);

-        vp9_fidct_mb(x);

-        vp9_recon_mby_s_c(&x->e_mbd,

-                          dst + x_idx * 16 + y_idx * 16 * dst_y_stride);

-        vp9_recon_mbuv_s_c(&x->e_mbd,

-                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);

-        vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled);

-        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;

-      } else {

-        int mb_skip_context = cpi->common.mb_no_coeff_skip ?

-            (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +

-            (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff :

-            0;

-        xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;

-        if (cpi->common.mb_no_coeff_skip) {

-          // TODO(rbultje) this should be done per-sb instead of per-mb?

-          if (output_enabled)

-            cpi->skip_true_count[mb_skip_context]++;

-          vp9_reset_mb_tokens_context(xd);

-        } else {

-          vp9_stuff_mb(cpi, xd, t, !output_enabled);

-          // TODO(rbultje) this should be done per-sb instead of per-mb?

-          if (output_enabled)

-            cpi->skip_false_count[mb_skip_context]++;

-        }

-      }

+    mi->mbmi.mb_skip_coeff = 1;

+    if (cm->mb_no_coeff_skip) {

+      if (output_enabled)

+        cpi->skip_true_count[mb_skip_context]++;

+      vp9_reset_sb_tokens_context(xd);

+    } else {

+      vp9_stuff_sb(cpi, xd, t, !output_enabled);

+      if (output_enabled)

+        cpi->skip_false_count[mb_skip_context]++;

+  }

-    xd->mode_info_context = mi;

-    update_sb_skip_coeff_state(cpi, ta, tl, tp, t, skip, output_enabled);

+  // copy skip flag on all mb_mode_info contexts in this SB

+  // if this was a skip at this txfm size

+  if (mb_col < cm->mb_cols - 1)

+    mi[1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

+  if (mb_row < cm->mb_rows - 1) {

+    mi[mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

+    if (mb_col < cm->mb_cols - 1)

+      mi[mis + 1].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

   if (output_enabled) {

     if (cm->txfm_mode == TX_MODE_SELECT &&

-        !((cm->mb_no_coeff_skip && skip[0] && skip[1] && skip[2] && skip[3]) ||

-          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {

+        !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) ||

+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

       cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;

     } else {

-      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?

-                      TX_32X32 :

-                      cm->txfm_mode;

+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;

       mi->mbmi.txfm_size = sz;

       if (mb_col < cm->mb_cols - 1)

         mi[1].mbmi.txfm_size = sz;

@@ -2485,7 +2527,6 @@

 static void encode_superblock64(VP9_COMP *cpi, TOKENEXTRA **t,

-                                int recon_yoffset, int recon_uvoffset,

                                 int output_enabled, int mb_row, int mb_col) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCK *const x = &cpi->mb;

@@ -2500,13 +2541,16 @@

   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

   unsigned char ref_pred_flag;

   int n;

-  TOKENEXTRA *tp[16];

-  int skip[16];

   MODE_INFO *mi = x->e_mbd.mode_info_context;

   unsigned int segment_id = mi->mbmi.segment_id;

-  ENTROPY_CONTEXT_PLANES ta[16], tl[16];

   const int mis = cm->mode_info_stride;

+#ifdef ENC_DEBUG

+  enc_debug = (cpi->common.current_video_frame == 11 && cm->show_frame &&

+               mb_row == 8 && mb_col == 0 && output_enabled);

+  if (enc_debug)

+    printf("Encode SB64 %d %d output %d\n", mb_row, mb_col, output_enabled);

+#endif

   if (cm->frame_type == KEY_FRAME) {

     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {

       adjust_act_zbin(cpi, x);

@@ -2531,10 +2575,12 @@

           else

             cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;

         } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {

-          cpi->zbin_mode_boost = 0;

+          cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;

         } else {

           cpi->zbin_mode_boost = MV_ZBIN_BOOST;

+      } else {

+        cpi->zbin_mode_boost = INTRA_ZBIN_BOOST;

@@ -2557,186 +2603,134 @@

     assert(cm->frame_type != KEY_FRAME);

     if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)

-      ref_fb_idx = cpi->common.lst_fb_idx;

+      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

     else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)

-      ref_fb_idx = cpi->common.gld_fb_idx;

+      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

     else

-      ref_fb_idx = cpi->common.alt_fb_idx;

+      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-    xd->pre.y_buffer =

-        cpi->common.yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;

-    xd->pre.u_buffer =

-        cpi->common.yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;

-    xd->pre.v_buffer =

-        cpi->common.yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;

+    setup_pred_block(&xd->pre,

+                     &cpi->common.yv12_fb[ref_fb_idx],

+                     mb_row, mb_col,

+                     &xd->scale_factor[0], &xd->scale_factor_uv[0]);

     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

       int second_ref_fb_idx;

       if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)

-        second_ref_fb_idx = cpi->common.lst_fb_idx;

+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];

       else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)

-        second_ref_fb_idx = cpi->common.gld_fb_idx;

+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];

       else

-        second_ref_fb_idx = cpi->common.alt_fb_idx;

+        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];

-      xd->second_pre.y_buffer =

-          cpi->common.yv12_fb[second_ref_fb_idx].y_buffer + recon_yoffset;

-      xd->second_pre.u_buffer =

-          cpi->common.yv12_fb[second_ref_fb_idx].u_buffer + recon_uvoffset;

-      xd->second_pre.v_buffer =

-          cpi->common.yv12_fb[second_ref_fb_idx].v_buffer + recon_uvoffset;

+      setup_pred_block(&xd->second_pre,

+                       &cpi->common.yv12_fb[second_ref_fb_idx],

+                       mb_row, mb_col,

+                       &xd->scale_factor[1], &xd->scale_factor_uv[1]);

     vp9_build_inter64x64_predictors_sb(xd, xd->dst.y_buffer,

                                        xd->dst.u_buffer, xd->dst.v_buffer,

-                                       xd->dst.y_stride, xd->dst.uv_stride);

+                                       xd->dst.y_stride, xd->dst.uv_stride,

+                                       mb_row, mb_col);

-  if (xd->mode_info_context->mbmi.txfm_size == TX_32X32) {

-    int n;

+  if (!x->skip) {

+    vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);

+    vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,

+                            udst, vdst, dst_uv_stride);

-    for (n = 0; n < 4; n++) {

-      int x_idx = n & 1, y_idx = n >> 1;

-      xd->mode_info_context = mi + x_idx * 2 + mis * y_idx * 2;

-      xd->left_context = cm->left_context + (y_idx << 1);

-      xd->above_context = cm->above_context + mb_col + (x_idx << 1);

-      memcpy(&ta[n * 2], xd->above_context, sizeof(*ta) * 2);

-      memcpy(&tl[n * 2], xd->left_context, sizeof(*tl) * 2);

-      tp[n] = *t;

-      xd->mode_info_context = mi + x_idx * 2 + y_idx * mis * 2;

-      if (!x->skip) {

-        vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff,

-                             src + x_idx * 32 + y_idx * 32 * src_y_stride,

-                             src_y_stride,

-                             dst + x_idx * 32 + y_idx * 32 * dst_y_stride,

-                             dst_y_stride);

-        vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,

-                              usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,

-                              vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,

-                              src_uv_stride,

-                              udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,

-                              vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride,

-                              dst_uv_stride);

-        vp9_transform_sby_32x32(x);

-        vp9_transform_sbuv_16x16(x);

-        vp9_quantize_sby_32x32(x);

-        vp9_quantize_sbuv_16x16(x);

-        // TODO(rbultje): trellis optimize

-        vp9_inverse_transform_sbuv_16x16(&x->e_mbd.sb_coeff_data);

-        vp9_inverse_transform_sby_32x32(&x->e_mbd.sb_coeff_data);

-        vp9_recon_sby_s_c(&x->e_mbd,

-                          dst + 32 * x_idx + 32 * y_idx * dst_y_stride);

-        vp9_recon_sbuv_s_c(&x->e_mbd,

-                           udst + x_idx * 16 + y_idx * 16 * dst_uv_stride,

-                           vdst + x_idx * 16 + y_idx * 16 * dst_uv_stride);

-        vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled);

-      } else {

-        int mb_skip_context = cpi->common.mb_no_coeff_skip ?

-                              (mi - 1)->mbmi.mb_skip_coeff +

-                                  (mi - mis)->mbmi.mb_skip_coeff : 0;

-        xd->mode_info_context->mbmi.mb_skip_coeff = 1;

-        if (cm->mb_no_coeff_skip) {

-          if (output_enabled)

-            cpi->skip_true_count[mb_skip_context]++;

-          vp9_fix_contexts_sb(xd);

-        } else {

-          vp9_stuff_sb(cpi, xd, t, !output_enabled);

-          if (output_enabled)

-            cpi->skip_false_count[mb_skip_context]++;

+    switch (xd->mode_info_context->mbmi.txfm_size) {

+      case TX_32X32:

+        vp9_transform_sb64y_32x32(x);

+        vp9_transform_sb64uv_32x32(x);

+        vp9_quantize_sb64y_32x32(x);

+        vp9_quantize_sb64uv_32x32(x);

+        if (x->optimize) {

+          vp9_optimize_sb64y_32x32(cm, x);

+          vp9_optimize_sb64uv_32x32(cm, x);

-      }

-      // copy skip flag on all mb_mode_info contexts in this SB

-      // if this was a skip at this txfm size

-      if (mb_col + x_idx * 2 < cm->mb_cols - 1)

-        mi[mis * y_idx * 2 + x_idx * 2 + 1].mbmi.mb_skip_coeff =

-            mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;

-      if (mb_row + y_idx * 2 < cm->mb_rows - 1) {

-        mi[mis * y_idx * 2 + x_idx * 2 + mis].mbmi.mb_skip_coeff =

-            mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;

-        if (mb_col + x_idx * 2 < cm->mb_cols - 1)

-          mi[mis * y_idx * 2 + x_idx * 2 + mis + 1].mbmi.mb_skip_coeff =

-              mi[mis * y_idx * 2 + x_idx * 2].mbmi.mb_skip_coeff;

-      }

-      skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;

+        vp9_inverse_transform_sb64y_32x32(xd);

+        vp9_inverse_transform_sb64uv_32x32(xd);

+        break;

+      case TX_16X16:

+        vp9_transform_sb64y_16x16(x);

+        vp9_transform_sb64uv_16x16(x);

+        vp9_quantize_sb64y_16x16(x);

+        vp9_quantize_sb64uv_16x16(x);

+        if (x->optimize) {

+          vp9_optimize_sb64y_16x16(cm, x);

+          vp9_optimize_sb64uv_16x16(cm, x);

+        }

+        vp9_inverse_transform_sb64y_16x16(xd);

+        vp9_inverse_transform_sb64uv_16x16(xd);

+        break;

+      case TX_8X8:

+        vp9_transform_sb64y_8x8(x);

+        vp9_transform_sb64uv_8x8(x);

+        vp9_quantize_sb64y_8x8(x);

+        vp9_quantize_sb64uv_8x8(x);

+        if (x->optimize) {

+          vp9_optimize_sb64y_8x8(cm, x);

+          vp9_optimize_sb64uv_8x8(cm, x);

+        }

+        vp9_inverse_transform_sb64y_8x8(xd);

+        vp9_inverse_transform_sb64uv_8x8(xd);

+        break;

+      case TX_4X4:

+        vp9_transform_sb64y_4x4(x);

+        vp9_transform_sb64uv_4x4(x);

+        vp9_quantize_sb64y_4x4(x);

+        vp9_quantize_sb64uv_4x4(x);

+        if (x->optimize) {

+          vp9_optimize_sb64y_4x4(cm, x);

+          vp9_optimize_sb64uv_4x4(cm, x);

+        }

+        vp9_inverse_transform_sb64y_4x4(xd);

+        vp9_inverse_transform_sb64uv_4x4(xd);

+        break;

+      default: assert(0);

+    vp9_recon_sb64y_s_c(xd, dst);

+    vp9_recon_sb64uv_s_c(&x->e_mbd, udst, vdst);

+#if CONFIG_CODE_NONZEROCOUNT

+    gather_nzcs_sb64(cm, &x->e_mbd);

+#endif

+    vp9_tokenize_sb64(cpi, &x->e_mbd, t, !output_enabled);

   } else {

-    for (n = 0; n < 16; n++) {

-      const int x_idx = n & 3, y_idx = n >> 2;

+    // FIXME(rbultje): not tile-aware (mi - 1)

+    int mb_skip_context = cpi->common.mb_no_coeff_skip ?

+        (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff : 0;

-      xd->left_context = cm->left_context + y_idx;

-      xd->above_context = cm->above_context + mb_col + x_idx;

-      memcpy(&ta[n], xd->above_context, sizeof(ta[n]));

-      memcpy(&tl[n], xd->left_context, sizeof(tl[n]));

-      tp[n] = *t;

-      xd->mode_info_context = mi + x_idx + y_idx * mis;

-      if (!x->skip) {

-        vp9_subtract_mby_s_c(x->src_diff,

-                             src + x_idx * 16 + y_idx * 16 * src_y_stride,

-                             src_y_stride,

-                             dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

-                             dst_y_stride);

-        vp9_subtract_mbuv_s_c(x->src_diff,

-                              usrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                              vsrc + x_idx * 8 + y_idx * 8 * src_uv_stride,

-                              src_uv_stride,

-                              udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                              vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                              dst_uv_stride);

-        vp9_fidct_mb(x);

-        vp9_recon_mby_s_c(&x->e_mbd,

-                          dst + x_idx * 16 + y_idx * 16 * dst_y_stride);

-        vp9_recon_mbuv_s_c(&x->e_mbd,

-                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,

-                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride);

-        vp9_tokenize_mb(cpi, &x->e_mbd, t, !output_enabled);

-        skip[n] = xd->mode_info_context->mbmi.mb_skip_coeff;

-      } else {

-        int mb_skip_context = cpi->common.mb_no_coeff_skip ?

-          (x->e_mbd.mode_info_context - 1)->mbmi.mb_skip_coeff +

-          (x->e_mbd.mode_info_context - mis)->mbmi.mb_skip_coeff : 0;

-        xd->mode_info_context->mbmi.mb_skip_coeff = skip[n] = 1;

-        if (cpi->common.mb_no_coeff_skip) {

-          // TODO(rbultje) this should be done per-sb instead of per-mb?

-          if (output_enabled)

-            cpi->skip_true_count[mb_skip_context]++;

-          vp9_reset_mb_tokens_context(xd);

-        } else {

-          vp9_stuff_mb(cpi, xd, t, !output_enabled);

-          // TODO(rbultje) this should be done per-sb instead of per-mb?

-          if (output_enabled)

-            cpi->skip_false_count[mb_skip_context]++;

-        }

-      }

+    xd->mode_info_context->mbmi.mb_skip_coeff = 1;

+    if (cm->mb_no_coeff_skip) {

+      if (output_enabled)

+        cpi->skip_true_count[mb_skip_context]++;

+      vp9_reset_sb64_tokens_context(xd);

+    } else {

+      vp9_stuff_sb64(cpi, xd, t, !output_enabled);

+      if (output_enabled)

+        cpi->skip_false_count[mb_skip_context]++;

-  xd->mode_info_context = mi;

-  update_sb64_skip_coeff_state(cpi, ta, tl, tp, t, skip, output_enabled);

+  // copy skip flag on all mb_mode_info contexts in this SB

+  // if this was a skip at this txfm size

+  for (n = 1; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    if (mb_col + x_idx < cm->mb_cols && mb_row + y_idx < cm->mb_rows)

+      mi[x_idx + y_idx * mis].mbmi.mb_skip_coeff = mi->mbmi.mb_skip_coeff;

+  }

   if (output_enabled) {

     if (cm->txfm_mode == TX_MODE_SELECT &&

-        !((cm->mb_no_coeff_skip &&

-           ((mi->mbmi.txfm_size == TX_32X32 &&

-             skip[0] && skip[1] && skip[2] && skip[3]) ||

-            (mi->mbmi.txfm_size != TX_32X32 &&

-             skip[0] && skip[1] && skip[2] && skip[3] &&

-             skip[4] && skip[5] && skip[6] && skip[7] &&

-             skip[8] && skip[9] && skip[10] && skip[11] &&

-             skip[12] && skip[13] && skip[14] && skip[15]))) ||

-          (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) &&

-           vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) == 0))) {

+        !((cm->mb_no_coeff_skip && mi->mbmi.mb_skip_coeff) ||

+          (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {

       cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;

     } else {

       int x, y;

-      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ?

-                    TX_32X32 :

-                    cm->txfm_mode;

+      TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;

       for (y = 0; y < 4; y++) {

         for (x = 0; x < 4; x++) {

           if (mb_col + x < cm->mb_cols && mb_row + y < cm->mb_rows) {

--- a/vp9/encoder/vp9_encodeframe.h

+++ b/vp9/encoder/vp9_encodeframe.h

@@ -14,8 +14,8 @@

 struct macroblock;

-extern void vp9_build_block_offsets(struct macroblock *x);

+void vp9_build_block_offsets(struct macroblock *x);

-extern void vp9_setup_block_ptrs(struct macroblock *x);

+void vp9_setup_block_ptrs(struct macroblock *x);

 #endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_

--- a/vp9/encoder/vp9_encodeintra.c

+++ b/vp9/encoder/vp9_encodeintra.c

@@ -12,14 +12,11 @@

 #include "vp9_rtcd.h"

 #include "vp9/encoder/vp9_quantize.h"

 #include "vp9/common/vp9_reconintra.h"

-#include "vp9/common/vp9_reconintra4x4.h"

 #include "vp9/encoder/vp9_encodemb.h"

 #include "vp9/common/vp9_invtrans.h"

 #include "vp9/encoder/vp9_encodeintra.h"

 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {

-  int i;

-  int intra_pred_var = 0;

   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

   (void) cpi;

@@ -28,8 +25,10 @@

     mbmi->uv_mode = DC_PRED;

     mbmi->ref_frame = INTRA_FRAME;

-    vp9_encode_intra16x16mby(x);

+    vp9_encode_intra16x16mby(&cpi->common, x);

   } else {

+    int i;

     for (i = 0; i < 16; i++) {

       x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;

       vp9_encode_intra4x4block(x, i);

@@ -36,9 +35,7 @@

-  intra_pred_var = vp9_get_mb_ss(x->src_diff);

-  return intra_pred_var;

+  return vp9_get_mb_ss(x->src_diff);

 void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {

@@ -47,21 +44,22 @@

   TX_TYPE tx_type;

 #if CONFIG_NEWBINTRAMODES

-  b->bmi.as_mode.context = vp9_find_bpred_context(b);

+  b->bmi.as_mode.context = vp9_find_bpred_context(&x->e_mbd, b);

 #endif

-  vp9_intra4x4_predict(b, b->bmi.as_mode.first, b->predictor);

+  vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor);

   vp9_subtract_b(be, b, 16);

-  tx_type = get_tx_type_4x4(&x->e_mbd, b);

+  tx_type = get_tx_type_4x4(&x->e_mbd, ib);

   if (tx_type != DCT_DCT) {

-    vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);

-    vp9_ht_quantize_b_4x4(be, b, tx_type);

-    vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);

+    vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

+    vp9_ht_quantize_b_4x4(x, ib, tx_type);

+    vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);

   } else {

-    x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

-    x->quantize_b_4x4(be, b) ;

-    vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 32);

+    x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+    x->quantize_b_4x4(x, ib);

+    vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],

+                                b->dqcoeff, b->diff, 32);

   vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

@@ -72,10 +70,9 @@

   for (i = 0; i < 16; i++)

     vp9_encode_intra4x4block(mb, i);

-  return;

-void vp9_encode_intra16x16mby(MACROBLOCK *x) {

+void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x) {

   MACROBLOCKD *xd = &x->e_mbd;

   BLOCK *b = &x->block[0];

   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

@@ -84,30 +81,34 @@

   vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);

-  if (tx_size == TX_16X16) {

-    vp9_transform_mby_16x16(x);

-    vp9_quantize_mby_16x16(x);

-    if (x->optimize)

-      vp9_optimize_mby_16x16(x);

-    vp9_inverse_transform_mby_16x16(xd);

-  } else if (tx_size == TX_8X8) {

-    vp9_transform_mby_8x8(x);

-    vp9_quantize_mby_8x8(x);

-    if (x->optimize)

-      vp9_optimize_mby_8x8(x);

-    vp9_inverse_transform_mby_8x8(xd);

-  } else {

-    vp9_transform_mby_4x4(x);

-    vp9_quantize_mby_4x4(x);

-    if (x->optimize)

-      vp9_optimize_mby_4x4(x);

-    vp9_inverse_transform_mby_4x4(xd);

+  switch (tx_size) {

+    case TX_16X16:

+      vp9_transform_mby_16x16(x);

+      vp9_quantize_mby_16x16(x);

+      if (x->optimize)

+        vp9_optimize_mby_16x16(cm, x);

+      vp9_inverse_transform_mby_16x16(xd);

+      break;

+    case TX_8X8:

+      vp9_transform_mby_8x8(x);

+      vp9_quantize_mby_8x8(x);

+      if (x->optimize)

+        vp9_optimize_mby_8x8(cm, x);

+      vp9_inverse_transform_mby_8x8(xd);

+      break;

+    default:

+      vp9_transform_mby_4x4(x);

+      vp9_quantize_mby_4x4(x);

+      if (x->optimize)

+        vp9_optimize_mby_4x4(cm, x);

+      vp9_inverse_transform_mby_4x4(xd);

+      break;

   vp9_recon_mby(xd);

-void vp9_encode_intra16x16mbuv(MACROBLOCK *x) {

+void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x) {

   MACROBLOCKD *xd = &x->e_mbd;

   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

@@ -116,19 +117,22 @@

   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

                     xd->predictor, x->src.uv_stride);

-  if (tx_size == TX_4X4) {

-    vp9_transform_mbuv_4x4(x);

-    vp9_quantize_mbuv_4x4(x);

-    if (x->optimize)

-      vp9_optimize_mbuv_4x4(x);

-    vp9_inverse_transform_mbuv_4x4(xd);

-  } else /* 16x16 or 8x8 */ {

-    vp9_transform_mbuv_8x8(x);

-    vp9_quantize_mbuv_8x8(x);

-    if (x->optimize)

-      vp9_optimize_mbuv_8x8(x);

-    vp9_inverse_transform_mbuv_8x8(xd);

-  }

+  switch (tx_size) {

+    case TX_4X4:

+      vp9_transform_mbuv_4x4(x);

+      vp9_quantize_mbuv_4x4(x);

+      if (x->optimize)

+        vp9_optimize_mbuv_4x4(cm, x);

+      vp9_inverse_transform_mbuv_4x4(xd);

+      break;

+    default:  // 16x16 or 8x8

+      vp9_transform_mbuv_8x8(x);

+      vp9_quantize_mbuv_8x8(x);

+      if (x->optimize)

+        vp9_optimize_mbuv_8x8(cm, x);

+      vp9_inverse_transform_mbuv_8x8(xd);

+      break;

+    }

   vp9_recon_intra_mbuv(xd);

@@ -141,7 +145,7 @@

   int i;

   TX_TYPE tx_type;

-  vp9_intra8x8_predict(b, b->bmi.as_mode.first, b->predictor);

+  vp9_intra8x8_predict(xd, b, b->bmi.as_mode.first, b->predictor);

   // generate residual blocks

   vp9_subtract_4b_c(be, b, 16);

@@ -148,16 +152,15 @@

   if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

     int idx = (ib & 0x02) ? (ib + 2) : ib;

-    tx_type = get_tx_type_8x8(xd, &xd->block[ib]);

+    tx_type = get_tx_type_8x8(xd, ib);

     if (tx_type != DCT_DCT) {

-      vp9_fht(be->src_diff, 32, (x->block + idx)->coeff,

-                tx_type, 8);

-      x->quantize_b_8x8(x->block + idx, xd->block + idx);

-      vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,

-                   tx_type, 8, xd->block[idx].eob);

+      vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);

+      x->quantize_b_8x8(x, idx, tx_type);

+      vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff,

+                            16, tx_type);

     } else {

-      x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);

-      x->quantize_b_8x8(x->block + idx, xd->block + idx);

+      x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);

+      x->quantize_b_8x8(x, idx, DCT_DCT);

       vp9_short_idct8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);

   } else {

@@ -164,15 +167,25 @@

     for (i = 0; i < 4; i++) {

       b = &xd->block[ib + iblock[i]];

       be = &x->block[ib + iblock[i]];

-      tx_type = get_tx_type_4x4(xd, b);

+      tx_type = get_tx_type_4x4(xd, ib + iblock[i]);

       if (tx_type != DCT_DCT) {

-        vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);

-        vp9_ht_quantize_b_4x4(be, b, tx_type);

-        vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);

+        vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

+        vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);

+        vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type);

+      } else if (!(i & 1) &&

+                 get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {

+        x->fwd_txm8x4(be->src_diff, be->coeff, 32);

+        x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);

+        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],

+                                    b->dqcoeff, b->diff, 32);

+        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i] + 1],

+                                    (b + 1)->dqcoeff, (b + 1)->diff, 32);

+        i++;

       } else {

-        x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

-        x->quantize_b_4x4(be, b);

-        vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32);

+        x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+        x->quantize_b_4x4(x, ib + iblock[i]);

+        vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]],

+                                    b->dqcoeff, b->diff, 32);

@@ -186,26 +199,24 @@

 void vp9_encode_intra8x8mby(MACROBLOCK *x) {

-  int i, ib;

+  int i;

-  for (i = 0; i < 4; i++) {

-    ib = vp9_i8x8_block[i];

-    vp9_encode_intra8x8(x, ib);

-  }

+  for (i = 0; i < 4; i++)

+    vp9_encode_intra8x8(x, vp9_i8x8_block[i]);

-static void encode_intra_uv4x4(MACROBLOCK *x, int ib,

-                               int mode) {

+static void encode_intra_uv4x4(MACROBLOCK *x, int ib, int mode) {

   BLOCKD *b = &x->e_mbd.block[ib];

   BLOCK *be = &x->block[ib];

-  vp9_intra_uv4x4_predict(b, mode, b->predictor);

+  vp9_intra_uv4x4_predict(&x->e_mbd, b, mode, b->predictor);

   vp9_subtract_b(be, b, 8);

-  x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);

-  x->quantize_b_4x4(be, b);

-  vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 16);

+  x->fwd_txm4x4(be->src_diff, be->coeff, 16);

+  x->quantize_b_4x4(x, ib);

+  vp9_inverse_transform_b_4x4(&x->e_mbd, x->e_mbd.eobs[ib],

+                              b->dqcoeff, b->diff, 16);

   vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,

                    b->dst_stride);

@@ -212,17 +223,13 @@

 void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {

-  int i, ib, mode;

-  BLOCKD *b;

+  int i;

   for (i = 0; i < 4; i++) {

-    ib = vp9_i8x8_block[i];

-    b = &x->e_mbd.block[ib];

-    mode = b->bmi.as_mode.first;

+    BLOCKD *b = &x->e_mbd.block[vp9_i8x8_block[i]];

+    int mode = b->bmi.as_mode.first;

-    /*u */

-    encode_intra_uv4x4(x, i + 16, mode);

-    /*v */

-    encode_intra_uv4x4(x, i + 20, mode);

+    encode_intra_uv4x4(x, i + 16, mode);  // u

+    encode_intra_uv4x4(x, i + 20, mode);  // v

--- a/vp9/encoder/vp9_encodeintra.h

+++ b/vp9/encoder/vp9_encodeintra.h

@@ -14,8 +14,8 @@

 #include "vp9/encoder/vp9_onyx_int.h"

 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);

-void vp9_encode_intra16x16mby(MACROBLOCK *x);

-void vp9_encode_intra16x16mbuv(MACROBLOCK *x);

+void vp9_encode_intra16x16mby(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_encode_intra16x16mbuv(VP9_COMMON *const cm, MACROBLOCK *x);

 void vp9_encode_intra4x4mby(MACROBLOCK *mb);

 void vp9_encode_intra4x4block(MACROBLOCK *x, int ib);

 void vp9_encode_intra8x8mby(MACROBLOCK *x);

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -29,9 +29,8 @@

   int r, c;

   for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

+    for (c = 0; c < 4; c++)

       diff_ptr[c] = src_ptr[c] - pred_ptr[c];

-    }

     diff_ptr += pitch;

     pred_ptr += pitch;

@@ -47,9 +46,9 @@

   int r, c;

   for (r = 0; r < 8; r++) {

-    for (c = 0; c < 8; c++) {

+    for (c = 0; c < 8; c++)

       diff_ptr[c] = src_ptr[c] - pred_ptr[c];

-    }

     diff_ptr += pitch;

     pred_ptr += pitch;

     src_ptr  += src_stride;

@@ -65,9 +64,8 @@

   int r, c;

   for (r = 0; r < 8; r++) {

-    for (c = 0; c < 8; c++) {

+    for (c = 0; c < 8; c++)

       udiff[c] = usrc[c] - upred[c];

-    }

     udiff += 8;

     upred += dst_stride;

@@ -98,9 +96,8 @@

   int r, c;

   for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

+    for (c = 0; c < 16; c++)

       diff[c] = src[c] - pred[c];

-    }

     diff += 16;

     pred += dst_stride;

@@ -113,9 +110,8 @@

   int r, c;

   for (r = 0; r < 32; r++) {

-    for (c = 0; c < 32; c++) {

+    for (c = 0; c < 32; c++)

       diff[c] = src[c] - pred[c];

-    }

     diff += 32;

     pred += dst_stride;

@@ -132,9 +128,8 @@

   int r, c;

   for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

+    for (c = 0; c < 16; c++)

       udiff[c] = usrc[c] - upred[c];

-    }

     udiff += 16;

     upred += dst_stride;

@@ -142,9 +137,8 @@

   for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

+    for (c = 0; c < 16; c++)

       vdiff[c] = vsrc[c] - vpred[c];

-    }

     vdiff += 16;

     vpred += dst_stride;

@@ -152,6 +146,50 @@

+void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride,

+                            const uint8_t *pred, int dst_stride) {

+  int r, c;

+  for (r = 0; r < 64; r++) {

+    for (c = 0; c < 64; c++) {

+      diff[c] = src[c] - pred[c];

+    }

+    diff += 64;

+    pred += dst_stride;

+    src  += src_stride;

+  }

+}

+void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc,

+                             const uint8_t *vsrc, int src_stride,

+                             const uint8_t *upred,

+                             const uint8_t *vpred, int dst_stride) {

+  int16_t *udiff = diff + 4096;

+  int16_t *vdiff = diff + 4096 + 1024;

+  int r, c;

+  for (r = 0; r < 32; r++) {

+    for (c = 0; c < 32; c++) {

+      udiff[c] = usrc[c] - upred[c];

+    }

+    udiff += 32;

+    upred += dst_stride;

+    usrc  += src_stride;

+  }

+  for (r = 0; r < 32; r++) {

+    for (c = 0; c < 32; c++) {

+      vdiff[c] = vsrc[c] - vpred[c];

+    }

+    vdiff += 32;

+    vpred += dst_stride;

+    vsrc  += src_stride;

+  }

+}

 void vp9_subtract_mby_c(int16_t *diff, uint8_t *src,

                         uint8_t *pred, int stride) {

   vp9_subtract_mby_s_c(diff, src, stride, pred, 16);

@@ -166,52 +204,29 @@

                     x->e_mbd.predictor, x->src.uv_stride);

-static void build_dcblock_4x4(MACROBLOCK *x) {

-  int16_t *src_diff_ptr = &x->src_diff[384];

-  int i;

-  for (i = 0; i < 16; i++) {

-    src_diff_ptr[i] = x->coeff[i * 16];

-    x->coeff[i * 16] = 0;

-  }

-}

 void vp9_transform_mby_4x4(MACROBLOCK *x) {

   int i;

   MACROBLOCKD *xd = &x->e_mbd;

-  int has_2nd_order = get_2nd_order_usage(xd);

   for (i = 0; i < 16; i++) {

     BLOCK *b = &x->block[i];

-    TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);

+    TX_TYPE tx_type = get_tx_type_4x4(xd, i);

     if (tx_type != DCT_DCT) {

-      assert(has_2nd_order == 0);

-      vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 4);

+      vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type);

+    } else if (!(i & 1) && get_tx_type_4x4(xd, i + 1) == DCT_DCT) {

+      x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 32);

+      i++;

     } else {

-      x->vp9_short_fdct4x4(&x->block[i].src_diff[0],

-                           &x->block[i].coeff[0], 32);

+      x->fwd_txm4x4(x->block[i].src_diff, x->block[i].coeff, 32);

-  if (has_2nd_order) {

-    // build dc block from 16 y dc values

-    build_dcblock_4x4(x);

-    // do 2nd order transform on the dc block

-    x->short_walsh4x4(&x->block[24].src_diff[0],

-                      &x->block[24].coeff[0], 8);

-  } else {

-    vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0]));

-  }

 void vp9_transform_mbuv_4x4(MACROBLOCK *x) {

   int i;

-  for (i = 16; i < 24; i += 2) {

-    x->vp9_short_fdct8x4(&x->block[i].src_diff[0],

-                         &x->block[i].coeff[0], 16);

-  }

+  for (i = 16; i < 24; i += 2)

+    x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 16);

 static void transform_mb_4x4(MACROBLOCK *x) {

@@ -219,71 +234,36 @@

   vp9_transform_mbuv_4x4(x);

-static void build_dcblock_8x8(MACROBLOCK *x) {

-  int16_t *src_diff_ptr = x->block[24].src_diff;

-  int i;

-  for (i = 0; i < 16; i++) {

-    src_diff_ptr[i] = 0;

-  }

-  src_diff_ptr[0] = x->coeff[0 * 16];

-  src_diff_ptr[1] = x->coeff[4 * 16];

-  src_diff_ptr[4] = x->coeff[8 * 16];

-  src_diff_ptr[8] = x->coeff[12 * 16];

-  x->coeff[0 * 16] = 0;

-  x->coeff[4 * 16] = 0;

-  x->coeff[8 * 16] = 0;

-  x->coeff[12 * 16] = 0;

-}

 void vp9_transform_mby_8x8(MACROBLOCK *x) {

   int i;

   MACROBLOCKD *xd = &x->e_mbd;

   TX_TYPE tx_type;

-  int has_2nd_order = get_2nd_order_usage(xd);

   for (i = 0; i < 9; i += 8) {

     BLOCK *b = &x->block[i];

-    tx_type = get_tx_type_8x8(xd, &xd->block[i]);

+    tx_type = get_tx_type_8x8(xd, i);

     if (tx_type != DCT_DCT) {

-      assert(has_2nd_order == 0);

-      vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 8);

+      vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type);

     } else {

-      x->vp9_short_fdct8x8(&x->block[i].src_diff[0],

-                           &x->block[i].coeff[0], 32);

+      x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 32);

   for (i = 2; i < 11; i += 8) {

     BLOCK *b = &x->block[i];

-    tx_type = get_tx_type_8x8(xd, &xd->block[i]);

+    tx_type = get_tx_type_8x8(xd, i);

     if (tx_type != DCT_DCT) {

-      assert(has_2nd_order == 0);

-      vp9_fht_c(b->src_diff, 32, (b + 2)->coeff, tx_type, 8);

+      vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type);

     } else {

-      x->vp9_short_fdct8x8(&x->block[i].src_diff[0],

-                           &x->block[i + 2].coeff[0], 32);

+      x->fwd_txm8x8(x->block[i].src_diff, x->block[i + 2].coeff, 32);

-  if (has_2nd_order) {

-    // build dc block from 2x2 y dc values

-    build_dcblock_8x8(x);

-    // do 2nd order transform on the dc block

-    x->short_fhaar2x2(&x->block[24].src_diff[0],

-                      &x->block[24].coeff[0], 8);

-  } else {

-    vpx_memset(x->block[24].coeff, 0, 16 * sizeof(x->block[24].coeff[0]));

-  }

 void vp9_transform_mbuv_8x8(MACROBLOCK *x) {

   int i;

-  for (i = 16; i < 24; i += 4) {

-    x->vp9_short_fdct8x8(&x->block[i].src_diff[0],

-                         &x->block[i].coeff[0], 16);

-  }

+  for (i = 16; i < 24; i += 4)

+    x->fwd_txm8x8(x->block[i].src_diff, x->block[i].coeff, 16);

 void vp9_transform_mb_8x8(MACROBLOCK *x) {

@@ -294,13 +274,12 @@

 void vp9_transform_mby_16x16(MACROBLOCK *x) {

   MACROBLOCKD *xd = &x->e_mbd;

   BLOCK *b = &x->block[0];

-  TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]);

+  TX_TYPE tx_type = get_tx_type_16x16(xd, 0);

   vp9_clear_system_state();

   if (tx_type != DCT_DCT) {

-    vp9_fht_c(b->src_diff, 32, b->coeff, tx_type, 16);

+    vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type);

   } else {

-    x->vp9_short_fdct16x16(&x->block[0].src_diff[0],

-                           &x->block[0].coeff[0], 32);

+    x->fwd_txm16x16(x->block[0].src_diff, x->block[0].coeff, 32);

@@ -310,19 +289,212 @@

 void vp9_transform_sby_32x32(MACROBLOCK *x) {

-  SUPERBLOCK * const x_sb = &x->sb_coeff_data;

-  vp9_short_fdct32x32(x_sb->src_diff, x_sb->coeff, 64);

+  vp9_short_fdct32x32(x->src_diff, x->coeff, 64);

+void vp9_transform_sby_16x16(MACROBLOCK *x) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int n;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4);

+    if (tx_type != DCT_DCT) {

+      vp9_short_fht16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,

+                         x->coeff + n * 256, 32, tx_type);

+    } else {

+      x->fwd_txm16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16,

+                      x->coeff + n * 256, 64);

+    }

+  }

+}

+void vp9_transform_sby_8x8(MACROBLOCK *x) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int n;

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2);

+    if (tx_type != DCT_DCT) {

+      vp9_short_fht8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,

+                       x->coeff + n * 64, 32, tx_type);

+    } else {

+      x->fwd_txm8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8,

+                    x->coeff + n * 64, 64);

+    }

+  }

+}

+void vp9_transform_sby_4x4(MACROBLOCK *x) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int n;

+  for (n = 0; n < 64; n++) {

+    const int x_idx = n & 7, y_idx = n >> 3;

+    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx);

+    if (tx_type != DCT_DCT) {

+      vp9_short_fht4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,

+                       x->coeff + n * 16, 32, tx_type);

+    } else {

+      x->fwd_txm4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4,

+                    x->coeff + n * 16, 64);

+    }

+  }

+}

 void vp9_transform_sbuv_16x16(MACROBLOCK *x) {

-  SUPERBLOCK * const x_sb = &x->sb_coeff_data;

   vp9_clear_system_state();

-  x->vp9_short_fdct16x16(x_sb->src_diff + 1024,

-                         x_sb->coeff + 1024, 32);

-  x->vp9_short_fdct16x16(x_sb->src_diff + 1280,

-                         x_sb->coeff + 1280, 32);

+  x->fwd_txm16x16(x->src_diff + 1024, x->coeff + 1024, 32);

+  x->fwd_txm16x16(x->src_diff + 1280, x->coeff + 1280, 32);

+void vp9_transform_sbuv_8x8(MACROBLOCK *x) {

+  int n;

+  vp9_clear_system_state();

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    x->fwd_txm8x8(x->src_diff + 1024 + y_idx * 16 * 8 + x_idx * 8,

+                  x->coeff + 1024 + n * 64, 32);

+    x->fwd_txm8x8(x->src_diff + 1280 + y_idx * 16 * 8 + x_idx * 8,

+                  x->coeff + 1280 + n * 64, 32);

+  }

+}

+void vp9_transform_sbuv_4x4(MACROBLOCK *x) {

+  int n;

+  vp9_clear_system_state();

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    x->fwd_txm4x4(x->src_diff + 1024 + y_idx * 16 * 4 + x_idx * 4,

+                  x->coeff + 1024 + n * 16, 32);

+    x->fwd_txm4x4(x->src_diff + 1280 + y_idx * 16 * 4 + x_idx * 4,

+                  x->coeff + 1280 + n * 16, 32);

+  }

+}

+void vp9_transform_sb64y_32x32(MACROBLOCK *x) {

+  int n;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    vp9_short_fdct32x32(x->src_diff + y_idx * 64 * 32 + x_idx * 32,

+                        x->coeff + n * 1024, 128);

+  }

+}

+void vp9_transform_sb64y_16x16(MACROBLOCK *x) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int n;

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4);

+    if (tx_type != DCT_DCT) {

+      vp9_short_fht16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,

+                         x->coeff + n * 256, 64, tx_type);

+    } else {

+      x->fwd_txm16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16,

+                      x->coeff + n * 256, 128);

+    }

+  }

+}

+void vp9_transform_sb64y_8x8(MACROBLOCK *x) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int n;

+  for (n = 0; n < 64; n++) {

+    const int x_idx = n & 7, y_idx = n >> 3;

+    const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2);

+    if (tx_type != DCT_DCT) {

+      vp9_short_fht8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,

+                         x->coeff + n * 64, 64, tx_type);

+    } else {

+      x->fwd_txm8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8,

+                    x->coeff + n * 64, 128);

+    }

+  }

+}

+void vp9_transform_sb64y_4x4(MACROBLOCK *x) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int n;

+  for (n = 0; n < 256; n++) {

+    const int x_idx = n & 15, y_idx = n >> 4;

+    const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx);

+    if (tx_type != DCT_DCT) {

+      vp9_short_fht8x8(x->src_diff + y_idx * 64 * 4 + x_idx * 4,

+                       x->coeff + n * 16, 64, tx_type);

+    } else {

+      x->fwd_txm4x4(x->src_diff + y_idx * 64 * 4 + x_idx * 4,

+                    x->coeff + n * 16, 128);

+    }

+  }

+}

+void vp9_transform_sb64uv_32x32(MACROBLOCK *x) {

+  vp9_clear_system_state();

+  vp9_short_fdct32x32(x->src_diff + 4096,

+                      x->coeff + 4096, 64);

+  vp9_short_fdct32x32(x->src_diff + 4096 + 1024,

+                      x->coeff + 4096 + 1024, 64);

+}

+void vp9_transform_sb64uv_16x16(MACROBLOCK *x) {

+  int n;

+  vp9_clear_system_state();

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    x->fwd_txm16x16(x->src_diff + 4096 + y_idx * 32 * 16 + x_idx * 16,

+                    x->coeff + 4096 + n * 256, 64);

+    x->fwd_txm16x16(x->src_diff + 4096 + 1024 + y_idx * 32 * 16 + x_idx * 16,

+                    x->coeff + 4096 + 1024 + n * 256, 64);

+  }

+}

+void vp9_transform_sb64uv_8x8(MACROBLOCK *x) {

+  int n;

+  vp9_clear_system_state();

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    x->fwd_txm8x8(x->src_diff + 4096 + y_idx * 32 * 8 + x_idx * 8,

+                  x->coeff + 4096 + n * 64, 64);

+    x->fwd_txm8x8(x->src_diff + 4096 + 1024 + y_idx * 32 * 8 + x_idx * 8,

+                  x->coeff + 4096 + 1024 + n * 64, 64);

+  }

+}

+void vp9_transform_sb64uv_4x4(MACROBLOCK *x) {

+  int n;

+  vp9_clear_system_state();

+  for (n = 0; n < 64; n++) {

+    const int x_idx = n & 7, y_idx = n >> 3;

+    x->fwd_txm4x4(x->src_diff + 4096 + y_idx * 32 * 4 + x_idx * 4,

+                  x->coeff + 4096 + n * 16, 64);

+    x->fwd_txm4x4(x->src_diff + 4096 + 1024 + y_idx * 32 * 4 + x_idx * 4,

+                  x->coeff + 4096 + 1024 + n * 16, 64);

+  }

+}

 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

 typedef struct vp9_token_state vp9_token_state;

@@ -338,13 +510,10 @@

 // TODO: experiments to find optimal multiple numbers

 #define Y1_RD_MULT 4

 #define UV_RD_MULT 2

-#define Y2_RD_MULT 4

 static const int plane_rd_mult[4] = {

   Y1_RD_MULT,

-  Y2_RD_MULT,

   UV_RD_MULT,

-  Y1_RD_MULT

};

 #define UPDATE_RD_COST()\

@@ -357,72 +526,120 @@

}\

-static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,

+// This function is a place holder for now but may ultimately need

+// to scan previous tokens to work out the correct context.

+static int trellis_get_coeff_context(const int *scan,

+                                     const int *nb,

+                                     int idx, int token,

+                                     uint8_t *token_cache,

+                                     int pad, int l) {

+  int bak = token_cache[idx], pt;

+  token_cache[idx] = token;

+  pt = vp9_get_coef_context(scan, nb, pad, token_cache, idx + 1, l);

+  token_cache[idx] = bak;

+  return pt;

+}

+static void optimize_b(VP9_COMMON *const cm,

+                       MACROBLOCK *mb, int ib, PLANE_TYPE type,

+                       const int16_t *dequant_ptr,

                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

                        int tx_size) {

-  BLOCK *b = &mb->block[i];

-  BLOCKD *d = &mb->e_mbd.block[i];

-  vp9_token_state tokens[257][2];

-  unsigned best_index[257][2];

-  const int16_t *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;

-  int16_t *qcoeff_ptr = d->qcoeff;

-  int16_t *dqcoeff_ptr = d->dqcoeff;

-  int eob = d->eob, final_eob, sz = 0;

-  int i0 = (type == PLANE_TYPE_Y_NO_DC);

-  int rc, x, next;

+  const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME;

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  vp9_token_state tokens[1025][2];

+  unsigned best_index[1025][2];

+  const int16_t *coeff_ptr = mb->coeff + ib * 16;

+  int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;

+  int16_t *dqcoeff_ptr = xd->dqcoeff + ib * 16;

+  int eob = xd->eobs[ib], final_eob, sz = 0;

+  const int i0 = 0;

+  int rc, x, next, i;

   int64_t rdmult, rddiv, rd_cost0, rd_cost1;

   int rate0, rate1, error0, error1, t0, t1;

   int best, band, pt;

   int err_mult = plane_rd_mult[type];

-  int default_eob;

-  int const *scan, *bands;

-#if CONFIG_NEWCOEFCONTEXT

-  const int *neighbors;

+  int default_eob, pad;

+  int const *scan, *nb;

+  const int mul = 1 + (tx_size == TX_32X32);

+  uint8_t token_cache[1024];

+#if CONFIG_CODE_NONZEROCOUNT

+  // TODO(debargha): the dynamic programming approach used in this function

+  // is not compatible with the true rate cost when nzcs are used. Note

+  // the total rate is the sum of the nzc rate and the indicvidual token

+  // rates. The latter part can be optimized in this function, but because

+  // the nzc rate is a function of all the other tokens without a Markov

+  // relationship this rate cannot be considered correctly.

+  // The current implementation uses a suboptimal approach to account for

+  // the nzc rates somewhat, but in reality the optimization approach needs

+  // to change substantially.

+  uint16_t nzc = xd->nzcs[ib];

+  uint16_t nzc0, nzc1;

+  uint16_t final_nzc = 0, final_nzc_exp;

+  int nzc_context = vp9_get_nzc_context(cm, xd, ib);

+  unsigned int *nzc_cost;

+  nzc0 = nzc1 = nzc;

 #endif

   switch (tx_size) {

     default:

-    case TX_4X4:

-      scan = vp9_default_zig_zag1d_4x4;

-      bands = vp9_coef_bands_4x4;

+    case TX_4X4: {

+      const TX_TYPE tx_type = get_tx_type_4x4(xd, ib);

       default_eob = 16;

-      // TODO: this isn't called (for intra4x4 modes), but will be left in

-      // since it could be used later

-      {

-        TX_TYPE tx_type = get_tx_type_4x4(&mb->e_mbd, d);

-        if (tx_type != DCT_DCT) {

-          switch (tx_type) {

-            case ADST_DCT:

-              scan = vp9_row_scan_4x4;

-              break;

-            case DCT_ADST:

-              scan = vp9_col_scan_4x4;

-              break;

-            default:

-              scan = vp9_default_zig_zag1d_4x4;

-              break;

-          }

-        } else {

-          scan = vp9_default_zig_zag1d_4x4;

-        }

+#if CONFIG_CODE_NONZEROCOUNT

+      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];

+#endif

+      if (tx_type == DCT_ADST) {

+        scan = vp9_col_scan_4x4;

+      } else if (tx_type == ADST_DCT) {

+        scan = vp9_row_scan_4x4;

+      } else {

+        scan = vp9_default_zig_zag1d_4x4;

       break;

-    case TX_8X8:

-      scan = vp9_default_zig_zag1d_8x8;

-      bands = vp9_coef_bands_8x8;

+    }

+    case TX_8X8: {

+      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+      const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

+      const TX_TYPE tx_type = get_tx_type_8x8(xd, y + (x >> 1));

+      if (tx_type == DCT_ADST) {

+        scan = vp9_col_scan_8x8;

+      } else if (tx_type == ADST_DCT) {

+        scan = vp9_row_scan_8x8;

+      } else {

+        scan = vp9_default_zig_zag1d_8x8;

+      }

       default_eob = 64;

+#if CONFIG_CODE_NONZEROCOUNT

+      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];

+#endif

       break;

-    case TX_16X16:

-      scan = vp9_default_zig_zag1d_16x16;

-      bands = vp9_coef_bands_16x16;

+    }

+    case TX_16X16: {

+      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+      const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

+      const TX_TYPE tx_type = get_tx_type_16x16(xd, y + (x >> 2));

+      if (tx_type == DCT_ADST) {

+        scan = vp9_col_scan_16x16;

+      } else if (tx_type == ADST_DCT) {

+        scan = vp9_row_scan_16x16;

+      } else {

+        scan = vp9_default_zig_zag1d_16x16;

+      }

       default_eob = 256;

+#if CONFIG_CODE_NONZEROCOUNT

+      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];

+#endif

       break;

-  }

-#if CONFIG_NEWCOEFCONTEXT

-  neighbors = vp9_get_coef_neighbors_handle(scan);

+    }

+    case TX_32X32:

+      scan = vp9_default_zig_zag1d_32x32;

+      default_eob = 1024;

+#if CONFIG_CODE_NONZEROCOUNT

+      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];

 #endif

+      break;

+  }

   /* Now set up a Viterbi trellis to evaluate alternative roundings. */

   rdmult = mb->rdmult * err_mult;

@@ -431,7 +648,11 @@

   rddiv = mb->rddiv;

   memset(best_index, 0, sizeof(best_index));

   /* Initialize the sentinel node of the trellis. */

+#if CONFIG_CODE_NONZEROCOUNT

+  tokens[eob][0].rate = nzc_cost[nzc];

+#else

   tokens[eob][0].rate = 0;

+#endif

   tokens[eob][0].error = 0;

   tokens[eob][0].next = default_eob;

   tokens[eob][0].token = DCT_EOB_TOKEN;

@@ -438,8 +659,15 @@

   tokens[eob][0].qc = 0;

   *(tokens[eob] + 1) = *(tokens[eob] + 0);

   next = eob;

+  for (i = 0; i < eob; i++)

+    token_cache[i] = vp9_dct_value_tokens_ptr[qcoeff_ptr[scan[i]]].Token;

+  nb = vp9_get_coef_neighbors_handle(scan, &pad);

   for (i = eob; i-- > i0;) {

     int base_bits, d2, dx;

+#if CONFIG_CODE_NONZEROCOUNT

+    int new_nzc0, new_nzc1;

+#endif

     rc = scan[i];

     x = qcoeff_ptr[rc];

@@ -454,23 +682,19 @@

       t0 = (vp9_dct_value_tokens_ptr + x)->Token;

       /* Consider both possible successor states. */

       if (next < default_eob) {

-        band = bands[i + 1];

-        pt = vp9_prev_token_class[t0];

-#if CONFIG_NEWCOEFCONTEXT

-        if (NEWCOEFCONTEXT_BAND_COND(band))

-          pt = vp9_get_coef_neighbor_context(

-              qcoeff_ptr, i0, neighbors, scan[i + 1]);

-#endif

+        band = get_coef_band(scan, tx_size, i + 1);

+        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,

+                                       pad, default_eob);

         rate0 +=

-          mb->token_costs[tx_size][type][band][pt][tokens[next][0].token];

+          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][0].token];

         rate1 +=

-          mb->token_costs[tx_size][type][band][pt][tokens[next][1].token];

+          mb->token_costs[tx_size][type][ref][band][pt][tokens[next][1].token];

       UPDATE_RD_COST();

       /* And pick the best. */

       best = rd_cost1 < rd_cost0;

       base_bits = *(vp9_dct_value_cost_ptr + x);

-      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];

+      dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]);

       d2 = dx * dx;

       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);

       tokens[i][0].error = d2 + (best ? error1 : error0);

@@ -478,12 +702,17 @@

       tokens[i][0].token = t0;

       tokens[i][0].qc = x;

       best_index[i][0] = best;

+#if CONFIG_CODE_NONZEROCOUNT

+      new_nzc0 = (best ? nzc1 : nzc0);

+#endif

       /* Evaluate the second possibility for this state. */

       rate0 = tokens[next][0].rate;

       rate1 = tokens[next][1].rate;

-      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc])) &&

-          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) + dequant_ptr[rc != 0]))

+      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) &&

+          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul +

+                                         dequant_ptr[rc != 0]))

         shortcut = 1;

       else

         shortcut = 0;

@@ -502,41 +731,27 @@

              DCT_EOB_TOKEN : ZERO_TOKEN;

         t1 = tokens[next][1].token == DCT_EOB_TOKEN ?

              DCT_EOB_TOKEN : ZERO_TOKEN;

+#if CONFIG_CODE_NONZEROCOUNT

+        // Account for rate drop because of the nzc change.

+        // TODO(debargha): Find a better solution

+        rate0 -= nzc_cost[nzc0] - nzc_cost[nzc0 - 1];

+        rate1 -= nzc_cost[nzc1] - nzc_cost[nzc1 - 1];

+#endif

       } else {

         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->Token;

       if (next < default_eob) {

-        band = bands[i + 1];

+        band = get_coef_band(scan, tx_size, i + 1);

         if (t0 != DCT_EOB_TOKEN) {

-#if CONFIG_NEWCOEFCONTEXT

-          int tmp = qcoeff_ptr[scan[i]];

-          qcoeff_ptr[scan[i]] = x;

-          if (NEWCOEFCONTEXT_BAND_COND(band))

-            pt = vp9_get_coef_neighbor_context(

-                qcoeff_ptr, i0, neighbors, scan[i + 1]);

-          else

-            pt = vp9_prev_token_class[t0];

-          qcoeff_ptr[scan[i]] = tmp;

-#else

-          pt = vp9_prev_token_class[t0];

-#endif

-          rate0 += mb->token_costs[tx_size][type][band][pt][

+          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,

+                                         pad, default_eob);

+          rate0 += mb->token_costs[tx_size][type][ref][band][pt][

               tokens[next][0].token];

         if (t1 != DCT_EOB_TOKEN) {

-#if CONFIG_NEWCOEFCONTEXT

-          int tmp = qcoeff_ptr[scan[i]];

-          qcoeff_ptr[scan[i]] = x;

-          if (NEWCOEFCONTEXT_BAND_COND(band))

-            pt = vp9_get_coef_neighbor_context(

-                qcoeff_ptr, i0, neighbors, scan[i + 1]);

-          else

-            pt = vp9_prev_token_class[t1];

-          qcoeff_ptr[scan[i]] = tmp;

-#else

-          pt = vp9_prev_token_class[t1];

-#endif

-          rate1 += mb->token_costs[tx_size][type][band][pt][

+          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,

+                                         pad, default_eob);

+          rate1 += mb->token_costs[tx_size][type][ref][band][pt][

               tokens[next][1].token];

@@ -556,6 +771,11 @@

       tokens[i][1].token = best ? t1 : t0;

       tokens[i][1].qc = x;

       best_index[i][1] = best;

+#if CONFIG_CODE_NONZEROCOUNT

+      new_nzc1 = (best ? nzc1 : nzc0) - (!x);

+      nzc0 = new_nzc0;

+      nzc1 = new_nzc1;

+#endif

       /* Finally, make this the new head of the trellis. */

       next = i;

@@ -563,16 +783,18 @@

      *  add a new trellis node, but we do need to update the costs.

*/

     else {

-      band = bands[i + 1];

+      band = get_coef_band(scan, tx_size, i + 1);

       t0 = tokens[next][0].token;

       t1 = tokens[next][1].token;

       /* Update the cost of each path if we're past the EOB token. */

       if (t0 != DCT_EOB_TOKEN) {

-        tokens[next][0].rate += mb->token_costs[tx_size][type][band][0][t0];

+        tokens[next][0].rate +=

+            mb->token_costs[tx_size][type][ref][band][0][t0];

         tokens[next][0].token = ZERO_TOKEN;

       if (t1 != DCT_EOB_TOKEN) {

-        tokens[next][1].rate += mb->token_costs[tx_size][type][band][0][t1];

+        tokens[next][1].rate +=

+            mb->token_costs[tx_size][type][ref][band][0][t1];

         tokens[next][1].token = ZERO_TOKEN;

       /* Don't update next, because we didn't add a new node. */

@@ -580,7 +802,7 @@

   /* Now pick the best path through the whole trellis. */

-  band = bands[i + 1];

+  band = get_coef_band(scan, tx_size, i + 1);

   VP9_COMBINEENTROPYCONTEXTS(pt, *a, *l);

   rate0 = tokens[next][0].rate;

   rate1 = tokens[next][1].rate;

@@ -588,18 +810,25 @@

   error1 = tokens[next][1].error;

   t0 = tokens[next][0].token;

   t1 = tokens[next][1].token;

-  rate0 += mb->token_costs[tx_size][type][band][pt][t0];

-  rate1 += mb->token_costs[tx_size][type][band][pt][t1];

+  rate0 += mb->token_costs[tx_size][type][ref][band][pt][t0];

+  rate1 += mb->token_costs[tx_size][type][ref][band][pt][t1];

   UPDATE_RD_COST();

   best = rd_cost1 < rd_cost0;

+#if CONFIG_CODE_NONZEROCOUNT

+  final_nzc_exp = (best ? nzc1 : nzc0);

+#endif

   final_eob = i0 - 1;

   for (i = next; i < eob; i = next) {

     x = tokens[i][best].qc;

-    if (x)

+    if (x) {

       final_eob = i;

+#if CONFIG_CODE_NONZEROCOUNT

+      ++final_nzc;

+#endif

+    }

     rc = scan[i];

     qcoeff_ptr[rc] = x;

-    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]);

+    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;

     next = tokens[i][best].next;

     best = best_index[i][best];

@@ -606,81 +835,16 @@

   final_eob++;

-  d->eob = final_eob;

-  *a = *l = (d->eob > !type);

+  xd->eobs[ib] = final_eob;

+  *a = *l = (final_eob > 0);

+#if CONFIG_CODE_NONZEROCOUNT

+  assert(final_nzc == final_nzc_exp);

+  xd->nzcs[ib] = final_nzc;

+#endif

-/**************************************************************************

-our inverse hadamard transform effectively is weighted sum of all 16 inputs

-with weight either 1 or -1. It has a last stage scaling of (sum+1)>>2. And

-dc only idct is (dc+16)>>5. So if all the sums are between -65 and 63 the

-output after inverse wht and idct will be all zero. A sum of absolute value

-smaller than 65 guarantees all 16 different (+1/-1) weighted sums in wht

-fall between -65 and +65.

-**************************************************************************/

-#define SUM_2ND_COEFF_THRESH 65

-static void check_reset_2nd_coeffs(MACROBLOCKD *xd,

-                                   ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {

-  int sum = 0;

-  int i;

-  BLOCKD *bd = &xd->block[24];

-  if (bd->dequant[0] >= SUM_2ND_COEFF_THRESH

-      && bd->dequant[1] >= SUM_2ND_COEFF_THRESH)

-    return;

-  for (i = 0; i < bd->eob; i++) {

-    int coef = bd->dqcoeff[vp9_default_zig_zag1d_4x4[i]];

-    sum += (coef >= 0) ? coef : -coef;

-    if (sum >= SUM_2ND_COEFF_THRESH)

-      return;

-  }

-  if (sum < SUM_2ND_COEFF_THRESH) {

-    for (i = 0; i < bd->eob; i++) {

-      int rc = vp9_default_zig_zag1d_4x4[i];

-      bd->qcoeff[rc] = 0;

-      bd->dqcoeff[rc] = 0;

-    }

-    bd->eob = 0;

-    *a = *l = (bd->eob != 0);

-  }

-}

-#define SUM_2ND_COEFF_THRESH_8X8 32

-static void check_reset_8x8_2nd_coeffs(MACROBLOCKD *xd,

-                                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {

-  int sum = 0;

-  BLOCKD *bd = &xd->block[24];

-  int coef;

-  coef = bd->dqcoeff[0];

-  sum += (coef >= 0) ? coef : -coef;

-  coef = bd->dqcoeff[1];

-  sum += (coef >= 0) ? coef : -coef;

-  coef = bd->dqcoeff[4];

-  sum += (coef >= 0) ? coef : -coef;

-  coef = bd->dqcoeff[8];

-  sum += (coef >= 0) ? coef : -coef;

-  if (sum < SUM_2ND_COEFF_THRESH_8X8) {

-    bd->qcoeff[0] = 0;

-    bd->dqcoeff[0] = 0;

-    bd->qcoeff[1] = 0;

-    bd->dqcoeff[1] = 0;

-    bd->qcoeff[4] = 0;

-    bd->dqcoeff[4] = 0;

-    bd->qcoeff[8] = 0;

-    bd->dqcoeff[8] = 0;

-    bd->eob = 0;

-    *a = *l = (bd->eob != 0);

-  }

-}

-void vp9_optimize_mby_4x4(MACROBLOCK *x) {

+void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

   int b;

-  PLANE_TYPE type;

-  int has_2nd_order;

   ENTROPY_CONTEXT_PLANES t_above, t_left;

   ENTROPY_CONTEXT *ta;

   ENTROPY_CONTEXT *tl;

@@ -694,28 +858,14 @@

   ta = (ENTROPY_CONTEXT *)&t_above;

   tl = (ENTROPY_CONTEXT *)&t_left;

-  has_2nd_order = get_2nd_order_usage(&x->e_mbd);

-  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;

   for (b = 0; b < 16; b++) {

-    optimize_b(x, b, type,

+    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,

                ta + vp9_block2above[TX_4X4][b],

                tl + vp9_block2left[TX_4X4][b], TX_4X4);

-  if (has_2nd_order) {

-    b = 24;

-    optimize_b(x, b, PLANE_TYPE_Y2,

-               ta + vp9_block2above[TX_4X4][b],

-               tl + vp9_block2left[TX_4X4][b], TX_4X4);

-    check_reset_2nd_coeffs(&x->e_mbd,

-                           ta + vp9_block2above[TX_4X4][b],

-                           tl + vp9_block2left[TX_4X4][b]);

-  }

-void vp9_optimize_mbuv_4x4(MACROBLOCK *x) {

+void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

   int b;

   ENTROPY_CONTEXT_PLANES t_above, t_left;

   ENTROPY_CONTEXT *ta;

@@ -731,24 +881,22 @@

   tl = (ENTROPY_CONTEXT *)&t_left;

   for (b = 16; b < 24; b++) {

-    optimize_b(x, b, PLANE_TYPE_UV,

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,

                ta + vp9_block2above[TX_4X4][b],

                tl + vp9_block2left[TX_4X4][b], TX_4X4);

-static void optimize_mb_4x4(MACROBLOCK *x) {

-  vp9_optimize_mby_4x4(x);

-  vp9_optimize_mbuv_4x4(x);

+static void optimize_mb_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

+  vp9_optimize_mby_4x4(cm, x);

+  vp9_optimize_mbuv_4x4(cm, x);

-void vp9_optimize_mby_8x8(MACROBLOCK *x) {

+void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

   int b;

-  PLANE_TYPE type;

   ENTROPY_CONTEXT_PLANES t_above, t_left;

   ENTROPY_CONTEXT *ta;

   ENTROPY_CONTEXT *tl;

-  int has_2nd_order = get_2nd_order_usage(&x->e_mbd);

   if (!x->e_mbd.above_context || !x->e_mbd.left_context)

     return;

@@ -758,31 +906,19 @@

   ta = (ENTROPY_CONTEXT *)&t_above;

   tl = (ENTROPY_CONTEXT *)&t_left;

-  type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;

   for (b = 0; b < 16; b += 4) {

     ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];

     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];

-#if CONFIG_CNVCONTEXT

     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;

     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;

-#else

-    ENTROPY_CONTEXT above_ec = a[0];

-    ENTROPY_CONTEXT left_ec = l[0];

-#endif

-    optimize_b(x, b, type, &above_ec, &left_ec, TX_8X8);

+    optimize_b(cm, x, b, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[b].dequant,

+               &above_ec, &left_ec, TX_8X8);

     a[1] = a[0] = above_ec;

     l[1] = l[0] = left_ec;

-  // 8x8 always have 2nd order block

-  if (has_2nd_order) {

-    check_reset_8x8_2nd_coeffs(&x->e_mbd,

-                               ta + vp9_block2above[TX_8X8][24],

-                               tl + vp9_block2left[TX_8X8][24]);

-  }

-void vp9_optimize_mbuv_8x8(MACROBLOCK *x) {

+void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

   int b;

   ENTROPY_CONTEXT *const ta = (ENTROPY_CONTEXT *)x->e_mbd.above_context;

   ENTROPY_CONTEXT *const tl = (ENTROPY_CONTEXT *)x->e_mbd.left_context;

@@ -793,23 +929,19 @@

   for (b = 16; b < 24; b += 4) {

     ENTROPY_CONTEXT *const a = ta + vp9_block2above[TX_8X8][b];

     ENTROPY_CONTEXT *const l = tl + vp9_block2left[TX_8X8][b];

-#if CONFIG_CNVCONTEXT

     ENTROPY_CONTEXT above_ec = (a[0] + a[1]) != 0;

     ENTROPY_CONTEXT left_ec = (l[0] + l[1]) != 0;

-#else

-    ENTROPY_CONTEXT above_ec = a[0];

-    ENTROPY_CONTEXT left_ec = l[0];

-#endif

-    optimize_b(x, b, PLANE_TYPE_UV, &above_ec, &left_ec, TX_8X8);

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[b].dequant,

+               &above_ec, &left_ec, TX_8X8);

-static void optimize_mb_8x8(MACROBLOCK *x) {

-  vp9_optimize_mby_8x8(x);

-  vp9_optimize_mbuv_8x8(x);

+static void optimize_mb_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

+  vp9_optimize_mby_8x8(cm, x);

+  vp9_optimize_mbuv_8x8(cm, x);

-void vp9_optimize_mby_16x16(MACROBLOCK *x) {

+void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

   ENTROPY_CONTEXT_PLANES *const t_above = x->e_mbd.above_context;

   ENTROPY_CONTEXT_PLANES *const t_left = x->e_mbd.left_context;

   ENTROPY_CONTEXT ta, tl;

@@ -817,22 +949,345 @@

   if (!t_above || !t_left)

     return;

-#if CONFIG_CNVCONTEXT

   ta = (t_above->y1[0] + t_above->y1[1] + t_above->y1[2] + t_above->y1[3]) != 0;

   tl = (t_left->y1[0] + t_left->y1[1] + t_left->y1[2] + t_left->y1[3]) != 0;

-#else

-  ta = t_above->y1[0];

-  tl = t_left->y1[0];

-#endif

-  optimize_b(x, 0, PLANE_TYPE_Y_WITH_DC, &ta, &tl, TX_16X16);

+  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+             &ta, &tl, TX_16X16);

-static void optimize_mb_16x16(MACROBLOCK *x) {

-  vp9_optimize_mby_16x16(x);

-  vp9_optimize_mbuv_8x8(x);

+static void optimize_mb_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

+  vp9_optimize_mby_16x16(cm, x);

+  vp9_optimize_mbuv_8x8(cm, x);

-void vp9_fidct_mb(MACROBLOCK *x) {

+void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

+  ENTROPY_CONTEXT ta, tl;

+  ta = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;

+  tl = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;

+  optimize_b(cm, x, 0, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+             &ta, &tl, TX_32X32);

+}

+void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

+  ENTROPY_CONTEXT ta[2], tl[2];

+  int n;

+  ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0;

+  ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0;

+  tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0;

+  tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+               ta + x_idx, tl + y_idx, TX_16X16);

+  }

+}

+void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

+  ENTROPY_CONTEXT ta[4], tl[4];

+  int n;

+  ta[0] = (a[0] + a[1]) != 0;

+  ta[1] = (a[2] + a[3]) != 0;

+  ta[2] = (a1[0] + a1[1]) != 0;

+  ta[3] = (a1[2] + a1[3]) != 0;

+  tl[0] = (l[0] + l[1]) != 0;

+  tl[1] = (l[2] + l[3]) != 0;

+  tl[2] = (l1[0] + l1[1]) != 0;

+  tl[3] = (l1[2] + l1[3]) != 0;

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+               ta + x_idx, tl + y_idx, TX_8X8);

+  }

+}

+void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT ta[8], tl[8];

+  int n;

+  vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT));

+  for (n = 0; n < 64; n++) {

+    const int x_idx = n & 7, y_idx = n >> 3;

+    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+               ta + x_idx, tl + y_idx, TX_4X4);

+  }

+}

+void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

+  ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;

+  int b;

+  for (b = 64; b < 96; b += 16) {

+    const int cidx = b >= 80 ? 20 : 16;

+    a = ta + vp9_block2above_sb[TX_16X16][b];

+    l = tl + vp9_block2left_sb[TX_16X16][b];

+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

+    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

+               &above_ec, &left_ec, TX_16X16);

+  }

+}

+void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

+  ENTROPY_CONTEXT *a, *l, above_ec, left_ec;

+  int b;

+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

+  for (b = 64; b < 96; b += 4) {

+    const int cidx = b >= 80 ? 20 : 16;

+    a = ta + vp9_block2above_sb[TX_8X8][b];

+    l = tl + vp9_block2left_sb[TX_8X8][b];

+    above_ec = (a[0] + a[1]) != 0;

+    left_ec = (l[0] + l[1]) != 0;

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

+               &above_ec, &left_ec, TX_8X8);

+    a[0] = a[1] = above_ec;

+    l[0] = l[1] = left_ec;

+  }

+}

+void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

+  ENTROPY_CONTEXT *a, *l;

+  int b;

+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

+  for (b = 64; b < 96; b++) {

+    const int cidx = b >= 80 ? 20 : 16;

+    a = ta + vp9_block2above_sb[TX_4X4][b];

+    l = tl + vp9_block2left_sb[TX_4X4][b];

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

+               a, l, TX_4X4);

+  }

+}

+void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

+  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);

+  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);

+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

+  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);

+  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);

+  ENTROPY_CONTEXT ta[2], tl[2];

+  int n;

+  ta[0] = (a[0] + a[1] + a[2] + a[3] + a1[0] + a1[1] + a1[2] + a1[3]) != 0;

+  ta[1] = (a2[0] + a2[1] + a2[2] + a2[3] + a3[0] + a3[1] + a3[2] + a3[3]) != 0;

+  tl[0] = (l[0] + l[1] + l[2] + l[3] + l1[0] + l1[1] + l1[2] + l1[3]) != 0;

+  tl[1] = (l2[0] + l2[1] + l2[2] + l2[3] + l3[0] + l3[1] + l3[2] + l3[3]) != 0;

+  for (n = 0; n < 4; n++) {

+    const int x_idx = n & 1, y_idx = n >> 1;

+    optimize_b(cm, x, n * 64, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+               ta + x_idx, tl + y_idx, TX_32X32);

+  }

+}

+void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

+  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);

+  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);

+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

+  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);

+  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);

+  ENTROPY_CONTEXT ta[4], tl[4];

+  int n;

+  ta[0] = (a[0] + a[1] + a[2] + a[3]) != 0;

+  ta[1] = (a1[0] + a1[1] + a1[2] + a1[3]) != 0;

+  ta[2] = (a2[0] + a2[1] + a2[2] + a2[3]) != 0;

+  ta[3] = (a3[0] + a3[1] + a3[2] + a3[3]) != 0;

+  tl[0] = (l[0] + l[1] + l[2] + l[3]) != 0;

+  tl[1] = (l1[0] + l1[1] + l1[2] + l1[3]) != 0;

+  tl[2] = (l2[0] + l2[1] + l2[2] + l2[3]) != 0;

+  tl[3] = (l3[0] + l3[1] + l3[2] + l3[3]) != 0;

+  for (n = 0; n < 16; n++) {

+    const int x_idx = n & 3, y_idx = n >> 2;

+    optimize_b(cm, x, n * 16, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+               ta + x_idx, tl + y_idx, TX_16X16);

+  }

+}

+void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT *a = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

+  ENTROPY_CONTEXT *a1 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 1);

+  ENTROPY_CONTEXT *a2 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 2);

+  ENTROPY_CONTEXT *a3 = (ENTROPY_CONTEXT *) (x->e_mbd.above_context + 3);

+  ENTROPY_CONTEXT *l = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

+  ENTROPY_CONTEXT *l1 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 1);

+  ENTROPY_CONTEXT *l2 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 2);

+  ENTROPY_CONTEXT *l3 = (ENTROPY_CONTEXT *) (x->e_mbd.left_context + 3);

+  ENTROPY_CONTEXT ta[8], tl[8];

+  int n;

+  ta[0] = (a[0] + a[1]) != 0;

+  ta[1] = (a[2] + a[3]) != 0;

+  ta[2] = (a1[0] + a1[1]) != 0;

+  ta[3] = (a1[2] + a1[3]) != 0;

+  ta[4] = (a2[0] + a2[1]) != 0;

+  ta[5] = (a2[2] + a2[3]) != 0;

+  ta[6] = (a3[0] + a3[1]) != 0;

+  ta[7] = (a3[2] + a3[3]) != 0;

+  tl[0] = (l[0] + l[1]) != 0;

+  tl[1] = (l[2] + l[3]) != 0;

+  tl[2] = (l1[0] + l1[1]) != 0;

+  tl[3] = (l1[2] + l1[3]) != 0;

+  tl[4] = (l2[0] + l2[1]) != 0;

+  tl[5] = (l2[2] + l2[3]) != 0;

+  tl[6] = (l3[0] + l3[1]) != 0;

+  tl[7] = (l3[2] + l3[3]) != 0;

+  for (n = 0; n < 64; n++) {

+    const int x_idx = n & 7, y_idx = n >> 3;

+    optimize_b(cm, x, n * 4, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+               ta + x_idx, tl + y_idx, TX_8X8);

+  }

+}

+void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT ta[16], tl[16];

+  int n;

+  vpx_memcpy(ta, x->e_mbd.above_context, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(ta + 4, x->e_mbd.above_context + 1, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(ta + 8, x->e_mbd.above_context + 2, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(ta + 12, x->e_mbd.above_context + 3, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(tl, x->e_mbd.left_context, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(tl + 4, x->e_mbd.left_context + 1, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(tl + 8, x->e_mbd.left_context + 2, 4 * sizeof(ENTROPY_CONTEXT));

+  vpx_memcpy(tl + 12, x->e_mbd.left_context + 3, 4 * sizeof(ENTROPY_CONTEXT));

+  for (n = 0; n < 256; n++) {

+    const int x_idx = n & 15, y_idx = n >> 4;

+    optimize_b(cm, x, n, PLANE_TYPE_Y_WITH_DC, x->e_mbd.block[0].dequant,

+               ta + x_idx, tl + y_idx, TX_4X4);

+  }

+}

+void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) x->e_mbd.above_context;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) x->e_mbd.left_context;

+  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;

+  int b;

+  for (b = 256; b < 384; b += 64) {

+    const int cidx = b >= 320 ? 20 : 16;

+    a = ta + vp9_block2above_sb64[TX_32X32][b];

+    l = tl + vp9_block2left_sb64[TX_32X32][b];

+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    a2 = a + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l2 = l + 2 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    a3 = a + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l3 = l + 3 * sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    a_ec = (a[0] + a[1] + a1[0] + a1[1] + a2[0] + a2[1] + a3[0] + a3[1]) != 0;

+    l_ec = (l[0] + l[1] + l1[0] + l1[1] + l2[0] + l2[1] + l3[0] + l3[1]) != 0;

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

+               &a_ec, &l_ec, TX_32X32);

+  }

+}

+void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

+  ENTROPY_CONTEXT *a, *l, *a1, *l1, above_ec, left_ec;

+  int b;

+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

+  for (b = 256; b < 384; b += 16) {

+    const int cidx = b >= 320 ? 20 : 16;

+    a = ta + vp9_block2above_sb64[TX_16X16][b];

+    l = tl + vp9_block2left_sb64[TX_16X16][b];

+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    above_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

+    left_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

+               &above_ec, &left_ec, TX_16X16);

+    a[0] = a[1] = a1[0] = a1[1] = above_ec;

+    l[0] = l[1] = l1[0] = l1[1] = left_ec;

+  }

+}

+void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

+  ENTROPY_CONTEXT *a, *l, above_ec, left_ec;

+  int b;

+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

+  for (b = 256; b < 384; b += 4) {

+    const int cidx = b >= 320 ? 20 : 16;

+    a = ta + vp9_block2above_sb64[TX_8X8][b];

+    l = tl + vp9_block2left_sb64[TX_8X8][b];

+    above_ec = (a[0] + a[1]) != 0;

+    left_ec = (l[0] + l[1]) != 0;

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

+               &above_ec, &left_ec, TX_8X8);

+    a[0] = a[1] = above_ec;

+    l[0] = l[1] = left_ec;

+  }

+}

+void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) t_left;

+  ENTROPY_CONTEXT *a, *l;

+  int b;

+  vpx_memcpy(t_above, x->e_mbd.above_context, sizeof(t_above));

+  vpx_memcpy(t_left, x->e_mbd.left_context, sizeof(t_left));

+  for (b = 256; b < 384; b++) {

+    const int cidx = b >= 320 ? 20 : 16;

+    a = ta + vp9_block2above_sb64[TX_4X4][b];

+    l = tl + vp9_block2left_sb64[TX_4X4][b];

+    optimize_b(cm, x, b, PLANE_TYPE_UV, x->e_mbd.block[cidx].dequant,

+               a, l, TX_4X4);

+  }

+}

+void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x) {

   MACROBLOCKD *const xd = &x->e_mbd;

   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;

@@ -840,7 +1295,7 @@

     vp9_transform_mb_16x16(x);

     vp9_quantize_mb_16x16(x);

     if (x->optimize)

-      optimize_mb_16x16(x);

+      optimize_mb_16x16(cm, x);

     vp9_inverse_transform_mb_16x16(xd);

   } else if (tx_size == TX_8X8) {

     if (xd->mode_info_context->mbmi.mode == SPLITMV) {

@@ -850,8 +1305,8 @@

       vp9_quantize_mby_8x8(x);

       vp9_quantize_mbuv_4x4(x);

       if (x->optimize) {

-        vp9_optimize_mby_8x8(x);

-        vp9_optimize_mbuv_4x4(x);

+        vp9_optimize_mby_8x8(cm, x);

+        vp9_optimize_mbuv_4x4(cm, x);

       vp9_inverse_transform_mby_8x8(xd);

       vp9_inverse_transform_mbuv_4x4(xd);

@@ -859,7 +1314,7 @@

       vp9_transform_mb_8x8(x);

       vp9_quantize_mb_8x8(x);

       if (x->optimize)

-        optimize_mb_8x8(x);

+        optimize_mb_8x8(cm, x);

       vp9_inverse_transform_mb_8x8(xd);

   } else {

@@ -866,26 +1321,27 @@

     transform_mb_4x4(x);

     vp9_quantize_mb_4x4(x);

     if (x->optimize)

-      optimize_mb_4x4(x);

+      optimize_mb_4x4(cm, x);

     vp9_inverse_transform_mb_4x4(xd);

-void vp9_encode_inter16x16(MACROBLOCK *x) {

+void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,

+                           int mb_row, int mb_col) {

   MACROBLOCKD *const xd = &x->e_mbd;

-  vp9_build_inter_predictors_mb(xd);

+  vp9_build_inter_predictors_mb(xd, mb_row, mb_col);

   subtract_mb(x);

-  vp9_fidct_mb(x);

+  vp9_fidct_mb(cm, x);

   vp9_recon_mb(xd);

 /* this function is used by first pass only */

-void vp9_encode_inter16x16y(MACROBLOCK *x) {

+void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col) {

   MACROBLOCKD *xd = &x->e_mbd;

   BLOCK *b = &x->block[0];

-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);

+  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);

   vp9_subtract_mby(x->src_diff, *(b->base_src), xd->predictor, b->src_stride);

--- a/vp9/encoder/vp9_encodemb.h

+++ b/vp9/encoder/vp9_encodemb.h

@@ -13,6 +13,8 @@

 #include "./vpx_config.h"

 #include "vp9/encoder/vp9_block.h"

+#include "vp9/encoder/vp9_onyx_int.h"

+#include "vp9/common/vp9_onyxc_int.h"

 typedef struct {

   MB_PREDICTION_MODE mode;

@@ -21,33 +23,61 @@

 } MODE_DEFINITION;

-#include "vp9/encoder/vp9_onyx_int.h"

 struct VP9_ENCODER_RTCD;

-void vp9_encode_inter16x16(MACROBLOCK *x);

+void vp9_encode_inter16x16(VP9_COMMON *const cm, MACROBLOCK *x,

+                           int mb_row, int mb_col);

 void vp9_transform_mbuv_4x4(MACROBLOCK *x);

 void vp9_transform_mby_4x4(MACROBLOCK *x);

-void vp9_optimize_mby_4x4(MACROBLOCK *x);

-void vp9_optimize_mbuv_4x4(MACROBLOCK *x);

-void vp9_encode_inter16x16y(MACROBLOCK *x);

+void vp9_optimize_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_optimize_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_encode_inter16x16y(MACROBLOCK *x, int mb_row, int mb_col);

 void vp9_transform_mb_8x8(MACROBLOCK *mb);

 void vp9_transform_mby_8x8(MACROBLOCK *x);

 void vp9_transform_mbuv_8x8(MACROBLOCK *x);

-void vp9_build_dcblock_8x8(MACROBLOCK *b);

-void vp9_optimize_mby_8x8(MACROBLOCK *x);

-void vp9_optimize_mbuv_8x8(MACROBLOCK *x);

+void vp9_optimize_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_optimize_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

 void vp9_transform_mb_16x16(MACROBLOCK *mb);

 void vp9_transform_mby_16x16(MACROBLOCK *x);

-void vp9_optimize_mby_16x16(MACROBLOCK *x);

+void vp9_optimize_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

 void vp9_transform_sby_32x32(MACROBLOCK *x);

+void vp9_optimize_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sby_16x16(MACROBLOCK *x);

+void vp9_optimize_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sby_8x8(MACROBLOCK *x);

+void vp9_optimize_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sby_4x4(MACROBLOCK *x);

+void vp9_optimize_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

 void vp9_transform_sbuv_16x16(MACROBLOCK *x);

+void vp9_optimize_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sbuv_8x8(MACROBLOCK *x);

+void vp9_optimize_sbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sbuv_4x4(MACROBLOCK *x);

+void vp9_optimize_sbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

-void vp9_fidct_mb(MACROBLOCK *x);

+void vp9_transform_sb64y_32x32(MACROBLOCK *x);

+void vp9_optimize_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sb64y_16x16(MACROBLOCK *x);

+void vp9_optimize_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sb64y_8x8(MACROBLOCK *x);

+void vp9_optimize_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sb64y_4x4(MACROBLOCK *x);

+void vp9_optimize_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sb64uv_32x32(MACROBLOCK *x);

+void vp9_optimize_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sb64uv_16x16(MACROBLOCK *x);

+void vp9_optimize_sb64uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sb64uv_8x8(MACROBLOCK *x);

+void vp9_optimize_sb64uv_8x8(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_transform_sb64uv_4x4(MACROBLOCK *x);

+void vp9_optimize_sb64uv_4x4(VP9_COMMON *const cm, MACROBLOCK *x);

+void vp9_fidct_mb(VP9_COMMON *const cm, MACROBLOCK *x);

 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);

 void vp9_subtract_mbuv_s_c(int16_t *diff, const uint8_t *usrc,

@@ -63,5 +93,11 @@

                            const uint8_t *vsrc, int src_stride,

                            const uint8_t *upred,

                            const uint8_t *vpred, int dst_stride);

+void vp9_subtract_sb64y_s_c(int16_t *diff, const uint8_t *src, int src_stride,

+                            const uint8_t *pred, int dst_stride);

+void vp9_subtract_sb64uv_s_c(int16_t *diff, const uint8_t *usrc,

+                             const uint8_t *vsrc, int src_stride,

+                             const uint8_t *upred,

+                             const uint8_t *vpred, int dst_stride);

 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_

--- a/vp9/encoder/vp9_firstpass.c

+++ b/vp9/encoder/vp9_firstpass.c

@@ -378,6 +378,19 @@

   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];

   int new_mv_mode_penalty = 256;

+  int sr = 0;

+  int quart_frm = MIN(cpi->common.width, cpi->common.height);

+  // refine the motion search range accroding to the frame dimension

+  // for first pass test

+  while ((quart_frm << sr) < MAX_FULL_PEL_VAL)

+    sr++;

+  if (sr)

+    sr--;

+  step_param    += sr;

+  further_steps -= sr;

   // override the default variance function to use MSE

   v_fn_ptr.vf = vp9_mse16x16;

@@ -435,9 +448,11 @@

   MACROBLOCKD *const xd = &x->e_mbd;

   int recon_yoffset, recon_uvoffset;

-  YV12_BUFFER_CONFIG *lst_yv12 = &cm->yv12_fb[cm->lst_fb_idx];

+  YV12_BUFFER_CONFIG *lst_yv12 =

+      &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]];

   YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];

-  YV12_BUFFER_CONFIG *gld_yv12 = &cm->yv12_fb[cm->gld_fb_idx];

+  YV12_BUFFER_CONFIG *gld_yv12 =

+      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];

   int recon_y_stride = lst_yv12->y_stride;

   int recon_uv_stride = lst_yv12->uv_stride;

   int64_t intra_error = 0;

@@ -611,7 +626,7 @@

           this_error = motion_error;

           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);

           xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-          vp9_encode_inter16x16y(x);

+          vp9_encode_inter16x16y(x, mb_row, mb_col);

           sum_mvr += mv.as_mv.row;

           sum_mvr_abs += abs(mv.as_mv.row);

           sum_mvc += mv.as_mv.col;

@@ -843,16 +858,15 @@

   power_term = (vp9_convert_qindex_to_q(Q) * 0.01) + pt_low;

   power_term = (power_term > pt_high) ? pt_high : power_term;

-  // Adjustments to error term

-  // TBD

   // Calculate correction factor

+  if (power_term < 1.0)

+    assert(error_term >= 0.0);

   correction_factor = pow(error_term, power_term);

   // Clip range

   correction_factor =

     (correction_factor < 0.05)

-    ? 0.05 : (correction_factor > 2.0) ? 2.0 : correction_factor;

+    ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;

   return correction_factor;

@@ -886,8 +900,7 @@

 static int estimate_max_q(VP9_COMP *cpi,

                           FIRSTPASS_STATS *fpstats,

-                          int section_target_bandwitdh,

-                          int overhead_bits) {

+                          int section_target_bandwitdh) {

   int Q;

   int num_mbs = cpi->common.MBs;

   int target_norm_bits_per_mb;

@@ -898,7 +911,6 @@

   double err_per_mb = section_err / num_mbs;

   double err_correction_factor;

   double speed_correction = 1.0;

-  double overhead_bits_per_mb;

   if (section_target_bandwitdh <= 0)

     return cpi->twopass.maxq_max_limit;          // Highest value allowed

@@ -910,15 +922,19 @@

   // Look at the drop in prediction quality between the last frame

   // and the GF buffer (which contained an older frame).

-  sr_err_diff =

-    (fpstats->sr_coded_error - fpstats->coded_error) /

-    (fpstats->count * cpi->common.MBs);

-  sr_correction = (sr_err_diff / 32.0);

-  sr_correction = pow(sr_correction, 0.25);

-  if (sr_correction < 0.75)

+  if (fpstats->sr_coded_error > fpstats->coded_error) {

+    sr_err_diff =

+      (fpstats->sr_coded_error - fpstats->coded_error) /

+      (fpstats->count * cpi->common.MBs);

+    sr_correction = (sr_err_diff / 32.0);

+    sr_correction = pow(sr_correction, 0.25);

+    if (sr_correction < 0.75)

+      sr_correction = 0.75;

+    else if (sr_correction > 1.25)

+      sr_correction = 1.25;

+  } else {

     sr_correction = 0.75;

-  else if (sr_correction > 1.25)

-    sr_correction = 1.25;

+  }

   // Calculate a corrective factor based on a rolling ratio of bits spent

   // vs target bits

@@ -950,13 +966,6 @@

       speed_correction = 1.25;

-  // Estimate of overhead bits per mb

-  // Correction to overhead bits for min allowed Q.

-  // PGW TODO.. This code is broken for the extended Q range

-  //            for now overhead set to 0.

-  overhead_bits_per_mb = overhead_bits / num_mbs;

-  overhead_bits_per_mb *= pow(0.98, (double)cpi->twopass.maxq_min_limit);

   // Try and pick a max Q that will be high enough to encode the

   // content at the given rate.

   for (Q = cpi->twopass.maxq_min_limit; Q < cpi->twopass.maxq_max_limit; Q++) {

@@ -967,24 +976,10 @@

       sr_correction * speed_correction *

       cpi->twopass.est_max_qcorrection_factor;

-    if (err_correction_factor < 0.05)

-      err_correction_factor = 0.05;

-    else if (err_correction_factor > 5.0)

-      err_correction_factor = 5.0;

     bits_per_mb_at_this_q =

-      vp9_bits_per_mb(INTER_FRAME, Q) + (int)overhead_bits_per_mb;

+      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);

-    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *

-                                  (double)bits_per_mb_at_this_q);

-    // Mode and motion overhead

-    // As Q rises in real encode loop rd code will force overhead down

-    // We make a crude adjustment for this here as *.98 per Q step.

-    // PGW TODO.. This code is broken for the extended Q range

-    //            for now overhead set to 0.

-    // overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);

     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)

       break;

@@ -1001,7 +996,7 @@

   // PGW TODO.. This code is broken for the extended Q range

   if ((cpi->ni_frames >

        ((int)cpi->twopass.total_stats->count >> 8)) &&

-      (cpi->ni_frames > 150)) {

+      (cpi->ni_frames > 25)) {

     adjust_maxq_qrange(cpi);

@@ -1012,8 +1007,7 @@

 // complexity and data rate.

 static int estimate_cq(VP9_COMP *cpi,

                        FIRSTPASS_STATS *fpstats,

-                       int section_target_bandwitdh,

-                       int overhead_bits) {

+                       int section_target_bandwitdh) {

   int Q;

   int num_mbs = cpi->common.MBs;

   int target_norm_bits_per_mb;

@@ -1026,15 +1020,11 @@

   double speed_correction = 1.0;

   double clip_iiratio;

   double clip_iifactor;

-  double overhead_bits_per_mb;

   target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))

                             ? (512 * section_target_bandwitdh) / num_mbs

                             : 512 * (section_target_bandwitdh / num_mbs);

-  // Estimate of overhead bits per mb

-  overhead_bits_per_mb = overhead_bits / num_mbs;

   // Corrections for higher compression speed settings

   // (reduced compression expected)

@@ -1047,15 +1037,19 @@

   // Look at the drop in prediction quality between the last frame

   // and the GF buffer (which contained an older frame).

-  sr_err_diff =

-    (fpstats->sr_coded_error - fpstats->coded_error) /

-    (fpstats->count * cpi->common.MBs);

-  sr_correction = (sr_err_diff / 32.0);

-  sr_correction = pow(sr_correction, 0.25);

-  if (sr_correction < 0.75)

+  if (fpstats->sr_coded_error > fpstats->coded_error) {

+    sr_err_diff =

+      (fpstats->sr_coded_error - fpstats->coded_error) /

+      (fpstats->count * cpi->common.MBs);

+    sr_correction = (sr_err_diff / 32.0);

+    sr_correction = pow(sr_correction, 0.25);

+    if (sr_correction < 0.75)

+      sr_correction = 0.75;

+    else if (sr_correction > 1.25)

+      sr_correction = 1.25;

+  } else {

     sr_correction = 0.75;

-  else if (sr_correction > 1.25)

-    sr_correction = 1.25;

+  }

   // II ratio correction factor for clip as a whole

   clip_iiratio = cpi->twopass.total_stats->intra_error /

@@ -1073,24 +1067,9 @@

       calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, Q) *

       sr_correction * speed_correction * clip_iifactor;

-    if (err_correction_factor < 0.05)

-      err_correction_factor = 0.05;

-    else if (err_correction_factor > 5.0)

-      err_correction_factor = 5.0;

     bits_per_mb_at_this_q =

-      vp9_bits_per_mb(INTER_FRAME, Q) + (int)overhead_bits_per_mb;

+      vp9_bits_per_mb(INTER_FRAME, Q, err_correction_factor);

-    bits_per_mb_at_this_q = (int)(.5 + err_correction_factor *

-                                  (double)bits_per_mb_at_this_q);

-    // Mode and motion overhead

-    // As Q rises in real encode loop rd code will force overhead down

-    // We make a crude adjustment for this here as *.98 per Q step.

-    // PGW TODO.. This code is broken for the extended Q range

-    //            for now overhead set to 0.

-    overhead_bits_per_mb = (int)((double)overhead_bits_per_mb * 0.98);

     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)

       break;

@@ -1209,12 +1188,16 @@

   mb_sr_err_diff =

     (next_frame->sr_coded_error - next_frame->coded_error) /

     (cpi->common.MBs);

-  second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);

-  second_ref_decay = pow(second_ref_decay, 0.5);

-  if (second_ref_decay < 0.85)

+  if (mb_sr_err_diff <= 512.0) {

+    second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0);

+    second_ref_decay = pow(second_ref_decay, 0.5);

+    if (second_ref_decay < 0.85)

+      second_ref_decay = 0.85;

+    else if (second_ref_decay > 1.0)

+      second_ref_decay = 1.0;

+  } else {

     second_ref_decay = 0.85;

-  else if (second_ref_decay > 1.0)

-    second_ref_decay = 1.0;

+  }

   if (second_ref_decay < prediction_decay_rate)

     prediction_decay_rate = second_ref_decay;

@@ -1459,11 +1442,14 @@

   return arf_boost;

-static void configure_arnr_filter(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {

+static void configure_arnr_filter(VP9_COMP *cpi,

+                                  FIRSTPASS_STATS *this_frame,

+                                  int group_boost) {

   int half_gf_int;

   int frames_after_arf;

   int frames_bwd = cpi->oxcf.arnr_max_frames - 1;

   int frames_fwd = cpi->oxcf.arnr_max_frames - 1;

+  int q;

   // Define the arnr filter width for this group of frames:

   // We only filter frames that lie within a distance of half

@@ -1508,6 +1494,25 @@

   cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;

+  // Adjust the strength based on active max q

+  q = ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 1);

+  if (q > 8) {

+    cpi->active_arnr_strength = cpi->oxcf.arnr_strength;

+  } else {

+    cpi->active_arnr_strength = cpi->oxcf.arnr_strength - (8 - q);

+    if (cpi->active_arnr_strength < 0)

+      cpi->active_arnr_strength = 0;

+  }

+  // Adjust number of frames in filter and strength based on gf boost level.

+  if (cpi->active_arnr_frames > (group_boost / 150)) {

+    cpi->active_arnr_frames = (group_boost / 150);

+    cpi->active_arnr_frames += !(cpi->active_arnr_frames & 1);

+  }

+  if (cpi->active_arnr_strength > (group_boost / 300)) {

+    cpi->active_arnr_strength = (group_boost / 300);

+  }

 // Analyse and define a gf/arf group .

@@ -1531,7 +1536,7 @@

   double this_frame_mv_in_out = 0.0;

   double mv_in_out_accumulator = 0.0;

   double abs_mv_in_out_accumulator = 0.0;

+  double mv_ratio_accumulator_thresh;

   int max_bits = frame_max_bits(cpi);     // Max for a single frame

   unsigned int allow_alt_ref =

@@ -1540,6 +1545,7 @@

   int f_boost = 0;

   int b_boost = 0;

   int flash_detected;

+  int active_max_gf_interval;

   cpi->twopass.gf_group_bits = 0;

@@ -1562,11 +1568,22 @@

   if (cpi->common.frame_type == KEY_FRAME)

     gf_group_err -= gf_first_frame_err;

-  // Scan forward to try and work out how many frames the next gf group

-  // should contain and what level of boost is appropriate for the GF

-  // or ARF that will be coded with the group

-  i = 0;

+  // Motion breakout threshold for loop below depends on image size.

+  mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 10.0;

+  // Work out a maximum interval for the GF.

+  // If the image appears completely static we can extend beyond this.

+  // The value chosen depends on the active Q range. At low Q we have

+  // bits to spare and are better with a smaller interval and smaller boost.

+  // At high Q when there are few bits to spare we are better with a longer

+  // interval to spread the cost of the GF.

+  active_max_gf_interval =

+    12 + ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 5);

+  if (active_max_gf_interval > cpi->max_gf_interval)

+    active_max_gf_interval = cpi->max_gf_interval;

+  i = 0;

   while (((i < cpi->twopass.static_scene_max_gf_interval) ||

           ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL)) &&

          (i < cpi->twopass.frames_to_key)) {

@@ -1618,7 +1635,7 @@

     // Break out conditions.

     if (

       // Break at cpi->max_gf_interval unless almost totally static

-      (i >= cpi->max_gf_interval && (zero_motion_accumulator < 0.995)) ||

+      (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||

         // Dont break out with a very short interval

         (i > MIN_GF_INTERVAL) &&

@@ -1626,7 +1643,7 @@

         ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&

         ((boost_score > 125.0) || (next_frame.pcnt_inter < 0.75)) &&

         (!flash_detected) &&

-        ((mv_ratio_accumulator > 100.0) ||

+        ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||

          (abs_mv_in_out_accumulator > 3.0) ||

          (mv_in_out_accumulator < -2.0) ||

          ((boost_score - old_boost_score) < IIFACTOR))

@@ -1673,7 +1690,7 @@

     cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);

     cpi->source_alt_ref_pending = TRUE;

-    configure_arnr_filter(cpi, this_frame);

+    configure_arnr_filter(cpi, this_frame, cpi->gfu_boost);

   } else {

     cpi->gfu_boost = (int)boost_score;

     cpi->source_alt_ref_pending = FALSE;

@@ -1945,7 +1962,8 @@

 void vp9_second_pass(VP9_COMP *cpi) {

   int tmp_q;

-  int frames_left = (int)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);

+  int frames_left = (int)(cpi->twopass.total_stats->count -

+                          cpi->common.current_video_frame);

   FIRSTPASS_STATS this_frame;

   FIRSTPASS_STATS this_frame_copy;

@@ -1953,8 +1971,6 @@

   double this_frame_intra_error;

   double this_frame_coded_error;

-  int overhead_bits;

   if (!cpi->twopass.stats_in) {

     return;

@@ -1961,68 +1977,6 @@

   vp9_clear_system_state();

-  vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS));

-  if (EOF == input_stats(cpi, &this_frame))

-    return;

-  this_frame_intra_error = this_frame.intra_error;

-  this_frame_coded_error = this_frame.coded_error;

-  // keyframe and section processing !

-  if (cpi->twopass.frames_to_key == 0) {

-    // Define next KF group and assign bits to it

-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

-    find_next_key_frame(cpi, &this_frame_copy);

-  }

-  // Is this a GF / ARF (Note that a KF is always also a GF)

-  if (cpi->frames_till_gf_update_due == 0) {

-    // Define next gf group and assign bits to it

-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

-    define_gf_group(cpi, &this_frame_copy);

-    // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....

-    // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits

-    // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well

-    if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {

-      // Assign a standard frames worth of bits from those allocated to the GF group

-      int bak = cpi->per_frame_bandwidth;

-      vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

-      assign_std_frame_bits(cpi, &this_frame_copy);

-      cpi->per_frame_bandwidth = bak;

-    }

-  }

-  // Otherwise this is an ordinary frame

-  else {

-    // Assign bits from those allocated to the GF group

-    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

-    assign_std_frame_bits(cpi, &this_frame_copy);

-  }

-  // Keep a globally available copy of this and the next frame's iiratio.

-  cpi->twopass.this_iiratio = (int)(this_frame_intra_error /

-                              DOUBLE_DIVIDE_CHECK(this_frame_coded_error));

-  {

-    FIRSTPASS_STATS next_frame;

-    if (lookup_next_frame_stats(cpi, &next_frame) != EOF) {

-      cpi->twopass.next_iiratio = (int)(next_frame.intra_error /

-                                  DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

-    }

-  }

-  // Set nominal per second bandwidth for this frame

-  cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth

-                                * cpi->output_frame_rate);

-  if (cpi->target_bandwidth < 0)

-    cpi->target_bandwidth = 0;

-  // Account for mv, mode and other overheads.

-  overhead_bits = (int)estimate_modemvcost(

-                        cpi, cpi->twopass.total_left_stats);

   // Special case code for first frame.

   if (cpi->common.current_video_frame == 0) {

     cpi->twopass.est_max_qcorrection_factor = 1.0;

@@ -2034,8 +1988,7 @@

       est_cq =

         estimate_cq(cpi,

                     cpi->twopass.total_left_stats,

-                    (int)(cpi->twopass.bits_left / frames_left),

-                    overhead_bits);

+                    (int)(cpi->twopass.bits_left / frames_left));

       cpi->cq_target_quality = cpi->oxcf.cq_level;

       if (est_cq > cpi->cq_target_quality)

@@ -2049,13 +2002,13 @@

     tmp_q = estimate_max_q(

               cpi,

               cpi->twopass.total_left_stats,

-              (int)(cpi->twopass.bits_left / frames_left),

-              overhead_bits);

+              (int)(cpi->twopass.bits_left / frames_left));

     cpi->active_worst_quality         = tmp_q;

     cpi->ni_av_qi                     = tmp_q;

     cpi->avg_q                        = vp9_convert_qindex_to_q(tmp_q);

+#ifndef ONE_SHOT_Q_ESTIMATE

     // Limit the maxq value returned subsequently.

     // This increases the risk of overspend or underspend if the initial

     // estimate for the clip is bad, but helps prevent excessive

@@ -2062,8 +2015,10 @@

     // variation in Q, especially near the end of a clip

     // where for example a small overspend may cause Q to crash

     adjust_maxq_qrange(cpi);

+#endif

+#ifndef ONE_SHOT_Q_ESTIMATE

   // The last few frames of a clip almost always have to few or too many

   // bits and for the sake of over exact rate control we dont want to make

   // radical adjustments to the allowed quantizer range just to use up a

@@ -2078,20 +2033,77 @@

     tmp_q = estimate_max_q(

               cpi,

               cpi->twopass.total_left_stats,

-              (int)(cpi->twopass.bits_left / frames_left),

-              overhead_bits);

+              (int)(cpi->twopass.bits_left / frames_left));

     // Make a damped adjustment to active max Q

     cpi->active_worst_quality =

       adjust_active_maxq(cpi->active_worst_quality, tmp_q);

+#endif

+  vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS));

+  if (EOF == input_stats(cpi, &this_frame))

+    return;

+  this_frame_intra_error = this_frame.intra_error;

+  this_frame_coded_error = this_frame.coded_error;

+  // keyframe and section processing !

+  if (cpi->twopass.frames_to_key == 0) {

+    // Define next KF group and assign bits to it

+    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+    find_next_key_frame(cpi, &this_frame_copy);

+  }

+  // Is this a GF / ARF (Note that a KF is always also a GF)

+  if (cpi->frames_till_gf_update_due == 0) {

+    // Define next gf group and assign bits to it

+    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+    define_gf_group(cpi, &this_frame_copy);

+    // If we are going to code an altref frame at the end of the group

+    // and the current frame is not a key frame....

+    // If the previous group used an arf this frame has already benefited

+    // from that arf boost and it should not be given extra bits

+    // If the previous group was NOT coded using arf we may want to apply

+    // some boost to this GF as well

+    if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {

+      // Assign a standard frames worth of bits from those allocated

+      // to the GF group

+      int bak = cpi->per_frame_bandwidth;

+      vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+      assign_std_frame_bits(cpi, &this_frame_copy);

+      cpi->per_frame_bandwidth = bak;

+    }

+  } else {

+    // Otherwise this is an ordinary frame

+    // Assign bits from those allocated to the GF group

+    vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));

+    assign_std_frame_bits(cpi, &this_frame_copy);

+  }

+  // Keep a globally available copy of this and the next frame's iiratio.

+  cpi->twopass.this_iiratio = (int)(this_frame_intra_error /

+                              DOUBLE_DIVIDE_CHECK(this_frame_coded_error));

+  {

+    FIRSTPASS_STATS next_frame;

+    if (lookup_next_frame_stats(cpi, &next_frame) != EOF) {

+      cpi->twopass.next_iiratio = (int)(next_frame.intra_error /

+                                  DOUBLE_DIVIDE_CHECK(next_frame.coded_error));

+    }

+  }

+  // Set nominal per second bandwidth for this frame

+  cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth

+                                * cpi->output_frame_rate);

+  if (cpi->target_bandwidth < 0)

+    cpi->target_bandwidth = 0;

   cpi->twopass.frames_to_key--;

   // Update the total stats remaining sturcture

   subtract_stats(cpi->twopass.total_left_stats, &this_frame);

 static int test_candidate_kf(VP9_COMP *cpi,

                              FIRSTPASS_STATS *last_frame,

--- a/vp9/encoder/vp9_firstpass.h

+++ b/vp9/encoder/vp9_firstpass.h

@@ -11,12 +11,12 @@

 #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_

 #define VP9_ENCODER_VP9_FIRSTPASS_H_

-extern void vp9_init_first_pass(VP9_COMP *cpi);

-extern void vp9_first_pass(VP9_COMP *cpi);

-extern void vp9_end_first_pass(VP9_COMP *cpi);

+void vp9_init_first_pass(VP9_COMP *cpi);

+void vp9_first_pass(VP9_COMP *cpi);

+void vp9_end_first_pass(VP9_COMP *cpi);

-extern void vp9_init_second_pass(VP9_COMP *cpi);

-extern void vp9_second_pass(VP9_COMP *cpi);

-extern void vp9_end_second_pass(VP9_COMP *cpi);

+void vp9_init_second_pass(VP9_COMP *cpi);

+void vp9_second_pass(VP9_COMP *cpi);

+void vp9_end_second_pass(VP9_COMP *cpi);

 #endif  // VP9_ENCODER_VP9_FIRSTPASS_H_

--- a/vp9/encoder/vp9_lookahead.c

+++ b/vp9/encoder/vp9_lookahead.c

@@ -9,7 +9,9 @@

*/

 #include <assert.h>

 #include <stdlib.h>

 #include "vpx_config.h"

+#include "vp9/common/vp9_common.h"

 #include "vp9/encoder/vp9_lookahead.h"

 #include "vp9/common/vp9_extend.h"

@@ -25,10 +27,9 @@

 /* Return the buffer at the given absolute index and increment the index */

-static struct lookahead_entry *

-pop(struct lookahead_ctx *ctx,

-    unsigned int         *idx) {

-  unsigned int            index = *idx;

+static struct lookahead_entry * pop(struct lookahead_ctx *ctx,

+                                    unsigned int *idx) {

+  unsigned int index = *idx;

   struct lookahead_entry *buf = ctx->buf + index;

   assert(index < ctx->max_sz);

@@ -39,8 +40,7 @@

-void

-vp9_lookahead_destroy(struct lookahead_ctx *ctx) {

+void vp9_lookahead_destroy(struct lookahead_ctx *ctx) {

   if (ctx) {

     if (ctx->buf) {

       unsigned int i;

@@ -54,23 +54,15 @@

-struct lookahead_ctx *

-vp9_lookahead_init(unsigned int width,

-                   unsigned int height,

-                   unsigned int depth) {

+struct lookahead_ctx * vp9_lookahead_init(unsigned int width,

+                                          unsigned int height,

+                                          unsigned int depth) {

   struct lookahead_ctx *ctx = NULL;

-  /* Clamp the lookahead queue depth */

-  if (depth < 1)

-    depth = 1;

-  else if (depth > MAX_LAG_BUFFERS)

-    depth = MAX_LAG_BUFFERS;

+  // Clamp the lookahead queue depth

+  depth = clamp(depth, 1, MAX_LAG_BUFFERS);

-  /* Align the buffer dimensions */

-  width = (width + 15) &~15;

-  height = (height + 15) &~15;

-  /* Allocate the lookahead structures */

+  // Allocate the lookahead structures

   ctx = calloc(1, sizeof(*ctx));

   if (ctx) {

     unsigned int i;

@@ -90,13 +82,9 @@

-int

-vp9_lookahead_push(struct lookahead_ctx *ctx,

-                   YV12_BUFFER_CONFIG   *src,

-                   int64_t               ts_start,

-                   int64_t               ts_end,

-                   unsigned int          flags,

-                   unsigned char        *active_map) {

+int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,

+                       int64_t ts_start, int64_t ts_end, unsigned int flags,

+                       unsigned char *active_map) {

   struct lookahead_entry *buf;

   int row, col, active_end;

   int mb_rows = (src->y_height + 15) >> 4;

@@ -156,9 +144,8 @@

-struct lookahead_entry *

-vp9_lookahead_pop(struct lookahead_ctx *ctx,

-                  int                   drain) {

+struct lookahead_entry * vp9_lookahead_pop(struct lookahead_ctx *ctx,

+                                           int drain) {

   struct lookahead_entry *buf = NULL;

   if (ctx->sz && (drain || ctx->sz == ctx->max_sz)) {

@@ -169,9 +156,8 @@

-struct lookahead_entry *

-vp9_lookahead_peek(struct lookahead_ctx *ctx,

-                   int                   index) {

+struct lookahead_entry * vp9_lookahead_peek(struct lookahead_ctx *ctx,

+                                            int index) {

   struct lookahead_entry *buf = NULL;

   assert(index < (int)ctx->max_sz);

@@ -184,8 +170,6 @@

   return buf;

-unsigned int

-vp9_lookahead_depth(struct lookahead_ctx *ctx) {

+unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx) {

   return ctx->sz;

--- a/vp9/encoder/vp9_lookahead.h

+++ b/vp9/encoder/vp9_lookahead.h

@@ -28,17 +28,13 @@

  * The lookahead stage is a queue of frame buffers on which some analysis

  * may be done when buffers are enqueued.

- *

- *

*/

 struct lookahead_ctx *vp9_lookahead_init(unsigned int width,

                                          unsigned int height,

-                                         unsigned int depth

-                                        );

+                                         unsigned int depth);

 /**\brief Destroys the lookahead stage

- *

*/

 void vp9_lookahead_destroy(struct lookahead_ctx *ctx);

@@ -58,13 +54,9 @@

  * \param[in] flags       Flags set on this frame

  * \param[in] active_map  Map that specifies which macroblock is active

*/

-int

-vp9_lookahead_push(struct lookahead_ctx *ctx,

-                   YV12_BUFFER_CONFIG   *src,

-                   int64_t               ts_start,

-                   int64_t               ts_end,

-                   unsigned int          flags,

-                   unsigned char        *active_map);

+int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,

+                       int64_t ts_start, int64_t ts_end, unsigned int flags,

+                       unsigned char *active_map);

 /**\brief Get the next source buffer to encode

@@ -76,11 +68,9 @@

  * \retval NULL, if drain set and queue is empty

  * \retval NULL, if drain not set and queue not of the configured depth

- *

*/

-struct lookahead_entry *

-vp9_lookahead_pop(struct lookahead_ctx *ctx,

-                  int                   drain);

+struct lookahead_entry *vp9_lookahead_pop(struct lookahead_ctx *ctx,

+                                          int drain);

 /**\brief Get a future source buffer to encode

@@ -89,11 +79,9 @@

  * \param[in] index     Index of the frame to be returned, 0 == next frame

  * \retval NULL, if no buffer exists at the specified index

- *

*/

-struct lookahead_entry *

-vp9_lookahead_peek(struct lookahead_ctx *ctx,

-                   int                   index);

+struct lookahead_entry *vp9_lookahead_peek(struct lookahead_ctx *ctx,

+                                           int index);

 /**\brief Get the number of frames currently in the lookahead queue

@@ -100,7 +88,6 @@

  * \param[in] ctx       Pointer to the lookahead context

*/

-unsigned int

-vp9_lookahead_depth(struct lookahead_ctx *ctx);

+unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx);

 #endif  // VP9_ENCODER_VP9_LOOKAHEAD_H_

--- a/vp9/encoder/vp9_mbgraph.c

+++ b/vp9/encoder/vp9_mbgraph.c

@@ -20,7 +20,9 @@

 static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,

                                               int_mv *ref_mv,

-                                              int_mv *dst_mv) {

+                                              int_mv *dst_mv,

+                                              int mb_row,

+                                              int mb_col) {

   MACROBLOCK   *const x  = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

   BLOCK *b  = &x->block[0];

@@ -27,8 +29,8 @@

   BLOCKD *d = &xd->block[0];

   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];

   unsigned int best_err;

-  int step_param;

   int tmp_col_min = x->mv_col_min;

   int tmp_col_max = x->mv_col_max;

   int tmp_row_min = x->mv_row_min;

@@ -36,11 +38,8 @@

   int_mv ref_full;

   // Further step/diamond searches as necessary

-  if (cpi->Speed < 8) {

-    step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);

-  } else {

-    step_param = cpi->sf.first_step + 2;

-  }

+  int step_param = cpi->sf.first_step +

+      (cpi->Speed < 8 ? (cpi->Speed > 5 ? 1 : 0) : 2);

   vp9_clamp_mv_min_max(x, ref_mv);

@@ -72,7 +71,7 @@

   vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);

-  vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);

+  vp9_build_inter16x16_predictors_mby(xd, xd->predictor, 16, mb_row, mb_col);

   best_err = vp9_sad16x16(xd->dst.y_buffer, xd->dst.y_stride,

                           xd->predictor, 16, INT_MAX);

@@ -93,8 +92,9 @@

   YV12_BUFFER_CONFIG *buf,

   int buf_mb_y_offset,

   YV12_BUFFER_CONFIG *ref,

-  int mb_y_offset

-) {

+  int mb_y_offset,

+  int mb_row,

+  int mb_col) {

   MACROBLOCK   *const x  = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

   unsigned int err, tmp_err;

@@ -124,7 +124,7 @@

   // Test last reference frame using the previous best mv as the

   // starting point (best reference) for the search

-  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv);

+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);

   if (tmp_err < err) {

     err            = tmp_err;

     dst_mv->as_int = tmp_mv.as_int;

@@ -136,7 +136,8 @@

     int_mv zero_ref_mv, tmp_mv;

     zero_ref_mv.as_int = 0;

-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv);

+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv,

+                                        mb_row, mb_col);

     if (tmp_err < err) {

       dst_mv->as_int = tmp_mv.as_int;

       err = tmp_err;

@@ -229,7 +230,9 @@

   int gld_y_offset,

   YV12_BUFFER_CONFIG *alt_ref,

   int_mv *prev_alt_ref_mv,

-  int arf_y_offset

+  int arf_y_offset,

+  int mb_row,

+  int mb_col

) {

   MACROBLOCK   *const x  = &cpi->mb;

   MACROBLOCKD *const xd = &x->e_mbd;

@@ -249,7 +252,8 @@

     int g_motion_error = do_16x16_motion_search(cpi, prev_golden_ref_mv,

                                                 &stats->ref[GOLDEN_FRAME].m.mv,

                                                 buf, mb_y_offset,

-                                                golden_ref, gld_y_offset);

+                                                golden_ref, gld_y_offset,

+                                                mb_row, mb_col);

     stats->ref[GOLDEN_FRAME].err = g_motion_error;

   } else {

     stats->ref[GOLDEN_FRAME].err = INT_MAX;

@@ -292,6 +296,9 @@

   int_mv arf_top_mv, gld_top_mv;

   MODE_INFO mi_local;

+  // Make sure the mi context starts in a consistent state.

+  memset(&mi_local, 0, sizeof(mi_local));

   // Set up limit values for motion vectors to prevent them extending outside the UMV borders

   arf_top_mv.as_int = 0;

   gld_top_mv.as_int = 0;

@@ -323,7 +330,8 @@

       update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset,

                               golden_ref, &gld_left_mv, gld_y_in_offset,

-                              alt_ref,    &arf_left_mv, arf_y_in_offset);

+                              alt_ref,    &arf_left_mv, arf_y_in_offset,

+                              mb_row, mb_col);

       arf_left_mv.as_int = mb_stats->ref[ALTREF_FRAME].m.mv.as_int;

       gld_left_mv.as_int = mb_stats->ref[GOLDEN_FRAME].m.mv.as_int;

       if (mb_col == 0) {

@@ -412,7 +420,7 @@

       cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs;

     // This error case should not be reachable as this function should

-    // never be called with the common data structure unititialized.

+    // never be called with the common data structure uninitialized.

     else

       cpi->static_mb_pct = 0;

@@ -427,13 +435,11 @@

   vpx_free(arf_not_zz);

-void vp9_update_mbgraph_stats

-(

-  VP9_COMP *cpi

-) {

+void vp9_update_mbgraph_stats(VP9_COMP *cpi) {

   VP9_COMMON *const cm = &cpi->common;

   int i, n_frames = vp9_lookahead_depth(cpi->lookahead);

-  YV12_BUFFER_CONFIG *golden_ref = &cm->yv12_fb[cm->gld_fb_idx];

+  YV12_BUFFER_CONFIG *golden_ref =

+      &cm->yv12_fb[cm->ref_frame_map[cpi->gld_fb_idx]];

   // we need to look ahead beyond where the ARF transitions into

   // being a GF - so exit if we don't look ahead beyond that

--- a/vp9/encoder/vp9_mbgraph.h

+++ b/vp9/encoder/vp9_mbgraph.h

@@ -11,6 +11,6 @@

 #ifndef VP9_ENCODER_VP9_MBGRAPH_H_

 #define VP9_ENCODER_VP9_MBGRAPH_H_

-extern void vp9_update_mbgraph_stats(VP9_COMP *cpi);

+void vp9_update_mbgraph_stats(VP9_COMP *cpi);

 #endif  // VP9_ENCODER_VP9_MBGRAPH_H_

--- a/vp9/encoder/vp9_mcomp.c

+++ b/vp9/encoder/vp9_mcomp.c

@@ -8,27 +8,22 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <stdio.h>

+#include <limits.h>

+#include <math.h>

 #include "vp9/encoder/vp9_onyx_int.h"

 #include "vp9/encoder/vp9_mcomp.h"

 #include "vpx_mem/vpx_mem.h"

 #include "./vpx_config.h"

-#include <stdio.h>

-#include <limits.h>

-#include <math.h>

 #include "vp9/common/vp9_findnearmv.h"

 #include "vp9/common/vp9_common.h"

-#ifdef ENTROPY_STATS

-static int mv_ref_ct [31] [4] [2];

-static int mv_mode_cts [4] [2];

-#endif

 void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) {

   int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL +

-      ((ref_mv->as_mv.col & 7) ? 1 : 0);

+                                 ((ref_mv->as_mv.col & 7) ? 1 : 0);

   int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL +

-      ((ref_mv->as_mv.row & 7) ? 1 : 0);

+                                 ((ref_mv->as_mv.row & 7) ? 1 : 0);

   int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL;

   int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL;

@@ -43,14 +38,26 @@

     x->mv_row_max = row_max;

+int vp9_init_search_range(int width, int height) {

+  int sr = 0;

+  int frm = MIN(width, height);

+  while ((frm << sr) < MAX_FULL_PEL_VAL)

+    sr++;

+  if (sr)

+    sr--;

+  return sr;

+}

 int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],

-                    int Weight, int ishp) {

+                    int weight, int ishp) {

   MV v;

-  v.row = (mv->as_mv.row - ref->as_mv.row);

-  v.col = (mv->as_mv.col - ref->as_mv.col);

+  v.row = mv->as_mv.row - ref->as_mv.row;

+  v.col = mv->as_mv.col - ref->as_mv.col;

   return ((mvjcost[vp9_get_mv_joint(v)] +

-           mvcost[0][v.row] + mvcost[1][v.col]) *

-          Weight) >> 7;

+           mvcost[0][v.row] + mvcost[1][v.col]) * weight) >> 7;

 static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],

@@ -57,11 +64,11 @@

                        int error_per_bit, int ishp) {

   if (mvcost) {

     MV v;

-    v.row = (mv->as_mv.row - ref->as_mv.row);

-    v.col = (mv->as_mv.col - ref->as_mv.col);

+    v.row = mv->as_mv.row - ref->as_mv.row;

+    v.col = mv->as_mv.col - ref->as_mv.col;

     return ((mvjcost[vp9_get_mv_joint(v)] +

              mvcost[0][v.row] + mvcost[1][v.col]) *

-            error_per_bit + 128) >> 8;

+            error_per_bit + 4096) >> 13;

   return 0;

@@ -68,11 +75,10 @@

 static int mvsad_err_cost(int_mv *mv, int_mv *ref, int *mvjsadcost,

                           int *mvsadcost[2], int error_per_bit) {

   if (mvsadcost) {

     MV v;

-    v.row = (mv->as_mv.row - ref->as_mv.row);

-    v.col = (mv->as_mv.col - ref->as_mv.col);

+    v.row = mv->as_mv.row - ref->as_mv.row;

+    v.col = mv->as_mv.col - ref->as_mv.col;

     return ((mvjsadcost[vp9_get_mv_joint(v)] +

              mvsadcost[0][v.row] + mvsadcost[1][v.col]) *

             error_per_bit + 128) >> 8;

@@ -81,45 +87,39 @@

 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride) {

-  int Len;

+  int len;

   int search_site_count = 0;

   // Generate offsets for 4 search sites per step.

-  Len = MAX_FIRST_STEP;

   x->ss[search_site_count].mv.col = 0;

   x->ss[search_site_count].mv.row = 0;

   x->ss[search_site_count].offset = 0;

   search_site_count++;

-  while (Len > 0) {

+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {

     // Compute offsets for search sites.

     x->ss[search_site_count].mv.col = 0;

-    x->ss[search_site_count].mv.row = -Len;

-    x->ss[search_site_count].offset = -Len * stride;

+    x->ss[search_site_count].mv.row = -len;

+    x->ss[search_site_count].offset = -len * stride;

     search_site_count++;

     // Compute offsets for search sites.

     x->ss[search_site_count].mv.col = 0;

-    x->ss[search_site_count].mv.row = Len;

-    x->ss[search_site_count].offset = Len * stride;

+    x->ss[search_site_count].mv.row = len;

+    x->ss[search_site_count].offset = len * stride;

     search_site_count++;

     // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = -Len;

+    x->ss[search_site_count].mv.col = -len;

     x->ss[search_site_count].mv.row = 0;

-    x->ss[search_site_count].offset = -Len;

+    x->ss[search_site_count].offset = -len;

     search_site_count++;

     // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = Len;

+    x->ss[search_site_count].mv.col = len;

     x->ss[search_site_count].mv.row = 0;

-    x->ss[search_site_count].offset = Len;

+    x->ss[search_site_count].offset = len;

     search_site_count++;

-    // Contract.

-    Len /= 2;

   x->ss_count = search_site_count;

@@ -127,68 +127,63 @@

 void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) {

-  int Len;

+  int len;

   int search_site_count = 0;

   // Generate offsets for 8 search sites per step.

-  Len = MAX_FIRST_STEP;

   x->ss[search_site_count].mv.col = 0;

   x->ss[search_site_count].mv.row = 0;

   x->ss[search_site_count].offset = 0;

   search_site_count++;

-  while (Len > 0) {

+  for (len = MAX_FIRST_STEP; len > 0; len /= 2) {

     // Compute offsets for search sites.

     x->ss[search_site_count].mv.col = 0;

-    x->ss[search_site_count].mv.row = -Len;

-    x->ss[search_site_count].offset = -Len * stride;

+    x->ss[search_site_count].mv.row = -len;

+    x->ss[search_site_count].offset = -len * stride;

     search_site_count++;

     // Compute offsets for search sites.

     x->ss[search_site_count].mv.col = 0;

-    x->ss[search_site_count].mv.row = Len;

-    x->ss[search_site_count].offset = Len * stride;

+    x->ss[search_site_count].mv.row = len;

+    x->ss[search_site_count].offset = len * stride;

     search_site_count++;

     // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = -Len;

+    x->ss[search_site_count].mv.col = -len;

     x->ss[search_site_count].mv.row = 0;

-    x->ss[search_site_count].offset = -Len;

+    x->ss[search_site_count].offset = -len;

     search_site_count++;

     // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = Len;

+    x->ss[search_site_count].mv.col = len;

     x->ss[search_site_count].mv.row = 0;

-    x->ss[search_site_count].offset = Len;

+    x->ss[search_site_count].offset = len;

     search_site_count++;

     // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = -Len;

-    x->ss[search_site_count].mv.row = -Len;

-    x->ss[search_site_count].offset = -Len * stride - Len;

+    x->ss[search_site_count].mv.col = -len;

+    x->ss[search_site_count].mv.row = -len;

+    x->ss[search_site_count].offset = -len * stride - len;

     search_site_count++;

     // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = Len;

-    x->ss[search_site_count].mv.row = -Len;

-    x->ss[search_site_count].offset = -Len * stride + Len;

+    x->ss[search_site_count].mv.col = len;

+    x->ss[search_site_count].mv.row = -len;

+    x->ss[search_site_count].offset = -len * stride + len;

     search_site_count++;

     // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = -Len;

-    x->ss[search_site_count].mv.row = Len;

-    x->ss[search_site_count].offset = Len * stride - Len;

+    x->ss[search_site_count].mv.col = -len;

+    x->ss[search_site_count].mv.row = len;

+    x->ss[search_site_count].offset = len * stride - len;

     search_site_count++;

     // Compute offsets for search sites.

-    x->ss[search_site_count].mv.col = Len;

-    x->ss[search_site_count].mv.row = Len;

-    x->ss[search_site_count].offset = Len * stride + Len;

+    x->ss[search_site_count].mv.col = len;

+    x->ss[search_site_count].mv.row = len;

+    x->ss[search_site_count].offset = len * stride + len;

     search_site_count++;

-    // Contract.

-    Len /= 2;

   x->ss_count = search_site_count;

@@ -210,8 +205,9 @@

     (mvcost ?                                           \

      ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \

        mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \

-      error_per_bit + 128) >> 8 : 0)

+      error_per_bit + 4096) >> 13 : 0)

 #define SP(x) (((x) & 7) << 1)  // convert motion vector component to offset

                                 // for svf calc

@@ -1546,7 +1542,7 @@

   int in_what_stride = d->pre_stride;

   int mv_stride = d->pre_stride;

   uint8_t *bestaddress;

-  int_mv *best_mv = &d->bmi.as_mv.first;

+  int_mv *best_mv = &d->bmi.as_mv[0];

   int_mv this_mv;

   int bestsad = INT_MAX;

   int r, c;

@@ -1641,7 +1637,7 @@

   int in_what_stride = d->pre_stride;

   int mv_stride = d->pre_stride;

   uint8_t *bestaddress;

-  int_mv *best_mv = &d->bmi.as_mv.first;

+  int_mv *best_mv = &d->bmi.as_mv[0];

   int_mv this_mv;

   unsigned int bestsad = INT_MAX;

   int r, c;

@@ -1770,7 +1766,7 @@

   int in_what_stride = d->pre_stride;

   int mv_stride = d->pre_stride;

   uint8_t *bestaddress;

-  int_mv *best_mv = &d->bmi.as_mv.first;

+  int_mv *best_mv = &d->bmi.as_mv[0];

   int_mv this_mv;

   unsigned int bestsad = INT_MAX;

   int r, c;

@@ -1787,7 +1783,7 @@

   int col_min = ref_col - distance;

   int col_max = ref_col + distance;

-  DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8);

+  DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);

   unsigned int sad_array[3];

   int_mv fcenter_mv;

@@ -2023,13 +2019,11 @@

   for (i = 0; i < search_range; i++) {

     int best_site = -1;

-    int all_in = 1;

+    int all_in = ((ref_mv->as_mv.row - 1) > x->mv_row_min) &

+                 ((ref_mv->as_mv.row + 1) < x->mv_row_max) &

+                 ((ref_mv->as_mv.col - 1) > x->mv_col_min) &

+                 ((ref_mv->as_mv.col + 1) < x->mv_col_max);

-    all_in &= ((ref_mv->as_mv.row - 1) > x->mv_row_min);

-    all_in &= ((ref_mv->as_mv.row + 1) < x->mv_row_max);

-    all_in &= ((ref_mv->as_mv.col - 1) > x->mv_col_min);

-    all_in &= ((ref_mv->as_mv.col + 1) < x->mv_col_max);

     if (all_in) {

       unsigned int sad_array[4];

       unsigned char const *block_offset[4];

@@ -2103,14 +2097,14 @@

 #ifdef ENTROPY_STATS

-void print_mode_context(void) {

+void print_mode_context(VP9_COMMON *pc) {

   FILE *f = fopen("vp9_modecont.c", "a");

   int i, j;

   fprintf(f, "#include \"vp9_entropy.h\"\n");

-  fprintf(f, "const int vp9_mode_contexts[6][4] =");

+  fprintf(f, "const int vp9_mode_contexts[INTER_MODE_CONTEXTS][4] =");

   fprintf(f, "{\n");

-  for (j = 0; j < 6; j++) {

+  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {

     fprintf(f, "  {/* %d */ ", j);

     fprintf(f, "    ");

     for (i = 0; i < 4; i++) {

@@ -2117,7 +2111,8 @@

       int this_prob;

       // context probs

-      this_prob = get_binary_prob(mv_ref_ct[j][i][0], mv_ref_ct[j][i][1]);

+      this_prob = get_binary_prob(pc->fc.mv_ref_ct[j][i][0],

+                                  pc->fc.mv_ref_ct[j][i][1]);

       fprintf(f, "%5d, ", this_prob);

@@ -2126,46 +2121,6 @@

   fprintf(f, "};\n");

   fclose(f);

-}

-/* MV ref count ENTROPY_STATS stats code */

-void init_mv_ref_counts() {

-  vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));

-  vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));

-}

-void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {

-  if (m == ZEROMV) {

-    ++mv_ref_ct [ct[0]] [0] [0];

-    ++mv_mode_cts[0][0];

-  } else {

-    ++mv_ref_ct [ct[0]] [0] [1];

-    ++mv_mode_cts[0][1];

-    if (m == NEARESTMV) {

-      ++mv_ref_ct [ct[1]] [1] [0];

-      ++mv_mode_cts[1][0];

-    } else {

-      ++mv_ref_ct [ct[1]] [1] [1];

-      ++mv_mode_cts[1][1];

-      if (m == NEARMV) {

-        ++mv_ref_ct [ct[2]] [2] [0];

-        ++mv_mode_cts[2][0];

-      } else {

-        ++mv_ref_ct [ct[2]] [2] [1];

-        ++mv_mode_cts[2][1];

-        if (m == NEWMV) {

-          ++mv_ref_ct [ct[3]] [3] [0];

-          ++mv_mode_cts[3][0];

-        } else {

-          ++mv_ref_ct [ct[3]] [3] [1];

-          ++mv_mode_cts[3][1];

-        }

-      }

-    }

-  }

 #endif/* END MV ref count ENTROPY_STATS stats code */

--- a/vp9/encoder/vp9_mcomp.h

+++ b/vp9/encoder/vp9_mcomp.h

@@ -16,21 +16,25 @@

 #include "vp9/encoder/vp9_variance.h"

 #ifdef ENTROPY_STATS

-extern void init_mv_ref_counts();

-extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);

-void print_mode_context(void);

+void print_mode_context(VP9_COMMON *pc);

 #endif

+// The maximum number of steps in a step search given the largest

+// allowed initial step

+#define MAX_MVSEARCH_STEPS 11

+// Max full pel mv specified in 1 pel units

+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)

+// Maximum size of the first step in full pel units

+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))

-#define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step

-#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)      // Max full pel mv specified in 1 pel units

-#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units

+void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);

+int vp9_init_search_range(int width, int height);

-extern void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);

-extern int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,

-                           int *mvcost[2], int Weight, int ishp);

-extern void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);

-extern void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);

+int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,

+                           int *mvcost[2], int weight, int ishp);

+void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);

+void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);

 // Runs sequence of diamond searches in smaller steps for RD

 struct VP9_COMP;

 int vp9_full_pixel_diamond(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK *b,

@@ -39,20 +43,13 @@

                            vp9_variance_fn_ptr_t *fn_ptr,

                            int_mv *ref_mv, int_mv *dst_mv);

-extern int vp9_hex_search

-(

-  MACROBLOCK *x,

-  BLOCK *b,

-  BLOCKD *d,

-  int_mv *ref_mv,

-  int_mv *best_mv,

-  int search_param,

-  int error_per_bit,

-  const vp9_variance_fn_ptr_t *vf,

-  int *mvjsadcost, int *mvsadcost[2],

-  int *mvjcost, int *mvcost[2],

-  int_mv *center_mv

-);

+int vp9_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d,

+                   int_mv *ref_mv, int_mv *best_mv,

+                   int search_param, int error_per_bit,

+                   const vp9_variance_fn_ptr_t *vf,

+                   int *mvjsadcost, int *mvsadcost[2],

+                   int *mvjcost, int *mvcost[2],

+                   int_mv *center_mv);

 typedef int (fractional_mv_step_fp) (MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv

   *bestmv, int_mv *ref_mv, int error_per_bit, const vp9_variance_fn_ptr_t *vfp,

--- a/vp9/encoder/vp9_onyx_if.c

+++ b/vp9/encoder/vp9_onyx_if.c

@@ -10,7 +10,9 @@

 #include "vpx_config.h"

+#include "vp9/common/vp9_filter.h"

 #include "vp9/common/vp9_onyxc_int.h"

+#include "vp9/common/vp9_reconinter.h"

 #include "vp9/encoder/vp9_onyx_int.h"

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9/encoder/vp9_quantize.h"

@@ -22,6 +24,7 @@

 #include "vp9/common/vp9_extend.h"

 #include "vp9/encoder/vp9_ratectrl.h"

 #include "vp9/common/vp9_quant_common.h"

+#include "vp9/common/vp9_tile_common.h"

 #include "vp9/encoder/vp9_segmentation.h"

 #include "./vp9_rtcd.h"

 #include "./vpx_scale_rtcd.h"

@@ -111,6 +114,13 @@

 extern void print_nmvstats();

 #endif

+#if CONFIG_CODE_NONZEROCOUNT

+#ifdef NZC_STATS

+extern void init_nzcstats();

+extern void print_nzcstats();

+#endif

+#endif

 #ifdef SPEEDSTATS

 unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

 #endif

@@ -146,31 +156,24 @@

 // The formulae were derived from computing a 3rd order polynomial best

 // fit to the original data (after plotting real maxq vs minq (not q index))

 static int calculate_minq_index(double maxq,

-                                double x3, double x2, double x, double c) {

+                                double x3, double x2, double x1, double c) {

   int i;

-  double minqtarget;

+  const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c,

+                                maxq);

-  minqtarget = ((x3 * maxq * maxq * maxq) +

-                (x2 * maxq * maxq) +

-                (x * maxq) +

-                c);

-  if (minqtarget > maxq)

-    minqtarget = maxq;

   for (i = 0; i < QINDEX_RANGE; i++) {

     if (minqtarget <= vp9_convert_qindex_to_q(i))

       return i;

   return QINDEX_RANGE - 1;

 static void init_minq_luts(void) {

   int i;

-  double maxq;

   for (i = 0; i < QINDEX_RANGE; i++) {

-    maxq = vp9_convert_qindex_to_q(i);

+    const double maxq = vp9_convert_qindex_to_q(i);

     kf_low_motion_minq[i] = calculate_minq_index(maxq,

@@ -206,7 +209,6 @@

   if (mb->e_mbd.allow_high_precision_mv) {

     mb->mvcost = mb->nmvcost_hp;

     mb->mvsadcost = mb->nmvsadcost_hp;

   } else {

     mb->mvcost = mb->nmvcost;

     mb->mvsadcost = mb->nmvsadcost;

@@ -214,15 +216,13 @@

 static void init_base_skip_probs(void) {

   int i;

-  double q;

-  int t;

   for (i = 0; i < QINDEX_RANGE; i++) {

-    q = vp9_convert_qindex_to_q(i);

+    const double q = vp9_convert_qindex_to_q(i);

     // Exponential decay caluclation of baseline skip prob with clamping

     // Based on crude best fit of old table.

-    t = (int)(564.25 * pow(2.71828, (-0.012 * q)));

+    const int t = (int)(564.25 * pow(2.71828, (-0.012 * q)));

     base_skip_false_prob[i][1] = clip_prob(t);

     base_skip_false_prob[i][2] = clip_prob(t * 3 / 4);

@@ -236,12 +236,12 @@

   if (cm->frame_type != KEY_FRAME) {

     vp9_update_skip_probs(cpi);

-    if (cm->refresh_alt_ref_frame) {

+    if (cpi->refresh_alt_ref_frame) {

       int k;

       for (k = 0; k < MBSKIP_CONTEXTS; ++k)

         cpi->last_skip_false_probs[2][k] = cm->mbskip_pred_probs[k];

       cpi->last_skip_probs_q[2] = cm->base_qindex;

-    } else if (cpi->common.refresh_golden_frame) {

+    } else if (cpi->refresh_golden_frame) {

       int k;

       for (k = 0; k < MBSKIP_CONTEXTS; ++k)

         cpi->last_skip_false_probs[1][k] = cm->mbskip_pred_probs[k];

@@ -258,7 +258,6 @@

           cm->mbskip_pred_probs[k];

 void vp9_initialize_enc() {

@@ -299,7 +298,6 @@

   vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));

   set_default_lf_deltas(cpi);

@@ -332,9 +330,7 @@

   vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);

   vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);

-#if VP9_TEMPORAL_ALT_REF

   vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer);

-#endif

   vp9_lookahead_destroy(cpi->lookahead);

   vpx_free(cpi->tok);

@@ -388,7 +384,7 @@

   return target_index - start_index;

-static void init_seg_features(VP9_COMP *cpi) {

+static void configure_static_seg_features(VP9_COMP *cpi) {

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &cpi->mb.e_mbd;

@@ -408,10 +404,8 @@

     // Clear down the segment features.

     vp9_clearall_segfeatures(xd);

-  }

-  // If this is an alt ref frame

-  else if (cm->refresh_alt_ref_frame) {

+  } else if (cpi->refresh_alt_ref_frame) {

+    // If this is an alt ref frame

     // Clear down the global segmentation map

     vpx_memset(cpi->segmentation_map, 0, (cm->mb_rows * cm->mb_cols));

     xd->update_mb_segmentation_map = 0;

@@ -448,7 +442,7 @@

   else if (xd->segmentation_enabled) {

     // First normal frame in a valid gf or alt ref group

     if (cpi->common.frames_since_golden == 0) {

-      // Set up segment features for normal frames in an af group

+      // Set up segment features for normal frames in an arf group

       if (cpi->source_alt_ref_active) {

         xd->update_mb_segmentation_map = 0;

         xd->update_mb_segmentation_data = 1;

@@ -465,16 +459,9 @@

         // Segment coding disabled for compred testing

         if (high_q || (cpi->static_mb_pct == 100)) {

-          // set_segref(xd, 1, LAST_FRAME);

           vp9_set_segref(xd, 1, ALTREF_FRAME);

           vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);

-          vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);

-          vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);

-          // EOB segment coding not fixed for 8x8 yet

-          vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);

-          vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);

+          vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);

       // Disable segmentation and clear down features if alt ref

@@ -493,29 +480,23 @@

     // Special case where we are coding over the top of a previous

-    // alt ref frame

+    // alt ref frame.

     // Segment coding disabled for compred testing

     else if (cpi->is_src_frame_alt_ref) {

-      // Enable mode and ref frame features for segment 0 as well

+      // Enable ref frame features for segment 0 as well

       vp9_enable_segfeature(xd, 0, SEG_LVL_REF_FRAME);

-      vp9_enable_segfeature(xd, 0, SEG_LVL_MODE);

       vp9_enable_segfeature(xd, 1, SEG_LVL_REF_FRAME);

-      vp9_enable_segfeature(xd, 1, SEG_LVL_MODE);

-      // All mbs should use ALTREF_FRAME, ZEROMV exclusively

+      // All mbs should use ALTREF_FRAME

       vp9_clear_segref(xd, 0);

       vp9_set_segref(xd, 0, ALTREF_FRAME);

       vp9_clear_segref(xd, 1);

       vp9_set_segref(xd, 1, ALTREF_FRAME);

-      vp9_set_segdata(xd, 0, SEG_LVL_MODE, ZEROMV);

-      vp9_set_segdata(xd, 1, SEG_LVL_MODE, ZEROMV);

-      // Skip all MBs if high Q

+      // Skip all MBs if high Q (0,0 mv and skip coeffs)

       if (high_q) {

-        vp9_enable_segfeature(xd, 0, SEG_LVL_EOB);

-        vp9_set_segdata(xd, 0, SEG_LVL_EOB, 0);

-        vp9_enable_segfeature(xd, 1, SEG_LVL_EOB);

-        vp9_set_segdata(xd, 1, SEG_LVL_EOB, 0);

+          vp9_enable_segfeature(xd, 0, SEG_LVL_SKIP);

+          vp9_enable_segfeature(xd, 1, SEG_LVL_SKIP);

       // Enable data udpate

       xd->update_mb_segmentation_data = 1;

@@ -534,17 +515,13 @@

   VP9_COMMON *cm = &cpi->common;

   int row, col;

   int map_index = 0;

-  FILE *statsfile;

+  FILE *statsfile = fopen("segmap.stt", "a");

-  statsfile = fopen("segmap.stt", "a");

+  fprintf(statsfile, "%10d\n", cm->current_video_frame);

-  fprintf(statsfile, "%10d\n",

-          cm->current_video_frame);

   for (row = 0; row < cpi->common.mb_rows; row++) {

     for (col = 0; col < cpi->common.mb_cols; col++) {

-      fprintf(statsfile, "%10d",

-              cpi->segmentation_map[map_index]);

+      fprintf(statsfile, "%10d", cpi->segmentation_map[map_index]);

       map_index++;

     fprintf(statsfile, "\n");

@@ -590,368 +567,88 @@

   cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv

-void vp9_set_speed_features(VP9_COMP *cpi) {

+static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {

   SPEED_FEATURES *sf = &cpi->sf;

-  int Mode = cpi->compressor_speed;

-  int Speed = cpi->Speed;

+  int speed_multiplier = speed + 1;

   int i;

-  VP9_COMMON *cm = &cpi->common;

-  // Only modes 0 and 1 supported for now in experimental code basae

-  if (Mode > 1)

-    Mode = 1;

-  // Initialise default mode frequency sampling variables

-  for (i = 0; i < MAX_MODES; i ++) {

-    cpi->mode_check_freq[i] = 0;

-    cpi->mode_test_hit_counts[i] = 0;

-    cpi->mode_chosen_counts[i] = 0;

+  // Set baseline threshold values

+  for (i = 0; i < MAX_MODES; ++i) {

+    sf->thresh_mult[i] = (mode == 0) ? -500 : 0;

-  // best quality defaults

-  sf->RD = 1;

-  sf->search_method = NSTEP;

-  sf->improved_dct = 1;

-  sf->auto_filter = 1;

-  sf->recode_loop = 1;

-  sf->quarter_pixel_search = 1;

-  sf->half_pixel_search = 1;

-  sf->iterative_sub_pixel = 1;

-#if CONFIG_LOSSLESS

-  sf->optimize_coefficients = 0;

-#else

-  sf->optimize_coefficients = 1;

-#endif

-  sf->no_skip_block4x4_search = 1;

+  sf->thresh_mult[THR_ZEROMV   ] = 0;

+  sf->thresh_mult[THR_ZEROG    ] = 0;

+  sf->thresh_mult[THR_ZEROA    ] = 0;

-  sf->first_step = 0;

-  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

+  sf->thresh_mult[THR_NEARESTMV] = 0;

+  sf->thresh_mult[THR_NEARESTG ] = 0;

+  sf->thresh_mult[THR_NEARESTA ] = 0;

-  // default thresholds to 0

-  for (i = 0; i < MAX_MODES; i++)

-    sf->thresh_mult[i] = 0;

+  sf->thresh_mult[THR_NEARMV   ] += speed_multiplier * 1000;

+  sf->thresh_mult[THR_NEARG    ] += speed_multiplier * 1000;

+  sf->thresh_mult[THR_NEARA    ] += speed_multiplier * 1000;

-  switch (Mode) {

-    case 0: // best quality mode

-      sf->thresh_mult[THR_ZEROMV   ] = 0;

-      sf->thresh_mult[THR_ZEROG    ] = 0;

-      sf->thresh_mult[THR_ZEROA    ] = 0;

-      sf->thresh_mult[THR_NEARESTMV] = 0;

-      sf->thresh_mult[THR_NEARESTG ] = 0;

-      sf->thresh_mult[THR_NEARESTA ] = 0;

-      sf->thresh_mult[THR_NEARMV   ] = 0;

-      sf->thresh_mult[THR_NEARG    ] = 0;

-      sf->thresh_mult[THR_NEARA    ] = 0;

+  sf->thresh_mult[THR_DC       ] = 0;

+  sf->thresh_mult[THR_TM       ] += speed_multiplier * 1000;

+  sf->thresh_mult[THR_V_PRED   ] += speed_multiplier * 1000;

+  sf->thresh_mult[THR_H_PRED   ] += speed_multiplier * 1000;

+  sf->thresh_mult[THR_D45_PRED ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_D135_PRED] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_D117_PRED] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_D153_PRED] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_D27_PRED ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_D63_PRED ] += speed_multiplier * 1500;

-      sf->thresh_mult[THR_DC       ] = 0;

+  sf->thresh_mult[THR_B_PRED   ] += speed_multiplier * 2500;

+  sf->thresh_mult[THR_I8X8_PRED] += speed_multiplier * 2500;

-      sf->thresh_mult[THR_V_PRED   ] = 1000;

-      sf->thresh_mult[THR_H_PRED   ] = 1000;

-      sf->thresh_mult[THR_D45_PRED ] = 1000;

-      sf->thresh_mult[THR_D135_PRED] = 1000;

-      sf->thresh_mult[THR_D117_PRED] = 1000;

-      sf->thresh_mult[THR_D153_PRED] = 1000;

-      sf->thresh_mult[THR_D27_PRED ] = 1000;

-      sf->thresh_mult[THR_D63_PRED ] = 1000;

-      sf->thresh_mult[THR_B_PRED   ] = 2000;

-      sf->thresh_mult[THR_I8X8_PRED] = 2000;

-      sf->thresh_mult[THR_TM       ] = 1000;

+  sf->thresh_mult[THR_NEWMV    ] += speed_multiplier * 1000;

+  sf->thresh_mult[THR_NEWG     ] += speed_multiplier * 1000;

+  sf->thresh_mult[THR_NEWA     ] += speed_multiplier * 1000;

-      sf->thresh_mult[THR_NEWMV    ] = 1000;

-      sf->thresh_mult[THR_NEWG     ] = 1000;

-      sf->thresh_mult[THR_NEWA     ] = 1000;

+  sf->thresh_mult[THR_SPLITMV  ] += speed_multiplier * 2500;

+  sf->thresh_mult[THR_SPLITG   ] += speed_multiplier * 2500;

+  sf->thresh_mult[THR_SPLITA   ] += speed_multiplier * 2500;

-      sf->thresh_mult[THR_SPLITMV  ] = 2500;

-      sf->thresh_mult[THR_SPLITG   ] = 5000;

-      sf->thresh_mult[THR_SPLITA   ] = 5000;

+  sf->thresh_mult[THR_COMP_ZEROLG   ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_ZEROLA   ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_ZEROGA   ] += speed_multiplier * 1500;

-      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;

-      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;

-      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;

-      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;

-      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;

-      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;

+  sf->thresh_mult[THR_COMP_NEARESTLG] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_NEARESTLA] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_NEARESTGA] += speed_multiplier * 1500;

-      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;

-      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;

-      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;

+  sf->thresh_mult[THR_COMP_NEARLG   ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_NEARLA   ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_NEARGA   ] += speed_multiplier * 1500;

-      sf->thresh_mult[THR_COMP_SPLITLA  ] = 2500;

-      sf->thresh_mult[THR_COMP_SPLITGA  ] = 5000;

-      sf->thresh_mult[THR_COMP_SPLITLG  ] = 5000;

+  sf->thresh_mult[THR_COMP_NEWLG    ] += speed_multiplier * 2000;

+  sf->thresh_mult[THR_COMP_NEWLA    ] += speed_multiplier * 2000;

+  sf->thresh_mult[THR_COMP_NEWGA    ] += speed_multiplier * 2000;

-#if CONFIG_COMP_INTERINTRA_PRED

-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = 0;

-#endif

+  sf->thresh_mult[THR_COMP_SPLITLA  ] += speed_multiplier * 4500;

+  sf->thresh_mult[THR_COMP_SPLITGA  ] += speed_multiplier * 4500;

+  sf->thresh_mult[THR_COMP_SPLITLG  ] += speed_multiplier * 4500;

-      sf->first_step = 0;

-      sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

-      sf->search_best_filter = SEARCH_BEST_FILTER;

-      break;

-    case 1:

-      sf->thresh_mult[THR_NEARESTMV] = 0;

-      sf->thresh_mult[THR_ZEROMV   ] = 0;

-      sf->thresh_mult[THR_DC       ] = 0;

-      sf->thresh_mult[THR_NEARMV   ] = 0;

-      sf->thresh_mult[THR_V_PRED   ] = 1000;

-      sf->thresh_mult[THR_H_PRED   ] = 1000;

-      sf->thresh_mult[THR_D45_PRED ] = 1000;

-      sf->thresh_mult[THR_D135_PRED] = 1000;

-      sf->thresh_mult[THR_D117_PRED] = 1000;

-      sf->thresh_mult[THR_D153_PRED] = 1000;

-      sf->thresh_mult[THR_D27_PRED ] = 1000;

-      sf->thresh_mult[THR_D63_PRED ] = 1000;

-      sf->thresh_mult[THR_B_PRED   ] = 2500;

-      sf->thresh_mult[THR_I8X8_PRED] = 2500;

-      sf->thresh_mult[THR_TM       ] = 1000;

-      sf->thresh_mult[THR_NEARESTG ] = 1000;

-      sf->thresh_mult[THR_NEARESTA ] = 1000;

-      sf->thresh_mult[THR_ZEROG    ] = 1000;

-      sf->thresh_mult[THR_ZEROA    ] = 1000;

-      sf->thresh_mult[THR_NEARG    ] = 1000;

-      sf->thresh_mult[THR_NEARA    ] = 1000;

-      sf->thresh_mult[THR_ZEROMV   ] = 0;

-      sf->thresh_mult[THR_ZEROG    ] = 0;

-      sf->thresh_mult[THR_ZEROA    ] = 0;

-      sf->thresh_mult[THR_NEARESTMV] = 0;

-      sf->thresh_mult[THR_NEARESTG ] = 0;

-      sf->thresh_mult[THR_NEARESTA ] = 0;

-      sf->thresh_mult[THR_NEARMV   ] = 0;

-      sf->thresh_mult[THR_NEARG    ] = 0;

-      sf->thresh_mult[THR_NEARA    ] = 0;

-      sf->thresh_mult[THR_NEWMV    ] = 1000;

-      sf->thresh_mult[THR_NEWG     ] = 1000;

-      sf->thresh_mult[THR_NEWA     ] = 1000;

-      sf->thresh_mult[THR_SPLITMV  ] = 1700;

-      sf->thresh_mult[THR_SPLITG   ] = 4500;

-      sf->thresh_mult[THR_SPLITA   ] = 4500;

-      sf->thresh_mult[THR_COMP_ZEROLG   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTLG] = 0;

-      sf->thresh_mult[THR_COMP_NEARLG   ] = 0;

-      sf->thresh_mult[THR_COMP_ZEROLA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTLA] = 0;

-      sf->thresh_mult[THR_COMP_NEARLA   ] = 0;

-      sf->thresh_mult[THR_COMP_ZEROGA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEARESTGA] = 0;

-      sf->thresh_mult[THR_COMP_NEARGA   ] = 0;

-      sf->thresh_mult[THR_COMP_NEWLG    ] = 1000;

-      sf->thresh_mult[THR_COMP_NEWLA    ] = 1000;

-      sf->thresh_mult[THR_COMP_NEWGA    ] = 1000;

-      sf->thresh_mult[THR_COMP_SPLITLA  ] = 1700;

-      sf->thresh_mult[THR_COMP_SPLITGA  ] = 4500;

-      sf->thresh_mult[THR_COMP_SPLITLG  ] = 4500;

 #if CONFIG_COMP_INTERINTRA_PRED

-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = 0;

-      sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = 0;

-#endif

+  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] += speed_multiplier * 1500;

-      if (Speed > 0) {

-        /* Disable coefficient optimization above speed 0 */

-        sf->optimize_coefficients = 0;

-        sf->no_skip_block4x4_search = 0;

+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += speed_multiplier * 1500;

-        sf->first_step = 1;

+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] += speed_multiplier * 1500;

+  sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] += speed_multiplier * 1500;

-        cpi->mode_check_freq[THR_SPLITG] = 2;

-        cpi->mode_check_freq[THR_SPLITA] = 2;

-        cpi->mode_check_freq[THR_SPLITMV] = 0;

-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;

-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;

-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;

-      }

-      if (Speed > 1) {

-        cpi->mode_check_freq[THR_SPLITG] = 4;

-        cpi->mode_check_freq[THR_SPLITA] = 4;

-        cpi->mode_check_freq[THR_SPLITMV] = 2;

-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;

-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;

-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;

-        sf->thresh_mult[THR_TM       ] = 1500;

-        sf->thresh_mult[THR_V_PRED   ] = 1500;

-        sf->thresh_mult[THR_H_PRED   ] = 1500;

-        sf->thresh_mult[THR_D45_PRED ] = 1500;

-        sf->thresh_mult[THR_D135_PRED] = 1500;

-        sf->thresh_mult[THR_D117_PRED] = 1500;

-        sf->thresh_mult[THR_D153_PRED] = 1500;

-        sf->thresh_mult[THR_D27_PRED ] = 1500;

-        sf->thresh_mult[THR_D63_PRED ] = 1500;

-        sf->thresh_mult[THR_B_PRED   ] = 5000;

-        sf->thresh_mult[THR_I8X8_PRED] = 5000;

-        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

-          sf->thresh_mult[THR_NEWMV    ] = 2000;

-          sf->thresh_mult[THR_SPLITMV  ] = 10000;

-          sf->thresh_mult[THR_COMP_SPLITLG  ] = 20000;

-        }

-        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

-          sf->thresh_mult[THR_NEARESTG ] = 1500;

-          sf->thresh_mult[THR_ZEROG    ] = 1500;

-          sf->thresh_mult[THR_NEARG    ] = 1500;

-          sf->thresh_mult[THR_NEWG     ] = 2000;

-          sf->thresh_mult[THR_SPLITG   ] = 20000;

-          sf->thresh_mult[THR_COMP_SPLITGA  ] = 20000;

-        }

-        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

-          sf->thresh_mult[THR_NEARESTA ] = 1500;

-          sf->thresh_mult[THR_ZEROA    ] = 1500;

-          sf->thresh_mult[THR_NEARA    ] = 1500;

-          sf->thresh_mult[THR_NEWA     ] = 2000;

-          sf->thresh_mult[THR_SPLITA   ] = 20000;

-          sf->thresh_mult[THR_COMP_SPLITLA  ] = 10000;

-        }

-        sf->thresh_mult[THR_COMP_ZEROLG   ] = 1500;

-        sf->thresh_mult[THR_COMP_NEARESTLG] = 1500;

-        sf->thresh_mult[THR_COMP_NEARLG   ] = 1500;

-        sf->thresh_mult[THR_COMP_ZEROLA   ] = 1500;

-        sf->thresh_mult[THR_COMP_NEARESTLA] = 1500;

-        sf->thresh_mult[THR_COMP_NEARLA   ] = 1500;

-        sf->thresh_mult[THR_COMP_ZEROGA   ] = 1500;

-        sf->thresh_mult[THR_COMP_NEARESTGA] = 1500;

-        sf->thresh_mult[THR_COMP_NEARGA   ] = 1500;

-        sf->thresh_mult[THR_COMP_NEWLG    ] = 2000;

-        sf->thresh_mult[THR_COMP_NEWLA    ] = 2000;

-        sf->thresh_mult[THR_COMP_NEWGA    ] = 2000;

-#if CONFIG_COMP_INTERINTRA_PRED

-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = 0;

+  sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] += speed_multiplier * 2000;

+  sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] += speed_multiplier * 2000;

+  sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] += speed_multiplier * 2000;

 #endif

-      }

-      if (Speed > 2) {

-        cpi->mode_check_freq[THR_SPLITG] = 15;

-        cpi->mode_check_freq[THR_SPLITA] = 15;

-        cpi->mode_check_freq[THR_SPLITMV] = 7;

-        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;

-        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;

-        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;

-        sf->thresh_mult[THR_TM       ] = 2000;

-        sf->thresh_mult[THR_V_PRED   ] = 2000;

-        sf->thresh_mult[THR_H_PRED   ] = 2000;

-        sf->thresh_mult[THR_D45_PRED ] = 2000;

-        sf->thresh_mult[THR_D135_PRED] = 2000;

-        sf->thresh_mult[THR_D117_PRED] = 2000;

-        sf->thresh_mult[THR_D153_PRED] = 2000;

-        sf->thresh_mult[THR_D27_PRED ] = 2000;

-        sf->thresh_mult[THR_D63_PRED ] = 2000;

-        sf->thresh_mult[THR_B_PRED   ] = 7500;

-        sf->thresh_mult[THR_I8X8_PRED] = 7500;

-        if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

-          sf->thresh_mult[THR_NEWMV    ] = 2000;

-          sf->thresh_mult[THR_SPLITMV  ] = 25000;

-          sf->thresh_mult[THR_COMP_SPLITLG  ] = 50000;

-        }

-        if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

-          sf->thresh_mult[THR_NEARESTG ] = 2000;

-          sf->thresh_mult[THR_ZEROG    ] = 2000;

-          sf->thresh_mult[THR_NEARG    ] = 2000;

-          sf->thresh_mult[THR_NEWG     ] = 2500;

-          sf->thresh_mult[THR_SPLITG   ] = 50000;

-          sf->thresh_mult[THR_COMP_SPLITGA  ] = 50000;

-        }

-        if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

-          sf->thresh_mult[THR_NEARESTA ] = 2000;

-          sf->thresh_mult[THR_ZEROA    ] = 2000;

-          sf->thresh_mult[THR_NEARA    ] = 2000;

-          sf->thresh_mult[THR_NEWA     ] = 2500;

-          sf->thresh_mult[THR_SPLITA   ] = 50000;

-          sf->thresh_mult[THR_COMP_SPLITLA  ] = 25000;

-        }

-        sf->thresh_mult[THR_COMP_ZEROLG   ] = 2000;

-        sf->thresh_mult[THR_COMP_NEARESTLG] = 2000;

-        sf->thresh_mult[THR_COMP_NEARLG   ] = 2000;

-        sf->thresh_mult[THR_COMP_ZEROLA   ] = 2000;

-        sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;

-        sf->thresh_mult[THR_COMP_NEARLA   ] = 2000;

-        sf->thresh_mult[THR_COMP_ZEROGA   ] = 2000;

-        sf->thresh_mult[THR_COMP_NEARESTGA] = 2000;

-        sf->thresh_mult[THR_COMP_NEARGA   ] = 2000;

-        sf->thresh_mult[THR_COMP_NEWLG    ] = 2500;

-        sf->thresh_mult[THR_COMP_NEWLA    ] = 2500;

-        sf->thresh_mult[THR_COMP_NEWGA    ] = 2500;

-#if CONFIG_COMP_INTERINTRA_PRED

-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = 0;

-        sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = 0;

-#endif

-        sf->improved_dct = 0;

-        // Only do recode loop on key frames, golden frames and

-        // alt ref frames

-        sf->recode_loop = 2;

-      }

-      break;

-  }; /* switch */

   /* disable frame modes if flags not set */

   if (!(cpi->ref_frame_flags & VP9_LAST_FLAG)) {

     sf->thresh_mult[THR_NEWMV    ] = INT_MAX;

@@ -959,13 +656,19 @@

     sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;

     sf->thresh_mult[THR_NEARMV   ] = INT_MAX;

     sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;

+#if CONFIG_COMP_INTERINTRA_PRED

+    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX;

+    sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = INT_MAX;

+    sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = INT_MAX;

+#endif

   if (!(cpi->ref_frame_flags & VP9_GOLD_FLAG)) {

     sf->thresh_mult[THR_NEARESTG ] = INT_MAX;

     sf->thresh_mult[THR_ZEROG    ] = INT_MAX;

     sf->thresh_mult[THR_NEARG    ] = INT_MAX;

     sf->thresh_mult[THR_NEWG     ] = INT_MAX;

+    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;

 #if CONFIG_COMP_INTERINTRA_PRED

     sf->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] = INT_MAX;

     sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] = INT_MAX;

@@ -972,14 +675,13 @@

     sf->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] = INT_MAX;

     sf->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] = INT_MAX;

 #endif

-    sf->thresh_mult[THR_SPLITG   ] = INT_MAX;

   if (!(cpi->ref_frame_flags & VP9_ALT_FLAG)) {

     sf->thresh_mult[THR_NEARESTA ] = INT_MAX;

     sf->thresh_mult[THR_ZEROA    ] = INT_MAX;

     sf->thresh_mult[THR_NEARA    ] = INT_MAX;

     sf->thresh_mult[THR_NEWA     ] = INT_MAX;

+    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;

 #if CONFIG_COMP_INTERINTRA_PRED

     sf->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] = INT_MAX;

     sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] = INT_MAX;

@@ -986,10 +688,10 @@

     sf->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] = INT_MAX;

     sf->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] = INT_MAX;

 #endif

-    sf->thresh_mult[THR_SPLITA   ] = INT_MAX;

-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) != (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {

+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_GOLD_FLAG)) !=

+      (VP9_LAST_FLAG | VP9_GOLD_FLAG)) {

     sf->thresh_mult[THR_COMP_ZEROLG   ] = INT_MAX;

     sf->thresh_mult[THR_COMP_NEARESTLG] = INT_MAX;

     sf->thresh_mult[THR_COMP_NEARLG   ] = INT_MAX;

@@ -996,8 +698,8 @@

     sf->thresh_mult[THR_COMP_NEWLG    ] = INT_MAX;

     sf->thresh_mult[THR_COMP_SPLITLG  ] = INT_MAX;

-  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) != (VP9_LAST_FLAG | VP9_ALT_FLAG)) {

+  if ((cpi->ref_frame_flags & (VP9_LAST_FLAG | VP9_ALT_FLAG)) !=

+      (VP9_LAST_FLAG | VP9_ALT_FLAG)) {

     sf->thresh_mult[THR_COMP_ZEROLA   ] = INT_MAX;

     sf->thresh_mult[THR_COMP_NEARESTLA] = INT_MAX;

     sf->thresh_mult[THR_COMP_NEARLA   ] = INT_MAX;

@@ -1004,8 +706,8 @@

     sf->thresh_mult[THR_COMP_NEWLA    ] = INT_MAX;

     sf->thresh_mult[THR_COMP_SPLITLA  ] = INT_MAX;

-  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) != (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {

+  if ((cpi->ref_frame_flags & (VP9_GOLD_FLAG | VP9_ALT_FLAG)) !=

+      (VP9_GOLD_FLAG | VP9_ALT_FLAG)) {

     sf->thresh_mult[THR_COMP_ZEROGA   ] = INT_MAX;

     sf->thresh_mult[THR_COMP_NEARESTGA] = INT_MAX;

     sf->thresh_mult[THR_COMP_NEARGA   ] = INT_MAX;

@@ -1012,15 +714,105 @@

     sf->thresh_mult[THR_COMP_NEWGA    ] = INT_MAX;

     sf->thresh_mult[THR_COMP_SPLITGA  ] = INT_MAX;

-#if CONFIG_COMP_INTERINTRA_PRED

-  if ((cpi->ref_frame_flags & VP9_LAST_FLAG) != VP9_LAST_FLAG) {

-    sf->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] = INT_MAX;

-    sf->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] = INT_MAX;

+}

+void vp9_set_speed_features(VP9_COMP *cpi) {

+  SPEED_FEATURES *sf = &cpi->sf;

+  int mode = cpi->compressor_speed;

+  int speed = cpi->Speed;

+  int i;

+  // Only modes 0 and 1 supported for now in experimental code basae

+  if (mode > 1)

+    mode = 1;

+  // Initialise default mode frequency sampling variables

+  for (i = 0; i < MAX_MODES; i ++) {

+    cpi->mode_check_freq[i] = 0;

+    cpi->mode_test_hit_counts[i] = 0;

+    cpi->mode_chosen_counts[i] = 0;

-#endif

+  // best quality defaults

+  sf->RD = 1;

+  sf->search_method = NSTEP;

+  sf->improved_dct = 1;

+  sf->auto_filter = 1;

+  sf->recode_loop = 1;

+  sf->quarter_pixel_search = 1;

+  sf->half_pixel_search = 1;

+  sf->iterative_sub_pixel = 1;

+  sf->no_skip_block4x4_search = 1;

+  if (cpi->oxcf.lossless)

+    sf->optimize_coefficients = 0;

+  else

+    sf->optimize_coefficients = 1;

+  sf->first_step = 0;

+  sf->max_step_search_steps = MAX_MVSEARCH_STEPS;

+  sf->static_segmentation = 1;

+  sf->splitmode_breakout = 0;

+  sf->mb16_breakout = 0;

+  switch (mode) {

+    case 0: // best quality mode

+      sf->search_best_filter = SEARCH_BEST_FILTER;

+      break;

+    case 1:

+      sf->static_segmentation = 1;

+      sf->splitmode_breakout = 1;

+      sf->mb16_breakout = 0;

+      if (speed > 0) {

+        /* Disable coefficient optimization above speed 0 */

+        sf->optimize_coefficients = 0;

+        sf->no_skip_block4x4_search = 0;

+        sf->first_step = 1;

+        cpi->mode_check_freq[THR_SPLITG] = 2;

+        cpi->mode_check_freq[THR_SPLITA] = 2;

+        cpi->mode_check_freq[THR_SPLITMV] = 0;

+        cpi->mode_check_freq[THR_COMP_SPLITGA] = 2;

+        cpi->mode_check_freq[THR_COMP_SPLITLG] = 2;

+        cpi->mode_check_freq[THR_COMP_SPLITLA] = 0;

+      }

+      if (speed > 1) {

+        cpi->mode_check_freq[THR_SPLITG] = 4;

+        cpi->mode_check_freq[THR_SPLITA] = 4;

+        cpi->mode_check_freq[THR_SPLITMV] = 2;

+        cpi->mode_check_freq[THR_COMP_SPLITGA] = 4;

+        cpi->mode_check_freq[THR_COMP_SPLITLG] = 4;

+        cpi->mode_check_freq[THR_COMP_SPLITLA] = 2;

+      }

+      if (speed > 2) {

+        cpi->mode_check_freq[THR_SPLITG] = 15;

+        cpi->mode_check_freq[THR_SPLITA] = 15;

+        cpi->mode_check_freq[THR_SPLITMV] = 7;

+        cpi->mode_check_freq[THR_COMP_SPLITGA] = 15;

+        cpi->mode_check_freq[THR_COMP_SPLITLG] = 15;

+        cpi->mode_check_freq[THR_COMP_SPLITLA] = 7;

+        sf->improved_dct = 0;

+        // Only do recode loop on key frames, golden frames and

+        // alt ref frames

+        sf->recode_loop = 2;

+      }

+      break;

+  }; /* switch */

+  // Set rd thresholds based on mode and speed setting

+  set_rd_speed_thresholds(cpi, mode, speed);

   // Slow quant, dct and trellis not worthwhile for first pass

   // so make sure they are always turned off.

   if (cpi->pass == 1) {

@@ -1028,36 +820,19 @@

     sf->improved_dct = 0;

-  if (cpi->sf.search_method == NSTEP) {

-    vp9_init3smotion_compensation(&cpi->mb,

-                                  cm->yv12_fb[cm->lst_fb_idx].y_stride);

-  } else if (cpi->sf.search_method == DIAMOND) {

-    vp9_init_dsmotion_compensation(&cpi->mb,

-                                   cm->yv12_fb[cm->lst_fb_idx].y_stride);

+  cpi->mb.fwd_txm16x16  = vp9_short_fdct16x16;

+  cpi->mb.fwd_txm8x8    = vp9_short_fdct8x8;

+  cpi->mb.fwd_txm8x4    = vp9_short_fdct8x4;

+  cpi->mb.fwd_txm4x4    = vp9_short_fdct4x4;

+  if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {

+    cpi->mb.fwd_txm8x4    = vp9_short_walsh8x4;

+    cpi->mb.fwd_txm4x4    = vp9_short_walsh4x4;

-  cpi->mb.vp9_short_fdct16x16 = vp9_short_fdct16x16;

-  cpi->mb.vp9_short_fdct8x8 = vp9_short_fdct8x8;

-  cpi->mb.vp9_short_fdct8x4 = vp9_short_fdct8x4;

-  cpi->mb.vp9_short_fdct4x4 = vp9_short_fdct4x4;

-  cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;

-  cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;

-#if CONFIG_LOSSLESS

-  if (cpi->oxcf.lossless) {

-    cpi->mb.vp9_short_fdct8x4 = vp9_short_walsh8x4_x8;

-    cpi->mb.vp9_short_fdct4x4 = vp9_short_walsh4x4_x8;

-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4;

-    cpi->mb.short_fhaar2x2 = vp9_short_fhaar2x2;

-    cpi->mb.short_walsh4x4 = vp9_short_walsh4x4_lossless;

-  }

-#endif

   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;

   cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;

   cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;

   cpi->mb.quantize_b_16x16    = vp9_regular_quantize_b_16x16;

-  cpi->mb.quantize_b_2x2      = vp9_regular_quantize_b_2x2;

   vp9_init_quantizer(cpi);

@@ -1078,24 +853,19 @@

   frames_at_speed[cpi->Speed]++;

 #endif

-static void alloc_raw_frame_buffers(VP9_COMP *cpi) {

-  int width = (cpi->oxcf.Width + 15) & ~15;

-  int height = (cpi->oxcf.Height + 15) & ~15;

-  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height,

+static void alloc_raw_frame_buffers(VP9_COMP *cpi) {

+  cpi->lookahead = vp9_lookahead_init(cpi->oxcf.width, cpi->oxcf.height,

                                       cpi->oxcf.lag_in_frames);

   if (!cpi->lookahead)

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate lag buffers");

-#if VP9_TEMPORAL_ALT_REF

   if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer,

-                                  width, height, VP9BORDERINPIXELS))

+                                  cpi->oxcf.width, cpi->oxcf.height,

+                                  VP9BORDERINPIXELS))

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate altref buffer");

-#endif

 static int alloc_partition_data(VP9_COMP *cpi) {

@@ -1115,10 +885,7 @@

 void vp9_alloc_compressor_data(VP9_COMP *cpi) {

   VP9_COMMON *cm = &cpi->common;

-  int width = cm->Width;

-  int height = cm->Height;

-  if (vp9_alloc_frame_buffers(cm, width, height))

+  if (vp9_alloc_frame_buffers(cm, cm->width, cm->height))

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate frame buffers");

@@ -1126,25 +893,16 @@

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate partition data");

-  if ((width & 0xf) != 0)

-    width += 16 - (width & 0xf);

-  if ((height & 0xf) != 0)

-    height += 16 - (height & 0xf);

   if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,

-                                  width, height, VP9BORDERINPIXELS))

+                                  cm->width, cm->height, VP9BORDERINPIXELS))

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate last frame buffer");

   if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source,

-                                  width, height, VP9BORDERINPIXELS))

+                                  cm->width, cm->height, VP9BORDERINPIXELS))

     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

                        "Failed to allocate scaled source buffer");

   vpx_free(cpi->tok);

@@ -1199,6 +957,48 @@

+static void update_frame_size(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  /* our internal buffers are always multiples of 16 */

+  int aligned_width = (cm->width + 15) & ~15;

+  int aligned_height = (cm->height + 15) & ~15;

+  cm->mb_rows = aligned_height >> 4;

+  cm->mb_cols = aligned_width >> 4;

+  cm->MBs = cm->mb_rows * cm->mb_cols;

+  cm->mode_info_stride = cm->mb_cols + 1;

+  memset(cm->mip, 0,

+        (cm->mb_cols + 1) * (cm->mb_rows + 1) * sizeof(MODE_INFO));

+  vp9_update_mode_info_border(cm, cm->mip);

+  cm->mi = cm->mip + cm->mode_info_stride + 1;

+  cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1;

+  vp9_update_mode_info_in_image(cm, cm->mi);

+  /* Update size of buffers local to this frame */

+  if (vp8_yv12_realloc_frame_buffer(&cpi->last_frame_uf,

+                                    cm->width, cm->height, VP9BORDERINPIXELS))

+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

+                       "Failed to reallocate last frame buffer");

+  if (vp8_yv12_realloc_frame_buffer(&cpi->scaled_source,

+                                    cm->width, cm->height, VP9BORDERINPIXELS))

+    vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,

+                       "Failed to reallocate scaled source buffer");

+  {

+    int y_stride = cpi->scaled_source.y_stride;

+    if (cpi->sf.search_method == NSTEP) {

+      vp9_init3smotion_compensation(&cpi->mb, y_stride);

+    } else if (cpi->sf.search_method == DIAMOND) {

+      vp9_init_dsmotion_compensation(&cpi->mb, y_stride);

+    }

+  }

+}

 // TODO perhaps change number of steps expose to outside world when setting

 // max and min limits. Also this will likely want refining for the extended Q

 // range.

@@ -1239,15 +1039,12 @@

     cpi->min_frame_bandwidth = FRAME_OVERHEAD_BITS;

   // Set Maximum gf/arf interval

-  cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);

+  cpi->max_gf_interval = 16;

-  if (cpi->max_gf_interval < 12)

-    cpi->max_gf_interval = 12;

   // Extended interval for genuinely static scenes

   cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;

-  // Special conditions when altr ref frame enabled in lagged compress mode

+  // Special conditions when alt ref frame enabled in lagged compress mode

   if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) {

     if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)

       cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;

@@ -1260,28 +1057,45 @@

     cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;

-static int

-rescale(int val, int num, int denom) {

+static int64_t rescale(int val, int64_t num, int denom) {

   int64_t llnum = num;

   int64_t llden = denom;

   int64_t llval = val;

-  return (int)(llval * llnum / llden);

+  return (llval * llnum / llden);

+static void set_tile_limits(VP9_COMP *cpi) {

+  VP9_COMMON *const cm = &cpi->common;

+  int min_log2_tiles, max_log2_tiles;

+  cm->log2_tile_columns = cpi->oxcf.tile_columns;

+  cm->log2_tile_rows = cpi->oxcf.tile_rows;

+  vp9_get_tile_n_bits(cm, &min_log2_tiles, &max_log2_tiles);

+  max_log2_tiles += min_log2_tiles;

+  if (cm->log2_tile_columns < min_log2_tiles)

+    cm->log2_tile_columns = min_log2_tiles;

+  else if (cm->log2_tile_columns > max_log2_tiles)

+    cm->log2_tile_columns = max_log2_tiles;

+  cm->tile_columns = 1 << cm->log2_tile_columns;

+  cm->tile_rows = 1 << cm->log2_tile_rows;

+}

 static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {

   VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  VP9_COMMON *cm = &cpi->common;

+  VP9_COMMON *const cm = &cpi->common;

   cpi->oxcf = *oxcf;

   cpi->goldfreq = 7;

-  cm->version = oxcf->Version;

+  cm->version = oxcf->version;

   vp9_setup_version(cm);

+  cm->width = oxcf->width;

+  cm->height = oxcf->height;

   // change includes all joint functionality

   vp9_change_config(ptr, oxcf);

@@ -1304,31 +1118,30 @@

   cpi->static_mb_pct = 0;

-#if VP9_TEMPORAL_ALT_REF

+  cpi->lst_fb_idx = 0;

+  cpi->gld_fb_idx = 1;

+  cpi->alt_fb_idx = 2;

+  set_tile_limits(cpi);

     int i;

     cpi->fixed_divide[0] = 0;

     for (i = 1; i < 512; i++)

       cpi->fixed_divide[i] = 0x80000 / i;

-#endif

 void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {

   VP9_COMP *cpi = (VP9_COMP *)(ptr);

-  VP9_COMMON *cm = &cpi->common;

+  VP9_COMMON *const cm = &cpi->common;

-  if (!cpi)

+  if (!cpi || !oxcf)

     return;

-  if (!oxcf)

-    return;

-  if (cm->version != oxcf->Version) {

-    cm->version = oxcf->Version;

+  if (cm->version != oxcf->version) {

+    cm->version = oxcf->version;

     vp9_setup_version(cm);

@@ -1351,7 +1164,6 @@

       if (cpi->oxcf.cpu_used > 5)

         cpi->oxcf.cpu_used = 5;

       break;

     case MODE_SECONDPASS_BEST:

@@ -1364,20 +1176,14 @@

   cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];

   cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];

-  cpi->mb.e_mbd.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;

-  cpi->mb.e_mbd.inv_xform4x4_x8       = vp9_short_idct4x4llm;

-  cpi->mb.e_mbd.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;

-  cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;

-#if CONFIG_LOSSLESS

   cpi->oxcf.lossless = oxcf->lossless;

   if (cpi->oxcf.lossless) {

-    cpi->mb.e_mbd.inv_xform4x4_1_x8     = vp9_short_inv_walsh4x4_1_x8;

-    cpi->mb.e_mbd.inv_xform4x4_x8       = vp9_short_inv_walsh4x4_x8;

-    cpi->mb.e_mbd.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1_lossless;

-    cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless;

+    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;

+    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_iwalsh4x4;

+  } else {

+    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;

+    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_idct4x4;

-#endif

   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;

@@ -1385,8 +1191,8 @@

   // cpi->use_golden_frame_only = 0;

   // cpi->use_last_frame_only = 0;

-  cm->refresh_golden_frame = 0;

-  cm->refresh_last_frame = 1;

+  cpi->refresh_golden_frame = 0;

+  cpi->refresh_last_frame = 1;

   cm->refresh_entropy_probs = 1;

   setup_features(cpi);

@@ -1414,31 +1220,28 @@

   // Convert target bandwidth from Kbit/s to Bit/s

   cpi->oxcf.target_bandwidth       *= 1000;

-  cpi->oxcf.starting_buffer_level =

-    rescale(cpi->oxcf.starting_buffer_level,

-            cpi->oxcf.target_bandwidth, 1000);

+  cpi->oxcf.starting_buffer_level = rescale(cpi->oxcf.starting_buffer_level,

+                                            cpi->oxcf.target_bandwidth, 1000);

   // Set or reset optimal and maximum buffer levels.

   if (cpi->oxcf.optimal_buffer_level == 0)

     cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;

   else

-    cpi->oxcf.optimal_buffer_level =

-      rescale(cpi->oxcf.optimal_buffer_level,

-              cpi->oxcf.target_bandwidth, 1000);

+    cpi->oxcf.optimal_buffer_level = rescale(cpi->oxcf.optimal_buffer_level,

+                                             cpi->oxcf.target_bandwidth, 1000);

   if (cpi->oxcf.maximum_buffer_size == 0)

     cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;

   else

-    cpi->oxcf.maximum_buffer_size =

-      rescale(cpi->oxcf.maximum_buffer_size,

-              cpi->oxcf.target_bandwidth, 1000);

+    cpi->oxcf.maximum_buffer_size = rescale(cpi->oxcf.maximum_buffer_size,

+                                            cpi->oxcf.target_bandwidth, 1000);

   // Set up frame rate and related parameters rate control values.

   vp9_new_frame_rate(cpi, cpi->oxcf.frame_rate);

   // Set absolute upper and lower quality limits

-  cpi->worst_quality               = cpi->oxcf.worst_allowed_q;

-  cpi->best_quality                = cpi->oxcf.best_allowed_q;

+  cpi->worst_quality = cpi->oxcf.worst_allowed_q;

+  cpi->best_quality = cpi->oxcf.best_allowed_q;

   // active values should only be modified if out of new range

   if (cpi->active_worst_quality > cpi->oxcf.worst_allowed_q) {

@@ -1467,12 +1270,9 @@

   cpi->target_bandwidth = cpi->oxcf.target_bandwidth;

-  cm->Width       = cpi->oxcf.Width;

-  cm->Height      = cpi->oxcf.Height;

+  cm->display_width = cpi->oxcf.width;

+  cm->display_height = cpi->oxcf.height;

-  cm->horiz_scale  = cpi->horiz_scale;

-  cm->vert_scale   = cpi->vert_scale;

   // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)

   if (cpi->oxcf.Sharpness > 7)

     cpi->oxcf.Sharpness = 7;

@@ -1479,26 +1279,18 @@

   cm->sharpness_level = cpi->oxcf.Sharpness;

-  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {

-    int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);

-    int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);

-    Scale2Ratio(cm->horiz_scale, &hr, &hs);

-    Scale2Ratio(cm->vert_scale, &vr, &vs);

-    // always go to the next whole number

-    cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;

-    cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;

-  }

-  if (((cm->Width + 15) & 0xfffffff0) !=

-      cm->yv12_fb[cm->lst_fb_idx].y_width ||

-      ((cm->Height + 15) & 0xfffffff0) !=

-      cm->yv12_fb[cm->lst_fb_idx].y_height ||

-      cm->yv12_fb[cm->lst_fb_idx].y_width == 0) {

+  // Increasing the size of the frame beyond the first seen frame, or some

+  // otherwise signalled maximum size, is not supported.

+  // TODO(jkoleszar): exit gracefully.

+  if (!cpi->initial_width) {

     alloc_raw_frame_buffers(cpi);

     vp9_alloc_compressor_data(cpi);

+    cpi->initial_width = cm->width;

+    cpi->initial_height = cm->height;

+  assert(cm->width <= cpi->initial_width);

+  assert(cm->height <= cpi->initial_height);

+  update_frame_size(cpi);

   if (cpi->oxcf.fixed_q >= 0) {

     cpi->last_q[0] = cpi->oxcf.fixed_q;

@@ -1526,6 +1318,7 @@

   cpi->last_frame_distortion = 0;

 #endif

+  set_tile_limits(cpi);

 #define M_LOG2_E 0.693147180559945309417

@@ -1541,15 +1334,15 @@

 static void cal_nmvsadcosts(int *mvsadcost[2]) {

   int i = 1;

-  mvsadcost [0] [0] = 0;

-  mvsadcost [1] [0] = 0;

+  mvsadcost[0][0] = 0;

+  mvsadcost[1][0] = 0;

   do {

     double z = 256 * (2 * (log2f(8 * i) + .6));

-    mvsadcost [0][i] = (int) z;

-    mvsadcost [1][i] = (int) z;

-    mvsadcost [0][-i] = (int) z;

-    mvsadcost [1][-i] = (int) z;

+    mvsadcost[0][i] = (int)z;

+    mvsadcost[1][i] = (int)z;

+    mvsadcost[0][-i] = (int)z;

+    mvsadcost[1][-i] = (int)z;

   } while (++i <= MV_MAX);

@@ -1556,15 +1349,15 @@

 static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {

   int i = 1;

-  mvsadcost [0] [0] = 0;

-  mvsadcost [1] [0] = 0;

+  mvsadcost[0][0] = 0;

+  mvsadcost[1][0] = 0;

   do {

     double z = 256 * (2 * (log2f(8 * i) + .6));

-    mvsadcost [0][i] = (int) z;

-    mvsadcost [1][i] = (int) z;

-    mvsadcost [0][-i] = (int) z;

-    mvsadcost [1][-i] = (int) z;

+    mvsadcost[0][i] = (int)z;

+    mvsadcost[1][i] = (int)z;

+    mvsadcost[0][-i] = (int)z;

+    mvsadcost[1][-i] = (int)z;

   } while (++i <= MV_MAX);

@@ -1682,6 +1475,11 @@

 #ifdef NMV_STATS

   init_nmvstats();

 #endif

+#if CONFIG_CODE_NONZEROCOUNT

+#ifdef NZC_STATS

+  init_nzcstats();

+#endif

+#endif

   /*Initialize the feed-forward activity masking.*/

   cpi->activity_avg = 90 << 12;

@@ -1693,7 +1491,7 @@

   cpi->source_alt_ref_pending = FALSE;

   cpi->source_alt_ref_active = FALSE;

-  cpi->common.refresh_alt_ref_frame = 0;

+  cpi->refresh_alt_ref_frame = 0;

   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;

 #if CONFIG_INTERNAL_STATS

@@ -1795,10 +1593,6 @@

     cpi->rd_thresh_mult[i] = 128;

-#ifdef ENTROPY_STATS

-  init_mv_ref_counts();

-#endif

 #define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \

     cpi->fn_ptr[BT].sdf            = SDF; \

     cpi->fn_ptr[BT].vf             = VF; \

@@ -1838,14 +1632,6 @@

   BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,

       NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)

-#if ARCH_X86 || ARCH_X86_64

-  cpi->fn_ptr[BLOCK_16X16].copymem  = vp9_copy32xn;

-  cpi->fn_ptr[BLOCK_16X8].copymem   = vp9_copy32xn;

-  cpi->fn_ptr[BLOCK_8X16].copymem   = vp9_copy32xn;

-  cpi->fn_ptr[BLOCK_8X8].copymem    = vp9_copy32xn;

-  cpi->fn_ptr[BLOCK_4X4].copymem    = vp9_copy32xn;

-#endif

   cpi->full_search_sad = vp9_full_search_sad;

   cpi->diamond_search_sad = vp9_diamond_search_sad;

   cpi->refining_search_sad = vp9_refining_search_sad;

@@ -1865,6 +1651,13 @@

   cpi->common.error.setjmp = 0;

   vp9_zero(cpi->y_uv_mode_count)

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_zero(cm->fc.nzc_counts_4x4);

+  vp9_zero(cm->fc.nzc_counts_8x8);

+  vp9_zero(cm->fc.nzc_counts_16x16);

+  vp9_zero(cm->fc.nzc_counts_32x32);

+  vp9_zero(cm->fc.nzc_pcat_counts);

+#endif

   return (VP9_PTR) cpi;

@@ -1885,7 +1678,7 @@

     if (cpi->pass != 1) {

       print_context_counters();

       print_tree_update_probs();

-      print_mode_context();

+      print_mode_context(&cpi->common);

 #endif

 #ifdef NMV_STATS

@@ -1892,6 +1685,12 @@

     if (cpi->pass != 1)

       print_nmvstats();

 #endif

+#if CONFIG_CODE_NONZEROCOUNT

+#ifdef NZC_STATS

+    if (cpi->pass != 1)

+      print_nzcstats();

+#endif

+#endif

 #if CONFIG_INTERNAL_STATS

@@ -1908,7 +1707,8 @@

       print_mode_contexts(&cpi->common);

 #endif

       if (cpi->b_calculate_psnr) {

-        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];

+        YV12_BUFFER_CONFIG *lst_yv12 =

+            &cpi->common.yv12_fb[cpi->common.ref_frame_map[cpi->lst_fb_idx]];

         double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;

         double total_psnr = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error);

         double total_psnr2 = vp9_mse2psnr(samples, 255.0, cpi->total_sq_error2);

@@ -2176,8 +1976,8 @@

   struct vpx_codec_cx_pkt  pkt;

   uint64_t                 sse;

   int                      i;

-  unsigned int             width = cpi->common.Width;

-  unsigned int             height = cpi->common.Height;

+  unsigned int             width = cpi->common.width;

+  unsigned int             height = cpi->common.height;

   pkt.kind = VPX_CODEC_PSNR_PKT;

   sse = calc_plane_error(orig->y_buffer, orig->y_stride,

@@ -2230,34 +2030,34 @@

   if (ref_frame_flags > 7)

     return -1;

-  cpi->common.refresh_golden_frame = 0;

-  cpi->common.refresh_alt_ref_frame = 0;

-  cpi->common.refresh_last_frame   = 0;

+  cpi->refresh_golden_frame = 0;

+  cpi->refresh_alt_ref_frame = 0;

+  cpi->refresh_last_frame   = 0;

   if (ref_frame_flags & VP9_LAST_FLAG)

-    cpi->common.refresh_last_frame = 1;

+    cpi->refresh_last_frame = 1;

   if (ref_frame_flags & VP9_GOLD_FLAG)

-    cpi->common.refresh_golden_frame = 1;

+    cpi->refresh_golden_frame = 1;

   if (ref_frame_flags & VP9_ALT_FLAG)

-    cpi->common.refresh_alt_ref_frame = 1;

+    cpi->refresh_alt_ref_frame = 1;

   return 0;

-int vp9_get_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,

-                          YV12_BUFFER_CONFIG *sd) {

+int vp9_copy_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,

+                           YV12_BUFFER_CONFIG *sd) {

   VP9_COMP *cpi = (VP9_COMP *)(ptr);

   VP9_COMMON *cm = &cpi->common;

   int ref_fb_idx;

   if (ref_frame_flag == VP9_LAST_FLAG)

-    ref_fb_idx = cm->lst_fb_idx;

+    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];

   else if (ref_frame_flag == VP9_GOLD_FLAG)

-    ref_fb_idx = cm->gld_fb_idx;

+    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];

   else if (ref_frame_flag == VP9_ALT_FLAG)

-    ref_fb_idx = cm->alt_fb_idx;

+    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];

   else

     return -1;

@@ -2266,6 +2066,17 @@

   return 0;

+int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {

+  VP9_COMP *cpi = (VP9_COMP *)(ptr);

+  VP9_COMMON *cm = &cpi->common;

+  if (index < 0 || index >= NUM_REF_FRAMES)

+    return -1;

+  *fb = &cm->yv12_fb[cm->ref_frame_map[index]];

+  return 0;

+}

 int vp9_set_reference_enc(VP9_PTR ptr, VP9_REFFRAME ref_frame_flag,

                           YV12_BUFFER_CONFIG *sd) {

   VP9_COMP *cpi = (VP9_COMP *)(ptr);

@@ -2274,11 +2085,11 @@

   int ref_fb_idx;

   if (ref_frame_flag == VP9_LAST_FLAG)

-    ref_fb_idx = cm->lst_fb_idx;

+    ref_fb_idx = cm->ref_frame_map[cpi->lst_fb_idx];

   else if (ref_frame_flag == VP9_GOLD_FLAG)

-    ref_fb_idx = cm->gld_fb_idx;

+    ref_fb_idx = cm->ref_frame_map[cpi->gld_fb_idx];

   else if (ref_frame_flag == VP9_ALT_FLAG)

-    ref_fb_idx = cm->alt_fb_idx;

+    ref_fb_idx = cm->ref_frame_map[cpi->alt_fb_idx];

   else

     return -1;

@@ -2327,7 +2138,7 @@

 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {

   YV12_BUFFER_CONFIG *s = cm->frame_to_show;

   uint8_t *src = s->y_buffer;

-  int h = cm->Height;

+  int h = cm->height;

   do {

     fwrite(src, s->y_width, 1,  yuv_rec_file);

@@ -2335,7 +2146,7 @@

   } while (--h);

   src = s->u_buffer;

-  h = (cm->Height + 1) / 2;

+  h = (cm->height + 1) / 2;

   do {

     fwrite(src, s->uv_width, 1,  yuv_rec_file);

@@ -2343,15 +2154,79 @@

   } while (--h);

   src = s->v_buffer;

-  h = (cm->Height + 1) / 2;

+  h = (cm->height + 1) / 2;

   do {

     fwrite(src, s->uv_width, 1, yuv_rec_file);

     src += s->uv_stride;

   } while (--h);

+  fflush(yuv_rec_file);

 #endif

+static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,

+                                   YV12_BUFFER_CONFIG *dst_fb) {

+  const int in_w = src_fb->y_crop_width;

+  const int in_h = src_fb->y_crop_height;

+  const int out_w = dst_fb->y_crop_width;

+  const int out_h = dst_fb->y_crop_height;

+  int x, y;

+  for (y = 0; y < out_h; y += 16) {

+    for (x = 0; x < out_w; x += 16) {

+      int x_q4 = x * 16 * in_w / out_w;

+      int y_q4 = y * 16 * in_h / out_h;

+      uint8_t *src, *dst;

+      int src_stride, dst_stride;

+      src = src_fb->y_buffer +

+          y * in_h / out_h * src_fb->y_stride +

+          x * in_w / out_w;

+      dst = dst_fb->y_buffer +

+          y * dst_fb->y_stride +

+          x;

+      src_stride = src_fb->y_stride;

+      dst_stride = dst_fb->y_stride;

+      vp9_convolve8(src, src_stride, dst, dst_stride,

+                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,

+                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,

+                    16, 16);

+      x_q4 >>= 1;

+      y_q4 >>= 1;

+      src_stride = src_fb->uv_stride;

+      dst_stride = dst_fb->uv_stride;

+      src = src_fb->u_buffer +

+          y / 2 * in_h / out_h * src_fb->uv_stride +

+          x / 2 * in_w / out_w;

+      dst = dst_fb->u_buffer +

+          y / 2 * dst_fb->uv_stride +

+          x / 2;

+      vp9_convolve8(src, src_stride, dst, dst_stride,

+                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,

+                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,

+                    8, 8);

+      src = src_fb->v_buffer +

+          y / 2 * in_h / out_h * src_fb->uv_stride +

+          x / 2 * in_w / out_w;

+      dst = dst_fb->v_buffer +

+          y / 2 * dst_fb->uv_stride +

+          x / 2;

+      vp9_convolve8(src, src_stride, dst, dst_stride,

+                    vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,

+                    vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,

+                    8, 8);

+    }

+  }

+  vp8_yv12_extend_frame_borders(dst_fb);

+}

 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {

   VP9_COMMON *cm = &cpi->common;

@@ -2374,13 +2249,13 @@

   VP9_COMMON *cm = &cpi->common;

   // Update the Golden frame usage counts.

-  if (cm->refresh_golden_frame) {

+  if (cpi->refresh_golden_frame) {

     // Update data structure that monitors level of reference to last GF

     vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

     // this frame refreshes means next frames don't unless specified by user

-    cm->refresh_golden_frame = 0;

+    cpi->refresh_golden_frame = 0;

     cpi->common.frames_since_golden = 0;

     // if ( cm->frame_type == KEY_FRAME )

@@ -2402,7 +2277,7 @@

     // ******** Fixed Q test code only ************

     // If we are going to use the ALT reference for the next group of frames set a flag to say so.

     if (cpi->oxcf.fixed_q >= 0 &&

-        cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame) {

+        cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {

       cpi->source_alt_ref_pending = TRUE;

       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;

@@ -2414,7 +2289,7 @@

     if (cpi->frames_till_gf_update_due > 0)

       cpi->frames_till_gf_update_due--;

-  } else if (!cpi->common.refresh_alt_ref_frame) {

+  } else if (!cpi->refresh_alt_ref_frame) {

     // Decrement count down till next gf

     if (cpi->frames_till_gf_update_due > 0)

       cpi->frames_till_gf_update_due--;

@@ -2535,8 +2410,8 @@

   if ((cpi->sf.recode_loop == 1) ||

       ((cpi->sf.recode_loop == 2) &&

        ((cm->frame_type == KEY_FRAME) ||

-        cm->refresh_golden_frame ||

-        cm->refresh_alt_ref_frame))) {

+        cpi->refresh_golden_frame ||

+        cpi->refresh_alt_ref_frame))) {

     // General over and under shoot tests

     if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||

         ((cpi->projected_frame_size < low_limit) && (q > minq))) {

@@ -2563,86 +2438,56 @@

   return force_recode;

-static void update_reference_frames(VP9_COMMON *cm) {

-  YV12_BUFFER_CONFIG *yv12_fb = cm->yv12_fb;

+static void update_reference_frames(VP9_COMP * const cpi) {

+  VP9_COMMON * const cm = &cpi->common;

   // At this point the new frame has been encoded.

   // If any buffer copy / swapping is signaled it should be done here.

   if (cm->frame_type == KEY_FRAME) {

-    yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG | VP9_ALT_FLAG;

+    ref_cnt_fb(cm->fb_idx_ref_cnt,

+               &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);

+    ref_cnt_fb(cm->fb_idx_ref_cnt,

+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);

+  } else if (cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {

+    /* Preserve the previously existing golden frame and update the frame in

+     * the alt ref slot instead. This is highly specific to the current use of

+     * alt-ref as a forward reference, and this needs to be generalized as

+     * other uses are implemented (like RTC/temporal scaling)

+     *

+     * The update to the buffer in the alt ref slot was signalled in

+     * vp9_pack_bitstream(), now swap the buffer pointers so that it's treated

+     * as the golden frame next time.

+     */

+    int tmp;

-    yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

-    yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

+    ref_cnt_fb(cm->fb_idx_ref_cnt,

+               &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);

-    cm->alt_fb_idx = cm->gld_fb_idx = cm->new_fb_idx;

-  } else { /* For non key frames */

-    if (cm->refresh_alt_ref_frame) {

-      assert(!cm->copy_buffer_to_arf);

-      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_ALT_FLAG;

-      cm->yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

-      cm->alt_fb_idx = cm->new_fb_idx;

-    } else if (cm->copy_buffer_to_arf) {

-      assert(!(cm->copy_buffer_to_arf & ~0x3));

-      if (cm->copy_buffer_to_arf == 1) {

-        if (cm->alt_fb_idx != cm->lst_fb_idx) {

-          yv12_fb[cm->lst_fb_idx].flags |= VP9_ALT_FLAG;

-          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

-          cm->alt_fb_idx = cm->lst_fb_idx;

-        }

-      } else { /* if (cm->copy_buffer_to_arf == 2) */

-        if (cm->alt_fb_idx != cm->gld_fb_idx) {

-          yv12_fb[cm->gld_fb_idx].flags |= VP9_ALT_FLAG;

-          yv12_fb[cm->alt_fb_idx].flags &= ~VP9_ALT_FLAG;

-          cm->alt_fb_idx = cm->gld_fb_idx;

-        }

-      }

+    tmp = cpi->alt_fb_idx;

+    cpi->alt_fb_idx = cpi->gld_fb_idx;

+    cpi->gld_fb_idx = tmp;

+  } else { /* For non key/golden frames */

+    if (cpi->refresh_alt_ref_frame) {

+      ref_cnt_fb(cm->fb_idx_ref_cnt,

+                 &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);

-    if (cm->refresh_golden_frame) {

-      assert(!cm->copy_buffer_to_gf);

-      cm->yv12_fb[cm->new_fb_idx].flags |= VP9_GOLD_FLAG;

-      cm->yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

-      cm->gld_fb_idx = cm->new_fb_idx;

-    } else if (cm->copy_buffer_to_gf) {

-      assert(!(cm->copy_buffer_to_arf & ~0x3));

-      if (cm->copy_buffer_to_gf == 1) {

-        if (cm->gld_fb_idx != cm->lst_fb_idx) {

-          yv12_fb[cm->lst_fb_idx].flags |= VP9_GOLD_FLAG;

-          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

-          cm->gld_fb_idx = cm->lst_fb_idx;

-        }

-      } else { /* if (cm->copy_buffer_to_gf == 2) */

-        if (cm->alt_fb_idx != cm->gld_fb_idx) {

-          yv12_fb[cm->alt_fb_idx].flags |= VP9_GOLD_FLAG;

-          yv12_fb[cm->gld_fb_idx].flags &= ~VP9_GOLD_FLAG;

-          cm->gld_fb_idx = cm->alt_fb_idx;

-        }

-      }

+    if (cpi->refresh_golden_frame) {

+      ref_cnt_fb(cm->fb_idx_ref_cnt,

+                 &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);

-  if (cm->refresh_last_frame) {

-    cm->yv12_fb[cm->new_fb_idx].flags |= VP9_LAST_FLAG;

-    cm->yv12_fb[cm->lst_fb_idx].flags &= ~VP9_LAST_FLAG;

-    cm->lst_fb_idx = cm->new_fb_idx;

+  if (cpi->refresh_last_frame) {

+    ref_cnt_fb(cm->fb_idx_ref_cnt,

+               &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);

 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {

-  if (cm->no_lpf) {

+  if (cm->no_lpf || cpi->mb.e_mbd.lossless) {

     cm->filter_level = 0;

-  }

-#if CONFIG_LOSSLESS

-  else if (cpi->oxcf.lossless) {

-    cm->filter_level = 0;

-  }

-#endif

-  else {

+  } else {

     struct vpx_usec_timer timer;

     vp9_clear_system_state();

@@ -2659,7 +2504,8 @@

   if (cm->filter_level > 0) {

     vp9_set_alt_lf_level(cpi, cm->filter_level);

-    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0);

+    vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level, 0,

+                          cm->dering_enabled);

   vp8_yv12_extend_frame_borders(cm->frame_to_show);

@@ -2666,7 +2512,7 @@

-void select_interp_filter_type(VP9_COMP *cpi) {

+void vp9_select_interp_filter_type(VP9_COMP *cpi) {

   int i;

   int high_filter_index = 0;

   unsigned int thresh;

@@ -2719,6 +2565,38 @@

 #endif

+static void scale_references(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int i;

+  for (i = 0; i < 3; i++) {

+    YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[i]];

+    if (ref->y_crop_width != cm->width ||

+        ref->y_crop_height != cm->height) {

+      int new_fb = get_free_fb(cm);

+      vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[new_fb],

+                                    cm->width, cm->height,

+                                    VP9BORDERINPIXELS);

+      scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);

+      cpi->scaled_ref_idx[i] = new_fb;

+    } else {

+      cpi->scaled_ref_idx[i] = cm->ref_frame_map[i];

+      cm->fb_idx_ref_cnt[cm->ref_frame_map[i]]++;

+    }

+  }

+}

+static void release_scaled_references(VP9_COMP *cpi) {

+  VP9_COMMON *cm = &cpi->common;

+  int i;

+  for (i = 0; i < 3; i++) {

+    cm->fb_idx_ref_cnt[cpi->scaled_ref_idx[i]]--;

+  }

+}

 static void encode_frame_to_data_rate(VP9_COMP *cpi,

                                       unsigned long *size,

                                       unsigned char *dest,

@@ -2735,8 +2613,6 @@

   int q_low;

   int q_high;

-  int zbin_oq_high;

-  int zbin_oq_low = 0;

   int top_index;

   int bottom_index;

@@ -2749,11 +2625,7 @@

 #if RESET_FOREACH_FILTER

   int q_low0;

   int q_high0;

-  int zbin_oq_high0;

-  int zbin_oq_low0 = 0;

   int Q0;

-  int last_zbin_oq;

-  int last_zbin_oq0;

   int active_best_quality0;

   int active_worst_quality0;

   double rate_correction_factor0;

@@ -2773,6 +2645,17 @@

   int mcomp_filter_index = 0;

   int64_t mcomp_filter_cost[4];

+  /* Scale the source buffer, if required */

+  if (cm->mb_cols * 16 != cpi->un_scaled_source->y_width ||

+      cm->mb_rows * 16 != cpi->un_scaled_source->y_height) {

+    scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source);

+    cpi->Source = &cpi->scaled_source;

+  } else {

+    cpi->Source = cpi->un_scaled_source;

+  }

+  scale_references(cpi);

   // Clear down mmx registers to allow floating point in what follows

   vp9_clear_system_state();

@@ -2779,7 +2662,7 @@

   // For an alt ref frame in 2 pass we skip the call to the second

   // pass function that sets the target bandwidth so must set it here

-  if (cpi->common.refresh_alt_ref_frame) {

+  if (cpi->refresh_alt_ref_frame) {

     cpi->per_frame_bandwidth = cpi->twopass.gf_bits;                           // Per frame bit target for the alt ref frame

     // per second target bitrate

     cpi->target_bandwidth = (int)(cpi->twopass.gf_bits *

@@ -2786,12 +2669,7 @@

                                   cpi->output_frame_rate);

-  // Default turn off buffer to buffer copying

-  cm->copy_buffer_to_gf = 0;

-  cm->copy_buffer_to_arf = 0;

   // Clear zbin over-quant value and mode boost values.

-  cpi->zbin_over_quant = 0;

   cpi->zbin_mode_boost = 0;

   // Enable or disable mode based tweaking of the zbin

@@ -2798,14 +2676,11 @@

   // For 2 Pass Only used where GF/ARF prediction quality

   // is above a threshold

   cpi->zbin_mode_boost = 0;

-#if CONFIG_LOSSLESS

-  cpi->zbin_mode_boost_enabled = FALSE;

-#else

-  cpi->zbin_mode_boost_enabled = TRUE;

-#endif

-  if (cpi->gfu_boost <= 400) {

+  // if (cpi->oxcf.lossless)

     cpi->zbin_mode_boost_enabled = FALSE;

-  }

+  // else

+  //   cpi->zbin_mode_boost_enabled = TRUE;

   // Current default encoder behaviour for the altref sign bias

   if (cpi->source_alt_ref_active)

@@ -2846,10 +2721,22 @@

     for (i = 0; i < MAX_MODES; i++) {

       cpi->rd_thresh_mult[i] = 128;

+    cm->error_resilient_mode = (cpi->oxcf.error_resilient_mode != 0);

+    cm->frame_parallel_decoding_mode =

+      (cpi->oxcf.frame_parallel_decoding_mode != 0);

+    if (cm->error_resilient_mode) {

+      cm->frame_parallel_decoding_mode = 1;

+      cm->refresh_entropy_probs = 0;

+    }

-  // Test code for new segment features

-  init_seg_features(cpi);

+  // Configure use of segmentation for enhanced coding of static regions.

+  // Only allowed for now in second pass of two pass (as requires lagged coding)

+  // and if the relevent speed feature flag is set.

+  if ((cpi->pass == 2) && (cpi->sf.static_segmentation)) {

+    configure_static_seg_features(cpi);

+  }

   // Decide how big to make the frame

   vp9_pick_frame_size(cpi);

@@ -2896,9 +2783,7 @@

       if (cpi->active_best_quality < cpi->best_quality)

         cpi->active_best_quality = cpi->best_quality;

-  }

-  else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) {

+  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {

     int high = 2000;

     int low = 400;

@@ -2935,7 +2820,15 @@

         cpi->active_best_quality * 15 / 16;

   } else {

+#ifdef ONE_SHOT_Q_ESTIMATE

+#ifdef STRICT_ONE_SHOT_Q

+    cpi->active_best_quality = Q;

+#else

     cpi->active_best_quality = inter_minq[Q];

+#endif

+#else

+    cpi->active_best_quality = inter_minq[Q];

+#endif

     // For the constant/constrained quality mode we dont want

     // q to fall below the cq level.

@@ -2971,19 +2864,8 @@

     // Determine initial Q to try

     Q = vp9_regulate_q(cpi, cpi->this_frame_target);

-#if RESET_FOREACH_FILTER

-  last_zbin_oq = cpi->zbin_over_quant;

-#endif

-  // Set highest allowed value for Zbin over quant

-  if (cm->frame_type == KEY_FRAME)

-    zbin_oq_high = 0; // ZBIN_OQ_MAX/16

-  else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))

-    zbin_oq_high = 16;

-  else

-    zbin_oq_high = ZBIN_OQ_MAX;

-  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,

+  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,

                                 &frame_over_shoot_limit);

   // Limit Q range for the adaptive loop.

@@ -3016,7 +2898,6 @@

 #if CONFIG_POSTPROC

   if (cpi->oxcf.noise_sensitivity > 0) {

-    uint8_t *src;

     int l = 0;

     switch (cpi->oxcf.noise_sensitivity) {

@@ -3030,7 +2911,6 @@

         l = 60;

         break;

       case 4:

       case 5:

         l = 100;

         break;

@@ -3039,18 +2919,7 @@

         break;

-    if (cm->frame_type == KEY_FRAME) {

-      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0);

-    } else {

-      vp9_de_noise(cpi->Source, cpi->Source, l, 1,  0);

-      src = cpi->Source->y_buffer;

-      if (cpi->Source->y_stride < 0) {

-        src += cpi->Source->y_stride * (cpi->Source->y_height - 1);

-      }

-    }

+    vp9_denoise(cpi->Source, cpi->Source, l, 1, 0);

 #endif

@@ -3064,9 +2933,6 @@

     q_low0 = q_low;

     q_high0 = q_high;

     Q0 = Q;

-    zbin_oq_low0 = zbin_oq_low;

-    zbin_oq_high0 = zbin_oq_high;

-    last_zbin_oq0 = last_zbin_oq;

     rate_correction_factor0 = cpi->rate_correction_factor;

     gf_rate_correction_factor0 = cpi->gf_rate_correction_factor;

     active_best_quality0 = cpi->active_best_quality;

@@ -3087,12 +2953,12 @@

           cm->mbskip_pred_probs[k] = cpi->base_skip_false_prob[Q][k];

         if (cm->frame_type != KEY_FRAME) {

-          if (cpi->common.refresh_alt_ref_frame) {

+          if (cpi->refresh_alt_ref_frame) {

             for (k = 0; k < MBSKIP_CONTEXTS; k++) {

               if (cpi->last_skip_false_probs[2][k] != 0)

                 cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[2][k];

-          } else if (cpi->common.refresh_golden_frame) {

+          } else if (cpi->refresh_golden_frame) {

             for (k = 0; k < MBSKIP_CONTEXTS; k++) {

               if (cpi->last_skip_false_probs[1][k] != 0)

                 cm->mbskip_pred_probs[k] = cpi->last_skip_false_probs[1][k];

@@ -3124,13 +2990,28 @@

       // Set up entropy depending on frame type.

-      if (cm->frame_type == KEY_FRAME)

+      if (cm->frame_type == KEY_FRAME) {

+        /* Choose which entropy context to use. When using a forward reference

+	 * frame, it immediately follows the keyframe, and thus benefits from

+	 * using the same entropy context established by the keyframe. Otherwise,

+	 * use the default context 0.

+	 */

+        cm->frame_context_idx = cpi->oxcf.play_alternate;

         vp9_setup_key_frame(cpi);

-      else

+      } else {

+	/* Choose which entropy context to use. Currently there are only two

+	 * contexts used, one for normal frames and one for alt ref frames.

+	 */

+        cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;

         vp9_setup_inter_frame(cpi);

+      }

     // transform / motion compensation build reconstruction frame

+#if CONFIG_MODELCOEFPROB && ADJUST_KF_COEF_PROBS

+    if (cm->frame_type == KEY_FRAME)

+      vp9_adjust_default_coef_probs(cm);

+#endif

     vp9_encode_frame(cpi);

@@ -3214,9 +3095,6 @@

       if (cpi->projected_frame_size > cpi->this_frame_target) {

         q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value

-        if (cpi->zbin_over_quant > 0)            // If we are using over quant do the same for zbin_oq_low

-          zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;

         if (undershoot_seen || (loop_count > 1)) {

           // Update rate_correction_factor unless cpi->active_worst_quality has changed.

           if (!active_worst_qchanged)

@@ -3223,14 +3101,6 @@

             vp9_update_rate_correction_factors(cpi, 1);

           Q = (q_high + q_low + 1) / 2;

-          // Adjust cpi->zbin_over_quant (only allowed when Q is max)

-          if (Q < MAXQ)

-            cpi->zbin_over_quant = 0;

-          else {

-            zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;

-            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;

-          }

         } else {

           // Update rate_correction_factor unless cpi->active_worst_quality has changed.

           if (!active_worst_qchanged)

@@ -3238,7 +3108,7 @@

           Q = vp9_regulate_q(cpi, cpi->this_frame_target);

-          while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10)) {

+          while ((Q < q_low) && (Retries < 10)) {

             vp9_update_rate_correction_factors(cpi, 0);

             Q = vp9_regulate_q(cpi, cpi->this_frame_target);

             Retries++;

@@ -3249,10 +3119,7 @@

       // Frame is too small

       else {

-        if (cpi->zbin_over_quant == 0)

-          q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant

-        else                                    // else lower zbin_oq_high

-          zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;

+        q_high = (Q > q_low) ? (Q - 1) : q_low;

         if (overshoot_seen || (loop_count > 1)) {

           // Update rate_correction_factor unless cpi->active_worst_quality has changed.

@@ -3260,12 +3127,6 @@

             vp9_update_rate_correction_factors(cpi, 1);

           Q = (q_high + q_low) / 2;

-          // Adjust cpi->zbin_over_quant (only allowed when Q is max)

-          if (Q < MAXQ)

-            cpi->zbin_over_quant = 0;

-          else

-            cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;

         } else {

           // Update rate_correction_factor unless cpi->active_worst_quality has changed.

           if (!active_worst_qchanged)

@@ -3282,7 +3143,7 @@

             q_low = Q;

-          while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10)) {

+          while ((Q > q_high) && (Retries < 10)) {

             vp9_update_rate_correction_factors(cpi, 0);

             Q = vp9_regulate_q(cpi, cpi->this_frame_target);

             Retries++;

@@ -3293,21 +3154,9 @@

       // Clamp Q to upper and lower limits:

-      if (Q > q_high)

-        Q = q_high;

-      else if (Q < q_low)

-        Q = q_low;

+      Q = clamp(Q, q_low, q_high);

-      // Clamp cpi->zbin_over_quant

-      cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ?

-          zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ?

-          zbin_oq_high : cpi->zbin_over_quant;

-      // Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;

-      Loop = ((Q != last_q)) ? TRUE : FALSE;

-#if RESET_FOREACH_FILTER

-      last_zbin_oq = cpi->zbin_over_quant;

-#endif

+      Loop = Q != last_q;

     } else

       Loop = FALSE;

@@ -3351,12 +3200,9 @@

         if (Loop == TRUE) {

           overshoot_seen = FALSE;

           undershoot_seen = FALSE;

-          zbin_oq_low = zbin_oq_low0;

-          zbin_oq_high = zbin_oq_high0;

           q_low = q_low0;

           q_high = q_high0;

           Q = Q0;

-          cpi->zbin_over_quant = last_zbin_oq = last_zbin_oq0;

           cpi->rate_correction_factor = rate_correction_factor0;

           cpi->gf_rate_correction_factor = gf_rate_correction_factor0;

           cpi->active_best_quality = active_best_quality0;

@@ -3412,12 +3258,18 @@

   vp9_update_gf_useage_maps(cpi, cm, &cpi->mb);

   if (cm->frame_type == KEY_FRAME)

-    cm->refresh_last_frame = 1;

+    cpi->refresh_last_frame = 1;

 #if 0

     FILE *f = fopen("gfactive.stt", "a");

-    fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);

+    fprintf(f, "%8d %8d %8d %8d %8d\n",

+            cm->current_video_frame,

+            (100 * cpi->gf_active_count)

+              / (cpi->common.mb_rows * cpi->common.mb_cols),

+            cpi->this_iiratio,

+            cpi->next_iiratio,

+            cpi->refresh_golden_frame);

     fclose(f);

 #endif

@@ -3444,18 +3296,19 @@

     update_reference_segmentation_map(cpi);

-  update_reference_frames(cm);

+  release_scaled_references(cpi);

+  update_reference_frames(cpi);

   vp9_copy(cpi->common.fc.coef_counts_4x4, cpi->coef_counts_4x4);

-  vp9_copy(cpi->common.fc.hybrid_coef_counts_4x4,

-           cpi->hybrid_coef_counts_4x4);

   vp9_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);

-  vp9_copy(cpi->common.fc.hybrid_coef_counts_8x8,

-           cpi->hybrid_coef_counts_8x8);

   vp9_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);

-  vp9_copy(cpi->common.fc.hybrid_coef_counts_16x16,

-           cpi->hybrid_coef_counts_16x16);

   vp9_copy(cpi->common.fc.coef_counts_32x32, cpi->coef_counts_32x32);

-  vp9_adapt_coef_probs(&cpi->common);

+  if (!cpi->common.error_resilient_mode &&

+      !cpi->common.frame_parallel_decoding_mode) {

+    vp9_adapt_coef_probs(&cpi->common);

+#if CONFIG_CODE_NONZEROCOUNT

+    vp9_adapt_nzc_probs(&cpi->common);

+#endif

+  }

   if (cpi->common.frame_type != KEY_FRAME) {

     vp9_copy(cpi->common.fc.sb_ymode_counts, cpi->sb_ymode_count);

     vp9_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);

@@ -3467,14 +3320,13 @@

 #if CONFIG_COMP_INTERINTRA_PRED

     vp9_copy(cpi->common.fc.interintra_counts, cpi->interintra_count);

 #endif

-    vp9_adapt_mode_probs(&cpi->common);

     cpi->common.fc.NMVcount = cpi->NMVcount;

-    /*

-    printf("2: %d %d %d %d\n", cpi->NMVcount.joints[0], cpi->NMVcount.joints[1],

-                      cpi->NMVcount.joints[2], cpi->NMVcount.joints[3]);

-                      */

-    vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);

+    if (!cpi->common.error_resilient_mode &&

+        !cpi->common.frame_parallel_decoding_mode) {

+      vp9_adapt_mode_probs(&cpi->common);

+      vp9_adapt_mode_context(&cpi->common);

+      vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);

+    }

 #if CONFIG_COMP_INTERINTRA_PRED

   if (cm->frame_type != KEY_FRAME)

@@ -3502,8 +3354,8 @@

   if ((cm->base_qindex < cpi->last_boosted_qindex) ||

       ((cpi->static_mb_pct < 100) &&

        ((cm->frame_type == KEY_FRAME) ||

-        cm->refresh_alt_ref_frame ||

-        (cm->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {

+        cpi->refresh_alt_ref_frame ||

+        (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {

     cpi->last_boosted_qindex = cm->base_qindex;

@@ -3516,7 +3368,8 @@

     cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;

   // Keep a record from which we can calculate the average Q excluding GF updates and key frames

-  if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) {

+  if ((cm->frame_type != KEY_FRAME)

+      && !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {

     cpi->ni_frames++;

     cpi->tot_q += vp9_convert_qindex_to_q(Q);

     cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;

@@ -3538,11 +3391,19 @@

   if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)

     cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;

-  // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.

-  cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;

-  cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;

-  cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;

-  cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;

+  // Rolling monitors of whether we are over or underspending used to help

+  // regulate min and Max Q in two pass.

+  if (cm->frame_type != KEY_FRAME) {

+    cpi->rolling_target_bits =

+      ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;

+    cpi->rolling_actual_bits =

+      ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;

+    cpi->long_rolling_target_bits =

+      ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;

+    cpi->long_rolling_actual_bits =

+      ((cpi->long_rolling_actual_bits * 31) +

+       cpi->projected_frame_size + 16) / 32;

+  }

   // Actual bits spent

   cpi->total_actual_bits    += cpi->projected_frame_size;

@@ -3558,7 +3419,7 @@

     if (cpi->twopass.kf_group_bits < 0)

       cpi->twopass.kf_group_bits = 0;

-  } else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame) {

+  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {

     cpi->twopass.gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;

     if (cpi->twopass.gf_group_bits < 0)

@@ -3569,7 +3430,7 @@

   // in this frame.

   update_base_skip_probs(cpi);

-#if 0// 1 && CONFIG_INTERNAL_STATS

+#if 0  // 1 && CONFIG_INTERNAL_STATS

     FILE *f = fopen("tmp.stt", "a");

     int recon_err;

@@ -3582,13 +3443,13 @@

     if (cpi->twopass.total_left_stats->coded_error != 0.0)

       fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"

               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"

-              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"

+              "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"

               "%10.3f %8d %10d %10d %10d\n",

               cpi->common.current_video_frame, cpi->this_frame_target,

               cpi->projected_frame_size, 0, //loop_size_estimate,

               (cpi->projected_frame_size - cpi->this_frame_target),

               (int)cpi->total_target_vs_actual,

-              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),

+              (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),

               (int)cpi->total_actual_bits,

               vp9_convert_qindex_to_q(cm->base_qindex),

               (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,

@@ -3597,9 +3458,8 @@

               cpi->avg_q,

               vp9_convert_qindex_to_q(cpi->ni_av_qi),

               vp9_convert_qindex_to_q(cpi->cq_target_quality),

-              cpi->zbin_over_quant,

-              // cpi->avg_frame_qindex, cpi->zbin_over_quant,

-              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,

+              cpi->refresh_last_frame,

+              cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,

               cm->frame_type, cpi->gfu_boost,

               cpi->twopass.est_max_qcorrection_factor,

               (int)cpi->twopass.bits_left,

@@ -3611,7 +3471,7 @@

     else

       fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d"

               "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"

-              "%6d %5d %5d %5d %8d %8.2f %10d %10.3f"

+              "%5d %5d %5d %8d %8d %8.2f %10d %10.3f"

               "%8d %10d %10d %10d\n",

               cpi->common.current_video_frame,

               cpi->this_frame_target, cpi->projected_frame_size,

@@ -3618,7 +3478,7 @@

               0, //loop_size_estimate,

               (cpi->projected_frame_size - cpi->this_frame_target),

               (int)cpi->total_target_vs_actual,

-              (cpi->oxcf.starting_buffer_level - cpi->bits_off_target),

+              (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),

               (int)cpi->total_actual_bits,

               vp9_convert_qindex_to_q(cm->base_qindex),

               (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,

@@ -3627,9 +3487,8 @@

               cpi->avg_q,

               vp9_convert_qindex_to_q(cpi->ni_av_qi),

               vp9_convert_qindex_to_q(cpi->cq_target_quality),

-              cpi->zbin_over_quant,

-              // cpi->avg_frame_qindex, cpi->zbin_over_quant,

-              cm->refresh_golden_frame, cm->refresh_alt_ref_frame,

+              cpi->refresh_last_frame,

+              cpi->refresh_golden_frame, cpi->refresh_alt_ref_frame,

               cm->frame_type, cpi->gfu_boost,

               cpi->twopass.est_max_qcorrection_factor,

               (int)cpi->twopass.bits_left,

@@ -3645,8 +3504,8 @@

       fprintf(fmodes, "%6d:%1d:%1d:%1d ",

               cpi->common.current_video_frame,

-              cm->frame_type, cm->refresh_golden_frame,

-              cm->refresh_alt_ref_frame);

+              cm->frame_type, cpi->refresh_golden_frame,

+              cpi->refresh_alt_ref_frame);

       for (i = 0; i < MAX_MODES; i++)

         fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);

@@ -3665,33 +3524,34 @@

 #endif

   // If this was a kf or Gf note the Q

-  if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)

+  if ((cm->frame_type == KEY_FRAME)

+      || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)

     cm->last_kf_gf_q = cm->base_qindex;

-  if (cm->refresh_golden_frame == 1)

+  if (cpi->refresh_golden_frame == 1)

     cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;

   else

     cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;

-  if (cm->refresh_alt_ref_frame == 1)

+  if (cpi->refresh_alt_ref_frame == 1)

     cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;

   else

     cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;

-  if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed

+  if (cpi->refresh_last_frame & cpi->refresh_golden_frame)

     cpi->gold_is_last = 1;

-  else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other

+  else if (cpi->refresh_last_frame ^ cpi->refresh_golden_frame)

     cpi->gold_is_last = 0;

-  if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed

+  if (cpi->refresh_last_frame & cpi->refresh_alt_ref_frame)

     cpi->alt_is_last = 1;

-  else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other

+  else if (cpi->refresh_last_frame ^ cpi->refresh_alt_ref_frame)

     cpi->alt_is_last = 0;

-  if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed

+  if (cpi->refresh_alt_ref_frame & cpi->refresh_golden_frame)

     cpi->gold_is_alt = 1;

-  else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other

+  else if (cpi->refresh_alt_ref_frame ^ cpi->refresh_golden_frame)

     cpi->gold_is_alt = 0;

   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;

@@ -3705,7 +3565,8 @@

   if (cpi->gold_is_alt)

     cpi->ref_frame_flags &= ~VP9_ALT_FLAG;

-  if (cpi->oxcf.play_alternate && cm->refresh_alt_ref_frame && (cm->frame_type != KEY_FRAME))

+  if (cpi->oxcf.play_alternate && cpi->refresh_alt_ref_frame

+      && (cm->frame_type != KEY_FRAME))

     // Update the alternate reference frame stats as appropriate.

     update_alt_ref_frame_stats(cpi);

   else

@@ -3727,6 +3588,9 @@

   xd->update_mb_segmentation_data = 0;

   xd->mode_ref_lf_delta_update = 0;

+  // keep track of the last coded dimensions

+  cm->last_width = cm->width;

+  cm->last_height = cm->height;

   // Dont increment frame counters if this was an altref buffer update not a real frame

   if (cm->show_frame) {

@@ -3744,8 +3608,9 @@

     FILE *recon_file;

     sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);

     recon_file = fopen(filename, "wb");

-    fwrite(cm->yv12_fb[cm->lst_fb_idx].buffer_alloc,

-           cm->yv12_fb[cm->lst_fb_idx].frame_size, 1, recon_file);

+    fwrite(cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].buffer_alloc,

+           cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]].frame_size,

+           1, recon_file);

     fclose(recon_file);

 #endif

@@ -3765,13 +3630,18 @@

 static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,

                         unsigned char *dest, unsigned int *frame_flags) {

-  if (!cpi->common.refresh_alt_ref_frame)

+  if (!cpi->refresh_alt_ref_frame)

     vp9_second_pass(cpi);

   encode_frame_to_data_rate(cpi, size, dest, frame_flags);

+#ifdef DISABLE_RC_LONG_TERM_MEM

+  cpi->twopass.bits_left -=  cpi->this_frame_target;

+#else

   cpi->twopass.bits_left -= 8 * *size;

+#endif

-  if (!cpi->common.refresh_alt_ref_frame) {

+  if (!cpi->refresh_alt_ref_frame) {

     double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.frame_rate;

     double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth

                                         * cpi->oxcf.two_pass_vbrmin_section / 100);

@@ -3808,9 +3678,8 @@

   const VP9_COMMON *cm = &cpi->common;

   const MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  return cm->frame_type == KEY_FRAME || cm->refresh_last_frame

-         || cm->refresh_golden_frame || cm->refresh_alt_ref_frame

-         || cm->copy_buffer_to_gf || cm->copy_buffer_to_arf

+  return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame

+         || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame

          || cm->refresh_entropy_probs

          || xd->mode_ref_lf_delta_update

          || xd->update_mb_segmentation_map || xd->update_mb_segmentation_data;

@@ -3846,9 +3715,9 @@

         force_src_buffer = &cpi->alt_ref_buffer;

       cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;

-      cm->refresh_alt_ref_frame = 1;

-      cm->refresh_golden_frame = 0;

-      cm->refresh_last_frame = 0;

+      cpi->refresh_alt_ref_frame = 1;

+      cpi->refresh_golden_frame = 0;

+      cpi->refresh_last_frame = 0;

       cm->show_frame = 0;

       cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.

       cpi->is_src_frame_alt_ref = 0;

@@ -3862,8 +3731,10 @@

       cpi->is_src_frame_alt_ref = cpi->alt_ref_source

                                   && (cpi->source == cpi->alt_ref_source);

-      if (cpi->is_src_frame_alt_ref)

+      if (cpi->is_src_frame_alt_ref) {

+        cpi->refresh_last_frame = 0;

         cpi->alt_ref_source = NULL;

+      }

@@ -3889,7 +3760,7 @@

   // adjust frame rates based on timestamps given

-  if (!cm->refresh_alt_ref_frame) {

+  if (!cpi->refresh_alt_ref_frame) {

     int64_t this_duration;

     int step = 0;

@@ -3945,28 +3816,34 @@

 #if 0

-  if (cm->refresh_alt_ref_frame) {

-    // cm->refresh_golden_frame = 1;

-    cm->refresh_golden_frame = 0;

-    cm->refresh_last_frame = 0;

+  if (cpi->refresh_alt_ref_frame) {

+    // cpi->refresh_golden_frame = 1;

+    cpi->refresh_golden_frame = 0;

+    cpi->refresh_last_frame = 0;

   } else {

-    cm->refresh_golden_frame = 0;

-    cm->refresh_last_frame = 1;

+    cpi->refresh_golden_frame = 0;

+    cpi->refresh_last_frame = 1;

 #endif

-  /* find a free buffer for the new frame */

-  {

-    int i = 0;

-    for (; i < NUM_YV12_BUFFERS; i++) {

-      if (!cm->yv12_fb[i].flags) {

-        cm->new_fb_idx = i;

-        break;

-      }

-    }

-    assert(i < NUM_YV12_BUFFERS);

-  }

+  /* find a free buffer for the new frame, releasing the reference previously

+   * held.

+   */

+  cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

+  cm->new_fb_idx = get_free_fb(cm);

+  /* Get the mapping of L/G/A to the reference buffer pool */

+  cm->active_ref_idx[0] = cm->ref_frame_map[cpi->lst_fb_idx];

+  cm->active_ref_idx[1] = cm->ref_frame_map[cpi->gld_fb_idx];

+  cm->active_ref_idx[2] = cm->ref_frame_map[cpi->alt_fb_idx];

+  /* Reset the frame pointers to the current frame size */

+  vp8_yv12_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],

+                                cm->width, cm->height,

+                                VP9BORDERINPIXELS);

+  vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);

   if (cpi->pass == 1) {

     Pass1Encode(cpi, size, dest, frame_flags);

   } else if (cpi->pass == 2) {

@@ -3976,21 +3853,19 @@

   if (cm->refresh_entropy_probs) {

-    if (cm->refresh_alt_ref_frame)

-      vpx_memcpy(&cm->lfc_a, &cm->fc, sizeof(cm->fc));

-    else

-      vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));

+    vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc,

+               sizeof(cm->fc));

-  // if its a dropped frame honor the requests on subsequent frames

   if (*size > 0) {

+    // if its a dropped frame honor the requests on subsequent frames

     cpi->droppable = !frame_is_reference(cpi);

     // return to normal state

     cm->refresh_entropy_probs = 1;

-    cm->refresh_alt_ref_frame = 0;

-    cm->refresh_golden_frame = 0;

-    cm->refresh_last_frame = 1;

+    cpi->refresh_alt_ref_frame = 0;

+    cpi->refresh_golden_frame = 0;

+    cpi->refresh_last_frame = 1;

     cm->frame_type = INTER_FRAME;

@@ -4113,7 +3988,7 @@

                               vp9_ppflags_t *flags) {

   VP9_COMP *cpi = (VP9_COMP *) comp;

-  if (cpi->common.refresh_alt_ref_frame)

+  if (!cpi->common.show_frame)

     return -1;

   else {

     int ret;

@@ -4123,9 +3998,9 @@

     if (cpi->common.frame_to_show) {

       *dest = *cpi->common.frame_to_show;

-      dest->y_width = cpi->common.Width;

-      dest->y_height = cpi->common.Height;

-      dest->uv_height = cpi->common.Height / 2;

+      dest->y_width = cpi->common.width;

+      dest->y_height = cpi->common.height;

+      dest->uv_height = cpi->common.height / 2;

       ret = 0;

     } else {

       ret = -1;

@@ -4217,17 +4092,25 @@

 int vp9_set_internal_size(VP9_PTR comp,

                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {

   VP9_COMP *cpi = (VP9_COMP *) comp;

+  VP9_COMMON *cm = &cpi->common;

+  int hr = 0, hs = 0, vr = 0, vs = 0;

-  if (horiz_mode <= ONETWO)

-    cpi->common.horiz_scale = horiz_mode;

-  else

+  if (horiz_mode > ONETWO)

     return -1;

-  if (vert_mode <= ONETWO)

-    cpi->common.vert_scale  = vert_mode;

-  else

+  if (vert_mode > ONETWO)

     return -1;

+  Scale2Ratio(horiz_mode, &hr, &hs);

+  Scale2Ratio(vert_mode, &vr, &vs);

+  // always go to the next whole number

+  cm->width = (hs - 1 + cpi->oxcf.width * hr) / hs;

+  cm->height = (vs - 1 + cpi->oxcf.height * vr) / vs;

+  assert(cm->width <= cpi->initial_width);

+  assert(cm->height <= cpi->initial_height);

+  update_frame_size(cpi);

   return 0;

@@ -4235,16 +4118,17 @@

 int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest) {

   int i, j;

-  int Total = 0;

+  int total = 0;

   uint8_t *src = source->y_buffer;

   uint8_t *dst = dest->y_buffer;

-  // Loop through the Y plane raw and reconstruction data summing (square differences)

+  // Loop through the Y plane raw and reconstruction data summing

+  // (square differences)

   for (i = 0; i < source->y_height; i += 16) {

     for (j = 0; j < source->y_width; j += 16) {

       unsigned int sse;

-      Total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,

+      total += vp9_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride,

                             &sse);

@@ -4252,7 +4136,7 @@

     dst += 16 * dest->y_stride;

-  return Total;

+  return total;

--- a/vp9/encoder/vp9_onyx_int.h

+++ b/vp9/encoder/vp9_onyx_int.h

@@ -29,6 +29,11 @@

 #include "vp9/common/vp9_findnearmv.h"

 #include "vp9/encoder/vp9_lookahead.h"

+// Experimental rate control switches

+// #define ONE_SHOT_Q_ESTIMATE 1

+// #define STRICT_ONE_SHOT_Q 1

+// #define DISABLE_RC_LONG_TERM_MEM 1

 // #define SPEEDSTATS 1

 #define MIN_GF_INTERVAL             4

 #define DEFAULT_GF_INTERVAL         7

@@ -37,10 +42,6 @@

 #define MAX_LAG_BUFFERS 25

-#define AF_THRESH   25

-#define AF_THRESH2  100

-#define ARF_DECAY_THRESH 12

 #if CONFIG_COMP_INTERINTRA_PRED

 #define MAX_MODES 54

 #else

@@ -50,13 +51,12 @@

 #define MIN_THRESHMULT  32

 #define MAX_THRESHMULT  512

-#define GF_ZEROMV_ZBIN_BOOST 12

-#define LF_ZEROMV_ZBIN_BOOST 6

-#define MV_ZBIN_BOOST        4

-#define ZBIN_OQ_MAX 192

+#define GF_ZEROMV_ZBIN_BOOST 0

+#define LF_ZEROMV_ZBIN_BOOST 0

+#define MV_ZBIN_BOOST        0

+#define SPLIT_MV_ZBIN_BOOST  0

+#define INTRA_ZBIN_BOOST     0

-#define VP9_TEMPORAL_ALT_REF 1

 typedef struct {

   nmv_context nmvc;

   int nmvjointcost[MV_JOINTS];

@@ -86,13 +86,10 @@

   // 0 = BPRED, ZERO_MV, MV, SPLIT

   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];

-  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_probs hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_probs hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_probs hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES_32X32];

+  vp9_coeff_probs coef_probs_4x4[BLOCK_TYPES];

+  vp9_coeff_probs coef_probs_8x8[BLOCK_TYPES];

+  vp9_coeff_probs coef_probs_16x16[BLOCK_TYPES];

+  vp9_coeff_probs coef_probs_32x32[BLOCK_TYPES];

   vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1];

   vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */

@@ -111,6 +108,18 @@

   int mv_ref_ct[INTER_MODE_CONTEXTS][4][2];

   int vp9_mode_contexts[INTER_MODE_CONTEXTS][4];

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_prob nzc_probs_4x4

+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];

+  vp9_prob nzc_probs_8x8

+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];

+  vp9_prob nzc_probs_16x16

+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];

+  vp9_prob nzc_probs_32x32

+           [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];

+  vp9_prob nzc_pcat_probs[MAX_NZC_CONTEXTS]

+                         [NZC_TOKENS_EXTRA][NZC_BITS_EXTRA];

+#endif

 } CODING_CONTEXT;

 typedef struct {

@@ -259,7 +268,9 @@

   int optimize_coefficients;

   int no_skip_block4x4_search;

   int search_best_filter;

+  int splitmode_breakout;

+  int mb16_breakout;

+  int static_segmentation;

 } SPEED_FEATURES;

 typedef struct {

@@ -301,11 +312,6 @@

   DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]);

@@ -312,30 +318,8 @@

   DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);

   DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);

-  DECLARE_ALIGNED(64, short, Y1zbin_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, Y2zbin_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, UVzbin_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y1_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);

-  DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);

-  DECLARE_ALIGNED(16, short, Y1zbin_32x32[QINDEX_RANGE][1024]);

-  DECLARE_ALIGNED(16, short, Y2zbin_32x32[QINDEX_RANGE][1024]);

-  DECLARE_ALIGNED(16, short, UVzbin_32x32[QINDEX_RANGE][1024]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_32x32[QINDEX_RANGE][1024]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_32x32[QINDEX_RANGE][1024]);

-  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_32x32[QINDEX_RANGE][1024]);

   MACROBLOCK mb;

   VP9_COMMON common;

   VP9_CONFIG oxcf;

@@ -357,11 +341,17 @@

   int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)

   int gold_is_alt;  // don't do both alt and gold search ( just do gold).

-  // int refresh_alt_ref_frame;

+  int scaled_ref_idx[3];

+  int lst_fb_idx;

+  int gld_fb_idx;

+  int alt_fb_idx;

+  int refresh_last_frame;

+  int refresh_golden_frame;

+  int refresh_alt_ref_frame;

   YV12_BUFFER_CONFIG last_frame_uf;

   TOKENEXTRA *tok;

-  unsigned int tok_count;

+  unsigned int tok_count[1 << 6];

   unsigned int frames_since_key;

@@ -396,11 +386,6 @@

   CODING_CONTEXT coding_context;

   // Rate targetting variables

-  int64_t prediction_error;

-  int64_t last_prediction_error;

-  int64_t intra_error;

-  int64_t last_intra_error;

   int this_frame_target;

   int projected_frame_size;

   int last_q[2];                   // Separate values for Intra/Inter

@@ -422,6 +407,7 @@

   int max_gf_interval;

   int baseline_gf_interval;

   int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames

+  int active_arnr_strength;         // <= cpi->oxcf.arnr_max_strength

   int64_t key_frame_count;

   int prior_key_frame_distance[KEY_FRAME_CONTEXT];

@@ -441,7 +427,6 @@

   double tot_q;

   double avg_q;

-  int zbin_over_quant;

   int zbin_mode_boost;

   int zbin_mode_boost_enabled;

@@ -484,37 +469,47 @@

   nmv_context_counts NMVcount;

-  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_count hybrid_coef_counts_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_probs frame_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4];

-  vp9_coeff_stats frame_hybrid_branch_ct_4x4[BLOCK_TYPES_4X4];

+  vp9_coeff_count coef_counts_4x4[BLOCK_TYPES];

+  vp9_coeff_probs frame_coef_probs_4x4[BLOCK_TYPES];

+  vp9_coeff_stats frame_branch_ct_4x4[BLOCK_TYPES];

-  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_count hybrid_coef_counts_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_probs frame_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8];

-  vp9_coeff_stats frame_hybrid_branch_ct_8x8[BLOCK_TYPES_8X8];

+  vp9_coeff_count coef_counts_8x8[BLOCK_TYPES];

+  vp9_coeff_probs frame_coef_probs_8x8[BLOCK_TYPES];

+  vp9_coeff_stats frame_branch_ct_8x8[BLOCK_TYPES];

-  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_count hybrid_coef_counts_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_probs frame_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16];

-  vp9_coeff_stats frame_hybrid_branch_ct_16x16[BLOCK_TYPES_16X16];

+  vp9_coeff_count coef_counts_16x16[BLOCK_TYPES];

+  vp9_coeff_probs frame_coef_probs_16x16[BLOCK_TYPES];

+  vp9_coeff_stats frame_branch_ct_16x16[BLOCK_TYPES];

-  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES_32X32];

-  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES_32X32];

-  vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES_32X32];

+  vp9_coeff_count coef_counts_32x32[BLOCK_TYPES];

+  vp9_coeff_probs frame_coef_probs_32x32[BLOCK_TYPES];

+  vp9_coeff_stats frame_branch_ct_32x32[BLOCK_TYPES];

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_prob frame_nzc_probs_4x4

+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES];

+  unsigned int frame_nzc_branch_ct_4x4

+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC4X4_NODES][2];

+  vp9_prob frame_nzc_probs_8x8

+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES];

+  unsigned int frame_nzc_branch_ct_8x8

+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC8X8_NODES][2];

+  vp9_prob frame_nzc_probs_16x16

+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES];

+  unsigned int frame_nzc_branch_ct_16x16

+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC16X16_NODES][2];

+  vp9_prob frame_nzc_probs_32x32

+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES];

+  unsigned int frame_nzc_branch_ct_32x32

+      [MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES][NZC32X32_NODES][2];

+#endif

   int gfu_boost;

   int last_boost;

   int kf_boost;

   int kf_zeromotion_pct;

-  int target_bandwidth;

+  int64_t target_bandwidth;

   struct vpx_codec_pkt_list  *output_pkt_list;

 #if 0

@@ -542,8 +537,6 @@

   int goldfreq;

   int auto_worst_q;

   int cpu_used;

-  int horiz_scale;

-  int vert_scale;

   int pass;

   vp9_prob last_skip_false_probs[3][MBSKIP_CONTEXTS];

@@ -628,11 +621,9 @@

     double est_max_qcorrection_factor;

   } twopass;

-#if VP9_TEMPORAL_ALT_REF

   YV12_BUFFER_CONFIG alt_ref_buffer;

   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS];

   int fixed_divide[512];

-#endif

 #if CONFIG_INTERNAL_STATS

   int    count;

@@ -683,9 +674,6 @@

   int droppable;

-  // TODO Do we still need this??

-  int update_context;

   int dummy_packing;    /* flag to indicate if packing is dummy */

   unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]

@@ -696,6 +684,8 @@

   unsigned int mb_mv_ref_count[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];

 #endif

+  int initial_width;

+  int initial_height;

 } VP9_COMP;

 void vp9_encode_frame(VP9_COMP *cpi);

--- a/vp9/encoder/vp9_picklpf.c

+++ b/vp9/encoder/vp9_picklpf.c

@@ -8,7 +8,8 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

+#include <limits.h>

 #include "vp9/common/vp9_onyxc_int.h"

 #include "vp9/encoder/vp9_onyx_int.h"

 #include "vp9/encoder/vp9_picklpf.h"

@@ -27,6 +28,7 @@

   int yoffset;

   int linestocopy;

+  assert(src_ybc->y_stride == dst_ybc->y_stride);

   yheight  = src_ybc->y_height;

   ystride  = src_ybc->y_stride;

@@ -246,7 +248,7 @@

   int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it

   //  Make a copy of the unfiltered / processed recon buffer

-  vp8_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf);

+  vp8_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);

   if (cm->frame_type == KEY_FRAME)

     cm->sharpness_level = 0;

@@ -266,7 +268,7 @@

   // Get baseline error score

   vp9_set_alt_lf_level(cpi, filt_mid);

-  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1);

+  vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, 0);

   best_err = vp9_calc_ss_err(sd, cm->frame_to_show);

   filt_best = filt_mid;

@@ -291,7 +293,7 @@

     if ((filt_direction <= 0) && (filt_low != filt_mid)) {

       // Get Low filter error score

       vp9_set_alt_lf_level(cpi, filt_low);

-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1);

+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, 0);

       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

@@ -311,7 +313,7 @@

     // Now look at filt_high

     if ((filt_direction >= 0) && (filt_high != filt_mid)) {

       vp9_set_alt_lf_level(cpi, filt_high);

-      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1);

+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, 0);

       filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

@@ -336,4 +338,30 @@

   cm->filter_level = filt_best;

+#if CONFIG_LOOP_DERING

+  /* Decide whether to turn on deringing filter */

+  {  // NOLINT

+    int best_dering = 0;

+    int this_dering;

+    int last_err_diff = INT_MAX;

+    for (this_dering = 1; this_dering <= 16; this_dering++) {

+      vp9_set_alt_lf_level(cpi, filt_best);

+      vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, this_dering);

+      filt_err = vp9_calc_ss_err(sd, cm->frame_to_show);

+      vp8_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);

+      if (filt_err < best_err) {

+        best_err = filt_err;

+        best_dering = this_dering;

+        last_err_diff = INT_MAX;

+      } else {

+        if (filt_err - best_err > last_err_diff)

+          break;

+        last_err_diff = filt_err - best_err;

+      }

+    }

+    cm->dering_enabled = best_dering;

+  }

+#endif

--- a/vp9/encoder/vp9_picklpf.h

+++ b/vp9/encoder/vp9_picklpf.h

@@ -15,12 +15,12 @@

 struct yv12_buffer_config;

 struct VP9_COMP;

-extern void vp9_pick_filter_level_fast(struct yv12_buffer_config *sd,

-                                       struct VP9_COMP *cpi);

+void vp9_pick_filter_level_fast(struct yv12_buffer_config *sd,

+                                struct VP9_COMP *cpi);

-extern void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val);

+void vp9_set_alt_lf_level(struct VP9_COMP *cpi, int filt_val);

-extern void vp9_pick_filter_level(struct yv12_buffer_config *sd,

-                                  struct VP9_COMP *cpi);

+void vp9_pick_filter_level(struct yv12_buffer_config *sd,

+                           struct VP9_COMP *cpi);

 #endif  // VP9_ENCODER_VP9_PICKLPF_H_

--- a/vp9/encoder/vp9_psnr.c

+++ b/vp9/encoder/vp9_psnr.c

@@ -11,17 +11,16 @@

 #include "vpx_scale/yv12config.h"

 #include "math.h"

-#include "vp9/common/vp9_systemdependent.h" /* for vp9_clear_system_state() */

 #define MAX_PSNR 100

-double vp9_mse2psnr(double Samples, double Peak, double Mse) {

+double vp9_mse2psnr(double samples, double peak, double mse) {

   double psnr;

-  if ((double)Mse > 0.0)

-    psnr = 10.0 * log10(Peak * Peak * Samples / Mse);

+  if (mse > 0.0)

+    psnr = 10.0 * log10(peak * peak * samples / mse);

   else

-    psnr = MAX_PSNR;      // Limit to prevent / 0

+    psnr = MAX_PSNR;  // Limit to prevent / 0

   if (psnr > MAX_PSNR)

     psnr = MAX_PSNR;

--- a/vp9/encoder/vp9_psnr.h

+++ b/vp9/encoder/vp9_psnr.h

@@ -12,6 +12,6 @@

 #ifndef VP9_ENCODER_VP9_PSNR_H_

 #define VP9_ENCODER_VP9_PSNR_H_

-extern double vp9_mse2psnr(double Samples, double Peak, double Mse);

+double vp9_mse2psnr(double samples, double peak, double mse);

 #endif  // VP9_ENCODER_VP9_PSNR_H_

--- a/vp9/encoder/vp9_quantize.c

+++ b/vp9/encoder/vp9_quantize.c

@@ -21,32 +21,46 @@

 extern int enc_debug;

 #endif

-void vp9_ht_quantize_b_4x4(BLOCK *b, BLOCKD *d, TX_TYPE tx_type) {

+static INLINE int plane_idx(MACROBLOCKD *xd, int b_idx) {

+  const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+  if (b_idx < (16 << (sb_type * 2)))

+    return 0;  // Y

+  else if (b_idx < (20 << (sb_type * 2)))

+    return 16;  // U

+  assert(b_idx < (24 << (sb_type * 2)));

+  return 20;  // V

+}

+void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  BLOCK *const b = &mb->block[0];

+  BLOCKD *const d = &xd->block[0];

   int i, rc, eob;

   int zbin;

   int x, y, z, sz;

+  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;

+  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;

+  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;

   int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;

-  int16_t *coeff_ptr       = b->coeff;

   int16_t *zbin_ptr        = b->zbin;

   int16_t *round_ptr       = b->round;

   int16_t *quant_ptr       = b->quant;

   uint8_t *quant_shift_ptr = b->quant_shift;

-  int16_t *qcoeff_ptr      = d->qcoeff;

-  int16_t *dqcoeff_ptr     = d->dqcoeff;

   int16_t *dequant_ptr     = d->dequant;

   int zbin_oq_value        = b->zbin_extra;

+  const int *pt_scan;

+#if CONFIG_CODE_NONZEROCOUNT

+  int nzc = 0;

+#endif

-  int const *pt_scan ;

+  assert(plane_idx(xd, b_idx) == 0);

   switch (tx_type) {

     case ADST_DCT:

       pt_scan = vp9_row_scan_4x4;

       break;

     case DCT_ADST:

       pt_scan = vp9_col_scan_4x4;

       break;

     default:

       pt_scan = vp9_default_zig_zag1d_4x4;

       break;

@@ -57,48 +71,63 @@

   eob = -1;

-  for (i = 0; i < b->eob_max_offset; i++) {

-    rc   = pt_scan[i];

-    z    = coeff_ptr[rc];

+  if (!b->skip_block) {

+    for (i = 0; i < 16; i++) {

+      rc   = pt_scan[i];

+      z    = coeff_ptr[rc];

-    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

-    zbin_boost_ptr ++;

+      zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

+      zbin_boost_ptr++;

-    sz = (z >> 31);                                 // sign of z

-    x  = (z ^ sz) - sz;                             // x = abs(z)

+      sz = (z >> 31);                                 // sign of z

+      x  = (z ^ sz) - sz;                             // x = abs(z)

-    if (x >= zbin) {

-      x += round_ptr[rc];

-      y  = (((x * quant_ptr[rc]) >> 16) + x)

-           >> quant_shift_ptr[rc];                // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

+      if (x >= zbin) {

+        x += round_ptr[rc];

+        y  = (((x * quant_ptr[rc]) >> 16) + x)

+             >> quant_shift_ptr[rc];                // quantize (x)

+        x  = (y ^ sz) - sz;                         // get the sign back

+        qcoeff_ptr[rc]  = x;                        // write to destination

+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

-      if (y) {

-        eob = i;                                // last nonzero coeffs

-        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

+        if (y) {

+          eob = i;                                // last nonzero coeffs

+#if CONFIG_CODE_NONZEROCOUNT

+          ++nzc;                                  // number of nonzero coeffs

+#endif

+          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

+        }

-  d->eob = eob + 1;

+  xd->eobs[b_idx] = eob + 1;

+#if CONFIG_CODE_NONZEROCOUNT

+  xd->nzcs[b_idx] = nzc;

+#endif

-void vp9_regular_quantize_b_4x4(BLOCK *b, BLOCKD *d) {

+void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  const int c_idx = plane_idx(xd, b_idx);

+  BLOCK *const b = &mb->block[c_idx];

+  BLOCKD *const d = &xd->block[c_idx];

   int i, rc, eob;

   int zbin;

   int x, y, z, sz;

+  int16_t *coeff_ptr       = mb->coeff + b_idx * 16;

+  int16_t *qcoeff_ptr      = xd->qcoeff + b_idx * 16;

+  int16_t *dqcoeff_ptr     = xd->dqcoeff + b_idx * 16;

   int16_t *zbin_boost_ptr  = b->zrun_zbin_boost;

-  int16_t *coeff_ptr       = b->coeff;

   int16_t *zbin_ptr        = b->zbin;

   int16_t *round_ptr       = b->round;

   int16_t *quant_ptr       = b->quant;

   uint8_t *quant_shift_ptr = b->quant_shift;

-  int16_t *qcoeff_ptr      = d->qcoeff;

-  int16_t *dqcoeff_ptr     = d->dqcoeff;

   int16_t *dequant_ptr     = d->dequant;

   int zbin_oq_value        = b->zbin_extra;

+#if CONFIG_CODE_NONZEROCOUNT

+  int nzc = 0;

+#endif

   vpx_memset(qcoeff_ptr, 0, 32);

   vpx_memset(dqcoeff_ptr, 0, 32);

@@ -105,203 +134,203 @@

   eob = -1;

-  for (i = 0; i < b->eob_max_offset; i++) {

-    rc   = vp9_default_zig_zag1d_4x4[i];

-    z    = coeff_ptr[rc];

+  if (!b->skip_block) {

+    for (i = 0; i < 16; i++) {

+      rc   = vp9_default_zig_zag1d_4x4[i];

+      z    = coeff_ptr[rc];

-    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

-    zbin_boost_ptr ++;

+      zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;

+      zbin_boost_ptr++;

-    sz = (z >> 31);                                 // sign of z

-    x  = (z ^ sz) - sz;                             // x = abs(z)

+      sz = (z >> 31);                                 // sign of z

+      x  = (z ^ sz) - sz;                             // x = abs(z)

-    if (x >= zbin) {

-      x += round_ptr[rc];

+      if (x >= zbin) {

+        x += round_ptr[rc];

-      y  = (((x * quant_ptr[rc]) >> 16) + x)

-           >> quant_shift_ptr[rc];                // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

+        y  = (((x * quant_ptr[rc]) >> 16) + x)

+             >> quant_shift_ptr[rc];                // quantize (x)

+        x  = (y ^ sz) - sz;                         // get the sign back

+        qcoeff_ptr[rc]  = x;                        // write to destination

+        dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

-      if (y) {

-        eob = i;                                // last nonzero coeffs

-        zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

+        if (y) {

+          eob = i;                                // last nonzero coeffs

+#if CONFIG_CODE_NONZEROCOUNT

+          ++nzc;                                  // number of nonzero coeffs

+#endif

+          zbin_boost_ptr = b->zrun_zbin_boost;    // reset zero runlength

+        }

-  d->eob = eob + 1;

+  xd->eobs[b_idx] = eob + 1;

+#if CONFIG_CODE_NONZEROCOUNT

+  xd->nzcs[b_idx] = nzc;

+#endif

-void vp9_quantize_mby_4x4_c(MACROBLOCK *x) {

+void vp9_quantize_mby_4x4(MACROBLOCK *x) {

   int i;

-  int has_2nd_order = get_2nd_order_usage(&x->e_mbd);

   for (i = 0; i < 16; i++) {

-    TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, &x->e_mbd.block[i]);

+    TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, i);

     if (tx_type != DCT_DCT) {

-      assert(has_2nd_order == 0);

-      vp9_ht_quantize_b_4x4(&x->block[i], &x->e_mbd.block[i], tx_type);

+      vp9_ht_quantize_b_4x4(x, i, tx_type);

     } else {

-      x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);

+      x->quantize_b_4x4(x, i);

-  if (has_2nd_order) {

-    x->quantize_b_4x4(&x->block[24], &x->e_mbd.block[24]);

-  } else {

-    vpx_memset(x->e_mbd.block[24].qcoeff, 0,

-               16 * sizeof(x->e_mbd.block[24].qcoeff[0]));

-    vpx_memset(x->e_mbd.block[24].dqcoeff, 0,

-               16 * sizeof(x->e_mbd.block[24].dqcoeff[0]));

-    x->e_mbd.block[24].eob = 0;

-  }

-void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) {

+void vp9_quantize_mbuv_4x4(MACROBLOCK *x) {

   int i;

   for (i = 16; i < 24; i++)

-    x->quantize_b_4x4(&x->block[i], &x->e_mbd.block[i]);

+    x->quantize_b_4x4(x, i);

-void vp9_quantize_mb_4x4_c(MACROBLOCK *x) {

-  vp9_quantize_mby_4x4_c(x);

-  vp9_quantize_mbuv_4x4_c(x);

+void vp9_quantize_mb_4x4(MACROBLOCK *x) {

+  vp9_quantize_mby_4x4(x);

+  vp9_quantize_mbuv_4x4(x);

-void vp9_regular_quantize_b_2x2(BLOCK *b, BLOCKD *d) {

-  int i, rc, eob;

-  int zbin;

-  int x, y, z, sz;

-  int16_t *zbin_boost_ptr = b->zrun_zbin_boost;

-  int zbin_zrun_index = 0;

-  int16_t *coeff_ptr  = b->coeff;

-  int16_t *zbin_ptr   = b->zbin;

-  int16_t *round_ptr  = b->round;

-  int16_t *quant_ptr  = b->quant;

-  uint8_t *quant_shift_ptr = b->quant_shift;

-  int16_t *qcoeff_ptr = d->qcoeff;

-  int16_t *dqcoeff_ptr = d->dqcoeff;

-  int16_t *dequant_ptr = d->dequant;

-  int zbin_oq_value    = b->zbin_extra;

-  // double q2nd = 4;

-  vpx_memset(qcoeff_ptr, 0, 32);

-  vpx_memset(dqcoeff_ptr, 0, 32);

+void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  int16_t *qcoeff_ptr = xd->qcoeff + 16 * b_idx;

+  int16_t *dqcoeff_ptr = xd->dqcoeff + 16 * b_idx;

+  const int c_idx = plane_idx(xd, b_idx);

+  BLOCK *const b = &mb->block[c_idx];

+  BLOCKD *const d = &xd->block[c_idx];

+  const int *pt_scan;

-  eob = -1;

-  for (i = 0; i < b->eob_max_offset_8x8; i++) {

-    rc   = vp9_default_zig_zag1d_4x4[i];

-    z    = coeff_ptr[rc];

-    zbin_boost_ptr = &b->zrun_zbin_boost[zbin_zrun_index];

-    zbin_zrun_index += 4;

-    zbin = (zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value);

-    sz = (z >> 31);                               // sign of z

-    x  = (z ^ sz) - sz;                           // x = abs(z)

-    if (x >= zbin) {

-      x += (round_ptr[rc]);

-      y  = ((int)((int)(x * quant_ptr[rc]) >> 16) + x)

-           >> quant_shift_ptr[rc];                // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc];      // dequantized value

-      if (y) {

-        eob = i;                                  // last nonzero coeffs

-        zbin_zrun_index = 0;

-      }

-    }

+  switch (tx_type) {

+    case ADST_DCT:

+      pt_scan = vp9_row_scan_8x8;

+      break;

+    case DCT_ADST:

+      pt_scan = vp9_col_scan_8x8;

+      break;

+    default:

+      pt_scan = vp9_default_zig_zag1d_8x8;

+      break;

-  d->eob = eob + 1;

-}

-void vp9_regular_quantize_b_8x8(BLOCK *b, BLOCKD *d) {

-  int i, rc, eob;

-  int zbin;

-  int x, y, z, sz;

-  int16_t *zbin_boost_ptr = b->zrun_zbin_boost_8x8;

-  int16_t *coeff_ptr  = b->coeff;

-  int16_t *zbin_ptr   = b->zbin_8x8;

-  int16_t *round_ptr  = b->round;

-  int16_t *quant_ptr  = b->quant;

-  uint8_t *quant_shift_ptr = b->quant_shift;

-  int16_t *qcoeff_ptr = d->qcoeff;

-  int16_t *dqcoeff_ptr = d->dqcoeff;

-  int16_t *dequant_ptr = d->dequant;

-  int zbin_oq_value = b->zbin_extra;

   vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t));

   vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t));

-  eob = -1;

+  if (!b->skip_block) {

+    int i, rc, eob;

+    int zbin;

+    int x, y, z, sz;

+    int zero_run;

+    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;

+    int16_t *coeff_ptr  = mb->coeff + 16 * b_idx;

+    int16_t *zbin_ptr   = b->zbin;

+    int16_t *round_ptr  = b->round;

+    int16_t *quant_ptr  = b->quant;

+    uint8_t *quant_shift_ptr = b->quant_shift;

+    int16_t *dequant_ptr = d->dequant;

+    int zbin_oq_value = b->zbin_extra;

+#if CONFIG_CODE_NONZEROCOUNT

+    int nzc = 0;

+#endif

-  for (i = 0; i < b->eob_max_offset_8x8; i++) {

-    rc   = vp9_default_zig_zag1d_8x8[i];

-    z    = coeff_ptr[rc];

+    eob = -1;

-    zbin = (zbin_ptr[rc != 0] + *zbin_boost_ptr + zbin_oq_value);

-    zbin_boost_ptr++;

+    // Special case for DC as it is the one triggering access in various

+    // tables: {zbin, quant, quant_shift, dequant}_ptr[rc != 0]

+    {

+      z    = coeff_ptr[0];

+      zbin = (zbin_ptr[0] + zbin_boost_ptr[0] + zbin_oq_value);

+      zero_run = 1;

-    sz = (z >> 31);                               // sign of z

-    x  = (z ^ sz) - sz;                           // x = abs(z)

+      sz = (z >> 31);                                // sign of z

+      x  = (z ^ sz) - sz;                            // x = abs(z)

-    if (x >= zbin) {

-      x += (round_ptr[rc != 0]);

-      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))

-           >> quant_shift_ptr[rc != 0];            // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0]; // dequantized value

+      if (x >= zbin) {

+        x += (round_ptr[0]);

+        y  = ((int)(((int)(x * quant_ptr[0]) >> 16) + x))

+             >> quant_shift_ptr[0];                  // quantize (x)

+        x  = (y ^ sz) - sz;                          // get the sign back

+        qcoeff_ptr[0]  = x;                          // write to destination

+        dqcoeff_ptr[0] = x * dequant_ptr[0];         // dequantized value

-      if (y) {

-        eob = i;                                  // last nonzero coeffs

-        zbin_boost_ptr = b->zrun_zbin_boost_8x8;

+        if (y) {

+          eob = 0;                                   // last nonzero coeffs

+#if CONFIG_CODE_NONZEROCOUNT

+          ++nzc;                                  // number of nonzero coeffs

+#endif

+          zero_run = 0;

+        }

-  }

+    for (i = 1; i < 64; i++) {

+      rc   = pt_scan[i];

+      z    = coeff_ptr[rc];

+      zbin = (zbin_ptr[1] + zbin_boost_ptr[zero_run] + zbin_oq_value);

+      // The original code was incrementing zero_run while keeping it at

+      // maximum 15 by adding "(zero_run < 15)". The same is achieved by

+      // removing the opposite of the sign mask of "(zero_run - 15)".

+      zero_run -= (zero_run - 15) >> 31;

-  d->eob = eob + 1;

+      sz = (z >> 31);                                // sign of z

+      x  = (z ^ sz) - sz;                            // x = abs(z)

+      if (x >= zbin) {

+        x += (round_ptr[rc != 0]);

+        y  = ((int)(((int)(x * quant_ptr[1]) >> 16) + x))

+             >> quant_shift_ptr[1];                  // quantize (x)

+        x  = (y ^ sz) - sz;                          // get the sign back

+        qcoeff_ptr[rc]  = x;                         // write to destination

+        dqcoeff_ptr[rc] = x * dequant_ptr[1];        // dequantized value

+        if (y) {

+          eob = i;                                   // last nonzero coeffs

+#if CONFIG_CODE_NONZEROCOUNT

+          ++nzc;                                     // number of nonzero coeffs

+#endif

+          zero_run = 0;

+        }

+      }

+    }

+    xd->eobs[b_idx] = eob + 1;

+#if CONFIG_CODE_NONZEROCOUNT

+    xd->nzcs[b_idx] = nzc;

+#endif

+  } else {

+    xd->eobs[b_idx] = 0;

+#if CONFIG_CODE_NONZEROCOUNT

+    xd->nzcs[b_idx] = 0;

+#endif

+  }

 void vp9_quantize_mby_8x8(MACROBLOCK *x) {

   int i;

-  int has_2nd_order = get_2nd_order_usage(&x->e_mbd);

+#if CONFIG_CODE_NONZEROCOUNT

   for (i = 0; i < 16; i ++) {

-    x->e_mbd.block[i].eob = 0;

+    x->e_mbd.nzcs[i] = 0;

-  x->e_mbd.block[24].eob = 0;

+#endif

   for (i = 0; i < 16; i += 4) {

-    int ib = (i & 8) + ((i & 4) >> 1);

-    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, &x->e_mbd.block[ib]);

-    if (tx_type != DCT_DCT)

-      assert(has_2nd_order == 0);

-    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);

+    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd, (i & 8) + ((i & 4) >> 1));

+    x->quantize_b_8x8(x, i, tx_type);

-  if (has_2nd_order) {

-    x->quantize_b_2x2(&x->block[24], &x->e_mbd.block[24]);

-  } else {

-    vpx_memset(x->e_mbd.block[24].qcoeff, 0,

-               16 * sizeof(x->e_mbd.block[24].qcoeff[0]));

-    vpx_memset(x->e_mbd.block[24].dqcoeff, 0,

-               16 * sizeof(x->e_mbd.block[24].dqcoeff[0]));

-    x->e_mbd.block[24].eob = 0;

-  }

 void vp9_quantize_mbuv_8x8(MACROBLOCK *x) {

   int i;

-  for (i = 16; i < 24; i ++)

-    x->e_mbd.block[i].eob = 0;

+#if CONFIG_CODE_NONZEROCOUNT

+  for (i = 16; i < 24; i ++) {

+    x->e_mbd.nzcs[i] = 0;

+  }

+#endif

   for (i = 16; i < 24; i += 4)

-    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);

+    x->quantize_b_8x8(x, i, DCT_DCT);

 void vp9_quantize_mb_8x8(MACROBLOCK *x) {

@@ -310,12 +339,14 @@

 void vp9_quantize_mby_16x16(MACROBLOCK *x) {

+  TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd, 0);

+#if CONFIG_CODE_NONZEROCOUNT

   int i;

-  for (i = 0; i < 16; i++)

-    x->e_mbd.block[i].eob = 0;

-  x->e_mbd.block[24].eob = 0;

-  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);

+  for (i = 0; i < 16; i++) {

+    x->e_mbd.nzcs[i] = 0;

+  }

+#endif

+  x->quantize_b_16x16(x, 0, tx_type);

 void vp9_quantize_mb_16x16(MACROBLOCK *x) {

@@ -324,107 +355,256 @@

 static void quantize(int16_t *zbin_boost_orig_ptr,

-                     int16_t *coeff_ptr, int n_coeffs, int max_coeffs,

+                     int16_t *coeff_ptr, int n_coeffs, int skip_block,

                      int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,

                      uint8_t *quant_shift_ptr,

                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,

                      int16_t *dequant_ptr, int zbin_oq_value,

-                     int *eob_ptr, const int *scan, int mul) {

+                     uint16_t *eob_ptr,

+#if CONFIG_CODE_NONZEROCOUNT

+                     uint16_t *nzc_ptr,

+#endif

+                     const int *scan, int mul) {

   int i, rc, eob;

   int zbin;

   int x, y, z, sz;

+  int zero_run = 0;

   int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;

+#if CONFIG_CODE_NONZEROCOUNT

+  int nzc = 0;

+#endif

   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));

   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));

   eob = -1;

-  for (i = 0; i < max_coeffs; i++) {

-    rc   = scan[i];

-    z    = coeff_ptr[rc] * mul;

-    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);

-    zbin_boost_ptr ++;

+  if (!skip_block) {

+    for (i = 0; i < n_coeffs; i++) {

+      rc   = scan[i];

+      z    = coeff_ptr[rc] * mul;

-    sz = (z >> 31);                               // sign of z

-    x  = (z ^ sz) - sz;                           // x = abs(z)

+      zbin = (zbin_ptr[rc != 0] + zbin_boost_ptr[zero_run] + zbin_oq_value);

+      zero_run += (zero_run < 15);

-    if (x >= zbin) {

-      x += (round_ptr[rc!=0]);

-      y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))

-          >> quant_shift_ptr[rc!=0];              // quantize (x)

-      x  = (y ^ sz) - sz;                         // get the sign back

-      qcoeff_ptr[rc]  = x;                        // write to destination

-      dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value

+      sz = (z >> 31);                               // sign of z

+      x  = (z ^ sz) - sz;                           // x = abs(z)

-      if (y) {

-        eob = i;                                  // last nonzero coeffs

-        zbin_boost_ptr = zbin_boost_orig_ptr;

+      if (x >= zbin) {

+        x += (round_ptr[rc != 0]);

+        y  = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))

+            >> quant_shift_ptr[rc != 0];            // quantize (x)

+        x  = (y ^ sz) - sz;                         // get the sign back

+        qcoeff_ptr[rc]  = x;                        // write to destination

+        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul;  // dequantized value

+        if (y) {

+          eob = i;                                  // last nonzero coeffs

+          zero_run = 0;

+#if CONFIG_CODE_NONZEROCOUNT

+          ++nzc;                                    // number of nonzero coeffs

+#endif

+        }

   *eob_ptr = eob + 1;

+#if CONFIG_CODE_NONZEROCOUNT

+  *nzc_ptr = nzc;

+#endif

-void vp9_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {

-  quantize(b->zrun_zbin_boost_16x16,

-           b->coeff,

-           256, b->eob_max_offset_16x16,

-           b->zbin_16x16, b->round, b->quant, b->quant_shift,

-           d->qcoeff,

-           d->dqcoeff,

+void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  const int c_idx = plane_idx(xd, b_idx);

+  BLOCK *const b = &mb->block[c_idx];

+  BLOCKD *const d = &xd->block[c_idx];

+  const int *pt_scan;

+  switch (tx_type) {

+    case ADST_DCT:

+      pt_scan = vp9_row_scan_16x16;

+      break;

+    case DCT_ADST:

+      pt_scan = vp9_col_scan_16x16;

+      break;

+    default:

+      pt_scan = vp9_default_zig_zag1d_16x16;

+      break;

+  }

+  quantize(b->zrun_zbin_boost,

+           mb->coeff + 16 * b_idx,

+           256, b->skip_block,

+           b->zbin, b->round, b->quant, b->quant_shift,

+           xd->qcoeff + 16 * b_idx,

+           xd->dqcoeff + 16 * b_idx,

            d->dequant,

            b->zbin_extra,

-           &d->eob, vp9_default_zig_zag1d_16x16, 1);

+           &xd->eobs[b_idx],

+#if CONFIG_CODE_NONZEROCOUNT

+           &xd->nzcs[b_idx],

+#endif

+           pt_scan, 1);

-void vp9_quantize_sby_32x32(MACROBLOCK *x) {

-  x->e_mbd.block[0].eob = 0;

-  quantize(x->block[0].zrun_zbin_boost_32x32,

-           x->sb_coeff_data.coeff,

-           1024, x->block[0].eob_max_offset_32x32,

-           x->block[0].zbin_32x32,

-           x->block[0].round, x->block[0].quant, x->block[0].quant_shift,

-           x->e_mbd.sb_coeff_data.qcoeff,

-           x->e_mbd.sb_coeff_data.dqcoeff,

-           x->e_mbd.block[0].dequant,

-           x->block[0].zbin_extra,

-           &x->e_mbd.block[0].eob,

+void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  const int c_idx = plane_idx(xd, b_idx);

+  BLOCK *const b = &mb->block[c_idx];

+  BLOCKD *const d = &xd->block[c_idx];

+  quantize(b->zrun_zbin_boost,

+           mb->coeff + b_idx * 16,

+           1024, b->skip_block,

+           b->zbin,

+           b->round, b->quant, b->quant_shift,

+           xd->qcoeff + b_idx * 16,

+           xd->dqcoeff + b_idx * 16,

+           d->dequant,

+           b->zbin_extra,

+           &xd->eobs[b_idx],

+#if CONFIG_CODE_NONZEROCOUNT

+           &xd->nzcs[b_idx],

+#endif

            vp9_default_zig_zag1d_32x32, 2);

+void vp9_quantize_sby_32x32(MACROBLOCK *x) {

+  vp9_regular_quantize_b_32x32(x, 0);

+}

+void vp9_quantize_sby_16x16(MACROBLOCK *x) {

+  int n;

+  for (n = 0; n < 4; n++) {

+    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,

+                                        (16 * (n & 2)) + ((n & 1) * 4));

+    x->quantize_b_16x16(x, n * 16, tx_type);

+  }

+}

+void vp9_quantize_sby_8x8(MACROBLOCK *x) {

+  int n;

+  for (n = 0; n < 16; n++) {

+    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,

+                                      (4 * (n & 12)) + ((n & 3) * 2));

+    x->quantize_b_8x8(x, n * 4, tx_type);

+  }

+}

+void vp9_quantize_sby_4x4(MACROBLOCK *x) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int n;

+  for (n = 0; n < 64; n++) {

+    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);

+    if (tx_type != DCT_DCT) {

+      vp9_ht_quantize_b_4x4(x, n, tx_type);

+    } else {

+      x->quantize_b_4x4(x, n);

+    }

+  }

+}

 void vp9_quantize_sbuv_16x16(MACROBLOCK *x) {

+  x->quantize_b_16x16(x, 64, DCT_DCT);

+  x->quantize_b_16x16(x, 80, DCT_DCT);

+}

+void vp9_quantize_sbuv_8x8(MACROBLOCK *x) {

   int i;

-  x->e_mbd.block[16].eob = 0;

-  x->e_mbd.block[20].eob = 0;

-  for (i = 16; i < 24; i += 4)

-    quantize(x->block[i].zrun_zbin_boost_16x16,

-             x->sb_coeff_data.coeff + 1024 + (i - 16) * 64,

-             256, x->block[i].eob_max_offset_16x16,

-             x->block[i].zbin_16x16,

-             x->block[i].round, x->block[0].quant, x->block[i].quant_shift,

-             x->e_mbd.sb_coeff_data.qcoeff + 1024 + (i - 16) * 64,

-             x->e_mbd.sb_coeff_data.dqcoeff + 1024 + (i - 16) * 64,

-             x->e_mbd.block[i].dequant,

-             x->block[i].zbin_extra,

-             &x->e_mbd.block[i].eob,

-             vp9_default_zig_zag1d_16x16, 1);

+  for (i = 64; i < 96; i += 4)

+    x->quantize_b_8x8(x, i, DCT_DCT);

+void vp9_quantize_sbuv_4x4(MACROBLOCK *x) {

+  int i;

+  for (i = 64; i < 96; i++)

+    x->quantize_b_4x4(x, i);

+}

+void vp9_quantize_sb64y_32x32(MACROBLOCK *x) {

+  int n;

+  for (n = 0; n < 4; n++)

+    vp9_regular_quantize_b_32x32(x, n * 64);

+}

+void vp9_quantize_sb64y_16x16(MACROBLOCK *x) {

+  int n;

+  for (n = 0; n < 16; n++) {

+    TX_TYPE tx_type = get_tx_type_16x16(&x->e_mbd,

+                                        (16 * (n & 12)) + ((n & 3) * 4));

+    x->quantize_b_16x16(x, n * 16, tx_type);

+  }

+}

+void vp9_quantize_sb64y_8x8(MACROBLOCK *x) {

+  int n;

+  for (n = 0; n < 64; n++) {

+    TX_TYPE tx_type = get_tx_type_8x8(&x->e_mbd,

+                                      (4 * (n & 56)) + ((n & 7) * 2));

+    x->quantize_b_8x8(x, n * 4, tx_type);

+  }

+}

+void vp9_quantize_sb64y_4x4(MACROBLOCK *x) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int n;

+  for (n = 0; n < 256; n++) {

+    const TX_TYPE tx_type = get_tx_type_4x4(xd, n);

+    if (tx_type != DCT_DCT) {

+      vp9_ht_quantize_b_4x4(x, n, tx_type);

+    } else {

+      x->quantize_b_4x4(x, n);

+    }

+  }

+}

+void vp9_quantize_sb64uv_32x32(MACROBLOCK *x) {

+  vp9_regular_quantize_b_32x32(x, 256);

+  vp9_regular_quantize_b_32x32(x, 320);

+}

+void vp9_quantize_sb64uv_16x16(MACROBLOCK *x) {

+  int i;

+  for (i = 256; i < 384; i += 16)

+    x->quantize_b_16x16(x, i, DCT_DCT);

+}

+void vp9_quantize_sb64uv_8x8(MACROBLOCK *x) {

+  int i;

+  for (i = 256; i < 384; i += 4)

+    x->quantize_b_8x8(x, i, DCT_DCT);

+}

+void vp9_quantize_sb64uv_4x4(MACROBLOCK *x) {

+  int i;

+  for (i = 256; i < 384; i++)

+    x->quantize_b_4x4(x, i);

+}

 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of

  * these two C functions if corresponding optimized routine is not available.

  * NEON optimized version implements currently the fast quantization for pair

  * of blocks. */

-void vp9_regular_quantize_b_4x4_pair(BLOCK *b1, BLOCK *b2,

-                                     BLOCKD *d1, BLOCKD *d2) {

-  vp9_regular_quantize_b_4x4(b1, d1);

-  vp9_regular_quantize_b_4x4(b2, d2);

+void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *x, int b_idx1, int b_idx2) {

+  vp9_regular_quantize_b_4x4(x, b_idx1);

+  vp9_regular_quantize_b_4x4(x, b_idx2);

-static void invert_quant(int16_t *quant,

-                         uint8_t *shift, int d) {

+static void invert_quant(int16_t *quant, uint8_t *shift, int d) {

   unsigned t;

   int l;

   t = d;

@@ -438,248 +618,53 @@

 void vp9_init_quantizer(VP9_COMP *cpi) {

   int i;

   int quant_val;

-  int Q;

-  static const int zbin_boost[16] = {  0,  0,  8, 10, 12, 14, 16, 20,

-                                      24, 28, 32, 36, 40, 44, 44, 44

-                                    };

+  int q;

-  static const int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,

-                                          14, 16, 18, 20, 22, 24, 26, 28,

-                                          30, 32, 34, 36, 38, 40, 42, 44,

-                                          46, 48, 48, 48, 48, 48, 48, 48,

-                                          48, 48, 48, 48, 48, 48, 48, 48,

-                                          48, 48, 48, 48, 48, 48, 48, 48,

-                                          48, 48, 48, 48, 48, 48, 48, 48,

-                                          48, 48, 48, 48, 48, 48, 48, 48

-                                        };

-  static const int zbin_boost_16x16[256] = {

-     0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,

-    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-  };

-  static const int zbin_boost_32x32[1024] = {

-    0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,

-    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

-  };

-  int qrounding_factor = 48;

+  static const int zbin_boost[16] = { 0,  0,  0,  8,  8,  8, 10, 12,

+                                     14, 16, 20, 24, 28, 32, 36, 40 };

-  for (Q = 0; Q < QINDEX_RANGE; Q++) {

-    int qzbin_factor = (vp9_dc_quant(Q, 0) < 148) ? 84 : 80;

-#if CONFIG_LOSSLESS

-    if (cpi->oxcf.lossless) {

-      if (Q == 0) {

-        qzbin_factor = 64;

-        qrounding_factor = 64;

-      }

+  for (q = 0; q < QINDEX_RANGE; q++) {

+    int qzbin_factor = (vp9_dc_quant(q, 0) < 148) ? 84 : 80;

+    int qrounding_factor = 48;

+    if (q == 0) {

+      qzbin_factor = 64;

+      qrounding_factor = 64;

-#endif

     // dc values

-    quant_val = vp9_dc_quant(Q, cpi->common.y1dc_delta_q);

-    invert_quant(cpi->Y1quant[Q] + 0,

-                 cpi->Y1quant_shift[Q] + 0, quant_val);

-    cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;

-    cpi->common.Y1dequant[Q][0] = quant_val;

-    cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;

-    cpi->zrun_zbin_boost_y1_8x8[Q][0] =

-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

-    cpi->zrun_zbin_boost_y1_16x16[Q][0] =

-      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

-    cpi->Y1zbin_32x32[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->zrun_zbin_boost_y1_32x32[Q][0] =

-     ((quant_val * zbin_boost_32x32[0]) + 64) >> 7;

+    quant_val = vp9_dc_quant(q, cpi->common.y1dc_delta_q);

+    invert_quant(cpi->Y1quant[q] + 0, cpi->Y1quant_shift[q] + 0, quant_val);

+    cpi->Y1zbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

+    cpi->Y1round[q][0] = (qrounding_factor * quant_val) >> 7;

+    cpi->common.Y1dequant[q][0] = quant_val;

+    cpi->zrun_zbin_boost_y1[q][0] = (quant_val * zbin_boost[0]) >> 7;

+    quant_val = vp9_dc_uv_quant(q, cpi->common.uvdc_delta_q);

+    invert_quant(cpi->UVquant[q] + 0, cpi->UVquant_shift[q] + 0, quant_val);

+    cpi->UVzbin[q][0] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

+    cpi->UVround[q][0] = (qrounding_factor * quant_val) >> 7;

+    cpi->common.UVdequant[q][0] = quant_val;

+    cpi->zrun_zbin_boost_uv[q][0] = (quant_val * zbin_boost[0]) >> 7;

-    quant_val = vp9_dc2quant(Q, cpi->common.y2dc_delta_q);

-    invert_quant(cpi->Y2quant[Q] + 0,

-                 cpi->Y2quant_shift[Q] + 0, quant_val);

-    cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;

-    cpi->common.Y2dequant[Q][0] = quant_val;

-    cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;

-    cpi->zrun_zbin_boost_y2_8x8[Q][0] =

-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

-    cpi->zrun_zbin_boost_y2_16x16[Q][0] =

-      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

-    quant_val = vp9_dc_uv_quant(Q, cpi->common.uvdc_delta_q);

-    invert_quant(cpi->UVquant[Q] + 0,

-                 cpi->UVquant_shift[Q] + 0, quant_val);

-    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

-    cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;

-    cpi->common.UVdequant[Q][0] = quant_val;

-    cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;

-    cpi->zrun_zbin_boost_uv_8x8[Q][0] =

-      ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

-    cpi->zrun_zbin_boost_uv_16x16[Q][0] =

-      ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

     // all the 4x4 ac values =;

     for (i = 1; i < 16; i++) {

       int rc = vp9_default_zig_zag1d_4x4[i];

-      quant_val = vp9_ac_yquant(Q);

-      invert_quant(cpi->Y1quant[Q] + rc,

-                   cpi->Y1quant_shift[Q] + rc, quant_val);

-      cpi->Y1zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->Y1round[Q][rc] = (qrounding_factor * quant_val) >> 7;

-      cpi->common.Y1dequant[Q][rc] = quant_val;

-      cpi->zrun_zbin_boost_y1[Q][i] =

-        ((quant_val * zbin_boost[i]) + 64) >> 7;

+      quant_val = vp9_ac_yquant(q);

+      invert_quant(cpi->Y1quant[q] + rc, cpi->Y1quant_shift[q] + rc, quant_val);

+      cpi->Y1zbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

+      cpi->Y1round[q][rc] = (qrounding_factor * quant_val) >> 7;

+      cpi->common.Y1dequant[q][rc] = quant_val;

+      cpi->zrun_zbin_boost_y1[q][i] =

+          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);

-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);

-      invert_quant(cpi->Y2quant[Q] + rc,

-                   cpi->Y2quant_shift[Q] + rc, quant_val);

-      cpi->Y2zbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->Y2round[Q][rc] = (qrounding_factor * quant_val) >> 7;

-      cpi->common.Y2dequant[Q][rc] = quant_val;

-      cpi->zrun_zbin_boost_y2[Q][i] =

-        ((quant_val * zbin_boost[i]) + 64) >> 7;

-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);

-      invert_quant(cpi->UVquant[Q] + rc,

-                   cpi->UVquant_shift[Q] + rc, quant_val);

-      cpi->UVzbin[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->UVround[Q][rc] = (qrounding_factor * quant_val) >> 7;

-      cpi->common.UVdequant[Q][rc] = quant_val;

-      cpi->zrun_zbin_boost_uv[Q][i] =

-        ((quant_val * zbin_boost[i]) + 64) >> 7;

+      quant_val = vp9_ac_uv_quant(q, cpi->common.uvac_delta_q);

+      invert_quant(cpi->UVquant[q] + rc, cpi->UVquant_shift[q] + rc, quant_val);

+      cpi->UVzbin[q][rc] = ROUND_POWER_OF_TWO(qzbin_factor * quant_val, 7);

+      cpi->UVround[q][rc] = (qrounding_factor * quant_val) >> 7;

+      cpi->common.UVdequant[q][rc] = quant_val;

+      cpi->zrun_zbin_boost_uv[q][i] =

+          ROUND_POWER_OF_TWO(quant_val * zbin_boost[i], 7);

-    // 8x8 structures... only zbin seperated out for now

-    // This needs cleaning up for 8x8 especially if we are to add

-    // support for non flat Q matices

-    for (i = 1; i < 64; i++) {

-      int rc = vp9_default_zig_zag1d_8x8[i];

-      quant_val = vp9_ac_yquant(Q);

-      cpi->Y1zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_y1_8x8[Q][i] =

-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);

-      cpi->Y2zbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_y2_8x8[Q][i] =

-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);

-      cpi->UVzbin_8x8[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_uv_8x8[Q][i] =

-        ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

-    }

-    // 16x16 structures. Same comment above applies.

-    for (i = 1; i < 256; i++) {

-      int rc = vp9_default_zig_zag1d_16x16[i];

-      quant_val = vp9_ac_yquant(Q);

-      cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_y1_16x16[Q][i] =

-        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

-      quant_val = vp9_ac2quant(Q, cpi->common.y2ac_delta_q);

-      cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_y2_16x16[Q][i] =

-        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

-      quant_val = vp9_ac_uv_quant(Q, cpi->common.uvac_delta_q);

-      cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_uv_16x16[Q][i] =

-        ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

-    }

-    // 32x32 structures. Same comment above applies.

-    for (i = 1; i < 1024; i++) {

-      int rc = vp9_default_zig_zag1d_32x32[i];

-      quant_val = vp9_ac_yquant(Q);

-      cpi->Y1zbin_32x32[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

-      cpi->zrun_zbin_boost_y1_32x32[Q][i] =

-        ((quant_val * zbin_boost_32x32[i]) + 64) >> 7;

-    }

@@ -709,8 +694,7 @@

   // Y

   zbin_extra = (cpi->common.Y1dequant[QIndex][1] *

-                (cpi->zbin_over_quant +

-                 cpi->zbin_mode_boost +

+                (cpi->zbin_mode_boost +

                  x->act_zbin_adj)) >> 7;

   for (i = 0; i < 16; i++) {

@@ -717,39 +701,19 @@

     x->block[i].quant = cpi->Y1quant[QIndex];

     x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];

     x->block[i].zbin = cpi->Y1zbin[QIndex];

-    x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];

-    x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];

-    x->block[i].zbin_32x32 = cpi->Y1zbin_32x32[QIndex];

     x->block[i].round = cpi->Y1round[QIndex];

     x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];

     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];

-    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];

-    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];

-    x->block[i].zrun_zbin_boost_32x32 = cpi->zrun_zbin_boost_y1_32x32[QIndex];

     x->block[i].zbin_extra = (int16_t)zbin_extra;

-    // Segment max eob offset feature.

-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

-      x->block[i].eob_max_offset =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-      x->block[i].eob_max_offset_8x8 =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-      x->block[i].eob_max_offset_16x16 =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-      x->block[i].eob_max_offset_32x32 =

-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-    } else {

-      x->block[i].eob_max_offset = 16;

-      x->block[i].eob_max_offset_8x8 = 64;

-      x->block[i].eob_max_offset_16x16 = 256;

-      x->block[i].eob_max_offset_32x32 = 1024;

-    }

+    // Segment skip feature.

+    x->block[i].skip_block =

+      vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

   // UV

   zbin_extra = (cpi->common.UVdequant[QIndex][1] *

-                (cpi->zbin_over_quant +

-                 cpi->zbin_mode_boost +

+                (cpi->zbin_mode_boost +

                  x->act_zbin_adj)) >> 7;

   for (i = 16; i < 24; i++) {

@@ -756,61 +720,16 @@

     x->block[i].quant = cpi->UVquant[QIndex];

     x->block[i].quant_shift = cpi->UVquant_shift[QIndex];

     x->block[i].zbin = cpi->UVzbin[QIndex];

-    x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];

-    x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];

     x->block[i].round = cpi->UVround[QIndex];

     x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];

     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];

-    x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];

-    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];

     x->block[i].zbin_extra = (int16_t)zbin_extra;

-    // Segment max eob offset feature.

-    if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

-      x->block[i].eob_max_offset =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-      x->block[i].eob_max_offset_8x8 =

-        vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-      x->block[i].eob_max_offset_16x16 =

-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-    } else {

-      x->block[i].eob_max_offset = 16;

-      x->block[i].eob_max_offset_8x8 = 64;

-      x->block[i].eob_max_offset_16x16 = 256;

-    }

+    // Segment skip feature.

+    x->block[i].skip_block =

+      vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

-  // Y2

-  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *

-                ((cpi->zbin_over_quant / 2) +

-                 cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  x->block[24].quant = cpi->Y2quant[QIndex];

-  x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];

-  x->block[24].zbin = cpi->Y2zbin[QIndex];

-  x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];

-  x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];

-  x->block[24].round = cpi->Y2round[QIndex];

-  x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];

-  x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];

-  x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];

-  x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];

-  x->block[24].zbin_extra = (int16_t)zbin_extra;

-  // TBD perhaps not use for Y2

-  // Segment max eob offset feature.

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

-    x->block[24].eob_max_offset =

-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-    x->block[24].eob_max_offset_8x8 =

-      vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-  } else {

-    x->block[24].eob_max_offset = 16;

-    x->block[24].eob_max_offset_8x8 = 4;

-  }

   /* save this macroblock QIndex for vp9_update_zbin_extra() */

   x->e_mbd.q_index = QIndex;

@@ -822,8 +741,7 @@

   // Y

   zbin_extra = (cpi->common.Y1dequant[QIndex][1] *

-                (cpi->zbin_over_quant +

-                 cpi->zbin_mode_boost +

+                (cpi->zbin_mode_boost +

                  x->act_zbin_adj)) >> 7;

   for (i = 0; i < 16; i++) {

     x->block[i].zbin_extra = (int16_t)zbin_extra;

@@ -831,21 +749,12 @@

   // UV

   zbin_extra = (cpi->common.UVdequant[QIndex][1] *

-                (cpi->zbin_over_quant +

-                 cpi->zbin_mode_boost +

+                (cpi->zbin_mode_boost +

                  x->act_zbin_adj)) >> 7;

   for (i = 16; i < 24; i++) {

     x->block[i].zbin_extra = (int16_t)zbin_extra;

-  // Y2

-  zbin_extra = (cpi->common.Y2dequant[QIndex][1] *

-                ((cpi->zbin_over_quant / 2) +

-                 cpi->zbin_mode_boost +

-                 x->act_zbin_adj)) >> 7;

-  x->block[24].zbin_extra = (int16_t)zbin_extra;

 void vp9_frame_init_quantizer(VP9_COMP *cpi) {

@@ -861,13 +770,15 @@

   cm->base_qindex = Q;

+  // Set lossless mode

+  if (cm->base_qindex <= 4)

+    cm->base_qindex = 0;

   // if any of the delta_q values are changing update flag will

   // have to be set.

   cm->y1dc_delta_q = 0;

-  cm->y2ac_delta_q = 0;

   cm->uvdc_delta_q = 0;

   cm->uvac_delta_q = 0;

-  cm->y2dc_delta_q = 0;

   // quantizer has to be reinitialized if any delta_q changes.

   // As there are not any here for now this is inactive code.

--- a/vp9/encoder/vp9_quantize.h

+++ b/vp9/encoder/vp9_quantize.h

@@ -14,10 +14,10 @@

 #include "vp9/encoder/vp9_block.h"

 #define prototype_quantize_block(sym) \

-  void (sym)(BLOCK *b,BLOCKD *d)

+  void (sym)(MACROBLOCK *mb, int b_idx)

 #define prototype_quantize_block_pair(sym) \

-  void (sym)(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)

+  void (sym)(MACROBLOCK *mb, int b_idx1, int b_idx2)

 #define prototype_quantize_mb(sym) \

   void (sym)(MACROBLOCK *x)

@@ -26,60 +26,41 @@

 #include "x86/vp9_quantize_x86.h"

 #endif

-#define prototype_quantize_block_type(sym) \

-  void (sym)(BLOCK *b, BLOCKD *d, TX_TYPE type)

-extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4);

+void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_ix, TX_TYPE type);

+void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx);

+void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2);

+void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type);

+void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type);

+void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx);

-#ifndef vp9_quantize_quantb_4x4

-#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4

-#endif

-extern prototype_quantize_block(vp9_quantize_quantb_4x4);

-#ifndef vp9_quantize_quantb_4x4_pair

-#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair

-#endif

-extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair);

-#ifndef vp9_quantize_quantb_8x8

-#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8

-#endif

-extern prototype_quantize_block(vp9_quantize_quantb_8x8);

-#ifndef vp9_quantize_quantb_16x16

-#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16

-#endif

-extern prototype_quantize_block(vp9_quantize_quantb_16x16);

-#ifndef vp9_quantize_quantb_2x2

-#define vp9_quantize_quantb_2x2 vp9_regular_quantize_b_2x2

-#endif

-extern prototype_quantize_block(vp9_quantize_quantb_2x2);

-#ifndef vp9_quantize_mb_4x4

-#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c

-#endif

-extern prototype_quantize_mb(vp9_quantize_mb_4x4);

+void vp9_quantize_mb_4x4(MACROBLOCK *x);

 void vp9_quantize_mb_8x8(MACROBLOCK *x);

-#ifndef vp9_quantize_mbuv_4x4

-#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c

-#endif

-extern prototype_quantize_mb(vp9_quantize_mbuv_4x4);

+void vp9_quantize_mbuv_4x4(MACROBLOCK *x);

+void vp9_quantize_mby_4x4(MACROBLOCK *x);

-#ifndef vp9_quantize_mby_4x4

-#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c

-#endif

-extern prototype_quantize_mb(vp9_quantize_mby_4x4);

+void vp9_quantize_mby_8x8(MACROBLOCK *x);

+void vp9_quantize_mbuv_8x8(MACROBLOCK *x);

-extern prototype_quantize_mb(vp9_quantize_mby_8x8);

-extern prototype_quantize_mb(vp9_quantize_mbuv_8x8);

 void vp9_quantize_mb_16x16(MACROBLOCK *x);

-extern prototype_quantize_block(vp9_quantize_quantb_16x16);

-extern prototype_quantize_mb(vp9_quantize_mby_16x16);

+void vp9_quantize_mby_16x16(MACROBLOCK *x);

 void vp9_quantize_sby_32x32(MACROBLOCK *x);

+void vp9_quantize_sby_16x16(MACROBLOCK *x);

+void vp9_quantize_sby_8x8(MACROBLOCK *x);

+void vp9_quantize_sby_4x4(MACROBLOCK *x);

 void vp9_quantize_sbuv_16x16(MACROBLOCK *x);

+void vp9_quantize_sbuv_8x8(MACROBLOCK *x);

+void vp9_quantize_sbuv_4x4(MACROBLOCK *x);

+void vp9_quantize_sb64y_32x32(MACROBLOCK *x);

+void vp9_quantize_sb64y_16x16(MACROBLOCK *x);

+void vp9_quantize_sb64y_8x8(MACROBLOCK *x);

+void vp9_quantize_sb64y_4x4(MACROBLOCK *x);

+void vp9_quantize_sb64uv_32x32(MACROBLOCK *x);

+void vp9_quantize_sb64uv_16x16(MACROBLOCK *x);

+void vp9_quantize_sb64uv_8x8(MACROBLOCK *x);

+void vp9_quantize_sb64uv_4x4(MACROBLOCK *x);

 struct VP9_COMP;

--- a/vp9/encoder/vp9_ratectrl.c

+++ b/vp9/encoder/vp9_ratectrl.c

@@ -14,8 +14,8 @@

 #include <string.h>

 #include <limits.h>

 #include <assert.h>

+#include <math.h>

-#include "math.h"

 #include "vp9/common/vp9_alloccommon.h"

 #include "vp9/common/vp9_modecont.h"

 #include "vp9/common/vp9_common.h"

@@ -25,9 +25,10 @@

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9/encoder/vp9_encodemv.h"

 #include "vp9/common/vp9_quant_common.h"

+#include "vp9/common/vp9_seg_common.h"

-#define MIN_BPB_FACTOR          0.005

-#define MAX_BPB_FACTOR          50

+#define MIN_BPB_FACTOR 0.005

+#define MAX_BPB_FACTOR 50

 #ifdef MODE_STATS

 extern unsigned int y_modes[VP9_YMODES];

@@ -88,39 +89,34 @@

 // tables if and when things settle down in the experimental bitstream

 double vp9_convert_qindex_to_q(int qindex) {

   // Convert the index to a real Q value (scaled down to match old Q values)

-  return (double)vp9_ac_yquant(qindex) / 4.0;

+  return vp9_ac_yquant(qindex) / 4.0;

 int vp9_gfboost_qadjust(int qindex) {

-  int retval;

-  double q;

-  q = vp9_convert_qindex_to_q(qindex);

-  retval = (int)((0.00000828 * q * q * q) +

-                 (-0.0055 * q * q) +

-                 (1.32 * q) + 79.3);

-  return retval;

+  const double q = vp9_convert_qindex_to_q(qindex);

+  return (int)((0.00000828 * q * q * q) +

+               (-0.0055 * q * q) +

+               (1.32 * q) + 79.3);

 static int kfboost_qadjust(int qindex) {

-  int retval;

-  double q;

-  q = vp9_convert_qindex_to_q(qindex);

-  retval = (int)((0.00000973 * q * q * q) +

-                 (-0.00613 * q * q) +

-                 (1.316 * q) + 121.2);

-  return retval;

+  const double q = vp9_convert_qindex_to_q(qindex);

+  return (int)((0.00000973 * q * q * q) +

+               (-0.00613 * q * q) +

+               (1.316 * q) + 121.2);

-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex) {

-  if (frame_type == KEY_FRAME)

-    return (int)(4500000 / vp9_convert_qindex_to_q(qindex));

-  else

-    return (int)(2850000 / vp9_convert_qindex_to_q(qindex));

-}

+int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,

+                    double correction_factor) {

+  const double q = vp9_convert_qindex_to_q(qindex);

+  int enumerator = frame_type == KEY_FRAME ? 4000000 : 2500000;

+  // q based adjustment to baseline enumberator

+  enumerator += (int)(enumerator * q) >> 12;

+  return (int)(0.5 + (enumerator * correction_factor / q));

+}

 void vp9_save_coding_context(VP9_COMP *cpi) {

   CODING_CONTEXT *const cc = &cpi->coding_context;

   VP9_COMMON *cm = &cpi->common;

@@ -168,16 +164,20 @@

   vp9_copy(cc->last_mode_lf_deltas, xd->last_mode_lf_deltas);

   vp9_copy(cc->coef_probs_4x4, cm->fc.coef_probs_4x4);

-  vp9_copy(cc->hybrid_coef_probs_4x4, cm->fc.hybrid_coef_probs_4x4);

   vp9_copy(cc->coef_probs_8x8, cm->fc.coef_probs_8x8);

-  vp9_copy(cc->hybrid_coef_probs_8x8, cm->fc.hybrid_coef_probs_8x8);

   vp9_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);

-  vp9_copy(cc->hybrid_coef_probs_16x16, cm->fc.hybrid_coef_probs_16x16);

   vp9_copy(cc->coef_probs_32x32, cm->fc.coef_probs_32x32);

   vp9_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);

 #if CONFIG_COMP_INTERINTRA_PRED

   cc->interintra_prob = cm->fc.interintra_prob;

 #endif

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_copy(cc->nzc_probs_4x4, cm->fc.nzc_probs_4x4);

+  vp9_copy(cc->nzc_probs_8x8, cm->fc.nzc_probs_8x8);

+  vp9_copy(cc->nzc_probs_16x16, cm->fc.nzc_probs_16x16);

+  vp9_copy(cc->nzc_probs_32x32, cm->fc.nzc_probs_32x32);

+  vp9_copy(cc->nzc_pcat_probs, cm->fc.nzc_pcat_probs);

+#endif

 void vp9_restore_coding_context(VP9_COMP *cpi) {

@@ -226,89 +226,55 @@

   vp9_copy(xd->last_mode_lf_deltas, cc->last_mode_lf_deltas);

   vp9_copy(cm->fc.coef_probs_4x4, cc->coef_probs_4x4);

-  vp9_copy(cm->fc.hybrid_coef_probs_4x4, cc->hybrid_coef_probs_4x4);

   vp9_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);

-  vp9_copy(cm->fc.hybrid_coef_probs_8x8, cc->hybrid_coef_probs_8x8);

   vp9_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);

-  vp9_copy(cm->fc.hybrid_coef_probs_16x16, cc->hybrid_coef_probs_16x16);

   vp9_copy(cm->fc.coef_probs_32x32, cc->coef_probs_32x32);

   vp9_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);

 #if CONFIG_COMP_INTERINTRA_PRED

   cm->fc.interintra_prob = cc->interintra_prob;

 #endif

+#if CONFIG_CODE_NONZEROCOUNT

+  vp9_copy(cm->fc.nzc_probs_4x4, cc->nzc_probs_4x4);

+  vp9_copy(cm->fc.nzc_probs_8x8, cc->nzc_probs_8x8);

+  vp9_copy(cm->fc.nzc_probs_16x16, cc->nzc_probs_16x16);

+  vp9_copy(cm->fc.nzc_probs_32x32, cc->nzc_probs_32x32);

+  vp9_copy(cm->fc.nzc_pcat_probs, cc->nzc_pcat_probs);

+#endif

 void vp9_setup_key_frame(VP9_COMP *cpi) {

   VP9_COMMON *cm = &cpi->common;

-  // Setup for Key frame:

-  vp9_default_coef_probs(& cpi->common);

-  vp9_kf_default_bmode_probs(cpi->common.kf_bmode_prob);

-  vp9_init_mbmode_probs(& cpi->common);

-  vp9_default_bmode_probs(cm->fc.bmode_prob);

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

-  if(cm->last_frame_seg_map)

-    vpx_memset(cm->last_frame_seg_map, 0, (cm->mb_rows * cm->mb_cols));

+  vp9_setup_past_independence(cm, xd);

-  vp9_init_mv_probs(& cpi->common);

-  // cpi->common.filter_level = 0;      // Reset every key frame.

-  cpi->common.filter_level = cpi->common.base_qindex * 3 / 8;

   // interval before next GF

   cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;

-  cpi->common.refresh_golden_frame = TRUE;

-  cpi->common.refresh_alt_ref_frame = TRUE;

-  vp9_init_mode_contexts(&cpi->common);

-  vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));

-  vpx_memcpy(&cpi->common.lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));

-  vpx_memset(cm->prev_mip, 0,

-    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  vpx_memset(cm->mip, 0,

-    (cm->mb_cols + 1) * (cm->mb_rows + 1)* sizeof(MODE_INFO));

-  vp9_update_mode_info_border(cm, cm->mip);

-  vp9_update_mode_info_in_image(cm, cm->mi);

-#if CONFIG_NEW_MVREF

-  if (1) {

-    MACROBLOCKD *xd = &cpi->mb.e_mbd;

-    // Defaults probabilities for encoding the MV ref id signal

-    vpx_memset(xd->mb_mv_ref_probs, VP9_DEFAULT_MV_REF_PROB,

-               sizeof(xd->mb_mv_ref_probs));

-  }

-#endif

+  /* All buffers are implicitly updated on key frames. */

+  cpi->refresh_golden_frame = 1;

+  cpi->refresh_alt_ref_frame = 1;

 void vp9_setup_inter_frame(VP9_COMP *cpi) {

-  if (cpi->common.refresh_alt_ref_frame) {

-    vpx_memcpy(&cpi->common.fc,

-               &cpi->common.lfc_a,

-               sizeof(cpi->common.fc));

-  } else {

-    vpx_memcpy(&cpi->common.fc,

-               &cpi->common.lfc,

-               sizeof(cpi->common.fc));

-  }

+  VP9_COMMON *cm = &cpi->common;

+  MACROBLOCKD *xd = &cpi->mb.e_mbd;

+  if (cm->error_resilient_mode)

+    vp9_setup_past_independence(cm, xd);

+  assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS);

+  vpx_memcpy(&cm->fc, &cm->frame_contexts[cm->frame_context_idx],

+             sizeof(cm->fc));

-static int estimate_bits_at_q(int frame_kind, int Q, int MBs,

+static int estimate_bits_at_q(int frame_kind, int q, int mbs,

                               double correction_factor) {

-  int Bpm = (int)(.5 + correction_factor * vp9_bits_per_mb(frame_kind, Q));

+  const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor));

-  /* Attempt to retain reasonable accuracy without overflow. The cutoff is

-   * chosen such that the maximum product of Bpm and MBs fits 31 bits. The

-   * largest Bpm takes 20 bits.

-   */

-  if (MBs > (1 << 11))

-    return (Bpm >> BPER_MB_NORMBITS) * MBs;

-  else

-    return (Bpm * MBs) >> BPER_MB_NORMBITS;

+  // Attempt to retain reasonable accuracy without overflow. The cutoff is

+  // chosen such that the maximum product of Bpm and MBs fits 31 bits. The

+  // largest Bpm takes 20 bits.

+  return (mbs > (1 << 11)) ? (bpm >> BPER_MB_NORMBITS) * mbs

+                           : (bpm * mbs) >> BPER_MB_NORMBITS;

@@ -331,7 +297,6 @@

   cpi->this_frame_target = target;

@@ -347,25 +312,15 @@

 static void calc_pframe_target_size(VP9_COMP *cpi) {

-  int min_frame_target;

-  min_frame_target = 0;

-  min_frame_target = cpi->min_frame_bandwidth;

-  if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))

-    min_frame_target = cpi->av_per_frame_bandwidth >> 5;

-  // Special alt reference frame case

-  if (cpi->common.refresh_alt_ref_frame) {

+  const int min_frame_target = MAX(cpi->min_frame_bandwidth,

+                                   cpi->av_per_frame_bandwidth >> 5);

+  if (cpi->refresh_alt_ref_frame) {

+    // Special alt reference frame case

     // Per frame bit target for the alt ref frame

     cpi->per_frame_bandwidth = cpi->twopass.gf_bits;

     cpi->this_frame_target = cpi->per_frame_bandwidth;

-  }

-  // Normal frames (gf,and inter)

-  else {

+  } else {

+    // Normal frames (gf,and inter)

     cpi->this_frame_target = cpi->per_frame_bandwidth;

@@ -377,16 +332,16 @@

   if (cpi->this_frame_target < min_frame_target)

     cpi->this_frame_target = min_frame_target;

-  if (!cpi->common.refresh_alt_ref_frame)

+  if (!cpi->refresh_alt_ref_frame)

     // Note the baseline target data rate for this inter frame.

     cpi->inter_frame_target = cpi->this_frame_target;

   // Adjust target frame size for Golden Frames:

   if (cpi->frames_till_gf_update_due == 0) {

-    // int Boost = 0;

-    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;

+    const int q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]

+                                          : cpi->oxcf.fixed_q;

-    cpi->common.refresh_golden_frame = TRUE;

+    cpi->refresh_golden_frame = 1;

     calc_gf_params(cpi);

@@ -398,17 +353,17 @@

         // The spend on the GF is defined in the two pass code

         // for two pass encodes

         cpi->this_frame_target = cpi->per_frame_bandwidth;

-      } else

+      } else {

         cpi->this_frame_target =

-          (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0)

+          (estimate_bits_at_q(1, q, cpi->common.MBs, 1.0)

            * cpi->last_boost) / 100;

+      }

-    }

-    // If there is an active ARF at this location use the minimum

-    // bits on this frame even if it is a contructed arf.

-    // The active maximum quantizer insures that an appropriate

-    // number of bits will be spent if needed for contstructed ARFs.

-    else {

+    } else {

+      // If there is an active ARF at this location use the minimum

+      // bits on this frame even if it is a contructed arf.

+      // The active maximum quantizer insures that an appropriate

+      // number of bits will be spent if needed for contstructed ARFs.

       cpi->this_frame_target = 0;

@@ -418,12 +373,12 @@

 void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {

-  int    Q = cpi->common.base_qindex;

-  int    correction_factor = 100;

+  const int q = cpi->common.base_qindex;

+  int correction_factor = 100;

   double rate_correction_factor;

   double adjustment_limit;

-  int    projected_size_based_on_q = 0;

+  int projected_size_based_on_q = 0;

   // Clear down mmx registers to allow floating point in what follows

   vp9_clear_system_state();  // __asm emms;

@@ -431,36 +386,19 @@

   if (cpi->common.frame_type == KEY_FRAME) {

     rate_correction_factor = cpi->key_frame_rate_correction_factor;

   } else {

-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)

+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)

       rate_correction_factor = cpi->gf_rate_correction_factor;

     else

       rate_correction_factor = cpi->rate_correction_factor;

-  // Work out how big we would have expected the frame to be at this Q given the current correction factor.

+  // Work out how big we would have expected the frame to be at this Q given

+  // the current correction factor.

   // Stay in double to avoid int overflow when values are large

-  projected_size_based_on_q =

-    (int)(((.5 + rate_correction_factor *

-            vp9_bits_per_mb(cpi->common.frame_type, Q)) *

-           cpi->common.MBs) / (1 << BPER_MB_NORMBITS));

+  projected_size_based_on_q = estimate_bits_at_q(cpi->common.frame_type, q,

+                                                 cpi->common.MBs,

+                                                 rate_correction_factor);

-  // Make some allowance for cpi->zbin_over_quant

-  if (cpi->zbin_over_quant > 0) {

-    int Z = cpi->zbin_over_quant;

-    double Factor = 0.99;

-    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;

-    while (Z > 0) {

-      Z--;

-      projected_size_based_on_q =

-        (int)(Factor * projected_size_based_on_q);

-      Factor += factor_adjustment;

-      if (Factor  >= 0.999)

-        Factor = 0.999;

-    }

-  }

   // Work out a size correction factor.

   // if ( cpi->this_frame_target > 0 )

   //  correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;

@@ -505,7 +443,7 @@

   if (cpi->common.frame_type == KEY_FRAME)

     cpi->key_frame_rate_correction_factor = rate_correction_factor;

   else {

-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)

+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)

       cpi->gf_rate_correction_factor = rate_correction_factor;

     else

       cpi->rate_correction_factor = rate_correction_factor;

@@ -514,7 +452,7 @@

 int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {

-  int Q = cpi->active_worst_quality;

+  int q = cpi->active_worst_quality;

   int i;

   int last_error = INT_MAX;

@@ -522,14 +460,11 @@

   int bits_per_mb_at_this_q;

   double correction_factor;

-  // Reset Zbin OQ value

-  cpi->zbin_over_quant = 0;

   // Select the appropriate correction factor based upon type of frame.

   if (cpi->common.frame_type == KEY_FRAME)

     correction_factor = cpi->key_frame_rate_correction_factor;

   else {

-    if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)

+    if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)

       correction_factor = cpi->gf_rate_correction_factor;

     else

       correction_factor = cpi->rate_correction_factor;

@@ -544,61 +479,22 @@

   i = cpi->active_best_quality;

   do {

-    bits_per_mb_at_this_q =

-      (int)(.5 + correction_factor *

-            vp9_bits_per_mb(cpi->common.frame_type, i));

+    bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i,

+                                                 correction_factor);

     if (bits_per_mb_at_this_q <= target_bits_per_mb) {

       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)

-        Q = i;

+        q = i;

       else

-        Q = i - 1;

+        q = i - 1;

       break;

-    } else

+    } else {

       last_error = bits_per_mb_at_this_q - target_bits_per_mb;

+    }

   } while (++i <= cpi->active_worst_quality);

-  // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like

-  // the RD multiplier and zero bin size.

-  if (Q >= MAXQ) {

-    int zbin_oqmax;

-    double Factor = 0.99;

-    double factor_adjustment = 0.01 / 256.0; // (double)ZBIN_OQ_MAX;

-    if (cpi->common.frame_type == KEY_FRAME)

-      zbin_oqmax = 0; // ZBIN_OQ_MAX/16

-    else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))

-      zbin_oqmax = 16;

-    else

-      zbin_oqmax = ZBIN_OQ_MAX;

-    // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.

-    // The effect will be highly clip dependent and may well have sudden steps.

-    // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero

-    // bin and hence decreasing the number of low magnitude non zero coefficients.

-    while (cpi->zbin_over_quant < zbin_oqmax) {

-      cpi->zbin_over_quant++;

-      if (cpi->zbin_over_quant > zbin_oqmax)

-        cpi->zbin_over_quant = zbin_oqmax;

-      // Adjust bits_per_mb_at_this_q estimate

-      bits_per_mb_at_this_q = (int)(Factor * bits_per_mb_at_this_q);

-      Factor += factor_adjustment;

-      if (Factor  >= 0.999)

-        Factor = 0.999;

-      if (bits_per_mb_at_this_q <= target_bits_per_mb)    // Break out if we get down to the target rate

-        break;

-    }

-  }

-  return Q;

+  return q;

@@ -643,7 +539,7 @@

       total_weight += prior_key_frame_weight[i];

-    av_key_frame_frequency  /= total_weight;

+    av_key_frame_frequency /= total_weight;

   return av_key_frame_frequency;

@@ -671,7 +567,7 @@

       *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;

       *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;

     } else {

-      if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) {

+      if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) {

         *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;

         *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;

       } else {

--- a/vp9/encoder/vp9_ratectrl.h

+++ b/vp9/encoder/vp9_ratectrl.h

@@ -16,23 +16,24 @@

 #define FRAME_OVERHEAD_BITS 200

-extern void vp9_save_coding_context(VP9_COMP *cpi);

-extern void vp9_restore_coding_context(VP9_COMP *cpi);

+void vp9_save_coding_context(VP9_COMP *cpi);

+void vp9_restore_coding_context(VP9_COMP *cpi);

-extern void vp9_setup_key_frame(VP9_COMP *cpi);

-extern void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);

-extern int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);

-extern void vp9_adjust_key_frame_context(VP9_COMP *cpi);

-extern void vp9_compute_frame_size_bounds(VP9_COMP *cpi,

-                                          int *frame_under_shoot_limit,

-                                          int *frame_over_shoot_limit);

+void vp9_setup_key_frame(VP9_COMP *cpi);

+void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);

+int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);

+void vp9_adjust_key_frame_context(VP9_COMP *cpi);

+void vp9_compute_frame_size_bounds(VP9_COMP *cpi,

+                                   int *frame_under_shoot_limit,

+                                   int *frame_over_shoot_limit);

 // return of 0 means drop frame

-extern int vp9_pick_frame_size(VP9_COMP *cpi);

+int vp9_pick_frame_size(VP9_COMP *cpi);

-extern double vp9_convert_qindex_to_q(int qindex);

-extern int vp9_gfboost_qadjust(int qindex);

-extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex);

+double vp9_convert_qindex_to_q(int qindex);

+int vp9_gfboost_qadjust(int qindex);

+extern int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,

+                           double correction_factor);

 void vp9_setup_inter_frame(VP9_COMP *cpi);

 #endif  // VP9_ENCODER_VP9_RATECTRL_H_

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -23,7 +23,6 @@

 #include "vp9/common/vp9_entropymode.h"

 #include "vp9/common/vp9_reconinter.h"

 #include "vp9/common/vp9_reconintra.h"

-#include "vp9/common/vp9_reconintra4x4.h"

 #include "vp9/common/vp9_findnearmv.h"

 #include "vp9/common/vp9_quant_common.h"

 #include "vp9/encoder/vp9_encodemb.h"

@@ -151,21 +150,70 @@

 static void fill_token_costs(vp9_coeff_count *c,

                              vp9_coeff_probs *p,

                              int block_type_counts) {

-  int i, j, k;

+  int i, j, k, l;

   for (i = 0; i < block_type_counts; i++)

-    for (j = 0; j < COEF_BANDS; j++)

-      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

-        if (k == 0 && ((j > 0 && i > 0) || (j > 1 && i == 0)))

-          vp9_cost_tokens_skip((int *)(c[i][j][k]),

-                               p[i][j][k],

+    for (j = 0; j < REF_TYPES; j++)

+      for (k = 0; k < COEF_BANDS; k++)

+        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {

+          vp9_cost_tokens_skip((int *)(c[i][j][k][l]),

+                               p[i][j][k][l],

                                vp9_coef_tree);

-        else

-          vp9_cost_tokens((int *)(c[i][j][k]),

-                          p[i][j][k],

-                          vp9_coef_tree);

+        }

+}

+#if CONFIG_CODE_NONZEROCOUNT

+static void fill_nzc_costs(VP9_COMP *cpi, int block_size) {

+  int nzc_context, r, b, nzc, values;

+  int cost[16];

+  values = block_size * block_size + 1;

+  for (nzc_context = 0; nzc_context < MAX_NZC_CONTEXTS; ++nzc_context) {

+    for (r = 0; r < REF_TYPES; ++r) {

+      for (b = 0; b < BLOCK_TYPES; ++b) {

+        unsigned int *nzc_costs;

+        if (block_size == 4) {

+          vp9_cost_tokens(cost,

+                          cpi->common.fc.nzc_probs_4x4[nzc_context][r][b],

+                          vp9_nzc4x4_tree);

+          nzc_costs = cpi->mb.nzc_costs_4x4[nzc_context][r][b];

+        } else if (block_size == 8) {

+          vp9_cost_tokens(cost,

+                          cpi->common.fc.nzc_probs_8x8[nzc_context][r][b],

+                          vp9_nzc8x8_tree);

+          nzc_costs = cpi->mb.nzc_costs_8x8[nzc_context][r][b];

+        } else if (block_size == 16) {

+          vp9_cost_tokens(cost,

+                          cpi->common.fc.nzc_probs_16x16[nzc_context][r][b],

+                          vp9_nzc16x16_tree);

+          nzc_costs = cpi->mb.nzc_costs_16x16[nzc_context][r][b];

+        } else {

+          vp9_cost_tokens(cost,

+                          cpi->common.fc.nzc_probs_32x32[nzc_context][r][b],

+                          vp9_nzc32x32_tree);

+          nzc_costs = cpi->mb.nzc_costs_32x32[nzc_context][r][b];

+        }

+        for (nzc = 0; nzc < values; ++nzc) {

+          int e, c, totalcost = 0;

+          c = codenzc(nzc);

+          totalcost = cost[c];

+          if ((e = vp9_extranzcbits[c])) {

+            int x = nzc - vp9_basenzcvalue[c];

+            while (e--) {

+              totalcost += vp9_cost_bit(

+                  cpi->common.fc.nzc_pcat_probs[nzc_context]

+                                               [c - NZC_TOKENS_NOEXTRA][e],

+                  ((x >> e) & 1));

+            }

+          }

+          nzc_costs[nzc] = totalcost;

+        }

+    }

+  }

+#endif

 static int rd_iifactor[32] =  { 4, 4, 3, 2, 1, 0, 0, 0,

@@ -193,19 +241,17 @@

 static int compute_rd_mult(int qindex) {

-  int q;

-  q = vp9_dc_quant(qindex, 0);

-  return (11 * q * q) >> 6;

+  int q = vp9_dc_quant(qindex, 0);

+  return (11 * q * q) >> 2;

-void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex) {

-  cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];

-  cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];

+void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {

+  cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];

+  cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];

-void vp9_initialize_rd_consts(VP9_COMP *cpi, int QIndex) {

+void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {

   int q, i;

   vp9_clear_system_state();  // __asm emms;

@@ -214,40 +260,23 @@

   // for key frames, golden frames and arf frames.

   // if (cpi->common.refresh_golden_frame ||

   //     cpi->common.refresh_alt_ref_frame)

-  QIndex = (QIndex < 0) ? 0 : ((QIndex > MAXQ) ? MAXQ : QIndex);

+  qindex = (qindex < 0) ? 0 : ((qindex > MAXQ) ? MAXQ : qindex);

-  cpi->RDMULT = compute_rd_mult(QIndex);

-  // Extend rate multiplier along side quantizer zbin increases

-  if (cpi->zbin_over_quant  > 0) {

-    double oq_factor;

-    // Experimental code using the same basic equation as used for Q above

-    // The units of cpi->zbin_over_quant are 1/128 of Q bin size

-    oq_factor = 1.0 + ((double)0.0015625 * cpi->zbin_over_quant);

-    cpi->RDMULT = (int)((double)cpi->RDMULT * oq_factor * oq_factor);

-  }

+  cpi->RDMULT = compute_rd_mult(qindex);

   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {

     if (cpi->twopass.next_iiratio > 31)

       cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;

     else

       cpi->RDMULT +=

-        (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;

+          (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;

-  if (cpi->RDMULT < 7)

-    cpi->RDMULT = 7;

-  cpi->mb.errorperbit = (cpi->RDMULT / 110);

+  cpi->mb.errorperbit = cpi->RDMULT >> 6;

   cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);

   vp9_set_speed_features(cpi);

-  q = (int)pow(vp9_dc_quant(QIndex, 0) >> 2, 1.25);

-  q = q << 2;

-  cpi->RDMULT = cpi->RDMULT << 4;

+  q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25);

+  q <<= 2;

   if (q < 8)

     q = 8;

@@ -279,22 +308,19 @@

   fill_token_costs(cpi->mb.token_costs[TX_4X4],

-                   cpi->common.fc.coef_probs_4x4, BLOCK_TYPES_4X4);

-  fill_token_costs(cpi->mb.hybrid_token_costs[TX_4X4],

-                   cpi->common.fc.hybrid_coef_probs_4x4, BLOCK_TYPES_4X4);

+                   cpi->common.fc.coef_probs_4x4, BLOCK_TYPES);

   fill_token_costs(cpi->mb.token_costs[TX_8X8],

-                   cpi->common.fc.coef_probs_8x8, BLOCK_TYPES_8X8);

-  fill_token_costs(cpi->mb.hybrid_token_costs[TX_8X8],

-                   cpi->common.fc.hybrid_coef_probs_8x8, BLOCK_TYPES_8X8);

+                   cpi->common.fc.coef_probs_8x8, BLOCK_TYPES);

   fill_token_costs(cpi->mb.token_costs[TX_16X16],

-                   cpi->common.fc.coef_probs_16x16, BLOCK_TYPES_16X16);

-  fill_token_costs(cpi->mb.hybrid_token_costs[TX_16X16],

-                   cpi->common.fc.hybrid_coef_probs_16x16, BLOCK_TYPES_16X16);

+                   cpi->common.fc.coef_probs_16x16, BLOCK_TYPES);

   fill_token_costs(cpi->mb.token_costs[TX_32X32],

-                   cpi->common.fc.coef_probs_32x32, BLOCK_TYPES_32X32);

+                   cpi->common.fc.coef_probs_32x32, BLOCK_TYPES);

+#if CONFIG_CODE_NONZEROCOUNT

+  fill_nzc_costs(cpi, 4);

+  fill_nzc_costs(cpi, 8);

+  fill_nzc_costs(cpi, 16);

+  fill_nzc_costs(cpi, 32);

+#endif

   /*rough estimate for costing*/

   cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;

@@ -321,36 +347,17 @@

   return error;

-int vp9_mbblock_error_8x8_c(MACROBLOCK *mb, int dc) {

+int vp9_mbblock_error_c(MACROBLOCK *mb) {

   BLOCK  *be;

   BLOCKD *bd;

   int i, j;

   int berror, error = 0;

-  for (i = 0; i < 16; i+=4) {

-    be = &mb->block[i];

-    bd = &mb->e_mbd.block[i];

-    berror = 0;

-    for (j = dc; j < 64; j++) {

-      int this_diff = be->coeff[j] - bd->dqcoeff[j];

-      berror += this_diff * this_diff;

-    }

-    error += berror;

-  }

-  return error;

-}

-int vp9_mbblock_error_c(MACROBLOCK *mb, int dc) {

-  BLOCK  *be;

-  BLOCKD *bd;

-  int i, j;

-  int berror, error = 0;

   for (i = 0; i < 16; i++) {

     be = &mb->block[i];

     bd = &mb->e_mbd.block[i];

     berror = 0;

-    for (j = dc; j < 16; j++) {

+    for (j = 0; j < 16; j++) {

       int this_diff = be->coeff[j] - bd->dqcoeff[j];

       berror += this_diff * this_diff;

@@ -417,75 +424,143 @@

     sse2 += sse1;

   return sse2;

-#if CONFIG_NEWCOEFCONTEXT

-#define PT pn

-#else

-#define PT pt

-#endif

-static int cost_coeffs(MACROBLOCK *mb,

-                       BLOCKD *b, PLANE_TYPE type,

-                       ENTROPY_CONTEXT *a,

-                       ENTROPY_CONTEXT *l,

-                       TX_SIZE tx_size) {

+static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,

+                              int ib, PLANE_TYPE type,

+                              ENTROPY_CONTEXT *a,

+                              ENTROPY_CONTEXT *l,

+                              TX_SIZE tx_size) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

   int pt;

-  const int eob = b->eob;

-  MACROBLOCKD *xd = &mb->e_mbd;

-  const int ib = (int)(b - xd->block);

-  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;

-  int cost = 0, seg_eob;

+  const int eob = xd->eobs[ib];

+  int c = 0;

+  int cost = 0, pad;

+  const int *scan, *nb;

+  const int16_t *qcoeff_ptr = xd->qcoeff + ib * 16;

+  const int ref = mbmi->ref_frame != INTRA_FRAME;

+  unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =

+      mb->token_costs[tx_size][type][ref];

+  ENTROPY_CONTEXT a_ec, l_ec;

+  ENTROPY_CONTEXT *const a1 = a +

+      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);

+  ENTROPY_CONTEXT *const l1 = l +

+      sizeof(ENTROPY_CONTEXT_PLANES)/sizeof(ENTROPY_CONTEXT);

+#if CONFIG_CODE_NONZEROCOUNT

+  int nzc_context = vp9_get_nzc_context(cm, xd, ib);

+  unsigned int *nzc_cost;

+#else

   const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  const int *scan, *band;

-  int16_t *qcoeff_ptr = b->qcoeff;

-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                          get_tx_type(xd, b) : DCT_DCT;

-#if CONFIG_NEWCOEFCONTEXT

-  const int *neighbors;

-  int pn;

+  vp9_prob (*coef_probs)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

+                        [ENTROPY_NODES];

 #endif

+  int seg_eob, default_eob;

+  uint8_t token_cache[1024];

-  ENTROPY_CONTEXT a_ec = *a, l_ec = *l;

+  // Check for consistency of tx_size with mode info

+  if (type == PLANE_TYPE_Y_WITH_DC) {

+    assert(xd->mode_info_context->mbmi.txfm_size == tx_size);

+  } else {

+    TX_SIZE tx_size_uv = get_uv_tx_size(xd);

+    assert(tx_size == tx_size_uv);

+  }

   switch (tx_size) {

-    case TX_4X4:

-      scan = vp9_default_zig_zag1d_4x4;

-      band = vp9_coef_bands_4x4;

+    case TX_4X4: {

+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                              get_tx_type_4x4(xd, ib) : DCT_DCT;

+      a_ec = *a;

+      l_ec = *l;

+#if CONFIG_CODE_NONZEROCOUNT

+      nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type];

+#else

+      coef_probs = cm->fc.coef_probs_4x4;

+#endif

       seg_eob = 16;

-      if (type == PLANE_TYPE_Y_WITH_DC) {

-        if (tx_type == ADST_DCT) {

-          scan = vp9_row_scan_4x4;

-        } else if (tx_type == DCT_ADST) {

-          scan = vp9_col_scan_4x4;

-        }

+      if (tx_type == ADST_DCT) {

+        scan = vp9_row_scan_4x4;

+      } else if (tx_type == DCT_ADST) {

+        scan = vp9_col_scan_4x4;

+      } else {

+        scan = vp9_default_zig_zag1d_4x4;

       break;

-    case TX_8X8:

-      if (type == PLANE_TYPE_Y2) {

-        scan = vp9_default_zig_zag1d_4x4;

-        band = vp9_coef_bands_4x4;

-        seg_eob = 4;

+    }

+    case TX_8X8: {

+      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+      const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;

+      a_ec = (a[0] + a[1]) != 0;

+      l_ec = (l[0] + l[1]) != 0;

+      if (tx_type == ADST_DCT) {

+        scan = vp9_row_scan_8x8;

+      } else if (tx_type == DCT_ADST) {

+        scan = vp9_col_scan_8x8;

       } else {

         scan = vp9_default_zig_zag1d_8x8;

-        band = vp9_coef_bands_8x8;

-        seg_eob = 64;

+#if CONFIG_CODE_NONZEROCOUNT

+      nzc_cost = mb->nzc_costs_8x8[nzc_context][ref][type];

+#else

+      coef_probs = cm->fc.coef_probs_8x8;

+#endif

+      seg_eob = 64;

       break;

-    case TX_16X16:

-      scan = vp9_default_zig_zag1d_16x16;

-      band = vp9_coef_bands_16x16;

+    }

+    case TX_16X16: {

+      const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type;

+      const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;

+      if (tx_type == ADST_DCT) {

+        scan = vp9_row_scan_16x16;

+      } else if (tx_type == DCT_ADST) {

+        scan = vp9_col_scan_16x16;

+      } else {

+        scan = vp9_default_zig_zag1d_16x16;

+      }

+#if CONFIG_CODE_NONZEROCOUNT

+      nzc_cost = mb->nzc_costs_16x16[nzc_context][ref][type];

+#else

+      coef_probs = cm->fc.coef_probs_16x16;

+#endif

       seg_eob = 256;

       if (type == PLANE_TYPE_UV) {

-        const int uv_idx = ib - 16;

-        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 64 * uv_idx;

+        a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

+        l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

+      } else {

+        a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;

+        l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;

       break;

+    }

     case TX_32X32:

       scan = vp9_default_zig_zag1d_32x32;

-      band = vp9_coef_bands_32x32;

+#if CONFIG_CODE_NONZEROCOUNT

+      nzc_cost = mb->nzc_costs_32x32[nzc_context][ref][type];

+#else

+      coef_probs = cm->fc.coef_probs_32x32;

+#endif

       seg_eob = 1024;

-      qcoeff_ptr = xd->sb_coeff_data.qcoeff;

+      if (type == PLANE_TYPE_UV) {

+        ENTROPY_CONTEXT *a2, *a3, *l2, *l3;

+        a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+        a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+        l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+        l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+        a_ec = (a[0] + a[1] + a1[0] + a1[1] +

+                a2[0] + a2[1] + a3[0] + a3[1]) != 0;

+        l_ec = (l[0] + l[1] + l1[0] + l1[1] +

+                l2[0] + l2[1] + l3[0] + l3[1]) != 0;

+      } else {

+        a_ec = (a[0] + a[1] + a[2] + a[3] +

+                a1[0] + a1[1] + a1[2] + a1[3]) != 0;

+        l_ec = (l[0] + l[1] + l[2] + l[3] +

+                l1[0] + l1[1] + l1[2] + l1[3]) != 0;

+      }

       break;

     default:

       abort();

@@ -493,202 +568,152 @@

   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);

-#if CONFIG_NEWCOEFCONTEXT

-  neighbors = vp9_get_coef_neighbors_handle(scan);

-  pn = pt;

+  nb = vp9_get_coef_neighbors_handle(scan, &pad);

+  default_eob = seg_eob;

+#if CONFIG_CODE_NONZEROCOUNT == 0

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))

+    seg_eob = 0;

 #endif

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))

-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

-  if (tx_type != DCT_DCT) {

-    for (; c < eob; c++) {

-      int v = qcoeff_ptr[scan[c]];

-      int t = vp9_dct_value_tokens_ptr[v].Token;

-      cost += mb->hybrid_token_costs[tx_size][type][band[c]][PT][t];

-      cost += vp9_dct_value_cost_ptr[v];

-      pt = vp9_prev_token_class[t];

-#if CONFIG_NEWCOEFCONTEXT

-      if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1]))

-        pn = vp9_get_coef_neighbor_context(

-            qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);

-      else

-        pn = pt;

+  {

+#if CONFIG_CODE_NONZEROCOUNT

+    int nzc = 0;

 #endif

-    }

-    if (c < seg_eob)

-      cost += mb->hybrid_token_costs[tx_size][type][band[c]]

-          [PT][DCT_EOB_TOKEN];

-  } else {

     for (; c < eob; c++) {

       int v = qcoeff_ptr[scan[c]];

       int t = vp9_dct_value_tokens_ptr[v].Token;

-      cost += mb->token_costs[tx_size][type][band[c]][pt][t];

+#if CONFIG_CODE_NONZEROCOUNT

+      nzc += (v != 0);

+#endif

+      token_cache[c] = t;

+      cost += token_costs[get_coef_band(scan, tx_size, c)][pt][t];

       cost += vp9_dct_value_cost_ptr[v];

-      pt = vp9_prev_token_class[t];

-#if CONFIG_NEWCOEFCONTEXT

-      if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(band[c + 1]))

-        pn = vp9_get_coef_neighbor_context(

-            qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);

-      else

-        pn = pt;

+#if !CONFIG_CODE_NONZEROCOUNT

+      if (!c || token_cache[c - 1])

+        cost += vp9_cost_bit(coef_probs[type][ref]

+                                       [get_coef_band(scan, tx_size, c)]

+                                       [pt][0], 1);

 #endif

+      pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob);

+#if CONFIG_CODE_NONZEROCOUNT

+    cost += nzc_cost[nzc];

+#else

     if (c < seg_eob)

-      cost += mb->token_costs[tx_size][type][band[c]]

-          [PT][DCT_EOB_TOKEN];

+      cost += mb->token_costs[tx_size][type][ref]

+                             [get_coef_band(scan, tx_size, c)]

+                             [pt][DCT_EOB_TOKEN];

+#endif

   // is eob first coefficient;

-  pt = (c > !type);

+  pt = (c > 0);

   *a = *l = pt;

+  if (tx_size >= TX_8X8) {

+    a[1] = l[1] = pt;

+    if (tx_size >= TX_16X16) {

+      if (type == PLANE_TYPE_UV) {

+        a1[0] = a1[1] = l1[0] = l1[1] = pt;

+      } else {

+        a[2] = a[3] = l[2] = l[3] = pt;

+        if (tx_size >= TX_32X32) {

+          a1[0] = a1[1] = a1[2] = a1[3] = pt;

+          l1[0] = l1[1] = l1[2] = l1[3] = pt;

+        }

+      }

+    }

+  }

   return cost;

-static int rdcost_mby_4x4(MACROBLOCK *mb, int has_2nd_order, int backup) {

+static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb) {

   int cost = 0;

   int b;

   MACROBLOCKD *xd = &mb->e_mbd;

   ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;

-  if (backup) {

-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));

-    ta = (ENTROPY_CONTEXT *)&t_above;

-    tl = (ENTROPY_CONTEXT *)&t_left;

-  } else {

-    ta = (ENTROPY_CONTEXT *)xd->above_context;

-    tl = (ENTROPY_CONTEXT *)xd->left_context;

-  }

   for (b = 0; b < 16; b++)

-    cost += cost_coeffs(mb, xd->block + b,

-                        (has_2nd_order ?

-                         PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC),

+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,

                         ta + vp9_block2above[TX_4X4][b],

                         tl + vp9_block2left[TX_4X4][b],

                         TX_4X4);

-  if (has_2nd_order)

-    cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,

-                        ta + vp9_block2above[TX_4X4][24],

-                        tl + vp9_block2left[TX_4X4][24],

-                        TX_4X4);

   return cost;

-static void macro_block_yrd_4x4(MACROBLOCK *mb,

-                                int *Rate,

-                                int *Distortion,

-                                int *skippable, int backup) {

+static void macro_block_yrd_4x4(VP9_COMMON *const cm,

+                                MACROBLOCK *mb,

+                                int *rate,

+                                int *distortion,

+                                int *skippable) {

   MACROBLOCKD *const xd = &mb->e_mbd;

-  BLOCK   *const mb_y2 = mb->block + 24;

-  BLOCKD *const x_y2  = xd->block + 24;

-  int d, has_2nd_order;

   xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-  has_2nd_order = get_2nd_order_usage(xd);

-  // Fdct and building the 2nd order block

   vp9_transform_mby_4x4(mb);

   vp9_quantize_mby_4x4(mb);

-  d = vp9_mbblock_error(mb, has_2nd_order);

-  if (has_2nd_order)

-    d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);

-  *Distortion = (d >> 2);

-  // rate

-  *Rate = rdcost_mby_4x4(mb, has_2nd_order, backup);

-  *skippable = vp9_mby_is_skippable_4x4(&mb->e_mbd, has_2nd_order);

+  *distortion = vp9_mbblock_error(mb) >> 2;

+  *rate = rdcost_mby_4x4(cm, mb);

+  *skippable = vp9_mby_is_skippable_4x4(xd);

-static int rdcost_mby_8x8(MACROBLOCK *mb, int has_2nd_order, int backup) {

+static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb) {

   int cost = 0;

   int b;

   MACROBLOCKD *xd = &mb->e_mbd;

   ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta;

-  ENTROPY_CONTEXT *tl;

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;

-  if (backup) {

-    vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left,  xd->left_context, sizeof(t_left));

-    ta = (ENTROPY_CONTEXT *)&t_above;

-    tl = (ENTROPY_CONTEXT *)&t_left;

-  } else {

-    ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context;

-    tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context;

-  }

   for (b = 0; b < 16; b += 4)

-    cost += cost_coeffs(mb, xd->block + b,

-                        (has_2nd_order ?

-                         PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC),

+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC,

                         ta + vp9_block2above[TX_8X8][b],

                         tl + vp9_block2left[TX_8X8][b],

                         TX_8X8);

-  if (has_2nd_order)

-    cost += cost_coeffs(mb, xd->block + 24, PLANE_TYPE_Y2,

-                            ta + vp9_block2above[TX_8X8][24],

-                            tl + vp9_block2left[TX_8X8][24],

-                            TX_8X8);

   return cost;

-static void macro_block_yrd_8x8(MACROBLOCK *mb,

-                                int *Rate,

-                                int *Distortion,

-                                int *skippable, int backup) {

+static void macro_block_yrd_8x8(VP9_COMMON *const cm,

+                                MACROBLOCK *mb,

+                                int *rate,

+                                int *distortion,

+                                int *skippable) {

   MACROBLOCKD *const xd = &mb->e_mbd;

-  BLOCK   *const mb_y2 = mb->block + 24;

-  BLOCKD *const x_y2  = xd->block + 24;

-  int d, has_2nd_order;

   xd->mode_info_context->mbmi.txfm_size = TX_8X8;

   vp9_transform_mby_8x8(mb);

   vp9_quantize_mby_8x8(mb);

-  has_2nd_order = get_2nd_order_usage(xd);

-  d = vp9_mbblock_error_8x8_c(mb, has_2nd_order);

-  if (has_2nd_order)

-    d += vp9_block_error(mb_y2->coeff, x_y2->dqcoeff, 16);

-  *Distortion = (d >> 2);

-  // rate

-  *Rate = rdcost_mby_8x8(mb, has_2nd_order, backup);

-  *skippable = vp9_mby_is_skippable_8x8(&mb->e_mbd, has_2nd_order);

+  *distortion = vp9_mbblock_error(mb) >> 2;

+  *rate = rdcost_mby_8x8(cm, mb);

+  *skippable = vp9_mby_is_skippable_8x8(xd);

-static int rdcost_mby_16x16(MACROBLOCK *mb, int backup) {

-  int cost;

-  MACROBLOCKD *xd = &mb->e_mbd;

+static int rdcost_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *mb) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

   ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left;

-  if (backup) {

-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left, xd->left_context, sizeof(t_left));

-    ta = (ENTROPY_CONTEXT *)&t_above;

-    tl = (ENTROPY_CONTEXT *)&t_left;

-  } else {

-    ta = (ENTROPY_CONTEXT *)xd->above_context;

-    tl = (ENTROPY_CONTEXT *)xd->left_context;

-  }

-  cost = cost_coeffs(mb, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);

-  return cost;

+  return cost_coeffs(cm, mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16);

-static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,

-                                  int *skippable, int backup) {

-  int d;

-  MACROBLOCKD *xd = &mb->e_mbd;

+static void macro_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *mb,

+                                  int *rate, int *distortion, int *skippable) {

+  MACROBLOCKD *const xd = &mb->e_mbd;

   xd->mode_info_context->mbmi.txfm_size = TX_16X16;

   vp9_transform_mby_16x16(mb);

@@ -696,15 +721,13 @@

   // TODO(jingning) is it possible to quickly determine whether to force

   //                trailing coefficients to be zero, instead of running trellis

   //                optimization in the rate-distortion optimization loop?

-  if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)

-    vp9_optimize_mby_16x16(mb);

+  if (mb->optimize &&

+      xd->mode_info_context->mbmi.mode < I8X8_PRED)

+    vp9_optimize_mby_16x16(cm, mb);

-  d = vp9_mbblock_error(mb, 0);

-  *Distortion = (d >> 2);

-  // rate

-  *Rate = rdcost_mby_16x16(mb, backup);

-  *skippable = vp9_mby_is_skippable_16x16(&mb->e_mbd);

+  *distortion = vp9_mbblock_error(mb) >> 2;

+  *rate = rdcost_mby_16x16(cm, mb);

+  *skippable = vp9_mby_is_skippable_16x16(xd);

 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,

@@ -795,6 +818,7 @@

 static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

                             int *distortion, int *skippable,

                             int64_t txfm_cache[NB_TXFM_MODES]) {

+  VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &x->e_mbd;

   int r[TX_SIZE_MAX_MB][2], d[TX_SIZE_MAX_MB], s[TX_SIZE_MAX_MB];

@@ -801,9 +825,9 @@

   vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor,

                    x->block[0].src_stride);

-  macro_block_yrd_16x16(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1);

-  macro_block_yrd_8x8(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1);

-  macro_block_yrd_4x4(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1);

+  macro_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);

+  macro_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);

+  macro_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);

   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,

                            txfm_cache, TX_16X16);

@@ -818,27 +842,8 @@

   d[12] = p[12];

-static int rdcost_sby_32x32(MACROBLOCK *x, int backup) {

-  MACROBLOCKD * const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

-  ENTROPY_CONTEXT *ta, *tl;

-  if (backup) {

-    ta = (ENTROPY_CONTEXT *) &t_above,

-    tl = (ENTROPY_CONTEXT *) &t_left;

-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left,  xd->left_context,  sizeof(ENTROPY_CONTEXT_PLANES));

-  } else {

-    ta = (ENTROPY_CONTEXT *) xd->above_context;

-    tl = (ENTROPY_CONTEXT *) xd->left_context;

-  }

-  return cost_coeffs(x, xd->block, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);

-}

 static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff,

-                                int block_size) {

+                                int block_size, int shift) {

   int i;

   int64_t error = 0;

@@ -846,38 +851,127 @@

     unsigned int this_diff = coeff[i] - dqcoeff[i];

     error += this_diff * this_diff;

+  error >>= shift;

   return error > INT_MAX ? INT_MAX : (int)error;

-#define DEBUG_ERROR 0

-static void super_block_yrd_32x32(MACROBLOCK *x,

-                                  int *rate, int *distortion, int *skippable,

-                                  int backup) {

-  SUPERBLOCK  * const x_sb = &x->sb_coeff_data;

+static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

+  int cost = 0, b;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

+  for (b = 0; b < 64; b++)

+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

+                        ta + vp9_block2above_sb[TX_4X4][b],

+                        tl + vp9_block2left_sb[TX_4X4][b], TX_4X4);

+  return cost;

+}

+static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,

+                                int *rate, int *distortion, int *skippable) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;

+  vp9_transform_sby_4x4(x);

+  vp9_quantize_sby_4x4(x);

+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);

+  *rate       = rdcost_sby_4x4(cm, x);

+  *skippable  = vp9_sby_is_skippable_4x4(xd);

+}

+static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

+  int cost = 0, b;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

+  for (b = 0; b < 64; b += 4)

+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

+                        ta + vp9_block2above_sb[TX_8X8][b],

+                        tl + vp9_block2left_sb[TX_8X8][b], TX_8X8);

+  return cost;

+}

+static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,

+                                int *rate, int *distortion, int *skippable) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;

+  vp9_transform_sby_8x8(x);

+  vp9_quantize_sby_8x8(x);

+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);

+  *rate       = rdcost_sby_8x8(cm, x);

+  *skippable  = vp9_sby_is_skippable_8x8(xd);

+}

+static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

+  int cost = 0, b;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

+  for (b = 0; b < 64; b += 16)

+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

+                        ta + vp9_block2above_sb[TX_16X16][b],

+                        tl + vp9_block2left_sb[TX_16X16][b], TX_16X16);

+  return cost;

+}

+static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,

+                                  int *rate, int *distortion, int *skippable) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  xd->mode_info_context->mbmi.txfm_size = TX_16X16;

+  vp9_transform_sby_16x16(x);

+  vp9_quantize_sby_16x16(x);

+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2);

+  *rate       = rdcost_sby_16x16(cm, x);

+  *skippable  = vp9_sby_is_skippable_16x16(xd);

+}

+static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

   MACROBLOCKD * const xd = &x->e_mbd;

-  SUPERBLOCKD * const xd_sb = &xd->sb_coeff_data;

-#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID

-  int16_t out[1024];

-#endif

+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

+  return cost_coeffs(cm, x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32);

+}

+static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,

+                                  int *rate, int *distortion, int *skippable) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  xd->mode_info_context->mbmi.txfm_size = TX_32X32;

   vp9_transform_sby_32x32(x);

   vp9_quantize_sby_32x32(x);

-#if DEBUG_ERROR || CONFIG_DWTDCTHYBRID

-  vp9_short_idct32x32(xd_sb->dqcoeff, out, 64);

-#endif

-#if !CONFIG_DWTDCTHYBRID

-  *distortion = vp9_sb_block_error_c(x_sb->coeff, xd_sb->dqcoeff, 1024);

-#else

-  *distortion = vp9_block_error_c(x_sb->src_diff, out, 1024) << 4;

-#endif

-#if DEBUG_ERROR

-  printf("IDCT/FDCT error 32x32: %d (d: %d)\n",

-         vp9_block_error_c(x_sb->src_diff, out, 1024), *distortion);

-#endif

-  *rate       = rdcost_sby_32x32(x, backup);

-  *skippable  = vp9_sby_is_skippable_32x32(&x->e_mbd);

+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 0);

+  *rate       = rdcost_sby_32x32(cm, x);

+  *skippable  = vp9_sby_is_skippable_32x32(xd);

 static void super_block_yrd(VP9_COMP *cpi,

@@ -884,179 +978,166 @@

                             MACROBLOCK *x, int *rate, int *distortion,

                             int *skip,

                             int64_t txfm_cache[NB_TXFM_MODES]) {

+  VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &x->e_mbd;

-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n;

+  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];

   const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;

   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

-  ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_MB][2],

-                        *orig_above = xd->above_context;

-  ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_MB][2],

-                        *orig_left = xd->left_context;

-  for (n = TX_4X4; n < TX_SIZE_MAX_MB; n++) {

-    vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));

-    vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));

-    r[n][0] = 0;

-    d[n] = 0;

-    s[n] = 1;

-  }

+  vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);

+  super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);

+  super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);

+  super_block_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);

+  super_block_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);

-  vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff, src, src_y_stride,

-                       dst, dst_y_stride);

-  super_block_yrd_32x32(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], 1);

+  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,

+                           TX_SIZE_MAX_SB - 1);

+}

-#if DEBUG_ERROR

-  int err[3] = { 0, 0, 0 };

-#endif

-  for (n = 0; n < 4; n++) {

-    int x_idx = n & 1, y_idx = n >> 1;

-    int r_tmp, d_tmp, s_tmp;

+static int rdcost_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) {

+  int cost = 0, b;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

-    vp9_subtract_mby_s_c(x->src_diff,

-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,

-                         src_y_stride,

-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

-                         dst_y_stride);

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

-    xd->above_context = &t_above[TX_16X16][x_idx];

-    xd->left_context = &t_left[TX_16X16][y_idx];

-    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);

-    d[TX_16X16] += d_tmp;

-    r[TX_16X16][0] += r_tmp;

-    s[TX_16X16] = s[TX_16X16] && s_tmp;

-#if DEBUG_ERROR

-    vp9_inverse_transform_mby_16x16(xd);

-    err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256);

-#endif

+  for (b = 0; b < 256; b++)

+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

+                        ta + vp9_block2above_sb64[TX_4X4][b],

+                        tl + vp9_block2left_sb64[TX_4X4][b], TX_4X4);

-    xd->above_context = &t_above[TX_4X4][x_idx];

-    xd->left_context = &t_left[TX_4X4][y_idx];

-    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);

-    d[TX_4X4] += d_tmp;

-    r[TX_4X4][0] += r_tmp;

-    s[TX_4X4] = s[TX_4X4] && s_tmp;

-#if DEBUG_ERROR

-    vp9_inverse_transform_mby_4x4(xd);

-    err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256);

-#endif

+  return cost;

+}

-    xd->above_context = &t_above[TX_8X8][x_idx];

-    xd->left_context = &t_left[TX_8X8][y_idx];

-    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);

-    d[TX_8X8] += d_tmp;

-    r[TX_8X8][0] += r_tmp;

-    s[TX_8X8] = s[TX_8X8] && s_tmp;

-#if DEBUG_ERROR

-    vp9_inverse_transform_mby_8x8(xd);

-    err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256);

-#endif

-  }

-#if DEBUG_ERROR

-  printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]);

-  printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]);

-  printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]);

-#endif

-  choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,

-                           TX_SIZE_MAX_SB - 1);

+static void super_block64_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x,

+                                  int *rate, int *distortion, int *skippable) {

+  MACROBLOCKD *const xd = &x->e_mbd;

-  xd->above_context = orig_above;

-  xd->left_context = orig_left;

+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;

+  vp9_transform_sb64y_4x4(x);

+  vp9_quantize_sb64y_4x4(x);

+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);

+  *rate       = rdcost_sb64y_4x4(cm, x);

+  *skippable  = vp9_sb64y_is_skippable_4x4(xd);

-static void super_block_64_yrd(VP9_COMP *cpi,

-                               MACROBLOCK *x, int *rate, int *distortion,

-                               int *skip,

-                               int64_t txfm_cache[NB_TXFM_MODES]) {

+static int rdcost_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) {

+  int cost = 0, b;

   MACROBLOCKD *const xd = &x->e_mbd;

-  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n;

-  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;

-  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

-  ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_SB][4],

-                        *orig_above = xd->above_context;

-  ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_SB][4],

-                        *orig_left = xd->left_context;

+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

-  for (n = TX_4X4; n < TX_SIZE_MAX_SB; n++) {

-    vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n]));

-    vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n]));

-    r[n][0] = 0;

-    d[n] = 0;

-    s[n] = 1;

-  }

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

-  for (n = 0; n < 4; n++) {

-    int x_idx = n & 1, y_idx = n >> 1;

-    int r_tmp, d_tmp, s_tmp;

+  for (b = 0; b < 256; b += 4)

+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

+                        ta + vp9_block2above_sb64[TX_8X8][b],

+                        tl + vp9_block2left_sb64[TX_8X8][b], TX_8X8);

-    xd->above_context = &t_above[TX_32X32][x_idx << 1];

-    xd->left_context = &t_left[TX_32X32][y_idx << 1];

-    vp9_subtract_sby_s_c(x->sb_coeff_data.src_diff,

-                         src + 32 * x_idx + 32 * y_idx * src_y_stride,

-                         src_y_stride,

-                         dst + 32 * x_idx + 32 * y_idx * dst_y_stride,

-                         dst_y_stride);

-    super_block_yrd_32x32(x, &r_tmp, &d_tmp, &s_tmp, 0);

-    r[TX_32X32][0] += r_tmp;

-    d[TX_32X32] += d_tmp;

-    s[TX_32X32] = s[TX_32X32] && s_tmp;

-  }

+  return cost;

+}

-#if DEBUG_ERROR

-  int err[3] = { 0, 0, 0 };

-#endif

-  for (n = 0; n < 16; n++) {

-    int x_idx = n & 3, y_idx = n >> 2;

-    int r_tmp, d_tmp, s_tmp;

+static void super_block64_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x,

+                                  int *rate, int *distortion, int *skippable) {

+  MACROBLOCKD *const xd = &x->e_mbd;

-    vp9_subtract_mby_s_c(x->src_diff,

-                         src + x_idx * 16 + y_idx * 16 * src_y_stride,

-                         src_y_stride,

-                         dst + x_idx * 16 + y_idx * 16 * dst_y_stride,

-                         dst_y_stride);

+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;

+  vp9_transform_sb64y_8x8(x);

+  vp9_quantize_sb64y_8x8(x);

-    xd->above_context = &t_above[TX_16X16][x_idx];

-    xd->left_context = &t_left[TX_16X16][y_idx];

-    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);

-    d[TX_16X16] += d_tmp;

-    r[TX_16X16][0] += r_tmp;

-    s[TX_16X16] = s[TX_16X16] && s_tmp;

-#if DEBUG_ERROR

-    vp9_inverse_transform_mby_16x16(xd);

-    err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256);

-#endif

+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);

+  *rate       = rdcost_sb64y_8x8(cm, x);

+  *skippable  = vp9_sb64y_is_skippable_8x8(xd);

+}

-    xd->above_context = &t_above[TX_4X4][x_idx];

-    xd->left_context = &t_left[TX_4X4][y_idx];

-    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);

-    d[TX_4X4] += d_tmp;

-    r[TX_4X4][0] += r_tmp;

-    s[TX_4X4] = s[TX_4X4] && s_tmp;

-#if DEBUG_ERROR

-    vp9_inverse_transform_mby_4x4(xd);

-    err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256);

-#endif

+static int rdcost_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) {

+  int cost = 0, b;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

-    xd->above_context = &t_above[TX_8X8][x_idx];

-    xd->left_context = &t_left[TX_8X8][y_idx];

-    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);

-    d[TX_8X8] += d_tmp;

-    r[TX_8X8][0] += r_tmp;

-    s[TX_8X8] = s[TX_8X8] && s_tmp;

-#if DEBUG_ERROR

-    vp9_inverse_transform_mby_8x8(xd);

-    err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256);

-#endif

-  }

-#if DEBUG_ERROR

-  printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]);

-  printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]);

-  printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]);

-#endif

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

+  for (b = 0; b < 256; b += 16)

+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

+                        ta + vp9_block2above_sb64[TX_16X16][b],

+                        tl + vp9_block2left_sb64[TX_16X16][b], TX_16X16);

+  return cost;

+}

+static void super_block64_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x,

+                                    int *rate, int *distortion,

+                                    int *skippable) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  xd->mode_info_context->mbmi.txfm_size = TX_16X16;

+  vp9_transform_sb64y_16x16(x);

+  vp9_quantize_sb64y_16x16(x);

+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2);

+  *rate       = rdcost_sb64y_16x16(cm, x);

+  *skippable  = vp9_sb64y_is_skippable_16x16(xd);

+}

+static int rdcost_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) {

+  int cost = 0, b;

+  MACROBLOCKD * const xd = &x->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

+  ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above;

+  ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left;

+  vpx_memcpy(&t_above, xd->above_context, sizeof(t_above));

+  vpx_memcpy(&t_left,  xd->left_context,  sizeof(t_left));

+  for (b = 0; b < 256; b += 64)

+    cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC,

+                        ta + vp9_block2above_sb64[TX_32X32][b],

+                        tl + vp9_block2left_sb64[TX_32X32][b], TX_32X32);

+  return cost;

+}

+static void super_block64_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x,

+                                    int *rate, int *distortion,

+                                    int *skippable) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  xd->mode_info_context->mbmi.txfm_size = TX_32X32;

+  vp9_transform_sb64y_32x32(x);

+  vp9_quantize_sb64y_32x32(x);

+  *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 0);

+  *rate       = rdcost_sb64y_32x32(cm, x);

+  *skippable  = vp9_sb64y_is_skippable_32x32(xd);

+}

+static void super_block_64_yrd(VP9_COMP *cpi,

+                               MACROBLOCK *x, int *rate, int *distortion,

+                               int *skip,

+                               int64_t txfm_cache[NB_TXFM_MODES]) {

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];

+  const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;

+  int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;

+  vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride);

+  super_block64_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);

+  super_block64_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);

+  super_block64_yrd_8x8(cm, x,   &r[TX_8X8][0],   &d[TX_8X8],   &s[TX_8X8]);

+  super_block64_yrd_4x4(cm, x,   &r[TX_4X4][0],   &d[TX_4X4],   &s[TX_4X4]);

   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache,

                            TX_SIZE_MAX_SB - 1);

-  xd->above_context = orig_above;

-  xd->left_context = orig_left;

 static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) {

@@ -1091,6 +1172,7 @@

   int64_t best_rd = INT64_MAX;

   int rate = 0;

   int distortion;

+  VP9_COMMON *const cm = &cpi->common;

   ENTROPY_CONTEXT ta = *a, tempa = *a;

   ENTROPY_CONTEXT tl = *l, templ = *l;

@@ -1105,8 +1187,9 @@

   DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16);

 #if CONFIG_NEWBINTRAMODES

-  b->bmi.as_mode.context = vp9_find_bpred_context(b);

+  b->bmi.as_mode.context = vp9_find_bpred_context(xd, b);

 #endif

+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;

   for (mode = B_DC_PRED; mode < LEFT4X4; mode++) {

     int64_t this_rd;

     int ratey;

@@ -1129,23 +1212,24 @@

     rate = bmode_costs[mode];

 #endif

-    vp9_intra4x4_predict(b, mode, b->predictor);

+    vp9_intra4x4_predict(xd, b, mode, b->predictor);

     vp9_subtract_b(be, b, 16);

     b->bmi.as_mode.first = mode;

-    tx_type = get_tx_type_4x4(xd, b);

+    tx_type = get_tx_type_4x4(xd, be - x->block);

     if (tx_type != DCT_DCT) {

-      vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);

-      vp9_ht_quantize_b_4x4(be, b, tx_type);

+      vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

+      vp9_ht_quantize_b_4x4(x, be - x->block, tx_type);

     } else {

-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

-      x->quantize_b_4x4(be, b);

+      x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+      x->quantize_b_4x4(x, be - x->block);

     tempa = ta;

     templ = tl;

-    ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);

+    ratey = cost_coeffs(cm, x, b - xd->block,

+                        PLANE_TYPE_Y_WITH_DC, &tempa, &templ, TX_4X4);

     rate += ratey;

     distortion = vp9_block_error(be->coeff, b->dqcoeff, 16) >> 2;

@@ -1168,9 +1252,9 @@

   // inverse transform

   if (best_tx_type != DCT_DCT)

-    vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob);

+    vp9_short_iht4x4(best_dqcoeff, b->diff, 16, best_tx_type);

   else

-    xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32);

+    xd->inv_txm4x4(best_dqcoeff, b->diff, 32);

   vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);

@@ -1179,8 +1263,7 @@

 static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,

                                          int *Rate, int *rate_y,

-                                         int *Distortion, int64_t best_rd,

-                                         int update_contexts) {

+                                         int *Distortion, int64_t best_rd) {

   int i;

   MACROBLOCKD *const xd = &mb->e_mbd;

   int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];

@@ -1191,18 +1274,13 @@

   ENTROPY_CONTEXT *ta, *tl;

   int *bmode_costs;

-  if (update_contexts) {

-    ta = (ENTROPY_CONTEXT *)xd->above_context;

-    tl = (ENTROPY_CONTEXT *)xd->left_context;

-  } else {

-    vpx_memcpy(&t_above, xd->above_context,

-               sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context,

-               sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_above, xd->above_context,

+             sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, xd->left_context,

+             sizeof(ENTROPY_CONTEXT_PLANES));

-    ta = (ENTROPY_CONTEXT *)&t_above;

-    tl = (ENTROPY_CONTEXT *)&t_left;

-  }

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

   xd->mode_info_context->mbmi.mode = B_PRED;

   bmode_costs = mb->inter_bmode_costs;

@@ -1220,7 +1298,7 @@

       bmode_costs  = mb->bmode_costs[A][L];

 #if CONFIG_NEWBINTRAMODES

-    mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd->block + i);

+    mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd, xd->block + i);

 #endif

     total_rd += rd_pick_intra4x4block(

@@ -1401,6 +1479,7 @@

                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

                                      int *bestrate, int *bestratey,

                                      int *bestdistortion) {

+  VP9_COMMON *const cm = &cpi->common;

   MB_PREDICTION_MODE mode;

   MACROBLOCKD *xd = &x->e_mbd;

   int64_t best_rd = INT64_MAX;

@@ -1407,8 +1486,9 @@

   int distortion = 0, rate = 0;

   BLOCK  *be = x->block + ib;

   BLOCKD *b = xd->block + ib;

-  ENTROPY_CONTEXT ta0, ta1, besta0 = 0, besta1 = 0;

-  ENTROPY_CONTEXT tl0, tl1, bestl0 = 0, bestl1 = 0;

+  ENTROPY_CONTEXT_PLANES ta, tl;

+  ENTROPY_CONTEXT *ta0, *ta1, besta0 = 0, besta1 = 0;

+  ENTROPY_CONTEXT *tl0, *tl1, bestl0 = 0, bestl1 = 0;

/*

    * The predictor buffer is a 2d buffer with a stride of 16.  Create

@@ -1430,58 +1510,76 @@

     rate = mode_costs[mode];

     b->bmi.as_mode.first = mode;

-    vp9_intra8x8_predict(b, mode, b->predictor);

+    vp9_intra8x8_predict(xd, b, mode, b->predictor);

     vp9_subtract_4b_c(be, b, 16);

-    assert(get_2nd_order_usage(xd) == 0);

     if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {

-      TX_TYPE tx_type = get_tx_type_8x8(xd, b);

+      TX_TYPE tx_type = get_tx_type_8x8(xd, ib);

       if (tx_type != DCT_DCT)

-        vp9_fht(be->src_diff, 32, (x->block + idx)->coeff, tx_type, 8);

+        vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type);

       else

-        x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);

-      x->quantize_b_8x8(x->block + idx, xd->block + idx);

+        x->fwd_txm8x8(be->src_diff, (x->block + idx)->coeff, 32);

+      x->quantize_b_8x8(x, idx, tx_type);

       // compute quantization mse of 8x8 block

       distortion = vp9_block_error_c((x->block + idx)->coeff,

                                      (xd->block + idx)->dqcoeff, 64);

-      ta0 = a[vp9_block2above[TX_8X8][idx]];

-      tl0 = l[vp9_block2left[TX_8X8][idx]];

-      rate_t = cost_coeffs(x, xd->block + idx, PLANE_TYPE_Y_WITH_DC,

-                           &ta0, &tl0, TX_8X8);

+      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));

+      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));

+      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_8X8][idx];

+      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_8X8][idx];

+      ta1 = ta0 + 1;

+      tl1 = tl0 + 1;

+      rate_t = cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,

+                           ta0, tl0, TX_8X8);

       rate += rate_t;

-      ta1 = ta0;

-      tl1 = tl0;

     } else {

       static const int iblock[4] = {0, 1, 4, 5};

       TX_TYPE tx_type;

       int i;

-      ta0 = a[vp9_block2above[TX_4X4][ib]];

-      ta1 = a[vp9_block2above[TX_4X4][ib + 1]];

-      tl0 = l[vp9_block2left[TX_4X4][ib]];

-      tl1 = l[vp9_block2left[TX_4X4][ib + 4]];

+      vpx_memcpy(&ta, a, sizeof(ENTROPY_CONTEXT_PLANES));

+      vpx_memcpy(&tl, l, sizeof(ENTROPY_CONTEXT_PLANES));

+      ta0 = ((ENTROPY_CONTEXT*)&ta) + vp9_block2above[TX_4X4][ib];

+      tl0 = ((ENTROPY_CONTEXT*)&tl) + vp9_block2left[TX_4X4][ib];

+      ta1 = ta0 + 1;

+      tl1 = tl0 + 1;

       distortion = 0;

       rate_t = 0;

       for (i = 0; i < 4; ++i) {

+        int do_two = 0;

         b = &xd->block[ib + iblock[i]];

         be = &x->block[ib + iblock[i]];

-        tx_type = get_tx_type_4x4(xd, b);

+        tx_type = get_tx_type_4x4(xd, ib + iblock[i]);

         if (tx_type != DCT_DCT) {

-          vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);

-          vp9_ht_quantize_b_4x4(be, b, tx_type);

+          vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type);

+          vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type);

+        } else if (!(i & 1) &&

+                   get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) {

+          x->fwd_txm8x4(be->src_diff, be->coeff, 32);

+          x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1);

+          do_two = 1;

         } else {

-          x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

-          x->quantize_b_4x4(be, b);

+          x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+          x->quantize_b_4x4(x, ib + iblock[i]);

-        distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16);

-        rate_t += cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC,

-                              // i&1 ? &ta1 : &ta0, i&2 ? &tl1 : &tl0,

-                              &ta0, &tl0,

+        distortion += vp9_block_error_c(be->coeff, b->dqcoeff, 16 << do_two);

+        rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,

+                              i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,

                               TX_4X4);

+        if (do_two) {

+          i++;

+          rate_t += cost_coeffs(cm, x, ib + iblock[i], PLANE_TYPE_Y_WITH_DC,

+                                i&1 ? ta1 : ta0, i&2 ? tl1 : tl0,

+                                TX_4X4);

+        }

+      b = &xd->block[ib];

+      be = &x->block[ib];

       rate += rate_t;

@@ -1491,10 +1589,10 @@

       *bestrate = rate;

       *bestratey = rate_t;

       *bestdistortion = distortion;

-      besta0 = ta0;

-      besta1 = ta1;

-      bestl0 = tl0;

-      bestl1 = tl1;

+      besta0 = *ta0;

+      besta1 = *ta1;

+      bestl0 = *tl0;

+      bestl1 = *tl1;

       best_rd = this_rd;

       *best_mode = mode;

       copy_predictor_8x8(best_predictor, b->predictor);

@@ -1563,7 +1661,80 @@

   return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);

-static int rd_cost_mbuv_4x4(MACROBLOCK *mb, int backup) {

+static int64_t rd_pick_intra8x8mby_modes_and_txsz(VP9_COMP *cpi, MACROBLOCK *x,

+                                                  int *rate, int *rate_y,

+                                                  int *distortion,

+                                                  int *mode8x8,

+                                                  int64_t best_yrd,

+                                                  int64_t *txfm_cache) {

+  VP9_COMMON *const cm = &cpi->common;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+  int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);

+  int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);

+  int64_t tmp_rd_4x4s, tmp_rd_8x8s;

+  int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;

+  int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;

+  mbmi->txfm_size = TX_4X4;

+  tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,

+                                         &d4x4, best_yrd);

+  mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;

+  mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;

+  mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;

+  mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;

+  mbmi->txfm_size = TX_8X8;

+  tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,

+                                         &d8x8, best_yrd);

+  txfm_cache[ONLY_4X4]  = tmp_rd_4x4;

+  txfm_cache[ALLOW_8X8] = tmp_rd_8x8;

+  txfm_cache[ALLOW_16X16] = tmp_rd_8x8;

+  tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);

+  tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);

+  txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ?

+                               tmp_rd_4x4s : tmp_rd_8x8s;

+  if (cm->txfm_mode == TX_MODE_SELECT) {

+    if (tmp_rd_4x4s < tmp_rd_8x8s) {

+      *rate = r4x4 + cost0;

+      *rate_y = tok4x4 + cost0;

+      *distortion = d4x4;

+      mbmi->txfm_size = TX_4X4;

+      tmp_rd = tmp_rd_4x4s;

+    } else {

+      *rate = r8x8 + cost1;

+      *rate_y = tok8x8 + cost1;

+      *distortion = d8x8;

+      mbmi->txfm_size = TX_8X8;

+      tmp_rd = tmp_rd_8x8s;

+      mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;

+      mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;

+      mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;

+      mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;

+    }

+  } else if (cm->txfm_mode == ONLY_4X4) {

+    *rate = r4x4;

+    *rate_y = tok4x4;

+    *distortion = d4x4;

+    mbmi->txfm_size = TX_4X4;

+    tmp_rd = tmp_rd_4x4;

+  } else {

+    *rate = r8x8;

+    *rate_y = tok8x8;

+    *distortion = d8x8;

+    mbmi->txfm_size = TX_8X8;

+    tmp_rd = tmp_rd_8x8;

+    mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;

+    mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;

+    mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;

+    mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;

+  }

+  return tmp_rd;

+}

+static int rd_cost_mbuv_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {

   int b;

   int cost = 0;

   MACROBLOCKD *xd = &mb->e_mbd;

@@ -1582,7 +1753,7 @@

   for (b = 16; b < 24; b++)

-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,

+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,

                         ta + vp9_block2above[TX_4X4][b],

                         tl + vp9_block2left[TX_4X4][b],

                         TX_4X4);

@@ -1597,7 +1768,7 @@

   vp9_transform_mbuv_4x4(x);

   vp9_quantize_mbuv_4x4(x);

-  *rate       = rd_cost_mbuv_4x4(x, do_ctx_backup);

+  *rate       = rd_cost_mbuv_4x4(&cpi->common, x, do_ctx_backup);

   *distortion = vp9_mbuverror(x) / 4;

   *skip       = vp9_mbuv_is_skippable_4x4(&x->e_mbd);

@@ -1604,7 +1775,7 @@

   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-static int rd_cost_mbuv_8x8(MACROBLOCK *mb, int backup) {

+static int rd_cost_mbuv_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) {

   int b;

   int cost = 0;

   MACROBLOCKD *xd = &mb->e_mbd;

@@ -1623,7 +1794,7 @@

   for (b = 16; b < 24; b += 4)

-    cost += cost_coeffs(mb, xd->block + b, PLANE_TYPE_UV,

+    cost += cost_coeffs(cm, mb, b, PLANE_TYPE_UV,

                         ta + vp9_block2above[TX_8X8][b],

                         tl + vp9_block2left[TX_8X8][b], TX_8X8);

@@ -1636,7 +1807,7 @@

   vp9_transform_mbuv_8x8(x);

   vp9_quantize_mbuv_8x8(x);

-  *rate       = rd_cost_mbuv_8x8(x, do_ctx_backup);

+  *rate       = rd_cost_mbuv_8x8(&cpi->common, x, do_ctx_backup);

   *distortion = vp9_mbuverror(x) / 4;

   *skip       = vp9_mbuv_is_skippable_8x8(&x->e_mbd);

@@ -1643,16 +1814,16 @@

   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-static int rd_cost_sbuv_16x16(MACROBLOCK *x, int backup) {

+static int rd_cost_sbuv_16x16(VP9_COMMON *const cm, MACROBLOCK *x, int backup) {

   int b;

   int cost = 0;

   MACROBLOCKD *const xd = &x->e_mbd;

-  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

   ENTROPY_CONTEXT *ta, *tl;

   if (backup) {

-    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES));

-    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

     ta = (ENTROPY_CONTEXT *) &t_above;

     tl = (ENTROPY_CONTEXT *) &t_left;

@@ -1662,7 +1833,7 @@

   for (b = 16; b < 24; b += 4)

-    cost += cost_coeffs(x, xd->block + b, PLANE_TYPE_UV,

+    cost += cost_coeffs(cm, x, b * 4, PLANE_TYPE_UV,

                         ta + vp9_block2above[TX_8X8][b],

                         tl + vp9_block2left[TX_8X8][b], TX_16X16);

@@ -1669,8 +1840,8 @@

   return cost;

-static void rd_inter32x32_uv_16x16(MACROBLOCK *x, int *rate,

-                                   int *distortion, int *skip,

+static void rd_inter32x32_uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x,

+                                   int *rate, int *distortion, int *skip,

                                    int backup) {

   MACROBLOCKD *const xd = &x->e_mbd;

@@ -1677,9 +1848,9 @@

   vp9_transform_sbuv_16x16(x);

   vp9_quantize_sbuv_16x16(x);

-  *rate       = rd_cost_sbuv_16x16(x, backup);

-  *distortion = vp9_block_error_c(x->sb_coeff_data.coeff + 1024,

-                                   xd->sb_coeff_data.dqcoeff + 1024, 512) >> 2;

+  *rate       = rd_cost_sbuv_16x16(cm, x, backup);

+  *distortion = vp9_sb_block_error_c(x->coeff + 1024,

+                                     xd->dqcoeff + 1024, 512, 2);

   *skip       = vp9_sbuv_is_skippable_16x16(xd);

@@ -1691,11 +1862,11 @@

   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;

   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  if (mbmi->txfm_size == TX_32X32) {

-    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,

+  if (mbmi->txfm_size >= TX_16X16) {

+    vp9_subtract_sbuv_s_c(x->src_diff,

                           usrc, vsrc, src_uv_stride,

                           udst, vdst, dst_uv_stride);

-    rd_inter32x32_uv_16x16(x, rate, distortion, skip, 1);

+    rd_inter32x32_uv_16x16(&cpi->common, x, rate, distortion, skip, 1);

   } else {

     int n, r = 0, d = 0;

     int skippable = 1;

@@ -1743,22 +1914,14 @@

   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-static void super_block_64_uvrd(MACROBLOCK *x, int *rate,

+static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, int *rate,

                                 int *distortion, int *skip);

 static int64_t rd_inter64x64_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

                                 int *distortion, int fullpixel, int *skip) {

-  super_block_64_uvrd(x, rate, distortion, skip);

+  super_block_64_uvrd(&cpi->common, x, rate, distortion, skip);

   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);

-static int64_t rd_inter4x4_uv(VP9_COMP *cpi, MACROBLOCK *x, int *rate,

-                              int *distortion, int *skip, int fullpixel) {

-  vp9_build_inter4x4_predictors_mbuv(&x->e_mbd);

-  vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

-                    x->e_mbd.predictor, x->src.uv_stride);

-  return rd_inter16x16_uv_4x4(cpi, x, rate, distortion, fullpixel, skip, 1);

-}

 static void rd_pick_intra_mbuv_mode(VP9_COMP *cpi,

                                     MACROBLOCK *x,

                                     int *rate,

@@ -1773,6 +1936,7 @@

   int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);

   int rate_to, UNINITIALIZED_IS_SAFE(skip);

+  xd->mode_info_context->mbmi.txfm_size = TX_4X4;

   for (mode = DC_PRED; mode <= TM_PRED; mode++) {

     int rate;

     int distortion;

@@ -1786,7 +1950,7 @@

     vp9_transform_mbuv_4x4(x);

     vp9_quantize_mbuv_4x4(x);

-    rate_to = rd_cost_mbuv_4x4(x, 1);

+    rate_to = rd_cost_mbuv_4x4(&cpi->common, x, 1);

     rate = rate_to

            + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];

@@ -1825,6 +1989,7 @@

   int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);

   int rate_to, UNINITIALIZED_IS_SAFE(skip);

+  xd->mode_info_context->mbmi.txfm_size = TX_8X8;

   for (mode = DC_PRED; mode <= TM_PRED; mode++) {

     int rate;

     int distortion;

@@ -1838,7 +2003,7 @@

     vp9_quantize_mbuv_8x8(x);

-    rate_to = rd_cost_mbuv_8x8(x, 1);

+    rate_to = rd_cost_mbuv_8x8(&cpi->common, x, 1);

     rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][mbmi->uv_mode];

     distortion = vp9_mbuverror(x) / 4;

@@ -1860,7 +2025,8 @@

 // TODO(rbultje) very similar to rd_inter32x32_uv(), merge?

-static void super_block_uvrd(MACROBLOCK *x,

+static void super_block_uvrd(VP9_COMMON *const cm,

+                             MACROBLOCK *x,

                              int *rate,

                              int *distortion,

                              int *skippable) {

@@ -1870,11 +2036,11 @@

   const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;

   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;

-  if (mbmi->txfm_size == TX_32X32) {

-    vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,

+  if (mbmi->txfm_size >= TX_16X16) {

+    vp9_subtract_sbuv_s_c(x->src_diff,

                           usrc, vsrc, src_uv_stride,

                           udst, vdst, dst_uv_stride);

-    rd_inter32x32_uv_16x16(x, rate, distortion, skippable, 1);

+    rd_inter32x32_uv_16x16(cm, x, rate, distortion, skippable, 1);

   } else {

     int d = 0, r = 0, n, s = 1;

     ENTROPY_CONTEXT_PLANES t_above[2], t_left[2];

@@ -1908,9 +2074,9 @@

       xd->above_context = t_above + x_idx;

       xd->left_context = t_left + y_idx;

       if (mbmi->txfm_size == TX_4X4) {

-        r += rd_cost_mbuv_4x4(x, 0);

+        r += rd_cost_mbuv_4x4(cm, x, 0);

       } else {

-        r += rd_cost_mbuv_8x8(x, 0);

+        r += rd_cost_mbuv_8x8(cm, x, 0);

@@ -1923,7 +2089,48 @@

-static void super_block_64_uvrd(MACROBLOCK *x,

+static int rd_cost_sb64uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,

+                                int backup) {

+  int b;

+  int cost = 0;

+  MACROBLOCKD *const xd = &x->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above[4], t_left[4];

+  ENTROPY_CONTEXT *ta, *tl;

+  if (backup) {

+    vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);

+    vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 4);

+    ta = (ENTROPY_CONTEXT *) &t_above;

+    tl = (ENTROPY_CONTEXT *) &t_left;

+  } else {

+    ta = (ENTROPY_CONTEXT *)xd->above_context;

+    tl = (ENTROPY_CONTEXT *)xd->left_context;

+  }

+  for (b = 16; b < 24; b += 4)

+    cost += cost_coeffs(cm, x, b * 16, PLANE_TYPE_UV,

+                        ta + vp9_block2above[TX_8X8][b],

+                        tl + vp9_block2left[TX_8X8][b], TX_32X32);

+  return cost;

+}

+static void rd_inter64x64_uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x,

+                                   int *rate, int *distortion, int *skip,

+                                   int backup) {

+  MACROBLOCKD *const xd = &x->e_mbd;

+  vp9_transform_sb64uv_32x32(x);

+  vp9_quantize_sb64uv_32x32(x);

+  *rate       = rd_cost_sb64uv_32x32(cm, x, backup);

+  *distortion = vp9_sb_block_error_c(x->coeff + 4096,

+                                     xd->dqcoeff + 4096, 2048, 0);

+  *skip       = vp9_sb64uv_is_skippable_32x32(xd);

+}

+static void super_block_64_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,

                                 int *rate,

                                 int *distortion,

                                 int *skippable) {

@@ -1937,10 +2144,15 @@

   ENTROPY_CONTEXT_PLANES *tl_orig = xd->left_context;

   int d = 0, r = 0, n, s = 1;

+  // FIXME not needed if tx=32x32

   memcpy(t_above, xd->above_context, sizeof(t_above));

   memcpy(t_left,  xd->left_context,  sizeof(t_left));

   if (mbmi->txfm_size == TX_32X32) {

+    vp9_subtract_sb64uv_s_c(x->src_diff, usrc, vsrc, src_uv_stride,

+                            udst, vdst, dst_uv_stride);

+    rd_inter64x64_uv_32x32(cm, x, &r, &d, &s, 1);

+  } else if (mbmi->txfm_size == TX_16X16) {

     int n;

     *rate = 0;

@@ -1948,7 +2160,7 @@

       int x_idx = n & 1, y_idx = n >> 1;

       int r_tmp, d_tmp, s_tmp;

-      vp9_subtract_sbuv_s_c(x->sb_coeff_data.src_diff,

+      vp9_subtract_sbuv_s_c(x->src_diff,

                             usrc + x_idx * 16 + y_idx * 16 * src_uv_stride,

                             vsrc + x_idx * 16 + y_idx * 16 * src_uv_stride,

                             src_uv_stride,

@@ -1957,7 +2169,7 @@

                             dst_uv_stride);

       xd->above_context = t_above + x_idx * 2;

       xd->left_context = t_left + y_idx * 2;

-      rd_inter32x32_uv_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);

+      rd_inter32x32_uv_16x16(cm, x, &r_tmp, &d_tmp, &s_tmp, 0);

       r += r_tmp;

       d += d_tmp;

       s = s && s_tmp;

@@ -1987,9 +2199,9 @@

       xd->left_context = t_left + y_idx;

       d += vp9_mbuverror(x) >> 2;

       if (mbmi->txfm_size == TX_4X4) {

-        r += rd_cost_mbuv_4x4(x, 0);

+        r += rd_cost_mbuv_4x4(cm, x, 0);

       } else {

-        r += rd_cost_mbuv_8x8(x, 0);

+        r += rd_cost_mbuv_8x8(cm, x, 0);

@@ -2018,7 +2230,7 @@

     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;

     vp9_build_intra_predictors_sbuv_s(&x->e_mbd);

-    super_block_uvrd(x, &this_rate_tokenonly,

+    super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,

                      &this_distortion, &s);

     this_rate = this_rate_tokenonly +

                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];

@@ -2055,7 +2267,7 @@

     x->e_mbd.mode_info_context->mbmi.uv_mode = mode;

     vp9_build_intra_predictors_sb64uv_s(&x->e_mbd);

-    super_block_64_uvrd(x, &this_rate_tokenonly,

+    super_block_64_uvrd(&cpi->common, x, &this_rate_tokenonly,

                         &this_distortion, &s);

     this_rate = this_rate_tokenonly +

     x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];

@@ -2082,12 +2294,8 @@

   MACROBLOCKD *xd = &cpi->mb.e_mbd;

   int segment_id = xd->mode_info_context->mbmi.segment_id;

-  // If the mode coding is done entirely at the segment level

-  // we should not account for it at the per mb level in rd code.

-  // Note that if the segment level coding is expanded from single mode

-  // to multiple mode masks as per reference frame coding we will need

-  // to do something different here.

-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

+  // Dont account for mode here if segment skip is enabled.

+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

     VP9_COMMON *pc = &cpi->common;

     vp9_prob p [VP9_MVREFS - 1];

@@ -2156,14 +2364,18 @@

           break;

         case LEFT4X4:

-          this_mv->as_int = col ? d[-1].bmi.as_mv.first.as_int : left_block_mv(mic, i);

+          this_mv->as_int = col ? d[-1].bmi.as_mv[0].as_int :

+                                  left_block_mv(xd, mic, i);

           if (mbmi->second_ref_frame > 0)

-            this_second_mv->as_int = col ? d[-1].bmi.as_mv.second.as_int : left_block_second_mv(mic, i);

+            this_second_mv->as_int = col ? d[-1].bmi.as_mv[1].as_int :

+                                           left_block_second_mv(xd, mic, i);

           break;

         case ABOVE4X4:

-          this_mv->as_int = row ? d[-4].bmi.as_mv.first.as_int : above_block_mv(mic, i, mis);

+          this_mv->as_int = row ? d[-4].bmi.as_mv[0].as_int :

+                                  above_block_mv(mic, i, mis);

           if (mbmi->second_ref_frame > 0)

-            this_second_mv->as_int = row ? d[-4].bmi.as_mv.second.as_int : above_block_second_mv(mic, i, mis);

+            this_second_mv->as_int = row ? d[-4].bmi.as_mv[1].as_int :

+                                           above_block_second_mv(mic, i, mis);

           break;

         case ZERO4X4:

           this_mv->as_int = 0;

@@ -2178,11 +2390,11 @@

         int_mv left_mv, left_second_mv;

         left_second_mv.as_int = 0;

-        left_mv.as_int = col ? d[-1].bmi.as_mv.first.as_int :

-                         left_block_mv(mic, i);

+        left_mv.as_int = col ? d[-1].bmi.as_mv[0].as_int :

+                         left_block_mv(xd, mic, i);

         if (mbmi->second_ref_frame > 0)

-          left_second_mv.as_int = col ? d[-1].bmi.as_mv.second.as_int :

-                                  left_block_second_mv(mic, i);

+          left_second_mv.as_int = col ? d[-1].bmi.as_mv[1].as_int :

+                                  left_block_second_mv(xd, mic, i);

         if (left_mv.as_int == this_mv->as_int &&

             (mbmi->second_ref_frame <= 0 ||

@@ -2198,9 +2410,9 @@

 #endif

-    d->bmi.as_mv.first.as_int = this_mv->as_int;

+    d->bmi.as_mv[0].as_int = this_mv->as_int;

     if (mbmi->second_ref_frame > 0)

-      d->bmi.as_mv.second.as_int = this_second_mv->as_int;

+      d->bmi.as_mv[1].as_int = this_second_mv->as_int;

     x->partition_info->bmi[i].mode = m;

     x->partition_info->bmi[i].mv.as_int = this_mv->as_int;

@@ -2212,7 +2424,8 @@

   return cost;

-static int64_t encode_inter_mb_segment(MACROBLOCK *x,

+static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,

+                                       MACROBLOCK *x,

                                        int const *labels,

                                        int which_label,

                                        int *labelyrate,

@@ -2230,15 +2443,30 @@

       BLOCK *be = &x->block[i];

       int thisdistortion;

-      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4);

-      if (xd->mode_info_context->mbmi.second_ref_frame > 0)

-        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4);

+      vp9_build_inter_predictor(*(bd->base_pre) + bd->pre,

+                                bd->pre_stride,

+                                bd->predictor, 16,

+                                &bd->bmi.as_mv[0],

+                                &xd->scale_factor[0],

+                                4, 4, 0 /* no avg */, &xd->subpix);

+      // TODO(debargha): Make this work properly with the

+      // implicit-compoundinter-weight experiment when implicit

+      // weighting for splitmv modes is turned on.

+      if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

+        vp9_build_inter_predictor(

+            *(bd->base_second_pre) + bd->pre, bd->pre_stride, bd->predictor, 16,

+            &bd->bmi.as_mv[1], &xd->scale_factor[1], 4, 4,

+            1 << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT) /* avg */,

+            &xd->subpix);

+      }

       vp9_subtract_b(be, bd, 16);

-      x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

-      x->quantize_b_4x4(be, bd);

+      x->fwd_txm4x4(be->src_diff, be->coeff, 32);

+      x->quantize_b_4x4(x, i);

       thisdistortion = vp9_block_error(be->coeff, bd->dqcoeff, 16);

       *distortion += thisdistortion;

-      *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,

+      *labelyrate += cost_coeffs(cm, x, i, PLANE_TYPE_Y_WITH_DC,

                                  ta + vp9_block2above[TX_4X4][i],

                                  tl + vp9_block2left[TX_4X4][i], TX_4X4);

@@ -2247,7 +2475,8 @@

   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);

-static int64_t encode_inter_mb_segment_8x8(MACROBLOCK *x,

+static int64_t encode_inter_mb_segment_8x8(VP9_COMMON *const cm,

+                                           MACROBLOCK *x,

                                            int const *labels,

                                            int which_label,

                                            int *labelyrate,

@@ -2274,42 +2503,60 @@

     int ib = vp9_i8x8_block[i];

     if (labels[ib] == which_label) {

+      const int use_second_ref =

+          xd->mode_info_context->mbmi.second_ref_frame > 0;

+      int which_mv;

       int idx = (ib & 8) + ((ib & 2) << 1);

       BLOCKD *bd = &xd->block[ib], *bd2 = &xd->block[idx];

       BLOCK *be = &x->block[ib], *be2 = &x->block[idx];

       int thisdistortion;

-      vp9_build_inter_predictors4b(xd, bd, 16);

-      if (xd->mode_info_context->mbmi.second_ref_frame > 0)

-        vp9_build_2nd_inter_predictors4b(xd, bd, 16);

+      for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {

+        uint8_t **base_pre = which_mv ? bd->base_second_pre : bd->base_pre;

+        // TODO(debargha): Make this work properly with the

+        // implicit-compoundinter-weight experiment when implicit

+        // weighting for splitmv modes is turned on.

+        vp9_build_inter_predictor(

+            *base_pre + bd->pre, bd->pre_stride, bd->predictor, 16,

+            &bd->bmi.as_mv[which_mv], &xd->scale_factor[which_mv], 8, 8,

+            which_mv << (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),

+            &xd->subpix);

+      }

       vp9_subtract_4b_c(be, bd, 16);

       if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) {

         if (otherrd) {

-          x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);

-          x->quantize_b_8x8(be2, bd2);

+          x->fwd_txm8x8(be->src_diff, be2->coeff, 32);

+          x->quantize_b_8x8(x, idx, DCT_DCT);

           thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);

           otherdist += thisdistortion;

-          othercost += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,

-                                     tacp + vp9_block2above[TX_8X8][idx],

-                                     tlcp + vp9_block2left[TX_8X8][idx],

-                                     TX_8X8);

+          xd->mode_info_context->mbmi.txfm_size = TX_8X8;

+          othercost += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,

+                                   tacp + vp9_block2above[TX_8X8][idx],

+                                   tlcp + vp9_block2left[TX_8X8][idx],

+                                   TX_8X8);

+          xd->mode_info_context->mbmi.txfm_size = TX_4X4;

         for (j = 0; j < 4; j += 2) {

           bd = &xd->block[ib + iblock[j]];

           be = &x->block[ib + iblock[j]];

-          x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);

-          x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);

+          x->fwd_txm8x4(be->src_diff, be->coeff, 32);

+          x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j] + 1);

           thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);

           *distortion += thisdistortion;

-          *labelyrate += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,

-                           ta + vp9_block2above[TX_4X4][ib + iblock[j]],

-                           tl + vp9_block2left[TX_4X4][ib + iblock[j]],

-                           TX_4X4);

-          *labelyrate += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,

-                           ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],

-                           tl + vp9_block2left[TX_4X4][ib + iblock[j]],

-                           TX_4X4);

+          *labelyrate +=

+              cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,

+                          ta + vp9_block2above[TX_4X4][ib + iblock[j]],

+                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],

+                          TX_4X4);

+          *labelyrate +=

+              cost_coeffs(cm, x, ib + iblock[j] + 1,

+                          PLANE_TYPE_Y_WITH_DC,

+                          ta + vp9_block2above[TX_4X4][ib + iblock[j] + 1],

+                          tl + vp9_block2left[TX_4X4][ib + iblock[j]],

+                          TX_4X4);

       } else /* 8x8 */ {

         if (otherrd) {

@@ -2316,25 +2563,30 @@

           for (j = 0; j < 4; j += 2) {

             BLOCKD *bd = &xd->block[ib + iblock[j]];

             BLOCK *be = &x->block[ib + iblock[j]];

-            x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);

-            x->quantize_b_4x4_pair(be, be + 1, bd, bd + 1);

+            x->fwd_txm8x4(be->src_diff, be->coeff, 32);

+            x->quantize_b_4x4_pair(x, ib + iblock[j], ib + iblock[j]);

             thisdistortion = vp9_block_error_c(be->coeff, bd->dqcoeff, 32);

             otherdist += thisdistortion;

-            othercost += cost_coeffs(x, bd, PLANE_TYPE_Y_WITH_DC,

-                           tacp + vp9_block2above[TX_4X4][ib + iblock[j]],

-                           tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],

-                           TX_4X4);

-            othercost += cost_coeffs(x, bd + 1, PLANE_TYPE_Y_WITH_DC,

-                           tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],

-                           tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],

-                           TX_4X4);

+            xd->mode_info_context->mbmi.txfm_size = TX_4X4;

+            othercost +=

+                cost_coeffs(cm, x, ib + iblock[j], PLANE_TYPE_Y_WITH_DC,

+                            tacp + vp9_block2above[TX_4X4][ib + iblock[j]],

+                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],

+                            TX_4X4);

+            othercost +=

+                cost_coeffs(cm, x, ib + iblock[j] + 1,

+                            PLANE_TYPE_Y_WITH_DC,

+                            tacp + vp9_block2above[TX_4X4][ib + iblock[j] + 1],

+                            tlcp + vp9_block2left[TX_4X4][ib + iblock[j]],

+                            TX_4X4);

+            xd->mode_info_context->mbmi.txfm_size = TX_8X8;

-        x->vp9_short_fdct8x8(be->src_diff, be2->coeff, 32);

-        x->quantize_b_8x8(be2, bd2);

+        x->fwd_txm8x8(be->src_diff, be2->coeff, 32);

+        x->quantize_b_8x8(x, idx, DCT_DCT);

         thisdistortion = vp9_block_error_c(be2->coeff, bd2->dqcoeff, 64);

         *distortion += thisdistortion;

-        *labelyrate += cost_coeffs(x, bd2, PLANE_TYPE_Y_WITH_DC,

+        *labelyrate += cost_coeffs(cm, x, idx, PLANE_TYPE_Y_WITH_DC,

                                    ta + vp9_block2above[TX_8X8][idx],

                                    tl + vp9_block2left[TX_8X8][idx], TX_8X8);

@@ -2373,8 +2625,7 @@

 } BEST_SEG_INFO;

-static __inline

-int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {

+static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {

   int r = 0;

   r |= (mv->as_mv.row >> 3) < x->mv_row_min;

   r |= (mv->as_mv.row >> 3) > x->mv_row_max;

@@ -2487,9 +2738,9 @@

           // use previous block's result as next block's MV predictor.

           if (segmentation == PARTITIONING_4X4 && i > 0) {

-            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv.first.as_int;

+            bsi->mvp.as_int = x->e_mbd.block[i - 1].bmi.as_mv[0].as_int;

             if (i == 4 || i == 8 || i == 12)

-              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv.first.as_int;

+              bsi->mvp.as_int = x->e_mbd.block[i - 4].bmi.as_mv[0].as_int;

             step_param = 2;

@@ -2528,11 +2779,11 @@

             if (thissme < bestsme) {

               bestsme = thissme;

-              mode_mv[NEW4X4].as_int = e->bmi.as_mv.first.as_int;

+              mode_mv[NEW4X4].as_int = e->bmi.as_mv[0].as_int;

             } else {

               /* The full search result is actually worse so re-instate the

                * previous best vector */

-              e->bmi.as_mv.first.as_int = mode_mv[NEW4X4].as_int;

+              e->bmi.as_mv[0].as_int = mode_mv[NEW4X4].as_int;

@@ -2575,11 +2826,13 @@

         continue;

       if (segmentation == PARTITIONING_4X4) {

-        this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,

+        this_rd = encode_inter_mb_segment(&cpi->common,

+                                          x, labels, i, &labelyrate,

                                           &distortion, ta_s, tl_s);

         other_rd = this_rd;

       } else {

-        this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,

+        this_rd = encode_inter_mb_segment_8x8(&cpi->common,

+                                              x, labels, i, &labelyrate,

                                               &distortion, &other_rd,

                                               ta_s, tl_s);

@@ -2595,13 +2848,13 @@

         if (x->e_mbd.mode_info_context->mbmi.txfm_size == TX_4X4) {

           for (j = 0; j < 16; j++)

             if (labels[j] == i)

-              best_eobs[j] = x->e_mbd.block[j].eob;

+              best_eobs[j] = x->e_mbd.eobs[j];

         } else {

           for (j = 0; j < 4; j++) {

             int ib = vp9_i8x8_block[j], idx = j * 4;

             if (labels[ib] == i)

-              best_eobs[idx] = x->e_mbd.block[idx].eob;

+              best_eobs[idx] = x->e_mbd.eobs[idx];

         if (other_rd < best_other_rd)

@@ -2734,8 +2987,9 @@

       if (base_rd < txfm_cache[ONLY_4X4]) {

         txfm_cache[ONLY_4X4] = base_rd;

-      if (base_rd + diff < txfm_cache[1]) {

-        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] = base_rd + diff;

+      if (base_rd + diff < txfm_cache[ALLOW_8X8]) {

+        txfm_cache[ALLOW_8X8] = txfm_cache[ALLOW_16X16] =

+            txfm_cache[ALLOW_32X32] = base_rd + diff;

       if (diff < 0) {

         base_rd += diff + RDCOST(x->rdmult, x->rddiv, cost8x8, 0);

@@ -2749,7 +3003,7 @@

-static __inline void cal_step_param(int sr, int *sp) {

+static INLINE void cal_step_param(int sr, int *sp) {

   int step = 0;

   if (sr > MAX_FIRST_STEP) sr = MAX_FIRST_STEP;

@@ -2872,10 +3126,10 @@

   for (i = 0; i < 16; i++) {

     BLOCKD *bd = &x->e_mbd.block[i];

-    bd->bmi.as_mv.first.as_int = bsi.mvs[i].as_int;

+    bd->bmi.as_mv[0].as_int = bsi.mvs[i].as_int;

     if (mbmi->second_ref_frame > 0)

-      bd->bmi.as_mv.second.as_int = bsi.second_mvs[i].as_int;

-    bd->eob = bsi.eobs[i];

+      bd->bmi.as_mv[1].as_int = bsi.second_mvs[i].as_int;

+    x->e_mbd.eobs[i] = bsi.eobs[i];

   *returntotrate = bsi.r;

@@ -2882,8 +3136,8 @@

   *returndistortion = bsi.d;

   *returnyrate = bsi.segment_yrate;

   *skippable = bsi.txfm_size == TX_4X4 ?

-                    vp9_mby_is_skippable_4x4(&x->e_mbd, 0) :

-                    vp9_mby_is_skippable_8x8(&x->e_mbd, 0);

+                    vp9_mby_is_skippable_4x4(&x->e_mbd) :

+                    vp9_mby_is_skippable_8x8(&x->e_mbd);

   /* save partitions */

   mbmi->txfm_size = bsi.txfm_size;

@@ -3016,7 +3270,8 @@

-static __inline unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1, int idx, int val, int weight) {

+static INLINE unsigned weighted_cost(vp9_prob *tab0, vp9_prob *tab1,

+                                     int idx, int val, int weight) {

   unsigned cost0 = tab0[idx] ? vp9_cost_bit(tab0[idx], val) : 0;

   unsigned cost1 = tab1[idx] ? vp9_cost_bit(tab1[idx], val) : 0;

   // weight is 16-bit fixed point, so this basically calculates:

@@ -3145,7 +3400,9 @@

   // UV cost and distortion

   vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

                     x->e_mbd.predictor, x->src.uv_stride);

-  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4)

+  if (x->e_mbd.mode_info_context->mbmi.txfm_size != TX_4X4 &&

+      x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED &&

+      x->e_mbd.mode_info_context->mbmi.mode != SPLITMV)

     rd_inter16x16_uv_8x8(cpi, x, rate_uv, distortion_uv,

                          cpi->common.full_pixel, &uv_skippable, 1);

   else

@@ -3160,41 +3417,104 @@

 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,

                                int idx, MV_REFERENCE_FRAME frame_type,

                                int block_size,

-                               int recon_yoffset, int recon_uvoffset,

+                               int mb_row, int mb_col,

                                int_mv frame_nearest_mv[MAX_REF_FRAMES],

                                int_mv frame_near_mv[MAX_REF_FRAMES],

                                int frame_mdcounts[4][4],

-                               uint8_t *y_buffer[4],

-                               uint8_t *u_buffer[4],

-                               uint8_t *v_buffer[4]) {

-  YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];

+                               YV12_BUFFER_CONFIG yv12_mb[4],

+                               struct scale_factors scale[MAX_REF_FRAMES]) {

+  VP9_COMMON *cm = &cpi->common;

+  YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;

+  int use_prev_in_find_mv_refs, use_prev_in_find_best_ref;

-  y_buffer[frame_type] = yv12->y_buffer + recon_yoffset;

-  u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset;

-  v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset;

+  // set up scaling factors

+  scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];

+  scale[frame_type].x_offset_q4 =

+      (mb_col * 16 * scale[frame_type].x_num / scale[frame_type].x_den) & 0xf;

+  scale[frame_type].y_offset_q4 =

+      (mb_row * 16 * scale[frame_type].y_num / scale[frame_type].y_den) & 0xf;

+  // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this

+  // use the UV scaling factors.

+  setup_pred_block(&yv12_mb[frame_type], yv12, mb_row, mb_col,

+                   &scale[frame_type], &scale[frame_type]);

   // Gets an initial list of candidate vectors from neighbours and orders them

-  vp9_find_mv_refs(xd, xd->mode_info_context,

-                   xd->prev_mode_info_context,

+  use_prev_in_find_mv_refs = cm->width == cm->last_width &&

+                             cm->height == cm->last_height &&

+                             !cpi->common.error_resilient_mode;

+  vp9_find_mv_refs(&cpi->common, xd, xd->mode_info_context,

+                   use_prev_in_find_mv_refs ? xd->prev_mode_info_context : NULL,

                    frame_type,

                    mbmi->ref_mvs[frame_type],

                    cpi->common.ref_frame_sign_bias);

   // Candidate refinement carried out at encoder and decoder

-  vp9_find_best_ref_mvs(xd, y_buffer[frame_type],

+  use_prev_in_find_best_ref =

+      scale[frame_type].x_num == scale[frame_type].x_den &&

+      scale[frame_type].y_num == scale[frame_type].y_den &&

+      !cm->error_resilient_mode &&

+      !cm->frame_parallel_decoding_mode;

+  vp9_find_best_ref_mvs(xd,

+                        use_prev_in_find_best_ref ?

+                            yv12_mb[frame_type].y_buffer : NULL,

                         yv12->y_stride,

                         mbmi->ref_mvs[frame_type],

                         &frame_nearest_mv[frame_type],

                         &frame_near_mv[frame_type]);

   // Further refinement that is encode side only to test the top few candidates

   // in full and choose the best as the centre point for subsequent searches.

-  mv_pred(cpi, x, y_buffer[frame_type], yv12->y_stride,

-          frame_type, block_size);

+  // The current implementation doesn't support scaling.

+  if (scale[frame_type].x_num == scale[frame_type].x_den &&

+      scale[frame_type].y_num == scale[frame_type].y_den)

+    mv_pred(cpi, x, yv12_mb[frame_type].y_buffer, yv12->y_stride,

+            frame_type, block_size);

+}

+static void model_rd_from_var_lapndz(int var, int n, int qstep,

+                                     int *rate, int *dist) {

+  // This function models the rate and distortion for a Laplacian

+  // source with given variance when quantized with a uniform quantizer

+  // with given stepsize. The closed form expressions are in:

+  // Hang and Chen, "Source Model for transform video coder and its

+  // application - Part I: Fundamental Theory", IEEE Trans. Circ.

+  // Sys. for Video Tech., April 1997.

+  // The function is implemented as piecewise approximation to the

+  // exact computation.

+  // TODO(debargha): Implement the functions by interpolating from a

+  // look-up table

+  vp9_clear_system_state();

+  {

+    double D, R;

+    double s2 = (double) var / n;

+    double s = sqrt(s2);

+    double x = qstep / s;

+    if (x > 1.0) {

+      double y = exp(-x / 2);

+      double y2 = y * y;

+      D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275;

+      R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017;

+    } else {

+      double x2 = x * x;

+      D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807;

+      if (x > 0.125)

+        R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x +

+                 0.1626989668625);

+      else

+        R = -1.442252874826093 * log(x) + 1.944647760719664;

+    }

+    if (R < 0) {

+      *rate = 0;

+      *dist = var;

+    } else {

+      *rate = (n * R * 256 + 0.5);

+      *dist = (n * D * s2 + 0.5);

+    }

+  }

+  vp9_clear_system_state();

 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

@@ -3209,9 +3529,12 @@

                                  int *rate_y, int *distortion_y,

                                  int *rate_uv, int *distortion_uv,

                                  int *mode_excluded, int *disable_skip,

-                                 int recon_yoffset, int mode_index,

+                                 int mode_index,

+                                 INTERPOLATIONFILTERTYPE *best_filter,

                                  int_mv frame_mv[MB_MODE_COUNT]

-                                                [MAX_REF_FRAMES]) {

+                                                [MAX_REF_FRAMES],

+                                 YV12_BUFFER_CONFIG *scaled_ref_frame,

+                                 int mb_row, int mb_col) {

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &x->e_mbd;

   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

@@ -3229,6 +3552,13 @@

   int_mv cur_mv[2];

   int_mv ref_mv[2];

   int64_t this_rd = 0;

+  unsigned char tmp_ybuf[64 * 64];

+  unsigned char tmp_ubuf[32 * 32];

+  unsigned char tmp_vbuf[32 * 32];

+  int pred_exists = 0;

+  int interpolating_intpel_seen = 0;

+  int intpel_mv;

+  int64_t rd, best_rd = INT64_MAX;

   switch (this_mode) {

     case NEWMV:

@@ -3248,6 +3578,7 @@

                                   x->nmvjointcost, x->mvcost, 96,

                                   x->e_mbd.allow_high_precision_mv);

       } else {

+        YV12_BUFFER_CONFIG backup_yv12 = xd->pre;

         int bestsme = INT_MAX;

         int further_steps, step_param = cpi->sf.first_step;

         int sadpb = x->sadperbit16;

@@ -3259,8 +3590,20 @@

         int tmp_row_min = x->mv_row_min;

         int tmp_row_max = x->mv_row_max;

+        if (scaled_ref_frame) {

+          // Swap out the reference frame for a version that's been scaled to

+          // match the resolution of the current frame, allowing the existing

+          // motion search code to be used without additional modifications.

+          xd->pre = *scaled_ref_frame;

+          xd->pre.y_buffer += mb_row * 16 * xd->pre.y_stride + mb_col * 16;

+          xd->pre.u_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;

+          xd->pre.v_buffer += mb_row * 8 * xd->pre.uv_stride + mb_col * 8;

+        }

         vp9_clamp_mv_min_max(x, &ref_mv[0]);

+        sr = vp9_init_search_range(cpi->common.width, cpi->common.height);

         // mvp_full.as_int = ref_mv[0].as_int;

         mvp_full.as_int =

          mbmi->ref_mvs[refs[0]][x->mv_best_ref_index[refs[0]]].as_int;

@@ -3267,9 +3610,6 @@

         mvp_full.as_mv.col >>= 3;

         mvp_full.as_mv.row >>= 3;

-        if (mvp_full.as_int != mvp_full.as_int) {

-          mvp_full.as_int = mvp_full.as_int;

-        }

         // adjust search range according to sr from mv prediction

         step_param = MAX(step_param, sr);

@@ -3297,22 +3637,22 @@

                                        x->nmvjointcost, x->mvcost,

                                        &dis, &sse);

-        d->bmi.as_mv.first.as_int = tmp_mv.as_int;

-        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;

+        d->bmi.as_mv[0].as_int = tmp_mv.as_int;

+        frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv[0].as_int;

         // Add the new motion vector cost to our rolling cost variable

         *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],

                                   x->nmvjointcost, x->mvcost,

                                   96, xd->allow_high_precision_mv);

+        // restore the predictor, if required

+        if (scaled_ref_frame) {

+          xd->pre = backup_yv12;

+        }

       break;

-    case NEARESTMV:

     case NEARMV:

-      // Do not bother proceeding if the vector (from newmv, nearest or

-      // near) is 0,0 as this should then be coded using the zeromv mode.

-      for (i = 0; i < num_refs; ++i)

-        if (frame_mv[this_mode][refs[i]].as_int == 0)

-          return INT64_MAX;

+    case NEARESTMV:

     case ZEROMV:

     default:

       break;

@@ -3326,11 +3666,6 @@

     mbmi->mv[i].as_int = cur_mv[i].as_int;

-  if (cpi->common.mcomp_filter_type == SWITCHABLE) {

-    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

-    const int m = vp9_switchable_interp_map[mbmi->interp_filter];

-    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

-  }

   /* We don't include the cost of the second reference here, because there

    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other

@@ -3355,36 +3690,332 @@

 #endif

+  pred_exists = 0;

+  interpolating_intpel_seen = 0;

+  // Are all MVs integer pel for Y and UV

+  intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 &&

+              (mbmi->mv[0].as_mv.col & 15) == 0;

+  if (is_comp_pred)

+    intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 &&

+                 (mbmi->mv[1].as_mv.col & 15) == 0;

+  // Search for best switchable filter by checking the variance of

+  // pred error irrespective of whether the filter will be used

   if (block_size == BLOCK_64X64) {

-    vp9_build_inter64x64_predictors_sb(xd,

-                                       xd->dst.y_buffer,

-                                       xd->dst.u_buffer,

-                                       xd->dst.v_buffer,

-                                       xd->dst.y_stride,

-                                       xd->dst.uv_stride);

+    int switchable_filter_index, newbest;

+    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;

+    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;

+    for (switchable_filter_index = 0;

+         switchable_filter_index < VP9_SWITCHABLE_FILTERS;

+         ++switchable_filter_index) {

+      int rs = 0;

+      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];

+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+      if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

+        const int m = vp9_switchable_interp_map[mbmi->interp_filter];

+        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

+      }

+      if (interpolating_intpel_seen && intpel_mv &&

+          vp9_is_interpolating_filter[mbmi->interp_filter]) {

+        rd = RDCOST(x->rdmult, x->rddiv,

+                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,

+                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);

+      } else {

+        unsigned int sse, var;

+        int tmp_rate_y, tmp_rate_u, tmp_rate_v;

+        int tmp_dist_y, tmp_dist_u, tmp_dist_v;

+        vp9_build_inter64x64_predictors_sb(xd,

+                                           xd->dst.y_buffer,

+                                           xd->dst.u_buffer,

+                                           xd->dst.v_buffer,

+                                           xd->dst.y_stride,

+                                           xd->dst.uv_stride,

+                                           mb_row, mb_col);

+        var = vp9_variance64x64(*(b->base_src), b->src_stride,

+                                xd->dst.y_buffer, xd->dst.y_stride, &sse);

+        // Note our transform coeffs are 8 times an orthogonal transform.

+        // Hence quantizer step is also 8 times. To get effective quantizer

+        // we need to divide by 8 before sending to modeling function.

+        model_rd_from_var_lapndz(var, 64 * 64, xd->block[0].dequant[1] >> 3,

+                                 &tmp_rate_y, &tmp_dist_y);

+        var = vp9_variance32x32(x->src.u_buffer, x->src.uv_stride,

+                                xd->dst.u_buffer, xd->dst.uv_stride, &sse);

+        model_rd_from_var_lapndz(var, 32 * 32, xd->block[16].dequant[1] >> 3,

+                                 &tmp_rate_u, &tmp_dist_u);

+        var = vp9_variance32x32(x->src.v_buffer, x->src.uv_stride,

+                                xd->dst.v_buffer, xd->dst.uv_stride, &sse);

+        model_rd_from_var_lapndz(var, 32 * 32, xd->block[20].dequant[1] >> 3,

+                                 &tmp_rate_v, &tmp_dist_v);

+        rd = RDCOST(x->rdmult, x->rddiv,

+                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,

+                    tmp_dist_y + tmp_dist_u + tmp_dist_v);

+        if (!interpolating_intpel_seen && intpel_mv &&

+            vp9_is_interpolating_filter[mbmi->interp_filter]) {

+          tmp_rate_y_i = tmp_rate_y;

+          tmp_rate_u_i = tmp_rate_u;

+          tmp_rate_v_i = tmp_rate_v;

+          tmp_dist_y_i = tmp_dist_y;

+          tmp_dist_u_i = tmp_dist_u;

+          tmp_dist_v_i = tmp_dist_v;

+        }

+      }

+      newbest = (switchable_filter_index == 0 || rd < best_rd);

+      if (newbest) {

+        best_rd = rd;

+        *best_filter = mbmi->interp_filter;

+      }

+      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||

+          (cm->mcomp_filter_type != SWITCHABLE &&

+           cm->mcomp_filter_type == mbmi->interp_filter)) {

+        int i;

+        for (i = 0; i < 64; ++i)

+          vpx_memcpy(tmp_ybuf + i * 64,

+                     xd->dst.y_buffer + i * xd->dst.y_stride,

+                     sizeof(unsigned char) * 64);

+        for (i = 0; i < 32; ++i)

+          vpx_memcpy(tmp_ubuf + i * 32,

+                     xd->dst.u_buffer + i * xd->dst.uv_stride,

+                     sizeof(unsigned char) * 32);

+        for (i = 0; i < 32; ++i)

+          vpx_memcpy(tmp_vbuf + i * 32,

+                     xd->dst.v_buffer + i * xd->dst.uv_stride,

+                     sizeof(unsigned char) * 32);

+        pred_exists = 1;

+      }

+      interpolating_intpel_seen |=

+        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];

+    }

   } else if (block_size == BLOCK_32X32) {

-    vp9_build_inter32x32_predictors_sb(xd,

-                                       xd->dst.y_buffer,

-                                       xd->dst.u_buffer,

-                                       xd->dst.v_buffer,

-                                       xd->dst.y_stride,

-                                       xd->dst.uv_stride);

+    int switchable_filter_index, newbest;

+    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;

+    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;

+    for (switchable_filter_index = 0;

+       switchable_filter_index < VP9_SWITCHABLE_FILTERS;

+       ++switchable_filter_index) {

+      int rs = 0;

+      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];

+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+      if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

+        const int m = vp9_switchable_interp_map[mbmi->interp_filter];

+        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

+      }

+      if (interpolating_intpel_seen && intpel_mv &&

+          vp9_is_interpolating_filter[mbmi->interp_filter]) {

+        rd = RDCOST(x->rdmult, x->rddiv,

+                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,

+                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);

+      } else {

+        unsigned int sse, var;

+        int tmp_rate_y, tmp_rate_u, tmp_rate_v;

+        int tmp_dist_y, tmp_dist_u, tmp_dist_v;

+        vp9_build_inter32x32_predictors_sb(xd,

+                                           xd->dst.y_buffer,

+                                           xd->dst.u_buffer,

+                                           xd->dst.v_buffer,

+                                           xd->dst.y_stride,

+                                           xd->dst.uv_stride,

+                                           mb_row, mb_col);

+        var = vp9_variance32x32(*(b->base_src), b->src_stride,

+                                xd->dst.y_buffer, xd->dst.y_stride, &sse);

+        // Note our transform coeffs are 8 times an orthogonal transform.

+        // Hence quantizer step is also 8 times. To get effective quantizer

+        // we need to divide by 8 before sending to modeling function.

+        model_rd_from_var_lapndz(var, 32 * 32, xd->block[0].dequant[1] >> 3,

+                                 &tmp_rate_y, &tmp_dist_y);

+        var = vp9_variance16x16(x->src.u_buffer, x->src.uv_stride,

+                                xd->dst.u_buffer, xd->dst.uv_stride, &sse);

+        model_rd_from_var_lapndz(var, 16 * 16, xd->block[16].dequant[1] >> 3,

+                                 &tmp_rate_u, &tmp_dist_u);

+        var = vp9_variance16x16(x->src.v_buffer, x->src.uv_stride,

+                                xd->dst.v_buffer, xd->dst.uv_stride, &sse);

+        model_rd_from_var_lapndz(var, 16 * 16, xd->block[20].dequant[1] >> 3,

+                                 &tmp_rate_v, &tmp_dist_v);

+        rd = RDCOST(x->rdmult, x->rddiv,

+                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,

+                    tmp_dist_y + tmp_dist_u + tmp_dist_v);

+        if (!interpolating_intpel_seen && intpel_mv &&

+            vp9_is_interpolating_filter[mbmi->interp_filter]) {

+          tmp_rate_y_i = tmp_rate_y;

+          tmp_rate_u_i = tmp_rate_u;

+          tmp_rate_v_i = tmp_rate_v;

+          tmp_dist_y_i = tmp_dist_y;

+          tmp_dist_u_i = tmp_dist_u;

+          tmp_dist_v_i = tmp_dist_v;

+        }

+      }

+      newbest = (switchable_filter_index == 0 || rd < best_rd);

+      if (newbest) {

+        best_rd = rd;

+        *best_filter = mbmi->interp_filter;

+      }

+      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||

+          (cm->mcomp_filter_type != SWITCHABLE &&

+           cm->mcomp_filter_type == mbmi->interp_filter)) {

+        int i;

+        for (i = 0; i < 32; ++i)

+          vpx_memcpy(tmp_ybuf + i * 64,

+                     xd->dst.y_buffer + i * xd->dst.y_stride,

+                     sizeof(unsigned char) * 32);

+        for (i = 0; i < 16; ++i)

+          vpx_memcpy(tmp_ubuf + i * 32,

+                     xd->dst.u_buffer + i * xd->dst.uv_stride,

+                     sizeof(unsigned char) * 16);

+        for (i = 0; i < 16; ++i)

+          vpx_memcpy(tmp_vbuf + i * 32,

+                     xd->dst.v_buffer + i * xd->dst.uv_stride,

+                     sizeof(unsigned char) * 16);

+        pred_exists = 1;

+      }

+      interpolating_intpel_seen |=

+        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];

+    }

   } else {

+    int switchable_filter_index, newbest;

+    int tmp_rate_y_i = 0, tmp_rate_u_i = 0, tmp_rate_v_i = 0;

+    int tmp_dist_y_i = 0, tmp_dist_u_i = 0, tmp_dist_v_i = 0;

     assert(block_size == BLOCK_16X16);

-    vp9_build_1st_inter16x16_predictors_mby(xd, xd->predictor, 16, 0);

-    if (is_comp_pred)

-      vp9_build_2nd_inter16x16_predictors_mby(xd, xd->predictor, 16);

-#if CONFIG_COMP_INTERINTRA_PRED

-    if (is_comp_interintra_pred) {

-      vp9_build_interintra_16x16_predictors_mby(xd, xd->predictor, 16);

+    for (switchable_filter_index = 0;

+       switchable_filter_index < VP9_SWITCHABLE_FILTERS;

+       ++switchable_filter_index) {

+      int rs = 0;

+      mbmi->interp_filter = vp9_switchable_interp[switchable_filter_index];

+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+      if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+        const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

+        const int m = vp9_switchable_interp_map[mbmi->interp_filter];

+        rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

+      }

+      if (interpolating_intpel_seen && intpel_mv &&

+          vp9_is_interpolating_filter[mbmi->interp_filter]) {

+        rd = RDCOST(x->rdmult, x->rddiv,

+                    rs + tmp_rate_y_i + tmp_rate_u_i + tmp_rate_v_i,

+                    tmp_dist_y_i + tmp_dist_u_i + tmp_dist_v_i);

+      } else {

+        unsigned int sse, var;

+        int tmp_rate_y, tmp_rate_u, tmp_rate_v;

+        int tmp_dist_y, tmp_dist_u, tmp_dist_v;

+        vp9_build_inter16x16_predictors_mb(xd, xd->predictor,

+                                           xd->predictor + 256,

+                                           xd->predictor + 320,

+                                           16, 8, mb_row, mb_col);

+        var = vp9_variance16x16(*(b->base_src), b->src_stride,

+                                xd->predictor, 16, &sse);

+        // Note our transform coeffs are 8 times an orthogonal transform.

+        // Hence quantizer step is also 8 times. To get effective quantizer

+        // we need to divide by 8 before sending to modeling function.

+        model_rd_from_var_lapndz(var, 16 * 16, xd->block[0].dequant[1] >> 3,

+                                 &tmp_rate_y, &tmp_dist_y);

+        var = vp9_variance8x8(x->src.u_buffer, x->src.uv_stride,

+                              &xd->predictor[256], 8, &sse);

+        model_rd_from_var_lapndz(var, 8 * 8, xd->block[16].dequant[1] >> 3,

+                                 &tmp_rate_u, &tmp_dist_u);

+        var = vp9_variance8x8(x->src.v_buffer, x->src.uv_stride,

+                              &xd->predictor[320], 8, &sse);

+        model_rd_from_var_lapndz(var, 8 * 8, xd->block[20].dequant[1] >> 3,

+                                 &tmp_rate_v, &tmp_dist_v);

+        rd = RDCOST(x->rdmult, x->rddiv,

+                    rs + tmp_rate_y + tmp_rate_u + tmp_rate_v,

+                    tmp_dist_y + tmp_dist_u + tmp_dist_v);

+        if (!interpolating_intpel_seen && intpel_mv &&

+            vp9_is_interpolating_filter[mbmi->interp_filter]) {

+          tmp_rate_y_i = tmp_rate_y;

+          tmp_rate_u_i = tmp_rate_u;

+          tmp_rate_v_i = tmp_rate_v;

+          tmp_dist_y_i = tmp_dist_y;

+          tmp_dist_u_i = tmp_dist_u;

+          tmp_dist_v_i = tmp_dist_v;

+        }

+      }

+      newbest = (switchable_filter_index == 0 || rd < best_rd);

+      if (newbest) {

+        best_rd = rd;

+        *best_filter = mbmi->interp_filter;

+      }

+      if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||

+          (cm->mcomp_filter_type != SWITCHABLE &&

+           cm->mcomp_filter_type == mbmi->interp_filter)) {

+        vpx_memcpy(tmp_ybuf, xd->predictor, sizeof(unsigned char) * 256);

+        vpx_memcpy(tmp_ubuf, xd->predictor + 256, sizeof(unsigned char) * 64);

+        vpx_memcpy(tmp_vbuf, xd->predictor + 320, sizeof(unsigned char) * 64);

+        pred_exists = 1;

+      }

+      interpolating_intpel_seen |=

+        intpel_mv && vp9_is_interpolating_filter[mbmi->interp_filter];

-#endif

+  // Set the appripriate filter

+  if (cm->mcomp_filter_type != SWITCHABLE)

+    mbmi->interp_filter = cm->mcomp_filter_type;

+  else

+    mbmi->interp_filter = *best_filter;

+  vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+  if (pred_exists) {

+    if (block_size == BLOCK_64X64) {

+      for (i = 0; i < 64; ++i)

+        vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,

+                   sizeof(unsigned char) * 64);

+      for (i = 0; i < 32; ++i)

+        vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32,

+                   sizeof(unsigned char) * 32);

+      for (i = 0; i < 32; ++i)

+        vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32,

+                   sizeof(unsigned char) * 32);

+    } else if (block_size == BLOCK_32X32) {

+      for (i = 0; i < 32; ++i)

+        vpx_memcpy(xd->dst.y_buffer + i * xd->dst.y_stride,  tmp_ybuf + i * 64,

+                   sizeof(unsigned char) * 32);

+      for (i = 0; i < 16; ++i)

+        vpx_memcpy(xd->dst.u_buffer + i * xd->dst.uv_stride, tmp_ubuf + i * 32,

+                   sizeof(unsigned char) * 16);

+      for (i = 0; i < 16; ++i)

+        vpx_memcpy(xd->dst.v_buffer + i * xd->dst.uv_stride, tmp_vbuf + i * 32,

+                   sizeof(unsigned char) * 16);

+    } else {

+      vpx_memcpy(xd->predictor, tmp_ybuf, sizeof(unsigned char) * 256);

+      vpx_memcpy(xd->predictor + 256, tmp_ubuf, sizeof(unsigned char) * 64);

+      vpx_memcpy(xd->predictor + 320, tmp_vbuf, sizeof(unsigned char) * 64);

+    }

+  } else {

+    // Handles the special case when a filter that is not in the

+    // switchable list (ex. bilinear, 6-tap) is indicated at the frame level

+    if (block_size == BLOCK_64X64) {

+      vp9_build_inter64x64_predictors_sb(xd,

+                                         xd->dst.y_buffer,

+                                         xd->dst.u_buffer,

+                                         xd->dst.v_buffer,

+                                         xd->dst.y_stride,

+                                         xd->dst.uv_stride,

+                                         mb_row, mb_col);

+    } else if (block_size == BLOCK_32X32) {

+      vp9_build_inter32x32_predictors_sb(xd,

+                                         xd->dst.y_buffer,

+                                         xd->dst.u_buffer,

+                                         xd->dst.v_buffer,

+                                         xd->dst.y_stride,

+                                         xd->dst.uv_stride,

+                                         mb_row, mb_col);

+    } else {

+      vp9_build_inter16x16_predictors_mb(xd, xd->predictor,

+                                         xd->predictor + 256,

+                                         xd->predictor + 320,

+                                         16, 8, mb_row, mb_col);

+    }

+  }

+  if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+    const int c = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);

+    const int m = vp9_switchable_interp_map[mbmi->interp_filter];

+    *rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs[c][m];

+  }

   if (cpi->active_map_enabled && x->active_ptr[0] == 0)

     x->skip = 1;

   else if (x->encode_breakout) {

-    unsigned int sse, var;

+    unsigned int var, sse;

     int threshold = (xd->block[0].dequant[1]

                      * xd->block[0].dequant[1] >> 4);

@@ -3404,9 +4035,9 @@

     if ((int)sse < threshold) {

-      unsigned int q2dc = xd->block[24].dequant[0];

+      unsigned int q2dc = xd->block[0].dequant[0];

       /* If there is no codeable 2nd order dc

-       or a very small uniform pixel change change */

+         or a very small uniform pixel change change */

       if ((sse - var < q2dc * q2dc >> 4) ||

           (sse / 2 > var && sse - var < 64)) {

         // Check u and v to make sure skip is ok

@@ -3447,17 +4078,6 @@

-  if (!(*mode_excluded)) {

-    if (is_comp_pred) {

-      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);

-    } else {

-      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);

-    }

-#if CONFIG_COMP_INTERINTRA_PRED

-    if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1;

-#endif

-  }

   if (!x->skip) {

     if (block_size == BLOCK_64X64) {

       int skippable_y, skippable_uv;

@@ -3491,30 +4111,32 @@

       *skippable = skippable_y && skippable_uv;

     } else {

       assert(block_size == BLOCK_16X16);

-      vp9_build_1st_inter16x16_predictors_mbuv(xd, &xd->predictor[256],

-                                               &xd->predictor[320], 8);

-      if (is_comp_pred)

-        vp9_build_2nd_inter16x16_predictors_mbuv(xd, &xd->predictor[256],

-                                                 &xd->predictor[320], 8);

-#if CONFIG_COMP_INTERINTRA_PRED

-      if (is_comp_interintra_pred) {

-        vp9_build_interintra_16x16_predictors_mbuv(xd, &xd->predictor[256],

-                                                   &xd->predictor[320], 8);

-      }

-#endif

       inter_mode_cost(cpi, x, rate2, distortion,

                       rate_y, distortion_y, rate_uv, distortion_uv,

                       skippable, txfm_cache);

+  if (!(*mode_excluded)) {

+    if (is_comp_pred) {

+      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);

+    } else {

+      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);

+    }

+#if CONFIG_COMP_INTERINTRA_PRED

+    if (is_comp_interintra_pred && !cm->use_interintra) *mode_excluded = 1;

+#endif

+  }

   return this_rd;  // if 0, this will be re-calculated by caller

 static void rd_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                               int recon_yoffset, int recon_uvoffset,

+                               int mb_row, int mb_col,

                                int *returnrate, int *returndistortion,

                                int64_t *returnintra) {

+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,

+    VP9_ALT_FLAG };

   VP9_COMMON *cm = &cpi->common;

   MACROBLOCKD *xd = &x->e_mbd;

   union b_mode_info best_bmodes[16];

@@ -3540,10 +4162,14 @@

 #if CONFIG_COMP_INTERINTRA_PRED

   int is_best_interintra = 0;

   int64_t best_intra16_rd = INT64_MAX;

-  int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED;

+  int best_intra16_mode = DC_PRED;

+#if SEPARATE_INTERINTRA_UV

+  int best_intra16_uv_mode = DC_PRED;

 #endif

+#endif

   int64_t best_overall_rd = INT64_MAX;

   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;

+  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;

   int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;

   int uv_intra_skippable = 0;

   int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;

@@ -3551,7 +4177,6 @@

   int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);

   int distortion_uv = INT_MAX;

   int64_t best_yrd = INT64_MAX;

-  int switchable_filter_index = 0;

   MB_PREDICTION_MODE uv_intra_mode;

   MB_PREDICTION_MODE uv_intra_mode_8x8 = 0;

@@ -3561,7 +4186,7 @@

   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];

   int frame_mdcounts[4][4];

-  uint8_t *y_buffer[4], *u_buffer[4], *v_buffer[4];

+  YV12_BUFFER_CONFIG yv12_mb[4];

   unsigned int ref_costs[MAX_REF_FRAMES];

   int_mv seg_mvs[NB_PARTITIONINGS][16 /* n_blocks */][MAX_REF_FRAMES - 1];

@@ -3569,6 +4194,8 @@

   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,

                                              cpi->common.y1dc_delta_q);

+  struct scale_factors scale_factor[4];

   vpx_memset(mode8x8, 0, sizeof(mode8x8));

   vpx_memset(&frame_mv, 0, sizeof(frame_mv));

   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

@@ -3592,24 +4219,24 @@

   if (cpi->ref_frame_flags & VP9_LAST_FLAG) {

-    setup_buffer_inter(cpi, x, cpi->common.lst_fb_idx, LAST_FRAME,

-                       BLOCK_16X16, recon_yoffset, recon_uvoffset,

+    setup_buffer_inter(cpi, x, cpi->lst_fb_idx,

+                       LAST_FRAME, BLOCK_16X16, mb_row, mb_col,

                        frame_mv[NEARESTMV], frame_mv[NEARMV],

-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);

+                       frame_mdcounts, yv12_mb, scale_factor);

   if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {

-    setup_buffer_inter(cpi, x, cpi->common.gld_fb_idx, GOLDEN_FRAME,

-                       BLOCK_16X16, recon_yoffset, recon_uvoffset,

+    setup_buffer_inter(cpi, x, cpi->gld_fb_idx,

+                       GOLDEN_FRAME, BLOCK_16X16, mb_row, mb_col,

                        frame_mv[NEARESTMV], frame_mv[NEARMV],

-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);

+                       frame_mdcounts, yv12_mb, scale_factor);

   if (cpi->ref_frame_flags & VP9_ALT_FLAG) {

-    setup_buffer_inter(cpi, x, cpi->common.alt_fb_idx, ALTREF_FRAME,

-                       BLOCK_16X16, recon_yoffset, recon_uvoffset,

+    setup_buffer_inter(cpi, x, cpi->alt_fb_idx,

+                       ALTREF_FRAME, BLOCK_16X16, mb_row, mb_col,

                        frame_mv[NEARESTMV], frame_mv[NEARMV],

-                       frame_mdcounts, y_buffer, u_buffer, v_buffer);

+                       frame_mdcounts, yv12_mb, scale_factor);

   *returnintra = INT64_MAX;

@@ -3620,6 +4247,8 @@

   cpi->zbin_mode_boost = 0;

   vp9_update_zbin_extra(cpi, x);

+  xd->mode_info_context->mbmi.mode = DC_PRED;

   rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate,

                           &uv_intra_rate_tokenonly, &uv_intra_distortion,

                           &uv_intra_skippable);

@@ -3638,8 +4267,7 @@

   // that depend on the current prediction etc.

   estimate_ref_frame_costs(cpi, segment_id, ref_costs);

-  for (mode_index = 0; mode_index < MAX_MODES;

-       mode_index += (!switchable_filter_index)) {

+  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {

     int64_t this_rd = INT64_MAX;

     int disable_skip = 0, skippable = 0;

     int other_cost = 0;

@@ -3649,6 +4277,7 @@

 #endif

     int mode_excluded = 0;

     int64_t txfm_cache[NB_TXFM_MODES] = { 0 };

+    YV12_BUFFER_CONFIG *scaled_ref_frame;

     // These variables hold are rolling total cost and distortion for this mode

     rate2 = 0;

@@ -3664,24 +4293,38 @@

     mbmi->ref_frame = vp9_mode_order[mode_index].ref_frame;

     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;

-    // Evaluate all sub-pel filters irrespective of whether we can use

-    // them for this frame.

-    if (this_mode >= NEARESTMV && this_mode <= SPLITMV) {

-      mbmi->interp_filter =

-          vp9_switchable_interp[switchable_filter_index++];

-      if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)

-        switchable_filter_index = 0;

-      if ((cm->mcomp_filter_type != SWITCHABLE) &&

-          (cm->mcomp_filter_type != mbmi->interp_filter)) {

-        mode_excluded = 1;

-      }

-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

-    }

+    mbmi->interp_filter = cm->mcomp_filter_type;

+    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

+                      scale_factor);

+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

     // Test best rd so far against threshold for trying this mode.

     if (best_rd <= cpi->rd_threshes[mode_index])

       continue;

+    // Ensure that the references used by this mode are available.

+    if (mbmi->ref_frame &&

+        !(cpi->ref_frame_flags & flag_list[mbmi->ref_frame]))

+      continue;

+    if (mbmi->second_ref_frame > 0 &&

+        !(cpi->ref_frame_flags & flag_list[mbmi->second_ref_frame]))

+      continue;

+    // only scale on zeromv.

+    if (mbmi->ref_frame > 0 &&

+          (yv12_mb[mbmi->ref_frame].y_width != cm->mb_cols * 16 ||

+           yv12_mb[mbmi->ref_frame].y_height != cm->mb_rows * 16) &&

+        this_mode != ZEROMV)

+      continue;

+    if (mbmi->second_ref_frame > 0 &&

+          (yv12_mb[mbmi->second_ref_frame].y_width != cm->mb_cols * 16 ||

+           yv12_mb[mbmi->second_ref_frame].y_height != cm->mb_rows * 16) &&

+        this_mode != ZEROMV)

+      continue;

     // current coding mode under rate-distortion optimization test loop

 #if CONFIG_COMP_INTERINTRA_PRED

     mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

@@ -3693,18 +4336,16 @@

     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

         !vp9_check_segref(xd, segment_id, mbmi->ref_frame)) {

       continue;

-    // If the segment mode feature is enabled....

+    // If the segment skip feature is enabled....

     // then do nothing if the current mode is not allowed..

-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

-               (this_mode !=

-                vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {

+    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&

+               (this_mode != ZEROMV)) {

       continue;

-    // Disable this drop out case if either the mode or ref frame

-    // segment level feature is enabled for this segment. This is to

+    // Disable this drop out case if  the ref frame segment

+    // level feature is enabled for this segment. This is to

     // prevent the possibility that the we end up unable to pick any mode.

-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

-      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,

+    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {

+      // Only consider ZEROMV/ALTREF_FRAME for alt ref frame overlay,

       // unless ARNR filtering is enabled in which case we want

       // an unfiltered alternative

       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {

@@ -3716,22 +4357,31 @@

     /* everything but intra */

+    scaled_ref_frame = NULL;

     if (mbmi->ref_frame) {

       int ref = mbmi->ref_frame;

+      int fb;

-      xd->pre.y_buffer = y_buffer[ref];

-      xd->pre.u_buffer = u_buffer[ref];

-      xd->pre.v_buffer = v_buffer[ref];

+      xd->pre = yv12_mb[ref];

       best_ref_mv = mbmi->ref_mvs[ref][0];

       vpx_memcpy(mdcounts, frame_mdcounts[ref], sizeof(mdcounts));

+      if (mbmi->ref_frame == LAST_FRAME) {

+        fb = cpi->lst_fb_idx;

+      } else if (mbmi->ref_frame == GOLDEN_FRAME) {

+        fb = cpi->gld_fb_idx;

+      } else {

+        fb = cpi->alt_fb_idx;

+      }

+      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])

+        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];

     if (mbmi->second_ref_frame > 0) {

       int ref = mbmi->second_ref_frame;

-      xd->second_pre.y_buffer = y_buffer[ref];

-      xd->second_pre.u_buffer = u_buffer[ref];

-      xd->second_pre.v_buffer = v_buffer[ref];

+      xd->second_pre = yv12_mb[ref];

       second_best_ref_mv = mbmi->ref_mvs[ref][0];

@@ -3798,8 +4448,7 @@

           // the BPRED mode : x->mbmode_cost[xd->frame_type][BPRED];

           mbmi->txfm_size = TX_4X4;

           tmp_rd = rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,

-                                             &distortion, best_yrd,

-                                             cpi->update_context);

+                                             &distortion, best_yrd);

           rate2 += rate;

           rate2 += intra_cost_penalty;

           distortion2 += distortion;

@@ -3816,65 +4465,11 @@

         break;

         case I8X8_PRED: {

-          int cost0 = vp9_cost_bit(cm->prob_tx[0], 0);

-          int cost1 = vp9_cost_bit(cm->prob_tx[0], 1);

-          int64_t tmp_rd_4x4s, tmp_rd_8x8s;

-          int64_t tmp_rd_4x4, tmp_rd_8x8, tmp_rd;

-          int r4x4, tok4x4, d4x4, r8x8, tok8x8, d8x8;

-          mbmi->txfm_size = TX_4X4;

-          tmp_rd_4x4 = rd_pick_intra8x8mby_modes(cpi, x, &r4x4, &tok4x4,

-                                                 &d4x4, best_yrd);

-          mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;

-          mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;

-          mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;

-          mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;

-          mbmi->txfm_size = TX_8X8;

-          tmp_rd_8x8 = rd_pick_intra8x8mby_modes(cpi, x, &r8x8, &tok8x8,

-                                                 &d8x8, best_yrd);

-          txfm_cache[ONLY_4X4]  = tmp_rd_4x4;

-          txfm_cache[ALLOW_8X8] = tmp_rd_8x8;

-          txfm_cache[ALLOW_16X16] = tmp_rd_8x8;

-          tmp_rd_4x4s = tmp_rd_4x4 + RDCOST(x->rdmult, x->rddiv, cost0, 0);

-          tmp_rd_8x8s = tmp_rd_8x8 + RDCOST(x->rdmult, x->rddiv, cost1, 0);

-          txfm_cache[TX_MODE_SELECT] = tmp_rd_4x4s < tmp_rd_8x8s ? tmp_rd_4x4s : tmp_rd_8x8s;

-          if (cm->txfm_mode == TX_MODE_SELECT) {

-            if (tmp_rd_4x4s < tmp_rd_8x8s) {

-              rate = r4x4 + cost0;

-              rate_y = tok4x4 + cost0;

-              distortion = d4x4;

-              mbmi->txfm_size = TX_4X4;

-              tmp_rd = tmp_rd_4x4s;

-            } else {

-              rate = r8x8 + cost1;

-              rate_y = tok8x8 + cost1;

-              distortion = d8x8;

-              mbmi->txfm_size = TX_8X8;

-              tmp_rd = tmp_rd_8x8s;

+          int64_t tmp_rd;

-              mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;

-              mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;

-              mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;

-              mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;

-            }

-          } else if (cm->txfm_mode == ONLY_4X4) {

-            rate = r4x4;

-            rate_y = tok4x4;

-            distortion = d4x4;

-            mbmi->txfm_size = TX_4X4;

-            tmp_rd = tmp_rd_4x4;

-          } else {

-            rate = r8x8;

-            rate_y = tok8x8;

-            distortion = d8x8;

-            mbmi->txfm_size = TX_8X8;

-            tmp_rd = tmp_rd_8x8;

-            mode8x8[0] = xd->mode_info_context->bmi[0].as_mode.first;

-            mode8x8[1] = xd->mode_info_context->bmi[2].as_mode.first;

-            mode8x8[2] = xd->mode_info_context->bmi[8].as_mode.first;

-            mode8x8[3] = xd->mode_info_context->bmi[10].as_mode.first;

-          }

+          tmp_rd = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate, &rate_y,

+                                                      &distortion, mode8x8,

+                                                      best_yrd, txfm_cache);

           rate2 += rate;

           rate2 += intra_cost_penalty;

           distortion2 += distortion;

@@ -3898,22 +4493,102 @@

     // special case it.

     else if (this_mode == SPLITMV) {

       const int is_comp_pred = mbmi->second_ref_frame > 0;

-      int64_t tmp_rd, this_rd_thresh;

+      int64_t this_rd_thresh;

+      int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;

+      int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;

+      int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;

+      int switchable_filter_index;

       int_mv *second_ref = is_comp_pred ? &second_best_ref_mv : NULL;

+      union b_mode_info tmp_best_bmodes[16];

+      MB_MODE_INFO tmp_best_mbmode;

+      PARTITION_INFO tmp_best_partition;

+      int pred_exists = 0;

       this_rd_thresh =

-              (mbmi->ref_frame == LAST_FRAME) ?

+          (mbmi->ref_frame == LAST_FRAME) ?

           cpi->rd_threshes[THR_NEWMV] : cpi->rd_threshes[THR_NEWA];

       this_rd_thresh =

-              (mbmi->ref_frame == GOLDEN_FRAME) ?

+          (mbmi->ref_frame == GOLDEN_FRAME) ?

           cpi->rd_threshes[THR_NEWG] : this_rd_thresh;

+      xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-      tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,

-                                           second_ref, best_yrd, mdcounts,

-                                           &rate, &rate_y, &distortion,

-                                           &skippable,

-                                           (int)this_rd_thresh, seg_mvs,

-                                           txfm_cache);

+      for (switchable_filter_index = 0;

+           switchable_filter_index < VP9_SWITCHABLE_FILTERS;

+           ++switchable_filter_index) {

+        int newbest;

+        mbmi->interp_filter =

+            vp9_switchable_interp[switchable_filter_index];

+        vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,

+                                             second_ref, best_yrd, mdcounts,

+                                             &rate, &rate_y, &distortion,

+                                             &skippable,

+                                             (int)this_rd_thresh, seg_mvs,

+                                             txfm_cache);

+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs

+                   [vp9_get_pred_context(&cpi->common, xd,

+                                         PRED_SWITCHABLE_INTERP)]

+                   [vp9_switchable_interp_map[mbmi->interp_filter]];

+          tmp_rd += RDCOST(x->rdmult, x->rddiv, rs, 0);

+        }

+        newbest = (tmp_rd < tmp_best_rd);

+        if (newbest) {

+          tmp_best_filter = mbmi->interp_filter;

+          tmp_best_rd = tmp_rd;

+        }

+        if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||

+            (mbmi->interp_filter == cm->mcomp_filter_type &&

+             cm->mcomp_filter_type != SWITCHABLE)) {

+          tmp_best_rdu = tmp_rd;

+          tmp_best_rate = rate;

+          tmp_best_ratey = rate_y;

+          tmp_best_distortion = distortion;

+          tmp_best_skippable = skippable;

+          vpx_memcpy(&tmp_best_mbmode, mbmi, sizeof(MB_MODE_INFO));

+          vpx_memcpy(&tmp_best_partition, x->partition_info,

+                     sizeof(PARTITION_INFO));

+          for (i = 0; i < 16; i++) {

+            tmp_best_bmodes[i] = xd->block[i].bmi;

+          }

+          pred_exists = 1;

+        }

+      }  // switchable_filter_index loop

+      mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?

+                             tmp_best_filter : cm->mcomp_filter_type);

+      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

+      if (!pred_exists) {

+        // Handles the special case when a filter that is not in the

+        // switchable list (bilinear, 6-tap) is indicated at the frame level

+        tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv,

+                                             second_ref, best_yrd, mdcounts,

+                                             &rate, &rate_y, &distortion,

+                                             &skippable,

+                                             (int)this_rd_thresh, seg_mvs,

+                                             txfm_cache);

+      } else {

+        if (cpi->common.mcomp_filter_type == SWITCHABLE) {

+          int rs = SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs

+                   [vp9_get_pred_context(&cpi->common, xd,

+                                         PRED_SWITCHABLE_INTERP)]

+                   [vp9_switchable_interp_map[mbmi->interp_filter]];

+          tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);

+        }

+        tmp_rd = tmp_best_rdu;

+        rate = tmp_best_rate;

+        rate_y = tmp_best_ratey;

+        distortion = tmp_best_distortion;

+        skippable = tmp_best_skippable;

+        vpx_memcpy(mbmi, &tmp_best_mbmode, sizeof(MB_MODE_INFO));

+        vpx_memcpy(x->partition_info, &tmp_best_partition,

+                   sizeof(PARTITION_INFO));

+        for (i = 0; i < 16; i++) {

+          xd->block[i].bmi = xd->mode_info_context->bmi[i] = tmp_best_bmodes[i];

+        }

+      }

       rate2 += rate;

       distortion2 += distortion;

@@ -3920,7 +4595,7 @@

       if (cpi->common.mcomp_filter_type == SWITCHABLE)

         rate2 += SWITCHABLE_INTERP_RATE_FACTOR * x->switchable_interp_costs

             [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]

-                [vp9_switchable_interp_map[mbmi->interp_filter]];

+            [vp9_switchable_interp_map[mbmi->interp_filter]];

       // If even the 'Y' rd value of split is higher than best so far

       // then dont bother looking at UV

@@ -3927,8 +4602,11 @@

       if (tmp_rd < best_yrd) {

         int uv_skippable;

-        rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,

-                       cpi->common.full_pixel);

+        vp9_build_inter4x4_predictors_mbuv(&x->e_mbd, mb_row, mb_col);

+        vp9_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,

+                          x->e_mbd.predictor, x->src.uv_stride);

+        rd_inter16x16_uv_4x4(cpi, x, &rate_uv, &distortion_uv,

+                             cpi->common.full_pixel, &uv_skippable, 1);

         rate2 += rate_uv;

         distortion2 += distortion_uv;

         skippable = skippable && uv_skippable;

@@ -3969,8 +4647,9 @@

 #endif

                                   &rate_y, &distortion,

                                   &rate_uv, &distortion_uv,

-                                  &mode_excluded, &disable_skip, recon_yoffset,

-                                  mode_index, frame_mv);

+                                  &mode_excluded, &disable_skip,

+                                  mode_index, &tmp_best_filter, frame_mv,

+                                  scaled_ref_frame, mb_row, mb_col);

       if (this_rd == INT64_MAX)

         continue;

@@ -3995,10 +4674,8 @@

       if (cpi->common.mb_no_coeff_skip) {

         int mb_skip_allowed;

-        // Is Mb level skip allowed for this mb.

-        mb_skip_allowed =

-          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+        // Is Mb level skip allowed (i.e. not coded at segment level).

+        mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

         if (skippable) {

           mbmi->mb_skip_coeff = 1;

@@ -4050,8 +4727,10 @@

         (this_rd < best_intra16_rd)) {

       best_intra16_rd = this_rd;

       best_intra16_mode = this_mode;

+#if SEPARATE_INTERINTRA_UV

       best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?

                               uv_intra_mode_8x8 : uv_intra_mode);

+#endif

 #endif

@@ -4061,7 +4740,7 @@

     if (this_rd < best_overall_rd) {

       best_overall_rd = this_rd;

-      best_filter = mbmi->interp_filter;

+      best_filter = tmp_best_filter;

       best_mode = this_mode;

 #if CONFIG_COMP_INTERINTRA_PRED

       is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);

@@ -4175,7 +4854,7 @@

     if (x->skip && !mode_excluded)

       break;

-    }

+  }

   assert((cm->mcomp_filter_type == SWITCHABLE) ||

          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||

@@ -4204,12 +4883,11 @@

         cpi->rd_thresh_mult[best_mode_index];

-  // This code force Altref,0,0 and skip for the frame that overlays a

+  // This code forces Altref,0,0 and skip for the frame that overlays a

   // an alrtef unless Altref is filtered. However, this is unsafe if

-  // segment level coding of ref frame or mode is enabled for this

+  // segment level coding of ref frame is enabled for this

   // segment.

   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

       cpi->is_src_frame_alt_ref &&

       (cpi->oxcf.arnr_max_frames == 0) &&

       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {

@@ -4224,6 +4902,8 @@

     mbmi->mb_skip_coeff =

       (cpi->common.mb_no_coeff_skip) ? 1 : 0;

     mbmi->partitioning = 0;

+    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

+                      scale_factor);

     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));

     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));

@@ -4244,10 +4924,12 @@

   if (best_mbmode.mode == SPLITMV) {

     for (i = 0; i < 16; i++)

-      xd->mode_info_context->bmi[i].as_mv.first.as_int = best_bmodes[i].as_mv.first.as_int;

+      xd->mode_info_context->bmi[i].as_mv[0].as_int =

+          best_bmodes[i].as_mv[0].as_int;

     if (mbmi->second_ref_frame > 0)

       for (i = 0; i < 16; i++)

-        xd->mode_info_context->bmi[i].as_mv.second.as_int = best_bmodes[i].as_mv.second.as_int;

+        xd->mode_info_context->bmi[i].as_mv[1].as_int =

+            best_bmodes[i].as_mv[1].as_int;

     vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));

@@ -4265,7 +4947,7 @@

   if (!x->skip) {

     for (i = 0; i < NB_TXFM_MODES; i++) {

       if (best_txfm_rd[i] == INT64_MAX)

-        best_txfm_diff[i] = INT_MIN;

+        best_txfm_diff[i] = 0;

       else

         best_txfm_diff[i] = best_rd - best_txfm_rd[i];

@@ -4274,6 +4956,8 @@

 end:

+  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

+                    scale_factor);

   store_coding_context(x, &x->mb_context[xd->sb_index][xd->mb_index],

                        best_mode_index, &best_partition,

                        &mbmi->ref_mvs[mbmi->ref_frame][0],

@@ -4291,22 +4975,29 @@

   int rate_y_tokenonly = 0, rate_uv_tokenonly;

   int dist_y = 0, dist_uv;

   int y_skip = 0, uv_skip;

-  int64_t txfm_cache[NB_TXFM_MODES];

+  int64_t txfm_cache[NB_TXFM_MODES], err;

+  int i;

-  rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,

-                                   &dist_y, &y_skip, txfm_cache);

+  xd->mode_info_context->mbmi.mode = DC_PRED;

+  err = rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,

+                               &dist_y, &y_skip, txfm_cache);

   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,

-                                     &dist_uv, &uv_skip);

+                          &dist_uv, &uv_skip);

   if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {

     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +

                   vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

     *returndist = dist_y + (dist_uv >> 2);

+    memset(x->sb32_context[xd->sb_index].txfm_rd_diff, 0,

+           sizeof(x->sb32_context[xd->sb_index].txfm_rd_diff));

   } else {

     *returnrate = rate_y + rate_uv;

     if (cpi->common.mb_no_coeff_skip)

       *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

     *returndist = dist_y + (dist_uv >> 2);

+    for (i = 0; i < NB_TXFM_MODES; i++) {

+      x->sb32_context[xd->sb_index].txfm_rd_diff[i] = err - txfm_cache[i];

+    }

@@ -4319,22 +5010,29 @@

   int rate_y_tokenonly = 0, rate_uv_tokenonly;

   int dist_y = 0, dist_uv;

   int y_skip = 0, uv_skip;

-  int64_t txfm_cache[NB_TXFM_MODES];

+  int64_t txfm_cache[NB_TXFM_MODES], err;

+  int i;

-  rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,

-                                     &dist_y, &y_skip, txfm_cache);

+  xd->mode_info_context->mbmi.mode = DC_PRED;

+  err = rd_pick_intra_sb64y_mode(cpi, x, &rate_y, &rate_y_tokenonly,

+                                 &dist_y, &y_skip, txfm_cache);

   rd_pick_intra_sb64uv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,

-                                       &dist_uv, &uv_skip);

+                            &dist_uv, &uv_skip);

   if (cpi->common.mb_no_coeff_skip && y_skip && uv_skip) {

     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +

     vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

     *returndist = dist_y + (dist_uv >> 2);

+    memset(x->sb64_context.txfm_rd_diff, 0,

+           sizeof(x->sb64_context.txfm_rd_diff));

   } else {

     *returnrate = rate_y + rate_uv;

     if (cm->mb_no_coeff_skip)

       *returnrate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

     *returndist = dist_y + (dist_uv >> 2);

+    for (i = 0; i < NB_TXFM_MODES; i++) {

+      x->sb64_context.txfm_rd_diff[i] = err - txfm_cache[i];

+    }

@@ -4356,13 +5054,14 @@

   int mode16x16;

   int mode8x8[4];

   int dist;

-  int modeuv, uv_intra_skippable, uv_intra_skippable_8x8;

+  int modeuv, modeuv8x8, uv_intra_skippable, uv_intra_skippable_8x8;

   int y_intra16x16_skippable = 0;

-  int64_t txfm_cache[NB_TXFM_MODES];

-  TX_SIZE txfm_size_16x16;

+  int64_t txfm_cache[2][NB_TXFM_MODES];

+  TX_SIZE txfm_size_16x16, txfm_size_8x8;

   int i;

   mbmi->ref_frame = INTRA_FRAME;

+  mbmi->mode = DC_PRED;

   rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv,

                           &uv_intra_skippable);

   modeuv = mbmi->uv_mode;

@@ -4369,47 +5068,71 @@

   if (cpi->common.txfm_mode != ONLY_4X4) {

     rd_pick_intra_mbuv_mode_8x8(cpi, x, &rateuv8x8, &rateuv8x8_tokenonly,

                                 &distuv8x8, &uv_intra_skippable_8x8);

+    modeuv8x8 = mbmi->uv_mode;

   } else {

     uv_intra_skippable_8x8 = uv_intra_skippable;

     rateuv8x8 = rateuv;

     distuv8x8 = distuv;

     rateuv8x8_tokenonly = rateuv_tokenonly;

+    modeuv8x8 = modeuv;

   // current macroblock under rate-distortion optimization test loop

   error16x16 = rd_pick_intra16x16mby_mode(cpi, x, &rate16x16,

                                           &rate16x16_tokenonly, &dist16x16,

-                                          &y_intra16x16_skippable, txfm_cache);

+                                          &y_intra16x16_skippable,

+                                          txfm_cache[1]);

   mode16x16 = mbmi->mode;

   txfm_size_16x16 = mbmi->txfm_size;

+  if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&

+      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||

+       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {

+    error16x16 -= RDCOST(x->rdmult, x->rddiv, rate16x16_tokenonly, 0);

+    rate16x16 -= rate16x16_tokenonly;

+  }

+  for (i = 0; i < NB_TXFM_MODES; i++) {

+    txfm_cache[0][i] = error16x16 - txfm_cache[1][cm->txfm_mode] +

+                       txfm_cache[1][i];

+  }

-  // FIXME(rbultje) support transform-size selection

-  mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;

-  error8x8 = rd_pick_intra8x8mby_modes(cpi, x, &rate8x8, &rate8x8_tokenonly,

-                                       &dist8x8, error16x16);

-  mode8x8[0]= xd->mode_info_context->bmi[0].as_mode.first;

-  mode8x8[1]= xd->mode_info_context->bmi[2].as_mode.first;

-  mode8x8[2]= xd->mode_info_context->bmi[8].as_mode.first;

-  mode8x8[3]= xd->mode_info_context->bmi[10].as_mode.first;

+  error8x8 = rd_pick_intra8x8mby_modes_and_txsz(cpi, x, &rate8x8,

+                                                &rate8x8_tokenonly,

+                                                &dist8x8, mode8x8,

+                                                error16x16, txfm_cache[1]);

+  txfm_size_8x8 = mbmi->txfm_size;

+  for (i = 0; i < NB_TXFM_MODES; i++) {

+    int64_t tmp_rd = error8x8 - txfm_cache[1][cm->txfm_mode] + txfm_cache[1][i];

+    if (tmp_rd < txfm_cache[0][i])

+      txfm_cache[0][i] = tmp_rd;

+  }

+  mbmi->txfm_size = TX_4X4;

   error4x4 = rd_pick_intra4x4mby_modes(cpi, x,

                                        &rate4x4, &rate4x4_tokenonly,

-                                       &dist4x4, error16x16,

-                                       cpi->update_context);

+                                       &dist4x4, error16x16);

+  for (i = 0; i < NB_TXFM_MODES; i++) {

+    if (error4x4 < txfm_cache[0][i])

+      txfm_cache[0][i] = error4x4;

+  }

   mbmi->mb_skip_coeff = 0;

-  if (cpi->common.mb_no_coeff_skip &&

-      y_intra16x16_skippable && uv_intra_skippable_8x8) {

+  if (cpi->common.mb_no_coeff_skip && y_intra16x16_skippable &&

+      ((cm->txfm_mode == ONLY_4X4 && uv_intra_skippable) ||

+       (cm->txfm_mode != ONLY_4X4 && uv_intra_skippable_8x8))) {

     mbmi->mb_skip_coeff = 1;

     mbmi->mode = mode16x16;

-    mbmi->uv_mode = modeuv;

-    rate = rateuv8x8 + rate16x16 - rateuv8x8_tokenonly - rate16x16_tokenonly +

-           vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

-    dist = dist16x16 + (distuv8x8 >> 2);

+    mbmi->uv_mode = (cm->txfm_mode == ONLY_4X4) ? modeuv : modeuv8x8;

+    rate = rate16x16 + vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 1);

+    dist = dist16x16;

+    if (cm->txfm_mode == ONLY_4X4) {

+      rate += rateuv - rateuv_tokenonly;

+      dist += (distuv >> 2);

+    } else {

+      rate += rateuv8x8 - rateuv8x8_tokenonly;

+      dist += (distuv8x8 >> 2);

+    }

     mbmi->txfm_size = txfm_size_16x16;

-    memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,

-           sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));

   } else if (error8x8 > error16x16) {

     if (error4x4 < error16x16) {

       rate = rateuv + rate4x4;

@@ -4416,17 +5139,11 @@

       mbmi->mode = B_PRED;

       mbmi->txfm_size = TX_4X4;

       dist = dist4x4 + (distuv >> 2);

-      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,

-             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));

     } else {

       mbmi->txfm_size = txfm_size_16x16;

       mbmi->mode = mode16x16;

       rate = rate16x16 + rateuv8x8;

       dist = dist16x16 + (distuv8x8 >> 2);

-      for (i = 0; i < NB_TXFM_MODES; i++) {

-        x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =

-            error16x16 - txfm_cache[i];

-      }

     if (cpi->common.mb_no_coeff_skip)

       rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

@@ -4436,28 +5153,28 @@

       mbmi->mode = B_PRED;

       mbmi->txfm_size = TX_4X4;

       dist = dist4x4 + (distuv >> 2);

-      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,

-             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));

     } else {

-      // FIXME(rbultje) support transform-size selection

       mbmi->mode = I8X8_PRED;

-      mbmi->txfm_size = (cm->txfm_mode == ONLY_4X4) ? TX_4X4 : TX_8X8;

+      mbmi->txfm_size = txfm_size_8x8;

       set_i8x8_block_modes(x, mode8x8);

       rate = rate8x8 + rateuv;

       dist = dist8x8 + (distuv >> 2);

-      memset(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff, 0,

-             sizeof(x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff));

     if (cpi->common.mb_no_coeff_skip)

       rate += vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_MBSKIP), 0);

+  for (i = 0; i < NB_TXFM_MODES; i++) {

+    x->mb_context[xd->sb_index][xd->mb_index].txfm_rd_diff[i] =

+        txfm_cache[0][cm->txfm_mode] - txfm_cache[0][i];

+  }

   *returnrate = rate;

   *returndist = dist;

 static int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,

-                                         int recon_yoffset, int recon_uvoffset,

+                                         int mb_row, int mb_col,

                                          int *returnrate,

                                          int *returndistortion,

                                          int block_size) {

@@ -4471,13 +5188,13 @@

   int comp_pred, i;

   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];

   int frame_mdcounts[4][4];

-  uint8_t *y_buffer[4];

-  uint8_t *u_buffer[4];

-  uint8_t *v_buffer[4];

+  YV12_BUFFER_CONFIG yv12_mb[4];

   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,

                                     VP9_ALT_FLAG };

-  int idx_list[4] = { 0, cpi->common.lst_fb_idx, cpi->common.gld_fb_idx,

-                      cpi->common.alt_fb_idx };

+  int idx_list[4] = {0,

+                     cpi->lst_fb_idx,

+                     cpi->gld_fb_idx,

+                     cpi->alt_fb_idx};

   int mdcounts[4];

   int near_sadidx[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };

   int saddone = 0;

@@ -4492,20 +5209,23 @@

 #if CONFIG_COMP_INTERINTRA_PRED

   int is_best_interintra = 0;

   int64_t best_intra16_rd = INT64_MAX;

-  int best_intra16_mode = DC_PRED, best_intra16_uv_mode = DC_PRED;

+  int best_intra16_mode = DC_PRED;

+#if SEPARATE_INTERINTRA_UV

+  int best_intra16_uv_mode = DC_PRED;

 #endif

+#endif

   int64_t best_overall_rd = INT64_MAX;

   INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;

+  INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;

   int rate_uv_4x4 = 0, rate_uv_8x8 = 0, rate_uv_tokenonly_4x4 = 0,

       rate_uv_tokenonly_8x8 = 0;

   int dist_uv_4x4 = 0, dist_uv_8x8 = 0, uv_skip_4x4 = 0, uv_skip_8x8 = 0;

   MB_PREDICTION_MODE mode_uv_4x4 = NEARESTMV, mode_uv_8x8 = NEARESTMV;

-  int switchable_filter_index = 0;

   int rate_uv_16x16 = 0, rate_uv_tokenonly_16x16 = 0;

   int dist_uv_16x16 = 0, uv_skip_16x16 = 0;

   MB_PREDICTION_MODE mode_uv_16x16 = NEARESTMV;

+  struct scale_factors scale_factor[4];

-  x->skip = 0;

   xd->mode_info_context->mbmi.segment_id = segment_id;

   estimate_ref_frame_costs(cpi, segment_id, ref_costs);

   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));

@@ -4518,9 +5238,9 @@

   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {

     if (cpi->ref_frame_flags & flag_list[ref_frame]) {

       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,

-                         recon_yoffset, recon_uvoffset, frame_mv[NEARESTMV],

+                         mb_row, mb_col, frame_mv[NEARESTMV],

                          frame_mv[NEARMV], frame_mdcounts,

-                         y_buffer, u_buffer, v_buffer);

+                         yv12_mb, scale_factor);

     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;

     frame_mv[ZEROMV][ref_frame].as_int = 0;

@@ -4570,8 +5290,7 @@

-  for (mode_index = 0; mode_index < MAX_MODES;

-       mode_index += (!switchable_filter_index)) {

+  for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {

     int mode_excluded = 0;

     int64_t this_rd = INT64_MAX;

     int disable_skip = 0;

@@ -4588,10 +5307,10 @@

     // Test best rd so far against threshold for trying this mode.

     if (best_rd <= cpi->rd_threshes[mode_index] ||

         cpi->rd_threshes[mode_index] == INT_MAX) {

-      switchable_filter_index = 0;

       continue;

+    x->skip = 0;

     this_mode = vp9_mode_order[mode_index].mode;

     ref_frame = vp9_mode_order[mode_index].ref_frame;

     if (!(ref_frame == INTRA_FRAME ||

@@ -4600,6 +5319,8 @@

     mbmi->ref_frame = ref_frame;

     mbmi->second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;

+    set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

+                      scale_factor);

     comp_pred = mbmi->second_ref_frame > INTRA_FRAME;

     mbmi->mode = this_mode;

     mbmi->uv_mode = DC_PRED;

@@ -4607,19 +5328,11 @@

     mbmi->interintra_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

     mbmi->interintra_uv_mode = (MB_PREDICTION_MODE)(DC_PRED - 1);

 #endif

     // Evaluate all sub-pel filters irrespective of whether we can use

     // them for this frame.

-    if (this_mode >= NEARESTMV && this_mode <= SPLITMV) {

-      mbmi->interp_filter =

-          vp9_switchable_interp[switchable_filter_index++];

-      if (switchable_filter_index == VP9_SWITCHABLE_FILTERS)

-        switchable_filter_index = 0;

-      if ((cm->mcomp_filter_type != SWITCHABLE) &&

-          (cm->mcomp_filter_type != mbmi->interp_filter)) {

-        mode_excluded = 1;

-      }

-      vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

-    }

+    mbmi->interp_filter = cm->mcomp_filter_type;

+    vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);

     // if (!(cpi->ref_frame_flags & flag_list[ref_frame]))

     //  continue;

@@ -4640,10 +5353,10 @@

       if (!(cpi->ref_frame_flags & flag_list[second_ref]))

         continue;

       mbmi->second_ref_frame = second_ref;

+      set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

+                        scale_factor);

-      xd->second_pre.y_buffer = y_buffer[second_ref];

-      xd->second_pre.u_buffer = u_buffer[second_ref];

-      xd->second_pre.v_buffer = v_buffer[second_ref];

+      xd->second_pre = yv12_mb[second_ref];

       mode_excluded =

           mode_excluded ?

               mode_excluded : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;

@@ -4661,9 +5374,7 @@

-    xd->pre.y_buffer = y_buffer[ref_frame];

-    xd->pre.u_buffer = u_buffer[ref_frame];

-    xd->pre.v_buffer = v_buffer[ref_frame];

+    xd->pre = yv12_mb[ref_frame];

     vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));

     // If the segment reference frame feature is enabled....

@@ -4671,16 +5382,15 @@

     if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

         !vp9_check_segref(xd, segment_id, ref_frame)) {

       continue;

-    // If the segment mode feature is enabled....

+    // If the segment skip feature is enabled....

     // then do nothing if the current mode is not allowed..

-    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

-               (this_mode != vp9_get_segdata(xd, segment_id, SEG_LVL_MODE))) {

+    } else if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) &&

+               (this_mode != ZEROMV)) {

       continue;

-    // Disable this drop out case if either the mode or ref frame

+    // Disable this drop out case if the ref frame

     // segment level feature is enabled for this segment. This is to

     // prevent the possibility that we end up unable to pick any mode.

-    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-               !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE)) {

+    } else if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {

       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,

       // unless ARNR filtering is enabled in which case we want

       // an unfiltered alternative

@@ -4722,6 +5432,20 @@

       rate2 = rate_y + x->mbmode_cost[cm->frame_type][mbmi->mode] + rate_uv;

       distortion2 = distortion_y + distortion_uv;

     } else {

+      YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;

+      int fb;

+      if (mbmi->ref_frame == LAST_FRAME) {

+        fb = cpi->lst_fb_idx;

+      } else if (mbmi->ref_frame == GOLDEN_FRAME) {

+        fb = cpi->gld_fb_idx;

+      } else {

+        fb = cpi->alt_fb_idx;

+      }

+      if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])

+        scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];

 #if CONFIG_COMP_INTERINTRA_PRED

       if (mbmi->second_ref_frame == INTRA_FRAME) {

         if (best_intra16_mode == DC_PRED - 1) continue;

@@ -4742,8 +5466,9 @@

 #endif

                                   &rate_y, &distortion_y,

                                   &rate_uv, &distortion_uv,

-                                  &mode_excluded, &disable_skip, recon_yoffset,

-                                  mode_index, frame_mv);

+                                  &mode_excluded, &disable_skip,

+                                  mode_index, &tmp_best_filter, frame_mv,

+                                  scaled_ref_frame, mb_row, mb_col);

       if (this_rd == INT64_MAX)

         continue;

@@ -4769,10 +5494,8 @@

       if (cpi->common.mb_no_coeff_skip) {

         int mb_skip_allowed;

-        // Is Mb level skip allowed for this mb.

-        mb_skip_allowed =

-          !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-          vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+        // Is Mb level skip allowed (i.e. not coded at segment level).

+        mb_skip_allowed = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

         if (skippable) {

           // Back out the coefficient coding costs

@@ -4821,8 +5544,10 @@

         (this_rd < best_intra16_rd)) {

       best_intra16_rd = this_rd;

       best_intra16_mode = this_mode;

+#if SEPARATE_INTERINTRA_UV

       best_intra16_uv_mode = (mbmi->txfm_size != TX_4X4 ?

                               mode_uv_8x8 : mode_uv_4x4);

+#endif

 #endif

@@ -4832,7 +5557,7 @@

     if (this_rd < best_overall_rd) {

       best_overall_rd = this_rd;

-      best_filter = mbmi->interp_filter;

+      best_filter = tmp_best_filter;

       best_mode = this_mode;

 #if CONFIG_COMP_INTERINTRA_PRED

       is_best_interintra = (mbmi->second_ref_frame == INTRA_FRAME);

@@ -4956,10 +5681,8 @@

   // This code forces Altref,0,0 and skip for the frame that overlays a

   // an alrtef unless Altref is filtered. However, this is unsafe if

-  // segment level coding of ref frame or mode is enabled for this

-  // segment.

+  // segment level coding of ref frame is enabled for this segment.

   if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&

-      !vp9_segfeature_active(xd, segment_id, SEG_LVL_MODE) &&

       cpi->is_src_frame_alt_ref &&

       (cpi->oxcf.arnr_max_frames == 0) &&

       (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME)) {

@@ -4971,7 +5694,7 @@

     mbmi->mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;

     mbmi->partitioning = 0;

     mbmi->txfm_size = cm->txfm_mode == TX_MODE_SELECT ?

-                      TX_16X16 : cm->txfm_mode;

+                      TX_32X32 : cm->txfm_mode;

     vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));

     vpx_memset(best_pred_diff, 0, sizeof(best_pred_diff));

@@ -4991,7 +5714,7 @@

   if (!x->skip) {

     for (i = 0; i < NB_TXFM_MODES; i++) {

       if (best_txfm_rd[i] == INT64_MAX)

-        best_txfm_diff[i] = INT_MIN;

+        best_txfm_diff[i] = 0;

       else

         best_txfm_diff[i] = best_rd - best_txfm_rd[i];

@@ -5000,6 +5723,8 @@

  end:

+  set_scale_factors(xd, mbmi->ref_frame, mbmi->second_ref_frame,

+                    scale_factor);

     PICK_MODE_CONTEXT *p = (block_size == BLOCK_32X32) ?

                             &x->sb32_context[xd->sb_index] :

@@ -5015,24 +5740,23 @@

 int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int recon_yoffset, int recon_uvoffset,

+                                    int mb_row, int mb_col,

                                     int *returnrate,

                                     int *returndistortion) {

-  return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset,

+  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,

                                    returnrate, returndistortion, BLOCK_32X32);

 int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int recon_yoffset, int recon_uvoffset,

+                                    int mb_row, int mb_col,

                                     int *returnrate,

                                     int *returndistortion) {

-  return vp9_rd_pick_inter_mode_sb(cpi, x, recon_yoffset, recon_uvoffset,

+  return vp9_rd_pick_inter_mode_sb(cpi, x, mb_row, mb_col,

                                    returnrate, returndistortion, BLOCK_64X64);

 void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

-                                    int recon_yoffset,

-                                    int recon_uvoffset,

+                                    int mb_row, int mb_col,

                                     int *totalrate, int *totaldist) {

   MACROBLOCKD *const xd = &x->e_mbd;

   MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;

@@ -5050,7 +5774,7 @@

     int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled;

-    rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate,

+    rd_pick_inter_mode(cpi, x, mb_row, mb_col, &rate,

                        &distortion, &intra_error);

     /* restore cpi->zbin_mode_boost_enabled */

--- a/vp9/encoder/vp9_rdopt.h

+++ b/vp9/encoder/vp9_rdopt.h

@@ -15,34 +15,34 @@

 #define RDCOST(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )

 #define RDCOST_8x8(RM,DM,R,D) ( ((128+((int64_t)R)*(RM)) >> 8) + ((int64_t)DM)*(D) )

-extern void vp9_initialize_rd_consts(VP9_COMP *cpi, int Qvalue);

+void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);

-extern void vp9_initialize_me_consts(VP9_COMP *cpi, int QIndex);

+void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);

-extern void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,

-                                   int *r, int *d);

+void vp9_rd_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x,

+                            int *r, int *d);

-extern void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,

-                                        int *r, int *d);

+void vp9_rd_pick_intra_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,

+                                 int *r, int *d);

-extern void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,

-                                        int *r, int *d);

+void vp9_rd_pick_intra_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,

+                                 int *r, int *d);

-extern void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

-                                           int ref_yoffset, int ref_uvoffset,

-                                           int *r, int *d);

+void vp9_pick_mode_inter_macroblock(VP9_COMP *cpi, MACROBLOCK *x,

+                                    int mb_row, int mb_col,

+                                    int *r, int *d);

-extern int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,

-                                           int ref_yoffset, int ref_uvoffset,

-                                           int *r, int *d);

+int64_t vp9_rd_pick_inter_mode_sb32(VP9_COMP *cpi, MACROBLOCK *x,

+                                    int mb_row, int mb_col,

+                                    int *r, int *d);

-extern int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,

-                                           int ref_yoffset, int ref_uvoffset,

-                                           int *r, int *d);

+int64_t vp9_rd_pick_inter_mode_sb64(VP9_COMP *cpi, MACROBLOCK *x,

+                                    int mb_row, int mb_col,

+                                    int *r, int *d);

-extern void vp9_init_me_luts();

+void vp9_init_me_luts();

-extern void vp9_set_mbmode_and_mvs(MACROBLOCK *x,

-                                   MB_PREDICTION_MODE mb, int_mv *mv);

+void vp9_set_mbmode_and_mvs(MACROBLOCK *x,

+                            MB_PREDICTION_MODE mb, int_mv *mv);

 #endif  // VP9_ENCODER_VP9_RDOPT_H_

--- a/vp9/encoder/vp9_sad_c.c

+++ b/vp9/encoder/vp9_sad_c.c

@@ -13,12 +13,13 @@

 #include "vp9/common/vp9_sadmxn.h"

 #include "./vpx_config.h"

 #include "vpx/vpx_integer.h"

+#include "./vp9_rtcd.h"

 unsigned int vp9_sad64x64_c(const uint8_t *src_ptr,

                             int  src_stride,

                             const uint8_t *ref_ptr,

                             int  ref_stride,

-                            int max_sad) {

+                            unsigned int max_sad) {

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 64, 64);

@@ -26,7 +27,7 @@

                             int  src_stride,

                             const uint8_t *ref_ptr,

                             int  ref_stride,

-                            int max_sad) {

+                            unsigned int max_sad) {

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32);

@@ -34,7 +35,7 @@

                             int  src_stride,

                             const uint8_t *ref_ptr,

                             int  ref_stride,

-                            int max_sad) {

+                            unsigned int max_sad) {

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16);

@@ -42,7 +43,7 @@

                           int  src_stride,

                           const uint8_t *ref_ptr,

                           int  ref_stride,

-                          int max_sad) {

+                          unsigned int max_sad) {

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);

@@ -51,7 +52,7 @@

                            int  src_stride,

                            const uint8_t *ref_ptr,

                            int  ref_stride,

-                           int max_sad) {

+                           unsigned int max_sad) {

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);

@@ -59,7 +60,7 @@

                            int  src_stride,

                            const uint8_t *ref_ptr,

                            int  ref_stride,

-                           int max_sad) {

+                           unsigned int max_sad) {

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);

@@ -68,7 +69,7 @@

                           int  src_stride,

                           const uint8_t *ref_ptr,

                           int  ref_stride,

-                          int max_sad) {

+                          unsigned int max_sad) {

   return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);

@@ -77,12 +78,12 @@

                       const uint8_t *ref_ptr,

                       int  ref_stride,

                       unsigned int *sad_array) {

-  sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride,

-                                ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride,

-                                ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride,

-                                ref_ptr + 2, ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad64x64(src_ptr, src_stride, ref_ptr, ref_stride,

+                              0x7fffffff);

+  sad_array[1] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 1, ref_stride,

+                              0x7fffffff);

+  sad_array[2] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 2, ref_stride,

+                              0x7fffffff);

 void vp9_sad32x32x3_c(const uint8_t *src_ptr,

@@ -90,12 +91,12 @@

                       const uint8_t *ref_ptr,

                       int  ref_stride,

                       unsigned int *sad_array) {

-  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr + 2, ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr + 2, ref_stride, 0x7fffffff);

 void vp9_sad64x64x8_c(const uint8_t *src_ptr,

@@ -102,31 +103,31 @@

                       int  src_stride,

                       const uint8_t *ref_ptr,

                       int  ref_stride,

-                      uint16_t *sad_array) {

-  sad_array[0] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,

-                                          ref_ptr, ref_stride,

-                                          0x7fffffff);

-  sad_array[1] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,

-                                          ref_ptr + 1, ref_stride,

-                                          0x7fffffff);

-  sad_array[2] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,

-                                          ref_ptr + 2, ref_stride,

-                                          0x7fffffff);

-  sad_array[3] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,

-                                          ref_ptr + 3, ref_stride,

-                                          0x7fffffff);

-  sad_array[4] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,

-                                          ref_ptr + 4, ref_stride,

-                                          0x7fffffff);

-  sad_array[5] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,

-                                          ref_ptr + 5, ref_stride,

-                                          0x7fffffff);

-  sad_array[6] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,

-                                          ref_ptr + 6, ref_stride,

-                                          0x7fffffff);

-  sad_array[7] = (uint16_t)vp9_sad64x64_c(src_ptr, src_stride,

-                                          ref_ptr + 7, ref_stride,

-                                          0x7fffffff);

+                      unsigned int *sad_array) {

+  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr, ref_stride,

+                              0x7fffffff);

+  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr + 1, ref_stride,

+                              0x7fffffff);

+  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr + 2, ref_stride,

+                              0x7fffffff);

+  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr + 3, ref_stride,

+                              0x7fffffff);

+  sad_array[4] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr + 4, ref_stride,

+                              0x7fffffff);

+  sad_array[5] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr + 5, ref_stride,

+                              0x7fffffff);

+  sad_array[6] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr + 6, ref_stride,

+                              0x7fffffff);

+  sad_array[7] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr + 7, ref_stride,

+                              0x7fffffff);

 void vp9_sad32x32x8_c(const uint8_t *src_ptr,

@@ -133,31 +134,31 @@

                       int  src_stride,

                       const uint8_t *ref_ptr,

                       int  ref_stride,

-                      uint16_t *sad_array) {

-  sad_array[0] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,

-                                          ref_ptr, ref_stride,

-                                          0x7fffffff);

-  sad_array[1] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,

-                                          ref_ptr + 1, ref_stride,

-                                          0x7fffffff);

-  sad_array[2] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,

-                                          ref_ptr + 2, ref_stride,

-                                          0x7fffffff);

-  sad_array[3] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,

-                                          ref_ptr + 3, ref_stride,

-                                          0x7fffffff);

-  sad_array[4] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,

-                                          ref_ptr + 4, ref_stride,

-                                          0x7fffffff);

-  sad_array[5] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,

-                                          ref_ptr + 5, ref_stride,

-                                          0x7fffffff);

-  sad_array[6] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,

-                                          ref_ptr + 6, ref_stride,

-                                          0x7fffffff);

-  sad_array[7] = (uint16_t)vp9_sad32x32_c(src_ptr, src_stride,

-                                          ref_ptr + 7, ref_stride,

-                                          0x7fffffff);

+                      unsigned int *sad_array) {

+  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr, ref_stride,

+                              0x7fffffff);

+  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr + 1, ref_stride,

+                              0x7fffffff);

+  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr + 2, ref_stride,

+                              0x7fffffff);

+  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr + 3, ref_stride,

+                              0x7fffffff);

+  sad_array[4] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr + 4, ref_stride,

+                              0x7fffffff);

+  sad_array[5] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr + 5, ref_stride,

+                              0x7fffffff);

+  sad_array[6] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr + 6, ref_stride,

+                              0x7fffffff);

+  sad_array[7] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr + 7, ref_stride,

+                              0x7fffffff);

 void vp9_sad16x16x3_c(const uint8_t *src_ptr,

@@ -165,12 +166,12 @@

                       const uint8_t *ref_ptr,

                       int  ref_stride,

                       unsigned int *sad_array) {

-  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr + 2, ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr + 2, ref_stride, 0x7fffffff);

 void vp9_sad16x16x8_c(const uint8_t *src_ptr,

@@ -177,31 +178,31 @@

                       int  src_stride,

                       const uint8_t *ref_ptr,

                       int  ref_stride,

-                      uint16_t *sad_array) {

-  sad_array[0] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,

-                                          ref_ptr, ref_stride,

-                                          0x7fffffff);

-  sad_array[1] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,

-                                          ref_ptr + 1, ref_stride,

-                                          0x7fffffff);

-  sad_array[2] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,

-                                          ref_ptr + 2, ref_stride,

-                                          0x7fffffff);

-  sad_array[3] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,

-                                          ref_ptr + 3, ref_stride,

-                                          0x7fffffff);

-  sad_array[4] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,

-                                          ref_ptr + 4, ref_stride,

-                                          0x7fffffff);

-  sad_array[5] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,

-                                          ref_ptr + 5, ref_stride,

-                                          0x7fffffff);

-  sad_array[6] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,

-                                          ref_ptr + 6, ref_stride,

-                                          0x7fffffff);

-  sad_array[7] = (uint16_t)vp9_sad16x16_c(src_ptr, src_stride,

-                                          ref_ptr + 7, ref_stride,

-                                          0x7fffffff);

+                      uint32_t *sad_array) {

+  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr, ref_stride,

+                              0x7fffffff);

+  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr + 1, ref_stride,

+                              0x7fffffff);

+  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr + 2, ref_stride,

+                              0x7fffffff);

+  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr + 3, ref_stride,

+                              0x7fffffff);

+  sad_array[4] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr + 4, ref_stride,

+                              0x7fffffff);

+  sad_array[5] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr + 5, ref_stride,

+                              0x7fffffff);

+  sad_array[6] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr + 6, ref_stride,

+                              0x7fffffff);

+  sad_array[7] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr + 7, ref_stride,

+                              0x7fffffff);

 void vp9_sad16x8x3_c(const uint8_t *src_ptr,

@@ -209,12 +210,12 @@

                      const uint8_t *ref_ptr,

                      int  ref_stride,

                      unsigned int *sad_array) {

-  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr + 2, ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr + 2, ref_stride, 0x7fffffff);

 void vp9_sad16x8x8_c(const uint8_t *src_ptr,

@@ -221,31 +222,31 @@

                      int  src_stride,

                      const uint8_t *ref_ptr,

                      int  ref_stride,

-                     uint16_t *sad_array) {

-  sad_array[0] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,

-                                         ref_ptr, ref_stride,

-                                         0x7fffffff);

-  sad_array[1] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,

-                                         ref_ptr + 1, ref_stride,

-                                         0x7fffffff);

-  sad_array[2] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,

-                                         ref_ptr + 2, ref_stride,

-                                         0x7fffffff);

-  sad_array[3] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,

-                                         ref_ptr + 3, ref_stride,

-                                         0x7fffffff);

-  sad_array[4] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,

-                                         ref_ptr + 4, ref_stride,

-                                         0x7fffffff);

-  sad_array[5] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,

-                                         ref_ptr + 5, ref_stride,

-                                         0x7fffffff);

-  sad_array[6] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,

-                                         ref_ptr + 6, ref_stride,

-                                         0x7fffffff);

-  sad_array[7] = (uint16_t)vp9_sad16x8_c(src_ptr, src_stride,

-                                         ref_ptr + 7, ref_stride,

-                                         0x7fffffff);

+                     uint32_t *sad_array) {

+  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr, ref_stride,

+                             0x7fffffff);

+  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr + 1, ref_stride,

+                             0x7fffffff);

+  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr + 2, ref_stride,

+                             0x7fffffff);

+  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr + 3, ref_stride,

+                             0x7fffffff);

+  sad_array[4] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr + 4, ref_stride,

+                             0x7fffffff);

+  sad_array[5] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr + 5, ref_stride,

+                             0x7fffffff);

+  sad_array[6] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr + 6, ref_stride,

+                             0x7fffffff);

+  sad_array[7] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr + 7, ref_stride,

+                             0x7fffffff);

 void vp9_sad8x8x3_c(const uint8_t *src_ptr,

@@ -253,12 +254,12 @@

                     const uint8_t *ref_ptr,

                     int  ref_stride,

                     unsigned int *sad_array) {

-  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr + 2, ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr + 2, ref_stride, 0x7fffffff);

 void vp9_sad8x8x8_c(const uint8_t *src_ptr,

@@ -265,31 +266,31 @@

                     int  src_stride,

                     const uint8_t *ref_ptr,

                     int  ref_stride,

-                    uint16_t *sad_array) {

-  sad_array[0] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,

-                                        ref_ptr, ref_stride,

-                                        0x7fffffff);

-  sad_array[1] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,

-                                        ref_ptr + 1, ref_stride,

-                                        0x7fffffff);

-  sad_array[2] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,

-                                        ref_ptr + 2, ref_stride,

-                                        0x7fffffff);

-  sad_array[3] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,

-                                        ref_ptr + 3, ref_stride,

-                                        0x7fffffff);

-  sad_array[4] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,

-                                        ref_ptr + 4, ref_stride,

-                                        0x7fffffff);

-  sad_array[5] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,

-                                        ref_ptr + 5, ref_stride,

-                                        0x7fffffff);

-  sad_array[6] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,

-                                        ref_ptr + 6, ref_stride,

-                                        0x7fffffff);

-  sad_array[7] = (uint16_t)vp9_sad8x8_c(src_ptr, src_stride,

-                                        ref_ptr + 7, ref_stride,

-                                        0x7fffffff);

+                    uint32_t *sad_array) {

+  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr, ref_stride,

+                            0x7fffffff);

+  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr + 1, ref_stride,

+                            0x7fffffff);

+  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr + 2, ref_stride,

+                            0x7fffffff);

+  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr + 3, ref_stride,

+                            0x7fffffff);

+  sad_array[4] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr + 4, ref_stride,

+                            0x7fffffff);

+  sad_array[5] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr + 5, ref_stride,

+                            0x7fffffff);

+  sad_array[6] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr + 6, ref_stride,

+                            0x7fffffff);

+  sad_array[7] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr + 7, ref_stride,

+                            0x7fffffff);

 void vp9_sad8x16x3_c(const uint8_t *src_ptr,

@@ -297,12 +298,12 @@

                      const uint8_t *ref_ptr,

                      int  ref_stride,

                      unsigned int *sad_array) {

-  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr + 2, ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr + 2, ref_stride, 0x7fffffff);

 void vp9_sad8x16x8_c(const uint8_t *src_ptr,

@@ -309,31 +310,31 @@

                      int  src_stride,

                      const uint8_t *ref_ptr,

                      int  ref_stride,

-                     uint16_t *sad_array) {

-  sad_array[0] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,

-                                         ref_ptr, ref_stride,

-                                         0x7fffffff);

-  sad_array[1] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,

-                                         ref_ptr + 1, ref_stride,

-                                         0x7fffffff);

-  sad_array[2] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,

-                                         ref_ptr + 2, ref_stride,

-                                         0x7fffffff);

-  sad_array[3] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,

-                                         ref_ptr + 3, ref_stride,

-                                         0x7fffffff);

-  sad_array[4] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,

-                                         ref_ptr + 4, ref_stride,

-                                         0x7fffffff);

-  sad_array[5] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,

-                                         ref_ptr + 5, ref_stride,

-                                         0x7fffffff);

-  sad_array[6] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,

-                                         ref_ptr + 6, ref_stride,

-                                         0x7fffffff);

-  sad_array[7] = (uint16_t)vp9_sad8x16_c(src_ptr, src_stride,

-                                         ref_ptr + 7, ref_stride,

-                                         0x7fffffff);

+                     uint32_t *sad_array) {

+  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr, ref_stride,

+                             0x7fffffff);

+  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr + 1, ref_stride,

+                             0x7fffffff);

+  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr + 2, ref_stride,

+                             0x7fffffff);

+  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr + 3, ref_stride,

+                             0x7fffffff);

+  sad_array[4] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr + 4, ref_stride,

+                             0x7fffffff);

+  sad_array[5] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr + 5, ref_stride,

+                             0x7fffffff);

+  sad_array[6] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr + 6, ref_stride,

+                             0x7fffffff);

+  sad_array[7] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr + 7, ref_stride,

+                             0x7fffffff);

 void vp9_sad4x4x3_c(const uint8_t *src_ptr,

@@ -341,12 +342,12 @@

                     const uint8_t *ref_ptr,

                     int  ref_stride,

                     unsigned int *sad_array) {

-  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr, ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr + 1, ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr + 2, ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr, ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr + 1, ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr + 2, ref_stride, 0x7fffffff);

 void vp9_sad4x4x8_c(const uint8_t *src_ptr,

@@ -353,192 +354,134 @@

                     int  src_stride,

                     const uint8_t *ref_ptr,

                     int  ref_stride,

-                    uint16_t *sad_array) {

-  sad_array[0] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,

-                                        ref_ptr, ref_stride,

-                                        0x7fffffff);

-  sad_array[1] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,

-                                        ref_ptr + 1, ref_stride,

-                                        0x7fffffff);

-  sad_array[2] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,

-                                        ref_ptr + 2, ref_stride,

-                                        0x7fffffff);

-  sad_array[3] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,

-                                        ref_ptr + 3, ref_stride,

-                                        0x7fffffff);

-  sad_array[4] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,

-                                        ref_ptr + 4, ref_stride,

-                                        0x7fffffff);

-  sad_array[5] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,

-                                        ref_ptr + 5, ref_stride,

-                                        0x7fffffff);

-  sad_array[6] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,

-                                        ref_ptr + 6, ref_stride,

-                                        0x7fffffff);

-  sad_array[7] = (uint16_t)vp9_sad4x4_c(src_ptr, src_stride,

-                                        ref_ptr + 7, ref_stride,

-                                        0x7fffffff);

+                    uint32_t *sad_array) {

+  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr, ref_stride,

+                            0x7fffffff);

+  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr + 1, ref_stride,

+                            0x7fffffff);

+  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr + 2, ref_stride,

+                            0x7fffffff);

+  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr + 3, ref_stride,

+                            0x7fffffff);

+  sad_array[4] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr + 4, ref_stride,

+                            0x7fffffff);

+  sad_array[5] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr + 5, ref_stride,

+                            0x7fffffff);

+  sad_array[6] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr + 6, ref_stride,

+                            0x7fffffff);

+  sad_array[7] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr + 7, ref_stride,

+                            0x7fffffff);

 void vp9_sad64x64x4d_c(const uint8_t *src_ptr,

                        int  src_stride,

-                       uint8_t *ref_ptr[],

+                       const uint8_t* const ref_ptr[],

                        int  ref_stride,

                        unsigned int *sad_array) {

-  sad_array[0] = vp9_sad64x64_c(src_ptr, src_stride,

-                                ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad64x64_c(src_ptr, src_stride,

-                                ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad64x64_c(src_ptr, src_stride,

-                                ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad64x64_c(src_ptr, src_stride,

-                                ref_ptr[3], ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad64x64(src_ptr, src_stride,

+                              ref_ptr[3], ref_stride, 0x7fffffff);

 void vp9_sad32x32x4d_c(const uint8_t *src_ptr,

                        int  src_stride,

-                       uint8_t *ref_ptr[],

+                       const uint8_t* const ref_ptr[],

                        int  ref_stride,

                        unsigned int *sad_array) {

-  sad_array[0] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad32x32_c(src_ptr, src_stride,

-                                ref_ptr[3], ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad32x32(src_ptr, src_stride,

+                              ref_ptr[3], ref_stride, 0x7fffffff);

 void vp9_sad16x16x4d_c(const uint8_t *src_ptr,

                        int  src_stride,

-                       uint8_t *ref_ptr[],

+                       const uint8_t* const ref_ptr[],

                        int  ref_stride,

                        unsigned int *sad_array) {

-  sad_array[0] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad16x16_c(src_ptr, src_stride,

-                                ref_ptr[3], ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad16x16(src_ptr, src_stride,

+                              ref_ptr[3], ref_stride, 0x7fffffff);

 void vp9_sad16x8x4d_c(const uint8_t *src_ptr,

                       int  src_stride,

-                      uint8_t *ref_ptr[],

+                      const uint8_t* const ref_ptr[],

                       int  ref_stride,

                       unsigned int *sad_array) {

-  sad_array[0] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad16x8_c(src_ptr, src_stride,

-                               ref_ptr[3], ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad16x8(src_ptr, src_stride,

+                             ref_ptr[3], ref_stride, 0x7fffffff);

 void vp9_sad8x8x4d_c(const uint8_t *src_ptr,

                      int  src_stride,

-                     uint8_t *ref_ptr[],

+                     const uint8_t* const ref_ptr[],

                      int  ref_stride,

                      unsigned int *sad_array) {

-  sad_array[0] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad8x8_c(src_ptr, src_stride,

-                              ref_ptr[3], ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad8x8(src_ptr, src_stride,

+                            ref_ptr[3], ref_stride, 0x7fffffff);

 void vp9_sad8x16x4d_c(const uint8_t *src_ptr,

                       int  src_stride,

-                      uint8_t *ref_ptr[],

+                      const uint8_t* const ref_ptr[],

                       int  ref_stride,

                       unsigned int *sad_array) {

-  sad_array[0] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad8x16_c(src_ptr, src_stride,

-                               ref_ptr[3], ref_stride, 0x7fffffff);

+  sad_array[0] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad8x16(src_ptr, src_stride,

+                             ref_ptr[3], ref_stride, 0x7fffffff);

 void vp9_sad4x4x4d_c(const uint8_t *src_ptr,

                      int  src_stride,

-                     uint8_t *ref_ptr[],

+                     const uint8_t* const ref_ptr[],

                      int  ref_stride,

                      unsigned int *sad_array) {

-  sad_array[0] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr[0], ref_stride, 0x7fffffff);

-  sad_array[1] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr[1], ref_stride, 0x7fffffff);

-  sad_array[2] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr[2], ref_stride, 0x7fffffff);

-  sad_array[3] = vp9_sad4x4_c(src_ptr, src_stride,

-                              ref_ptr[3], ref_stride, 0x7fffffff);

-}

-/* Copy 2 macroblocks to a buffer */

-void vp9_copy32xn_c(uint8_t *src_ptr,

-                    int  src_stride,

-                    uint8_t *dst_ptr,

-                    int  dst_stride,

-                    int height) {

-  int r;

-  for (r = 0; r < height; r++) {

-#if !(CONFIG_FAST_UNALIGNED)

-    dst_ptr[0] = src_ptr[0];

-    dst_ptr[1] = src_ptr[1];

-    dst_ptr[2] = src_ptr[2];

-    dst_ptr[3] = src_ptr[3];

-    dst_ptr[4] = src_ptr[4];

-    dst_ptr[5] = src_ptr[5];

-    dst_ptr[6] = src_ptr[6];

-    dst_ptr[7] = src_ptr[7];

-    dst_ptr[8] = src_ptr[8];

-    dst_ptr[9] = src_ptr[9];

-    dst_ptr[10] = src_ptr[10];

-    dst_ptr[11] = src_ptr[11];

-    dst_ptr[12] = src_ptr[12];

-    dst_ptr[13] = src_ptr[13];

-    dst_ptr[14] = src_ptr[14];

-    dst_ptr[15] = src_ptr[15];

-    dst_ptr[16] = src_ptr[16];

-    dst_ptr[17] = src_ptr[17];

-    dst_ptr[18] = src_ptr[18];

-    dst_ptr[19] = src_ptr[19];

-    dst_ptr[20] = src_ptr[20];

-    dst_ptr[21] = src_ptr[21];

-    dst_ptr[22] = src_ptr[22];

-    dst_ptr[23] = src_ptr[23];

-    dst_ptr[24] = src_ptr[24];

-    dst_ptr[25] = src_ptr[25];

-    dst_ptr[26] = src_ptr[26];

-    dst_ptr[27] = src_ptr[27];

-    dst_ptr[28] = src_ptr[28];

-    dst_ptr[29] = src_ptr[29];

-    dst_ptr[30] = src_ptr[30];

-    dst_ptr[31] = src_ptr[31];

-#else

-    ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0];

-    ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1];

-    ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2];

-    ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3];

-    ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4];

-    ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5];

-    ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6];

-    ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7];

-#endif

-    src_ptr += src_stride;

-    dst_ptr += dst_stride;

-  }

+  sad_array[0] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr[0], ref_stride, 0x7fffffff);

+  sad_array[1] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr[1], ref_stride, 0x7fffffff);

+  sad_array[2] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr[2], ref_stride, 0x7fffffff);

+  sad_array[3] = vp9_sad4x4(src_ptr, src_stride,

+                            ref_ptr[3], ref_stride, 0x7fffffff);

--- a/vp9/encoder/vp9_satd_c.c

+++ /dev/null

@@ -1,48 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <stdlib.h>

-#include "vpx_ports/mem.h"

-#include "./vp9_rtcd.h"

-unsigned int vp9_satd16x16_c(const uint8_t *src_ptr,

-                             int  src_stride,

-                             const uint8_t *ref_ptr,

-                             int  ref_stride,

-                             unsigned int *psatd) {

-  int r, c, i;

-  unsigned int satd = 0;

-  DECLARE_ALIGNED(16, int16_t, diff_in[256]);

-  DECLARE_ALIGNED(16, int16_t, diff_out[16]);

-  int16_t *in;

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

-      diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c];

-    }

-    src_ptr += src_stride;

-    ref_ptr += ref_stride;

-  }

-  in = diff_in;

-  for (r = 0; r < 16; r += 4) {

-    for (c = 0; c < 16; c += 4) {

-      vp9_short_walsh4x4_c(in + c, diff_out, 32);

-      for (i = 0; i < 16; i++)

-        satd += abs(diff_out[i]);

-    }

-    in += 64;

-  }

-  if (psatd)

-    *psatd = satd;

-  return satd;

-}

--- a/vp9/encoder/vp9_segmentation.c

+++ b/vp9/encoder/vp9_segmentation.c

@@ -9,10 +9,11 @@

*/

-#include "limits.h"

+#include <limits.h>

 #include "vpx_mem/vpx_mem.h"

 #include "vp9/encoder/vp9_segmentation.h"

 #include "vp9/common/vp9_pred_common.h"

+#include "vp9/common/vp9_tile_common.h"

 void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x) {

   int mb_row, mb_col;

@@ -21,7 +22,7 @@

   x->gf_active_ptr = (signed char *)cpi->gf_active_flags;

-  if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {

+  if ((cm->frame_type == KEY_FRAME) || (cpi->refresh_golden_frame)) {

     // Reset Gf useage monitors

     vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));

     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;

@@ -143,11 +144,74 @@

   return cost;

+// Based on set of segment counts calculate a probability tree

+static void calc_segtree_probs_pred(MACROBLOCKD *xd,

+                                    int (*segcounts)[MAX_MB_SEGMENTS],

+                                    vp9_prob *segment_tree_probs,

+                                    vp9_prob *mod_probs) {

+  int count[4];

+  assert(!segcounts[0][0] && !segcounts[1][1] &&

+         !segcounts[2][2] && !segcounts[3][3]);

+  // Total count for all segments

+  count[0] = segcounts[3][0] + segcounts[1][0] + segcounts[2][0];

+  count[1] = segcounts[2][1] + segcounts[0][1] + segcounts[3][1];

+  count[2] = segcounts[0][2] + segcounts[3][2] + segcounts[1][2];

+  count[3] = segcounts[1][3] + segcounts[2][3] + segcounts[0][3];

+  // Work out probabilities of each segment

+  segment_tree_probs[0] = get_binary_prob(count[0] + count[1],

+                                          count[2] + count[3]);

+  segment_tree_probs[1] = get_binary_prob(count[0], count[1]);

+  segment_tree_probs[2] = get_binary_prob(count[2], count[3]);

+  // now work out modified counts that the decoder would have

+  count[0] =        segment_tree_probs[0]  *        segment_tree_probs[1];

+  count[1] =        segment_tree_probs[0]  * (256 - segment_tree_probs[1]);

+  count[2] = (256 - segment_tree_probs[0]) *        segment_tree_probs[2];

+  count[3] = (256 - segment_tree_probs[0]) * (256 - segment_tree_probs[2]);

+  // Work out modified probabilties depending on what segment was predicted

+  mod_probs[0] = get_binary_prob(count[1], count[2] + count[3]);

+  mod_probs[1] = get_binary_prob(count[0], count[2] + count[3]);

+  mod_probs[2] = get_binary_prob(count[0] + count[1], count[3]);

+  mod_probs[3] = get_binary_prob(count[0] + count[1], count[2]);

+}

+// Based on set of segment counts and probabilities calculate a cost estimate

+static int cost_segmap_pred(MACROBLOCKD *xd,

+                            int (*segcounts)[MAX_MB_SEGMENTS],

+                            vp9_prob *probs, vp9_prob *mod_probs) {

+  int pred_seg, cost = 0;

+  for (pred_seg = 0; pred_seg < MAX_MB_SEGMENTS; pred_seg++) {

+    int count1, count2;

+    // Cost the top node of the tree

+    count1 = segcounts[pred_seg][0] + segcounts[pred_seg][1];

+    count2 = segcounts[pred_seg][2] + segcounts[pred_seg][3];

+    cost += count1 * vp9_cost_zero(mod_probs[pred_seg]) +

+            count2 * vp9_cost_one(mod_probs[pred_seg]);

+    // Now add the cost of each individual segment branch

+    if (pred_seg >= 2 && count1) {

+      cost += segcounts[pred_seg][0] * vp9_cost_zero(probs[1]) +

+              segcounts[pred_seg][1] * vp9_cost_one(probs[1]);

+    } else if (pred_seg < 2 && count2 > 0) {

+      cost += segcounts[pred_seg][2] * vp9_cost_zero(probs[2]) +

+              segcounts[pred_seg][3] * vp9_cost_one(probs[2]);

+    }

+  }

+  return cost;

+}

 static void count_segs(VP9_COMP *cpi,

                        MODE_INFO *mi,

                        int *no_pred_segcounts,

                        int (*temporal_predictor_count)[2],

-                       int *t_unpred_seg_counts,

+                       int (*t_unpred_seg_counts)[MAX_MB_SEGMENTS],

                        int mb_size, int mb_row, int mb_col) {

   VP9_COMMON *const cm = &cpi->common;

   MACROBLOCKD *const xd = &cpi->mb.e_mbd;

@@ -155,10 +219,8 @@

   const int segment_id = mi->mbmi.segment_id;

   xd->mode_info_context = mi;

-  xd->mb_to_top_edge = -((mb_row * 16) << 3);

-  xd->mb_to_left_edge = -((mb_col * 16) << 3);

-  xd->mb_to_bottom_edge = ((cm->mb_rows - mb_size - mb_row) * 16) << 3;

-  xd->mb_to_right_edge  = ((cm->mb_cols - mb_size - mb_col) * 16) << 3;

+  set_mb_row(cm, xd, mb_row, mb_size);

+  set_mb_col(cm, xd, mb_col, mb_size);

   // Count the number of hits on each segment with no prediction

   no_pred_segcounts[segment_id]++;

@@ -166,8 +228,8 @@

   // Temporal prediction not allowed on key frames

   if (cm->frame_type != KEY_FRAME) {

     // Test to see if the segment id matches the predicted value.

-    const int seg_predicted =

-        (segment_id == vp9_get_pred_mb_segid(cm, xd, segmap_index));

+    const int pred_seg_id = vp9_get_pred_mb_segid(cm, xd, segmap_index);

+    const int seg_predicted = (segment_id == pred_seg_id);

     // Get the segment id prediction context

     const int pred_context = vp9_get_pred_context(cm, xd, PRED_SEG_ID);

@@ -179,7 +241,7 @@

     if (!seg_predicted)

       // Update the "unpredicted" segment count

-      t_unpred_seg_counts[segment_id]++;

+      t_unpred_seg_counts[pred_seg_id][segment_id]++;

@@ -191,18 +253,19 @@

   int t_pred_cost = INT_MAX;

   int i;

-  int mb_row, mb_col;

+  int tile_col, mb_row, mb_col;

   int temporal_predictor_count[PREDICTION_PROBS][2];

   int no_pred_segcounts[MAX_MB_SEGMENTS];

-  int t_unpred_seg_counts[MAX_MB_SEGMENTS];

+  int t_unpred_seg_counts[MAX_MB_SEGMENTS][MAX_MB_SEGMENTS];

   vp9_prob no_pred_tree[MB_FEATURE_TREE_PROBS];

   vp9_prob t_pred_tree[MB_FEATURE_TREE_PROBS];

+  vp9_prob t_pred_tree_mod[MAX_MB_SEGMENTS];

   vp9_prob t_nopred_prob[PREDICTION_PROBS];

   const int mis = cm->mode_info_stride;

-  MODE_INFO *mi_ptr = cm->mi, *mi;

+  MODE_INFO *mi_ptr, *mi;

   // Set default state for the segment tree probabilities and the

   // temporal coding probabilities

@@ -218,42 +281,49 @@

   // First of all generate stats regarding how well the last segment map

   // predicts this one

-  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {

-    mi = mi_ptr;

-    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 4, mi += 4) {

-      if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

-        count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,

-                   t_unpred_seg_counts, 4, mb_row, mb_col);

-      } else {

-        for (i = 0; i < 4; i++) {

-          int x_idx = (i & 1) << 1, y_idx = i & 2;

-          MODE_INFO *sb_mi = mi + y_idx * mis + x_idx;

+  for (tile_col = 0; tile_col < cm->tile_columns; tile_col++) {

+    vp9_get_tile_col_offsets(cm, tile_col);

+    mi_ptr = cm->mi + cm->cur_tile_mb_col_start;

+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 4, mi_ptr += 4 * mis) {

+      mi = mi_ptr;

+      for (mb_col = cm->cur_tile_mb_col_start;

+           mb_col < cm->cur_tile_mb_col_end; mb_col += 4, mi += 4) {

+        if (mi->mbmi.sb_type == BLOCK_SIZE_SB64X64) {

+          count_segs(cpi, mi, no_pred_segcounts, temporal_predictor_count,

+                     t_unpred_seg_counts, 4, mb_row, mb_col);

+        } else {

+          for (i = 0; i < 4; i++) {

+            int x_idx = (i & 1) << 1, y_idx = i & 2;

+            MODE_INFO *sb_mi = mi + y_idx * mis + x_idx;

-          if (mb_col + x_idx >= cm->mb_cols ||

-              mb_row + y_idx >= cm->mb_rows) {

-            continue;

-          }

+            if (mb_col + x_idx >= cm->mb_cols ||

+                mb_row + y_idx >= cm->mb_rows) {

+              continue;

+            }

-          if (sb_mi->mbmi.sb_type) {

-            assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32);

-            count_segs(cpi, sb_mi, no_pred_segcounts, temporal_predictor_count,

-                       t_unpred_seg_counts, 2, mb_row + y_idx, mb_col + x_idx);

-          } else {

-            int j;

+            if (sb_mi->mbmi.sb_type) {

+              assert(sb_mi->mbmi.sb_type == BLOCK_SIZE_SB32X32);

+              count_segs(cpi, sb_mi, no_pred_segcounts,

+                         temporal_predictor_count, t_unpred_seg_counts, 2,

+                         mb_row + y_idx, mb_col + x_idx);

+            } else {

+              int j;

-            for (j = 0; j < 4; j++) {

-              const int x_idx_mb = x_idx + (j & 1), y_idx_mb = y_idx + (j >> 1);

-              MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis;

+              for (j = 0; j < 4; j++) {

+                const int x_idx_mb = x_idx + (j & 1);

+                const int y_idx_mb = y_idx + (j >> 1);

+                MODE_INFO *mb_mi = mi + x_idx_mb + y_idx_mb * mis;

-              if (mb_col + x_idx_mb >= cm->mb_cols ||

-                  mb_row + y_idx_mb >= cm->mb_rows) {

-                continue;

-              }

+                if (mb_col + x_idx_mb >= cm->mb_cols ||

+                    mb_row + y_idx_mb >= cm->mb_rows) {

+                  continue;

+                }

-              assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);

-              count_segs(cpi, mb_mi, no_pred_segcounts,

-                         temporal_predictor_count, t_unpred_seg_counts,

-                         1, mb_row + y_idx_mb, mb_col + x_idx_mb);

+                assert(mb_mi->mbmi.sb_type == BLOCK_SIZE_MB16X16);

+                count_segs(cpi, mb_mi, no_pred_segcounts,

+                           temporal_predictor_count, t_unpred_seg_counts,

+                           1, mb_row + y_idx_mb, mb_col + x_idx_mb);

+              }

@@ -270,8 +340,10 @@

   if (cm->frame_type != KEY_FRAME) {

     // Work out probability tree for coding those segments not

     // predicted using the temporal method and the cost.

-    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);

-    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);

+    calc_segtree_probs_pred(xd, t_unpred_seg_counts, t_pred_tree,

+                            t_pred_tree_mod);

+    t_pred_cost = cost_segmap_pred(xd, t_unpred_seg_counts, t_pred_tree,

+                                   t_pred_tree_mod);

     // Add in the cost of the signalling for each prediction context

     for (i = 0; i < PREDICTION_PROBS; i++) {

@@ -291,6 +363,8 @@

     cm->temporal_update = 1;

     vpx_memcpy(xd->mb_segment_tree_probs,

                t_pred_tree, sizeof(t_pred_tree));

+    vpx_memcpy(xd->mb_segment_mispred_tree_probs,

+               t_pred_tree_mod, sizeof(t_pred_tree_mod));

     vpx_memcpy(&cm->segment_pred_probs,

                t_nopred_prob, sizeof(t_nopred_prob));

   } else {

--- a/vp9/encoder/vp9_segmentation.h

+++ b/vp9/encoder/vp9_segmentation.h

@@ -9,23 +9,20 @@

*/

-#include "string.h"

-#include "vp9/common/vp9_blockd.h"

-#include "vp9/encoder/vp9_onyx_int.h"

 #ifndef VP9_ENCODER_VP9_SEGMENTATION_H_

 #define VP9_ENCODER_VP9_SEGMENTATION_H_

-extern void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm,

-                                      MACROBLOCK *x);

+#include "vp9/common/vp9_blockd.h"

+#include "vp9/encoder/vp9_onyx_int.h"

-extern void vp9_enable_segmentation(VP9_PTR ptr);

-extern void vp9_disable_segmentation(VP9_PTR ptr);

+void vp9_update_gf_useage_maps(VP9_COMP *cpi, VP9_COMMON *cm, MACROBLOCK *x);

+void vp9_enable_segmentation(VP9_PTR ptr);

+void vp9_disable_segmentation(VP9_PTR ptr);

 // Valid values for a segment are 0 to 3

 // Segmentation map is arrange as [Rows][Columns]

-extern void vp9_set_segmentation_map(VP9_PTR ptr,

-                                     unsigned char *segmentation_map);

+void vp9_set_segmentation_map(VP9_PTR ptr, unsigned char *segmentation_map);

 // The values given for each segment can be either deltas (from the default

 // value chosen for the frame) or absolute values.

@@ -37,10 +34,9 @@

//

 // abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use

 // the absolute values given).

-//

-extern void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,

-                                 unsigned char abs_delta);

+void vp9_set_segment_data(VP9_PTR ptr, signed char *feature_data,

+                          unsigned char abs_delta);

-extern void vp9_choose_segmap_coding_method(VP9_COMP *cpi);

+void vp9_choose_segmap_coding_method(VP9_COMP *cpi);

 #endif  // VP9_ENCODER_VP9_SEGMENTATION_H_

--- a/vp9/encoder/vp9_temporal_filter.c

+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -8,8 +8,11 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <math.h>

+#include <limits.h>

 #include "vp9/common/vp9_onyxc_int.h"

+#include "vp9/common/vp9_reconinter.h"

 #include "vp9/encoder/vp9_onyx_int.h"

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9/encoder/vp9_quantize.h"

@@ -26,15 +29,9 @@

 #include "vp9/common/vp9_swapyv12buffer.h"

 #include "vpx_ports/vpx_timer.h"

-#include <math.h>

-#include <limits.h>

 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering

 #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering

-#if VP9_TEMPORAL_ALT_REF

 static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,

                                             uint8_t *y_mb_ptr,

                                             uint8_t *u_mb_ptr,

@@ -43,39 +40,44 @@

                                             int mv_row,

                                             int mv_col,

                                             uint8_t *pred) {

-  int offset;

-  uint8_t *yptr, *uptr, *vptr;

-  int omv_row, omv_col;

+  const int which_mv = 0;

+  int_mv subpel_mv;

+  int_mv fullpel_mv;

-  // Y

-  yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);

+  subpel_mv.as_mv.row = mv_row;

+  subpel_mv.as_mv.col = mv_col;

+  // TODO(jkoleszar): Make this rounding consistent with the rest of the code

+  fullpel_mv.as_mv.row = (mv_row >> 1) & ~7;

+  fullpel_mv.as_mv.col = (mv_col >> 1) & ~7;

-  if ((mv_row | mv_col) & 7) {

-    xd->subpixel_predict16x16(yptr, stride,

-                             (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);

-  } else {

-    vp9_copy_mem16x16(yptr, stride, &pred[0], 16);

-  }

+  vp9_build_inter_predictor(y_mb_ptr, stride,

+                            &pred[0], 16,

+                            &subpel_mv,

+                            &xd->scale_factor[which_mv],

+                            16, 16,

+                            which_mv <<

+                            (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),

+                            &xd->subpix);

-  // U & V

-  omv_row = mv_row;

-  omv_col = mv_col;

-  mv_row >>= 1;

-  mv_col >>= 1;

   stride = (stride + 1) >> 1;

-  offset = (mv_row >> 3) * stride + (mv_col >> 3);

-  uptr = u_mb_ptr + offset;

-  vptr = v_mb_ptr + offset;

-  if ((omv_row | omv_col) & 15) {

-    xd->subpixel_predict8x8(uptr, stride,

-                           (omv_col & 15), (omv_row & 15), &pred[256], 8);

-    xd->subpixel_predict8x8(vptr, stride,

-                           (omv_col & 15), (omv_row & 15), &pred[320], 8);

-  } else {

-    vp9_copy_mem8x8(uptr, stride, &pred[256], 8);

-    vp9_copy_mem8x8(vptr, stride, &pred[320], 8);

-  }

+  vp9_build_inter_predictor_q4(u_mb_ptr, stride,

+                               &pred[256], 8,

+                               &fullpel_mv, &subpel_mv,

+                               &xd->scale_factor_uv[which_mv],

+                               8, 8,

+                               which_mv <<

+                               (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),

+                               &xd->subpix);

+  vp9_build_inter_predictor_q4(v_mb_ptr, stride,

+                               &pred[320], 8,

+                               &fullpel_mv, &subpel_mv,

+                               &xd->scale_factor_uv[which_mv],

+                               8, 8,

+                               which_mv <<

+                               (2 * CONFIG_IMPLICIT_COMPOUNDINTER_WEIGHT),

+                               &xd->subpix);

 void vp9_temporal_filter_apply_c(uint8_t *frame1,

@@ -170,7 +172,7 @@

   /*cpi->sf.search_method == HEX*/

   // TODO Check that the 16x16 vf & sdf are selected here

   // Ignore mv costing by sending NULL pointer instead of cost arrays

-  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv.first,

+  bestsme = vp9_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.as_mv[0],

                            step_param, sadpb, &cpi->fn_ptr[BLOCK_16X16],

                            NULL, NULL, NULL, NULL,

                            &best_ref_mv1);

@@ -182,7 +184,7 @@

     int distortion;

     unsigned int sse;

     // Ignore mv costing by sending NULL pointer instead of cost array

-    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv.first,

+    bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.as_mv[0],

                                            &best_ref_mv1,

                                            x->errorperbit,

                                            &cpi->fn_ptr[BLOCK_16X16],

@@ -262,8 +264,8 @@

         if (cpi->frames[frame] == NULL)

           continue;

-        mbd->block[0].bmi.as_mv.first.as_mv.row = 0;

-        mbd->block[0].bmi.as_mv.first.as_mv.col = 0;

+        mbd->block[0].bmi.as_mv[0].as_mv.row = 0;

+        mbd->block[0].bmi.as_mv[0].as_mv.col = 0;

         if (frame == alt_ref_index) {

           filter_weight = 2;

@@ -296,8 +298,8 @@

            cpi->frames[frame]->u_buffer + mb_uv_offset,

            cpi->frames[frame]->v_buffer + mb_uv_offset,

            cpi->frames[frame]->y_stride,

-           mbd->block[0].bmi.as_mv.first.as_mv.row,

-           mbd->block[0].bmi.as_mv.first.as_mv.col,

+           mbd->block[0].bmi.as_mv[0].as_mv.row,

+           mbd->block[0].bmi.as_mv[0].as_mv.col,

            predictor);

           // Apply the filter (YUV)

@@ -375,11 +377,7 @@

   mbd->pre.v_buffer = v_buffer;

-void vp9_temporal_filter_prepare

-(

-  VP9_COMP *cpi,

-  int distance

-) {

+void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {

   int frame = 0;

   int num_frames_backward = 0;

@@ -389,10 +387,8 @@

   int frames_to_blur = 0;

   int start_frame = 0;

-  int strength = cpi->oxcf.arnr_strength;

+  int strength = cpi->active_arnr_strength;

   int blur_type = cpi->oxcf.arnr_type;

   int max_frames = cpi->active_arnr_frames;

   num_frames_backward = distance;

@@ -464,6 +460,13 @@

 , start_frame);

 #endif

+  // Setup scaling factors. Scaling on each of the arnr frames is not supported

+  vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],

+      &cpi->common.yv12_fb[cpi->common.new_fb_idx],

+      cpi->common.width,

+      cpi->common.height);

+  cpi->mb.e_mbd.scale_factor_uv[0] = cpi->mb.e_mbd.scale_factor[0];

   // Setup frame pointers, NULL indicates frame not included in filter

   vpx_memset(cpi->frames, 0, max_frames * sizeof(YV12_BUFFER_CONFIG *));

   for (frame = 0; frame < frames_to_blur; frame++) {

@@ -479,4 +482,3 @@

     frames_to_blur_backward,

     strength);

-#endif

--- a/vp9/encoder/vp9_temporal_filter.h

+++ b/vp9/encoder/vp9_temporal_filter.h

@@ -11,6 +11,6 @@

 #ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_

 #define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_

-extern void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);

+void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance);

 #endif  // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_

--- a/vp9/encoder/vp9_tokenize.c

+++ b/vp9/encoder/vp9_tokenize.c

@@ -25,23 +25,32 @@

    compressions, then generating vp9_context.c = initial stats. */

 #ifdef ENTROPY_STATS

-vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4];

-vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4];

-vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8];

-vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8];

-vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16];

-vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16];

-vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32];

+vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];

+vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];

+vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];

+vp9_coeff_accum context_counters_32x32[BLOCK_TYPES];

-extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES_4X4];

-extern vp9_coeff_stats hybrid_tree_update_hist_4x4[BLOCK_TYPES_4X4];

-extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES_8X8];

-extern vp9_coeff_stats hybrid_tree_update_hist_8x8[BLOCK_TYPES_8X8];

-extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES_16X16];

-extern vp9_coeff_stats hybrid_tree_update_hist_16x16[BLOCK_TYPES_16X16];

-extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES_32X32];

+extern vp9_coeff_stats tree_update_hist_4x4[BLOCK_TYPES];

+extern vp9_coeff_stats tree_update_hist_8x8[BLOCK_TYPES];

+extern vp9_coeff_stats tree_update_hist_16x16[BLOCK_TYPES];

+extern vp9_coeff_stats tree_update_hist_32x32[BLOCK_TYPES];

 #endif  /* ENTROPY_STATS */

+#if CONFIG_CODE_NONZEROCOUNT

+#ifdef NZC_STATS

+unsigned int nzc_counts_4x4[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                           [NZC4X4_TOKENS];

+unsigned int nzc_counts_8x8[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                           [NZC8X8_TOKENS];

+unsigned int nzc_counts_16x16[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                             [NZC16X16_TOKENS];

+unsigned int nzc_counts_32x32[MAX_NZC_CONTEXTS][REF_TYPES][BLOCK_TYPES]

+                             [NZC32X32_TOKENS];

+unsigned int nzc_pcat_counts[MAX_NZC_CONTEXTS][NZC_TOKENS_EXTRA]

+                            [NZC_BITS_EXTRA][2];

+#endif

+#endif

 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];

 const TOKENVALUE *vp9_dct_value_tokens_ptr;

 static int dct_value_cost[DCT_MAX_VALUE * 2];

@@ -100,11 +109,7 @@

   vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;

-#if CONFIG_NEWCOEFCONTEXT

-#define PT pn

-#else

-#define PT pt

-#endif

+extern const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad);

 static void tokenize_b(VP9_COMP *cpi,

                        MACROBLOCKD *xd,

@@ -113,79 +118,92 @@

                        PLANE_TYPE type,

                        TX_SIZE tx_size,

                        int dry_run) {

+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

   int pt; /* near block/prev token context index */

-  int c = (type == PLANE_TYPE_Y_NO_DC) ? 1 : 0;

-  const BLOCKD * const b = xd->block + ib;

-  const int eob = b->eob;     /* one beyond last nonzero coeff */

+  int c = 0;

+  const int eob = xd->eobs[ib];     /* one beyond last nonzero coeff */

   TOKENEXTRA *t = *tp;        /* store tokens starting here */

-  int16_t *qcoeff_ptr = b->qcoeff;

-  int seg_eob;

-  const int segment_id = xd->mode_info_context->mbmi.segment_id;

-  const int *bands, *scan;

+  int16_t *qcoeff_ptr = xd->qcoeff + 16 * ib;

+  int seg_eob, default_eob, pad;

+  const int segment_id = mbmi->segment_id;

+  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;

+  const int *scan, *nb;

   vp9_coeff_count *counts;

   vp9_coeff_probs *probs;

-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                          get_tx_type(xd, b) : DCT_DCT;

-#if CONFIG_NEWCOEFCONTEXT

-  const int *neighbors;

-  int pn;

+  const int ref = mbmi->ref_frame != INTRA_FRAME;

+  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;

+  uint8_t token_cache[1024];

+#if CONFIG_CODE_NONZEROCOUNT

+  int zerosleft, nzc = 0;

+  if (eob == 0)

+    assert(xd->nzcs[ib] == 0);

 #endif

-  ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context +

-      vp9_block2above[tx_size][ib];

-  ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context +

-      vp9_block2left[tx_size][ib];

-  ENTROPY_CONTEXT a_ec = *a, l_ec = *l;

+  if (sb_type == BLOCK_SIZE_SB64X64) {

+    a = (ENTROPY_CONTEXT *)xd->above_context +

+                                             vp9_block2above_sb64[tx_size][ib];

+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib];

+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+  } else if (sb_type == BLOCK_SIZE_SB32X32) {

+    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib];

+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib];

+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    a2 = a3 = l2 = l3 = NULL;

+  } else {

+    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib];

+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib];

+    a1 = l1 = a2 = l2 = a3 = l3 = NULL;

+  }

-  ENTROPY_CONTEXT *const a1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]) +

-      vp9_block2above[tx_size][ib];

-  ENTROPY_CONTEXT *const l1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]) +

-      vp9_block2left[tx_size][ib];

   switch (tx_size) {

     default:

-    case TX_4X4:

+    case TX_4X4: {

+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                              get_tx_type_4x4(xd, ib) : DCT_DCT;

+      a_ec = *a;

+      l_ec = *l;

       seg_eob = 16;

-      bands = vp9_coef_bands_4x4;

       scan = vp9_default_zig_zag1d_4x4;

       if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_4x4;

-        probs = cpi->common.fc.hybrid_coef_probs_4x4;

         if (tx_type == ADST_DCT) {

           scan = vp9_row_scan_4x4;

         } else if (tx_type == DCT_ADST) {

           scan = vp9_col_scan_4x4;

-      } else {

-        counts = cpi->coef_counts_4x4;

-        probs = cpi->common.fc.coef_probs_4x4;

+      counts = cpi->coef_counts_4x4;

+      probs = cpi->common.fc.coef_probs_4x4;

       break;

-    case TX_8X8:

-      if (type == PLANE_TYPE_Y2) {

-        seg_eob = 4;

-        bands = vp9_coef_bands_4x4;

-        scan = vp9_default_zig_zag1d_4x4;

-      } else {

-#if CONFIG_CNVCONTEXT

-        a_ec = (a[0] + a[1]) != 0;

-        l_ec = (l[0] + l[1]) != 0;

-#endif

-        seg_eob = 64;

-        bands = vp9_coef_bands_8x8;

-        scan = vp9_default_zig_zag1d_8x8;

-      }

+    }

+    case TX_8X8: {

+      const int sz = 3 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                              get_tx_type_8x8(xd, y + (x >> 1)) : DCT_DCT;

+      a_ec = (a[0] + a[1]) != 0;

+      l_ec = (l[0] + l[1]) != 0;

+      seg_eob = 64;

+      scan = vp9_default_zig_zag1d_8x8;

       if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_8x8;

-        probs = cpi->common.fc.hybrid_coef_probs_8x8;

-      } else {

-        counts = cpi->coef_counts_8x8;

-        probs = cpi->common.fc.coef_probs_8x8;

+        if (tx_type == ADST_DCT) {

+          scan = vp9_row_scan_8x8;

+        } else if (tx_type == DCT_ADST) {

+          scan = vp9_col_scan_8x8;

+        }

+      counts = cpi->coef_counts_8x8;

+      probs = cpi->common.fc.coef_probs_8x8;

       break;

-    case TX_16X16:

-#if CONFIG_CNVCONTEXT

+    }

+    case TX_16X16: {

+      const int sz = 4 + sb_type, x = ib & ((1 << sz) - 1), y = ib - x;

+      const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

+                              get_tx_type_16x16(xd, y + (x >> 2)) : DCT_DCT;

       if (type != PLANE_TYPE_UV) {

         a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;

         l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;

@@ -193,89 +211,99 @@

         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

-#endif

       seg_eob = 256;

-      bands = vp9_coef_bands_16x16;

       scan = vp9_default_zig_zag1d_16x16;

       if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_16x16;

-        probs = cpi->common.fc.hybrid_coef_probs_16x16;

-      } else {

-        counts = cpi->coef_counts_16x16;

-        probs = cpi->common.fc.coef_probs_16x16;

+        if (tx_type == ADST_DCT) {

+          scan = vp9_row_scan_16x16;

+        } else if (tx_type == DCT_ADST) {

+          scan = vp9_col_scan_16x16;

+        }

-      if (type == PLANE_TYPE_UV) {

-        int uv_idx = (ib - 16) >> 2;

-        qcoeff_ptr = xd->sb_coeff_data.qcoeff + 1024 + 256 * uv_idx;

-      }

+      counts = cpi->coef_counts_16x16;

+      probs = cpi->common.fc.coef_probs_16x16;

       break;

+    }

     case TX_32X32:

-#if CONFIG_CNVCONTEXT

-      a_ec = a[0] + a[1] + a[2] + a[3] +

-             a1[0] + a1[1] + a1[2] + a1[3];

-      l_ec = l[0] + l[1] + l[2] + l[3] +

-             l1[0] + l1[1] + l1[2] + l1[3];

-      a_ec = a_ec != 0;

-      l_ec = l_ec != 0;

-#endif

+      if (type != PLANE_TYPE_UV) {

+        a_ec = (a[0] + a[1] + a[2] + a[3] +

+                a1[0] + a1[1] + a1[2] + a1[3]) != 0;

+        l_ec = (l[0] + l[1] + l[2] + l[3] +

+                l1[0] + l1[1] + l1[2] + l1[3]) != 0;

+      } else {

+        a_ec = (a[0] + a[1] + a1[0] + a1[1] +

+                a2[0] + a2[1] + a3[0] + a3[1]) != 0;

+        l_ec = (l[0] + l[1] + l1[0] + l1[1] +

+                l2[0] + l2[1] + l3[0] + l3[1]) != 0;

+      }

       seg_eob = 1024;

-      bands = vp9_coef_bands_32x32;

       scan = vp9_default_zig_zag1d_32x32;

       counts = cpi->coef_counts_32x32;

       probs = cpi->common.fc.coef_probs_32x32;

-      qcoeff_ptr = xd->sb_coeff_data.qcoeff;

       break;

   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);

-#if CONFIG_NEWCOEFCONTEXT

-  neighbors = vp9_get_coef_neighbors_handle(scan);

-  pn = pt;

-#endif

+  nb = vp9_get_coef_neighbors_handle(scan, &pad);

+  default_eob = seg_eob;

-  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB))

-    seg_eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);

+  if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))

+    seg_eob = 0;

   do {

-    const int band = bands[c];

+    const int band = get_coef_band(scan, tx_size, c);

     int token;

+    int v = 0;

+#if CONFIG_CODE_NONZEROCOUNT

+    zerosleft = seg_eob - xd->nzcs[ib] - c + nzc;

+#endif

     if (c < eob) {

       const int rc = scan[c];

-      const int v = qcoeff_ptr[rc];

+      v = qcoeff_ptr[rc];

       assert(-DCT_MAX_VALUE <= v  &&  v < DCT_MAX_VALUE);

       t->Extra = vp9_dct_value_tokens_ptr[v].Extra;

       token    = vp9_dct_value_tokens_ptr[v].Token;

     } else {

+#if CONFIG_CODE_NONZEROCOUNT

+      break;

+#else

       token = DCT_EOB_TOKEN;

+#endif

     t->Token = token;

-    t->context_tree = probs[type][band][PT];

-    t->skip_eob_node = (pt == 0) && ((band > 0 && type != PLANE_TYPE_Y_NO_DC) ||

-                                     (band > 1 && type == PLANE_TYPE_Y_NO_DC));

+    t->context_tree = probs[type][ref][band][pt];

+#if CONFIG_CODE_NONZEROCOUNT

+    // Skip zero node if there are no zeros left

+    t->skip_eob_node = 1 + (zerosleft == 0);

+#else

+    t->skip_eob_node = (c > 0) && (token_cache[c - 1] == 0);

+#endif

     assert(vp9_coef_encodings[t->Token].Len - t->skip_eob_node > 0);

     if (!dry_run) {

-      ++counts[type][band][PT][token];

+      ++counts[type][ref][band][pt][token];

+      if (!t->skip_eob_node)

+        ++cpi->common.fc.eob_branch_counts[tx_size][type][ref][band][pt];

-    pt = vp9_prev_token_class[token];

-#if CONFIG_NEWCOEFCONTEXT

-    if (c < seg_eob - 1 && NEWCOEFCONTEXT_BAND_COND(bands[c + 1]))

-      pn = vp9_get_coef_neighbor_context(

-          qcoeff_ptr, (type == PLANE_TYPE_Y_NO_DC), neighbors, scan[c + 1]);

-    else

-      pn = pt;

+#if CONFIG_CODE_NONZEROCOUNT

+    nzc += (v != 0);

 #endif

+    token_cache[c] = token;

+    pt = vp9_get_coef_context(scan, nb, pad, token_cache, c + 1, default_eob);

     ++t;

   } while (c < eob && ++c < seg_eob);

+#if CONFIG_CODE_NONZEROCOUNT

+  assert(nzc == xd->nzcs[ib]);

+#endif

   *tp = t;

-  a_ec = l_ec = (c > !type); /* 0 <-> all coeff data is zero */

+  a_ec = l_ec = (c > 0); /* 0 <-> all coeff data is zero */

   a[0] = a_ec;

   l[0] = l_ec;

-  if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) {

+  if (tx_size == TX_8X8) {

     a[1] = a_ec;

     l[1] = l_ec;

   } else if (tx_size == TX_16X16) {

@@ -287,25 +315,27 @@

       l1[0] = l1[1] = l[1] = l_ec;

   } else if (tx_size == TX_32X32) {

-    a[1] = a[2] = a[3] = a_ec;

-    l[1] = l[2] = l[3] = l_ec;

-    a1[0] = a1[1] = a1[2] = a1[3] = a_ec;

-    l1[0] = l1[1] = l1[2] = l1[3] = l_ec;

+    if (type != PLANE_TYPE_UV) {

+      a[1] = a[2] = a[3] = a_ec;

+      l[1] = l[2] = l[3] = l_ec;

+      a1[0] = a1[1] = a1[2] = a1[3] = a_ec;

+      l1[0] = l1[1] = l1[2] = l1[3] = l_ec;

+    } else {

+      a[1] = a1[0] = a1[1] = a_ec;

+      l[1] = l1[0] = l1[1] = l_ec;

+      a2[0] = a2[1] = a3[0] = a3[1] = a_ec;

+      l2[0] = l2[1] = l3[0] = l3[1] = l_ec;

+    }

-int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_2nd_order) {

+int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd) {

   int skip = 1;

   int i = 0;

-  if (has_2nd_order) {

-    for (i = 0; i < 16; i++)

-      skip &= (xd->block[i].eob < 2);

-    skip &= (!xd->block[24].eob);

-  } else {

-    for (i = 0; i < 16; i++)

-      skip &= (!xd->block[i].eob);

-  }

+  for (i = 0; i < 16; i++)

+    skip &= (!xd->eobs[i]);

   return skip;

@@ -314,48 +344,41 @@

   int i;

   for (i = 16; i < 24; i++)

-    skip &= (!xd->block[i].eob);

+    skip &= (!xd->eobs[i]);

   return skip;

-static int mb_is_skippable_4x4(MACROBLOCKD *xd, int has_2nd_order) {

-  return (vp9_mby_is_skippable_4x4(xd, has_2nd_order) &

+static int mb_is_skippable_4x4(MACROBLOCKD *xd) {

+  return (vp9_mby_is_skippable_4x4(xd) &

           vp9_mbuv_is_skippable_4x4(xd));

-int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_2nd_order) {

+int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd) {

   int skip = 1;

   int i = 0;

-  if (has_2nd_order) {

-    for (i = 0; i < 16; i += 4)

-      skip &= (xd->block[i].eob < 2);

-    skip &= (!xd->block[24].eob);

-  } else {

-    for (i = 0; i < 16; i += 4)

-      skip &= (!xd->block[i].eob);

-  }

+  for (i = 0; i < 16; i += 4)

+    skip &= (!xd->eobs[i]);

   return skip;

 int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd) {

-  return (!xd->block[16].eob) & (!xd->block[20].eob);

+  return (!xd->eobs[16]) & (!xd->eobs[20]);

-static int mb_is_skippable_8x8(MACROBLOCKD *xd, int has_2nd_order) {

-  return (vp9_mby_is_skippable_8x8(xd, has_2nd_order) &

+static int mb_is_skippable_8x8(MACROBLOCKD *xd) {

+  return (vp9_mby_is_skippable_8x8(xd) &

           vp9_mbuv_is_skippable_8x8(xd));

-static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd, int has_2nd_order) {

-  return (vp9_mby_is_skippable_8x8(xd, has_2nd_order) &

+static int mb_is_skippable_8x8_4x4uv(MACROBLOCKD *xd) {

+  return (vp9_mby_is_skippable_8x8(xd) &

           vp9_mbuv_is_skippable_4x4(xd));

 int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd) {

-  int skip = 1;

-  skip &= !xd->block[0].eob;

-  return skip;

+  return (!xd->eobs[0]);

 static int mb_is_skippable_16x16(MACROBLOCKD *xd) {

@@ -363,13 +386,11 @@

 int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd) {

-  int skip = 1;

-  skip &= !xd->block[0].eob;

-  return skip;

+  return (!xd->eobs[0]);

 int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd) {

-  return (!xd->block[16].eob) & (!xd->block[20].eob);

+  return (!xd->eobs[64]) & (!xd->eobs[80]);

 static int sb_is_skippable_32x32(MACROBLOCKD *xd) {

@@ -377,6 +398,68 @@

          vp9_sbuv_is_skippable_16x16(xd);

+int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 0; i < 64; i += 16)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+static int sb_is_skippable_16x16(MACROBLOCKD *xd) {

+  return vp9_sby_is_skippable_16x16(xd) & vp9_sbuv_is_skippable_16x16(xd);

+}

+int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 0; i < 64; i += 4)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 64; i < 96; i += 4)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+static int sb_is_skippable_8x8(MACROBLOCKD *xd) {

+  return vp9_sby_is_skippable_8x8(xd) & vp9_sbuv_is_skippable_8x8(xd);

+}

+int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 0; i < 64; i++)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 64; i < 96; i++)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+static int sb_is_skippable_4x4(MACROBLOCKD *xd) {

+  return vp9_sby_is_skippable_4x4(xd) & vp9_sbuv_is_skippable_4x4(xd);

+}

 void vp9_tokenize_sb(VP9_COMP *cpi,

                      MACROBLOCKD *xd,

                      TOKENEXTRA **t,

@@ -384,17 +467,26 @@

   VP9_COMMON * const cm = &cpi->common;

   MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;

   TOKENEXTRA *t_backup = *t;

-  ENTROPY_CONTEXT *A[2] = { (ENTROPY_CONTEXT *) (xd->above_context + 0),

-                            (ENTROPY_CONTEXT *) (xd->above_context + 1), };

-  ENTROPY_CONTEXT *L[2] = { (ENTROPY_CONTEXT *) (xd->left_context + 0),

-                            (ENTROPY_CONTEXT *) (xd->left_context + 1), };

   const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);

   const int segment_id = mbmi->segment_id;

-  const int skip_inc =  !vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-                        (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0);

+  const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

   int b;

-  mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);

+  switch (mbmi->txfm_size) {

+    case TX_32X32:

+      mbmi->mb_skip_coeff = sb_is_skippable_32x32(xd);

+      break;

+    case TX_16X16:

+      mbmi->mb_skip_coeff = sb_is_skippable_16x16(xd);

+      break;

+    case TX_8X8:

+      mbmi->mb_skip_coeff = sb_is_skippable_8x8(xd);

+      break;

+    case TX_4X4:

+      mbmi->mb_skip_coeff = sb_is_skippable_4x4(xd);

+      break;

+    default: assert(0);

+  }

   if (mbmi->mb_skip_coeff) {

     if (!dry_run)

@@ -402,7 +494,7 @@

     if (!cm->mb_no_coeff_skip) {

       vp9_stuff_sb(cpi, xd, t, dry_run);

     } else {

-      vp9_fix_contexts_sb(xd);

+      vp9_reset_sb_tokens_context(xd);

     if (dry_run)

       *t = t_backup;

@@ -412,14 +504,215 @@

   if (!dry_run)

     cpi->skip_false_count[mb_skip_context] += skip_inc;

-  tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC,

-             TX_32X32, dry_run);

+  switch (mbmi->txfm_size) {

+    case TX_32X32:

+      tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC,

+                 TX_32X32, dry_run);

+      for (b = 64; b < 96; b += 16)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

+                   TX_16X16, dry_run);

+      break;

+    case TX_16X16:

+      for (b = 0; b < 64; b += 16)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

+                   TX_16X16, dry_run);

+      for (b = 64; b < 96; b += 16)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

+                   TX_16X16, dry_run);

+      break;

+    case TX_8X8:

+      for (b = 0; b < 64; b += 4)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

+                   TX_8X8, dry_run);

+      for (b = 64; b < 96; b += 4)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

+                   TX_8X8, dry_run);

+      break;

+    case TX_4X4:

+      for (b = 0; b < 64; b++)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

+                   TX_4X4, dry_run);

+      for (b = 64; b < 96; b++)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

+                   TX_4X4, dry_run);

+      break;

+    default: assert(0);

+  }

-  for (b = 16; b < 24; b += 4) {

-    tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

-               TX_16X16, dry_run);

+  if (dry_run)

+    *t = t_backup;

+}

+int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 0; i < 256; i += 64)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd) {

+  return (!xd->eobs[256]) & (!xd->eobs[320]);

+}

+static int sb64_is_skippable_32x32(MACROBLOCKD *xd) {

+  return vp9_sb64y_is_skippable_32x32(xd) & vp9_sb64uv_is_skippable_32x32(xd);

+}

+int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 0; i < 256; i += 16)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 256; i < 384; i += 16)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+static int sb64_is_skippable_16x16(MACROBLOCKD *xd) {

+  return vp9_sb64y_is_skippable_16x16(xd) & vp9_sb64uv_is_skippable_16x16(xd);

+}

+int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 0; i < 256; i += 4)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 256; i < 384; i += 4)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+static int sb64_is_skippable_8x8(MACROBLOCKD *xd) {

+  return vp9_sb64y_is_skippable_8x8(xd) & vp9_sb64uv_is_skippable_8x8(xd);

+}

+int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 0; i < 256; i++)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd) {

+  int skip = 1;

+  int i = 0;

+  for (i = 256; i < 384; i++)

+    skip &= (!xd->eobs[i]);

+  return skip;

+}

+static int sb64_is_skippable_4x4(MACROBLOCKD *xd) {

+  return vp9_sb64y_is_skippable_4x4(xd) & vp9_sb64uv_is_skippable_4x4(xd);

+}

+void vp9_tokenize_sb64(VP9_COMP *cpi,

+                       MACROBLOCKD *xd,

+                       TOKENEXTRA **t,

+                       int dry_run) {

+  VP9_COMMON * const cm = &cpi->common;

+  MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;

+  TOKENEXTRA *t_backup = *t;

+  const int mb_skip_context = vp9_get_pred_context(cm, xd, PRED_MBSKIP);

+  const int segment_id = mbmi->segment_id;

+  const int skip_inc = !vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP);

+  int b;

+  switch (mbmi->txfm_size) {

+    case TX_32X32:

+      mbmi->mb_skip_coeff = sb64_is_skippable_32x32(xd);

+      break;

+    case TX_16X16:

+      mbmi->mb_skip_coeff = sb64_is_skippable_16x16(xd);

+      break;

+    case TX_8X8:

+      mbmi->mb_skip_coeff = sb64_is_skippable_8x8(xd);

+      break;

+    case TX_4X4:

+      mbmi->mb_skip_coeff = sb64_is_skippable_4x4(xd);

+      break;

+    default: assert(0);

-  A[0][8] = L[0][8] = A[1][8] = L[1][8] = 0;

+  if (mbmi->mb_skip_coeff) {

+    if (!dry_run)

+      cpi->skip_true_count[mb_skip_context] += skip_inc;

+    if (!cm->mb_no_coeff_skip) {

+      vp9_stuff_sb64(cpi, xd, t, dry_run);

+    } else {

+      vp9_reset_sb64_tokens_context(xd);

+    }

+    if (dry_run)

+      *t = t_backup;

+    return;

+  }

+  if (!dry_run)

+    cpi->skip_false_count[mb_skip_context] += skip_inc;

+  switch (mbmi->txfm_size) {

+    case TX_32X32:

+      for (b = 0; b < 256; b += 64)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

+                   TX_32X32, dry_run);

+      for (b = 256; b < 384; b += 64)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

+                   TX_32X32, dry_run);

+      break;

+    case TX_16X16:

+      for (b = 0; b < 256; b += 16)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

+                   TX_16X16, dry_run);

+      for (b = 256; b < 384; b += 16)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

+                   TX_16X16, dry_run);

+      break;

+    case TX_8X8:

+      for (b = 0; b < 256; b += 4)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

+                   TX_8X8, dry_run);

+      for (b = 256; b < 384; b += 4)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

+                   TX_8X8, dry_run);

+      break;

+    case TX_4X4:

+      for (b = 0; b < 256; b++)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC,

+                   TX_4X4, dry_run);

+      for (b = 256; b < 384; b++)

+        tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV,

+                   TX_4X4, dry_run);

+      break;

+    default: assert(0);

+  }

   if (dry_run)

     *t = t_backup;

@@ -428,8 +721,6 @@

                      MACROBLOCKD *xd,

                      TOKENEXTRA **t,

                      int dry_run) {

-  PLANE_TYPE plane_type;

-  int has_2nd_order;

   int b;

   int tx_size = xd->mode_info_context->mbmi.txfm_size;

   int mb_skip_context = vp9_get_pred_context(&cpi->common, xd, PRED_MBSKIP);

@@ -441,14 +732,11 @@

   int skip_inc;

   int segment_id = xd->mode_info_context->mbmi.segment_id;

-  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB) ||

-      (vp9_get_segdata(xd, segment_id, SEG_LVL_EOB) != 0)) {

+  if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) {

     skip_inc = 1;

   } else

     skip_inc = 0;

-  has_2nd_order = get_2nd_order_usage(xd);

   switch (tx_size) {

     case TX_16X16:

@@ -458,15 +746,15 @@

       if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

           xd->mode_info_context->mbmi.mode == SPLITMV)

         xd->mode_info_context->mbmi.mb_skip_coeff =

-            mb_is_skippable_8x8_4x4uv(xd, 0);

+            mb_is_skippable_8x8_4x4uv(xd);

       else

         xd->mode_info_context->mbmi.mb_skip_coeff =

-            mb_is_skippable_8x8(xd, has_2nd_order);

+            mb_is_skippable_8x8(xd);

       break;

     default:

       xd->mode_info_context->mbmi.mb_skip_coeff =

-          mb_is_skippable_4x4(xd, has_2nd_order);

+          mb_is_skippable_4x4(xd);

       break;

@@ -487,15 +775,6 @@

   if (!dry_run)

     cpi->skip_false_count[mb_skip_context] += skip_inc;

-  if (has_2nd_order) {

-    tokenize_b(cpi, xd, 24, t, PLANE_TYPE_Y2, tx_size, dry_run);

-    plane_type = PLANE_TYPE_Y_NO_DC;

-  } else {

-    xd->above_context->y2 = 0;

-    xd->left_context->y2 = 0;

-    plane_type = PLANE_TYPE_Y_WITH_DC;

-  }

   if (tx_size == TX_16X16) {

     tokenize_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);

     for (b = 16; b < 24; b += 4) {

@@ -503,7 +782,7 @@

   } else if (tx_size == TX_8X8) {

     for (b = 0; b < 16; b += 4) {

-      tokenize_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run);

+      tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

     if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||

         xd->mode_info_context->mbmi.mode == SPLITMV) {

@@ -516,11 +795,10 @@

   } else {

-    for (b = 0; b < 24; b++) {

-      if (b >= 16)

-        plane_type = PLANE_TYPE_UV;

-      tokenize_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run);

-    }

+    for (b = 0; b < 16; b++)

+      tokenize_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);

+    for (b = 16; b < 24; b++)

+      tokenize_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

   if (dry_run)

     *t = t_backup;

@@ -531,25 +809,13 @@

   FILE *f = fopen("context.bin", "rb");

   if (!f) {

     vpx_memset(context_counters_4x4, 0, sizeof(context_counters_4x4));

-    vpx_memset(hybrid_context_counters_4x4, 0,

-               sizeof(hybrid_context_counters_4x4));

     vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));

-    vpx_memset(hybrid_context_counters_8x8, 0,

-               sizeof(hybrid_context_counters_8x8));

     vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));

-    vpx_memset(hybrid_context_counters_16x16, 0,

-               sizeof(hybrid_context_counters_16x16));

     vpx_memset(context_counters_32x32, 0, sizeof(context_counters_32x32));

   } else {

     fread(context_counters_4x4, sizeof(context_counters_4x4), 1, f);

-    fread(hybrid_context_counters_4x4,

-          sizeof(hybrid_context_counters_4x4), 1, f);

     fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

-    fread(hybrid_context_counters_8x8,

-          sizeof(hybrid_context_counters_8x8), 1, f);

     fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

-    fread(hybrid_context_counters_16x16,

-          sizeof(hybrid_context_counters_16x16), 1, f);

     fread(context_counters_32x32, sizeof(context_counters_32x32), 1, f);

     fclose(f);

@@ -557,25 +823,13 @@

   f = fopen("treeupdate.bin", "rb");

   if (!f) {

     vpx_memset(tree_update_hist_4x4, 0, sizeof(tree_update_hist_4x4));

-    vpx_memset(hybrid_tree_update_hist_4x4, 0,

-               sizeof(hybrid_tree_update_hist_4x4));

     vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));

-    vpx_memset(hybrid_tree_update_hist_8x8, 0,

-               sizeof(hybrid_tree_update_hist_8x8));

     vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));

-    vpx_memset(hybrid_tree_update_hist_16x16, 0,

-               sizeof(hybrid_tree_update_hist_16x16));

     vpx_memset(tree_update_hist_32x32, 0, sizeof(tree_update_hist_32x32));

   } else {

     fread(tree_update_hist_4x4, sizeof(tree_update_hist_4x4), 1, f);

-    fread(hybrid_tree_update_hist_4x4,

-          sizeof(hybrid_tree_update_hist_4x4), 1, f);

     fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

-    fread(hybrid_tree_update_hist_8x8,

-          sizeof(hybrid_tree_update_hist_8x8), 1, f);

     fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

-    fread(hybrid_tree_update_hist_16x16,

-          sizeof(hybrid_tree_update_hist_16x16), 1, f);

     fread(tree_update_hist_32x32, sizeof(tree_update_hist_32x32), 1, f);

     fclose(f);

@@ -583,7 +837,7 @@

 static void print_counter(FILE *f, vp9_coeff_accum *context_counters,

                           int block_types, const char *header) {

-  int type, band, pt, t;

+  int type, ref, band, pt, t;

   fprintf(f, "static const vp9_coeff_count %s = {\n", header);

@@ -590,26 +844,31 @@

 #define Comma(X) (X ? "," : "")

   type = 0;

   do {

+    ref = 0;

     fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

-    band = 0;

     do {

-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

-      pt = 0;

+      fprintf(f, "%s\n    { /* %s */", Comma(type), ref ? "Inter" : "Intra");

+      band = 0;

       do {

-        fprintf(f, "%s\n      {", Comma(pt));

-        t = 0;

+        fprintf(f, "%s\n      { /* Coeff Band %d */", Comma(band), band);

+        pt = 0;

         do {

-          const int64_t x = context_counters[type][band][pt][t];

-          const int y = (int) x;

+          fprintf(f, "%s\n        {", Comma(pt));

-          assert(x == (int64_t) y);  /* no overflow handling yet */

-          fprintf(f, "%s %d", Comma(t), y);

-        } while (++t < MAX_ENTROPY_TOKENS);

-        fprintf(f, "}");

-      } while (++pt < PREV_COEF_CONTEXTS);

+          t = 0;

+          do {

+            const int64_t x = context_counters[type][ref][band][pt][t];

+            const int y = (int) x;

+            assert(x == (int64_t) y);  /* no overflow handling yet */

+            fprintf(f, "%s %d", Comma(t), y);

+          } while (++t < 1 + MAX_ENTROPY_TOKENS);

+          fprintf(f, "}");

+        } while (++pt < PREV_COEF_CONTEXTS);

+        fprintf(f, "\n      }");

+      } while (++band < COEF_BANDS);

       fprintf(f, "\n    }");

-    } while (++band < COEF_BANDS);

+    } while (++ref < REF_TYPES);

     fprintf(f, "\n  }");

   } while (++type < block_types);

   fprintf(f, "\n};\n");

@@ -617,7 +876,7 @@

 static void print_probs(FILE *f, vp9_coeff_accum *context_counters,

                         int block_types, const char *header) {

-  int type, band, pt, t;

+  int type, ref, band, pt, t;

   fprintf(f, "static const vp9_coeff_probs %s = {", header);

@@ -626,32 +885,41 @@

   do {

     fprintf(f, "%s%s{ /* block Type %d */",

             Comma(type), Newline(type, "  "), type);

-    band = 0;

+    ref = 0;

     do {

-      fprintf(f, "%s%s{ /* Coeff Band %d */",

-              Comma(band), Newline(band, "    "), band);

-      pt = 0;

+      fprintf(f, "%s%s{ /* %s */",

+              Comma(band), Newline(band, "    "), ref ? "Inter" : "Intra");

+      band = 0;

       do {

-        unsigned int branch_ct[ENTROPY_NODES][2];

-        unsigned int coef_counts[MAX_ENTROPY_TOKENS];

-        vp9_prob coef_probs[ENTROPY_NODES];

+        fprintf(f, "%s%s{ /* Coeff Band %d */",

+                Comma(band), Newline(band, "      "), band);

+        pt = 0;

+        do {

+          unsigned int branch_ct[ENTROPY_NODES][2];

+          unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1];

+          vp9_prob coef_probs[ENTROPY_NODES];

-        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

-          coef_counts[t] = context_counters[type][band][pt][t];

-        vp9_tree_probs_from_distribution(MAX_ENTROPY_TOKENS,

-                                         vp9_coef_encodings, vp9_coef_tree,

-                                         coef_probs, branch_ct, coef_counts);

-        fprintf(f, "%s\n      {", Comma(pt));

+          if (pt >= 3 && band == 0)

+            break;

+          for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t)

+            coef_counts[t] = context_counters[type][ref][band][pt][t];

+          vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs,

+                                           branch_ct, coef_counts, 0);

+          branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0];

+          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);

+          fprintf(f, "%s\n      {", Comma(pt));

-        t = 0;

-        do {

-          fprintf(f, "%s %3d", Comma(t), coef_probs[t]);

-        } while (++t < ENTROPY_NODES);

+          t = 0;

+          do {

+            fprintf(f, "%s %3d", Comma(t), coef_probs[t]);

+          } while (++t < ENTROPY_NODES);

-        fprintf(f, " }");

-      } while (++pt < PREV_COEF_CONTEXTS);

+          fprintf(f, " }");

+        } while (++pt < PREV_COEF_CONTEXTS);

+        fprintf(f, "\n      }");

+      } while (++band < COEF_BANDS);

       fprintf(f, "\n    }");

-    } while (++band < COEF_BANDS);

+    } while (++ref < REF_TYPES);

     fprintf(f, "\n  }");

   } while (++type < block_types);

   fprintf(f, "\n};\n");

@@ -664,49 +932,31 @@

   fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");

   /* print counts */

-  print_counter(f, context_counters_4x4, BLOCK_TYPES_4X4,

-                "vp9_default_coef_counts_4x4[BLOCK_TYPES_4X4]");

-  print_counter(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4,

-                "vp9_default_hybrid_coef_counts_4x4[BLOCK_TYPES_4X4]");

-  print_counter(f, context_counters_8x8, BLOCK_TYPES_8X8,

-                "vp9_default_coef_counts_8x8[BLOCK_TYPES_8X8]");

-  print_counter(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8,

-                "vp9_default_hybrid_coef_counts_8x8[BLOCK_TYPES_8X8]");

-  print_counter(f, context_counters_16x16, BLOCK_TYPES_16X16,

-                "vp9_default_coef_counts_16x16[BLOCK_TYPES_16X16]");

-  print_counter(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16,

-                "vp9_default_hybrid_coef_counts_16x16[BLOCK_TYPES_16X16]");

-  print_counter(f, context_counters_32x32, BLOCK_TYPES_32X32,

-                "vp9_default_coef_counts_32x32[BLOCK_TYPES_32X32]");

+  print_counter(f, context_counters_4x4, BLOCK_TYPES,

+                "vp9_default_coef_counts_4x4[BLOCK_TYPES]");

+  print_counter(f, context_counters_8x8, BLOCK_TYPES,

+                "vp9_default_coef_counts_8x8[BLOCK_TYPES]");

+  print_counter(f, context_counters_16x16, BLOCK_TYPES,

+                "vp9_default_coef_counts_16x16[BLOCK_TYPES]");

+  print_counter(f, context_counters_32x32, BLOCK_TYPES,

+                "vp9_default_coef_counts_32x32[BLOCK_TYPES]");

   /* print coefficient probabilities */

-  print_probs(f, context_counters_4x4, BLOCK_TYPES_4X4,

-              "default_coef_probs_4x4[BLOCK_TYPES_4X4]");

-  print_probs(f, hybrid_context_counters_4x4, BLOCK_TYPES_4X4,

-              "default_hybrid_coef_probs_4x4[BLOCK_TYPES_4X4]");

-  print_probs(f, context_counters_8x8, BLOCK_TYPES_8X8,

-              "default_coef_probs_8x8[BLOCK_TYPES_8X8]");

-  print_probs(f, hybrid_context_counters_8x8, BLOCK_TYPES_8X8,

-              "default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8]");

-  print_probs(f, context_counters_16x16, BLOCK_TYPES_16X16,

-              "default_coef_probs_16x16[BLOCK_TYPES_16X16]");

-  print_probs(f, hybrid_context_counters_16x16, BLOCK_TYPES_16X16,

-              "default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16]");

-  print_probs(f, context_counters_32x32, BLOCK_TYPES_32X32,

-              "default_coef_probs_32x32[BLOCK_TYPES_32X32]");

+  print_probs(f, context_counters_4x4, BLOCK_TYPES,

+              "default_coef_probs_4x4[BLOCK_TYPES]");

+  print_probs(f, context_counters_8x8, BLOCK_TYPES,

+              "default_coef_probs_8x8[BLOCK_TYPES]");

+  print_probs(f, context_counters_16x16, BLOCK_TYPES,

+              "default_coef_probs_16x16[BLOCK_TYPES]");

+  print_probs(f, context_counters_32x32, BLOCK_TYPES,

+              "default_coef_probs_32x32[BLOCK_TYPES]");

   fclose(f);

   f = fopen("context.bin", "wb");

   fwrite(context_counters_4x4, sizeof(context_counters_4x4), 1, f);

-  fwrite(hybrid_context_counters_4x4,

-         sizeof(hybrid_context_counters_4x4), 1, f);

   fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

-  fwrite(hybrid_context_counters_8x8,

-         sizeof(hybrid_context_counters_8x8), 1, f);

   fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

-  fwrite(hybrid_context_counters_16x16,

-         sizeof(hybrid_context_counters_16x16), 1, f);

   fwrite(context_counters_32x32, sizeof(context_counters_32x32), 1, f);

   fclose(f);

@@ -716,61 +966,65 @@

   fill_value_tokens();

-static __inline void stuff_b(VP9_COMP *cpi,

-                             MACROBLOCKD *xd,

-                             const int ib,

-                             TOKENEXTRA **tp,

-                             PLANE_TYPE type,

-                             TX_SIZE tx_size,

-                             int dry_run) {

-  const BLOCKD * const b = xd->block + ib;

-  const int *bands;

+static void stuff_b(VP9_COMP *cpi,

+                    MACROBLOCKD *xd,

+                    const int ib,

+                    TOKENEXTRA **tp,

+                    PLANE_TYPE type,

+                    TX_SIZE tx_size,

+                    int dry_run) {

+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;

+  const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type;

+#if CONFIG_CODE_NONZEROCOUNT == 0

   vp9_coeff_count *counts;

   vp9_coeff_probs *probs;

   int pt, band;

   TOKENEXTRA *t = *tp;

-  const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ?

-                          get_tx_type(xd, b) : DCT_DCT;

-  ENTROPY_CONTEXT *const a = (ENTROPY_CONTEXT *)xd->above_context +

-      vp9_block2above[tx_size][ib];

-  ENTROPY_CONTEXT *const l = (ENTROPY_CONTEXT *)xd->left_context +

-      vp9_block2left[tx_size][ib];

-  ENTROPY_CONTEXT a_ec = *a, l_ec = *l;

-  ENTROPY_CONTEXT *const a1 = (ENTROPY_CONTEXT *)(&xd->above_context[1]) +

-      vp9_block2above[tx_size][ib];

-  ENTROPY_CONTEXT *const l1 = (ENTROPY_CONTEXT *)(&xd->left_context[1]) +

-      vp9_block2left[tx_size][ib];

+  const int ref = mbmi->ref_frame != INTRA_FRAME;

+#endif

+  ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec;

+  if (sb_type == BLOCK_SIZE_SB32X32) {

+    a = (ENTROPY_CONTEXT *)xd->above_context +

+                                             vp9_block2above_sb64[tx_size][ib];

+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb64[tx_size][ib];

+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    a2 = a1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l2 = l1 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    a3 = a2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l3 = l2 + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+  } else if (sb_type == BLOCK_SIZE_SB32X32) {

+    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above_sb[tx_size][ib];

+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left_sb[tx_size][ib];

+    a1 = a + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    l1 = l + sizeof(ENTROPY_CONTEXT_PLANES) / sizeof(ENTROPY_CONTEXT);

+    a2 = l2 = a3 = l3 = NULL;

+  } else {

+    a = (ENTROPY_CONTEXT *)xd->above_context + vp9_block2above[tx_size][ib];

+    l = (ENTROPY_CONTEXT *)xd->left_context + vp9_block2left[tx_size][ib];

+    a1 = l1 = a2 = l2 = a3 = l3 = NULL;

+  }

   switch (tx_size) {

     default:

     case TX_4X4:

-      bands = vp9_coef_bands_4x4;

-      if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_4x4;

-        probs = cpi->common.fc.hybrid_coef_probs_4x4;

-      } else {

-        counts = cpi->coef_counts_4x4;

-        probs = cpi->common.fc.coef_probs_4x4;

-      }

+      a_ec = a[0];

+      l_ec = l[0];

+#if CONFIG_CODE_NONZEROCOUNT == 0

+      counts = cpi->coef_counts_4x4;

+      probs = cpi->common.fc.coef_probs_4x4;

+#endif

       break;

     case TX_8X8:

-#if CONFIG_CNVCONTEXT

-      if (type != PLANE_TYPE_Y2) {

-        a_ec = (a[0] + a[1]) != 0;

-        l_ec = (l[0] + l[1]) != 0;

-      }

+      a_ec = (a[0] + a[1]) != 0;

+      l_ec = (l[0] + l[1]) != 0;

+#if CONFIG_CODE_NONZEROCOUNT == 0

+      counts = cpi->coef_counts_8x8;

+      probs = cpi->common.fc.coef_probs_8x8;

 #endif

-      bands = vp9_coef_bands_8x8;

-      if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_8x8;

-        probs = cpi->common.fc.hybrid_coef_probs_8x8;

-      } else {

-        counts = cpi->coef_counts_8x8;

-        probs = cpi->common.fc.coef_probs_8x8;

-      }

       break;

     case TX_16X16:

-#if CONFIG_CNVCONTEXT

       if (type != PLANE_TYPE_UV) {

         a_ec = (a[0] + a[1] + a[2] + a[3]) != 0;

         l_ec = (l[0] + l[1] + l[2] + l[3]) != 0;

@@ -778,41 +1032,44 @@

         a_ec = (a[0] + a[1] + a1[0] + a1[1]) != 0;

         l_ec = (l[0] + l[1] + l1[0] + l1[1]) != 0;

+#if CONFIG_CODE_NONZEROCOUNT == 0

+      counts = cpi->coef_counts_16x16;

+      probs = cpi->common.fc.coef_probs_16x16;

 #endif

-      bands = vp9_coef_bands_16x16;

-      if (tx_type != DCT_DCT) {

-        counts = cpi->hybrid_coef_counts_16x16;

-        probs = cpi->common.fc.hybrid_coef_probs_16x16;

-      } else {

-        counts = cpi->coef_counts_16x16;

-        probs = cpi->common.fc.coef_probs_16x16;

-      }

       break;

     case TX_32X32:

-#if CONFIG_CNVCONTEXT

-      a_ec = a[0] + a[1] + a[2] + a[3] +

-             a1[0] + a1[1] + a1[2] + a1[3];

-      l_ec = l[0] + l[1] + l[2] + l[3] +

-             l1[0] + l1[1] + l1[2] + l1[3];

-      a_ec = a_ec != 0;

-      l_ec = l_ec != 0;

-#endif

-      bands = vp9_coef_bands_32x32;

+      if (type != PLANE_TYPE_UV) {

+        a_ec = (a[0] + a[1] + a[2] + a[3] +

+                a1[0] + a1[1] + a1[2] + a1[3]) != 0;

+        l_ec = (l[0] + l[1] + l[2] + l[3] +

+                l1[0] + l1[1] + l1[2] + l1[3]) != 0;

+      } else {

+        a_ec = (a[0] + a[1] + a1[0] + a1[1] +

+                a2[0] + a2[1] + a3[0] + a3[1]) != 0;

+        l_ec = (l[0] + l[1] + l1[0] + l1[1] +

+                l2[0] + l2[1] + l3[0] + l3[1]) != 0;

+      }

+#if CONFIG_CODE_NONZEROCOUNT == 0

       counts = cpi->coef_counts_32x32;

       probs = cpi->common.fc.coef_probs_32x32;

+#endif

       break;

+#if CONFIG_CODE_NONZEROCOUNT == 0

   VP9_COMBINEENTROPYCONTEXTS(pt, a_ec, l_ec);

-  band = bands[(type == PLANE_TYPE_Y_NO_DC) ? 1 : 0];

+  band = 0;

   t->Token = DCT_EOB_TOKEN;

-  t->context_tree = probs[type][band][pt];

+  t->context_tree = probs[type][ref][band][pt];

   t->skip_eob_node = 0;

   ++t;

   *tp = t;

+  if (!dry_run) {

+    ++counts[type][ref][band][pt][DCT_EOB_TOKEN];

+  }

+#endif

   *a = *l = 0;

-  if (tx_size == TX_8X8 && type != PLANE_TYPE_Y2) {

+  if (tx_size == TX_8X8) {

     a[1] = 0;

     l[1] = 0;

   } else if (tx_size == TX_16X16) {

@@ -824,39 +1081,28 @@

       l1[0] = l1[1] = l[1] = l_ec;

   } else if (tx_size == TX_32X32) {

-    a[1] = a[2] = a[3] = a_ec;

-    l[1] = l[2] = l[3] = l_ec;

-    a1[0] = a1[1] = a1[2] = a1[3] = a_ec;

-    l1[0] = l1[1] = l1[2] = l1[3] = l_ec;

+    if (type != PLANE_TYPE_Y_WITH_DC) {

+      a[1] = a[2] = a[3] = a_ec;

+      l[1] = l[2] = l[3] = l_ec;

+      a1[0] = a1[1] = a1[2] = a1[3] = a_ec;

+      l1[0] = l1[1] = l1[2] = l1[3] = l_ec;

+    } else {

+      a[1] = a1[0] = a1[1] = a_ec;

+      l[1] = l1[0] = l1[1] = l_ec;

+      a2[0] = a2[1] = a3[0] = a3[1] = a_ec;

+      l2[0] = l2[1] = l3[0] = l3[1] = l_ec;

+    }

-  if (!dry_run) {

-    ++counts[type][band][pt][DCT_EOB_TOKEN];

-  }

 static void stuff_mb_8x8(VP9_COMP *cpi, MACROBLOCKD *xd,

                          TOKENEXTRA **t, int dry_run) {

-  PLANE_TYPE plane_type;

   int b;

-  int has_2nd_order = get_2nd_order_usage(xd);

-  if (has_2nd_order) {

-    stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run);

-    plane_type = PLANE_TYPE_Y_NO_DC;

-  } else {

-#if CONFIG_CNVCONTEXT

-    xd->above_context->y2 = 0;

-    xd->left_context->y2 = 0;

-#endif

-    plane_type = PLANE_TYPE_Y_WITH_DC;

-  }

-  for (b = 0; b < 24; b += 4) {

-    if (b >= 16)

-      plane_type = PLANE_TYPE_UV;

-    stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run);

-  }

+  for (b = 0; b < 16; b += 4)

+    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

+  for (b = 16; b < 24; b += 4)

+    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

 static void stuff_mb_16x16(VP9_COMP *cpi, MACROBLOCKD *xd,

@@ -867,56 +1113,26 @@

   for (b = 16; b < 24; b += 4) {

     stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

-#if CONFIG_CNVCONTEXT

-  xd->above_context->y2 = 0;

-  xd->left_context->y2 = 0;

-#endif

 static void stuff_mb_4x4(VP9_COMP *cpi, MACROBLOCKD *xd,

                          TOKENEXTRA **t, int dry_run) {

   int b;

-  PLANE_TYPE plane_type;

-  int has_2nd_order = get_2nd_order_usage(xd);

-  if (has_2nd_order) {

-    stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_4X4, dry_run);

-    plane_type = PLANE_TYPE_Y_NO_DC;

-  } else {

-    xd->above_context->y2 = 0;

-    xd->left_context->y2 = 0;

-    plane_type = PLANE_TYPE_Y_WITH_DC;

-  }

-  for (b = 0; b < 24; b++) {

-    if (b >= 16)

-      plane_type = PLANE_TYPE_UV;

-    stuff_b(cpi, xd, b, t, plane_type, TX_4X4, dry_run);

-  }

+  for (b = 0; b < 16; b++)

+    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);

+  for (b = 16; b < 24; b++)

+    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

 static void stuff_mb_8x8_4x4uv(VP9_COMP *cpi, MACROBLOCKD *xd,

                                TOKENEXTRA **t, int dry_run) {

-  PLANE_TYPE plane_type;

   int b;

-  int has_2nd_order = get_2nd_order_usage(xd);

-  if (has_2nd_order) {

-    stuff_b(cpi, xd, 24, t, PLANE_TYPE_Y2, TX_8X8, dry_run);

-    plane_type = PLANE_TYPE_Y_NO_DC;

-  } else {

-    xd->above_context->y2 = 0;

-    xd->left_context->y2 = 0;

-    plane_type = PLANE_TYPE_Y_WITH_DC;

-  }

-  for (b = 0; b < 16; b += 4) {

-    stuff_b(cpi, xd, b, t, plane_type, TX_8X8, dry_run);

-  }

-  for (b = 16; b < 24; b++) {

+  for (b = 0; b < 16; b += 4)

+    stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

+  for (b = 16; b < 24; b++)

     stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

-  }

 void vp9_stuff_mb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {

@@ -941,27 +1157,76 @@

-static void stuff_sb_32x32(VP9_COMP *cpi, MACROBLOCKD *xd,

-                               TOKENEXTRA **t, int dry_run) {

+void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {

+  TOKENEXTRA * const t_backup = *t;

   int b;

-  stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);

-  for (b = 16; b < 24; b += 4) {

-    stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);

+  switch (xd->mode_info_context->mbmi.txfm_size) {

+    case TX_32X32:

+      stuff_b(cpi, xd, 0, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);

+      for (b = 64; b < 96; b += 16)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);

+      break;

+    case TX_16X16:

+      for (b = 0; b < 64; b += 16)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);

+      for (b = 64; b < 96; b += 16)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);

+      break;

+    case TX_8X8:

+      for (b = 0; b < 64; b += 4)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

+      for (b = 64; b < 96; b += 4)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

+      break;

+    case TX_4X4:

+      for (b = 0; b < 64; b++)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);

+      for (b = 64; b < 96; b++)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

+      break;

+    default: assert(0);

+  if (dry_run) {

+    *t = t_backup;

+  }

-void vp9_stuff_sb(VP9_COMP *cpi, MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run) {

+void vp9_stuff_sb64(VP9_COMP *cpi, MACROBLOCKD *xd,

+                    TOKENEXTRA **t, int dry_run) {

   TOKENEXTRA * const t_backup = *t;

+  int b;

-  stuff_sb_32x32(cpi, xd, t, dry_run);

+  switch (xd->mode_info_context->mbmi.txfm_size) {

+    case TX_32X32:

+      for (b = 0; b < 256; b += 64)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_32X32, dry_run);

+      for (b = 256; b < 384; b += 64)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_32X32, dry_run);

+      break;

+    case TX_16X16:

+      for (b = 0; b < 256; b += 16)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_16X16, dry_run);

+      for (b = 256; b < 384; b += 16)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_16X16, dry_run);

+      break;

+    case TX_8X8:

+      for (b = 0; b < 256; b += 4)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_8X8, dry_run);

+      for (b = 256; b < 384; b += 4)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_8X8, dry_run);

+      break;

+    case TX_4X4:

+      for (b = 0; b < 256; b++)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_Y_WITH_DC, TX_4X4, dry_run);

+      for (b = 256; b < 384; b++)

+        stuff_b(cpi, xd, b, t, PLANE_TYPE_UV, TX_4X4, dry_run);

+      break;

+    default: assert(0);

+  }

   if (dry_run) {

     *t = t_backup;

-}

-void vp9_fix_contexts_sb(MACROBLOCKD *xd) {

-  vpx_memset(xd->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

-  vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * 2);

--- a/vp9/encoder/vp9_tokenize.h

+++ b/vp9/encoder/vp9_tokenize.h

@@ -28,42 +28,54 @@

   uint8_t         skip_eob_node;

 } TOKENEXTRA;

-typedef int64_t vp9_coeff_accum[COEF_BANDS][PREV_COEF_CONTEXTS]

-                               [MAX_ENTROPY_TOKENS];

+typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]

+                               [MAX_ENTROPY_TOKENS + 1];

-extern int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd, int has_y2_block);

-extern int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);

-extern int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd, int has_y2_block);

-extern int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);

-extern int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);

-extern int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);

-extern int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);

+int vp9_mby_is_skippable_4x4(MACROBLOCKD *xd);

+int vp9_mbuv_is_skippable_4x4(MACROBLOCKD *xd);

+int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd);

+int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd);

+int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd);

+int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd);

+int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd);

+int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd);

+int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd);

+int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd);

+int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd);

+int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd);

+int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd);

+int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd);

+int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd);

+int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd);

+int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd);

+int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd);

+int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd);

+int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd);

 struct VP9_COMP;

-extern void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                            TOKENEXTRA **t, int dry_run);

-extern void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                            TOKENEXTRA **t, int dry_run);

+void vp9_tokenize_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

+                     TOKENEXTRA **t, int dry_run);

+void vp9_tokenize_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

+                     TOKENEXTRA **t, int dry_run);

+void vp9_tokenize_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd,

+                       TOKENEXTRA **t, int dry_run);

-extern void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                         TOKENEXTRA **t, int dry_run);

-extern void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

-                         TOKENEXTRA **t, int dry_run);

+void vp9_stuff_mb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

+                  TOKENEXTRA **t, int dry_run);

+void vp9_stuff_sb(struct VP9_COMP *cpi, MACROBLOCKD *xd,

+                  TOKENEXTRA **t, int dry_run);

+void vp9_stuff_sb64(struct VP9_COMP *cpi, MACROBLOCKD *xd,

+                    TOKENEXTRA **t, int dry_run);

-extern void vp9_fix_contexts_sb(MACROBLOCKD *xd);

 #ifdef ENTROPY_STATS

 void init_context_counters();

 void print_context_counters();

-extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES_4X4];

-extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES_8X8];

-extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES_16X16];

-extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES_32X32];

-extern vp9_coeff_accum hybrid_context_counters_4x4[BLOCK_TYPES_4X4];

-extern vp9_coeff_accum hybrid_context_counters_8x8[BLOCK_TYPES_8X8];

-extern vp9_coeff_accum hybrid_context_counters_16x16[BLOCK_TYPES_16X16];

+extern vp9_coeff_accum context_counters_4x4[BLOCK_TYPES];

+extern vp9_coeff_accum context_counters_8x8[BLOCK_TYPES];

+extern vp9_coeff_accum context_counters_16x16[BLOCK_TYPES];

+extern vp9_coeff_accum context_counters_32x32[BLOCK_TYPES];

 #endif

 extern const int *vp9_dct_value_cost_ptr;

--- a/vp9/encoder/vp9_treewriter.c

+++ b/vp9/encoder/vp9_treewriter.c

@@ -10,6 +10,7 @@

 #include "vp9/encoder/vp9_treewriter.h"

+#include "vp9/common/vp9_common.h"

 static void cost(

   int *const C,

@@ -35,5 +36,7 @@

 void vp9_cost_tokens_skip(int *c, const vp9_prob *p, vp9_tree t) {

+  assert(t[1] > 0 && t[0] <= 0);

+  c[-t[0]] = vp9_cost_bit(p[0], 0);

   cost(c, t, p, 2, 0);

--- a/vp9/encoder/vp9_treewriter.h

+++ b/vp9/encoder/vp9_treewriter.h

@@ -36,30 +36,28 @@

 /* Both of these return bits, not scaled bits. */

-static __inline unsigned int cost_branch(const unsigned int ct[2],

-                                         vp9_prob p) {

+static INLINE unsigned int cost_branch256(const unsigned int ct[2],

+                                          vp9_prob p) {

   /* Imitate existing calculation */

-  return ((ct[0] * vp9_cost_zero(p))

-          + (ct[1] * vp9_cost_one(p))) >> 8;

+  return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);

-static __inline unsigned int cost_branch256(const unsigned int ct[2],

-                                            vp9_prob p) {

+static INLINE unsigned int cost_branch(const unsigned int ct[2],

+                                       vp9_prob p) {

   /* Imitate existing calculation */

-  return ((ct[0] * vp9_cost_zero(p))

-          + (ct[1] * vp9_cost_one(p)));

+  return cost_branch256(ct, p) >> 8;

 /* Small functions to write explicit values and tokens, as well as

    estimate their lengths. */

-static __inline void treed_write(vp9_writer *const w,

-                                 vp9_tree t,

-                                 const vp9_prob *const p,

-                                 int v,

-                                 /* number of bits in v, assumed nonzero */

-                                 int n) {

+static INLINE void treed_write(vp9_writer *const w,

+                               vp9_tree t,

+                               const vp9_prob *const p,

+                               int v,

+                               /* number of bits in v, assumed nonzero */

+                               int n) {

   vp9_tree_index i = 0;

   do {

@@ -69,18 +67,18 @@

   } while (n);

-static __inline void write_token(vp9_writer *const w,

-                                 vp9_tree t,

-                                 const vp9_prob *const p,

-                                 vp9_token *const x) {

+static INLINE void write_token(vp9_writer *const w,

+                               vp9_tree t,

+                               const vp9_prob *const p,

+                               vp9_token *const x) {

   treed_write(w, t, p, x->value, x->Len);

-static __inline int treed_cost(vp9_tree t,

-                               const vp9_prob *const p,

-                               int v,

-                               /* number of bits in v, assumed nonzero */

-                               int n) {

+static INLINE int treed_cost(vp9_tree t,

+                             const vp9_prob *const p,

+                             int v,

+                             /* number of bits in v, assumed nonzero */

+                             int n) {

   int c = 0;

   vp9_tree_index i = 0;

@@ -93,9 +91,9 @@

   return c;

-static __inline int cost_token(vp9_tree t,

-                               const vp9_prob *const p,

-                               vp9_token *const x) {

+static INLINE int cost_token(vp9_tree t,

+                             const vp9_prob *const p,

+                             vp9_token *const x) {

   return treed_cost(t, p, x->value, x->Len);

--- a/vp9/encoder/vp9_variance.h

+++ b/vp9/encoder/vp9_variance.h

@@ -19,12 +19,6 @@

                                     int ref_stride,

                                     unsigned int max_sad);

-typedef void (*vp9_copy32xn_fn_t)(const uint8_t *src_ptr,

-                                  int source_stride,

-                                  const uint8_t *ref_ptr,

-                                  int ref_stride,

-                                  int n);

 typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,

                                    int source_stride,

                                    const uint8_t *ref_ptr,

@@ -35,11 +29,11 @@

                                     int source_stride,

                                     const uint8_t *ref_ptr,

                                     int  ref_stride,

-                                    unsigned short *sad_array);

+                                    unsigned int *sad_array);

 typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,

                                      int source_stride,

-                                     const uint8_t ** ref_ptr,

+                                     const uint8_t* const ref_ptr[],

                                      int  ref_stride, unsigned int *sad_array);

 typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,

@@ -79,7 +73,6 @@

     vp9_sad_multi_fn_t      sdx3f;

     vp9_sad_multi1_fn_t     sdx8f;

     vp9_sad_multi_d_fn_t    sdx4df;

-    vp9_copy32xn_fn_t       copymem;

 } vp9_variance_fn_ptr_t;

 #endif  // VP9_ENCODER_VP9_VARIANCE_H_

--- a/vp9/encoder/vp9_variance_c.c

+++ b/vp9/encoder/vp9_variance_c.c

@@ -142,8 +142,8 @@

   const int16_t *HFilter, *VFilter;

   uint16_t FData3[5 * 4];  // Temp data bufffer used in filtering

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   // First filter 1d Horizontal

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);

@@ -166,8 +166,8 @@

   uint8_t temp2[20 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);

   var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);

@@ -186,8 +186,8 @@

   uint8_t temp2[20 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);

   var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);

@@ -206,8 +206,8 @@

   uint8_t temp2[68 * 64];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,

                                     1, 65, 64, HFilter);

@@ -227,8 +227,8 @@

   uint8_t temp2[36 * 32];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);

   var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);

@@ -367,8 +367,8 @@

   uint8_t temp2[20 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);

   var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);

@@ -387,8 +387,8 @@

   uint8_t temp2[20 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,

                                     1, 17, 8, HFilter);

--- /dev/null

+++ b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c

@@ -1,0 +1,895 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <emmintrin.h>  // SSE2

+#include "vp9/common/vp9_idct.h"  // for cospi constants

+void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {

+  const int stride = pitch >> 1;

+  int pass;

+  // Constants

+  //    When we use them, in one case, they are all the same. In all others

+  //    it's a pair of them that we need to repeat four times. This is done

+  //    by constructing the 32 bit constant corresponding to that pair.

+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);

+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);

+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  // Load input

+  __m128i in0  = _mm_loadu_si128((const __m128i *)(input + 0 * stride));

+  __m128i in1  = _mm_loadu_si128((const __m128i *)(input + 1 * stride));

+  __m128i in2  = _mm_loadu_si128((const __m128i *)(input + 2 * stride));

+  __m128i in3  = _mm_loadu_si128((const __m128i *)(input + 3 * stride));

+  __m128i in4  = _mm_loadu_si128((const __m128i *)(input + 4 * stride));

+  __m128i in5  = _mm_loadu_si128((const __m128i *)(input + 5 * stride));

+  __m128i in6  = _mm_loadu_si128((const __m128i *)(input + 6 * stride));

+  __m128i in7  = _mm_loadu_si128((const __m128i *)(input + 7 * stride));

+  // Pre-condition input (shift by two)

+  in0 = _mm_slli_epi16(in0, 2);

+  in1 = _mm_slli_epi16(in1, 2);

+  in2 = _mm_slli_epi16(in2, 2);

+  in3 = _mm_slli_epi16(in3, 2);

+  in4 = _mm_slli_epi16(in4, 2);

+  in5 = _mm_slli_epi16(in5, 2);

+  in6 = _mm_slli_epi16(in6, 2);

+  in7 = _mm_slli_epi16(in7, 2);

+  // We do two passes, first the columns, then the rows. The results of the

+  // first pass are transposed so that the same column code can be reused. The

+  // results of the second pass are also transposed so that the rows (processed

+  // as columns) are put back in row positions.

+  for (pass = 0; pass < 2; pass++) {

+    // To store results of each pass before the transpose.

+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;

+    // Add/substract

+    const __m128i q0 = _mm_add_epi16(in0, in7);

+    const __m128i q1 = _mm_add_epi16(in1, in6);

+    const __m128i q2 = _mm_add_epi16(in2, in5);

+    const __m128i q3 = _mm_add_epi16(in3, in4);

+    const __m128i q4 = _mm_sub_epi16(in3, in4);

+    const __m128i q5 = _mm_sub_epi16(in2, in5);

+    const __m128i q6 = _mm_sub_epi16(in1, in6);

+    const __m128i q7 = _mm_sub_epi16(in0, in7);

+    // Work on first four results

+    {

+      // Add/substract

+      const __m128i r0 = _mm_add_epi16(q0, q3);

+      const __m128i r1 = _mm_add_epi16(q1, q2);

+      const __m128i r2 = _mm_sub_epi16(q1, q2);

+      const __m128i r3 = _mm_sub_epi16(q0, q3);

+      // Interleave to do the multiply by constants which gets us into 32bits

+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);

+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);

+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);

+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);

+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);

+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);

+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);

+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);

+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);

+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);

+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);

+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);

+      // dct_const_round_shift

+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);

+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);

+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);

+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);

+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

+      // Combine

+      res0 = _mm_packs_epi32(w0, w1);

+      res4 = _mm_packs_epi32(w2, w3);

+      res2 = _mm_packs_epi32(w4, w5);

+      res6 = _mm_packs_epi32(w6, w7);

+    }

+    // Work on next four results

+    {

+      // Interleave to do the multiply by constants which gets us into 32bits

+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);

+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);

+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);

+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);

+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);

+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);

+      // dct_const_round_shift

+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);

+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);

+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);

+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);

+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);

+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);

+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);

+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);

+      // Combine

+      const __m128i r0 = _mm_packs_epi32(s0, s1);

+      const __m128i r1 = _mm_packs_epi32(s2, s3);

+      // Add/substract

+      const __m128i x0 = _mm_add_epi16(q4, r0);

+      const __m128i x1 = _mm_sub_epi16(q4, r0);

+      const __m128i x2 = _mm_sub_epi16(q7, r1);

+      const __m128i x3 = _mm_add_epi16(q7, r1);

+      // Interleave to do the multiply by constants which gets us into 32bits

+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);

+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);

+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);

+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);

+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);

+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);

+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);

+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);

+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);

+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);

+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);

+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);

+      // dct_const_round_shift

+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);

+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);

+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);

+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);

+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

+      // Combine

+      res1 = _mm_packs_epi32(w0, w1);

+      res7 = _mm_packs_epi32(w2, w3);

+      res5 = _mm_packs_epi32(w4, w5);

+      res3 = _mm_packs_epi32(w6, w7);

+    }

+    // Transpose the 8x8.

+    {

+      // 00 01 02 03 04 05 06 07

+      // 10 11 12 13 14 15 16 17

+      // 20 21 22 23 24 25 26 27

+      // 30 31 32 33 34 35 36 37

+      // 40 41 42 43 44 45 46 47

+      // 50 51 52 53 54 55 56 57

+      // 60 61 62 63 64 65 66 67

+      // 70 71 72 73 74 75 76 77

+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);

+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);

+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);

+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);

+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);

+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);

+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);

+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);

+      // 00 10 01 11 02 12 03 13

+      // 20 30 21 31 22 32 23 33

+      // 04 14 05 15 06 16 07 17

+      // 24 34 25 35 26 36 27 37

+      // 40 50 41 51 42 52 43 53

+      // 60 70 61 71 62 72 63 73

+      // 54 54 55 55 56 56 57 57

+      // 64 74 65 75 66 76 67 77

+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);

+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);

+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);

+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);

+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);

+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);

+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);

+      // 00 10 20 30 01 11 21 31

+      // 40 50 60 70 41 51 61 71

+      // 02 12 22 32 03 13 23 33

+      // 42 52 62 72 43 53 63 73

+      // 04 14 24 34 05 15 21 36

+      // 44 54 64 74 45 55 61 76

+      // 06 16 26 36 07 17 27 37

+      // 46 56 66 76 47 57 67 77

+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);

+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);

+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);

+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);

+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);

+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);

+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);

+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);

+      // 00 10 20 30 40 50 60 70

+      // 01 11 21 31 41 51 61 71

+      // 02 12 22 32 42 52 62 72

+      // 03 13 23 33 43 53 63 73

+      // 04 14 24 34 44 54 64 74

+      // 05 15 25 35 45 55 65 75

+      // 06 16 26 36 46 56 66 76

+      // 07 17 27 37 47 57 67 77

+    }

+  }

+  // Post-condition output and store it

+  {

+    // Post-condition (division by two)

+    //    division of two 16 bits signed numbers using shifts

+    //    n / 2 = (n - (n >> 15)) >> 1

+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);

+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);

+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);

+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);

+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);

+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);

+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);

+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);

+    in0 = _mm_sub_epi16(in0, sign_in0);

+    in1 = _mm_sub_epi16(in1, sign_in1);

+    in2 = _mm_sub_epi16(in2, sign_in2);

+    in3 = _mm_sub_epi16(in3, sign_in3);

+    in4 = _mm_sub_epi16(in4, sign_in4);

+    in5 = _mm_sub_epi16(in5, sign_in5);

+    in6 = _mm_sub_epi16(in6, sign_in6);

+    in7 = _mm_sub_epi16(in7, sign_in7);

+    in0 = _mm_srai_epi16(in0, 1);

+    in1 = _mm_srai_epi16(in1, 1);

+    in2 = _mm_srai_epi16(in2, 1);

+    in3 = _mm_srai_epi16(in3, 1);

+    in4 = _mm_srai_epi16(in4, 1);

+    in5 = _mm_srai_epi16(in5, 1);

+    in6 = _mm_srai_epi16(in6, 1);

+    in7 = _mm_srai_epi16(in7, 1);

+    // store results

+    _mm_storeu_si128 ((__m128i *)(output + 0 * 8), in0);

+    _mm_storeu_si128 ((__m128i *)(output + 1 * 8), in1);

+    _mm_storeu_si128 ((__m128i *)(output + 2 * 8), in2);

+    _mm_storeu_si128 ((__m128i *)(output + 3 * 8), in3);

+    _mm_storeu_si128 ((__m128i *)(output + 4 * 8), in4);

+    _mm_storeu_si128 ((__m128i *)(output + 5 * 8), in5);

+    _mm_storeu_si128 ((__m128i *)(output + 6 * 8), in6);

+    _mm_storeu_si128 ((__m128i *)(output + 7 * 8), in7);

+  }

+}

+void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {

+  // The 2D transform is done with two passes which are actually pretty

+  // similar. In the first one, we transform the columns and transpose

+  // the results. In the second one, we transform the rows. To achieve that,

+  // as the first pass results are transposed, we tranpose the columns (that

+  // is the transposed rows) and transpose the results (so that it goes back

+  // in normal/row positions).

+  const int stride = pitch >> 1;

+  int pass;

+  // We need an intermediate buffer between passes.

+  int16_t intermediate[256];

+  int16_t *in = input;

+  int16_t *out = intermediate;

+  // Constants

+  //    When we use them, in one case, they are all the same. In all others

+  //    it's a pair of them that we need to repeat four times. This is done

+  //    by constructing the 32 bit constant corresponding to that pair.

+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);

+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);

+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);

+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);

+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);

+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);

+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);

+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);

+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);

+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);

+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);

+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);

+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);

+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);

+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);

+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);

+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);

+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);

+  const __m128i kOne = _mm_set1_epi16(1);

+  // Do the two transform/transpose passes

+  for (pass = 0; pass < 2; ++pass) {

+    // We process eight columns (transposed rows in second pass) at a time.

+    int column_start;

+    for (column_start = 0; column_start < 16; column_start += 8) {

+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;

+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;

+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;

+      __m128i step1_0, step1_1, step1_2, step1_3;

+      __m128i step1_4, step1_5, step1_6, step1_7;

+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;

+      __m128i step3_0, step3_1, step3_2, step3_3;

+      __m128i step3_4, step3_5, step3_6, step3_7;

+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;

+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;

+      // Load and pre-condition input.

+      if (0 == pass) {

+        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * stride));

+        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * stride));

+        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * stride));

+        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * stride));

+        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * stride));

+        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * stride));

+        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * stride));

+        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * stride));

+        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * stride));

+        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * stride));

+        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * stride));

+        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * stride));

+        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * stride));

+        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * stride));

+        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * stride));

+        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * stride));

+        // x = x << 2

+        in00 = _mm_slli_epi16(in00, 2);

+        in01 = _mm_slli_epi16(in01, 2);

+        in02 = _mm_slli_epi16(in02, 2);

+        in03 = _mm_slli_epi16(in03, 2);

+        in04 = _mm_slli_epi16(in04, 2);

+        in05 = _mm_slli_epi16(in05, 2);

+        in06 = _mm_slli_epi16(in06, 2);

+        in07 = _mm_slli_epi16(in07, 2);

+        in08 = _mm_slli_epi16(in08, 2);

+        in09 = _mm_slli_epi16(in09, 2);

+        in10 = _mm_slli_epi16(in10, 2);

+        in11 = _mm_slli_epi16(in11, 2);

+        in12 = _mm_slli_epi16(in12, 2);

+        in13 = _mm_slli_epi16(in13, 2);

+        in14 = _mm_slli_epi16(in14, 2);

+        in15 = _mm_slli_epi16(in15, 2);

+      } else {

+        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 16));

+        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 16));

+        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 16));

+        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 16));

+        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 16));

+        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 16));

+        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 16));

+        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 16));

+        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 16));

+        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 16));

+        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 16));

+        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 16));

+        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 16));

+        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 16));

+        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 16));

+        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 16));

+        // x = (x + 1) >> 2

+        in00 = _mm_add_epi16(in00, kOne);

+        in01 = _mm_add_epi16(in01, kOne);

+        in02 = _mm_add_epi16(in02, kOne);

+        in03 = _mm_add_epi16(in03, kOne);

+        in04 = _mm_add_epi16(in04, kOne);

+        in05 = _mm_add_epi16(in05, kOne);

+        in06 = _mm_add_epi16(in06, kOne);

+        in07 = _mm_add_epi16(in07, kOne);

+        in08 = _mm_add_epi16(in08, kOne);

+        in09 = _mm_add_epi16(in09, kOne);

+        in10 = _mm_add_epi16(in10, kOne);

+        in11 = _mm_add_epi16(in11, kOne);

+        in12 = _mm_add_epi16(in12, kOne);

+        in13 = _mm_add_epi16(in13, kOne);

+        in14 = _mm_add_epi16(in14, kOne);

+        in15 = _mm_add_epi16(in15, kOne);

+        in00 = _mm_srai_epi16(in00, 2);

+        in01 = _mm_srai_epi16(in01, 2);

+        in02 = _mm_srai_epi16(in02, 2);

+        in03 = _mm_srai_epi16(in03, 2);

+        in04 = _mm_srai_epi16(in04, 2);

+        in05 = _mm_srai_epi16(in05, 2);

+        in06 = _mm_srai_epi16(in06, 2);

+        in07 = _mm_srai_epi16(in07, 2);

+        in08 = _mm_srai_epi16(in08, 2);

+        in09 = _mm_srai_epi16(in09, 2);

+        in10 = _mm_srai_epi16(in10, 2);

+        in11 = _mm_srai_epi16(in11, 2);

+        in12 = _mm_srai_epi16(in12, 2);

+        in13 = _mm_srai_epi16(in13, 2);

+        in14 = _mm_srai_epi16(in14, 2);

+        in15 = _mm_srai_epi16(in15, 2);

+      }

+      in += 8;

+      // Calculate input for the first 8 results.

+      {

+        input0 = _mm_add_epi16(in00, in15);

+        input1 = _mm_add_epi16(in01, in14);

+        input2 = _mm_add_epi16(in02, in13);

+        input3 = _mm_add_epi16(in03, in12);

+        input4 = _mm_add_epi16(in04, in11);

+        input5 = _mm_add_epi16(in05, in10);

+        input6 = _mm_add_epi16(in06, in09);

+        input7 = _mm_add_epi16(in07, in08);

+      }

+      // Calculate input for the next 8 results.

+      {

+        step1_0 = _mm_sub_epi16(in07, in08);

+        step1_1 = _mm_sub_epi16(in06, in09);

+        step1_2 = _mm_sub_epi16(in05, in10);

+        step1_3 = _mm_sub_epi16(in04, in11);

+        step1_4 = _mm_sub_epi16(in03, in12);

+        step1_5 = _mm_sub_epi16(in02, in13);

+        step1_6 = _mm_sub_epi16(in01, in14);

+        step1_7 = _mm_sub_epi16(in00, in15);

+      }

+      // Work on the first eight values; fdct8_1d(input, even_results);

+      {

+        // Add/substract

+        const __m128i q0 = _mm_add_epi16(input0, input7);

+        const __m128i q1 = _mm_add_epi16(input1, input6);

+        const __m128i q2 = _mm_add_epi16(input2, input5);

+        const __m128i q3 = _mm_add_epi16(input3, input4);

+        const __m128i q4 = _mm_sub_epi16(input3, input4);

+        const __m128i q5 = _mm_sub_epi16(input2, input5);

+        const __m128i q6 = _mm_sub_epi16(input1, input6);

+        const __m128i q7 = _mm_sub_epi16(input0, input7);

+        // Work on first four results

+        {

+          // Add/substract

+          const __m128i r0 = _mm_add_epi16(q0, q3);

+          const __m128i r1 = _mm_add_epi16(q1, q2);

+          const __m128i r2 = _mm_sub_epi16(q1, q2);

+          const __m128i r3 = _mm_sub_epi16(q0, q3);

+          // Interleave to do the multiply by constants which gets us

+          // into 32 bits.

+          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);

+          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);

+          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);

+          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);

+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);

+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);

+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);

+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);

+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);

+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);

+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);

+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);

+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

+          // Combine

+          res00 = _mm_packs_epi32(w0, w1);

+          res08 = _mm_packs_epi32(w2, w3);

+          res04 = _mm_packs_epi32(w4, w5);

+          res12 = _mm_packs_epi32(w6, w7);

+        }

+        // Work on next four results

+        {

+          // Interleave to do the multiply by constants which gets us

+          // into 32 bits.

+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);

+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);

+          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);

+          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);

+          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);

+          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);

+          // dct_const_round_shift

+          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);

+          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);

+          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);

+          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);

+          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);

+          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);

+          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);

+          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);

+          // Combine

+          const __m128i r0 = _mm_packs_epi32(s0, s1);

+          const __m128i r1 = _mm_packs_epi32(s2, s3);

+          // Add/substract

+          const __m128i x0 = _mm_add_epi16(q4, r0);

+          const __m128i x1 = _mm_sub_epi16(q4, r0);

+          const __m128i x2 = _mm_sub_epi16(q7, r1);

+          const __m128i x3 = _mm_add_epi16(q7, r1);

+          // Interleave to do the multiply by constants which gets us

+          // into 32 bits.

+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);

+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);

+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);

+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);

+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);

+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);

+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);

+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);

+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);

+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);

+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);

+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);

+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);

+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);

+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);

+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);

+          // Combine

+          res02 = _mm_packs_epi32(w0, w1);

+          res14 = _mm_packs_epi32(w2, w3);

+          res10 = _mm_packs_epi32(w4, w5);

+          res06 = _mm_packs_epi32(w6, w7);

+        }

+      }

+      // Work on the next eight values; step1 -> odd_results

+      {

+        // step 2

+        {

+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);

+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);

+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);

+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);

+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);

+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          // Combine

+          step2_2 = _mm_packs_epi32(w0, w1);

+          step2_3 = _mm_packs_epi32(w2, w3);

+        }

+        {

+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);

+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);

+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);

+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);

+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);

+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          // Combine

+          step2_5 = _mm_packs_epi32(w0, w1);

+          step2_4 = _mm_packs_epi32(w2, w3);

+        }

+        // step 3

+        {

+          step3_0 = _mm_add_epi16(step1_0, step2_3);

+          step3_1 = _mm_add_epi16(step1_1, step2_2);

+          step3_2 = _mm_sub_epi16(step1_1, step2_2);

+          step3_3 = _mm_sub_epi16(step1_0, step2_3);

+          step3_4 = _mm_sub_epi16(step1_7, step2_4);

+          step3_5 = _mm_sub_epi16(step1_6, step2_5);

+          step3_6 = _mm_add_epi16(step1_6, step2_5);

+          step3_7 = _mm_add_epi16(step1_7, step2_4);

+        }

+        // step 4

+        {

+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);

+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);

+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);

+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);

+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);

+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          // Combine

+          step2_1 = _mm_packs_epi32(w0, w1);

+          step2_2 = _mm_packs_epi32(w2, w3);

+        }

+        {

+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);

+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);

+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);

+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);

+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);

+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          // Combine

+          step2_6 = _mm_packs_epi32(w0, w1);

+          step2_5 = _mm_packs_epi32(w2, w3);

+        }

+        // step 5

+        {

+          step1_0 = _mm_add_epi16(step3_0, step2_1);

+          step1_1 = _mm_sub_epi16(step3_0, step2_1);

+          step1_2 = _mm_sub_epi16(step3_3, step2_2);

+          step1_3 = _mm_add_epi16(step3_3, step2_2);

+          step1_4 = _mm_add_epi16(step3_4, step2_5);

+          step1_5 = _mm_sub_epi16(step3_4, step2_5);

+          step1_6 = _mm_sub_epi16(step3_7, step2_6);

+          step1_7 = _mm_add_epi16(step3_7, step2_6);

+        }

+        // step 6

+        {

+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);

+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);

+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);

+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);

+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);

+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          // Combine

+          res01 = _mm_packs_epi32(w0, w1);

+          res09 = _mm_packs_epi32(w2, w3);

+        }

+        {

+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);

+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);

+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);

+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);

+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);

+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          // Combine

+          res05 = _mm_packs_epi32(w0, w1);

+          res13 = _mm_packs_epi32(w2, w3);

+        }

+        {

+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);

+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);

+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);

+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);

+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);

+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          // Combine

+          res11 = _mm_packs_epi32(w0, w1);

+          res03 = _mm_packs_epi32(w2, w3);

+        }

+        {

+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);

+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);

+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);

+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);

+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);

+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);

+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);

+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);

+          // dct_const_round_shift

+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);

+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);

+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);

+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);

+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);

+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);

+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);

+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);

+          // Combine

+          res15 = _mm_packs_epi32(w0, w1);

+          res07 = _mm_packs_epi32(w2, w3);

+        }

+      }

+      // Transpose the results, do it as two 8x8 transposes.

+      {

+        // 00 01 02 03 04 05 06 07

+        // 10 11 12 13 14 15 16 17

+        // 20 21 22 23 24 25 26 27

+        // 30 31 32 33 34 35 36 37

+        // 40 41 42 43 44 45 46 47

+        // 50 51 52 53 54 55 56 57

+        // 60 61 62 63 64 65 66 67

+        // 70 71 72 73 74 75 76 77

+        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);

+        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);

+        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);

+        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);

+        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);

+        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);

+        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);

+        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);

+        // 00 10 01 11 02 12 03 13

+        // 20 30 21 31 22 32 23 33

+        // 04 14 05 15 06 16 07 17

+        // 24 34 25 35 26 36 27 37

+        // 40 50 41 51 42 52 43 53

+        // 60 70 61 71 62 72 63 73

+        // 54 54 55 55 56 56 57 57

+        // 64 74 65 75 66 76 67 77

+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);

+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);

+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);

+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);

+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);

+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);

+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);

+        // 00 10 20 30 01 11 21 31

+        // 40 50 60 70 41 51 61 71

+        // 02 12 22 32 03 13 23 33

+        // 42 52 62 72 43 53 63 73

+        // 04 14 24 34 05 15 21 36

+        // 44 54 64 74 45 55 61 76

+        // 06 16 26 36 07 17 27 37

+        // 46 56 66 76 47 57 67 77

+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);

+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);

+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);

+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);

+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);

+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);

+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);

+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);

+        // 00 10 20 30 40 50 60 70

+        // 01 11 21 31 41 51 61 71

+        // 02 12 22 32 42 52 62 72

+        // 03 13 23 33 43 53 63 73

+        // 04 14 24 34 44 54 64 74

+        // 05 15 25 35 45 55 65 75

+        // 06 16 26 36 46 56 66 76

+        // 07 17 27 37 47 57 67 77

+        _mm_storeu_si128 ((__m128i *)(out + 0 * 16), tr2_0);

+        _mm_storeu_si128 ((__m128i *)(out + 1 * 16), tr2_1);

+        _mm_storeu_si128 ((__m128i *)(out + 2 * 16), tr2_2);

+        _mm_storeu_si128 ((__m128i *)(out + 3 * 16), tr2_3);

+        _mm_storeu_si128 ((__m128i *)(out + 4 * 16), tr2_4);

+        _mm_storeu_si128 ((__m128i *)(out + 5 * 16), tr2_5);

+        _mm_storeu_si128 ((__m128i *)(out + 6 * 16), tr2_6);

+        _mm_storeu_si128 ((__m128i *)(out + 7 * 16), tr2_7);

+      }

+      {

+        // 00 01 02 03 04 05 06 07

+        // 10 11 12 13 14 15 16 17

+        // 20 21 22 23 24 25 26 27

+        // 30 31 32 33 34 35 36 37

+        // 40 41 42 43 44 45 46 47

+        // 50 51 52 53 54 55 56 57

+        // 60 61 62 63 64 65 66 67

+        // 70 71 72 73 74 75 76 77

+        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);

+        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);

+        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);

+        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);

+        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);

+        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);

+        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);

+        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);

+        // 00 10 01 11 02 12 03 13

+        // 20 30 21 31 22 32 23 33

+        // 04 14 05 15 06 16 07 17

+        // 24 34 25 35 26 36 27 37

+        // 40 50 41 51 42 52 43 53

+        // 60 70 61 71 62 72 63 73

+        // 54 54 55 55 56 56 57 57

+        // 64 74 65 75 66 76 67 77

+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);

+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);

+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);

+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);

+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);

+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);

+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);

+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);

+        // 00 10 20 30 01 11 21 31

+        // 40 50 60 70 41 51 61 71

+        // 02 12 22 32 03 13 23 33

+        // 42 52 62 72 43 53 63 73

+        // 04 14 24 34 05 15 21 36

+        // 44 54 64 74 45 55 61 76

+        // 06 16 26 36 07 17 27 37

+        // 46 56 66 76 47 57 67 77

+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);

+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);

+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);

+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);

+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);

+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);

+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);

+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);

+        // 00 10 20 30 40 50 60 70

+        // 01 11 21 31 41 51 61 71

+        // 02 12 22 32 42 52 62 72

+        // 03 13 23 33 43 53 63 73

+        // 04 14 24 34 44 54 64 74

+        // 05 15 25 35 45 55 65 75

+        // 06 16 26 36 46 56 66 76

+        // 07 17 27 37 47 57 67 77

+        // Store results

+        _mm_storeu_si128 ((__m128i *)(out + 8 + 0 * 16), tr2_0);

+        _mm_storeu_si128 ((__m128i *)(out + 8 + 1 * 16), tr2_1);

+        _mm_storeu_si128 ((__m128i *)(out + 8 + 2 * 16), tr2_2);

+        _mm_storeu_si128 ((__m128i *)(out + 8 + 3 * 16), tr2_3);

+        _mm_storeu_si128 ((__m128i *)(out + 8 + 4 * 16), tr2_4);

+        _mm_storeu_si128 ((__m128i *)(out + 8 + 5 * 16), tr2_5);

+        _mm_storeu_si128 ((__m128i *)(out + 8 + 6 * 16), tr2_6);

+        _mm_storeu_si128 ((__m128i *)(out + 8 + 7 * 16), tr2_7);

+      }

+      out += 8*16;

+    }

+    // Setup in/out for next pass.

+    in = intermediate;

+    out = output;

+  }

+}

--- a/vp9/encoder/x86/vp9_encodeopt.asm

+++ b/vp9/encoder/x86/vp9_encodeopt.asm

@@ -125,7 +125,7 @@

ret

-;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

+;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);

 global sym(vp9_mbblock_error_mmx_impl) PRIVATE

 sym(vp9_mbblock_error_mmx_impl):

     push        rbp

@@ -142,10 +142,6 @@

         mov         rdi,        arg(1) ;dcoef_ptr

         pxor        mm2,        mm2

-        movd        mm1,        dword ptr arg(2) ;dc

-        por         mm1,        mm2

-        pcmpeqw     mm1,        mm7

         mov         rcx,        16

 .mberror_loop_mmx:

@@ -160,7 +156,6 @@

         pmaddwd     mm5,        mm5

         psubw       mm3,        mm4

-        pand        mm3,        mm1

         pmaddwd     mm3,        mm3

         paddd       mm2,        mm5

@@ -202,13 +197,13 @@

ret

-;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

+;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);

 global sym(vp9_mbblock_error_xmm_impl) PRIVATE

 sym(vp9_mbblock_error_xmm_impl):

     push        rbp

     mov         rbp, rsp

     SHADOW_ARGS_TO_STACK 3

-    SAVE_XMM 6

+    SAVE_XMM 5

     push rsi

     push rdi

     ; end prolog

@@ -215,15 +210,11 @@

         mov         rsi,        arg(0) ;coeff_ptr

-        pxor        xmm6,       xmm6

+        pxor        xmm5,       xmm5

         mov         rdi,        arg(1) ;dcoef_ptr

         pxor        xmm4,       xmm4

-        movd        xmm5,       dword ptr arg(2) ;dc

-        por         xmm5,       xmm4

-        pcmpeqw     xmm5,       xmm6

         mov         rcx,        16

 .mberror_loop:

@@ -238,7 +229,6 @@

         pmaddwd     xmm2,       xmm2

         psubw       xmm0,       xmm1

-        pand        xmm0,       xmm5

         pmaddwd     xmm0,       xmm0

         add         rsi,        32

@@ -252,9 +242,9 @@

         jnz         .mberror_loop

         movdqa      xmm0,       xmm4

-        punpckldq   xmm0,       xmm6

+        punpckldq   xmm0,       xmm5

-        punpckhdq   xmm4,       xmm6

+        punpckhdq   xmm4,       xmm5

         paddd       xmm0,       xmm4

         movdqa      xmm1,       xmm0

--- /dev/null

+++ b/vp9/encoder/x86/vp9_sad4d_sse2.asm

@@ -1,0 +1,225 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "third_party/x86inc/x86inc.asm"

+SECTION .text

+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end

+%macro PROCESS_4x2x4 5-6 0

+  movd                  m0, [srcq +%2]

+%if %1 == 1

+  movd                  m6, [ref1q+%3]

+  movd                  m4, [ref2q+%3]

+  movd                  m7, [ref3q+%3]

+  movd                  m5, [ref4q+%3]

+  punpckldq             m0, [srcq +%4]

+  punpckldq             m6, [ref1q+%5]

+  punpckldq             m4, [ref2q+%5]

+  punpckldq             m7, [ref3q+%5]

+  punpckldq             m5, [ref4q+%5]

+  psadbw                m6, m0

+  psadbw                m4, m0

+  psadbw                m7, m0

+  psadbw                m5, m0

+  punpckldq             m6, m4

+  punpckldq             m7, m5

+%else

+  movd                  m1, [ref1q+%3]

+  movd                  m2, [ref2q+%3]

+  movd                  m3, [ref3q+%3]

+  movd                  m4, [ref4q+%3]

+  punpckldq             m0, [srcq +%4]

+  punpckldq             m1, [ref1q+%5]

+  punpckldq             m2, [ref2q+%5]

+  punpckldq             m3, [ref3q+%5]

+  punpckldq             m4, [ref4q+%5]

+  psadbw                m1, m0

+  psadbw                m2, m0

+  psadbw                m3, m0

+  psadbw                m4, m0

+  punpckldq             m1, m2

+  punpckldq             m3, m4

+  paddd                 m6, m1

+  paddd                 m7, m3

+%endif

+%if %6 == 1

+  lea                 srcq, [srcq +src_strideq*2]

+  lea                ref1q, [ref1q+ref_strideq*2]

+  lea                ref2q, [ref2q+ref_strideq*2]

+  lea                ref3q, [ref3q+ref_strideq*2]

+  lea                ref4q, [ref4q+ref_strideq*2]

+%endif

+%endmacro

+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end

+%macro PROCESS_8x2x4 5-6 0

+  movh                  m0, [srcq +%2]

+%if %1 == 1

+  movh                  m4, [ref1q+%3]

+  movh                  m5, [ref2q+%3]

+  movh                  m6, [ref3q+%3]

+  movh                  m7, [ref4q+%3]

+  movhps                m0, [srcq +%4]

+  movhps                m4, [ref1q+%5]

+  movhps                m5, [ref2q+%5]

+  movhps                m6, [ref3q+%5]

+  movhps                m7, [ref4q+%5]

+  psadbw                m4, m0

+  psadbw                m5, m0

+  psadbw                m6, m0

+  psadbw                m7, m0

+%else

+  movh                  m1, [ref1q+%3]

+  movh                  m2, [ref2q+%3]

+  movh                  m3, [ref3q+%3]

+  movhps                m0, [srcq +%4]

+  movhps                m1, [ref1q+%5]

+  movhps                m2, [ref2q+%5]

+  movhps                m3, [ref3q+%5]

+  psadbw                m1, m0

+  psadbw                m2, m0

+  psadbw                m3, m0

+  paddd                 m4, m1

+  movh                  m1, [ref4q+%3]

+  movhps                m1, [ref4q+%5]

+  paddd                 m5, m2

+  paddd                 m6, m3

+  psadbw                m1, m0

+  paddd                 m7, m1

+%endif

+%if %6 == 1

+  lea                 srcq, [srcq +src_strideq*2]

+  lea                ref1q, [ref1q+ref_strideq*2]

+  lea                ref2q, [ref2q+ref_strideq*2]

+  lea                ref3q, [ref3q+ref_strideq*2]

+  lea                ref4q, [ref4q+ref_strideq*2]

+%endif

+%endmacro

+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end

+%macro PROCESS_16x2x4 5-6 0

+  ; 1st 16 px

+  mova                  m0, [srcq +%2]

+%if %1 == 1

+  movu                  m4, [ref1q+%3]

+  movu                  m5, [ref2q+%3]

+  movu                  m6, [ref3q+%3]

+  movu                  m7, [ref4q+%3]

+  psadbw                m4, m0

+  psadbw                m5, m0

+  psadbw                m6, m0

+  psadbw                m7, m0

+%else

+  movu                  m1, [ref1q+%3]

+  movu                  m2, [ref2q+%3]

+  movu                  m3, [ref3q+%3]

+  psadbw                m1, m0

+  psadbw                m2, m0

+  psadbw                m3, m0

+  paddd                 m4, m1

+  movu                  m1, [ref4q+%3]

+  paddd                 m5, m2

+  paddd                 m6, m3

+  psadbw                m1, m0

+  paddd                 m7, m1

+%endif

+  ; 2nd 16 px

+  mova                  m0, [srcq +%4]

+  movu                  m1, [ref1q+%5]

+  movu                  m2, [ref2q+%5]

+  movu                  m3, [ref3q+%5]

+  psadbw                m1, m0

+  psadbw                m2, m0

+  psadbw                m3, m0

+  paddd                 m4, m1

+  movu                  m1, [ref4q+%5]

+  paddd                 m5, m2

+  paddd                 m6, m3

+%if %6 == 1

+  lea                 srcq, [srcq +src_strideq*2]

+  lea                ref1q, [ref1q+ref_strideq*2]

+  lea                ref2q, [ref2q+ref_strideq*2]

+  lea                ref3q, [ref3q+ref_strideq*2]

+  lea                ref4q, [ref4q+ref_strideq*2]

+%endif

+  psadbw                m1, m0

+  paddd                 m7, m1

+%endmacro

+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end

+%macro PROCESS_32x2x4 5-6 0

+  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16

+  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6

+%endmacro

+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end

+%macro PROCESS_64x2x4 5-6 0

+  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32

+  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6

+%endmacro

+; void vp9_sadNxNx4d_sse2(uint8_t *src,    int src_stride,

+;                         uint8_t *ref[4], int ref_stride,

+;                         unsigned int res[4]);

+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8

+%macro SADNXN4D 2

+%if UNIX64

+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \

+                              res, ref2, ref3, ref4

+%else

+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \

+                              ref2, ref3, ref4

+%endif

+  movsxdifnidn src_strideq, src_strided

+  movsxdifnidn ref_strideq, ref_strided

+  mov                ref2q, [ref1q+gprsize*1]

+  mov                ref3q, [ref1q+gprsize*2]

+  mov                ref4q, [ref1q+gprsize*3]

+  mov                ref1q, [ref1q+gprsize*0]

+  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1

+%rep (%2-4)/2

+  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1

+%endrep

+  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0

+%if mmsize == 16

+  pslldq                m5, 4

+  pslldq                m7, 4

+  por                   m4, m5

+  por                   m6, m7

+  mova                  m5, m4

+  mova                  m7, m6

+  punpcklqdq            m4, m6

+  punpckhqdq            m5, m7

+  movifnidn             r4, r4mp

+  paddd                 m4, m5

+  movu                [r4], m4

+  RET

+%else

+  movifnidn             r4, r4mp

+  movq               [r4+0], m6

+  movq               [r4+8], m7

+  RET

+%endif

+%endmacro

+INIT_XMM sse2

+SADNXN4D 64, 64

+SADNXN4D 32, 32

+SADNXN4D 16, 16

+SADNXN4D 16,  8

+SADNXN4D  8, 16

+SADNXN4D  8,  8

+INIT_MMX sse

+SADNXN4D  4,  4

--- a/vp9/encoder/x86/vp9_sad_sse2.asm

+++ b/vp9/encoder/x86/vp9_sad_sse2.asm

@@ -8,403 +8,175 @@

 ;  be found in the AUTHORS file in the root of the source tree.

+%include "third_party/x86inc/x86inc.asm"

-%include "vpx_ports/x86_abi_support.asm"

+SECTION .text

-;unsigned int vp9_sad16x16_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-global sym(vp9_sad16x16_wmt) PRIVATE

-sym(vp9_sad16x16_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    SAVE_XMM 6

-    push        rsi

-    push        rdi

-    ; end prolog

+; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,

+;                                uint8_t *ref, int ref_stride);

+INIT_XMM sse2

+cglobal sad64x64, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows

+  movsxdifnidn src_strideq, src_strided

+  movsxdifnidn ref_strideq, ref_strided

+  mov              n_rowsd, 64

+  pxor                  m0, m0

+.loop:

+  movu                  m1, [refq]

+  movu                  m2, [refq+16]

+  movu                  m3, [refq+32]

+  movu                  m4, [refq+48]

+  psadbw                m1, [srcq]

+  psadbw                m2, [srcq+16]

+  psadbw                m3, [srcq+32]

+  psadbw                m4, [srcq+48]

+  paddd                 m1, m2

+  paddd                 m3, m4

+  add                 refq, ref_strideq

+  paddd                 m0, m1

+  add                 srcq, src_strideq

+  paddd                 m0, m3

+  dec              n_rowsd

+  jg .loop

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

+  movhlps               m1, m0

+  paddd                 m0, m1

+  movd                 eax, m0

+  RET

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,

+;                                uint8_t *ref, int ref_stride);

+INIT_XMM sse2

+cglobal sad32x32, 4, 5, 5, src, src_stride, ref, ref_stride, n_rows

+  movsxdifnidn src_strideq, src_strided

+  movsxdifnidn ref_strideq, ref_strided

+  mov              n_rowsd, 16

+  pxor                  m0, m0

-        lea             rcx,        [rsi+rax*8]

+.loop:

+  movu                  m1, [refq]

+  movu                  m2, [refq+16]

+  movu                  m3, [refq+ref_strideq]

+  movu                  m4, [refq+ref_strideq+16]

+  psadbw                m1, [srcq]

+  psadbw                m2, [srcq+16]

+  psadbw                m3, [srcq+src_strideq]

+  psadbw                m4, [srcq+src_strideq+16]

+  paddd                 m1, m2

+  paddd                 m3, m4

+  lea                 refq, [refq+ref_strideq*2]

+  paddd                 m0, m1

+  lea                 srcq, [srcq+src_strideq*2]

+  paddd                 m0, m3

+  dec              n_rowsd

+  jg .loop

-        lea             rcx,        [rcx+rax*8]

-        pxor            xmm6,       xmm6

+  movhlps               m1, m0

+  paddd                 m0, m1

+  movd                 eax, m0

+  RET

-.x16x16sad_wmt_loop:

+; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,

+;                                    uint8_t *ref, int ref_stride);

+%macro SAD16XN 1

+cglobal sad16x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \

+                           src_stride3, ref_stride3, n_rows

+  movsxdifnidn src_strideq, src_strided

+  movsxdifnidn ref_strideq, ref_strided

+  lea         src_stride3q, [src_strideq*3]

+  lea         ref_stride3q, [ref_strideq*3]

+  mov              n_rowsd, %1/4

+  pxor                  m0, m0

-        movq            xmm0,       QWORD PTR [rsi]

-        movq            xmm2,       QWORD PTR [rsi+8]

+.loop:

+  movu                  m1, [refq]

+  movu                  m2, [refq+ref_strideq]

+  movu                  m3, [refq+ref_strideq*2]

+  movu                  m4, [refq+ref_stride3q]

+  psadbw                m1, [srcq]

+  psadbw                m2, [srcq+src_strideq]

+  psadbw                m3, [srcq+src_strideq*2]

+  psadbw                m4, [srcq+src_stride3q]

+  paddd                 m1, m2

+  paddd                 m3, m4

+  lea                 refq, [refq+ref_strideq*4]

+  paddd                 m0, m1

+  lea                 srcq, [srcq+src_strideq*4]

+  paddd                 m0, m3

+  dec              n_rowsd

+  jg .loop

-        movq            xmm1,       QWORD PTR [rdi]

-        movq            xmm3,       QWORD PTR [rdi+8]

+  movhlps               m1, m0

+  paddd                 m0, m1

+  movd                 eax, m0

+  RET

+%endmacro

-        movq            xmm4,       QWORD PTR [rsi+rax]

-        movq            xmm5,       QWORD PTR [rdi+rdx]

+INIT_XMM sse2

+SAD16XN 16 ; sad16x16_sse2

+SAD16XN  8 ; sad16x8_sse2

+; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,

+;                                   uint8_t *ref, int ref_stride);

+%macro SAD8XN 1

+cglobal sad8x%1, 4, 7, 5, src, src_stride, ref, ref_stride, \

+                          src_stride3, ref_stride3, n_rows

+  movsxdifnidn src_strideq, src_strided

+  movsxdifnidn ref_strideq, ref_strided

+  lea         src_stride3q, [src_strideq*3]

+  lea         ref_stride3q, [ref_strideq*3]

+  mov              n_rowsd, %1/4

+  pxor                  m0, m0

-        punpcklbw       xmm0,       xmm2

-        punpcklbw       xmm1,       xmm3

+.loop:

+  movh                  m1, [refq]

+  movhps                m1, [refq+ref_strideq]

+  movh                  m2, [refq+ref_strideq*2]

+  movhps                m2, [refq+ref_stride3q]

+  movh                  m3, [srcq]

+  movhps                m3, [srcq+src_strideq]

+  movh                  m4, [srcq+src_strideq*2]

+  movhps                m4, [srcq+src_stride3q]

+  psadbw                m1, m3

+  psadbw                m2, m4

+  lea                 refq, [refq+ref_strideq*4]

+  paddd                 m0, m1

+  lea                 srcq, [srcq+src_strideq*4]

+  paddd                 m0, m2

+  dec              n_rowsd

+  jg .loop

-        psadbw          xmm0,       xmm1

-        movq            xmm2,       QWORD PTR [rsi+rax+8]

+  movhlps               m1, m0

+  paddd                 m0, m1

+  movd                 eax, m0

+  RET

+%endmacro

-        movq            xmm3,       QWORD PTR [rdi+rdx+8]

-        lea             rsi,        [rsi+rax*2]

+INIT_XMM sse2

+SAD8XN 16 ; sad8x16_sse2

+SAD8XN  8 ; sad8x8_sse2

-        lea             rdi,        [rdi+rdx*2]

-        punpcklbw       xmm4,       xmm2

-        punpcklbw       xmm5,       xmm3

-        psadbw          xmm4,       xmm5

-        paddw           xmm6,       xmm0

-        paddw           xmm6,       xmm4

-        cmp             rsi,        rcx

-        jne             .x16x16sad_wmt_loop

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movq            rax,        xmm0

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad8x16_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  max_err)

-global sym(vp9_sad8x16_wmt) PRIVATE

-sym(vp9_sad8x16_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rbx,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rbx*8]

-        lea             rcx,        [rcx+rbx*8]

-        pxor            mm7,        mm7

-.x8x16sad_wmt_loop:

-        movq            rax,        mm7

-        cmp             eax,        arg(4)

-        jg              .x8x16sad_wmt_early_exit

-        movq            mm0,        QWORD PTR [rsi]

-        movq            mm1,        QWORD PTR [rdi]

-        movq            mm2,        QWORD PTR [rsi+rbx]

-        movq            mm3,        QWORD PTR [rdi+rdx]

-        psadbw          mm0,        mm1

-        psadbw          mm2,        mm3

-        lea             rsi,        [rsi+rbx*2]

-        lea             rdi,        [rdi+rdx*2]

-        paddw           mm7,        mm0

-        paddw           mm7,        mm2

-        cmp             rsi,        rcx

-        jne             .x8x16sad_wmt_loop

-        movq            rax,        mm7

-.x8x16sad_wmt_early_exit:

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    pop         rbx

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad8x8_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-global sym(vp9_sad8x8_wmt) PRIVATE

-sym(vp9_sad8x8_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rbx,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rbx*8]

-        pxor            mm7,        mm7

-.x8x8sad_wmt_loop:

-        movq            rax,        mm7

-        cmp             eax,        arg(4)

-        jg              .x8x8sad_wmt_early_exit

-        movq            mm0,        QWORD PTR [rsi]

-        movq            mm1,        QWORD PTR [rdi]

-        psadbw          mm0,        mm1

-        lea             rsi,        [rsi+rbx]

-        add             rdi,        rdx

-        paddw           mm7,        mm0

-        cmp             rsi,        rcx

-        jne             .x8x8sad_wmt_loop

-        movq            rax,        mm7

-.x8x8sad_wmt_early_exit:

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    pop         rbx

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad4x4_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-global sym(vp9_sad4x4_wmt) PRIVATE

-sym(vp9_sad4x4_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 4

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        movd            mm0,        DWORD PTR [rsi]

-        movd            mm1,        DWORD PTR [rdi]

-        movd            mm2,        DWORD PTR [rsi+rax]

-        movd            mm3,        DWORD PTR [rdi+rdx]

-        punpcklbw       mm0,        mm2

-        punpcklbw       mm1,        mm3

-        psadbw          mm0,        mm1

-        lea             rsi,        [rsi+rax*2]

-        lea             rdi,        [rdi+rdx*2]

-        movd            mm4,        DWORD PTR [rsi]

-        movd            mm5,        DWORD PTR [rdi]

-        movd            mm6,        DWORD PTR [rsi+rax]

-        movd            mm7,        DWORD PTR [rdi+rdx]

-        punpcklbw       mm4,        mm6

-        punpcklbw       mm5,        mm7

-        psadbw          mm4,        mm5

-        paddw           mm0,        mm4

-        movq            rax,        mm0

-    ; begin epilog

-    pop rdi

-    pop rsi

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;unsigned int vp9_sad16x8_wmt(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride)

-global sym(vp9_sad16x8_wmt) PRIVATE

-sym(vp9_sad16x8_wmt):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    push        rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;ref_ptr

-        movsxd          rbx,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        lea             rcx,        [rsi+rbx*8]

-        pxor            mm7,        mm7

-.x16x8sad_wmt_loop:

-        movq            rax,        mm7

-        cmp             eax,        arg(4)

-        jg              .x16x8sad_wmt_early_exit

-        movq            mm0,        QWORD PTR [rsi]

-        movq            mm2,        QWORD PTR [rsi+8]

-        movq            mm1,        QWORD PTR [rdi]

-        movq            mm3,        QWORD PTR [rdi+8]

-        movq            mm4,        QWORD PTR [rsi+rbx]

-        movq            mm5,        QWORD PTR [rdi+rdx]

-        psadbw          mm0,        mm1

-        psadbw          mm2,        mm3

-        movq            mm1,        QWORD PTR [rsi+rbx+8]

-        movq            mm3,        QWORD PTR [rdi+rdx+8]

-        psadbw          mm4,        mm5

-        psadbw          mm1,        mm3

-        lea             rsi,        [rsi+rbx*2]

-        lea             rdi,        [rdi+rdx*2]

-        paddw           mm0,        mm2

-        paddw           mm4,        mm1

-        paddw           mm7,        mm0

-        paddw           mm7,        mm4

-        cmp             rsi,        rcx

-        jne             .x16x8sad_wmt_loop

-        movq            rax,        mm7

-.x16x8sad_wmt_early_exit:

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    pop         rbx

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_copy32xn_sse2(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *dst_ptr,

-;    int  dst_stride,

-;    int height);

-global sym(vp9_copy32xn_sse2) PRIVATE

-sym(vp9_copy32xn_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;dst_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;dst_stride

-        movsxd          rcx,        dword ptr arg(4) ;height

-.block_copy_sse2_loopx4:

-        movdqu          xmm0,       XMMWORD PTR [rsi]

-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

-        movdqu          xmm2,       XMMWORD PTR [rsi + rax]

-        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]

-        lea             rsi,        [rsi+rax*2]

-        movdqu          xmm4,       XMMWORD PTR [rsi]

-        movdqu          xmm5,       XMMWORD PTR [rsi + 16]

-        movdqu          xmm6,       XMMWORD PTR [rsi + rax]

-        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]

-        lea             rsi,    [rsi+rax*2]

-        movdqa          XMMWORD PTR [rdi], xmm0

-        movdqa          XMMWORD PTR [rdi + 16], xmm1

-        movdqa          XMMWORD PTR [rdi + rdx], xmm2

-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3

-        lea             rdi,    [rdi+rdx*2]

-        movdqa          XMMWORD PTR [rdi], xmm4

-        movdqa          XMMWORD PTR [rdi + 16], xmm5

-        movdqa          XMMWORD PTR [rdi + rdx], xmm6

-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7

-        lea             rdi,    [rdi+rdx*2]

-        sub             rcx,     4

-        cmp             rcx,     4

-        jge             .block_copy_sse2_loopx4

-        cmp             rcx, 0

-        je              .copy_is_done

-.block_copy_sse2_loop:

-        movdqu          xmm0,       XMMWORD PTR [rsi]

-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

-        lea             rsi,    [rsi+rax]

-        movdqa          XMMWORD PTR [rdi], xmm0

-        movdqa          XMMWORD PTR [rdi + 16], xmm1

-        lea             rdi,    [rdi+rdx]

-        sub             rcx,     1

-        jne             .block_copy_sse2_loop

-.copy_is_done:

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

+; unsigned int vp9_sad4x4_sse(uint8_t *src, int src_stride,

+;                             uint8_t *ref, int ref_stride);

+INIT_MMX sse

+cglobal sad4x4, 4, 4, 8, src, src_stride, ref, ref_stride

+  movsxdifnidn src_strideq, src_strided

+  movsxdifnidn ref_strideq, ref_strided

+  movd                  m0, [refq]

+  movd                  m1, [refq+ref_strideq]

+  movd                  m2, [srcq]

+  movd                  m3, [srcq+src_strideq]

+  lea                 refq, [refq+ref_strideq*2]

+  lea                 srcq, [srcq+src_strideq*2]

+  movd                  m4, [refq]

+  movd                  m5, [refq+ref_strideq]

+  movd                  m6, [srcq]

+  movd                  m7, [srcq+src_strideq]

+  punpckldq             m0, m1

+  punpckldq             m2, m3

+  punpckldq             m4, m5

+  punpckldq             m6, m7

+  psadbw                m0, m2

+  psadbw                m4, m6

+  paddd                 m0, m4

+  movd                 eax, m0

+  RET

--- a/vp9/encoder/x86/vp9_sad_sse3.asm

+++ b/vp9/encoder/x86/vp9_sad_sse3.asm

@@ -83,87 +83,6 @@

ret

 %endmacro

-%macro STACK_FRAME_CREATE_X4 0

-%if ABI_IS_32BIT

-  %define     src_ptr       rsi

-  %define     src_stride    rax

-  %define     r0_ptr        rcx

-  %define     r1_ptr        rdx

-  %define     r2_ptr        rbx

-  %define     r3_ptr        rdi

-  %define     ref_stride    rbp

-  %define     result_ptr    arg(4)

-    push        rbp

-    mov         rbp,        rsp

-    push        rsi

-    push        rdi

-    push        rbx

-    push        rbp

-    mov         rdi,        arg(2)              ; ref_ptr_base

-    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi

-    mov         rsi,        arg(0)              ; src_ptr

-    movsxd      rbx,        dword ptr arg(1)    ; src_stride

-    movsxd      rbp,        dword ptr arg(3)    ; ref_stride

-    xchg        rbx,        rax

-%else

-  %if LIBVPX_YASM_WIN64

-    SAVE_XMM 7, u

-    %define     src_ptr     rcx

-    %define     src_stride  rdx

-    %define     r0_ptr      rsi

-    %define     r1_ptr      r10

-    %define     r2_ptr      r11

-    %define     r3_ptr      r8

-    %define     ref_stride  r9

-    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]

-    push        rsi

-    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr

-  %else

-    %define     src_ptr     rdi

-    %define     src_stride  rsi

-    %define     r0_ptr      r9

-    %define     r1_ptr      r10

-    %define     r2_ptr      r11

-    %define     r3_ptr      rdx

-    %define     ref_stride  rcx

-    %define     result_ptr  r8

-    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr

-  %endif

-%endif

-%endmacro

-%macro STACK_FRAME_DESTROY_X4 0

-  %define     src_ptr

-  %define     src_stride

-  %define     r0_ptr

-  %define     r1_ptr

-  %define     r2_ptr

-  %define     r3_ptr

-  %define     ref_stride

-  %define     result_ptr

-%if ABI_IS_32BIT

-    pop         rbx

-    pop         rdi

-    pop         rsi

-    pop         rbp

-%else

-  %if LIBVPX_YASM_WIN64

-    pop         rsi

-    RESTORE_XMM

-  %endif

-%endif

-    ret

-%endmacro

 %macro PROCESS_16X2X3 5

 %if %1==0

         movdqa          xmm0,       XMMWORD PTR [%2]

@@ -250,130 +169,6 @@

         paddw           mm7,       mm3

 %endmacro

-%macro LOAD_X4_ADDRESSES 5

-        mov             %2,         [%1+REG_SZ_BYTES*0]

-        mov             %3,         [%1+REG_SZ_BYTES*1]

-        mov             %4,         [%1+REG_SZ_BYTES*2]

-        mov             %5,         [%1+REG_SZ_BYTES*3]

-%endmacro

-%macro PROCESS_16X2X4 8

-%if %1==0

-        movdqa          xmm0,       XMMWORD PTR [%2]

-        lddqu           xmm4,       XMMWORD PTR [%3]

-        lddqu           xmm5,       XMMWORD PTR [%4]

-        lddqu           xmm6,       XMMWORD PTR [%5]

-        lddqu           xmm7,       XMMWORD PTR [%6]

-        psadbw          xmm4,       xmm0

-        psadbw          xmm5,       xmm0

-        psadbw          xmm6,       xmm0

-        psadbw          xmm7,       xmm0

-%else

-        movdqa          xmm0,       XMMWORD PTR [%2]

-        lddqu           xmm1,       XMMWORD PTR [%3]

-        lddqu           xmm2,       XMMWORD PTR [%4]

-        lddqu           xmm3,       XMMWORD PTR [%5]

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm4,       xmm1

-        lddqu           xmm1,       XMMWORD PTR [%6]

-        paddw           xmm5,       xmm2

-        paddw           xmm6,       xmm3

-        psadbw          xmm1,       xmm0

-        paddw           xmm7,       xmm1

-%endif

-        movdqa          xmm0,       XMMWORD PTR [%2+%7]

-        lddqu           xmm1,       XMMWORD PTR [%3+%8]

-        lddqu           xmm2,       XMMWORD PTR [%4+%8]

-        lddqu           xmm3,       XMMWORD PTR [%5+%8]

-        psadbw          xmm1,       xmm0

-        psadbw          xmm2,       xmm0

-        psadbw          xmm3,       xmm0

-        paddw           xmm4,       xmm1

-        lddqu           xmm1,       XMMWORD PTR [%6+%8]

-        paddw           xmm5,       xmm2

-        paddw           xmm6,       xmm3

-%if %1==0 || %1==1

-        lea             %2,         [%2+%7*2]

-        lea             %3,         [%3+%8*2]

-        lea             %4,         [%4+%8*2]

-        lea             %5,         [%5+%8*2]

-        lea             %6,         [%6+%8*2]

-%endif

-        psadbw          xmm1,       xmm0

-        paddw           xmm7,       xmm1

-%endmacro

-%macro PROCESS_8X2X4 8

-%if %1==0

-        movq            mm0,        QWORD PTR [%2]

-        movq            mm4,        QWORD PTR [%3]

-        movq            mm5,        QWORD PTR [%4]

-        movq            mm6,        QWORD PTR [%5]

-        movq            mm7,        QWORD PTR [%6]

-        psadbw          mm4,        mm0

-        psadbw          mm5,        mm0

-        psadbw          mm6,        mm0

-        psadbw          mm7,        mm0

-%else

-        movq            mm0,        QWORD PTR [%2]

-        movq            mm1,        QWORD PTR [%3]

-        movq            mm2,        QWORD PTR [%4]

-        movq            mm3,        QWORD PTR [%5]

-        psadbw          mm1,        mm0

-        psadbw          mm2,        mm0

-        psadbw          mm3,        mm0

-        paddw           mm4,        mm1

-        movq            mm1,        QWORD PTR [%6]

-        paddw           mm5,        mm2

-        paddw           mm6,        mm3

-        psadbw          mm1,        mm0

-        paddw           mm7,        mm1

-%endif

-        movq            mm0,        QWORD PTR [%2+%7]

-        movq            mm1,        QWORD PTR [%3+%8]

-        movq            mm2,        QWORD PTR [%4+%8]

-        movq            mm3,        QWORD PTR [%5+%8]

-        psadbw          mm1,        mm0

-        psadbw          mm2,        mm0

-        psadbw          mm3,        mm0

-        paddw           mm4,        mm1

-        movq            mm1,        QWORD PTR [%6+%8]

-        paddw           mm5,        mm2

-        paddw           mm6,        mm3

-%if %1==0 || %1==1

-        lea             %2,         [%2+%7*2]

-        lea             %3,         [%3+%8*2]

-        lea             %4,         [%4+%8*2]

-        lea             %5,         [%5+%8*2]

-        lea             %6,         [%6+%8*2]

-%endif

-        psadbw          mm1,        mm0

-        paddw           mm7,        mm1

-%endmacro

 ;void int vp9_sad16x16x3_sse3(

 ;    unsigned char *src_ptr,

 ;    int  src_stride,

@@ -581,380 +376,3 @@

         movd            [rcx+8],    mm7

     STACK_FRAME_DESTROY_X3

-;unsigned int vp9_sad16x16_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  max_err)

-;%define lddqu movdqu

-global sym(vp9_sad16x16_sse3) PRIVATE

-sym(vp9_sad16x16_sse3):

-    STACK_FRAME_CREATE_X3

-        mov             end_ptr,    4

-        pxor            xmm7,        xmm7

-.vp9_sad16x16_sse3_loop:

-        movdqa          xmm0,       XMMWORD PTR [src_ptr]

-        movdqu          xmm1,       XMMWORD PTR [ref_ptr]

-        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]

-        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]

-        lea             src_ptr,    [src_ptr+src_stride*2]

-        lea             ref_ptr,    [ref_ptr+ref_stride*2]

-        movdqa          xmm4,       XMMWORD PTR [src_ptr]

-        movdqu          xmm5,       XMMWORD PTR [ref_ptr]

-        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]

-        psadbw          xmm0,       xmm1

-        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]

-        psadbw          xmm2,       xmm3

-        psadbw          xmm4,       xmm5

-        psadbw          xmm6,       xmm1

-        lea             src_ptr,    [src_ptr+src_stride*2]

-        lea             ref_ptr,    [ref_ptr+ref_stride*2]

-        paddw           xmm7,        xmm0

-        paddw           xmm7,        xmm2

-        paddw           xmm7,        xmm4

-        paddw           xmm7,        xmm6

-        sub             end_ptr,     1

-        jne             .vp9_sad16x16_sse3_loop

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movq            rax,        xmm0

-    STACK_FRAME_DESTROY_X3

-;void vp9_copy32xn_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *dst_ptr,

-;    int  dst_stride,

-;    int height);

-global sym(vp9_copy32xn_sse3) PRIVATE

-sym(vp9_copy32xn_sse3):

-    STACK_FRAME_CREATE_X3

-.block_copy_sse3_loopx4:

-        lea             end_ptr,    [src_ptr+src_stride*2]

-        movdqu          xmm0,       XMMWORD PTR [src_ptr]

-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

-        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]

-        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]

-        movdqu          xmm4,       XMMWORD PTR [end_ptr]

-        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]

-        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]

-        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]

-        lea             src_ptr,    [src_ptr+src_stride*4]

-        lea             end_ptr,    [ref_ptr+ref_stride*2]

-        movdqa          XMMWORD PTR [ref_ptr], xmm0

-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

-        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2

-        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3

-        movdqa          XMMWORD PTR [end_ptr], xmm4

-        movdqa          XMMWORD PTR [end_ptr + 16], xmm5

-        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6

-        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7

-        lea             ref_ptr,    [ref_ptr+ref_stride*4]

-        sub             height,     4

-        cmp             height,     4

-        jge             .block_copy_sse3_loopx4

-        ;Check to see if there is more rows need to be copied.

-        cmp             height, 0

-        je              .copy_is_done

-.block_copy_sse3_loop:

-        movdqu          xmm0,       XMMWORD PTR [src_ptr]

-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

-        lea             src_ptr,    [src_ptr+src_stride]

-        movdqa          XMMWORD PTR [ref_ptr], xmm0

-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

-        lea             ref_ptr,    [ref_ptr+ref_stride]

-        sub             height,     1

-        jne             .block_copy_sse3_loop

-.copy_is_done:

-    STACK_FRAME_DESTROY_X3

-;void vp9_sad16x16x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr_base,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad16x16x4d_sse3) PRIVATE

-sym(vp9_sad16x16x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-%if ABI_IS_32BIT

-        pop             rbp

-%endif

-        mov             rcx,        result_ptr

-        movq            xmm0,       xmm4

-        psrldq          xmm4,       8

-        paddw           xmm0,       xmm4

-        movd            [rcx],      xmm0

-;-

-        movq            xmm0,       xmm5

-        psrldq          xmm5,       8

-        paddw           xmm0,       xmm5

-        movd            [rcx+4],    xmm0

-;-

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movd            [rcx+8],    xmm0

-;-

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movd            [rcx+12],   xmm0

-    STACK_FRAME_DESTROY_X4

-;void vp9_sad16x8x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr_base,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad16x8x4d_sse3) PRIVATE

-sym(vp9_sad16x8x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-%if ABI_IS_32BIT

-        pop             rbp

-%endif

-        mov             rcx,        result_ptr

-        movq            xmm0,       xmm4

-        psrldq          xmm4,       8

-        paddw           xmm0,       xmm4

-        movd            [rcx],      xmm0

-;-

-        movq            xmm0,       xmm5

-        psrldq          xmm5,       8

-        paddw           xmm0,       xmm5

-        movd            [rcx+4],    xmm0

-;-

-        movq            xmm0,       xmm6

-        psrldq          xmm6,       8

-        paddw           xmm0,       xmm6

-        movd            [rcx+8],    xmm0

-;-

-        movq            xmm0,       xmm7

-        psrldq          xmm7,       8

-        paddw           xmm0,       xmm7

-        movd            [rcx+12],   xmm0

-    STACK_FRAME_DESTROY_X4

-;void int vp9_sad8x16x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad8x16x4d_sse3) PRIVATE

-sym(vp9_sad8x16x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-%if ABI_IS_32BIT

-        pop             rbp

-%endif

-        mov             rcx,        result_ptr

-        punpckldq       mm4,        mm5

-        punpckldq       mm6,        mm7

-        movq            [rcx],      mm4

-        movq            [rcx+8],    mm6

-    STACK_FRAME_DESTROY_X4

-;void int vp9_sad8x8x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad8x8x4d_sse3) PRIVATE

-sym(vp9_sad8x8x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride

-%if ABI_IS_32BIT

-        pop             rbp

-%endif

-        mov             rcx,        result_ptr

-        punpckldq       mm4,        mm5

-        punpckldq       mm6,        mm7

-        movq            [rcx],      mm4

-        movq            [rcx+8],    mm6

-    STACK_FRAME_DESTROY_X4

-;void int vp9_sad4x4x4d_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *ref_ptr,

-;    int  ref_stride,

-;    int  *results)

-global sym(vp9_sad4x4x4d_sse3) PRIVATE

-sym(vp9_sad4x4x4d_sse3):

-    STACK_FRAME_CREATE_X4

-        movd            mm0,        DWORD PTR [src_ptr]

-        movd            mm1,        DWORD PTR [r0_ptr]

-        movd            mm2,        DWORD PTR [src_ptr+src_stride]

-        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]

-        punpcklbw       mm0,        mm2

-        punpcklbw       mm1,        mm3

-        movd            mm4,        DWORD PTR [r1_ptr]

-        movd            mm5,        DWORD PTR [r2_ptr]

-        movd            mm6,        DWORD PTR [r3_ptr]

-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]

-        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]

-        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]

-        psadbw          mm1,        mm0

-        punpcklbw       mm4,        mm2

-        punpcklbw       mm5,        mm3

-        punpcklbw       mm6,        mm7

-        psadbw          mm4,        mm0

-        psadbw          mm5,        mm0

-        psadbw          mm6,        mm0

-        lea             src_ptr,    [src_ptr+src_stride*2]

-        lea             r0_ptr,     [r0_ptr+ref_stride*2]

-        lea             r1_ptr,     [r1_ptr+ref_stride*2]

-        lea             r2_ptr,     [r2_ptr+ref_stride*2]

-        lea             r3_ptr,     [r3_ptr+ref_stride*2]

-        movd            mm0,        DWORD PTR [src_ptr]

-        movd            mm2,        DWORD PTR [r0_ptr]

-        movd            mm3,        DWORD PTR [src_ptr+src_stride]

-        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]

-        punpcklbw       mm0,        mm3

-        punpcklbw       mm2,        mm7

-        movd            mm3,        DWORD PTR [r1_ptr]

-        movd            mm7,        DWORD PTR [r2_ptr]

-        psadbw          mm2,        mm0

-%if ABI_IS_32BIT

-        mov             rax,        rbp

-        pop             rbp

-%define     ref_stride    rax

-%endif

-        mov             rsi,        result_ptr

-        paddw           mm1,        mm2

-        movd            [rsi],      mm1

-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]

-        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]

-        punpcklbw       mm3,        mm2

-        punpcklbw       mm7,        mm1

-        psadbw          mm3,        mm0

-        psadbw          mm7,        mm0

-        movd            mm2,        DWORD PTR [r3_ptr]

-        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]

-        paddw           mm3,        mm4

-        paddw           mm7,        mm5

-        movd            [rsi+4],    mm3

-        punpcklbw       mm2,        mm1

-        movd            [rsi+8],    mm7

-        psadbw          mm2,        mm0

-        paddw           mm2,        mm6

-        movd            [rsi+12],   mm2

-    STACK_FRAME_DESTROY_X4

--- a/vp9/encoder/x86/vp9_sad_sse4.asm

+++ b/vp9/encoder/x86/vp9_sad_sse4.asm

@@ -154,7 +154,17 @@

         paddw           xmm1,       xmm5

 %endmacro

+%macro WRITE_AS_INTS 0

+    mov             rdi,        arg(4)           ;Results

+    pxor            xmm0, xmm0

+    movdqa          xmm2, xmm1

+    punpcklwd       xmm1, xmm0

+    punpckhwd       xmm2, xmm0

+    movdqa          [rdi],    xmm1

+    movdqa          [rdi + 16],    xmm2

+%endmacro

 ;void vp9_sad16x16x8_sse4(

 ;    const unsigned char *src_ptr,

 ;    int  src_stride,

@@ -170,23 +180,22 @@

     push        rdi

     ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

+    mov             rsi,        arg(0)           ;src_ptr

+    mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+    movsxd          rax,        dword ptr arg(1) ;src_stride

+    movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_16X2X8 1

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

+    PROCESS_16X2X8 1

+    PROCESS_16X2X8 0

+    PROCESS_16X2X8 0

+    PROCESS_16X2X8 0

+    PROCESS_16X2X8 0

+    PROCESS_16X2X8 0

+    PROCESS_16X2X8 0

+    PROCESS_16X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

+    WRITE_AS_INTS

     ; begin epilog

     pop         rdi

@@ -212,19 +221,18 @@

     push        rdi

     ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

+    mov             rsi,        arg(0)           ;src_ptr

+    mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+    movsxd          rax,        dword ptr arg(1) ;src_stride

+    movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_16X2X8 1

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

-        PROCESS_16X2X8 0

+    PROCESS_16X2X8 1

+    PROCESS_16X2X8 0

+    PROCESS_16X2X8 0

+    PROCESS_16X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

+    WRITE_AS_INTS

     ; begin epilog

     pop         rdi

@@ -250,19 +258,18 @@

     push        rdi

     ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

+    mov             rsi,        arg(0)           ;src_ptr

+    mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+    movsxd          rax,        dword ptr arg(1) ;src_stride

+    movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_8X2X8 1

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

+    PROCESS_8X2X8 1

+    PROCESS_8X2X8 0

+    PROCESS_8X2X8 0

+    PROCESS_8X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

+    WRITE_AS_INTS

     ; begin epilog

     pop         rdi

@@ -288,23 +295,23 @@

     push        rdi

     ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

+    mov             rsi,        arg(0)           ;src_ptr

+    mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+    movsxd          rax,        dword ptr arg(1) ;src_stride

+    movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_8X2X8 1

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        PROCESS_8X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

+    PROCESS_8X2X8 1

+    PROCESS_8X2X8 0

+    PROCESS_8X2X8 0

+    PROCESS_8X2X8 0

+    PROCESS_8X2X8 0

+    PROCESS_8X2X8 0

+    PROCESS_8X2X8 0

+    PROCESS_8X2X8 0

+    WRITE_AS_INTS

     ; begin epilog

     pop         rdi

     pop         rsi

@@ -329,17 +336,16 @@

     push        rdi

     ; end prolog

-        mov             rsi,        arg(0)           ;src_ptr

-        mov             rdi,        arg(2)           ;ref_ptr

+    mov             rsi,        arg(0)           ;src_ptr

+    mov             rdi,        arg(2)           ;ref_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;ref_stride

+    movsxd          rax,        dword ptr arg(1) ;src_stride

+    movsxd          rdx,        dword ptr arg(3) ;ref_stride

-        PROCESS_4X2X8 1

-        PROCESS_4X2X8 0

+    PROCESS_4X2X8 1

+    PROCESS_4X2X8 0

-        mov             rdi,        arg(4)           ;Results

-        movdqa          XMMWORD PTR [rdi],    xmm1

+    WRITE_AS_INTS

     ; begin epilog

     pop         rdi

--- a/vp9/encoder/x86/vp9_variance_sse2.c

+++ b/vp9/encoder/x86/vp9_variance_sse2.c

@@ -186,6 +186,7 @@

   *sse = sse0;

   return (sse0 - (((unsigned int)sum0 * sum0) >> 8));

 unsigned int vp9_mse16x16_wmt(

   const unsigned char *src_ptr,

   int  source_stride,

@@ -305,20 +306,16 @@

   return (xxsum - (((unsigned int)xsum * xsum) >> 6));

-unsigned int vp9_sub_pixel_variance16x16_wmt

-(

-  const unsigned char  *src_ptr,

-  int  src_pixels_per_line,

-  int  xoffset,

-  int  yoffset,

-  const unsigned char *dst_ptr,

-  int dst_pixels_per_line,

-  unsigned int *sse

-) {

+static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,

+                                         int src_pixels_per_line,

+                                         int xoffset,

+                                         int yoffset,

+                                         const uint8_t *dst_ptr,

+                                         int dst_pixels_per_line,

+                                         unsigned int *sse, int *avg) {

   int xsum0, xsum1;

   unsigned int xxsum0, xxsum1;

   // note we could avoid these if statements if the calling function

   // just called the appropriate functions inside.

   if (xoffset == HALFNDX && yoffset == 0) {

@@ -355,10 +352,136 @@

   *sse = xxsum0;

-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));

+  *avg = xsum0;

-unsigned int vp9_sub_pixel_mse16x16_wmt(

+unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,

+                                              int src_pixels_per_line,

+                                              int xoffset,

+                                              int yoffset,

+                                              const uint8_t *dst_ptr,

+                                              int dst_pixels_per_line,

+                                              unsigned int *sse_ptr) {

+  int avg;

+  unsigned int sse;

+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr, dst_pixels_per_line,

+                               &sse, &avg);

+  *sse_ptr = sse;

+  return (sse - (((unsigned int) avg * avg) >> 8));

+}

+unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr,

+                                              int src_pixels_per_line,

+                                              int xoffset,

+                                              int yoffset,

+                                              const uint8_t *dst_ptr,

+                                              int dst_pixels_per_line,

+                                              unsigned int *sse_ptr) {

+  int avg0, avg1, avg2, avg3;

+  unsigned int sse0, sse1, sse2, sse3;

+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr, dst_pixels_per_line,

+                               &sse0, &avg0);

+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 16, dst_pixels_per_line,

+                               &sse1, &avg1);

+  src_ptr += 16 * src_pixels_per_line;

+  dst_ptr += 16 * dst_pixels_per_line;

+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr, dst_pixels_per_line,

+                               &sse2, &avg2);

+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 16, dst_pixels_per_line,

+                               &sse3, &avg3);

+  sse0 += sse1 + sse2 + sse3;

+  avg0 += avg1 + avg2 + avg3;

+  *sse_ptr = sse0;

+  return (sse0 - (((unsigned int) avg0 * avg0) >> 10));

+}

+unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr,

+                                              int src_pixels_per_line,

+                                              int xoffset,

+                                              int yoffset,

+                                              const uint8_t *dst_ptr,

+                                              int dst_pixels_per_line,

+                                              unsigned int *sse_ptr) {

+  int avg0, avg1, avg2, avg3, avg4;

+  unsigned int sse0, sse1, sse2, sse3, sse4;

+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr, dst_pixels_per_line,

+                               &sse0, &avg0);

+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 16, dst_pixels_per_line,

+                               &sse1, &avg1);

+  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 32, dst_pixels_per_line,

+                               &sse2, &avg2);

+  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 48, dst_pixels_per_line,

+                               &sse3, &avg3);

+  src_ptr += 16 * src_pixels_per_line;

+  dst_ptr += 16 * dst_pixels_per_line;

+  avg0 += avg1 + avg2 + avg3;

+  sse0 += sse1 + sse2 + sse3;

+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr, dst_pixels_per_line,

+                               &sse1, &avg1);

+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 16, dst_pixels_per_line,

+                               &sse2, &avg2);

+  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 32, dst_pixels_per_line,

+                               &sse3, &avg3);

+  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 48, dst_pixels_per_line,

+                               &sse4, &avg4);

+  src_ptr += 16 * src_pixels_per_line;

+  dst_ptr += 16 * dst_pixels_per_line;

+  avg0 += avg1 + avg2 + avg3 + avg4;

+  sse0 += sse1 + sse2 + sse3 + sse4;

+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr, dst_pixels_per_line,

+                               &sse1, &avg1);

+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 16, dst_pixels_per_line,

+                               &sse2, &avg2);

+  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 32, dst_pixels_per_line,

+                               &sse3, &avg3);

+  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 48, dst_pixels_per_line,

+                               &sse4, &avg4);

+  src_ptr += 16 * src_pixels_per_line;

+  dst_ptr += 16 * dst_pixels_per_line;

+  avg0 += avg1 + avg2 + avg3 + avg4;

+  sse0 += sse1 + sse2 + sse3 + sse4;

+  sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr, dst_pixels_per_line,

+                               &sse1, &avg1);

+  sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 16, dst_pixels_per_line,

+                               &sse2, &avg2);

+  sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 32, dst_pixels_per_line,

+                               &sse3, &avg3);

+  sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,

+                               yoffset, dst_ptr + 48, dst_pixels_per_line,

+                               &sse4, &avg4);

+  avg0 += avg1 + avg2 + avg3 + avg4;

+  sse0 += sse1 + sse2 + sse3 + sse4;

+  *sse_ptr = sse0;

+  return (sse0 - (((unsigned int) avg0 * avg0) >> 12));

+}

+unsigned int vp9_sub_pixel_mse16x16_sse2(

   const unsigned char  *src_ptr,

   int  src_pixels_per_line,

   int  xoffset,

@@ -367,7 +490,8 @@

   int dst_pixels_per_line,

   unsigned int *sse

) {

-  vp9_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);

+  vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,

+                                   yoffset, dst_ptr, dst_pixels_per_line, sse);

   return *sse;

--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c

+++ b/vp9/encoder/x86/vp9_x86_csystemdependent.c

@@ -23,11 +23,11 @@

   vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);

-int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

-int vp9_mbblock_error_mmx(MACROBLOCK *mb, int dc) {

+int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr);

+int vp9_mbblock_error_mmx(MACROBLOCK *mb) {

   short *coeff_ptr =  mb->block[0].coeff;

   short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

-  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);

+  return vp9_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr);

 int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);

@@ -51,11 +51,11 @@

 #endif

 #if HAVE_SSE2

-int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);

-int vp9_mbblock_error_xmm(MACROBLOCK *mb, int dc) {

+int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr);

+int vp9_mbblock_error_xmm(MACROBLOCK *mb) {

   short *coeff_ptr =  mb->block[0].coeff;

   short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;

-  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);

+  return vp9_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr);

 int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -9,6 +9,7 @@

##

 VP9_COMMON_SRCS-yes += vp9_common.mk

+VP9_COMMON_SRCS-yes += vp9_iface_common.h

 VP9_COMMON_SRCS-yes += common/vp9_pragmas.h

 VP9_COMMON_SRCS-yes += common/vp9_ppflags.h

 VP9_COMMON_SRCS-yes += common/vp9_onyx.h

@@ -16,6 +17,8 @@

 VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c

 VP9_COMMON_SRCS-yes += common/vp9_blockd.c

 VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h

+VP9_COMMON_SRCS-yes += common/vp9_convolve.c

+VP9_COMMON_SRCS-yes += common/vp9_convolve.h

 VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c

 VP9_COMMON_SRCS-yes += common/vp9_default_coef_probs.h

 VP9_COMMON_SRCS-yes += common/vp9_entropy.c

@@ -26,7 +29,7 @@

 VP9_COMMON_SRCS-yes += common/vp9_filter.h

 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.c

 VP9_COMMON_SRCS-yes += common/generic/vp9_systemdependent.c

-VP9_COMMON_SRCS-yes += common/vp9_idctllm.c

+VP9_COMMON_SRCS-yes += common/vp9_idct.c

 VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h

 VP9_COMMON_SRCS-yes += common/vp9_blockd.h

 VP9_COMMON_SRCS-yes += common/vp9_common.h

@@ -36,6 +39,7 @@

 VP9_COMMON_SRCS-yes += common/vp9_extend.h

 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h

 VP9_COMMON_SRCS-yes += common/vp9_header.h

+VP9_COMMON_SRCS-yes += common/vp9_idct.h

 VP9_COMMON_SRCS-yes += common/vp9_invtrans.h

 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h

 VP9_COMMON_SRCS-yes += common/vp9_modecont.h

@@ -46,7 +50,6 @@

 VP9_COMMON_SRCS-yes += common/vp9_quant_common.h

 VP9_COMMON_SRCS-yes += common/vp9_reconinter.h

 VP9_COMMON_SRCS-yes += common/vp9_reconintra.h

-VP9_COMMON_SRCS-yes += common/vp9_reconintra4x4.h

 VP9_COMMON_SRCS-yes += common/vp9_rtcd.c

 VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh

 VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h

@@ -54,10 +57,11 @@

 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h

 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c

 VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h

-VP9_COMMON_SRCS-yes += common/vp9_subpixel.h

 VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h

 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h

 VP9_COMMON_SRCS-yes += common/vp9_textblit.h

+VP9_COMMON_SRCS-yes += common/vp9_tile_common.h

+VP9_COMMON_SRCS-yes += common/vp9_tile_common.c

 VP9_COMMON_SRCS-yes += common/vp9_treecoder.h

 VP9_COMMON_SRCS-yes += common/vp9_invtrans.c

 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c

@@ -79,7 +83,6 @@

 VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c

 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h

-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_subpixel_x86.h

 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h

 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h

 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c

@@ -89,18 +92,15 @@

 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm

-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm

-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm

+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm

-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c

 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm

-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm

 ifeq ($(CONFIG_POSTPROC),yes)

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm

@@ -112,19 +112,13 @@

 VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm

 endif

-VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_filter_sse4.c

-ifeq ($(HAVE_SSE4_1),yes)

-vp9/common/x86/vp9_filter_sse4.c.o: CFLAGS += -msse4

-vp9/common/x86/vp9_filter_sse4.c.d: CFLAGS += -msse4

-endif

-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_filter_sse2.c

+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.c

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_sse2.c

 ifeq ($(HAVE_SSE2),yes)

-vp9/common/x86/vp9_filter_sse2.c.o: CFLAGS += -msse2

+vp9/common/x86/vp9_idct_x86.c.o: CFLAGS += -msse2

 vp9/common/x86/vp9_loopfilter_intrin_sse2.c.o: CFLAGS += -msse2

 vp9/common/x86/vp9_sadmxn_sse2.c.o: CFLAGS += -msse2

-vp9/common/x86/vp9_filter_sse2.c.d: CFLAGS += -msse2

+vp9/common/x86/vp9_idct_x86.c.d: CFLAGS += -msse2

 vp9/common/x86/vp9_loopfilter_intrin_sse2.c.d: CFLAGS += -msse2

 vp9/common/x86/vp9_sadmxn_sse2.c.d: CFLAGS += -msse2

 endif

--- a/vp9/vp9_cx_iface.c

+++ b/vp9/vp9_cx_iface.c

@@ -16,6 +16,7 @@

 #include "vpx/vp8cx.h"

 #include "vp9/encoder/vp9_firstpass.h"

 #include "vp9/common/vp9_onyx.h"

+#include "vp9/vp9_iface_common.h"

 #include <stdlib.h>

 #include <string.h>

@@ -26,7 +27,8 @@

   unsigned int                noise_sensitivity;

   unsigned int                Sharpness;

   unsigned int                static_thresh;

-  unsigned int                token_partitions;

+  unsigned int                tile_columns;

+  unsigned int                tile_rows;

   unsigned int                arnr_max_frames;    /* alt_ref Noise Reduction Max Frame Count */

   unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */

   unsigned int                arnr_type;        /* alt_ref filter type */

@@ -34,9 +36,8 @@

   vp8e_tuning                 tuning;

   unsigned int                cq_level;         /* constrained quality level */

   unsigned int                rc_max_intra_bitrate_pct;

-#if CONFIG_LOSSLESS

   unsigned int                lossless;

-#endif

+  unsigned int                frame_parallel_decoding_mode;

};

 struct extraconfig_map {

@@ -54,7 +55,8 @@

       0,                          /* noise_sensitivity */

       0,                          /* Sharpness */

       0,                          /* static_thresh */

-      VP8_ONE_TOKENPARTITION,     /* token_partitions */

+      0,                          /* tile_columns */

+      0,                          /* tile_rows */

       0,                          /* arnr_max_frames */

       3,                          /* arnr_strength */

       3,                          /* arnr_type*/

@@ -62,9 +64,8 @@

       0,                          /* tuning*/

       10,                         /* cq_level */

       0,                          /* rc_max_intra_bitrate_pct */

-#if CONFIG_LOSSLESS

       0,                          /* lossless */

-#endif

+      0,                          /* frame_parallel_decoding_mode */

};

@@ -79,8 +80,10 @@

   unsigned int            cx_data_sz;

   unsigned char          *pending_cx_data;

   unsigned int            pending_cx_data_sz;

+  int                     pending_frame_count;

+  uint32_t                pending_frame_sizes[8];

+  uint32_t                pending_frame_magnitude;

   vpx_image_t             preview_img;

-  unsigned int            next_frame_flag;

   vp8_postproc_cfg_t      preview_ppcfg;

   vpx_codec_pkt_list_decl(64) pkt_list;              // changed to accomendate the maximum number of lagged frames allowed

   unsigned int                fixed_kf_cntr;

@@ -129,8 +132,8 @@

 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,

                                        const vpx_codec_enc_cfg_t *cfg,

                                        const struct vp8_extracfg *vp8_cfg) {

-  RANGE_CHECK(cfg, g_w,                   1, 16383); /* 14 bits available */

-  RANGE_CHECK(cfg, g_h,                   1, 16383); /* 14 bits available */

+  RANGE_CHECK(cfg, g_w,                   1, 65535); /* 16 bits available */

+  RANGE_CHECK(cfg, g_h,                   1, 65535); /* 16 bits available */

   RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);

   RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);

   RANGE_CHECK_HI(cfg, g_profile,          3);

@@ -137,13 +140,11 @@

   RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);

   RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);

-#if CONFIG_LOSSLESS

   RANGE_CHECK_BOOL(vp8_cfg, lossless);

   if (vp8_cfg->lossless) {

     RANGE_CHECK_HI(cfg, rc_max_quantizer, 0);

     RANGE_CHECK_HI(cfg, rc_min_quantizer, 0);

-#endif

   RANGE_CHECK_HI(cfg, g_threads,          64);

   RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);

@@ -172,7 +173,8 @@

   RANGE_CHECK_HI(vp8_cfg, noise_sensitivity,  6);

-  RANGE_CHECK(vp8_cfg, token_partitions,   VP8_ONE_TOKENPARTITION, VP8_EIGHT_TOKENPARTITION);

+  RANGE_CHECK(vp8_cfg, tile_columns, 0, 6);

+  RANGE_CHECK(vp8_cfg, tile_rows, 0, 2);

   RANGE_CHECK_HI(vp8_cfg, Sharpness,       7);

   RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);

   RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);

@@ -226,11 +228,9 @@

 static vpx_codec_err_t set_vp8e_config(VP9_CONFIG *oxcf,

                                        vpx_codec_enc_cfg_t cfg,

                                        struct vp8_extracfg vp8_cfg) {

-  oxcf->Version               = cfg.g_profile;

-  oxcf->Version              |= vp8_cfg.experimental ? 0x4 : 0;

-  oxcf->Width                 = cfg.g_w;

-  oxcf->Height                = cfg.g_h;

+  oxcf->version = cfg.g_profile | (vp8_cfg.experimental ? 0x4 : 0);

+  oxcf->width   = cfg.g_w;

+  oxcf->height  = cfg.g_h;

   /* guess a frame rate if out of whack, use 30 */

   oxcf->frame_rate             = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);

@@ -309,37 +309,43 @@

   oxcf->tuning = vp8_cfg.tuning;

-#if CONFIG_LOSSLESS

+  oxcf->tile_columns = vp8_cfg.tile_columns;

+  oxcf->tile_rows = vp8_cfg.tile_rows;

   oxcf->lossless = vp8_cfg.lossless;

-#endif

+  oxcf->error_resilient_mode = cfg.g_error_resilient;

+  oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;

/*

-      printf("Current VP8 Settings: \n");

-      printf("target_bandwidth: %d\n", oxcf->target_bandwidth);

-      printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);

-      printf("Sharpness: %d\n",    oxcf->Sharpness);

-      printf("cpu_used: %d\n",  oxcf->cpu_used);

-      printf("Mode: %d\n",     oxcf->Mode);

-      printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);

-      printf("auto_key: %d\n",  oxcf->auto_key);

-      printf("key_freq: %d\n", oxcf->key_freq);

-      printf("end_usage: %d\n", oxcf->end_usage);

-      printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);

-      printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);

-      printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);

-      printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);

-      printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);

-      printf("fixed_q: %d\n",  oxcf->fixed_q);

-      printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);

-      printf("best_allowed_q: %d\n", oxcf->best_allowed_q);

-      printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);

-      printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);

-      printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);

-      printf("allow_lag: %d\n", oxcf->allow_lag);

-      printf("lag_in_frames: %d\n", oxcf->lag_in_frames);

-      printf("play_alternate: %d\n", oxcf->play_alternate);

-      printf("Version: %d\n", oxcf->Version);

-      printf("encode_breakout: %d\n", oxcf->encode_breakout);

+  printf("Current VP9 Settings: \n");

+  printf("target_bandwidth: %d\n", oxcf->target_bandwidth);

+  printf("noise_sensitivity: %d\n", oxcf->noise_sensitivity);

+  printf("Sharpness: %d\n",    oxcf->Sharpness);

+  printf("cpu_used: %d\n",  oxcf->cpu_used);

+  printf("Mode: %d\n",     oxcf->Mode);

+  // printf("delete_first_pass_file: %d\n",  oxcf->delete_first_pass_file);

+  printf("auto_key: %d\n",  oxcf->auto_key);

+  printf("key_freq: %d\n", oxcf->key_freq);

+  printf("end_usage: %d\n", oxcf->end_usage);

+  printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct);

+  printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct);

+  printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level);

+  printf("optimal_buffer_level: %d\n",  oxcf->optimal_buffer_level);

+  printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size);

+  printf("fixed_q: %d\n",  oxcf->fixed_q);

+  printf("worst_allowed_q: %d\n", oxcf->worst_allowed_q);

+  printf("best_allowed_q: %d\n", oxcf->best_allowed_q);

+  printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);

+  printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);

+  printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);

+  printf("allow_lag: %d\n", oxcf->allow_lag);

+  printf("lag_in_frames: %d\n", oxcf->lag_in_frames);

+  printf("play_alternate: %d\n", oxcf->play_alternate);

+  printf("Version: %d\n", oxcf->Version);

+  printf("encode_breakout: %d\n", oxcf->encode_breakout);

+  printf("error resilient: %d\n", oxcf->error_resilient_mode);

+  printf("frame parallel detokenization: %d\n",

+         oxcf->frame_parallel_decoding_mode);

*/

   return VPX_CODEC_OK;

@@ -409,7 +415,8 @@

       MAP(VP8E_SET_NOISE_SENSITIVITY,     xcfg.noise_sensitivity);

       MAP(VP8E_SET_SHARPNESS,             xcfg.Sharpness);

       MAP(VP8E_SET_STATIC_THRESHOLD,      xcfg.static_thresh);

-      MAP(VP8E_SET_TOKEN_PARTITIONS,      xcfg.token_partitions);

+      MAP(VP9E_SET_TILE_COLUMNS,          xcfg.tile_columns);

+      MAP(VP9E_SET_TILE_ROWS,             xcfg.tile_rows);

       MAP(VP8E_SET_ARNR_MAXFRAMES,        xcfg.arnr_max_frames);

       MAP(VP8E_SET_ARNR_STRENGTH,        xcfg.arnr_strength);

@@ -417,9 +424,8 @@

       MAP(VP8E_SET_TUNING,                xcfg.tuning);

       MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);

       MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, xcfg.rc_max_intra_bitrate_pct);

-#if CONFIG_LOSSLESS

       MAP(VP9E_SET_LOSSLESS,              xcfg.lossless);

-#endif

+      MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode);

   res = validate_config(ctx, &ctx->cfg, &xcfg);

@@ -540,6 +546,8 @@

   yv12->u_buffer = img->planes[VPX_PLANE_U];

   yv12->v_buffer = img->planes[VPX_PLANE_V];

+  yv12->y_crop_width  = img->d_w;

+  yv12->y_crop_height = img->d_h;

   yv12->y_width  = img->d_w;

   yv12->y_height = img->d_h;

   yv12->uv_width = (1 + yv12->y_width) / 2;

@@ -578,6 +586,46 @@

+static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {

+  uint8_t marker = 0xc0;

+  int mag, mask, index_sz;

+  assert(ctx->pending_frame_count);

+  assert(ctx->pending_frame_count <= 8);

+  /* Add the number of frames to the marker byte */

+  marker |= ctx->pending_frame_count - 1;

+  /* Choose the magnitude */

+  for (mag = 0, mask = 0xff; mag < 4; mag++) {

+    if (ctx->pending_frame_magnitude < mask)

+      break;

+    mask <<= 8;

+    mask |= 0xff;

+  }

+  marker |= mag << 3;

+  /* Write the index */

+  index_sz = 2 + (mag + 1) * ctx->pending_frame_count;

+  if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) {

+    uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz;

+    int i, j;

+    *x++ = marker;

+    for (i = 0; i < ctx->pending_frame_count; i++) {

+      int this_sz = ctx->pending_frame_sizes[i];

+      for (j = 0; j <= mag; j++) {

+        *x++ = this_sz & 0xff;

+        this_sz >>= 8;

+      }

+    }

+    *x++ = marker;

+    ctx->pending_cx_data_sz += index_sz;

+  }

+  return index_sz;

+}

 static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,

                                    const vpx_image_t     *img,

                                    vpx_codec_pts_t        pts,

@@ -670,14 +718,11 @@

     if (img != NULL) {

       res = image2yuvconfig(img, &sd);

-      if (vp9_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,

+      if (vp9_receive_raw_frame(ctx->cpi, lib_flags,

                                 &sd, dst_time_stamp, dst_end_time_stamp)) {

         VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;

         res = update_error_state(ctx, &cpi->common.error);

-      /* reset for next frame */

-      ctx->next_frame_flag = 0;

     cx_data = ctx->cx_data;

@@ -714,6 +759,8 @@

           if (!ctx->pending_cx_data)

             ctx->pending_cx_data = cx_data;

           ctx->pending_cx_data_sz += size;

+          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;

+          ctx->pending_frame_magnitude |= size;

           cx_data += size;

           cx_data_sz -= size;

           continue;

@@ -773,10 +820,16 @@

         else*/

           if (ctx->pending_cx_data) {

+            ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;

+            ctx->pending_frame_magnitude |= size;

+            ctx->pending_cx_data_sz += size;

+            size += write_superframe_index(ctx);

             pkt.data.frame.buf = ctx->pending_cx_data;

-            pkt.data.frame.sz  = ctx->pending_cx_data_sz + size;

+            pkt.data.frame.sz  = ctx->pending_cx_data_sz;

             ctx->pending_cx_data = NULL;

             ctx->pending_cx_data_sz = 0;

+            ctx->pending_frame_count = 0;

+            ctx->pending_frame_magnitude = 0;

           } else {

             pkt.data.frame.buf = cx_data;

             pkt.data.frame.sz  = size;

@@ -818,9 +871,9 @@

-static vpx_codec_err_t vp8e_get_reference(vpx_codec_alg_priv_t *ctx,

-                                          int ctr_id,

-                                          va_list args) {

+static vpx_codec_err_t vp8e_copy_reference(vpx_codec_alg_priv_t *ctx,

+                                           int ctr_id,

+                                           va_list args) {

   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

@@ -829,12 +882,28 @@

     YV12_BUFFER_CONFIG sd;

     image2yuvconfig(&frame->img, &sd);

-    vp9_get_reference_enc(ctx->cpi, frame->frame_type, &sd);

+    vp9_copy_reference_enc(ctx->cpi, frame->frame_type, &sd);

     return VPX_CODEC_OK;

   } else

     return VPX_CODEC_INVALID_PARAM;

+static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,

+                                     int ctr_id,

+                                     va_list args) {

+  vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);

+  if (data) {

+    YV12_BUFFER_CONFIG* fb;

+    vp9_get_reference_enc(ctx->cpi, data->idx, &fb);

+    yuvconfig2image(&data->img, fb, NULL);

+    return VPX_CODEC_OK;

+  } else {

+    return VPX_CODEC_INVALID_PARAM;

+  }

+}

 static vpx_codec_err_t vp8e_set_previewpp(vpx_codec_alg_priv_t *ctx,

                                           int ctr_id,

                                           va_list args) {

@@ -979,8 +1048,6 @@

                                 scalemode.v_scaling_mode);

     if (!res) {

-      /*force next frame a key frame to effect scaling mode */

-      ctx->next_frame_flag |= FRAMEFLAGS_KEY;

       return VPX_CODEC_OK;

     } else

       return VPX_CODEC_INVALID_PARAM;

@@ -991,7 +1058,7 @@

 static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {

   {VP8_SET_REFERENCE,                 vp8e_set_reference},

-  {VP8_COPY_REFERENCE,                vp8e_get_reference},

+  {VP8_COPY_REFERENCE,                vp8e_copy_reference},

   {VP8_SET_POSTPROC,                  vp8e_set_previewpp},

   {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},

   {VP8E_UPD_REFERENCE,                vp8e_update_reference},

@@ -1004,7 +1071,8 @@

   {VP8E_SET_ENABLEAUTOALTREF,         set_param},

   {VP8E_SET_SHARPNESS,                set_param},

   {VP8E_SET_STATIC_THRESHOLD,         set_param},

-  {VP8E_SET_TOKEN_PARTITIONS,         set_param},

+  {VP9E_SET_TILE_COLUMNS,             set_param},

+  {VP9E_SET_TILE_ROWS,                set_param},

   {VP8E_GET_LAST_QUANTIZER,           get_param},

   {VP8E_GET_LAST_QUANTIZER_64,        get_param},

   {VP8E_SET_ARNR_MAXFRAMES,           set_param},

@@ -1013,9 +1081,8 @@

   {VP8E_SET_TUNING,                   set_param},

   {VP8E_SET_CQ_LEVEL,                 set_param},

   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},

-#if CONFIG_LOSSLESS

   {VP9E_SET_LOSSLESS,                 set_param},

-#endif

+  {VP9_GET_REFERENCE,                 get_reference},

   { -1, NULL},

};

--- a/vp9/vp9_dx_iface.c

+++ b/vp9/vp9_dx_iface.c

@@ -17,6 +17,7 @@

 #include "vpx_version.h"

 #include "decoder/vp9_onyxd.h"

 #include "decoder/vp9_onyxd_int.h"

+#include "vp9/vp9_iface_common.h"

 #define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)

 typedef vpx_codec_stream_info_t  vp8_stream_info_t;

@@ -63,6 +64,7 @@

   vpx_image_t             img;

   int                     img_setup;

   int                     img_avail;

+  int                     invert_tile_order;

};

 static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si,

@@ -229,8 +231,8 @@

       if (c[0] != 0x9d || c[1] != 0x01 || c[2] != 0x2a)

         res = VPX_CODEC_UNSUP_BITSTREAM;

-      si->w = (c[3] | (c[4] << 8)) & 0x3fff;

-      si->h = (c[5] | (c[6] << 8)) & 0x3fff;

+      si->w = (c[3] | (c[4] << 8));

+      si->h = (c[5] | (c[6] << 8));

       /*printf("w=%d, h=%d\n", si->w, si->h);*/

       if (!(si->h | si->w))

@@ -273,36 +275,6 @@

   return res;

-static void yuvconfig2image(vpx_image_t               *img,

-                            const YV12_BUFFER_CONFIG  *yv12,

-                            void                      *user_priv) {

-  /** vpx_img_wrap() doesn't allow specifying independent strides for

-    * the Y, U, and V planes, nor other alignment adjustments that

-    * might be representable by a YV12_BUFFER_CONFIG, so we just

-    * initialize all the fields.*/

-  img->fmt = yv12->clrtype == REG_YUV ?

-             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;

-  img->w = yv12->y_stride;

-  img->h = (yv12->y_height + 2 * VP9BORDERINPIXELS + 15) & ~15;

-  img->d_w = yv12->y_width;

-  img->d_h = yv12->y_height;

-  img->x_chroma_shift = 1;

-  img->y_chroma_shift = 1;

-  img->planes[VPX_PLANE_Y] = yv12->y_buffer;

-  img->planes[VPX_PLANE_U] = yv12->u_buffer;

-  img->planes[VPX_PLANE_V] = yv12->v_buffer;

-  img->planes[VPX_PLANE_ALPHA] = NULL;

-  img->stride[VPX_PLANE_Y] = yv12->y_stride;

-  img->stride[VPX_PLANE_U] = yv12->uv_stride;

-  img->stride[VPX_PLANE_V] = yv12->uv_stride;

-  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;

-  img->bps = 12;

-  img->user_priv = user_priv;

-  img->img_data = yv12->buffer_alloc;

-  img->img_data_owner = 0;

-  img->self_allocd = 0;

-}

 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,

                                   const uint8_t        **data,

                                   unsigned int           data_sz,

@@ -362,6 +334,7 @@

       oxcf.Version = 9;

       oxcf.postprocess = 0;

       oxcf.max_threads = ctx->cfg.threads;

+      oxcf.inv_tile_order = ctx->invert_tile_order;

       optr = vp9_create_decompressor(&oxcf);

       /* If postprocessing was enabled by the application and a

@@ -424,6 +397,39 @@

   return res;

+static void parse_superframe_index(const uint8_t *data,

+                                   size_t         data_sz,

+                                   uint32_t       sizes[8],

+                                   int           *count) {

+  uint8_t marker;

+  assert(data_sz);

+  marker = data[data_sz - 1];

+  *count = 0;

+  if ((marker & 0xe0) == 0xc0) {

+    const int frames = (marker & 0x7) + 1;

+    const int mag = ((marker >> 3) & 3) + 1;

+    const int index_sz = 2 + mag  * frames;

+    if (data_sz >= index_sz && data[data_sz - index_sz] == marker) {

+      // found a valid superframe index

+      int i, j;

+      const uint8_t *x = data + data_sz - index_sz + 1;

+      for (i = 0; i < frames; i++) {

+        int this_sz = 0;

+        for (j = 0; j < mag; j++)

+          this_sz |= (*x++) << (j * 8);

+        sizes[i] = this_sz;

+      }

+      *count = frames;

+    }

+  }

+}

 static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t  *ctx,

                                   const uint8_t         *data,

                                   unsigned int           data_sz,

@@ -431,9 +437,43 @@

                                   long                   deadline) {

   const uint8_t *data_start = data;

   const uint8_t *data_end = data + data_sz;

-  vpx_codec_err_t res;

+  vpx_codec_err_t res = 0;

+  uint32_t sizes[8];

+  int frames_this_pts, frame_count = 0;

+  parse_superframe_index(data, data_sz, sizes, &frames_this_pts);

   do {

+    // Skip over the superframe index, if present

+    if (data_sz && (*data_start & 0xe0) == 0xc0) {

+      const uint8_t marker = *data_start;

+      const int frames = (marker & 0x7) + 1;

+      const int mag = ((marker >> 3) & 3) + 1;

+      const int index_sz = 2 + mag  * frames;

+      if (data_sz >= index_sz && data_start[index_sz - 1] == marker) {

+        data_start += index_sz;

+        data_sz -= index_sz;

+        if (data_start < data_end)

+          continue;

+        else

+          break;

+      }

+    }

+    // Use the correct size for this frame, if an index is present.

+    if (frames_this_pts) {

+      uint32_t this_sz = sizes[frame_count];

+      if (data_sz < this_sz) {

+        ctx->base.err_detail = "Invalid frame size in index";

+        return VPX_CODEC_CORRUPT_FRAME;

+      }

+      data_sz = this_sz;

+      frame_count++;

+    }

     res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);

     assert(data_start >= data);

     assert(data_start <= data_end);

@@ -545,6 +585,8 @@

   yv12->u_buffer = img->planes[VPX_PLANE_U];

   yv12->v_buffer = img->planes[VPX_PLANE_V];

+  yv12->y_crop_width  = img->d_w;

+  yv12->y_crop_height = img->d_h;

   yv12->y_width  = img->d_w;

   yv12->y_height = img->d_h;

   yv12->uv_width = yv12->y_width / 2;

@@ -580,9 +622,9 @@

-static vpx_codec_err_t vp9_get_reference(vpx_codec_alg_priv_t *ctx,

-                                         int ctr_id,

-                                         va_list args) {

+static vpx_codec_err_t vp9_copy_reference(vpx_codec_alg_priv_t *ctx,

+                                          int ctr_id,

+                                          va_list args) {

   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);

@@ -592,13 +634,29 @@

     image2yuvconfig(&frame->img, &sd);

-    return vp9_get_reference_dec(ctx->pbi,

-                                 (VP9_REFFRAME)frame->frame_type, &sd);

+    return vp9_copy_reference_dec(ctx->pbi,

+                                  (VP9_REFFRAME)frame->frame_type, &sd);

   } else

     return VPX_CODEC_INVALID_PARAM;

+static vpx_codec_err_t get_reference(vpx_codec_alg_priv_t *ctx,

+                                     int ctr_id,

+                                     va_list args) {

+  vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);

+  if (data) {

+    YV12_BUFFER_CONFIG* fb;

+    vp9_get_reference_dec(ctx->pbi, data->idx, &fb);

+    yuvconfig2image(&data->img, fb, NULL);

+    return VPX_CODEC_OK;

+  } else {

+    return VPX_CODEC_INVALID_PARAM;

+  }

+}

 static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,

                                         int ctr_id,

                                         va_list args) {

@@ -645,9 +703,7 @@

   VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;

   if (update_info) {

-    *update_info = pbi->common.refresh_alt_ref_frame * (int) VP8_ALTR_FRAME

-                   + pbi->common.refresh_golden_frame * (int) VP8_GOLD_FRAME

-                   + pbi->common.refresh_last_frame * (int) VP8_LAST_FRAME;

+    *update_info = pbi->refresh_frame_flags;

     return VPX_CODEC_OK;

   } else

@@ -671,9 +727,16 @@

+static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx,

+                                             int ctr_id,

+                                             va_list args) {

+  ctx->invert_tile_order = va_arg(args, int);

+  return VPX_CODEC_OK;

+}

 static vpx_codec_ctrl_fn_map_t ctf_maps[] = {

   {VP8_SET_REFERENCE,             vp9_set_reference},

-  {VP8_COPY_REFERENCE,            vp9_get_reference},

+  {VP8_COPY_REFERENCE,            vp9_copy_reference},

   {VP8_SET_POSTPROC,              vp8_set_postproc},

   {VP8_SET_DBG_COLOR_REF_FRAME,   vp8_set_dbg_options},

   {VP8_SET_DBG_COLOR_MB_MODES,    vp8_set_dbg_options},

@@ -681,6 +744,8 @@

   {VP8_SET_DBG_DISPLAY_MV,        vp8_set_dbg_options},

   {VP8D_GET_LAST_REF_UPDATES,     vp8_get_last_ref_updates},

   {VP8D_GET_FRAME_CORRUPTED,      vp8_get_frame_corrupted},

+  {VP9_GET_REFERENCE,             get_reference},

+  {VP9_INVERT_TILE_DECODE_ORDER,  set_invert_tile_order},

   { -1, NULL},

};

--- /dev/null

+++ b/vp9/vp9_iface_common.h

@@ -1,0 +1,43 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_VP9_IFACE_COMMON_H_

+#define VP9_VP9_IFACE_COMMON_H_

+static void yuvconfig2image(vpx_image_t               *img,

+                            const YV12_BUFFER_CONFIG  *yv12,

+                            void                      *user_priv) {

+  /** vpx_img_wrap() doesn't allow specifying independent strides for

+    * the Y, U, and V planes, nor other alignment adjustments that

+    * might be representable by a YV12_BUFFER_CONFIG, so we just

+    * initialize all the fields.*/

+  img->fmt = yv12->clrtype == REG_YUV ?

+             VPX_IMG_FMT_I420 : VPX_IMG_FMT_VPXI420;

+  img->w = yv12->y_stride;

+  img->h = (yv12->y_height + 2 * VP9BORDERINPIXELS + 15) & ~15;

+  img->d_w = yv12->y_width;

+  img->d_h = yv12->y_height;

+  img->x_chroma_shift = 1;

+  img->y_chroma_shift = 1;

+  img->planes[VPX_PLANE_Y] = yv12->y_buffer;

+  img->planes[VPX_PLANE_U] = yv12->u_buffer;

+  img->planes[VPX_PLANE_V] = yv12->v_buffer;

+  img->planes[VPX_PLANE_ALPHA] = NULL;

+  img->stride[VPX_PLANE_Y] = yv12->y_stride;

+  img->stride[VPX_PLANE_U] = yv12->uv_stride;

+  img->stride[VPX_PLANE_V] = yv12->uv_stride;

+  img->stride[VPX_PLANE_ALPHA] = yv12->y_stride;

+  img->bps = 12;

+  img->user_priv = user_priv;

+  img->img_data = yv12->buffer_alloc;

+  img->img_data_owner = 0;

+  img->self_allocd = 0;

+}

+#endif

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -65,7 +65,6 @@

 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c

 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c

 VP9_CX_SRCS-yes += encoder/vp9_sad_c.c

-VP9_CX_SRCS-yes += encoder/vp9_satd_c.c

 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c

 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h

 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c

@@ -95,8 +94,9 @@

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm

+#VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm

@@ -103,12 +103,18 @@

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm

-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm

+#VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm

 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm

-VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm

+#VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_quantize_sse4.asm

 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_quantize_mmx.asm

 VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm

 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2_intrinsics.c

+ifeq ($(HAVE_SSE2),yes)

+vp9/encoder/x86/vp9_dct_sse2_intrinsics.c.d: CFLAGS += -msse2

+vp9/encoder/x86/vp9_dct_sse2_intrinsics.c.o: CFLAGS += -msse2

+endif

 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

--- a/vp9/vp9dx.mk

+++ b/vp9/vp9dx.mk

@@ -38,5 +38,11 @@

 VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c

+VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/vp9_dequantize_x86.c

+ifeq ($(HAVE_SSE2),yes)

+vp9/decoder/x86/vp9_dequantize_x86.c.o: CFLAGS += -msse2

+vp9/decoder/x86/vp9_dequantize_x86.c.d: CFLAGS += -msse2

+endif

 $(eval $(call asm_offsets_template,\

          vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))

--- a/vpx/vp8.h

+++ b/vpx/vp8.h

@@ -44,6 +44,12 @@

   VP8_SET_DBG_COLOR_MB_MODES  = 5,    /**< set which macro block modes to color */

   VP8_SET_DBG_COLOR_B_MODES   = 6,    /**< set which blocks modes to color */

   VP8_SET_DBG_DISPLAY_MV      = 7,    /**< set which motion vector modes to draw */

+  /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)

+   * for its control ids. These should be migrated to something like the

+   * VP8_DECODER_CTRL_ID_START range next time we're ready to break the ABI.

+   */

+  VP9_GET_REFERENCE           = 128,  /**< get a pointer to a reference frame */

   VP8_COMMON_CTRL_ID_MAX,

   VP8_DECODER_CTRL_ID_START   = 256

};

@@ -97,6 +103,10 @@

   vpx_image_t           img;          /**< reference frame data in image format */

 } vpx_ref_frame_t;

+typedef struct vp9_ref_frame {

+  int idx; /**< frame index to get (input) */

+  vpx_image_t  img; /**< img structure to populate (output) */

+} vp9_ref_frame_t;

 /*!\brief vp8 decoder control function parameter type

@@ -110,6 +120,7 @@

 VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_MB_MODES,  int)

 VPX_CTRL_USE_TYPE(VP8_SET_DBG_COLOR_B_MODES,   int)

 VPX_CTRL_USE_TYPE(VP8_SET_DBG_DISPLAY_MV,      int)

+VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)

 /*! @} - end defgroup vp8 */

--- a/vpx/vp8cx.h

+++ b/vpx/vp8cx.h

@@ -187,7 +187,10 @@

   /* TODO(jkoleszar): Move to vp9cx.h */

-  VP9E_SET_LOSSLESS

+  VP9E_SET_LOSSLESS,

+  VP9E_SET_TILE_COLUMNS,

+  VP9E_SET_TILE_ROWS,

+  VP9E_SET_FRAME_PARALLEL_DECODING

};

 /*!\brief vpx 1-D scaling mode

@@ -298,6 +301,9 @@

 VPX_CTRL_USE_TYPE(VP8E_SET_TUNING,             int) /* vp8e_tuning */

 VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL,      unsigned int)

+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS,  int)

+VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS,  int)

 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)

 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64,  int *)

@@ -305,6 +311,7 @@

 VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)

+VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)

 /*! @} - end defgroup vp8_encoder */

 #include "vpx_codec_impl_bottom.h"

 #endif

--- a/vpx/vp8dx.h

+++ b/vpx/vp8dx.h

@@ -69,6 +69,9 @@

*/

   VP8_SET_DECRYPT_KEY,

+  /** For testing. */

+  VP9_INVERT_TILE_DECODE_ORDER,

   VP8_DECODER_CTRL_ID_MAX

};

@@ -85,6 +88,7 @@

 VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED,    int *)

 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED,      int *)

 VPX_CTRL_USE_TYPE(VP8_SET_DECRYPT_KEY,         const unsigned char *)

+VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)

 /*! @} - end defgroup vp8_decoder */

--- a/vpx/vpx_integer.h

+++ b/vpx/vpx_integer.h

@@ -28,6 +28,8 @@

 typedef signed __int64   int64_t;

 typedef unsigned __int64 uint64_t;

 #define INT64_MAX _I64_MAX

+#define INT16_MAX _I16_MAX

+#define INT16_MIN _I16_MIN

 #endif

 #ifndef _UINTPTR_T_DEFINED

--- a/vpx_ports/mem.h

+++ b/vpx_ports/mem.h

@@ -11,6 +11,7 @@

 #ifndef VPX_PORTS_MEM_H

 #define VPX_PORTS_MEM_H

 #include "vpx_config.h"

 #include "vpx/vpx_integer.h"

--- a/vpx_scale/generic/yv12config.c

+++ b/vpx_scale/generic/yv12config.c

@@ -35,36 +35,41 @@

   return 0;

-/****************************************************************************

- *

- ****************************************************************************/

-int

-vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border) {

-  /*NOTE:*/

+int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,

+                                  int width, int height, int border) {

   if (ybf) {

-    int y_stride = ((width + 2 * border) + 31) & ~31;

-    int yplane_size = (height + 2 * border) * y_stride;

-    int uv_width = width >> 1;

-    int uv_height = height >> 1;

+    int aligned_width = (width + 15) & ~15;

+    int aligned_height = (height + 15) & ~15;

+    int y_stride = ((aligned_width + 2 * border) + 31) & ~31;

+    int yplane_size = (aligned_height + 2 * border) * y_stride;

+    int uv_width = aligned_width >> 1;

+    int uv_height = aligned_height >> 1;

     /** There is currently a bunch of code which assumes

       *  uv_stride == y_stride/2, so enforce this here. */

     int uv_stride = y_stride >> 1;

     int uvplane_size = (uv_height + border) * uv_stride;

+    const int frame_size = yplane_size + 2 * uvplane_size;

-    vp8_yv12_de_alloc_frame_buffer(ybf);

+    if (!ybf->buffer_alloc) {

+      ybf->buffer_alloc = vpx_memalign(32, frame_size);

+      ybf->buffer_alloc_sz = frame_size;

+    }

-    /** Only support allocating buffers that have a height and width that

-      *  are multiples of 16, and a border that's a multiple of 32.

-      * The border restriction is required to get 16-byte alignment of the

-      *  start of the chroma rows without intoducing an arbitrary gap

-      *  between planes, which would break the semantics of things like

-      *  vpx_img_set_rect(). */

-    if ((width & 0xf) | (height & 0xf) | (border & 0x1f))

+    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size)

+      return -1;

+    /* Only support allocating buffers that have a border that's a multiple

+     * of 32. The border restriction is required to get 16-byte alignment of

+     * the start of the chroma rows without intoducing an arbitrary gap

+     * between planes, which would break the semantics of things like

+     * vpx_img_set_rect(). */

+    if (border & 0x1f)

       return -3;

-    ybf->y_width  = width;

-    ybf->y_height = height;

+    ybf->y_crop_width = width;

+    ybf->y_crop_height = height;

+    ybf->y_width  = aligned_width;

+    ybf->y_height = aligned_height;

     ybf->y_stride = y_stride;

     ybf->uv_width = uv_width;

@@ -72,21 +77,23 @@

     ybf->uv_stride = uv_stride;

     ybf->border = border;

-    ybf->frame_size = yplane_size + 2 * uvplane_size;

+    ybf->frame_size = frame_size;

-    ybf->buffer_alloc = (unsigned char *) vpx_memalign(32, ybf->frame_size);

-    if (ybf->buffer_alloc == NULL)

-      return -1;

     ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;

     ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2  * uv_stride) + border / 2;

     ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2  * uv_stride) + border / 2;

     ybf->corrupted = 0; /* assume not currupted by errors */

-  } else {

-    return -2;

+    return 0;

+  return -2;

+}

-  return 0;

+int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,

+                                int width, int height, int border) {

+  if (ybf) {

+    vp8_yv12_de_alloc_frame_buffer(ybf);

+    return vp8_yv12_realloc_frame_buffer(ybf, width, height, border);

+  }

+  return -2;

--- a/vpx_scale/generic/yv12extend.c

+++ b/vpx_scale/generic/yv12extend.c

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include "vpx_scale/yv12config.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_scale/vpx_scale.h"

@@ -20,180 +20,81 @@

 /****************************************************************************

  ****************************************************************************/

-void

-vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {

+static void extend_plane(uint8_t *s,       /* source */

+                         int sp,           /* source pitch */

+                         int w,            /* width */

+                         int h,            /* height */

+                         int et,           /* extend top border */

+                         int el,           /* extend left border */

+                         int eb,           /* extend bottom border */

+                         int er) {         /* extend right border */

   int i;

-  unsigned char *src_ptr1, *src_ptr2;

-  unsigned char *dest_ptr1, *dest_ptr2;

+  uint8_t *src_ptr1, *src_ptr2;

+  uint8_t *dest_ptr1, *dest_ptr2;

+  int linesize;

-  unsigned int Border;

-  int plane_stride;

-  int plane_height;

-  int plane_width;

-  /***********/

-  /* Y Plane */

-  /***********/

-  Border = ybf->border;

-  plane_stride = ybf->y_stride;

-  plane_height = ybf->y_height;

-  plane_width = ybf->y_width;

   /* copy the left and right most columns out */

-  src_ptr1 = ybf->y_buffer;

-  src_ptr2 = src_ptr1 + plane_width - 1;

-  dest_ptr1 = src_ptr1 - Border;

-  dest_ptr2 = src_ptr2 + 1;

+  src_ptr1 = s;

+  src_ptr2 = s + w - 1;

+  dest_ptr1 = s - el;

+  dest_ptr2 = s + w;

-  for (i = 0; i < plane_height; i++) {

-    vpx_memset(dest_ptr1, src_ptr1[0], Border);

-    vpx_memset(dest_ptr2, src_ptr2[0], Border);

-    src_ptr1  += plane_stride;

-    src_ptr2  += plane_stride;

-    dest_ptr1 += plane_stride;

-    dest_ptr2 += plane_stride;

+  for (i = 0; i < h; i++) {

+    vpx_memset(dest_ptr1, src_ptr1[0], el);

+    vpx_memset(dest_ptr2, src_ptr2[0], er);

+    src_ptr1  += sp;

+    src_ptr2  += sp;

+    dest_ptr1 += sp;

+    dest_ptr2 += sp;

-  /* Now copy the top and bottom source lines into each line of the respective borders */

-  src_ptr1 = ybf->y_buffer - Border;

-  src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;

-  dest_ptr1 = src_ptr1 - (Border * plane_stride);

-  dest_ptr2 = src_ptr2 + plane_stride;

+  /* Now copy the top and bottom lines into each line of the respective

+   * borders

+   */

+  src_ptr1 = s - el;

+  src_ptr2 = s + sp * (h - 1) - el;

+  dest_ptr1 = s + sp * (-et) - el;

+  dest_ptr2 = s + sp * (h) - el;

+  linesize = el + er + w;

-  for (i = 0; i < (int)Border; i++) {

-    vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);

-    vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);

-    dest_ptr1 += plane_stride;

-    dest_ptr2 += plane_stride;

+  for (i = 0; i < et; i++) {

+    vpx_memcpy(dest_ptr1, src_ptr1, linesize);

+    dest_ptr1 += sp;

-  /***********/

-  /* U Plane */

-  /***********/

-  plane_stride = ybf->uv_stride;

-  plane_height = ybf->uv_height;

-  plane_width = ybf->uv_width;

-  Border /= 2;

-  /* copy the left and right most columns out */

-  src_ptr1 = ybf->u_buffer;

-  src_ptr2 = src_ptr1 + plane_width - 1;

-  dest_ptr1 = src_ptr1 - Border;

-  dest_ptr2 = src_ptr2 + 1;

-  for (i = 0; i < plane_height; i++) {

-    vpx_memset(dest_ptr1, src_ptr1[0], Border);

-    vpx_memset(dest_ptr2, src_ptr2[0], Border);

-    src_ptr1  += plane_stride;

-    src_ptr2  += plane_stride;

-    dest_ptr1 += plane_stride;

-    dest_ptr2 += plane_stride;

+  for (i = 0; i < eb; i++) {

+    vpx_memcpy(dest_ptr2, src_ptr2, linesize);

+    dest_ptr2 += sp;

-  /* Now copy the top and bottom source lines into each line of the respective borders */

-  src_ptr1 = ybf->u_buffer - Border;

-  src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;

-  dest_ptr1 = src_ptr1 - (Border * plane_stride);

-  dest_ptr2 = src_ptr2 + plane_stride;

-  for (i = 0; i < (int)(Border); i++) {

-    vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);

-    vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);

-    dest_ptr1 += plane_stride;

-    dest_ptr2 += plane_stride;

-  }

-  /***********/

-  /* V Plane */

-  /***********/

-  /* copy the left and right most columns out */

-  src_ptr1 = ybf->v_buffer;

-  src_ptr2 = src_ptr1 + plane_width - 1;

-  dest_ptr1 = src_ptr1 - Border;

-  dest_ptr2 = src_ptr2 + 1;

-  for (i = 0; i < plane_height; i++) {

-    vpx_memset(dest_ptr1, src_ptr1[0], Border);

-    vpx_memset(dest_ptr2, src_ptr2[0], Border);

-    src_ptr1  += plane_stride;

-    src_ptr2  += plane_stride;

-    dest_ptr1 += plane_stride;

-    dest_ptr2 += plane_stride;

-  }

-  /* Now copy the top and bottom source lines into each line of the respective borders */

-  src_ptr1 = ybf->v_buffer - Border;

-  src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;

-  dest_ptr1 = src_ptr1 - (Border * plane_stride);

-  dest_ptr2 = src_ptr2 + plane_stride;

-  for (i = 0; i < (int)(Border); i++) {

-    vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);

-    vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);

-    dest_ptr1 += plane_stride;

-    dest_ptr2 += plane_stride;

-  }

+void

+vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {

+  assert(ybf->y_height - ybf->y_crop_height < 16);

+  assert(ybf->y_width - ybf->y_crop_width < 16);

+  assert(ybf->y_height - ybf->y_crop_height >= 0);

+  assert(ybf->y_width - ybf->y_crop_width >= 0);

-static void

-extend_frame_borders_yonly_c(YV12_BUFFER_CONFIG *ybf) {

-  int i;

-  unsigned char *src_ptr1, *src_ptr2;

-  unsigned char *dest_ptr1, *dest_ptr2;

+  extend_plane(ybf->y_buffer, ybf->y_stride,

+               ybf->y_crop_width, ybf->y_crop_height,

+               ybf->border, ybf->border,

+               ybf->border + ybf->y_height - ybf->y_crop_height,

+               ybf->border + ybf->y_width - ybf->y_crop_width);

-  unsigned int Border;

-  int plane_stride;

-  int plane_height;

-  int plane_width;

+  extend_plane(ybf->u_buffer, ybf->uv_stride,

+               (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,

+               ybf->border / 2, ybf->border / 2,

+               (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,

+               (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);

-  /***********/

-  /* Y Plane */

-  /***********/

-  Border = ybf->border;

-  plane_stride = ybf->y_stride;

-  plane_height = ybf->y_height;

-  plane_width = ybf->y_width;

-  /* copy the left and right most columns out */

-  src_ptr1 = ybf->y_buffer;

-  src_ptr2 = src_ptr1 + plane_width - 1;

-  dest_ptr1 = src_ptr1 - Border;

-  dest_ptr2 = src_ptr2 + 1;

-  for (i = 0; i < plane_height; i++) {

-    vpx_memset(dest_ptr1, src_ptr1[0], Border);

-    vpx_memset(dest_ptr2, src_ptr2[0], Border);

-    src_ptr1  += plane_stride;

-    src_ptr2  += plane_stride;

-    dest_ptr1 += plane_stride;

-    dest_ptr2 += plane_stride;

-  }

-  /* Now copy the top and bottom source lines into each line of the respective borders */

-  src_ptr1 = ybf->y_buffer - Border;

-  src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;

-  dest_ptr1 = src_ptr1 - (Border * plane_stride);

-  dest_ptr2 = src_ptr2 + plane_stride;

-  for (i = 0; i < (int)Border; i++) {

-    vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);

-    vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);

-    dest_ptr1 += plane_stride;

-    dest_ptr2 += plane_stride;

-  }

-  plane_stride /= 2;

-  plane_height /= 2;

-  plane_width /= 2;

-  Border /= 2;

+  extend_plane(ybf->v_buffer, ybf->uv_stride,

+               (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,

+               ybf->border / 2, ybf->border / 2,

+               (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,

+               (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);

 /****************************************************************************

  *  ROUTINE       : vp8_yv12_copy_frame

@@ -215,6 +116,14 @@

                       YV12_BUFFER_CONFIG *dst_ybc) {

   int row;

   unsigned char *source, *dest;

+#if 0

+  /* These assertions are valid in the codec, but the libvpx-tester uses

+   * this code slightly differently.

+   */

+  assert(src_ybc->y_width == dst_ybc->y_width);

+  assert(src_ybc->y_height == dst_ybc->y_height);

+#endif

   source = src_ybc->y_buffer;

   dest = dst_ybc->y_buffer;

--- a/vpx_scale/yv12config.h

+++ b/vpx_scale/yv12config.h

@@ -42,6 +42,8 @@

   typedef struct yv12_buffer_config {

     int   y_width;

     int   y_height;

+    int   y_crop_width;

+    int   y_crop_height;

     int   y_stride;

     /*    int   yinternal_width; */

@@ -55,6 +57,7 @@

     uint8_t *v_buffer;

     uint8_t *buffer_alloc;

+    int buffer_alloc_sz;

     int border;

     int frame_size;

     YUV_TYPE clrtype;

@@ -65,6 +68,8 @@

   int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,

                                   int width, int height, int border);

+  int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,

+                                    int width, int height, int border);

   int vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);

 #ifdef __cplusplus

--- a/vpxenc.c

+++ b/vpxenc.c

@@ -1028,7 +1028,8 @@

 static const arg_def_t *global_args[] = {

   &use_yv12, &use_i420, &usage, &threads, &profile,

-  &width, &height, &stereo_mode, &timebase, &framerate, &error_resilient,

+  &width, &height, &stereo_mode, &timebase, &framerate,

+  &error_resilient,

   &lag_in_frames, NULL

};

@@ -1103,7 +1104,11 @@

 static const arg_def_t cpu_used = ARG_DEF(NULL, "cpu-used", 1,

                                           "CPU Used (-16..16)");

 static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1,

-                                             "Number of token partitions to use, log2");

+                                     "Number of token partitions to use, log2");

+static const arg_def_t tile_cols = ARG_DEF(NULL, "tile-columns", 1,

+                                         "Number of tile columns to use, log2");

+static const arg_def_t tile_rows = ARG_DEF(NULL, "tile-rows", 1,

+                                           "Number of tile rows to use, log2");

 static const arg_def_t auto_altref = ARG_DEF(NULL, "auto-alt-ref", 1,

                                              "Enable automatic alt reference frames");

 static const arg_def_t arnr_maxframes = ARG_DEF(NULL, "arnr-maxframes", 1,

@@ -1123,8 +1128,10 @@

                                           "Constrained Quality Level");

 static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1,

                                                     "Max I-frame bitrate (pct)");

-#if CONFIG_LOSSLESS

 static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");

+#if CONFIG_VP9_ENCODER

+static const arg_def_t frame_parallel_decoding  = ARG_DEF(

+    NULL, "frame-parallel", 1, "Enable frame parallel decodability features");

 #endif

 #if CONFIG_VP8_ENCODER

@@ -1147,22 +1154,18 @@

 #if CONFIG_VP9_ENCODER

 static const arg_def_t *vp9_args[] = {

   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,

-  &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,

-  &tune_ssim, &cq_level, &max_intra_rate_pct,

-#if CONFIG_LOSSLESS

-  &lossless,

-#endif

+  &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,

+  &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,

+  &frame_parallel_decoding,

   NULL

};

 static const int vp9_arg_ctrl_map[] = {

   VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF,

   VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD,

-  VP8E_SET_TOKEN_PARTITIONS,

+  VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,

   VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,

   VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,

-#if CONFIG_LOSSLESS

-  VP9E_SET_LOSSLESS,

-#endif

+  VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING,

};

 #endif

@@ -1479,14 +1482,16 @@

 #define mmin(a, b)  ((a) < (b) ? (a) : (b))

 static void find_mismatch(vpx_image_t *img1, vpx_image_t *img2,

                           int yloc[2], int uloc[2], int vloc[2]) {

-  int match = 1;

-  int i, j;

-  yloc[0] = yloc[1] = -1;

-  for (i = 0, match = 1; match && i < img1->d_h; i+=32) {

-    for (j = 0; match && j < img1->d_w; j+=32) {

+  const unsigned int bsize = 64;

+  const unsigned int bsize2 = bsize >> 1;

+  unsigned int match = 1;

+  unsigned int i, j;

+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;

+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {

+    for (j = 0; match && j < img1->d_w; j += bsize) {

       int k, l;

-      int si = mmin(i + 32, img1->d_h) - i;

-      int sj = mmin(j + 32, img1->d_w) - j;

+      int si = mmin(i + bsize, img1->d_h) - i;

+      int sj = mmin(j + bsize, img1->d_w) - j;

       for (k = 0; match && k < si; k++)

         for (l = 0; match && l < sj; l++) {

           if (*(img1->planes[VPX_PLANE_Y] +

@@ -1495,6 +1500,10 @@

                 (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {

             yloc[0] = i + k;

             yloc[1] = j + l;

+            yloc[2] = *(img1->planes[VPX_PLANE_Y] +

+                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);

+            yloc[3] = *(img2->planes[VPX_PLANE_Y] +

+                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);

             match = 0;

             break;

@@ -1501,12 +1510,12 @@

-  uloc[0] = uloc[1] = -1;

-  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i+=16) {

-    for (j = 0; j < match && (img1->d_w + 1) / 2; j+=16) {

+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;

+  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) {

+    for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) {

       int k, l;

-      int si = mmin(i + 16, (img1->d_h + 1) / 2) - i;

-      int sj = mmin(j + 16, (img1->d_w + 1) / 2) - j;

+      int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i;

+      int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j;

       for (k = 0; match && k < si; k++)

         for (l = 0; match && l < sj; l++) {

           if (*(img1->planes[VPX_PLANE_U] +

@@ -1515,6 +1524,10 @@

                 (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {

             uloc[0] = i + k;

             uloc[1] = j + l;

+            uloc[2] = *(img1->planes[VPX_PLANE_U] +

+                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);

+            uloc[3] = *(img2->planes[VPX_PLANE_U] +

+                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);

             match = 0;

             break;

@@ -1521,12 +1534,12 @@

-  vloc[0] = vloc[1] = -1;

-  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i+=16) {

-    for (j = 0; j < match && (img1->d_w + 1) / 2; j+=16) {

+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;

+  for (i = 0, match = 1; match && i < (img1->d_h + 1) / 2; i += bsize2) {

+    for (j = 0; j < match && (img1->d_w + 1) / 2; j += bsize2) {

       int k, l;

-      int si = mmin(i + 16, (img1->d_h + 1) / 2) - i;

-      int sj = mmin(j + 16, (img1->d_w + 1) / 2) - j;

+      int si = mmin(i + bsize2, (img1->d_h + 1) / 2) - i;

+      int sj = mmin(j + bsize2, (img1->d_w + 1) / 2) - j;

       for (k = 0; match && k < si; k++)

         for (l = 0; match && l < sj; l++) {

           if (*(img1->planes[VPX_PLANE_V] +

@@ -1535,6 +1548,10 @@

                 (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {

             vloc[0] = i + k;

             vloc[1] = j + l;

+            vloc[2] = *(img1->planes[VPX_PLANE_V] +

+                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);

+            vloc[3] = *(img2->planes[VPX_PLANE_V] +

+                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);

             match = 0;

             break;

@@ -1546,7 +1563,7 @@

 static int compare_img(vpx_image_t *img1, vpx_image_t *img2)

   int match = 1;

-  int i;

+  unsigned int i;

   match &= (img1->fmt == img2->fmt);

   match &= (img1->w == img2->w);

@@ -1638,8 +1655,6 @@

   stats_io_t                stats;

   struct vpx_image         *img;

   vpx_codec_ctx_t           decoder;

-  vpx_ref_frame_t           ref_enc;

-  vpx_ref_frame_t           ref_dec;

   int                       mismatch_seen;

};

@@ -2221,16 +2236,7 @@

 #if CONFIG_DECODERS

   if (global->test_decode != TEST_DECODE_OFF) {

-    int width, height;

     vpx_codec_dec_init(&stream->decoder, global->codec->dx_iface(), NULL, 0);

-    width = (stream->config.cfg.g_w + 15) & ~15;

-    height = (stream->config.cfg.g_h + 15) & ~15;

-    vpx_img_alloc(&stream->ref_enc.img, VPX_IMG_FMT_I420, width, height, 1);

-    vpx_img_alloc(&stream->ref_dec.img, VPX_IMG_FMT_I420, width, height, 1);

-    stream->ref_enc.frame_type = VP8_LAST_FRAME;

-    stream->ref_dec.frame_type = VP8_LAST_FRAME;

 #endif

@@ -2311,6 +2317,8 @@

         if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {

           stream->frames_out++;

+        if (!global->quiet)

+          fprintf(stderr, " %6luF", (unsigned long)pkt->data.frame.sz);

         update_rate_histogram(&stream->rate_hist, cfg, pkt);

         if (stream->config.write_webm) {

@@ -2373,6 +2381,8 @@

           stream->psnr_sse_total += pkt->data.psnr.sse[0];

           stream->psnr_samples_total += pkt->data.psnr.samples[0];

           for (i = 0; i < 4; i++) {

+            if (!global->quiet)

+              fprintf(stderr, "%.3f ", pkt->data.psnr.psnr[i]);

             stream->psnr_totals[i] += pkt->data.psnr.psnr[i];

           stream->psnr_count++;

@@ -2411,26 +2421,59 @@

 static void test_decode(struct stream_state  *stream,

-                        enum TestDecodeFatality fatal) {

+                        enum TestDecodeFatality fatal,

+                        const struct codec_item *codec) {

+  vpx_image_t enc_img, dec_img;

   if (stream->mismatch_seen)

     return;

-  vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &stream->ref_enc);

+  /* Get the internal reference frame */

+  if (codec->fourcc == VP8_FOURCC) {

+    struct vpx_ref_frame ref_enc, ref_dec;

+    int width, height;

+    width = (stream->config.cfg.g_w + 15) & ~15;

+    height = (stream->config.cfg.g_h + 15) & ~15;

+    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1);

+    enc_img = ref_enc.img;

+    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1);

+    dec_img = ref_dec.img;

+    ref_enc.frame_type = VP8_LAST_FRAME;

+    ref_dec.frame_type = VP8_LAST_FRAME;

+    vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc);

+    vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec);

+  } else {

+    struct vp9_ref_frame ref;

+    ref.idx = 0;

+    vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref);

+    enc_img = ref.img;

+    vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref);

+    dec_img = ref.img;

+  }

   ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");

-  vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &stream->ref_dec);

   ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");

-  if (!compare_img(&stream->ref_enc.img, &stream->ref_dec.img)) {

-    int y[2], u[2], v[2];

-    find_mismatch(&stream->ref_enc.img, &stream->ref_dec.img,

-                  y, u, v);

+  if (!compare_img(&enc_img, &dec_img)) {

+    int y[4], u[4], v[4];

+    find_mismatch(&enc_img, &dec_img, y, u, v);

+    stream->decoder.err = 1;

     warn_or_exit_on_error(&stream->decoder, fatal == TEST_DECODE_FATAL,

-                          "Stream %d: Encode/decode mismatch on frame %d"

-                          " at Y[%d, %d], U[%d, %d], V[%d, %d]",

+                          "Stream %d: Encode/decode mismatch on frame %d at"

+                          " Y[%d, %d] {%d/%d},"

+                          " U[%d, %d] {%d/%d},"

+                          " V[%d, %d] {%d/%d}",

                           stream->index, stream->frames_out,

-                          y[0], y[1], u[0], u[1], v[0], v[1]);

+                          y[0], y[1], y[2], y[3],

+                          u[0], u[1], u[2], u[3],

+                          v[0], v[1], v[2], v[3]);

     stream->mismatch_seen = stream->frames_out;

+  vpx_img_free(&enc_img);

+  vpx_img_free(&dec_img);

@@ -2544,7 +2587,6 @@

         " and --passes=2\n", stream->index, global.pass);

});

     /* Use the frame rate from the file only if none was specified

      * on the command-line.

*/

@@ -2656,7 +2698,7 @@

         if (got_data && global.test_decode != TEST_DECODE_OFF)

-          FOREACH_STREAM(test_decode(stream, global.test_decode));

+          FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec));

       fflush(stdout);

@@ -2688,8 +2730,6 @@

     if (global.test_decode != TEST_DECODE_OFF) {

       FOREACH_STREAM(vpx_codec_destroy(&stream->decoder));

-      FOREACH_STREAM(vpx_img_free(&stream->ref_enc.img));

-      FOREACH_STREAM(vpx_img_free(&stream->ref_dec.img));

     close_input_file(&input);

--

⑨