shithub: libvpx

--- a/configure

+++ b/configure

@@ -225,6 +225,7 @@

     hybridtransform

     switchable_interp

     htrans8x8

+    tx16x16

 CONFIG_LIST="

     external_build

--- /dev/null

+++ b/test/dct16x16_test.cc

@@ -1,0 +1,356 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <math.h>

+#include <stdlib.h>

+#include <string.h>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+extern "C" {

+#include "vp8/common/entropy.h"

+#include "vp8/common/idct.h"

+#include "vp8/encoder/dct.h"

+}

+#include "acm_random.h"

+#include "vpx/vpx_integer.h"

+using libvpx_test::ACMRandom;

+namespace {

+const double PI = 3.1415926535898;

+void reference2_16x16_idct_2d(double *input, double *output) {

+  double x;

+  for (int l = 0; l < 16; ++l) {

+    for (int k = 0; k < 16; ++k) {

+      double s = 0;

+      for (int i = 0; i < 16; ++i) {

+        for (int j = 0; j < 16; ++j) {

+          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/256;

+          if (i != 0)

+            x *= sqrt(2.0);

+          if (j != 0)

+            x *= sqrt(2.0);

+          s += x;

+        }

+      }

+      output[k*16+l] = s;

+    }

+  }

+}

+static void butterfly_16x16_dct_1d(double input[16], double output[16]) {

+  double step[16];

+  double intermediate[16];

+  double temp1, temp2;

+  const double C1 = cos(1*PI/(double)32);

+  const double C2 = cos(2*PI/(double)32);

+  const double C3 = cos(3*PI/(double)32);

+  const double C4 = cos(4*PI/(double)32);

+  const double C5 = cos(5*PI/(double)32);

+  const double C6 = cos(6*PI/(double)32);

+  const double C7 = cos(7*PI/(double)32);

+  const double C8 = cos(8*PI/(double)32);

+  const double C9 = cos(9*PI/(double)32);

+  const double C10 = cos(10*PI/(double)32);

+  const double C11 = cos(11*PI/(double)32);

+  const double C12 = cos(12*PI/(double)32);

+  const double C13 = cos(13*PI/(double)32);

+  const double C14 = cos(14*PI/(double)32);

+  const double C15 = cos(15*PI/(double)32);

+  // step 1

+  step[ 0] = input[0] + input[15];

+  step[ 1] = input[1] + input[14];

+  step[ 2] = input[2] + input[13];

+  step[ 3] = input[3] + input[12];

+  step[ 4] = input[4] + input[11];

+  step[ 5] = input[5] + input[10];

+  step[ 6] = input[6] + input[ 9];

+  step[ 7] = input[7] + input[ 8];

+  step[ 8] = input[7] - input[ 8];

+  step[ 9] = input[6] - input[ 9];

+  step[10] = input[5] - input[10];

+  step[11] = input[4] - input[11];

+  step[12] = input[3] - input[12];

+  step[13] = input[2] - input[13];

+  step[14] = input[1] - input[14];

+  step[15] = input[0] - input[15];

+  // step 2

+  output[0] = step[0] + step[7];

+  output[1] = step[1] + step[6];

+  output[2] = step[2] + step[5];

+  output[3] = step[3] + step[4];

+  output[4] = step[3] - step[4];

+  output[5] = step[2] - step[5];

+  output[6] = step[1] - step[6];

+  output[7] = step[0] - step[7];

+  temp1 = step[ 8]*C7;

+  temp2 = step[15]*C9;

+  output[ 8] = temp1 + temp2;

+  temp1 = step[ 9]*C11;

+  temp2 = step[14]*C5;

+  output[ 9] = temp1 - temp2;

+  temp1 = step[10]*C3;

+  temp2 = step[13]*C13;

+  output[10] = temp1 + temp2;

+  temp1 = step[11]*C15;

+  temp2 = step[12]*C1;

+  output[11] = temp1 - temp2;

+  temp1 = step[11]*C1;

+  temp2 = step[12]*C15;

+  output[12] = temp2 + temp1;

+  temp1 = step[10]*C13;

+  temp2 = step[13]*C3;

+  output[13] = temp2 - temp1;

+  temp1 = step[ 9]*C5;

+  temp2 = step[14]*C11;

+  output[14] = temp2 + temp1;

+  temp1 = step[ 8]*C9;

+  temp2 = step[15]*C7;

+  output[15] = temp2 - temp1;

+  // step 3

+  step[ 0] = output[0] + output[3];

+  step[ 1] = output[1] + output[2];

+  step[ 2] = output[1] - output[2];

+  step[ 3] = output[0] - output[3];

+  temp1 = output[4]*C14;

+  temp2 = output[7]*C2;

+  step[ 4] = temp1 + temp2;

+  temp1 = output[5]*C10;

+  temp2 = output[6]*C6;

+  step[ 5] = temp1 + temp2;

+  temp1 = output[5]*C6;

+  temp2 = output[6]*C10;

+  step[ 6] = temp2 - temp1;

+  temp1 = output[4]*C2;

+  temp2 = output[7]*C14;

+  step[ 7] = temp2 - temp1;

+  step[ 8] = output[ 8] + output[11];

+  step[ 9] = output[ 9] + output[10];

+  step[10] = output[ 9] - output[10];

+  step[11] = output[ 8] - output[11];

+  step[12] = output[12] + output[15];

+  step[13] = output[13] + output[14];

+  step[14] = output[13] - output[14];

+  step[15] = output[12] - output[15];

+  // step 4

+  output[ 0] = (step[ 0] + step[ 1]);

+  output[ 8] = (step[ 0] - step[ 1]);

+  temp1 = step[2]*C12;

+  temp2 = step[3]*C4;

+  temp1 = temp1 + temp2;

+  output[ 4] = 2*(temp1*C8);

+  temp1 = step[2]*C4;

+  temp2 = step[3]*C12;

+  temp1 = temp2 - temp1;

+  output[12] = 2*(temp1*C8);

+  output[ 2] = 2*((step[4] + step[ 5])*C8);

+  output[14] = 2*((step[7] - step[ 6])*C8);

+  temp1 = step[4] - step[5];

+  temp2 = step[6] + step[7];

+  output[ 6] = (temp1 + temp2);

+  output[10] = (temp1 - temp2);

+  intermediate[8] = step[8] + step[14];

+  intermediate[9] = step[9] + step[15];

+  temp1 = intermediate[8]*C12;

+  temp2 = intermediate[9]*C4;

+  temp1 = temp1 - temp2;

+  output[3] = 2*(temp1*C8);

+  temp1 = intermediate[8]*C4;

+  temp2 = intermediate[9]*C12;

+  temp1 = temp2 + temp1;

+  output[13] = 2*(temp1*C8);

+  output[ 9] = 2*((step[10] + step[11])*C8);

+  intermediate[11] = step[10] - step[11];

+  intermediate[12] = step[12] + step[13];

+  intermediate[13] = step[12] - step[13];

+  intermediate[14] = step[ 8] - step[14];

+  intermediate[15] = step[ 9] - step[15];

+  output[15] = (intermediate[11] + intermediate[12]);

+  output[ 1] = -(intermediate[11] - intermediate[12]);

+  output[ 7] = 2*(intermediate[13]*C8);

+  temp1 = intermediate[14]*C12;

+  temp2 = intermediate[15]*C4;

+  temp1 = temp1 - temp2;

+  output[11] = -2*(temp1*C8);

+  temp1 = intermediate[14]*C4;

+  temp2 = intermediate[15]*C12;

+  temp1 = temp2 + temp1;

+  output[ 5] = 2*(temp1*C8);

+}

+static void reference_16x16_dct_1d(double in[16], double out[16]) {

+  const double kPi = 3.141592653589793238462643383279502884;

+  const double kInvSqrt2 = 0.707106781186547524400844362104;

+  for (int k = 0; k < 16; k++) {

+    out[k] = 0.0;

+    for (int n = 0; n < 16; n++)

+      out[k] += in[n]*cos(kPi*(2*n+1)*k/32.0);

+    if (k == 0)

+      out[k] = out[k]*kInvSqrt2;

+  }

+}

+void reference_16x16_dct_2d(int16_t input[16*16], double output[16*16]) {

+  // First transform columns

+  for (int i = 0; i < 16; ++i) {

+    double temp_in[16], temp_out[16];

+    for (int j = 0; j < 16; ++j)

+      temp_in[j] = input[j*16 + i];

+    butterfly_16x16_dct_1d(temp_in, temp_out);

+    for (int j = 0; j < 16; ++j)

+      output[j*16 + i] = temp_out[j];

+  }

+  // Then transform rows

+  for (int i = 0; i < 16; ++i) {

+    double temp_in[16], temp_out[16];

+    for (int j = 0; j < 16; ++j)

+      temp_in[j] = output[j + i*16];

+    butterfly_16x16_dct_1d(temp_in, temp_out);

+    // Scale by some magic number

+    for (int j = 0; j < 16; ++j)

+      output[j + i*16] = temp_out[j]/2;

+  }

+}

+TEST(VP8Idct16x16Test, AccuracyCheck) {

+  ACMRandom rnd(ACMRandom::DeterministicSeed());

+  const int count_test_block = 1000;

+  for (int i = 0; i < count_test_block; ++i) {

+    int16_t in[256], coeff[256];

+    int16_t out_c[256];

+    double out_r[256];

+    // Initialize a test block with input range [-255, 255].

+    for (int j = 0; j < 256; ++j)

+      in[j] = rnd.Rand8() - rnd.Rand8();

+    reference_16x16_dct_2d(in, out_r);

+    for (int j = 0; j < 256; j++)

+      coeff[j] = round(out_r[j]);

+    vp8_short_idct16x16_c(coeff, out_c, 32);

+    for (int j = 0; j < 256; ++j) {

+      const int diff = out_c[j] - in[j];

+      const int error = diff * diff;

+      EXPECT_GE(1, error)

+          << "Error: 16x16 IDCT has error " << error

+          << " at index " << j;

+    }

+    vp8_short_fdct16x16_c(in, out_c, 32);

+    for (int j = 0; j < 256; ++j) {

+      const double diff = coeff[j] - out_c[j];

+      const double error = diff * diff;

+      EXPECT_GE(1.0, error)

+          << "Error: 16x16 FDCT has error " << error

+          << " at index " << j;

+    }

+  }

+}

+TEST(VP8Fdct16x16Test, AccuracyCheck) {

+  ACMRandom rnd(ACMRandom::DeterministicSeed());

+  int max_error = 0;

+  double total_error = 0;

+  const int count_test_block = 1000;

+  for (int i = 0; i < count_test_block; ++i) {

+    int16_t test_input_block[256];

+    int16_t test_temp_block[256];

+    int16_t test_output_block[256];

+    // Initialize a test block with input range [-255, 255].

+    for (int j = 0; j < 256; ++j)

+      test_input_block[j] = rnd.Rand8() - rnd.Rand8();

+    const int pitch = 32;

+    vp8_short_fdct16x16_c(test_input_block, test_temp_block, pitch);

+    vp8_short_idct16x16_c(test_temp_block, test_output_block, pitch);

+    for (int j = 0; j < 256; ++j) {

+      const int diff = test_input_block[j] - test_output_block[j];

+      const int error = diff * diff;

+      if (max_error < error)

+        max_error = error;

+      total_error += error;

+    }

+  }

+  EXPECT_GE(1, max_error)

+      << "Error: 16x16 FDCT/IDCT has an individual roundtrip error > 1";

+  EXPECT_GE(count_test_block/10, total_error)

+      << "Error: 16x16 FDCT/IDCT has average roundtrip error > 1/10 per block";

+}

+TEST(VP8Fdct16x16Test, CoeffSizeCheck) {

+  ACMRandom rnd(ACMRandom::DeterministicSeed());

+  const int count_test_block = 1000;

+  for (int i = 0; i < count_test_block; ++i) {

+    int16_t input_block[256], input_extreme_block[256];

+    int16_t output_block[256], output_extreme_block[256];

+    // Initialize a test block with input range [-255, 255].

+    for (int j = 0; j < 256; ++j) {

+      input_block[j] = rnd.Rand8() - rnd.Rand8();

+      input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;

+    }

+    if (i == 0)

+      for (int j = 0; j < 256; ++j)

+        input_extreme_block[j] = 255;

+    const int pitch = 32;

+    vp8_short_fdct16x16_c(input_block, output_block, pitch);

+    vp8_short_fdct16x16_c(input_extreme_block, output_extreme_block, pitch);

+    // The minimum quant value is 4.

+    for (int j = 0; j < 256; ++j) {

+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_block[j]))

+          << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";

+      EXPECT_GE(4*DCT_MAX_VALUE, abs(output_extreme_block[j]))

+          << "Error: 16x16 FDCT extreme has coefficient larger than 4*DCT_MAX_VALUE";

+    }

+  }

+}

+}  // namespace

--- a/test/fdct8x8_test.cc

+++ b/test/fdct8x8_test.cc

@@ -115,8 +115,8 @@

   EXPECT_GE(1, max_error)

       << "Error: 8x8 FDCT/IDCT has an individual roundtrip error > 1";

-  EXPECT_GE(count_test_block, total_error)

-      << "Error: 8x8 FDCT/IDCT has average roundtrip error > 1 per block";

+  EXPECT_GE(count_test_block/5, total_error)

+      << "Error: 8x8 FDCT/IDCT has average roundtrip error > 1/5 per block";

};

 TEST(VP8Fdct8x8Test, ExtremalCheck) {

@@ -149,9 +149,9 @@

         << "Error: Extremal 8x8 FDCT/IDCT has an"

         << " individual roundtrip error > 1";

-    EXPECT_GE(count_test_block, total_error)

+    EXPECT_GE(count_test_block/5, total_error)

         << "Error: Extremal 8x8 FDCT/IDCT has average"

-        << " roundtrip error > 1 per block";

+        << " roundtrip error > 1/5 per block";

};

--- /dev/null

+++ b/test/idct8x8_test.cc

@@ -1,0 +1,154 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <math.h>

+#include <stdlib.h>

+#include <string.h>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+extern "C" {

+#include "vp8/encoder/dct.h"

+#include "vp8/common/idct.h"

+}

+#include "acm_random.h"

+#include "vpx/vpx_integer.h"

+using libvpx_test::ACMRandom;

+namespace {

+void reference_dct_1d(double input[8], double output[8]) {

+  const double kPi = 3.141592653589793238462643383279502884;

+  const double kInvSqrt2 = 0.707106781186547524400844362104;

+  for (int k = 0; k < 8; k++) {

+    output[k] = 0.0;

+    for (int n = 0; n < 8; n++)

+      output[k] += input[n]*cos(kPi*(2*n+1)*k/16.0);

+    if (k == 0)

+      output[k] = output[k]*kInvSqrt2;

+  }

+}

+void reference_dct_2d(int16_t input[64], double output[64]) {

+  // First transform columns

+  for (int i = 0; i < 8; ++i) {

+    double temp_in[8], temp_out[8];

+    for (int j = 0; j < 8; ++j)

+      temp_in[j] = input[j*8 + i];

+    reference_dct_1d(temp_in, temp_out);

+    for (int j = 0; j < 8; ++j)

+      output[j*8 + i] = temp_out[j];

+  }

+  // Then transform rows

+  for (int i = 0; i < 8; ++i) {

+    double temp_in[8], temp_out[8];

+    for (int j = 0; j < 8; ++j)

+      temp_in[j] = output[j + i*8];

+    reference_dct_1d(temp_in, temp_out);

+    for (int j = 0; j < 8; ++j)

+      output[j + i*8] = temp_out[j];

+  }

+  // Scale by some magic number

+  for (int i = 0; i < 64; ++i)

+    output[i] *= 2;

+}

+void reference_idct_1d(double input[8], double output[8]) {

+  const double kPi = 3.141592653589793238462643383279502884;

+  const double kSqrt2 = 1.414213562373095048801688724209698;

+  for (int k = 0; k < 8; k++) {

+    output[k] = 0.0;

+    for (int n = 0; n < 8; n++) {

+      output[k] += input[n]*cos(kPi*(2*k+1)*n/16.0);

+      if (n == 0)

+        output[k] = output[k]/kSqrt2;

+    }

+  }

+}

+void reference_idct_2d(double input[64], int16_t output[64]) {

+  double out[64], out2[64];

+  // First transform rows

+  for (int i = 0; i < 8; ++i) {

+    double temp_in[8], temp_out[8];

+    for (int j = 0; j < 8; ++j)

+      temp_in[j] = input[j + i*8];

+    reference_idct_1d(temp_in, temp_out);

+    for (int j = 0; j < 8; ++j)

+      out[j + i*8] = temp_out[j];

+  }

+  // Then transform columns

+  for (int i = 0; i < 8; ++i) {

+    double temp_in[8], temp_out[8];

+    for (int j = 0; j < 8; ++j)

+      temp_in[j] = out[j*8 + i];

+    reference_idct_1d(temp_in, temp_out);

+    for (int j = 0; j < 8; ++j)

+      out2[j*8 + i] = temp_out[j];

+  }

+  for (int i = 0; i < 64; ++i)

+    output[i] = round(out2[i]/32);

+}

+TEST(VP8Idct8x8Test, AccuracyCheck) {

+  ACMRandom rnd(ACMRandom::DeterministicSeed());

+  const int count_test_block = 10000;

+  for (int i = 0; i < count_test_block; ++i) {

+    int16_t input[64], coeff[64];

+    int16_t output_c[64];

+    double output_r[64];

+    // Initialize a test block with input range [-255, 255].

+    for (int j = 0; j < 64; ++j)

+      input[j] = rnd.Rand8() - rnd.Rand8();

+    const int pitch = 16;

+    vp8_short_fdct8x8_c(input, output_c, pitch);

+    reference_dct_2d(input, output_r);

+    for (int j = 0; j < 64; ++j) {

+      const double diff = output_c[j] - output_r[j];

+      const double error = diff * diff;

+      // An error in a DCT coefficient isn't that bad.

+      // We care more about the reconstructed pixels.

+      EXPECT_GE(2.0, error)

+          << "Error: 8x8 FDCT/IDCT has error " << error

+          << " at index " << j;

+    }

+#if 0

+    // Tests that the reference iDCT and fDCT match.

+    reference_dct_2d(input, output_r);

+    reference_idct_2d(output_r, output_c);

+    for (int j = 0; j < 64; ++j) {

+      const int diff = output_c[j] -input[j];

+      const int error = diff * diff;

+      EXPECT_EQ(0, error)

+          << "Error: 8x8 FDCT/IDCT has error " << error

+          << " at index " << j;

+    }

+#endif

+    reference_dct_2d(input, output_r);

+    for (int j = 0; j < 64; ++j)

+      coeff[j] = round(output_r[j]);

+    vp8_short_idct8x8_c(coeff, output_c, pitch);

+    for (int j = 0; j < 64; ++j) {

+      const int diff = output_c[j] -input[j];

+      const int error = diff * diff;

+      EXPECT_GE(1, error)

+          << "Error: 8x8 FDCT/IDCT has error " << error

+          << " at index " << j;

+    }

+  }

+}

+}  // namespace

--- a/test/test.mk

+++ b/test/test.mk

@@ -1,8 +1,10 @@

 LIBVPX_TEST_SRCS-yes += test.mk

 LIBVPX_TEST_SRCS-yes += acm_random.h

 LIBVPX_TEST_SRCS-yes += boolcoder_test.cc

+LIBVPX_TEST_SRCS-$(CONFIG_TX16X16) += dct16x16_test.cc

 LIBVPX_TEST_SRCS-yes += fdct4x4_test.cc

 LIBVPX_TEST_SRCS-yes += fdct8x8_test.cc

+LIBVPX_TEST_SRCS-yes += idct8x8_test.cc

 LIBVPX_TEST_SRCS-yes += test_libvpx.cc

 LIBVPX_TEST_DATA-yes += hantro_collage_w352h288.yuv

--- a/vp8/common/alloccommon.c

+++ b/vp8/common/alloccommon.c

@@ -218,7 +218,4 @@

   vp8_entropy_mode_init();

   vp8_entropy_mv_init();

-  vp8_init_scan_order_mask();

--- a/vp8/common/blockd.h

+++ b/vp8/common/blockd.h

@@ -129,11 +129,12 @@

 // Segment level features.

 typedef enum {

-  TX_4X4 = 0,                      // 4x4 dct transform

-  TX_8X8 = 1,                      // 8x8 dct transform

-  TX_SIZE_MAX = 2                  // Number of differnt transforms avaialble

+  TX_4X4,                      // 4x4 dct transform

+  TX_8X8,                      // 8x8 dct transform

+#if CONFIG_TX16X16

+  TX_16X16,                    // 16x16 dct transform

+#endif

+  TX_SIZE_MAX                  // Number of different transforms available

 } TX_SIZE;

 #if CONFIG_HYBRIDTRANSFORM

--- a/vp8/common/coefupdateprobs.h

+++ b/vp8/common/coefupdateprobs.h

@@ -13,4 +13,7 @@

    Generated file included by entropy.c */

 #define COEF_UPDATE_PROB 252

 #define COEF_UPDATE_PROB_8X8 252

+#if CONFIG_TX16X16

+#define COEF_UPDATE_PROB_16X16 252

+#endif

--- a/vp8/common/default_coef_probs.h

+++ b/vp8/common/default_coef_probs.h

@@ -488,3 +488,211 @@

 #endif

};

+#if CONFIG_TX16X16

+static const vp8_prob

+vp8_default_coef_probs_16x16[BLOCK_TYPES_16X16]

+                            [COEF_BANDS]

+                            [PREV_COEF_CONTEXTS]

+                            [ENTROPY_NODES] =

+{

+  { /* block Type 0 */

+    { /* Coeff Band 0 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 1 */

+      { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128},

+      { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128},

+      { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}

+    },

+    { /* Coeff Band 2 */

+      { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128},

+      { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128},

+      { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}

+    },

+    { /* Coeff Band 3 */

+      { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128},

+      { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128},

+      { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}

+    },

+    { /* Coeff Band 4 */

+      { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128},

+      { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128},

+      { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}

+    },

+    { /* Coeff Band 5 */

+      { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128},

+      { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    { /* Coeff Band 6 */

+      { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128},

+      { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128},

+      { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}

+    },

+    { /* Coeff Band 7 */

+      { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128},

+      { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128},

+      { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}

+    }

+  },

+  { /* block Type 1 */

+    { /* Coeff Band 0 */

+      { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128},

+      { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128},

+      { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128}

+    },

+    { /* Coeff Band 1 */

+      { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128},

+      { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128},

+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128},

+      { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}

+    },

+    { /* Coeff Band 2 */

+      { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128},

+      { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128},

+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128},

+      { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 3 */

+      { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128},

+      { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128},

+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128},

+      { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 4 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 5 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 6 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 7 */

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    }

+  },

+  { /* block Type 2 */

+    { /* Coeff Band 0 */

+      { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128},

+      { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128},

+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128},

+      { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}

+    },

+    { /* Coeff Band 1 */

+      { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128},

+      { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128},

+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128},

+      { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}

+    },

+    { /* Coeff Band 2 */

+      { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128},

+      { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128},

+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128},

+      { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}

+    },

+    { /* Coeff Band 3 */

+      { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128},

+      { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128},

+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128},

+      { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}

+    },

+    { /* Coeff Band 4 */

+      { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128},

+      { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128},

+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128},

+      { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}

+    },

+    { /* Coeff Band 5 */

+      { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128},

+      { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128},

+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128},

+      { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}

+    },

+    { /* Coeff Band 6 */

+      { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128},

+      { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128},

+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128},

+      { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}

+    },

+    { /* Coeff Band 7 */

+      { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128},

+      { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128},

+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128},

+      { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}

+    }

+  },

+  { /* block Type 3 */

+    { /* Coeff Band 0 */

+      { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184},

+      { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200},

+      { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247},

+      { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}

+    },

+    { /* Coeff Band 1 */

+      { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128},

+      { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128},

+      { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255},

+      { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255}

+    },

+    { /* Coeff Band 2 */

+      { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128},

+      { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255},

+      { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255},

+      { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255}

+    },

+    { /* Coeff Band 3 */

+      { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128},

+      { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128},

+      { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128},

+      { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255}

+    },

+    { /* Coeff Band 4 */

+      { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128},

+      { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128},

+      { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128},

+      { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128}

+    },

+    { /* Coeff Band 5 */

+      { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128},

+      { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128},

+      { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128},

+      { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128}

+    },

+    { /* Coeff Band 6 */

+      { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128},

+      { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128},

+      { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128},

+      { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255}

+    },

+    { /* Coeff Band 7 */

+      { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128},

+      { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128},

+      { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128},

+      { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128}

+    }

+  }

+};

+#endif

--- a/vp8/common/entropy.c

+++ b/vp8/common/entropy.c

@@ -47,7 +47,7 @@

   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

};

-DECLARE_ALIGNED(16, cuchar, vp8_coef_bands[16]) = {

+DECLARE_ALIGNED(16, const int, vp8_coef_bands[16]) = {

   0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7

};

@@ -79,15 +79,15 @@

 #endif

-DECLARE_ALIGNED(64, cuchar, vp8_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,

-                                                        5, 3, 6, 3, 5, 4, 6, 6,

-                                                        6, 5, 5, 6, 6, 6, 6, 6,

-                                                        6, 6, 6, 6, 6, 6, 6, 6,

-                                                        6, 6, 6, 6, 7, 7, 7, 7,

-                                                        7, 7, 7, 7, 7, 7, 7, 7,

-                                                        7, 7, 7, 7, 7, 7, 7, 7,

-                                                        7, 7, 7, 7, 7, 7, 7, 7

-                                                      };

+DECLARE_ALIGNED(64, const int, vp8_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5,

+                                                           5, 3, 6, 3, 5, 4, 6, 6,

+                                                           6, 5, 5, 6, 6, 6, 6, 6,

+                                                           6, 6, 6, 6, 6, 6, 6, 6,

+                                                           6, 6, 6, 6, 7, 7, 7, 7,

+                                                           7, 7, 7, 7, 7, 7, 7, 7,

+                                                           7, 7, 7, 7, 7, 7, 7, 7,

+                                                           7, 7, 7, 7, 7, 7, 7, 7

+                                                         };

 DECLARE_ALIGNED(64, const int, vp8_default_zig_zag1d_8x8[64]) = {

   0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,

   12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,

@@ -95,9 +95,46 @@

   58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63,

};

+#if CONFIG_TX16X16

+// Table can be optimized.

+DECLARE_ALIGNED(16, const int, vp8_coef_bands_16x16[256]) = {

+    0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6,

+    6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,

+    6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

+};

+DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d_16x16[256]) = {

+      0,   1,  16,  32,  17,   2,   3,  18,  33,  48,  64,  49,  34,  19,   4,   5,

+     20,  35,  50,  65,  80,  96,  81,  66,  51,  36,  21,   6,   7,  22,  37,  52,

+     67,  82,  97, 112, 128, 113,  98,  83,  68,  53,  38,  23,   8,   9,  24,  39,

+     54,  69,  84,  99, 114, 129, 144, 160, 145, 130, 115, 100,  85,  70,  55,  40,

+     25,  10,  11,  26,  41,  56,  71,  86, 101, 116, 131, 146, 161, 176, 192, 177,

+    162, 147, 132, 117, 102,  87,  72,  57,  42,  27,  12,  13,  28,  43,  58,  73,

+     88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134,

+    119, 104,  89,  74,  59,  44,  29,  14,  15,  30,  45,  60,  75,  90, 105, 120,

+    135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136,

+    121, 106,  91,  76,  61,  46,  31,  47,  62,  77,  92, 107, 122, 137, 152, 167,

+    182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108,  93,

+     78,  63,  79,  94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230,

+    215, 200, 185, 170, 155, 140, 125, 110,  95, 111, 126, 141, 156, 171, 186, 201,

+    216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188,

+    203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,

+    250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255,

+};

+#endif

-DECLARE_ALIGNED(16, short, vp8_default_zig_zag_mask[16]);

-DECLARE_ALIGNED(64, short, vp8_default_zig_zag_mask_8x8[64]);// int64_t

 /* Array indices are identical to previously-existing CONTEXT_NODE indices */

@@ -131,17 +168,6 @@

 static vp8_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26];

-void vp8_init_scan_order_mask() {

-  int i;

-  for (i = 0; i < 16; i++) {

-    vp8_default_zig_zag_mask[vp8_default_zig_zag1d[i]] = 1 << i;

-  }

-  for (i = 0; i < 64; i++) {

-    vp8_default_zig_zag_mask_8x8[vp8_default_zig_zag1d_8x8[i]] = 1 << i;

-  }

-}

 static void init_bit_tree(vp8_tree_index *p, int n) {

   int i = 0;

@@ -181,11 +207,15 @@

 void vp8_default_coef_probs(VP8_COMMON *pc) {

   vpx_memcpy(pc->fc.coef_probs, default_coef_probs,

-             sizeof(default_coef_probs));

+             sizeof(pc->fc.coef_probs));

   vpx_memcpy(pc->fc.coef_probs_8x8, vp8_default_coef_probs_8x8,

-             sizeof(vp8_default_coef_probs_8x8));

+             sizeof(pc->fc.coef_probs_8x8));

+#if CONFIG_TX16X16

+  vpx_memcpy(pc->fc.coef_probs_16x16, vp8_default_coef_probs_16x16,

+             sizeof(pc->fc.coef_probs_16x16));

+#endif

 void vp8_coef_tree_initialize() {

@@ -304,4 +334,27 @@

           else cm->fc.coef_probs_8x8[i][j][k][t] = prob;

+#if CONFIG_TX16X16

+  for (i = 0; i < BLOCK_TYPES_16X16; ++i)

+    for (j = 0; j < COEF_BANDS; ++j)

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp8_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,

+          coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1);

+        for (t = 0; t < ENTROPY_NODES; ++t) {

+          int prob;

+          count = branch_ct[t][0] + branch_ct[t][1];

+          count = count > count_sat ? count_sat : count;

+          factor = (update_factor * count / count_sat);

+          prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] * (256 - factor) +

+                  (int)coef_probs[t] * factor + 128) >> 8;

+          if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1;

+          else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255;

+          else cm->fc.coef_probs_16x16[i][j][k][t] = prob;

+        }

+      }

+#endif

--- a/vp8/common/entropy.h

+++ b/vp8/common/entropy.h

@@ -62,19 +62,22 @@

 /* Outside dimension.  0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */

 #define BLOCK_TYPES 4

 #if CONFIG_HTRANS8X8

 #define BLOCK_TYPES_8X8 4

 #else

 #define BLOCK_TYPES_8X8 3

 #endif

+#define BLOCK_TYPES_16X16 4

 /* Middle dimension is a coarsening of the coefficient's

    position within the 4x4 DCT. */

 #define COEF_BANDS 8

-extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);

-extern DECLARE_ALIGNED(64, const unsigned char, vp8_coef_bands_8x8[64]);

+extern DECLARE_ALIGNED(16, const int, vp8_coef_bands[16]);

+extern DECLARE_ALIGNED(64, const int, vp8_coef_bands_8x8[64]);

+#if CONFIG_TX16X16

+extern DECLARE_ALIGNED(16, const int, vp8_coef_bands_16x16[256]);

+#endif

 /* Inside dimension is 3-valued measure of nearby complexity, that is,

    the extent to which nearby coefficients are nonzero.  For the first

@@ -113,8 +116,11 @@

 extern short vp8_default_zig_zag_mask[16];

 extern DECLARE_ALIGNED(64, const int, vp8_default_zig_zag1d_8x8[64]);

-extern short vp8_default_zig_zag_mask_8x8[64];// int64_t

 void vp8_coef_tree_initialize(void);

+#if CONFIG_TX16X16

+extern DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d_16x16[256]);

+#endif

 void vp8_adapt_coef_probs(struct VP8Common *);

 #endif

--- a/vp8/common/entropymode.c

+++ b/vp8/common/entropymode.c

@@ -249,11 +249,8 @@

 void vp8_init_mbmode_probs(VP8_COMMON *x) {

   unsigned int bct [VP8_YMODES] [2];      /* num Ymodes > num UV modes */

-  vp8_tree_probs_from_distribution(

-    VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,

-    x->fc.ymode_prob, bct, y_mode_cts,

-    256, 1

-  );

+  vp8_tree_probs_from_distribution(VP8_YMODES, vp8_ymode_encodings,

+    vp8_ymode_tree, x->fc.ymode_prob, bct, y_mode_cts, 256, 1);

     int i;

     for (i = 0; i < 8; i++)

@@ -260,8 +257,7 @@

       vp8_tree_probs_from_distribution(

         VP8_YMODES, vp8_kf_ymode_encodings, vp8_kf_ymode_tree,

         x->kf_ymode_prob[i], bct, kf_y_mode_cts[i],

-        256, 1

-      );

+        256, 1);

     int i;

@@ -295,13 +291,9 @@

 static void intra_bmode_probs_from_distribution(

   vp8_prob p [VP8_BINTRAMODES - 1],

   unsigned int branch_ct [VP8_BINTRAMODES - 1] [2],

-  const unsigned int events [VP8_BINTRAMODES]

-) {

-  vp8_tree_probs_from_distribution(

-    VP8_BINTRAMODES, vp8_bmode_encodings, vp8_bmode_tree,

-    p, branch_ct, events,

-    256, 1

-  );

+  const unsigned int events [VP8_BINTRAMODES]) {

+  vp8_tree_probs_from_distribution(VP8_BINTRAMODES, vp8_bmode_encodings,

+    vp8_bmode_tree, p, branch_ct, events, 256, 1);

 void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES - 1]) {

--- a/vp8/common/generic/systemdependent.c

+++ b/vp8/common/generic/systemdependent.c

@@ -32,6 +32,9 @@

   rtcd->idct.idct8        = vp8_short_idct8x8_c;

   rtcd->idct.idct1_scalar_add_8x8 = vp8_dc_only_idct_add_8x8_c;

   rtcd->idct.ihaar2       = vp8_short_ihaar2x2_c;

+#if CONFIG_TX16X16

+  rtcd->idct.idct16x16    = vp8_short_idct16x16_c;

+#endif

   rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;

   rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;

   rtcd->recon.avg16x16    = vp8_avg_mem16x16_c;

--- a/vp8/common/idct.h

+++ b/vp8/common/idct.h

@@ -36,6 +36,13 @@

 #define Y2_WHT_UPSCALE_FACTOR 2

 #endif

+#if CONFIG_TX16X16

+#ifndef vp8_idct_idct16x16

+#define vp8_idct_idct16x16 vp8_short_idct16x16_c

+#endif

+extern prototype_idct(vp8_idct_idct16x16);

+#endif

 #ifndef vp8_idct_idct8

 #define vp8_idct_idct8 vp8_short_idct8x8_c

 #endif

@@ -120,6 +127,10 @@

   vp8_idct_scalar_add_fn_t idct1_scalar_add_8x8;

   vp8_idct_fn_t ihaar2;

   vp8_idct_fn_t ihaar2_1;

+#if CONFIG_TX16X16

+  vp8_idct_fn_t            idct16x16;

+#endif

 } vp8_idct_rtcd_vtable_t;

 #if CONFIG_RUNTIME_CPU_DETECT

--- a/vp8/common/idctllm.c

+++ b/vp8/common/idctllm.c

@@ -647,3 +647,275 @@

   op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1;

+#if CONFIG_TX16X16

+#if 0

+// Keep a really bad float version as reference for now.

+void vp8_short_idct16x16_c(short *input, short *output, int pitch) {

+  double x;

+  const int short_pitch = pitch >> 1;

+  int i, j, k, l;

+  for (l = 0; l < 16; ++l) {

+    for (k = 0; k < 16; ++k) {

+      double s = 0;

+      for (i = 0; i < 16; ++i) {

+        for (j = 0; j < 16; ++j) {

+          x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;

+          if (i != 0)

+            x *= sqrt(2.0);

+          if (j != 0)

+            x *= sqrt(2.0);

+          s += x;

+        }

+      }

+      output[k*short_pitch+l] = (short)round(s);

+    }

+  }

+}

+#endif

+static void butterfly_16x16_idct_1d(double input[16], double output[16]) {

+  double step[16];

+  double intermediate[16];

+  double temp1, temp2;

+  const double PI = M_PI;

+  const double C1 = cos(1*PI/(double)32);

+  const double C2 = cos(2*PI/(double)32);

+  const double C3 = cos(3*PI/(double)32);

+  const double C4 = cos(4*PI/(double)32);

+  const double C5 = cos(5*PI/(double)32);

+  const double C6 = cos(6*PI/(double)32);

+  const double C7 = cos(7*PI/(double)32);

+  const double C8 = cos(8*PI/(double)32);

+  const double C9 = cos(9*PI/(double)32);

+  const double C10 = cos(10*PI/(double)32);

+  const double C11 = cos(11*PI/(double)32);

+  const double C12 = cos(12*PI/(double)32);

+  const double C13 = cos(13*PI/(double)32);

+  const double C14 = cos(14*PI/(double)32);

+  const double C15 = cos(15*PI/(double)32);

+  // step 1 and 2

+  step[ 0] = input[0] + input[8];

+  step[ 1] = input[0] - input[8];

+  temp1 = input[4]*C12;

+  temp2 = input[12]*C4;

+  temp1 -= temp2;

+  temp1 *= C8;

+  step[ 2] = 2*(temp1);

+  temp1 = input[4]*C4;

+  temp2 = input[12]*C12;

+  temp1 += temp2;

+  temp1 = (temp1);

+  temp1 *= C8;

+  step[ 3] = 2*(temp1);

+  temp1 = input[2]*C8;

+  temp1 = 2*(temp1);

+  temp2 = input[6] + input[10];

+  step[ 4] = temp1 + temp2;

+  step[ 5] = temp1 - temp2;

+  temp1 = input[14]*C8;

+  temp1 = 2*(temp1);

+  temp2 = input[6] - input[10];

+  step[ 6] = temp2 - temp1;

+  step[ 7] = temp2 + temp1;

+  // for odd input

+  temp1 = input[3]*C12;

+  temp2 = input[13]*C4;

+  temp1 += temp2;

+  temp1 = (temp1);

+  temp1 *= C8;

+  intermediate[ 8] = 2*(temp1);

+  temp1 = input[3]*C4;

+  temp2 = input[13]*C12;

+  temp2 -= temp1;

+  temp2 = (temp2);

+  temp2 *= C8;

+  intermediate[ 9] = 2*(temp2);

+  intermediate[10] = 2*(input[9]*C8);

+  intermediate[11] = input[15] - input[1];

+  intermediate[12] = input[15] + input[1];

+  intermediate[13] = 2*((input[7]*C8));

+  temp1 = input[11]*C12;

+  temp2 = input[5]*C4;

+  temp2 -= temp1;

+  temp2 = (temp2);

+  temp2 *= C8;

+  intermediate[14] = 2*(temp2);

+  temp1 = input[11]*C4;

+  temp2 = input[5]*C12;

+  temp1 += temp2;

+  temp1 = (temp1);

+  temp1 *= C8;

+  intermediate[15] = 2*(temp1);

+  step[ 8] = intermediate[ 8] + intermediate[14];

+  step[ 9] = intermediate[ 9] + intermediate[15];

+  step[10] = intermediate[10] + intermediate[11];

+  step[11] = intermediate[10] - intermediate[11];

+  step[12] = intermediate[12] + intermediate[13];

+  step[13] = intermediate[12] - intermediate[13];

+  step[14] = intermediate[ 8] - intermediate[14];

+  step[15] = intermediate[ 9] - intermediate[15];

+  // step 3

+  output[0] = step[ 0] + step[ 3];

+  output[1] = step[ 1] + step[ 2];

+  output[2] = step[ 1] - step[ 2];

+  output[3] = step[ 0] - step[ 3];

+  temp1 = step[ 4]*C14;

+  temp2 = step[ 7]*C2;

+  temp1 -= temp2;

+  output[4] =  (temp1);

+  temp1 = step[ 4]*C2;

+  temp2 = step[ 7]*C14;

+  temp1 += temp2;

+  output[7] =  (temp1);

+  temp1 = step[ 5]*C10;

+  temp2 = step[ 6]*C6;

+  temp1 -= temp2;

+  output[5] =  (temp1);

+  temp1 = step[ 5]*C6;

+  temp2 = step[ 6]*C10;

+  temp1 += temp2;

+  output[6] =  (temp1);

+  output[8] = step[ 8] + step[11];

+  output[9] = step[ 9] + step[10];

+  output[10] = step[ 9] - step[10];

+  output[11] = step[ 8] - step[11];

+  output[12] = step[12] + step[15];

+  output[13] = step[13] + step[14];

+  output[14] = step[13] - step[14];

+  output[15] = step[12] - step[15];

+  // output 4

+  step[ 0] = output[0] + output[7];

+  step[ 1] = output[1] + output[6];

+  step[ 2] = output[2] + output[5];

+  step[ 3] = output[3] + output[4];

+  step[ 4] = output[3] - output[4];

+  step[ 5] = output[2] - output[5];

+  step[ 6] = output[1] - output[6];

+  step[ 7] = output[0] - output[7];

+  temp1 = output[8]*C7;

+  temp2 = output[15]*C9;

+  temp1 -= temp2;

+  step[ 8] = (temp1);

+  temp1 = output[9]*C11;

+  temp2 = output[14]*C5;

+  temp1 += temp2;

+  step[ 9] = (temp1);

+  temp1 = output[10]*C3;

+  temp2 = output[13]*C13;

+  temp1 -= temp2;

+  step[10] = (temp1);

+  temp1 = output[11]*C15;

+  temp2 = output[12]*C1;

+  temp1 += temp2;

+  step[11] = (temp1);

+  temp1 = output[11]*C1;

+  temp2 = output[12]*C15;

+  temp2 -= temp1;

+  step[12] = (temp2);

+  temp1 = output[10]*C13;

+  temp2 = output[13]*C3;

+  temp1 += temp2;

+  step[13] = (temp1);

+  temp1 = output[9]*C5;

+  temp2 = output[14]*C11;

+  temp2 -= temp1;

+  step[14] = (temp2);

+  temp1 = output[8]*C9;

+  temp2 = output[15]*C7;

+  temp1 += temp2;

+  step[15] = (temp1);

+  // step 5

+  output[0] = (step[0] + step[15]);

+  output[1] = (step[1] + step[14]);

+  output[2] = (step[2] + step[13]);

+  output[3] = (step[3] + step[12]);

+  output[4] = (step[4] + step[11]);

+  output[5] = (step[5] + step[10]);

+  output[6] = (step[6] + step[ 9]);

+  output[7] = (step[7] + step[ 8]);

+  output[15] = (step[0] - step[15]);

+  output[14] = (step[1] - step[14]);

+  output[13] = (step[2] - step[13]);

+  output[12] = (step[3] - step[12]);

+  output[11] = (step[4] - step[11]);

+  output[10] = (step[5] - step[10]);

+  output[9] = (step[6] - step[ 9]);

+  output[8] = (step[7] - step[ 8]);

+}

+// Remove once an int version of iDCT is written

+#if 0

+void reference_16x16_idct_1d(double input[16], double output[16]) {

+  const double kPi = 3.141592653589793238462643383279502884;

+  const double kSqrt2 = 1.414213562373095048801688724209698;

+  for (int k = 0; k < 16; k++) {

+    output[k] = 0.0;

+    for (int n = 0; n < 16; n++) {

+      output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);

+      if (n == 0)

+        output[k] = output[k]/kSqrt2;

+    }

+  }

+}

+#endif

+void vp8_short_idct16x16_c(short *input, short *output, int pitch) {

+  double out[16*16], out2[16*16];

+  const int short_pitch = pitch >> 1;

+  int i, j;

+    // First transform rows

+  for (i = 0; i < 16; ++i) {

+    double temp_in[16], temp_out[16];

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = input[j + i*short_pitch];

+    butterfly_16x16_idct_1d(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      out[j + i*16] = temp_out[j];

+  }

+  // Then transform columns

+  for (i = 0; i < 16; ++i) {

+    double temp_in[16], temp_out[16];

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j*16 + i];

+    butterfly_16x16_idct_1d(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      out2[j*16 + i] = temp_out[j];

+  }

+  for (i = 0; i < 16*16; ++i)

+    output[i] = round(out2[i]/128);

+}

+#endif

--- a/vp8/common/invtrans.c

+++ b/vp8/common/invtrans.c

@@ -153,3 +153,33 @@

+#if CONFIG_TX16X16

+void vp8_inverse_transform_b_16x16(const vp8_idct_rtcd_vtable_t *rtcd,

+                                   short *input_dqcoeff,

+                                   short *output_coeff, int pitch) {

+  IDCT_INVOKE(rtcd, idct16x16)(input_dqcoeff, output_coeff, pitch);

+}

+void vp8_inverse_transform_mby_16x16(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) {

+    vp8_inverse_transform_b_16x16(rtcd, &x->block[0].dqcoeff[0], &x->block[0].diff[0], 32);

+}

+// U,V blocks are 8x8 per macroblock, so just run 8x8

+void vp8_inverse_transform_mbuv_16x16(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) {

+  int i;

+  for (i = 16; i < 24; i += 4)

+    vp8_inverse_transform_b_8x8(rtcd, &x->block[i].dqcoeff[0], &x->block[i].diff[0], 16);

+}

+void vp8_inverse_transform_mb_16x16(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) {

+  int i;

+  // Luma

+  vp8_inverse_transform_b_16x16(rtcd, &x->block[0].dqcoeff[0], &x->block[0].diff[0], 32);

+  // U, V

+  // Chroma blocks are downscaled, so run an 8x8 on them.

+  for (i = 16; i < 24; i+= 4)

+    vp8_inverse_transform_b_8x8(rtcd, &x->block[i].dqcoeff[0], &x->block[i].diff[0], 16);

+}

+#endif

--- a/vp8/common/invtrans.h

+++ b/vp8/common/invtrans.h

@@ -30,4 +30,12 @@

 extern void vp8_inverse_transform_mby_8x8(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);

 extern void vp8_inverse_transform_mbuv_8x8(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);

+#if CONFIG_TX16X16

+extern void vp8_inverse_transform_b_16x16(const vp8_idct_rtcd_vtable_t *rtcd,

+                                          short *input_dqcoeff, short *output_coeff,

+                                          int pitch);

+extern void vp8_inverse_transform_mb_16x16(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);

+extern void vp8_inverse_transform_mby_16x16(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);

+extern void vp8_inverse_transform_mbuv_16x16(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);

+#endif

 #endif

--- a/vp8/common/loopfilter.c

+++ b/vp8/common/loopfilter.c

@@ -329,7 +329,11 @@

             vp8_loop_filter_mbv_c

             (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

-          if (!skip_lf) {

+          if (!skip_lf

+#if CONFIG_TX16X16

+              && tx_type != TX_16X16

+#endif

+              ) {

             if (tx_type == TX_8X8)

               vp8_loop_filter_bv8x8_c

               (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

@@ -344,7 +348,11 @@

             vp8_loop_filter_mbh_c

             (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

-          if (!skip_lf) {

+          if (!skip_lf

+#if CONFIG_TX16X16

+              && tx_type != TX_16X16

+#endif

+              ) {

             if (tx_type == TX_8X8)

               vp8_loop_filter_bh8x8_c

               (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

@@ -353,6 +361,7 @@

               (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);

         } else {

+          // FIXME: Not 8x8 aware

           if (mb_col > 0)

             LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)

             (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);

@@ -431,7 +440,6 @@

       const int seg = mode_info_context->mbmi.segment_id;

       const int ref_frame = mode_info_context->mbmi.ref_frame;

       int tx_type = mode_info_context->mbmi.txfm_size;

       filter_level = lfi_n->lvl[seg][ref_frame][mode_index];

       if (filter_level) {

@@ -446,7 +454,11 @@

             vp8_loop_filter_mbv_c

             (y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          if (!skip_lf) {

+          if (!skip_lf

+#if CONFIG_TX16X16

+              && tx_type != TX_16X16

+#endif

+              ) {

             if (tx_type == TX_8X8)

               vp8_loop_filter_bv8x8_c

               (y_ptr, 0, 0, post->y_stride, 0, &lfi);

@@ -460,7 +472,11 @@

             vp8_loop_filter_mbh_c

             (y_ptr, 0, 0, post->y_stride, 0, &lfi);

-          if (!skip_lf) {

+          if (!skip_lf

+#if CONFIG_TX16X16

+              && tx_type != TX_16X16

+#endif

+              ) {

             if (tx_type == TX_8X8)

               vp8_loop_filter_bh8x8_c

               (y_ptr, 0, 0, post->y_stride, 0, &lfi);

@@ -469,6 +485,7 @@

               (y_ptr, 0, 0, post->y_stride, 0, &lfi);

         } else {

+          // FIXME: Not 8x8 aware

           if (mb_col > 0)

             LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)

             (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);

--- a/vp8/common/onyxc_int.h

+++ b/vp8/common/onyxc_int.h

@@ -52,6 +52,9 @@

   vp8_prob mbsplit_prob [VP8_NUMMBSPLITS - 1];

   vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

   vp8_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+#if CONFIG_TX16X16

+  vp8_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+#endif

   MV_CONTEXT mvc[2];

 #if CONFIG_HIGH_PRECISION_MV

   MV_CONTEXT_HP mvc_hp[2];

@@ -73,12 +76,22 @@

   unsigned int sub_mv_ref_counts [SUBMVREF_COUNT][VP8_SUBMVREFS];

   unsigned int mbsplit_counts [VP8_NUMMBSPLITS];

-  vp8_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

-  vp8_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp8_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  vp8_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+#if CONFIG_TX16X16

+  vp8_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+#endif

   unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS]

-  [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

   unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS]

-  [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+#if CONFIG_TX16X16

+  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS]

+      [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+#endif

   unsigned int MVcount [2] [MVvals];

 #if CONFIG_HIGH_PRECISION_MV

   unsigned int MVcount_hp [2] [MVvals_hp];

--- a/vp8/decoder/decodframe.c

+++ b/vp8/decoder/decodframe.c

@@ -217,22 +217,46 @@

 #endif

   if (pbi->common.frame_type == KEY_FRAME) {

-    if (pbi->common.txfm_mode == ALLOW_8X8 &&

+#if CONFIG_TX16X16

+    if (xd->mode_info_context->mbmi.mode <= TM_PRED ||

+        xd->mode_info_context->mbmi.mode == NEWMV ||

+        xd->mode_info_context->mbmi.mode == ZEROMV ||

+        xd->mode_info_context->mbmi.mode == NEARMV ||

+        xd->mode_info_context->mbmi.mode == NEARESTMV)

+      xd->mode_info_context->mbmi.txfm_size = TX_16X16;

+    else if (pbi->common.txfm_mode == ALLOW_8X8 &&

         xd->mode_info_context->mbmi.mode != I8X8_PRED &&

         xd->mode_info_context->mbmi.mode != B_PRED)

+#else

+      if (pbi->common.txfm_mode == ALLOW_8X8 &&

+          xd->mode_info_context->mbmi.mode != I8X8_PRED &&

+          xd->mode_info_context->mbmi.mode != B_PRED)

+#endif

       xd->mode_info_context->mbmi.txfm_size = TX_8X8;

     else

       xd->mode_info_context->mbmi.txfm_size = TX_4X4;

   } else {

-    if (pbi->common.txfm_mode == ONLY_4X4) {

+#if CONFIG_TX16X16

+    if (xd->mode_info_context->mbmi.mode <= TM_PRED ||

+        xd->mode_info_context->mbmi.mode == NEWMV ||

+        xd->mode_info_context->mbmi.mode == ZEROMV ||

+        xd->mode_info_context->mbmi.mode == NEARMV ||

+        xd->mode_info_context->mbmi.mode == NEARESTMV) {

+      xd->mode_info_context->mbmi.txfm_size = TX_16X16;

+    } else if (pbi->common.txfm_mode == ALLOW_8X8 &&

+        xd->mode_info_context->mbmi.mode != I8X8_PRED &&

+        xd->mode_info_context->mbmi.mode != B_PRED &&

+        xd->mode_info_context->mbmi.mode != SPLITMV) {

+#else

+    if (pbi->common.txfm_mode == ALLOW_8X8 &&

+        xd->mode_info_context->mbmi.mode != I8X8_PRED &&

+        xd->mode_info_context->mbmi.mode != B_PRED &&

+        xd->mode_info_context->mbmi.mode != SPLITMV) {

+#endif

+      xd->mode_info_context->mbmi.txfm_size = TX_8X8;

+    }

+    else {

       xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-    } else if (pbi->common.txfm_mode == ALLOW_8X8) {

-      if (xd->mode_info_context->mbmi.mode == B_PRED

-          || xd->mode_info_context->mbmi.mode == I8X8_PRED

-          || xd->mode_info_context->mbmi.mode == SPLITMV)

-        xd->mode_info_context->mbmi.txfm_size = TX_4X4;

-      else

-        xd->mode_info_context->mbmi.txfm_size = TX_8X8;

@@ -251,6 +275,11 @@

       xd->block[i].eob = 0;

       xd->eobs[i] = 0;

+#if CONFIG_TX16X16

+    if (tx_type == TX_16X16)

+      eobtotal = vp8_decode_mb_tokens_16x16(pbi, xd);

+    else

+#endif

     if (tx_type == TX_8X8)

       eobtotal = vp8_decode_mb_tokens_8x8(pbi, xd);

     else

@@ -462,6 +491,15 @@

      xd->dst.y_stride, xd->eobs);

   } else {

     BLOCKD *b = &xd->block[24];

+#if CONFIG_TX16X16

+    if (tx_type == TX_16X16) {

+      vp8_dequant_idct_add_16x16_c(xd->qcoeff, xd->block[0].dequant,

+                                   xd->predictor, xd->dst.y_buffer,

+                                   16, xd->dst.y_stride);

+    }

+    else

+#endif

     if (tx_type == TX_8X8) {

       DEQUANT_INVOKE(&pbi->dequant, block_2x2)(b);

 #ifdef DEC_DEBUG

@@ -511,7 +549,11 @@

-  if (tx_type == TX_8X8)

+  if (tx_type == TX_8X8

+#if CONFIG_TX16X16

+      || tx_type == TX_16X16

+#endif

+      )

     DEQUANT_INVOKE(&pbi->dequant, idct_add_uv_block_8x8) //

     (xd->qcoeff + 16 * 16, xd->block[16].dequant,

      xd->predictor + 16 * 16, xd->dst.u_buffer, xd->dst.v_buffer,

@@ -900,7 +942,7 @@

-    }

+        }

   if (pbi->common.txfm_mode == ALLOW_8X8 && vp8_read_bit(bc)) {

@@ -921,6 +963,28 @@

+#if CONFIG_TX16X16

+  // 16x16

+  if (vp8_read_bit(bc)) {

+    // read coef probability tree

+    for (i = 0; i < BLOCK_TYPES_16X16; ++i)

+      for (j = !i; j < COEF_BANDS; ++j)

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+          if (k >= 3 && ((i == 0 && j == 1) ||

+                         (i > 0 && j == 0)))

+            continue;

+          for (l = 0; l < ENTROPY_NODES; ++l) {

+            vp8_prob *const p = pc->fc.coef_probs_16x16[i][j][k] + l;

+            if (vp8_read(bc, COEF_UPDATE_PROB_16X16)) {

+              *p = read_prob_diff_update(bc, *p);

+            }

+          }

+        }

+  }

+#endif

 int vp8_decode_frame(VP8D_COMP *pbi) {

@@ -1281,6 +1345,9 @@

   vp8_copy(pbi->common.fc.pre_coef_probs, pbi->common.fc.coef_probs);

   vp8_copy(pbi->common.fc.pre_coef_probs_8x8, pbi->common.fc.coef_probs_8x8);

+#if CONFIG_TX16X16

+  vp8_copy(pbi->common.fc.pre_coef_probs_16x16, pbi->common.fc.coef_probs_16x16);

+#endif

   vp8_copy(pbi->common.fc.pre_ymode_prob, pbi->common.fc.ymode_prob);

   vp8_copy(pbi->common.fc.pre_uv_mode_prob, pbi->common.fc.uv_mode_prob);

   vp8_copy(pbi->common.fc.pre_bmode_prob, pbi->common.fc.bmode_prob);

@@ -1293,6 +1360,9 @@

 #endif

   vp8_zero(pbi->common.fc.coef_counts);

   vp8_zero(pbi->common.fc.coef_counts_8x8);

+#if CONFIG_TX16X16

+  vp8_zero(pbi->common.fc.coef_counts_16x16);

+#endif

   vp8_zero(pbi->common.fc.ymode_counts);

   vp8_zero(pbi->common.fc.uv_mode_counts);

   vp8_zero(pbi->common.fc.bmode_counts);

--- a/vp8/decoder/dequantize.c

+++ b/vp8/decoder/dequantize.c

@@ -422,3 +422,39 @@

 #endif

+#if CONFIG_TX16X16

+void vp8_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred,

+                                  unsigned char *dest, int pitch, int stride) {

+  short output[256];

+  short *diff_ptr = output;

+  int r, c, i;

+  input[0]= input[0] * dq[0];

+  // recover quantizer for 4 4x4 blocks

+  for (i = 1; i < 256; i++)

+    input[i] = input[i] * dq[1];

+  // the idct halves ( >> 1) the pitch

+  vp8_short_idct16x16_c(input, output, 32);

+  vpx_memset(input, 0, 512);

+  for (r = 0; r < 16; r++) {

+    for (c = 0; c < 16; c++) {

+      int a = diff_ptr[c] + pred[c];

+      if (a < 0)

+        a = 0;

+      else if (a > 255)

+        a = 255;

+      dest[c] = (unsigned char) a;

+    }

+    dest += stride;

+    diff_ptr += 16;

+    pred += pitch;

+  }

+}

+#endif

--- a/vp8/decoder/dequantize.h

+++ b/vp8/decoder/dequantize.h

@@ -145,6 +145,12 @@

 #endif

 extern prototype_dequant_idct_add_uv_block_8x8(vp8_dequant_idct_add_uv_block_8x8);

+#if CONFIG_TX16X16

+#ifndef vp8_dequant_idct_add_16x16

+#define vp8_dequant_idct_add_16x16 vp8_dequant_idct_add_16x16_c

+#endif

+extern prototype_dequant_idct_add(vp8_dequant_idct_add_16x16);

+#endif

 typedef prototype_dequant_block((*vp8_dequant_block_fn_t));

@@ -178,6 +184,9 @@

   vp8_dequant_dc_idct_add_y_block_fn_t_8x8 dc_idct_add_y_block_8x8;

   vp8_dequant_idct_add_y_block_fn_t_8x8    idct_add_y_block_8x8;

   vp8_dequant_idct_add_uv_block_fn_t_8x8   idct_add_uv_block_8x8;

+#if CONFIG_TX16X16

+  vp8_dequant_idct_add_fn_t            idct_add_16x16;

+#endif

 } vp8_dequant_rtcd_vtable_t;

 #if CONFIG_RUNTIME_CPU_DETECT

--- a/vp8/decoder/detokenize.c

+++ b/vp8/decoder/detokenize.c

@@ -22,13 +22,13 @@

 #define OCB_X PREV_COEF_CONTEXTS * ENTROPY_NODES

-DECLARE_ALIGNED(16, int, coef_bands_x[16]) = {

+DECLARE_ALIGNED(16, const int, coef_bands_x[16]) = {

   0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X,

   6 * OCB_X, 4 * OCB_X, 5 * OCB_X, 6 * OCB_X,

   6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

   6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X

};

-DECLARE_ALIGNED(16, int, coef_bands_x_8x8[64]) = {

+DECLARE_ALIGNED(16, const int, coef_bands_x_8x8[64]) = {

   0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X,

   5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,

   6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

@@ -39,6 +39,27 @@

   7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

};

+#if CONFIG_TX16X16

+DECLARE_ALIGNED(16, const int, coef_bands_x_16x16[256]) = {

+  0 * OCB_X, 1 * OCB_X, 2 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 4 * OCB_X, 5 * OCB_X, 5 * OCB_X, 3 * OCB_X, 6 * OCB_X, 3 * OCB_X, 5 * OCB_X, 4 * OCB_X, 6 * OCB_X, 6 * OCB_X,

+  6 * OCB_X, 5 * OCB_X, 5 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X,

+  6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 6 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X,

+  7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X, 7 * OCB_X

+};

+#endif

 #define EOB_CONTEXT_NODE            0

 #define ZERO_CONTEXT_NODE           1

 #define ONE_CONTEXT_NODE            2

@@ -81,9 +102,13 @@

 void vp8_reset_mb_tokens_context(MACROBLOCKD *x) {

   /* Clear entropy contexts for Y2 blocks */

-  if (x->mode_info_context->mbmi.mode != B_PRED &&

+  if ((x->mode_info_context->mbmi.mode != B_PRED &&

       x->mode_info_context->mbmi.mode != I8X8_PRED &&

-      x->mode_info_context->mbmi.mode != SPLITMV) {

+      x->mode_info_context->mbmi.mode != SPLITMV)

+#if CONFIG_TX16X16

+      || x->mode_info_context->mbmi.txfm_size == TX_16X16

+#endif

+      ) {

     vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

     vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

   } else {

@@ -200,7 +225,28 @@

+#if CONFIG_TX16X16

+void static count_tokens_16x16(INT16 *qcoeff_ptr, int block, int type,

+                               ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                               int eob, int seg_eob, FRAME_CONTEXT *fc) {

+  int c, pt, token;

+  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  for (c = !type; c < eob; ++c) {

+    int rc = vp8_default_zig_zag1d_16x16[c];

+    int v = qcoeff_ptr[rc];

+    int band = vp8_coef_bands_16x16[c];

+    token = get_token(v);

+    fc->coef_counts_16x16[type][band][pt][token]++;

+    pt = vp8_prev_token_class[token];

+  }

+  if (eob < seg_eob) {

+    int band = vp8_coef_bands_16x16[c];

+    fc->coef_counts_16x16[type][band][pt][DCT_EOB_TOKEN]++;

+  }

+}

+#endif

 static int vp8_get_signed(BOOL_DECODER *br, int value_to_sign) {

   const int split = (br->range + 1) >> 1;

   const VP8_BD_VALUE bigsplit = (VP8_BD_VALUE)split << (VP8_BD_VALUE_SIZE - 8);

@@ -224,16 +270,16 @@

   return v;

-#define WRITE_COEF_CONTINUE(val)                                  \

-  {                                                             \

-    Prob = coef_probs + (ENTROPY_NODES*PREV_CONTEXT_INC(val));\

+#define WRITE_COEF_CONTINUE(val)                              \

+  {                                                           \

+    prob = coef_probs + (ENTROPY_NODES*PREV_CONTEXT_INC(val));\

     qcoeff_ptr[scan[c]] = (INT16) vp8_get_signed(br, val);    \

     c++;                                                      \

     continue;                                                 \

-#define ADJUST_COEF(prob, bits_count)      \

-  do {                                   \

+#define ADJUST_COEF(prob, bits_count)  \

+  do {                                 \

     if (vp8_read(br, prob))            \

       val += (UINT16)(1 << bits_count);\

   } while (0);

@@ -246,48 +292,59 @@

   FRAME_CONTEXT *const fc = &dx->common.fc;

   BOOL_DECODER *br = xd->current_bc;

   int tmp, c = (type == 0);

-  const vp8_prob *Prob;

-  const vp8_prob *coef_probs =

-    (block_type == TX_4X4) ? fc->coef_probs[type][0][0]

-    : fc->coef_probs_8x8[type][0][0];

+  const vp8_prob *prob, *coef_probs;

+  switch (block_type) {

+    case TX_4X4:

+      coef_probs = fc->coef_probs[type][0][0];

+      break;

+    case TX_8X8:

+      coef_probs = fc->coef_probs_8x8[type][0][0];

+      break;

+#if CONFIG_TX16X16

+    default:

+      coef_probs = fc->coef_probs_16x16[type][0][0];

+      break;

+#endif

+  }

   VP8_COMBINEENTROPYCONTEXTS(tmp, *a, *l);

-  Prob = coef_probs + tmp * ENTROPY_NODES;

+  prob = coef_probs + tmp * ENTROPY_NODES;

   while (1) {

     int val;

     const uint8_t *cat6 = cat6_prob;

     if (c == seg_eob) break;

-    Prob += coef_bands[c];

-    if (!vp8_read(br, Prob[EOB_CONTEXT_NODE]))

+    prob += coef_bands[c];

+    if (!vp8_read(br, prob[EOB_CONTEXT_NODE]))

       break;

-  SKIP_START:

+SKIP_START:

     if (c == seg_eob) break;

-    if (!vp8_read(br, Prob[ZERO_CONTEXT_NODE])) {

+    if (!vp8_read(br, prob[ZERO_CONTEXT_NODE])) {

       ++c;

-      Prob = coef_probs + coef_bands[c];

+      prob = coef_probs + coef_bands[c];

       goto SKIP_START;

     // ONE_CONTEXT_NODE_0_

-    if (!vp8_read(br, Prob[ONE_CONTEXT_NODE])) {

-      Prob = coef_probs + ENTROPY_NODES;

+    if (!vp8_read(br, prob[ONE_CONTEXT_NODE])) {

+      prob = coef_probs + ENTROPY_NODES;

       qcoeff_ptr[scan[c]] = (INT16) vp8_get_signed(br, 1);

       ++c;

       continue;

     // LOW_VAL_CONTEXT_NODE_0_

-    if (!vp8_read(br, Prob[LOW_VAL_CONTEXT_NODE])) {

-      if (!vp8_read(br, Prob[TWO_CONTEXT_NODE])) {

+    if (!vp8_read(br, prob[LOW_VAL_CONTEXT_NODE])) {

+      if (!vp8_read(br, prob[TWO_CONTEXT_NODE])) {

         WRITE_COEF_CONTINUE(2);

-      if (!vp8_read(br, Prob[THREE_CONTEXT_NODE])) {

+      if (!vp8_read(br, prob[THREE_CONTEXT_NODE])) {

         WRITE_COEF_CONTINUE(3);

       WRITE_COEF_CONTINUE(4);

     // HIGH_LOW_CONTEXT_NODE_0_

-    if (!vp8_read(br, Prob[HIGH_LOW_CONTEXT_NODE])) {

-      if (!vp8_read(br, Prob[CAT_ONE_CONTEXT_NODE])) {

+    if (!vp8_read(br, prob[HIGH_LOW_CONTEXT_NODE])) {

+      if (!vp8_read(br, prob[CAT_ONE_CONTEXT_NODE])) {

         val = CAT1_MIN_VAL;

         ADJUST_COEF(CAT1_PROB0, 0);

         WRITE_COEF_CONTINUE(val);

@@ -298,8 +355,8 @@

       WRITE_COEF_CONTINUE(val);

     // CAT_THREEFOUR_CONTEXT_NODE_0_

-    if (!vp8_read(br, Prob[CAT_THREEFOUR_CONTEXT_NODE])) {

-      if (!vp8_read(br, Prob[CAT_THREE_CONTEXT_NODE])) {

+    if (!vp8_read(br, prob[CAT_THREEFOUR_CONTEXT_NODE])) {

+      if (!vp8_read(br, prob[CAT_THREE_CONTEXT_NODE])) {

         val = CAT3_MIN_VAL;

         ADJUST_COEF(CAT3_PROB2, 2);

         ADJUST_COEF(CAT3_PROB1, 1);

@@ -314,7 +371,7 @@

       WRITE_COEF_CONTINUE(val);

     // CAT_FIVE_CONTEXT_NODE_0_:

-    if (!vp8_read(br, Prob[CAT_FIVE_CONTEXT_NODE])) {

+    if (!vp8_read(br, prob[CAT_FIVE_CONTEXT_NODE])) {

       val = CAT5_MIN_VAL;

       ADJUST_COEF(CAT5_PROB4, 4);

       ADJUST_COEF(CAT5_PROB3, 3);

@@ -331,18 +388,81 @@

     WRITE_COEF_CONTINUE(val);

-  if (block_type == TX_4X4)

+  if (block_type == TX_4X4) {

 #if CONFIG_HYBRIDTRANSFORM

     count_tokens_adaptive_scan(xd, qcoeff_ptr, i, type, a, l, c, seg_eob, fc);

 #else

     count_tokens(qcoeff_ptr, i, type, a, l, c, seg_eob, fc);

 #endif

-  else

+  }

+  else if (block_type == TX_8X8)

     count_tokens_8x8(qcoeff_ptr, i, type, a, l, c, seg_eob, fc);

+#if CONFIG_TX16X16

+  else

+    count_tokens_16x16(qcoeff_ptr, i, type, a, l, c, seg_eob, fc);

+#endif

   return c;

+#if CONFIG_TX16X16

+int vp8_decode_mb_tokens_16x16(VP8D_COMP *pbi, MACROBLOCKD *xd) {

+  ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;

+  ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;

+  char* const eobs = xd->eobs;

+  int c, i, type, eobtotal = 0, seg_eob;

+  const int segment_id = xd->mode_info_context->mbmi.segment_id;

+  const int seg_active = segfeature_active(xd, segment_id, SEG_LVL_EOB);

+  INT16 *qcoeff_ptr = &xd->qcoeff[0];

+  type = PLANE_TYPE_Y_WITH_DC;

+  if (seg_active)

+      seg_eob = get_segdata(xd, segment_id, SEG_LVL_EOB);

+  else

+      seg_eob = 256;

+  // Luma block

+  {

+    const int* const scan = vp8_default_zig_zag1d_16x16;

+    c = vp8_decode_coefs(pbi, xd, A, L, type, seg_eob, qcoeff_ptr,

+                         0, scan, TX_16X16, coef_bands_x_16x16);

+    eobs[0] = c;

+    *A = *L = (c != !type);

+    for (i = 1; i < 16; i++) {

+      *(A + vp8_block2above[i]) = *(A);

+      *(L +  vp8_block2left[i]) = *(L);

+    }

+    eobtotal += c;

+  }

+  // 8x8 chroma blocks

+  qcoeff_ptr += 256;

+  type = PLANE_TYPE_UV;

+  if (seg_active)

+    seg_eob = get_segdata(xd, segment_id, SEG_LVL_EOB);

+  else

+    seg_eob = 64;

+  for (i = 16; i < 24; i += 4) {

+    ENTROPY_CONTEXT* const a = A + vp8_block2above_8x8[i];

+    ENTROPY_CONTEXT* const l = L + vp8_block2left_8x8[i];

+    const int* const scan = vp8_default_zig_zag1d_8x8;

+    c = vp8_decode_coefs(pbi, xd, a, l, type, seg_eob, qcoeff_ptr,

+                         i, scan, TX_8X8, coef_bands_x_8x8);

+    a[0] = l[0] = ((eobs[i] = c) != !type);

+    a[1] = a[0];

+    l[1] = l[0];

+    eobtotal += c;

+    qcoeff_ptr += 64;

+  }

+  vpx_memset(&A[8], 0, sizeof(A[8]));

+  vpx_memset(&L[8], 0, sizeof(L[8]));

+  return eobtotal;

+}

+#endif

 int vp8_decode_mb_tokens_8x8(VP8D_COMP *pbi, MACROBLOCKD *xd) {

   ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;

   ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;

@@ -464,7 +584,6 @@

     c = vp8_decode_coefs(dx, xd, a, l, type, seg_eob, qcoeff_ptr + 24 * 16, 24,

                          scan, TX_4X4, coef_bands_x);

     a[0] = l[0] = ((eobs[24] = c) != !type);

     eobtotal += c - 16;

     type = PLANE_TYPE_Y_NO_DC;

--- a/vp8/decoder/detokenize.h

+++ b/vp8/decoder/detokenize.h

@@ -17,5 +17,8 @@

 void vp8_reset_mb_tokens_context(MACROBLOCKD *x);

 int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);

 int vp8_decode_mb_tokens_8x8(VP8D_COMP *, MACROBLOCKD *);

+#if CONFIG_TX16X16

+int vp8_decode_mb_tokens_16x16(VP8D_COMP *, MACROBLOCKD *);

+#endif

 #endif /* DETOKENIZE_H */

--- a/vp8/decoder/generic/dsystemdependent.c

+++ b/vp8/decoder/generic/dsystemdependent.c

@@ -22,6 +22,9 @@

   pbi->mb.rtcd                     = &pbi->common.rtcd;

   pbi->dequant.block_2x2           = vp8_dequantize_b_2x2_c;

   pbi->dequant.idct_add_8x8        = vp8_dequant_idct_add_8x8_c;

+#if CONFIG_TX16X16

+  pbi->dequant.idct_add_16x16      = vp8_dequant_idct_add_16x16_c;

+#endif

   pbi->dequant.dc_idct_add_8x8     = vp8_dequant_dc_idct_add_8x8_c;

   pbi->dequant.dc_idct_add_y_block_8x8 = vp8_dequant_dc_idct_add_y_block_8x8_c;

   pbi->dequant.idct_add_y_block_8x8 = vp8_dequant_idct_add_y_block_8x8_c;

--- a/vp8/decoder/onyxd_int.h

+++ b/vp8/decoder/onyxd_int.h

@@ -54,6 +54,9 @@

   vp8_prob const *coef_probs[BLOCK_TYPES];

   vp8_prob const *coef_probs_8x8[BLOCK_TYPES_8X8];

+#if CONFIG_TX16X16

+  vp8_prob const *coef_probs_16X16[BLOCK_TYPES_16X16];

+#endif

   UINT8 eob[25];

--- a/vp8/encoder/bitstream.c

+++ b/vp8/encoder/bitstream.c

@@ -42,6 +42,12 @@

                                   [COEF_BANDS]

                                   [PREV_COEF_CONTEXTS]

                                   [ENTROPY_NODES] [2];

+#if CONFIG_TX16X16

+unsigned int tree_update_hist_16x16 [BLOCK_TYPES_16X16]

+                                    [COEF_BANDS]

+                                    [PREV_COEF_CONTEXTS]

+                                    [ENTROPY_NODES] [2];

+#endif

 extern unsigned int active_section;

 #endif

@@ -1283,15 +1289,13 @@

 void build_coeff_contexts(VP8_COMP *cpi) {

-  int i = 0;

-  do {

-    int j = 0;

-    do {

-      int k = 0;

-      do {

+  int i = 0, j, k;

 #ifdef ENTROPY_STATS

-        int t;

+  int t = 0;

 #endif

+  for (i = 0; i < BLOCK_TYPES; ++i) {

+    for (j = 0; j < COEF_BANDS; ++j) {

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

         if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

           continue;

         vp8_tree_probs_from_distribution(

@@ -1302,33 +1306,23 @@

           256, 1

);

 #ifdef ENTROPY_STATS

-        if (!cpi->dummy_packing) {

-          t = 0;

-          do {

-            context_counters [i][j][k][t] +=

-              cpi->coef_counts [i][j][k][t];

-          } while (++t < MAX_ENTROPY_TOKENS);

-        }

+        if (!cpi->dummy_packing)

+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+            context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];

 #endif

-      } while (++k < PREV_COEF_CONTEXTS);

-    } while (++j < COEF_BANDS);

-  } while (++i < BLOCK_TYPES);

+      }

+    }

+  }

-  i = 0;

   if (cpi->common.txfm_mode == ALLOW_8X8) {

-    do {

-      int j = 0;      /* token/prob index */

-      do {

-        int k = 0;

-        do {

+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

+      for (j = 0; j < COEF_BANDS; ++j) {

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

           /* at every context */

           /* calc probs and branch cts for this frame only */

           // vp8_prob new_p           [ENTROPY_NODES];

           // unsigned int branch_ct   [ENTROPY_NODES] [2];

-#ifdef ENTROPY_STATS

-          int t = 0;      /* token/prob index */

-#endif

           if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

             continue;

           vp8_tree_probs_from_distribution(

@@ -1339,20 +1333,36 @@

             256, 1

);

 #ifdef ENTROPY_STATS

-          if (!cpi->dummy_packing) {

-            t = 0;

-            do {

-              context_counters_8x8 [i][j][k][t] +=

-                cpi->coef_counts_8x8 [i][j][k][t];

-            } while (++t < MAX_ENTROPY_TOKENS);

-          }

+          if (!cpi->dummy_packing)

+            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+              context_counters_8x8[i][j][k][t] += cpi->coef_counts_8x8[i][j][k][t];

 #endif

-        } while (++k < PREV_COEF_CONTEXTS);

-      } while (++j < COEF_BANDS);

-    } while (++i < BLOCK_TYPES_8X8);

+        }

+      }

+    }

+#if CONFIG_TX16X16

+  //16x16

+  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {

+    for (j = 0; j < COEF_BANDS; ++j) {

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+          continue;

+        vp8_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,

+          cpi->frame_coef_probs_16x16[i][j][k],

+          cpi->frame_branch_ct_16x16[i][j][k],

+          cpi->coef_counts_16x16[i][j][k], 256, 1);

+#ifdef ENTROPY_STATS

+        if (!cpi->dummy_packing)

+          for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+            context_counters_16x16[i][j][k][t] += cpi->coef_counts_16x16[i][j][k][t];

+#endif

+      }

+    }

+  }

+#endif

 static void update_coef_probs3(VP8_COMP *cpi) {

@@ -1696,7 +1706,7 @@

 static void update_coef_probs(VP8_COMP *cpi) {

-  int i = 0;

+  int i, j, k, t;

   vp8_writer *const w = & cpi->bc;

   int update[2] = {0, 0};

   int savings;

@@ -1704,7 +1714,6 @@

   vp8_clear_system_state(); // __asm emms;

   // Build the cofficient contexts based on counts collected in encode loop

   build_coeff_contexts(cpi);

   // vp8_prob bestupd = find_coef_update_prob(cpi);

@@ -1711,14 +1720,11 @@

   /* dry run to see if there is any udpate at all needed */

   savings = 0;

-  do {

-    int j = !i;

-    do {

-      int k = 0;

+  for (i = 0; i < BLOCK_TYPES; ++i) {

+    for (j = !i; j < COEF_BANDS; ++j) {

       int prev_coef_savings[ENTROPY_NODES] = {0};

-      do {

-        int t = 0;      /* token/prob index */

-        do {

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        for (t = 0; t < ENTROPY_NODES; ++t) {

           vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];

           vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t;

           const vp8_prob upd = COEF_UPDATE_PROB;

@@ -1747,29 +1753,23 @@

 #endif

           update[u]++;

-        } while (++t < ENTROPY_NODES);

-      } while (++k < PREV_COEF_CONTEXTS);

-    } while (++j < COEF_BANDS);

-  } while (++i < BLOCK_TYPES);

+        }

+      }

+    }

+  }

   // printf("Update %d %d, savings %d\n", update[0], update[1], savings);

   /* Is coef updated at all */

   if (update[1] == 0 || savings < 0)

-  {

     vp8_write_bit(w, 0);

-  } else {

+  else {

     vp8_write_bit(w, 1);

-    i = 0;

-    do {

-      int j = !i;

-      do {

-        int k = 0;

+    for (i = 0; i < BLOCK_TYPES; ++i) {

+      for (j = !i; j < COEF_BANDS; ++j) {

         int prev_coef_savings[ENTROPY_NODES] = {0};

-        do {

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

           // calc probs and branch cts for this frame only

-          int t = 0;      /* token/prob index */

-          do {

+          for (t = 0; t < ENTROPY_NODES; ++t) {

             vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];

             vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t;

             const vp8_prob upd = COEF_UPDATE_PROB;

@@ -1791,8 +1791,6 @@

             if (s > 0)

               u = 1;

 #endif

             vp8_write(w, u, upd);

 #ifdef ENTROPY_STATS

             if (!cpi->dummy_packing)

@@ -1803,28 +1801,23 @@

               write_prob_diff_update(w, newp, *Pold);

               *Pold = newp;

-          } while (++t < ENTROPY_NODES);

-        } while (++k < PREV_COEF_CONTEXTS);

-      } while (++j < COEF_BANDS);

-    } while (++i < BLOCK_TYPES);

+          }

+        }

+      }

+    }

-  /* do not do this if not evena allowed */

+  /* do not do this if not even allowed */

   if (cpi->common.txfm_mode == ALLOW_8X8) {

     /* dry run to see if update is necessary */

     update[0] = update[1] = 0;

     savings = 0;

-    i = 0;

-    do {

-      int j = !i;

-      do {

-        int k = 0;

-        do {

+    for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

+      for (j = !i; j < COEF_BANDS; ++j) {

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

           // calc probs and branch cts for this frame only

-          int t = 0;      /* token/prob index */

-          do {

+          for (t = 0; t < ENTROPY_NODES; ++t) {

             const unsigned int *ct  = cpi->frame_branch_ct_8x8 [i][j][k][t];

             vp8_prob newp = cpi->frame_coef_probs_8x8 [i][j][k][t];

             vp8_prob *Pold = cpi->common.fc.coef_probs_8x8 [i][j][k] + t;

@@ -1846,26 +1839,20 @@

             if (u)

               savings += s;

 #endif

             update[u]++;

-          } while (++t < MAX_ENTROPY_TOKENS - 1);

-        } while (++k < PREV_COEF_CONTEXTS);

-      } while (++j < COEF_BANDS);

-    } while (++i < BLOCK_TYPES_8X8);

+          }

+        }

+      }

+    }

     if (update[1] == 0 || savings < 0)

-    {

       vp8_write_bit(w, 0);

-    } else {

+    else {

       vp8_write_bit(w, 1);

-      i = 0;

-      do {

-        int j = !i;

-        do {

-          int k = 0;

-          do {

-            int t = 0;      /* token/prob index */

-            do {

+      for (i = 0; i < BLOCK_TYPES_8X8; ++i) {

+        for (j = !i; j < COEF_BANDS; ++j) {

+          for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+            for (t = 0; t < ENTROPY_NODES; ++t) {

               const unsigned int *ct  = cpi->frame_branch_ct_8x8 [i][j][k][t];

               vp8_prob newp = cpi->frame_coef_probs_8x8 [i][j][k][t];

               vp8_prob *Pold = cpi->common.fc.coef_probs_8x8 [i][j][k] + t;

@@ -1892,12 +1879,90 @@

                 write_prob_diff_update(w, newp, oldp);

                 *Pold = newp;

-            } while (++t < MAX_ENTROPY_TOKENS - 1);

-          } while (++k < PREV_COEF_CONTEXTS);

-        } while (++j < COEF_BANDS);

-      } while (++i < BLOCK_TYPES_8X8);

+            }

+          }

+        }

+      }

+#if CONFIG_TX16X16

+  // 16x16

+  /* dry run to see if update is necessary */

+  update[0] = update[1] = 0;

+  savings = 0;

+  for (i = 0; i < BLOCK_TYPES_16X16; ++i) {

+    for (j = !i; j < COEF_BANDS; ++j) {

+      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+        // calc probs and branch cts for this frame only

+        for (t = 0; t < ENTROPY_NODES; ++t) {

+          const unsigned int *ct  = cpi->frame_branch_ct_16x16[i][j][k][t];

+          vp8_prob newp = cpi->frame_coef_probs_16x16[i][j][k][t];

+          vp8_prob *Pold = cpi->common.fc.coef_probs_16x16[i][j][k] + t;

+          const vp8_prob oldp = *Pold;

+          int s, u;

+          const vp8_prob upd = COEF_UPDATE_PROB_16X16;

+          if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0)))

+            continue;

+#if defined(SEARCH_NEWP)

+          s = prob_diff_update_savings_search(ct, oldp, &newp, upd);

+          u = s > 0 && newp != oldp ? 1 : 0;

+          if (u)

+            savings += s - (int)(vp8_cost_zero(upd));

+          else

+            savings -= (int)(vp8_cost_zero(upd));

+#else

+          s = prob_update_savings(ct, oldp, newp, upd);

+          u = s > 0 ? 1 : 0;

+          if (u)

+            savings += s;

+#endif

+          update[u]++;

+        }

+      }

+    }

+  }

+  if (update[1] == 0 || savings < 0)

+    vp8_write_bit(w, 0);

+  else {

+    vp8_write_bit(w, 1);

+    for (i = 0; i < BLOCK_TYPES_16X16; ++i) {

+      for (j = !i; j < COEF_BANDS; ++j) {

+        for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {

+          for (t = 0; t < ENTROPY_NODES; ++t) {

+            const unsigned int *ct  = cpi->frame_branch_ct_16x16[i][j][k][t];

+            vp8_prob newp = cpi->frame_coef_probs_16x16[i][j][k][t];

+            vp8_prob *Pold = cpi->common.fc.coef_probs_16x16[i][j][k] + t;

+            const vp8_prob oldp = *Pold;

+            const vp8_prob upd = COEF_UPDATE_PROB_16X16;

+            int s, u;

+            if (k >= 3 && ((i == 0 && j == 1) ||

+                           (i > 0 && j == 0)))

+              continue;

+#if defined(SEARCH_NEWP)

+            s = prob_diff_update_savings_search(ct, oldp, &newp, upd);

+            u = s > 0 && newp != oldp ? 1 : 0;

+#else

+            s = prob_update_savings(ct, oldp, newp, upd);

+            u = s > 0 ? 1 : 0;

+#endif

+            vp8_write(w, u, upd);

+#ifdef ENTROPY_STATS

+            if (!cpi->dummy_packing)

+              ++tree_update_hist_16x16[i][j][k][t][u];

+#endif

+            if (u) {

+              /* send/use new probability */

+              write_prob_diff_update(w, newp, oldp);

+              *Pold = newp;

+            }

+          }

+        }

+      }

+    }

+  }

+#endif

 #ifdef PACKET_TESTING

@@ -2308,12 +2373,10 @@

     vp8_write_bit(bc, pc->refresh_last_frame);

 #ifdef ENTROPY_STATS

   if (pc->frame_type == INTER_FRAME)

     active_section = 0;

   else

     active_section = 7;

 #endif

   vp8_clear_system_state();  // __asm emms;

@@ -2320,6 +2383,9 @@

   vp8_copy(cpi->common.fc.pre_coef_probs, cpi->common.fc.coef_probs);

   vp8_copy(cpi->common.fc.pre_coef_probs_8x8, cpi->common.fc.coef_probs_8x8);

+#if CONFIG_TX16X16

+  vp8_copy(cpi->common.fc.pre_coef_probs_16x16, cpi->common.fc.coef_probs_16x16);

+#endif

   vp8_copy(cpi->common.fc.pre_ymode_prob, cpi->common.fc.ymode_prob);

   vp8_copy(cpi->common.fc.pre_uv_mode_prob, cpi->common.fc.uv_mode_prob);

   vp8_copy(cpi->common.fc.pre_bmode_prob, cpi->common.fc.bmode_prob);

@@ -2399,24 +2465,20 @@

   FILE *f = fopen("coefupdprob.h", "w");

   int Sum;

   fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");

   fprintf(f, "const vp8_prob\n"

           "vp8_coef_update_probs[BLOCK_TYPES]\n"

           "                     [COEF_BANDS]\n"

           "                     [PREV_COEF_CONTEXTS]\n"

           "                     [ENTROPY_NODES] = {\n");

   for (i = 0; i < BLOCK_TYPES; i++) {

     fprintf(f, "  { \n");

     for (j = 0; j < COEF_BANDS; j++) {

       fprintf(f, "    {\n");

       for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

         fprintf(f, "      {");

         for (l = 0; l < ENTROPY_NODES; l++) {

           Sum = tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];

           if (Sum > 0) {

             if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)

               fprintf(f, "%3ld, ", (tree_update_hist[i][j][k][l][0] * 255) / Sum);

@@ -2425,16 +2487,12 @@

           } else

             fprintf(f, "%3ld, ", 128);

         fprintf(f, "},\n");

       fprintf(f, "    },\n");

     fprintf(f, "  },\n");

   fprintf(f, "};\n");

   fprintf(f, "const vp8_prob\n"

@@ -2442,20 +2500,14 @@

           "                         [COEF_BANDS]\n"

           "                         [PREV_COEF_CONTEXTS]\n"

           "                         [ENTROPY_NODES] = {\n");

   for (i = 0; i < BLOCK_TYPES_8X8; i++) {

     fprintf(f, "  { \n");

     for (j = 0; j < COEF_BANDS; j++) {

       fprintf(f, "    {\n");

       for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

         fprintf(f, "      {");

         for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {

           Sum = tree_update_hist_8x8[i][j][k][l][0] + tree_update_hist_8x8[i][j][k][l][1];

           if (Sum > 0) {

             if (((tree_update_hist_8x8[i][j][k][l][0] * 255) / Sum) > 0)

               fprintf(f, "%3ld, ", (tree_update_hist_8x8[i][j][k][l][0] * 255) / Sum);

@@ -2464,20 +2516,50 @@

           } else

             fprintf(f, "%3ld, ", 128);

         fprintf(f, "},\n");

       fprintf(f, "    },\n");

+    fprintf(f, "  },\n");

+  }

+#if CONFIG_TX16X16

+  fprintf(f, "const vp8_prob\n"

+          "vp8_coef_update_probs_16x16[BLOCK_TYPES_16X16]\n"

+          "                           [COEF_BANDS]\n"

+          "                           [PREV_COEF_CONTEXTS]\n"

+          "                           [ENTROPY_NODES] = {\n");

+  for (i = 0; i < BLOCK_TYPES_16X16; i++) {

+    fprintf(f, "  { \n");

+    for (j = 0; j < COEF_BANDS; j++) {

+      fprintf(f, "    {\n");

+      for (k = 0; k < PREV_COEF_CONTEXTS; k++) {

+        fprintf(f, "      {");

+        for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++) {

+          Sum = tree_update_hist_16x16[i][j][k][l][0] + tree_update_hist_16x16[i][j][k][l][1];

+          if (Sum > 0) {

+            if (((tree_update_hist_16x16[i][j][k][l][0] * 255) / Sum) > 0)

+              fprintf(f, "%3ld, ", (tree_update_hist_16x16[i][j][k][l][0] * 255) / Sum);

+            else

+              fprintf(f, "%3ld, ", 1);

+          } else

+            fprintf(f, "%3ld, ", 128);

+        }

+        fprintf(f, "},\n");

+      }

+      fprintf(f, "    },\n");

+    }

     fprintf(f, "  },\n");

+#endif

   fclose(f);

   f = fopen("treeupdate.bin", "wb");

   fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);

   fwrite(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

+#if CONFIG_TX16X16

+  fwrite(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

+#endif

   fclose(f);

 #endif

--- a/vp8/encoder/block.h

+++ b/vp8/encoder/block.h

@@ -35,8 +35,14 @@

   unsigned char *quant_shift;

   short *zbin;

   short *zbin_8x8;

+#if CONFIG_TX16X16

+  short *zbin_16x16;

+#endif

   short *zrun_zbin_boost;

   short *zrun_zbin_boost_8x8;

+#if CONFIG_TX16X16

+  short *zrun_zbin_boost_16x16;

+#endif

   short *round;

   // Zbin Over Quant value

@@ -49,7 +55,9 @@

   int eob_max_offset;

   int eob_max_offset_8x8;

+#if CONFIG_TX16X16

+  int eob_max_offset_16x16;

+#endif

 } BLOCK;

 typedef struct {

@@ -153,9 +161,13 @@

 #endif

   unsigned int token_costs[BLOCK_TYPES] [COEF_BANDS]

-  [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

   unsigned int token_costs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]

-  [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+#if CONFIG_TX16X16

+  unsigned int token_costs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]

+    [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];

+#endif

   int optimize;

   int q_index;

@@ -176,7 +188,13 @@

   void (*quantize_b)(BLOCK *b, BLOCKD *d);

   void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);

   void (*vp8_short_fdct8x8)(short *input, short *output, int pitch);

+#if CONFIG_TX16X16

+  void (*vp8_short_fdct16x16)(short *input, short *output, int pitch);

+#endif

   void (*short_fhaar2x2)(short *input, short *output, int pitch);

+#if CONFIG_TX16X16

+  void (*quantize_b_16x16)(BLOCK *b, BLOCKD *d);

+#endif

   void (*quantize_b_8x8)(BLOCK *b, BLOCKD *d);

   void (*quantize_b_2x2)(BLOCK *b, BLOCKD *d);

--- a/vp8/encoder/dct.c

+++ b/vp8/encoder/dct.c

@@ -575,3 +575,205 @@

   vp8_short_walsh4x4_x8_c(input + 4, output + 16, pitch);

 #endif

+#if CONFIG_TX16X16

+static void dct16x16_1d(double input[16], double output[16]) {

+  double step[16];

+  double intermediate[16];

+  double temp1, temp2;

+  const double PI = 3.1415926535898;

+  const double C1 = cos(1*PI/(double)32);

+  const double C2 = cos(2*PI/(double)32);

+  const double C3 = cos(3*PI/(double)32);

+  const double C4 = cos(4*PI/(double)32);

+  const double C5 = cos(5*PI/(double)32);

+  const double C6 = cos(6*PI/(double)32);

+  const double C7 = cos(7*PI/(double)32);

+  const double C8 = cos(8*PI/(double)32);

+  const double C9 = cos(9*PI/(double)32);

+  const double C10 = cos(10*PI/(double)32);

+  const double C11 = cos(11*PI/(double)32);

+  const double C12 = cos(12*PI/(double)32);

+  const double C13 = cos(13*PI/(double)32);

+  const double C14 = cos(14*PI/(double)32);

+  const double C15 = cos(15*PI/(double)32);

+  // step 1

+  step[ 0] = input[0] + input[15];

+  step[ 1] = input[1] + input[14];

+  step[ 2] = input[2] + input[13];

+  step[ 3] = input[3] + input[12];

+  step[ 4] = input[4] + input[11];

+  step[ 5] = input[5] + input[10];

+  step[ 6] = input[6] + input[ 9];

+  step[ 7] = input[7] + input[ 8];

+  step[ 8] = input[7] - input[ 8];

+  step[ 9] = input[6] - input[ 9];

+  step[10] = input[5] - input[10];

+  step[11] = input[4] - input[11];

+  step[12] = input[3] - input[12];

+  step[13] = input[2] - input[13];

+  step[14] = input[1] - input[14];

+  step[15] = input[0] - input[15];

+  // step 2

+  output[0] = step[0] + step[7];

+  output[1] = step[1] + step[6];

+  output[2] = step[2] + step[5];

+  output[3] = step[3] + step[4];

+  output[4] = step[3] - step[4];

+  output[5] = step[2] - step[5];

+  output[6] = step[1] - step[6];

+  output[7] = step[0] - step[7];

+  temp1 = step[ 8]*C7;

+  temp2 = step[15]*C9;

+  output[ 8] = temp1 + temp2;

+  temp1 = step[ 9]*C11;

+  temp2 = step[14]*C5;

+  output[ 9] = temp1 - temp2;

+  temp1 = step[10]*C3;

+  temp2 = step[13]*C13;

+  output[10] = temp1 + temp2;

+  temp1 = step[11]*C15;

+  temp2 = step[12]*C1;

+  output[11] = temp1 - temp2;

+  temp1 = step[11]*C1;

+  temp2 = step[12]*C15;

+  output[12] = temp2 + temp1;

+  temp1 = step[10]*C13;

+  temp2 = step[13]*C3;

+  output[13] = temp2 - temp1;

+  temp1 = step[ 9]*C5;

+  temp2 = step[14]*C11;

+  output[14] = temp2 + temp1;

+  temp1 = step[ 8]*C9;

+  temp2 = step[15]*C7;

+  output[15] = temp2 - temp1;

+  // step 3

+  step[ 0] = output[0] + output[3];

+  step[ 1] = output[1] + output[2];

+  step[ 2] = output[1] - output[2];

+  step[ 3] = output[0] - output[3];

+  temp1 = output[4]*C14;

+  temp2 = output[7]*C2;

+  step[ 4] = temp1 + temp2;

+  temp1 = output[5]*C10;

+  temp2 = output[6]*C6;

+  step[ 5] = temp1 + temp2;

+  temp1 = output[5]*C6;

+  temp2 = output[6]*C10;

+  step[ 6] = temp2 - temp1;

+  temp1 = output[4]*C2;

+  temp2 = output[7]*C14;

+  step[ 7] = temp2 - temp1;

+  step[ 8] = output[ 8] + output[11];

+  step[ 9] = output[ 9] + output[10];

+  step[10] = output[ 9] - output[10];

+  step[11] = output[ 8] - output[11];

+  step[12] = output[12] + output[15];

+  step[13] = output[13] + output[14];

+  step[14] = output[13] - output[14];

+  step[15] = output[12] - output[15];

+  // step 4

+  output[ 0] = (step[ 0] + step[ 1]);

+  output[ 8] = (step[ 0] - step[ 1]);

+  temp1 = step[2]*C12;

+  temp2 = step[3]*C4;

+  temp1 = temp1 + temp2;

+  output[ 4] = 2*(temp1*C8);

+  temp1 = step[2]*C4;

+  temp2 = step[3]*C12;

+  temp1 = temp2 - temp1;

+  output[12] = 2*(temp1*C8);

+  output[ 2] = 2*((step[4] + step[ 5])*C8);

+  output[14] = 2*((step[7] - step[ 6])*C8);

+  temp1 = step[4] - step[5];

+  temp2 = step[6] + step[7];

+  output[ 6] = (temp1 + temp2);

+  output[10] = (temp1 - temp2);

+  intermediate[8] = step[8] + step[14];

+  intermediate[9] = step[9] + step[15];

+  temp1 = intermediate[8]*C12;

+  temp2 = intermediate[9]*C4;

+  temp1 = temp1 - temp2;

+  output[3] = 2*(temp1*C8);

+  temp1 = intermediate[8]*C4;

+  temp2 = intermediate[9]*C12;

+  temp1 = temp2 + temp1;

+  output[13] = 2*(temp1*C8);

+  output[ 9] = 2*((step[10] + step[11])*C8);

+  intermediate[11] = step[10] - step[11];

+  intermediate[12] = step[12] + step[13];

+  intermediate[13] = step[12] - step[13];

+  intermediate[14] = step[ 8] - step[14];

+  intermediate[15] = step[ 9] - step[15];

+  output[15] = (intermediate[11] + intermediate[12]);

+  output[ 1] = -(intermediate[11] - intermediate[12]);

+  output[ 7] = 2*(intermediate[13]*C8);

+  temp1 = intermediate[14]*C12;

+  temp2 = intermediate[15]*C4;

+  temp1 = temp1 - temp2;

+  output[11] = -2*(temp1*C8);

+  temp1 = intermediate[14]*C4;

+  temp2 = intermediate[15]*C12;

+  temp1 = temp2 + temp1;

+  output[ 5] = 2*(temp1*C8);

+}

+void vp8_short_fdct16x16_c(short *input, short *out, int pitch) {

+    int shortpitch = pitch >> 1;

+    int i, j;

+    double output[256];

+    // First transform columns

+    for (i = 0; i < 16; i++) {

+        double temp_in[16], temp_out[16];

+        for (j = 0; j < 16; j++)

+            temp_in[j] = input[j*shortpitch + i];

+        dct16x16_1d(temp_in, temp_out);

+        for (j = 0; j < 16; j++)

+            output[j*16 + i] = temp_out[j];

+    }

+    // Then transform rows

+    for (i = 0; i < 16; ++i) {

+        double temp_in[16], temp_out[16];

+        for (j = 0; j < 16; ++j)

+            temp_in[j] = output[j + i*16];

+        dct16x16_1d(temp_in, temp_out);

+        for (j = 0; j < 16; ++j)

+            output[j + i*16] = temp_out[j];

+    }

+    // Scale by some magic number

+    for (i = 0; i < 256; i++)

+        out[i] = (short)round(output[i]/2);

+}

+#endif

--- a/vp8/encoder/dct.h

+++ b/vp8/encoder/dct.h

@@ -28,6 +28,13 @@

 void vp8_fht8x4_c(short *input, short *output, int pitch, TX_TYPE tx_type);

 #endif

+#if CONFIG_TX16X16

+#ifndef vp8_fdct_short16x16

+#define vp8_fdct_short16x16 vp8_short_fdct16x16_c

+#endif

+extern prototype_fdct(vp8_fdct_short16x16);

+#endif

 #ifndef vp8_fdct_short8x8

 #define vp8_fdct_short8x8  vp8_short_fdct8x8_c

 #endif

@@ -71,6 +78,9 @@

 typedef prototype_fdct(*vp8_fdct_fn_t);

 typedef struct {

+#if CONFIG_TX16X16

+  vp8_fdct_fn_t    short16x16;

+#endif

   vp8_fdct_fn_t    short8x8;

   vp8_fdct_fn_t    haar_short2x2;

   vp8_fdct_fn_t    short4x4;

--- a/vp8/encoder/encodeframe.c

+++ b/vp8/encoder/encodeframe.c

@@ -1132,6 +1132,9 @@

 #endif

   vp8_zero(cpi->coef_counts);

   vp8_zero(cpi->coef_counts_8x8);

+#if CONFIG_TX16X16

+  vp8_zero(cpi->coef_counts_16x16);

+#endif

   vp8cx_frame_init_quantizer(cpi);

@@ -1437,6 +1440,13 @@

   /* test code: set transform size based on mode selection */

+#if CONFIG_TX16X16

+  if (x->e_mbd.mode_info_context->mbmi.mode <= TM_PRED) {

+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_16X16;

+    cpi->t16x16_count++;

+  }

+  else

+#endif

   if (cpi->common.txfm_mode == ALLOW_8X8

       && x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED

       && x->e_mbd.mode_info_context->mbmi.mode != B_PRED) {

@@ -1470,12 +1480,9 @@

 extern void vp8_fix_contexts(MACROBLOCKD *x);

-void vp8cx_encode_inter_macroblock

-(

-  VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,

-  int recon_yoffset, int recon_uvoffset,

-  int output_enabled

-) {

+void vp8cx_encode_inter_macroblock (VP8_COMP *cpi, MACROBLOCK *x,

+                                    TOKENEXTRA **t, int recon_yoffset,

+                                    int recon_uvoffset, int output_enabled) {

   VP8_COMMON *cm = &cpi->common;

   MACROBLOCKD *const xd = &x->e_mbd;

   unsigned char *segment_id = &xd->mode_info_context->mbmi.segment_id;

@@ -1523,6 +1530,16 @@

   set_pred_flag(xd, PRED_REF, ref_pred_flag);

   /* test code: set transform size based on mode selection */

+#if CONFIG_TX16X16

+  if (x->e_mbd.mode_info_context->mbmi.mode <= TM_PRED ||

+      x->e_mbd.mode_info_context->mbmi.mode == NEWMV ||

+      x->e_mbd.mode_info_context->mbmi.mode == ZEROMV ||

+      x->e_mbd.mode_info_context->mbmi.mode == NEARMV ||

+      x->e_mbd.mode_info_context->mbmi.mode == NEARESTMV) {

+    x->e_mbd.mode_info_context->mbmi.txfm_size = TX_16X16;

+    cpi->t16x16_count++;

+  } else

+#endif

   if (cpi->common.txfm_mode == ALLOW_8X8

       && x->e_mbd.mode_info_context->mbmi.mode != I8X8_PRED

       && x->e_mbd.mode_info_context->mbmi.mode != B_PRED

--- a/vp8/encoder/encodeintra.c

+++ b/vp8/encoder/encodeintra.c

@@ -160,11 +160,21 @@

   ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16)

+    vp8_transform_intra_mby_16x16(x);

+  else

+#endif

   if (tx_type == TX_8X8)

     vp8_transform_intra_mby_8x8(x);

   else

     vp8_transform_intra_mby(x);

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16)

+    vp8_quantize_mby_16x16(x);

+  else

+#endif

   if (tx_type == TX_8X8)

     vp8_quantize_mby_8x8(x);

   else

@@ -171,6 +181,11 @@

     vp8_quantize_mby(x);

   if (x->optimize) {

+#if CONFIG_TX16X16

+    if (tx_type == TX_16X16)

+      vp8_optimize_mby_16x16(x, rtcd);

+    else

+#endif

     if (tx_type == TX_8X8)

       vp8_optimize_mby_8x8(x, rtcd);

     else

@@ -177,6 +192,11 @@

       vp8_optimize_mby(x, rtcd);

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16)

+    vp8_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

+  else

+#endif

   if (tx_type == TX_8X8)

     vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

   else

@@ -220,6 +240,9 @@

 void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

   int tx_type = x->e_mbd.mode_info_context->mbmi.txfm_size;

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16) tx_type = TX_8X8; // 16x16 for U and V should default to 8x8 behavior.

+#endif

 #if CONFIG_COMP_INTRA_PRED

   if (x->e_mbd.mode_info_context->mbmi.second_uv_mode == (MB_PREDICTION_MODE)(DC_PRED - 1)) {

 #endif

--- a/vp8/encoder/encodemb.c

+++ b/vp8/encoder/encodemb.c

@@ -282,7 +282,43 @@

+#if CONFIG_TX16X16

+void vp8_transform_mbuv_16x16(MACROBLOCK *x) {

+  int i;

+  vp8_clear_system_state();

+  // Default to the 8x8

+  for (i = 16; i < 24; i += 4)

+    x->vp8_short_fdct8x8(&x->block[i].src_diff[0],

+        &x->block[i].coeff[0], 16);

+}

+void vp8_transform_intra_mby_16x16(MACROBLOCK *x) {

+  vp8_clear_system_state();

+  x->vp8_short_fdct16x16(&x->block[0].src_diff[0],

+      &x->block[0].coeff[0], 32);

+}

+void vp8_transform_mb_16x16(MACROBLOCK *x) {

+  int i;

+  vp8_clear_system_state();

+  x->vp8_short_fdct16x16(&x->block[0].src_diff[0],

+      &x->block[0].coeff[0], 32);

+  for (i = 16; i < 24; i += 4) {

+      x->vp8_short_fdct8x8(&x->block[i].src_diff[0],

+          &x->block[i].coeff[0], 16);

+  }

+}

+void vp8_transform_mby_16x16(MACROBLOCK *x) {

+  vp8_clear_system_state();

+  x->vp8_short_fdct16x16(&x->block[0].src_diff[0], &x->block[0].coeff[0], 32);

+}

+#endif

 #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

 #define RDTRUNC_8x8(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF )

 typedef struct vp8_token_state vp8_token_state;

@@ -290,7 +326,7 @@

 struct vp8_token_state {

   int           rate;

   int           error;

-  signed char   next;

+  int           next;

   signed char   token;

   short         qc;

};

@@ -1017,6 +1053,237 @@

+#if CONFIG_TX16X16

+#define UPDATE_RD_COST()\

+{\

+    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\

+    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\

+    if (rd_cost0 == rd_cost1) {\

+        rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\

+        rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\

+    }\

+}

+void optimize_b_16x16(MACROBLOCK *mb, int i, int type,

+                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,

+                      const VP8_ENCODER_RTCD *rtcd) {

+  BLOCK *b = &mb->block[i];

+  BLOCKD *d = &mb->e_mbd.block[i];

+  vp8_token_state tokens[257][2];

+  unsigned best_index[257][2];

+  const short *dequant_ptr = d->dequant, *coeff_ptr = b->coeff;

+  short *qcoeff_ptr = qcoeff_ptr = d->qcoeff;

+  short *dqcoeff_ptr = dqcoeff_ptr = d->dqcoeff;

+  int eob = d->eob, final_eob, sz = 0;

+  int rc, x, next;

+  int64_t rdmult, rddiv, rd_cost0, rd_cost1;

+  int rate0, rate1, error0, error1, t0, t1;

+  int best, band, pt;

+  int err_mult = plane_rd_mult[type];

+  /* Now set up a Viterbi trellis to evaluate alternative roundings. */

+  rdmult = mb->rdmult * err_mult;

+  if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME)

+      rdmult = (rdmult * 9)>>4;

+  rddiv = mb->rddiv;

+  memset(best_index, 0, sizeof(best_index));

+  /* Initialize the sentinel node of the trellis. */

+  tokens[eob][0].rate = 0;

+  tokens[eob][0].error = 0;

+  tokens[eob][0].next = 256;

+  tokens[eob][0].token = DCT_EOB_TOKEN;

+  tokens[eob][0].qc = 0;

+  *(tokens[eob] + 1) = *(tokens[eob] + 0);

+  next = eob;

+  for (i = eob; i-- > 0;) {

+    int base_bits, d2, dx;

+    rc = vp8_default_zig_zag1d_16x16[i];

+    x = qcoeff_ptr[rc];

+    /* Only add a trellis state for non-zero coefficients. */

+    if (x) {

+      int shortcut = 0;

+      error0 = tokens[next][0].error;

+      error1 = tokens[next][1].error;

+      /* Evaluate the first possibility for this state. */

+      rate0 = tokens[next][0].rate;

+      rate1 = tokens[next][1].rate;

+      t0 = (vp8_dct_value_tokens_ptr + x)->Token;

+      /* Consider both possible successor states. */

+      if (next < 256) {

+        band = vp8_coef_bands_16x16[i + 1];

+        pt = vp8_prev_token_class[t0];

+        rate0 += mb->token_costs_16x16[type][band][pt][tokens[next][0].token];

+        rate1 += mb->token_costs_16x16[type][band][pt][tokens[next][1].token];

+      }

+      UPDATE_RD_COST();

+      /* And pick the best. */

+      best = rd_cost1 < rd_cost0;

+      base_bits = *(vp8_dct_value_cost_ptr + x);

+      dx = dqcoeff_ptr[rc] - coeff_ptr[rc];

+      d2 = dx*dx;

+      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);

+      tokens[i][0].error = d2 + (best ? error1 : error0);

+      tokens[i][0].next = next;

+      tokens[i][0].token = t0;

+      tokens[i][0].qc = x;

+      best_index[i][0] = best;

+      /* Evaluate the second possibility for this state. */

+      rate0 = tokens[next][0].rate;

+      rate1 = tokens[next][1].rate;

+      if((abs(x)*dequant_ptr[rc!=0]>abs(coeff_ptr[rc])) &&

+         (abs(x)*dequant_ptr[rc!=0]<abs(coeff_ptr[rc])+dequant_ptr[rc!=0]))

+        shortcut = 1;

+      else

+        shortcut = 0;

+      if (shortcut) {

+        sz = -(x < 0);

+        x -= 2*sz + 1;

+      }

+      /* Consider both possible successor states. */

+      if (!x) {

+        /* If we reduced this coefficient to zero, check to see if

+         *  we need to move the EOB back here.

+         */

+        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?

+             DCT_EOB_TOKEN : ZERO_TOKEN;

+        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?

+             DCT_EOB_TOKEN : ZERO_TOKEN;

+      }

+      else

+        t0=t1 = (vp8_dct_value_tokens_ptr + x)->Token;

+      if (next < 256) {

+        band = vp8_coef_bands_16x16[i + 1];

+        if (t0 != DCT_EOB_TOKEN) {

+            pt = vp8_prev_token_class[t0];

+            rate0 += mb->token_costs_16x16[type][band][pt]

+                [tokens[next][0].token];

+        }

+        if (t1!=DCT_EOB_TOKEN) {

+            pt = vp8_prev_token_class[t1];

+            rate1 += mb->token_costs_16x16[type][band][pt]

+                [tokens[next][1].token];

+        }

+      }

+      UPDATE_RD_COST();

+      /* And pick the best. */

+      best = rd_cost1 < rd_cost0;

+      base_bits = *(vp8_dct_value_cost_ptr + x);

+      if(shortcut) {

+        dx -= (dequant_ptr[rc!=0] + sz) ^ sz;

+        d2 = dx*dx;

+      }

+      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);

+      tokens[i][1].error = d2 + (best ? error1 : error0);

+      tokens[i][1].next = next;

+      tokens[i][1].token = best ? t1 : t0;

+      tokens[i][1].qc = x;

+      best_index[i][1] = best;

+      /* Finally, make this the new head of the trellis. */

+      next = i;

+    }

+    /* There's no choice to make for a zero coefficient, so we don't

+     *  add a new trellis node, but we do need to update the costs.

+     */

+    else {

+      band = vp8_coef_bands_16x16[i + 1];

+      t0 = tokens[next][0].token;

+      t1 = tokens[next][1].token;

+      /* Update the cost of each path if we're past the EOB token. */

+      if (t0 != DCT_EOB_TOKEN) {

+        tokens[next][0].rate += mb->token_costs_16x16[type][band][0][t0];

+        tokens[next][0].token = ZERO_TOKEN;

+      }

+      if (t1 != DCT_EOB_TOKEN) {

+        tokens[next][1].rate += mb->token_costs_16x16[type][band][0][t1];

+        tokens[next][1].token = ZERO_TOKEN;

+      }

+      /* Don't update next, because we didn't add a new node. */

+    }

+  }

+  /* Now pick the best path through the whole trellis. */

+  band = vp8_coef_bands_16x16[i + 1];

+  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  rate0 = tokens[next][0].rate;

+  rate1 = tokens[next][1].rate;

+  error0 = tokens[next][0].error;

+  error1 = tokens[next][1].error;

+  t0 = tokens[next][0].token;

+  t1 = tokens[next][1].token;

+  rate0 += mb->token_costs_16x16[type][band][pt][t0];

+  rate1 += mb->token_costs_16x16[type][band][pt][t1];

+  UPDATE_RD_COST();

+  best = rd_cost1 < rd_cost0;

+  final_eob = -1;

+  for (i = next; i < eob; i = next) {

+    x = tokens[i][best].qc;

+    if (x)

+      final_eob = i;

+    rc = vp8_default_zig_zag1d_16x16[i];

+    qcoeff_ptr[rc] = x;

+    dqcoeff_ptr[rc] = (x * dequant_ptr[rc!=0]);

+    next = tokens[i][best].next;

+    best = best_index[i][best];

+  }

+  final_eob++;

+  d->eob = final_eob;

+  *a = *l = (d->eob != !type);

+}

+void vp8_optimize_mby_16x16(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) {

+    ENTROPY_CONTEXT_PLANES t_above, t_left;

+    ENTROPY_CONTEXT *ta, *tl;

+    if (!x->e_mbd.above_context)

+        return;

+    if (!x->e_mbd.left_context)

+        return;

+    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+    ta = (ENTROPY_CONTEXT *)&t_above;

+    tl = (ENTROPY_CONTEXT *)&t_left;

+    optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);

+    *(ta + 1) = *ta;

+    *(tl + 1) = *tl;

+}

+void optimize_mb_16x16(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) {

+  int b;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta, *tl;

+  vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);

+  *(ta + 1) = *ta;

+  *(tl + 1) = *tl;

+  for (b = 16; b < 24; b += 4) {

+    optimize_b_8x8(x, b, PLANE_TYPE_UV,

+                   ta + vp8_block2above_8x8[b], tl + vp8_block2left_8x8[b],

+                   rtcd);

+    *(ta + vp8_block2above_8x8[b] + 1) = *(ta + vp8_block2above_8x8[b]);

+    *(tl + vp8_block2left_8x8[b] + 1) = *(tl + vp8_block2left_8x8[b]);

+  }

+}

+#endif

 void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) {

   int tx_type = x->e_mbd.mode_info_context->mbmi.txfm_size;

   vp8_build_inter_predictors_mb(&x->e_mbd);

@@ -1023,11 +1290,21 @@

   vp8_subtract_mb(rtcd, x);

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16)

+    vp8_transform_mb_16x16(x);

+  else

+#endif

   if (tx_type == TX_8X8)

     vp8_transform_mb_8x8(x);

   else

     transform_mb(x);

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16)

+    vp8_quantize_mb_16x16(x);

+  else

+#endif

   if (tx_type == TX_8X8)

     vp8_quantize_mb_8x8(x);

   else

@@ -1034,6 +1311,11 @@

     vp8_quantize_mb(x);

   if (x->optimize) {

+#if CONFIG_TX16X16

+    if (tx_type == TX_16X16)

+      optimize_mb_16x16(x, rtcd);

+    else

+#endif

     if (tx_type == TX_8X8)

       optimize_mb_8x8(x, rtcd);

     else

@@ -1040,6 +1322,11 @@

       optimize_mb(x, rtcd);

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16)

+    vp8_inverse_transform_mb_16x16(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

+  else

+#endif

   if (tx_type == TX_8X8)

     vp8_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

   else

@@ -1111,6 +1398,11 @@

   ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16)

+    vp8_transform_mby_16x16(x);

+  else

+#endif

   if (tx_type == TX_8X8)

     vp8_transform_mby_8x8(x);

   else

@@ -1118,6 +1410,11 @@

   vp8_quantize_mby(x);

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16)

+    vp8_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

+  else

+#endif

   if (tx_type == TX_8X8)

     vp8_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), &x->e_mbd);

   else

@@ -1126,3 +1423,4 @@

   RECON_INVOKE(&rtcd->common->recon, recon_mby)

   (IF_RTCD(&rtcd->common->recon), &x->e_mbd);

--- a/vp8/encoder/encodemb.h

+++ b/vp8/encoder/encodemb.h

@@ -121,6 +121,15 @@

 void vp8_optimize_mby_8x8(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);

 void vp8_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);

+#if CONFIG_TX16X16

+void vp8_transform_mb_16x16(MACROBLOCK *mb);

+void vp8_transform_mby_16x16(MACROBLOCK *x);

+void vp8_transform_mbuv_16x16(MACROBLOCK *x);

+void vp8_transform_intra_mby_16x16(MACROBLOCK *x);

+void vp8_build_dcblock_16x16(MACROBLOCK *b);

+void vp8_optimize_mby_16x16(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);

+#endif

 void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);

 #endif

--- a/vp8/encoder/generic/csystemdependent.c

+++ b/vp8/encoder/generic/csystemdependent.c

@@ -69,6 +69,9 @@

   cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;

   cpi->rtcd.fdct.short8x8                  = vp8_short_fdct8x8_c;

+#if CONFIG_TX16X16

+  cpi->rtcd.fdct.short16x16                = vp8_short_fdct16x16_c;

+#endif

   cpi->rtcd.fdct.haar_short2x2             = vp8_short_fhaar2x2_c;

   cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;

   cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;

--- a/vp8/encoder/onyx_if.c

+++ b/vp8/encoder/onyx_if.c

@@ -1161,10 +1161,16 @@

   if (cpi->sf.improved_dct) {

+#if CONFIG_TX16X16

+    cpi->mb.vp8_short_fdct16x16 = FDCT_INVOKE(&cpi->rtcd.fdct, short16x16);

+#endif

     cpi->mb.vp8_short_fdct8x8 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x8);

     cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);

     cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);

   } else {

+#if CONFIG_TX16X16

+    cpi->mb.vp8_short_fdct16x16 = FDCT_INVOKE(&cpi->rtcd.fdct, short16x16);

+#endif

     cpi->mb.vp8_short_fdct8x8 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x8);

     cpi->mb.vp8_short_fdct8x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);

     cpi->mb.vp8_short_fdct4x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);

@@ -1177,6 +1183,9 @@

   cpi->mb.quantize_b      = vp8_regular_quantize_b;

   cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;

   cpi->mb.quantize_b_8x8  = vp8_regular_quantize_b_8x8;

+#if CONFIG_TX16X16

+  cpi->mb.quantize_b_16x16= vp8_regular_quantize_b_16x16;

+#endif

   cpi->mb.quantize_b_2x2  = vp8_regular_quantize_b_2x2;

   vp8cx_init_quantizer(cpi);

@@ -3629,6 +3638,9 @@

   update_reference_frames(cm);

   vp8_copy(cpi->common.fc.coef_counts, cpi->coef_counts);

   vp8_copy(cpi->common.fc.coef_counts_8x8, cpi->coef_counts_8x8);

+#if CONFIG_TX16X16

+  vp8_copy(cpi->common.fc.coef_counts_16x16, cpi->coef_counts_16x16);

+#endif

   vp8_adapt_coef_probs(&cpi->common);

   if (cpi->common.frame_type != KEY_FRAME) {

     vp8_copy(cpi->common.fc.ymode_counts, cpi->ymode_count);

--- a/vp8/encoder/onyx_int.h

+++ b/vp8/encoder/onyx_int.h

@@ -91,9 +91,13 @@

   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];

   vp8_prob coef_probs[BLOCK_TYPES]

-  [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

   vp8_prob coef_probs_8x8[BLOCK_TYPES_8X8]

-  [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+#if CONFIG_TX16X16

+  vp8_prob coef_probs_16x16[BLOCK_TYPES_16X16]

+      [COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];

+#endif

   vp8_prob ymode_prob [VP8_YMODES - 1]; /* interframe intra mode probs */

   vp8_prob uv_mode_prob [VP8_YMODES][VP8_UV_MODES - 1];

@@ -388,6 +392,15 @@

   DECLARE_ALIGNED(64, short, zrun_zbin_boost_y2_8x8[QINDEX_RANGE][64]);

   DECLARE_ALIGNED(64, short, zrun_zbin_boost_uv_8x8[QINDEX_RANGE][64]);

+#if CONFIG_TX16X16

+  DECLARE_ALIGNED(16, short, Y1zbin_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, Y2zbin_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, UVzbin_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2_16x16[QINDEX_RANGE][256]);

+  DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv_16x16[QINDEX_RANGE][256]);

+#endif

   MACROBLOCK mb;

   VP8_COMMON common;

   vp8_writer bc, bc2;

@@ -538,6 +551,11 @@

   unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

   vp8_prob frame_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

   unsigned int frame_branch_ct_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

+#if CONFIG_TX16X16

+  unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];  /* for this frame */

+  vp8_prob frame_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];

+  unsigned int frame_branch_ct_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES][2];

+#endif

   int gfu_boost;

   int last_boost;

@@ -596,6 +614,9 @@

   int skip_false_count[3];

   int t4x4_count;

   int t8x8_count;

+#if CONFIG_TX16X16

+  int t16x16_count;

+#endif

   unsigned char *segmentation_map;

--- a/vp8/encoder/quantize.c

+++ b/vp8/encoder/quantize.c

@@ -302,9 +302,8 @@

 void vp8_quantize_mbuv_8x8(MACROBLOCK *x) {

   int i;

-  for (i = 16; i < 24; i ++) {

+  for (i = 16; i < 24; i ++)

     x->e_mbd.block[i].eob = 0;

-  }

   for (i = 16; i < 24; i += 4)

     x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);

@@ -311,6 +310,85 @@

+#if CONFIG_TX16X16

+void vp8_quantize_mby_16x16(MACROBLOCK *x) {

+  int i;

+  for (i = 0; i < 16; i++)

+    x->e_mbd.block[i].eob = 0;

+  x->e_mbd.block[24].eob = 0;

+  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);

+}

+void vp8_quantize_mb_16x16(MACROBLOCK *x) {

+  int i;

+  for(i = 0; i < 25; i++)

+    x->e_mbd.block[i].eob = 0;

+  x->quantize_b_16x16(&x->block[0], &x->e_mbd.block[0]);

+  for (i = 16; i < 24; i += 4)

+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);

+}

+// U and V should use 8x8

+void vp8_quantize_mbuv_16x16(MACROBLOCK *x) {

+  int i;

+  for(i = 16; i < 24; i++)

+    x->e_mbd.block[i].eob = 0;

+  for (i = 16; i < 24; i += 4)

+    x->quantize_b_8x8(&x->block[i], &x->e_mbd.block[i]);

+}

+void vp8_regular_quantize_b_16x16(BLOCK *b, BLOCKD *d) {

+  int i, rc, eob;

+  int zbin;

+  int x, y, z, sz;

+  short *zbin_boost_ptr = b->zrun_zbin_boost_16x16;

+  short *coeff_ptr  = b->coeff;

+  short *zbin_ptr   = b->zbin_16x16;

+  short *round_ptr  = b->round;

+  short *quant_ptr  = b->quant;

+  unsigned char *quant_shift_ptr = b->quant_shift;

+  short *qcoeff_ptr = d->qcoeff;

+  short *dqcoeff_ptr = d->dqcoeff;

+  short *dequant_ptr = d->dequant;

+  short zbin_oq_value = b->zbin_extra;

+  vpx_memset(qcoeff_ptr, 0, 256*sizeof(short));

+  vpx_memset(dqcoeff_ptr, 0, 256*sizeof(short));

+  eob = -1;

+  for (i = 0; i < b->eob_max_offset_16x16; i++) {

+    rc   = vp8_default_zig_zag1d_16x16[i];

+    z    = coeff_ptr[rc];

+    zbin = (zbin_ptr[rc!=0] + *zbin_boost_ptr + zbin_oq_value);

+    zbin_boost_ptr ++;

+    sz = (z >> 31);                               // sign of z

+    x  = (z ^ sz) - sz;                           // x = abs(z)

+    if (x >= zbin) {

+      x += (round_ptr[rc!=0]);

+      y  = ((int)(((int)(x * quant_ptr[rc!=0]) >> 16) + x))

+          >> quant_shift_ptr[rc!=0];              // quantize (x)

+      x  = (y ^ sz) - sz;                         // get the sign back

+      qcoeff_ptr[rc]  = x;                        // write to destination

+      dqcoeff_ptr[rc] = x * dequant_ptr[rc!=0];   // dequantized value

+      if (y) {

+        eob = i;                                  // last nonzero coeffs

+        zbin_boost_ptr = b->zrun_zbin_boost_16x16;

+      }

+    }

+  }

+  d->eob = eob + 1;

+}

+#endif

 /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of

  * these two C functions if corresponding optimized routine is not available.

  * NEON optimized version implements currently the fast quantization for pair

@@ -337,20 +415,39 @@

   int i;

   int quant_val;

   int Q;

-  int zbin_boost[16] = { 0,  0,  8, 10, 12, 14, 16, 20,

-                         24, 28, 32, 36, 40, 44, 44, 44

-                       };

+  static const int zbin_boost[16] = {  0,  0,  8, 10, 12, 14, 16, 20,

+                                      24, 28, 32, 36, 40, 44, 44, 44

+                                    };

-  int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,

-                              14, 16, 18, 20, 22, 24, 26, 28,

-                              30, 32, 34, 36, 38, 40, 42, 44,

-                              46, 48, 48, 48, 48, 48, 48, 48,

-                              48, 48, 48, 48, 48, 48, 48, 48,

-                              48, 48, 48, 48, 48, 48, 48, 48,

-                              48, 48, 48, 48, 48, 48, 48, 48,

-                              48, 48, 48, 48, 48, 48, 48, 48

-                           };

+  static const int zbin_boost_8x8[64] = {  0,  0,  0,  8,  8,  8, 10, 12,

+                                          14, 16, 18, 20, 22, 24, 26, 28,

+                                          30, 32, 34, 36, 38, 40, 42, 44,

+                                          46, 48, 48, 48, 48, 48, 48, 48,

+                                          48, 48, 48, 48, 48, 48, 48, 48,

+                                          48, 48, 48, 48, 48, 48, 48, 48,

+                                          48, 48, 48, 48, 48, 48, 48, 48,

+                                          48, 48, 48, 48, 48, 48, 48, 48

+                                        };

+#if CONFIG_TX16X16

+  static const int zbin_boost_16x16[256] = {

+     0,  0,  0,  8,  8,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28,

+    30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,

+  };

+#endif

   int qrounding_factor = 48;

@@ -372,33 +469,52 @@

                  cpi->Y1quant_shift[Q] + 0, quant_val);

     cpi->Y1zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

     cpi->Y1zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+#if CONFIG_TX16X16

+    cpi->Y1zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+#endif

     cpi->Y1round[Q][0] = (qrounding_factor * quant_val) >> 7;

     cpi->common.Y1dequant[Q][0] = quant_val;

     cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;

     cpi->zrun_zbin_boost_y1_8x8[Q][0] =

       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

+#if CONFIG_TX16X16

+    cpi->zrun_zbin_boost_y1_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

+#endif

     quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);

     invert_quant(cpi->Y2quant[Q] + 0,

                  cpi->Y2quant_shift[Q] + 0, quant_val);

     cpi->Y2zbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

     cpi->Y2zbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+#if CONFIG_TX16X16

+    cpi->Y2zbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+#endif

     cpi->Y2round[Q][0] = (qrounding_factor * quant_val) >> 7;

     cpi->common.Y2dequant[Q][0] = quant_val;

     cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;

     cpi->zrun_zbin_boost_y2_8x8[Q][0] =

       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

+#if CONFIG_TX16X16

+    cpi->zrun_zbin_boost_y2_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

+#endif

     quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);

     invert_quant(cpi->UVquant[Q] + 0,

                  cpi->UVquant_shift[Q] + 0, quant_val);

-    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;;

-    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;;

+    cpi->UVzbin[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+    cpi->UVzbin_8x8[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+#if CONFIG_TX16X16

+    cpi->UVzbin_16x16[Q][0] = ((qzbin_factor * quant_val) + 64) >> 7;

+#endif

     cpi->UVround[Q][0] = (qrounding_factor * quant_val) >> 7;

     cpi->common.UVdequant[Q][0] = quant_val;

     cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;

     cpi->zrun_zbin_boost_uv_8x8[Q][0] =

       ((quant_val * zbin_boost_8x8[0]) + 64) >> 7;

+#if CONFIG_TX16X16

+    cpi->zrun_zbin_boost_uv_16x16[Q][0] = ((quant_val * zbin_boost_16x16[0]) + 64) >> 7;

+#endif

     // all the 4x4 ac values =;

     for (i = 1; i < 16; i++) {

@@ -453,6 +569,25 @@

       cpi->zrun_zbin_boost_uv_8x8[Q][i] =

         ((quant_val * zbin_boost_8x8[i]) + 64) >> 7;

+#if CONFIG_TX16X16

+    // 16x16 structures. Same comment above applies.

+    for (i = 1; i < 256; i++) {

+      int rc = vp8_default_zig_zag1d_16x16[i];

+      quant_val = vp8_ac_yquant(Q);

+      cpi->Y1zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->zrun_zbin_boost_y1_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

+      quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);

+      cpi->Y2zbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->zrun_zbin_boost_y2_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

+      quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);

+      cpi->UVzbin_16x16[Q][rc] = ((qzbin_factor * quant_val) + 64) >> 7;

+      cpi->zrun_zbin_boost_uv_16x16[Q][i] = ((quant_val * zbin_boost_16x16[i]) + 64) >> 7;

+    }

+#endif

@@ -491,10 +626,16 @@

     x->block[i].quant_shift = cpi->Y1quant_shift[QIndex];

     x->block[i].zbin = cpi->Y1zbin[QIndex];

     x->block[i].zbin_8x8 = cpi->Y1zbin_8x8[QIndex];

+#if CONFIG_TX16X16

+    x->block[i].zbin_16x16 = cpi->Y1zbin_16x16[QIndex];

+#endif

     x->block[i].round = cpi->Y1round[QIndex];

     x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];

     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];

     x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y1_8x8[QIndex];

+#if CONFIG_TX16X16

+    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y1_16x16[QIndex];

+#endif

     x->block[i].zbin_extra = (short)zbin_extra;

     // Segment max eob offset feature.

@@ -503,9 +644,16 @@

         get_segdata(xd, segment_id, SEG_LVL_EOB);

       x->block[i].eob_max_offset_8x8 =

         get_segdata(xd, segment_id, SEG_LVL_EOB);

+#if CONFIG_TX16X16

+      x->block[i].eob_max_offset_16x16 =

+        get_segdata(xd, segment_id, SEG_LVL_EOB);

+#endif

     } else {

       x->block[i].eob_max_offset = 16;

       x->block[i].eob_max_offset_8x8 = 64;

+#if CONFIG_TX16X16

+      x->block[i].eob_max_offset_16x16 = 256;

+#endif

@@ -520,10 +668,16 @@

     x->block[i].quant_shift = cpi->UVquant_shift[QIndex];

     x->block[i].zbin = cpi->UVzbin[QIndex];

     x->block[i].zbin_8x8 = cpi->UVzbin_8x8[QIndex];

+#if CONFIG_TX16X16

+    x->block[i].zbin_16x16 = cpi->UVzbin_16x16[QIndex];

+#endif

     x->block[i].round = cpi->UVround[QIndex];

     x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];

     x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];

     x->block[i].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_uv_8x8[QIndex];

+#if CONFIG_TX16X16

+    x->block[i].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_uv_16x16[QIndex];

+#endif

     x->block[i].zbin_extra = (short)zbin_extra;

@@ -549,10 +703,16 @@

   x->block[24].quant_shift = cpi->Y2quant_shift[QIndex];

   x->block[24].zbin = cpi->Y2zbin[QIndex];

   x->block[24].zbin_8x8 = cpi->Y2zbin_8x8[QIndex];

+#if CONFIG_TX16X16

+  x->block[24].zbin_16x16 = cpi->Y2zbin_16x16[QIndex];

+#endif

   x->block[24].round = cpi->Y2round[QIndex];

   x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];

   x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];

   x->block[24].zrun_zbin_boost_8x8 = cpi->zrun_zbin_boost_y2_8x8[QIndex];

+#if CONFIG_TX16X16

+  x->block[24].zrun_zbin_boost_16x16 = cpi->zrun_zbin_boost_y2_16x16[QIndex];

+#endif

   x->block[24].zbin_extra = (short)zbin_extra;

   // TBD perhaps not use for Y2

--- a/vp8/encoder/quantize.h

+++ b/vp8/encoder/quantize.h

@@ -46,6 +46,13 @@

 #endif

 extern prototype_quantize_block(vp8_quantize_quantb_8x8);

+#if CONFIG_TX16X16

+#ifndef vp8_quantize_quantb_16x16

+#define vp8_quantize_quantb_16x16 vp8_regular_quantize_b_16x16

+#endif

+extern prototype_quantize_block(vp8_quantize_quantb_16x16);

+#endif

 #ifndef vp8_quantize_quantb_2x2

 #define vp8_quantize_quantb_2x2 vp8_regular_quantize_b_2x2

 #endif

@@ -69,6 +76,13 @@

 extern prototype_quantize_mb(vp8_quantize_mby_8x8);

 extern prototype_quantize_mb(vp8_quantize_mbuv_8x8);

+#if CONFIG_TX16X16

+void vp8_quantize_mb_16x16(MACROBLOCK *x);

+extern prototype_quantize_block(vp8_quantize_quantb_16x16);

+extern prototype_quantize_mb(vp8_quantize_mby_16x16);

+extern prototype_quantize_mb(vp8_quantize_mbuv_16x16);

+#endif

 struct VP8_COMP;

 extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);

--- a/vp8/encoder/ratectrl.c

+++ b/vp8/encoder/ratectrl.c

@@ -177,6 +177,9 @@

 #if CONFIG_SWITCHABLE_INTERP

   vp8_copy(cc->switchable_interp_prob, cm->fc.switchable_interp_prob);

 #endif

+#if CONFIG_TX16X16

+  vp8_copy(cc->coef_probs_16x16, cm->fc.coef_probs_16x16);

+#endif

 void vp8_restore_coding_context(VP8_COMP *cpi) {

@@ -232,6 +235,9 @@

   vp8_copy(cm->fc.coef_probs_8x8, cc->coef_probs_8x8);

 #if CONFIG_SWITCHABLE_INTERP

   vp8_copy(cm->fc.switchable_interp_prob, cc->switchable_interp_prob);

+#endif

+#if CONFIG_TX16X16

+  vp8_copy(cm->fc.coef_probs_16x16, cc->coef_probs_16x16);

 #endif

--- a/vp8/encoder/rdopt.c

+++ b/vp8/encoder/rdopt.c

@@ -366,6 +366,13 @@

     (const vp8_prob( *)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_8x8,

     BLOCK_TYPES_8X8);

+#if CONFIG_TX16X16

+  fill_token_costs(

+    cpi->mb.token_costs_16x16,

+    (const vp8_prob(*)[8][PREV_COEF_CONTEXTS][11]) cpi->common.fc.coef_probs_16x16,

+    BLOCK_TYPES_16X16);

+#endif

   /*rough estimate for costing*/

   cpi->common.kf_ymode_probs_index = cpi->common.base_qindex >> 4;

   vp8_init_mode_costs(cpi);

@@ -809,6 +816,72 @@

   *Rate = vp8_rdcost_mby_8x8(mb);

+#if CONFIG_TX16X16

+static int cost_coeffs_16x16(MACROBLOCK *mb, BLOCKD *b, int type,

+                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {

+  const int eob = b->eob;

+  int c = !type;              /* start at coef 0, unless Y with Y2 */

+  int cost = 0, pt;    /* surrounding block/prev coef predictor */

+  short *qcoeff_ptr = b->qcoeff;

+  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+# define QC16X16(I)  ( qcoeff_ptr [vp8_default_zig_zag1d_16x16[I]] )

+  for (; c < eob; c++) {

+    int v = QC16X16(c);

+    int t = vp8_dct_value_tokens_ptr[v].Token;

+    cost += mb->token_costs_16x16[type][vp8_coef_bands_16x16[c]][pt][t];

+    cost += vp8_dct_value_cost_ptr[v];

+    pt = vp8_prev_token_class[t];

+  }

+# undef QC16X16

+  if (c < 256)

+    cost += mb->token_costs_16x16[type][vp8_coef_bands_16x16[c]]

+            [pt][DCT_EOB_TOKEN];

+  pt = (c != !type); // is eob first coefficient;

+  *a = *l = pt;

+  return cost;

+}

+static int vp8_rdcost_mby_16x16(MACROBLOCK *mb) {

+  int cost;

+  MACROBLOCKD *x = &mb->e_mbd;

+  ENTROPY_CONTEXT_PLANES t_above, t_left;

+  ENTROPY_CONTEXT *ta, *tl;

+  vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));

+  ta = (ENTROPY_CONTEXT *)&t_above;

+  tl = (ENTROPY_CONTEXT *)&t_left;

+  cost = cost_coeffs_16x16(mb, x->block, PLANE_TYPE_Y_WITH_DC, ta, tl);

+  return cost;

+}

+static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,

+                                  const VP8_ENCODER_RTCD *rtcd) {

+  int d;

+  ENCODEMB_INVOKE(&rtcd->encodemb, submby)(

+    mb->src_diff,

+    *(mb->block[0].base_src),

+    mb->e_mbd.predictor,

+    mb->block[0].src_stride);

+  vp8_transform_mby_16x16(mb);

+  vp8_quantize_mby_16x16(mb);

+  d = ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(mb, 0);

+  *Distortion = (d >> 2);

+  // rate

+  *Rate = vp8_rdcost_mby_16x16(mb);

+}

+#endif

 static void copy_predictor(unsigned char *dst, const unsigned char *predictor) {

   const unsigned int *p = (const unsigned int *)predictor;

   unsigned int *d = (unsigned int *)dst;

@@ -1121,7 +1194,12 @@

 #endif

-      macro_block_yrd_8x8(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd));

+#if CONFIG_TX16X16

+      if (mode <= TM_PRED)

+        macro_block_yrd_16x16(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd));

+      else

+#endif

+        macro_block_yrd_8x8(x, &ratey, &distortion, IF_RTCD(&cpi->rtcd));

       // FIXME add compoundmode cost

       // FIXME add rate for mode2

       rate = ratey + x->mbmode_cost[x->e_mbd.frame_type]

@@ -3081,6 +3159,10 @@

             vp8_cost_bit(get_pred_prob(cm, xd, PRED_COMP), 0);

         break;

+        case DC_PRED:

+        case V_PRED:

+        case H_PRED:

+        case TM_PRED:

         case D45_PRED:

         case D135_PRED:

         case D117_PRED:

@@ -3087,13 +3169,26 @@

         case D153_PRED:

         case D27_PRED:

         case D63_PRED:

-        case DC_PRED:

-        case V_PRED:

-        case H_PRED:

-        case TM_PRED:

+#if CONFIG_TX16X16

+          // FIXME: breaks lossless since 4x4 isn't allowed

           x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

           // FIXME compound intra prediction

           RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)

+              (&x->e_mbd);

+          macro_block_yrd_16x16(x, &rate_y, &distortion,

+                                IF_RTCD(&cpi->rtcd));

+          rate2 += rate_y;

+          distortion2 += distortion;

+          rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];

+          rate2 += uv_intra_rate_8x8;

+          rate_uv = uv_intra_rate_tokenonly_8x8;

+          distortion2 += uv_intra_distortion_8x8;

+          distortion_uv = uv_intra_distortion_8x8;

+          break;

+#else

+          x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;

+          // FIXME compound intra prediction

+          RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)

           (&x->e_mbd);

           if (cpi->common.txfm_mode == ALLOW_8X8)

             macro_block_yrd_8x8(x, &rate_y, &distortion,

@@ -3116,6 +3211,7 @@

             distortion_uv = uv_intra_distortion;

           break;

+#endif

         case NEWMV: {

           int thissme;

@@ -3269,7 +3365,6 @@

         case ZEROMV:

           // Trap vectors that reach beyond the UMV borders

           // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point

           // because of the lack of break statements in the previous two cases.

@@ -3348,12 +3443,23 @@

           rate2 += vp8_cost_mv_ref(cpi, this_mode, mdcounts);

           // Y cost and distortion

-          if (cpi->common.txfm_mode == ALLOW_8X8)

-            macro_block_yrd_8x8(x, &rate_y, &distortion,

-                                IF_RTCD(&cpi->rtcd));

-          else

-            macro_block_yrd(x, &rate_y, &distortion,

-                            IF_RTCD(&cpi->rtcd));

+#if CONFIG_TX16X16

+          if (this_mode == ZEROMV ||

+              this_mode == NEARESTMV ||

+              this_mode == NEARMV ||

+              this_mode == NEWMV)

+            macro_block_yrd_16x16(x, &rate_y, &distortion, IF_RTCD(&cpi->rtcd));

+          else {

+#endif

+            if (cpi->common.txfm_mode == ALLOW_8X8)

+              macro_block_yrd_8x8(x, &rate_y, &distortion,

+                                  IF_RTCD(&cpi->rtcd));

+            else

+              macro_block_yrd(x, &rate_y, &distortion,

+                              IF_RTCD(&cpi->rtcd));

+#if CONFIG_TX16X16

+          }

+#endif

           rate2 += rate_y;

           distortion2 += distortion;

@@ -3361,7 +3467,14 @@

           // UV cost and distortion

           vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);

-          if (cpi->common.txfm_mode == ALLOW_8X8)

+          if (cpi->common.txfm_mode == ALLOW_8X8

+#if CONFIG_TX16X16

+              || this_mode == ZEROMV ||

+              this_mode == NEARESTMV ||

+              this_mode == NEARMV ||

+              this_mode == NEWMV

+#endif

+              )

             rd_inter16x16_uv_8x8(cpi, x, &rate_uv,

                                  &distortion_uv,

                                  cpi->common.full_pixel);

@@ -3487,9 +3600,21 @@

                                                &x->e_mbd.predictor[320], 16, 8);

         /* Y cost and distortion */

-        if (cpi->common.txfm_mode == ALLOW_8X8)

+        if (cpi->common.txfm_mode == ALLOW_8X8

+#if CONFIG_TX16X16

+            || this_mode == ZEROMV ||

+            this_mode == NEARESTMV ||

+            this_mode == NEARMV ||

+            this_mode == NEWMV

+#endif

+            )

+#if CONFIG_TX16X16

+          macro_block_yrd_16x16(x, &rate_y, &distortion,

+                                IF_RTCD(&cpi->rtcd));

+#else

           macro_block_yrd_8x8(x, &rate_y, &distortion,

                               IF_RTCD(&cpi->rtcd));

+#endif

         else

           macro_block_yrd(x, &rate_y, &distortion,

                           IF_RTCD(&cpi->rtcd));

@@ -3498,7 +3623,14 @@

         distortion2 += distortion;

         /* UV cost and distortion */

-        if (cpi->common.txfm_mode == ALLOW_8X8)

+        if (cpi->common.txfm_mode == ALLOW_8X8

+#if CONFIG_TX16X16

+            || this_mode == ZEROMV ||

+            this_mode == NEARESTMV ||

+            this_mode == NEARMV ||

+            this_mode == NEWMV

+#endif

+            )

           rd_inter16x16_uv_8x8(cpi, x, &rate_uv,

                                &distortion_uv,

                                cpi->common.full_pixel);

@@ -3541,6 +3673,15 @@

                       && this_mode != B_PRED

                       && this_mode != I8X8_PRED);

+#if CONFIGURE_TX16X16

+        if (this_mode <= TM_PRED ||

+            this_mode == NEWMV ||

+            this_mode == ZEROMV ||

+            this_mode == NEARESTMV ||

+            this_mode == NEARMV)

+          mb_skippable = mb_is_skippable_16x16(&x->e_mbd);

+        else

+#endif

         if ((cpi->common.txfm_mode == ALLOW_8X8) && has_y2) {

           if (x->e_mbd.mode_info_context->mbmi.ref_frame != INTRA_FRAME)

             mb_skippable = mb_is_skippable_8x8(&x->e_mbd);

@@ -4002,10 +4143,25 @@

     /* test code: set transform size based on mode selection */

+#if CONFIG_TX16X16

+    if (xd->mode_info_context->mbmi.mode <= TM_PRED ||

+        xd->mode_info_context->mbmi.mode == NEWMV ||

+        xd->mode_info_context->mbmi.mode == ZEROMV ||

+        xd->mode_info_context->mbmi.mode == NEARMV ||

+        xd->mode_info_context->mbmi.mode == NEARESTMV) {

+      xd->mode_info_context->mbmi.txfm_size = TX_16X16;

+      cpi->t16x16_count++;

+    }

+    else if (cpi->common.txfm_mode == ALLOW_8X8

+        && xd->mode_info_context->mbmi.mode != I8X8_PRED

+        && xd->mode_info_context->mbmi.mode != B_PRED

+        && xd->mode_info_context->mbmi.mode != SPLITMV) {

+#else

     if (cpi->common.txfm_mode == ALLOW_8X8

         && xd->mode_info_context->mbmi.mode != I8X8_PRED

         && xd->mode_info_context->mbmi.mode != B_PRED

         && xd->mode_info_context->mbmi.mode != SPLITMV) {

+#endif

       xd->mode_info_context->mbmi.txfm_size = TX_8X8;

       cpi->t8x8_count++;

     } else {

--- a/vp8/encoder/tokenize.c

+++ b/vp8/encoder/tokenize.c

@@ -26,17 +26,23 @@

 #ifdef ENTROPY_STATS

 INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

 INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

-extern unsigned int tree_update_hist [BLOCK_TYPES]

-[COEF_BANDS]

-[PREV_COEF_CONTEXTS]

-[ENTROPY_NODES][2];

-extern unsigned int tree_update_hist_8x8 [BLOCK_TYPES_8X8]

-[COEF_BANDS]

-[PREV_COEF_CONTEXTS]

-[ENTROPY_NODES] [2];

+#if CONFIG_TX16X16

+INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

 #endif

+extern unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]

+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];

+extern unsigned int tree_update_hist_8x8[BLOCK_TYPES_8X8][COEF_BANDS]

+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

+#if CONFIG_TX16X16

+extern unsigned int tree_update_hist_16x16[BLOCK_TYPES_16X16][COEF_BANDS]

+                    [PREV_COEF_CONTEXTS][ENTROPY_NODES] [2];

+#endif

+#endif

 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);

 void vp8_stuff_mb_8x8(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);

+#if CONFIG_TX16X16

+void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t);

+#endif

 void vp8_fix_contexts(MACROBLOCKD *x);

 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];

@@ -103,6 +109,54 @@

   vp8_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;

+#if CONFIG_TX16X16

+static void tokenize1st_order_b_16x16(MACROBLOCKD *xd, const BLOCKD *const b, TOKENEXTRA **tp,

+                                      const int type, const FRAME_TYPE frametype, ENTROPY_CONTEXT *a,

+                                      ENTROPY_CONTEXT *l, VP8_COMP *cpi) {

+  int pt; /* near block/prev token context index */

+  int c = 0;                  /* start at DC unless type 0 */

+  const int eob = b->eob;     /* one beyond last nonzero coeff */

+  TOKENEXTRA *t = *tp;        /* store tokens starting here */

+  int x;

+  const short *qcoeff_ptr = b->qcoeff;

+  int seg_eob = 256;

+  int segment_id = xd->mode_info_context->mbmi.segment_id;

+  if (segfeature_active(xd, segment_id, SEG_LVL_EOB))

+    seg_eob = get_segdata(xd, segment_id, SEG_LVL_EOB);

+  VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+  do {

+    const int band = vp8_coef_bands_16x16[c];

+    int v;

+    x = DCT_EOB_TOKEN;

+    if (c < eob) {

+      int rc = vp8_default_zig_zag1d_16x16[c];

+      v = qcoeff_ptr[rc];

+      assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));

+      t->Extra = vp8_dct_value_tokens_ptr[v].Extra;

+      x        = vp8_dct_value_tokens_ptr[v].Token;

+    }

+    t->Token = x;

+    t->context_tree = cpi->common.fc.coef_probs_16x16[type][band][pt];

+    t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));

+    ++cpi->coef_counts_16x16[type][band][pt][x];

+  } while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < seg_eob);

+  *tp = t;

+  pt = (c != !type); /* 0 <-> all coeff data is zero */

+  *a = *l = pt;

+}

+#endif

 static void tokenize2nd_order_b_8x8

   MACROBLOCKD *xd,

@@ -170,12 +224,8 @@

-static void tokenize2nd_order_b

-(

-  MACROBLOCKD *xd,

-  TOKENEXTRA **tp,

-  VP8_COMP *cpi

-) {

+static void tokenize2nd_order_b(MACROBLOCKD *xd, TOKENEXTRA **tp,

+                                VP8_COMP *cpi) {

   int pt;             /* near block/prev token context index */

   int c;              /* start at DC */

   TOKENEXTRA *t = *tp;/* store tokens starting here */

@@ -188,9 +238,8 @@

   int seg_eob = 16;

   int segment_id = xd->mode_info_context->mbmi.segment_id;

-  if (segfeature_active(xd, segment_id, SEG_LVL_EOB)) {

+  if (segfeature_active(xd, segment_id, SEG_LVL_EOB))

     seg_eob = get_segdata(xd, segment_id, SEG_LVL_EOB);

-  }

   b = xd->block + 24;

   qcoeff_ptr = b->qcoeff;

@@ -542,14 +591,10 @@

   unsigned int block;

   const BLOCKD *b;

   int pt;             /* near block/prev token context index */

-  int c;

-  int token;

+  int band, rc, v, c, token;

   TOKENEXTRA *t = *tp;/* store tokens starting here */

   const short *qcoeff_ptr;

-  ENTROPY_CONTEXT *a;

-  ENTROPY_CONTEXT *l;

-  int band, rc, v;

-  int tmp1, tmp2;

+  ENTROPY_CONTEXT *a, *l;

   int seg_eob = 16;

   int segment_id = xd->mode_info_context->mbmi.segment_id;

@@ -561,11 +606,9 @@

   b = xd->block;

   /* Luma */

   for (block = 0; block < 16; block++, b++) {

-    tmp1 = vp8_block2above[block];

-    tmp2 = vp8_block2left[block];

     qcoeff_ptr = b->qcoeff;

-    a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;

-    l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;

+    a = (ENTROPY_CONTEXT *)xd->above_context + vp8_block2above[block];

+    l = (ENTROPY_CONTEXT *)xd->left_context + vp8_block2left[block];

     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

     c = type ? 0 : 1;

@@ -609,11 +652,9 @@

   /* Chroma */

   for (block = 16; block < 24; block++, b++) {

-    tmp1 = vp8_block2above[block];

-    tmp2 = vp8_block2left[block];

     qcoeff_ptr = b->qcoeff;

-    a = (ENTROPY_CONTEXT *)xd->above_context + tmp1;

-    l = (ENTROPY_CONTEXT *)xd->left_context + tmp2;

+    a = (ENTROPY_CONTEXT *)xd->above_context + vp8_block2above[block];

+    l = (ENTROPY_CONTEXT *)xd->left_context + vp8_block2left[block];

     VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

@@ -701,7 +742,21 @@

   return (mby_is_skippable_8x8(x) & mbuv_is_skippable_8x8(x));

+#if CONFIG_TX16X16

+int mby_is_skippable_16x16(MACROBLOCKD *x) {

+  int skip = 1;

+  //skip &= (x->block[0].eob < 2); // I think this should be commented? No second order == DC must be coded

+  //skip &= (x->block[0].eob < 1);

+  //skip &= (!x->block[24].eob);

+  skip &= !x->block[0].eob;

+  return skip;

+}

+int mb_is_skippable_16x16(MACROBLOCKD *x) {

+  return (mby_is_skippable_16x16(x) & mbuv_is_skippable_8x8(x));

+}

+#endif

 void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {

   int plane_type;

   int has_y2_block;

@@ -730,16 +785,32 @@

   has_y2_block = (x->mode_info_context->mbmi.mode != B_PRED

                   && x->mode_info_context->mbmi.mode != I8X8_PRED

                   && x->mode_info_context->mbmi.mode != SPLITMV);

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16) has_y2_block = 0; // Because of inter frames

+#endif

-  x->mode_info_context->mbmi.mb_skip_coeff =

-    ((tx_type == TX_8X8) ?

-     mb_is_skippable_8x8(x) :

-     mb_is_skippable(x, has_y2_block));

+  switch (tx_type) {

+#if CONFIG_TX16X16

+    case TX_16X16:

+      x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_16x16(x);

+      break;

+#endif

+    case TX_8X8:

+      x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable_8x8(x);

+      break;

+    default:

+      x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x, has_y2_block);

+      break;

+  }

   if (x->mode_info_context->mbmi.mb_skip_coeff) {

     cpi->skip_true_count[mb_skip_context] += skip_inc;

     if (!cpi->common.mb_no_coeff_skip) {

+#if CONFIG_TX16X16

+      if (tx_type == TX_16X16)

+        vp8_stuff_mb_16x16(cpi, x, t);

+      else

+#endif

       if (tx_type == TX_8X8)

         vp8_stuff_mb_8x8(cpi, x, t);

       else

@@ -766,9 +837,28 @@

       tokenize2nd_order_b(x, t, cpi);

     plane_type = 0;

+#if CONFIG_TX16X16

+  if (tx_type == TX_16X16) {

+    ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;

+    ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;

+    tokenize1st_order_b_16x16(x, x->block, t, 3, x->frame_type, A, L, cpi);

+    for (b = 1; b < 16; b++) {

+      *(A + vp8_block2above[b]) = *(A);

+      *(L + vp8_block2left[b] ) = *(L);

+    }

+    for (b = 16; b < 24; b += 4) {

+      tokenize1st_order_b_8x8(x, x->block + b, t, 2, x->frame_type,

+          A + vp8_block2above_8x8[b], L + vp8_block2left_8x8[b], cpi);

+      *(A + vp8_block2above_8x8[b]+1) = *(A + vp8_block2above_8x8[b]);

+      *(L + vp8_block2left_8x8[b]+1 ) = *(L + vp8_block2left_8x8[b]);

+    }

+    vpx_memset(&A[8], 0, sizeof(A[8]));

+    vpx_memset(&L[8], 0, sizeof(L[8]));

+  }

+  else

+#endif

   if (tx_type == TX_8X8) {

     ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;

     ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;

@@ -827,15 +917,20 @@

 #ifdef ENTROPY_STATS

 void init_context_counters(void) {

   FILE *f = fopen("context.bin", "rb");

   if (!f) {

     vpx_memset(context_counters, 0, sizeof(context_counters));

     vpx_memset(context_counters_8x8, 0, sizeof(context_counters_8x8));

+#if CONFIG_TX16X16

+    vpx_memset(context_counters_16x16, 0, sizeof(context_counters_16x16));

+#endif

   } else {

     fread(context_counters, sizeof(context_counters), 1, f);

     fread(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

+#if CONFIG_TX16X16

+    fread(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

+#endif

     fclose(f);

@@ -843,15 +938,20 @@

   if (!f) {

     vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));

     vpx_memset(tree_update_hist_8x8, 0, sizeof(tree_update_hist_8x8));

+#if CONFIG_TX16X16

+    vpx_memset(tree_update_hist_16x16, 0, sizeof(tree_update_hist_16x16));

+#endif

   } else {

     fread(tree_update_hist, sizeof(tree_update_hist), 1, f);

     fread(tree_update_hist_8x8, sizeof(tree_update_hist_8x8), 1, f);

+#if CONFIG_TX16X16

+    fread(tree_update_hist_16x16, sizeof(tree_update_hist_16x16), 1, f);

+#endif

     fclose(f);

 void print_context_counters() {

   int type, band, pt, t;

   FILE *f = fopen("context.c", "w");

@@ -892,7 +992,6 @@

   fprintf(f, "static const unsigned int\nvp8_default_coef_counts_8x8"

           "[BLOCK_TYPES_8X8] [COEF_BANDS]"

           "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");

   type = 0;

   do {

     fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

@@ -921,26 +1020,54 @@

     fprintf(f, "\n  }");

   } while (++type < BLOCK_TYPES_8X8);

+  fprintf(f, "\n};\n");

+#if CONFIG_TX16X16

+  fprintf(f, "static const unsigned int\nvp8_default_coef_counts_16x16"

+          "[BLOCK_TYPES_16X16] [COEF_BANDS]"

+          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");

+  type = 0;

+  do {

+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

+    band = 0;

+    do {

+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

+      pt = 0;

+      do {

+        fprintf(f, "%s\n      {", Comma(pt));

+        t = 0;

+        do {

+          const INT64 x = context_counters_16x16 [type] [band] [pt] [t];

+          const int y = (int) x;

+          assert(x == (INT64) y);  /* no overflow handling yet */

+          fprintf(f, "%s %d", Comma(t), y);

+        } while (++t < MAX_ENTROPY_TOKENS);

+        fprintf(f, "}");

+      } while (++pt < PREV_COEF_CONTEXTS);

+      fprintf(f, "\n    }");

+    } while (++band < COEF_BANDS);

+    fprintf(f, "\n  }");

+  } while (++type < BLOCK_TYPES_16X16);

   fprintf(f, "\n};\n");

+#endif

   fprintf(f, "static const vp8_prob\n"

           "vp8_default_coef_probs[BLOCK_TYPES] [COEF_BANDS] \n"

           "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");

   type = 0;

   do {

     fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

     band = 0;

     do {

       fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

       pt = 0;

       do {

         unsigned int branch_ct [ENTROPY_NODES] [2];

         unsigned int coef_counts[MAX_ENTROPY_TOKENS];

         vp8_prob coef_probs[ENTROPY_NODES];

@@ -952,7 +1079,6 @@

         fprintf(f, "%s\n      {", Comma(pt));

         t = 0;

         do {

           fprintf(f, "%s %d", Comma(t), coef_probs[t]);

@@ -960,11 +1086,8 @@

         fprintf(f, "}");

       } while (++pt < PREV_COEF_CONTEXTS);

       fprintf(f, "\n    }");

     } while (++band < COEF_BANDS);

     fprintf(f, "\n  }");

   } while (++type < BLOCK_TYPES);

   fprintf(f, "\n};\n");

@@ -973,19 +1096,13 @@

           "vp8_default_coef_probs_8x8[BLOCK_TYPES_8X8] [COEF_BANDS]\n"

           "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");

   type = 0;

   do {

     fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

     band = 0;

     do {

       fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

       pt = 0;

       do {

         unsigned int branch_ct [ENTROPY_NODES] [2];

         unsigned int coef_counts[MAX_ENTROPY_TOKENS];

         vp8_prob coef_probs[ENTROPY_NODES];

@@ -994,34 +1111,65 @@

         vp8_tree_probs_from_distribution(

           MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,

           coef_probs, branch_ct, coef_counts, 256, 1);

         fprintf(f, "%s\n      {", Comma(pt));

-        t = 0;

+        t = 0;

         do {

           fprintf(f, "%s %d", Comma(t), coef_probs[t]);

         } while (++t < ENTROPY_NODES);

         fprintf(f, "}");

       } while (++pt < PREV_COEF_CONTEXTS);

       fprintf(f, "\n    }");

     } while (++band < COEF_BANDS);

     fprintf(f, "\n  }");

   } while (++type < BLOCK_TYPES_8X8);

   fprintf(f, "\n};\n");

+#if CONFIG_TX16X16

+  fprintf(f, "static const vp8_prob\n"

+          "vp8_default_coef_probs_16x16[BLOCK_TYPES_16X16] [COEF_BANDS]\n"

+          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {");

+  type = 0;

+  do {

+    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);

+    band = 0;

+    do {

+      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);

+      pt = 0;

+      do {

+        unsigned int branch_ct [ENTROPY_NODES] [2];

+        unsigned int coef_counts[MAX_ENTROPY_TOKENS];

+        vp8_prob coef_probs[ENTROPY_NODES];

+        for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)

+          coef_counts[t] = context_counters_16x16[type] [band] [pt] [t];

+        vp8_tree_probs_from_distribution(

+          MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,

+          coef_probs, branch_ct, coef_counts, 256, 1);

+        fprintf(f, "%s\n      {", Comma(pt));

+        t = 0;

+        do {

+          fprintf(f, "%s %d", Comma(t), coef_probs[t]);

+        } while (++t < ENTROPY_NODES);

+        fprintf(f, "}");

+      } while (++pt < PREV_COEF_CONTEXTS);

+      fprintf(f, "\n    }");

+    } while (++band < COEF_BANDS);

+    fprintf(f, "\n  }");

+  } while (++type < BLOCK_TYPES_16X16);

+  fprintf(f, "\n};\n");

+#endif

   fclose(f);

   f = fopen("context.bin", "wb");

   fwrite(context_counters, sizeof(context_counters), 1, f);

   fwrite(context_counters_8x8, sizeof(context_counters_8x8), 1, f);

+#if CONFIG_TX16X16

+  fwrite(context_counters_16x16, sizeof(context_counters_16x16), 1, f);

+#endif

   fclose(f);

 #endif

@@ -1151,6 +1299,50 @@

+#if CONFIG_TX16X16

+static __inline

+void stuff1st_order_b_16x16(const BLOCKD *const b, TOKENEXTRA **tp, const FRAME_TYPE frametype,

+                            ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, VP8_COMP *cpi)

+{

+    int pt; /* near block/prev token context index */

+    TOKENEXTRA *t = *tp;        /* store tokens starting here */

+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);

+    (void) frametype;

+    (void) b;

+    t->Token = DCT_EOB_TOKEN;

+    t->context_tree = cpi->common.fc.coef_probs_16x16[3][1][pt];

+    t->skip_eob_node = 0;

+    ++t;

+    *tp = t;

+    ++cpi->coef_counts_16x16[3][1][pt][DCT_EOB_TOKEN];

+    pt = 0; /* 0 <-> all coeff data is zero */

+    *a = *l = pt;

+}

+void vp8_stuff_mb_16x16(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {

+  ENTROPY_CONTEXT * A = (ENTROPY_CONTEXT *)x->above_context;

+  ENTROPY_CONTEXT * L = (ENTROPY_CONTEXT *)x->left_context;

+  int b, i;

+  stuff1st_order_b_16x16(x->block, t, x->frame_type, A, L, cpi);

+  for (i = 1; i < 16; i++) {

+    *(A + vp8_block2above[i]) = *(A);

+    *(L +  vp8_block2left[i]) = *(L);

+  }

+  for (b = 16; b < 24; b += 4) {

+    stuff1st_order_buv_8x8(x->block + b, t, 2, x->frame_type,

+        A + vp8_block2above[b],

+        L + vp8_block2left[b],

+        cpi);

+    *(A + vp8_block2above_8x8[b]+1) = *(A + vp8_block2above_8x8[b]);

+    *(L + vp8_block2left_8x8[b]+1 ) = *(L + vp8_block2left_8x8[b]);

+  }

+  vpx_memset(&A[8], 0, sizeof(A[8]));

+  vpx_memset(&L[8], 0, sizeof(L[8]));

+}

+#endif

 static __inline void stuff2nd_order_b

   TOKENEXTRA **tp,

@@ -1215,7 +1407,6 @@

   ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];

   pt = 0; /* 0 <-> all coeff data is zero */

   *a = *l = pt;

 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) {

@@ -1241,9 +1432,13 @@

 void vp8_fix_contexts(MACROBLOCKD *x) {

   /* Clear entropy contexts for Y2 blocks */

-  if (x->mode_info_context->mbmi.mode != B_PRED

+  if ((x->mode_info_context->mbmi.mode != B_PRED

       && x->mode_info_context->mbmi.mode != I8X8_PRED

-      && x->mode_info_context->mbmi.mode != SPLITMV) {

+      && x->mode_info_context->mbmi.mode != SPLITMV)

+#if CONFIG_TX16X16

+      || x->mode_info_context->mbmi.txfm_size == TX_16X16

+#endif

+      ) {

     vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

     vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));

   } else {

--- a/vp8/encoder/tokenize.h

+++ b/vp8/encoder/tokenize.h

@@ -44,8 +44,11 @@

 extern INT64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

 extern INT64 context_counters_8x8[BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

+#if CONFIG_TX16X16

+extern INT64 context_counters_16x16[BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS];

 #endif

+#endif

 extern const int *vp8_dct_value_cost_ptr;

 /* TODO: The Token field should be broken out into a separate char array to

  *  improve cache locality, since it's needed for costing when the rest of the

--

⑨