shithub: libvpx

--- a/configure

+++ b/configure

@@ -249,6 +249,7 @@

     abovesprefmv

     intht

     intht4x4

+    intht16x16

 CONFIG_LIST="

     external_build

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -399,7 +399,7 @@

 #define ACTIVE_HT8  300

-#define ACTIVE_HT16 0

+#define ACTIVE_HT16 300

 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE

 static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {

--- a/vp9/common/vp9_idctllm.c

+++ b/vp9/common/vp9_idctllm.c

@@ -986,6 +986,231 @@

+#if CONFIG_INTHT16X16

+void iadst16_1d(int16_t *input, int16_t *output) {

+  int x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;

+  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;

+  x0 = input[15];

+  x1 = input[0];

+  x2 = input[13];

+  x3 = input[2];

+  x4 = input[11];

+  x5 = input[4];

+  x6 = input[9];

+  x7 = input[6];

+  x8 = input[7];

+  x9 = input[8];

+  x10 = input[5];

+  x11 = input[10];

+  x12 = input[3];

+  x13 = input[12];

+  x14 = input[1];

+  x15 = input[14];

+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8

+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {

+    output[0] = output[1] = output[2] = output[3] = output[4]

+              = output[5] = output[6] = output[7] = output[8]

+              = output[9] = output[10] = output[11] = output[12]

+              = output[13] = output[14] = output[15] = 0;

+    return;

+  }

+  // stage 1

+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;

+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;

+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;

+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;

+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;

+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;

+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;

+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;

+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;

+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;

+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;

+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;

+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;

+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;

+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;

+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;

+  x0 = dct_const_round_shift(s0 + s8);

+  x1 = dct_const_round_shift(s1 + s9);

+  x2 = dct_const_round_shift(s2 + s10);

+  x3 = dct_const_round_shift(s3 + s11);

+  x4 = dct_const_round_shift(s4 + s12);

+  x5 = dct_const_round_shift(s5 + s13);

+  x6 = dct_const_round_shift(s6 + s14);

+  x7 = dct_const_round_shift(s7 + s15);

+  x8  = dct_const_round_shift(s0 - s8);

+  x9  = dct_const_round_shift(s1 - s9);

+  x10 = dct_const_round_shift(s2 - s10);

+  x11 = dct_const_round_shift(s3 - s11);

+  x12 = dct_const_round_shift(s4 - s12);

+  x13 = dct_const_round_shift(s5 - s13);

+  x14 = dct_const_round_shift(s6 - s14);

+  x15 = dct_const_round_shift(s7 - s15);

+  // stage 2

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4;

+  s5 = x5;

+  s6 = x6;

+  s7 = x7;

+  s8 =    x8 * cospi_4_64   + x9 * cospi_28_64;

+  s9 =    x8 * cospi_28_64  - x9 * cospi_4_64;

+  s10 =   x10 * cospi_20_64 + x11 * cospi_12_64;

+  s11 =   x10 * cospi_12_64 - x11 * cospi_20_64;

+  s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;

+  s13 =   x12 * cospi_4_64  + x13 * cospi_28_64;

+  s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;

+  s15 =   x14 * cospi_20_64 + x15 * cospi_12_64;

+  x0 = s0 + s4;

+  x1 = s1 + s5;

+  x2 = s2 + s6;

+  x3 = s3 + s7;

+  x4 = s0 - s4;

+  x5 = s1 - s5;

+  x6 = s2 - s6;

+  x7 = s3 - s7;

+  x8 = dct_const_round_shift(s8 + s12);

+  x9 = dct_const_round_shift(s9 + s13);

+  x10 = dct_const_round_shift(s10 + s14);

+  x11 = dct_const_round_shift(s11 + s15);

+  x12 = dct_const_round_shift(s8 - s12);

+  x13 = dct_const_round_shift(s9 - s13);

+  x14 = dct_const_round_shift(s10 - s14);

+  x15 = dct_const_round_shift(s11 - s15);

+  // stage 3

+  s0 = x0;

+  s1 = x1;

+  s2 = x2;

+  s3 = x3;

+  s4 = x4 * cospi_8_64  + x5 * cospi_24_64;

+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;

+  s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;

+  s7 =   x6 * cospi_8_64  + x7 * cospi_24_64;

+  s8 = x8;

+  s9 = x9;

+  s10 = x10;

+  s11 = x11;

+  s12 = x12 * cospi_8_64  + x13 * cospi_24_64;

+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;

+  s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;

+  s15 =   x14 * cospi_8_64  + x15 * cospi_24_64;

+  x0 = s0 + s2;

+  x1 = s1 + s3;

+  x2 = s0 - s2;

+  x3 = s1 - s3;

+  x4 = dct_const_round_shift(s4 + s6);

+  x5 = dct_const_round_shift(s5 + s7);

+  x6 = dct_const_round_shift(s4 - s6);

+  x7 = dct_const_round_shift(s5 - s7);

+  x8 = s8 + s10;

+  x9 = s9 + s11;

+  x10 = s8 - s10;

+  x11 = s9 - s11;

+  x12 = dct_const_round_shift(s12 + s14);

+  x13 = dct_const_round_shift(s13 + s15);

+  x14 = dct_const_round_shift(s12 - s14);

+  x15 = dct_const_round_shift(s13 - s15);

+  // stage 4

+  s2 = (- cospi_16_64) * (x2 + x3);

+  s3 = cospi_16_64 * (x2 - x3);

+  s6 = cospi_16_64 * (x6 + x7);

+  s7 = cospi_16_64 * (- x6 + x7);

+  s10 = cospi_16_64 * (x10 + x11);

+  s11 = cospi_16_64 * (- x10 + x11);

+  s14 = (- cospi_16_64) * (x14 + x15);

+  s15 = cospi_16_64 * (x14 - x15);

+  x2 = dct_const_round_shift(s2);

+  x3 = dct_const_round_shift(s3);

+  x6 = dct_const_round_shift(s6);

+  x7 = dct_const_round_shift(s7);

+  x10 = dct_const_round_shift(s10);

+  x11 = dct_const_round_shift(s11);

+  x14 = dct_const_round_shift(s14);

+  x15 = dct_const_round_shift(s15);

+  output[0] = x0;

+  output[1] = - x8;

+  output[2] = x12;

+  output[3] = - x4;

+  output[4] = x6;

+  output[5] = x14;

+  output[6] = x10;

+  output[7] = x2;

+  output[8] = x3;

+  output[9] =  x11;

+  output[10] = x15;

+  output[11] = x7;

+  output[12] = x5;

+  output[13] = - x13;

+  output[14] = x9;

+  output[15] = - x1;

+}

+void vp9_short_iht16x16_c(int16_t *input, int16_t *output,

+                        int pitch, TX_TYPE tx_type) {

+  int16_t out[16 * 16];

+  int16_t *outptr = &out[0];

+  const int short_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[16], temp_out[16];

+  void (*invr)(int16_t*, int16_t*);

+  void (*invc)(int16_t*, int16_t*);

+  switch (tx_type) {

+    case ADST_ADST:

+      invc = &iadst16_1d;

+      invr = &iadst16_1d;

+      break;

+    case ADST_DCT:

+      invc = &iadst16_1d;

+      invr = &idct16_1d;

+      break;

+    case DCT_ADST:

+      invc = &idct16_1d;

+      invr = &iadst16_1d;

+      break;

+    case DCT_DCT:

+      invc = &idct16_1d;

+      invr = &idct16_1d;

+      break;

+    default:

+      assert(0);

+  }

+  // inverse transform row vectors

+  for (i = 0; i < 16; ++i) {

+    invr(input, outptr);

+    input += short_pitch;

+    outptr += 16;

+  }

+  // inverse transform column vectors

+  for (i = 0; i < 16; ++i) {

+    for (j = 0; j < 16; ++j)

+      temp_in[j] = out[j * 16 + i];

+    invc(temp_in, temp_out);

+    for (j = 0; j < 16; ++j)

+      output[j * 16 + i] = (temp_out[j] + 32) >> 6;

+  }

+}

+#endif

 void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {

     int16_t out[16 * 16];

     int16_t *outptr = &out[0];

--- a/vp9/common/vp9_invtrans.c

+++ b/vp9/common/vp9_invtrans.c

@@ -116,7 +116,11 @@

   BLOCKD *bd = &xd->block[0];

   TX_TYPE tx_type = get_tx_type_16x16(xd, bd);

   if (tx_type != DCT_DCT) {

+#if CONFIG_INTHT16X16

+    vp9_short_iht16x16(bd->dqcoeff, bd->diff, 32, tx_type);

+#else

     vp9_ihtllm(bd->dqcoeff, bd->diff, 32, tx_type, 16, bd->eob);

+#endif

   } else {

     vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],

                                   &xd->block[0].diff[0], 32);

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -294,6 +294,11 @@

 specialize vp9_short_iht4x4

 #endif

+#if CONFIG_INTHT16X16

+prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"

+specialize vp9_short_iht16x16

+#endif

 prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"

 specialize vp9_ihtllm

--- a/vp9/decoder/vp9_dequantize.c

+++ b/vp9/decoder/vp9_dequantize.c

@@ -267,7 +267,11 @@

       input[i] = input[i] * dq[1];

     // inverse hybrid transform

+#if CONFIG_INTHT16X16

+    vp9_short_iht16x16(input, output, 32, tx_type);

+#else

     vp9_ihtllm(input, output, 32, tx_type, 16, eobs);

+#endif

     // the idct halves ( >> 1) the pitch

     // vp9_short_idct16x16_c(input, output, 32);

--

⑨