shithub: libvpx

--- a/configure

+++ b/configure

@@ -250,6 +250,7 @@

     enable_6tap

     abovesprefmv

     intht

+    intht4x4

 CONFIG_LIST="

     external_build

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -408,7 +408,7 @@

 #define ACTIVE_HT8  300

-#define ACTIVE_HT16 300

+#define ACTIVE_HT16 0

 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE

 static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {

--- a/vp9/common/vp9_idct.h

+++ b/vp9/common/vp9_idct.h

@@ -50,6 +50,14 @@

 static const int cospi_30_64 = 1606;

 static const int cospi_31_64 = 804;

+#if CONFIG_INTHT4X4

+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3

+static const int sinpi_1_9 = 5283;

+static const int sinpi_2_9 = 9929;

+static const int sinpi_3_9 = 13377;

+static const int sinpi_4_9 = 15212;

+#endif

 static INLINE int dct_const_round_shift(int input) {

   int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;

   assert((rv <= INT16_MAX) && (rv >= INT16_MIN));

--- a/vp9/common/vp9_idctllm.c

+++ b/vp9/common/vp9_idctllm.c

@@ -494,7 +494,6 @@

 #endif

 void idct4_1d(int16_t *input, int16_t *output) {

   int16_t step[4];

   int temp1, temp2;

@@ -651,6 +650,100 @@

+#if CONFIG_INTHT4X4

+static void iadst4_1d(int16_t *input, int16_t *output) {

+  int x0, x1, x2, x3;

+  int s0, s1, s2, s3, s4, s5, s6, s7;

+  x0 = input[0];

+  x1 = input[1];

+  x2 = input[2];

+  x3 = input[3];

+  if (!(x0 | x1 | x2 | x3)) {

+    output[0] = output[1] = output[2] = output[3] = 0;

+    return;

+  }

+  s0 = sinpi_1_9 * x0;

+  s1 = sinpi_2_9 * x0;

+  s2 = sinpi_3_9 * x1;

+  s3 = sinpi_4_9 * x2;

+  s4 = sinpi_1_9 * x2;

+  s5 = sinpi_2_9 * x3;

+  s6 = sinpi_4_9 * x3;

+  s7 = x0 - x2 + x3;

+  x0 = s0 + s3 + s5;

+  x1 = s1 - s4 - s6;

+  x2 = sinpi_3_9 * s7;

+  x3 = s2;

+  s0 = x0 + x3;

+  s1 = x1 + x3;

+  s2 = x2;

+  s3 = x0 + x1 - x3;

+  // 1-D transform scaling factor is sqrt(2).

+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)

+  // + 1b (addition) = 29b.

+  // Hence the output bit depth is 15b.

+  output[0] = dct_const_round_shift(s0);

+  output[1] = dct_const_round_shift(s1);

+  output[2] = dct_const_round_shift(s2);

+  output[3] = dct_const_round_shift(s3);

+}

+void vp9_short_iht4x4_c(int16_t *input, int16_t *output,

+                        int pitch, TX_TYPE tx_type) {

+  int16_t out[16];

+  int16_t *outptr = &out[0];

+  const int short_pitch = pitch >> 1;

+  int i, j;

+  int16_t temp_in[4], temp_out[4];

+  void (*invr)(int16_t*, int16_t*);

+  void (*invc)(int16_t*, int16_t*);

+  switch (tx_type) {

+    case ADST_ADST:

+      invc = &iadst4_1d;

+      invr = &iadst4_1d;

+      break;

+    case ADST_DCT:

+      invc = &iadst4_1d;

+      invr = &idct4_1d;

+      break;

+    case DCT_ADST:

+      invc = &idct4_1d;

+      invr = &iadst4_1d;

+      break;

+    case DCT_DCT:

+      invc = &idct4_1d;

+      invr = &idct4_1d;

+      break;

+    default:

+      assert(0);

+  }

+  // inverse transform row vectors

+  for (i = 0; i < 4; ++i) {

+    invr(input, outptr);

+    input  += 4;

+    outptr += 4;

+  }

+  // inverse transform column vectors

+  for (i = 0; i < 4; ++i) {

+    for (j = 0; j < 4; ++j)

+      temp_in[j] = out[j * 4 + i];

+    invc(temp_in, temp_out);

+    for (j = 0; j < 4; ++j)

+      output[j * short_pitch + i] = (temp_out[j] + 8) >> 4;

+  }

+}

+#endif

 #if CONFIG_INTHT

 static void iadst8_1d(int16_t *input, int16_t *output) {

   int x0, x1, x2, x3, x4, x5, x6, x7;

@@ -733,7 +826,7 @@

 void vp9_short_iht8x8_c(int16_t *input, int16_t *output,

-                        TX_TYPE tx_type, int pitch) {

+                        int pitch, TX_TYPE tx_type) {

   int16_t out[8 * 8];

   int16_t *outptr = &out[0];

   const int short_pitch = pitch >> 1;

--- a/vp9/common/vp9_invtrans.c

+++ b/vp9/common/vp9_invtrans.c

@@ -51,8 +51,13 @@

   for (i = 0; i < 16; i++) {

     TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]);

     if (tx_type != DCT_DCT) {

+#if CONFIG_INTHT4X4

+      vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff,

+                       32, tx_type);

+#else

       vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32,

                    tx_type, 4, xd->block[i].eob);

+#endif

     } else {

       vp9_inverse_transform_b_4x4(xd, i, 32);

@@ -93,7 +98,7 @@

     if (tx_type != DCT_DCT) {

 #if CONFIG_INTHT

       vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff,

-                           tx_type, 32);

+                           32, tx_type);

 #else

       vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8,

                  xd->block[i].eob);

@@ -108,7 +113,7 @@

     if (tx_type != DCT_DCT) {

 #if CONFIG_INTHT

       vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff,

-                           tx_type, 32);

+                           32, tx_type);

 #else

       vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8,

                  xd->block[i + 2].eob);

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -300,8 +300,13 @@

 specialize vp9_short_idct1_32x32

 #if CONFIG_INTHT

-prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int tx_type, int pitch"

+prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"

 specialize vp9_short_iht8x8

+#endif

+#if CONFIG_INTHT4X4

+prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"

+specialize vp9_short_iht4x4

 #endif

 prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs"

--- a/vp9/decoder/vp9_dequantize.c

+++ b/vp9/decoder/vp9_dequantize.c

@@ -69,7 +69,11 @@

     input[i] = dq[i] * input[i];

+#if CONFIG_INTHT4X4

+  vp9_short_iht4x4(input, output, 8, tx_type);

+#else

   vp9_ihtllm(input, output, 4 << 1, tx_type, 4, eobs);

+#endif

   vpx_memset(input, 0, 32);

@@ -93,7 +97,7 @@

 #if CONFIG_INTHT

-    vp9_short_iht8x8(input, output, tx_type, 16);

+    vp9_short_iht8x8(input, output, 16, tx_type);

 #else

     vp9_ihtllm(input, output, 16, tx_type, 8, eobs);

 #endif

--- a/vp9/encoder/vp9_encodeintra.c

+++ b/vp9/encoder/vp9_encodeintra.c

@@ -56,7 +56,11 @@

   if (tx_type != DCT_DCT) {

     vp9_fht(be->src_diff, 32, be->coeff, tx_type, 4);

     vp9_ht_quantize_b_4x4(be, b, tx_type);

+#if CONFIG_INTHT4X4

+    vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type);

+#else

     vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);

+#endif

   } else {

     x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

     x->quantize_b_4x4(be, b) ;

@@ -155,7 +159,7 @@

 #if CONFIG_INTHT

       vp9_short_iht8x8(xd->block[idx].dqcoeff, xd->block[ib].diff,

-                            tx_type, 32);

+                            32, tx_type);

 #else

       vp9_ihtllm(xd->block[idx].dqcoeff, xd->block[ib].diff, 32,

                    tx_type, 8, xd->block[idx].eob);

@@ -173,7 +177,11 @@

       if (tx_type != DCT_DCT) {

         vp9_fht_c(be->src_diff, 32, be->coeff, tx_type, 4);

         vp9_ht_quantize_b_4x4(be, b, tx_type);

+#if CONFIG_INTHT4X4

+        vp9_short_iht4x4(b->dqcoeff, b->diff, 32, tx_type);

+#else

         vp9_ihtllm(b->dqcoeff, b->diff, 32, tx_type, 4, b->eob);

+#endif

       } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) {

         x->vp9_short_fdct8x4(be->src_diff, be->coeff, 32);

         x->quantize_b_4x4_pair(be, be + 1, b, b + 1);

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -1170,7 +1170,11 @@

   // inverse transform

   if (best_tx_type != DCT_DCT)

+#if CONFIG_INTHT4X4

+    vp9_short_iht4x4(best_dqcoeff, b->diff, 32, best_tx_type);

+#else

     vp9_ihtllm(best_dqcoeff, b->diff, 32, best_tx_type, 4, b->eob);

+#endif

   else

     xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32);

--

⑨