shithub: libvpx

--- a/vp9/common/generic/systemdependent.c

+++ b/vp9/common/generic/systemdependent.c

@@ -32,6 +32,7 @@

   rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;

   rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;

   rtcd->idct.idct16x16    = vp9_short_idct16x16_c;

+  rtcd->idct.idct10_16x16    = vp9_short_idct10_16x16_c;

   rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;

   rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;

--- a/vp9/common/idct.h

+++ b/vp9/common/idct.h

@@ -50,6 +50,11 @@

 #endif

 extern prototype_idct(vp9_idct_idct16x16);

+#ifndef vp9_idct_idct10_16x16

+#define vp9_idct_idct10_16x16 vp9_short_idct10_16x16_c

+#endif

+extern prototype_idct(vp9_idct_idct10_16x16);

 #ifndef vp9_idct_idct8

 #define vp9_idct_idct8 vp9_short_idct8x8_c

 #endif

@@ -133,6 +138,7 @@

   vp9_idct_fn_t ihaar2_1;

   vp9_idct_fn_t            idct16x16;

+  vp9_idct_fn_t            idct10_16x16;

 } vp9_idct_rtcd_vtable_t;

 #if CONFIG_RUNTIME_CPU_DETECT

--- a/vp9/common/idctllm.c

+++ b/vp9/common/idctllm.c

@@ -1502,6 +1502,161 @@

         output[j * 16 + i] = temp_out[j];

+/* The following function is called when we know the maximum number of non-zero

+ * dct coefficients is less or equal 10.

+ */

+static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16],

+                                      int last_shift_bits) {

+    int16_t step[16] = {0};

+    int intermediate[16] = {0};

+    int temp1, temp2;

+    int last_rounding = 0;

+    if (last_shift_bits > 0)

+      last_rounding = 1 << (last_shift_bits - 1);

+    // step 1 and 2

+    step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    // for odd input

+    temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 *= C8;

+    intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 *= C8;

+    intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT;

+    // step 3

+    output[0] = step[ 0];

+    output[1] = step[ 1];

+    output[2] = step[ 1];

+    output[3] = step[ 0];

+    temp1 = step[ 4] * C14;

+    output[4] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = step[ 4] * C2;

+    output[7] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = step[ 5] * C10;

+    output[5] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = step[ 5] * C6;

+    output[6] =  (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    output[8] = step[ 8] + step[11];

+    output[9] = step[ 9] + step[10];

+    output[10] = step[ 9] - step[10];

+    output[11] = step[ 8] - step[11];

+    output[12] = step[12] + step[15];

+    output[13] = step[13] + step[14];

+    output[14] = step[13] - step[14];

+    output[15] = step[12] - step[15];

+    // output 4

+    step[ 0] = output[0] + output[7];

+    step[ 1] = output[1] + output[6];

+    step[ 2] = output[2] + output[5];

+    step[ 3] = output[3] + output[4];

+    step[ 4] = output[3] - output[4];

+    step[ 5] = output[2] - output[5];

+    step[ 6] = output[1] - output[6];

+    step[ 7] = output[0] - output[7];

+    temp1 = output[8] * C7;

+    temp2 = output[15] * C9;

+    step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = output[9] * C11;

+    temp2 = output[14] * C5;

+    step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = output[10] * C3;

+    temp2 = output[13] * C13;

+    step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = output[11] * C15;

+    temp2 = output[12] * C1;

+    step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = output[11] * C1;

+    temp2 = output[12] * C15;

+    step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = output[10] * C13;

+    temp2 = output[13] * C3;

+    step[13] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = output[9] * C5;

+    temp2 = output[14] * C11;

+    step[14] = (temp2 - temp1 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    temp1 = output[8] * C9;

+    temp2 = output[15] * C7;

+    step[15] = (temp1 + temp2 +   RIGHT_ROUNDING) >> RIGHT_SHIFT;

+    // step 5

+    output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits;

+    output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits;

+    output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits;

+    output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits;

+    output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits;

+    output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits;

+    output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits;

+    output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits;

+    output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits;

+    output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits;

+    output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits;

+    output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits;

+    output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits;

+    output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits;

+    output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits;

+    output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits;

+}

+void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {

+    int16_t out[16 * 16];

+    int16_t *outptr = &out[0];

+    const int short_pitch = pitch >> 1;

+    int i, j;

+    int16_t temp_in[16], temp_out[16];

+    /* First transform rows. Since all non-zero dct coefficients are in

+     * upper-left 4x4 area, we only need to calculate first 4 rows here.

+     */

+    vpx_memset(out, 0, sizeof(out));

+    for (i = 0; i < 4; ++i) {

+      butterfly_16x16_idct10_1d(input, outptr, 0);

+      input += short_pitch;

+      outptr += 16;

+    }

+    // Then transform columns

+    for (i = 0; i < 16; ++i) {

+      for (j = 0; j < 16; ++j)

+        temp_in[j] = out[j*16 + i];

+      butterfly_16x16_idct10_1d(temp_in, temp_out, 3);

+      for (j = 0; j < 16; ++j)

+        output[j*16 + i] = temp_out[j];

+    }

+}

 #undef INITIAL_SHIFT

 #undef INITIAL_ROUNDING

 #undef RIGHT_SHIFT

--- a/vp9/common/rtcd_defs.sh

+++ b/vp9/common/rtcd_defs.sh

@@ -54,7 +54,7 @@

 prototype void vp9_dequant_idct_add_uv_block_8x8 "short *q, short *dq, unsigned char *pre, unsigned char *dstu, unsigned char *dstv, int stride, unsigned short *eobs, struct macroblockd *xd"

 specialize vp9_dequant_idct_add_uv_block_8x8

-prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"

+prototype void vp9_dequant_idct_add_16x16 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, unsigned short eobs"

 specialize vp9_dequant_idct_add_16x16

 prototype void vp9_dequant_idct_add_8x8 "short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride"

--- a/vp9/decoder/decodframe.c

+++ b/vp9/decoder/decodframe.c

@@ -401,7 +401,7 @@

       } else {

         vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,

                                      xd->predictor, xd->dst.y_buffer,

-                                     16, xd->dst.y_stride);

+                                     16, xd->dst.y_stride, xd->eobs[0]);

     } else if (tx_size == TX_8X8) {

 #if CONFIG_SUPERBLOCKS

--- a/vp9/decoder/dequantize.c

+++ b/vp9/decoder/dequantize.c

@@ -19,6 +19,28 @@

 extern int dec_debug;

 #endif

+static void recon(int16_t *diff, uint8_t *pred, int pitch, uint8_t *dest,

+                  int stride, int width, int height) {

+  int r, c;

+  for (r = 0; r < height; r++) {

+    for (c = 0; c < width; c++) {

+      int a = diff[c] + pred[c];

+      if (a < 0)

+        a = 0;

+      else if (a > 255)

+        a = 255;

+      dest[c] = (uint8_t) a;

+    }

+    dest += stride;

+    diff += width;

+    pred += pitch;

+  }

+}

 void vp9_dequantize_b_c(BLOCKD *d) {

   int i;

@@ -37,7 +59,6 @@

                                int pitch, int stride) {

   short output[16];

   short *diff_ptr = output;

-  int r, c;

   int i;

   for (i = 0; i < 16; i++) {

@@ -48,23 +69,7 @@

   vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-      for (c = 0; c < 4; c++) {

-        int a = diff_ptr[c] + pred[c];

-        if (a < 0)

-            a = 0;

-        if (a > 255)

-            a = 255;

-        dest[c] = (unsigned char) a;

-    }

-      dest += stride;

-      diff_ptr += 4;

-      pred += pitch;

-  }

+  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

 void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,

@@ -115,7 +120,6 @@

                             unsigned char *dest, int pitch, int stride) {

   short output[16];

   short *diff_ptr = output;

-  int r, c;

   int i;

   for (i = 0; i < 16; i++) {

@@ -127,23 +131,7 @@

   vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 4;

-    pred += pitch;

-  }

+  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

 void vp9_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,

@@ -152,7 +140,6 @@

   int i;

   short output[16];

   short *diff_ptr = output;

-  int r, c;

   input[0] = (short)Dc;

@@ -165,23 +152,7 @@

   vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 4;

-    pred += pitch;

-  }

+  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

 #if CONFIG_LOSSLESS

@@ -190,7 +161,6 @@

                                      int pitch, int stride) {

   short output[16];

   short *diff_ptr = output;

-  int r, c;

   int i;

   for (i = 0; i < 16; i++) {

@@ -201,23 +171,7 @@

   vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 4;

-    pred += pitch;

-  }

+  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

 void vp9_dequant_dc_idct_add_lossless_c(short *input, short *dq,

@@ -227,7 +181,6 @@

   int i;

   short output[16];

   short *diff_ptr = output;

-  int r, c;

   input[0] = (short)dc;

@@ -238,23 +191,7 @@

   vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1);

   vpx_memset(input, 0, 32);

-  for (r = 0; r < 4; r++) {

-    for (c = 0; c < 4; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 4;

-    pred += pitch;

-  }

+  recon(diff_ptr, pred, pitch, dest, stride, 4, 4);

 #endif

@@ -461,7 +398,7 @@

                                      int pitch, int stride) {

   short output[256];

   short *diff_ptr = output;

-  int r, c, i;

+  int i;

   input[0]= input[0] * dq[0];

@@ -477,55 +414,80 @@

   vpx_memset(input, 0, 512);

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

-      int a = diff_ptr[c] + pred[c];

-      if (a < 0)

-        a = 0;

-      else if (a > 255)

-        a = 255;

-      dest[c] = (unsigned char) a;

-    }

-    dest += stride;

-    diff_ptr += 16;

-    pred += pitch;

-  }

+  recon(diff_ptr, pred, pitch, dest, stride, 16, 16);

-void vp9_dequant_idct_add_16x16_c(short *input, short *dq, unsigned char *pred,

-                                  unsigned char *dest, int pitch, int stride) {

-  short output[256];

-  short *diff_ptr = output;

+void vp9_dequant_idct_add_16x16_c(int16_t *input, int16_t *dq, uint8_t *pred,

+                                  uint8_t *dest, int pitch, int stride,

+                                  uint16_t eobs) {

+  int16_t output[256];

+  int16_t *diff_ptr = output;

   int r, c, i;

-  input[0]= input[0] * dq[0];

+  /* The calculation can be simplified if there are not many non-zero dct

+   * coefficients. Use eobs to separate different cases. */

+  if (eobs == 0) {

+    /* All 0 DCT coefficient */

+    vp9_copy_mem16x16(pred, pitch, dest, stride);

+  } else if (eobs == 1) {

+    /* DC only DCT coefficient. */

+    int16_t out;

-  // recover quantizer for 4 4x4 blocks

-  for (i = 1; i < 256; i++)

-    input[i] = input[i] * dq[1];

+    out = (input[0] * dq[0] + 2) >> 2;

+    out = (out + 2) >> 2;

+    out = (out + 4) >> 3;

-  // the idct halves ( >> 1) the pitch

-  vp9_short_idct16x16_c(input, output, 32);

+    input[0] = 0;

-  vpx_memset(input, 0, 512);

+    for (r = 0; r < 16; r++) {

+      for (c = 0; c < 16; c++) {

+        int a = out + pred[c];

-  for (r = 0; r < 16; r++) {

-    for (c = 0; c < 16; c++) {

-      int a = diff_ptr[c] + pred[c];

+        if (a < 0)

+          a = 0;

+        else if (a > 255)

+          a = 255;

-      if (a < 0)

-        a = 0;

-      else if (a > 255)

-        a = 255;

+        dest[c] = (uint8_t) a;

+      }

-      dest[c] = (unsigned char) a;

+      dest += stride;

+      pred += pitch;

-    dest += stride;

-    diff_ptr += 16;

-    pred += pitch;

+  } else if (eobs <= 10) {

+    input[0]= input[0] * dq[0];

+    input[1] = input[1] * dq[1];

+    input[2] = input[2] * dq[1];

+    input[3] = input[3] * dq[1];

+    input[16] = input[16] * dq[1];

+    input[17] = input[17] * dq[1];

+    input[18] = input[18] * dq[1];

+    input[32] = input[32] * dq[1];

+    input[33] = input[33] * dq[1];

+    input[48] = input[48] * dq[1];

+    // the idct halves ( >> 1) the pitch

+    vp9_short_idct10_16x16_c(input, output, 32);

+    input[0] = input[1] = input[2] = input[3] = 0;

+    input[16] = input[17] = input[18] = 0;

+    input[32] = input[33] = 0;

+    input[48] = 0;

+    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);

+  } else {

+    input[0]= input[0] * dq[0];

+    // recover quantizer for 4 4x4 blocks

+    for (i = 1; i < 256; i++)

+      input[i] = input[i] * dq[1];

+    // the idct halves ( >> 1) the pitch

+    vp9_short_idct16x16_c(input, output, 32);

+    vpx_memset(input, 0, 512);

+    recon(diff_ptr, pred, pitch, dest, stride, 16, 16);

--

⑨