shithub: libvpx

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -11,40 +11,10 @@

 #include <math.h>

 #include "./vp9_rtcd.h"

-#include "vp9/common/vp9_systemdependent.h"

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_idct.h"

+#include "vp9/common/vp9_systemdependent.h"

-#if CONFIG_EMULATE_HARDWARE

-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a

-// non-normative method to handle overflows. A stream that causes

-// overflows  in the inverse transform is considered invalid in VP9,

-// and a hardware implementer is free to choose any reasonable

-// method to handle overflows. However to aid in hardware

-// verification they can use a specific implementation of the

-// WRAPLOW() macro below that is identical to their intended

-// hardware implementation (and also use configure options to trigger

-// the C-implementation of the transform).

-//

-// The particular WRAPLOW implementation below performs strict

-// overflow wrapping to match common hardware implementations.

-// bd of 8 uses trans_low with 16bits, need to remove 16bits

-// bd of 10 uses trans_low with 18bits, need to remove 14bits

-// bd of 12 uses trans_low with 20bits, need to remove 12bits

-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits

-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))

-#else

-#define WRAPLOW(x, bd) ((int32_t)(x))

-#endif  // CONFIG_EMULATE_HARDWARE

-#if CONFIG_VP9_HIGHBITDEPTH

-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,

-                                             int bd) {

-  trans = WRAPLOW(trans, bd);

-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {

   trans = WRAPLOW(trans, 8);

   return clip_pixel(WRAPLOW(dest + trans, 8));

@@ -1540,7 +1510,7 @@

-static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {

+void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {

   tran_low_t step[4];

   tran_high_t temp1, temp2;

   (void) bd;

@@ -1571,7 +1541,7 @@

   // Rows

   for (i = 0; i < 4; ++i) {

-    highbd_idct4(input, outptr, bd);

+    vp9_highbd_idct4(input, outptr, bd);

     input += 4;

     outptr += 4;

@@ -1580,7 +1550,7 @@

   for (i = 0; i < 4; ++i) {

     for (j = 0; j < 4; ++j)

       temp_in[j] = out[j * 4 + i];

-    highbd_idct4(temp_in, temp_out, bd);

+    vp9_highbd_idct4(temp_in, temp_out, bd);

     for (j = 0; j < 4; ++j) {

       dest[j * stride + i] = highbd_clip_pixel_add(

           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);

@@ -1607,7 +1577,7 @@

-static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {

+void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {

   tran_low_t step1[8], step2[8];

   tran_high_t temp1, temp2;

   // stage 1

@@ -1625,7 +1595,7 @@

   step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);

   // stage 2 & stage 3 - even half

-  highbd_idct4(step1, step1, bd);

+  vp9_highbd_idct4(step1, step1, bd);

   // stage 2 - odd half

   step2[4] = WRAPLOW(step1[4] + step1[5], bd);

@@ -1662,7 +1632,7 @@

   // First transform rows.

   for (i = 0; i < 8; ++i) {

-    highbd_idct8(input, outptr, bd);

+    vp9_highbd_idct8(input, outptr, bd);

     input += 8;

     outptr += 8;

@@ -1671,7 +1641,7 @@

   for (i = 0; i < 8; ++i) {

     for (j = 0; j < 8; ++j)

       temp_in[j] = out[j * 8 + i];

-    highbd_idct8(temp_in, temp_out, bd);

+    vp9_highbd_idct8(temp_in, temp_out, bd);

     for (j = 0; j < 8; ++j) {

       dest[j * stride + i] = highbd_clip_pixel_add(

           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

@@ -1735,9 +1705,9 @@

 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,

                                 int stride, int tx_type, int bd) {

   const highbd_transform_2d IHT_4[] = {

-    { highbd_idct4, highbd_idct4  },    // DCT_DCT  = 0

-    { highbd_iadst4, highbd_idct4 },    // ADST_DCT = 1

-    { highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2

+    { vp9_highbd_idct4, vp9_highbd_idct4  },    // DCT_DCT  = 0

+    { highbd_iadst4, vp9_highbd_idct4 },    // ADST_DCT = 1

+    { vp9_highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2

     { highbd_iadst4, highbd_iadst4 }    // ADST_ADST = 3

};

   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);

@@ -1844,9 +1814,9 @@

 static const highbd_transform_2d HIGH_IHT_8[] = {

-  { highbd_idct8,  highbd_idct8  },  // DCT_DCT  = 0

-  { highbd_iadst8, highbd_idct8  },  // ADST_DCT = 1

-  { highbd_idct8,  highbd_iadst8 },  // DCT_ADST = 2

+  { vp9_highbd_idct8,  vp9_highbd_idct8  },  // DCT_DCT  = 0

+  { highbd_iadst8, vp9_highbd_idct8  },  // ADST_DCT = 1

+  { vp9_highbd_idct8,  highbd_iadst8 },  // DCT_ADST = 2

   { highbd_iadst8, highbd_iadst8 }   // ADST_ADST = 3

};

@@ -1889,7 +1859,7 @@

   // First transform rows.

   // Only first 4 row has non-zero coefs.

   for (i = 0; i < 4; ++i) {

-    highbd_idct8(input, outptr, bd);

+    vp9_highbd_idct8(input, outptr, bd);

     input += 8;

     outptr += 8;

@@ -1897,7 +1867,7 @@

   for (i = 0; i < 8; ++i) {

     for (j = 0; j < 8; ++j)

       temp_in[j] = out[j * 8 + i];

-    highbd_idct8(temp_in, temp_out, bd);

+    vp9_highbd_idct8(temp_in, temp_out, bd);

     for (j = 0; j < 8; ++j) {

       dest[j * stride + i] = highbd_clip_pixel_add(

           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);

@@ -1905,7 +1875,7 @@

-static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {

+void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {

   tran_low_t step1[16], step2[16];

   tran_high_t temp1, temp2;

   (void) bd;

@@ -2081,7 +2051,7 @@

   // First transform rows.

   for (i = 0; i < 16; ++i) {

-    highbd_idct16(input, outptr, bd);

+    vp9_highbd_idct16(input, outptr, bd);

     input += 16;

     outptr += 16;

@@ -2090,7 +2060,7 @@

   for (i = 0; i < 16; ++i) {

     for (j = 0; j < 16; ++j)

       temp_in[j] = out[j * 16 + i];

-    highbd_idct16(temp_in, temp_out, bd);

+    vp9_highbd_idct16(temp_in, temp_out, bd);

     for (j = 0; j < 16; ++j) {

       dest[j * stride + i] = highbd_clip_pixel_add(

           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

@@ -2270,9 +2240,9 @@

 static const highbd_transform_2d HIGH_IHT_16[] = {

-  { highbd_idct16,  highbd_idct16  },  // DCT_DCT  = 0

-  { highbd_iadst16, highbd_idct16  },  // ADST_DCT = 1

-  { highbd_idct16,  highbd_iadst16 },  // DCT_ADST = 2

+  { vp9_highbd_idct16,  vp9_highbd_idct16  },  // DCT_DCT  = 0

+  { highbd_iadst16, vp9_highbd_idct16  },  // ADST_DCT = 1

+  { vp9_highbd_idct16,  highbd_iadst16 },  // DCT_ADST = 2

   { highbd_iadst16, highbd_iadst16 }   // ADST_ADST = 3

};

@@ -2315,7 +2285,7 @@

   // First transform rows. Since all non-zero dct coefficients are in

   // upper-left 4x4 area, we only need to calculate first 4 rows here.

   for (i = 0; i < 4; ++i) {

-    highbd_idct16(input, outptr, bd);

+    vp9_highbd_idct16(input, outptr, bd);

     input += 16;

     outptr += 16;

@@ -2324,7 +2294,7 @@

   for (i = 0; i < 16; ++i) {

     for (j = 0; j < 16; ++j)

       temp_in[j] = out[j*16 + i];

-    highbd_idct16(temp_in, temp_out, bd);

+    vp9_highbd_idct16(temp_in, temp_out, bd);

     for (j = 0; j < 16; ++j) {

       dest[j * stride + i] = highbd_clip_pixel_add(

           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);

--- a/vp9/common/vp9_idct.h

+++ b/vp9/common/vp9_idct.h

@@ -118,6 +118,28 @@

 } highbd_transform_2d;

 #endif  // CONFIG_VP9_HIGHBITDEPTH

+#if CONFIG_EMULATE_HARDWARE

+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a

+// non-normative method to handle overflows. A stream that causes

+// overflows  in the inverse transform is considered invalid in VP9,

+// and a hardware implementer is free to choose any reasonable

+// method to handle overflows. However to aid in hardware

+// verification they can use a specific implementation of the

+// WRAPLOW() macro below that is identical to their intended

+// hardware implementation (and also use configure options to trigger

+// the C-implementation of the transform).

+//

+// The particular WRAPLOW implementation below performs strict

+// overflow wrapping to match common hardware implementations.

+// bd of 8 uses trans_low with 16bits, need to remove 16bits

+// bd of 10 uses trans_low with 18bits, need to remove 14bits

+// bd of 12 uses trans_low with 20bits, need to remove 12bits

+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits

+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))

+#else

+#define WRAPLOW(x, bd) (x)

+#endif  // CONFIG_EMULATE_HARDWARE

 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

                      int eob);

 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

@@ -137,6 +159,9 @@

                       int stride, int eob);

 #if CONFIG_VP9_HIGHBITDEPTH

+void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);

+void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);

+void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);

 void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

                             int eob, int bd);

 void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,

@@ -153,6 +178,11 @@

                            uint8_t *dest, int stride, int eob, int bd);

 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,

                              uint8_t *dest, int stride, int eob, int bd);

+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,

+                                             int bd) {

+  trans = WRAPLOW(trans, bd);

+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);

+}

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #ifdef __cplusplus

 }  // extern "C"

--- a/vp9/encoder/vp9_dct.c

+++ b/vp9/encoder/vp9_dct.c

@@ -17,6 +17,7 @@

 #include "vp9/common/vp9_blockd.h"

 #include "vp9/common/vp9_idct.h"

 #include "vp9/common/vp9_systemdependent.h"

+#include "vp9/encoder/vp9_dct.h"

 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {

   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);

@@ -26,7 +27,7 @@

   return rv;

-static void fdct4(const tran_low_t *input, tran_low_t *output) {

+void vp9_fdct4(const tran_low_t *input, tran_low_t *output) {

   tran_high_t step[4];

   tran_high_t temp1, temp2;

@@ -123,7 +124,7 @@

-static void fadst4(const tran_low_t *input, tran_low_t *output) {

+void vp9_fadst4(const tran_low_t *input, tran_low_t *output) {

   tran_high_t x0, x1, x2, x3;

   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

@@ -163,13 +164,6 @@

   output[3] = (tran_low_t)fdct_round_shift(s3);

-static const transform_2d FHT_4[] = {

-  { fdct4,  fdct4  },  // DCT_DCT  = 0

-  { fadst4, fdct4  },  // ADST_DCT = 1

-  { fdct4,  fadst4 },  // DCT_ADST = 2

-  { fadst4, fadst4 }   // ADST_ADST = 3

-};

 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,

                   int stride, int tx_type) {

   if (tx_type == DCT_DCT) {

@@ -203,7 +197,7 @@

-static void fdct8(const tran_low_t *input, tran_low_t *output) {

+void vp9_fdct8(const tran_low_t *input, tran_low_t *output) {

   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16

   tran_high_t t0, t1, t2, t3;                  // needs32

   tran_high_t x0, x1, x2, x3;                  // canbe16

@@ -331,7 +325,7 @@

   // Rows

   for (i = 0; i < 8; ++i) {

-    fdct8(&intermediate[i * 8], &final_output[i * 8]);

+    vp9_fdct8(&intermediate[i * 8], &final_output[i * 8]);

     for (j = 0; j < 8; ++j)

       final_output[j + i * 8] /= 2;

@@ -413,7 +407,7 @@

   // Rows

   for (i = 0; i < 8; ++i) {

-    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);

+    vp9_fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);

     for (j = 0; j < 8; ++j)

       coeff_ptr[j + i * 8] /= 2;

@@ -641,7 +635,7 @@

-static void fadst8(const tran_low_t *input, tran_low_t *output) {

+void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {

   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;

   tran_high_t x0 = input[7];

@@ -712,13 +706,6 @@

   output[7] = (tran_low_t)-x1;

-static const transform_2d FHT_8[] = {

-  { fdct8,  fdct8  },  // DCT_DCT  = 0

-  { fadst8, fdct8  },  // ADST_DCT = 1

-  { fdct8,  fadst8 },  // DCT_ADST = 2

-  { fadst8, fadst8 }   // ADST_ADST = 3

-};

 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,

                   int stride, int tx_type) {

   if (tx_type == DCT_DCT) {

@@ -807,7 +794,7 @@

 // Rewrote to use same algorithm as others.

-static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {

+void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]) {

   tran_high_t step1[8];      // canbe16

   tran_high_t step2[8];      // canbe16

   tran_high_t step3[8];      // canbe16

@@ -948,7 +935,7 @@

   out[15] = (tran_low_t)fdct_round_shift(temp2);

-static void fadst16(const tran_low_t *input, tran_low_t *output) {

+void vp9_fadst16(const tran_low_t *input, tran_low_t *output) {

   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;

   tran_high_t s9, s10, s11, s12, s13, s14, s15;

@@ -1111,13 +1098,6 @@

   output[15] = (tran_low_t)-x1;

-static const transform_2d FHT_16[] = {

-  { fdct16,  fdct16  },  // DCT_DCT  = 0

-  { fadst16, fdct16  },  // ADST_DCT = 1

-  { fdct16,  fadst16 },  // DCT_ADST = 2

-  { fadst16, fadst16 }   // ADST_ADST = 3

-};

 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,

                     int stride, int tx_type) {

   if (tx_type == DCT_DCT) {

@@ -1162,7 +1142,7 @@

   return rv;

-static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {

+void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) {

   tran_high_t step[32];

   // Stage 1

   step[0] = input[0] + input[(32 - 1)];

@@ -1505,7 +1485,7 @@

     tran_high_t temp_in[32], temp_out[32];

     for (j = 0; j < 32; ++j)

       temp_in[j] = input[j * stride + i] * 4;

-    fdct32(temp_in, temp_out, 0);

+    vp9_fdct32(temp_in, temp_out, 0);

     for (j = 0; j < 32; ++j)

       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;

@@ -1515,7 +1495,7 @@

     tran_high_t temp_in[32], temp_out[32];

     for (j = 0; j < 32; ++j)

       temp_in[j] = output[j + i * 32];

-    fdct32(temp_in, temp_out, 0);

+    vp9_fdct32(temp_in, temp_out, 0);

     for (j = 0; j < 32; ++j)

       out[j + i * 32] =

           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);

@@ -1534,7 +1514,7 @@

     tran_high_t temp_in[32], temp_out[32];

     for (j = 0; j < 32; ++j)

       temp_in[j] = input[j * stride + i] * 4;

-    fdct32(temp_in, temp_out, 0);

+    vp9_fdct32(temp_in, temp_out, 0);

     for (j = 0; j < 32; ++j)

       // TODO(cd): see quality impact of only doing

       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;

@@ -1547,7 +1527,7 @@

     tran_high_t temp_in[32], temp_out[32];

     for (j = 0; j < 32; ++j)

       temp_in[j] = output[j + i * 32];

-    fdct32(temp_in, temp_out, 1);

+    vp9_fdct32(temp_in, temp_out, 1);

     for (j = 0; j < 32; ++j)

       out[j + i * 32] = (tran_low_t)temp_out[j];

--- /dev/null

+++ b/vp9/encoder/vp9_dct.h

@@ -1,0 +1,61 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_ENCODER_VP9_DCT_H_

+#define VP9_ENCODER_VP9_DCT_H_

+#include "vp9/common/vp9_idct.h"

+#ifdef __cplusplus

+extern "C" {

+#endif

+void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);

+void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);

+void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,

+                            int stride);

+void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride);

+void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,

+                               int stride);

+void vp9_fdct4(const tran_low_t *input, tran_low_t *output);

+void vp9_fadst4(const tran_low_t *input, tran_low_t *output);

+void vp9_fdct8(const tran_low_t *input, tran_low_t *output);

+void vp9_fadst8(const tran_low_t *input, tran_low_t *output);

+void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]);

+void vp9_fadst16(const tran_low_t *input, tran_low_t *output);

+void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round);

+static const transform_2d FHT_4[] = {

+  { vp9_fdct4,  vp9_fdct4  },  // DCT_DCT  = 0

+  { vp9_fadst4, vp9_fdct4  },  // ADST_DCT = 1

+  { vp9_fdct4,  vp9_fadst4 },  // DCT_ADST = 2

+  { vp9_fadst4, vp9_fadst4 }   // ADST_ADST = 3

+};

+static const transform_2d FHT_8[] = {

+  { vp9_fdct8,  vp9_fdct8  },  // DCT_DCT  = 0

+  { vp9_fadst8, vp9_fdct8  },  // ADST_DCT = 1

+  { vp9_fdct8,  vp9_fadst8 },  // DCT_ADST = 2

+  { vp9_fadst8, vp9_fadst8 }   // ADST_ADST = 3

+};

+static const transform_2d FHT_16[] = {

+  { vp9_fdct16,  vp9_fdct16  },  // DCT_DCT  = 0

+  { vp9_fadst16, vp9_fdct16  },  // ADST_DCT = 1

+  { vp9_fdct16,  vp9_fadst16 },  // DCT_ADST = 2

+  { vp9_fadst16, vp9_fadst16 }   // ADST_ADST = 3

+};

+#ifdef __cplusplus

+}  // extern "C"

+#endif

+#endif  // VP9_ENCODER_VP9_DCT_H_

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -24,6 +24,7 @@

 VP9_CX_SRCS-yes += encoder/vp9_cost.h

 VP9_CX_SRCS-yes += encoder/vp9_cost.c

 VP9_CX_SRCS-yes += encoder/vp9_dct.c

+VP9_CX_SRCS-yes += encoder/vp9_dct.h

 VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c

 VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h

 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c

--

⑨