shithub: libvpx

Download patch

ref: 969957f9f2a124861145a0d18781b855e98caa54
parent: 9efc42f4f89eeb05aba384e9179281ece3be6429
author: Jingning Han <jingning@google.com>
date: Thu Jan 26 10:00:04 EST 2017

Fix real-time compression regression in hbd mode

This commit resolves the compression performance regression in
real-time encoding setting when high bit-depth mode is enabled.

The current solution temporarily disables the SIMD implementations
of vpx_satd, hadamard8x8, and hadamard16x16 in high bit-depth mode.

The commit makes the coding results bit-wise identical between
regular coding pipeline and high bit-depth at profile 0.

BUG=webm:1365

Change-Id: Icfb900821733749685370460a1a5a7e07f76f4bf

--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -315,11 +315,13 @@
     ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
                       make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(C, SatdTest,
                         ::testing::Values(make_tuple(16, &vpx_satd_c),
                                           make_tuple(64, &vpx_satd_c),
                                           make_tuple(256, &vpx_satd_c),
                                           make_tuple(1024, &vpx_satd_c)));
+#endif
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
@@ -345,6 +347,7 @@
                       make_tuple(64, &vpx_int_pro_col_sse2,
                                  &vpx_int_pro_col_c)));
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
                         ::testing::Values(make_tuple(16, &vpx_satd_sse2),
                                           make_tuple(64, &vpx_satd_sse2),
@@ -351,6 +354,7 @@
                                           make_tuple(256, &vpx_satd_sse2),
                                           make_tuple(1024, &vpx_satd_sse2)));
 #endif
+#endif
 
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(
@@ -376,12 +380,14 @@
                       make_tuple(64, &vpx_int_pro_col_neon,
                                  &vpx_int_pro_col_c)));
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
                         ::testing::Values(make_tuple(16, &vpx_satd_neon),
                                           make_tuple(64, &vpx_satd_neon),
                                           make_tuple(256, &vpx_satd_neon),
                                           make_tuple(1024, &vpx_satd_neon)));
-#endif
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
 
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(
@@ -407,11 +413,13 @@
                       make_tuple(64, &vpx_int_pro_col_msa,
                                  &vpx_int_pro_col_c)));
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
                         ::testing::Values(make_tuple(16, &vpx_satd_msa),
                                           make_tuple(64, &vpx_satd_msa),
                                           make_tuple(256, &vpx_satd_msa),
                                           make_tuple(1024, &vpx_satd_msa)));
-#endif
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA
 
 }  // namespace
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -144,6 +144,7 @@
   }
 }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_c));
 
@@ -166,6 +167,7 @@
 INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test,
                         ::testing::Values(&vpx_hadamard_8x8_msa));
 #endif  // HAVE_MSA
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
 
 class Hadamard16x16Test : public HadamardTestBase {};
 
@@ -210,6 +212,7 @@
   }
 }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
                         ::testing::Values(&vpx_hadamard_16x16_c));
 
@@ -227,4 +230,5 @@
 INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test,
                         ::testing::Values(&vpx_hadamard_16x16_msa));
 #endif  // HAVE_MSA
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -132,6 +132,9 @@
   add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
 
+  add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
+  specialize qw/vp9_block_error_fp/;
+
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
   add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1815,7 +1815,9 @@
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  {
+  // TODO(jingning): Implement integral projection functions for high bit-depth
+  // setting and remove this part of code.
+  if (xd->bd != 8) {
     unsigned int this_sad;
     tmp_mv->row = 0;
     tmp_mv->col = 0;
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -590,25 +590,10 @@
   *out_dist_sum += dist << 4;
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
 static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
                       int *skippable, int64_t *sse, BLOCK_SIZE bsize,
                       TX_SIZE tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
-  unsigned int var_y, sse_y;
-
-  (void)tx_size;
-  model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist, &var_y,
-                    &sse_y);
-  *sse = INT_MAX;
-  *skippable = 0;
-  return;
-}
-#else
-static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
-                      int *skippable, int64_t *sse, BLOCK_SIZE bsize,
-                      TX_SIZE tx_size) {
-  MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
   const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -624,6 +609,20 @@
   const int bw = 4 * num_4x4_w;
   const int bh = 4 * num_4x4_h;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  // TODO(jingning): Implement the high bit-depth Hadamard transforms and
+  // remove this check condition.
+  if (xd->bd != 8) {
+    unsigned int var_y, sse_y;
+    (void)tx_size;
+    model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
+                      &var_y, &sse_y);
+    *sse = INT_MAX;
+    *skippable = 0;
+    return;
+  }
+#endif
+
   (void)cpi;
 
   // The max tx_size passed in is TX_16X16.
@@ -648,7 +647,7 @@
 
         switch (tx_size) {
           case TX_16X16:
-            vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+            vpx_hadamard_16x16(src_diff, diff_stride, coeff);
             vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                             pd->dequant, eob, scan_order->scan,
@@ -655,7 +654,7 @@
                             scan_order->iscan);
             break;
           case TX_8X8:
-            vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+            vpx_hadamard_8x8(src_diff, diff_stride, coeff);
             vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                             pd->dequant, eob, scan_order->scan,
@@ -699,7 +698,7 @@
         if (*eob == 1)
           this_rdc->rate += (int)abs(qcoeff[0]);
         else if (*eob > 1)
-          this_rdc->rate += vpx_satd((const int16_t *)qcoeff, step << 4);
+          this_rdc->rate += vpx_satd(qcoeff, step << 4);
 
         this_rdc->dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> 2;
       }
@@ -711,7 +710,6 @@
   this_rdc->rate <<= (2 + VP9_PROB_COST_SHIFT);
   this_rdc->rate += (eob_cost << VP9_PROB_COST_SHIFT);
 }
-#endif
 
 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE plane_bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -321,7 +321,7 @@
   return error;
 }
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                              int block_size) {
   int i;
   int64_t error = 0;
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -67,9 +67,10 @@
 // The order of the output coeff of the hadamard is not important. For
 // optimization purposes the final transpose may be skipped.
 void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
-                        int16_t *coeff) {
+                        tran_low_t *coeff) {
   int idx;
   int16_t buffer[64];
+  int16_t buffer2[64];
   int16_t *tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
     hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
@@ -80,17 +81,19 @@
 
   tmp_buf = &buffer[0];
   for (idx = 0; idx < 8; ++idx) {
-    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit
-                                       // dynamic range [-2040, 2040]
-    coeff += 8;                        // coeff: 15 bit
-                                       // dynamic range [-16320, 16320]
+    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
     ++tmp_buf;
   }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
 }
 
 // In place 16x16 2D Hadamard transform
 void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
-                          int16_t *coeff) {
+                          tran_low_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
@@ -101,15 +104,15 @@
 
   // coeff: 15 bit, dynamic range [-16320, 16320]
   for (idx = 0; idx < 64; ++idx) {
-    int16_t a0 = coeff[0];
-    int16_t a1 = coeff[64];
-    int16_t a2 = coeff[128];
-    int16_t a3 = coeff[192];
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
 
-    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
-    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
-    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
-    int16_t b3 = (a2 - a3) >> 1;
+    tran_low_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    tran_low_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    tran_low_t b3 = (a2 - a3) >> 1;
 
     coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
     coeff[64] = b1 + b3;
@@ -122,7 +125,7 @@
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
-int vpx_satd_c(const int16_t *coeff, int length) {
+int vpx_satd_c(const tran_low_t *coeff, int length) {
   int i;
   int satd = 0;
   for (i = 0; i < length; ++i) satd += abs(coeff[i]);
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -885,14 +885,26 @@
   add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   specialize qw/vpx_minmax_8x8 sse2 neon msa/;
 
-  add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
 
-  add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
+    specialize qw/vpx_hadamard_8x8/;
 
-  add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
-  specialize qw/vpx_satd sse2 neon msa/;
+    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
+    specialize qw/vpx_hadamard_16x16/;
+
+    add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
+    specialize qw/vpx_satd/;
+  } else {
+    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+    specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
+
+    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+    specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
+
+    add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
+    specialize qw/vpx_satd sse2 neon msa/;
+  }
 
   add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
   specialize qw/vpx_int_pro_row sse2 neon msa/;