shithub: libvpx

Download patch

ref: 5fbc7a286b4d72883392fdbb10ec52bace662f66
parent: 356174583506fb6654a3de7264348fbbfb7ca62c
author: Johann <johann.koenig@duck.com>
date: Fri Nov 30 10:42:57 EST 2018

quantize 32x32: saturate dqcoeff on x86

This slows down low bitdepth builds but is necessary to obtain correct
values.

BUG=webm:1448

Change-Id: I4ca9145f576089bb8496fcfeedeb556dc8fe6574

--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -496,7 +496,6 @@
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-#if CONFIG_VP9_HIGHBITDEPTH
 #if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VP9QuantizeTest,
@@ -521,35 +520,9 @@
                                  false)));
 
 #endif  // ARCH_X86_64
-#else
-#if ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
-                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
-                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
-                                 VPX_BITS_8, 32, true)));
-
-#else
-INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
-                        ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
-                                                     &vpx_quantize_b_c,
-                                                     VPX_BITS_8, 16, false)));
-#endif  // ARCH_X86_64
-// TODO(webm:1448): lowbd truncates results in C.
-INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest,
-                        ::testing::Values(make_tuple(
-                            &vpx_quantize_b_32x32_ssse3,
-                            &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
-#if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(AVX, VP9QuantizeTest,
                         ::testing::Values(make_tuple(&vpx_quantize_b_avx,
                                                      &vpx_quantize_b_c,
@@ -557,17 +530,6 @@
                                           make_tuple(&vpx_quantize_b_32x32_avx,
                                                      &vpx_quantize_b_32x32_c,
                                                      VPX_BITS_8, 32, false)));
-#else
-INSTANTIATE_TEST_CASE_P(AVX, VP9QuantizeTest,
-                        ::testing::Values(make_tuple(&vpx_quantize_b_avx,
-                                                     &vpx_quantize_b_c,
-                                                     VPX_BITS_8, 16, false)));
-// TODO(webm:1448): lowbd truncates results in C.
-INSTANTIATE_TEST_CASE_P(DISABLED_AVX, VP9QuantizeTest,
-                        ::testing::Values(make_tuple(&vpx_quantize_b_32x32_avx,
-                                                     &vpx_quantize_b_32x32_c,
-                                                     VPX_BITS_8, 32, false)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX
 
 #if ARCH_X86_64 && HAVE_AVX2
@@ -576,7 +538,7 @@
     ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true)));
-#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_AVX2
 
 // TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -12,6 +12,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/quantize.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
@@ -259,7 +260,15 @@
           15;
 
     qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+#if (ARCH_X86 || ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH
+    // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than
+    // truncating with a cast, saturate the value. This is easier to implement
+    // on x86 and preserves the sign of the value.
+    dqcoeff_ptr[rc] =
+        clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX);
+#else
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+#endif  // ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH
 
     if (tmp) eob = idx_arr[i];
   }
--- a/vpx_dsp/x86/quantize_ssse3.h
+++ b/vpx_dsp/x86/quantize_ssse3.h
@@ -24,7 +24,6 @@
   // Un-sign to bias rounding like C.
   const __m128i coeff = _mm_abs_epi16(qcoeff);
 
-#if CONFIG_VP9_HIGHBITDEPTH
   const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
   const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
 
@@ -40,17 +39,12 @@
   dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
   dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
 
+#if CONFIG_VP9_HIGHBITDEPTH
   _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
   _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
 #else
-  __m128i dqcoeff16 = _mm_mullo_epi16(coeff, dequant);
-  (void)zero;
-
-  dqcoeff16 = _mm_srli_epi16(dqcoeff16, 1);
-
-  dqcoeff16 = _mm_sign_epi16(dqcoeff16, qcoeff);
-
-  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16);
+  _mm_store_si128((__m128i *)(dqcoeff),
+                  _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }