shithub: libvpx

--- a/test/vp9_quantize_test.cc

+++ b/test/vp9_quantize_test.cc

@@ -466,11 +466,11 @@

 #if HAVE_SSE2

 #if CONFIG_VP9_HIGHBITDEPTH

-// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds.

-// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8),

 INSTANTIATE_TEST_CASE_P(

     SSE2, VP9QuantizeTest,

     ::testing::Values(

+        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,

+                   false),

         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,

                    VPX_BITS_8, 16, false),

         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,

@@ -495,7 +495,7 @@

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 #endif  // HAVE_SSE2

-#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH

+#if HAVE_SSSE3

 #if ARCH_X86_64

 INSTANTIATE_TEST_CASE_P(

     SSSE3, VP9QuantizeTest,

@@ -512,30 +512,24 @@

                         ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,

                                                      &vpx_quantize_b_c,

                                                      VPX_BITS_8, 16, false)));

-#endif

+#endif  // ARCH_X86_64

-#if ARCH_X86_64

-// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test.

+// TODO(johannkoenig): fix 32x32

 INSTANTIATE_TEST_CASE_P(DISABLED_SSSE3, VP9QuantizeTest,

                         ::testing::Values(make_tuple(

                             &vpx_quantize_b_32x32_ssse3,

                             &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, false)));

-#endif  // ARCH_X86_64

-#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH

+#endif  // HAVE_SSSE3

-// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or

-// highbitdepth configurations.

-#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH

+#if HAVE_AVX

 INSTANTIATE_TEST_CASE_P(

     AVX, VP9QuantizeTest,

     ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c,

                                  VPX_BITS_8, 16, false),

-                      // Even though SSSE3 and AVX do not match the reference

-                      // code, we can keep them in sync with each other.

                       make_tuple(&vpx_quantize_b_32x32_avx,

                                  &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32,

                                  false)));

-#endif  // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH

+#endif  // HAVE_AVX

 #if ARCH_X86_64 && HAVE_AVX2

 INSTANTIATE_TEST_CASE_P(

--- a/vpx_dsp/x86/quantize_avx.c

+++ b/vpx_dsp/x86/quantize_avx.c

@@ -90,14 +90,12 @@

     store_tran_low(qcoeff0, qcoeff_ptr);

     store_tran_low(qcoeff1, qcoeff_ptr + 8);

-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);

+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);

     dequant = _mm_unpackhi_epi64(dequant, dequant);

-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);

+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);

-    store_tran_low(coeff0, dqcoeff_ptr);

-    store_tran_low(coeff1, dqcoeff_ptr + 8);

-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

+    eob =

+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

   // AC only loop.

@@ -134,13 +132,10 @@

     store_tran_low(qcoeff0, qcoeff_ptr + index);

     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);

-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);

-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);

+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);

+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);

-    store_tran_low(coeff0, dqcoeff_ptr + index);

-    store_tran_low(coeff1, dqcoeff_ptr + index + 8);

-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,

+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,

                         zero);

     eob = _mm_max_epi16(eob, eob0);

--- a/vpx_dsp/x86/quantize_sse2.c

+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -74,15 +74,12 @@

   store_tran_low(qcoeff0, qcoeff_ptr);

   store_tran_low(qcoeff1, qcoeff_ptr + 8);

-  coeff0 = calculate_dqcoeff(qcoeff0, dequant);

+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);

   dequant = _mm_unpackhi_epi64(dequant, dequant);

-  coeff1 = calculate_dqcoeff(qcoeff1, dequant);

+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);

-  store_tran_low(coeff0, dqcoeff_ptr);

-  store_tran_low(coeff1, dqcoeff_ptr + 8);

+  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

-  eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

   // AC only loop.

   while (index < n_coeffs) {

     coeff0 = load_tran_low(coeff_ptr + index);

@@ -108,13 +105,10 @@

     store_tran_low(qcoeff0, qcoeff_ptr + index);

     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);

-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);

-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);

+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);

+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);

-    store_tran_low(coeff0, dqcoeff_ptr + index);

-    store_tran_low(coeff1, dqcoeff_ptr + index + 8);

-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,

+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,

                         zero);

     eob = _mm_max_epi16(eob, eob0);

--- a/vpx_dsp/x86/quantize_sse2.h

+++ b/vpx_dsp/x86/quantize_sse2.h

@@ -48,6 +48,24 @@

   return _mm_mullo_epi16(qcoeff, dequant);

+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,

+                                               tran_low_t *dqcoeff) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  const __m128i low = _mm_mullo_epi16(qcoeff, dequant);

+  const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);

+  const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);

+  const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);

+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);

+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);

+#else

+  const __m128i dqcoeff16 = _mm_mullo_epi16(qcoeff, dequant);

+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16);

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+}

 // Scan 16 values for eob reference in scan. Use masks (-1) from comparing to

 // zbin to add 1 to the index in 'scan'.

 static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,

--- a/vpx_dsp/x86/quantize_ssse3.c

+++ b/vpx_dsp/x86/quantize_ssse3.c

@@ -67,15 +67,12 @@

   store_tran_low(qcoeff0, qcoeff_ptr);

   store_tran_low(qcoeff1, qcoeff_ptr + 8);

-  coeff0 = calculate_dqcoeff(qcoeff0, dequant);

+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);

   dequant = _mm_unpackhi_epi64(dequant, dequant);

-  coeff1 = calculate_dqcoeff(qcoeff1, dequant);

+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);

-  store_tran_low(coeff0, dqcoeff_ptr);

-  store_tran_low(coeff1, dqcoeff_ptr + 8);

+  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

-  eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);

   // AC only loop.

   while (index < n_coeffs) {

     coeff0 = load_tran_low(coeff_ptr + index);

@@ -99,13 +96,10 @@

     store_tran_low(qcoeff0, qcoeff_ptr + index);

     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);

-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);

-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);

+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);

+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);

-    store_tran_low(coeff0, dqcoeff_ptr + index);

-    store_tran_low(coeff1, dqcoeff_ptr + index + 8);

-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,

+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,

                         zero);

     eob = _mm_max_epi16(eob, eob0);

--

⑨