shithub: libvpx

--- a/test/convolve_test.cc

+++ b/test/convolve_test.cc

@@ -603,6 +603,75 @@

          UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);

+TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {

+  const uint8_t *const in = input();

+  uint8_t *const out = output();

+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];

+  const int kNumTests = 5000000;

+  const int width = Width();

+  const int height = Height();

+  vpx_usec_timer timer;

+  SetConstantInput(127);

+  vpx_usec_timer_start(&timer);

+  for (int n = 0; n < kNumTests; ++n) {

+    UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,

+                  width, height);

+  }

+  vpx_usec_timer_mark(&timer);

+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));

+  printf("convolve8_%dx%d_%d: %d us\n", width, height,

+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);

+}

+TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {

+  const uint8_t *const in = input();

+  uint8_t *const out = output();

+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];

+  const int kNumTests = 5000000;

+  const int width = Width();

+  const int height = Height();

+  vpx_usec_timer timer;

+  SetConstantInput(127);

+  vpx_usec_timer_start(&timer);

+  for (int n = 0; n < kNumTests; ++n) {

+    UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,

+                 width, height);

+  }

+  vpx_usec_timer_mark(&timer);

+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));

+  printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,

+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);

+}

+TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {

+  const uint8_t *const in = input();

+  uint8_t *const out = output();

+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];

+  const int kNumTests = 5000000;

+  const int width = Width();

+  const int height = Height();

+  vpx_usec_timer timer;

+  SetConstantInput(127);

+  vpx_usec_timer_start(&timer);

+  for (int n = 0; n < kNumTests; ++n) {

+    UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,

+                 width, height);

+  }

+  vpx_usec_timer_mark(&timer);

+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));

+  printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,

+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);

+}

 TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {

   const uint8_t *const in = input();

   uint8_t *const out = output();

--- a/vpx_dsp/x86/convolve_avx2.h

+++ b/vpx_dsp/x86/convolve_avx2.h

@@ -58,16 +58,19 @@

   const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);

   const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);

   const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);

-  // add and saturate the results together

-  const __m256i min_x2x1 = _mm256_min_epi16(x2, x1);

-  const __m256i max_x2x1 = _mm256_max_epi16(x2, x1);

-  __m256i temp = _mm256_adds_epi16(x0, x3);

-  temp = _mm256_adds_epi16(temp, min_x2x1);

-  temp = _mm256_adds_epi16(temp, max_x2x1);

+  __m256i sum1, sum2;

+  // sum the results together, saturating only on the final step

+  // adding x0 with x2 and x1 with x3 is the only order that prevents

+  // outranges for all filters

+  sum1 = _mm256_add_epi16(x0, x2);

+  sum2 = _mm256_add_epi16(x1, x3);

+  // add the rounding offset early to avoid another saturated add

+  sum1 = _mm256_add_epi16(sum1, k_64);

+  sum1 = _mm256_adds_epi16(sum1, sum2);

   // round and shift by 7 bit each 16 bit

-  temp = _mm256_adds_epi16(temp, k_64);

-  temp = _mm256_srai_epi16(temp, 7);

-  return temp;

+  sum1 = _mm256_srai_epi16(sum1, 7);

+  return sum1;

 static INLINE __m128i convolve8_8_avx2(const __m256i *const s,

@@ -82,16 +85,19 @@

                                        _mm256_castsi256_si128(f[2]));

   const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),

                                        _mm256_castsi256_si128(f[3]));

-  // add and saturate the results together

-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);

-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);

-  __m128i temp = _mm_adds_epi16(x0, x3);

-  temp = _mm_adds_epi16(temp, min_x2x1);

-  temp = _mm_adds_epi16(temp, max_x2x1);

-  // round and shift by 7 bit each 16 bit

-  temp = _mm_adds_epi16(temp, k_64);

-  temp = _mm_srai_epi16(temp, 7);

-  return temp;

+  __m128i sum1, sum2;

+  // sum the results together, saturating only on the final step

+  // adding x0 with x2 and x1 with x3 is the only order that prevents

+  // outranges for all filters

+  sum1 = _mm_add_epi16(x0, x2);

+  sum2 = _mm_add_epi16(x1, x3);

+  // add the rounding offset early to avoid another saturated add

+  sum1 = _mm_add_epi16(sum1, k_64);

+  sum1 = _mm_adds_epi16(sum1, sum2);

+  // shift by 7 bit each 16 bit

+  sum1 = _mm_srai_epi16(sum1, 7);

+  return sum1;

 #undef MM256_BROADCASTSI128_SI256

--- a/vpx_dsp/x86/convolve_ssse3.h

+++ b/vpx_dsp/x86/convolve_ssse3.h

@@ -48,16 +48,19 @@

   const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);

   const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);

   const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);

-  // add and saturate the results together

-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);

-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);

-  __m128i temp = _mm_adds_epi16(x0, x3);

-  temp = _mm_adds_epi16(temp, min_x2x1);

-  temp = _mm_adds_epi16(temp, max_x2x1);

-  // round and shift by 7 bit each 16 bit

-  temp = _mm_adds_epi16(temp, k_64);

-  temp = _mm_srai_epi16(temp, 7);

-  return temp;

+  __m128i sum1, sum2;

+  // sum the results together, saturating only on the final step

+  // adding x0 with x2 and x1 with x3 is the only order that prevents

+  // outranges for all filters

+  sum1 = _mm_add_epi16(x0, x2);

+  sum2 = _mm_add_epi16(x1, x3);

+  // add the rounding offset early to avoid another saturated add

+  sum1 = _mm_add_epi16(sum1, k_64);

+  sum1 = _mm_adds_epi16(sum1, sum2);

+  // shift by 7 bit each 16 bit

+  sum1 = _mm_srai_epi16(sum1, 7);

+  return sum1;

 static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,

--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c

+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c

@@ -38,8 +38,8 @@

     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,

     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {

   __m128i firstFilters, secondFilters, shuffle1, shuffle2;

-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;

-  __m128i addFilterReg64, filtersReg, srcReg, minReg;

+  __m128i srcRegFilt1, srcRegFilt2;

+  __m128i addFilterReg64, filtersReg, srcReg;

   unsigned int i;

   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64

@@ -75,18 +75,16 @@

     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);

     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

-    // extract the higher half of the lane

-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);

-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);

+    // sum the results together, saturating only on the final step

+    // the specific order of the additions prevents outranges

+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);

-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);

+    // extract the higher half of the register

+    srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);

-    // add and saturate all the results together

-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);

-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);

-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);

-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);

-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

+    // add the rounding offset early to avoid another saturated add

+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);

+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);

     // shift by 7 bit each 16 bits

     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

--

⑨