shithub: libvpx

--- a/test/avg_test.cc

+++ b/test/avg_test.cc

@@ -368,6 +368,21 @@

   Check(expected);

+TEST_P(SatdTest, DISABLED_Speed) {

+  const int kCountSpeedTestBlock = 20000;

+  vpx_usec_timer timer;

+  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);

+  const int blocksize = GET_PARAM(0);

+  vpx_usec_timer_start(&timer);

+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {

+    GET_PARAM(1)(coeff, blocksize);

+  }

+  vpx_usec_timer_mark(&timer);

+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));

+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);

+}

 TEST_P(BlockErrorTestFP, MinValue) {

   const int64_t kMin = -32640;

   const int64_t expected = kMin * kMin * txfm_size_;

@@ -472,6 +487,12 @@

 #endif  // HAVE_SSE2

 #if HAVE_AVX2

+INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,

+                        ::testing::Values(make_tuple(16, &vpx_satd_avx2),

+                                          make_tuple(64, &vpx_satd_avx2),

+                                          make_tuple(256, &vpx_satd_avx2),

+                                          make_tuple(1024, &vpx_satd_avx2)));

 INSTANTIATE_TEST_CASE_P(

     AVX2, BlockErrorTestFP,

     ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),

@@ -478,7 +499,7 @@

                       make_tuple(64, &vp9_block_error_fp_avx2),

                       make_tuple(256, &vp9_block_error_fp_avx2),

                       make_tuple(1024, &vp9_block_error_fp_avx2)));

-#endif  // HAVE_AVX2

+#endif

 #if HAVE_NEON

 INSTANTIATE_TEST_CASE_P(

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -773,7 +773,7 @@

     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;

     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";

-    specialize qw/vpx_satd sse2 neon/;

+    specialize qw/vpx_satd avx2 sse2 neon/;

   } else {

     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";

     specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";

@@ -782,7 +782,7 @@

     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;

     add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";

-    specialize qw/vpx_satd sse2 neon msa/;

+    specialize qw/vpx_satd avx2 sse2 neon msa/;

   add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";

--- a/vpx_dsp/x86/avg_intrin_avx2.c

+++ b/vpx_dsp/x86/avg_intrin_avx2.c

@@ -171,3 +171,27 @@

     t_coeff += 16;

+int vpx_satd_avx2(const tran_low_t *coeff, int length) {

+  const __m256i one = _mm256_set1_epi16(1);

+  __m256i accum = _mm256_setzero_si256();

+  int i;

+  for (i = 0; i < length; i += 16) {

+    const __m256i src_line = load_tran_low(coeff);

+    const __m256i abs = _mm256_abs_epi16(src_line);

+    const __m256i sum = _mm256_madd_epi16(abs, one);

+    accum = _mm256_add_epi32(accum, sum);

+    coeff += 16;

+  }

+  {  // 32 bit horizontal add

+    const __m256i a = _mm256_srli_si256(accum, 8);

+    const __m256i b = _mm256_add_epi32(accum, a);

+    const __m256i c = _mm256_srli_epi64(b, 32);

+    const __m256i d = _mm256_add_epi32(b, c);

+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),

+                                            _mm256_extractf128_si256(d, 1));

+    return _mm_cvtsi128_si32(accum_128);

+  }

+}

--

⑨