shithub: libvpx

--- a/test/hadamard_test.cc

+++ b/test/hadamard_test.cc

@@ -311,8 +311,9 @@

 INSTANTIATE_TEST_CASE_P(

     AVX2, HadamardHighbdTest,

     ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8),

-                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2,

-                                           16)));

+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, 16),

+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_avx2,

+                                           32)));

 #endif  // HAVE_AVX2

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -792,7 +792,7 @@

     specialize qw/vpx_highbd_hadamard_16x16 avx2/;

     add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";

-    specialize qw/vpx_highbd_hadamard_32x32/;

+    specialize qw/vpx_highbd_hadamard_32x32 avx2/;

     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";

     specialize qw/vpx_satd avx2 sse2 neon/;

--- a/vpx_dsp/x86/avg_intrin_avx2.c

+++ b/vpx_dsp/x86/avg_intrin_avx2.c

@@ -175,6 +175,47 @@

     t_coeff += 8;

+void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,

+                                    ptrdiff_t src_stride, tran_low_t *coeff) {

+  int idx;

+  tran_low_t *t_coeff = coeff;

+  for (idx = 0; idx < 4; ++idx) {

+    const int16_t *src_ptr =

+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;

+    vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);

+  }

+  for (idx = 0; idx < 256; idx += 8) {

+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);

+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));

+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));

+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));

+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);

+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);

+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);

+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);

+    b0 = _mm256_srai_epi32(b0, 2);

+    b1 = _mm256_srai_epi32(b1, 2);

+    b2 = _mm256_srai_epi32(b2, 2);

+    b3 = _mm256_srai_epi32(b3, 2);

+    coeff0 = _mm256_add_epi32(b0, b2);

+    coeff1 = _mm256_add_epi32(b1, b3);

+    coeff2 = _mm256_sub_epi32(b0, b2);

+    coeff3 = _mm256_sub_epi32(b1, b3);

+    _mm256_storeu_si256((__m256i *)coeff, coeff0);

+    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);

+    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);

+    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);

+    coeff += 8;

+    t_coeff += 8;

+  }

+}

 #endif  // CONFIG_VP9_HIGHBITDEPTH

 static void hadamard_col8x2_avx2(__m256i *in, int iter) {

--

⑨