shithub: libvpx

--- a/test/avg_test.cc

+++ b/test/avg_test.cc

@@ -348,9 +348,6 @@

                       make_tuple(64, &vpx_int_pro_col_sse2,

                                  &vpx_int_pro_col_c)));

-// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are

-// in place.

-#if !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,

                         ::testing::Values(make_tuple(16, &vpx_satd_sse2),

                                           make_tuple(64, &vpx_satd_sse2),

@@ -357,7 +354,6 @@

                                           make_tuple(256, &vpx_satd_sse2),

                                           make_tuple(1024, &vpx_satd_sse2)));

 #endif

-#endif

 #if HAVE_NEON

 INSTANTIATE_TEST_CASE_P(

@@ -383,13 +379,11 @@

                       make_tuple(64, &vpx_int_pro_col_neon,

                                  &vpx_int_pro_col_c)));

-#if !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(NEON, SatdTest,

                         ::testing::Values(make_tuple(16, &vpx_satd_neon),

                                           make_tuple(64, &vpx_satd_neon),

                                           make_tuple(256, &vpx_satd_neon),

                                           make_tuple(1024, &vpx_satd_neon)));

-#endif  // !CONFIG_VP9_HIGHBITDEPTH

 #endif  // HAVE_NEON

 #if HAVE_MSA

@@ -416,6 +410,8 @@

                       make_tuple(64, &vpx_int_pro_col_msa,

                                  &vpx_int_pro_col_c)));

+// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are

+// in place.

 #if !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(MSA, SatdTest,

                         ::testing::Values(make_tuple(16, &vpx_satd_msa),

--- a/vpx_dsp/arm/avg_neon.c

+++ b/vpx_dsp/arm/avg_neon.c

@@ -15,6 +15,7 @@

 #include "./vpx_config.h"

 #include "vpx/vpx_integer.h"

+#include "vpx_dsp/arm/idct_neon.h"

 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {

   const uint32x4_t a = vpaddlq_u16(v_16x8);

@@ -64,13 +65,13 @@

 // coeff: 16 bits, dynamic range [-32640, 32640].

 // length: value range {16, 64, 256, 1024}.

-int vpx_satd_neon(const int16_t *coeff, int length) {

+int vpx_satd_neon(const tran_low_t *coeff, int length) {

   const int16x4_t zero = vdup_n_s16(0);

   int32x4_t accum = vdupq_n_s32(0);

   do {

-    const int16x8_t src0 = vld1q_s16(coeff);

-    const int16x8_t src8 = vld1q_s16(coeff + 8);

+    const int16x8_t src0 = load_tran_low_to_s16q(coeff);

+    const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8);

     accum = vabal_s16(accum, vget_low_s16(src0), zero);

     accum = vabal_s16(accum, vget_high_s16(src0), zero);

     accum = vabal_s16(accum, vget_low_s16(src8), zero);

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -894,7 +894,7 @@

     specialize qw/vpx_hadamard_16x16 sse2 neon/;

     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";

-    specialize qw/vpx_satd/;

+    specialize qw/vpx_satd sse2 neon/;

   } else {

     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";

     specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";

--- a/vpx_dsp/x86/avg_intrin_sse2.c

+++ b/vpx_dsp/x86/avg_intrin_sse2.c

@@ -285,13 +285,13 @@

-int vpx_satd_sse2(const int16_t *coeff, int length) {

+int vpx_satd_sse2(const tran_low_t *coeff, int length) {

   int i;

   const __m128i zero = _mm_setzero_si128();

   __m128i accum = zero;

   for (i = 0; i < length; i += 8) {

-    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);

+    const __m128i src_line = load_tran_low(coeff);

     const __m128i inv = _mm_sub_epi16(zero, src_line);

     const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)

     const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);

--

⑨