shithub: libvpx

--- a/test/avg_test.cc

+++ b/test/avg_test.cc

@@ -379,15 +379,11 @@

                       make_tuple(64, &vpx_int_pro_col_neon,

                                  &vpx_int_pro_col_c)));

-// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are

-// in place.

-#if !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(NEON, SatdTest,

                         ::testing::Values(make_tuple(16, &vpx_satd_neon),

                                           make_tuple(64, &vpx_satd_neon),

                                           make_tuple(256, &vpx_satd_neon),

                                           make_tuple(1024, &vpx_satd_neon)));

-#endif  // !CONFIG_VP9_HIGHBITDEPTH

 #endif  // HAVE_NEON

 #if HAVE_MSA

@@ -414,6 +410,8 @@

                       make_tuple(64, &vpx_int_pro_col_msa,

                                  &vpx_int_pro_col_c)));

+// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are

+// in place.

 #if !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(MSA, SatdTest,

                         ::testing::Values(make_tuple(16, &vpx_satd_msa),

--- a/vpx_dsp/arm/avg_neon.c

+++ b/vpx_dsp/arm/avg_neon.c

@@ -15,6 +15,7 @@

 #include "./vpx_config.h"

 #include "vpx/vpx_integer.h"

+#include "vpx_dsp/arm/idct_neon.h"

 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {

   const uint32x4_t a = vpaddlq_u16(v_16x8);

@@ -64,13 +65,13 @@

 // coeff: 16 bits, dynamic range [-32640, 32640].

 // length: value range {16, 64, 256, 1024}.

-int vpx_satd_neon(const int16_t *coeff, int length) {

+int vpx_satd_neon(const tran_low_t *coeff, int length) {

   const int16x4_t zero = vdup_n_s16(0);

   int32x4_t accum = vdupq_n_s32(0);

   do {

-    const int16x8_t src0 = vld1q_s16(coeff);

-    const int16x8_t src8 = vld1q_s16(coeff + 8);

+    const int16x8_t src0 = load_tran_low_to_s16q(coeff);

+    const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8);

     accum = vabal_s16(accum, vget_low_s16(src0), zero);

     accum = vabal_s16(accum, vget_high_s16(src0), zero);

     accum = vabal_s16(accum, vget_low_s16(src8), zero);

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -894,7 +894,7 @@

     specialize qw/vpx_hadamard_16x16/;

     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";

-    specialize qw/vpx_satd sse2/;

+    specialize qw/vpx_satd sse2 neon/;

   } else {

     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";

     specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";

--

⑨