shithub: libvpx

--- a/test/idct_test.cc

+++ b/test/idct_test.cc

@@ -115,6 +115,10 @@

 INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));

+#if HAVE_NEON

+INSTANTIATE_TEST_CASE_P(NEON, IDCTTest,

+                        ::testing::Values(vp8_short_idct4x4llm_neon));

+#endif

 #if HAVE_MMX

 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,

                         ::testing::Values(vp8_short_idct4x4llm_mmx));

--- a/vp8/common/arm/neon/dequant_idct_neon.c

+++ b/vp8/common/arm/neon/dequant_idct_neon.c

@@ -11,7 +11,11 @@

 #include <arm_neon.h>

 static const int16_t cospi8sqrt2minus1 = 20091;

-static const int16_t sinpi8sqrt2 = 35468;

+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of

+// the way it is used in vqdmulh, where the result is doubled, it can be divided

+// by 2 beforehand. This saves compensating for the negative value as well as

+// shifting the result.

+static const int16_t sinpi8sqrt2 = 35468 >> 1;

 void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,

                                int stride) {

@@ -60,10 +64,8 @@

   q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);

   q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);

-  q3 = vshrq_n_s16(q3, 1);

   q4 = vshrq_n_s16(q4, 1);

-  q3 = vqaddq_s16(q3, q2);

   q4 = vqaddq_s16(q4, q2);

   d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));

@@ -90,10 +92,8 @@

   d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);

   d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);

-  q3 = vshrq_n_s16(q3, 1);

   q4 = vshrq_n_s16(q4, 1);

-  q3 = vqaddq_s16(q3, q2);

   q4 = vqaddq_s16(q4, q2);

   d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));

--- a/vp8/common/arm/neon/shortidct4x4llm_neon.c

+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.c

@@ -11,7 +11,11 @@

 #include <arm_neon.h>

 static const int16_t cospi8sqrt2minus1 = 20091;

-static const int16_t sinpi8sqrt2 = 35468;

+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of

+// the way it is used in vqdmulh, where the result is doubled, it can be divided

+// by 2 beforehand. This saves compensating for the negative value as well as

+// shifting the result.

+static const int16_t sinpi8sqrt2 = 35468 >> 1;

 void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,

                                int pred_stride, unsigned char *dst_ptr,

@@ -40,10 +44,8 @@

   d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1

   d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1

-  q3s16 = vshrq_n_s16(q3s16, 1);

   q4s16 = vshrq_n_s16(q4s16, 1);

-  q3s16 = vqaddq_s16(q3s16, q2s16);

   q4s16 = vqaddq_s16(q4s16, q2s16);

   d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1

@@ -71,10 +73,8 @@

   d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1

   d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1

-  q3s16 = vshrq_n_s16(q3s16, 1);

   q4s16 = vshrq_n_s16(q4s16, 1);

-  q3s16 = vqaddq_s16(q3s16, q2s16);

   q4s16 = vqaddq_s16(q4s16, q2s16);

   d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1

--

⑨