shithub: libvpx

Download patch

ref: 88dc0d606255a5cba721c40b078b5802e891df57
parent: 7b278e30723d7c9e769ed8ac54daf45a721172da
author: Linfeng Zhang <linfengz@google.com>
date: Tue Mar 13 12:10:00 EDT 2018

Fix a bug in vp9_highbd_iht4x4_16_add_neon()

This bug was introduced in 36363304.

BUG=webm:1403

Change-Id: I695b409047e41ab7e0460981524310d78753751a

--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -629,11 +629,11 @@
 
 static const FuncInfo ht_neon_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
+    2 },
 // TODO(linfengz): reenable these functions once test vector failures are
 // addressed.
 #if 0
-  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
-    2 },
   { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
     2 },
 #endif
--- a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -23,34 +23,55 @@
 static INLINE void highbd_iadst4(int32x4_t *const io) {
   const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
   const int32x4_t sinpi = vld1q_s32(sinpis);
-  int32x4_t s[8];
+  int64x2x2_t s[7], t[4];
+  int32x4_t s7;
 
-  s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
-  s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
-  s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
-  s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
-  s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
-  s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
-  s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
-  s[7] = vsubq_s32(io[0], io[2]);
-  s[7] = vaddq_s32(s[7], io[3]);
+  s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1);
+  s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1);
+  s7 = vsubq_s32(io[0], io[2]);
+  s7 = vaddq_s32(s7, io[3]);
 
-  s[0] = vaddq_s32(s[0], s[3]);
-  s[0] = vaddq_s32(s[0], s[5]);
-  s[1] = vsubq_s32(s[1], s[4]);
-  s[1] = vsubq_s32(s[1], s[6]);
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]);
   s[3] = s[2];
-  s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0);
 
-  io[0] = vaddq_s32(s[0], s[3]);
-  io[1] = vaddq_s32(s[1], s[3]);
-  io[2] = s[2];
-  io[3] = vaddq_s32(s[0], s[1]);
-  io[3] = vsubq_s32(io[3], s[3]);
-  io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
-  io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
-  io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
-  io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
+  t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]);
+  t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]);
+  t[2] = s[2];
+  t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]);
+  t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]);
+  t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]);
+  t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]);
+  io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS));
+  io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS));
+  io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS));
+  io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS));
 }
 
 void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -103,7 +103,7 @@
   add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
 
   if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
-    specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
+    specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
     specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
     specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
   }