ref: 88dc0d606255a5cba721c40b078b5802e891df57
parent: 7b278e30723d7c9e769ed8ac54daf45a721172da
author: Linfeng Zhang <linfengz@google.com>
date: Tue Mar 13 12:10:00 EDT 2018
Fix a bug in vp9_highbd_iht4x4_16_add_neon() This bug was introduced in 36363304. BUG=webm:1403 Change-Id: I695b409047e41ab7e0460981524310d78753751a
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -629,11 +629,11 @@
static const FuncInfo ht_neon_func_info[] = {
#if CONFIG_VP9_HIGHBITDEPTH
+ { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
+ 2 },
// TODO(linfengz): reenable these functions once test vector failures are
// addressed.
#if 0
- { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
- 2 },
{ &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
2 },
#endif
--- a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -23,34 +23,55 @@
static INLINE void highbd_iadst4(int32x4_t *const io) {
const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
const int32x4_t sinpi = vld1q_s32(sinpis);
- int32x4_t s[8];
+ int64x2x2_t s[7], t[4];
+ int32x4_t s7;
- s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
- s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
- s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
- s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
- s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
- s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
- s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
- s[7] = vsubq_s32(io[0], io[2]);
- s[7] = vaddq_s32(s[7], io[3]);
+ s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0);
+ s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0);
+ s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1);
+ s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1);
+ s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0);
+ s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0);
+ s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1);
+ s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1);
+ s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0);
+ s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0);
+ s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1);
+ s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1);
+ s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1);
+ s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1);
+ s7 = vsubq_s32(io[0], io[2]);
+ s7 = vaddq_s32(s7, io[3]);
- s[0] = vaddq_s32(s[0], s[3]);
- s[0] = vaddq_s32(s[0], s[5]);
- s[1] = vsubq_s32(s[1], s[4]);
- s[1] = vsubq_s32(s[1], s[6]);
+ s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+ s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+ s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]);
+ s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]);
+ s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]);
+ s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]);
+ s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]);
+ s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]);
s[3] = s[2];
- s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
+ s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0);
+ s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0);
- io[0] = vaddq_s32(s[0], s[3]);
- io[1] = vaddq_s32(s[1], s[3]);
- io[2] = s[2];
- io[3] = vaddq_s32(s[0], s[1]);
- io[3] = vsubq_s32(io[3], s[3]);
- io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
- io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
- io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
- io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
+ t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+ t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+ t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]);
+ t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]);
+ t[2] = s[2];
+ t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]);
+ t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]);
+ t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]);
+ t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]);
+ io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS));
+ io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS));
+ io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS));
+ io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS),
+ vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS));
}
void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -103,7 +103,7 @@
add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
- specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
+ specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
}