ref: 5ab46e0ecdccd4ff9cc876a7f9a5887eceddd054
parent: 16d6aaceb8525b6d5cb4e7452473381f84c624e9
parent: 38bc1d0f4b25856c6c62286f14863ca5e0365827
author: James Zern <jzern@google.com>
date: Tue Apr 5 22:51:53 EDT 2016
Merge changes I7a1c0cba,Ie02b5caf,I2cbd85d7,I644f35b0 * changes: vpx_fdct16x16_1_sse2: improve load pattern vpx_fdct16x16_1_c/msa: fix accumulator overflow vpx_fdctNxN_1_sse2: reduce store size dct32x32_test: add PartialTrans32x32Test, Random
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -792,6 +792,67 @@
CompareInvReference(ref_txfm_, thresh_);
}
+class PartialTrans16x16Test
+ : public ::testing::TestWithParam<
+ std::tr1::tuple<FdctFunc, vpx_bit_depth_t> > {
+ public:
+ virtual ~PartialTrans16x16Test() {}
+ virtual void SetUp() {
+ fwd_txfm_ = GET_PARAM(0);
+ bit_depth_ = GET_PARAM(1);
+ }
+
+ virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ vpx_bit_depth_t bit_depth_;
+ FdctFunc fwd_txfm_;
+};
+
+TEST_P(PartialTrans16x16Test, Extremes) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int16_t maxval =
+ static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+ const int16_t maxval = 255;
+#endif
+ const int minval = -maxval;
+ DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+ DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+
+ for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
+ output[0] = 0;
+ ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+ EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
+
+ for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
+ output[0] = 0;
+ ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+ EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
+}
+
+TEST_P(PartialTrans16x16Test, Random) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int16_t maxval =
+ static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+ const int16_t maxval = 255;
+#endif
+ DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+ DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ int sum = 0;
+ for (int i = 0; i < kNumCoeffs; ++i) {
+ const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
+ input[i] = val;
+ sum += val;
+ }
+ output[0] = 0;
+ ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+ EXPECT_EQ(sum >> 1, output[0]);
+}
+
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
@@ -824,6 +885,11 @@
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+ C, PartialTrans16x16Test,
+ ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8),
+ make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10),
+ make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12)));
#else
INSTANTIATE_TEST_CASE_P(
C, Trans16x16HT,
@@ -832,6 +898,9 @@
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
+ ::testing::Values(make_tuple(&vpx_fdct16x16_1_c,
+ VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -859,6 +928,9 @@
VPX_BITS_8),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
+ ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
+ VPX_BITS_8)));
#endif // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -896,6 +968,9 @@
&idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
make_tuple(&idct16x16_12,
&idct16x16_256_add_12_sse2, 3167, VPX_BITS_12)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
+ ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
+ VPX_BITS_8)));
#endif // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -912,5 +987,8 @@
make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
VPX_BITS_8)));
+INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
+ ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa,
+ VPX_BITS_8)));
#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
} // namespace
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -344,6 +344,28 @@
EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
}
+TEST_P(PartialTrans32x32Test, Random) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int16_t maxval =
+ static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+ const int16_t maxval = 255;
+#endif
+ DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+ DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ int sum = 0;
+ for (int i = 0; i < kNumCoeffs; ++i) {
+ const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
+ input[i] = val;
+ sum += val;
+ }
+ output[0] = 0;
+ ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
+ EXPECT_EQ(sum >> 3, output[0]);
+}
+
using std::tr1::make_tuple;
#if CONFIG_VP9_HIGHBITDEPTH
--- a/vpx_dsp/fwd_txfm.c
+++ b/vpx_dsp/fwd_txfm.c
@@ -365,12 +365,12 @@
void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
- tran_low_t sum = 0;
+ int sum = 0;
for (r = 0; r < 16; ++r)
for (c = 0; c < 16; ++c)
sum += input[r * stride + c];
- output[0] = sum >> 1;
+ output[0] = (tran_low_t)(sum >> 1);
}
static INLINE tran_high_t dct_32_round(tran_high_t input) {
--- a/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/vpx_dsp/mips/fwd_txfm_msa.c
@@ -237,11 +237,9 @@
}
void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
- out[1] = 0;
-
- out[0] = LD_HADD(input, stride);
- out[0] += LD_HADD(input + 8, stride);
- out[0] += LD_HADD(input + 16 * 8, stride);
- out[0] += LD_HADD(input + 16 * 8 + 8, stride);
- out[0] >>= 1;
+ int sum = LD_HADD(input, stride);
+ sum += LD_HADD(input + 8, stride);
+ sum += LD_HADD(input + 16 * 8, stride);
+ sum += LD_HADD(input + 16 * 8 + 8, stride);
+ out[0] = (int16_t)(sum >> 1);
}
--- a/vpx_dsp/x86/fwd_txfm_sse2.c
+++ b/vpx_dsp/x86/fwd_txfm_sse2.c
@@ -40,7 +40,7 @@
in1 = _mm_add_epi32(tmp, in0);
in0 = _mm_slli_epi32(in1, 1);
- store_output(&in0, output);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
}
void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
@@ -80,7 +80,7 @@
in0 = _mm_srli_si128(sum, 8);
in1 = _mm_add_epi32(sum, in0);
- store_output(&in1, output);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
}
void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
@@ -91,20 +91,19 @@
int i;
for (i = 0; i < 2; ++i) {
- input += 8 * i;
- in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
u0 = _mm_add_epi16(in0, in1);
u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
sum = _mm_add_epi16(sum, u1);
u0 = _mm_add_epi16(in0, in1);
@@ -111,10 +110,10 @@
u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
- in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
+ in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
sum = _mm_add_epi16(sum, u1);
u0 = _mm_add_epi16(in0, in1);
@@ -121,10 +120,10 @@
u1 = _mm_add_epi16(in2, in3);
sum = _mm_add_epi16(sum, u0);
- in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
+ in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
+ in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
+ in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
+ in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
sum = _mm_add_epi16(sum, u1);
u0 = _mm_add_epi16(in0, in1);
@@ -132,6 +131,7 @@
sum = _mm_add_epi16(sum, u0);
sum = _mm_add_epi16(sum, u1);
+ input += 8 * stride;
}
u0 = _mm_setzero_si128();
@@ -149,7 +149,7 @@
in1 = _mm_add_epi32(sum, in0);
in1 = _mm_srai_epi32(in1, 1);
- store_output(&in1, output);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
}
void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
@@ -221,7 +221,7 @@
in1 = _mm_add_epi32(sum, in0);
in1 = _mm_srai_epi32(in1, 3);
- store_output(&in1, output);
+ output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
}
#define DCT_HIGH_BIT_DEPTH 0