ref: 0b7e4af7446bd2ed962776010812393d2a3dcf09
parent: 673ebe8d2b1417eadaa769436d14f51de6258e3f
parent: f6a002f2a67579cce9d53a4314f0676228517c12
author: Sai Deng <sdeng@google.com>
date: Sat Dec 8 13:43:55 EST 2018
Merge "Add satd avx2 implementation"
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -34,6 +34,18 @@
return (value >> 15) & 0xffff;
}
+ int32_t Rand20Signed(void) {
+ // Use 20 bits: values between 524287 and -524288.
+ const uint32_t value = random_.Generate(1048576);
+ return static_cast<int32_t>(value) - 524288;
+ }
+
+ int16_t Rand16Signed(void) {
+ // Use 16 bits: values between 32767 and -32768.
+ const uint32_t value = random_.Generate(65536);
+ return static_cast<int16_t>(value) - 32768;
+ }
+
int16_t Rand13Signed(void) {
// Use 13 bits: values between 4095 and -4096.
const uint32_t value = random_.Generate(8192);
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -251,12 +251,7 @@
for (int i = 0; i < satd_size_; ++i) src_[i] = val;
}
- void FillRandom() {
- for (int i = 0; i < satd_size_; ++i) {
- const int16_t tmp = rnd_.Rand16();
- src_[i] = (tran_low_t)tmp;
- }
- }
+ virtual void FillRandom() = 0;
void Check(const int expected) {
int total;
@@ -267,13 +262,23 @@
tran_low_t *GetCoeff() const { return src_; }
int satd_size_;
+ ACMRandom rnd_;
+ tran_low_t *src_;
private:
- tran_low_t *src_;
SatdFunc satd_func_;
- ACMRandom rnd_;
};
+class SatdLowbdTest : public SatdTest {
+ protected:
+ virtual void FillRandom() {
+ for (int i = 0; i < satd_size_; ++i) {
+ const int16_t tmp = rnd_.Rand16Signed();
+ src_[i] = (tran_low_t)tmp;
+ }
+ }
+};
+
typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
const tran_low_t *dqcoeff, int block_size);
typedef std::tuple<int, BlockErrorFunc> BlockErrorTestFPParam;
@@ -403,7 +408,7 @@
RunComparison();
}
-TEST_P(SatdTest, MinValue) {
+TEST_P(SatdLowbdTest, MinValue) {
const int kMin = -32640;
const int expected = -kMin * satd_size_;
FillConstant(kMin);
@@ -410,7 +415,7 @@
Check(expected);
}
-TEST_P(SatdTest, MaxValue) {
+TEST_P(SatdLowbdTest, MaxValue) {
const int kMax = 32640;
const int expected = kMax * satd_size_;
FillConstant(kMax);
@@ -417,13 +422,13 @@
Check(expected);
}
-TEST_P(SatdTest, Random) {
+TEST_P(SatdLowbdTest, Random) {
int expected;
switch (satd_size_) {
- case 16: expected = 205298; break;
- case 64: expected = 1113950; break;
- case 256: expected = 4268415; break;
- case 1024: expected = 16954082; break;
+ case 16: expected = 263252; break;
+ case 64: expected = 1105420; break;
+ case 256: expected = 4252250; break;
+ case 1024: expected = 16876840; break;
default:
FAIL() << "Invalid satd size (" << satd_size_
<< ") valid: 16/64/256/1024";
@@ -432,7 +437,7 @@
Check(expected);
}
-TEST_P(SatdTest, DISABLED_Speed) {
+TEST_P(SatdLowbdTest, DISABLED_Speed) {
const int kCountSpeedTestBlock = 20000;
vpx_usec_timer timer;
const int blocksize = GET_PARAM(0);
@@ -448,6 +453,62 @@
printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+class SatdHighbdTest : public SatdTest {
+ protected:
+ virtual void FillRandom() {
+ for (int i = 0; i < satd_size_; ++i) {
+ src_[i] = rnd_.Rand20Signed();
+ }
+ }
+};
+
+TEST_P(SatdHighbdTest, MinValue) {
+ const int kMin = -524280;
+ const int expected = -kMin * satd_size_;
+ FillConstant(kMin);
+ Check(expected);
+}
+
+TEST_P(SatdHighbdTest, MaxValue) {
+ const int kMax = 524280;
+ const int expected = kMax * satd_size_;
+ FillConstant(kMax);
+ Check(expected);
+}
+
+TEST_P(SatdHighbdTest, Random) {
+ int expected;
+ switch (satd_size_) {
+ case 16: expected = 5249712; break;
+ case 64: expected = 18362120; break;
+ case 256: expected = 66100520; break;
+ case 1024: expected = 266094734; break;
+ default:
+ FAIL() << "Invalid satd size (" << satd_size_
+ << ") valid: 16/64/256/1024";
+ }
+ FillRandom();
+ Check(expected);
+}
+
+TEST_P(SatdHighbdTest, DISABLED_Speed) {
+ const int kCountSpeedTestBlock = 20000;
+ vpx_usec_timer timer;
+ const int blocksize = GET_PARAM(0);
+ FillRandom();
+ tran_low_t *coeff = GetCoeff();
+
+ vpx_usec_timer_start(&timer);
+ for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+ GET_PARAM(1)(coeff, blocksize);
+ }
+ vpx_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
TEST_P(BlockErrorTestFP, MinValue) {
const int64_t kMin = -32640;
const int64_t expected = kMin * kMin * txfm_size_;
@@ -513,9 +574,15 @@
::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2),
make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2)));
#endif // HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(C, SatdHighbdTest,
+ ::testing::Values(make_tuple(16, &vpx_satd_c),
+ make_tuple(64, &vpx_satd_c),
+ make_tuple(256, &vpx_satd_c),
+ make_tuple(1024, &vpx_satd_c)));
#endif // CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(C, SatdTest,
+INSTANTIATE_TEST_CASE_P(C, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_c),
make_tuple(64, &vpx_satd_c),
make_tuple(256, &vpx_satd_c),
@@ -552,7 +619,7 @@
make_tuple(64, &vpx_int_pro_col_sse2,
&vpx_int_pro_col_c)));
-INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
+INSTANTIATE_TEST_CASE_P(SSE2, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_sse2),
make_tuple(64, &vpx_satd_sse2),
make_tuple(256, &vpx_satd_sse2),
@@ -567,13 +634,22 @@
#endif // HAVE_SSE2
#if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,
+INSTANTIATE_TEST_CASE_P(AVX2, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_avx2),
make_tuple(64, &vpx_satd_avx2),
make_tuple(256, &vpx_satd_avx2),
make_tuple(1024, &vpx_satd_avx2)));
+#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(
+ AVX2, SatdHighbdTest,
+ ::testing::Values(make_tuple(16, &vpx_highbd_satd_avx2),
+ make_tuple(64, &vpx_highbd_satd_avx2),
+ make_tuple(256, &vpx_highbd_satd_avx2),
+ make_tuple(1024, &vpx_highbd_satd_avx2)));
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_CASE_P(
AVX2, BlockErrorTestFP,
::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
make_tuple(64, &vp9_block_error_fp_avx2),
@@ -605,7 +681,7 @@
make_tuple(64, &vpx_int_pro_col_neon,
&vpx_int_pro_col_c)));
-INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
+INSTANTIATE_TEST_CASE_P(NEON, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_neon),
make_tuple(64, &vpx_satd_neon),
make_tuple(256, &vpx_satd_neon),
@@ -650,7 +726,7 @@
// TODO(jingning): Remove the highbitdepth flag once the SIMD functions are
// in place.
#if !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
+INSTANTIATE_TEST_CASE_P(MSA, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_msa),
make_tuple(64, &vpx_satd_msa),
make_tuple(256, &vpx_satd_msa),
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5972,8 +5972,7 @@
vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
dst_stride, xd->bd);
highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- // TODO(sdeng): Implement SIMD based high bit-depth satd.
- intra_cost = vpx_satd_c(coeff, pix_num);
+ intra_cost = vpx_highbd_satd(coeff, pix_num);
} else {
vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
dst_stride);
@@ -6019,7 +6018,7 @@
bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
- inter_cost = vpx_satd_c(coeff, pix_num);
+ inter_cost = vpx_highbd_satd(coeff, pix_num);
} else {
vp9_build_inter_predictor(
ref_frame[rf_idx]->y_buffer + mb_y_offset,
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -314,6 +314,19 @@
}
}
+#if CONFIG_VP9_HIGHBITDEPTH
+// coeff: dynamic range 20 bit.
+// length: value range {16, 64, 256, 1024}.
+int vpx_highbd_satd_c(const tran_low_t *coeff, int length) {
+ int i;
+ int satd = 0;
+ for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+ // satd: 30 bits
+ return satd;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
int vpx_satd_c(const tran_low_t *coeff, int length) {
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -796,6 +796,9 @@
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd avx2 sse2 neon/;
+
+ add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length";
+ specialize qw/vpx_highbd_satd avx2/;
} else {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -457,3 +457,26 @@
return _mm_cvtsi128_si32(accum_128);
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 8, coeff += 8) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i abs = _mm256_abs_epi32(src_line);
+ accum = _mm256_add_epi32(accum, abs);
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH