ref: cb9133c72f06fcc595c8117078c9689258dd0cf6
parent: a10a5cb3567a0cfcea73c2e3765bc86d427646e3
author: Johann <johannkoenig@google.com>
date: Mon May 1 05:10:06 EDT 2017
neon variance: add small missing sizes Some of the mixed sizes were missing. They can be implemented trivially using the existing helper function. When comparing the previous 16x8 and 8x16 implementations, the helper function is about 10% faster than the 16x8 version. The 8x16 is very close, but the existing version appears to be faster. BUG=webm:1422 Change-Id: Ib0e856083c1893e1bd399373c5fbcd6271a7f004
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -22,6 +22,7 @@
#include "vpx/vpx_integer.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
namespace {
@@ -345,6 +346,7 @@
void RefTest();
void RefStrideTest();
void OneQuarterTest();
+ void SpeedTest();
// MSE/SSE tests
void RefTestMse();
@@ -363,6 +365,7 @@
int byte_shift() const { return params_.bit_depth - 8; }
int block_size() const { return params_.block_size; }
int width() const { return params_.width; }
+ int height() const { return params_.height; }
uint32_t mask() const { return params_.mask; }
};
@@ -471,7 +474,36 @@
EXPECT_EQ(expected, var);
}
-////////////////////////////////////////////////////////////////////////////////
+template <typename VarianceFunctionType>
+void MainTestClass<VarianceFunctionType>::SpeedTest() {
+ const int half = block_size() / 2;
+ if (!use_high_bit_depth()) {
+ memset(src_, 255, block_size());
+ memset(ref_, 255, half);
+ memset(ref_ + half, 0, half);
+#if CONFIG_VP9_HIGHBITDEPTH
+ } else {
+ vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ }
+ unsigned int sse;
+
+ vpx_usec_timer timer;
+ vpx_usec_timer_start(&timer);
+ for (int i = 0; i < 100000000 / block_size(); ++i) {
+ const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse);
+ // Ignore return value.
+ (void)variance;
+ }
+ vpx_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("Variance %dx%d time: %5d ms\n", width(), height(),
+ elapsed_time / 1000);
+}
+
+////////////////////////////////////////////////////////////////////////////////
// Tests related to MSE / SSE.
template <typename FunctionType>
@@ -727,6 +759,7 @@
TEST_P(VpxVarianceTest, Ref) { RefTest(); }
TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
@@ -809,6 +842,7 @@
TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
@@ -1219,10 +1253,13 @@
VarianceParams(6, 5, &vpx_variance64x32_neon),
VarianceParams(5, 6, &vpx_variance32x64_neon),
VarianceParams(5, 5, &vpx_variance32x32_neon),
+ VarianceParams(5, 4, &vpx_variance32x16_neon),
+ VarianceParams(4, 5, &vpx_variance16x32_neon),
VarianceParams(4, 4, &vpx_variance16x16_neon),
VarianceParams(4, 3, &vpx_variance16x8_neon),
VarianceParams(3, 4, &vpx_variance8x16_neon),
- VarianceParams(3, 3, &vpx_variance8x8_neon)));
+ VarianceParams(3, 3, &vpx_variance8x8_neon),
+ VarianceParams(3, 2, &vpx_variance8x4_neon)));
INSTANTIATE_TEST_CASE_P(
NEON, VpxSubpelVarianceTest,
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -70,30 +70,28 @@
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
}
-unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
- return *sse - ((sum * sum) >> 6);
-}
+#define varianceNxM(n, m, shift) \
+ unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ variance_neon_w8(a, a_stride, b, b_stride, n, m, sse, &sum); \
+ if (n * m < 16 * 16) \
+ return *sse - ((sum * sum) >> shift); \
+ else \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ }
-unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
- return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
-}
+varianceNxM(8, 4, 5);
+varianceNxM(8, 8, 6);
+// TODO(johannkoenig) Investigate why the implementation below is faster.
+// varianceNxM(8, 16, 7);
+varianceNxM(16, 8, 7);
+varianceNxM(16, 16, 8);
+varianceNxM(16, 32, 9);
+varianceNxM(32, 16, 9);
+varianceNxM(32, 32, 10);
-unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
- return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-
unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride,
unsigned int *sse) {
@@ -142,82 +140,6 @@
*sse = sse1 + sse2;
sum1 += sum2;
return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
-
-unsigned int vpx_variance16x8_neon(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride, unsigned int *sse) {
- int i;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 4; i++) {
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q1u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- q2u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
- q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
- q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
- q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
- q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
- q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
}
unsigned int vpx_variance8x16_neon(const unsigned char *src_ptr,
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1177,10 +1177,10 @@
specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance32x16 sse2 avx2 msa/;
+ specialize qw/vpx_variance32x16 sse2 avx2 neon msa/;
add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance16x32 sse2 msa/;
+ specialize qw/vpx_variance16x32 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
@@ -1195,12 +1195,14 @@
specialize qw/vpx_variance8x8 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vpx_variance8x4 sse2 msa/;
+ specialize qw/vpx_variance8x4 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+# TODO(johannkoenig): neon
specialize qw/vpx_variance4x8 sse2 msa/;
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+# TODO(johannkoenig): neon
specialize qw/vpx_variance4x4 sse2 msa/;
#