ref: caa116c9be96508c18d533dedc95b2df4f8e3812
parent: bc4098a8e969f1ab3281a801a4ba3b78d788cec3
parent: 1b2f92ee8e3a8bf1d3f5a39bfae92e8136ec370b
author: Kyle Siefring <kylesiefring@gmail.com>
date: Thu Oct 12 12:12:38 EDT 2017
Merge changes I38783d97,If5160c0c * changes: Extend 16 wide AVX2 convolve8 code to support averaging. Add AVX2 version of vpx_convolve8_avg.
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -603,6 +603,29 @@
UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
}
+TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
+ const uint8_t *const in = input();
+ uint8_t *const out = output();
+ const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+ const int kNumTests = 5000000;
+ const int width = Width();
+ const int height = Height();
+ vpx_usec_timer timer;
+
+ SetConstantInput(127);
+
+ vpx_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+ width, height);
+ }
+ vpx_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("convolve8_avg_%dx%d_%d: %d us\n", width, height,
+ UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
TEST_P(ConvolveTest, Copy) {
uint8_t *const in = input();
uint8_t *const out = output();
@@ -1177,8 +1200,8 @@
#else // !CONFIG_VP9_HIGHBITDEPTH
const ConvolveFunctions convolve8_avx2(
vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
- vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
- vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
+ vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2,
+ vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2,
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -364,13 +364,13 @@
specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/;
add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/vpx_scaled_2d ssse3 neon/;
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -59,10 +59,11 @@
#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
#endif // __clang__
-static void vpx_filter_block1d16_h8_avx2(
+static INLINE void vpx_filter_block1d16_h8_X_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
+ const int avg) {
+ __m128i filtersReg, outReg1, outReg2;
__m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
__m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
@@ -185,13 +186,21 @@
src_ptr += src_stride;
+ // average if necessary
+ outReg1 = _mm256_castsi256_si128(srcRegFilt32b1_1);
+ outReg2 = _mm256_extractf128_si256(srcRegFilt32b1_1, 1);
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ outReg2 = _mm_avg_epu8(
+ outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+ }
+
// save 16 bytes
- _mm_store_si128((__m128i *)output_ptr,
- _mm256_castsi256_si128(srcRegFilt32b1_1));
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
// save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + output_pitch),
- _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+ _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
+
output_ptr += dst_stride;
}
@@ -280,17 +289,37 @@
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
// result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+ outReg1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+ // average if necessary
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ }
+
// save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
}
}
-static void vpx_filter_block1d16_v8_avx2(
+static void vpx_filter_block1d16_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+ ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+ vpx_filter_block1d16_h8_X_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+ output_height, filter, 0);
+}
+
+static void vpx_filter_block1d16_h8_avg_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+ ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+ vpx_filter_block1d16_h8_X_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+ output_height, filter, 1);
+}
+
+static INLINE void vpx_filter_block1d16_v8_X_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
- ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
+ const int avg) {
+ __m128i filtersReg, outReg1, outReg2;
__m256i addFilterReg64;
__m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
__m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
@@ -435,12 +464,20 @@
src_ptr += src_stride;
+ // average if necessary
+ outReg1 = _mm256_castsi256_si128(srcReg32b1);
+ outReg2 = _mm256_extractf128_si256(srcReg32b1, 1);
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ outReg2 = _mm_avg_epu8(
+ outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+ }
+
// save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
// save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + out_pitch),
- _mm256_extractf128_si256(srcReg32b1, 1));
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
output_ptr += dst_stride;
@@ -515,13 +552,33 @@
// shrink to 8 bit each 16 bits, the first lane contain the first
// convolve result and the second lane contain the second convolve
// result
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+ outReg1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+ // average if necessary
+ if (avg) {
+ outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+ }
+
// save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+ _mm_store_si128((__m128i *)output_ptr, outReg1);
}
}
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height,
+ const int16_t *filter) {
+ vpx_filter_block1d16_v8_X_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, filter, 0);
+}
+
+static void vpx_filter_block1d16_v8_avg_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr,
+ ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) {
+ vpx_filter_block1d16_v8_X_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+ height, filter, 1);
+}
+
#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
#if ARCH_X86_64
@@ -539,6 +596,14 @@
#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
#endif // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3
filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
@@ -552,6 +617,18 @@
#define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
#define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
#define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
+#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3
+#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
+#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
+#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
+#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
// void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const InterpKernel *filter, int x0_q4,
@@ -562,8 +639,20 @@
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
+// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4,
+// int y_step_q4, int w, int h);
FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
// void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
@@ -570,5 +659,11 @@
// const InterpKernel *filter, int x0_q4,
// int32_t x_step_q4, int y0_q4, int y_step_q4,
// int w, int h);
+// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const InterpKernel *filter, int x0_q4,
+// int32_t x_step_q4, int y0_q4, int y_step_q4,
+// int w, int h);
FUN_CONV_2D(, avx2);
+FUN_CONV_2D(avg_, avx2);
#endif // HAVE_AX2 && HAVE_SSSE3