ref: 6f35ac956b8315b466c7f66af1172fbc473f1464
parent: 979f0c0e5a901e9d6d21bcc5d566024acaaa22f9
author: Johann <johann.koenig@duck.com>
date: Wed Oct 24 08:22:35 EDT 2018
vp8 bilinear: rewrite in intrinsics 8x8 is 15% faster than the assembly. 8x4 is 200% faster than MMX. Remove MMX version. Change-Id: I55642ebd276db265911f2c79616177a3a9a7e04f
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@@ -16,6 +16,7 @@
#include "./vp8_rtcd.h"
#include "./vpx_config.h"
#include "test/acm_random.h"
+#include "test/bench.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "test/util.h"
@@ -33,7 +34,8 @@
typedef ::testing::tuple<int, int, PredictFunc> PredictParam;
-class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
+class PredictTestBase : public AbstractBench,
+ public ::testing::TestWithParam<PredictParam> {
public:
PredictTestBase()
: width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)),
@@ -204,8 +206,21 @@
}
}
}
-};
+ void Run() {
+ for (int xoffset = 0; xoffset < 8; ++xoffset) {
+ for (int yoffset = 0; yoffset < 8; ++yoffset) {
+ if (xoffset == 0 && yoffset == 0) {
+ continue;
+ }
+
+ predict_(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, dst_,
+ dst_stride_);
+ }
+ }
+ }
+}; // namespace
+
class SixtapPredictTest : public PredictTestBase {};
TEST_P(SixtapPredictTest, TestWithRandomData) {
@@ -341,7 +356,15 @@
TEST_P(BilinearPredictTest, TestWithUnalignedDst) {
TestWithUnalignedDst(vp8_bilinear_predict16x16_c);
}
+TEST_P(BilinearPredictTest, DISABLED_Speed) {
+ const int kCountSpeedTestBlock = 5000000 / (width_ * height_);
+ RunNTimes(kCountSpeedTestBlock);
+ char title[16];
+ snprintf(title, sizeof(title), "%dx%d", width_, height_);
+ PrintMedian(title);
+}
+
INSTANTIATE_TEST_CASE_P(
C, BilinearPredictTest,
::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_c),
@@ -359,14 +382,14 @@
#if HAVE_MMX
INSTANTIATE_TEST_CASE_P(
MMX, BilinearPredictTest,
- ::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
- make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
+ ::testing::Values(make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
#endif
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, BilinearPredictTest,
::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2),
- make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2)));
+ make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2),
+ make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2)));
#endif
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -164,7 +164,7 @@
specialize qw/vp8_bilinear_predict8x8 sse2 ssse3 neon msa/;
add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
+specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/;
add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_bilinear_predict4x4 mmx neon msa/;
--- /dev/null
+++ b/vp8/common/x86/bilinear_filter_sse2.c
@@ -1,0 +1,119 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <xmmintrin.h>
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
+ const int xoffset, const int height) {
+ int h;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (xoffset == 0) {
+ for (h = 0; h < height; ++h) {
+ const __m128i a = _mm_loadl_epi64((__m128i *)src);
+ const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+ _mm_store_si128((__m128i *)dst, a_u16);
+ src += stride;
+ dst += 8;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+ const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+ // Filter horizontally. Rather than load the whole array and transpose, load
+ // 16 values (overreading) and shift to set up the second value. Do an
+ // "extra" 9th line so the vertical pass has the necessary context.
+ for (h = 0; h < height; ++h) {
+ const __m128i a = _mm_loadu_si128((__m128i *)src);
+ const __m128i b = _mm_srli_si128(a, 1);
+ const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+ const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+ const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+ const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+ const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+ const __m128i compensated = _mm_add_epi16(sum, round_factor);
+ const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+ _mm_store_si128((__m128i *)dst, shifted);
+ src += stride;
+ dst += 8;
+ }
+ }
+}
+
+static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride,
+ const int yoffset, const int height) {
+ int h;
+
+ if (yoffset == 0) {
+ for (h = 0; h < height; ++h) {
+ const __m128i row = _mm_load_si128((__m128i *)src);
+ const __m128i packed = _mm_packus_epi16(row, row);
+ _mm_storel_epi64((__m128i *)dst, packed);
+ src += 8;
+ dst += stride;
+ }
+ return;
+ }
+
+ {
+ const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+ const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+ const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+ __m128i row_0 = _mm_load_si128((__m128i *)src);
+ src += 8;
+ for (h = 0; h < height; ++h) {
+ const __m128i row_1 = _mm_load_si128((__m128i *)src);
+ const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+ const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+ const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+ const __m128i compensated = _mm_add_epi16(sum, round_factor);
+ const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+ const __m128i packed = _mm_packus_epi16(shifted, shifted);
+ _mm_storel_epi64((__m128i *)dst, packed);
+ row_0 = row_1;
+ src += 8;
+ dst += stride;
+ }
+ }
+}
+
+void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, uint8_t *dst_ptr,
+ int dst_pitch) {
+ uint16_t FData[8 * 9];
+
+ assert((xoffset | yoffset) != 0);
+
+ horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9);
+
+ vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8);
+}
+
+void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+ int xoffset, int yoffset, uint8_t *dst_ptr,
+ int dst_pitch) {
+ uint16_t FData[8 * 5];
+
+ assert((xoffset | yoffset) != 0);
+
+ horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5);
+
+ vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
+}
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -205,161 +205,6 @@
ret
-;void bilinear_predict8x4_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
-sym(vp8_bilinear_predict8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
- movsxd rax, dword ptr arg(2) ;xoffset
- mov rdi, arg(4) ;dst_ptr ;
-
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
- shl rax, 5
-
- mov rsi, arg(0) ;src_ptr ;
- add rax, rcx
-
- movsxd rdx, dword ptr arg(5) ;dst_pitch
- movq mm1, [rax] ;
-
- movq mm2, [rax+16] ;
- movsxd rax, dword ptr arg(3) ;yoffset
-
- pxor mm0, mm0 ;
- shl rax, 5
-
- add rax, rcx
- lea rcx, [rdi+rdx*4] ;
-
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
-
- ; get the first horizontal line done ;
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
- add rsi, rdx ; next line
-.next_row_8x4:
- movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movq mm4, mm3 ; make a copy of current line
-
- punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, mm1 ;
- pmullw mm4, mm1 ;
-
- movq mm5, [rsi+1] ;
- movq mm6, mm5 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0 ;
-
- pmullw mm5, mm2 ;
- pmullw mm6, mm2 ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
- movq mm5, mm7 ;
- movq mm6, mm7 ;
-
- punpcklbw mm5, mm0 ;
- punpckhbw mm6, mm0
-
- pmullw mm5, [rax] ;
- pmullw mm6, [rax] ;
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
-
- movq mm7, mm3 ;
- packuswb mm7, mm4 ;
-
-
- pmullw mm3, [rax+16] ;
- pmullw mm4, [rax+16] ;
-
- paddw mm3, mm5 ;
- paddw mm4, mm6 ;
-
-
- paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- paddw mm4, [GLOBAL(rd)] ;
- psraw mm4, VP8_FILTER_SHIFT ;
-
- packuswb mm3, mm4
-
- movq [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, rdx ; next line
- add rdi, dword ptr arg(5) ;dst_pitch ;
-%else
- movsxd r8, dword ptr arg(5) ;dst_pitch
- add rsi, rdx ; next line
- add rdi, r8
-%endif
- cmp rdi, rcx ;
- jne .next_row_8x4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
;void bilinear_predict4x4_mmx
;(
; unsigned char *src_ptr,
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -1226,151 +1226,6 @@
pop rbp
ret
-
-;void vp8_bilinear_predict8x8_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
-sym(vp8_bilinear_predict8x8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 144 ; reserve 144 bytes
-
- ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
- ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
- lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- ;Read 9-line unaligned data in and put them on stack. This gives a big
- ;performance boost.
- movdqu xmm0, [rsi]
- lea rax, [rdx + rdx*2]
- movdqu xmm1, [rsi+rdx]
- movdqu xmm2, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi+rdx]
- movdqu xmm5, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm6, [rsi]
- movdqu xmm7, [rsi+rdx]
-
- movdqa XMMWORD PTR [rsp], xmm0
-
- movdqu xmm0, [rsi+rdx*2]
-
- movdqa XMMWORD PTR [rsp+16], xmm1
- movdqa XMMWORD PTR [rsp+32], xmm2
- movdqa XMMWORD PTR [rsp+48], xmm3
- movdqa XMMWORD PTR [rsp+64], xmm4
- movdqa XMMWORD PTR [rsp+80], xmm5
- movdqa XMMWORD PTR [rsp+96], xmm6
- movdqa XMMWORD PTR [rsp+112], xmm7
- movdqa XMMWORD PTR [rsp+128], xmm0
-
- movsxd rax, dword ptr arg(2) ;xoffset
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
-
- movdqa xmm5, [rax]
- movdqa xmm6, [rax+16]
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqa xmm3, XMMWORD PTR [rsp]
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm7, xmm3
- add rsp, 16 ; next line
-.next_row8x8:
- movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
- pmullw xmm7, xmm5
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm4, xmm3
-
- pmullw xmm3, xmm6
- paddw xmm3, xmm7
-
- movdqa xmm7, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
-
- packuswb xmm3, xmm0
- movq [rdi], xmm3 ; store the results in the destination
-
- add rsp, 16 ; next line
- add rdi, rdx
-
- cmp rdi, rcx
- jne .next_row8x8
-
- ;add rsp, 144
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
SECTION_RODATA
align 16
rd:
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -86,6 +86,7 @@
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/bilinear_filter_sse2.c
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm