ref: 9e4dc99e4be809fd7f07423eac39f9828594197c
parent: 2626b1545e4b5493729850d14185b0ec00a21383
parent: d468fd90e05ba7f5173d849c63f6a50115c9769b
author: James Zern <jzern@google.com>
date: Fri Jun 8 02:15:24 EDT 2018
Merge changes I89ce12b6,Id91b52d6,Icd7d4453 * changes: Implement subtract_block for VSX Cast bsize as int to print a meaninful debug info Speed test for subtract_block
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -14,6 +14,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
+#include "test/bench.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "vp9/common/vp9_blockd.h"
@@ -26,13 +27,58 @@
namespace vp9 {
-class VP9SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
+class VP9SubtractBlockTest : public AbstractBench,
+ public ::testing::TestWithParam<SubtractFunc> {
public:
virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+ int block_width_;
+ int block_height_;
+ int16_t *diff_;
+ uint8_t *pred_;
+ uint8_t *src_;
+
+ virtual void Run() {
+ GetParam()(block_height_, block_width_, diff_, block_width_, src_,
+ block_width_, pred_, block_width_);
+ }
+
+ void SetupBlocks(BLOCK_SIZE bsize) {
+ block_width_ = 4 * num_4x4_blocks_wide_lookup[bsize];
+ block_height_ = 4 * num_4x4_blocks_high_lookup[bsize];
+ diff_ = reinterpret_cast<int16_t *>(
+ vpx_memalign(16, sizeof(*diff_) * block_width_ * block_height_ * 2));
+ pred_ = reinterpret_cast<uint8_t *>(
+ vpx_memalign(16, block_width_ * block_height_ * 2));
+ src_ = reinterpret_cast<uint8_t *>(
+ vpx_memalign(16, block_width_ * block_height_ * 2));
+ }
};
using libvpx_test::ACMRandom;
+TEST_P(VP9SubtractBlockTest, DISABLED_Speed) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+ bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
+ SetupBlocks(bsize);
+
+ RunNTimes(100000000 / (block_height_ * block_width_));
+ char block_size[16];
+ snprintf(block_size, sizeof(block_size), "%dx%d", block_height_,
+ block_width_);
+ char title[100];
+ snprintf(title, sizeof(title), "%8s ", block_size);
+ PrintMedian(title);
+
+ vpx_free(diff_);
+ vpx_free(pred_);
+ vpx_free(src_);
+ }
+}
+
TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -39,49 +85,42 @@
// FIXME(rbultje) split in its own file
for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
- const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize];
- const int block_height = 4 * num_4x4_blocks_high_lookup[bsize];
- int16_t *diff = reinterpret_cast<int16_t *>(
- vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));
- uint8_t *pred = reinterpret_cast<uint8_t *>(
- vpx_memalign(16, block_width * block_height * 2));
- uint8_t *src = reinterpret_cast<uint8_t *>(
- vpx_memalign(16, block_width * block_height * 2));
+ SetupBlocks(bsize);
for (int n = 0; n < 100; n++) {
- for (int r = 0; r < block_height; ++r) {
- for (int c = 0; c < block_width * 2; ++c) {
- src[r * block_width * 2 + c] = rnd.Rand8();
- pred[r * block_width * 2 + c] = rnd.Rand8();
+ for (int r = 0; r < block_height_; ++r) {
+ for (int c = 0; c < block_width_ * 2; ++c) {
+ src_[r * block_width_ * 2 + c] = rnd.Rand8();
+ pred_[r * block_width_ * 2 + c] = rnd.Rand8();
}
}
- GetParam()(block_height, block_width, diff, block_width, src, block_width,
- pred, block_width);
+ GetParam()(block_height_, block_width_, diff_, block_width_, src_,
+ block_width_, pred_, block_width_);
- for (int r = 0; r < block_height; ++r) {
- for (int c = 0; c < block_width; ++c) {
- EXPECT_EQ(diff[r * block_width + c],
- (src[r * block_width + c] - pred[r * block_width + c]))
- << "r = " << r << ", c = " << c << ", bs = " << bsize;
+ for (int r = 0; r < block_height_; ++r) {
+ for (int c = 0; c < block_width_; ++c) {
+ EXPECT_EQ(diff_[r * block_width_ + c],
+ (src_[r * block_width_ + c] - pred_[r * block_width_ + c]))
+ << "r = " << r << ", c = " << c << ", bs = " << (int)bsize;
}
}
- GetParam()(block_height, block_width, diff, block_width * 2, src,
- block_width * 2, pred, block_width * 2);
+ GetParam()(block_height_, block_width_, diff_, block_width_ * 2, src_,
+ block_width_ * 2, pred_, block_width_ * 2);
- for (int r = 0; r < block_height; ++r) {
- for (int c = 0; c < block_width; ++c) {
- EXPECT_EQ(
- diff[r * block_width * 2 + c],
- (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c]))
- << "r = " << r << ", c = " << c << ", bs = " << bsize;
+ for (int r = 0; r < block_height_; ++r) {
+ for (int c = 0; c < block_width_; ++c) {
+ EXPECT_EQ(diff_[r * block_width_ * 2 + c],
+ (src_[r * block_width_ * 2 + c] -
+ pred_[r * block_width_ * 2 + c]))
+ << "r = " << r << ", c = " << c << ", bs = " << (int)bsize;
}
}
}
- vpx_free(diff);
- vpx_free(pred);
- vpx_free(src);
+ vpx_free(diff_);
+ vpx_free(pred_);
+ vpx_free(src_);
}
}
@@ -104,6 +143,11 @@
#if HAVE_MMI
INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest,
::testing::Values(vpx_subtract_block_mmi));
+#endif
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, VP9SubtractBlockTest,
+ ::testing::Values(vpx_subtract_block_vsx));
#endif
} // namespace vp9
--- /dev/null
+++ b/vpx_dsp/ppc/subtract_vsx.c
@@ -1,0 +1,117 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static VPX_FORCE_INLINE void subtract_block4x4(
+ int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) {
+ int16_t *diff1 = diff + 2 * diff_stride;
+ const uint8_t *src1 = src + 2 * src_stride;
+ const uint8_t *pred1 = pred + 2 * pred_stride;
+
+ const int16x8_t d0 = vec_vsx_ld(0, diff);
+ const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride);
+ const int16x8_t d2 = vec_vsx_ld(0, diff1);
+ const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride);
+
+ const uint8x16_t s0 = read4x2(src, (int)src_stride);
+ const uint8x16_t p0 = read4x2(pred, (int)pred_stride);
+ const uint8x16_t s1 = read4x2(src1, (int)src_stride);
+ const uint8x16_t p1 = read4x2(pred1, (int)pred_stride);
+
+ const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+ const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+
+ vec_vsx_st(xxpermdi(da, d0, 1), 0, diff);
+ vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride);
+ vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1);
+ vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride);
+}
+
+void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src,
+ ptrdiff_t src_stride, const uint8_t *pred,
+ ptrdiff_t pred_stride) {
+ int r = rows, c;
+
+ switch (cols) {
+ case 64:
+ case 32:
+ do {
+ for (c = 0; c < cols; c += 32) {
+ const uint8x16_t s0 = vec_vsx_ld(0, src + c);
+ const uint8x16_t s1 = vec_vsx_ld(16, src + c);
+ const uint8x16_t p0 = vec_vsx_ld(0, pred + c);
+ const uint8x16_t p1 = vec_vsx_ld(16, pred + c);
+ const int16x8_t d0l =
+ vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+ const int16x8_t d0h =
+ vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+ const int16x8_t d1l =
+ vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1));
+ const int16x8_t d1h =
+ vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+ vec_vsx_st(d0h, 0, diff + c);
+ vec_vsx_st(d0l, 16, diff + c);
+ vec_vsx_st(d1h, 0, diff + c + 16);
+ vec_vsx_st(d1l, 16, diff + c + 16);
+ }
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ break;
+ case 16:
+ do {
+ const uint8x16_t s0 = vec_vsx_ld(0, src);
+ const uint8x16_t p0 = vec_vsx_ld(0, pred);
+ const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+ const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+ vec_vsx_st(d0h, 0, diff);
+ vec_vsx_st(d0l, 16, diff);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ break;
+ case 8:
+ do {
+ const uint8x16_t s0 = vec_vsx_ld(0, src);
+ const uint8x16_t p0 = vec_vsx_ld(0, pred);
+ const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+ vec_vsx_st(d0h, 0, diff);
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ } while (--r);
+ break;
+ case 4:
+ subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride);
+ if (r > 4) {
+ diff += 4 * diff_stride;
+ pred += 4 * pred_stride;
+ src += 4 * src_stride;
+
+ subtract_block4x4(diff, diff_stride,
+
+ src, src_stride,
+
+ pred, pred_stride);
+ }
+ break;
+ default:
+ assert(0); // unreachable
+ }
+}
--- a/vpx_dsp/ppc/types_vsx.h
+++ b/vpx_dsp/ppc/types_vsx.h
@@ -68,6 +68,13 @@
#endif
#endif
+static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {
+ const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
+ const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
+
+ return (uint8x16_t)vec_mergeh(a0, a1);
+}
+
static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 };
static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 };
--- a/vpx_dsp/ppc/variance_vsx.c
+++ b/vpx_dsp/ppc/variance_vsx.c
@@ -14,13 +14,6 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/ppc/types_vsx.h"
-static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {
- const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
- const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
-
- return (uint8x16_t)vec_mergeh(a0, a1);
-}
-
uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride) {
int distortion;
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -342,6 +342,7 @@
DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm
DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
+DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -718,7 +718,7 @@
# Block subtraction
#
add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/;
#
# Single block SAD