shithub: libvpx

--- a/test/vp9_subtract_test.cc

+++ b/test/vp9_subtract_test.cc

@@ -14,6 +14,7 @@

 #include "./vpx_config.h"

 #include "./vpx_dsp_rtcd.h"

 #include "test/acm_random.h"

+#include "test/bench.h"

 #include "test/clear_system_state.h"

 #include "test/register_state_check.h"

 #include "vp9/common/vp9_blockd.h"

@@ -26,13 +27,58 @@

 namespace vp9 {

-class VP9SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {

+class VP9SubtractBlockTest : public AbstractBench,

+                             public ::testing::TestWithParam<SubtractFunc> {

  public:

   virtual void TearDown() { libvpx_test::ClearSystemState(); }

+ protected:

+  int block_width_;

+  int block_height_;

+  int16_t *diff_;

+  uint8_t *pred_;

+  uint8_t *src_;

+  virtual void Run() {

+    GetParam()(block_height_, block_width_, diff_, block_width_, src_,

+               block_width_, pred_, block_width_);

+  }

+  void SetupBlocks(BLOCK_SIZE bsize) {

+    block_width_ = 4 * num_4x4_blocks_wide_lookup[bsize];

+    block_height_ = 4 * num_4x4_blocks_high_lookup[bsize];

+    diff_ = reinterpret_cast<int16_t *>(

+        vpx_memalign(16, sizeof(*diff_) * block_width_ * block_height_ * 2));

+    pred_ = reinterpret_cast<uint8_t *>(

+        vpx_memalign(16, block_width_ * block_height_ * 2));

+    src_ = reinterpret_cast<uint8_t *>(

+        vpx_memalign(16, block_width_ * block_height_ * 2));

+  }

};

 using libvpx_test::ACMRandom;

+TEST_P(VP9SubtractBlockTest, DISABLED_Speed) {

+  ACMRandom rnd(ACMRandom::DeterministicSeed());

+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;

+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {

+    SetupBlocks(bsize);

+    RunNTimes(100000000 / (block_height_ * block_width_));

+    char block_size[16];

+    snprintf(block_size, sizeof(block_size), "%dx%d", block_height_,

+             block_width_);

+    char title[100];

+    snprintf(title, sizeof(title), "%8s ", block_size);

+    PrintMedian(title);

+    vpx_free(diff_);

+    vpx_free(pred_);

+    vpx_free(src_);

+  }

+}

 TEST_P(VP9SubtractBlockTest, SimpleSubtract) {

   ACMRandom rnd(ACMRandom::DeterministicSeed());

@@ -39,49 +85,42 @@

   // FIXME(rbultje) split in its own file

   for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;

        bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {

-    const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize];

-    const int block_height = 4 * num_4x4_blocks_high_lookup[bsize];

-    int16_t *diff = reinterpret_cast<int16_t *>(

-        vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));

-    uint8_t *pred = reinterpret_cast<uint8_t *>(

-        vpx_memalign(16, block_width * block_height * 2));

-    uint8_t *src = reinterpret_cast<uint8_t *>(

-        vpx_memalign(16, block_width * block_height * 2));

+    SetupBlocks(bsize);

     for (int n = 0; n < 100; n++) {

-      for (int r = 0; r < block_height; ++r) {

-        for (int c = 0; c < block_width * 2; ++c) {

-          src[r * block_width * 2 + c] = rnd.Rand8();

-          pred[r * block_width * 2 + c] = rnd.Rand8();

+      for (int r = 0; r < block_height_; ++r) {

+        for (int c = 0; c < block_width_ * 2; ++c) {

+          src_[r * block_width_ * 2 + c] = rnd.Rand8();

+          pred_[r * block_width_ * 2 + c] = rnd.Rand8();

-      GetParam()(block_height, block_width, diff, block_width, src, block_width,

-                 pred, block_width);

+      GetParam()(block_height_, block_width_, diff_, block_width_, src_,

+                 block_width_, pred_, block_width_);

-      for (int r = 0; r < block_height; ++r) {

-        for (int c = 0; c < block_width; ++c) {

-          EXPECT_EQ(diff[r * block_width + c],

-                    (src[r * block_width + c] - pred[r * block_width + c]))

-              << "r = " << r << ", c = " << c << ", bs = " << bsize;

+      for (int r = 0; r < block_height_; ++r) {

+        for (int c = 0; c < block_width_; ++c) {

+          EXPECT_EQ(diff_[r * block_width_ + c],

+                    (src_[r * block_width_ + c] - pred_[r * block_width_ + c]))

+              << "r = " << r << ", c = " << c << ", bs = " << (int)bsize;

-      GetParam()(block_height, block_width, diff, block_width * 2, src,

-                 block_width * 2, pred, block_width * 2);

+      GetParam()(block_height_, block_width_, diff_, block_width_ * 2, src_,

+                 block_width_ * 2, pred_, block_width_ * 2);

-      for (int r = 0; r < block_height; ++r) {

-        for (int c = 0; c < block_width; ++c) {

-          EXPECT_EQ(

-              diff[r * block_width * 2 + c],

-              (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c]))

-              << "r = " << r << ", c = " << c << ", bs = " << bsize;

+      for (int r = 0; r < block_height_; ++r) {

+        for (int c = 0; c < block_width_; ++c) {

+          EXPECT_EQ(diff_[r * block_width_ * 2 + c],

+                    (src_[r * block_width_ * 2 + c] -

+                     pred_[r * block_width_ * 2 + c]))

+              << "r = " << r << ", c = " << c << ", bs = " << (int)bsize;

-    vpx_free(diff);

-    vpx_free(pred);

-    vpx_free(src);

+    vpx_free(diff_);

+    vpx_free(pred_);

+    vpx_free(src_);

@@ -104,6 +143,11 @@

 #if HAVE_MMI

 INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest,

                         ::testing::Values(vpx_subtract_block_mmi));

+#endif

+#if HAVE_VSX

+INSTANTIATE_TEST_CASE_P(VSX, VP9SubtractBlockTest,

+                        ::testing::Values(vpx_subtract_block_vsx));

 #endif

 }  // namespace vp9

--- /dev/null

+++ b/vpx_dsp/ppc/subtract_vsx.c

@@ -1,0 +1,117 @@

+/*

+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/ppc/types_vsx.h"

+static VPX_FORCE_INLINE void subtract_block4x4(

+    int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src,

+    ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) {

+  int16_t *diff1 = diff + 2 * diff_stride;

+  const uint8_t *src1 = src + 2 * src_stride;

+  const uint8_t *pred1 = pred + 2 * pred_stride;

+  const int16x8_t d0 = vec_vsx_ld(0, diff);

+  const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride);

+  const int16x8_t d2 = vec_vsx_ld(0, diff1);

+  const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride);

+  const uint8x16_t s0 = read4x2(src, (int)src_stride);

+  const uint8x16_t p0 = read4x2(pred, (int)pred_stride);

+  const uint8x16_t s1 = read4x2(src1, (int)src_stride);

+  const uint8x16_t p1 = read4x2(pred1, (int)pred_stride);

+  const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));

+  const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));

+  vec_vsx_st(xxpermdi(da, d0, 1), 0, diff);

+  vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride);

+  vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1);

+  vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride);

+}

+void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff,

+                            ptrdiff_t diff_stride, const uint8_t *src,

+                            ptrdiff_t src_stride, const uint8_t *pred,

+                            ptrdiff_t pred_stride) {

+  int r = rows, c;

+  switch (cols) {

+    case 64:

+    case 32:

+      do {

+        for (c = 0; c < cols; c += 32) {

+          const uint8x16_t s0 = vec_vsx_ld(0, src + c);

+          const uint8x16_t s1 = vec_vsx_ld(16, src + c);

+          const uint8x16_t p0 = vec_vsx_ld(0, pred + c);

+          const uint8x16_t p1 = vec_vsx_ld(16, pred + c);

+          const int16x8_t d0l =

+              vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));

+          const int16x8_t d0h =

+              vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));

+          const int16x8_t d1l =

+              vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1));

+          const int16x8_t d1h =

+              vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));

+          vec_vsx_st(d0h, 0, diff + c);

+          vec_vsx_st(d0l, 16, diff + c);

+          vec_vsx_st(d1h, 0, diff + c + 16);

+          vec_vsx_st(d1l, 16, diff + c + 16);

+        }

+        diff += diff_stride;

+        pred += pred_stride;

+        src += src_stride;

+      } while (--r);

+      break;

+    case 16:

+      do {

+        const uint8x16_t s0 = vec_vsx_ld(0, src);

+        const uint8x16_t p0 = vec_vsx_ld(0, pred);

+        const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));

+        const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));

+        vec_vsx_st(d0h, 0, diff);

+        vec_vsx_st(d0l, 16, diff);

+        diff += diff_stride;

+        pred += pred_stride;

+        src += src_stride;

+      } while (--r);

+      break;

+    case 8:

+      do {

+        const uint8x16_t s0 = vec_vsx_ld(0, src);

+        const uint8x16_t p0 = vec_vsx_ld(0, pred);

+        const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));

+        vec_vsx_st(d0h, 0, diff);

+        diff += diff_stride;

+        pred += pred_stride;

+        src += src_stride;

+      } while (--r);

+      break;

+    case 4:

+      subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride);

+      if (r > 4) {

+        diff += 4 * diff_stride;

+        pred += 4 * pred_stride;

+        src += 4 * src_stride;

+        subtract_block4x4(diff, diff_stride,

+                          src, src_stride,

+                          pred, pred_stride);

+      }

+      break;

+    default:

+      assert(0);  // unreachable

+  }

+}

--- a/vpx_dsp/ppc/types_vsx.h

+++ b/vpx_dsp/ppc/types_vsx.h

@@ -68,6 +68,13 @@

 #endif

 #endif

+static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {

+  const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);

+  const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);

+  return (uint8x16_t)vec_mergeh(a0, a1);

+}

 static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0,

                                          0, 0, 0, 0, 0, 0, 0, 0 };

 static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 };

--- a/vpx_dsp/ppc/variance_vsx.c

+++ b/vpx_dsp/ppc/variance_vsx.c

@@ -14,13 +14,6 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx_dsp/ppc/types_vsx.h"

-static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {

-  const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);

-  const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);

-  return (uint8x16_t)vec_mergeh(a0, a1);

-}

 uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b,

                               int b_stride) {

   int distortion;

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -342,6 +342,7 @@

 DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm

 DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c

+DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -718,7 +718,7 @@

 # Block subtraction

 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";

-specialize qw/vpx_subtract_block neon msa mmi sse2/;

+specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/;

 # Single block SAD

--

⑨