shithub: libvpx

--- a/test/pp_filter_test.cc

+++ b/test/pp_filter_test.cc

@@ -120,6 +120,94 @@

   vpx_free(flimits);

};

+TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {

+  // Size of the underlying data block that will be filtered.

+  // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V

+  // blocks are always a multiple of 8 wide and exactly 8 high.

+  const int block_width = 136;

+  const int block_height = 16;

+  // 5-tap filter needs 2 padding rows above and below the block in the input.

+  const int input_width = block_width;

+  const int input_height = block_height + 4;

+  const int input_stride = input_width;

+  const int input_size = input_stride * input_height;

+  // Filter extends output block by 8 samples at left and right edges.

+  const int output_width = block_width + 16;

+  const int output_height = block_height;

+  const int output_stride = output_width;

+  const int output_size = output_stride * output_height;

+  uint8_t *const src_image = new uint8_t[input_size];

+  ASSERT_TRUE(src_image != NULL);

+  // Though the left padding is only 8 bytes, the assembly code tries to

+  // read 16 bytes before the pointer.

+  uint8_t *const dst_image = new uint8_t[output_size + 8];

+  ASSERT_TRUE(dst_image != NULL);

+  uint8_t *const dst_image_ref = new uint8_t[output_size + 8];

+  ASSERT_TRUE(dst_image_ref != NULL);

+  // Pointers to top-left pixel of block in the input and output images.

+  uint8_t *const src_image_ptr = src_image + (input_stride << 1);

+  // The assembly works in increments of 16. The first read may be offset by

+  // this amount.

+  uint8_t *const dst_image_ptr = dst_image + 16;

+  uint8_t *const dst_image_ref_ptr = dst_image + 16;

+  // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock

+  // can have a different filter.

+  uint8_t *const flimits =

+      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));

+  ACMRandom rnd;

+  rnd.Reset(ACMRandom::DeterministicSeed());

+  // Initialize pixels in the input:

+  //   block pixels to random values.

+  //   border pixels to value 10.

+  (void)memset(src_image, 10, input_size);

+  uint8_t *pixel_ptr = src_image_ptr;

+  for (int i = 0; i < block_height; ++i) {

+    for (int j = 0; j < block_width; ++j) {

+      pixel_ptr[j] = rnd.Rand8();

+    }

+    pixel_ptr += input_stride;

+  }

+  for (int blocks = 0; blocks < block_width; blocks += 8) {

+    (void)memset(flimits, 0, sizeof(*flimits) * block_width);

+    for (int f = 0; f < 255; f++) {

+      (void)memset(flimits + blocks, f, sizeof(*flimits) * 8);

+      (void)memset(dst_image, 0, output_size);

+      (void)memset(dst_image_ref, 0, output_size);

+      vpx_post_proc_down_and_across_mb_row_c(

+          src_image_ptr, dst_image_ref_ptr, input_stride, output_stride,

+          block_width, flimits, block_height);

+      ASM_REGISTER_STATE_CHECK(GetParam()(src_image_ptr, dst_image_ptr,

+                                          input_stride, output_stride,

+                                          block_width, flimits, 16));

+      for (int i = 0; i < block_height; ++i) {

+        for (int j = 0; j < block_width; ++j) {

+          ASSERT_EQ(dst_image_ref_ptr[j + i * output_stride],

+                    dst_image_ptr[j + i * output_stride])

+              << "at (" << i << ", " << j << ")";

+        }

+      }

+    }

+  }

+  delete[] src_image;

+  delete[] dst_image;

+  delete[] dst_image_ref;

+  vpx_free(flimits);

+}

 class VpxMbPostProcAcrossIpTest

     : public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {

  public:

@@ -497,8 +585,14 @@

 INSTANTIATE_TEST_CASE_P(SSE2, VpxMbPostProcDownTest,

                         ::testing::Values(vpx_mbpost_proc_down_sse2));

-#endif

+#endif  // HAVE_SSE2

+#if HAVE_NEON

+INSTANTIATE_TEST_CASE_P(

+    NEON, VpxPostProcDownAndAcrossMbRowTest,

+    ::testing::Values(vpx_post_proc_down_and_across_mb_row_neon));

+#endif  // HAVE_NEON

 #if HAVE_MSA

 INSTANTIATE_TEST_CASE_P(

     MSA, VpxPostProcDownAndAcrossMbRowTest,

@@ -509,6 +603,6 @@

 INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest,

                         ::testing::Values(vpx_mbpost_proc_down_msa));

-#endif

+#endif  // HAVE_MSA

 }  // namespace

--- /dev/null

+++ b/vpx_dsp/arm/deblock_neon.c

@@ -1,0 +1,259 @@

+/*

+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include <assert.h>

+#include "./vpx_dsp_rtcd.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/arm/transpose_neon.h"

+static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,

+                               const uint8x8_t v0, const uint8x8_t b1,

+                               const uint8x8_t b2) {

+  const uint8x8_t k1 = vrhadd_u8(a2, a1);

+  const uint8x8_t k2 = vrhadd_u8(b2, b1);

+  const uint8x8_t k3 = vrhadd_u8(k1, k2);

+  return vrhadd_u8(k3, v0);

+}

+static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1,

+                               const uint8x8_t v0, const uint8x8_t b1,

+                               const uint8x8_t b2, const uint8x8_t filter) {

+  const uint8x8_t a2_v0 = vabd_u8(a2, v0);

+  const uint8x8_t a1_v0 = vabd_u8(a1, v0);

+  const uint8x8_t b1_v0 = vabd_u8(b1, v0);

+  const uint8x8_t b2_v0 = vabd_u8(b2, v0);

+  uint8x8_t max = vmax_u8(a2_v0, a1_v0);

+  max = vmax_u8(b1_v0, max);

+  max = vmax_u8(b2_v0, max);

+  return vclt_u8(max, filter);

+}

+static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1,

+                                 const uint8x8_t v0, const uint8x8_t b1,

+                                 const uint8x8_t b2, const uint8x8_t filter) {

+  const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2);

+  const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter);

+  return vbsl_u8(mask, k_out, v0);

+}

+// Same functions but for uint8x16_t.

+static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1,

+                                 const uint8x16_t v0, const uint8x16_t b1,

+                                 const uint8x16_t b2) {

+  const uint8x16_t k1 = vrhaddq_u8(a2, a1);

+  const uint8x16_t k2 = vrhaddq_u8(b2, b1);

+  const uint8x16_t k3 = vrhaddq_u8(k1, k2);

+  return vrhaddq_u8(k3, v0);

+}

+static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1,

+                                 const uint8x16_t v0, const uint8x16_t b1,

+                                 const uint8x16_t b2, const uint8x16_t filter) {

+  const uint8x16_t a2_v0 = vabdq_u8(a2, v0);

+  const uint8x16_t a1_v0 = vabdq_u8(a1, v0);

+  const uint8x16_t b1_v0 = vabdq_u8(b1, v0);

+  const uint8x16_t b2_v0 = vabdq_u8(b2, v0);

+  uint8x16_t max = vmaxq_u8(a2_v0, a1_v0);

+  max = vmaxq_u8(b1_v0, max);

+  max = vmaxq_u8(b2_v0, max);

+  return vcltq_u8(max, filter);

+}

+static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1,

+                                   const uint8x16_t v0, const uint8x16_t b1,

+                                   const uint8x16_t b2,

+                                   const uint8x16_t filter) {

+  const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2);

+  const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter);

+  return vbslq_u8(mask, k_out, v0);

+}

+void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,

+                                               uint8_t *dst_ptr, int src_stride,

+                                               int dst_stride, int cols,

+                                               uint8_t *f, int size) {

+  uint8_t *src, *dst;

+  int row;

+  int col;

+  // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for

+  // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8

+  // (for U/V).

+  assert((size == 8 || size == 16) && cols % 8 == 0);

+  // While columns of length 16 can be processed, load them.

+  for (col = 0; col < cols - 8; col += 16) {

+    uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;

+    src = src_ptr - 2 * src_stride;

+    dst = dst_ptr;

+    a0 = vld1q_u8(src);

+    src += src_stride;

+    a1 = vld1q_u8(src);

+    src += src_stride;

+    a2 = vld1q_u8(src);

+    src += src_stride;

+    a3 = vld1q_u8(src);

+    src += src_stride;

+    for (row = 0; row < size; row += 4) {

+      uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3;

+      const uint8x16_t filterq = vld1q_u8(f + col);

+      a4 = vld1q_u8(src);

+      src += src_stride;

+      a5 = vld1q_u8(src);

+      src += src_stride;

+      a6 = vld1q_u8(src);

+      src += src_stride;

+      a7 = vld1q_u8(src);

+      src += src_stride;

+      v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq);

+      v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq);

+      v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq);

+      v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq);

+      vst1q_u8(dst, v_out_0);

+      dst += dst_stride;

+      vst1q_u8(dst, v_out_1);

+      dst += dst_stride;

+      vst1q_u8(dst, v_out_2);

+      dst += dst_stride;

+      vst1q_u8(dst, v_out_3);

+      dst += dst_stride;

+      // Rotate over to the next slot.

+      a0 = a4;

+      a1 = a5;

+      a2 = a6;

+      a3 = a7;

+    }

+    src_ptr += 16;

+    dst_ptr += 16;

+  }

+  // Clean up any left over column of length 8.

+  if (col != cols) {

+    uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;

+    src = src_ptr - 2 * src_stride;

+    dst = dst_ptr;

+    a0 = vld1_u8(src);

+    src += src_stride;

+    a1 = vld1_u8(src);

+    src += src_stride;

+    a2 = vld1_u8(src);

+    src += src_stride;

+    a3 = vld1_u8(src);

+    src += src_stride;

+    for (row = 0; row < size; row += 4) {

+      uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3;

+      const uint8x8_t filter = vld1_u8(f + col);

+      a4 = vld1_u8(src);

+      src += src_stride;

+      a5 = vld1_u8(src);

+      src += src_stride;

+      a6 = vld1_u8(src);

+      src += src_stride;

+      a7 = vld1_u8(src);

+      src += src_stride;

+      v_out_0 = generate_output(a0, a1, a2, a3, a4, filter);

+      v_out_1 = generate_output(a1, a2, a3, a4, a5, filter);

+      v_out_2 = generate_output(a2, a3, a4, a5, a6, filter);

+      v_out_3 = generate_output(a3, a4, a5, a6, a7, filter);

+      vst1_u8(dst, v_out_0);

+      dst += dst_stride;

+      vst1_u8(dst, v_out_1);

+      dst += dst_stride;

+      vst1_u8(dst, v_out_2);

+      dst += dst_stride;

+      vst1_u8(dst, v_out_3);

+      dst += dst_stride;

+      // Rotate over to the next slot.

+      a0 = a4;

+      a1 = a5;

+      a2 = a6;

+      a3 = a7;

+    }

+    // Not strictly necessary but makes resetting dst_ptr easier.

+    dst_ptr += 8;

+  }

+  dst_ptr -= cols;

+  for (row = 0; row < size; row += 8) {

+    uint8x8_t a0, a1, a2, a3;

+    uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;

+    src = dst_ptr;

+    dst = dst_ptr;

+    // Load 8 values, transpose 4 of them, and discard 2 because they will be

+    // reloaded later.

+    load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3);

+    a3 = a1;

+    a2 = a1 = a0;  // Extend left border.

+    src += 2;

+    for (col = 0; col < cols; col += 8) {

+      uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6,

+          v_out_7;

+      // Although the filter is meant to be applied vertically and is instead

+      // being applied horizontally here it's OK because it's set in blocks of 8

+      // (or 16).

+      const uint8x8_t filter = vld1_u8(f + col);

+      load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5,

+                                &b6, &b7);

+      if (col + 8 == cols) {

+        // Last row. Extend border (b5).

+        b6 = b7 = b5;

+      }

+      v_out_0 = generate_output(a0, a1, a2, a3, b0, filter);

+      v_out_1 = generate_output(a1, a2, a3, b0, b1, filter);

+      v_out_2 = generate_output(a2, a3, b0, b1, b2, filter);

+      v_out_3 = generate_output(a3, b0, b1, b2, b3, filter);

+      v_out_4 = generate_output(b0, b1, b2, b3, b4, filter);

+      v_out_5 = generate_output(b1, b2, b3, b4, b5, filter);

+      v_out_6 = generate_output(b2, b3, b4, b5, b6, filter);

+      v_out_7 = generate_output(b3, b4, b5, b6, b7, filter);

+      transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2,

+                                 v_out_3, v_out_4, v_out_5, v_out_6, v_out_7);

+      a0 = b4;

+      a1 = b5;

+      a2 = b6;

+      a3 = b7;

+      src += 8;

+      dst += 8;

+    }

+    dst_ptr += 8 * dst_stride;

+  }

+}

--- a/vpx_dsp/arm/transpose_neon.h

+++ b/vpx_dsp/arm/transpose_neon.h

@@ -179,6 +179,62 @@

   *a1 = d0.val[1];

+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,

+                                    uint8x8_t *a3, const uint8x8_t a4,

+                                    const uint8x8_t a5, const uint8x8_t a6,

+                                    const uint8x8_t a7) {

+  // Swap 32 bit elements. Goes from:

+  // a0: 00 01 02 03 XX XX XX XX

+  // a1: 10 11 12 13 XX XX XX XX

+  // a2: 20 21 22 23 XX XX XX XX

+  // a3; 30 31 32 33 XX XX XX XX

+  // a4: 40 41 42 43 XX XX XX XX

+  // a5: 50 51 52 53 XX XX XX XX

+  // a6: 60 61 62 63 XX XX XX XX

+  // a7: 70 71 72 73 XX XX XX XX

+  // to:

+  // b0.val[0]: 00 01 02 03 40 41 42 43

+  // b1.val[0]: 10 11 12 13 50 51 52 53

+  // b2.val[0]: 20 21 22 23 60 61 62 63

+  // b3.val[0]: 30 31 32 33 70 71 72 73

+  const uint32x2x2_t b0 =

+      vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));

+  const uint32x2x2_t b1 =

+      vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));

+  const uint32x2x2_t b2 =

+      vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));

+  const uint32x2x2_t b3 =

+      vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));

+  // Swap 16 bit elements resulting in:

+  // c0.val[0]: 00 01 20 21 40 41 60 61

+  // c0.val[1]: 02 03 22 23 42 43 62 63

+  // c1.val[0]: 10 11 30 31 50 51 70 71

+  // c1.val[1]: 12 13 32 33 52 53 72 73

+  const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),

+                                   vreinterpret_u16_u32(b2.val[0]));

+  const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),

+                                   vreinterpret_u16_u32(b3.val[0]));

+  // Swap 8 bit elements resulting in:

+  // d0.val[0]: 00 10 20 30 40 50 60 70

+  // d0.val[1]: 01 11 21 31 41 51 61 71

+  // d1.val[0]: 02 12 22 32 42 52 62 72

+  // d1.val[1]: 03 13 23 33 43 53 63 73

+  const uint8x8x2_t d0 =

+      vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));

+  const uint8x8x2_t d1 =

+      vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));

+  *a0 = d0.val[0];

+  *a1 = d0.val[1];

+  *a2 = d1.val[0];

+  *a3 = d1.val[1];

+}

 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,

                                      int32x4_t *a2, int32x4_t *a3) {

   // Swap 32 bit elements. Goes from:

@@ -936,11 +992,85 @@

   *o15 = e7.val[1];

-static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride,

-                                              int16x8_t *a0, int16x8_t *a1,

-                                              int16x8_t *a2, int16x8_t *a3,

-                                              int16x8_t *a4, int16x8_t *a5,

-                                              int16x8_t *a6, int16x8_t *a7) {

+static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,

+                                             const int a_stride, uint8x8_t *a0,

+                                             uint8x8_t *a1, uint8x8_t *a2,

+                                             uint8x8_t *a3) {

+  uint8x8_t a4, a5, a6, a7;

+  *a0 = vld1_u8(a);

+  a += a_stride;

+  *a1 = vld1_u8(a);

+  a += a_stride;

+  *a2 = vld1_u8(a);

+  a += a_stride;

+  *a3 = vld1_u8(a);

+  a += a_stride;

+  a4 = vld1_u8(a);

+  a += a_stride;

+  a5 = vld1_u8(a);

+  a += a_stride;

+  a6 = vld1_u8(a);

+  a += a_stride;

+  a7 = vld1_u8(a);

+  transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);

+}

+static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,

+                                             const int a_stride, uint8x8_t *a0,

+                                             uint8x8_t *a1, uint8x8_t *a2,

+                                             uint8x8_t *a3, uint8x8_t *a4,

+                                             uint8x8_t *a5, uint8x8_t *a6,

+                                             uint8x8_t *a7) {

+  *a0 = vld1_u8(a);

+  a += a_stride;

+  *a1 = vld1_u8(a);

+  a += a_stride;

+  *a2 = vld1_u8(a);

+  a += a_stride;

+  *a3 = vld1_u8(a);

+  a += a_stride;

+  *a4 = vld1_u8(a);

+  a += a_stride;

+  *a5 = vld1_u8(a);

+  a += a_stride;

+  *a6 = vld1_u8(a);

+  a += a_stride;

+  *a7 = vld1_u8(a);

+  transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);

+}

+static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,

+                                              uint8x8_t a0, uint8x8_t a1,

+                                              uint8x8_t a2, uint8x8_t a3,

+                                              uint8x8_t a4, uint8x8_t a5,

+                                              uint8x8_t a6, uint8x8_t a7) {

+  transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);

+  vst1_u8(a, a0);

+  a += a_stride;

+  vst1_u8(a, a1);

+  a += a_stride;

+  vst1_u8(a, a2);

+  a += a_stride;

+  vst1_u8(a, a3);

+  a += a_stride;

+  vst1_u8(a, a4);

+  a += a_stride;

+  vst1_u8(a, a5);

+  a += a_stride;

+  vst1_u8(a, a6);

+  a += a_stride;

+  vst1_u8(a, a7);

+}

+static INLINE void load_and_transpose_s16_8x8(const int16_t *a,

+                                              const int a_stride, int16x8_t *a0,

+                                              int16x8_t *a1, int16x8_t *a2,

+                                              int16x8_t *a3, int16x8_t *a4,

+                                              int16x8_t *a5, int16x8_t *a6,

+                                              int16x8_t *a7) {

   *a0 = vld1q_s16(a);

   a += a_stride;

   *a1 = vld1q_s16(a);

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -57,6 +57,7 @@

 DSP_SRCS-yes += postproc.h

 DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c

 DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c

+DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c

 DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm

 DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm

 endif # CONFIG_POSTPROC

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -1756,7 +1756,7 @@

     specialize qw/vpx_mbpost_proc_across_ip sse2 msa/;

     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";

-    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/;

+    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;