ref: cd3d7cf4acd47f13e65ba5d644aafe15fd340590
parent: ba8bfaafa7df4807626583bca6ed149068b91ba0
parent: 405b94c661563d3ebcf57751da3b83ca943d2bcf
author: Johann Koenig <johannkoenig@google.com>
date: Thu Mar 16 17:52:15 EDT 2017
Merge "Add Hadamard for Power8"
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -13,6 +13,7 @@
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_timer.h"
#include "test/acm_random.h"
#include "test/register_state_check.h"
@@ -99,8 +100,31 @@
ACMRandom rnd_;
};
+void HadamardSpeedTest(const char *name, HadamardFunc const func,
+ const int16_t *input, int stride, tran_low_t *output,
+ int times) {
+ int i;
+ vpx_usec_timer timer;
+
+ vpx_usec_timer_start(&timer);
+ for (i = 0; i < times; ++i) {
+ func(input, stride, output);
+ }
+ vpx_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("%s[%12d runs]: %d us\n", name, times, elapsed_time);
+}
+
class Hadamard8x8Test : public HadamardTestBase {};
+void HadamardSpeedTest8x8(HadamardFunc const func, int times) {
+ DECLARE_ALIGNED(16, int16_t, input[64]);
+ DECLARE_ALIGNED(16, tran_low_t, output[64]);
+ memset(input, 1, sizeof(input));
+ HadamardSpeedTest("Hadamard8x8", func, input, 8, output, times);
+}
+
TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
DECLARE_ALIGNED(16, int16_t, a[64]);
DECLARE_ALIGNED(16, tran_low_t, b[64]);
@@ -142,6 +166,12 @@
}
}
+TEST_P(Hadamard8x8Test, DISABLED_Speed) {
+ HadamardSpeedTest8x8(h_func_, 10);
+ HadamardSpeedTest8x8(h_func_, 10000);
+ HadamardSpeedTest8x8(h_func_, 10000000);
+}
+
INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
::testing::Values(&vpx_hadamard_8x8_c));
@@ -169,8 +199,20 @@
#endif // HAVE_MSA
#endif // !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, Hadamard8x8Test,
+ ::testing::Values(&vpx_hadamard_8x8_vsx));
+#endif // HAVE_VSX
+
class Hadamard16x16Test : public HadamardTestBase {};
+void HadamardSpeedTest16x16(HadamardFunc const func, int times) {
+ DECLARE_ALIGNED(16, int16_t, input[256]);
+ DECLARE_ALIGNED(16, tran_low_t, output[256]);
+ memset(input, 1, sizeof(input));
+ HadamardSpeedTest("Hadamard16x16", func, input, 16, output, times);
+}
+
TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]);
@@ -212,6 +254,12 @@
}
}
+TEST_P(Hadamard16x16Test, DISABLED_Speed) {
+ HadamardSpeedTest16x16(h_func_, 10);
+ HadamardSpeedTest16x16(h_func_, 10000);
+ HadamardSpeedTest16x16(h_func_, 10000000);
+}
+
INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_c));
@@ -219,6 +267,11 @@
INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
::testing::Values(&vpx_hadamard_16x16_sse2));
#endif // HAVE_SSE2
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test,
+ ::testing::Values(&vpx_hadamard_16x16_vsx));
+#endif // HAVE_VSX
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
--- /dev/null
+++ b/vpx_dsp/ppc/bitdepth_conversion_vsx.h
@@ -1,0 +1,47 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+#define VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE int16x8_t load_tran_low(int32_t c, const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ int32x4_t u = vec_vsx_ld(c, s);
+ int32x4_t v = vec_vsx_ld(c, s + 4);
+ return vec_packs(u, v);
+#else
+ return vec_vsx_ld(c, s);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int16x8_t one = vec_splat_s16(1);
+ const int32x4_t even = vec_mule(v, one);
+ const int32x4_t odd = vec_mulo(v, one);
+ const int32x4_t high = vec_mergeh(even, odd);
+ const int32x4_t low = vec_mergel(even, odd);
+ vec_vsx_st(high, c, s);
+ vec_vsx_st(low, c, s + 4);
+#else
+ vec_vsx_st(v, c, s);
+#endif
+}
+
+#endif // VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
--- /dev/null
+++ b/vpx_dsp/ppc/hadamard_vsx.c
@@ -1,0 +1,119 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/ppc/transpose_vsx.h"
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+
+static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) {
+ const int16x8_t b0 = vec_add(v[0], v[1]);
+ const int16x8_t b1 = vec_sub(v[0], v[1]);
+ const int16x8_t b2 = vec_add(v[2], v[3]);
+ const int16x8_t b3 = vec_sub(v[2], v[3]);
+ const int16x8_t b4 = vec_add(v[4], v[5]);
+ const int16x8_t b5 = vec_sub(v[4], v[5]);
+ const int16x8_t b6 = vec_add(v[6], v[7]);
+ const int16x8_t b7 = vec_sub(v[6], v[7]);
+
+ const int16x8_t c0 = vec_add(b0, b2);
+ const int16x8_t c1 = vec_add(b1, b3);
+ const int16x8_t c2 = vec_sub(b0, b2);
+ const int16x8_t c3 = vec_sub(b1, b3);
+ const int16x8_t c4 = vec_add(b4, b6);
+ const int16x8_t c5 = vec_add(b5, b7);
+ const int16x8_t c6 = vec_sub(b4, b6);
+ const int16x8_t c7 = vec_sub(b5, b7);
+
+ v[0] = vec_add(c0, c4);
+ v[1] = vec_sub(c2, c6);
+ v[2] = vec_sub(c0, c4);
+ v[3] = vec_add(c2, c6);
+ v[4] = vec_add(c3, c7);
+ v[5] = vec_sub(c3, c7);
+ v[6] = vec_sub(c1, c5);
+ v[7] = vec_add(c1, c5);
+}
+
+void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride,
+ tran_low_t *coeff) {
+ int16x8_t v[8];
+
+ v[0] = vec_vsx_ld(0, src_diff);
+ v[1] = vec_vsx_ld(0, src_diff + src_stride);
+ v[2] = vec_vsx_ld(0, src_diff + (2 * src_stride));
+ v[3] = vec_vsx_ld(0, src_diff + (3 * src_stride));
+ v[4] = vec_vsx_ld(0, src_diff + (4 * src_stride));
+ v[5] = vec_vsx_ld(0, src_diff + (5 * src_stride));
+ v[6] = vec_vsx_ld(0, src_diff + (6 * src_stride));
+ v[7] = vec_vsx_ld(0, src_diff + (7 * src_stride));
+
+ vpx_hadamard_s16_8x8_one_pass(v);
+
+ vpx_transpose_s16_8x8(v);
+
+ vpx_hadamard_s16_8x8_one_pass(v);
+
+ store_tran_low(v[0], 0, coeff);
+ store_tran_low(v[1], 0, coeff + 8);
+ store_tran_low(v[2], 0, coeff + 16);
+ store_tran_low(v[3], 0, coeff + 24);
+ store_tran_low(v[4], 0, coeff + 32);
+ store_tran_low(v[5], 0, coeff + 40);
+ store_tran_low(v[6], 0, coeff + 48);
+ store_tran_low(v[7], 0, coeff + 56);
+}
+
+void vpx_hadamard_16x16_vsx(const int16_t *src_diff, int src_stride,
+ tran_low_t *coeff) {
+ int i;
+ const uint16x8_t ones = vec_splat_u16(1);
+
+ /* Rearrange 16x16 to 8x32 and remove stride.
+ * Top left first. */
+ vpx_hadamard_8x8_vsx(src_diff, src_stride, coeff);
+ /* Top right. */
+ vpx_hadamard_8x8_vsx(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+ /* Bottom left. */
+ vpx_hadamard_8x8_vsx(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+ /* Bottom right. */
+ vpx_hadamard_8x8_vsx(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+ /* Overlay the 8x8 blocks and combine. */
+ for (i = 0; i < 64; i += 8) {
+ const int16x8_t a0 = load_tran_low(0, coeff);
+ const int16x8_t a1 = load_tran_low(0, coeff + 64);
+ const int16x8_t a2 = load_tran_low(0, coeff + 128);
+ const int16x8_t a3 = load_tran_low(0, coeff + 192);
+
+ /* Prevent the result from escaping int16_t. */
+ const int16x8_t b0 = vec_sra(a0, ones);
+ const int16x8_t b1 = vec_sra(a1, ones);
+ const int16x8_t b2 = vec_sra(a2, ones);
+ const int16x8_t b3 = vec_sra(a3, ones);
+
+ const int16x8_t c0 = vec_add(b0, b1);
+ const int16x8_t c2 = vec_add(b2, b3);
+ const int16x8_t c1 = vec_sub(b0, b1);
+ const int16x8_t c3 = vec_sub(b2, b3);
+
+ const int16x8_t d0 = vec_add(c0, c2);
+ const int16x8_t d1 = vec_add(c1, c3);
+ const int16x8_t d2 = vec_sub(c0, c2);
+ const int16x8_t d3 = vec_sub(c1, c3);
+
+ store_tran_low(d0, 0, coeff);
+ store_tran_low(d1, 0, coeff + 64);
+ store_tran_low(d2, 0, coeff + 128);
+ store_tran_low(d3, 0, coeff + 192);
+
+ coeff += 8;
+ }
+}
--- /dev/null
+++ b/vpx_dsp/ppc/transpose_vsx.h
@@ -1,0 +1,101 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PPC_TRANSPOSE_VSX_H_
+#define VPX_DSP_PPC_TRANSPOSE_VSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) {
+ // d = vec_mergeh(a,b):
+ // The even elements of the result are obtained left-to-right,
+ // from the high elements of a.
+ // The odd elements of the result are obtained left-to-right,
+ // from the high elements of b.
+ //
+ // d = vec_mergel(a,b):
+ // The even elements of the result are obtained left-to-right,
+ // from the low elements of a.
+ // The odd elements of the result are obtained left-to-right,
+ // from the low elements of b.
+
+ // Example, starting with:
+ // v[0]: 00 01 02 03 04 05 06 07
+ // v[1]: 10 11 12 13 14 15 16 17
+ // v[2]: 20 21 22 23 24 25 26 27
+ // v[3]: 30 31 32 33 34 35 36 37
+ // v[4]: 40 41 42 43 44 45 46 47
+ // v[5]: 50 51 52 53 54 55 56 57
+ // v[6]: 60 61 62 63 64 65 66 67
+ // v[7]: 70 71 72 73 74 75 76 77
+
+ int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+ int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+
+ b0 = vec_mergeh(v[0], v[4]);
+ b1 = vec_mergel(v[0], v[4]);
+ b2 = vec_mergeh(v[1], v[5]);
+ b3 = vec_mergel(v[1], v[5]);
+ b4 = vec_mergeh(v[2], v[6]);
+ b5 = vec_mergel(v[2], v[6]);
+ b6 = vec_mergeh(v[3], v[7]);
+ b7 = vec_mergel(v[3], v[7]);
+
+ // After first merge operation
+ // b0: 00 40 01 41 02 42 03 43
+ // b1: 04 44 05 45 06 46 07 47
+ // b2: 10 50 11 51 12 52 13 53
+ // b3: 14 54 15 55 16 56 17 57
+ // b4: 20 60 21 61 22 62 23 63
+ // b5: 24 64 25 65 26 66 27 67
+ // b6: 30 70 31 71 32 62 33 73
+ // b7: 34 74 35 75 36 76 37 77
+
+ c0 = vec_mergeh(b0, b4);
+ c1 = vec_mergel(b0, b4);
+ c2 = vec_mergeh(b1, b5);
+ c3 = vec_mergel(b1, b5);
+ c4 = vec_mergeh(b2, b6);
+ c5 = vec_mergel(b2, b6);
+ c6 = vec_mergeh(b3, b7);
+ c7 = vec_mergel(b3, b7);
+
+ // After second merge operation
+ // c0: 00 20 40 60 01 21 41 61
+ // c1: 02 22 42 62 03 23 43 63
+ // c2: 04 24 44 64 05 25 45 65
+ // c3: 06 26 46 66 07 27 47 67
+ // c4: 10 30 50 70 11 31 51 71
+ // c5: 12 32 52 72 13 33 53 73
+ // c6: 14 34 54 74 15 35 55 75
+ // c7: 16 36 56 76 17 37 57 77
+
+ v[0] = vec_mergeh(c0, c4);
+ v[1] = vec_mergel(c0, c4);
+ v[2] = vec_mergeh(c1, c5);
+ v[3] = vec_mergel(c1, c5);
+ v[4] = vec_mergeh(c2, c6);
+ v[5] = vec_mergel(c2, c6);
+ v[6] = vec_mergeh(c3, c7);
+ v[7] = vec_mergel(c3, c7);
+
+ // After last merge operation
+ // v[0]: 00 10 20 30 40 50 60 70
+ // v[1]: 01 11 21 31 41 51 61 71
+ // v[2]: 02 12 22 32 42 52 62 72
+ // v[3]: 03 13 23 33 43 53 63 73
+ // v[4]: 04 14 24 34 44 54 64 74
+ // v[5]: 05 15 25 35 45 55 65 75
+ // v[6]: 06 16 26 36 46 56 66 76
+ // v[7]: 07 17 27 37 47 57 67 77
+}
+
+#endif // VPX_DSP_PPC_TRANSPOSE_VSX_H_
--- /dev/null
+++ b/vpx_dsp/ppc/types_vsx.h
@@ -1,0 +1,20 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_PPC_TYPES_VSX_H_
+#define VPX_DSP_PPC_TYPES_VSX_H_
+
+#include <altivec.h>
+
+typedef vector signed short int16x8_t;
+typedef vector unsigned short uint16x8_t;
+typedef vector signed int int32x4_t;
+
+#endif // VPX_DSP_PPC_TYPES_VSX_H_
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -264,11 +264,12 @@
DSP_SRCS-yes += avg.c
DSP_SRCS-$(HAVE_SSE2) += x86/avg_intrin_sse2.c
DSP_SRCS-$(HAVE_NEON) += arm/avg_neon.c
-DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
DSP_SRCS-$(HAVE_NEON) += arm/hadamard_neon.c
+DSP_SRCS-$(HAVE_MSA) += mips/avg_msa.c
ifeq ($(ARCH_X86_64),yes)
DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
endif
+DSP_SRCS-$(HAVE_VSX) += ppc/hadamard_vsx.c
endif # CONFIG_VP9_ENCODER
@@ -336,6 +337,11 @@
# Neon utilities
DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
+
+# PPC VSX utilities
+DSP_SRCS-$(HAVE_VSX) += ppc/types_vsx.h
+DSP_SRCS-$(HAVE_VSX) += ppc/transpose_vsx.h
+DSP_SRCS-$(HAVE_VSX) += ppc/bitdepth_conversion_vsx.h
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -908,22 +908,21 @@
add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
specialize qw/vpx_minmax_8x8 sse2 neon msa/;
-
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
+ specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_16x16 sse2 neon/;
+ specialize qw/vpx_hadamard_16x16 sse2 neon vsx/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
specialize qw/vpx_satd sse2 neon/;
} else {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
+ specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
+ specialize qw/vpx_hadamard_16x16 sse2 neon msa vsx/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
specialize qw/vpx_satd sse2 neon msa/;