ref: 0f751ecee314b90a551bc6b7fc09a40d20a3eab6
parent: 1eb8a718bfe08dd6e6ffdfff2e4613c4b0d4d7d6
author: Johann <johannkoenig@google.com>
date: Tue Jan 31 03:16:19 EST 2017
hadamard highbd ssse3: use tran_low_t for coeff BUG=webm:1365 Change-Id: I374dfc08732932382043905f128e928b08cb4f57
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -150,14 +150,10 @@
::testing::Values(&vpx_hadamard_8x8_sse2));
#endif // HAVE_SSE2
-// TODO(jingning): Remove highbitdepth flag when the SIMD functions are
-// in place and turn on the unit test.
-#if !CONFIG_VP9_HIGHBITDEPTH
#if HAVE_SSSE3 && ARCH_X86_64
INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
::testing::Values(&vpx_hadamard_8x8_ssse3));
#endif // HAVE_SSSE3 && ARCH_X86_64
-#endif // !CONFIG_VP9_HIGHBITDEPTH
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
@@ -164,6 +160,8 @@
::testing::Values(&vpx_hadamard_8x8_neon));
#endif // HAVE_NEON
+// TODO(jingning): Remove highbitdepth flag when the SIMD functions are
+// in place and turn on the unit test.
#if !CONFIG_VP9_HIGHBITDEPTH
#if HAVE_MSA
INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test,
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -888,7 +888,7 @@
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
- specialize qw/vpx_hadamard_8x8 sse2 neon/;
+ specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
specialize qw/vpx_hadamard_16x16 sse2 neon/;
--- a/vpx_dsp/x86/avg_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -8,8 +8,6 @@
; be found in the AUTHORS file in the root of the source tree.
;
-%define private_prefix vpx
-
%include "third_party/x86inc/x86inc.asm"
SECTION .text
@@ -96,6 +94,21 @@
SWAP 7, 9
%endmacro
+%if CONFIG_VP9_HIGHBITDEPTH
+; store %1 to outputq + %2
+; uses m8-m10 as scratch registers
+%macro STORE_TRAN_LOW 2
+ pxor m8, m8
+ mova m9, m%1
+ mova m10, m%1
+ pcmpgtw m8, m%1
+ punpcklwd m9, m8
+ punpckhwd m10, m8
+ mova [outputq + %2], m9
+ mova [outputq + %2 + 16], m10
+%endmacro
+%endif
+
INIT_XMM ssse3
cglobal hadamard_8x8, 3, 5, 11, input, stride, output
lea r3, [2 * strideq]
@@ -117,6 +130,16 @@
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
HMD8_1D
+%if CONFIG_VP9_HIGHBITDEPTH
+ STORE_TRAN_LOW 0, 0
+ STORE_TRAN_LOW 1, 32
+ STORE_TRAN_LOW 2, 64
+ STORE_TRAN_LOW 3, 96
+ STORE_TRAN_LOW 4, 128
+ STORE_TRAN_LOW 5, 160
+ STORE_TRAN_LOW 6, 192
+ STORE_TRAN_LOW 7, 224
+%else
mova [outputq + 0], m0
mova [outputq + 16], m1
mova [outputq + 32], m2
@@ -125,6 +148,7 @@
mova [outputq + 80], m5
mova [outputq + 96], m6
mova [outputq + 112], m7
+%endif
RET
%endif