shithub: libvpx

--- a/test/avg_test.cc

+++ b/test/avg_test.cc

@@ -446,9 +446,6 @@

                                           make_tuple(256, &vpx_satd_sse2),

                                           make_tuple(1024, &vpx_satd_sse2)));

-// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are

-// in place.

-#if !CONFIG_VP9_HIGHBITDEPTH

 INSTANTIATE_TEST_CASE_P(

     SSE2, BlockErrorTest,

     ::testing::Values(make_tuple(16, &vp9_block_error_fp_sse2),

@@ -455,7 +452,6 @@

                       make_tuple(64, &vp9_block_error_fp_sse2),

                       make_tuple(256, &vp9_block_error_fp_sse2),

                       make_tuple(1024, &vp9_block_error_fp_sse2)));

-#endif  // !CONFIG_VP9_HIGHBITDEPTH

 #endif  // HAVE_SSE2

 #if HAVE_NEON

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -133,7 +133,7 @@

   specialize qw/vp9_highbd_block_error_8bit sse2 avx/;

   add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";

-  specialize qw/vp9_block_error_fp/;

+  specialize qw/vp9_block_error_fp sse2/;

   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

--- a/vp9/encoder/x86/vp9_error_sse2.asm

+++ b/vp9/encoder/x86/vp9_error_sse2.asm

@@ -11,9 +11,12 @@

 %define private_prefix vp9

 %include "third_party/x86inc/x86inc.asm"

+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"

 SECTION .text

+%if CONFIG_VP9_HIGHBITDEPTH

+%else

 ; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,

 ;                         int64_t *ssz)

@@ -74,9 +77,11 @@

   movd    edx, m5

 %endif

RET

+%endif  ; CONFIG_VP9_HIGHBITDEPTH

-; Compute the sum of squared difference between two int16_t vectors.

-; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,

+; Compute the sum of squared difference between two tran_low_t vectors.

+; Vectors are converted (if necessary) to int16_t for calculations.

+; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,

 ;                            intptr_t block_size)

 INIT_XMM sse2

@@ -83,14 +88,14 @@

 cglobal block_error_fp, 3, 3, 6, uqc, dqc, size

   pxor      m4, m4                 ; sse accumulator

   pxor      m5, m5                 ; dedicated zero register

-  lea     uqcq, [uqcq+sizeq*2]

-  lea     dqcq, [dqcq+sizeq*2]

-  neg    sizeq

 .loop:

-  mova      m2, [uqcq+sizeq*2]

-  mova      m0, [dqcq+sizeq*2]

-  mova      m3, [uqcq+sizeq*2+mmsize]

-  mova      m1, [dqcq+sizeq*2+mmsize]

+  LOAD_TRAN_LOW 2, uqcq, 0

+  LOAD_TRAN_LOW 0, dqcq, 0

+  LOAD_TRAN_LOW 3, uqcq, 1

+  LOAD_TRAN_LOW 1, dqcq, 1

+  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16

+  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16

+  sub    sizeq, 16

   psubw     m0, m2

   psubw     m1, m3

   ; individual errors are max. 15bit+sign, so squares are 30bit, and

@@ -106,8 +111,7 @@

   punpckhdq m1, m5

   paddq     m4, m3

   paddq     m4, m1

-  add    sizeq, mmsize

-  jl .loop

+  jnz .loop

   ; accumulate horizontally and store in return value

   movhlps   m5, m4

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -107,11 +107,10 @@

 endif

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm

+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm

 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm

-else

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm

 endif

 ifeq ($(ARCH_X86_64),yes)

--

⑨