shithub: libvpx

--- a/libs.mk

+++ b/libs.mk

@@ -149,6 +149,7 @@

 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c

 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)

 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm

+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm

 endif

 CODEC_EXPORTS-yes += vpx/exports_com

 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc

@@ -204,6 +205,7 @@

     third_party/x86inc/x86inc.asm \

     vpx_config.asm \

     vpx_ports/x86_abi_support.asm \

+    vpx_dsp/x86/bitdepth_conversion_sse2.asm \

 vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def

 	@echo "    [CREATE] $@"

--- a/vp9/encoder/x86/vp9_dct_sse2.asm

+++ b/vp9/encoder/x86/vp9_dct_sse2.asm

@@ -11,6 +11,7 @@

 %define private_prefix vp9

 %include "third_party/x86inc/x86inc.asm"

+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"

 SECTION .text

@@ -62,25 +63,7 @@

   psllw           m0,        2

   psllw           m1,        2

-%if CONFIG_VP9_HIGHBITDEPTH

-  ; sign extension

-  mova            m2,             m0

-  mova            m3,             m1

-  punpcklwd       m0,             m0

-  punpcklwd       m1,             m1

-  punpckhwd       m2,             m2

-  punpckhwd       m3,             m3

-  psrad           m0,             16

-  psrad           m1,             16

-  psrad           m2,             16

-  psrad           m3,             16

-  mova            [outputq],      m0

-  mova            [outputq + 16], m2

-  mova            [outputq + 32], m1

-  mova            [outputq + 48], m3

-%else

-  mova            [outputq],      m0

-  mova            [outputq + 16], m1

-%endif

+  STORE_TRAN_LOW 0, outputq, 0, 2, 3

+  STORE_TRAN_LOW 1, outputq, 1, 2, 3

RET

--- a/vp9/encoder/x86/vp9_dct_ssse3.c

+++ b/vp9/encoder/x86/vp9_dct_ssse3.c

@@ -14,7 +14,7 @@

 #include "./vp9_rtcd.h"

 #include "./vpx_config.h"

 #include "vpx_dsp/vpx_dsp_common.h"

-#include "vpx_dsp/x86/fdct.h"

+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"

 #include "vpx_dsp/x86/inv_txfm_sse2.h"

 #include "vpx_dsp/x86/txfm_common_sse2.h"

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -13,6 +13,12 @@

 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h

+DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2.h

+# This file is included in libs.mk. Including it here would cause it to be

+# compiled into an object. Even as an empty file, this would create an

+# executable section on the stack.

+#DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2$(ASM)

 # bit reader

 DSP_SRCS-yes += prob.h

 DSP_SRCS-yes += prob.c

@@ -245,7 +251,6 @@

 DSP_SRCS-yes            += quantize.c

 DSP_SRCS-yes            += quantize.h

-DSP_SRCS-$(HAVE_SSE2)   += x86/fdct.h

 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c

--- a/vpx_dsp/x86/avg_intrin_sse2.c

+++ b/vpx_dsp/x86/avg_intrin_sse2.c

@@ -12,7 +12,7 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx/vpx_integer.h"

-#include "vpx_dsp/x86/fdct.h"

+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"

 #include "vpx_ports/mem.h"

 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,

--- a/vpx_dsp/x86/avg_ssse3_x86_64.asm

+++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm

@@ -9,6 +9,7 @@

 %include "third_party/x86inc/x86inc.asm"

+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"

 SECTION .text

@@ -94,20 +95,6 @@

   SWAP               7, 9

 %endmacro

-%if CONFIG_VP9_HIGHBITDEPTH

-; store %1 to outputq + %2

-; uses m8-m10 as scratch registers

-%macro STORE_TRAN_LOW 2

-  pxor                           m8, m8

-  mova                           m9, m%1

-  mova                          m10, m%1

-  pcmpgtw                        m8, m%1

-  punpcklwd                      m9, m8

-  punpckhwd                     m10, m8

-  mova               [outputq + %2], m9

-  mova          [outputq + %2 + 16], m10

-%endmacro

-%endif

 INIT_XMM ssse3

 cglobal hadamard_8x8, 3, 5, 11, input, stride, output

@@ -130,25 +117,14 @@

   TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10

   HMD8_1D

-%if CONFIG_VP9_HIGHBITDEPTH

-  STORE_TRAN_LOW                  0, 0

-  STORE_TRAN_LOW                  1, 32

-  STORE_TRAN_LOW                  2, 64

-  STORE_TRAN_LOW                  3, 96

-  STORE_TRAN_LOW                  4, 128

-  STORE_TRAN_LOW                  5, 160

-  STORE_TRAN_LOW                  6, 192

-  STORE_TRAN_LOW                  7, 224

-%else

-  mova              [outputq +   0], m0

-  mova              [outputq +  16], m1

-  mova              [outputq +  32], m2

-  mova              [outputq +  48], m3

-  mova              [outputq +  64], m4

-  mova              [outputq +  80], m5

-  mova              [outputq +  96], m6

-  mova              [outputq + 112], m7

-%endif

+  STORE_TRAN_LOW 0, outputq, 0, 8, 9

+  STORE_TRAN_LOW 1, outputq, 1, 8, 9

+  STORE_TRAN_LOW 2, outputq, 2, 8, 9

+  STORE_TRAN_LOW 3, outputq, 3, 8, 9

+  STORE_TRAN_LOW 4, outputq, 4, 8, 9

+  STORE_TRAN_LOW 5, outputq, 5, 8, 9

+  STORE_TRAN_LOW 6, outputq, 6, 8, 9

+  STORE_TRAN_LOW 7, outputq, 7, 8, 9

RET

 %endif

--- /dev/null

+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm

@@ -1,0 +1,66 @@

+;

+;  Copyright (c) 2017 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.

+; vpx_config.asm is not guarded so can not be included twice. Because this will

+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be

+; included after those files.

+; Increment register by sizeof() tran_low_t * 8.

+%macro INCREMENT_TRAN_LOW 1

+%if CONFIG_VP9_HIGHBITDEPTH

+  add %1, 32

+%else

+  add %1, 16

+%endif

+%endmacro

+; Increment %1 by sizeof() tran_low_t * %2.

+%macro INCREMENT_ELEMENTS_TRAN_LOW 2

+%if CONFIG_VP9_HIGHBITDEPTH

+  lea %1, [%1 + %2 * 4]

+%else

+  lea %1, [%1 + %2 * 2]

+%endif

+%endmacro

+; Load %2 + %3 into m%1.

+; %3 is the offset in elements, not bits.

+; If tran_low_t is 16 bits (low bit depth configuration) then load the value

+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack

+; the values down to 16 bits.

+%macro LOAD_TRAN_LOW 3

+%if CONFIG_VP9_HIGHBITDEPTH

+  mova     m%1, [%2 + %3 * 32]

+  packssdw m%1, [%2 + %3 * 32 + 16]

+%else

+  mova     m%1, [%2 + %3 * 16]

+%endif

+%endmacro

+; Store m%1 to %2 + %3.

+; %3 is the offset in elements, not bits.

+; If tran_low_t is 16 bits (low bit depth configuration) then store the value

+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign

+; extend the values first.

+; Uses m%4-m%6 as scratch registers for high bit depth.

+%macro STORE_TRAN_LOW 5

+%if CONFIG_VP9_HIGHBITDEPTH

+  pxor                      m%4, m%4

+  mova                      m%5, m%1

+  pcmpgtw                   m%4, m%1

+  punpcklwd                 m%5, m%4

+  punpckhwd                 m%1, m%4

+  mova      [%2 + %3 * 32 +  0], m%5

+  mova      [%2 + %3 * 32 + 16], m%1

+%else

+  mova           [%2 + %3 * 16], m%1

+%endif

+%endmacro

--- /dev/null

+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.h

@@ -1,0 +1,57 @@

+/*

+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_X86_FDCT_H_

+#define VPX_DSP_X86_FDCT_H_

+#include <xmmintrin.h>

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_dsp/vpx_dsp_common.h"

+// Load 8 16 bit values. If the source is 32 bits then cast down.

+// This does not saturate values. It only truncates.

+static INLINE __m128i load_tran_low(const tran_low_t *a) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],

+                        (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],

+                        (int16_t)a[6], (int16_t)a[7]);

+#else

+  return _mm_load_si128((const __m128i *)a);

+#endif

+}

+// Store 8 16 bit values. If the destination is 32 bits then sign extend the

+// values by multiplying by 1.

+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  const __m128i one = _mm_set1_epi16(1);

+  const __m128i a_hi = _mm_mulhi_epi16(a, one);

+  const __m128i a_lo = _mm_mullo_epi16(a, one);

+  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);

+  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);

+  _mm_store_si128((__m128i *)(b), a_1);

+  _mm_store_si128((__m128i *)(b + 4), a_2);

+#else

+  _mm_store_si128((__m128i *)(b), a);

+#endif

+}

+// Zero fill 8 positions in the output buffer.

+static INLINE void store_zero_tran_low(tran_low_t *a) {

+  const __m128i zero = _mm_setzero_si128();

+#if CONFIG_VP9_HIGHBITDEPTH

+  _mm_store_si128((__m128i *)(a), zero);

+  _mm_store_si128((__m128i *)(a + 4), zero);

+#else

+  _mm_store_si128((__m128i *)(a), zero);

+#endif

+}

+#endif  // VPX_DSP_X86_FDCT_H_

--- a/vpx_dsp/x86/fdct.h

+++ /dev/null

@@ -1,57 +1,0 @@

-/*

- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VPX_DSP_X86_FDCT_H_

-#define VPX_DSP_X86_FDCT_H_

-#include <xmmintrin.h>

-#include "./vpx_config.h"

-#include "vpx/vpx_integer.h"

-#include "vpx_dsp/vpx_dsp_common.h"

-// Load 8 16 bit values. If the source is 32 bits then cast down.

-// This does not saturate values. It only truncates.

-static INLINE __m128i load_tran_low(const tran_low_t *a) {

-#if CONFIG_VP9_HIGHBITDEPTH

-  return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],

-                        (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],

-                        (int16_t)a[6], (int16_t)a[7]);

-#else

-  return _mm_load_si128((const __m128i *)a);

-#endif

-}

-// Store 8 16 bit values. If the destination is 32 bits then sign extend the

-// values by multiplying by 1.

-static INLINE void store_tran_low(__m128i a, tran_low_t *b) {

-#if CONFIG_VP9_HIGHBITDEPTH

-  const __m128i one = _mm_set1_epi16(1);

-  const __m128i a_hi = _mm_mulhi_epi16(a, one);

-  const __m128i a_lo = _mm_mullo_epi16(a, one);

-  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);

-  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);

-  _mm_store_si128((__m128i *)(b), a_1);

-  _mm_store_si128((__m128i *)(b + 4), a_2);

-#else

-  _mm_store_si128((__m128i *)(b), a);

-#endif

-}

-// Zero fill 8 positions in the output buffer.

-static INLINE void store_zero_tran_low(tran_low_t *a) {

-  const __m128i zero = _mm_setzero_si128();

-#if CONFIG_VP9_HIGHBITDEPTH

-  _mm_store_si128((__m128i *)(a), zero);

-  _mm_store_si128((__m128i *)(a + 4), zero);

-#else

-  _mm_store_si128((__m128i *)(a), zero);

-#endif

-}

-#endif  // VPX_DSP_X86_FDCT_H_

--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm

+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm

@@ -9,6 +9,7 @@

 %include "third_party/x86inc/x86inc.asm"

+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"

 SECTION_RODATA

@@ -230,21 +231,10 @@

   lea        r3, [2 * strideq]

-%if CONFIG_VP9_HIGHBITDEPTH

-  mova       m0, [inputq +   0]

-  packssdw   m0, [inputq +  16]

-  mova       m1, [inputq +  32]

-  packssdw   m1, [inputq +  48]

-  mova       m2, [inputq +  64]

-  packssdw   m2, [inputq +  80]

-  mova       m3, [inputq +  96]

-  packssdw   m3, [inputq + 112]

-%else

-  mova       m0, [inputq +  0]

-  mova       m1, [inputq + 16]

-  mova       m2, [inputq + 32]

-  mova       m3, [inputq + 48]

-%endif

+  LOAD_TRAN_LOW 0, inputq, 0

+  LOAD_TRAN_LOW 1, inputq, 1

+  LOAD_TRAN_LOW 2, inputq, 2

+  LOAD_TRAN_LOW 3, inputq, 3

   punpcklwd  m0, m1

   punpcklwd  m2, m3

@@ -752,33 +742,14 @@

   lea             r4, [rsp + transposed_in]

 idct32x32_34_transpose:

-%if CONFIG_VP9_HIGHBITDEPTH

-  mova            m0, [r3 +       0]

-  packssdw        m0, [r3 +      16]

-  mova            m1, [r3 + 32 *  4]

-  packssdw        m1, [r3 + 32 *  4 + 16]

-  mova            m2, [r3 + 32 *  8]

-  packssdw        m2, [r3 + 32 *  8 + 16]

-  mova            m3, [r3 + 32 * 12]

-  packssdw        m3, [r3 + 32 * 12 + 16]

-  mova            m4, [r3 + 32 * 16]

-  packssdw        m4, [r3 + 32 * 16 + 16]

-  mova            m5, [r3 + 32 * 20]

-  packssdw        m5, [r3 + 32 * 20 + 16]

-  mova            m6, [r3 + 32 * 24]

-  packssdw        m6, [r3 + 32 * 24 + 16]

-  mova            m7, [r3 + 32 * 28]

-  packssdw        m7, [r3 + 32 * 28 + 16]

-%else

-  mova            m0, [r3 +       0]

-  mova            m1, [r3 + 16 *  4]

-  mova            m2, [r3 + 16 *  8]

-  mova            m3, [r3 + 16 * 12]

-  mova            m4, [r3 + 16 * 16]

-  mova            m5, [r3 + 16 * 20]

-  mova            m6, [r3 + 16 * 24]

-  mova            m7, [r3 + 16 * 28]

-%endif

+  LOAD_TRAN_LOW 0, r3,  0

+  LOAD_TRAN_LOW 1, r3,  4

+  LOAD_TRAN_LOW 2, r3,  8

+  LOAD_TRAN_LOW 3, r3, 12

+  LOAD_TRAN_LOW 4, r3, 16

+  LOAD_TRAN_LOW 5, r3, 20

+  LOAD_TRAN_LOW 6, r3, 24

+  LOAD_TRAN_LOW 7, r3, 28

   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10

@@ -1182,33 +1153,15 @@

   mov             r7, 2

 idct32x32_135_transpose:

-%if CONFIG_VP9_HIGHBITDEPTH

-  mova            m0, [r3 +       0]

-  packssdw        m0, [r3 +      16]

-  mova            m1, [r3 + 32 *  4]

-  packssdw        m1, [r3 + 32 *  4 + 16]

-  mova            m2, [r3 + 32 *  8]

-  packssdw        m2, [r3 + 32 *  8 + 16]

-  mova            m3, [r3 + 32 * 12]

-  packssdw        m3, [r3 + 32 * 12 + 16]

-  mova            m4, [r3 + 32 * 16]

-  packssdw        m4, [r3 + 32 * 16 + 16]

-  mova            m5, [r3 + 32 * 20]

-  packssdw        m5, [r3 + 32 * 20 + 16]

-  mova            m6, [r3 + 32 * 24]

-  packssdw        m6, [r3 + 32 * 24 + 16]

-  mova            m7, [r3 + 32 * 28]

-  packssdw        m7, [r3 + 32 * 28 + 16]

-%else

-  mova            m0, [r3 +       0]

-  mova            m1, [r3 + 16 *  4]

-  mova            m2, [r3 + 16 *  8]

-  mova            m3, [r3 + 16 * 12]

-  mova            m4, [r3 + 16 * 16]

-  mova            m5, [r3 + 16 * 20]

-  mova            m6, [r3 + 16 * 24]

-  mova            m7, [r3 + 16 * 28]

-%endif

+  LOAD_TRAN_LOW 0, r3,  0

+  LOAD_TRAN_LOW 1, r3,  4

+  LOAD_TRAN_LOW 2, r3,  8

+  LOAD_TRAN_LOW 3, r3, 12

+  LOAD_TRAN_LOW 4, r3, 16

+  LOAD_TRAN_LOW 5, r3, 20

+  LOAD_TRAN_LOW 6, r3, 24

+  LOAD_TRAN_LOW 7, r3, 28

   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10

   mova [r4 +      0], m0

@@ -1220,11 +1173,7 @@

   mova [r4 + 16 * 6], m6

   mova [r4 + 16 * 7], m7

-%if CONFIG_VP9_HIGHBITDEPTH

-  add             r3, 32

-%else

-  add             r3, 16

-%endif

+  INCREMENT_TRAN_LOW r3

   add             r4, 16 * 8

   dec             r7

   jne idct32x32_135_transpose

@@ -1231,11 +1180,7 @@

   IDCT32X32_135 16*0, 16*32, 16*64, 16*96

   lea            stp, [stp + 16 * 8]

-%if CONFIG_VP9_HIGHBITDEPTH

-  lea         inputq, [inputq + 32 * 32]

-%else

-  lea         inputq, [inputq + 16 * 32]

-%endif

+  INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32

   dec             r6

   jnz idct32x32_135

@@ -1646,33 +1591,14 @@

   mov             r7, 4

 idct32x32_1024_transpose:

-%if CONFIG_VP9_HIGHBITDEPTH

-  mova            m0, [r3 +       0]

-  packssdw        m0, [r3 +      16]

-  mova            m1, [r3 + 32 *  4]

-  packssdw        m1, [r3 + 32 *  4 + 16]

-  mova            m2, [r3 + 32 *  8]

-  packssdw        m2, [r3 + 32 *  8 + 16]

-  mova            m3, [r3 + 32 * 12]

-  packssdw        m3, [r3 + 32 * 12 + 16]

-  mova            m4, [r3 + 32 * 16]

-  packssdw        m4, [r3 + 32 * 16 + 16]

-  mova            m5, [r3 + 32 * 20]

-  packssdw        m5, [r3 + 32 * 20 + 16]

-  mova            m6, [r3 + 32 * 24]

-  packssdw        m6, [r3 + 32 * 24 + 16]

-  mova            m7, [r3 + 32 * 28]

-  packssdw        m7, [r3 + 32 * 28 + 16]

-%else

-  mova            m0, [r3 +       0]

-  mova            m1, [r3 + 16 *  4]

-  mova            m2, [r3 + 16 *  8]

-  mova            m3, [r3 + 16 * 12]

-  mova            m4, [r3 + 16 * 16]

-  mova            m5, [r3 + 16 * 20]

-  mova            m6, [r3 + 16 * 24]

-  mova            m7, [r3 + 16 * 28]

-%endif

+  LOAD_TRAN_LOW 0, r3,  0

+  LOAD_TRAN_LOW 1, r3,  4

+  LOAD_TRAN_LOW 2, r3,  8

+  LOAD_TRAN_LOW 3, r3, 12

+  LOAD_TRAN_LOW 4, r3, 16

+  LOAD_TRAN_LOW 5, r3, 20

+  LOAD_TRAN_LOW 6, r3, 24

+  LOAD_TRAN_LOW 7, r3, 28

   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10

@@ -1684,11 +1610,7 @@

   mova [r4 + 16 * 5], m5

   mova [r4 + 16 * 6], m6

   mova [r4 + 16 * 7], m7

-%if CONFIG_VP9_HIGHBITDEPTH

-  add             r3, 32

-%else

-  add             r3, 16

-%endif

+  INCREMENT_TRAN_LOW r3

   add             r4, 16 * 8

   dec             r7

   jne idct32x32_1024_transpose

@@ -1696,11 +1618,7 @@

   IDCT32X32_1024 16*0, 16*32, 16*64, 16*96

   lea            stp, [stp + 16 * 8]

-%if CONFIG_VP9_HIGHBITDEPTH

-  lea         inputq, [inputq + 32 * 32]

-%else

-  lea         inputq, [inputq + 16 * 32]

-%endif

+  INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32

   dec             r6

   jnz idct32x32_1024

--- a/vpx_dsp/x86/inv_wht_sse2.asm

+++ b/vpx_dsp/x86/inv_wht_sse2.asm

@@ -9,6 +9,7 @@

 %include "third_party/x86inc/x86inc.asm"

+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"

 SECTION .text

@@ -82,15 +83,8 @@

 INIT_XMM sse2

 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride

-%if CONFIG_VP9_HIGHBITDEPTH

-  mova            m0,        [inputq +  0]

-  packssdw        m0,        [inputq + 16]

-  mova            m1,        [inputq + 32]

-  packssdw        m1,        [inputq + 48]

-%else

-  mova            m0,        [inputq +  0]

-  mova            m1,        [inputq + 16]

-%endif

+  LOAD_TRAN_LOW    0, inputq, 0

+  LOAD_TRAN_LOW    1, inputq, 1

   psraw           m0,        2

   psraw           m1,        2

--- a/vpx_dsp/x86/quantize_sse2.c

+++ b/vpx_dsp/x86/quantize_sse2.c

@@ -13,7 +13,7 @@

 #include "./vpx_dsp_rtcd.h"

 #include "vpx/vpx_integer.h"

-#include "vpx_dsp/x86/fdct.h"

+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"

 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,

                          int skip_block, const int16_t *zbin_ptr,

--

⑨