shithub: libvpx

Download patch

ref: 641fda79bb357599e5b38f750196bce66ec5df6b
parent: a16ca80b09cb4d698606d706486c2d22577af124
author: Johann <johannkoenig@google.com>
date: Thu Feb 2 09:17:26 EST 2017

highbd x86: consolidate tran_low_t conversions

Create new helper files specifically for converting tran_low_t types.

Change-Id: I7c4c458ef910f3b3d10a3cfbf9df4de7682fd905

--- a/libs.mk
+++ b/libs.mk
@@ -149,6 +149,7 @@
 INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
+INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm
 endif
 CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
@@ -204,6 +205,7 @@
     third_party/x86inc/x86inc.asm \
     vpx_config.asm \
     vpx_ports/x86_abi_support.asm \
+    vpx_dsp/x86/bitdepth_conversion_sse2.asm \
 
 vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
--- a/vp9/encoder/x86/vp9_dct_sse2.asm
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -11,6 +11,7 @@
 %define private_prefix vp9
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -62,25 +63,7 @@
   psllw           m0,        2
   psllw           m1,        2
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; sign extension
-  mova            m2,             m0
-  mova            m3,             m1
-  punpcklwd       m0,             m0
-  punpcklwd       m1,             m1
-  punpckhwd       m2,             m2
-  punpckhwd       m3,             m3
-  psrad           m0,             16
-  psrad           m1,             16
-  psrad           m2,             16
-  psrad           m3,             16
-  mova            [outputq],      m0
-  mova            [outputq + 16], m2
-  mova            [outputq + 32], m1
-  mova            [outputq + 48], m3
-%else
-  mova            [outputq],      m0
-  mova            [outputq + 16], m1
-%endif
+  STORE_TRAN_LOW 0, outputq, 0, 2, 3
+  STORE_TRAN_LOW 1, outputq, 1, 2, 3
 
   RET
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -14,7 +14,7 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -13,6 +13,12 @@
 
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 
+DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2.h
+# This file is included in libs.mk. Including it here would cause it to be
+# compiled into an object. Even as an empty file, this would create an
+# executable section on the stack.
+#DSP_SRCS-$(HAVE_SSE2)   += x86/bitdepth_conversion_sse2$(ASM)
+
 # bit reader
 DSP_SRCS-yes += prob.h
 DSP_SRCS-yes += prob.c
@@ -245,7 +251,6 @@
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
-DSP_SRCS-$(HAVE_SSE2)   += x86/fdct.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -12,7 +12,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_ports/mem.h"
 
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
--- a/vpx_dsp/x86/avg_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -9,6 +9,7 @@
 ;
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -94,20 +95,6 @@
   SWAP               7, 9
 %endmacro
 
-%if CONFIG_VP9_HIGHBITDEPTH
-; store %1 to outputq + %2
-; uses m8-m10 as scratch registers
-%macro STORE_TRAN_LOW 2
-  pxor                           m8, m8
-  mova                           m9, m%1
-  mova                          m10, m%1
-  pcmpgtw                        m8, m%1
-  punpcklwd                      m9, m8
-  punpckhwd                     m10, m8
-  mova               [outputq + %2], m9
-  mova          [outputq + %2 + 16], m10
-%endmacro
-%endif
 
 INIT_XMM ssse3
 cglobal hadamard_8x8, 3, 5, 11, input, stride, output
@@ -130,25 +117,14 @@
   TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
   HMD8_1D
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  STORE_TRAN_LOW                  0, 0
-  STORE_TRAN_LOW                  1, 32
-  STORE_TRAN_LOW                  2, 64
-  STORE_TRAN_LOW                  3, 96
-  STORE_TRAN_LOW                  4, 128
-  STORE_TRAN_LOW                  5, 160
-  STORE_TRAN_LOW                  6, 192
-  STORE_TRAN_LOW                  7, 224
-%else
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
-%endif
+  STORE_TRAN_LOW 0, outputq, 0, 8, 9
+  STORE_TRAN_LOW 1, outputq, 1, 8, 9
+  STORE_TRAN_LOW 2, outputq, 2, 8, 9
+  STORE_TRAN_LOW 3, outputq, 3, 8, 9
+  STORE_TRAN_LOW 4, outputq, 4, 8, 9
+  STORE_TRAN_LOW 5, outputq, 5, 8, 9
+  STORE_TRAN_LOW 6, outputq, 6, 8, 9
+  STORE_TRAN_LOW 7, outputq, 7, 8, 9
 
   RET
 %endif
--- /dev/null
+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@@ -1,0 +1,66 @@
+;
+;  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
+; vpx_config.asm is not guarded so can not be included twice. Because this will
+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
+; included after those files.
+
+; Increment register by sizeof() tran_low_t * 8.
+%macro INCREMENT_TRAN_LOW 1
+%if CONFIG_VP9_HIGHBITDEPTH
+  add %1, 32
+%else
+  add %1, 16
+%endif
+%endmacro
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+%if CONFIG_VP9_HIGHBITDEPTH
+  lea %1, [%1 + %2 * 4]
+%else
+  lea %1, [%1 + %2 * 2]
+%endif
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bits.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+  mova     m%1, [%2 + %3 * 32]
+  packssdw m%1, [%2 + %3 * 32 + 16]
+%else
+  mova     m%1, [%2 + %3 * 16]
+%endif
+%endmacro
+
+; Store m%1 to %2 + %3.
+; %3 is the offset in elements, not bits.
+; If tran_low_t is 16 bits (low bit depth configuration) then store the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
+; extend the values first.
+; Uses m%4-m%6 as scratch registers for high bit depth.
+%macro STORE_TRAN_LOW 5
+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor                      m%4, m%4
+  mova                      m%5, m%1
+  pcmpgtw                   m%4, m%1
+  punpcklwd                 m%5, m%4
+  punpckhwd                 m%1, m%4
+  mova      [%2 + %3 * 32 +  0], m%5
+  mova      [%2 + %3 * 32 + 16], m%1
+%else
+  mova           [%2 + %3 * 16], m%1
+%endif
+%endmacro
--- /dev/null
+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.h
@@ -1,0 +1,57 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_FDCT_H_
+#define VPX_DSP_X86_FDCT_H_
+
+#include <xmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then cast down.
+// This does not saturate values. It only truncates.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
+                        (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
+                        (int16_t)a[6], (int16_t)a[7]);
+#else
+  return _mm_load_si128((const __m128i *)a);
+#endif
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_hi = _mm_mulhi_epi16(a, one);
+  const __m128i a_lo = _mm_mullo_epi16(a, one);
+  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+#else
+  _mm_store_si128((__m128i *)(b), a);
+#endif
+}
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m128i zero = _mm_setzero_si128();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm_store_si128((__m128i *)(a), zero);
+  _mm_store_si128((__m128i *)(a + 4), zero);
+#else
+  _mm_store_si128((__m128i *)(a), zero);
+#endif
+}
+#endif  // VPX_DSP_X86_FDCT_H_
--- a/vpx_dsp/x86/fdct.h
+++ /dev/null
@@ -1,57 +1,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef VPX_DSP_X86_FDCT_H_
-#define VPX_DSP_X86_FDCT_H_
-
-#include <xmmintrin.h>
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-
-// Load 8 16 bit values. If the source is 32 bits then cast down.
-// This does not saturate values. It only truncates.
-static INLINE __m128i load_tran_low(const tran_low_t *a) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
-                        (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
-                        (int16_t)a[6], (int16_t)a[7]);
-#else
-  return _mm_load_si128((const __m128i *)a);
-#endif
-}
-
-// Store 8 16 bit values. If the destination is 32 bits then sign extend the
-// values by multiplying by 1.
-static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i a_hi = _mm_mulhi_epi16(a, one);
-  const __m128i a_lo = _mm_mullo_epi16(a, one);
-  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
-  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
-  _mm_store_si128((__m128i *)(b), a_1);
-  _mm_store_si128((__m128i *)(b + 4), a_2);
-#else
-  _mm_store_si128((__m128i *)(b), a);
-#endif
-}
-
-// Zero fill 8 positions in the output buffer.
-static INLINE void store_zero_tran_low(tran_low_t *a) {
-  const __m128i zero = _mm_setzero_si128();
-#if CONFIG_VP9_HIGHBITDEPTH
-  _mm_store_si128((__m128i *)(a), zero);
-  _mm_store_si128((__m128i *)(a + 4), zero);
-#else
-  _mm_store_si128((__m128i *)(a), zero);
-#endif
-}
-#endif  // VPX_DSP_X86_FDCT_H_
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -9,6 +9,7 @@
 ;
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION_RODATA
 
@@ -230,21 +231,10 @@
 
   lea        r3, [2 * strideq]
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       m0, [inputq +   0]
-  packssdw   m0, [inputq +  16]
-  mova       m1, [inputq +  32]
-  packssdw   m1, [inputq +  48]
-  mova       m2, [inputq +  64]
-  packssdw   m2, [inputq +  80]
-  mova       m3, [inputq +  96]
-  packssdw   m3, [inputq + 112]
-%else
-  mova       m0, [inputq +  0]
-  mova       m1, [inputq + 16]
-  mova       m2, [inputq + 32]
-  mova       m3, [inputq + 48]
-%endif
+  LOAD_TRAN_LOW 0, inputq, 0
+  LOAD_TRAN_LOW 1, inputq, 1
+  LOAD_TRAN_LOW 2, inputq, 2
+  LOAD_TRAN_LOW 3, inputq, 3
 
   punpcklwd  m0, m1
   punpcklwd  m2, m3
@@ -752,33 +742,14 @@
   lea             r4, [rsp + transposed_in]
 
 idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
+  LOAD_TRAN_LOW 0, r3,  0
+  LOAD_TRAN_LOW 1, r3,  4
+  LOAD_TRAN_LOW 2, r3,  8
+  LOAD_TRAN_LOW 3, r3, 12
+  LOAD_TRAN_LOW 4, r3, 16
+  LOAD_TRAN_LOW 5, r3, 20
+  LOAD_TRAN_LOW 6, r3, 24
+  LOAD_TRAN_LOW 7, r3, 28
 
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10
 
@@ -1182,33 +1153,15 @@
   mov             r7, 2
 
 idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
+  LOAD_TRAN_LOW 0, r3,  0
+  LOAD_TRAN_LOW 1, r3,  4
+  LOAD_TRAN_LOW 2, r3,  8
+  LOAD_TRAN_LOW 3, r3, 12
+  LOAD_TRAN_LOW 4, r3, 16
+  LOAD_TRAN_LOW 5, r3, 20
+  LOAD_TRAN_LOW 6, r3, 24
+  LOAD_TRAN_LOW 7, r3, 28
+
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10
 
   mova [r4 +      0], m0
@@ -1220,11 +1173,7 @@
   mova [r4 + 16 * 6], m6
   mova [r4 + 16 * 7], m7
 
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
+  INCREMENT_TRAN_LOW r3
   add             r4, 16 * 8
   dec             r7
   jne idct32x32_135_transpose
@@ -1231,11 +1180,7 @@
 
   IDCT32X32_135 16*0, 16*32, 16*64, 16*96
   lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
+  INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
   dec             r6
   jnz idct32x32_135
 
@@ -1646,33 +1591,14 @@
   mov             r7, 4
 
 idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0, [r3 +       0]
-  packssdw        m0, [r3 +      16]
-  mova            m1, [r3 + 32 *  4]
-  packssdw        m1, [r3 + 32 *  4 + 16]
-  mova            m2, [r3 + 32 *  8]
-  packssdw        m2, [r3 + 32 *  8 + 16]
-  mova            m3, [r3 + 32 * 12]
-  packssdw        m3, [r3 + 32 * 12 + 16]
-  mova            m4, [r3 + 32 * 16]
-  packssdw        m4, [r3 + 32 * 16 + 16]
-  mova            m5, [r3 + 32 * 20]
-  packssdw        m5, [r3 + 32 * 20 + 16]
-  mova            m6, [r3 + 32 * 24]
-  packssdw        m6, [r3 + 32 * 24 + 16]
-  mova            m7, [r3 + 32 * 28]
-  packssdw        m7, [r3 + 32 * 28 + 16]
-%else
-  mova            m0, [r3 +       0]
-  mova            m1, [r3 + 16 *  4]
-  mova            m2, [r3 + 16 *  8]
-  mova            m3, [r3 + 16 * 12]
-  mova            m4, [r3 + 16 * 16]
-  mova            m5, [r3 + 16 * 20]
-  mova            m6, [r3 + 16 * 24]
-  mova            m7, [r3 + 16 * 28]
-%endif
+  LOAD_TRAN_LOW 0, r3,  0
+  LOAD_TRAN_LOW 1, r3,  4
+  LOAD_TRAN_LOW 2, r3,  8
+  LOAD_TRAN_LOW 3, r3, 12
+  LOAD_TRAN_LOW 4, r3, 16
+  LOAD_TRAN_LOW 5, r3, 20
+  LOAD_TRAN_LOW 6, r3, 24
+  LOAD_TRAN_LOW 7, r3, 28
 
   TRANSPOSE8X8  0, 1, 2, 3, 4, 5, 6, 7, 9, 10
 
@@ -1684,11 +1610,7 @@
   mova [r4 + 16 * 5], m5
   mova [r4 + 16 * 6], m6
   mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
-  add             r3, 32
-%else
-  add             r3, 16
-%endif
+  INCREMENT_TRAN_LOW r3
   add             r4, 16 * 8
   dec             r7
   jne idct32x32_1024_transpose
@@ -1696,11 +1618,7 @@
   IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
 
   lea            stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea         inputq, [inputq + 32 * 32]
-%else
-  lea         inputq, [inputq + 16 * 32]
-%endif
+  INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
   dec             r6
   jnz idct32x32_1024
 
--- a/vpx_dsp/x86/inv_wht_sse2.asm
+++ b/vpx_dsp/x86/inv_wht_sse2.asm
@@ -9,6 +9,7 @@
 ;
 
 %include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
 
 SECTION .text
 
@@ -82,15 +83,8 @@
 
 INIT_XMM sse2
 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova            m0,        [inputq +  0]
-  packssdw        m0,        [inputq + 16]
-  mova            m1,        [inputq + 32]
-  packssdw        m1,        [inputq + 48]
-%else
-  mova            m0,        [inputq +  0]
-  mova            m1,        [inputq + 16]
-%endif
+  LOAD_TRAN_LOW    0, inputq, 0
+  LOAD_TRAN_LOW    1, inputq, 1
   psraw           m0,        2
   psraw           m1,        2
 
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -13,7 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,