ref: 44600442dca48a5586105b01403ac03fad54d05b
parent: 60a10116d13c4394bed943e0213631735d03bb74
author: Johann <johannkoenig@google.com>
date: Wed Feb 15 12:17:45 EST 2017
bitdepth conversion: really use num elements The previous implementation confused bit/bytes/elements. It was using '32' as the multiplier but that was mistakenly adopted because a 32x32 transform embedded the stride. Change-Id: Ieeb867a332416b9a40580b5e7c9b20088e9e691a
--- a/vp9/encoder/x86/vp9_dct_sse2.asm
+++ b/vp9/encoder/x86/vp9_dct_sse2.asm
@@ -64,6 +64,6 @@
psllw m1, 2
STORE_TRAN_LOW 0, outputq, 0, 2, 3
- STORE_TRAN_LOW 1, outputq, 1, 2, 3
+ STORE_TRAN_LOW 1, outputq, 8, 2, 3
RET
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -91,8 +91,8 @@
.loop:
LOAD_TRAN_LOW 2, uqcq, 0
LOAD_TRAN_LOW 0, dqcq, 0
- LOAD_TRAN_LOW 3, uqcq, 1
- LOAD_TRAN_LOW 1, dqcq, 1
+ LOAD_TRAN_LOW 3, uqcq, 8
+ LOAD_TRAN_LOW 1, dqcq, 8
INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
sub sizeq, 16
--- a/vpx_dsp/x86/avg_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -117,14 +117,14 @@
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
HMD8_1D
- STORE_TRAN_LOW 0, outputq, 0, 8, 9
- STORE_TRAN_LOW 1, outputq, 1, 8, 9
- STORE_TRAN_LOW 2, outputq, 2, 8, 9
- STORE_TRAN_LOW 3, outputq, 3, 8, 9
- STORE_TRAN_LOW 4, outputq, 4, 8, 9
- STORE_TRAN_LOW 5, outputq, 5, 8, 9
- STORE_TRAN_LOW 6, outputq, 6, 8, 9
- STORE_TRAN_LOW 7, outputq, 7, 8, 9
+ STORE_TRAN_LOW 0, outputq, 0, 8, 9
+ STORE_TRAN_LOW 1, outputq, 8, 8, 9
+ STORE_TRAN_LOW 2, outputq, 16, 8, 9
+ STORE_TRAN_LOW 3, outputq, 24, 8, 9
+ STORE_TRAN_LOW 4, outputq, 32, 8, 9
+ STORE_TRAN_LOW 5, outputq, 40, 8, 9
+ STORE_TRAN_LOW 6, outputq, 48, 8, 9
+ STORE_TRAN_LOW 7, outputq, 56, 8, 9
RET
%endif
--- a/vpx_dsp/x86/bitdepth_conversion_sse2.asm
+++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@@ -32,21 +32,21 @@
%endmacro
; Load %2 + %3 into m%1.
-; %3 is the offset in elements, not bits.
+; %3 is the offset in elements, not bytes.
; If tran_low_t is 16 bits (low bit depth configuration) then load the value
; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
; the values down to 16 bits.
%macro LOAD_TRAN_LOW 3
%if CONFIG_VP9_HIGHBITDEPTH
- mova m%1, [%2 + %3 * 32]
- packssdw m%1, [%2 + %3 * 32 + 16]
+ mova m%1, [%2 + %3 * 4]
+ packssdw m%1, [%2 + %3 * 4 + 16]
%else
- mova m%1, [%2 + %3 * 16]
+ mova m%1, [%2 + %3 * 2]
%endif
%endmacro
; Store m%1 to %2 + %3.
-; %3 is the offset in elements, not bits.
+; %3 is the offset in elements, not bytes.
; If tran_low_t is 16 bits (low bit depth configuration) then store the value
; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
; extend the values first.
@@ -58,9 +58,9 @@
pcmpgtw m%4, m%1
punpcklwd m%5, m%4
punpckhwd m%1, m%4
- mova [%2 + %3 * 32 + 0], m%5
- mova [%2 + %3 * 32 + 16], m%1
+ mova [%2 + %3 * 4 + 0], m%5
+ mova [%2 + %3 * 4 + 16], m%1
%else
- mova [%2 + %3 * 16], m%1
+ mova [%2 + %3 * 2], m%1
%endif
%endmacro
--- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
@@ -984,14 +984,14 @@
mov r7, 2
idct32x32_135_transpose:
- LOAD_TRAN_LOW 0, r3, 0
- LOAD_TRAN_LOW 1, r3, 4
- LOAD_TRAN_LOW 2, r3, 8
- LOAD_TRAN_LOW 3, r3, 12
- LOAD_TRAN_LOW 4, r3, 16
- LOAD_TRAN_LOW 5, r3, 20
- LOAD_TRAN_LOW 6, r3, 24
- LOAD_TRAN_LOW 7, r3, 28
+ LOAD_TRAN_LOW 0, r3, 0
+ LOAD_TRAN_LOW 1, r3, 32
+ LOAD_TRAN_LOW 2, r3, 64
+ LOAD_TRAN_LOW 3, r3, 96
+ LOAD_TRAN_LOW 4, r3, 128
+ LOAD_TRAN_LOW 5, r3, 160
+ LOAD_TRAN_LOW 6, r3, 192
+ LOAD_TRAN_LOW 7, r3, 224
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
@@ -1422,14 +1422,14 @@
mov r7, 4
idct32x32_1024_transpose:
- LOAD_TRAN_LOW 0, r3, 0
- LOAD_TRAN_LOW 1, r3, 4
- LOAD_TRAN_LOW 2, r3, 8
- LOAD_TRAN_LOW 3, r3, 12
- LOAD_TRAN_LOW 4, r3, 16
- LOAD_TRAN_LOW 5, r3, 20
- LOAD_TRAN_LOW 6, r3, 24
- LOAD_TRAN_LOW 7, r3, 28
+ LOAD_TRAN_LOW 0, r3, 0
+ LOAD_TRAN_LOW 1, r3, 32
+ LOAD_TRAN_LOW 2, r3, 64
+ LOAD_TRAN_LOW 3, r3, 96
+ LOAD_TRAN_LOW 4, r3, 128
+ LOAD_TRAN_LOW 5, r3, 160
+ LOAD_TRAN_LOW 6, r3, 192
+ LOAD_TRAN_LOW 7, r3, 224
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
--- a/vpx_dsp/x86/inv_wht_sse2.asm
+++ b/vpx_dsp/x86/inv_wht_sse2.asm
@@ -84,7 +84,7 @@
INIT_XMM sse2
cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
LOAD_TRAN_LOW 0, inputq, 0
- LOAD_TRAN_LOW 1, inputq, 1
+ LOAD_TRAN_LOW 1, inputq, 8
psraw m0, 2
psraw m1, 2