ref: ac00db794878c03c54c8b91577ebb41a9c7723ea
parent: fd918cf9a33494787c198ed32a3a484bf05a15d7
parent: 1e1caad16570a526b17c1d861cb8011f50ab6425
author: James Zern <jzern@google.com>
date: Thu Oct 6 15:37:18 EDT 2016
Merge changes from topic '8bit-hbd-idct' * changes: vpx_dsp/idct*_neon.asm: simplify immediate loads enable idct*_1_add_neon in high-bitdepth builds
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -201,7 +201,19 @@
make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
&vpx_idct4x4_1_add_c, TX_4X4, 1)));
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+ NEON, PartialIDctTest,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
+ &vpx_idct32x32_1_add_neon, TX_32X32, 1),
+ make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
+ &vpx_idct16x16_1_add_neon, TX_16X16, 1),
+ make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
+ &vpx_idct8x8_1_add_neon, TX_8X8, 1),
+ make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
+ &vpx_idct4x4_1_add_neon, TX_4X4, 1)));
+#else // !CONFIG_VP9_HIGHBITDEPTH
// 32x32_34_ 32x32_135_ are implemented using the 1024 version.
INSTANTIATE_TEST_CASE_P(
NEON, PartialIDctTest,
@@ -229,7 +241,8 @@
&vpx_idct4x4_16_add_neon, TX_4X4, 16),
make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
&vpx_idct4x4_1_add_neon, TX_4X4, 1)));
-#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif // CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
// 32x32_135_ is implemented using the 1024 version.
--- a/vpx_dsp/arm/idct16x16_1_add_neon.asm
+++ b/vpx_dsp/arm/idct16x16_1_add_neon.asm
@@ -25,9 +25,8 @@
|vpx_idct16x16_1_add_neon| PROC
ldrsh r0, [r0]
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
+ ; cospi_16_64 = 11585
+ movw r12, #0x2d41
; out = dct_const_round_shift(input[0] * cospi_16_64)
mul r0, r0, r12 ; input[0] * cospi_16_64
--- a/vpx_dsp/arm/idct16x16_add_neon.asm
+++ b/vpx_dsp/arm/idct16x16_add_neon.asm
@@ -60,13 +60,11 @@
vld2.s16 {q1,q2}, [r0]!
vmov.s16 q15, q1
- ; generate cospi_28_64 = 3196
- mov r3, #0xc00
- add r3, #0x7c
+ ; cospi_28_64 = 3196
+ movw r3, #0x0c7c
- ; generate cospi_4_64 = 16069
- mov r12, #0x3e00
- add r12, #0xc5
+ ; cospi_4_64 = 16069
+ movw r12, #0x3ec5
; transpose the input data
TRANSPOSE8X8
@@ -76,13 +74,11 @@
vdup.16 d1, r12 ; duplicate cospi_4_64
; preloading to avoid stall
- ; generate cospi_12_64 = 13623
- mov r3, #0x3500
- add r3, #0x37
+ ; cospi_12_64 = 13623
+ movw r3, #0x3537
- ; generate cospi_20_64 = 9102
- mov r12, #0x2300
- add r12, #0x8e
+ ; cospi_20_64 = 9102
+ movw r12, #0x238e
; step2[4] * cospi_28_64
vmull.s16 q2, d18, d0
@@ -112,13 +108,11 @@
vqrshrn.s32 d15, q6, #14 ; >> 14
; preloading to avoid stall
- ; generate cospi_16_64 = 11585
- mov r3, #0x2d00
- add r3, #0x41
+ ; cospi_16_64 = 11585
+ movw r3, #0x2d41
- ; generate cospi_24_64 = 6270
- mov r12, #0x1800
- add r12, #0x7e
+ ; cospi_24_64 = 6270
+ movw r12, #0x187e
; step2[5] * cospi_12_64
vmull.s16 q2, d26, d2
@@ -155,9 +149,8 @@
vmull.s16 q0, d24, d30
vmull.s16 q1, d25, d30
- ; generate cospi_8_64 = 15137
- mov r3, #0x3b00
- add r3, #0x21
+ ; cospi_8_64 = 15137
+ movw r3, #0x3b21
vdup.16 d30, r12 ; duplicate cospi_24_64
vdup.16 d31, r3 ; duplicate cospi_8_64
@@ -208,9 +201,8 @@
vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7];
vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7];
- ; generate cospi_16_64 = 11585
- mov r3, #0x2d00
- add r3, #0x41
+ ; cospi_16_64 = 11585
+ movw r3, #0x2d41
; stage 5
vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3];
@@ -307,13 +299,11 @@
vld2.s16 {q0,q1}, [r0]!
vmov.s16 q15, q0;
- ; generate cospi_30_64 = 1606
- mov r3, #0x0600
- add r3, #0x46
+ ; cospi_30_64 = 1606
+ movw r3, #0x0646
- ; generate cospi_2_64 = 16305
- mov r12, #0x3f00
- add r12, #0xb1
+ ; cospi_2_64 = 16305
+ movw r12, #0x3fb1
; transpose the input data
TRANSPOSE8X8
@@ -323,13 +313,11 @@
vdup.16 d13, r12 ; duplicate cospi_2_64
; preloading to avoid stall
- ; generate cospi_14_64 = 12665
- mov r3, #0x3100
- add r3, #0x79
+ ; cospi_14_64 = 12665
+ movw r3, #0x3179
- ; generate cospi_18_64 = 10394
- mov r12, #0x2800
- add r12, #0x9a
+ ; cospi_18_64 = 10394
+ movw r12, #0x289a
; step1[8] * cospi_30_64
vmull.s16 q2, d16, d12
@@ -359,13 +347,11 @@
vqrshrn.s32 d15, q4, #14 ; >> 14
; preloading to avoid stall
- ; generate cospi_22_64 = 7723
- mov r3, #0x1e00
- add r3, #0x2b
+ ; cospi_22_64 = 7723
+ movw r3, #0x1e2b
- ; generate cospi_10_64 = 14449
- mov r12, #0x3800
- add r12, #0x71
+ ; cospi_10_64 = 14449
+ movw r12, #0x3871
; step1[9] * cospi_14_64
vmull.s16 q2, d24, d30
@@ -411,13 +397,11 @@
vmlal.s16 q5, d27, d30
; preloading to avoid stall
- ; generate cospi_6_64 = 15679
- mov r3, #0x3d00
- add r3, #0x3f
+ ; cospi_6_64 = 15679
+ movw r3, #0x3d3f
- ; generate cospi_26_64 = 4756
- mov r12, #0x1200
- add r12, #0x94
+ ; cospi_26_64 = 4756
+ movw r12, #0x1294
vdup.16 d30, r3 ; duplicate cospi_6_64
vdup.16 d31, r12 ; duplicate cospi_26_64
@@ -466,13 +450,11 @@
vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15]
; stage 4
- ; generate cospi_24_64 = 6270
- mov r3, #0x1800
- add r3, #0x7e
+ ; cospi_24_64 = 6270
+ movw r3, #0x187e
- ; generate cospi_8_64 = 15137
- mov r12, #0x3b00
- add r12, #0x21
+ ; cospi_8_64 = 15137
+ movw r12, #0x3b21
; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
vdup.16 d30, r12 ; duplicate cospi_8_64
@@ -543,9 +525,8 @@
vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
; stage 6.
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
+ ; cospi_16_64 = 11585
+ movw r12, #0x2d41
vdup.16 d14, r12 ; duplicate cospi_16_64
@@ -810,13 +791,11 @@
vld2.s16 {q1,q2}, [r0]!
vmov.s16 q15, q1
- ; generate cospi_28_64*2 = 6392
- mov r3, #0x1800
- add r3, #0xf8
+ ; cospi_28_64*2 = 6392
+ movw r3, #0x18f8
- ; generate cospi_4_64*2 = 32138
- mov r12, #0x7d00
- add r12, #0x8a
+ ; cospi_4_64*2 = 32138
+ movw r12, #0x7d8a
; transpose the input data
TRANSPOSE8X8
@@ -833,9 +812,8 @@
vqrdmulh.s16 q4, q9, q0
; preloading to avoid stall
- ; generate cospi_16_64*2 = 23170
- mov r3, #0x5a00
- add r3, #0x82
+ ; cospi_16_64*2 = 23170
+ movw r3, #0x5a82
; dct_const_round_shift(step2[4] * cospi_4_64);
vqrdmulh.s16 q7, q9, q1
@@ -843,9 +821,8 @@
; stage 4
vdup.16 q1, r3 ; cospi_16_64*2
- ; generate cospi_16_64 = 11585
- mov r3, #0x2d00
- add r3, #0x41
+ ; cospi_16_64 = 11585
+ movw r3, #0x2d41
vdup.16 d4, r3; ; duplicate cospi_16_64
@@ -939,13 +916,11 @@
vld2.s16 {q0,q1}, [r0]!
vmov.s16 q15, q0;
- ; generate 2*cospi_30_64 = 3212
- mov r3, #0xc00
- add r3, #0x8c
+ ; 2*cospi_30_64 = 3212
+ movw r3, #0x0c8c
- ; generate 2*cospi_2_64 = 32610
- mov r12, #0x7f00
- add r12, #0x62
+ ; 2*cospi_2_64 = 32610
+ movw r12, #0x7f62
; transpose the input data
TRANSPOSE8X8
@@ -962,15 +937,13 @@
vqrdmulh.s16 q7, q8, q6
; preloading to avoid stall
- ; generate 2*cospi_26_64 = 9512
- mov r12, #0x2500
- add r12, #0x28
+ ; 2*cospi_26_64 = 9512
+ movw r12, #0x2528
rsb r12, #0
vdup.16 q15, r12 ; duplicate -2*cospi_26_64
- ; generate 2*cospi_6_64 = 31358
- mov r3, #0x7a00
- add r3, #0x7e
+ ; 2*cospi_6_64 = 31358
+ movw r3, #0x7a7e
vdup.16 q14, r3 ; duplicate 2*cospi_6_64
; dct_const_round_shift(- step1[12] * cospi_26_64)
@@ -980,14 +953,12 @@
vqrdmulh.s16 q4, q9, q14
; stage 4
- ; generate cospi_24_64 = 6270
- mov r3, #0x1800
- add r3, #0x7e
+ ; cospi_24_64 = 6270
+ movw r3, #0x187e
vdup.16 d31, r3 ; duplicate cospi_24_64
- ; generate cospi_8_64 = 15137
- mov r12, #0x3b00
- add r12, #0x21
+ ; cospi_8_64 = 15137
+ movw r12, #0x3b21
vdup.16 d30, r12 ; duplicate cospi_8_64
; step1[14] * cospi_24_64
@@ -1052,9 +1023,8 @@
vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
; stage 6.
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
+ ; cospi_16_64 = 11585
+ movw r12, #0x2d41
vdup.16 d14, r12 ; duplicate cospi_16_64
--- a/vpx_dsp/arm/idct32x32_1_add_neon.asm
+++ b/vpx_dsp/arm/idct32x32_1_add_neon.asm
@@ -77,9 +77,8 @@
add r3, r1, #16 ; r3 dest + 16 for second loop
ldrsh r0, [r0]
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
+ ; cospi_16_64 = 11585
+ movw r12, #0x2d41
; out = dct_const_round_shift(input[0] * cospi_16_64)
mul r0, r0, r12 ; input[0] * cospi_16_64
--- a/vpx_dsp/arm/idct4x4_1_add_neon.asm
+++ b/vpx_dsp/arm/idct4x4_1_add_neon.asm
@@ -25,9 +25,8 @@
|vpx_idct4x4_1_add_neon| PROC
ldrsh r0, [r0]
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
+ ; cospi_16_64 = 11585
+ movw r12, #0x2d41
; out = dct_const_round_shift(input[0] * cospi_16_64)
mul r0, r0, r12 ; input[0] * cospi_16_64
--- a/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -36,15 +36,12 @@
vld1.s16 {q8,q9}, [r0]!
; generate scalar constants
- ; cospi_8_64 = 15137 = 0x3b21
- mov r0, #0x3b00
- add r0, #0x21
- ; cospi_16_64 = 11585 = 0x2d41
- mov r3, #0x2d00
- add r3, #0x41
- ; cospi_24_64 = 6270 = 0x 187e
- mov r12, #0x1800
- add r12, #0x7e
+ ; cospi_8_64 = 15137
+ movw r0, #0x3b21
+ ; cospi_16_64 = 11585
+ movw r3, #0x2d41
+ ; cospi_24_64 = 6270
+ movw r12, #0x187e
; transpose the input data
; 00 01 02 03 d16
--- a/vpx_dsp/arm/idct8x8_1_add_neon.asm
+++ b/vpx_dsp/arm/idct8x8_1_add_neon.asm
@@ -25,9 +25,8 @@
|vpx_idct8x8_1_add_neon| PROC
ldrsh r0, [r0]
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
+ ; cospi_16_64 = 11585
+ movw r12, #0x2d41
; out = dct_const_round_shift(input[0] * cospi_16_64)
mul r0, r0, r12 ; input[0] * cospi_16_64
--- a/vpx_dsp/arm/idct8x8_add_neon.asm
+++ b/vpx_dsp/arm/idct8x8_add_neon.asm
@@ -215,33 +215,26 @@
; transpose the input data
TRANSPOSE8X8
- ; generate cospi_28_64 = 3196
- mov r3, #0x0c00
- add r3, #0x7c
+ ; cospi_28_64 = 3196
+ movw r3, #0x0c7c
- ; generate cospi_4_64 = 16069
- mov r4, #0x3e00
- add r4, #0xc5
+ ; cospi_4_64 = 16069
+ movw r4, #0x3ec5
- ; generate cospi_12_64 = 13623
- mov r5, #0x3500
- add r5, #0x37
+ ; cospi_12_64 = 13623
+ movw r5, #0x3537
- ; generate cospi_20_64 = 9102
- mov r6, #0x2300
- add r6, #0x8e
+ ; cospi_20_64 = 9102
+ movw r6, #0x238e
- ; generate cospi_16_64 = 11585
- mov r7, #0x2d00
- add r7, #0x41
+ ; cospi_16_64 = 11585
+ movw r7, #0x2d41
- ; generate cospi_24_64 = 6270
- mov r8, #0x1800
- add r8, #0x7e
+ ; cospi_24_64 = 6270
+ movw r8, #0x187e
- ; generate cospi_8_64 = 15137
- mov r9, #0x3b00
- add r9, #0x21
+ ; cospi_8_64 = 15137
+ movw r9, #0x3b21
; First transform rows
IDCT8x8_1D
@@ -327,33 +320,26 @@
; transpose the input data
TRANSPOSE8X8
- ; generate cospi_28_64 = 3196
- mov r3, #0x0c00
- add r3, #0x7c
+ ; cospi_28_64 = 3196
+ movw r3, #0x0c7c
- ; generate cospi_4_64 = 16069
- mov r4, #0x3e00
- add r4, #0xc5
+ ; cospi_4_64 = 16069
+ movw r4, #0x3ec5
- ; generate cospi_12_64 = 13623
- mov r5, #0x3500
- add r5, #0x37
+ ; cospi_12_64 = 13623
+ movw r5, #0x3537
- ; generate cospi_20_64 = 9102
- mov r6, #0x2300
- add r6, #0x8e
+ ; cospi_20_64 = 9102
+ movw r6, #0x238e
- ; generate cospi_16_64 = 11585
- mov r7, #0x2d00
- add r7, #0x41
+ ; cospi_16_64 = 11585
+ movw r7, #0x2d41
- ; generate cospi_24_64 = 6270
- mov r8, #0x1800
- add r8, #0x7e
+ ; cospi_24_64 = 6270
+ movw r8, #0x187e
- ; generate cospi_8_64 = 15137
- mov r9, #0x3b00
- add r9, #0x21
+ ; cospi_8_64 = 15137
+ movw r9, #0x3b21
; First transform rows
; stage 1
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -199,23 +199,15 @@
ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM)
else
ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes += arm/idct4x4_1_add_neon.c
DSP_SRCS-yes += arm/idct4x4_add_neon.c
-DSP_SRCS-yes += arm/idct8x8_1_add_neon.c
DSP_SRCS-yes += arm/idct8x8_add_neon.c
-DSP_SRCS-yes += arm/idct16x16_1_add_neon.c
DSP_SRCS-yes += arm/idct16x16_add_neon.c
-DSP_SRCS-yes += arm/idct32x32_1_add_neon.c
DSP_SRCS-yes += arm/idct32x32_add_neon.c
endif # HAVE_NEON
endif # HAVE_NEON_ASM
@@ -233,7 +225,20 @@
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
-endif # CONFIG_VP9_HIGHBITDEPTH
+endif # !CONFIG_VP9_HIGHBITDEPTH
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM)
+else
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
+endif # HAVE_NEON_ASM
+
endif # CONFIG_VP9
# quantization
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -647,7 +647,7 @@
specialize qw/vpx_idct4x4_16_add sse2/;
add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct4x4_1_add sse2/;
+ specialize qw/vpx_idct4x4_1_add neon sse2/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64";
@@ -656,7 +656,7 @@
specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64";
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct8x8_1_add sse2/;
+ specialize qw/vpx_idct8x8_1_add neon sse2/;
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct16x16_256_add sse2/;
@@ -665,7 +665,7 @@
specialize qw/vpx_idct16x16_10_add sse2/;
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct16x16_1_add sse2/;
+ specialize qw/vpx_idct16x16_1_add neon sse2/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
@@ -679,7 +679,7 @@
specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vpx_idct32x32_1_add sse2/;
+ specialize qw/vpx_idct32x32_1_add neon sse2/;
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
specialize qw/vpx_highbd_idct4x4_16_add sse2/;