shithub: libvpx

--- a/test/partial_idct_test.cc

+++ b/test/partial_idct_test.cc

@@ -201,7 +201,19 @@

                       make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,

                                  &vpx_idct4x4_1_add_c, TX_4X4, 1)));

-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE

+#if CONFIG_VP9_HIGHBITDEPTH

+INSTANTIATE_TEST_CASE_P(

+    NEON, PartialIDctTest,

+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,

+                                 &vpx_idct32x32_1_add_neon, TX_32X32, 1),

+                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,

+                                 &vpx_idct16x16_1_add_neon, TX_16X16, 1),

+                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,

+                                 &vpx_idct8x8_1_add_neon, TX_8X8, 1),

+                      make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,

+                                 &vpx_idct4x4_1_add_neon, TX_4X4, 1)));

+#else   // !CONFIG_VP9_HIGHBITDEPTH

 // 32x32_34_ 32x32_135_ are implemented using the 1024 version.

 INSTANTIATE_TEST_CASE_P(

     NEON, PartialIDctTest,

@@ -229,7 +241,8 @@

                                  &vpx_idct4x4_16_add_neon, TX_4X4, 16),

                       make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,

                                  &vpx_idct4x4_1_add_neon, TX_4X4, 1)));

-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

+#endif  // CONFIG_VP9_HIGHBITDEPTH

+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE

 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE

 // 32x32_135_ is implemented using the 1024 version.

--- a/vpx_dsp/arm/idct16x16_1_add_neon.asm

+++ b/vpx_dsp/arm/idct16x16_1_add_neon.asm

@@ -25,9 +25,8 @@

 |vpx_idct16x16_1_add_neon| PROC

     ldrsh            r0, [r0]

-    ; generate cospi_16_64 = 11585

-    mov              r12, #0x2d00

-    add              r12, #0x41

+    ; cospi_16_64 = 11585

+    movw             r12, #0x2d41

     ; out = dct_const_round_shift(input[0] * cospi_16_64)

     mul              r0, r0, r12               ; input[0] * cospi_16_64

--- a/vpx_dsp/arm/idct16x16_add_neon.asm

+++ b/vpx_dsp/arm/idct16x16_add_neon.asm

@@ -60,13 +60,11 @@

     vld2.s16        {q1,q2}, [r0]!

     vmov.s16        q15, q1

-    ; generate  cospi_28_64 = 3196

-    mov             r3, #0xc00

-    add             r3, #0x7c

+    ; cospi_28_64 = 3196

+    movw            r3, #0x0c7c

-    ; generate cospi_4_64  = 16069

-    mov             r12, #0x3e00

-    add             r12, #0xc5

+    ; cospi_4_64  = 16069

+    movw            r12, #0x3ec5

     ; transpose the input data

     TRANSPOSE8X8

@@ -76,13 +74,11 @@

     vdup.16         d1, r12                   ; duplicate cospi_4_64

     ; preloading to avoid stall

-    ; generate cospi_12_64 = 13623

-    mov             r3, #0x3500

-    add             r3, #0x37

+    ; cospi_12_64 = 13623

+    movw            r3, #0x3537

-    ; generate cospi_20_64 = 9102

-    mov             r12, #0x2300

-    add             r12, #0x8e

+    ; cospi_20_64 = 9102

+    movw            r12, #0x238e

     ; step2[4] * cospi_28_64

     vmull.s16       q2, d18, d0

@@ -112,13 +108,11 @@

     vqrshrn.s32     d15, q6, #14              ; >> 14

     ; preloading to avoid stall

-    ; generate cospi_16_64 = 11585

-    mov             r3, #0x2d00

-    add             r3, #0x41

+    ; cospi_16_64 = 11585

+    movw            r3, #0x2d41

-    ; generate cospi_24_64 = 6270

-    mov             r12, #0x1800

-    add             r12, #0x7e

+    ; cospi_24_64 = 6270

+    movw            r12, #0x187e

     ; step2[5] * cospi_12_64

     vmull.s16       q2, d26, d2

@@ -155,9 +149,8 @@

     vmull.s16       q0, d24, d30

     vmull.s16       q1, d25, d30

-    ; generate cospi_8_64 = 15137

-    mov             r3, #0x3b00

-    add             r3, #0x21

+    ; cospi_8_64 = 15137

+    movw            r3, #0x3b21

     vdup.16         d30, r12                  ; duplicate cospi_24_64

     vdup.16         d31, r3                   ; duplicate cospi_8_64

@@ -208,9 +201,8 @@

     vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];

     vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];

-    ; generate cospi_16_64 = 11585

-    mov             r3, #0x2d00

-    add             r3, #0x41

+    ; cospi_16_64 = 11585

+    movw            r3, #0x2d41

     ; stage 5

     vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];

@@ -307,13 +299,11 @@

     vld2.s16        {q0,q1}, [r0]!

     vmov.s16        q15, q0;

-    ; generate  cospi_30_64 = 1606

-    mov             r3, #0x0600

-    add             r3, #0x46

+    ; cospi_30_64 = 1606

+    movw            r3, #0x0646

-    ; generate cospi_2_64  = 16305

-    mov             r12, #0x3f00

-    add             r12, #0xb1

+    ; cospi_2_64  = 16305

+    movw            r12, #0x3fb1

     ; transpose the input data

     TRANSPOSE8X8

@@ -323,13 +313,11 @@

     vdup.16         d13, r12                  ; duplicate cospi_2_64

     ; preloading to avoid stall

-    ; generate cospi_14_64 = 12665

-    mov             r3, #0x3100

-    add             r3, #0x79

+    ; cospi_14_64 = 12665

+    movw            r3, #0x3179

-    ; generate cospi_18_64 = 10394

-    mov             r12, #0x2800

-    add             r12, #0x9a

+    ; cospi_18_64 = 10394

+    movw            r12, #0x289a

     ; step1[8] * cospi_30_64

     vmull.s16       q2, d16, d12

@@ -359,13 +347,11 @@

     vqrshrn.s32     d15, q4, #14              ; >> 14

     ; preloading to avoid stall

-    ; generate cospi_22_64 = 7723

-    mov             r3, #0x1e00

-    add             r3, #0x2b

+    ; cospi_22_64 = 7723

+    movw            r3, #0x1e2b

-    ; generate cospi_10_64 = 14449

-    mov             r12, #0x3800

-    add             r12, #0x71

+    ; cospi_10_64 = 14449

+    movw            r12, #0x3871

     ; step1[9] * cospi_14_64

     vmull.s16       q2, d24, d30

@@ -411,13 +397,11 @@

     vmlal.s16       q5, d27, d30

     ; preloading to avoid stall

-    ; generate cospi_6_64 = 15679

-    mov             r3, #0x3d00

-    add             r3, #0x3f

+    ; cospi_6_64 = 15679

+    movw            r3, #0x3d3f

-    ; generate cospi_26_64 = 4756

-    mov             r12, #0x1200

-    add             r12, #0x94

+    ; cospi_26_64 = 4756

+    movw            r12, #0x1294

     vdup.16         d30, r3                   ; duplicate cospi_6_64

     vdup.16         d31, r12                  ; duplicate cospi_26_64

@@ -466,13 +450,11 @@

     vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]

     ; stage 4

-    ; generate cospi_24_64 = 6270

-    mov             r3, #0x1800

-    add             r3, #0x7e

+    ; cospi_24_64 = 6270

+    movw            r3, #0x187e

-    ; generate cospi_8_64 = 15137

-    mov             r12, #0x3b00

-    add             r12, #0x21

+    ; cospi_8_64 = 15137

+    movw            r12, #0x3b21

     ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64

     vdup.16         d30, r12                  ; duplicate cospi_8_64

@@ -543,9 +525,8 @@

     vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];

     ; stage 6.

-    ; generate cospi_16_64 = 11585

-    mov             r12, #0x2d00

-    add             r12, #0x41

+    ; cospi_16_64 = 11585

+    movw            r12, #0x2d41

     vdup.16         d14, r12                  ; duplicate cospi_16_64

@@ -810,13 +791,11 @@

     vld2.s16        {q1,q2}, [r0]!

     vmov.s16        q15, q1

-    ; generate  cospi_28_64*2 = 6392

-    mov             r3, #0x1800

-    add             r3, #0xf8

+    ; cospi_28_64*2 = 6392

+    movw            r3, #0x18f8

-    ; generate cospi_4_64*2  = 32138

-    mov             r12, #0x7d00

-    add             r12, #0x8a

+    ; cospi_4_64*2  = 32138

+    movw            r12, #0x7d8a

     ; transpose the input data

     TRANSPOSE8X8

@@ -833,9 +812,8 @@

     vqrdmulh.s16    q4, q9, q0

     ; preloading to avoid stall

-    ; generate cospi_16_64*2 = 23170

-    mov             r3, #0x5a00

-    add             r3, #0x82

+    ; cospi_16_64*2 = 23170

+    movw            r3, #0x5a82

     ; dct_const_round_shift(step2[4] * cospi_4_64);

     vqrdmulh.s16    q7, q9, q1

@@ -843,9 +821,8 @@

     ; stage 4

     vdup.16         q1, r3                    ; cospi_16_64*2

-    ; generate cospi_16_64 = 11585

-    mov             r3, #0x2d00

-    add             r3, #0x41

+    ; cospi_16_64 = 11585

+    movw            r3, #0x2d41

     vdup.16         d4, r3;                   ; duplicate cospi_16_64

@@ -939,13 +916,11 @@

     vld2.s16        {q0,q1}, [r0]!

     vmov.s16        q15, q0;

-    ; generate 2*cospi_30_64 = 3212

-    mov             r3, #0xc00

-    add             r3, #0x8c

+    ; 2*cospi_30_64 = 3212

+    movw            r3, #0x0c8c

-    ; generate 2*cospi_2_64  = 32610

-    mov             r12, #0x7f00

-    add             r12, #0x62

+    ; 2*cospi_2_64  = 32610

+    movw            r12, #0x7f62

     ; transpose the input data

     TRANSPOSE8X8

@@ -962,15 +937,13 @@

     vqrdmulh.s16    q7, q8, q6

     ; preloading to avoid stall

-    ; generate 2*cospi_26_64 = 9512

-    mov             r12, #0x2500

-    add             r12, #0x28

+    ; 2*cospi_26_64 = 9512

+    movw            r12, #0x2528

     rsb             r12, #0

     vdup.16         q15, r12                  ; duplicate -2*cospi_26_64

-    ; generate 2*cospi_6_64 = 31358

-    mov             r3, #0x7a00

-    add             r3, #0x7e

+    ; 2*cospi_6_64 = 31358

+    movw            r3, #0x7a7e

     vdup.16         q14, r3                   ; duplicate 2*cospi_6_64

     ; dct_const_round_shift(- step1[12] * cospi_26_64)

@@ -980,14 +953,12 @@

     vqrdmulh.s16    q4, q9, q14

     ; stage 4

-    ; generate cospi_24_64 = 6270

-    mov             r3, #0x1800

-    add             r3, #0x7e

+    ; cospi_24_64 = 6270

+    movw            r3, #0x187e

     vdup.16         d31, r3                   ; duplicate cospi_24_64

-    ; generate cospi_8_64 = 15137

-    mov             r12, #0x3b00

-    add             r12, #0x21

+    ; cospi_8_64 = 15137

+    movw            r12, #0x3b21

     vdup.16         d30, r12                  ; duplicate cospi_8_64

     ; step1[14] * cospi_24_64

@@ -1052,9 +1023,8 @@

     vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];

     ; stage 6.

-    ; generate cospi_16_64 = 11585

-    mov             r12, #0x2d00

-    add             r12, #0x41

+    ; cospi_16_64 = 11585

+    movw            r12, #0x2d41

     vdup.16         d14, r12                  ; duplicate cospi_16_64

--- a/vpx_dsp/arm/idct32x32_1_add_neon.asm

+++ b/vpx_dsp/arm/idct32x32_1_add_neon.asm

@@ -77,9 +77,8 @@

     add              r3, r1, #16               ; r3 dest + 16 for second loop

     ldrsh            r0, [r0]

-    ; generate cospi_16_64 = 11585

-    mov              r12, #0x2d00

-    add              r12, #0x41

+    ; cospi_16_64 = 11585

+    movw             r12, #0x2d41

     ; out = dct_const_round_shift(input[0] * cospi_16_64)

     mul              r0, r0, r12               ; input[0] * cospi_16_64

--- a/vpx_dsp/arm/idct4x4_1_add_neon.asm

+++ b/vpx_dsp/arm/idct4x4_1_add_neon.asm

@@ -25,9 +25,8 @@

 |vpx_idct4x4_1_add_neon| PROC

     ldrsh            r0, [r0]

-    ; generate cospi_16_64 = 11585

-    mov              r12, #0x2d00

-    add              r12, #0x41

+    ; cospi_16_64 = 11585

+    movw             r12, #0x2d41

     ; out = dct_const_round_shift(input[0] * cospi_16_64)

     mul              r0, r0, r12               ; input[0] * cospi_16_64

--- a/vpx_dsp/arm/idct4x4_add_neon.asm

+++ b/vpx_dsp/arm/idct4x4_add_neon.asm

@@ -36,15 +36,12 @@

     vld1.s16        {q8,q9}, [r0]!

     ; generate scalar constants

-    ; cospi_8_64 = 15137 = 0x3b21

-    mov             r0, #0x3b00

-    add             r0, #0x21

-    ; cospi_16_64 = 11585 = 0x2d41

-    mov             r3, #0x2d00

-    add             r3, #0x41

-    ; cospi_24_64 = 6270 = 0x 187e

-    mov             r12, #0x1800

-    add             r12, #0x7e

+    ; cospi_8_64 = 15137

+    movw            r0, #0x3b21

+    ; cospi_16_64 = 11585

+    movw            r3, #0x2d41

+    ; cospi_24_64 = 6270

+    movw            r12, #0x187e

     ; transpose the input data

     ; 00 01 02 03   d16

--- a/vpx_dsp/arm/idct8x8_1_add_neon.asm

+++ b/vpx_dsp/arm/idct8x8_1_add_neon.asm

@@ -25,9 +25,8 @@

 |vpx_idct8x8_1_add_neon| PROC

     ldrsh            r0, [r0]

-    ; generate cospi_16_64 = 11585

-    mov              r12, #0x2d00

-    add              r12, #0x41

+    ; cospi_16_64 = 11585

+    movw             r12, #0x2d41

     ; out = dct_const_round_shift(input[0] * cospi_16_64)

     mul              r0, r0, r12               ; input[0] * cospi_16_64

--- a/vpx_dsp/arm/idct8x8_add_neon.asm

+++ b/vpx_dsp/arm/idct8x8_add_neon.asm

@@ -215,33 +215,26 @@

     ; transpose the input data

     TRANSPOSE8X8

-    ; generate  cospi_28_64 = 3196

-    mov             r3, #0x0c00

-    add             r3, #0x7c

+    ; cospi_28_64 = 3196

+    movw            r3, #0x0c7c

-    ; generate cospi_4_64  = 16069

-    mov             r4, #0x3e00

-    add             r4, #0xc5

+    ; cospi_4_64  = 16069

+    movw            r4, #0x3ec5

-    ; generate cospi_12_64 = 13623

-    mov             r5, #0x3500

-    add             r5, #0x37

+    ; cospi_12_64 = 13623

+    movw            r5, #0x3537

-    ; generate cospi_20_64 = 9102

-    mov             r6, #0x2300

-    add             r6, #0x8e

+    ; cospi_20_64 = 9102

+    movw            r6, #0x238e

-    ; generate cospi_16_64 = 11585

-    mov             r7, #0x2d00

-    add             r7, #0x41

+    ; cospi_16_64 = 11585

+    movw            r7, #0x2d41

-    ; generate cospi_24_64 = 6270

-    mov             r8, #0x1800

-    add             r8, #0x7e

+    ; cospi_24_64 = 6270

+    movw            r8, #0x187e

-    ; generate cospi_8_64 = 15137

-    mov             r9, #0x3b00

-    add             r9, #0x21

+    ; cospi_8_64 = 15137

+    movw            r9, #0x3b21

     ; First transform rows

     IDCT8x8_1D

@@ -327,33 +320,26 @@

     ; transpose the input data

     TRANSPOSE8X8

-    ; generate  cospi_28_64 = 3196

-    mov             r3, #0x0c00

-    add             r3, #0x7c

+    ; cospi_28_64 = 3196

+    movw            r3, #0x0c7c

-    ; generate cospi_4_64  = 16069

-    mov             r4, #0x3e00

-    add             r4, #0xc5

+    ; cospi_4_64  = 16069

+    movw            r4, #0x3ec5

-    ; generate cospi_12_64 = 13623

-    mov             r5, #0x3500

-    add             r5, #0x37

+    ; cospi_12_64 = 13623

+    movw            r5, #0x3537

-    ; generate cospi_20_64 = 9102

-    mov             r6, #0x2300

-    add             r6, #0x8e

+    ; cospi_20_64 = 9102

+    movw            r6, #0x238e

-    ; generate cospi_16_64 = 11585

-    mov             r7, #0x2d00

-    add             r7, #0x41

+    ; cospi_16_64 = 11585

+    movw            r7, #0x2d41

-    ; generate cospi_24_64 = 6270

-    mov             r8, #0x1800

-    add             r8, #0x7e

+    ; cospi_24_64 = 6270

+    movw            r8, #0x187e

-    ; generate cospi_8_64 = 15137

-    mov             r9, #0x3b00

-    add             r9, #0x21

+    ; cospi_8_64 = 15137

+    movw            r9, #0x3b21

     ; First transform rows

     ; stage 1

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -199,23 +199,15 @@

 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 ifeq ($(HAVE_NEON_ASM),yes)

-DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)

 DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)

-DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)

 DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)

-DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)

 DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)

-DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)

 DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)

 else

 ifeq ($(HAVE_NEON),yes)

-DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c

 DSP_SRCS-yes  += arm/idct4x4_add_neon.c

-DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c

 DSP_SRCS-yes  += arm/idct8x8_add_neon.c

-DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c

 DSP_SRCS-yes  += arm/idct16x16_add_neon.c

-DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c

 DSP_SRCS-yes  += arm/idct32x32_add_neon.c

 endif  # HAVE_NEON

 endif  # HAVE_NEON_ASM

@@ -233,7 +225,20 @@

 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c

 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c

 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c

-endif  # CONFIG_VP9_HIGHBITDEPTH

+endif  # !CONFIG_VP9_HIGHBITDEPTH

+ifeq ($(HAVE_NEON_ASM),yes)

+DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)

+DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)

+DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)

+DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM)

+else

+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c

+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c

+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c

+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c

+endif  # HAVE_NEON_ASM

 endif  # CONFIG_VP9

 # quantization

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -647,7 +647,7 @@

     specialize qw/vpx_idct4x4_16_add sse2/;

     add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vpx_idct4x4_1_add sse2/;

+    specialize qw/vpx_idct4x4_1_add neon sse2/;

     add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

     specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64";

@@ -656,7 +656,7 @@

     specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64";

     add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vpx_idct8x8_1_add sse2/;

+    specialize qw/vpx_idct8x8_1_add neon sse2/;

     add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

     specialize qw/vpx_idct16x16_256_add sse2/;

@@ -665,7 +665,7 @@

     specialize qw/vpx_idct16x16_10_add sse2/;

     add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vpx_idct16x16_1_add sse2/;

+    specialize qw/vpx_idct16x16_1_add neon sse2/;

     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

     specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";

@@ -679,7 +679,7 @@

     specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";

     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vpx_idct32x32_1_add sse2/;

+    specialize qw/vpx_idct32x32_1_add neon sse2/;

     add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

     specialize qw/vpx_highbd_idct4x4_16_add sse2/;

--

⑨