shithub: libvpx

Download patch

ref: ac00db794878c03c54c8b91577ebb41a9c7723ea
parent: fd918cf9a33494787c198ed32a3a484bf05a15d7
parent: 1e1caad16570a526b17c1d861cb8011f50ab6425
author: James Zern <jzern@google.com>
date: Thu Oct 6 15:37:18 EDT 2016

Merge changes from topic '8bit-hbd-idct'

* changes:
  vpx_dsp/idct*_neon.asm: simplify immediate loads
  enable idct*_1_add_neon in high-bitdepth builds

--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -201,7 +201,19 @@
                       make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
                                  &vpx_idct4x4_1_add_c, TX_4X4, 1)));
 
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialIDctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c,
+                                 &vpx_idct32x32_1_add_neon, TX_32X32, 1),
+                      make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c,
+                                 &vpx_idct16x16_1_add_neon, TX_16X16, 1),
+                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c,
+                                 &vpx_idct8x8_1_add_neon, TX_8X8, 1),
+                      make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
+                                 &vpx_idct4x4_1_add_neon, TX_4X4, 1)));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
 // 32x32_34_ 32x32_135_ are implemented using the 1024 version.
 INSTANTIATE_TEST_CASE_P(
     NEON, PartialIDctTest,
@@ -229,7 +241,8 @@
                                  &vpx_idct4x4_16_add_neon, TX_4X4, 16),
                       make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c,
                                  &vpx_idct4x4_1_add_neon, TX_4X4, 1)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 // 32x32_135_ is implemented using the 1024 version.
--- a/vpx_dsp/arm/idct16x16_1_add_neon.asm
+++ b/vpx_dsp/arm/idct16x16_1_add_neon.asm
@@ -25,9 +25,8 @@
 |vpx_idct16x16_1_add_neon| PROC
     ldrsh            r0, [r0]
 
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
+    ; cospi_16_64 = 11585
+    movw             r12, #0x2d41
 
     ; out = dct_const_round_shift(input[0] * cospi_16_64)
     mul              r0, r0, r12               ; input[0] * cospi_16_64
--- a/vpx_dsp/arm/idct16x16_add_neon.asm
+++ b/vpx_dsp/arm/idct16x16_add_neon.asm
@@ -60,13 +60,11 @@
     vld2.s16        {q1,q2}, [r0]!
     vmov.s16        q15, q1
 
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0xc00
-    add             r3, #0x7c
+    ; cospi_28_64 = 3196
+    movw            r3, #0x0c7c
 
-    ; generate cospi_4_64  = 16069
-    mov             r12, #0x3e00
-    add             r12, #0xc5
+    ; cospi_4_64  = 16069
+    movw            r12, #0x3ec5
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -76,13 +74,11 @@
     vdup.16         d1, r12                   ; duplicate cospi_4_64
 
     ; preloading to avoid stall
-    ; generate cospi_12_64 = 13623
-    mov             r3, #0x3500
-    add             r3, #0x37
+    ; cospi_12_64 = 13623
+    movw            r3, #0x3537
 
-    ; generate cospi_20_64 = 9102
-    mov             r12, #0x2300
-    add             r12, #0x8e
+    ; cospi_20_64 = 9102
+    movw            r12, #0x238e
 
     ; step2[4] * cospi_28_64
     vmull.s16       q2, d18, d0
@@ -112,13 +108,11 @@
     vqrshrn.s32     d15, q6, #14              ; >> 14
 
     ; preloading to avoid stall
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
+    ; cospi_16_64 = 11585
+    movw            r3, #0x2d41
 
-    ; generate cospi_24_64 = 6270
-    mov             r12, #0x1800
-    add             r12, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r12, #0x187e
 
     ; step2[5] * cospi_12_64
     vmull.s16       q2, d26, d2
@@ -155,9 +149,8 @@
     vmull.s16       q0, d24, d30
     vmull.s16       q1, d25, d30
 
-    ; generate cospi_8_64 = 15137
-    mov             r3, #0x3b00
-    add             r3, #0x21
+    ; cospi_8_64 = 15137
+    movw            r3, #0x3b21
 
     vdup.16         d30, r12                  ; duplicate cospi_24_64
     vdup.16         d31, r3                   ; duplicate cospi_8_64
@@ -208,9 +201,8 @@
     vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7];
     vadd.s16        q15, q6, q7               ; step2[7] = step1[6] + step1[7];
 
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
+    ; cospi_16_64 = 11585
+    movw            r3, #0x2d41
 
     ; stage 5
     vadd.s16        q0, q8, q11               ; step1[0] = step2[0] + step2[3];
@@ -307,13 +299,11 @@
     vld2.s16        {q0,q1}, [r0]!
     vmov.s16        q15, q0;
 
-    ; generate  cospi_30_64 = 1606
-    mov             r3, #0x0600
-    add             r3, #0x46
+    ; cospi_30_64 = 1606
+    movw            r3, #0x0646
 
-    ; generate cospi_2_64  = 16305
-    mov             r12, #0x3f00
-    add             r12, #0xb1
+    ; cospi_2_64  = 16305
+    movw            r12, #0x3fb1
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -323,13 +313,11 @@
     vdup.16         d13, r12                  ; duplicate cospi_2_64
 
     ; preloading to avoid stall
-    ; generate cospi_14_64 = 12665
-    mov             r3, #0x3100
-    add             r3, #0x79
+    ; cospi_14_64 = 12665
+    movw            r3, #0x3179
 
-    ; generate cospi_18_64 = 10394
-    mov             r12, #0x2800
-    add             r12, #0x9a
+    ; cospi_18_64 = 10394
+    movw            r12, #0x289a
 
     ; step1[8] * cospi_30_64
     vmull.s16       q2, d16, d12
@@ -359,13 +347,11 @@
     vqrshrn.s32     d15, q4, #14              ; >> 14
 
     ; preloading to avoid stall
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
+    ; cospi_22_64 = 7723
+    movw            r3, #0x1e2b
 
-    ; generate cospi_10_64 = 14449
-    mov             r12, #0x3800
-    add             r12, #0x71
+    ; cospi_10_64 = 14449
+    movw            r12, #0x3871
 
     ; step1[9] * cospi_14_64
     vmull.s16       q2, d24, d30
@@ -411,13 +397,11 @@
     vmlal.s16       q5, d27, d30
 
     ; preloading to avoid stall
-    ; generate cospi_6_64 = 15679
-    mov             r3, #0x3d00
-    add             r3, #0x3f
+    ; cospi_6_64 = 15679
+    movw            r3, #0x3d3f
 
-    ; generate cospi_26_64 = 4756
-    mov             r12, #0x1200
-    add             r12, #0x94
+    ; cospi_26_64 = 4756
+    movw            r12, #0x1294
 
     vdup.16         d30, r3                   ; duplicate cospi_6_64
     vdup.16         d31, r12                  ; duplicate cospi_26_64
@@ -466,13 +450,11 @@
     vadd.s16        q7, q6, q7                ; step1[15]=step2[14]+step2[15]
 
     ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r3, #0x187e
 
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
+    ; cospi_8_64 = 15137
+    movw            r12, #0x3b21
 
     ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
     vdup.16         d30, r12                  ; duplicate cospi_8_64
@@ -543,9 +525,8 @@
     vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
 
     ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
+    ; cospi_16_64 = 11585
+    movw            r12, #0x2d41
 
     vdup.16         d14, r12                  ; duplicate cospi_16_64
 
@@ -810,13 +791,11 @@
     vld2.s16        {q1,q2}, [r0]!
     vmov.s16        q15, q1
 
-    ; generate  cospi_28_64*2 = 6392
-    mov             r3, #0x1800
-    add             r3, #0xf8
+    ; cospi_28_64*2 = 6392
+    movw            r3, #0x18f8
 
-    ; generate cospi_4_64*2  = 32138
-    mov             r12, #0x7d00
-    add             r12, #0x8a
+    ; cospi_4_64*2  = 32138
+    movw            r12, #0x7d8a
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -833,9 +812,8 @@
     vqrdmulh.s16    q4, q9, q0
 
     ; preloading to avoid stall
-    ; generate cospi_16_64*2 = 23170
-    mov             r3, #0x5a00
-    add             r3, #0x82
+    ; cospi_16_64*2 = 23170
+    movw            r3, #0x5a82
 
     ; dct_const_round_shift(step2[4] * cospi_4_64);
     vqrdmulh.s16    q7, q9, q1
@@ -843,9 +821,8 @@
     ; stage 4
     vdup.16         q1, r3                    ; cospi_16_64*2
 
-    ; generate cospi_16_64 = 11585
-    mov             r3, #0x2d00
-    add             r3, #0x41
+    ; cospi_16_64 = 11585
+    movw            r3, #0x2d41
 
     vdup.16         d4, r3;                   ; duplicate cospi_16_64
 
@@ -939,13 +916,11 @@
     vld2.s16        {q0,q1}, [r0]!
     vmov.s16        q15, q0;
 
-    ; generate 2*cospi_30_64 = 3212
-    mov             r3, #0xc00
-    add             r3, #0x8c
+    ; 2*cospi_30_64 = 3212
+    movw            r3, #0x0c8c
 
-    ; generate 2*cospi_2_64  = 32610
-    mov             r12, #0x7f00
-    add             r12, #0x62
+    ; 2*cospi_2_64  = 32610
+    movw            r12, #0x7f62
 
     ; transpose the input data
     TRANSPOSE8X8
@@ -962,15 +937,13 @@
     vqrdmulh.s16    q7, q8, q6
 
     ; preloading to avoid stall
-    ; generate 2*cospi_26_64 = 9512
-    mov             r12, #0x2500
-    add             r12, #0x28
+    ; 2*cospi_26_64 = 9512
+    movw            r12, #0x2528
     rsb             r12, #0
     vdup.16         q15, r12                  ; duplicate -2*cospi_26_64
 
-    ; generate 2*cospi_6_64 = 31358
-    mov             r3, #0x7a00
-    add             r3, #0x7e
+    ; 2*cospi_6_64 = 31358
+    movw            r3, #0x7a7e
     vdup.16         q14, r3                   ; duplicate 2*cospi_6_64
 
     ; dct_const_round_shift(- step1[12] * cospi_26_64)
@@ -980,14 +953,12 @@
     vqrdmulh.s16    q4, q9, q14
 
     ; stage 4
-    ; generate cospi_24_64 = 6270
-    mov             r3, #0x1800
-    add             r3, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r3, #0x187e
     vdup.16         d31, r3                   ; duplicate cospi_24_64
 
-    ; generate cospi_8_64 = 15137
-    mov             r12, #0x3b00
-    add             r12, #0x21
+    ; cospi_8_64 = 15137
+    movw            r12, #0x3b21
     vdup.16         d30, r12                  ; duplicate cospi_8_64
 
     ; step1[14] * cospi_24_64
@@ -1052,9 +1023,8 @@
     vadd.s16        q15, q7, q4               ; step1[15] =step2[12]+step2[15];
 
     ; stage 6.
-    ; generate cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
+    ; cospi_16_64 = 11585
+    movw            r12, #0x2d41
 
     vdup.16         d14, r12                  ; duplicate cospi_16_64
 
--- a/vpx_dsp/arm/idct32x32_1_add_neon.asm
+++ b/vpx_dsp/arm/idct32x32_1_add_neon.asm
@@ -77,9 +77,8 @@
     add              r3, r1, #16               ; r3 dest + 16 for second loop
     ldrsh            r0, [r0]
 
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
+    ; cospi_16_64 = 11585
+    movw             r12, #0x2d41
 
     ; out = dct_const_round_shift(input[0] * cospi_16_64)
     mul              r0, r0, r12               ; input[0] * cospi_16_64
--- a/vpx_dsp/arm/idct4x4_1_add_neon.asm
+++ b/vpx_dsp/arm/idct4x4_1_add_neon.asm
@@ -25,9 +25,8 @@
 |vpx_idct4x4_1_add_neon| PROC
     ldrsh            r0, [r0]
 
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
+    ; cospi_16_64 = 11585
+    movw             r12, #0x2d41
 
     ; out = dct_const_round_shift(input[0] * cospi_16_64)
     mul              r0, r0, r12               ; input[0] * cospi_16_64
--- a/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -36,15 +36,12 @@
     vld1.s16        {q8,q9}, [r0]!
 
     ; generate scalar constants
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov             r0, #0x3b00
-    add             r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov             r3, #0x2d00
-    add             r3, #0x41
-    ; cospi_24_64 = 6270 = 0x 187e
-    mov             r12, #0x1800
-    add             r12, #0x7e
+    ; cospi_8_64 = 15137
+    movw            r0, #0x3b21
+    ; cospi_16_64 = 11585
+    movw            r3, #0x2d41
+    ; cospi_24_64 = 6270
+    movw            r12, #0x187e
 
     ; transpose the input data
     ; 00 01 02 03   d16
--- a/vpx_dsp/arm/idct8x8_1_add_neon.asm
+++ b/vpx_dsp/arm/idct8x8_1_add_neon.asm
@@ -25,9 +25,8 @@
 |vpx_idct8x8_1_add_neon| PROC
     ldrsh            r0, [r0]
 
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
+    ; cospi_16_64 = 11585
+    movw             r12, #0x2d41
 
     ; out = dct_const_round_shift(input[0] * cospi_16_64)
     mul              r0, r0, r12               ; input[0] * cospi_16_64
--- a/vpx_dsp/arm/idct8x8_add_neon.asm
+++ b/vpx_dsp/arm/idct8x8_add_neon.asm
@@ -215,33 +215,26 @@
     ; transpose the input data
     TRANSPOSE8X8
 
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
+    ; cospi_28_64 = 3196
+    movw            r3, #0x0c7c
 
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
+    ; cospi_4_64  = 16069
+    movw            r4, #0x3ec5
 
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
+    ; cospi_12_64 = 13623
+    movw            r5, #0x3537
 
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
+    ; cospi_20_64 = 9102
+    movw            r6, #0x238e
 
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
+    ; cospi_16_64 = 11585
+    movw            r7, #0x2d41
 
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r8, #0x187e
 
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
+    ; cospi_8_64 = 15137
+    movw            r9, #0x3b21
 
     ; First transform rows
     IDCT8x8_1D
@@ -327,33 +320,26 @@
     ; transpose the input data
     TRANSPOSE8X8
 
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
+    ; cospi_28_64 = 3196
+    movw            r3, #0x0c7c
 
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
+    ; cospi_4_64  = 16069
+    movw            r4, #0x3ec5
 
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
+    ; cospi_12_64 = 13623
+    movw            r5, #0x3537
 
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
+    ; cospi_20_64 = 9102
+    movw            r6, #0x238e
 
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
+    ; cospi_16_64 = 11585
+    movw            r7, #0x2d41
 
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
+    ; cospi_24_64 = 6270
+    movw            r8, #0x187e
 
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
+    ; cospi_8_64 = 15137
+    movw            r9, #0x3b21
 
     ; First transform rows
     ; stage 1
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -199,23 +199,15 @@
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 ifeq ($(HAVE_NEON_ASM),yes)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon$(ASM)
 DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon$(ASM)
 DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon$(ASM)
 DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon$(ASM)
 DSP_SRCS-yes  += arm/idct32x32_add_neon$(ASM)
 else
 ifeq ($(HAVE_NEON),yes)
-DSP_SRCS-yes  += arm/idct4x4_1_add_neon.c
 DSP_SRCS-yes  += arm/idct4x4_add_neon.c
-DSP_SRCS-yes  += arm/idct8x8_1_add_neon.c
 DSP_SRCS-yes  += arm/idct8x8_add_neon.c
-DSP_SRCS-yes  += arm/idct16x16_1_add_neon.c
 DSP_SRCS-yes  += arm/idct16x16_add_neon.c
-DSP_SRCS-yes  += arm/idct32x32_1_add_neon.c
 DSP_SRCS-yes  += arm/idct32x32_add_neon.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
@@ -233,7 +225,20 @@
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
-endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # !CONFIG_VP9_HIGHBITDEPTH
+
+ifeq ($(HAVE_NEON_ASM),yes)
+DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
+DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM)
+else
+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
+endif  # HAVE_NEON_ASM
+
 endif  # CONFIG_VP9
 
 # quantization
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -647,7 +647,7 @@
     specialize qw/vpx_idct4x4_16_add sse2/;
 
     add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct4x4_1_add sse2/;
+    specialize qw/vpx_idct4x4_1_add neon sse2/;
 
     add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64";
@@ -656,7 +656,7 @@
     specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct8x8_1_add sse2/;
+    specialize qw/vpx_idct8x8_1_add neon sse2/;
 
     add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct16x16_256_add sse2/;
@@ -665,7 +665,7 @@
     specialize qw/vpx_idct16x16_10_add sse2/;
 
     add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct16x16_1_add sse2/;
+    specialize qw/vpx_idct16x16_1_add neon sse2/;
 
     add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64";
@@ -679,7 +679,7 @@
     specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vpx_idct32x32_1_add sse2/;
+    specialize qw/vpx_idct32x32_1_add neon sse2/;
 
     add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
     specialize qw/vpx_highbd_idct4x4_16_add sse2/;