shithub: dav1d

--- a/src/arm/64/itx.S

+++ b/src/arm/64/itx.S

@@ -449,7 +449,7 @@

         sqsub           \r2\sz,  v3\sz,   v7\sz

 .endm

-function inv_dct_4x4_neon

+function inv_dct_4h_x4_neon, export=1

         movrel          x16, idct_coeffs

         ld1             {v0.4h}, [x16]

         idct_4          v16, v17, v18, v19, .4h

@@ -456,7 +456,7 @@

ret

 endfunc

-function inv_dct_8x4_neon

+function inv_dct_8h_x4_neon, export=1

         movrel          x16, idct_coeffs

         ld1             {v0.4h}, [x16]

         idct_4          v16, v17, v18, v19, .8h

@@ -489,12 +489,12 @@

         rshrn           \o3\().4h, \o3\().4s, #12

 .endm

-function inv_adst_4x4_neon

+function inv_adst_4h_x4_neon, export=1

         iadst_4x4       v16, v17, v18, v19

ret

 endfunc

-function inv_flipadst_4x4_neon

+function inv_flipadst_4h_x4_neon, export=1

         iadst_4x4       v19, v18, v17, v16

ret

 endfunc

@@ -555,17 +555,17 @@

         rshrn2          \o3\().8h, v5.4s,  #12

 .endm

-function inv_adst_8x4_neon

+function inv_adst_8h_x4_neon, export=1

         iadst_8x4       v16, v17, v18, v19

ret

 endfunc

-function inv_flipadst_8x4_neon

+function inv_flipadst_8h_x4_neon, export=1

         iadst_8x4       v19, v18, v17, v16

ret

 endfunc

-function inv_identity_4x4_neon

+function inv_identity_4h_x4_neon, export=1

         mov             w16, #(5793-4096)*8

         dup             v0.4h,   w16

         sqrdmulh        v4.4h,   v16.4h,  v0.h[0]

@@ -579,7 +579,7 @@

ret

 endfunc

-function inv_identity_8x4_neon

+function inv_identity_8h_x4_neon, export=1

         mov             w16, #(5793-4096)*8

         dup             v0.4h,   w16

         sqrdmulh        v4.8h,   v16.8h,  v0.h[0]

@@ -684,8 +684,8 @@

         b               L(itx_4x4_end)

1:

 .endif

-        adr             x4,  inv_\txfm1\()_4x4_neon

-        adr             x5,  inv_\txfm2\()_4x4_neon

+        adr             x4,  inv_\txfm1\()_4h_x4_neon

+        adr             x5,  inv_\txfm2\()_4h_x4_neon

         b               inv_txfm_add_4x4_neon

 endfunc

 .endm

@@ -741,7 +741,7 @@

         mov             \r6\szb, v6\szb         // out6

 .endm

-function inv_dct_8x8_neon

+function inv_dct_8h_x8_neon, export=1

         movrel          x16, idct_coeffs

         ld1             {v0.8h}, [x16]

         idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b

@@ -748,7 +748,7 @@

ret

 endfunc

-function inv_dct_4x8_neon

+function inv_dct_4h_x8_neon, export=1

         movrel          x16, idct_coeffs

         ld1             {v0.8h}, [x16]

         idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b

@@ -822,27 +822,27 @@

         sqneg           \o5\()\sz, v3\sz     // out5

 .endm

-function inv_adst_8x8_neon

+function inv_adst_8h_x8_neon, export=1

         iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .8h

ret

 endfunc

-function inv_flipadst_8x8_neon

+function inv_flipadst_8h_x8_neon, export=1

         iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .8h

ret

 endfunc

-function inv_adst_4x8_neon

+function inv_adst_4h_x8_neon, export=1

         iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .4h

ret

 endfunc

-function inv_flipadst_4x8_neon

+function inv_flipadst_4h_x8_neon, export=1

         iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .4h

ret

 endfunc

-function inv_identity_8x8_neon

+function inv_identity_8h_x8_neon, export=1

         sqshl           v16.8h,  v16.8h,  #1

         sqshl           v17.8h,  v17.8h,  #1

         sqshl           v18.8h,  v18.8h,  #1

@@ -854,7 +854,7 @@

ret

 endfunc

-function inv_identity_4x8_neon

+function inv_identity_4h_x8_neon, export=1

         sqshl           v16.4h,  v16.4h,  #1

         sqshl           v17.4h,  v17.4h,  #1

         sqshl           v18.4h,  v18.4h,  #1

@@ -911,11 +911,11 @@

 .ifc \txfm1\()_\txfm2, dct_dct

         idct_dc         8,   8,   1

 .endif

-        adr             x5,  inv_\txfm2\()_8x8_neon

+        adr             x5,  inv_\txfm2\()_8h_x8_neon

 .ifc \txfm1, identity

         b               inv_txfm_identity_add_8x8_neon

 .else

-        adr             x4,  inv_\txfm1\()_8x8_neon

+        adr             x4,  inv_\txfm1\()_8h_x8_neon

         b               inv_txfm_add_8x8_neon

 .endif

 endfunc

@@ -998,8 +998,8 @@

 .ifc \txfm1\()_\txfm2, dct_dct

         idct_dc         \w,  \h,  0

 .endif

-        adr             x4,  inv_\txfm1\()_\h\()x\w\()_neon

-        adr             x5,  inv_\txfm2\()_\w\()x\h\()_neon

+        adr             x4,  inv_\txfm1\()_\h\()h_x\w\()_neon

+        adr             x5,  inv_\txfm2\()_\w\()h_x\h\()_neon

         b               inv_txfm_add_\w\()x\h\()_neon

 endfunc

 .endm

@@ -1110,7 +1110,7 @@

         mov             v22\szb, v3\szb

 .endm

-function inv_dct_8x16_neon

+function inv_dct_8h_x16_neon, export=1

         movrel          x16, idct_coeffs

         ld1             {v0.8h, v1.8h}, [x16]

         idct_16         .8h, .16b

@@ -1117,7 +1117,7 @@

ret

 endfunc

-function inv_dct_4x16_neon

+function inv_dct_4h_x16_neon, export=1

         movrel          x16, idct_coeffs

         ld1             {v0.8h, v1.8h}, [x16]

         idct_16         .4h, .8b

@@ -1294,27 +1294,27 @@

         sqneg           \o9\sz,  v7\sz // out9

 .endm

-function inv_adst_8x16_neon

+function inv_adst_8h_x16_neon, export=1

         iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b

ret

 endfunc

-function inv_flipadst_8x16_neon

+function inv_flipadst_8h_x16_neon, export=1

         iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b

ret

 endfunc

-function inv_adst_4x16_neon

+function inv_adst_4h_x16_neon, export=1

         iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b

ret

 endfunc

-function inv_flipadst_4x16_neon

+function inv_flipadst_4h_x16_neon, export=1

         iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b

ret

 endfunc

-function inv_identity_8x16_neon

+function inv_identity_8h_x16_neon, export=1

         mov             w16, #2*(5793-4096)*8

         dup             v0.4h,   w16

 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

@@ -1325,7 +1325,7 @@

ret

 endfunc

-function inv_identity_4x16_neon

+function inv_identity_4h_x16_neon, export=1

         mov             w16, #2*(5793-4096)*8

         dup             v0.4h,   w16

 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

@@ -1465,9 +1465,9 @@

         adr             x9,  inv_txfm_horz_identity_16x8_neon

 .else

         adr             x9,  inv_txfm_horz_16x8_neon

-        adr             x4,  inv_\txfm1\()_8x16_neon

+        adr             x4,  inv_\txfm1\()_8h_x16_neon

 .endif

-        adr             x5,  inv_\txfm2\()_8x16_neon

+        adr             x5,  inv_\txfm2\()_8h_x16_neon

         mov             x13, #\eob_half

         b               inv_txfm_add_16x16_neon

 endfunc

@@ -1634,12 +1634,12 @@

         idct_dc         \w,  \h,  1

 .endif

 .if \w == 4

-        adr             x4,  inv_\txfm1\()_8x\w\()_neon

-        adr             x5,  inv_\txfm2\()_4x\h\()_neon

+        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon

+        adr             x5,  inv_\txfm2\()_4h_x\h\()_neon

         mov             w13, #\eob_half

 .else

-        adr             x4,  inv_\txfm1\()_4x\w\()_neon

-        adr             x5,  inv_\txfm2\()_8x\h\()_neon

+        adr             x4,  inv_\txfm1\()_4h_x\w\()_neon

+        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon

 .endif

 .ifc \txfm1, identity

         b               inv_txfm_identity_add_\w\()x\h\()_neon

@@ -1816,8 +1816,8 @@

 .ifc \txfm1\()_\txfm2, dct_dct

         idct_dc         \w,  \h,  1

 .endif

-        adr             x4,  inv_\txfm1\()_8x\w\()_neon

-        adr             x5,  inv_\txfm2\()_8x\h\()_neon

+        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon

+        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon

 .if \w == 8

         mov             x13, #\eob_half

 .endif

@@ -1851,7 +1851,7 @@

 def_fns_816 8, 16

 def_fns_816 16, 8

-function inv_dct32_odd_8x16_neon

+function inv_dct32_odd_8h_x16_neon, export=1

         movrel          x16, idct_coeffs, 2*16

         ld1             {v0.8h, v1.8h}, [x16]

         sub             x16, x16, #2*16

@@ -2029,7 +2029,7 @@

         scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23

         scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31

 .endif

-        bl              inv_dct_8x16_neon

+        bl              inv_dct_8h_x16_neon

         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5

         transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5

@@ -2059,7 +2059,7 @@

         scale_input     .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23

         scale_input     .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31

 .endif

-        bl              inv_dct32_odd_8x16_neon

+        bl              inv_dct32_odd_8h_x16_neon

         transpose_8x8h  v31, v30, v29, v28, v27, v26, v25, v24, v4, v5

         transpose_8x8h  v23, v22, v21, v20, v19, v18, v17, v16, v4, v5

 .macro store2 r0, r1, shift

@@ -2105,7 +2105,7 @@

 .endr

         sub             x7,  x7,  x8, lsl #4

-        bl              inv_dct_8x16_neon

+        bl              inv_dct_8h_x16_neon

 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

         st1             {v\i\().8h}, [x7], x8

@@ -2118,7 +2118,7 @@

 .endr

         sub             x7,  x7,  x8, lsl #4

         sub             x7,  x7,  x8, lsr #1

-        bl              inv_dct32_odd_8x16_neon

+        bl              inv_dct32_odd_8h_x16_neon

         neg             x9,  x8

         mov             x10, x6

@@ -2384,7 +2384,7 @@

         sub             sp,  sp,  #1024

         movrel          x13, eob_16x32

         ldrh            w12, [x13], #2

-        adr             x4,  inv_dct_8x16_neon

+        adr             x4,  inv_dct_8h_x16_neon

 .irp i, 0, 8, 16, 24

         add             x6,  sp,  #(\i*16*2)

@@ -2432,7 +2432,7 @@

         mov             x15, x30

         sub             sp,  sp,  #1024

-        adr             x5,  inv_dct_8x16_neon

+        adr             x5,  inv_dct_8h_x16_neon

 .irp i, 0, 8

         add             x6,  sp,  #(\i*32*2)

@@ -2493,7 +2493,7 @@

         sub             w9,  w9,  #8

         add             x2,  x2,  #2*8

-        bl              inv_dct_8x8_neon

+        bl              inv_dct_8h_x8_neon

 .irp i, 16, 17, 18, 19, 20, 21, 22, 23

         srshr           v\i\().8h,  v\i\().8h,  #2

@@ -2550,7 +2550,7 @@

 .endr

         add             w9,  w9,  #8

-        bl              inv_dct_8x8_neon

+        bl              inv_dct_8h_x8_neon

         cmp             w9,  #32

@@ -2755,7 +2755,7 @@

 .endm

 .macro def_dct64_func suffix, clear=0, scale=0

-function inv_txfm_dct\suffix\()_8x64_neon

+function inv_txfm_dct\suffix\()_8h_x64_neon, export=1

         mov             x14, x30

         mov             x6,  sp

         lsl             x8,  x8,  #2

@@ -2768,7 +2768,7 @@

         add             x7,  x7,  x8, lsr #1

         scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23

-        bl              inv_dct_8x16_neon

+        bl              inv_dct_8h_x16_neon

         store16         x6

@@ -2781,7 +2781,7 @@

         sub             x7,  x7,  x8, lsr #1

         scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23

-        bl              inv_dct32_odd_8x16_neon

+        bl              inv_dct32_odd_8h_x16_neon

         add             x10, x6,  #16*15

         sub             x6,  x6,  #16*16

@@ -3043,7 +3043,7 @@

         add             x7,  x2,  #(\i*2)

         mov             x8,  #32*2

         mov             x12, #-2 // shift

-        bl              inv_txfm_dct_clear_8x64_neon

+        bl              inv_txfm_dct_clear_8h_x64_neon

         add             x6,  x5,  #(\i*64*2)

         bl              inv_txfm_horz_dct_64x8_neon

 .if \i < 24

@@ -3068,7 +3068,7 @@

 .irp i, 0, 8, 16, 24, 32, 40, 48, 56

         add             x7,  x5,  #(\i*2)

         mov             x8,  #64*2

-        bl              inv_txfm_dct_8x64_neon

+        bl              inv_txfm_dct_8h_x64_neon

         add             x6,  x0,  #(\i)

         bl              inv_txfm_add_vert_dct_8x64_neon

 .endr

@@ -3097,7 +3097,7 @@

         add             x7,  x2,  #(\i*2)

         mov             x8,  #32*2

         mov             x12, #-1 // shift

-        bl              inv_txfm_dct_clear_scale_8x64_neon

+        bl              inv_txfm_dct_clear_scale_8h_x64_neon

         add             x6,  x5,  #(\i*64*2)

         bl              inv_txfm_horz_dct_64x8_neon

 .if \i < 24

@@ -3171,7 +3171,7 @@

 .irp i, 0, 8, 16, 24

         add             x7,  x5,  #(\i*2)

         mov             x8,  #32*2

-        bl              inv_txfm_dct_8x64_neon

+        bl              inv_txfm_dct_8h_x64_neon

         add             x6,  x0,  #(\i)

         bl              inv_txfm_add_vert_dct_8x64_neon

 .endr

@@ -3200,7 +3200,7 @@

         add             x7,  x2,  #(\i*2)

         mov             x8,  #16*2

         mov             x12, #-2 // shift

-        bl              inv_txfm_dct_clear_8x64_neon

+        bl              inv_txfm_dct_clear_8h_x64_neon

         add             x6,  x4,  #(\i*64*2)

         bl              inv_txfm_horz_dct_64x8_neon

 .if \i < 8

@@ -3222,7 +3222,7 @@

         b.gt            2b

3:

-        adr             x5,  inv_dct_8x16_neon

+        adr             x5,  inv_dct_8h_x16_neon

 .irp i, 0, 8, 16, 24, 32, 40, 48, 56

         add             x6,  x0,  #(\i)

         add             x7,  x4,  #(\i*2)

@@ -3245,7 +3245,7 @@

         movrel          x13, eob_16x32

         ldrh            w12, [x13], #2

-        adr             x4,  inv_dct_8x16_neon

+        adr             x4,  inv_dct_8h_x16_neon

 .irp i, 0, 8, 16, 24

         add             x6,  x5,  #(\i*16*2)

 .if \i > 0

@@ -3276,7 +3276,7 @@

 .irp i, 0, 8

         add             x7,  x5,  #(\i*2)

         mov             x8,  #16*2

-        bl              inv_txfm_dct_8x64_neon

+        bl              inv_txfm_dct_8h_x64_neon

         add             x6,  x0,  #(\i)

         bl              inv_txfm_add_vert_dct_8x64_neon

 .endr

--- /dev/null

+++ b/src/arm/64/itx16.S

@@ -1,0 +1,3514 @@

+/******************************************************************************

+ * Copyright © 2018, VideoLAN and dav1d authors

+ * Copyright © 2020, Martin Storsjo

+ * All rights reserved.

+ *

+ * Redistribution and use in source and binary forms, with or without

+ * modification, are permitted provided that the following conditions are met:

+ *

+ * 1. Redistributions of source code must retain the above copyright notice, this

+ *    list of conditions and the following disclaimer.

+ *

+ * 2. Redistributions in binary form must reproduce the above copyright notice,

+ *    this list of conditions and the following disclaimer in the documentation

+ *    and/or other materials provided with the distribution.

+ *

+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+ *****************************************************************************/

+#include "src/arm/asm.S"

+#include "util.S"

+// The exported functions in this file have got the following signature:

+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,

+//                int bitdepth_max);

+// Most of the functions use the following register layout:

+// x0-x3  external parameters

+// x4     function pointer to first transform

+// x5     function pointer to second transform

+// x6     output parameter for helper function

+// x7     input parameter for helper function

+// x8     input stride for helper function

+// x9-x12 scratch variables for helper functions

+// x13    pointer to list of eob thresholds

+// x14    return pointer for helper function

+// x15    return pointer for main function

+// The SIMD registers most often use the following layout:

+// v0-v1   multiplication coefficients

+// v2-v7   scratch registers

+// v8-v15  unused

+// v16-v31 inputs/outputs of transforms

+const idct_coeffs, align=4

+        // idct4

+        .int            2896, 2896*8*(1<<16), 1567, 3784

+        // idct8

+        .int            799, 4017, 3406, 2276

+        // idct16

+        .int            401, 4076, 3166, 2598

+        .int            1931, 3612, 3920, 1189

+        // idct32

+        .int            201, 4091, 3035, 2751

+        .int            1751, 3703, 3857, 1380

+        .int            995, 3973, 3513, 2106

+        .int            2440, 3290, 4052, 601

+endconst

+const idct64_coeffs, align=4

+        .int            101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)

+        .int            1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)

+        .int            4076, 401, 4017, 799

+        .int            4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)

+        .int            3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)

+        .int            -3166, -2598, -799, -4017

+        .int            501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)

+        .int            2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)

+        .int            3612, 1931, 2276, 3406

+        .int            4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)

+        .int            3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)

+        .int            -3920, -1189, -3406, -2276

+endconst

+const iadst4_coeffs, align=4

+        .int            1321, 3803, 2482, 3344

+endconst

+const iadst8_coeffs, align=4

+        .int            4076, 401, 3612, 1931

+        .int            2598, 3166, 1189, 3920

+        // idct_coeffs

+        .int            2896, 0, 1567, 3784

+endconst

+const iadst16_coeffs, align=4

+        .int            4091, 201, 3973, 995

+        .int            3703, 1751, 3290, 2440

+        .int            2751, 3035, 2106, 3513

+        .int            1380, 3857, 601, 4052

+endconst

+.macro mul_mla d, s0, s1, c0, c1

+        mul             \d\().4s, \s0\().4s, \c0

+        mla             \d\().4s, \s1\().4s, \c1

+.endm

+.macro mul_mls d, s0, s1, c0, c1

+        mul             \d\().4s, \s0\().4s, \c0

+        mls             \d\().4s, \s1\().4s, \c1

+.endm

+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7

+        sqrdmulh        \r0\sz,  \r0\sz,  \c

+        sqrdmulh        \r1\sz,  \r1\sz,  \c

+        sqrdmulh        \r2\sz,  \r2\sz,  \c

+        sqrdmulh        \r3\sz,  \r3\sz,  \c

+.ifnb \r4

+        sqrdmulh        \r4\sz,  \r4\sz,  \c

+        sqrdmulh        \r5\sz,  \r5\sz,  \c

+        sqrdmulh        \r6\sz,  \r6\sz,  \c

+        sqrdmulh        \r7\sz,  \r7\sz,  \c

+.endif

+.endm

+.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4

+.ifnb \load

+        ld1             {\load},  [\src], x1

+.endif

+.ifnb \shift

+        srshr           \shift,  \shift,  #\shiftbits

+.endif

+.ifnb \addsrc

+        sqadd           \adddst, \adddst, \addsrc

+.endif

+.ifnb \max

+        smax            \max,  \max,  v6.8h

+.endif

+.ifnb \min

+        smin            \min,  \min,  v7.8h

+.endif

+.ifnb \store

+        st1             {\store},  [\dst], x1

+.endif

+.endm

+.macro load_add_store_8x16 dst, src

+        mov             \src, \dst

+        movi            v6.8h,   #0

+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff

+        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src

+        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src

+        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src

+        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src

+        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src

+        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src

+        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src

+        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src

+        load_add_store  v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src

+        load_add_store  v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src

+        load_add_store  v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src

+        load_add_store  v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src

+        load_add_store  v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src

+        load_add_store  v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src

+        load_add_store  v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src

+        load_add_store  v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src

+        load_add_store       ,       , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src

+        load_add_store       ,       , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src

+        load_add_store       ,       ,      ,       , v31.8h, v30.8h, v29.8h, \dst, \src

+        load_add_store       ,       ,      ,       ,       , v31.8h, v30.8h, \dst, \src

+        load_add_store       ,       ,      ,       ,       ,       , v31.8h, \dst, \src

+.endm

+.macro load_add_store_8x8 dst, src, shiftbits=4

+        mov             \src, \dst

+        movi            v6.8h,   #0

+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff

+        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits

+        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits

+        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits

+        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits

+        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits

+        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits

+        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits

+        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits

+        load_add_store       ,       , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits

+        load_add_store       ,       , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits

+        load_add_store       ,       ,      ,       , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits

+        load_add_store       ,       ,      ,       ,       , v23.8h, v22.8h, \dst, \src, \shiftbits

+        load_add_store       ,       ,      ,       ,       ,       , v23.8h, \dst, \src, \shiftbits

+.endm

+.macro load_add_store_8x4 dst, src, shiftbits=4

+        mov             \src, \dst

+        movi            v6.8h,   #0

+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff

+        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits

+        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits

+        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits

+        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits

+        load_add_store       ,       , v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits

+        load_add_store       ,       , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits

+        load_add_store       ,       ,      ,       , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits

+        load_add_store       ,       ,      ,       ,       , v19.8h, v18.8h, \dst, \src, \shiftbits

+        load_add_store       ,       ,      ,       ,       ,       , v19.8h, \dst, \src, \shiftbits

+.endm

+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src

+.ifnb \load

+        ld1             {\load}[0],  [\src], x1

+.endif

+.ifnb \inssrc

+        ins             \insdst\().d[1],   \inssrc\().d[0]

+.endif

+.ifnb \shift

+        srshr           \shift,  \shift,  #4

+.endif

+.ifnb \load

+        ld1             {\load}[1],  [\src], x1

+.endif

+.ifnb \addsrc

+        sqadd           \adddst, \adddst, \addsrc

+.endif

+.ifnb \store

+        st1             {\store}[0],  [\dst], x1

+.endif

+.ifnb \max

+        smax            \max,  \max,  v6.8h

+.endif

+.ifnb \min

+        smin            \min,  \min,  v7.8h

+.endif

+.ifnb \store

+        st1             {\store}[1],  [\dst], x1

+.endif

+.endm

+.macro load_add_store_4x16 dst, src

+        mov             \src, \dst

+        movi            v6.8h,   #0

+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff

+        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src

+        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src

+        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src

+        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src

+        load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src

+        load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src

+        load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src

+        load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src

+        load_add_store4     ,    ,    , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src

+        load_add_store4     ,    ,    , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src

+        load_add_store4     ,    ,    ,       , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src

+        load_add_store4     ,    ,    ,       ,      ,       , v30.8h, v28.8h, v26.d, \dst, \src

+        load_add_store4     ,    ,    ,       ,      ,       ,       , v30.8h, v28.d, \dst, \src

+        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v30.d, \dst, \src

+.endm

+.macro load_add_store_4x8 dst, src

+        mov             \src, \dst

+        movi            v6.8h,   #0

+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff

+        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src

+        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src

+        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src

+        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src

+        load_add_store4     ,    ,    , v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src

+        load_add_store4     ,    ,    , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src

+        load_add_store4     ,    ,    ,       , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src

+        load_add_store4     ,    ,    ,       ,      ,       , v22.8h, v20.8h, v18.d, \dst, \src

+        load_add_store4     ,    ,    ,       ,      ,       ,       , v22.8h, v20.d, \dst, \src

+        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v22.d, \dst, \src

+.endm

+.macro idct_dc w, h, shift

+        cbnz            w3,  1f

+        movz            w16, #2896*8, lsl #16

+        ld1r            {v16.4s}, [x2]

+        dup             v0.2s,   w16

+        sqrdmulh        v20.4s,  v16.4s,  v0.s[0]

+        str             wzr, [x2]

+.if (\w == 2*\h) || (2*\w == \h)

+        sqrdmulh        v20.4s,  v20.4s,  v0.s[0]

+.endif

+.if \shift > 0

+        sqrshrn         v16.4h,  v20.4s,  #\shift

+        sqrshrn2        v16.8h,  v20.4s,  #\shift

+.else

+        sqxtn           v16.4h,  v20.4s

+        sqxtn2          v16.8h,  v20.4s

+.endif

+        sqrdmulh        v16.8h,  v16.8h,  v0.h[1]

+        srshr           v16.8h,  v16.8h,  #4

+        mov             w4,  #\h

+        b               idct_dc_w\w\()_neon

+1:

+.endm

+function idct_dc_w4_neon

+        movi            v30.8h,  #0

+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff

+1:

+        ld1             {v0.d}[0], [x0], x1

+        ld1             {v0.d}[1], [x0], x1

+        ld1             {v1.d}[0], [x0], x1

+        subs            w4,  w4,  #4

+        ld1             {v1.d}[1], [x0], x1

+        sqadd           v0.8h,   v0.8h,   v16.8h

+        sub             x0,  x0,  x1, lsl #2

+        sqadd           v1.8h,   v1.8h,   v16.8h

+        smax            v0.8h,   v0.8h,   v30.8h

+        smax            v1.8h,   v1.8h,   v30.8h

+        smin            v0.8h,   v0.8h,   v31.8h

+        st1             {v0.d}[0], [x0], x1

+        smin            v1.8h,   v1.8h,   v31.8h

+        st1             {v0.d}[1], [x0], x1

+        st1             {v1.d}[0], [x0], x1

+        st1             {v1.d}[1], [x0], x1

+        b.gt            1b

+        ret

+endfunc

+function idct_dc_w8_neon

+        movi            v30.8h,  #0

+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff

+1:

+        ld1             {v0.8h}, [x0], x1

+        subs            w4,  w4,  #4

+        ld1             {v1.8h}, [x0], x1

+        sqadd           v0.8h,   v0.8h,   v16.8h

+        ld1             {v2.8h}, [x0], x1

+        sqadd           v1.8h,   v1.8h,   v16.8h

+        ld1             {v3.8h}, [x0], x1

+        sqadd           v2.8h,   v2.8h,   v16.8h

+        sqadd           v3.8h,   v3.8h,   v16.8h

+        sub             x0,  x0,  x1, lsl #2

+        smax            v0.8h,   v0.8h,   v30.8h

+        smax            v1.8h,   v1.8h,   v30.8h

+        smax            v2.8h,   v2.8h,   v30.8h

+        smax            v3.8h,   v3.8h,   v30.8h

+        smin            v0.8h,   v0.8h,   v31.8h

+        smin            v1.8h,   v1.8h,   v31.8h

+        st1             {v0.8h}, [x0], x1

+        smin            v2.8h,   v2.8h,   v31.8h

+        st1             {v1.8h}, [x0], x1

+        smin            v3.8h,   v3.8h,   v31.8h

+        st1             {v2.8h}, [x0], x1

+        st1             {v3.8h}, [x0], x1

+        b.gt            1b

+        ret

+endfunc

+function idct_dc_w16_neon

+        movi            v30.8h,  #0

+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff

+1:

+        ld1             {v0.8h, v1.8h}, [x0], x1

+        subs            w4,  w4,  #2

+        ld1             {v2.8h, v3.8h}, [x0], x1

+        sqadd           v0.8h,   v0.8h,   v16.8h

+        sqadd           v1.8h,   v1.8h,   v16.8h

+        sub             x0,  x0,  x1, lsl #1

+        sqadd           v2.8h,   v2.8h,   v16.8h

+        sqadd           v3.8h,   v3.8h,   v16.8h

+        smax            v0.8h,   v0.8h,   v30.8h

+        smax            v1.8h,   v1.8h,   v30.8h

+        smax            v2.8h,   v2.8h,   v30.8h

+        smax            v3.8h,   v3.8h,   v30.8h

+        smin            v0.8h,   v0.8h,   v31.8h

+        smin            v1.8h,   v1.8h,   v31.8h

+        smin            v2.8h,   v2.8h,   v31.8h

+        st1             {v0.8h, v1.8h}, [x0], x1

+        smin            v3.8h,   v3.8h,   v31.8h

+        st1             {v2.8h, v3.8h}, [x0], x1

+        b.gt            1b

+        ret

+endfunc

+function idct_dc_w32_neon

+        movi            v30.8h,  #0

+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff

+1:

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]

+        subs            w4,  w4,  #1

+        sqadd           v0.8h,   v0.8h,   v16.8h

+        sqadd           v1.8h,   v1.8h,   v16.8h

+        sqadd           v2.8h,   v2.8h,   v16.8h

+        sqadd           v3.8h,   v3.8h,   v16.8h

+        smax            v0.8h,   v0.8h,   v30.8h

+        smax            v1.8h,   v1.8h,   v30.8h

+        smax            v2.8h,   v2.8h,   v30.8h

+        smax            v3.8h,   v3.8h,   v30.8h

+        smin            v0.8h,   v0.8h,   v31.8h

+        smin            v1.8h,   v1.8h,   v31.8h

+        smin            v2.8h,   v2.8h,   v31.8h

+        smin            v3.8h,   v3.8h,   v31.8h

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1

+        b.gt            1b

+        ret

+endfunc

+function idct_dc_w64_neon

+        movi            v30.8h,  #0

+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff

+        sub             x1,  x1,  #64

+1:

+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        subs            w4,  w4,  #1

+        sqadd           v0.8h,   v0.8h,   v16.8h

+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]

+        sqadd           v1.8h,   v1.8h,   v16.8h

+        sub             x0,  x0,  #64

+        sqadd           v2.8h,   v2.8h,   v16.8h

+        sqadd           v3.8h,   v3.8h,   v16.8h

+        sqadd           v4.8h,   v4.8h,   v16.8h

+        sqadd           v5.8h,   v5.8h,   v16.8h

+        sqadd           v6.8h,   v6.8h,   v16.8h

+        sqadd           v7.8h,   v7.8h,   v16.8h

+        smax            v0.8h,   v0.8h,   v30.8h

+        smax            v1.8h,   v1.8h,   v30.8h

+        smax            v2.8h,   v2.8h,   v30.8h

+        smax            v3.8h,   v3.8h,   v30.8h

+        smax            v4.8h,   v4.8h,   v30.8h

+        smax            v5.8h,   v5.8h,   v30.8h

+        smax            v6.8h,   v6.8h,   v30.8h

+        smax            v7.8h,   v7.8h,   v30.8h

+        smin            v0.8h,   v0.8h,   v31.8h

+        smin            v1.8h,   v1.8h,   v31.8h

+        smin            v2.8h,   v2.8h,   v31.8h

+        smin            v3.8h,   v3.8h,   v31.8h

+        smin            v4.8h,   v4.8h,   v31.8h

+        smin            v5.8h,   v5.8h,   v31.8h

+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64

+        smin            v6.8h,   v6.8h,   v31.8h

+        smin            v7.8h,   v7.8h,   v31.8h

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1

+        b.gt            1b

+        ret

+endfunc

+.macro iwht4

+        add             v16.4s,  v16.4s,  v17.4s

+        sub             v21.4s,  v18.4s,  v19.4s

+        sub             v20.4s,  v16.4s,  v21.4s

+        sshr            v20.4s,  v20.4s,  #1

+        sub             v18.4s,  v20.4s,  v17.4s

+        sub             v17.4s,  v20.4s,  v19.4s

+        add             v19.4s,  v21.4s,  v18.4s

+        sub             v16.4s,  v16.4s,  v17.4s

+.endm

+.macro idct_4 r0, r1, r2, r3

+        mul_mla         v6,  \r1, \r3, v0.s[3], v0.s[2]

+        mul_mls         v4,  \r1, \r3, v0.s[2], v0.s[3]

+        mul_mla         v2,  \r0, \r2, v0.s[0], v0.s[0]

+        mul_mls         v3,  \r0, \r2, v0.s[0], v0.s[0]

+        srshr           v6.4s,  v6.4s,  #12

+        srshr           v7.4s,  v4.4s,  #12

+        srshr           v2.4s,  v2.4s,  #12

+        srshr           v3.4s,  v3.4s,  #12

+        sqadd           \r0\().4s,  v2.4s,   v6.4s

+        sqsub           \r3\().4s,  v2.4s,   v6.4s

+        sqadd           \r1\().4s,  v3.4s,   v7.4s

+        sqsub           \r2\().4s,  v3.4s,   v7.4s

+.endm

+function inv_dct_4s_x4_neon

+        movrel          x16, idct_coeffs

+        ld1             {v0.4s}, [x16]

+        idct_4          v16, v17, v18, v19

+        ret

+endfunc

+.macro iadst_4x4 o0, o1, o2, o3

+        movrel          x16, iadst4_coeffs

+        ld1             {v0.4s}, [x16]

+        sub             v3.4s,   v16.4s,  v18.4s

+        mul             v4.4s,   v16.4s,  v0.s[0]

+        mla             v4.4s,   v18.4s,  v0.s[1]

+        mla             v4.4s,   v19.4s,  v0.s[2]

+        mul             v7.4s,   v17.4s,  v0.s[3]

+        add             v3.4s,   v3.4s,   v19.4s

+        mul             v5.4s,   v16.4s,  v0.s[2]

+        mls             v5.4s,   v18.4s,  v0.s[0]

+        mls             v5.4s,   v19.4s,  v0.s[1]

+        add             \o3\().4s, v4.4s,     v5.4s

+        mul             \o2\().4s, v3.4s,     v0.s[3]

+        add             \o0\().4s, v4.4s,     v7.4s

+        add             \o1\().4s, v5.4s,     v7.4s

+        sub             \o3\().4s, \o3\().4s, v7.4s

+        srshr           \o0\().4s, \o0\().4s, #12

+        srshr           \o2\().4s, \o2\().4s, #12

+        srshr           \o1\().4s, \o1\().4s, #12

+        srshr           \o3\().4s, \o3\().4s, #12

+.endm

+function inv_adst_4s_x4_neon

+        iadst_4x4       v16, v17, v18, v19

+        ret

+endfunc

+function inv_flipadst_4s_x4_neon

+        iadst_4x4       v19, v18, v17, v16

+        ret

+endfunc

+function inv_identity_4s_x4_neon

+        movz            w16, #(5793-4096)*8, lsl #16

+        dup             v0.2s,   w16

+        sqrdmulh        v4.4s,   v16.4s,  v0.s[0]

+        sqrdmulh        v5.4s,   v17.4s,  v0.s[0]

+        sqrdmulh        v6.4s,   v18.4s,  v0.s[0]

+        sqrdmulh        v7.4s,   v19.4s,  v0.s[0]

+        sqadd           v16.4s,  v16.4s,  v4.4s

+        sqadd           v17.4s,  v17.4s,  v5.4s

+        sqadd           v18.4s,  v18.4s,  v6.4s

+        sqadd           v19.4s,  v19.4s,  v7.4s

+        ret

+endfunc

+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1

+        mov             x15, x30

+        movi            v30.4s,  #0

+        movi            v31.4s,  #0

+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]

+        st1             {v30.4s, v31.4s}, [x2], #32

+        sshr            v16.4s,  v16.4s,  #2

+        sshr            v17.4s,  v17.4s,  #2

+        sshr            v18.4s,  v18.4s,  #2

+        sshr            v19.4s,  v19.4s,  #2

+        iwht4

+        st1             {v30.4s, v31.4s}, [x2], #32

+        transpose_4x4s  v16, v17, v18, v19, v20, v21, v22, v23

+        iwht4

+        ld1             {v0.d}[0], [x0], x1

+        sqxtn           v16.4h,  v16.4s

+        ld1             {v0.d}[1], [x0], x1

+        sqxtn2          v16.8h,  v17.4s

+        ld1             {v1.d}[0], [x0], x1

+        sqxtn           v18.4h,  v18.4s

+        ld1             {v1.d}[1], [x0], x1

+        sqxtn2          v18.8h,  v19.4s

+        b               L(itx_4x4_end)

+endfunc

+function inv_txfm_add_4x4_neon

+        movi            v30.4s,  #0

+        movi            v31.4s,  #0

+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]

+        st1             {v30.4s, v31.4s}, [x2], #32

+        blr             x4

+        st1             {v30.4s, v31.4s}, [x2], #32

+        sqxtn           v16.4h,  v16.4s

+        sqxtn           v17.4h,  v17.4s

+        sqxtn           v18.4h,  v18.4s

+        sqxtn           v19.4h,  v19.4s

+        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23

+        blr             x5

+        ld1             {v0.d}[0], [x0], x1

+        ld1             {v0.d}[1], [x0], x1

+        ins             v16.d[1], v17.d[0]

+        ins             v18.d[1], v19.d[0]

+        ld1             {v1.d}[0], [x0], x1

+        ld1             {v1.d}[1], [x0], x1

+        srshr           v16.8h,  v16.8h,  #4

+        srshr           v18.8h,  v18.8h,  #4

+L(itx_4x4_end):

+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff

+        sub             x0,  x0,  x1, lsl #2

+        sqadd           v16.8h,  v16.8h,  v0.8h

+        sqadd           v18.8h,  v18.8h,  v1.8h

+        smax            v16.8h,  v16.8h,  v30.8h

+        smax            v18.8h,  v18.8h,  v30.8h

+        smin            v16.8h,  v16.8h,  v31.8h

+        st1             {v16.d}[0], [x0], x1

+        smin            v18.8h,  v18.8h,  v31.8h

+        st1             {v16.d}[1], [x0], x1

+        st1             {v18.d}[0], [x0], x1

+        st1             {v18.d}[1], [x0], x1

+        br              x15

+endfunc

+.macro def_fn_4x4 txfm1, txfm2

+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1

+        mov             x15, x30

+.ifc \txfm1\()_\txfm2, dct_dct

+        cbnz            w3,  1f

+        movz            w16, #2896*8, lsl #16

+        ld1r            {v16.4s}, [x2]

+        dup             v4.2s,   w16

+        str             wzr, [x2]

+        sqrdmulh        v16.4s,  v16.4s,  v4.s[0]

+        ld1             {v0.d}[0], [x0], x1

+        sqxtn           v20.4h,  v16.4s

+        sqxtn2          v20.8h,  v16.4s

+        ld1             {v0.d}[1], [x0], x1

+        sqrdmulh        v20.8h,  v20.8h,  v4.h[1]

+        ld1             {v1.d}[0], [x0], x1

+        srshr           v16.8h,  v20.8h,  #4

+        ld1             {v1.d}[1], [x0], x1

+        srshr           v18.8h,  v20.8h,  #4

+        movi            v30.8h,  #0

+        b               L(itx_4x4_end)

+1:

+.endif

+        adr             x4,  inv_\txfm1\()_4s_x4_neon

+        movrel          x5,  X(inv_\txfm2\()_4h_x4_neon)

+        b               inv_txfm_add_4x4_neon

+endfunc

+.endm

+def_fn_4x4 dct, dct

+def_fn_4x4 identity, identity

+def_fn_4x4 dct, adst

+def_fn_4x4 dct, flipadst

+def_fn_4x4 dct, identity

+def_fn_4x4 adst, dct

+def_fn_4x4 adst, adst

+def_fn_4x4 adst, flipadst

+def_fn_4x4 flipadst, dct

+def_fn_4x4 flipadst, adst

+def_fn_4x4 flipadst, flipadst

+def_fn_4x4 identity, dct

+def_fn_4x4 adst, identity

+def_fn_4x4 flipadst, identity

+def_fn_4x4 identity, adst

+def_fn_4x4 identity, flipadst

+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7

+        idct_4          \r0, \r2, \r4, \r6

+        mul_mls         v2,  \r1, \r7, v1.s[0], v1.s[1]  // -> t4a

+        mul_mla         v4,  \r1, \r7, v1.s[1], v1.s[0]  // -> t7a

+        mul_mls         v6,  \r5, \r3, v1.s[2], v1.s[3]  // -> t5a

+        mul_mla         v7,  \r5, \r3, v1.s[3], v1.s[2]  // -> t6a

+        srshr           \r1\().4s, v2.4s,  #12           // t4a

+        srshr           \r7\().4s, v4.4s,  #12           // t7a

+        srshr           \r3\().4s, v6.4s,  #12           // t5a

+        srshr           \r5\().4s, v7.4s,  #12           // taa

+        sqadd           v2.4s,     \r1\().4s,  \r3\().4s // t4

+        sqsub           \r1\().4s, \r1\().4s,  \r3\().4s // t5a

+        sqadd           v3.4s,     \r7\().4s,  \r5\().4s // t7

+        sqsub           \r3\().4s, \r7\().4s,  \r5\().4s // t6a

+        mul_mls         v4,  \r3, \r1, v0.s[0], v0.s[0]  // -> t5

+        mul_mla         v6,  \r3, \r1, v0.s[0], v0.s[0]  // -> t6

+        srshr           v4.4s,  v4.4s,  #12              // t5

+        srshr           v5.4s,  v6.4s,  #12              // t6

+        sqsub           \r7\().4s,  \r0\().4s,  v3.4s    // out7

+        sqadd           \r0\().4s,  \r0\().4s,  v3.4s    // out0

+        sqadd           \r1\().4s,  \r2\().4s,  v5.4s    // out1

+        sqsub           v6.4s,      \r2\().4s,  v5.4s    // out6

+        sqadd           \r2\().4s,  \r4\().4s,  v4.4s    // out2

+        sqsub           \r5\().4s,  \r4\().4s,  v4.4s    // out5

+        sqadd           \r3\().4s,  \r6\().4s,  v2.4s    // out3

+        sqsub           \r4\().4s,  \r6\().4s,  v2.4s    // out4

+        mov             \r6\().16b, v6.16b               // out6

+.endm

+function inv_dct_4s_x8_neon

+        movrel          x16, idct_coeffs

+        ld1             {v0.4s, v1.4s}, [x16]

+        idct_8          v16, v17, v18, v19, v20, v21, v22, v23

+        ret

+endfunc

+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7

+        movrel          x16, iadst8_coeffs

+        ld1             {v0.4s, v1.4s}, [x16], #32

+        mul_mla         v2,  v23, v16, v0.s[0], v0.s[1]

+        mul_mls         v4,  v23, v16, v0.s[1], v0.s[0]

+        mul_mla         v6,  v21, v18, v0.s[2], v0.s[3]

+        srshr           v16.4s, v2.4s,  #12  // t0a

+        srshr           v23.4s, v4.4s,  #12  // t1a

+        mul_mls         v2,  v21, v18, v0.s[3], v0.s[2]

+        mul_mla         v4,  v19, v20, v1.s[0], v1.s[1]

+        srshr           v18.4s, v6.4s,  #12  // t2a

+        srshr           v21.4s, v2.4s,  #12  // t3a

+        mul_mls         v6,  v19, v20, v1.s[1], v1.s[0]

+        mul_mla         v2,  v17, v22, v1.s[2], v1.s[3]

+        srshr           v20.4s, v4.4s,  #12  // t4a

+        srshr           v19.4s, v6.4s,  #12  // t5a

+        mul_mls         v4,  v17, v22, v1.s[3], v1.s[2]

+        srshr           v22.4s, v2.4s,  #12  // t6a

+        srshr           v17.4s, v4.4s,  #12  // t7a

+        ld1             {v0.4s}, [x16]

+        sqadd           v2.4s,   v16.4s,  v20.4s // t0

+        sqsub           v3.4s,   v16.4s,  v20.4s // t4

+        sqadd           v4.4s,   v23.4s,  v19.4s // t1

+        sqsub           v5.4s,   v23.4s,  v19.4s // t5

+        sqadd           v6.4s,   v18.4s,  v22.4s // t2

+        sqsub           v7.4s,   v18.4s,  v22.4s // t6

+        sqadd           v18.4s,  v21.4s,  v17.4s // t3

+        sqsub           v19.4s,  v21.4s,  v17.4s // t7

+        mul_mla         v16, v3,  v5,  v0.s[3], v0.s[2]

+        mul_mls         v20, v3,  v5,  v0.s[2], v0.s[3]

+        mul_mls         v22, v19, v7,  v0.s[3], v0.s[2]

+        srshr           v3.4s,  v16.4s, #12  // t4a

+        srshr           v5.4s,  v20.4s, #12  // t5a

+        mul_mla         v16, v19, v7,  v0.s[2], v0.s[3]

+        srshr           v7.4s,  v22.4s, #12  // t6a

+        srshr           v19.4s, v16.4s, #12  // t7a

+        sqadd           \o0\().4s, v2.4s, v6.4s  // out0

+        sqsub           v2.4s,     v2.4s, v6.4s  // t2

+        sqadd           \o7\().4s, v4.4s, v18.4s // out7

+        sqsub           v4.4s,     v4.4s, v18.4s // t3

+        sqneg           \o7\().4s, \o7\().4s     // out7

+        sqadd           \o1\().4s, v3.4s, v7.4s  // out1

+        sqsub           v3.4s,     v3.4s, v7.4s  // t6

+        sqadd           \o6\().4s, v5.4s, v19.4s // out6

+        sqsub           v5.4s,     v5.4s, v19.4s // t7

+        sqneg           \o1\().4s, \o1\().4s     // out1

+        mul_mla         v18, v2,  v4,  v0.s[0], v0.s[0] // -> out3 (v19 or v20)

+        mul_mls         v6,  v2,  v4,  v0.s[0], v0.s[0] // -> out4 (v20 or v19)

+        mul_mls         v20, v3,  v5,  v0.s[0], v0.s[0] // -> out5 (v21 or v18)

+        srshr           v2.4s,  v18.4s, #12 // out3

+        mul_mla         v18, v3,  v5,  v0.s[0], v0.s[0] // -> out2 (v18 or v21)

+        srshr           v3.4s,  v20.4s, #12 // out5

+        srshr           \o2\().4s, v18.4s, #12 // out2 (v18 or v21)

+        srshr           \o4\().4s, v6.4s,  #12 // out4 (v20 or v19)

+        sqneg           \o3\().4s, v2.4s     // out3

+        sqneg           \o5\().4s, v3.4s     // out5

+.endm

+function inv_adst_4s_x8_neon

+        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23

+        ret

+endfunc

+function inv_flipadst_4s_x8_neon

+        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16

+        ret

+endfunc

+function inv_identity_4s_x8_neon

+        sqshl           v16.4s,  v16.4s,  #1

+        sqshl           v17.4s,  v17.4s,  #1

+        sqshl           v18.4s,  v18.4s,  #1

+        sqshl           v19.4s,  v19.4s,  #1

+        sqshl           v20.4s,  v20.4s,  #1

+        sqshl           v21.4s,  v21.4s,  #1

+        sqshl           v22.4s,  v22.4s,  #1

+        sqshl           v23.4s,  v23.4s,  #1

+        ret

+endfunc

+function inv_txfm_add_8x8_neon

+        movi            v31.4s,  #0

+        cmp             w3,  w13

+        mov             x11, #32

+        b.lt            1f

+        add             x6,  x2,  #16

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s

+        ld1             {\i},     [x6]

+        st1             {v31.4s}, [x6], x11

+.endr

+        blr             x4

+        sqrshrn         v24.4h,  v16.4s,  #1

+        sqrshrn         v25.4h,  v17.4s,  #1

+        sqrshrn         v26.4h,  v18.4s,  #1

+        sqrshrn         v27.4h,  v19.4s,  #1

+        sqrshrn2        v24.8h,  v20.4s,  #1

+        sqrshrn2        v25.8h,  v21.4s,  #1

+        sqrshrn2        v26.8h,  v22.4s,  #1

+        sqrshrn2        v27.8h,  v23.4s,  #1

+        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5

+        b               2f

+1:

+.irp i, v24.8h, v25.8h, v26.8h, v27.8h

+        movi            \i,  #0

+.endr

+2:

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s

+        ld1             {\i},     [x2]

+        st1             {v31.4s}, [x2], x11

+.endr

+        blr             x4

+        sqrshrn         v16.4h,  v16.4s,  #1

+        sqrshrn         v17.4h,  v17.4s,  #1

+        sqrshrn         v18.4h,  v18.4s,  #1

+        sqrshrn         v19.4h,  v19.4s,  #1

+        sqrshrn2        v16.8h,  v20.4s,  #1

+        sqrshrn2        v17.8h,  v21.4s,  #1

+        sqrshrn2        v18.8h,  v22.4s,  #1

+        sqrshrn2        v19.8h,  v23.4s,  #1

+        transpose_4x8h  v16, v17, v18, v19, v20, v21, v22, v23

+        mov             v20.16b, v24.16b

+        mov             v21.16b, v25.16b

+        mov             v22.16b, v26.16b

+        mov             v23.16b, v27.16b

+        blr             x5

+        load_add_store_8x8 x0, x7

+        br              x15

+endfunc

+.macro def_fn_8x8 txfm1, txfm2, eob_half

+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1

+        mov             x15, x30

+.ifc \txfm1\()_\txfm2, dct_dct

+        idct_dc         8,   8,   1

+.endif

+        movrel          x5,  X(inv_\txfm2\()_8h_x8_neon)

+        mov             w13, #\eob_half

+        adr             x4,  inv_\txfm1\()_4s_x8_neon

+        b               inv_txfm_add_8x8_neon

+endfunc

+.endm

+def_fn_8x8 dct, dct, 10

+def_fn_8x8 identity, identity, 10

+def_fn_8x8 dct, adst, 10

+def_fn_8x8 dct, flipadst, 10

+def_fn_8x8 dct, identity, 4

+def_fn_8x8 adst, dct, 10

+def_fn_8x8 adst, adst, 10

+def_fn_8x8 adst, flipadst, 10

+def_fn_8x8 flipadst, dct, 10

+def_fn_8x8 flipadst, adst, 10

+def_fn_8x8 flipadst, flipadst, 10

+def_fn_8x8 identity, dct, 4

+def_fn_8x8 adst, identity, 4

+def_fn_8x8 flipadst, identity, 4

+def_fn_8x8 identity, adst, 4

+def_fn_8x8 identity, flipadst, 4

+function inv_txfm_add_8x4_neon

+        movi            v28.4s,  #0

+        movi            v29.4s,  #0

+        movi            v30.4s,  #0

+        movi            v31.4s,  #0

+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]

+        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64

+        movz            w16, #2896*8, lsl #16

+        dup             v0.2s,   w16

+        ld1             {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]

+        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]

+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        blr             x4

+        sqxtn           v16.4h,  v16.4s

+        sqxtn           v17.4h,  v17.4s

+        sqxtn           v18.4h,  v18.4s

+        sqxtn           v19.4h,  v19.4s

+        sqxtn           v20.4h,  v20.4s

+        sqxtn           v21.4h,  v21.4s

+        sqxtn           v22.4h,  v22.4s

+        sqxtn           v23.4h,  v23.4s

+        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7

+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7

+        ins             v16.d[1], v20.d[0]

+        ins             v17.d[1], v21.d[0]

+        ins             v18.d[1], v22.d[0]

+        ins             v19.d[1], v23.d[0]

+        blr             x5

+        load_add_store_8x4 x0, x7

+        br              x15

+endfunc

+function inv_txfm_add_4x8_neon

+        movz            w16, #2896*8, lsl #16

+        movi            v31.4s,  #0

+        dup             v30.2s,  w16

+        cmp             w3,  w13

+        mov             x11, #32

+        b.lt            1f

+        add             x6,  x2,  #16

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s

+        ld1             {\i},     [x6]

+        st1             {v31.4s}, [x6], x11

+.endr

+        scale_input     .4s, v30.s[0], v16, v17, v18, v19

+        blr             x4

+        sqxtn           v20.4h,  v16.4s

+        sqxtn           v21.4h,  v17.4s

+        sqxtn           v22.4h,  v18.4s

+        sqxtn           v23.4h,  v19.4s

+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7

+        b               2f

+1:

+.irp i, v20, v21, v22, v23

+        movi            \i\().4h, #0

+.endr

+2:

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s

+        ld1             {\i},     [x2]

+        st1             {v31.4s}, [x2], x11

+.endr

+        scale_input     .4s, v30.s[0], v16, v17, v18, v19

+        blr             x4

+        sqxtn           v16.4h,  v16.4s

+        sqxtn           v17.4h,  v17.4s

+        sqxtn           v18.4h,  v18.4s

+        sqxtn           v19.4h,  v19.4s

+        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7

+        blr             x5

+        load_add_store_4x8 x0, x7

+        br              x15

+endfunc

+.macro def_fn_48 w, h, txfm1, txfm2, eob_half

+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1

+        mov             x15, x30

+.ifc \txfm1\()_\txfm2, dct_dct

+        idct_dc         \w,  \h,  0

+.endif

+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon

+.if \w == 4

+        mov             w13, #\eob_half

+.endif

+        movrel          x5,  X(inv_\txfm2\()_\w\()h_x\h\()_neon)

+        b               inv_txfm_add_\w\()x\h\()_neon

+endfunc

+.endm

+.macro def_fns_48 w, h

+def_fn_48 \w, \h, dct, dct, 13

+def_fn_48 \w, \h, identity, identity, 13

+def_fn_48 \w, \h, dct, adst, 13

+def_fn_48 \w, \h, dct, flipadst, 13

+def_fn_48 \w, \h, dct, identity, 4

+def_fn_48 \w, \h, adst, dct, 13

+def_fn_48 \w, \h, adst, adst, 13

+def_fn_48 \w, \h, adst, flipadst, 13

+def_fn_48 \w, \h, flipadst, dct, 13

+def_fn_48 \w, \h, flipadst, adst, 13

+def_fn_48 \w, \h, flipadst, flipadst, 13

+def_fn_48 \w, \h, identity, dct, 16

+def_fn_48 \w, \h, adst, identity, 4

+def_fn_48 \w, \h, flipadst, identity, 4

+def_fn_48 \w, \h, identity, adst, 16

+def_fn_48 \w, \h, identity, flipadst, 16

+.endm

+def_fns_48 4, 8

+def_fns_48 8, 4

+function inv_dct_4s_x16_neon

+        movrel          x16, idct_coeffs

+        ld1             {v0.4s, v1.4s}, [x16], #32

+        idct_8          v16, v18, v20, v22, v24, v26, v28, v30

+        ld1             {v0.4s, v1.4s}, [x16]

+        sub             x16, x16, #32

+        mul_mls         v2,  v17, v31, v0.s[0], v0.s[1] // -> t8a

+        mul_mla         v4,  v17, v31, v0.s[1], v0.s[0] // -> t15a

+        mul_mls         v6,  v25, v23, v0.s[2], v0.s[3] // -> t9a

+        srshr           v17.4s, v2.4s,  #12             // t8a

+        srshr           v31.4s, v4.4s,  #12             // t15a

+        mul_mla         v2,  v25, v23, v0.s[3], v0.s[2] // -> t14a

+        mul_mls         v4,  v21, v27, v1.s[0], v1.s[1] // -> t10a

+        srshr           v23.4s, v6.4s,  #12             // t9a

+        srshr           v25.4s, v2.4s,  #12             // t14a

+        mul_mla         v6,  v21, v27, v1.s[1], v1.s[0] // -> t13a

+        mul_mls         v2,  v29, v19, v1.s[2], v1.s[3] // -> t11a

+        srshr           v21.4s, v4.4s,  #12             // t10a

+        srshr           v27.4s, v6.4s,  #12             // t13a

+        mul_mla         v4,  v29, v19, v1.s[3], v1.s[2] // -> t12a

+        srshr           v19.4s, v2.4s,  #12             // t11a

+        srshr           v29.4s, v4.4s,  #12             // t12a

+        ld1             {v0.4s}, [x16]

+        sqsub           v2.4s,   v17.4s,  v23.4s  // t9

+        sqadd           v17.4s,  v17.4s,  v23.4s  // t8

+        sqsub           v3.4s,   v31.4s,  v25.4s  // t14

+        sqadd           v31.4s,  v31.4s,  v25.4s  // t15

+        sqsub           v23.4s,  v19.4s,  v21.4s  // t10

+        sqadd           v19.4s,  v19.4s,  v21.4s  // t11

+        sqadd           v25.4s,  v29.4s,  v27.4s  // t12

+        sqsub           v29.4s,  v29.4s,  v27.4s  // t13

+        mul_mls         v4,  v3,  v2,  v0.s[2], v0.s[3] // -> t9a

+        mul_mla         v6,  v3,  v2,  v0.s[3], v0.s[2] // -> t14a

+        srshr           v21.4s, v4.4s,  #12             // t9a

+        srshr           v27.4s, v6.4s,  #12             // t14a

+        mul_mls         v4,  v29, v23, v0.s[2], v0.s[3] // -> t13a

+        mul_mla         v6,  v29, v23, v0.s[3], v0.s[2] // -> t10a

+        srshr           v29.4s, v4.4s,  #12             // t13a

+        neg             v6.4s,   v6.4s

+        srshr           v23.4s, v6.4s,  #12             // t10a

+        sqsub           v2.4s,   v17.4s,  v19.4s  // t11a

+        sqadd           v17.4s,  v17.4s,  v19.4s  // t8a

+        sqsub           v3.4s,   v31.4s,  v25.4s  // t12a

+        sqadd           v31.4s,  v31.4s,  v25.4s  // t15a

+        sqadd           v19.4s,  v21.4s,  v23.4s  // t9

+        sqsub           v21.4s,  v21.4s,  v23.4s  // t10

+        sqsub           v25.4s,  v27.4s,  v29.4s  // t13

+        sqadd           v27.4s,  v27.4s,  v29.4s  // t14

+        mul_mls         v4,  v3,  v2,  v0.s[0], v0.s[0] // -> t11

+        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t12

+        mul_mls         v2,  v25, v21, v0.s[0], v0.s[0] // -> t10a

+        srshr           v4.4s,  v4.4s,  #12   // t11

+        srshr           v5.4s,  v6.4s,  #12   // t12

+        mul_mla         v6,  v25, v21, v0.s[0], v0.s[0] // -> t10a

+        srshr           v2.4s,  v2.4s,  #12   // t10a

+        srshr           v3.4s,  v6.4s,  #12   // t13a

+        sqadd           v6.4s,   v16.4s,  v31.4s  // out0

+        sqsub           v31.4s,  v16.4s,  v31.4s  // out15

+        mov             v16.16b, v6.16b

+        sqadd           v23.4s,  v30.4s,  v17.4s  // out7

+        sqsub           v7.4s,   v30.4s,  v17.4s  // out8

+        sqadd           v17.4s,  v18.4s,  v27.4s  // out1

+        sqsub           v30.4s,  v18.4s,  v27.4s  // out14

+        sqadd           v18.4s,  v20.4s,  v3.4s   // out2

+        sqsub           v29.4s,  v20.4s,  v3.4s   // out13

+        sqadd           v3.4s,   v28.4s,  v19.4s  // out6

+        sqsub           v25.4s,  v28.4s,  v19.4s  // out9

+        sqadd           v19.4s,  v22.4s,  v5.4s   // out3

+        sqsub           v28.4s,  v22.4s,  v5.4s   // out12

+        sqadd           v20.4s,  v24.4s,  v4.4s   // out4

+        sqsub           v27.4s,  v24.4s,  v4.4s   // out11

+        sqadd           v21.4s,  v26.4s,  v2.4s   // out5

+        sqsub           v26.4s,  v26.4s,  v2.4s   // out10

+        mov             v24.16b, v7.16b

+        mov             v22.16b, v3.16b

+        ret

+endfunc

+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15

+        movrel          x16, iadst16_coeffs

+        ld1             {v0.4s, v1.4s}, [x16], #32

+        mul_mla         v2,  v31, v16, v0.s[0], v0.s[1] // -> t0

+        mul_mls         v4,  v31, v16, v0.s[1], v0.s[0] // -> t1

+        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t2

+        srshr           v16.4s, v2.4s,  #12             // t0

+        srshr           v31.4s, v4.4s,  #12             // t1

+        mul_mls         v2,  v29, v18, v0.s[3], v0.s[2] // -> t3

+        mul_mla         v4,  v27, v20, v1.s[0], v1.s[1] // -> t4

+        srshr           v18.4s, v6.4s,  #12             // t2

+        srshr           v29.4s, v2.4s,  #12             // t3

+        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t5

+        mul_mla         v2,  v25, v22, v1.s[2], v1.s[3] // -> t6

+        srshr           v20.4s, v4.4s,  #12             // t4

+        srshr           v27.4s, v6.4s,  #12             // t5

+        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t7

+        ld1             {v0.4s, v1.4s}, [x16]

+        movrel          x16, idct_coeffs

+        mul_mla         v6,  v23, v24, v0.s[0], v0.s[1] // -> t8

+        srshr           v22.4s, v2.4s,  #12             // t6

+        srshr           v25.4s, v4.4s,  #12             // t7

+        mul_mls         v2,  v23, v24, v0.s[1], v0.s[0] // -> t9

+        mul_mla         v4,  v21, v26, v0.s[2], v0.s[3] // -> t10

+        srshr           v23.4s, v6.4s,  #12             // t8

+        srshr           v24.4s, v2.4s,  #12             // t9

+        mul_mls         v6,  v21, v26, v0.s[3], v0.s[2] // -> t11

+        mul_mla         v2,  v19, v28, v1.s[0], v1.s[1] // -> t12

+        srshr           v21.4s, v4.4s,  #12             // t10

+        srshr           v26.4s, v6.4s,  #12             // t11

+        mul_mls         v4,  v19, v28, v1.s[1], v1.s[0] // -> t13

+        mul_mla         v6,  v17, v30, v1.s[2], v1.s[3] // -> t14

+        srshr           v19.4s, v2.4s,  #12             // t12

+        srshr           v28.4s, v4.4s,  #12             // t13

+        mul_mls         v2,  v17, v30, v1.s[3], v1.s[2] // -> t15

+        srshr           v17.4s, v6.4s,  #12             // t14

+        srshr           v30.4s, v2.4s,  #12             // t15

+        ld1             {v0.4s, v1.4s}, [x16]

+        sqsub           v2.4s,   v16.4s,  v23.4s // t8a

+        sqadd           v16.4s,  v16.4s,  v23.4s // t0a

+        sqsub           v3.4s,   v31.4s,  v24.4s // t9a

+        sqadd           v31.4s,  v31.4s,  v24.4s // t1a

+        sqadd           v23.4s,  v18.4s,  v21.4s // t2a

+        sqsub           v18.4s,  v18.4s,  v21.4s // t10a

+        sqadd           v24.4s,  v29.4s,  v26.4s // t3a

+        sqsub           v29.4s,  v29.4s,  v26.4s // t11a

+        sqadd           v21.4s,  v20.4s,  v19.4s // t4a

+        sqsub           v20.4s,  v20.4s,  v19.4s // t12a

+        sqadd           v26.4s,  v27.4s,  v28.4s // t5a

+        sqsub           v27.4s,  v27.4s,  v28.4s // t13a

+        sqadd           v19.4s,  v22.4s,  v17.4s // t6a

+        sqsub           v22.4s,  v22.4s,  v17.4s // t14a

+        sqadd           v28.4s,  v25.4s,  v30.4s // t7a

+        sqsub           v25.4s,  v25.4s,  v30.4s // t15a

+        mul_mla         v4,  v2,  v3,  v1.s[1], v1.s[0] // -> t8

+        mul_mls         v6,  v2,  v3,  v1.s[0], v1.s[1] // -> t9

+        mul_mla         v2,  v18, v29, v1.s[3], v1.s[2] // -> t10

+        srshr           v17.4s, v4.4s,  #12             // t8

+        srshr           v30.4s, v6.4s,  #12             // t9

+        mul_mls         v4,  v18, v29, v1.s[2], v1.s[3] // -> t11

+        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t12

+        srshr           v18.4s, v2.4s,  #12             // t10

+        srshr           v29.4s, v4.4s,  #12             // t11

+        mul_mla         v2,  v27, v20, v1.s[0], v1.s[1] // -> t13

+        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t14

+        srshr           v27.4s, v6.4s,  #12             // t12

+        srshr           v20.4s, v2.4s,  #12             // t13

+        mul_mla         v6,  v25, v22, v1.s[2], v1.s[3] // -> t15

+        srshr           v25.4s, v4.4s,  #12             // t14

+        srshr           v22.4s, v6.4s,  #12             // t15

+        sqsub           v2.4s,   v16.4s,  v21.4s // t4

+        sqadd           v16.4s,  v16.4s,  v21.4s // t0

+        sqsub           v3.4s,   v31.4s,  v26.4s // t5

+        sqadd           v31.4s,  v31.4s,  v26.4s // t1

+        sqadd           v21.4s,  v23.4s,  v19.4s // t2

+        sqsub           v23.4s,  v23.4s,  v19.4s // t6

+        sqadd           v26.4s,  v24.4s,  v28.4s // t3

+        sqsub           v24.4s,  v24.4s,  v28.4s // t7

+        sqadd           v19.4s,  v17.4s,  v27.4s // t8a

+        sqsub           v17.4s,  v17.4s,  v27.4s // t12a

+        sqadd           v28.4s,  v30.4s,  v20.4s // t9a

+        sqsub           v30.4s,  v30.4s,  v20.4s // t13a

+        sqadd           v27.4s,  v18.4s,  v25.4s // t10a

+        sqsub           v18.4s,  v18.4s,  v25.4s // t14a

+        sqadd           v20.4s,  v29.4s,  v22.4s // t11a

+        sqsub           v29.4s,  v29.4s,  v22.4s // t15a

+        mul_mla         v4,  v2,  v3,  v0.s[3], v0.s[2] // -> t4a

+        mul_mls         v6,  v2,  v3,  v0.s[2], v0.s[3] // -> t5a

+        mul_mls         v2,  v24, v23, v0.s[3], v0.s[2] // -> t6a

+        srshr           v22.4s, v4.4s,  #12             // t4a

+        srshr           v25.4s, v6.4s,  #12             // t5a

+        mul_mla         v4,  v24, v23, v0.s[2], v0.s[3] // -> t7a

+        mul_mla         v6,  v17, v30, v0.s[3], v0.s[2] // -> t12

+        srshr           v24.4s, v2.4s,  #12             // t6a

+        srshr           v23.4s, v4.4s,  #12             // t7a

+        mul_mls         v2,  v17, v30, v0.s[2], v0.s[3] // -> t13

+        mul_mls         v4,  v29, v18, v0.s[3], v0.s[2] // -> t14

+        srshr           v17.4s, v6.4s,  #12             // t12

+        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t15

+        srshr           v29.4s, v2.4s,  #12             // t13

+        srshr           v30.4s, v4.4s,  #12             // t14

+        srshr           v18.4s, v6.4s,  #12             // t15

+        sqsub           v2.4s,   v16.4s,  v21.4s // t2a

+.ifc \o0, v16

+        sqadd           \o0\().4s,  v16.4s,  v21.4s // out0

+        sqsub           v21.4s,     v31.4s,  v26.4s // t3a

+        sqadd           \o15\().4s, v31.4s,  v26.4s // out15

+.else

+        sqadd           v4.4s,      v16.4s,  v21.4s // out0

+        sqsub           v21.4s,     v31.4s,  v26.4s // t3a

+        sqadd           \o15\().4s, v31.4s,  v26.4s // out15

+        mov             \o0\().16b, v4.16b

+.endif

+        sqneg           \o15\().4s, \o15\().4s      // out15

+        sqsub           v3.4s,      v29.4s,  v18.4s // t15a

+        sqadd           \o13\().4s, v29.4s,  v18.4s // out13

+        sqadd           \o2\().4s,  v17.4s,  v30.4s // out2

+        sqsub           v26.4s,     v17.4s,  v30.4s // t14a

+        sqneg           \o13\().4s, \o13\().4s      // out13

+        sqadd           \o1\().4s,  v19.4s,  v27.4s // out1

+        sqsub           v27.4s,     v19.4s,  v27.4s // t10

+        sqadd           \o14\().4s, v28.4s,  v20.4s // out14

+        sqsub           v20.4s,     v28.4s,  v20.4s // t11

+        sqneg           \o1\().4s,  \o1\().4s       // out1

+        sqadd           \o3\().4s,  v22.4s,  v24.4s // out3

+        sqsub           v22.4s,     v22.4s,  v24.4s // t6

+        sqadd           \o12\().4s, v25.4s,  v23.4s // out12

+        sqsub           v23.4s,     v25.4s,  v23.4s // t7

+        sqneg           \o3\().4s,  \o3\().4s       // out3

+        mul_mls         v24, v2,  v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)

+        mul_mla         v4,  v2,  v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)

+        mul_mla         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out5 (v21 or v26)

+        srshr           v24.4s, v24.4s, #12             // out8

+        srshr           v4.4s,  v4.4s,  #12             // out7

+        srshr           v5.4s,  v6.4s,  #12             // out5

+        mul_mls         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out10 (v26 or v21)

+        mul_mla         v2,  v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)

+        srshr           v26.4s, v6.4s,  #12             // out10

+        mul_mls         v6,  v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)

+        mul_mla         v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)

+        mul_mls         v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)

+        srshr           \o4\().4s,   v2.4s,  #12        // out4

+        srshr           v6.4s,       v6.4s,  #12        // out11

+        srshr           v7.4s,       v21.4s, #12        // out9

+        srshr           \o6\().4s,   v22.4s, #12        // out6

+.ifc \o8, v23

+        mov             \o8\().16b,  v24.16b

+        mov             \o10\().16b, v26.16b

+.endif

+        sqneg           \o7\().4s,   v4.4s // out7

+        sqneg           \o5\().4s,   v5.4s // out5

+        sqneg           \o11\().4s,  v6.4s // out11

+        sqneg           \o9\().4s,   v7.4s // out9

+.endm

+function inv_adst_4s_x16_neon

+        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31

+        ret

+endfunc

+function inv_flipadst_4s_x16_neon

+        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16

+        ret

+endfunc

+function inv_identity_4s_x16_neon

+        movz            w16, #2*(5793-4096)*8, lsl #16

+        dup             v0.2s,   w16

+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

+        sqrdmulh        v2.4s,      v\i\().4s,  v0.s[0]

+        sqadd           v\i\().4s,  v\i\().4s,  v\i\().4s

+        sqadd           v\i\().4s,  v\i\().4s,  v2.4s

+.endr

+        ret

+endfunc

+.macro identity_4x16_shift1 c

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        sqrdmulh        v3.4s,   \i,      \c

+        srshr           v3.4s,   v3.4s,   #1

+        sqadd           \i,      \i,      v3.4s

+.endr

+.endm

+.macro identity_4x16 c

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        sqrdmulh        v3.4s,   \i,      \c

+        sqadd           \i,      \i,      \i

+        sqadd           \i,      \i,      v3.4s

+.endr

+.endm

+.macro def_horz_16 scale=0, shift=2, suffix

+function inv_txfm_horz\suffix\()_16x4_neon

+        mov             x14, x30

+        movi            v7.4s,  #0

+.if \scale

+        movz            w16, #2896*8, lsl #16

+        dup             v0.2s,   w16

+.endif

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        ld1             {\i}, [x7]

+        st1             {v7.4s}, [x7], x8

+.endr

+.if \scale

+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31

+.endif

+        blr             x4

+        sqrshrn         v16.4h,  v16.4s,  #\shift

+        sqrshrn         v17.4h,  v17.4s,  #\shift

+        sqrshrn         v18.4h,  v18.4s,  #\shift

+        sqrshrn         v19.4h,  v19.4s,  #\shift

+        sqrshrn2        v16.8h,  v20.4s,  #\shift

+        sqrshrn2        v17.8h,  v21.4s,  #\shift

+        sqrshrn2        v18.8h,  v22.4s,  #\shift

+        sqrshrn2        v19.8h,  v23.4s,  #\shift

+        sqrshrn         v20.4h,  v24.4s,  #\shift

+        sqrshrn         v21.4h,  v25.4s,  #\shift

+        sqrshrn         v22.4h,  v26.4s,  #\shift

+        sqrshrn         v23.4h,  v27.4s,  #\shift

+        sqrshrn2        v20.8h,  v28.4s,  #\shift

+        sqrshrn2        v21.8h,  v29.4s,  #\shift

+        sqrshrn2        v22.8h,  v30.4s,  #\shift

+        sqrshrn2        v23.8h,  v31.4s,  #\shift

+        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7

+        transpose_4x8h  v20, v21, v22, v23, v4,  v5,  v6,  v7

+.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h

+        st1             {\i}, [x6], #16

+.endr

+        br              x14

+endfunc

+.endm

+def_horz_16 scale=0, shift=2

+def_horz_16 scale=1, shift=1, suffix=_scale

+function inv_txfm_add_vert_8x16_neon

+        mov             x14, x30

+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

+        ld1             {v\i\().8h}, [x7], x8

+.endr

+        blr             x5

+        load_add_store_8x16 x6, x7

+        br              x14

+endfunc

+function inv_txfm_add_16x16_neon

+        mov             x15, x30

+        sub             sp,  sp,  #512

+        ldrh            w12, [x13], #2

+.irp i, 0, 4, 8, 12

+        add             x6,  sp,  #(\i*16*2)

+.if \i > 0

+        mov             w8,  #(16 - \i)

+        cmp             w3,  w12

+        b.lt            1f

+.if \i < 12

+        ldrh            w12, [x13], #2

+.endif

+.endif

+        add             x7,  x2,  #(\i*4)

+        mov             x8,  #16*4

+        bl              inv_txfm_horz_16x4_neon

+.endr

+        b               3f

+1:

+        movi            v4.8h,  #0

+        movi            v5.8h,  #0

+        movi            v6.8h,  #0

+        movi            v7.8h,  #0

+2:

+        subs            w8,  w8,  #4

+.rept 2

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+        b.gt            2b

+3:

+.irp i, 0, 8

+        add             x6,  x0,  #(\i*2)

+        add             x7,  sp,  #(\i*2)

+        mov             x8,  #32

+        bl              inv_txfm_add_vert_8x16_neon

+.endr

+        add             sp,  sp,  #512

+        br              x15

+endfunc

+const eob_16x16

+        .short 10, 36, 78, 256

+endconst

+const eob_16x16_identity

+        .short 4, 8, 12, 256

+endconst

+.macro def_fn_16x16 txfm1, txfm2, eob_half

+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1

+.ifc \txfm1\()_\txfm2, dct_dct

+        idct_dc         16,  16,  2

+.endif

+        adr             x4,  inv_\txfm1\()_4s_x16_neon

+        movrel          x5,  X(inv_\txfm2\()_8h_x16_neon)

+.ifc \txfm1, identity

+.ifc \txfm2, identity

+        movrel          x13, eob_16x16

+.else

+        movrel          x13, eob_16x16_identity

+.endif

+.else

+.ifc \txfm2, identity

+        movrel          x13, eob_16x16_identity

+.else

+        movrel          x13, eob_16x16

+.endif

+.endif

+        b               inv_txfm_add_16x16_neon

+endfunc

+.endm

+def_fn_16x16 dct, dct, 36

+def_fn_16x16 identity, identity, 36

+def_fn_16x16 dct, adst, 36

+def_fn_16x16 dct, flipadst, 36

+def_fn_16x16 dct, identity, 8

+def_fn_16x16 adst, dct, 36

+def_fn_16x16 adst, adst, 36

+def_fn_16x16 adst, flipadst, 36

+def_fn_16x16 flipadst, dct, 36

+def_fn_16x16 flipadst, adst, 36

+def_fn_16x16 flipadst, flipadst, 36

+def_fn_16x16 identity, dct, 8

+function inv_txfm_add_16x4_neon

+        mov             x15, x30

+        movi            v4.4s,  #0

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        ld1             {\i},    [x2]

+        st1             {v4.4s}, [x2], #16

+.endr

+        blr             x4

+        sqrshrn         v16.4h,  v16.4s,  #1

+        sqrshrn         v17.4h,  v17.4s,  #1

+        sqrshrn         v18.4h,  v18.4s,  #1

+        sqrshrn         v19.4h,  v19.4s,  #1

+        sqrshrn2        v16.8h,  v20.4s,  #1

+        sqrshrn2        v17.8h,  v21.4s,  #1

+        sqrshrn2        v18.8h,  v22.4s,  #1

+        sqrshrn2        v19.8h,  v23.4s,  #1

+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5

+        blr             x5

+        mov             x6,  x0

+        load_add_store_8x4 x6, x7

+        sqrshrn         v16.4h,  v24.4s,  #1

+        sqrshrn         v17.4h,  v25.4s,  #1

+        sqrshrn         v18.4h,  v26.4s,  #1

+        sqrshrn         v19.4h,  v27.4s,  #1

+        sqrshrn2        v16.8h,  v28.4s,  #1

+        sqrshrn2        v17.8h,  v29.4s,  #1

+        sqrshrn2        v18.8h,  v30.4s,  #1

+        sqrshrn2        v19.8h,  v31.4s,  #1

+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5

+        blr             x5

+        add             x6,  x0,  #16

+        load_add_store_8x4 x6, x7

+        br              x15

+endfunc

+function inv_txfm_add_4x16_neon

+        ldrh            w12, [x13, #4]

+        mov             x15, x30

+        mov             x11, #64

+        cmp             w3,  w12

+        ldrh            w12, [x13, #2]

+        b.lt            1f

+        add             x6,  x2,  #48

+        movi            v2.4s,   #0

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s

+        ld1             {\i},    [x6]

+        st1             {v2.4s}, [x6], x11

+.endr

+        blr             x4

+        rshrn           v28.4h,  v16.4s,  #1

+        rshrn           v29.4h,  v17.4s,  #1

+        rshrn           v30.4h,  v18.4s,  #1

+        rshrn           v31.4h,  v19.4s,  #1

+        transpose_4x4h  v28, v29, v30, v31, v4,  v5,  v6,  v7

+        b               2f

+1:

+.irp i, v28.4h, v29.4h, v30.4h, v31.4h

+        movi            \i,  #0

+.endr

+2:

+        cmp             w3,  w12

+        ldrh            w12, [x13, #0]

+        b.lt            1f

+        add             x6,  x2,  #32

+        movi            v2.4s,   #0

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s

+        ld1             {\i},    [x6]

+        st1             {v2.4s}, [x6], x11

+.endr

+        blr             x4

+        rshrn           v24.4h,  v16.4s,  #1

+        rshrn           v25.4h,  v17.4s,  #1

+        rshrn           v26.4h,  v18.4s,  #1

+        rshrn           v27.4h,  v19.4s,  #1

+        transpose_4x4h  v24, v25, v26, v27, v4,  v5,  v6,  v7

+        b               2f

+1:

+.irp i, v24.4h, v25.4h, v26.4h, v27.4h

+        movi            \i,  #0

+.endr

+2:

+        cmp             w3,  w12

+        b.lt            1f

+        add             x6,  x2,  #16

+        movi            v2.4s,   #0

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s

+        ld1             {\i},    [x6]

+        st1             {v2.4s}, [x6], x11

+.endr

+        blr             x4

+        rshrn           v20.4h,  v16.4s,  #1

+        rshrn           v21.4h,  v17.4s,  #1

+        rshrn           v22.4h,  v18.4s,  #1

+        rshrn           v23.4h,  v19.4s,  #1

+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7

+        b               2f

+1:

+.irp i, v20.4h, v21.4h, v22.4h, v23.4h

+        movi            \i,  #0

+.endr

+2:

+        movi            v2.4s,   #0

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s

+        ld1             {\i},    [x2]

+        st1             {v2.4s}, [x2], x11

+.endr

+        blr             x4

+        rshrn           v16.4h,  v16.4s,  #1

+        rshrn           v17.4h,  v17.4s,  #1

+        rshrn           v18.4h,  v18.4s,  #1

+        rshrn           v19.4h,  v19.4s,  #1

+        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7

+        blr             x5

+        load_add_store_4x16 x0, x6

+        br              x15

+endfunc

+const eob_4x16

+        .short 13, 29, 45, 64

+endconst

+const eob_4x16_identity1

+        .short 16, 32, 48, 64

+endconst

+const eob_4x16_identity2

+        .short 4, 8, 12, 64

+endconst

+.macro def_fn_416 w, h, txfm1, txfm2

+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1

+.ifc \txfm1\()_\txfm2, dct_dct

+        idct_dc         \w,  \h,  1

+.endif

+.if \w == 4

+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon

+        movrel          x5,  X(inv_\txfm2\()_4h_x\h\()_neon)

+.ifc \txfm1, identity

+.ifc \txfm2, identity

+        movrel          x13, eob_4x16

+.else

+        movrel          x13, eob_4x16_identity1

+.endif

+.else

+.ifc \txfm2, identity

+        movrel          x13, eob_4x16_identity2

+.else

+        movrel          x13, eob_4x16

+.endif

+.endif

+.else

+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon

+        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)

+.endif

+        b               inv_txfm_add_\w\()x\h\()_neon

+endfunc

+.endm

+.macro def_fns_416 w, h

+def_fn_416 \w, \h, dct, dct

+def_fn_416 \w, \h, identity, identity

+def_fn_416 \w, \h, dct, adst

+def_fn_416 \w, \h, dct, flipadst

+def_fn_416 \w, \h, dct, identity

+def_fn_416 \w, \h, adst, dct

+def_fn_416 \w, \h, adst, adst

+def_fn_416 \w, \h, adst, flipadst

+def_fn_416 \w, \h, flipadst, dct

+def_fn_416 \w, \h, flipadst, adst

+def_fn_416 \w, \h, flipadst, flipadst

+def_fn_416 \w, \h, identity, dct

+def_fn_416 \w, \h, adst, identity

+def_fn_416 \w, \h, flipadst, identity

+def_fn_416 \w, \h, identity, adst

+def_fn_416 \w, \h, identity, flipadst

+.endm

+def_fns_416 4, 16

+def_fns_416 16, 4

+function inv_txfm_add_16x8_neon

+        mov             x15, x30

+        stp             d8,  d9,  [sp, #-0x40]!

+        stp             d10, d11, [sp, #0x10]

+        stp             d12, d13, [sp, #0x20]

+        stp             d14, d15, [sp, #0x30]

+        movi            v4.4s,  #0

+        movz            w16, #2896*8, lsl #16

+        dup             v0.2s,   w16

+        mov             x11, #32

+        add             x6,  x2,  #16

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        ld1             {\i},    [x6]

+        st1             {v4.4s}, [x6], x11

+.endr

+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31

+        blr             x4

+        sqrshrn         v8.4h,   v16.4s,  #1

+        sqrshrn         v9.4h,   v17.4s,  #1

+        sqrshrn         v10.4h,  v18.4s,  #1

+        sqrshrn         v11.4h,  v19.4s,  #1

+        sqrshrn2        v8.8h,   v20.4s,  #1

+        sqrshrn2        v9.8h,   v21.4s,  #1

+        sqrshrn2        v10.8h,  v22.4s,  #1

+        sqrshrn2        v11.8h,  v23.4s,  #1

+        sqrshrn         v12.4h,  v24.4s,  #1

+        sqrshrn         v13.4h,  v25.4s,  #1

+        sqrshrn         v14.4h,  v26.4s,  #1

+        sqrshrn         v15.4h,  v27.4s,  #1

+        sqrshrn2        v12.8h,  v28.4s,  #1

+        sqrshrn2        v13.8h,  v29.4s,  #1

+        sqrshrn2        v14.8h,  v30.4s,  #1

+        sqrshrn2        v15.8h,  v31.4s,  #1

+        transpose_4x8h  v8,  v9,  v10, v11, v2,  v3,  v4,  v5

+        transpose_4x8h  v12, v13, v14, v15, v2,  v3,  v4,  v5

+        movz            w16, #2896*8, lsl #16

+        dup             v0.2s,   w16

+        movi            v4.4s,  #0

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        ld1             {\i},    [x2]

+        st1             {v4.4s}, [x2], x11

+.endr

+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31

+        blr             x4

+        sqrshrn         v16.4h,  v16.4s,  #1

+        sqrshrn         v17.4h,  v17.4s,  #1

+        sqrshrn         v18.4h,  v18.4s,  #1

+        sqrshrn         v19.4h,  v19.4s,  #1

+        sqrshrn2        v16.8h,  v20.4s,  #1

+        sqrshrn2        v17.8h,  v21.4s,  #1

+        sqrshrn2        v18.8h,  v22.4s,  #1

+        sqrshrn2        v19.8h,  v23.4s,  #1

+        mov             v20.16b, v8.16b

+        mov             v21.16b, v9.16b

+        mov             v22.16b, v10.16b

+        mov             v23.16b, v11.16b

+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5

+        sqrshrn         v8.4h,   v24.4s,  #1

+        sqrshrn         v9.4h,   v25.4s,  #1

+        sqrshrn         v10.4h,  v26.4s,  #1

+        sqrshrn         v11.4h,  v27.4s,  #1

+        sqrshrn2        v8.8h,   v28.4s,  #1

+        sqrshrn2        v9.8h,   v29.4s,  #1

+        sqrshrn2        v10.8h,  v30.4s,  #1

+        sqrshrn2        v11.8h,  v31.4s,  #1

+        transpose_4x8h  v8,  v9, v10, v11, v2,  v3,  v4,  v5

+        blr             x5

+        mov             x6,  x0

+        load_add_store_8x8 x6, x7

+        mov             v16.16b, v8.16b

+        mov             v17.16b, v9.16b

+        mov             v18.16b, v10.16b

+        mov             v19.16b, v11.16b

+        mov             v20.16b, v12.16b

+        mov             v21.16b, v13.16b

+        mov             v22.16b, v14.16b

+        mov             v23.16b, v15.16b

+        blr             x5

+        add             x0,  x0,  #16

+        load_add_store_8x8 x0, x7

+        ldp             d14, d15, [sp, #0x30]

+        ldp             d12, d13, [sp, #0x20]

+        ldp             d10, d11, [sp, #0x10]

+        ldp             d8,  d9,  [sp], 0x40

+        br              x15

+endfunc

+function inv_txfm_add_8x16_neon

+        mov             x15, x30

+        stp             d8,  d9,  [sp, #-0x20]!

+        stp             d10, d11, [sp, #0x10]

+        ldrh            w12, [x13, #4]

+        mov             x11, #64

+        cmp             w3,  w12

+        ldrh            w12, [x13, #2]

+        b.lt            1f

+        add             x6,  x2,  #48

+        movi            v4.4s,   #0

+        movz            w16, #2896*8, lsl #16

+        dup             v0.2s,   w16

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s

+        ld1             {\i},    [x6]

+        st1             {v4.4s}, [x6], x11

+.endr

+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        blr             x4

+        sqrshrn         v28.4h,  v16.4s,  #1

+        sqrshrn         v29.4h,  v17.4s,  #1

+        sqrshrn         v30.4h,  v18.4s,  #1

+        sqrshrn         v31.4h,  v19.4s,  #1

+        sqrshrn2        v28.8h,  v20.4s,  #1

+        sqrshrn2        v29.8h,  v21.4s,  #1

+        sqrshrn2        v30.8h,  v22.4s,  #1

+        sqrshrn2        v31.8h,  v23.4s,  #1

+        transpose_4x8h  v28, v29, v30, v31, v2, v3, v4, v5

+        b               2f

+1:

+.irp i, v28.8h, v29.8h, v30.8h, v31.8h

+        movi            \i,  #0

+.endr

+2:

+        cmp             w3,  w12

+        ldrh            w12, [x13, #0]

+        b.lt            1f

+        add             x6,  x2,  #32

+        movi            v4.4s,   #0

+        movz            w16, #2896*8, lsl #16

+        dup             v0.2s,   w16

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s

+        ld1             {\i},    [x6]

+        st1             {v4.4s}, [x6], x11

+.endr

+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        blr             x4

+        sqrshrn         v24.4h,  v16.4s,  #1

+        sqrshrn         v25.4h,  v17.4s,  #1

+        sqrshrn         v26.4h,  v18.4s,  #1

+        sqrshrn         v27.4h,  v19.4s,  #1

+        sqrshrn2        v24.8h,  v20.4s,  #1

+        sqrshrn2        v25.8h,  v21.4s,  #1

+        sqrshrn2        v26.8h,  v22.4s,  #1

+        sqrshrn2        v27.8h,  v23.4s,  #1

+        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5

+        b               2f

+1:

+.irp i, v24.8h, v25.8h, v26.8h, v27.8h

+        movi            \i,  #0

+.endr

+2:

+        cmp             w3,  w12

+        b.lt            1f

+        add             x6,  x2,  #16

+        movi            v4.4s,   #0

+        movz            w16, #2896*8, lsl #16

+        dup             v0.2s,   w16

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s

+        ld1             {\i},    [x6]

+        st1             {v4.4s}, [x6], x11

+.endr

+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        blr             x4

+        sqrshrn         v8.4h,   v16.4s,  #1

+        sqrshrn         v9.4h,   v17.4s,  #1

+        sqrshrn         v10.4h,  v18.4s,  #1

+        sqrshrn         v11.4h,  v19.4s,  #1

+        sqrshrn2        v8.8h,   v20.4s,  #1

+        sqrshrn2        v9.8h,   v21.4s,  #1

+        sqrshrn2        v10.8h,  v22.4s,  #1

+        sqrshrn2        v11.8h,  v23.4s,  #1

+        transpose_4x8h  v8,  v9,  v10, v11, v2, v3, v4, v5

+        b               2f

+1:

+.irp i, v8.8h, v9.8h, v10.8h, v11.8h

+        movi            \i,  #0

+.endr

+2:

+        movi            v4.4s,   #0

+        movz            w16, #2896*8, lsl #16

+        dup             v0.2s,   w16

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s

+        ld1             {\i},    [x2]

+        st1             {v4.4s}, [x2], x11

+.endr

+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        blr             x4

+        sqrshrn         v16.4h,  v16.4s,  #1

+        sqrshrn         v17.4h,  v17.4s,  #1

+        sqrshrn         v18.4h,  v18.4s,  #1

+        sqrshrn         v19.4h,  v19.4s,  #1

+        sqrshrn2        v16.8h,  v20.4s,  #1

+        sqrshrn2        v17.8h,  v21.4s,  #1

+        sqrshrn2        v18.8h,  v22.4s,  #1

+        sqrshrn2        v19.8h,  v23.4s,  #1

+        transpose_4x8h  v16, v17, v18, v19, v2, v3, v4, v5

+        mov             v20.16b, v8.16b

+        mov             v21.16b, v9.16b

+        mov             v22.16b, v10.16b

+        mov             v23.16b, v11.16b

+        blr             x5

+        load_add_store_8x16 x0, x6

+        ldp             d10, d11, [sp, #0x10]

+        ldp             d8,  d9,  [sp], 0x20

+        br              x15

+endfunc

+const eob_8x16

+        .short 10, 43, 75, 128

+endconst

+const eob_8x16_identity1

+        .short 4, 64, 96, 128

+endconst

+const eob_8x16_identity2

+        .short 4, 8, 12, 128

+endconst

+.macro def_fn_816 w, h, txfm1, txfm2

+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1

+.ifc \txfm1\()_\txfm2, dct_dct

+        idct_dc         \w,  \h,  1

+.endif

+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon

+        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)

+.if \w == 8

+.ifc \txfm1, identity

+.ifc \txfm2, identity

+        movrel          x13, eob_8x16

+.else

+        movrel          x13, eob_8x16_identity1

+.endif

+.else

+.ifc \txfm2, identity

+        movrel          x13, eob_8x16_identity2

+.else

+        movrel          x13, eob_8x16

+.endif

+.endif

+.endif

+        b               inv_txfm_add_\w\()x\h\()_neon

+endfunc

+.endm

+.macro def_fns_816 w, h

+def_fn_816 \w, \h, dct, dct

+def_fn_816 \w, \h, identity, identity

+def_fn_816 \w, \h, dct, adst

+def_fn_816 \w, \h, dct, flipadst

+def_fn_816 \w, \h, dct, identity

+def_fn_816 \w, \h, adst, dct

+def_fn_816 \w, \h, adst, adst

+def_fn_816 \w, \h, adst, flipadst

+def_fn_816 \w, \h, flipadst, dct

+def_fn_816 \w, \h, flipadst, adst

+def_fn_816 \w, \h, flipadst, flipadst

+def_fn_816 \w, \h, identity, dct

+def_fn_816 \w, \h, adst, identity

+def_fn_816 \w, \h, flipadst, identity

+def_fn_816 \w, \h, identity, adst

+def_fn_816 \w, \h, identity, flipadst

+.endm

+def_fns_816 8, 16

+def_fns_816 16, 8

+function inv_dct32_odd_4s_x16_neon

+        movrel          x16, idct_coeffs, 4*16

+        ld1             {v0.4s, v1.4s}, [x16], #32

+        mul_mls         v2,  v16, v31, v0.s[0], v0.s[1] // -> t16a

+        mul_mla         v4,  v16, v31, v0.s[1], v0.s[0] // -> t31a

+        mul_mls         v6,  v24, v23, v0.s[2], v0.s[3] // -> t17a

+        srshr           v16.4s, v2.4s,  #12             // t16a

+        srshr           v31.4s, v4.4s,  #12             // t31a

+        mul_mla         v2,  v24, v23, v0.s[3], v0.s[2] // -> t30a

+        mul_mls         v4,  v20, v27, v1.s[0], v1.s[1] // -> t18a

+        srshr           v24.4s, v6.4s,  #12             // t17a

+        srshr           v23.4s, v2.4s,  #12             // t30a

+        mul_mla         v6,  v20, v27, v1.s[1], v1.s[0] // -> t29a

+        mul_mls         v2,  v28, v19, v1.s[2], v1.s[3] // -> t19a

+        srshr           v20.4s, v4.4s,  #12             // t18a

+        srshr           v27.4s, v6.4s,  #12             // t29a

+        mul_mla         v4,  v28, v19, v1.s[3], v1.s[2] // -> t28a

+        ld1             {v0.4s, v1.4s}, [x16]

+        sub             x16, x16, #4*24

+        mul_mls         v6,  v18, v29, v0.s[0], v0.s[1] // -> t20a

+        srshr           v28.4s, v2.4s,  #12             // t19a

+        srshr           v19.4s, v4.4s,  #12             // t28a

+        mul_mla         v2,  v18, v29, v0.s[1], v0.s[0] // -> t27a

+        mul_mls         v4,  v26, v21, v0.s[2], v0.s[3] // -> t21a

+        srshr           v18.4s, v6.4s,  #12             // t20a

+        srshr           v29.4s, v2.4s,  #12             // t27a

+        mul_mla         v6,  v26, v21, v0.s[3], v0.s[2] // -> t26a

+        mul_mls         v2,  v22, v25, v1.s[0], v1.s[1] // -> t22a

+        srshr           v26.4s, v4.4s,  #12             // t21a

+        srshr           v21.4s, v6.4s,  #12             // t26a

+        mul_mla         v4,  v22, v25, v1.s[1], v1.s[0] // -> t25a

+        mul_mls         v6,  v30, v17, v1.s[2], v1.s[3] // -> t23a

+        srshr           v22.4s, v2.4s,  #12             // t22a

+        srshr           v25.4s, v4.4s,  #12             // t25a

+        mul_mla         v2,  v30, v17, v1.s[3], v1.s[2] // -> t24a

+        srshr           v30.4s, v6.4s,  #12             // t23a

+        srshr           v17.4s, v2.4s,  #12             // t24a

+        ld1             {v0.4s, v1.4s}, [x16]

+        sqsub           v2.4s,   v16.4s,  v24.4s // t17

+        sqadd           v16.4s,  v16.4s,  v24.4s // t16

+        sqsub           v3.4s,   v31.4s,  v23.4s // t30

+        sqadd           v31.4s,  v31.4s,  v23.4s // t31

+        sqsub           v24.4s,  v28.4s,  v20.4s // t18

+        sqadd           v28.4s,  v28.4s,  v20.4s // t19

+        sqadd           v23.4s,  v18.4s,  v26.4s // t20

+        sqsub           v18.4s,  v18.4s,  v26.4s // t21

+        sqsub           v20.4s,  v30.4s,  v22.4s // t22

+        sqadd           v30.4s,  v30.4s,  v22.4s // t23

+        sqadd           v26.4s,  v17.4s,  v25.4s // t24

+        sqsub           v17.4s,  v17.4s,  v25.4s // t25

+        sqsub           v22.4s,  v29.4s,  v21.4s // t26

+        sqadd           v29.4s,  v29.4s,  v21.4s // t27

+        sqadd           v25.4s,  v19.4s,  v27.4s // t28

+        sqsub           v19.4s,  v19.4s,  v27.4s // t29

+        mul_mls         v4,  v3,  v2,  v1.s[0], v1.s[1] // -> t17a

+        mul_mla         v6,  v3,  v2,  v1.s[1], v1.s[0] // -> t30a

+        mul_mla         v2,  v19, v24, v1.s[1], v1.s[0] // -> t18a

+        srshr           v21.4s, v4.4s,  #12             // t17a

+        srshr           v27.4s, v6.4s,  #12             // t30a

+        neg             v2.4s,   v2.4s                  // -> t18a

+        mul_mls         v4,  v19, v24, v1.s[0], v1.s[1] // -> t29a

+        mul_mls         v6,  v22, v18, v1.s[2], v1.s[3] // -> t21a

+        srshr           v19.4s, v2.4s,  #12             // t18a

+        srshr           v24.4s, v4.4s,  #12             // t29a

+        mul_mla         v2,  v22, v18, v1.s[3], v1.s[2] // -> t26a

+        mul_mla         v4,  v17, v20, v1.s[3], v1.s[2] // -> t22a

+        srshr           v22.4s, v6.4s,  #12             // t21a

+        srshr           v18.4s, v2.4s,  #12             // t26a

+        neg             v4.4s,   v4.4s                  // -> t22a

+        mul_mls         v6,  v17, v20, v1.s[2], v1.s[3] // -> t25a

+        srshr           v17.4s, v4.4s,  #12             // t22a

+        srshr           v20.4s, v6.4s,  #12             // t25a

+        sqsub           v2.4s,   v27.4s,  v24.4s // t29

+        sqadd           v27.4s,  v27.4s,  v24.4s // t30

+        sqsub           v3.4s,   v21.4s,  v19.4s // t18

+        sqadd           v21.4s,  v21.4s,  v19.4s // t17

+        sqsub           v24.4s,  v16.4s,  v28.4s // t19a

+        sqadd           v16.4s,  v16.4s,  v28.4s // t16a

+        sqsub           v19.4s,  v30.4s,  v23.4s // t20a

+        sqadd           v30.4s,  v30.4s,  v23.4s // t23a

+        sqsub           v28.4s,  v17.4s,  v22.4s // t21

+        sqadd           v17.4s,  v17.4s,  v22.4s // t22

+        sqadd           v23.4s,  v26.4s,  v29.4s // t24a

+        sqsub           v26.4s,  v26.4s,  v29.4s // t27a

+        sqadd           v22.4s,  v20.4s,  v18.4s // t25

+        sqsub           v20.4s,  v20.4s,  v18.4s // t26

+        sqsub           v29.4s,  v31.4s,  v25.4s // t28a

+        sqadd           v31.4s,  v31.4s,  v25.4s // t31a

+        mul_mls         v4,  v2,  v3,  v0.s[2], v0.s[3] // -> t18a

+        mul_mla         v6,  v2,  v3,  v0.s[3], v0.s[2] // -> t29a

+        mul_mls         v2,  v29, v24, v0.s[2], v0.s[3] // -> t19

+        srshr           v18.4s, v4.4s,  #12             // t18a

+        srshr           v25.4s, v6.4s,  #12             // t29a

+        mul_mla         v4,  v29, v24, v0.s[3], v0.s[2] // -> t28

+        mul_mla         v6,  v26, v19, v0.s[3], v0.s[2] // -> t20

+        srshr           v29.4s, v2.4s,  #12             // t19

+        srshr           v24.4s, v4.4s,  #12             // t28

+        neg             v6.4s,   v6.4s                  // -> t20

+        mul_mls         v2,  v26, v19, v0.s[2], v0.s[3] // -> t27

+        mul_mla         v4,  v20, v28, v0.s[3], v0.s[2] // -> t21a

+        srshr           v26.4s, v6.4s,  #12             // t20

+        srshr           v19.4s, v2.4s,  #12             // t27

+        neg             v4.4s,   v4.4s                  // -> t21a

+        mul_mls         v6,  v20, v28, v0.s[2], v0.s[3] // -> t26a

+        srshr           v20.4s, v4.4s,  #12             // t21a

+        srshr           v28.4s, v6.4s,  #12             // t26a

+        sqsub           v2.4s,   v16.4s,  v30.4s // t23

+        sqadd           v16.4s,  v16.4s,  v30.4s // t16 = out16

+        sqsub           v3.4s,   v31.4s,  v23.4s // t24

+        sqadd           v31.4s,  v31.4s,  v23.4s // t31 = out31

+        sqsub           v23.4s,  v21.4s,  v17.4s // t22a

+        sqadd           v17.4s,  v21.4s,  v17.4s // t17a = out17

+        sqadd           v30.4s,  v27.4s,  v22.4s // t30a = out30

+        sqsub           v21.4s,  v27.4s,  v22.4s // t25a

+        sqsub           v27.4s,  v18.4s,  v20.4s // t21

+        sqadd           v18.4s,  v18.4s,  v20.4s // t18 = out18

+        sqadd           v4.4s,   v29.4s,  v26.4s // t19a = out19

+        sqsub           v26.4s,  v29.4s,  v26.4s // t20a

+        sqadd           v29.4s,  v25.4s,  v28.4s // t29 = out29

+        sqsub           v25.4s,  v25.4s,  v28.4s // t26

+        sqadd           v28.4s,  v24.4s,  v19.4s // t28a = out28

+        sqsub           v24.4s,  v24.4s,  v19.4s // t27a

+        mov             v19.16b, v4.16b          // out19

+        mul_mls         v4,  v24, v26, v0.s[0], v0.s[0] // -> t20

+        mul_mla         v6,  v24, v26, v0.s[0], v0.s[0] // -> t27

+        srshr           v20.4s, v4.4s,  #12             // t20

+        srshr           v22.4s, v6.4s,  #12             // t27

+        mul_mla         v4,  v25, v27, v0.s[0], v0.s[0] // -> t26a

+        mul_mls         v6,  v25, v27, v0.s[0], v0.s[0] // -> t21a

+        mov             v27.16b,  v22.16b               // t27

+        srshr           v26.4s, v4.4s,  #12             // t26a

+        mul_mls         v24, v21, v23, v0.s[0], v0.s[0] // -> t22

+        mul_mla         v4,  v21, v23, v0.s[0], v0.s[0] // -> t25

+        srshr           v21.4s, v6.4s,  #12             // t21a

+        srshr           v22.4s, v24.4s, #12             // t22

+        srshr           v25.4s, v4.4s,  #12             // t25

+        mul_mls         v4,  v3,  v2,  v0.s[0], v0.s[0] // -> t23a

+        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t24a

+        srshr           v23.4s, v4.4s,  #12             // t23a

+        srshr           v24.4s, v6.4s,  #12             // t24a

+        ret

+endfunc

+.macro def_horz_32 scale=0, shift=2, suffix

+function inv_txfm_horz\suffix\()_dct_32x4_neon

+        mov             x14, x30

+        movi            v7.4s,  #0

+        lsl             x8,  x8,  #1

+.if \scale

+        movz            w16, #2896*8, lsl #16

+        dup             v0.2s,   w16

+.endif

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        ld1             {\i}, [x7]

+        st1             {v7.4s}, [x7], x8

+.endr

+        sub             x7,  x7,  x8, lsl #4

+        add             x7,  x7,  x8, lsr #1

+.if \scale

+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31

+.endif

+        bl              inv_dct_4s_x16_neon

+        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5

+        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5

+        transpose_4x4s  v24, v25, v26, v27, v2,  v3,  v4,  v5

+        transpose_4x4s  v28, v29, v30, v31, v2,  v3,  v4,  v5

+.macro store1 r0, r1, r2, r3

+        st1             {\r0}, [x6], #16

+        st1             {\r1}, [x6], #16

+        st1             {\r2}, [x6], #16

+        st1             {\r3}, [x6], #16

+.endm

+        store1          v16.4s,  v20.4s,  v24.4s,  v28.4s

+        store1          v17.4s,  v21.4s,  v25.4s,  v29.4s

+        store1          v18.4s,  v22.4s,  v26.4s,  v30.4s

+        store1          v19.4s,  v23.4s,  v27.4s,  v31.4s

+.purgem store1

+        sub             x6,  x6,  #64*4

+        movi            v7.4s,  #0

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        ld1             {\i}, [x7]

+        st1             {v7.4s}, [x7], x8

+.endr

+.if \scale

+        // This relies on the fact that the idct also leaves the right coeff in v0.s[1]

+        scale_input     .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23

+        scale_input     .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31

+.endif

+        bl              inv_dct32_odd_4s_x16_neon

+        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5

+        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5

+        transpose_4x4s  v23, v22, v21, v20, v2,  v3,  v4,  v5

+        transpose_4x4s  v19, v18, v17, v16, v2,  v3,  v4,  v5

+.macro store2 r0, r1, r2, r3, shift

+        ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]

+        sqsub           v4.4s,   v0.4s,   \r0

+        sqadd           v0.4s,   v0.4s,   \r0

+        sqsub           v5.4s,   v1.4s,   \r1

+        sqadd           v1.4s,   v1.4s,   \r1

+        sqsub           v6.4s,   v2.4s,   \r2

+        sqadd           v2.4s,   v2.4s,   \r2

+        sqsub           v7.4s,   v3.4s,   \r3

+        sqadd           v3.4s,   v3.4s,   \r3

+        sqrshrn         v0.4h,   v0.4s,   #\shift

+        sqrshrn2        v0.8h,   v1.4s,   #\shift

+        sqrshrn         v1.4h,   v2.4s,   #\shift

+        sqrshrn2        v1.8h,   v3.4s,   #\shift

+        sqrshrn         v2.4h,   v7.4s,   #\shift

+        sqrshrn2        v2.8h,   v6.4s,   #\shift

+        sqrshrn         v3.4h,   v5.4s,   #\shift

+        sqrshrn2        v3.8h,   v4.4s,   #\shift

+        st1             {v0.8h, v1.8h}, [x6], #32

+        rev64           v2.8h,   v2.8h

+        rev64           v3.8h,   v3.8h

+        st1             {v2.8h, v3.8h}, [x6], #32

+.endm

+        store2          v31.4s,  v27.4s,  v23.4s,  v19.4s,  \shift

+        store2          v30.4s,  v26.4s,  v22.4s,  v18.4s,  \shift

+        store2          v29.4s,  v25.4s,  v21.4s,  v17.4s,  \shift

+        store2          v28.4s,  v24.4s,  v20.4s,  v16.4s,  \shift

+.purgem store2

+        br              x14

+endfunc

+.endm

+def_horz_32 scale=0, shift=2

+def_horz_32 scale=1, shift=1, suffix=_scale

+function inv_txfm_add_vert_dct_8x32_neon

+        mov             x14, x30

+        lsl             x8,  x8,  #1

+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

+        ld1             {v\i\().8h}, [x7], x8

+.endr

+        sub             x7,  x7,  x8, lsl #4

+        bl              X(inv_dct_8h_x16_neon)

+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

+        st1             {v\i\().8h}, [x7], x8

+.endr

+        sub             x7,  x7,  x8, lsl #4

+        add             x7,  x7,  x8, lsr #1

+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

+        ld1             {v\i\().8h}, [x7], x8

+.endr

+        sub             x7,  x7,  x8, lsl #4

+        sub             x7,  x7,  x8, lsr #1

+        bl              X(inv_dct32_odd_8h_x16_neon)

+        neg             x9,  x8

+        mov             x10, x6

+        movi            v0.8h,   #0

+        mvni            v1.8h,   #0xfc, lsl #8 // 0x3ff

+.macro combine r0, r1, r2, r3, op, stride

+        ld1             {v5.8h}, [x7],    \stride

+        ld1             {v2.8h}, [x10],   x1

+        ld1             {v6.8h}, [x7],    \stride

+        ld1             {v3.8h}, [x10],   x1

+        \op             v5.8h,   v5.8h,   \r0

+        ld1             {v7.8h}, [x7],    \stride

+        ld1             {v4.8h}, [x10],   x1

+        srshr           v5.8h,   v5.8h,   #4

+        \op             v6.8h,   v6.8h,   \r1

+        sqadd           v5.8h,   v5.8h,   v2.8h

+        srshr           v6.8h,   v6.8h,   #4

+        \op             v7.8h,   v7.8h,   \r2

+        smax            v2.8h,   v5.8h,   v0.8h

+        ld1             {v5.8h}, [x7],    \stride

+        sqadd           v6.8h,   v6.8h,   v3.8h

+        smin            v2.8h,   v2.8h,   v1.8h

+        srshr           v7.8h,   v7.8h,   #4

+        \op             v5.8h,   v5.8h,   \r3

+        st1             {v2.8h}, [x6],    x1

+        ld1             {v2.8h}, [x10],   x1

+        smax            v3.8h,   v6.8h,   v0.8h

+        sqadd           v7.8h,   v7.8h,   v4.8h

+        smin            v3.8h,   v3.8h,   v1.8h

+        srshr           v5.8h,   v5.8h,   #4

+        st1             {v3.8h}, [x6],    x1

+        smax            v4.8h,   v7.8h,   v0.8h

+        sqadd           v5.8h,   v5.8h,   v2.8h

+        smin            v4.8h,   v4.8h,   v1.8h

+        st1             {v4.8h}, [x6],    x1

+        smax            v2.8h,   v5.8h,   v0.8h

+        smin            v2.8h,   v2.8h,   v1.8h

+        st1             {v2.8h}, [x6],    x1

+.endm

+        combine         v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8

+        combine         v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8

+        combine         v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8

+        combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8

+        sub             x7,  x7,  x8

+        combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9

+        combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9

+        combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9

+        combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9

+.purgem combine

+        br              x14

+endfunc

+const eob_32x32

+        .short 10, 36, 78, 136, 210, 300, 406, 1024

+endconst

+const eob_16x32

+        .short 10, 36, 78, 151, 215, 279, 343, 512

+endconst

+const eob_16x32_shortside

+        .short 10, 36, 78, 512

+endconst

+const eob_8x32

+        .short 10, 43, 75, 107, 139, 171, 203, 256

+endconst

+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1

+        movi            v0.8h,  #0

+        movi            v1.8h,  #0

+        movrel          x13, eob_32x32, 2

+        mov             x8,  #4*32

+1:

+        mov             w9,  #0

+        movrel          x12, eob_32x32, 2

+2:

+        add             w9,  w9,  #8

+        ld1             {v16.4s, v17.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v18.4s, v19.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v20.4s, v21.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v22.4s, v23.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v24.4s, v25.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v26.4s, v27.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v28.4s, v29.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v30.4s, v31.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        sqxtn           v16.4h,  v16.4s

+        sqxtn2          v16.8h,  v17.4s

+        sqxtn           v17.4h,  v18.4s

+        sqxtn2          v17.8h,  v19.4s

+        sqxtn           v18.4h,  v20.4s

+        sqxtn2          v18.8h,  v21.4s

+        sqxtn           v19.4h,  v22.4s

+        sqxtn2          v19.8h,  v23.4s

+        sqxtn           v20.4h,  v24.4s

+        sqxtn2          v20.8h,  v25.4s

+        sqxtn           v21.4h,  v26.4s

+        sqxtn2          v21.8h,  v27.4s

+        sqxtn           v22.4h,  v28.4s

+        sqxtn2          v22.8h,  v29.4s

+        sqxtn           v23.4h,  v30.4s

+        sqxtn2          v23.8h,  v31.4s

+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5

+        load_add_store_8x8 x0, x7, shiftbits=2

+        ldrh            w11, [x12], #4

+        sub             x0,  x0,  x1, lsl #3

+        add             x0,  x0,  #2*8

+        cmp             w3,  w11

+        b.ge            2b

+        ldrh            w11, [x13], #4

+        cmp             w3,  w11

+        b.lt            9f

+        sub             x0,  x0,  w9, uxtw #1

+        add             x0,  x0,  x1, lsl #3

+        msub            x2,  x8,  x9,  x2

+        add             x2,  x2,  #4*8

+        b               1b

+9:

+        ret

+endfunc

+.macro shift_16_regs op, shift

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        \op             \i,  \i,  #\shift

+.endr

+.endm

+.macro def_identity_1632 w, h, wshort, hshort

+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1

+        movz            w16, #2896*8, lsl #16

+        movz            w17, #2*(5793-4096)*8, lsl #16

+        movi            v0.4s,   #0

+        movi            v1.4s,   #0

+        movrel          x13, eob_16x32\hshort, 2

+        mov             x8,  #4*\h

+1:

+        mov             w9,  #0

+        movrel          x12, eob_16x32\wshort, 2

+2:

+        add             w9,  w9,  #8

+        ld1             {v16.4s, v17.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        dup             v2.2s,   w16

+        ld1             {v18.4s, v19.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        mov             v2.s[1], w17

+        ld1             {v20.4s, v21.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v22.4s, v23.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v24.4s, v25.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v26.4s, v27.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v28.4s, v29.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v30.4s, v31.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        scale_input     .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        scale_input     .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31

+.if \w == 16

+        // 16x32

+        identity_4x16_shift1 v2.s[1]

+.else

+        // 32x16

+        shift_16_regs   sqshl, 1

+        identity_4x16   v2.s[1]

+.endif

+        sqxtn           v16.4h,  v16.4s

+        sqxtn2          v16.8h,  v17.4s

+        sqxtn           v17.4h,  v18.4s

+        sqxtn2          v17.8h,  v19.4s

+        sqxtn           v18.4h,  v20.4s

+        sqxtn2          v18.8h,  v21.4s

+        sqxtn           v19.4h,  v22.4s

+        sqxtn2          v19.8h,  v23.4s

+        sqxtn           v20.4h,  v24.4s

+        sqxtn2          v20.8h,  v25.4s

+        sqxtn           v21.4h,  v26.4s

+        sqxtn2          v21.8h,  v27.4s

+        sqxtn           v22.4h,  v28.4s

+        sqxtn2          v22.8h,  v29.4s

+        sqxtn           v23.4h,  v30.4s

+        sqxtn2          v23.8h,  v31.4s

+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5

+.if \w == 16

+        load_add_store_8x8 x0, x7, shiftbits=2

+.else

+        load_add_store_8x8 x0, x7, shiftbits=4

+.endif

+        ldrh            w11, [x12], #4

+        sub             x0,  x0,  x1, lsl #3

+        add             x0,  x0,  #16

+        cmp             w3,  w11

+        b.ge            2b

+        ldrh            w11, [x13], #4

+        cmp             w3,  w11

+        b.lt            9f

+        sub             x0,  x0,  w9, uxtw #1

+        add             x0,  x0,  x1, lsl #3

+        msub            x2,  x8,  x9,  x2

+        add             x2,  x2,  #4*8

+        b               1b

+9:

+        ret

+endfunc

+.endm

+def_identity_1632 16, 32, _shortside,

+def_identity_1632 32, 16, , _shortside

+.macro def_identity_832 w, h

+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1

+        movi            v0.4s,  #0

+        movi            v1.4s,  #0

+        // Working on 8x8 blocks, read every other entry from eob_8x32

+        movrel          x13, eob_8x32, 2

+        mov             w8,  #4*\h

+1:

+        // Working on 8x8 blocks, read every other entry from eob_8x32

+        ldrh            w12, [x13], #4

+        ld1             {v16.4s, v17.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v18.4s, v19.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v20.4s, v21.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v22.4s, v23.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v24.4s, v25.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v26.4s, v27.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v28.4s, v29.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+        ld1             {v30.4s, v31.4s}, [x2]

+        st1             {v0.4s, v1.4s},   [x2], x8

+.if \w == 8

+        sqrshrn         v16.4h,  v16.4s,  #1

+        sqrshrn2        v16.8h,  v17.4s,  #1

+        sqrshrn         v17.4h,  v18.4s,  #1

+        sqrshrn2        v17.8h,  v19.4s,  #1

+        sqrshrn         v18.4h,  v20.4s,  #1

+        sqrshrn2        v18.8h,  v21.4s,  #1

+        sqrshrn         v19.4h,  v22.4s,  #1

+        sqrshrn2        v19.8h,  v23.4s,  #1

+        sqrshrn         v20.4h,  v24.4s,  #1

+        sqrshrn2        v20.8h,  v25.4s,  #1

+        sqrshrn         v21.4h,  v26.4s,  #1

+        sqrshrn2        v21.8h,  v27.4s,  #1

+        sqrshrn         v22.4h,  v28.4s,  #1

+        sqrshrn2        v22.8h,  v29.4s,  #1

+        sqrshrn         v23.4h,  v30.4s,  #1

+        sqrshrn2        v23.8h,  v31.4s,  #1

+.else

+        sqxtn           v16.4h,  v16.4s

+        sqxtn2          v16.8h,  v17.4s

+        sqxtn           v17.4h,  v18.4s

+        sqxtn2          v17.8h,  v19.4s

+        sqxtn           v18.4h,  v20.4s

+        sqxtn2          v18.8h,  v21.4s

+        sqxtn           v19.4h,  v22.4s

+        sqxtn2          v19.8h,  v23.4s

+        sqxtn           v20.4h,  v24.4s

+        sqxtn2          v20.8h,  v25.4s

+        sqxtn           v21.4h,  v26.4s

+        sqxtn2          v21.8h,  v27.4s

+        sqxtn           v22.4h,  v28.4s

+        sqxtn2          v22.8h,  v29.4s

+        sqxtn           v23.4h,  v30.4s

+        sqxtn2          v23.8h,  v31.4s

+.endif

+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5

+        cmp             w3,  w12

+.if \w == 8

+        load_add_store_8x8 x0, x7, shiftbits=2

+.else

+        load_add_store_8x8 x0, x7, shiftbits=3

+.endif

+        b.lt            9f

+.if \w == 8

+        sub             x2,  x2,  x8, lsl #3

+        add             x2,  x2,  #4*8

+.else

+        sub             x0,  x0,  x1, lsl #3

+        add             x0,  x0,  #2*8

+.endif

+        b               1b

+9:

+        ret

+endfunc

+.endm

+def_identity_832 8, 32

+def_identity_832 32, 8

+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1

+        idct_dc         32,  32,  2

+        mov             x15, x30

+        sub             sp,  sp,  #2048

+        movrel          x13, eob_32x32

+        ldrh            w12, [x13], #2

+.irp i, 0, 4, 8, 12, 16, 20, 24, 28

+        add             x6,  sp,  #(\i*32*2)

+.if \i > 0

+        mov             w8,  #(32 - \i)

+        cmp             w3,  w12

+        b.lt            1f

+.if \i < 28

+        ldrh            w12, [x13], #2

+.endif

+.endif

+        add             x7,  x2,  #(\i*4)

+        mov             x8,  #32*4

+        bl              inv_txfm_horz_dct_32x4_neon

+.endr

+        b               3f

+1:

+        movi            v4.8h,  #0

+        movi            v5.8h,  #0

+        movi            v6.8h,  #0

+        movi            v7.8h,  #0

+2:

+        subs            w8,  w8,  #4

+.rept 4

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+        b.gt            2b

+3:

+.irp i, 0, 8, 16, 24

+        add             x6,  x0,  #(\i*2)

+        add             x7,  sp,  #(\i*2)

+        mov             x8,  #32*2

+        bl              inv_txfm_add_vert_dct_8x32_neon

+.endr

+        add             sp,  sp,  #2048

+        br              x15

+endfunc

+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1

+        idct_dc         16,  32,  1

+        mov             x15, x30

+        sub             sp,  sp,  #1024

+        movrel          x13, eob_16x32

+        ldrh            w12, [x13], #2

+        adr             x4,  inv_dct_4s_x16_neon

+.irp i, 0, 4, 8, 12, 16, 20, 24, 28

+        add             x6,  sp,  #(\i*16*2)

+        add             x7,  x2,  #(\i*4)

+.if \i > 0

+        mov             w8,  #(32 - \i)

+        cmp             w3,  w12

+        b.lt            1f

+.if \i < 28

+        ldrh            w12, [x13], #2

+.endif

+.endif

+        mov             x8,  #4*32

+        bl              inv_txfm_horz_scale_16x4_neon

+.endr

+        b               3f

+1:

+        movi            v4.8h,  #0

+        movi            v5.8h,  #0

+        movi            v6.8h,  #0

+        movi            v7.8h,  #0

+2:

+        subs            w8,  w8,  #4

+.rept 2

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+        b.gt            2b

+3:

+.irp i, 0, 8

+        add             x6,  x0,  #(\i*2)

+        add             x7,  sp,  #(\i*2)

+        mov             x8,  #16*2

+        bl              inv_txfm_add_vert_dct_8x32_neon

+.endr

+        add             sp,  sp,  #1024

+        br              x15

+endfunc

+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1

+        idct_dc         32,  16,  1

+        mov             x15, x30

+        sub             sp,  sp,  #1024

+        movrel          x13, eob_16x32

+        movrel          x5,  X(inv_dct_8h_x16_neon)

+        ldrh            w12, [x13], #2

+.irp i, 0, 4, 8, 12

+        add             x6,  sp,  #(\i*32*2)

+        add             x7,  x2,  #(\i*4)

+.if \i > 0

+        mov             w8,  #(16 - \i)

+        cmp             w3,  w12

+        b.lt            1f

+        ldrh            w12, [x13], #2

+.endif

+        mov             x8,  #4*16

+        bl              inv_txfm_horz_scale_dct_32x4_neon

+.endr

+        b               3f

+1:

+        movi            v4.8h,  #0

+        movi            v5.8h,  #0

+        movi            v6.8h,  #0

+        movi            v7.8h,  #0

+2:

+        subs            w8,  w8,  #4

+.rept 4

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+        b.gt            2b

+3:

+.irp i, 0, 8, 16, 24

+        add             x6,  x0,  #(\i*2)

+        add             x7,  sp,  #(\i*2)

+        mov             x8,  #32*2

+        bl              inv_txfm_add_vert_8x16_neon

+.endr

+        add             sp,  sp,  #1024

+        br              x15

+endfunc

+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1

+        idct_dc         8,   32, 2

+        mov             x15, x30

+        sub             sp,  sp,  #512

+        movrel          x13, eob_8x32

+        movi            v28.4s,  #0

+        mov             x8,  #4*32

+        mov             w9,  #32

+        mov             x6,  sp

+        mov             x7,  x2

+1:

+.irp i, 16, 17, 18, 19, 20, 21, 22, 23

+        ld1             {v\i\().4s}, [x7]

+        st1             {v28.4s}, [x7], x8

+.endr

+        ldrh            w12, [x13], #2

+        sub             w9,  w9,  #4

+        sub             x7,  x7,  x8, lsl #3

+        add             x7,  x7,  #4*4

+        bl              inv_dct_4s_x8_neon

+        sqrshrn         v16.4h,  v16.4s,  #2

+        sqrshrn         v17.4h,  v17.4s,  #2

+        sqrshrn         v18.4h,  v18.4s,  #2

+        sqrshrn         v19.4h,  v19.4s,  #2

+        sqrshrn2        v16.8h,  v20.4s,  #2

+        sqrshrn2        v17.8h,  v21.4s,  #2

+        sqrshrn2        v18.8h,  v22.4s,  #2

+        sqrshrn2        v19.8h,  v23.4s,  #2

+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5

+        cmp             w3,  w12

+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64

+        b.ge            1b

+        cbz             w9,  3f

+        movi            v29.8h,  #0

+        movi            v30.8h,  #0

+        movi            v31.8h,  #0

+2:

+        subs            w9,  w9,  #4

+        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64

+        b.gt            2b

+3:

+        mov             x6,  x0

+        mov             x7,  sp

+        mov             x8,  #8*2

+        bl              inv_txfm_add_vert_dct_8x32_neon

+        add             sp,  sp,  #512

+        br              x15

+endfunc

+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1

+        idct_dc         32,  8,   2

+        mov             x15, x30

+        sub             sp,  sp,  #512

+.irp i, 0, 4

+        add             x6,  sp,  #(\i*32*2)

+        add             x7,  x2,  #(\i*4)

+.if \i > 0

+        cmp             w3,  #10

+        b.lt            1f

+.endif

+        mov             x8,  #8*4

+        bl              inv_txfm_horz_dct_32x4_neon

+.endr

+        b               2f

+1:

+        movi            v4.8h,   #0

+        movi            v5.8h,   #0

+        movi            v6.8h,   #0

+        movi            v7.8h,   #0

+.rept 4

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+2:

+        mov             x8,  #2*32

+        mov             w9,  #0

+1:

+        add             x6,  x0,  x9, lsl #1

+        add             x7,  sp,  x9, lsl #1 // #(\i*2)

+.irp i, 16, 17, 18, 19, 20, 21, 22, 23

+        ld1             {v\i\().8h}, [x7], x8

+.endr

+        add             w9,  w9,  #8

+        bl              X(inv_dct_8h_x8_neon)

+        cmp             w9,  #32

+        load_add_store_8x8 x6, x7

+        b.lt            1b

+        add             sp,  sp,  #512

+        br              x15

+endfunc

+function inv_dct64_step1_neon

+        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a

+        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a

+        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a

+        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a

+        ld1             {v0.4s, v1.4s}, [x17], #32

+        sqrdmulh        v23.4s,  v16.4s,  v0.s[1]       // t63a

+        sqrdmulh        v16.4s,  v16.4s,  v0.s[0]       // t32a

+        sqrdmulh        v22.4s,  v17.4s,  v0.s[2]       // t62a

+        sqrdmulh        v17.4s,  v17.4s,  v0.s[3]       // t33a

+        sqrdmulh        v21.4s,  v18.4s,  v1.s[1]       // t61a

+        sqrdmulh        v18.4s,  v18.4s,  v1.s[0]       // t34a

+        sqrdmulh        v20.4s,  v19.4s,  v1.s[2]       // t60a

+        sqrdmulh        v19.4s,  v19.4s,  v1.s[3]       // t35a

+        ld1             {v0.4s}, [x17], #16

+        sqadd           v24.4s,  v16.4s,  v17.4s        // t32

+        sqsub           v25.4s,  v16.4s,  v17.4s        // t33

+        sqsub           v26.4s,  v19.4s,  v18.4s        // t34

+        sqadd           v27.4s,  v19.4s,  v18.4s        // t35

+        sqadd           v28.4s,  v20.4s,  v21.4s        // t60

+        sqsub           v29.4s,  v20.4s,  v21.4s        // t61

+        sqsub           v30.4s,  v23.4s,  v22.4s        // t62

+        sqadd           v31.4s,  v23.4s,  v22.4s        // t63

+        mul_mla         v2,  v29, v26, v0.s[0], v0.s[1] // -> t34a

+        mul_mls         v4,  v29, v26, v0.s[1], v0.s[0] // -> t61a

+        neg             v2.4s,   v2.4s                  // t34a

+        mul_mls         v6,  v30, v25, v0.s[1], v0.s[0] // -> t33a

+        srshr           v26.4s, v2.4s,  #12             // t34a

+        mul_mla         v2,  v30, v25, v0.s[0], v0.s[1] // -> t62a

+        srshr           v29.4s, v4.4s,  #12             // t61a

+        srshr           v25.4s, v6.4s,  #12             // t33a

+        srshr           v30.4s, v2.4s,  #12             // t62a

+        sqadd           v16.4s,  v24.4s,  v27.4s        // t32a

+        sqsub           v19.4s,  v24.4s,  v27.4s        // t35a

+        sqadd           v17.4s,  v25.4s,  v26.4s        // t33

+        sqsub           v18.4s,  v25.4s,  v26.4s        // t34

+        sqsub           v20.4s,  v31.4s,  v28.4s        // t60a

+        sqadd           v23.4s,  v31.4s,  v28.4s        // t63a

+        sqsub           v21.4s,  v30.4s,  v29.4s        // t61

+        sqadd           v22.4s,  v30.4s,  v29.4s        // t62

+        mul_mla         v2,  v21, v18, v0.s[2], v0.s[3] // -> t61a

+        mul_mls         v4,  v21, v18, v0.s[3], v0.s[2] // -> t34a

+        mul_mla         v6,  v20, v19, v0.s[2], v0.s[3] // -> t60

+        srshr           v21.4s, v2.4s,  #12             // t61a

+        srshr           v18.4s, v4.4s,  #12             // t34a

+        mul_mls         v2,  v20, v19, v0.s[3], v0.s[2] // -> t35

+        srshr           v20.4s, v6.4s,  #12             // t60

+        srshr           v19.4s, v2.4s,  #12             // t35

+        st1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64

+        st1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64

+        ret

+endfunc

+function inv_dct64_step2_neon

+        movrel          x16, idct_coeffs

+        ld1             {v0.4s}, [x16]

+1:

+        // t32a/33/34a/35/60/61a/62/63a

+        // t56a/57/58a/59/36/37a/38/39a

+        // t40a/41/42a/43/52/53a/54/55a

+        // t48a/49/50a/51/44/45a/46/47a

+        ldr             q16, [x6, #4*4*0]  // t32a

+        ldr             q17, [x9, #4*4*8]  // t39a

+        ldr             q18, [x9, #4*4*0]  // t63a

+        ldr             q19, [x6, #4*4*8]  // t56a

+        ldr             q20, [x6, #4*4*16] // t40a

+        ldr             q21, [x9, #4*4*24] // t47a

+        ldr             q22, [x9, #4*4*16] // t55a

+        ldr             q23, [x6, #4*4*24] // t48a

+        sqadd           v24.4s,  v16.4s, v17.4s         // t32

+        sqsub           v25.4s,  v16.4s, v17.4s         // t39

+        sqadd           v26.4s,  v18.4s, v19.4s         // t63

+        sqsub           v27.4s,  v18.4s, v19.4s         // t56

+        sqsub           v28.4s,  v21.4s, v20.4s         // t40

+        sqadd           v29.4s,  v21.4s, v20.4s         // t47

+        sqadd           v30.4s,  v23.4s, v22.4s         // t48

+        sqsub           v31.4s,  v23.4s, v22.4s         // t55

+        mul_mla         v2,  v27, v25, v0.s[3], v0.s[2] // -> t56a

+        mul_mls         v4,  v27, v25, v0.s[2], v0.s[3] // -> t39a

+        mul_mla         v6,  v31, v28, v0.s[3], v0.s[2] // -> t40a

+        srshr           v25.4s, v2.4s,  #12             // t56a

+        srshr           v27.4s, v4.4s,  #12             // t39a

+        neg             v6.4s,   v6.4s                  // t40a

+        mul_mls         v2,  v31, v28, v0.s[2], v0.s[3] // -> t55a

+        srshr           v31.4s, v6.4s,  #12             // t40a

+        srshr           v28.4s, v2.4s,  #12             // t55a

+        sqadd           v16.4s,  v24.4s,  v29.4s        // t32a

+        sqsub           v19.4s,  v24.4s,  v29.4s        // t47a

+        sqadd           v17.4s,  v27.4s,  v31.4s        // t39

+        sqsub           v18.4s,  v27.4s,  v31.4s        // t40

+        sqsub           v20.4s,  v26.4s,  v30.4s        // t48a

+        sqadd           v23.4s,  v26.4s,  v30.4s        // t63a

+        sqsub           v21.4s,  v25.4s,  v28.4s        // t55

+        sqadd           v22.4s,  v25.4s,  v28.4s        // t56

+        mul_mls         v2,  v21, v18, v0.s[0], v0.s[0] // -> t40a

+        mul_mla         v4,  v21, v18, v0.s[0], v0.s[0] // -> t55a

+        mul_mls         v6,  v20, v19, v0.s[0], v0.s[0] // -> t47

+        srshr           v18.4s, v2.4s,  #12             // t40a

+        srshr           v21.4s, v4.4s,  #12             // t55a

+        mul_mla         v2,  v20, v19, v0.s[0], v0.s[0] // -> t48

+        srshr           v19.4s, v6.4s,  #12             // t47

+        srshr           v20.4s, v2.4s,  #12             // t48

+        str             q16, [x6, #4*4*0]  // t32a

+        str             q17, [x9, #4*4*0]  // t39

+        str             q18, [x6, #4*4*8]  // t40a

+        str             q19, [x9, #4*4*8]  // t47

+        str             q20, [x6, #4*4*16] // t48

+        str             q21, [x9, #4*4*16] // t55a

+        str             q22, [x6, #4*4*24] // t56

+        str             q23, [x9, #4*4*24] // t63a

+        add             x6,  x6,  #4*4

+        sub             x9,  x9,  #4*4

+        cmp             x6,  x9

+        b.lt            1b

+        ret

+endfunc

+.macro load8 src, strd, zero, clear

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s

+.if \clear

+        ld1             {\i}, [\src]

+        st1             {\zero}, [\src], \strd

+.else

+        ld1             {\i}, [\src], \strd

+.endif

+.endr

+.endm

+.macro store16 dst

+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        st1             {\i}, [\dst], #16

+.endr

+.endm

+.macro clear_upper8

+.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s

+        movi            \i,  #0

+.endr

+.endm

+.macro movi_if reg, val, cond

+.if \cond

+        movi            \reg, \val

+.endif

+.endm

+.macro movz16dup_if reg, gpr, val, cond

+.if \cond

+        movz            \gpr, \val, lsl #16

+        dup             \reg, \gpr

+.endif

+.endm

+.macro st1_if regs, dst, cond

+.if \cond

+        st1             \regs, \dst

+.endif

+.endm

+.macro str_if reg, dst, cond

+.if \cond

+        str             \reg, \dst

+.endif

+.endm

+.macro stroff_if reg, dst, dstoff, cond

+.if \cond

+        str             \reg, \dst, \dstoff

+.endif

+.endm

+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7

+.if \cond

+        scale_input     .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7

+.endif

+.endm

+.macro def_dct64_func suffix, clear=0, scale=0

+function inv_txfm_dct\suffix\()_4s_x64_neon

+        mov             x14, x30

+        mov             x6,  sp

+        lsl             x8,  x8,  #2

+        movz16dup_if    v0.2s, w16, #2896*8, \scale

+        movi_if         v7.4s,  #0, \clear

+        load8           x7,  x8,  v7.4s, \clear

+        clear_upper8

+        sub             x7,  x7,  x8, lsl #3

+        add             x7,  x7,  x8, lsr #1

+        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        bl              inv_dct_4s_x16_neon

+        store16         x6

+        movz16dup_if    v0.2s, w16, #2896*8, \scale

+        movi_if         v7.8h,  #0, \clear

+        load8           x7,  x8,  v7.4s, \clear

+        clear_upper8

+        sub             x7,  x7,  x8, lsl #3

+        lsr             x8,  x8,  #1

+        sub             x7,  x7,  x8, lsr #1

+        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23

+        bl              inv_dct32_odd_4s_x16_neon

+        add             x10, x6,  #16*15

+        sub             x6,  x6,  #16*16

+        mov             x9,  #-16

+.macro store_addsub r0, r1, r2, r3

+        ld1             {v2.4s}, [x6], #16

+        ld1             {v3.4s}, [x6], #16

+        sqadd           v6.4s,  v2.4s,  \r0

+        sqsub           \r0,    v2.4s,  \r0

+        ld1             {v4.4s}, [x6], #16

+        sqadd           v7.4s,  v3.4s,  \r1

+        sqsub           \r1,    v3.4s,  \r1

+        ld1             {v5.4s}, [x6], #16

+        sqadd           v2.4s,  v4.4s,  \r2

+        sub             x6,  x6,  #16*4

+        sqsub           \r2,    v4.4s,  \r2

+        st1             {v6.4s}, [x6], #16

+        st1             {\r0},   [x10], x9

+        sqadd           v3.4s,  v5.4s,  \r3

+        sqsub           \r3,    v5.4s,  \r3

+        st1             {v7.4s}, [x6], #16

+        st1             {\r1},   [x10], x9

+        st1             {v2.4s}, [x6], #16

+        st1             {\r2},   [x10], x9

+        st1             {v3.4s}, [x6], #16

+        st1             {\r3},   [x10], x9

+.endm

+        store_addsub    v31.4s, v30.4s, v29.4s, v28.4s

+        store_addsub    v27.4s, v26.4s, v25.4s, v24.4s

+        store_addsub    v23.4s, v22.4s, v21.4s, v20.4s

+        store_addsub    v19.4s, v18.4s, v17.4s, v16.4s

+.purgem store_addsub

+        add             x6,  x6,  #4*4*16

+        movrel          x17, idct64_coeffs

+        movz16dup_if    v0.2s, w16, #2896*8, \scale

+        movi_if         v7.4s,  #0, \clear

+        add             x9,  x7,  x8, lsl #4 // offset 16

+        add             x10, x7,  x8, lsl #3 // offset 8

+        sub             x9,  x9,  x8         // offset 15

+        sub             x11, x10, x8         // offset 7

+        ld1             {v16.4s}, [x7]  // in1  (offset 0)

+        ld1             {v17.4s}, [x9]  // in31 (offset 15)

+        ld1             {v18.4s}, [x10] // in17 (offset 8)

+        ld1             {v19.4s}, [x11] // in15 (offset 7)

+        st1_if          {v7.4s}, [x7],  \clear

+        st1_if          {v7.4s}, [x9],  \clear

+        st1_if          {v7.4s}, [x10], \clear

+        st1_if          {v7.4s}, [x11], \clear

+        scale_if        \scale, v0.s[0], v16, v17, v18, v19

+        bl              inv_dct64_step1_neon

+        movz16dup_if    v0.2s, w16, #2896*8, \scale

+        movi_if         v7.4s,  #0, \clear

+        add             x7,  x7,  x8, lsl #2 // offset 4

+        sub             x9,  x9,  x8, lsl #2 // offset 11

+        sub             x10, x7,  x8         // offset 3

+        add             x11, x9,  x8         // offset 12

+        ld1             {v16.4s}, [x10] // in7  (offset 3)

+        ld1             {v17.4s}, [x11] // in25 (offset 12)

+        ld1             {v18.4s}, [x9]  // in23 (offset 11)

+        ld1             {v19.4s}, [x7]  // in9  (offset 4)

+        st1_if          {v7.4s}, [x7],  \clear

+        st1_if          {v7.4s}, [x9],  \clear

+        st1_if          {v7.4s}, [x10], \clear

+        st1_if          {v7.4s}, [x11], \clear

+        scale_if        \scale, v0.s[0], v16, v17, v18, v19

+        bl              inv_dct64_step1_neon

+        movz16dup_if    v0.2s, w16, #2896*8, \scale

+        movi_if         v7.4s,  #0, \clear

+        sub             x10, x10, x8, lsl #1 // offset 1

+        sub             x9,  x9,  x8, lsl #1 // offset 9

+        add             x7,  x7,  x8         // offset 5

+        add             x11, x11, x8         // offset 13

+        ldr             q16, [x10, x8] // in5  (offset 2)

+        ldr             q17, [x11]     // in27 (offset 13)

+        ldr             q18, [x9,  x8] // in21 (offset 10)

+        ldr             q19, [x7]      // in11 (offset 5)

+        stroff_if       q7,  [x10, x8], \clear

+        str_if          q7,  [x11],     \clear

+        stroff_if       q7,  [x9,  x8], \clear

+        str_if          q7,  [x7],      \clear

+        scale_if        \scale, v0.s[0], v16, v17, v18, v19

+        bl              inv_dct64_step1_neon

+        movz16dup_if    v0.2s, w16, #2896*8, \scale

+        movi_if         v7.4s,  #0, \clear

+        ldr             q16, [x10]     // in3  (offset 1)

+        ldr             q17, [x11, x8] // in29 (offset 14)

+        ldr             q18, [x9]      // in19 (offset 9)

+        ldr             q19, [x7,  x8] // in13 (offset 6)

+        str_if          q7,  [x10],     \clear

+        stroff_if       q7,  [x11, x8], \clear

+        str_if          q7,  [x9],      \clear

+        stroff_if       q7,  [x7,  x8], \clear

+        scale_if        \scale, v0.s[0], v16, v17, v18, v19

+        bl              inv_dct64_step1_neon

+        sub             x6,  x6,  #4*4*32

+        add             x9,  x6,  #4*4*7

+        bl              inv_dct64_step2_neon

+        br              x14

+endfunc

+.endm

+def_dct64_func _clear, clear=1

+def_dct64_func _clear_scale, clear=1, scale=1

+function inv_txfm_horz_dct_64x4_neon

+        mov             x14, x30

+        mov             x7,  sp

+        add             x8,  sp,  #4*4*(64 - 4)

+        add             x9,  x6,  #2*56

+        mov             x10, #2*64

+        mov             x11, #-4*4*4

+        dup             v7.4s,  w12

+1:

+        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64

+        ld1             {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11

+        ld1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64

+        ld1             {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11

+        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5

+        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5

+        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5

+        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5

+.macro store_addsub src0, src1, src2, src3

+        sqsub           v1.4s,   \src0,   \src1

+        sqadd           v0.4s,   \src0,   \src1

+        sqsub           v3.4s,   \src2,   \src3

+        srshl           v1.4s,   v1.4s,   v7.4s

+        sqadd           v2.4s,   \src2,   \src3

+        srshl           v3.4s,   v3.4s,   v7.4s

+        srshl           v0.4s,   v0.4s,   v7.4s

+        srshl           v2.4s,   v2.4s,   v7.4s

+        sqxtn           v3.4h,   v3.4s

+        sqxtn2          v3.8h,   v1.4s

+        sqxtn           v0.4h,   v0.4s

+        sqxtn2          v0.8h,   v2.4s

+        rev64           v3.8h,   v3.8h

+        st1             {v0.8h},  [x6], x10

+        st1             {v3.8h},  [x9], x10

+.endm

+        store_addsub    v16.4s,  v31.4s,  v20.4s,  v27.4s

+        store_addsub    v17.4s,  v30.4s,  v21.4s,  v26.4s

+        store_addsub    v18.4s,  v29.4s,  v22.4s,  v25.4s

+        store_addsub    v19.4s,  v28.4s,  v23.4s,  v24.4s

+.purgem store_addsub

+        sub             x6,  x6,  x10, lsl #2

+        sub             x9,  x9,  x10, lsl #2

+        add             x6,  x6,  #16

+        sub             x9,  x9,  #16

+        cmp             x7,  x8

+        b.lt            1b

+        br              x14

+endfunc

+function inv_txfm_add_vert_dct_8x64_neon

+        mov             x14, x30

+        lsl             x8,  x8,  #1

+        mov             x7,  sp

+        add             x8,  sp,  #2*8*(64 - 4)

+        add             x9,  x6,  x1, lsl #6

+        sub             x9,  x9,  x1

+        neg             x10, x1

+        mov             x11, #-2*8*4

+1:

+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64

+        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11

+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64

+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11

+        movi            v6.8h,   #0

+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff

+.macro add_dest_addsub src0, src1, src2, src3

+        ld1             {v0.8h}, [x6], x1

+        ld1             {v1.8h}, [x9], x10

+        sqadd           v4.8h,   \src0,   \src1

+        ld1             {v2.8h}, [x6]

+        sqsub           \src0,   \src0,   \src1

+        ld1             {v3.8h}, [x9]

+        sqadd           v5.8h,   \src2,   \src3

+        sqsub           \src2,   \src2,   \src3

+        sub             x6,  x6,  x1

+        sub             x9,  x9,  x10

+        srshr           v4.8h,   v4.8h,   #4

+        srshr           v5.8h,   v5.8h,   #4

+        srshr           \src0,   \src0,   #4

+        sqadd           v0.8h,   v0.8h,   v4.8h

+        srshr           \src2,   \src2,   #4

+        sqadd           v1.8h,   v1.8h,   \src0

+        sqadd           v2.8h,   v2.8h,   v5.8h

+        smax            v0.8h,   v0.8h,   v6.8h

+        sqadd           v3.8h,   v3.8h,   \src2

+        smax            v1.8h,   v1.8h,   v6.8h

+        smin            v0.8h,   v0.8h,   v7.8h

+        smax            v2.8h,   v2.8h,   v6.8h

+        smin            v1.8h,   v1.8h,   v7.8h

+        st1             {v0.8h}, [x6], x1

+        smax            v3.8h,   v3.8h,   v6.8h

+        smin            v2.8h,   v2.8h,   v7.8h

+        st1             {v1.8h}, [x9], x10

+        smin            v3.8h,   v3.8h,   v7.8h

+        st1             {v2.8h}, [x6], x1

+        st1             {v3.8h}, [x9], x10

+.endm

+        add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h

+        add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h

+        add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h

+        add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h

+.purgem add_dest_addsub

+        cmp             x7,  x8

+        b.lt            1b

+        br              x14

+endfunc

+.macro sub_sp space

+#ifdef _WIN32

+.if \space > 4096

+        sub             x16, sp,  #4096

+        ldr             xzr, [x16]

+        sub             sp,  x16, #(\space - 4096)

+.else

+        sub             sp,  sp,  #\space

+.endif

+#else

+.if \space >= 4096

+        sub             sp,  sp,  #(\space)/4096*4096

+.if (\space % 4096) != 0

+        sub             sp,  sp,  #(\space)%4096

+.endif

+.else

+        sub             sp,  sp,  #\space

+.endif

+#endif

+.endm

+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1

+        idct_dc         64,  64,  2

+        mov             x15, x30

+        sub_sp          64*32*2+64*4*4

+        add             x5,  sp, #64*4*4

+        movrel          x13, eob_32x32

+.irp i, 0, 4, 8, 12, 16, 20, 24, 28

+        add             x6,  x5,  #(\i*64*2)

+.if \i > 0

+        mov             w8,  #(32 - \i)

+        cmp             w3,  w12

+        b.lt            1f

+.endif

+        add             x7,  x2,  #(\i*4)

+        mov             x8,  #32*4

+        mov             x12, #-2 // shift

+        bl              inv_txfm_dct_clear_4s_x64_neon

+        add             x6,  x5,  #(\i*64*2)

+        bl              inv_txfm_horz_dct_64x4_neon

+.if \i < 28

+        ldrh            w12, [x13], #2

+.endif

+.endr

+        b               3f

+1:

+        movi            v4.8h,  #0

+        movi            v5.8h,  #0

+        movi            v6.8h,  #0

+        movi            v7.8h,  #0

+2:

+        subs            w8,  w8,  #2

+.rept 4

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+        b.gt            2b

+3:

+.irp i, 0, 8, 16, 24, 32, 40, 48, 56

+        add             x7,  x5,  #(\i*2)

+        mov             x8,  #64*2

+        bl              X(inv_txfm_dct_8h_x64_neon)

+        add             x6,  x0,  #(\i*2)

+        bl              inv_txfm_add_vert_dct_8x64_neon

+.endr

+        add             sp,  x5,  #64*32*2

+        br              x15

+endfunc

+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1

+        idct_dc         64,  32,  1

+        mov             x15, x30

+        sub_sp          64*32*2+64*4*4

+        add             x5,  sp, #64*4*4

+        movrel          x13, eob_32x32

+.irp i, 0, 4, 8, 12, 16, 20, 24, 28

+        add             x6,  x5,  #(\i*64*2)

+.if \i > 0

+        mov             w8,  #(32 - \i)

+        cmp             w3,  w12

+        b.lt            1f

+.endif

+        add             x7,  x2,  #(\i*4)

+        mov             x8,  #32*4

+        mov             x12, #-1 // shift

+        bl              inv_txfm_dct_clear_scale_4s_x64_neon

+        add             x6,  x5,  #(\i*64*2)

+        bl              inv_txfm_horz_dct_64x4_neon

+.if \i < 28

+        ldrh            w12, [x13], #2

+.endif

+.endr

+        b               3f

+1:

+        movi            v4.8h,  #0

+        movi            v5.8h,  #0

+        movi            v6.8h,  #0

+        movi            v7.8h,  #0

+2:

+        subs            w8,  w8,  #2

+.rept 4

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+        b.gt            2b

+3:

+.irp i, 0, 8, 16, 24, 32, 40, 48, 56

+        add             x6,  x0,  #(\i*2)

+        add             x7,  x5,  #(\i*2)

+        mov             x8,  #64*2

+        bl              inv_txfm_add_vert_dct_8x32_neon

+.endr

+        add             sp,  x5,  #64*32*2

+        br              x15

+endfunc

+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1

+        idct_dc         32,  64,  1

+        mov             x15, x30

+        sub_sp          32*32*2+64*8*2

+        add             x5,  sp, #64*8*2

+        movrel          x13, eob_32x32

+        ldrh            w12, [x13], #2

+.irp i, 0, 4, 8, 12, 16, 20, 24, 28

+        add             x6,  x5,  #(\i*32*2)

+.if \i > 0

+        mov             w8,  #(32 - \i)

+        cmp             w3,  w12

+        b.lt            1f

+        ldrh            w12, [x13], #2

+.endif

+        add             x7,  x2,  #(\i*4)

+        mov             x8,  #32*4

+        bl              inv_txfm_horz_scale_dct_32x4_neon

+.endr

+        b               3f

+1:

+        movi            v4.8h,  #0

+        movi            v5.8h,  #0

+        movi            v6.8h,  #0

+        movi            v7.8h,  #0

+2:

+        subs            w8,  w8,  #4

+.rept 4

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+        b.gt            2b

+3:

+.irp i, 0, 8, 16, 24

+        add             x7,  x5,  #(\i*2)

+        mov             x8,  #32*2

+        bl              X(inv_txfm_dct_8h_x64_neon)

+        add             x6,  x0,  #(\i*2)

+        bl              inv_txfm_add_vert_dct_8x64_neon

+.endr

+        add             sp,  x5,  #32*32*2

+        br              x15

+endfunc

+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1

+        idct_dc         64,  16,  2

+        mov             x15, x30

+        sub_sp          64*16*2+64*4*4

+        add             x4,  sp, #64*4*4

+        movrel          x13, eob_16x32

+.irp i, 0, 4, 8, 12

+        add             x6,  x4,  #(\i*64*2)

+.if \i > 0

+        mov             w8,  #(16 - \i)

+        cmp             w3,  w12

+        b.lt            1f

+.endif

+        add             x7,  x2,  #(\i*4)

+        mov             x8,  #16*4

+        mov             x12, #-2 // shift

+        bl              inv_txfm_dct_clear_4s_x64_neon

+        add             x6,  x4,  #(\i*64*2)

+        bl              inv_txfm_horz_dct_64x4_neon

+.if \i < 12

+        ldrh            w12, [x13], #2

+.endif

+.endr

+        b               3f

+1:

+        movi            v4.8h,  #0

+        movi            v5.8h,  #0

+        movi            v6.8h,  #0

+        movi            v7.8h,  #0

+2:

+        subs            w8,  w8,  #2

+.rept 4

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+        b.gt            2b

+3:

+        movrel          x5,  X(inv_dct_8h_x16_neon)

+.irp i, 0, 8, 16, 24, 32, 40, 48, 56

+        add             x6,  x0,  #(\i*2)

+        add             x7,  x4,  #(\i*2)

+        mov             x8,  #64*2

+        bl              inv_txfm_add_vert_8x16_neon

+.endr

+        add             sp,  x4,  #64*16*2

+        br              x15

+endfunc

+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1

+        idct_dc         16,  64,  2

+        mov             x15, x30

+        sub_sp          16*32*2+64*8*2

+        add             x5,  sp, #64*8*2

+        movrel          x13, eob_16x32

+        ldrh            w12, [x13], #2

+        adr             x4,  inv_dct_4s_x16_neon

+.irp i, 0, 4, 8, 12, 16, 20, 24, 28

+        add             x6,  x5,  #(\i*16*2)

+.if \i > 0

+        mov             w8,  #(32 - \i)

+        cmp             w3,  w12

+        b.lt            1f

+        ldrh            w12, [x13], #2

+.endif

+        add             x7,  x2,  #(\i*4)

+        mov             x8,  #32*4

+        bl              inv_txfm_horz_16x4_neon

+.endr

+        b               3f

+1:

+        movi            v4.8h,  #0

+        movi            v5.8h,  #0

+        movi            v6.8h,  #0

+        movi            v7.8h,  #0

+2:

+        subs            w8,  w8,  #4

+.rept 2

+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64

+.endr

+        b.gt            2b

+3:

+.irp i, 0, 8

+        add             x7,  x5,  #(\i*2)

+        mov             x8,  #16*2

+        bl              X(inv_txfm_dct_8h_x64_neon)

+        add             x6,  x0,  #(\i*2)

+        bl              inv_txfm_add_vert_dct_8x64_neon

+.endr

+        add             sp,  x5,  #16*32*2

+        br              x15

+endfunc

--- a/src/arm/64/util.S

+++ b/src/arm/64/util.S

@@ -170,6 +170,18 @@

         trn2            \r3\().2s,  \t5\().2s,  \t7\().2s

 .endm

+.macro  transpose_4x4s  r0, r1, r2, r3, t4, t5, t6, t7

+        trn1            \t4\().4s,  \r0\().4s,  \r1\().4s

+        trn2            \t5\().4s,  \r0\().4s,  \r1\().4s

+        trn1            \t6\().4s,  \r2\().4s,  \r3\().4s

+        trn2            \t7\().4s,  \r2\().4s,  \r3\().4s

+        trn1            \r0\().2d,  \t4\().2d,  \t6\().2d

+        trn2            \r2\().2d,  \t4\().2d,  \t6\().2d

+        trn1            \r1\().2d,  \t5\().2d,  \t7\().2d

+        trn2            \r3\().2d,  \t5\().2d,  \t7\().2d

+.endm

 .macro  transpose_4x8h  r0, r1, r2, r3, t4, t5, t6, t7

         trn1            \t4\().8h,  \r0\().8h,  \r1\().8h

         trn2            \t5\().8h,  \r0\().8h,  \r1\().8h

--- a/src/arm/itx_init_tmpl.c

+++ b/src/arm/itx_init_tmpl.c

@@ -117,7 +117,9 @@

     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8 && ARCH_AARCH64

+    if (bpc > 10) return;

+#if ARCH_AARCH64

     assign_itx17_fn( ,  4,  4, neon);

     assign_itx16_fn(R,  4,  8, neon);

     assign_itx16_fn(R,  4, 16, neon);

--- a/src/meson.build

+++ b/src/meson.build

@@ -102,6 +102,8 @@

         if host_machine.cpu_family() == 'aarch64'

             libdav1d_sources += files(

+                # itx.S is used for both 8 and 16 bpc.

+                'arm/64/itx.S',

                 'arm/64/looprestoration_common.S',

                 'arm/64/msac.S',

@@ -110,7 +112,6 @@

                 libdav1d_sources += files(

                     'arm/64/cdef.S',

                     'arm/64/ipred.S',

-                    'arm/64/itx.S',

                     'arm/64/loopfilter.S',

                     'arm/64/looprestoration.S',

                     'arm/64/mc.S',

@@ -121,6 +122,7 @@

                 libdav1d_sources += files(

                     'arm/64/cdef16.S',

                     'arm/64/ipred16.S',

+                    'arm/64/itx16.S',

                     'arm/64/loopfilter16.S',

                     'arm/64/looprestoration16.S',

                     'arm/64/mc16.S',

--

⑨