ref: eaedb95de65fb5612297b148a306398240e8ad6f
parent: ff3054feb26ed4476f632965d6a76b6af1d4f31c
author: Martin Storsjö <martin@martin.st>
date: Sat Apr 11 09:27:37 EDT 2020
arm64: itx: Add NEON implementation of itx for 10 bpc Add an element size specifier to the existing individual transform functions for 8 bpc, naming them e.g. inv_dct_8h_x8_neon, to clarify that they operate on input vectors of 8h, and make the symbols public, to let the 10 bpc case call them from a different object file. The same convention is used in the new itx16.S, like inv_dct_4s_x8_neon. Make the existing itx.S compiled regardless of whether 8 bpc support is enabled. For builds with 8 bpc support disabled, this does include the unused frontend functions though, but this is hopefully tolerable to avoid having to split the file into a sharable file for transforms and a separate one for frontends. This only implements the 10 bpc case, as that case can use transforms operating on 16 bit coefficients in the second pass. Relative speedup vs C for a few functions: Cortex A53 A72 A73 inv_txfm_add_4x4_dct_dct_0_10bpc_neon: 4.14 4.06 4.49 inv_txfm_add_4x4_dct_dct_1_10bpc_neon: 6.51 6.49 6.42 inv_txfm_add_8x8_dct_dct_0_10bpc_neon: 5.02 4.63 6.23 inv_txfm_add_8x8_dct_dct_1_10bpc_neon: 8.54 7.13 11.96 inv_txfm_add_16x16_dct_dct_0_10bpc_neon: 5.52 6.60 8.03 inv_txfm_add_16x16_dct_dct_1_10bpc_neon: 11.27 9.62 12.22 inv_txfm_add_16x16_dct_dct_2_10bpc_neon: 9.60 6.97 8.59 inv_txfm_add_32x32_dct_dct_0_10bpc_neon: 2.60 3.48 3.19 inv_txfm_add_32x32_dct_dct_1_10bpc_neon: 14.65 12.64 16.86 inv_txfm_add_32x32_dct_dct_2_10bpc_neon: 11.57 8.80 12.68 inv_txfm_add_32x32_dct_dct_3_10bpc_neon: 8.79 8.00 9.21 inv_txfm_add_32x32_dct_dct_4_10bpc_neon: 7.58 6.21 7.80 inv_txfm_add_64x64_dct_dct_0_10bpc_neon: 2.41 2.85 2.75 inv_txfm_add_64x64_dct_dct_1_10bpc_neon: 12.91 10.27 12.24 inv_txfm_add_64x64_dct_dct_2_10bpc_neon: 10.96 7.97 10.31 inv_txfm_add_64x64_dct_dct_3_10bpc_neon: 8.95 7.42 9.55 inv_txfm_add_64x64_dct_dct_4_10bpc_neon: 7.97 6.12 7.82
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -449,7 +449,7 @@
sqsub \r2\sz, v3\sz, v7\sz
.endm
-function inv_dct_4x4_neon
+function inv_dct_4h_x4_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.4h}, [x16]
idct_4 v16, v17, v18, v19, .4h
@@ -456,7 +456,7 @@
ret
endfunc
-function inv_dct_8x4_neon
+function inv_dct_8h_x4_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.4h}, [x16]
idct_4 v16, v17, v18, v19, .8h
@@ -489,12 +489,12 @@
rshrn \o3\().4h, \o3\().4s, #12
.endm
-function inv_adst_4x4_neon
+function inv_adst_4h_x4_neon, export=1
iadst_4x4 v16, v17, v18, v19
ret
endfunc
-function inv_flipadst_4x4_neon
+function inv_flipadst_4h_x4_neon, export=1
iadst_4x4 v19, v18, v17, v16
ret
endfunc
@@ -555,17 +555,17 @@
rshrn2 \o3\().8h, v5.4s, #12
.endm
-function inv_adst_8x4_neon
+function inv_adst_8h_x4_neon, export=1
iadst_8x4 v16, v17, v18, v19
ret
endfunc
-function inv_flipadst_8x4_neon
+function inv_flipadst_8h_x4_neon, export=1
iadst_8x4 v19, v18, v17, v16
ret
endfunc
-function inv_identity_4x4_neon
+function inv_identity_4h_x4_neon, export=1
mov w16, #(5793-4096)*8
dup v0.4h, w16
sqrdmulh v4.4h, v16.4h, v0.h[0]
@@ -579,7 +579,7 @@
ret
endfunc
-function inv_identity_8x4_neon
+function inv_identity_8h_x4_neon, export=1
mov w16, #(5793-4096)*8
dup v0.4h, w16
sqrdmulh v4.8h, v16.8h, v0.h[0]
@@ -684,8 +684,8 @@
b L(itx_4x4_end)
1:
.endif
- adr x4, inv_\txfm1\()_4x4_neon
- adr x5, inv_\txfm2\()_4x4_neon
+ adr x4, inv_\txfm1\()_4h_x4_neon
+ adr x5, inv_\txfm2\()_4h_x4_neon
b inv_txfm_add_4x4_neon
endfunc
.endm
@@ -741,7 +741,7 @@
mov \r6\szb, v6\szb // out6
.endm
-function inv_dct_8x8_neon
+function inv_dct_8h_x8_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.8h}, [x16]
idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b
@@ -748,7 +748,7 @@
ret
endfunc
-function inv_dct_4x8_neon
+function inv_dct_4h_x8_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.8h}, [x16]
idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b
@@ -822,27 +822,27 @@
sqneg \o5\()\sz, v3\sz // out5
.endm
-function inv_adst_8x8_neon
+function inv_adst_8h_x8_neon, export=1
iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h
ret
endfunc
-function inv_flipadst_8x8_neon
+function inv_flipadst_8h_x8_neon, export=1
iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h
ret
endfunc
-function inv_adst_4x8_neon
+function inv_adst_4h_x8_neon, export=1
iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h
ret
endfunc
-function inv_flipadst_4x8_neon
+function inv_flipadst_4h_x8_neon, export=1
iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h
ret
endfunc
-function inv_identity_8x8_neon
+function inv_identity_8h_x8_neon, export=1
sqshl v16.8h, v16.8h, #1
sqshl v17.8h, v17.8h, #1
sqshl v18.8h, v18.8h, #1
@@ -854,7 +854,7 @@
ret
endfunc
-function inv_identity_4x8_neon
+function inv_identity_4h_x8_neon, export=1
sqshl v16.4h, v16.4h, #1
sqshl v17.4h, v17.4h, #1
sqshl v18.4h, v18.4h, #1
@@ -911,11 +911,11 @@
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc 8, 8, 1
.endif
- adr x5, inv_\txfm2\()_8x8_neon
+ adr x5, inv_\txfm2\()_8h_x8_neon
.ifc \txfm1, identity
b inv_txfm_identity_add_8x8_neon
.else
- adr x4, inv_\txfm1\()_8x8_neon
+ adr x4, inv_\txfm1\()_8h_x8_neon
b inv_txfm_add_8x8_neon
.endif
endfunc
@@ -998,8 +998,8 @@
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc \w, \h, 0
.endif
- adr x4, inv_\txfm1\()_\h\()x\w\()_neon
- adr x5, inv_\txfm2\()_\w\()x\h\()_neon
+ adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon
+ adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon
b inv_txfm_add_\w\()x\h\()_neon
endfunc
.endm
@@ -1110,7 +1110,7 @@
mov v22\szb, v3\szb
.endm
-function inv_dct_8x16_neon
+function inv_dct_8h_x16_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.8h, v1.8h}, [x16]
idct_16 .8h, .16b
@@ -1117,7 +1117,7 @@
ret
endfunc
-function inv_dct_4x16_neon
+function inv_dct_4h_x16_neon, export=1
movrel x16, idct_coeffs
ld1 {v0.8h, v1.8h}, [x16]
idct_16 .4h, .8b
@@ -1294,27 +1294,27 @@
sqneg \o9\sz, v7\sz // out9
.endm
-function inv_adst_8x16_neon
+function inv_adst_8h_x16_neon, export=1
iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b
ret
endfunc
-function inv_flipadst_8x16_neon
+function inv_flipadst_8h_x16_neon, export=1
iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b
ret
endfunc
-function inv_adst_4x16_neon
+function inv_adst_4h_x16_neon, export=1
iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b
ret
endfunc
-function inv_flipadst_4x16_neon
+function inv_flipadst_4h_x16_neon, export=1
iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b
ret
endfunc
-function inv_identity_8x16_neon
+function inv_identity_8h_x16_neon, export=1
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1325,7 +1325,7 @@
ret
endfunc
-function inv_identity_4x16_neon
+function inv_identity_4h_x16_neon, export=1
mov w16, #2*(5793-4096)*8
dup v0.4h, w16
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1465,9 +1465,9 @@
adr x9, inv_txfm_horz_identity_16x8_neon
.else
adr x9, inv_txfm_horz_16x8_neon
- adr x4, inv_\txfm1\()_8x16_neon
+ adr x4, inv_\txfm1\()_8h_x16_neon
.endif
- adr x5, inv_\txfm2\()_8x16_neon
+ adr x5, inv_\txfm2\()_8h_x16_neon
mov x13, #\eob_half
b inv_txfm_add_16x16_neon
endfunc
@@ -1634,12 +1634,12 @@
idct_dc \w, \h, 1
.endif
.if \w == 4
- adr x4, inv_\txfm1\()_8x\w\()_neon
- adr x5, inv_\txfm2\()_4x\h\()_neon
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_4h_x\h\()_neon
mov w13, #\eob_half
.else
- adr x4, inv_\txfm1\()_4x\w\()_neon
- adr x5, inv_\txfm2\()_8x\h\()_neon
+ adr x4, inv_\txfm1\()_4h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
.endif
.ifc \txfm1, identity
b inv_txfm_identity_add_\w\()x\h\()_neon
@@ -1816,8 +1816,8 @@
.ifc \txfm1\()_\txfm2, dct_dct
idct_dc \w, \h, 1
.endif
- adr x4, inv_\txfm1\()_8x\w\()_neon
- adr x5, inv_\txfm2\()_8x\h\()_neon
+ adr x4, inv_\txfm1\()_8h_x\w\()_neon
+ adr x5, inv_\txfm2\()_8h_x\h\()_neon
.if \w == 8
mov x13, #\eob_half
.endif
@@ -1851,7 +1851,7 @@
def_fns_816 8, 16
def_fns_816 16, 8
-function inv_dct32_odd_8x16_neon
+function inv_dct32_odd_8h_x16_neon, export=1
movrel x16, idct_coeffs, 2*16
ld1 {v0.8h, v1.8h}, [x16]
sub x16, x16, #2*16
@@ -2029,7 +2029,7 @@
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
.endif
- bl inv_dct_8x16_neon
+ bl inv_dct_8h_x16_neon
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
@@ -2059,7 +2059,7 @@
scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23
scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31
.endif
- bl inv_dct32_odd_8x16_neon
+ bl inv_dct32_odd_8h_x16_neon
transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5
.macro store2 r0, r1, shift
@@ -2105,7 +2105,7 @@
.endr
sub x7, x7, x8, lsl #4
- bl inv_dct_8x16_neon
+ bl inv_dct_8h_x16_neon
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
st1 {v\i\().8h}, [x7], x8
@@ -2118,7 +2118,7 @@
.endr
sub x7, x7, x8, lsl #4
sub x7, x7, x8, lsr #1
- bl inv_dct32_odd_8x16_neon
+ bl inv_dct32_odd_8h_x16_neon
neg x9, x8
mov x10, x6
@@ -2384,7 +2384,7 @@
sub sp, sp, #1024
movrel x13, eob_16x32
ldrh w12, [x13], #2
- adr x4, inv_dct_8x16_neon
+ adr x4, inv_dct_8h_x16_neon
.irp i, 0, 8, 16, 24
add x6, sp, #(\i*16*2)
@@ -2432,7 +2432,7 @@
mov x15, x30
sub sp, sp, #1024
- adr x5, inv_dct_8x16_neon
+ adr x5, inv_dct_8h_x16_neon
.irp i, 0, 8
add x6, sp, #(\i*32*2)
@@ -2493,7 +2493,7 @@
sub w9, w9, #8
add x2, x2, #2*8
- bl inv_dct_8x8_neon
+ bl inv_dct_8h_x8_neon
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
srshr v\i\().8h, v\i\().8h, #2
@@ -2550,7 +2550,7 @@
.endr
add w9, w9, #8
- bl inv_dct_8x8_neon
+ bl inv_dct_8h_x8_neon
cmp w9, #32
@@ -2755,7 +2755,7 @@
.endm
.macro def_dct64_func suffix, clear=0, scale=0
-function inv_txfm_dct\suffix\()_8x64_neon
+function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
mov x14, x30
mov x6, sp
lsl x8, x8, #2
@@ -2768,7 +2768,7 @@
add x7, x7, x8, lsr #1
scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
- bl inv_dct_8x16_neon
+ bl inv_dct_8h_x16_neon
store16 x6
@@ -2781,7 +2781,7 @@
sub x7, x7, x8, lsr #1
scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
- bl inv_dct32_odd_8x16_neon
+ bl inv_dct32_odd_8h_x16_neon
add x10, x6, #16*15
sub x6, x6, #16*16
@@ -3043,7 +3043,7 @@
add x7, x2, #(\i*2)
mov x8, #32*2
mov x12, #-2 // shift
- bl inv_txfm_dct_clear_8x64_neon
+ bl inv_txfm_dct_clear_8h_x64_neon
add x6, x5, #(\i*64*2)
bl inv_txfm_horz_dct_64x8_neon
.if \i < 24
@@ -3068,7 +3068,7 @@
.irp i, 0, 8, 16, 24, 32, 40, 48, 56
add x7, x5, #(\i*2)
mov x8, #64*2
- bl inv_txfm_dct_8x64_neon
+ bl inv_txfm_dct_8h_x64_neon
add x6, x0, #(\i)
bl inv_txfm_add_vert_dct_8x64_neon
.endr
@@ -3097,7 +3097,7 @@
add x7, x2, #(\i*2)
mov x8, #32*2
mov x12, #-1 // shift
- bl inv_txfm_dct_clear_scale_8x64_neon
+ bl inv_txfm_dct_clear_scale_8h_x64_neon
add x6, x5, #(\i*64*2)
bl inv_txfm_horz_dct_64x8_neon
.if \i < 24
@@ -3171,7 +3171,7 @@
.irp i, 0, 8, 16, 24
add x7, x5, #(\i*2)
mov x8, #32*2
- bl inv_txfm_dct_8x64_neon
+ bl inv_txfm_dct_8h_x64_neon
add x6, x0, #(\i)
bl inv_txfm_add_vert_dct_8x64_neon
.endr
@@ -3200,7 +3200,7 @@
add x7, x2, #(\i*2)
mov x8, #16*2
mov x12, #-2 // shift
- bl inv_txfm_dct_clear_8x64_neon
+ bl inv_txfm_dct_clear_8h_x64_neon
add x6, x4, #(\i*64*2)
bl inv_txfm_horz_dct_64x8_neon
.if \i < 8
@@ -3222,7 +3222,7 @@
b.gt 2b
3:
- adr x5, inv_dct_8x16_neon
+ adr x5, inv_dct_8h_x16_neon
.irp i, 0, 8, 16, 24, 32, 40, 48, 56
add x6, x0, #(\i)
add x7, x4, #(\i*2)
@@ -3245,7 +3245,7 @@
movrel x13, eob_16x32
ldrh w12, [x13], #2
- adr x4, inv_dct_8x16_neon
+ adr x4, inv_dct_8h_x16_neon
.irp i, 0, 8, 16, 24
add x6, x5, #(\i*16*2)
.if \i > 0
@@ -3276,7 +3276,7 @@
.irp i, 0, 8
add x7, x5, #(\i*2)
mov x8, #16*2
- bl inv_txfm_dct_8x64_neon
+ bl inv_txfm_dct_8h_x64_neon
add x6, x0, #(\i)
bl inv_txfm_add_vert_dct_8x64_neon
.endr
--- /dev/null
+++ b/src/arm/64/itx16.S
@@ -1,0 +1,3514 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
+// int bitdepth_max);
+
+// Most of the functions use the following register layout:
+// x0-x3 external parameters
+// x4 function pointer to first transform
+// x5 function pointer to second transform
+// x6 output parameter for helper function
+// x7 input parameter for helper function
+// x8 input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13 pointer to list of eob thresholds
+// x14 return pointer for helper function
+// x15 return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1 multiplication coefficients
+// v2-v7 scratch registers
+// v8-v15 unused
+// v16-v31 inputs/outputs of transforms
+
+const idct_coeffs, align=4
+ // idct4
+ .int 2896, 2896*8*(1<<16), 1567, 3784
+ // idct8
+ .int 799, 4017, 3406, 2276
+ // idct16
+ .int 401, 4076, 3166, 2598
+ .int 1931, 3612, 3920, 1189
+ // idct32
+ .int 201, 4091, 3035, 2751
+ .int 1751, 3703, 3857, 1380
+ .int 995, 3973, 3513, 2106
+ .int 2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+ .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+ .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+ .int 4076, 401, 4017, 799
+
+ .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+ .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+ .int -3166, -2598, -799, -4017
+
+ .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+ .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+ .int 3612, 1931, 2276, 3406
+
+ .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+ .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+ .int -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+ .int 1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+ .int 4076, 401, 3612, 1931
+ .int 2598, 3166, 1189, 3920
+ // idct_coeffs
+ .int 2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+ .int 4091, 201, 3973, 995
+ .int 3703, 1751, 3290, 2440
+ .int 2751, 3035, 2106, 3513
+ .int 1380, 3857, 601, 4052
+endconst
+
+.macro mul_mla d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mla \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro mul_mls d, s0, s1, c0, c1
+ mul \d\().4s, \s0\().4s, \c0
+ mls \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+ sqrdmulh \r0\sz, \r0\sz, \c
+ sqrdmulh \r1\sz, \r1\sz, \c
+ sqrdmulh \r2\sz, \r2\sz, \c
+ sqrdmulh \r3\sz, \r3\sz, \c
+.ifnb \r4
+ sqrdmulh \r4\sz, \r4\sz, \c
+ sqrdmulh \r5\sz, \r5\sz, \c
+ sqrdmulh \r6\sz, \r6\sz, \c
+ sqrdmulh \r7\sz, \r7\sz, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
+.ifnb \load
+ ld1 {\load}, [\src], x1
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #\shiftbits
+.endif
+.ifnb \addsrc
+ sqadd \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+ smax \max, \max, v6.8h
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}, [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , , \dst, \src
+ load_add_store v3.8h, v17.8h, , , , , , \dst, \src
+ load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src
+ load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src
+ load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src
+ load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src
+ load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src
+ load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src
+ load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src
+ load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src
+ load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src
+ load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src
+ load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src
+ load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src
+ load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src
+ load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src
+ load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src
+ load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src
+ load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src
+ load_add_store , , , , , v31.8h, v30.8h, \dst, \src
+ load_add_store , , , , , , v31.8h, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
+ load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
+ load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+ load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+ load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits
+ load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits
+ load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
+ load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
+ load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
+ load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
+ load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+ load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
+ load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src
+.ifnb \load
+ ld1 {\load}[0], [\src], x1
+.endif
+.ifnb \inssrc
+ ins \insdst\().d[1], \inssrc\().d[0]
+.endif
+.ifnb \shift
+ srshr \shift, \shift, #4
+.endif
+.ifnb \load
+ ld1 {\load}[1], [\src], x1
+.endif
+.ifnb \addsrc
+ sqadd \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+ st1 {\store}[0], [\dst], x1
+.endif
+.ifnb \max
+ smax \max, \max, v6.8h
+.endif
+.ifnb \min
+ smin \min, \min, v7.8h
+.endif
+.ifnb \store
+ st1 {\store}[1], [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
+ load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
+ load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
+ load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
+ load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src
+ load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src
+ load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src
+ load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src
+ load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src
+ load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src
+ load_add_store4 , , , , , , , , v30.d, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+ mov \src, \dst
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+ load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
+ load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
+ load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
+ load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
+ load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
+ load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
+ load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
+ load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src
+ load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src
+ load_add_store4 , , , , , , , , v22.d, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v0.2s, w16
+ sqrdmulh v20.4s, v16.4s, v0.s[0]
+ str wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+ sqrdmulh v20.4s, v20.4s, v0.s[0]
+.endif
+.if \shift > 0
+ sqrshrn v16.4h, v20.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+.else
+ sqxtn v16.4h, v20.4s
+ sqxtn2 v16.8h, v20.4s
+.endif
+ sqrdmulh v16.8h, v16.8h, v0.h[1]
+ srshr v16.8h, v16.8h, #4
+ mov w4, #\h
+ b idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[0], [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.d}[1], [x0], x1
+ sqadd v0.8h, v0.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ sqadd v1.8h, v1.8h, v16.8h
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ st1 {v0.d}[0], [x0], x1
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[0], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w8_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h}, [x0], x1
+ subs w4, w4, #4
+ ld1 {v1.8h}, [x0], x1
+ sqadd v0.8h, v0.8h, v16.8h
+ ld1 {v2.8h}, [x0], x1
+ sqadd v1.8h, v1.8h, v16.8h
+ ld1 {v3.8h}, [x0], x1
+ sqadd v2.8h, v2.8h, v16.8h
+ sqadd v3.8h, v3.8h, v16.8h
+ sub x0, x0, x1, lsl #2
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ st1 {v0.8h}, [x0], x1
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h}, [x0], x1
+ st1 {v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w16_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h}, [x0], x1
+ subs w4, w4, #2
+ ld1 {v2.8h, v3.8h}, [x0], x1
+ sqadd v0.8h, v0.8h, v16.8h
+ sqadd v1.8h, v1.8h, v16.8h
+ sub x0, x0, x1, lsl #1
+ sqadd v2.8h, v2.8h, v16.8h
+ sqadd v3.8h, v3.8h, v16.8h
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ st1 {v0.8h, v1.8h}, [x0], x1
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w32_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+ subs w4, w4, #1
+ sqadd v0.8h, v0.8h, v16.8h
+ sqadd v1.8h, v1.8h, v16.8h
+ sqadd v2.8h, v2.8h, v16.8h
+ sqadd v3.8h, v3.8h, v16.8h
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function idct_dc_w64_neon
+ movi v30.8h, #0
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x1, x1, #64
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ subs w4, w4, #1
+ sqadd v0.8h, v0.8h, v16.8h
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
+ sqadd v1.8h, v1.8h, v16.8h
+ sub x0, x0, #64
+ sqadd v2.8h, v2.8h, v16.8h
+ sqadd v3.8h, v3.8h, v16.8h
+ sqadd v4.8h, v4.8h, v16.8h
+ sqadd v5.8h, v5.8h, v16.8h
+ sqadd v6.8h, v6.8h, v16.8h
+ sqadd v7.8h, v7.8h, v16.8h
+ smax v0.8h, v0.8h, v30.8h
+ smax v1.8h, v1.8h, v30.8h
+ smax v2.8h, v2.8h, v30.8h
+ smax v3.8h, v3.8h, v30.8h
+ smax v4.8h, v4.8h, v30.8h
+ smax v5.8h, v5.8h, v30.8h
+ smax v6.8h, v6.8h, v30.8h
+ smax v7.8h, v7.8h, v30.8h
+ smin v0.8h, v0.8h, v31.8h
+ smin v1.8h, v1.8h, v31.8h
+ smin v2.8h, v2.8h, v31.8h
+ smin v3.8h, v3.8h, v31.8h
+ smin v4.8h, v4.8h, v31.8h
+ smin v5.8h, v5.8h, v31.8h
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+ smin v6.8h, v6.8h, v31.8h
+ smin v7.8h, v7.8h, v31.8h
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro iwht4
+ add v16.4s, v16.4s, v17.4s
+ sub v21.4s, v18.4s, v19.4s
+ sub v20.4s, v16.4s, v21.4s
+ sshr v20.4s, v20.4s, #1
+ sub v18.4s, v20.4s, v17.4s
+ sub v17.4s, v20.4s, v19.4s
+ add v19.4s, v21.4s, v18.4s
+ sub v16.4s, v16.4s, v17.4s
+.endm
+
+.macro idct_4 r0, r1, r2, r3
+ mul_mla v6, \r1, \r3, v0.s[3], v0.s[2]
+ mul_mls v4, \r1, \r3, v0.s[2], v0.s[3]
+ mul_mla v2, \r0, \r2, v0.s[0], v0.s[0]
+ mul_mls v3, \r0, \r2, v0.s[0], v0.s[0]
+ srshr v6.4s, v6.4s, #12
+ srshr v7.4s, v4.4s, #12
+ srshr v2.4s, v2.4s, #12
+ srshr v3.4s, v3.4s, #12
+ sqadd \r0\().4s, v2.4s, v6.4s
+ sqsub \r3\().4s, v2.4s, v6.4s
+ sqadd \r1\().4s, v3.4s, v7.4s
+ sqsub \r2\().4s, v3.4s, v7.4s
+.endm
+
+function inv_dct_4s_x4_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+ idct_4 v16, v17, v18, v19
+ ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+ movrel x16, iadst4_coeffs
+ ld1 {v0.4s}, [x16]
+
+ sub v3.4s, v16.4s, v18.4s
+ mul v4.4s, v16.4s, v0.s[0]
+ mla v4.4s, v18.4s, v0.s[1]
+ mla v4.4s, v19.4s, v0.s[2]
+ mul v7.4s, v17.4s, v0.s[3]
+ add v3.4s, v3.4s, v19.4s
+ mul v5.4s, v16.4s, v0.s[2]
+ mls v5.4s, v18.4s, v0.s[0]
+ mls v5.4s, v19.4s, v0.s[1]
+
+ add \o3\().4s, v4.4s, v5.4s
+ mul \o2\().4s, v3.4s, v0.s[3]
+ add \o0\().4s, v4.4s, v7.4s
+ add \o1\().4s, v5.4s, v7.4s
+ sub \o3\().4s, \o3\().4s, v7.4s
+
+ srshr \o0\().4s, \o0\().4s, #12
+ srshr \o2\().4s, \o2\().4s, #12
+ srshr \o1\().4s, \o1\().4s, #12
+ srshr \o3\().4s, \o3\().4s, #12
+.endm
+
+function inv_adst_4s_x4_neon
+ iadst_4x4 v16, v17, v18, v19
+ ret
+endfunc
+
+function inv_flipadst_4s_x4_neon
+ iadst_4x4 v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x4_neon
+ movz w16, #(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+ sqrdmulh v4.4s, v16.4s, v0.s[0]
+ sqrdmulh v5.4s, v17.4s, v0.s[0]
+ sqrdmulh v6.4s, v18.4s, v0.s[0]
+ sqrdmulh v7.4s, v19.4s, v0.s[0]
+ sqadd v16.4s, v16.4s, v4.4s
+ sqadd v17.4s, v17.4s, v5.4s
+ sqadd v18.4s, v18.4s, v6.4s
+ sqadd v19.4s, v19.4s, v7.4s
+ ret
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+ mov x15, x30
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ sshr v16.4s, v16.4s, #2
+ sshr v17.4s, v17.4s, #2
+ sshr v18.4s, v18.4s, #2
+ sshr v19.4s, v19.4s, #2
+
+ iwht4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23
+
+ iwht4
+
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v16.4h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqxtn2 v16.8h, v17.4s
+ ld1 {v1.d}[0], [x0], x1
+ sqxtn v18.4h, v18.4s
+ ld1 {v1.d}[1], [x0], x1
+ sqxtn2 v18.8h, v19.4s
+
+ b L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v30.4s, v31.4s}, [x2], #32
+
+ blr x4
+
+ st1 {v30.4s, v31.4s}, [x2], #32
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x5
+
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ins v16.d[1], v17.d[0]
+ ins v18.d[1], v19.d[0]
+ ld1 {v1.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ srshr v16.8h, v16.8h, #4
+ srshr v18.8h, v18.8h, #4
+
+L(itx_4x4_end):
+ mvni v31.8h, #0xfc, lsl #8 // 0x3ff
+ sub x0, x0, x1, lsl #2
+ sqadd v16.8h, v16.8h, v0.8h
+ sqadd v18.8h, v18.8h, v1.8h
+ smax v16.8h, v16.8h, v30.8h
+ smax v18.8h, v18.8h, v30.8h
+ smin v16.8h, v16.8h, v31.8h
+ st1 {v16.d}[0], [x0], x1
+ smin v18.8h, v18.8h, v31.8h
+ st1 {v16.d}[1], [x0], x1
+ st1 {v18.d}[0], [x0], x1
+ st1 {v18.d}[1], [x0], x1
+
+ br x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ cbnz w3, 1f
+ movz w16, #2896*8, lsl #16
+ ld1r {v16.4s}, [x2]
+ dup v4.2s, w16
+ str wzr, [x2]
+ sqrdmulh v16.4s, v16.4s, v4.s[0]
+ ld1 {v0.d}[0], [x0], x1
+ sqxtn v20.4h, v16.4s
+ sqxtn2 v20.8h, v16.4s
+ ld1 {v0.d}[1], [x0], x1
+ sqrdmulh v20.8h, v20.8h, v4.h[1]
+ ld1 {v1.d}[0], [x0], x1
+ srshr v16.8h, v20.8h, #4
+ ld1 {v1.d}[1], [x0], x1
+ srshr v18.8h, v20.8h, #4
+ movi v30.8h, #0
+ b L(itx_4x4_end)
+1:
+.endif
+ adr x4, inv_\txfm1\()_4s_x4_neon
+ movrel x5, X(inv_\txfm2\()_4h_x4_neon)
+ b inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
+ idct_4 \r0, \r2, \r4, \r6
+
+ mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
+ mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
+ mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
+ mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
+ srshr \r1\().4s, v2.4s, #12 // t4a
+ srshr \r7\().4s, v4.4s, #12 // t7a
+ srshr \r3\().4s, v6.4s, #12 // t5a
+ srshr \r5\().4s, v7.4s, #12 // taa
+
+ sqadd v2.4s, \r1\().4s, \r3\().4s // t4
+ sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a
+ sqadd v3.4s, \r7\().4s, \r5\().4s // t7
+ sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
+
+ mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5
+ mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
+ srshr v4.4s, v4.4s, #12 // t5
+ srshr v5.4s, v6.4s, #12 // t6
+
+ sqsub \r7\().4s, \r0\().4s, v3.4s // out7
+ sqadd \r0\().4s, \r0\().4s, v3.4s // out0
+ sqadd \r1\().4s, \r2\().4s, v5.4s // out1
+ sqsub v6.4s, \r2\().4s, v5.4s // out6
+ sqadd \r2\().4s, \r4\().4s, v4.4s // out2
+ sqsub \r5\().4s, \r4\().4s, v4.4s // out5
+ sqadd \r3\().4s, \r6\().4s, v2.4s // out3
+ sqsub \r4\().4s, \r6\().4s, v2.4s // out4
+ mov \r6\().16b, v6.16b // out6
+.endm
+
+function inv_dct_4s_x8_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16]
+ idct_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+ movrel x16, iadst8_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v23, v16, v0.s[0], v0.s[1]
+ mul_mls v4, v23, v16, v0.s[1], v0.s[0]
+ mul_mla v6, v21, v18, v0.s[2], v0.s[3]
+ srshr v16.4s, v2.4s, #12 // t0a
+ srshr v23.4s, v4.4s, #12 // t1a
+ mul_mls v2, v21, v18, v0.s[3], v0.s[2]
+ mul_mla v4, v19, v20, v1.s[0], v1.s[1]
+ srshr v18.4s, v6.4s, #12 // t2a
+ srshr v21.4s, v2.4s, #12 // t3a
+ mul_mls v6, v19, v20, v1.s[1], v1.s[0]
+ mul_mla v2, v17, v22, v1.s[2], v1.s[3]
+ srshr v20.4s, v4.4s, #12 // t4a
+ srshr v19.4s, v6.4s, #12 // t5a
+ mul_mls v4, v17, v22, v1.s[3], v1.s[2]
+ srshr v22.4s, v2.4s, #12 // t6a
+ srshr v17.4s, v4.4s, #12 // t7a
+
+ ld1 {v0.4s}, [x16]
+
+ sqadd v2.4s, v16.4s, v20.4s // t0
+ sqsub v3.4s, v16.4s, v20.4s // t4
+ sqadd v4.4s, v23.4s, v19.4s // t1
+ sqsub v5.4s, v23.4s, v19.4s // t5
+ sqadd v6.4s, v18.4s, v22.4s // t2
+ sqsub v7.4s, v18.4s, v22.4s // t6
+ sqadd v18.4s, v21.4s, v17.4s // t3
+ sqsub v19.4s, v21.4s, v17.4s // t7
+
+ mul_mla v16, v3, v5, v0.s[3], v0.s[2]
+ mul_mls v20, v3, v5, v0.s[2], v0.s[3]
+ mul_mls v22, v19, v7, v0.s[3], v0.s[2]
+
+ srshr v3.4s, v16.4s, #12 // t4a
+ srshr v5.4s, v20.4s, #12 // t5a
+
+ mul_mla v16, v19, v7, v0.s[2], v0.s[3]
+
+ srshr v7.4s, v22.4s, #12 // t6a
+ srshr v19.4s, v16.4s, #12 // t7a
+
+ sqadd \o0\().4s, v2.4s, v6.4s // out0
+ sqsub v2.4s, v2.4s, v6.4s // t2
+ sqadd \o7\().4s, v4.4s, v18.4s // out7
+ sqsub v4.4s, v4.4s, v18.4s // t3
+ sqneg \o7\().4s, \o7\().4s // out7
+
+ sqadd \o1\().4s, v3.4s, v7.4s // out1
+ sqsub v3.4s, v3.4s, v7.4s // t6
+ sqadd \o6\().4s, v5.4s, v19.4s // out6
+ sqsub v5.4s, v5.4s, v19.4s // t7
+ sqneg \o1\().4s, \o1\().4s // out1
+
+ mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
+ mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19)
+ mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18)
+ srshr v2.4s, v18.4s, #12 // out3
+ mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21)
+ srshr v3.4s, v20.4s, #12 // out5
+ srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
+ srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19)
+
+ sqneg \o3\().4s, v2.4s // out3
+ sqneg \o5\().4s, v3.4s // out5
+.endm
+
+function inv_adst_4s_x8_neon
+ iadst_8 v16, v17, v18, v19, v20, v21, v22, v23
+ ret
+endfunc
+
+function inv_flipadst_4s_x8_neon
+ iadst_8 v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x8_neon
+ sqshl v16.4s, v16.4s, #1
+ sqshl v17.4s, v17.4s, #1
+ sqshl v18.4s, v18.4s, #1
+ sqshl v19.4s, v19.4s, #1
+ sqshl v20.4s, v20.4s, #1
+ sqshl v21.4s, v21.4s, #1
+ sqshl v22.4s, v22.4s, #1
+ sqshl v23.4s, v23.4s, #1
+ ret
+endfunc
+
+function inv_txfm_add_8x8_neon
+ movi v31.4s, #0
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23
+
+ mov v20.16b, v24.16b
+ mov v21.16b, v25.16b
+ mov v22.16b, v26.16b
+ mov v23.16b, v27.16b
+
+ blr x5
+
+ load_add_store_8x8 x0, x7
+ br x15
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 8, 8, 1
+.endif
+ movrel x5, X(inv_\txfm2\()_8h_x8_neon)
+ mov w13, #\eob_half
+ adr x4, inv_\txfm1\()_4s_x8_neon
+ b inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+ movi v28.4s, #0
+ movi v29.4s, #0
+ movi v30.4s, #0
+ movi v31.4s, #0
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+ ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
+ st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ blr x4
+
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ sqxtn v20.4h, v20.4s
+ sqxtn v21.4h, v21.4s
+ sqxtn v22.4h, v22.4s
+ sqxtn v23.4h, v23.4s
+
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+ ins v16.d[1], v20.d[0]
+ ins v17.d[1], v21.d[0]
+ ins v18.d[1], v22.d[0]
+ ins v19.d[1], v23.d[0]
+
+ blr x5
+
+ load_add_store_8x4 x0, x7
+ br x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+ movz w16, #2896*8, lsl #16
+ movi v31.4s, #0
+ dup v30.2s, w16
+
+ cmp w3, w13
+ mov x11, #32
+ b.lt 1f
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v31.4s}, [x6], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v20.4h, v16.4s
+ sqxtn v21.4h, v17.4s
+ sqxtn v22.4h, v18.4s
+ sqxtn v23.4h, v19.4s
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+
+1:
+.irp i, v20, v21, v22, v23
+ movi \i\().4h, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v31.4s}, [x2], x11
+.endr
+ scale_input .4s, v30.s[0], v16, v17, v18, v19
+ blr x4
+ sqxtn v16.4h, v16.4s
+ sqxtn v17.4h, v17.4s
+ sqxtn v18.4h, v18.4s
+ sqxtn v19.4h, v19.4s
+ transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x8 x0, x7
+ br x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+ mov x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 0
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+ mov w13, #\eob_half
+.endif
+ movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+function inv_dct_4s_x16_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ idct_8 v16, v18, v20, v22, v24, v26, v28, v30
+
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #32
+
+ mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
+ mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a
+ mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
+ srshr v17.4s, v2.4s, #12 // t8a
+ srshr v31.4s, v4.4s, #12 // t15a
+ mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
+ mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a
+ srshr v23.4s, v6.4s, #12 // t9a
+ srshr v25.4s, v2.4s, #12 // t14a
+ mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
+ mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
+ srshr v21.4s, v4.4s, #12 // t10a
+ srshr v27.4s, v6.4s, #12 // t13a
+ mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a
+ srshr v19.4s, v2.4s, #12 // t11a
+ srshr v29.4s, v4.4s, #12 // t12a
+
+ ld1 {v0.4s}, [x16]
+
+ sqsub v2.4s, v17.4s, v23.4s // t9
+ sqadd v17.4s, v17.4s, v23.4s // t8
+ sqsub v3.4s, v31.4s, v25.4s // t14
+ sqadd v31.4s, v31.4s, v25.4s // t15
+ sqsub v23.4s, v19.4s, v21.4s // t10
+ sqadd v19.4s, v19.4s, v21.4s // t11
+ sqadd v25.4s, v29.4s, v27.4s // t12
+ sqsub v29.4s, v29.4s, v27.4s // t13
+
+ mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a
+ mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
+ srshr v21.4s, v4.4s, #12 // t9a
+ srshr v27.4s, v6.4s, #12 // t14a
+
+ mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a
+ mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
+ srshr v29.4s, v4.4s, #12 // t13a
+ neg v6.4s, v6.4s
+ srshr v23.4s, v6.4s, #12 // t10a
+
+ sqsub v2.4s, v17.4s, v19.4s // t11a
+ sqadd v17.4s, v17.4s, v19.4s // t8a
+ sqsub v3.4s, v31.4s, v25.4s // t12a
+ sqadd v31.4s, v31.4s, v25.4s // t15a
+ sqadd v19.4s, v21.4s, v23.4s // t9
+ sqsub v21.4s, v21.4s, v23.4s // t10
+ sqsub v25.4s, v27.4s, v29.4s // t13
+ sqadd v27.4s, v27.4s, v29.4s // t14
+
+ mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
+ mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
+
+ srshr v4.4s, v4.4s, #12 // t11
+ srshr v5.4s, v6.4s, #12 // t12
+ mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a
+ srshr v2.4s, v2.4s, #12 // t10a
+ srshr v3.4s, v6.4s, #12 // t13a
+
+ sqadd v6.4s, v16.4s, v31.4s // out0
+ sqsub v31.4s, v16.4s, v31.4s // out15
+ mov v16.16b, v6.16b
+ sqadd v23.4s, v30.4s, v17.4s // out7
+ sqsub v7.4s, v30.4s, v17.4s // out8
+ sqadd v17.4s, v18.4s, v27.4s // out1
+ sqsub v30.4s, v18.4s, v27.4s // out14
+ sqadd v18.4s, v20.4s, v3.4s // out2
+ sqsub v29.4s, v20.4s, v3.4s // out13
+ sqadd v3.4s, v28.4s, v19.4s // out6
+ sqsub v25.4s, v28.4s, v19.4s // out9
+ sqadd v19.4s, v22.4s, v5.4s // out3
+ sqsub v28.4s, v22.4s, v5.4s // out12
+ sqadd v20.4s, v24.4s, v4.4s // out4
+ sqsub v27.4s, v24.4s, v4.4s // out11
+ sqadd v21.4s, v26.4s, v2.4s // out5
+ sqsub v26.4s, v26.4s, v2.4s // out10
+ mov v24.16b, v7.16b
+ mov v22.16b, v3.16b
+
+ ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+ movrel x16, iadst16_coeffs
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0
+ mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2
+ srshr v16.4s, v2.4s, #12 // t0
+ srshr v31.4s, v4.4s, #12 // t1
+ mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3
+ mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4
+ srshr v18.4s, v6.4s, #12 // t2
+ srshr v29.4s, v2.4s, #12 // t3
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5
+ mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6
+ srshr v20.4s, v4.4s, #12 // t4
+ srshr v27.4s, v6.4s, #12 // t5
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7
+ ld1 {v0.4s, v1.4s}, [x16]
+ movrel x16, idct_coeffs
+ mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8
+ srshr v22.4s, v2.4s, #12 // t6
+ srshr v25.4s, v4.4s, #12 // t7
+ mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9
+ mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10
+ srshr v23.4s, v6.4s, #12 // t8
+ srshr v24.4s, v2.4s, #12 // t9
+ mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11
+ mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12
+ srshr v21.4s, v4.4s, #12 // t10
+ srshr v26.4s, v6.4s, #12 // t11
+ mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13
+ mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14
+ srshr v19.4s, v2.4s, #12 // t12
+ srshr v28.4s, v4.4s, #12 // t13
+ mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15
+ srshr v17.4s, v6.4s, #12 // t14
+ srshr v30.4s, v2.4s, #12 // t15
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ sqsub v2.4s, v16.4s, v23.4s // t8a
+ sqadd v16.4s, v16.4s, v23.4s // t0a
+ sqsub v3.4s, v31.4s, v24.4s // t9a
+ sqadd v31.4s, v31.4s, v24.4s // t1a
+ sqadd v23.4s, v18.4s, v21.4s // t2a
+ sqsub v18.4s, v18.4s, v21.4s // t10a
+ sqadd v24.4s, v29.4s, v26.4s // t3a
+ sqsub v29.4s, v29.4s, v26.4s // t11a
+ sqadd v21.4s, v20.4s, v19.4s // t4a
+ sqsub v20.4s, v20.4s, v19.4s // t12a
+ sqadd v26.4s, v27.4s, v28.4s // t5a
+ sqsub v27.4s, v27.4s, v28.4s // t13a
+ sqadd v19.4s, v22.4s, v17.4s // t6a
+ sqsub v22.4s, v22.4s, v17.4s // t14a
+ sqadd v28.4s, v25.4s, v30.4s // t7a
+ sqsub v25.4s, v25.4s, v30.4s // t15a
+
+ mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
+ mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
+ mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
+ srshr v17.4s, v4.4s, #12 // t8
+ srshr v30.4s, v6.4s, #12 // t9
+ mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11
+ mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12
+ srshr v18.4s, v2.4s, #12 // t10
+ srshr v29.4s, v4.4s, #12 // t11
+ mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13
+ mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14
+ srshr v27.4s, v6.4s, #12 // t12
+ srshr v20.4s, v2.4s, #12 // t13
+ mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15
+ srshr v25.4s, v4.4s, #12 // t14
+ srshr v22.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t4
+ sqadd v16.4s, v16.4s, v21.4s // t0
+ sqsub v3.4s, v31.4s, v26.4s // t5
+ sqadd v31.4s, v31.4s, v26.4s // t1
+ sqadd v21.4s, v23.4s, v19.4s // t2
+ sqsub v23.4s, v23.4s, v19.4s // t6
+ sqadd v26.4s, v24.4s, v28.4s // t3
+ sqsub v24.4s, v24.4s, v28.4s // t7
+ sqadd v19.4s, v17.4s, v27.4s // t8a
+ sqsub v17.4s, v17.4s, v27.4s // t12a
+ sqadd v28.4s, v30.4s, v20.4s // t9a
+ sqsub v30.4s, v30.4s, v20.4s // t13a
+ sqadd v27.4s, v18.4s, v25.4s // t10a
+ sqsub v18.4s, v18.4s, v25.4s // t14a
+ sqadd v20.4s, v29.4s, v22.4s // t11a
+ sqsub v29.4s, v29.4s, v22.4s // t15a
+
+ mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
+ mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
+ mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
+ srshr v22.4s, v4.4s, #12 // t4a
+ srshr v25.4s, v6.4s, #12 // t5a
+ mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a
+ mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12
+ srshr v24.4s, v2.4s, #12 // t6a
+ srshr v23.4s, v4.4s, #12 // t7a
+ mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13
+ mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14
+ srshr v17.4s, v6.4s, #12 // t12
+ mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15
+ srshr v29.4s, v2.4s, #12 // t13
+ srshr v30.4s, v4.4s, #12 // t14
+ srshr v18.4s, v6.4s, #12 // t15
+
+ sqsub v2.4s, v16.4s, v21.4s // t2a
+.ifc \o0, v16
+ sqadd \o0\().4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+.else
+ sqadd v4.4s, v16.4s, v21.4s // out0
+ sqsub v21.4s, v31.4s, v26.4s // t3a
+ sqadd \o15\().4s, v31.4s, v26.4s // out15
+ mov \o0\().16b, v4.16b
+.endif
+ sqneg \o15\().4s, \o15\().4s // out15
+
+ sqsub v3.4s, v29.4s, v18.4s // t15a
+ sqadd \o13\().4s, v29.4s, v18.4s // out13
+ sqadd \o2\().4s, v17.4s, v30.4s // out2
+ sqsub v26.4s, v17.4s, v30.4s // t14a
+ sqneg \o13\().4s, \o13\().4s // out13
+
+ sqadd \o1\().4s, v19.4s, v27.4s // out1
+ sqsub v27.4s, v19.4s, v27.4s // t10
+ sqadd \o14\().4s, v28.4s, v20.4s // out14
+ sqsub v20.4s, v28.4s, v20.4s // t11
+ sqneg \o1\().4s, \o1\().4s // out1
+
+ sqadd \o3\().4s, v22.4s, v24.4s // out3
+ sqsub v22.4s, v22.4s, v24.4s // t6
+ sqadd \o12\().4s, v25.4s, v23.4s // out12
+ sqsub v23.4s, v25.4s, v23.4s // t7
+ sqneg \o3\().4s, \o3\().4s // out3
+
+ mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
+ mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
+ mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26)
+
+ srshr v24.4s, v24.4s, #12 // out8
+ srshr v4.4s, v4.4s, #12 // out7
+ srshr v5.4s, v6.4s, #12 // out5
+ mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21)
+ mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
+ srshr v26.4s, v6.4s, #12 // out10
+
+ mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
+ mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
+ mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
+
+ srshr \o4\().4s, v2.4s, #12 // out4
+ srshr v6.4s, v6.4s, #12 // out11
+ srshr v7.4s, v21.4s, #12 // out9
+ srshr \o6\().4s, v22.4s, #12 // out6
+
+.ifc \o8, v23
+ mov \o8\().16b, v24.16b
+ mov \o10\().16b, v26.16b
+.endif
+
+ sqneg \o7\().4s, v4.4s // out7
+ sqneg \o5\().4s, v5.4s // out5
+ sqneg \o11\().4s, v6.4s // out11
+ sqneg \o9\().4s, v7.4s // out9
+.endm
+
+function inv_adst_4s_x16_neon
+ iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ ret
+endfunc
+
+function inv_flipadst_4s_x16_neon
+ iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
+ ret
+endfunc
+
+function inv_identity_4s_x16_neon
+ movz w16, #2*(5793-4096)*8, lsl #16
+ dup v0.2s, w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ sqrdmulh v2.4s, v\i\().4s, v0.s[0]
+ sqadd v\i\().4s, v\i\().4s, v\i\().4s
+ sqadd v\i\().4s, v\i\().4s, v2.4s
+.endr
+ ret
+endfunc
+
+.macro identity_4x16_shift1 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ srshr v3.4s, v3.4s, #1
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro identity_4x16 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ sqrdmulh v3.4s, \i, \c
+ sqadd \i, \i, \i
+ sqadd \i, \i, v3.4s
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ blr x4
+ sqrshrn v16.4h, v16.4s, #\shift
+ sqrshrn v17.4h, v17.4s, #\shift
+ sqrshrn v18.4h, v18.4s, #\shift
+ sqrshrn v19.4h, v19.4s, #\shift
+ sqrshrn2 v16.8h, v20.4s, #\shift
+ sqrshrn2 v17.8h, v21.4s, #\shift
+ sqrshrn2 v18.8h, v22.4s, #\shift
+ sqrshrn2 v19.8h, v23.4s, #\shift
+ sqrshrn v20.4h, v24.4s, #\shift
+ sqrshrn v21.4h, v25.4s, #\shift
+ sqrshrn v22.4h, v26.4s, #\shift
+ sqrshrn v23.4h, v27.4s, #\shift
+ sqrshrn2 v20.8h, v28.4s, #\shift
+ sqrshrn2 v21.8h, v29.4s, #\shift
+ sqrshrn2 v22.8h, v30.4s, #\shift
+ sqrshrn2 v23.8h, v31.4s, #\shift
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+ transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7
+
+.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
+ st1 {\i}, [x6], #16
+.endr
+
+ br x14
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_8x16_neon
+ mov x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ blr x5
+ load_add_store_8x16 x6, x7
+ br x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+ mov x15, x30
+ sub sp, sp, #512
+ ldrh w12, [x13], #2
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*16*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #512
+ br x15
+endfunc
+
+const eob_16x16
+ .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+ .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc 16, 16, 2
+.endif
+ adr x4, inv_\txfm1\()_4s_x16_neon
+ movrel x5, X(inv_\txfm2\()_8h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_16x16
+.else
+ movrel x13, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_16x16_identity
+.else
+ movrel x13, eob_16x16
+.endif
+.endif
+ b inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct, 36
+def_fn_16x16 identity, identity, 36
+def_fn_16x16 dct, adst, 36
+def_fn_16x16 dct, flipadst, 36
+def_fn_16x16 dct, identity, 8
+def_fn_16x16 adst, dct, 36
+def_fn_16x16 adst, adst, 36
+def_fn_16x16 adst, flipadst, 36
+def_fn_16x16 flipadst, dct, 36
+def_fn_16x16 flipadst, adst, 36
+def_fn_16x16 flipadst, flipadst, 36
+def_fn_16x16 identity, dct, 8
+
+function inv_txfm_add_16x4_neon
+ mov x15, x30
+ movi v4.4s, #0
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], #16
+.endr
+
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ mov x6, x0
+ load_add_store_8x4 x6, x7
+
+ sqrshrn v16.4h, v24.4s, #1
+ sqrshrn v17.4h, v25.4s, #1
+ sqrshrn v18.4h, v26.4s, #1
+ sqrshrn v19.4h, v27.4s, #1
+ sqrshrn2 v16.8h, v28.4s, #1
+ sqrshrn2 v17.8h, v29.4s, #1
+ sqrshrn2 v18.8h, v30.4s, #1
+ sqrshrn2 v19.8h, v31.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+ blr x5
+ add x6, x0, #16
+ load_add_store_8x4 x6, x7
+
+ br x15
+endfunc
+
+function inv_txfm_add_4x16_neon
+ ldrh w12, [x13, #4]
+ mov x15, x30
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ rshrn v28.4h, v16.4s, #1
+ rshrn v29.4h, v17.4s, #1
+ rshrn v30.4h, v18.4s, #1
+ rshrn v31.4h, v19.4s, #1
+ transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v28.4h, v29.4h, v30.4h, v31.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ rshrn v24.4h, v16.4s, #1
+ rshrn v25.4h, v17.4s, #1
+ rshrn v26.4h, v18.4s, #1
+ rshrn v27.4h, v19.4s, #1
+ transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h
+ movi \i, #0
+.endr
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x6]
+ st1 {v2.4s}, [x6], x11
+.endr
+ blr x4
+ rshrn v20.4h, v16.4s, #1
+ rshrn v21.4h, v17.4s, #1
+ rshrn v22.4h, v18.4s, #1
+ rshrn v23.4h, v19.4s, #1
+ transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7
+
+ b 2f
+1:
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+ movi \i, #0
+.endr
+2:
+
+ movi v2.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+ ld1 {\i}, [x2]
+ st1 {v2.4s}, [x2], x11
+.endr
+ blr x4
+ rshrn v16.4h, v16.4s, #1
+ rshrn v17.4h, v17.4s, #1
+ rshrn v18.4h, v18.4s, #1
+ rshrn v19.4h, v19.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
+
+ blr x5
+
+ load_add_store_4x16 x0, x6
+
+ br x15
+endfunc
+
+const eob_4x16
+ .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+ .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+ .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+.if \w == 4
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_4x16
+.else
+ movrel x13, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_4x16_identity2
+.else
+ movrel x13, eob_4x16
+.endif
+.endif
+.else
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct
+def_fn_416 \w, \h, identity, identity
+def_fn_416 \w, \h, dct, adst
+def_fn_416 \w, \h, dct, flipadst
+def_fn_416 \w, \h, dct, identity
+def_fn_416 \w, \h, adst, dct
+def_fn_416 \w, \h, adst, adst
+def_fn_416 \w, \h, adst, flipadst
+def_fn_416 \w, \h, flipadst, dct
+def_fn_416 \w, \h, flipadst, adst
+def_fn_416 \w, \h, flipadst, flipadst
+def_fn_416 \w, \h, identity, dct
+def_fn_416 \w, \h, adst, identity
+def_fn_416 \w, \h, flipadst, identity
+def_fn_416 \w, \h, identity, adst
+def_fn_416 \w, \h, identity, flipadst
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+function inv_txfm_add_16x8_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x40]!
+ stp d10, d11, [sp, #0x10]
+ stp d12, d13, [sp, #0x20]
+ stp d14, d15, [sp, #0x30]
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ mov x11, #32
+
+ add x6, x2, #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ sqrshrn v12.4h, v24.4s, #1
+ sqrshrn v13.4h, v25.4s, #1
+ sqrshrn v14.4h, v26.4s, #1
+ sqrshrn v15.4h, v27.4s, #1
+ sqrshrn2 v12.8h, v28.4s, #1
+ sqrshrn2 v13.8h, v29.4s, #1
+ sqrshrn2 v14.8h, v30.4s, #1
+ sqrshrn2 v15.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+ transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5
+
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+
+ movi v4.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ sqrshrn v8.4h, v24.4s, #1
+ sqrshrn v9.4h, v25.4s, #1
+ sqrshrn v10.4h, v26.4s, #1
+ sqrshrn v11.4h, v27.4s, #1
+ sqrshrn2 v8.8h, v28.4s, #1
+ sqrshrn2 v9.8h, v29.4s, #1
+ sqrshrn2 v10.8h, v30.4s, #1
+ sqrshrn2 v11.8h, v31.4s, #1
+
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ blr x5
+
+ mov x6, x0
+ load_add_store_8x8 x6, x7
+
+ mov v16.16b, v8.16b
+ mov v17.16b, v9.16b
+ mov v18.16b, v10.16b
+ mov v19.16b, v11.16b
+ mov v20.16b, v12.16b
+ mov v21.16b, v13.16b
+ mov v22.16b, v14.16b
+ mov v23.16b, v15.16b
+
+ blr x5
+
+ add x0, x0, #16
+ load_add_store_8x8 x0, x7
+
+ ldp d14, d15, [sp, #0x30]
+ ldp d12, d13, [sp, #0x20]
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x40
+ br x15
+endfunc
+
+function inv_txfm_add_8x16_neon
+ mov x15, x30
+ stp d8, d9, [sp, #-0x20]!
+ stp d10, d11, [sp, #0x10]
+ ldrh w12, [x13, #4]
+
+ mov x11, #64
+
+ cmp w3, w12
+ ldrh w12, [x13, #2]
+ b.lt 1f
+
+ add x6, x2, #48
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v28.4h, v16.4s, #1
+ sqrshrn v29.4h, v17.4s, #1
+ sqrshrn v30.4h, v18.4s, #1
+ sqrshrn v31.4h, v19.4s, #1
+ sqrshrn2 v28.8h, v20.4s, #1
+ sqrshrn2 v29.8h, v21.4s, #1
+ sqrshrn2 v30.8h, v22.4s, #1
+ sqrshrn2 v31.8h, v23.4s, #1
+ transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ ldrh w12, [x13, #0]
+ b.lt 1f
+
+ add x6, x2, #32
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v24.4h, v16.4s, #1
+ sqrshrn v25.4h, v17.4s, #1
+ sqrshrn v26.4h, v18.4s, #1
+ sqrshrn v27.4h, v19.4s, #1
+ sqrshrn2 v24.8h, v20.4s, #1
+ sqrshrn2 v25.8h, v21.4s, #1
+ sqrshrn2 v26.8h, v22.4s, #1
+ sqrshrn2 v27.8h, v23.4s, #1
+ transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+ movi \i, #0
+.endr
+
+2:
+ cmp w3, w12
+ b.lt 1f
+
+ add x6, x2, #16
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x6]
+ st1 {v4.4s}, [x6], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v8.4h, v16.4s, #1
+ sqrshrn v9.4h, v17.4s, #1
+ sqrshrn v10.4h, v18.4s, #1
+ sqrshrn v11.4h, v19.4s, #1
+ sqrshrn2 v8.8h, v20.4s, #1
+ sqrshrn2 v9.8h, v21.4s, #1
+ sqrshrn2 v10.8h, v22.4s, #1
+ sqrshrn2 v11.8h, v23.4s, #1
+ transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5
+
+ b 2f
+
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h
+ movi \i, #0
+.endr
+
+2:
+ movi v4.4s, #0
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+ ld1 {\i}, [x2]
+ st1 {v4.4s}, [x2], x11
+.endr
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ blr x4
+
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn v17.4h, v17.4s, #1
+ sqrshrn v18.4h, v18.4s, #1
+ sqrshrn v19.4h, v19.4s, #1
+ sqrshrn2 v16.8h, v20.4s, #1
+ sqrshrn2 v17.8h, v21.4s, #1
+ sqrshrn2 v18.8h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ mov v20.16b, v8.16b
+ mov v21.16b, v9.16b
+ mov v22.16b, v10.16b
+ mov v23.16b, v11.16b
+
+ blr x5
+
+ load_add_store_8x16 x0, x6
+
+ ldp d10, d11, [sp, #0x10]
+ ldp d8, d9, [sp], 0x20
+
+ br x15
+endfunc
+
+const eob_8x16
+ .short 10, 43, 75, 128
+endconst
+
+const eob_8x16_identity1
+ .short 4, 64, 96, 128
+endconst
+
+const eob_8x16_identity2
+ .short 4, 8, 12, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+ idct_dc \w, \h, 1
+.endif
+ adr x4, inv_\txfm1\()_4s_x\w\()_neon
+ movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon)
+.if \w == 8
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+ movrel x13, eob_8x16
+.else
+ movrel x13, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+ movrel x13, eob_8x16_identity2
+.else
+ movrel x13, eob_8x16
+.endif
+.endif
+.endif
+ b inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4s_x16_neon
+ movrel x16, idct_coeffs, 4*16
+ ld1 {v0.4s, v1.4s}, [x16], #32
+
+ mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a
+ mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a
+ mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a
+ srshr v16.4s, v2.4s, #12 // t16a
+ srshr v31.4s, v4.4s, #12 // t31a
+ mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a
+ mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a
+ srshr v24.4s, v6.4s, #12 // t17a
+ srshr v23.4s, v2.4s, #12 // t30a
+ mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a
+ mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a
+ srshr v20.4s, v4.4s, #12 // t18a
+ srshr v27.4s, v6.4s, #12 // t29a
+ mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a
+ ld1 {v0.4s, v1.4s}, [x16]
+ sub x16, x16, #4*24
+ mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a
+ srshr v28.4s, v2.4s, #12 // t19a
+ srshr v19.4s, v4.4s, #12 // t28a
+ mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a
+ mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a
+ srshr v18.4s, v6.4s, #12 // t20a
+ srshr v29.4s, v2.4s, #12 // t27a
+ mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a
+ mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a
+ srshr v26.4s, v4.4s, #12 // t21a
+ srshr v21.4s, v6.4s, #12 // t26a
+ mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a
+ mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a
+ srshr v22.4s, v2.4s, #12 // t22a
+ srshr v25.4s, v4.4s, #12 // t25a
+ mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a
+ srshr v30.4s, v6.4s, #12 // t23a
+ srshr v17.4s, v2.4s, #12 // t24a
+
+ ld1 {v0.4s, v1.4s}, [x16]
+
+ sqsub v2.4s, v16.4s, v24.4s // t17
+ sqadd v16.4s, v16.4s, v24.4s // t16
+ sqsub v3.4s, v31.4s, v23.4s // t30
+ sqadd v31.4s, v31.4s, v23.4s // t31
+ sqsub v24.4s, v28.4s, v20.4s // t18
+ sqadd v28.4s, v28.4s, v20.4s // t19
+ sqadd v23.4s, v18.4s, v26.4s // t20
+ sqsub v18.4s, v18.4s, v26.4s // t21
+ sqsub v20.4s, v30.4s, v22.4s // t22
+ sqadd v30.4s, v30.4s, v22.4s // t23
+ sqadd v26.4s, v17.4s, v25.4s // t24
+ sqsub v17.4s, v17.4s, v25.4s // t25
+ sqsub v22.4s, v29.4s, v21.4s // t26
+ sqadd v29.4s, v29.4s, v21.4s // t27
+ sqadd v25.4s, v19.4s, v27.4s // t28
+ sqsub v19.4s, v19.4s, v27.4s // t29
+
+ mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a
+ mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
+ mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
+ srshr v21.4s, v4.4s, #12 // t17a
+ srshr v27.4s, v6.4s, #12 // t30a
+ neg v2.4s, v2.4s // -> t18a
+ mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a
+ mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
+ srshr v19.4s, v2.4s, #12 // t18a
+ srshr v24.4s, v4.4s, #12 // t29a
+ mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
+ mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a
+ srshr v22.4s, v6.4s, #12 // t21a
+ srshr v18.4s, v2.4s, #12 // t26a
+ neg v4.4s, v4.4s // -> t22a
+ mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
+ srshr v17.4s, v4.4s, #12 // t22a
+ srshr v20.4s, v6.4s, #12 // t25a
+
+ sqsub v2.4s, v27.4s, v24.4s // t29
+ sqadd v27.4s, v27.4s, v24.4s // t30
+ sqsub v3.4s, v21.4s, v19.4s // t18
+ sqadd v21.4s, v21.4s, v19.4s // t17
+ sqsub v24.4s, v16.4s, v28.4s // t19a
+ sqadd v16.4s, v16.4s, v28.4s // t16a
+ sqsub v19.4s, v30.4s, v23.4s // t20a
+ sqadd v30.4s, v30.4s, v23.4s // t23a
+ sqsub v28.4s, v17.4s, v22.4s // t21
+ sqadd v17.4s, v17.4s, v22.4s // t22
+ sqadd v23.4s, v26.4s, v29.4s // t24a
+ sqsub v26.4s, v26.4s, v29.4s // t27a
+ sqadd v22.4s, v20.4s, v18.4s // t25
+ sqsub v20.4s, v20.4s, v18.4s // t26
+ sqsub v29.4s, v31.4s, v25.4s // t28a
+ sqadd v31.4s, v31.4s, v25.4s // t31a
+
+ mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a
+ mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
+ mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
+ srshr v18.4s, v4.4s, #12 // t18a
+ srshr v25.4s, v6.4s, #12 // t29a
+ mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28
+ mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
+ srshr v29.4s, v2.4s, #12 // t19
+ srshr v24.4s, v4.4s, #12 // t28
+ neg v6.4s, v6.4s // -> t20
+ mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
+ mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a
+ srshr v26.4s, v6.4s, #12 // t20
+ srshr v19.4s, v2.4s, #12 // t27
+ neg v4.4s, v4.4s // -> t21a
+ mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
+ srshr v20.4s, v4.4s, #12 // t21a
+ srshr v28.4s, v6.4s, #12 // t26a
+
+ sqsub v2.4s, v16.4s, v30.4s // t23
+ sqadd v16.4s, v16.4s, v30.4s // t16 = out16
+ sqsub v3.4s, v31.4s, v23.4s // t24
+ sqadd v31.4s, v31.4s, v23.4s // t31 = out31
+ sqsub v23.4s, v21.4s, v17.4s // t22a
+ sqadd v17.4s, v21.4s, v17.4s // t17a = out17
+ sqadd v30.4s, v27.4s, v22.4s // t30a = out30
+ sqsub v21.4s, v27.4s, v22.4s // t25a
+ sqsub v27.4s, v18.4s, v20.4s // t21
+ sqadd v18.4s, v18.4s, v20.4s // t18 = out18
+ sqadd v4.4s, v29.4s, v26.4s // t19a = out19
+ sqsub v26.4s, v29.4s, v26.4s // t20a
+ sqadd v29.4s, v25.4s, v28.4s // t29 = out29
+ sqsub v25.4s, v25.4s, v28.4s // t26
+ sqadd v28.4s, v24.4s, v19.4s // t28a = out28
+ sqsub v24.4s, v24.4s, v19.4s // t27a
+ mov v19.16b, v4.16b // out19
+
+ mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20
+ mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
+ srshr v20.4s, v4.4s, #12 // t20
+ srshr v22.4s, v6.4s, #12 // t27
+
+ mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a
+ mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
+ mov v27.16b, v22.16b // t27
+ srshr v26.4s, v4.4s, #12 // t26a
+
+ mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
+ mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25
+ srshr v21.4s, v6.4s, #12 // t21a
+ srshr v22.4s, v24.4s, #12 // t22
+ srshr v25.4s, v4.4s, #12 // t25
+
+ mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a
+ mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
+ srshr v23.4s, v4.4s, #12 // t23a
+ srshr v24.4s, v6.4s, #12 // t24a
+
+ ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+ mov x14, x30
+ movi v7.4s, #0
+ lsl x8, x8, #1
+.if \scale
+ movz w16, #2896*8, lsl #16
+ dup v0.2s, w16
+.endif
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+.if \scale
+ scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct_4s_x16_neon
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
+ transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5
+
+.macro store1 r0, r1, r2, r3
+ st1 {\r0}, [x6], #16
+ st1 {\r1}, [x6], #16
+ st1 {\r2}, [x6], #16
+ st1 {\r3}, [x6], #16
+.endm
+ store1 v16.4s, v20.4s, v24.4s, v28.4s
+ store1 v17.4s, v21.4s, v25.4s, v29.4s
+ store1 v18.4s, v22.4s, v26.4s, v30.4s
+ store1 v19.4s, v23.4s, v27.4s, v31.4s
+.purgem store1
+ sub x6, x6, #64*4
+
+ movi v7.4s, #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ ld1 {\i}, [x7]
+ st1 {v7.4s}, [x7], x8
+.endr
+.if \scale
+ // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
+ scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+ bl inv_dct32_odd_4s_x16_neon
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+ transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5
+ transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5
+.macro store2 r0, r1, r2, r3, shift
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
+ sqsub v4.4s, v0.4s, \r0
+ sqadd v0.4s, v0.4s, \r0
+ sqsub v5.4s, v1.4s, \r1
+ sqadd v1.4s, v1.4s, \r1
+ sqsub v6.4s, v2.4s, \r2
+ sqadd v2.4s, v2.4s, \r2
+ sqsub v7.4s, v3.4s, \r3
+ sqadd v3.4s, v3.4s, \r3
+ sqrshrn v0.4h, v0.4s, #\shift
+ sqrshrn2 v0.8h, v1.4s, #\shift
+ sqrshrn v1.4h, v2.4s, #\shift
+ sqrshrn2 v1.8h, v3.4s, #\shift
+ sqrshrn v2.4h, v7.4s, #\shift
+ sqrshrn2 v2.8h, v6.4s, #\shift
+ sqrshrn v3.4h, v5.4s, #\shift
+ sqrshrn2 v3.8h, v4.4s, #\shift
+ st1 {v0.8h, v1.8h}, [x6], #32
+ rev64 v2.8h, v2.8h
+ rev64 v3.8h, v3.8h
+ st1 {v2.8h, v3.8h}, [x6], #32
+.endm
+
+ store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift
+ store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift
+ store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift
+ store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift
+.purgem store2
+ br x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+
+ bl X(inv_dct_8h_x16_neon)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ st1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ add x7, x7, x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ sub x7, x7, x8, lsl #4
+ sub x7, x7, x8, lsr #1
+ bl X(inv_dct32_odd_8h_x16_neon)
+
+ neg x9, x8
+ mov x10, x6
+ movi v0.8h, #0
+ mvni v1.8h, #0xfc, lsl #8 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+ ld1 {v5.8h}, [x7], \stride
+ ld1 {v2.8h}, [x10], x1
+ ld1 {v6.8h}, [x7], \stride
+ ld1 {v3.8h}, [x10], x1
+ \op v5.8h, v5.8h, \r0
+ ld1 {v7.8h}, [x7], \stride
+ ld1 {v4.8h}, [x10], x1
+ srshr v5.8h, v5.8h, #4
+ \op v6.8h, v6.8h, \r1
+ sqadd v5.8h, v5.8h, v2.8h
+ srshr v6.8h, v6.8h, #4
+ \op v7.8h, v7.8h, \r2
+ smax v2.8h, v5.8h, v0.8h
+ ld1 {v5.8h}, [x7], \stride
+ sqadd v6.8h, v6.8h, v3.8h
+ smin v2.8h, v2.8h, v1.8h
+ srshr v7.8h, v7.8h, #4
+ \op v5.8h, v5.8h, \r3
+ st1 {v2.8h}, [x6], x1
+ ld1 {v2.8h}, [x10], x1
+ smax v3.8h, v6.8h, v0.8h
+ sqadd v7.8h, v7.8h, v4.8h
+ smin v3.8h, v3.8h, v1.8h
+ srshr v5.8h, v5.8h, #4
+ st1 {v3.8h}, [x6], x1
+ smax v4.8h, v7.8h, v0.8h
+ sqadd v5.8h, v5.8h, v2.8h
+ smin v4.8h, v4.8h, v1.8h
+ st1 {v4.8h}, [x6], x1
+ smax v2.8h, v5.8h, v0.8h
+ smin v2.8h, v2.8h, v1.8h
+ st1 {v2.8h}, [x6], x1
+.endm
+ combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+ combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+ combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+ combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+ sub x7, x7, x8
+ combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+ combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+ combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+ combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+ br x14
+endfunc
+
+const eob_32x32
+ .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+ .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+ .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+ .short 10, 43, 75, 107, 139, 171, 203, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+ movi v0.8h, #0
+ movi v1.8h, #0
+ movrel x13, eob_32x32, 2
+
+ mov x8, #4*32
+1:
+ mov w9, #0
+ movrel x12, eob_32x32, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+ load_add_store_8x8 x0, x7, shiftbits=2
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+
+.macro shift_16_regs op, shift
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ \op \i, \i, #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movz w16, #2896*8, lsl #16
+ movz w17, #2*(5793-4096)*8, lsl #16
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movrel x13, eob_16x32\hshort, 2
+
+ mov x8, #4*\h
+1:
+ mov w9, #0
+ movrel x12, eob_16x32\wshort, 2
+2:
+ add w9, w9, #8
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ dup v2.2s, w16
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ mov v2.s[1], w17
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+ scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+
+.if \w == 16
+ // 16x32
+ identity_4x16_shift1 v2.s[1]
+.else
+ // 32x16
+ shift_16_regs sqshl, 1
+ identity_4x16 v2.s[1]
+.endif
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+ ldrh w11, [x12], #4
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #16
+ cmp w3, w11
+ b.ge 2b
+
+ ldrh w11, [x13], #4
+ cmp w3, w11
+ b.lt 9f
+
+ sub x0, x0, w9, uxtw #1
+ add x0, x0, x1, lsl #3
+ msub x2, x8, x9, x2
+ add x2, x2, #4*8
+ b 1b
+9:
+ ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+ movi v0.4s, #0
+ movi v1.4s, #0
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ movrel x13, eob_8x32, 2
+
+ mov w8, #4*\h
+1:
+ // Working on 8x8 blocks, read every other entry from eob_8x32
+ ldrh w12, [x13], #4
+ ld1 {v16.4s, v17.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v18.4s, v19.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v20.4s, v21.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v22.4s, v23.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v24.4s, v25.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v26.4s, v27.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v28.4s, v29.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+ ld1 {v30.4s, v31.4s}, [x2]
+ st1 {v0.4s, v1.4s}, [x2], x8
+
+.if \w == 8
+ sqrshrn v16.4h, v16.4s, #1
+ sqrshrn2 v16.8h, v17.4s, #1
+ sqrshrn v17.4h, v18.4s, #1
+ sqrshrn2 v17.8h, v19.4s, #1
+ sqrshrn v18.4h, v20.4s, #1
+ sqrshrn2 v18.8h, v21.4s, #1
+ sqrshrn v19.4h, v22.4s, #1
+ sqrshrn2 v19.8h, v23.4s, #1
+ sqrshrn v20.4h, v24.4s, #1
+ sqrshrn2 v20.8h, v25.4s, #1
+ sqrshrn v21.4h, v26.4s, #1
+ sqrshrn2 v21.8h, v27.4s, #1
+ sqrshrn v22.4h, v28.4s, #1
+ sqrshrn2 v22.8h, v29.4s, #1
+ sqrshrn v23.4h, v30.4s, #1
+ sqrshrn2 v23.8h, v31.4s, #1
+.else
+ sqxtn v16.4h, v16.4s
+ sqxtn2 v16.8h, v17.4s
+ sqxtn v17.4h, v18.4s
+ sqxtn2 v17.8h, v19.4s
+ sqxtn v18.4h, v20.4s
+ sqxtn2 v18.8h, v21.4s
+ sqxtn v19.4h, v22.4s
+ sqxtn2 v19.8h, v23.4s
+ sqxtn v20.4h, v24.4s
+ sqxtn2 v20.8h, v25.4s
+ sqxtn v21.4h, v26.4s
+ sqxtn2 v21.8h, v27.4s
+ sqxtn v22.4h, v28.4s
+ sqxtn2 v22.8h, v29.4s
+ sqxtn v23.4h, v30.4s
+ sqxtn2 v23.8h, v31.4s
+.endif
+
+ transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+
+ cmp w3, w12
+.if \w == 8
+ load_add_store_8x8 x0, x7, shiftbits=2
+.else
+ load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+ b.lt 9f
+.if \w == 8
+ sub x2, x2, x8, lsl #3
+ add x2, x2, #4*8
+.else
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #2*8
+.endif
+ b 1b
+
+9:
+ ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+ idct_dc 32, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #2048
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #2048
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+ idct_dc 16, 32, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+ adr x4, inv_dct_4s_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, sp, #(\i*16*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endif
+ mov x8, #4*32
+ bl inv_txfm_horz_scale_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #16*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, sp, #1024
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+ idct_dc 32, 16, 1
+
+ mov x15, x30
+ sub sp, sp, #1024
+
+ movrel x13, eob_16x32
+ movrel x5, X(inv_dct_8h_x16_neon)
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+ ldrh w12, [x13], #2
+.endif
+ mov x8, #4*16
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x6, x0, #(\i*2)
+ add x7, sp, #(\i*2)
+ mov x8, #32*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, sp, #1024
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+ idct_dc 8, 32, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+ movrel x13, eob_8x32
+
+ movi v28.4s, #0
+ mov x8, #4*32
+ mov w9, #32
+ mov x6, sp
+ mov x7, x2
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().4s}, [x7]
+ st1 {v28.4s}, [x7], x8
+.endr
+ ldrh w12, [x13], #2
+ sub w9, w9, #4
+ sub x7, x7, x8, lsl #3
+ add x7, x7, #4*4
+
+ bl inv_dct_4s_x8_neon
+
+ sqrshrn v16.4h, v16.4s, #2
+ sqrshrn v17.4h, v17.4s, #2
+ sqrshrn v18.4h, v18.4s, #2
+ sqrshrn v19.4h, v19.4s, #2
+ sqrshrn2 v16.8h, v20.4s, #2
+ sqrshrn2 v17.8h, v21.4s, #2
+ sqrshrn2 v18.8h, v22.4s, #2
+ sqrshrn2 v19.8h, v23.4s, #2
+
+ transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
+
+ cmp w3, w12
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+
+ b.ge 1b
+ cbz w9, 3f
+
+ movi v29.8h, #0
+ movi v30.8h, #0
+ movi v31.8h, #0
+2:
+ subs w9, w9, #4
+ st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+ b.gt 2b
+
+3:
+ mov x6, x0
+ mov x7, sp
+ mov x8, #8*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+
+ add sp, sp, #512
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+ idct_dc 32, 8, 2
+
+ mov x15, x30
+ sub sp, sp, #512
+
+.irp i, 0, 4
+ add x6, sp, #(\i*32*2)
+ add x7, x2, #(\i*4)
+.if \i > 0
+ cmp w3, #10
+ b.lt 1f
+.endif
+ mov x8, #8*4
+ bl inv_txfm_horz_dct_32x4_neon
+.endr
+ b 2f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+
+2:
+ mov x8, #2*32
+ mov w9, #0
+1:
+ add x6, x0, x9, lsl #1
+ add x7, sp, x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x7], x8
+.endr
+ add w9, w9, #8
+
+ bl X(inv_dct_8h_x8_neon)
+
+ cmp w9, #32
+
+ load_add_store_8x8 x6, x7
+
+ b.lt 1b
+
+ add sp, sp, #512
+ br x15
+endfunc
+
+function inv_dct64_step1_neon
+ // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+ // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+ // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+ // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+ ld1 {v0.4s, v1.4s}, [x17], #32
+
+ sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a
+ sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a
+ sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a
+ sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a
+ sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a
+ sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a
+ sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a
+ sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a
+
+ ld1 {v0.4s}, [x17], #16
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t33
+ sqsub v26.4s, v19.4s, v18.4s // t34
+ sqadd v27.4s, v19.4s, v18.4s // t35
+ sqadd v28.4s, v20.4s, v21.4s // t60
+ sqsub v29.4s, v20.4s, v21.4s // t61
+ sqsub v30.4s, v23.4s, v22.4s // t62
+ sqadd v31.4s, v23.4s, v22.4s // t63
+
+ mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
+ mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a
+ neg v2.4s, v2.4s // t34a
+ mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
+ srshr v26.4s, v2.4s, #12 // t34a
+ mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
+ srshr v29.4s, v4.4s, #12 // t61a
+ srshr v25.4s, v6.4s, #12 // t33a
+ srshr v30.4s, v2.4s, #12 // t62a
+
+ sqadd v16.4s, v24.4s, v27.4s // t32a
+ sqsub v19.4s, v24.4s, v27.4s // t35a
+ sqadd v17.4s, v25.4s, v26.4s // t33
+ sqsub v18.4s, v25.4s, v26.4s // t34
+ sqsub v20.4s, v31.4s, v28.4s // t60a
+ sqadd v23.4s, v31.4s, v28.4s // t63a
+ sqsub v21.4s, v30.4s, v29.4s // t61
+ sqadd v22.4s, v30.4s, v29.4s // t62
+
+ mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
+ mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a
+ mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
+ srshr v21.4s, v2.4s, #12 // t61a
+ srshr v18.4s, v4.4s, #12 // t34a
+ mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
+ srshr v20.4s, v6.4s, #12 // t60
+ srshr v19.4s, v2.4s, #12 // t35
+
+ st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
+ st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
+
+ ret
+endfunc
+
+function inv_dct64_step2_neon
+ movrel x16, idct_coeffs
+ ld1 {v0.4s}, [x16]
+1:
+ // t32a/33/34a/35/60/61a/62/63a
+ // t56a/57/58a/59/36/37a/38/39a
+ // t40a/41/42a/43/52/53a/54/55a
+ // t48a/49/50a/51/44/45a/46/47a
+ ldr q16, [x6, #4*4*0] // t32a
+ ldr q17, [x9, #4*4*8] // t39a
+ ldr q18, [x9, #4*4*0] // t63a
+ ldr q19, [x6, #4*4*8] // t56a
+ ldr q20, [x6, #4*4*16] // t40a
+ ldr q21, [x9, #4*4*24] // t47a
+ ldr q22, [x9, #4*4*16] // t55a
+ ldr q23, [x6, #4*4*24] // t48a
+
+ sqadd v24.4s, v16.4s, v17.4s // t32
+ sqsub v25.4s, v16.4s, v17.4s // t39
+ sqadd v26.4s, v18.4s, v19.4s // t63
+ sqsub v27.4s, v18.4s, v19.4s // t56
+ sqsub v28.4s, v21.4s, v20.4s // t40
+ sqadd v29.4s, v21.4s, v20.4s // t47
+ sqadd v30.4s, v23.4s, v22.4s // t48
+ sqsub v31.4s, v23.4s, v22.4s // t55
+
+ mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
+ mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a
+ mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
+ srshr v25.4s, v2.4s, #12 // t56a
+ srshr v27.4s, v4.4s, #12 // t39a
+ neg v6.4s, v6.4s // t40a
+ mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
+ srshr v31.4s, v6.4s, #12 // t40a
+ srshr v28.4s, v2.4s, #12 // t55a
+
+ sqadd v16.4s, v24.4s, v29.4s // t32a
+ sqsub v19.4s, v24.4s, v29.4s // t47a
+ sqadd v17.4s, v27.4s, v31.4s // t39
+ sqsub v18.4s, v27.4s, v31.4s // t40
+ sqsub v20.4s, v26.4s, v30.4s // t48a
+ sqadd v23.4s, v26.4s, v30.4s // t63a
+ sqsub v21.4s, v25.4s, v28.4s // t55
+ sqadd v22.4s, v25.4s, v28.4s // t56
+
+ mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
+ mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a
+ mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
+ srshr v18.4s, v2.4s, #12 // t40a
+ srshr v21.4s, v4.4s, #12 // t55a
+ mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
+ srshr v19.4s, v6.4s, #12 // t47
+ srshr v20.4s, v2.4s, #12 // t48
+
+ str q16, [x6, #4*4*0] // t32a
+ str q17, [x9, #4*4*0] // t39
+ str q18, [x6, #4*4*8] // t40a
+ str q19, [x9, #4*4*8] // t47
+ str q20, [x6, #4*4*16] // t48
+ str q21, [x9, #4*4*16] // t55a
+ str q22, [x6, #4*4*24] // t56
+ str q23, [x9, #4*4*24] // t63a
+
+ add x6, x6, #4*4
+ sub x9, x9, #4*4
+ cmp x6, x9
+ b.lt 1b
+ ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+.if \clear
+ ld1 {\i}, [\src]
+ st1 {\zero}, [\src], \strd
+.else
+ ld1 {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ st1 {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+ movi \i, #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+ movi \reg, \val
+.endif
+.endm
+
+.macro movz16dup_if reg, gpr, val, cond
+.if \cond
+ movz \gpr, \val, lsl #16
+ dup \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+ st1 \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+ str \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+ str \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+ scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4s_x64_neon
+ mov x14, x30
+ mov x6, sp
+ lsl x8, x8, #2
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ add x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct_4s_x16_neon
+
+ store16 x6
+
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.8h, #0, \clear
+ load8 x7, x8, v7.4s, \clear
+ clear_upper8
+ sub x7, x7, x8, lsl #3
+ lsr x8, x8, #1
+ sub x7, x7, x8, lsr #1
+ scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+ bl inv_dct32_odd_4s_x16_neon
+
+ add x10, x6, #16*15
+ sub x6, x6, #16*16
+
+ mov x9, #-16
+
+.macro store_addsub r0, r1, r2, r3
+ ld1 {v2.4s}, [x6], #16
+ ld1 {v3.4s}, [x6], #16
+ sqadd v6.4s, v2.4s, \r0
+ sqsub \r0, v2.4s, \r0
+ ld1 {v4.4s}, [x6], #16
+ sqadd v7.4s, v3.4s, \r1
+ sqsub \r1, v3.4s, \r1
+ ld1 {v5.4s}, [x6], #16
+ sqadd v2.4s, v4.4s, \r2
+ sub x6, x6, #16*4
+ sqsub \r2, v4.4s, \r2
+ st1 {v6.4s}, [x6], #16
+ st1 {\r0}, [x10], x9
+ sqadd v3.4s, v5.4s, \r3
+ sqsub \r3, v5.4s, \r3
+ st1 {v7.4s}, [x6], #16
+ st1 {\r1}, [x10], x9
+ st1 {v2.4s}, [x6], #16
+ st1 {\r2}, [x10], x9
+ st1 {v3.4s}, [x6], #16
+ st1 {\r3}, [x10], x9
+.endm
+ store_addsub v31.4s, v30.4s, v29.4s, v28.4s
+ store_addsub v27.4s, v26.4s, v25.4s, v24.4s
+ store_addsub v23.4s, v22.4s, v21.4s, v20.4s
+ store_addsub v19.4s, v18.4s, v17.4s, v16.4s
+.purgem store_addsub
+
+ add x6, x6, #4*4*16
+
+ movrel x17, idct64_coeffs
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x9, x7, x8, lsl #4 // offset 16
+ add x10, x7, x8, lsl #3 // offset 8
+ sub x9, x9, x8 // offset 15
+ sub x11, x10, x8 // offset 7
+ ld1 {v16.4s}, [x7] // in1 (offset 0)
+ ld1 {v17.4s}, [x9] // in31 (offset 15)
+ ld1 {v18.4s}, [x10] // in17 (offset 8)
+ ld1 {v19.4s}, [x11] // in15 (offset 7)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ add x7, x7, x8, lsl #2 // offset 4
+ sub x9, x9, x8, lsl #2 // offset 11
+ sub x10, x7, x8 // offset 3
+ add x11, x9, x8 // offset 12
+ ld1 {v16.4s}, [x10] // in7 (offset 3)
+ ld1 {v17.4s}, [x11] // in25 (offset 12)
+ ld1 {v18.4s}, [x9] // in23 (offset 11)
+ ld1 {v19.4s}, [x7] // in9 (offset 4)
+ st1_if {v7.4s}, [x7], \clear
+ st1_if {v7.4s}, [x9], \clear
+ st1_if {v7.4s}, [x10], \clear
+ st1_if {v7.4s}, [x11], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ sub x10, x10, x8, lsl #1 // offset 1
+ sub x9, x9, x8, lsl #1 // offset 9
+ add x7, x7, x8 // offset 5
+ add x11, x11, x8 // offset 13
+ ldr q16, [x10, x8] // in5 (offset 2)
+ ldr q17, [x11] // in27 (offset 13)
+ ldr q18, [x9, x8] // in21 (offset 10)
+ ldr q19, [x7] // in11 (offset 5)
+ stroff_if q7, [x10, x8], \clear
+ str_if q7, [x11], \clear
+ stroff_if q7, [x9, x8], \clear
+ str_if q7, [x7], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+ movz16dup_if v0.2s, w16, #2896*8, \scale
+ movi_if v7.4s, #0, \clear
+ ldr q16, [x10] // in3 (offset 1)
+ ldr q17, [x11, x8] // in29 (offset 14)
+ ldr q18, [x9] // in19 (offset 9)
+ ldr q19, [x7, x8] // in13 (offset 6)
+ str_if q7, [x10], \clear
+ stroff_if q7, [x11, x8], \clear
+ str_if q7, [x9], \clear
+ stroff_if q7, [x7, x8], \clear
+ scale_if \scale, v0.s[0], v16, v17, v18, v19
+ bl inv_dct64_step1_neon
+
+ sub x6, x6, #4*4*32
+ add x9, x6, #4*4*7
+
+ bl inv_dct64_step2_neon
+
+ br x14
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x4_neon
+ mov x14, x30
+
+ mov x7, sp
+ add x8, sp, #4*4*(64 - 4)
+ add x9, x6, #2*56
+ mov x10, #2*64
+ mov x11, #-4*4*4
+
+ dup v7.4s, w12
+1:
+ ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
+ ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
+ ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
+ ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
+ transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
+ transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
+ transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5
+ transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5
+
+.macro store_addsub src0, src1, src2, src3
+ sqsub v1.4s, \src0, \src1
+ sqadd v0.4s, \src0, \src1
+ sqsub v3.4s, \src2, \src3
+ srshl v1.4s, v1.4s, v7.4s
+ sqadd v2.4s, \src2, \src3
+ srshl v3.4s, v3.4s, v7.4s
+ srshl v0.4s, v0.4s, v7.4s
+ srshl v2.4s, v2.4s, v7.4s
+ sqxtn v3.4h, v3.4s
+ sqxtn2 v3.8h, v1.4s
+ sqxtn v0.4h, v0.4s
+ sqxtn2 v0.8h, v2.4s
+ rev64 v3.8h, v3.8h
+ st1 {v0.8h}, [x6], x10
+ st1 {v3.8h}, [x9], x10
+.endm
+ store_addsub v16.4s, v31.4s, v20.4s, v27.4s
+ store_addsub v17.4s, v30.4s, v21.4s, v26.4s
+ store_addsub v18.4s, v29.4s, v22.4s, v25.4s
+ store_addsub v19.4s, v28.4s, v23.4s, v24.4s
+.purgem store_addsub
+ sub x6, x6, x10, lsl #2
+ sub x9, x9, x10, lsl #2
+ add x6, x6, #16
+ sub x9, x9, #16
+
+ cmp x7, x8
+ b.lt 1b
+ br x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+ mov x14, x30
+ lsl x8, x8, #1
+
+ mov x7, sp
+ add x8, sp, #2*8*(64 - 4)
+ add x9, x6, x1, lsl #6
+ sub x9, x9, x1
+ neg x10, x1
+ mov x11, #-2*8*4
+
+1:
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+ movi v6.8h, #0
+ mvni v7.8h, #0xfc, lsl #8 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+ ld1 {v0.8h}, [x6], x1
+ ld1 {v1.8h}, [x9], x10
+ sqadd v4.8h, \src0, \src1
+ ld1 {v2.8h}, [x6]
+ sqsub \src0, \src0, \src1
+ ld1 {v3.8h}, [x9]
+ sqadd v5.8h, \src2, \src3
+ sqsub \src2, \src2, \src3
+ sub x6, x6, x1
+ sub x9, x9, x10
+ srshr v4.8h, v4.8h, #4
+ srshr v5.8h, v5.8h, #4
+ srshr \src0, \src0, #4
+ sqadd v0.8h, v0.8h, v4.8h
+ srshr \src2, \src2, #4
+ sqadd v1.8h, v1.8h, \src0
+ sqadd v2.8h, v2.8h, v5.8h
+ smax v0.8h, v0.8h, v6.8h
+ sqadd v3.8h, v3.8h, \src2
+ smax v1.8h, v1.8h, v6.8h
+ smin v0.8h, v0.8h, v7.8h
+ smax v2.8h, v2.8h, v6.8h
+ smin v1.8h, v1.8h, v7.8h
+ st1 {v0.8h}, [x6], x1
+ smax v3.8h, v3.8h, v6.8h
+ smin v2.8h, v2.8h, v7.8h
+ st1 {v1.8h}, [x9], x10
+ smin v3.8h, v3.8h, v7.8h
+ st1 {v2.8h}, [x6], x1
+ st1 {v3.8h}, [x9], x10
+.endm
+ add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h
+ add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h
+ add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h
+ add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h
+.purgem add_dest_addsub
+ cmp x7, x8
+ b.lt 1b
+
+ br x14
+endfunc
+
+.macro sub_sp space
+#ifdef _WIN32
+.if \space > 4096
+ sub x16, sp, #4096
+ ldr xzr, [x16]
+ sub sp, x16, #(\space - 4096)
+.else
+ sub sp, sp, #\space
+.endif
+#else
+.if \space >= 4096
+ sub sp, sp, #(\space)/4096*4096
+.if (\space % 4096) != 0
+ sub sp, sp, #(\space)%4096
+.endif
+.else
+ sub sp, sp, #\space
+.endif
+#endif
+.endm
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+ idct_dc 64, 64, 2
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #64*32*2
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+ idct_dc 64, 32, 1
+
+ mov x15, x30
+
+ sub_sp 64*32*2+64*4*4
+ add x5, sp, #64*4*4
+
+ movrel x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*64*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ mov x12, #-1 // shift
+ bl inv_txfm_dct_clear_scale_4s_x64_neon
+ add x6, x5, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x5, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+ add sp, x5, #64*32*2
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+ idct_dc 32, 64, 1
+
+ mov x15, x30
+
+ sub_sp 32*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_32x32
+ ldrh w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*32*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+ ldrh w12, [x13], #2
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_scale_dct_32x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8, 16, 24
+ add x7, x5, #(\i*2)
+ mov x8, #32*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #32*32*2
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+ idct_dc 64, 16, 2
+
+ mov x15, x30
+
+ sub_sp 64*16*2+64*4*4
+ add x4, sp, #64*4*4
+
+ movrel x13, eob_16x32
+
+.irp i, 0, 4, 8, 12
+ add x6, x4, #(\i*64*2)
+.if \i > 0
+ mov w8, #(16 - \i)
+ cmp w3, w12
+ b.lt 1f
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #16*4
+ mov x12, #-2 // shift
+ bl inv_txfm_dct_clear_4s_x64_neon
+ add x6, x4, #(\i*64*2)
+ bl inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+ ldrh w12, [x13], #2
+.endif
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #2
+.rept 4
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+ movrel x5, X(inv_dct_8h_x16_neon)
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+ add x6, x0, #(\i*2)
+ add x7, x4, #(\i*2)
+ mov x8, #64*2
+ bl inv_txfm_add_vert_8x16_neon
+.endr
+
+ add sp, x4, #64*16*2
+ br x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+ idct_dc 16, 64, 2
+
+ mov x15, x30
+
+ sub_sp 16*32*2+64*8*2
+ add x5, sp, #64*8*2
+
+ movrel x13, eob_16x32
+ ldrh w12, [x13], #2
+
+ adr x4, inv_dct_4s_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+ add x6, x5, #(\i*16*2)
+.if \i > 0
+ mov w8, #(32 - \i)
+ cmp w3, w12
+ b.lt 1f
+ ldrh w12, [x13], #2
+.endif
+ add x7, x2, #(\i*4)
+ mov x8, #32*4
+ bl inv_txfm_horz_16x4_neon
+.endr
+ b 3f
+
+1:
+ movi v4.8h, #0
+ movi v5.8h, #0
+ movi v6.8h, #0
+ movi v7.8h, #0
+2:
+ subs w8, w8, #4
+.rept 2
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+ b.gt 2b
+
+3:
+.irp i, 0, 8
+ add x7, x5, #(\i*2)
+ mov x8, #16*2
+ bl X(inv_txfm_dct_8h_x64_neon)
+ add x6, x0, #(\i*2)
+ bl inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+ add sp, x5, #16*32*2
+ br x15
+endfunc
--- a/src/arm/64/util.S
+++ b/src/arm/64/util.S
@@ -170,6 +170,18 @@
trn2 \r3\().2s, \t5\().2s, \t7\().2s
.endm
+.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().4s, \r0\().4s, \r1\().4s
+ trn2 \t5\().4s, \r0\().4s, \r1\().4s
+ trn1 \t6\().4s, \r2\().4s, \r3\().4s
+ trn2 \t7\().4s, \r2\().4s, \r3\().4s
+
+ trn1 \r0\().2d, \t4\().2d, \t6\().2d
+ trn2 \r2\().2d, \t4\().2d, \t6\().2d
+ trn1 \r1\().2d, \t5\().2d, \t7\().2d
+ trn2 \r3\().2d, \t5\().2d, \t7\().2d
+.endm
+
.macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7
trn1 \t4\().8h, \r0\().8h, \r1\().8h
trn2 \t5\().8h, \r0\().8h, \r1\().8h
--- a/src/arm/itx_init_tmpl.c
+++ b/src/arm/itx_init_tmpl.c
@@ -117,7 +117,9 @@
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
-#if BITDEPTH == 8 && ARCH_AARCH64
+ if (bpc > 10) return;
+
+#if ARCH_AARCH64
assign_itx17_fn( , 4, 4, neon);
assign_itx16_fn(R, 4, 8, neon);
assign_itx16_fn(R, 4, 16, neon);
--- a/src/meson.build
+++ b/src/meson.build
@@ -102,6 +102,8 @@
)
if host_machine.cpu_family() == 'aarch64'
libdav1d_sources += files(
+ # itx.S is used for both 8 and 16 bpc.
+ 'arm/64/itx.S',
'arm/64/looprestoration_common.S',
'arm/64/msac.S',
)
@@ -110,7 +112,6 @@
libdav1d_sources += files(
'arm/64/cdef.S',
'arm/64/ipred.S',
- 'arm/64/itx.S',
'arm/64/loopfilter.S',
'arm/64/looprestoration.S',
'arm/64/mc.S',
@@ -121,6 +122,7 @@
libdav1d_sources += files(
'arm/64/cdef16.S',
'arm/64/ipred16.S',
+ 'arm/64/itx16.S',
'arm/64/loopfilter16.S',
'arm/64/looprestoration16.S',
'arm/64/mc16.S',