ref: 33e65d80de3e3e17c11d6bc6a8da25bcca099962
parent: 4504ae3f469f4beb1c9f0c1c703c5778cc0f32d1
author: Martin Storsjö <martin@martin.st>
date: Thu Jan 2 02:58:56 EST 2020
arm64: itx: Adjust .irp in the 4x16/16x4/8x16/16x8 functions Don't use the \() token concatenation operator in the .irp loops; if the function definition is enclosed in a .macro, we can't use \() in the loop as it is expanded already when the macro is expanded, before the loop is expanded.
--- a/src/arm/64/itx.S
+++ b/src/arm/64/itx.S
@@ -1473,8 +1473,8 @@
mov x15, x30
movi v4.8h, #0
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().4h}, [x2]
+.irp i, v16.4h, v17.4h, v18.4h, v19.4h, v20.4h, v21.4h, v22.4h, v23.4h, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+ ld1 {\i}, [x2]
st1 {v4.4h}, [x2], #8
.endr
@@ -1484,8 +1484,8 @@
ins v17.d[1], v21.d[0]
ins v18.d[1], v22.d[0]
ins v19.d[1], v23.d[0]
-.irp i, 16, 17, 18, 19
- srshr v\i\().8h, v\i\().8h, #1
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ srshr \i, \i, #1
.endr
transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5
blr x5
@@ -1517,8 +1517,8 @@
b.lt 1f
add x6, x2, #16
-.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x6]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ ld1 {\i}, [x6]
st1 {v2.8h}, [x6], x11
.endr
blr x4
@@ -1534,18 +1534,18 @@
b 2f
1:
-.irp i, 24, 25, 26, 27, 28, 29, 30, 31
- movi v\i\().4h, #0
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h, v28.4h, v29.4h, v30.4h, v31.4h
+ movi \i, #0
.endr
2:
movi v2.8h, #0
-.irp i, 16, 17, 18, 19
- ld1 {v\i\().8h}, [x2]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ ld1 {\i}, [x2]
st1 {v2.8h}, [x2], x11
.endr
blr x4
-.irp i, 16, 17, 18, 19
- srshr v\i\().8h, v\i\().8h, #1
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h
+ srshr \i, \i, #1
.endr
transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7
ins v20.d[0], v16.d[1]
@@ -1606,8 +1606,8 @@
mov w16, #2896*8
dup v0.4h, w16
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
- ld1 {v\i\().8h}, [x2]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ ld1 {\i}, [x2]
st1 {v4.8h}, [x2], #16
.endr
@@ -1615,8 +1615,8 @@
scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
blr x4
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- srshr v\i\().8h, v\i\().8h, #1
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ srshr \i, \i, #1
.endr
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
@@ -1655,8 +1655,8 @@
b.lt 1f
add x6, x2, #16
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x6]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x6]
st1 {v4.8h}, [x6], x11
.endr
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
@@ -1675,8 +1675,8 @@
b 2f
1:
-.irp i, 24, 25, 26, 27, 28, 29, 30, 31
- movi v\i\().8h, #0
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+ movi \i, #0
.endr
2:
@@ -1684,15 +1684,15 @@
mov w16, #2896*8
dup v0.4h, w16
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- ld1 {v\i\().8h}, [x2]
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ ld1 {\i}, [x2]
st1 {v4.8h}, [x2], x11
.endr
scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
blr x4
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
- srshr v\i\().8h, v\i\().8h, #1
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
+ srshr \i, \i, #1
.endr
transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v2, v3