ref: b37f87df78571df7bdc79a1cf207c00dc720f97c
dir: /src/arm/32/mc.S/
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Janne Grunau
* Copyright © 2018, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro avg dst0, dst1, t0, t1, t2, t3
vld1.16 {\t0,\t1}, [r2, :128]!
vld1.16 {\t2,\t3}, [r3, :128]!
vadd.i16 \t0, \t0, \t2
vadd.i16 \t1, \t1, \t3
vqrshrun.s16 \dst0, \t0, #5
vqrshrun.s16 \dst1, \t1, #5
.endm
.macro w_avg dst0, dst1, t0, t1, t2, t3
vld1.16 {\t0,\t1}, [r2, :128]!
vld1.16 {\t2,\t3}, [r3, :128]!
vsub.i16 \t0, \t2, \t0
vsub.i16 \t1, \t3, \t1
vqdmulh.s16 \t0, \t0, q15
vqdmulh.s16 \t1, \t1, q15
vadd.i16 \t0, \t2, \t0
vadd.i16 \t1, \t3, \t1
vqrshrun.s16 \dst0, \t0, #4
vqrshrun.s16 \dst1, \t1, #4
.endm
.macro mask dst0, dst1, t0, t1, t2, t3
vld1.8 {q14}, [lr, :128]!
vld1.16 {\t0,\t1}, [r2, :128]!
vmul.i8 q14, q14, q15
vld1.16 {\t2,\t3}, [r3, :128]!
vshll.i8 q13, d28, #8
vshll.i8 q14, d29, #8
vsub.i16 \t0, \t2, \t0
vsub.i16 \t1, \t3, \t1
vqdmulh.s16 \t0, \t0, q13
vqdmulh.s16 \t1, \t1, q14
vadd.i16 \t0, \t2, \t0
vadd.i16 \t1, \t3, \t1
vqrshrun.s16 \dst0, \t0, #4
vqrshrun.s16 \dst1, \t1, #4
.endm
.macro bidir_fn type
function \type\()_8bpc_neon, export=1
push {r4-r6,lr}
ldr r4, [sp, #16]
ldr r5, [sp, #20]
clz r4, r4
.ifnc \type, avg
ldr lr, [sp, #24]
.endif
.ifc \type, w_avg
vdup.s16 q15, lr
vneg.s16 q15, q15
vshl.i16 q15, q15, #11
.endif
.ifc \type, mask
vmov.i8 q15, #256-2
.endif
adr r12, L(\type\()_tbl)
sub r4, r4, #24
ldr r4, [r12, r4, lsl #2]
\type d16, d17, q0, q1, q2, q3
add r12, r12, r4
bx r12
.align 2
L(\type\()_tbl):
.word 1280f - L(\type\()_tbl) + CONFIG_THUMB
.word 640f - L(\type\()_tbl) + CONFIG_THUMB
.word 320f - L(\type\()_tbl) + CONFIG_THUMB
.word 160f - L(\type\()_tbl) + CONFIG_THUMB
.word 80f - L(\type\()_tbl) + CONFIG_THUMB
.word 4f - L(\type\()_tbl) + CONFIG_THUMB
4:
add r6, r0, r1
lsl r1, r1, #1
cmp r5, #4
vst1.32 {d16[0]}, [r0, :32], r1
vst1.32 {d16[1]}, [r6, :32], r1
vst1.32 {d17[0]}, [r0, :32], r1
vst1.32 {d17[1]}, [r6, :32], r1
beq 0f
\type d18, d19, q0, q1, q2, q3
cmp r5, #8
vst1.32 {d18[0]}, [r0, :32], r1
vst1.32 {d18[1]}, [r6, :32], r1
vst1.32 {d19[0]}, [r0, :32], r1
vst1.32 {d19[1]}, [r6, :32], r1
beq 0f
\type d16, d17, q0, q1, q2, q3
vst1.32 {d16[0]}, [r0, :32], r1
vst1.32 {d16[1]}, [r6, :32], r1
\type d18, d19, q0, q1, q2, q3
vst1.32 {d17[0]}, [r0, :32], r1
vst1.32 {d17[1]}, [r6, :32], r1
vst1.32 {d18[0]}, [r0, :32], r1
vst1.32 {d18[1]}, [r6, :32], r1
vst1.32 {d19[0]}, [r0, :32], r1
vst1.32 {d19[1]}, [r6, :32], r1
pop {r4-r6,pc}
80:
add r6, r0, r1
lsl r1, r1, #1
8:
vst1.8 {d16}, [r0, :64], r1
\type d18, d19, q0, q1, q2, q3
vst1.8 {d17}, [r6, :64], r1
vst1.8 {d18}, [r0, :64], r1
subs r5, r5, #4
vst1.8 {d19}, [r6, :64], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 8b
160:
add r6, r0, r1
lsl r1, r1, #1
16:
\type d18, d19, q0, q1, q2, q3
vst1.8 {q8}, [r0, :128], r1
\type d20, d21, q0, q1, q2, q3
vst1.8 {q9}, [r6, :128], r1
\type d22, d23, q0, q1, q2, q3
vst1.8 {q10}, [r0, :128], r1
subs r5, r5, #4
vst1.8 {q11}, [r6, :128], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 16b
320:
add r6, r0, r1
lsl r1, r1, #1
32:
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
vst1.8 {q8, q9}, [r0, :128], r1
\type d22, d23, q0, q1, q2, q3
subs r5, r5, #2
vst1.8 {q10, q11}, [r6, :128], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 32b
640:
add r6, r0, #32
64:
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
\type d22, d23, q0, q1, q2, q3
vst1.8 {q8, q9}, [r0, :128], r1
\type d16, d17, q0, q1, q2, q3
vst1.8 {q10, q11}, [r6, :128], r1
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
vst1.8 {q8, q9}, [r0, :128], r1
\type d22, d23, q0, q1, q2, q3
subs r5, r5, #2
vst1.8 {q10, q11}, [r6, :128], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 64b
1280:
sub r1, r1, #32
add r6, r0, #64
128:
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
\type d22, d23, q0, q1, q2, q3
vst1.8 {q8, q9}, [r0, :128]!
\type d16, d17, q0, q1, q2, q3
vst1.8 {q10, q11}, [r0, :128], r1
\type d18, d19, q0, q1, q2, q3
\type d20, d21, q0, q1, q2, q3
vst1.8 {q8, q9}, [r6, :128]!
\type d22, d23, q0, q1, q2, q3
subs r5, r5, #1
vst1.8 {q10, q11}, [r6, :128], r1
ble 0f
\type d16, d17, q0, q1, q2, q3
b 128b
0:
pop {r4-r6,pc}
endfunc
.endm
bidir_fn avg
bidir_fn w_avg
bidir_fn mask
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r5,
// and assumes that r8 is set to (clz(w)-24).
function put_neon
adr r9, L(put_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
bx r9
.align 2
L(put_tbl):
.word 1280f - L(put_tbl) + CONFIG_THUMB
.word 640f - L(put_tbl) + CONFIG_THUMB
.word 32f - L(put_tbl) + CONFIG_THUMB
.word 160f - L(put_tbl) + CONFIG_THUMB
.word 8f - L(put_tbl) + CONFIG_THUMB
.word 4f - L(put_tbl) + CONFIG_THUMB
.word 2f - L(put_tbl) + CONFIG_THUMB
2:
vld1.16 {d0[]}, [r2], r3
vld1.16 {d1[]}, [r2], r3
subs r5, r5, #2
vst1.16 {d0[0]}, [r0, :16], r1
vst1.16 {d1[0]}, [r0, :16], r1
bgt 2b
pop {r4-r11,pc}
4:
vld1.32 {d0[]}, [r2], r3
vld1.32 {d1[]}, [r2], r3
subs r5, r5, #2
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d1[0]}, [r0, :32], r1
bgt 4b
pop {r4-r11,pc}
8:
vld1.8 {d0}, [r2], r3
vld1.8 {d1}, [r2], r3
subs r5, r5, #2
vst1.8 {d0}, [r0, :64], r1
vst1.8 {d1}, [r0, :64], r1
bgt 8b
pop {r4-r11,pc}
160:
add r8, r0, r1
lsl r1, r1, #1
add r9, r2, r3
lsl r3, r3, #1
16:
vld1.8 {q0}, [r2], r3
vld1.8 {q1}, [r9], r3
subs r5, r5, #2
vst1.8 {q0}, [r0, :128], r1
vst1.8 {q1}, [r8, :128], r1
bgt 16b
pop {r4-r11,pc}
32:
vld1.8 {q0, q1}, [r2], r3
subs r5, r5, #1
vst1.8 {q0, q1}, [r0, :128], r1
bgt 32b
pop {r4-r11,pc}
640:
sub r1, r1, #32
sub r3, r3, #32
64:
vld1.8 {q0, q1}, [r2]!
vst1.8 {q0, q1}, [r0, :128]!
vld1.8 {q2, q3}, [r2], r3
subs r5, r5, #1
vst1.8 {q2, q3}, [r0, :128], r1
bgt 64b
pop {r4-r11,pc}
1280:
sub r1, r1, #96
sub r3, r3, #96
128:
vld1.8 {q8, q9}, [r2]!
vst1.8 {q8, q9}, [r0, :128]!
vld1.8 {q10, q11}, [r2]!
vst1.8 {q10, q11}, [r0, :128]!
vld1.8 {q12, q13}, [r2]!
vst1.8 {q12, q13}, [r0, :128]!
vld1.8 {q14, q15}, [r2], r3
subs r5, r5, #1
vst1.8 {q14, q15}, [r0, :128], r1
bgt 128b
pop {r4-r11,pc}
endfunc
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r4,
// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
function prep_neon
adr r9, L(prep_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
bx r9
.align 2
L(prep_tbl):
.word 1280f - L(prep_tbl) + CONFIG_THUMB
.word 640f - L(prep_tbl) + CONFIG_THUMB
.word 320f - L(prep_tbl) + CONFIG_THUMB
.word 160f - L(prep_tbl) + CONFIG_THUMB
.word 8f - L(prep_tbl) + CONFIG_THUMB
.word 4f - L(prep_tbl) + CONFIG_THUMB
4:
vld1.32 {d0[]}, [r1], r2
vld1.32 {d2[]}, [r1], r2
subs r4, r4, #2
vshll.u8 q0, d0, #4
vshll.u8 q1, d2, #4
vst1.16 {d1, d2}, [r0, :64]!
bgt 4b
pop {r4-r11,pc}
8:
vld1.8 {d0}, [r1], r2
vld1.8 {d2}, [r1], r2
subs r4, r4, #2
vshll.u8 q0, d0, #4
vshll.u8 q1, d2, #4
vst1.16 {q0, q1}, [r0, :128]!
bgt 8b
pop {r4-r11,pc}
160:
add r9, r1, r2
lsl r2, r2, #1
add r8, r0, r7
lsl r7, r7, #1
16:
vld1.8 {q2}, [r1], r2
vld1.8 {q3}, [r9], r2
subs r4, r4, #2
vshll.u8 q0, d4, #4
vshll.u8 q1, d5, #4
vshll.u8 q2, d6, #4
vshll.u8 q3, d7, #4
vst1.16 {q0, q1}, [r0, :128], r7
vst1.16 {q2, q3}, [r8, :128], r7
bgt 16b
pop {r4-r11,pc}
320:
add r8, r0, r3
32:
vld1.8 {q0, q1}, [r1], r2
subs r4, r4, #2
vshll.u8 q8, d0, #4
vshll.u8 q9, d1, #4
vld1.8 {q2, q3}, [r1], r2
vshll.u8 q10, d2, #4
vshll.u8 q11, d3, #4
vshll.u8 q12, d4, #4
vst1.16 {q8, q9}, [r0, :128], r7
vshll.u8 q13, d5, #4
vst1.16 {q10, q11}, [r8, :128], r7
vshll.u8 q14, d6, #4
vst1.16 {q12, q13}, [r0, :128], r7
vshll.u8 q15, d7, #4
vst1.16 {q14, q15}, [r8, :128], r7
bgt 32b
pop {r4-r11,pc}
640:
sub r2, r2, #32
add r8, r0, #32
mov r6, #64
64:
vld1.8 {q0, q1}, [r1]!
subs r4, r4, #1
vshll.u8 q8, d0, #4
vshll.u8 q9, d1, #4
vld1.8 {q2, q3}, [r1], r2
vshll.u8 q10, d2, #4
vshll.u8 q11, d3, #4
vshll.u8 q12, d4, #4
vst1.16 {q8, q9}, [r0, :128], r6
vshll.u8 q13, d5, #4
vshll.u8 q14, d6, #4
vst1.16 {q10, q11}, [r8, :128], r6
vshll.u8 q15, d7, #4
vst1.16 {q12, q13}, [r0, :128], r6
vst1.16 {q14, q15}, [r8, :128], r6
bgt 64b
pop {r4-r11,pc}
1280:
sub r2, r2, #96
add r8, r0, #32
mov r6, #64
128:
vld1.8 {q0, q1}, [r1]!
vld1.8 {q2, q3}, [r1]!
vshll.u8 q10, d0, #4
vshll.u8 q11, d1, #4
vshll.u8 q12, d2, #4
vshll.u8 q13, d3, #4
vshll.u8 q14, d4, #4
vshll.u8 q15, d5, #4
vld1.8 {q8, q9}, [r1]!
vst1.16 {q10, q11}, [r0, :128], r6
vst1.16 {q12, q13}, [r8, :128], r6
vshll.u8 q0, d6, #4
vshll.u8 q1, d7, #4
vshll.u8 q2, d16, #4
vshll.u8 q3, d17, #4
vshll.u8 q8, d18, #4
vshll.u8 q9, d19, #4
vld1.8 {q10, q11}, [r1], r2
vst1.16 {q14, q15}, [r0, :128], r6
vst1.16 {q0, q1}, [r8, :128], r6
vshll.u8 q12, d20, #4
vshll.u8 q13, d21, #4
vshll.u8 q14, d22, #4
vshll.u8 q15, d23, #4
subs r4, r4, #1
vst1.16 {q2, q3}, [r0, :128], r6
vst1.16 {q8, q9}, [r8, :128], r6
vst1.16 {q12, q13}, [r0, :128], r6
vst1.16 {q14, q15}, [r8, :128], r6
bgt 128b
pop {r4-r11,pc}
endfunc
.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
vld1.\wd {\d0[]}, [\s0], \strd
vld1.\wd {\d1[]}, [\s1], \strd
.ifnb \d2
vld1.\wd {\d2[]}, [\s0], \strd
vld1.\wd {\d3[]}, [\s1], \strd
.endif
.ifnb \d4
vld1.\wd {\d4[]}, [\s0], \strd
.endif
.ifnb \d5
vld1.\wd {\d5[]}, [\s1], \strd
.endif
.ifnb \d6
vld1.\wd {\d6[]}, [\s0], \strd
.endif
.endm
.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
vld1.8 {\d0}, [\s0], \strd
vld1.8 {\d1}, [\s1], \strd
.ifnb \d2
vld1.8 {\d2}, [\s0], \strd
vld1.8 {\d3}, [\s1], \strd
.endif
.ifnb \d4
vld1.8 {\d4}, [\s0], \strd
.endif
.ifnb \d5
vld1.8 {\d5}, [\s1], \strd
.endif
.ifnb \d6
vld1.8 {\d6}, [\s0], \strd
.endif
.endm
.macro load_16 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
load_slice \s0, \s1, \strd, 16, \d0, \d1, \d2, \d3, \d4, \d5, \d6
.endm
.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
load_slice \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
.endm
.macro interleave_1_16 r0, r1, r2, r3, r4
vext.8 \r0, \r0, \r1, #6
vext.8 \r1, \r1, \r2, #6
.ifnb \r3
vext.8 \r2, \r2, \r3, #6
vext.8 \r3, \r3, \r4, #6
.endif
.endm
.macro interleave_1_32 r0, r1, r2, r3, r4
vext.8 \r0, \r0, \r1, #4
vext.8 \r1, \r1, \r2, #4
.ifnb \r3
vext.8 \r2, \r2, \r3, #4
vext.8 \r3, \r3, \r4, #4
.endif
.endm
.macro vmovl_u8 q0, d0, q1, d1, q2, d2, q3, d3, q4, d4, q5, d5, q6, d6
vmovl.u8 \q0, \d0
vmovl.u8 \q1, \d1
.ifnb \q2
vmovl.u8 \q2, \d2
vmovl.u8 \q3, \d3
.endif
.ifnb \q4
vmovl.u8 \q4, \d4
.endif
.ifnb \q5
vmovl.u8 \q5, \d5
.endif
.ifnb \q6
vmovl.u8 \q6, \d6
.endif
.endm
.macro mul_mla_4 d, s0, s1, s2, s3
vmul.s16 \d, \s0, d0[0]
vmla.s16 \d, \s1, d0[1]
vmla.s16 \d, \s2, d0[2]
vmla.s16 \d, \s3, d0[3]
.endm
.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
vmul.s16 \d0, \s0, d0[0]
vmla.s16 \d0, \s1, d0[1]
vmla.s16 \d0, \s2, d0[2]
vmla.s16 \d0, \s3, d0[3]
vmla.s16 \d0, \s4, d1[0]
vmla.s16 \d0, \s5, d1[1]
vmla.s16 \d0, \s6, d1[2]
vmla.s16 \d0, \s7, d1[3]
vmul.s16 \d1, \s1, d0[0]
vmla.s16 \d1, \s2, d0[1]
vmla.s16 \d1, \s3, d0[2]
vmla.s16 \d1, \s4, d0[3]
vmla.s16 \d1, \s5, d1[0]
vmla.s16 \d1, \s6, d1[1]
vmla.s16 \d1, \s7, d1[2]
vmla.s16 \d1, \s8, d1[3]
.endm
.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
vmul.s16 \d0, \s0, d0[0]
vmla.s16 \d0, \s1, d0[1]
vmla.s16 \d0, \s2, d0[2]
vmla.s16 \d0, \s3, d0[3]
vmla.s16 \d0, \s4, d1[0]
vmla.s16 \d0, \s5, d1[1]
vmla.s16 \d0, \s6, d1[2]
vmla.s16 \d0, \s7, d1[3]
vmul.s16 \d1, \s2, d0[0]
vmla.s16 \d1, \s3, d0[1]
vmla.s16 \d1, \s4, d0[2]
vmla.s16 \d1, \s5, d0[3]
vmla.s16 \d1, \s6, d1[0]
vmla.s16 \d1, \s7, d1[1]
vmla.s16 \d1, \s8, d1[2]
vmla.s16 \d1, \s9, d1[3]
.endm
.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
vmul.s16 \d0, \s0, d0[0]
vmla.s16 \d0, \s1, d0[1]
vmla.s16 \d0, \s2, d0[2]
vmla.s16 \d0, \s3, d0[3]
vmla.s16 \d0, \s4, d1[0]
vmla.s16 \d0, \s5, d1[1]
vmla.s16 \d0, \s6, d1[2]
vmla.s16 \d0, \s7, d1[3]
vmul.s16 \d1, \s4, d0[0]
vmla.s16 \d1, \s5, d0[1]
vmla.s16 \d1, \s6, d0[2]
vmla.s16 \d1, \s7, d0[3]
vmla.s16 \d1, \s8, d1[0]
vmla.s16 \d1, \s9, d1[1]
vmla.s16 \d1, \s10, d1[2]
vmla.s16 \d1, \s11, d1[3]
.endm
.macro vqrshrun_s16 shift, q0, d0, q1, d1, q2, d2, q3, d3
vqrshrun.s16 \d0, \q0, #\shift
.ifnb \q1
vqrshrun.s16 \d1, \q1, #\shift
.endif
.ifnb \q2
vqrshrun.s16 \d2, \q2, #\shift
vqrshrun.s16 \d3, \q3, #\shift
.endif
.endm
.macro vrshr_s16 shift, r0, r1, r2, r3
vrshr.s16 \r0, \r0, #\shift
.ifnb \r1
vrshr.s16 \r1, \r1, #\shift
.endif
.ifnb \r2
vrshr.s16 \r2, \r2, #\shift
vrshr.s16 \r3, \r3, #\shift
.endif
.endm
.macro st_16 strd, reg, lanes
vst1.16 {\reg[0]}, [r0, :16], \strd
vst1.16 {\reg[1]}, [r8, :16], \strd
.if \lanes > 2
vst1.16 {\reg[2]}, [r0, :16], \strd
vst1.16 {\reg[3]}, [r8, :16], \strd
.endif
.endm
.macro st_32 strd, r0, r1
vst1.32 {\r0[0]}, [r0, :32], \strd
vst1.32 {\r0[1]}, [r8, :32], \strd
.ifnb \r1
vst1.32 {\r1[0]}, [r0, :32], \strd
vst1.32 {\r1[1]}, [r8, :32], \strd
.endif
.endm
.macro st_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
vst1.8 {\r0}, [r0, \align], \strd
vst1.8 {\r1}, [r8, \align], \strd
.ifnb \r2
vst1.8 {\r2}, [r0, \align], \strd
vst1.8 {\r3}, [r8, \align], \strd
.endif
.ifnb \r4
vst1.8 {\r4}, [r0, \align], \strd
vst1.8 {\r5}, [r8, \align], \strd
vst1.8 {\r6}, [r0, \align], \strd
vst1.8 {\r7}, [r8, \align], \strd
.endif
.endm
.macro shift_store_4 type, strd, q0, d0, d1, q1, d2, d3
.ifc \type, put
vqrshrun_s16 6, \q0, \d0, \q1, \d2
st_32 \strd, \d0, \d2
.else
vrshr_s16 2, \q0, \q1
st_reg \strd, :64, \d0, \d1, \d2, \d3
.endif
.endm
.macro shift_store_8 type, strd, q0, d0, q1, d1, q2, d2, q3, d3
.ifc \type, put
vqrshrun_s16 6, \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
st_reg \strd, :64, \d0, \d1, \d2, \d3
.else
vrshr_s16 2, \q0, \q1, \q2, \q3
st_reg \strd, :128,\q0, \q1, \q2, \q3
.endif
.endm
.macro shift_store_16 type, strd, q0, d0, d1, q1, q2, d4, d5, q3
.ifc \type, put
vqrshrun.s16 \d0, \q0, #6
vqrshrun.s16 \d1, \q1, #6
vqrshrun.s16 \d4, \q2, #6
vqrshrun.s16 \d5, \q3, #6
st_reg \strd, :128, \q0, \q2
.else
vrshr_s16 2, \q0, \q1, \q2, \q3
vst1.16 {\q0, \q1}, [r0, :128], \strd
vst1.16 {\q2, \q3}, [r8, :128], \strd
.endif
.endm
.macro make_8tap_fn op, type, type_h, type_v
function \op\()_8tap_\type\()_8bpc_neon, export=1
push {r4-r11,lr}
movw r8, \type_h
movw r9, \type_v
b \op\()_8tap_neon
endfunc
.endm
// No spaces in these expressions, due to gas-preprocessor.
#define REGULAR ((0*15<<7)|3*15)
#define SMOOTH ((1*15<<7)|4*15)
#define SHARP ((2*15<<7)|3*15)
.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, ds2, sr2, shift_hv
make_8tap_fn \type, regular, REGULAR, REGULAR
make_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
make_8tap_fn \type, regular_sharp, REGULAR, SHARP
make_8tap_fn \type, smooth, SMOOTH, SMOOTH
make_8tap_fn \type, smooth_regular, SMOOTH, REGULAR
make_8tap_fn \type, smooth_sharp, SMOOTH, SHARP
make_8tap_fn \type, sharp, SHARP, SHARP
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
function \type\()_8tap_neon
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
mul \mx, \mx, r10
mul \my, \my, r10
add \mx, \mx, r8 // mx, 8tap_h, 4tap_h
add \my, \my, r9 // my, 8tap_v, 4tap_v
.ifc \type, prep
lsl \d_strd, \w, #1
.endif
clz r8, \w
tst \mx, #(0x7f << 14)
sub r8, r8, #24
movrel r10, X(mc_subpel_filters), -8
bne L(\type\()_8tap_h)
tst \my, #(0x7f << 14)
bne L(\type\()_8tap_v)
b \type\()_neon
L(\type\()_8tap_h):
cmp \w, #4
ubfx r9, \mx, #7, #7
and \mx, \mx, #0x7f
it gt
movgt \mx, r9
tst \my, #(0x7f << 14)
add \mx, r10, \mx, lsl #3
bne L(\type\()_8tap_hv)
adr r9, L(\type\()_8tap_h_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
bx r9
.align 2
L(\type\()_8tap_h_tbl):
.word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
.word 640f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
.word 320f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
.word 160f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
.word 80f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
.word 40f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
.word 20f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
20: // 2xN h
.ifc \type, put
add \mx, \mx, #2
vld1.32 {d0[]}, [\mx]
sub \src, \src, #1
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
vmovl.s8 q0, d0
2:
vld1.8 {d4}, [\src], \s_strd
vld1.8 {d6}, [\sr2], \s_strd
vmovl.u8 q2, d4
vmovl.u8 q3, d6
vext.8 d5, d4, d5, #2
vext.8 d7, d6, d7, #2
subs \h, \h, #2
vtrn.32 d4, d6
vtrn.32 d5, d7
vmul.s16 d2, d4, d0[0]
vmla.s16 d2, d5, d0[1]
vmla.s16 d2, d6, d0[2]
vmla.s16 d2, d7, d0[3]
vrshr.s16 d2, d2, #2
vqrshrun.s16 d2, q1, #4
vst1.16 {d2[0]}, [\dst, :16], \d_strd
vst1.16 {d2[1]}, [\ds2, :16], \d_strd
bgt 2b
pop {r4-r11,pc}
.endif
40: // 4xN h
add \mx, \mx, #2
vld1.32 {d0[]}, [\mx]
sub \src, \src, #1
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
vmovl.s8 q0, d0
4:
vld1.8 {d16}, [\src], \s_strd
vld1.8 {d24}, [\sr2], \s_strd
vmovl.u8 q8, d16
vmovl.u8 q12, d24
vext.8 q9, q8, q8, #2
vext.8 q10, q8, q8, #4
vext.8 q11, q8, q8, #6
vext.8 q13, q12, q12, #2
vext.8 q14, q12, q12, #4
vext.8 q15, q12, q12, #6
subs \h, \h, #2
vmul.s16 d4, d16, d0[0]
vmla.s16 d4, d18, d0[1]
vmla.s16 d4, d20, d0[2]
vmla.s16 d4, d22, d0[3]
vmul.s16 d5, d24, d0[0]
vmla.s16 d5, d26, d0[1]
vmla.s16 d5, d28, d0[2]
vmla.s16 d5, d30, d0[3]
vrshr.s16 q2, q2, #2
.ifc \type, put
vqrshrun.s16 d4, q2, #4
vst1.32 {d4[0]}, [\dst, :32], \d_strd
vst1.32 {d4[1]}, [\ds2, :32], \d_strd
.else
vst1.16 {d4}, [\dst, :64], \d_strd
vst1.16 {d5}, [\ds2, :64], \d_strd
.endif
bgt 4b
pop {r4-r11,pc}
80: // 8xN h
vld1.8 {d0}, [\mx]
sub \src, \src, #3
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
vmovl.s8 q0, d0
8:
vld1.8 {q8}, [\src], \s_strd
vld1.8 {q12}, [\sr2], \s_strd
vmovl.u8 q9, d17
vmovl.u8 q8, d16
vmovl.u8 q13, d25
vmovl.u8 q12, d24
vmul.s16 q10, q8, d0[0]
vmul.s16 q14, q12, d0[0]
.irpc i, 1234567
vext.8 q11, q8, q9, #(2*\i)
vext.8 q15, q12, q13, #(2*\i)
.if \i < 4
vmla.s16 q10, q11, d0[\i]
vmla.s16 q14, q15, d0[\i]
.else
vmla.s16 q10, q11, d1[\i-4]
vmla.s16 q14, q15, d1[\i-4]
.endif
.endr
subs \h, \h, #2
vrshr.s16 q10, q10, #2
vrshr.s16 q14, q14, #2
.ifc \type, put
vqrshrun.s16 d20, q10, #4
vqrshrun.s16 d28, q14, #4
vst1.8 {d20}, [\dst, :64], \d_strd
vst1.8 {d28}, [\ds2, :64], \d_strd
.else
vst1.16 {q10}, [\dst, :128], \d_strd
vst1.16 {q14}, [\ds2, :128], \d_strd
.endif
bgt 8b
pop {r4-r11,pc}
160:
320:
640:
1280: // 16xN, 32xN, ... h
// This could be done without touching q4-q6, by using only
// one temporary for vext in the loop. That's slower on A7 and A53,
// (but surprisingly, marginally faster on A8 and A73).
vpush {q4-q6}
vld1.8 {d0}, [\mx]
sub \src, \src, #3
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
vmovl.s8 q0, d0
sub \s_strd, \s_strd, \w
sub \s_strd, \s_strd, #8
.ifc \type, put
lsl \d_strd, \d_strd, #1
sub \d_strd, \d_strd, \w
.endif
161:
vld1.8 {d16, d17, d18}, [\src]!
vld1.8 {d24, d25, d26}, [\sr2]!
mov \mx, \w
vmovl.u8 q10, d18
vmovl.u8 q9, d17
vmovl.u8 q8, d16
vmovl.u8 q14, d26
vmovl.u8 q13, d25
vmovl.u8 q12, d24
16:
vmul.s16 q1, q8, d0[0]
vmul.s16 q2, q9, d0[0]
vmul.s16 q3, q12, d0[0]
vmul.s16 q4, q13, d0[0]
.irpc i, 1234567
vext.8 q5, q8, q9, #(2*\i)
vext.8 q6, q9, q10, #(2*\i)
vext.8 q11, q12, q13, #(2*\i)
vext.8 q15, q13, q14, #(2*\i)
.if \i < 4
vmla.s16 q1, q5, d0[\i]
vmla.s16 q2, q6, d0[\i]
vmla.s16 q3, q11, d0[\i]
vmla.s16 q4, q15, d0[\i]
.else
vmla.s16 q1, q5, d1[\i-4]
vmla.s16 q2, q6, d1[\i-4]
vmla.s16 q3, q11, d1[\i-4]
vmla.s16 q4, q15, d1[\i-4]
.endif
.endr
vrshr.s16 q1, q1, #2
vrshr.s16 q2, q2, #2
vrshr.s16 q3, q3, #2
vrshr.s16 q4, q4, #2
subs \mx, \mx, #16
.ifc \type, put
vqrshrun.s16 d2, q1, #4
vqrshrun.s16 d3, q2, #4
vqrshrun.s16 d4, q3, #4
vqrshrun.s16 d5, q4, #4
vst1.8 {q1}, [\dst, :128]!
vst1.8 {q2}, [\ds2, :128]!
.else
vst1.16 {q1, q2}, [\dst, :128]!
vst1.16 {q3, q4}, [\ds2, :128]!
.endif
ble 9f
vmov q8, q10
vmov q12, q14
vld1.8 {d18, d19}, [\src]!
vld1.8 {d26, d27}, [\sr2]!
vmovl.u8 q10, d19
vmovl.u8 q9, d18
vmovl.u8 q14, d27
vmovl.u8 q13, d26
b 16b
9:
add \dst, \dst, \d_strd
add \ds2, \ds2, \d_strd
add \src, \src, \s_strd
add \sr2, \sr2, \s_strd
subs \h, \h, #2
bgt 161b
vpop {q4-q6}
pop {r4-r11,pc}
L(\type\()_8tap_v):
cmp \h, #4
ubfx r9, \my, #7, #7
and \my, \my, #0x7f
it gt
movgt \my, r9
add \my, r10, \my, lsl #3
adr r9, L(\type\()_8tap_v_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
bx r9
.align 2
L(\type\()_8tap_v_tbl):
.word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
.word 640f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
.word 320f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
.word 160f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
.word 80f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
.word 40f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
.word 20f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
20: // 2xN v
.ifc \type, put
bgt 28f
cmp \h, #2
add \my, \my, #2
vld1.32 {d0[]}, [\my]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vmovl.s8 q0, d0
// 2x2 v
load_16 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
interleave_1_16 d1, d2, d3, d4, d5
bgt 24f
vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4
mul_mla_4 d6, d16, d18, d20, d22
vqrshrun_s16 6, q3, d6
st_16 \d_strd, d6, 2
pop {r4-r11,pc}
24: // 2x4 v
load_16 \sr2, \src, \s_strd, d6, d7
interleave_1_16 d5, d6, d7
vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5, q13, d6
vmov d17, d20
vmov d19, d22
vmov d21, d24
vmov d23, d26
mul_mla_4 q3, q8, q9, q10, q11
vqrshrun_s16 6, q3, d6
st_16 \d_strd, d6, 4
pop {r4-r11,pc}
28: // 2x8, 2x16 v
vpush {q4-q7}
vld1.8 {d0}, [\my]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
sub \src, \sr2, \s_strd
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
vmovl.s8 q0, d0
load_16 \src, \sr2, \s_strd, d2, d4, d6, d8, d10, d12, d14
interleave_1_16 d2, d4, d6, d8, d10
interleave_1_16 d10, d12, d14
vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q5, d10, q6, d12
vmov d3, d6
vmov d5, d8
vmov d7, d10
vmov d9, d12
216:
subs \h, \h, #8
load_16 \sr2, \src, \s_strd, d16, d18, d20, d22
load_16 \sr2, \src, \s_strd, d24, d26, d28, d30
interleave_1_16 d14, d16, d18, d20, d22
interleave_1_16 d22, d24, d26, d28, d30
vmovl_u8 q7, d14, q8, d16, q9, d18, q10, d20
vmovl_u8 q11, d22, q12, d24, q13, d26, q14, d28
vmov d11, d14
vmov d13, d16
vmov d15, d18
vmov d17, d20
vmov d19, d22
vmov d21, d24
vmov d23, d26
vmov d25, d28
mul_mla_8_4 q1, q2, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
vqrshrun_s16 6, q1, d2, q2, d4
st_16 \d_strd, d2, 4
st_16 \d_strd, d4, 4
ble 0f
vmov q1, q9
vmov q2, q10
vmov q3, q11
vmov q4, q12
vmov q5, q13
vmov q6, q14
vmov d14, d30
b 216b
0:
vpop {q4-q7}
pop {r4-r11,pc}
.endif
40:
bgt 480f
// 4x2, 4x4 v
cmp \h, #2
add \my, \my, #2
vld1.32 {d0[]}, [\my]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vmovl.s8 q0, d0
load_32 \src, \sr2, \s_strd, d1, d2, d3, d4, d5
interleave_1_32 d1, d2, d3, d4, d5
vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4
mul_mla_4 q3, q8, q9, q10, q11
shift_store_4 \type, \d_strd, q3, d6, d7
ble 0f
load_32 \sr2, \src, \s_strd, d6, d7
interleave_1_32 d5, d6, d7
vmovl_u8 q12, d5, q13, d6
mul_mla_4 q3, q10, q11, q12, q13
shift_store_4 \type, \d_strd, q3, d6, d7
0:
pop {r4-r11,pc}
480: // 4x8, 4x16 v
vpush {q4}
vld1.8 {d0}, [\my]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
sub \src, \sr2, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vmovl.s8 q0, d0
load_32 \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20
interleave_1_32 d2, d4, d6
interleave_1_32 d6, d8, d16, d18, d20
vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18
48:
subs \h, \h, #4
load_32 \sr2, \src, \s_strd, d22, d24, d26, d28
interleave_1_32 d20, d22, d24, d26, d28
vmovl_u8 q10, d20, q11, d22, q12, d24, q13, d26
mul_mla_8_2 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12, q13
shift_store_4 \type, \d_strd, q1, d2, d3, q2, d4, d5
ble 0f
subs \h, \h, #4
load_32 \sr2, \src, \s_strd, d30, d2, d4, d6
interleave_1_32 d28, d30, d2, d4, d6
vmovl_u8 q14, d28, q15, d30, q1, d2, q2, d4
mul_mla_8_2 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1, q2
shift_store_4 \type, \d_strd, q8, d16, d17, q9, d18, d19
ble 0f
subs \h, \h, #4
load_32 \sr2, \src, \s_strd, d8, d16, d18, d20
interleave_1_32 d6, d8, d16, d18, d20
vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18
mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9
shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27
b 48b
0:
vpop {q4}
pop {r4-r11,pc}
80:
bgt 880f
// 8x2, 8x4 v
cmp \h, #2
add \my, \my, #2
vld1.32 {d0[]}, [\my]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vmovl.s8 q0, d0
load_reg \src, \sr2, \s_strd, d1, d2, d3, d4, d5
vmovl_u8 q8, d1, q9, d2, q10, d3, q11, d4, q12, d5
mul_mla_4 q1, q8, q9, q10, q11
mul_mla_4 q2, q9, q10, q11, q12
shift_store_8 \type, \d_strd, q1, d2, q2, d4
ble 0f
load_reg \sr2, \src, \s_strd, d6, d7
vmovl_u8 q13, d6, q14, d7
mul_mla_4 q1, q10, q11, q12, q13
mul_mla_4 q2, q11, q12, q13, q14
shift_store_8 \type, \d_strd, q1, d2, q2, d4
0:
pop {r4-r11,pc}
880: // 8x8, 8x16, 8x32 v
1680: // 16x8, 16x16, ...
320: // 32x8, 32x16, ...
640:
1280:
vpush {q4}
vld1.8 {d0}, [\my]
sub \src, \src, \s_strd
sub \src, \src, \s_strd, lsl #1
vmovl.s8 q0, d0
mov \my, \h
168:
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
load_reg \src, \sr2, \s_strd, d2, d4, d6, d8, d16, d18, d20
vmovl_u8 q1, d2, q2, d4, q3, d6, q4, d8, q8, d16, q9, d18, q10, d20
88:
subs \h, \h, #2
load_reg \sr2, \src, \s_strd, d22, d24
vmovl_u8 q11, d22, q12, d24
mul_mla_8_1 q1, q2, q1, q2, q3, q4, q8, q9, q10, q11, q12
shift_store_8 \type, \d_strd, q1, d2, q2, d4
ble 9f
subs \h, \h, #2
load_reg \sr2, \src, \s_strd, d26, d28
vmovl_u8 q13, d26, q14, d28
mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14
shift_store_8 \type, \d_strd, q3, d6, q4, d8
ble 9f
subs \h, \h, #4
load_reg \sr2, \src, \s_strd, d30, d2, d4, d6
vmovl_u8 q15, d30, q1, d2, q2, d4, q3, d6
mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1
mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3
shift_store_8 \type, \d_strd, q8, d16, q9, d18, q10, d20, q11, d22
ble 9f
subs \h, \h, #4
load_reg \sr2, \src, \s_strd, d8, d16, d18, d20
vmovl_u8 q4, d8, q8, d16, q9, d18, q10, d20
mul_mla_8_1 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8
mul_mla_8_1 q14, q15, q14, q15, q1, q2, q3, q4, q8, q9, q10
shift_store_8 \type, \d_strd, q12, d24, q13, d26, q14, d28, q15, d30
bgt 88b
9:
subs \w, \w, #8
ble 0f
asr \s_strd, \s_strd, #1
asr \d_strd, \d_strd, #1
mls \src, \s_strd, \my, \src
mls \dst, \d_strd, \my, \dst
sub \src, \src, \s_strd, lsl #3
mov \h, \my
add \src, \src, #8
.ifc \type, put
add \dst, \dst, #8
.else
add \dst, \dst, #16
.endif
b 168b
0:
vpop {q4}
pop {r4-r11,pc}
160:
bgt 1680b
// 16x2, 16x4 v
add \my, \my, #2
vld1.32 {d0[]}, [\my]
sub \src, \src, \s_strd
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vmovl.s8 q0, d0
cmp \h, #2
load_reg \src, \sr2, \s_strd, q11, q12, q13, q14, q15
vmovl.u8 q1, d22
vmovl.u8 q2, d24
vmovl.u8 q3, d26
vmovl.u8 q8, d28
vmovl.u8 q9, d30
vmovl.u8 q11, d23
vmovl.u8 q12, d25
vmovl.u8 q13, d27
vmovl.u8 q14, d29
vmovl.u8 q15, d31
mul_mla_4 q1, q1, q2, q3, q8
mul_mla_4 q10, q2, q3, q8, q9
mul_mla_4 q2, q11, q12, q13, q14
mul_mla_4 q11, q12, q13, q14, q15
shift_store_16 \type, \d_strd, q1, d2, d3, q2, q10, d20, d21, q11
ble 0f
load_reg \sr2, \src, \s_strd, q10, q11
vmovl.u8 q1, d20
vmovl.u8 q10, d21
vmovl.u8 q12, d22
vmovl.u8 q11, d23
mul_mla_4 q2, q3, q8, q9, q1
mul_mla_4 q3, q13, q14, q15, q10
mul_mla_4 q13, q8, q9, q1, q12
mul_mla_4 q14, q14, q15, q10, q11
shift_store_16 \type, \d_strd, q2, d4, d5, q3, q13, d26, d27, q14
0:
pop {r4-r11,pc}
L(\type\()_8tap_hv):
cmp \h, #4
ubfx r9, \my, #7, #7
and \my, \my, #0x7f
it gt
movgt \my, r9
add \my, r10, \my, lsl #3
adr r9, L(\type\()_8tap_hv_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
bx r9
.align 2
L(\type\()_8tap_hv_tbl):
.word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
.word 640f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
.word 320f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
.word 160f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
.word 80f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
.word 40f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
.word 20f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
20:
.ifc \type, put
add \mx, \mx, #2
vld1.32 {d0[]}, [\mx]
bgt 280f
add \my, \my, #2
vld1.32 {d2[]}, [\my]
// 2x2, 2x4 hv
sub \sr2, \src, #1
sub \src, \sr2, \s_strd
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vmovl.s8 q0, d0
vmovl.s8 q1, d2
vld1.8 {d26}, [\src], \s_strd
vmovl.u8 q13, d26
vext.8 q14, q13, q13, #2
vmul.s16 d26, d26, d0
vmul.s16 d28, d28, d0
vpadd.s16 d26, d26, d28
vpadd.s16 d26, d26, d26
vrshr.s16 d16, d26, #2
bl L(\type\()_8tap_filter_2)
vext.8 d16, d16, d16, #4
vmov d17, d26
vext.8 d16, d16, d26, #4
2:
bl L(\type\()_8tap_filter_2)
vext.8 d18, d17, d26, #4
vmov d19, d26
vmull.s16 q2, d16, d2[0]
vmlal.s16 q2, d17, d2[1]
vmlal.s16 q2, d18, d2[2]
vmlal.s16 q2, d19, d2[3]
vqrshrn.s32 d4, q2, #\shift_hv
vqmovun.s16 d4, q2
subs \h, \h, #2
vst1.16 {d4[0]}, [\dst, :16], \d_strd
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
ble 0f
vmov d16, d18
vmov d17, d19
b 2b
280: // 2x8, 2x16, 2x32 hv
vld1.8 {d2}, [\my]
sub \src, \src, #1
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vmovl.s8 q0, d0
vmovl.s8 q1, d2
vld1.8 {d26}, [\src], \s_strd
vmovl.u8 q13, d26
vext.8 q14, q13, q13, #2
vmul.s16 d26, d26, d0
vmul.s16 d28, d28, d0
vpadd.s16 d26, d26, d28
vpadd.s16 d26, d26, d26
vrshr.s16 d16, d26, #2
bl L(\type\()_8tap_filter_2)
vext.8 d16, d16, d16, #4
vmov d17, d26
vext.8 d16, d16, d26, #4
bl L(\type\()_8tap_filter_2)
vext.8 d18, d17, d26, #4
vmov d19, d26
bl L(\type\()_8tap_filter_2)
vext.8 d20, d19, d26, #4
vmov d21, d26
28:
bl L(\type\()_8tap_filter_2)
vext.8 d22, d21, d26, #4
vmov d23, d26
vmull.s16 q2, d16, d2[0]
vmlal.s16 q2, d17, d2[1]
vmlal.s16 q2, d18, d2[2]
vmlal.s16 q2, d19, d2[3]
vmlal.s16 q2, d20, d3[0]
vmlal.s16 q2, d21, d3[1]
vmlal.s16 q2, d22, d3[2]
vmlal.s16 q2, d23, d3[3]
vqrshrn.s32 d4, q2, #\shift_hv
vqmovun.s16 d4, q2
subs \h, \h, #2
vst1.16 {d4[0]}, [\dst, :16], \d_strd
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
ble 0f
vmov d16, d18
vmov d17, d19
vmov d18, d20
vmov d19, d21
vmov d20, d22
vmov d21, d23
b 28b
0:
pop {r4-r11,pc}
L(\type\()_8tap_filter_2):
vld1.8 {d28}, [\sr2], \s_strd
vld1.8 {d30}, [\src], \s_strd
vext.8 d29, d28, d28, #1
vext.8 d31, d30, d30, #1
vmovl.u8 q13, d28
vmovl.u8 q14, d29
vmov d27, d28
vmovl.u8 q14, d30
vmovl.u8 q15, d31
vtrn.32 d26, d28
vtrn.32 d27, d30
vmul.s16 d26, d26, d0[0]
vmla.s16 d26, d27, d0[1]
vmla.s16 d26, d28, d0[2]
vmla.s16 d26, d30, d0[3]
vrshr.s16 d26, d26, #2
vext.8 d27, d26, d26, #4
bx lr
.endif
40:
add \mx, \mx, #2
vld1.32 {d0[]}, [\mx]
bgt 480f
add \my, \my, #2
vld1.32 {d2[]}, [\my]
sub \sr2, \src, #1
sub \src, \sr2, \s_strd
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vmovl.s8 q0, d0
vmovl.s8 q1, d2
// 4x2, 4x4 hv
vld1.8 {d30}, [\src], \s_strd
vmovl.u8 q14, d30
vext.8 d27, d28, d29, #2
vext.8 d30, d28, d29, #4
vext.8 d31, d28, d29, #6
vmul.s16 d26, d28, d0[0]
vmla.s16 d26, d27, d0[1]
vmla.s16 d26, d30, d0[2]
vmla.s16 d26, d31, d0[3]
vrshr.s16 d16, d26, #2
bl L(\type\()_8tap_filter_4)
vmov d17, d26
vmov d18, d27
4:
bl L(\type\()_8tap_filter_4)
vmull.s16 q2, d16, d2[0]
vmlal.s16 q2, d17, d2[1]
vmlal.s16 q2, d18, d2[2]
vmlal.s16 q2, d26, d2[3]
vmull.s16 q3, d17, d2[0]
vmlal.s16 q3, d18, d2[1]
vmlal.s16 q3, d26, d2[2]
vmlal.s16 q3, d27, d2[3]
vqrshrn.s32 d4, q2, #\shift_hv
vqrshrn.s32 d6, q3, #\shift_hv
subs \h, \h, #2
.ifc \type, put
vqmovun.s16 d4, q2
vqmovun.s16 d6, q3
vst1.32 {d4[0]}, [\dst, :32], \d_strd
vst1.32 {d6[0]}, [\ds2, :32], \d_strd
.else
vst1.16 {d4}, [\dst, :64], \d_strd
vst1.16 {d6}, [\ds2, :64], \d_strd
.endif
ble 0f
vmov d16, d18
vmov d17, d26
vmov d18, d27
b 4b
480: // 4x8, 4x16, 4x32 hv
vld1.8 {d2}, [\my]
sub \src, \src, #1
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vmovl.s8 q0, d0
vmovl.s8 q1, d2
vld1.8 {d30}, [\src], \s_strd
vmovl.u8 q14, d30
vext.8 d27, d28, d29, #2
vext.8 d30, d28, d29, #4
vext.8 d31, d28, d29, #6
vmul.s16 d26, d28, d0[0]
vmla.s16 d26, d27, d0[1]
vmla.s16 d26, d30, d0[2]
vmla.s16 d26, d31, d0[3]
vrshr.s16 d16, d26, #2
bl L(\type\()_8tap_filter_4)
vmov d17, d26
vmov d18, d27
bl L(\type\()_8tap_filter_4)
vmov d19, d26
vmov d20, d27
bl L(\type\()_8tap_filter_4)
vmov d21, d26
vmov d22, d27
48:
bl L(\type\()_8tap_filter_4)
vmull.s16 q2, d16, d2[0]
vmlal.s16 q2, d17, d2[1]
vmlal.s16 q2, d18, d2[2]
vmlal.s16 q2, d19, d2[3]
vmlal.s16 q2, d20, d3[0]
vmlal.s16 q2, d21, d3[1]
vmlal.s16 q2, d22, d3[2]
vmlal.s16 q2, d26, d3[3]
vmull.s16 q3, d17, d2[0]
vmlal.s16 q3, d18, d2[1]
vmlal.s16 q3, d19, d2[2]
vmlal.s16 q3, d20, d2[3]
vmlal.s16 q3, d21, d3[0]
vmlal.s16 q3, d22, d3[1]
vmlal.s16 q3, d26, d3[2]
vmlal.s16 q3, d27, d3[3]
vqrshrn.s32 d4, q2, #\shift_hv
vqrshrn.s32 d6, q3, #\shift_hv
subs \h, \h, #2
.ifc \type, put
vqmovun.s16 d4, q2
vqmovun.s16 d6, q3
vst1.32 {d4[0]}, [\dst, :32], \d_strd
vst1.32 {d6[0]}, [\ds2, :32], \d_strd
.else
vst1.16 {d4}, [\dst, :64], \d_strd
vst1.16 {d6}, [\ds2, :64], \d_strd
.endif
ble 0f
vmov d16, d18
vmov d17, d19
vmov d18, d20
vmov d19, d21
vmov d20, d22
vmov d21, d26
vmov d22, d27
b 48b
0:
pop {r4-r11,pc}
L(\type\()_8tap_filter_4):
vld1.8 {d30}, [\sr2], \s_strd
vld1.8 {d31}, [\src], \s_strd
vmovl.u8 q14, d30
vext.8 d27, d28, d29, #2
vext.8 d30, d28, d29, #4
vext.8 d1, d28, d29, #6
vmul.s16 d26, d28, d0[0]
vmla.s16 d26, d27, d0[1]
vmla.s16 d26, d30, d0[2]
vmla.s16 d26, d1, d0[3]
vmovl.u8 q14, d31
vext.8 d30, d28, d29, #2
vext.8 d31, d28, d29, #4
vext.8 d1, d28, d29, #6
vmul.s16 d27, d28, d0[0]
vmla.s16 d27, d30, d0[1]
vmla.s16 d27, d31, d0[2]
vmla.s16 d27, d1, d0[3]
vrshr.s16 d26, d26, #2
vrshr.s16 d27, d27, #2
bx lr
80:
160:
320:
bgt 880f
vpush {q4-q7}
add \my, \my, #2
vld1.8 {d0}, [\mx]
vld1.32 {d2[]}, [\my]
sub \src, \src, #3
sub \src, \src, \s_strd
vmovl.s8 q0, d0
vmovl.s8 q1, d2
mov \my, \h
164: // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
vld1.8 {q14}, [\src], \s_strd
vmovl.u8 q12, d28
vmovl.u8 q13, d29
vmul.s16 q10, q12, d0[0]
.irpc i, 123
vext.8 q14, q12, q13, #(2*\i)
vmla.s16 q10, q14, d0[\i]
.endr
.irpc i, 4567
vext.8 q14, q12, q13, #(2*\i)
vmla.s16 q10, q14, d1[\i-4]
.endr
vrshr.s16 q3, q10, #2
bl L(\type\()_8tap_filter_8)
vmov q4, q10
vmov q5, q11
8:
bl L(\type\()_8tap_filter_8)
vmull.s16 q12, d6, d2[0]
vmull.s16 q13, d7, d2[0]
vmull.s16 q14, d8, d2[0]
vmull.s16 q15, d9, d2[0]
vmlal.s16 q12, d8, d2[1]
vmlal.s16 q13, d9, d2[1]
vmlal.s16 q14, d10, d2[1]
vmlal.s16 q15, d11, d2[1]
vmlal.s16 q12, d10, d2[2]
vmlal.s16 q13, d11, d2[2]
vmlal.s16 q14, d20, d2[2]
vmlal.s16 q15, d21, d2[2]
vmlal.s16 q12, d20, d2[3]
vmlal.s16 q13, d21, d2[3]
vmlal.s16 q14, d22, d2[3]
vmlal.s16 q15, d23, d2[3]
vqrshrn.s32 d24, q12, #\shift_hv
vqrshrn.s32 d25, q13, #\shift_hv
vqrshrn.s32 d28, q14, #\shift_hv
vqrshrn.s32 d29, q15, #\shift_hv
subs \h, \h, #2
.ifc \type, put
vqmovun.s16 d24, q12
vqmovun.s16 d28, q14
vst1.8 {d24}, [\dst, :64], \d_strd
vst1.8 {d28}, [\ds2, :64], \d_strd
.else
vst1.16 {q12}, [\dst, :128], \d_strd
vst1.16 {q14}, [\ds2, :128], \d_strd
.endif
ble 9f
vmov q3, q5
vmov q4, q10
vmov q5, q11
b 8b
9:
subs \w, \w, #8
ble 0f
asr \s_strd, \s_strd, #1
asr \d_strd, \d_strd, #1
mls \src, \s_strd, \my, \src
mls \dst, \d_strd, \my, \dst
sub \src, \src, \s_strd, lsl #2
mov \h, \my
add \src, \src, #8
.ifc \type, put
add \dst, \dst, #8
.else
add \dst, \dst, #16
.endif
b 164b
880: // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
640:
1280:
vpush {q4-q7}
vld1.8 {d0}, [\mx]
vld1.8 {d2}, [\my]
sub \src, \src, #3
sub \src, \src, \s_strd
sub \src, \src, \s_strd, lsl #1
vmovl.s8 q0, d0
vmovl.s8 q1, d2
mov \my, \h
168:
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
vld1.8 {q14}, [\src], \s_strd
vmovl.u8 q12, d28
vmovl.u8 q13, d29
vmul.s16 q10, q12, d0[0]
.irpc i, 123
vext.8 q14, q12, q13, #(2*\i)
vmla.s16 q10, q14, d0[\i]
.endr
.irpc i, 4567
vext.8 q14, q12, q13, #(2*\i)
vmla.s16 q10, q14, d1[\i-4]
.endr
vrshr.s16 q3, q10, #2
bl L(\type\()_8tap_filter_8)
vmov q4, q10
vmov q5, q11
bl L(\type\()_8tap_filter_8)
vmov q6, q10
vmov q7, q11
bl L(\type\()_8tap_filter_8)
vmov q8, q10
vmov q9, q11
88:
bl L(\type\()_8tap_filter_8)
vmull.s16 q12, d6, d2[0]
vmull.s16 q13, d7, d2[0]
vmull.s16 q14, d8, d2[0]
vmull.s16 q15, d9, d2[0]
vmlal.s16 q12, d8, d2[1]
vmlal.s16 q13, d9, d2[1]
vmlal.s16 q14, d10, d2[1]
vmlal.s16 q15, d11, d2[1]
vmlal.s16 q12, d10, d2[2]
vmlal.s16 q13, d11, d2[2]
vmlal.s16 q14, d12, d2[2]
vmlal.s16 q15, d13, d2[2]
vmlal.s16 q12, d12, d2[3]
vmlal.s16 q13, d13, d2[3]
vmlal.s16 q14, d14, d2[3]
vmlal.s16 q15, d15, d2[3]
vmlal.s16 q12, d14, d3[0]
vmlal.s16 q13, d15, d3[0]
vmlal.s16 q14, d16, d3[0]
vmlal.s16 q15, d17, d3[0]
vmlal.s16 q12, d16, d3[1]
vmlal.s16 q13, d17, d3[1]
vmlal.s16 q14, d18, d3[1]
vmlal.s16 q15, d19, d3[1]
vmlal.s16 q12, d18, d3[2]
vmlal.s16 q13, d19, d3[2]
vmlal.s16 q14, d20, d3[2]
vmlal.s16 q15, d21, d3[2]
vmlal.s16 q12, d20, d3[3]
vmlal.s16 q13, d21, d3[3]
vmlal.s16 q14, d22, d3[3]
vmlal.s16 q15, d23, d3[3]
vqrshrn.s32 d24, q12, #\shift_hv
vqrshrn.s32 d25, q13, #\shift_hv
vqrshrn.s32 d28, q14, #\shift_hv
vqrshrn.s32 d29, q15, #\shift_hv
subs \h, \h, #2
.ifc \type, put
vqmovun.s16 d24, q12
vqmovun.s16 d28, q14
vst1.8 {d24}, [\dst, :64], \d_strd
vst1.8 {d28}, [\ds2, :64], \d_strd
.else
vst1.16 {q12}, [\dst, :128], \d_strd
vst1.16 {q14}, [\ds2, :128], \d_strd
.endif
ble 9f
vmov q3, q5
vmov q4, q6
vmov q5, q7
vmov q6, q8
vmov q7, q9
vmov q8, q10
vmov q9, q11
b 88b
9:
subs \w, \w, #8
ble 0f
asr \s_strd, \s_strd, #1
asr \d_strd, \d_strd, #1
mls \src, \s_strd, \my, \src
mls \dst, \d_strd, \my, \dst
sub \src, \src, \s_strd, lsl #3
mov \h, \my
add \src, \src, #8
.ifc \type, put
add \dst, \dst, #8
.else
add \dst, \dst, #16
.endif
b 168b
0:
vpop {q4-q7}
pop {r4-r11,pc}
L(\type\()_8tap_filter_8):
vld1.8 {q14}, [\sr2], \s_strd
vld1.8 {q15}, [\src], \s_strd
vmovl.u8 q12, d28
vmovl.u8 q13, d29
vmul.s16 q10, q12, d0[0]
.irpc i, 123
vext.8 q14, q12, q13, #(2*\i)
vmla.s16 q10, q14, d0[\i]
.endr
.irpc i, 4567
vext.8 q14, q12, q13, #(2*\i)
vmla.s16 q10, q14, d1[\i-4]
.endr
vmovl.u8 q12, d30
vmovl.u8 q13, d31
vmul.s16 q11, q12, d0[0]
.irpc i, 123
vext.8 q14, q12, q13, #(2*\i)
vmla.s16 q11, q14, d0[\i]
.endr
.irpc i, 4567
vext.8 q14, q12, q13, #(2*\i)
vmla.s16 q11, q14, d1[\i-4]
.endr
vrshr.s16 q10, q10, #2
vrshr.s16 q11, q11, #2
bx lr
endfunc
function \type\()_bilin_8bpc_neon, export=1
push {r4-r11,lr}
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
vdup.8 d1, \mx
vdup.8 d3, \my
rsb r8, \mx, #16
rsb r9, \my, #16
vdup.8 d0, r8
vdup.8 d2, r9
.ifc \type, prep
lsl \d_strd, \w, #1
.endif
clz r8, \w
cmp \mx, #0
sub r8, r8, #24
bne L(\type\()_bilin_h)
cmp \my, #0
bne L(\type\()_bilin_v)
b \type\()_neon
L(\type\()_bilin_h):
cmp \my, #0
bne L(\type\()_bilin_hv)
adr r9, L(\type\()_bilin_h_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
bx r9
.align 2
L(\type\()_bilin_h_tbl):
.word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
.word 640f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
.word 320f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
.word 160f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
.word 80f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
.word 40f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
.word 20f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
20: // 2xN h
.ifc \type, put
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
2:
vld1.32 {d4[]}, [\src], \s_strd
vld1.32 {d6[]}, [\sr2], \s_strd
vext.8 d5, d4, d4, #1
vext.8 d7, d6, d6, #1
vtrn.16 q2, q3
subs \h, \h, #2
vmull.u8 q3, d4, d0
vmlal.u8 q3, d5, d1
vqrshrn.u16 d4, q3, #4
vst1.16 {d4[0]}, [\dst, :16], \d_strd
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
bgt 2b
pop {r4-r11,pc}
.endif
40: // 4xN h
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
4:
vld1.8 {d4}, [\src], \s_strd
vld1.8 {d6}, [\sr2], \s_strd
vext.8 d5, d4, d4, #1
vext.8 d7, d6, d6, #1
vtrn.32 q2, q3
subs \h, \h, #2
vmull.u8 q3, d4, d0
vmlal.u8 q3, d5, d1
.ifc \type, put
vqrshrn.u16 d4, q3, #4
vst1.32 {d4[0]}, [\dst, :32], \d_strd
vst1.32 {d4[1]}, [\ds2, :32], \d_strd
.else
vst1.16 {d6}, [\dst, :64], \d_strd
vst1.16 {d7}, [\ds2, :64], \d_strd
.endif
bgt 4b
pop {r4-r11,pc}
80: // 8xN h
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \d_strd, \d_strd, #1
lsl \s_strd, \s_strd, #1
8:
vld1.8 {q8}, [\src], \s_strd
vld1.8 {q10}, [\sr2], \s_strd
vext.8 q9, q8, q8, #1
vext.8 q11, q10, q10, #1
subs \h, \h, #2
vmull.u8 q8, d16, d0
vmull.u8 q10, d20, d0
vmlal.u8 q8, d18, d1
vmlal.u8 q10, d22, d1
.ifc \type, put
vqrshrn.u16 d16, q8, #4
vqrshrn.u16 d18, q10, #4
vst1.8 {d16}, [\dst, :64], \d_strd
vst1.8 {d18}, [\ds2, :64], \d_strd
.else
vst1.16 {q8}, [\dst, :128], \d_strd
vst1.16 {q10}, [\ds2, :128], \d_strd
.endif
bgt 8b
pop {r4-r11,pc}
160:
320:
640:
1280: // 16xN, 32xN, ... h
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
sub \s_strd, \s_strd, \w
sub \s_strd, \s_strd, #8
.ifc \type, put
lsl \d_strd, \d_strd, #1
sub \d_strd, \d_strd, \w
.endif
161:
vld1.8 {d16}, [\src]!
vld1.8 {d22}, [\sr2]!
mov \mx, \w
16:
vld1.8 {d17,d18}, [\src]!
vld1.8 {d23,d24}, [\sr2]!
vext.8 q10, q8, q9, #1
vext.8 q13, q11, q12, #1
vmull.u8 q2, d16, d0
vmull.u8 q3, d17, d0
vmull.u8 q14, d22, d0
vmull.u8 q15, d23, d0
vmlal.u8 q2, d20, d1
vmlal.u8 q3, d21, d1
vmlal.u8 q14, d26, d1
vmlal.u8 q15, d27, d1
subs \mx, \mx, #16
.ifc \type, put
vqrshrn.u16 d4, q2, #4
vqrshrn.u16 d5, q3, #4
vqrshrn.u16 d28, q14, #4
vqrshrn.u16 d29, q15, #4
vst1.8 {q2}, [\dst, :128]!
vst1.8 {q14}, [\ds2, :128]!
.else
vst1.16 {q2, q3}, [\dst, :128]!
vst1.16 {q14, q15}, [\ds2, :128]!
.endif
ble 9f
vmov d16, d18
vmov d22, d24
b 16b
9:
add \dst, \dst, \d_strd
add \ds2, \ds2, \d_strd
add \src, \src, \s_strd
add \sr2, \sr2, \s_strd
subs \h, \h, #2
bgt 161b
pop {r4-r11,pc}
L(\type\()_bilin_v):
cmp \h, #4
adr r9, L(\type\()_bilin_v_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
bx r9
.align 2
L(\type\()_bilin_v_tbl):
.word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
.word 640f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
.word 320f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
.word 160f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
.word 80f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
.word 40f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
.word 20f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
20: // 2xN v
.ifc \type, put
cmp \h, #2
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
// 2x2 v
vld1.16 {d16[]}, [\src], \s_strd
bgt 24f
vld1.16 {d17[]}, [\sr2], \s_strd
vld1.16 {d18[]}, [\src], \s_strd
vext.8 d16, d16, d17, #6
vext.8 d17, d17, d18, #6
vmull.u8 q2, d16, d2
vmlal.u8 q2, d17, d3
vqrshrn.u16 d4, q2, #4
vst1.16 {d4[0]}, [\dst, :16]
vst1.16 {d4[1]}, [\ds2, :16]
pop {r4-r11,pc}
24: // 2x4, 2x8, ... v
vld1.16 {d17[]}, [\sr2], \s_strd
vld1.16 {d18[]}, [\src], \s_strd
vld1.16 {d19[]}, [\sr2], \s_strd
vld1.16 {d20[]}, [\src], \s_strd
vext.8 d16, d16, d17, #6
vext.8 d17, d17, d18, #6
vext.8 d18, d18, d19, #6
vext.8 d19, d19, d20, #6
vtrn.32 d16, d18
vtrn.32 d17, d19
vmull.u8 q2, d16, d2
vmlal.u8 q2, d17, d3
subs \h, \h, #4
vqrshrn.u16 d4, q2, #4
vst1.16 {d4[0]}, [\dst, :16], \d_strd
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
vst1.16 {d4[2]}, [\dst, :16], \d_strd
vst1.16 {d4[3]}, [\ds2, :16], \d_strd
ble 0f
vmov d16, d20
b 24b
0:
pop {r4-r11,pc}
.endif
40: // 4xN v
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vld1.32 {d16[]}, [\src], \s_strd
4:
vld1.32 {d17[]}, [\sr2], \s_strd
vld1.32 {d18[]}, [\src], \s_strd
vext.8 d16, d16, d17, #4
vext.8 d17, d17, d18, #4
vmull.u8 q2, d16, d2
vmlal.u8 q2, d17, d3
subs \h, \h, #2
.ifc \type, put
vqrshrn.u16 d4, q2, #4
vst1.32 {d4[0]}, [\dst, :32], \d_strd
vst1.32 {d4[1]}, [\ds2, :32], \d_strd
.else
vst1.16 {d4}, [\dst, :64], \d_strd
vst1.16 {d5}, [\ds2, :64], \d_strd
.endif
ble 0f
vmov d16, d18
b 4b
0:
pop {r4-r11,pc}
80: // 8xN v
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vld1.8 {d16}, [\src], \s_strd
8:
vld1.8 {d17}, [\sr2], \s_strd
vld1.8 {d18}, [\src], \s_strd
vmull.u8 q2, d16, d2
vmull.u8 q3, d17, d2
vmlal.u8 q2, d17, d3
vmlal.u8 q3, d18, d3
subs \h, \h, #2
.ifc \type, put
vqrshrn.u16 d4, q2, #4
vqrshrn.u16 d6, q3, #4
vst1.8 {d4}, [\dst, :64], \d_strd
vst1.8 {d6}, [\ds2, :64], \d_strd
.else
vst1.16 {q2}, [\dst, :128], \d_strd
vst1.16 {q3}, [\ds2, :128], \d_strd
.endif
ble 0f
vmov d16, d18
b 8b
0:
pop {r4-r11,pc}
160: // 16xN, 32xN, ...
320:
640:
1280:
mov \my, \h
1:
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vld1.8 {q8}, [\src], \s_strd
2:
vld1.8 {q9}, [\sr2], \s_strd
vld1.8 {q10}, [\src], \s_strd
vmull.u8 q12, d16, d2
vmull.u8 q13, d17, d2
vmull.u8 q14, d18, d2
vmull.u8 q15, d19, d2
vmlal.u8 q12, d18, d3
vmlal.u8 q13, d19, d3
vmlal.u8 q14, d20, d3
vmlal.u8 q15, d21, d3
subs \h, \h, #2
.ifc \type, put
vqrshrn.u16 d24, q12, #4
vqrshrn.u16 d25, q13, #4
vqrshrn.u16 d28, q14, #4
vqrshrn.u16 d29, q15, #4
vst1.8 {q12}, [\dst, :128], \d_strd
vst1.8 {q14}, [\ds2, :128], \d_strd
.else
vst1.16 {q12, q13}, [\dst, :128], \d_strd
vst1.16 {q14, q15}, [\ds2, :128], \d_strd
.endif
ble 9f
vmov q8, q10
b 2b
9:
subs \w, \w, #16
ble 0f
asr \s_strd, \s_strd, #1
asr \d_strd, \d_strd, #1
mls \src, \s_strd, \my, \src
mls \dst, \d_strd, \my, \dst
sub \src, \src, \s_strd, lsl #1
mov \h, \my
add \src, \src, #16
.ifc \type, put
add \dst, \dst, #16
.else
add \dst, \dst, #32
.endif
b 1b
0:
pop {r4-r11,pc}
L(\type\()_bilin_hv):
vmovl.u8 q2, d2
vmovl.u8 q3, d3
adr r9, L(\type\()_bilin_hv_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
bx r9
.align 2
L(\type\()_bilin_hv_tbl):
.word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
.word 640f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
.word 320f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
.word 160f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
.word 80f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
.word 40f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
.word 20f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
20: // 2xN hv
.ifc \type, put
add \sr2, \src, \s_strd
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vld1.32 {d28[]}, [\src], \s_strd
vext.8 d29, d28, d28, #1
vmull.u8 q8, d28, d0
vmlal.u8 q8, d29, d1
2:
vld1.32 {d28[]}, [\sr2], \s_strd
vld1.32 {d30[]}, [\src], \s_strd
vext.8 d29, d28, d28, #1
vext.8 d31, d30, d30, #1
vtrn.16 d28, d30
vtrn.16 d29, d31
vmull.u8 q9, d28, d0
vmlal.u8 q9, d29, d1
vtrn.32 d16, d18
vmul.u16 d20, d16, d4
vmla.u16 d20, d19, d6
vqrshrn.u16 d20, q10, #8
subs \h, \h, #2
vst1.16 {d20[0]}, [\dst, :16], \d_strd
vst1.16 {d20[1]}, [\ds2, :16], \d_strd
ble 0f
vtrn.32 d19, d16
b 2b
0:
pop {r4-r11,pc}
.endif
40: // 4xN hv
add \sr2, \src, \s_strd
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vld1.8 {d28}, [\src], \s_strd
vext.8 d29, d28, d28, #1
vmull.u8 q8, d28, d0
vmlal.u8 q8, d29, d1
4:
vld1.8 {d28}, [\sr2], \s_strd
vld1.8 {d30}, [\src], \s_strd
vext.8 d29, d28, d28, #1
vext.8 d31, d30, d30, #1
vtrn.32 d28, d30
vtrn.32 d29, d31
vmull.u8 q9, d28, d0
vmlal.u8 q9, d29, d1
vmov d17, d18
vmul.u16 q10, q8, q2
vmla.u16 q10, q9, q3
subs \h, \h, #2
.ifc \type, put
vqrshrn.u16 d20, q10, #8
vst1.32 {d20[0]}, [\dst, :32], \d_strd
vst1.32 {d20[1]}, [\ds2, :32], \d_strd
.else
vrshr.u16 q10, q10, #4
vst1.16 {d20}, [\dst, :64], \d_strd
vst1.16 {d21}, [\ds2, :64], \d_strd
.endif
ble 0f
vmov d16, d19
b 4b
0:
pop {r4-r11,pc}
80: // 8xN, 16xN, ... hv
160:
320:
640:
1280:
mov \my, \h
1:
add \sr2, \src, \s_strd
add \ds2, \dst, \d_strd
lsl \s_strd, \s_strd, #1
lsl \d_strd, \d_strd, #1
vld1.8 {q12}, [\src], \s_strd
vext.8 q13, q12, q12, #1
vmull.u8 q8, d24, d0
vmlal.u8 q8, d26, d1
2:
vld1.8 {q12}, [\sr2], \s_strd
vld1.8 {q14}, [\src], \s_strd
vext.8 q13, q12, q12, #1
vext.8 q15, q14, q14, #1
vmull.u8 q9, d24, d0
vmlal.u8 q9, d26, d1
vmull.u8 q10, d28, d0
vmlal.u8 q10, d30, d1
vmul.u16 q8, q8, q2
vmla.u16 q8, q9, q3
vmul.u16 q9, q9, q2
vmla.u16 q9, q10, q3
subs \h, \h, #2
.ifc \type, put
vqrshrn.u16 d16, q8, #8
vqrshrn.u16 d18, q9, #8
vst1.8 {d16}, [\dst, :64], \d_strd
vst1.8 {d18}, [\ds2, :64], \d_strd
.else
vrshr.u16 q8, q8, #4
vrshr.u16 q9, q9, #4
vst1.16 {q8}, [\dst, :128], \d_strd
vst1.16 {q9}, [\ds2, :128], \d_strd
.endif
ble 9f
vmov q8, q10
b 2b
9:
subs \w, \w, #8
ble 0f
asr \s_strd, \s_strd, #1
asr \d_strd, \d_strd, #1
mls \src, \s_strd, \my, \src
mls \dst, \d_strd, \my, \dst
sub \src, \src, \s_strd, lsl #1
mov \h, \my
add \src, \src, #8
.ifc \type, put
add \dst, \dst, #8
.else
add \dst, \dst, #16
.endif
b 1b
0:
pop {r4-r11,pc}
endfunc
.endm
filter_fn put, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, 10
filter_fn prep, r0, r7, r1, r2, r3, r4, r5, r6, r8, r9, 6