ref: 40566f29f05e2ec8dced96f80dbb47ce23a4cff3
dir: /codec/processing/src/arm/down_sample_neon.S/
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
stmdb sp!, {r4-r8, lr}
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
lsr r5, #1
//Save the tailer for the unasigned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
add r7, r2, r3
//processing a colume data
comp_ds_bilinear_loop0:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
add lr, #32
cmp lr, r4
movcs lr, #0
addcs r6, r6, r3, lsl #1
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_loop0
//restore the tailer for the unasigned size
vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
//processing a colume data
comp_ds_bilinear_w_x8_loop0:
lsr r6, r4, #3
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x8_loop1:
vld1.8 {d0}, [r2]!
vld1.8 {d1}, [r7]!
vpaddl.u8 q0, q0
vrshr.u16 q0, #1
vrhadd.u16 d0, d1
vmovn.u16 d0, q0
vst1.32 {d0[0]}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x8_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x8_loop0
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
//processing a colume data
comp_ds_bilinear_w_x16_loop0:
lsr r6, r4, #4
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x16_loop1:
vld1.8 {q0}, [r2]!
vld1.8 {q1}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrhadd.u16 q0, q1
vmovn.u16 d0, q0
vst1.32 {d0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x16_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x16_loop0
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
//processing a colume data
comp_ds_bilinear_w_x32_loop0:
lsr r6, r4, #5
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x32_loop1:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x32_loop0
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
stmdb sp!, {r4-r12, lr}
//Get the data from stack
ldr r4, [sp, #40] //the addr of src
ldr r5, [sp, #44] //the value of src_stride
ldr r6, [sp, #48] //the value of scaleX
ldr r7, [sp, #52] //the value of scaleY
mov r10, #32768
sub r10, #1
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
mov r11, #-1
mul r11, r8 // r11 -uinc
vdup.s16 d2, r8
vdup.s16 d0, r11
vzip.s16 d0, d2 // uinc -uinc uinc -uinc
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
mov r11, #-1
mul r11, r9 // r11 -vinc
vdup.s16 d2, r9
vdup.s16 d3, r11
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
mov r11, #0x40000000
mov r12, #0x4000
sub r12, #1
add r11, r12
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
mov r11, #16384
vdup.s16 d16, r11
sub r11, #1
vdup.s16 d17, r11
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
veor q14, q14
sub r1, r2 // stride - width
mov r8, #16384 // yInverse
sub r3, #1
_HEIGHT:
ldr r4, [sp, #40] //the addr of src
mov r11, r8
lsr r11, #15
mul r11, r5
add r11, r4 // get current row address
mov r12, r11
add r12, r5
mov r9, #16384 // xInverse
sub r10, r2, #1
vmov.s16 d6, d1
_WIDTH:
mov lr, r9
lsr lr, #15
add r4, r11,lr
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
add r4, r12,lr
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
vzip.32 d28, d29 //q14: 000d000c000b000a;
vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u32 q12, d26,d28
vmlal.u32 q12, d27,d29
vqadd.u64 d24, d24,d25
vrshr.u64 d24, #30
vst1.8 {d24[0]}, [r0]!
add r9, r6
vadd.u16 d6, d0 // inc u
vshl.u16 d6, #1
vshr.u16 d6, #1
subs r10, #1
bne _WIDTH
WIDTH_END:
lsr r9, #15
add r4,r11,r9
vld1.8 {d24[0]}, [r4]
vst1.8 {d24[0]}, [r0]
add r0, #1
add r8, r7
add r0, r1
vadd.s16 d7, d5 // inc v
vshl.u16 d7, #1
vshr.u16 d7, #1
subs r3, #1
bne _HEIGHT
LAST_ROW:
ldr r4, [sp, #40] //the addr of src
lsr r8, #15
mul r8, r5
add r4, r8 // get current row address
mov r9, #16384
_LAST_ROW_WIDTH:
mov r11, r9
lsr r11, #15
add r3, r4,r11
vld1.8 {d0[0]}, [r3]
vst1.8 {d0[0]}, [r0]
add r0, #1
add r9, r6
subs r2, #1
bne _LAST_ROW_WIDTH
ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DyadicBilinearOneThirdDownsampler_neon
stmdb sp!, {r4-r8, lr}
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
//Save the tailer for the un-aligned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
add r7, r2, r3
//processing a colume data
comp_ds_bilinear_onethird_loop0:
vld3.8 {d0, d1, d2}, [r2]!
vld3.8 {d3, d4, d5}, [r2]!
vld3.8 {d16, d17, d18}, [r7]!
vld3.8 {d19, d20, d21}, [r7]!
vaddl.u8 q11, d0, d1
vaddl.u8 q12, d3, d4
vaddl.u8 q13, d16, d17
vaddl.u8 q14, d19, d20
vrshr.u16 q11, #1
vrshr.u16 q12, #1
vrshr.u16 q13, #1
vrshr.u16 q14, #1
vrhadd.u16 q11, q13
vrhadd.u16 q12, q14
vmovn.u16 d0, q11
vmovn.u16 d1, q12
vst1.8 {q0}, [r0]!
add lr, #48
cmp lr, r4
movcs lr, #0
addcs r6, r6, r3, lsl #1
addcs r6, r6, r3
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_onethird_loop0
//restore the tailer for the un-aligned size
vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DyadicBilinearQuarterDownsampler_neon
stmdb sp!, {r4-r8, lr}
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
lsr r5, #2
//Save the tailer for the un-aligned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
add r7, r2, r3
//processing a colume data
comp_ds_bilinear_quarter_loop0:
vld2.16 {q0, q1}, [r2]!
vld2.16 {q2, q3}, [r2]!
vld2.16 {q8, q9}, [r7]!
vld2.16 {q10, q11}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q2, q2
vpaddl.u8 q8, q8
vpaddl.u8 q10, q10
vrshr.u16 q0, #1
vrshr.u16 q2, #1
vrshr.u16 q8, #1
vrshr.u16 q10, #1
vrhadd.u16 q0, q8
vrhadd.u16 q2, q10
vmovn.u16 d0, q0
vmovn.u16 d1, q2
vst1.8 {q0}, [r0]!
add lr, #64
cmp lr, r4
movcs lr, #0
addcs r6, r6, r3, lsl #2
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_quarter_loop0
//restore the tailer for the un-aligned size
vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
#endif