ref: 090b3b02c214c116fba440b75acfd52693989636
parent: e55e3f80319e9cae14b91b649bc861b0b5297c53
author: Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
date: Mon Jun 25 04:53:23 EDT 2018
Add New Neon Assemblies for Motion Compensation Commit adds neon assemblies for motion compensation which show an improvement over the existing neon code. Performance Improvement - Platform Resolution 1 Thread 4 Threads Nexus 6 720p 12.16% 7.21% @2.65 GHz 1080p 18.00% 15.28% Change-Id: Ic0b0412eeb01c8317642b20bb99092c2f5baba37
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
@@ -1,0 +1,438 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers*****************************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_avg_horiz_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u8 {d6}, [r1]
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u8 {d7}, [r6]
+ vrhadd.u8 d20, d20, d6
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vrhadd.u8 d8, d8, d7
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlsl.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlal.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlal.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ add r7, r1, #8
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vld1.u8 {d0}, [r1]
+ vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u8 {d2}, [r7]
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vrhadd.u8 d8, d8, d0
+ vrhadd.u8 d9, d9, d2
+ vmlsl.u8 q11, d1, d24
+ vmlsl.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlal.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ vmlal.u8 q11, d13, d28
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ subeq r14, r14, #2
+ vhadd.s16 q5, q5, q10
+ vmlal.u8 q11, d15, d29
+ addeq r1, r1, r8
+ vmlsl.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vdup.16 q10, r7
+ vld1.u32 {q3}, [r12], r11
+ add r7, r6, #8
+ moveq r5, r10
+ vld1.u8 {d0}, [r6]
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u8 {d2}, [r7]
+ vqrshrun.s16 d11, q11, #6
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q6}, [r12], r11
+ vrhadd.u8 d10, d10, d0
+ vld1.u32 {q7}, [r12], r11
+ vrhadd.u8 d11, d11, d2
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ mov r7, #0xc000
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ add r7, r6, #8
+ vld1.u8 {d20}, [r6]
+ vld1.u8 {d21}, [r7]
+ vrhadd.u8 d10, d10, d20
+ vrhadd.u8 d11, d11, d21
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlal.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlal.u8 q4, d5, d29
+ vmlsl.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vld1.u32 {d10[0]}, [r1]
+ vld1.u32 {d10[1]}, [r6]
+ vrhadd.u8 d8, d8, d10
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
@@ -1,0 +1,439 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_avg_horiz_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u8 {d6}, [r1]
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u8 {d7}, [r6]
+ vrhadd.u8 d20, d20, d6
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vrhadd.u8 d8, d8, d7
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlal.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlsl.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlsl.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlal.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ add r7, r1, #8
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vld1.u8 {d0}, [r1]
+ vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u8 {d2}, [r7]
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vrhadd.u8 d8, d8, d0
+ vrhadd.u8 d9, d9, d2
+ vmlsl.u8 q11, d1, d24
+ vmlal.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlsl.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ vmlal.u8 q11, d13, d28
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ subeq r14, r14, #2
+ vhadd.s16 q5, q5, q10
+ vmlsl.u8 q11, d15, d29
+ addeq r1, r1, r8
+ vmlal.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vdup.16 q10, r7
+ vld1.u32 {q3}, [r12], r11
+ add r7, r6, #8
+ moveq r5, r10
+ vld1.u8 {d0}, [r6]
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u8 {d2}, [r7]
+ vqrshrun.s16 d11, q11, #6
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q6}, [r12], r11
+ vrhadd.u8 d10, d10, d0
+ vld1.u32 {q7}, [r12], r11
+ vrhadd.u8 d11, d11, d2
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ mov r7, #0xc000
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ add r7, r6, #8
+ vld1.u8 {d20}, [r6]
+ vld1.u8 {d21}, [r7]
+ vrhadd.u8 d10, d10, d20
+ vrhadd.u8 d11, d11, d21
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlal.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlsl.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlsl.u8 q4, d5, d29
+ vmlal.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vld1.u32 {d10[0]}, [r1]
+ vld1.u32 {d10[1]}, [r6]
+ vrhadd.u8 d8, d8, d10
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
--- a/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
+++ /dev/null
@@ -1,295 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- ; These functions are only valid when:
- ; x_step_q4 == 16
- ; w%4 == 0
- ; h%4 == 0
- ; taps == 8
- ; VP9_FILTER_WEIGHT == 128
- ; VP9_FILTER_SHIFT == 7
-
- EXPORT |vpx_convolve8_avg_horiz_neon|
- EXPORT |vpx_convolve8_avg_vert_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Multiply and accumulate by q0
- MACRO
- MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
- vmull.s16 $dst, $src0, d0[0]
- vmlal.s16 $dst, $src1, d0[1]
- vmlal.s16 $dst, $src2, d0[2]
- vmlal.s16 $dst, $src3, d0[3]
- vmlal.s16 $dst, $src4, d1[0]
- vmlal.s16 $dst, $src5, d1[1]
- vmlal.s16 $dst, $src6, d1[2]
- vmlal.s16 $dst, $src7, d1[3]
- MEND
-
-; r0 const uint8_t *src
-; r1 int src_stride
-; r2 uint8_t *dst
-; r3 int dst_stride
-; sp[]const int16_t *filter
-; sp[]int x0_q4
-; sp[]int x_step_q4 ; unused
-; sp[]int y0_q4
-; sp[]int y_step_q4 ; unused
-; sp[]int w
-; sp[]int h
-
-|vpx_convolve8_avg_horiz_neon| PROC
- push {r4-r10, lr}
-
- sub r0, r0, #3 ; adjust for taps
-
- ldrd r4, r5, [sp, #32] ; filter, x0_q4
- add r4, r5, lsl #4
- ldrd r6, r7, [sp, #52] ; w, h
-
- vld1.s16 {q0}, [r4] ; filter
-
- sub r8, r1, r1, lsl #2 ; -src_stride * 3
- add r8, r8, #4 ; -src_stride * 3 + 4
-
- sub r4, r3, r3, lsl #2 ; -dst_stride * 3
- add r4, r4, #4 ; -dst_stride * 3 + 4
-
- rsb r9, r6, r1, lsl #2 ; reset src for outer loop
- sub r9, r9, #7
- rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
-
- mov r10, r6 ; w loop counter
-
-vpx_convolve8_avg_loop_horiz_v
- vld1.8 {d24}, [r0], r1
- vld1.8 {d25}, [r0], r1
- vld1.8 {d26}, [r0], r1
- vld1.8 {d27}, [r0], r8
-
- vtrn.16 q12, q13
- vtrn.8 d24, d25
- vtrn.8 d26, d27
-
- pld [r0, r1, lsl #2]
-
- vmovl.u8 q8, d24
- vmovl.u8 q9, d25
- vmovl.u8 q10, d26
- vmovl.u8 q11, d27
-
- ; save a few instructions in the inner loop
- vswp d17, d18
- vmov d23, d21
-
- add r0, r0, #3
-
-vpx_convolve8_avg_loop_horiz
- add r5, r0, #64
-
- vld1.32 {d28[]}, [r0], r1
- vld1.32 {d29[]}, [r0], r1
- vld1.32 {d31[]}, [r0], r1
- vld1.32 {d30[]}, [r0], r8
-
- pld [r5]
-
- vtrn.16 d28, d31
- vtrn.16 d29, d30
- vtrn.8 d28, d29
- vtrn.8 d31, d30
-
- pld [r5, r1]
-
- ; extract to s16
- vtrn.32 q14, q15
- vmovl.u8 q12, d28
- vmovl.u8 q13, d29
-
- pld [r5, r1, lsl #1]
-
- ; slightly out of order load to match the existing data
- vld1.u32 {d6[0]}, [r2], r3
- vld1.u32 {d7[0]}, [r2], r3
- vld1.u32 {d6[1]}, [r2], r3
- vld1.u32 {d7[1]}, [r2], r3
-
- sub r2, r2, r3, lsl #2 ; reset for store
-
- ; src[] * filter
- MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
- MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
- MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
- MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
-
- pld [r5, -r8]
-
- ; += 64 >> 7
- vqrshrun.s32 d2, q1, #7
- vqrshrun.s32 d3, q2, #7
- vqrshrun.s32 d4, q14, #7
- vqrshrun.s32 d5, q15, #7
-
- ; saturate
- vqmovn.u16 d2, q1
- vqmovn.u16 d3, q2
-
- ; transpose
- vtrn.16 d2, d3
- vtrn.32 d2, d3
- vtrn.8 d2, d3
-
- ; average the new value and the dst value
- vrhadd.u8 q1, q1, q3
-
- vst1.u32 {d2[0]}, [r2@32], r3
- vst1.u32 {d3[0]}, [r2@32], r3
- vst1.u32 {d2[1]}, [r2@32], r3
- vst1.u32 {d3[1]}, [r2@32], r4
-
- vmov q8, q9
- vmov d20, d23
- vmov q11, q12
- vmov q9, q13
-
- subs r6, r6, #4 ; w -= 4
- bgt vpx_convolve8_avg_loop_horiz
-
- ; outer loop
- mov r6, r10 ; restore w counter
- add r0, r0, r9 ; src += src_stride * 4 - w
- add r2, r2, r12 ; dst += dst_stride * 4 - w
- subs r7, r7, #4 ; h -= 4
- bgt vpx_convolve8_avg_loop_horiz_v
-
- pop {r4-r10, pc}
-
- ENDP
-
-|vpx_convolve8_avg_vert_neon| PROC
- push {r4-r8, lr}
-
- ; adjust for taps
- sub r0, r0, r1
- sub r0, r0, r1, lsl #1
-
- ldr r4, [sp, #24] ; filter
- ldr r5, [sp, #36] ; y0_q4
- add r4, r5, lsl #4
- ldr r6, [sp, #44] ; w
- ldr lr, [sp, #48] ; h
-
- vld1.s16 {q0}, [r4] ; filter
-
- lsl r1, r1, #1
- lsl r3, r3, #1
-
-vpx_convolve8_avg_loop_vert_h
- mov r4, r0
- add r7, r0, r1, asr #1
- mov r5, r2
- add r8, r2, r3, asr #1
- mov r12, lr ; h loop counter
-
- vld1.u32 {d16[0]}, [r4], r1
- vld1.u32 {d16[1]}, [r7], r1
- vld1.u32 {d18[0]}, [r4], r1
- vld1.u32 {d18[1]}, [r7], r1
- vld1.u32 {d20[0]}, [r4], r1
- vld1.u32 {d20[1]}, [r7], r1
- vld1.u32 {d22[0]}, [r4], r1
-
- vmovl.u8 q8, d16
- vmovl.u8 q9, d18
- vmovl.u8 q10, d20
- vmovl.u8 q11, d22
-
-vpx_convolve8_avg_loop_vert
- ; always process a 4x4 block at a time
- vld1.u32 {d24[0]}, [r7], r1
- vld1.u32 {d26[0]}, [r4], r1
- vld1.u32 {d26[1]}, [r7], r1
- vld1.u32 {d24[1]}, [r4], r1
-
- ; extract to s16
- vmovl.u8 q12, d24
- vmovl.u8 q13, d26
-
- vld1.u32 {d6[0]}, [r5@32], r3
- vld1.u32 {d6[1]}, [r8@32], r3
- vld1.u32 {d7[0]}, [r5@32], r3
- vld1.u32 {d7[1]}, [r8@32], r3
-
- pld [r7]
- pld [r4]
-
- ; src[] * filter
- MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
-
- pld [r7, r1]
- pld [r4, r1]
-
- MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
-
- pld [r5]
- pld [r8]
-
- MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
-
- pld [r5, r3]
- pld [r8, r3]
-
- MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
-
- ; += 64 >> 7
- vqrshrun.s32 d2, q1, #7
- vqrshrun.s32 d3, q2, #7
- vqrshrun.s32 d4, q14, #7
- vqrshrun.s32 d5, q15, #7
-
- ; saturate
- vqmovn.u16 d2, q1
- vqmovn.u16 d3, q2
-
- ; average the new value and the dst value
- vrhadd.u8 q1, q1, q3
-
- sub r5, r5, r3, lsl #1 ; reset for store
- sub r8, r8, r3, lsl #1
-
- vst1.u32 {d2[0]}, [r5@32], r3
- vst1.u32 {d2[1]}, [r8@32], r3
- vst1.u32 {d3[0]}, [r5@32], r3
- vst1.u32 {d3[1]}, [r8@32], r3
-
- vmov q8, q10
- vmov d18, d22
- vmov d19, d24
- vmov q10, q13
- vmov d22, d25
-
- subs r12, r12, #4 ; h -= 4
- bgt vpx_convolve8_avg_loop_vert
-
- ; outer loop
- add r0, r0, #4
- add r2, r2, #4
- subs r6, r6, #4 ; w -= 4
- bgt vpx_convolve8_avg_loop_vert_h
-
- pop {r4-r8, pc}
-
- ENDP
- END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
@@ -1,0 +1,487 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_avg_vert_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+ .syntax unified
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r3, r3, r2
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ add r14, r1, r6
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlsl.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d6, d25
+ vrhadd.u8 d10, d10, d20
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ addle r0, r0, r8
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vrhadd.u8 d12, d12, d20
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ addle r1, r1, r9
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlsl.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ add r10, r10, r2 ; 11*strd
+ vmlal.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlal.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d16, d28
+ add r10, r10, r2 ;12*strd
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ subs r7, r7, #4
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vrhadd.u8 d12, d12, d20
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vst1.8 {d12}, [r14], r6
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vst1.8 {d14}, [r14], r6
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vmlal.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ add r14, r1, r6
+ vmlsl.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlal.u8 q7, d16, d27
+ vmlsl.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d12, d12, d20
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d14, d14, d20
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlsl.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlsl.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlal.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlal.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlsl.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vld1.u32 {d20[0]}, [r1]
+ vld1.u32 {d20[1]}, [r3]
+ vrhadd.u8 d0, d0, d20
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ mov r4, r3
+ vld1.u32 {d20[0]}, [r4], r6
+ vld1.u32 {d20[1]}, [r4]
+ vrhadd.u8 d8, d8, d20
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
@@ -1,0 +1,488 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_avg_vert_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+ .syntax unified
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r3, r3, r2
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ add r14, r1, r6
+ vmlsl.u8 q6, d7, d27
+ vmlal.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlal.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlsl.u8 q7, d5, d24
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d6, d25
+ vrhadd.u8 d10, d10, d20
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d20}, [r14]
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ addle r0, r0, r8
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vrhadd.u8 d12, d12, d20
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ addle r1, r1, r9
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlal.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ add r10, r10, r2 ; 11*strd
+ vmlsl.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlsl.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlal.u8 q6, d16, d28
+ add r10, r10, r2 ;12*strd
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ subs r7, r7, #4
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vld1.u8 {d20}, [r14]
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vrhadd.u8 d12, d12, d20
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vst1.8 {d12}, [r14], r6
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vrhadd.u8 d14, d14, d20
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vst1.8 {d14}, [r14], r6
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vld1.u8 {d20}, [r1]
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vrhadd.u8 d8, d8, d20
+ vmlsl.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlsl.u8 q6, d7, d27
+ add r14, r1, r6
+ vmlal.u8 q6, d16, d28
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vmlsl.u8 q6, d17, d29
+ vld1.u8 {d20}, [r14]
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vrhadd.u8 d10, d10, d20
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlsl.u8 q7, d16, d27
+ vmlal.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d12, d12, d20
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d20}, [r14]
+ vrhadd.u8 d14, d14, d20
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlal.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlal.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlsl.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlsl.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlal.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vld1.u32 {d20[0]}, [r1]
+ vld1.u32 {d20[1]}, [r3]
+ vrhadd.u8 d0, d0, d20
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ mov r4, r3
+ vld1.u32 {d20[0]}, [r4], r6
+ vld1.u32 {d20[1]}, [r4]
+ vrhadd.u8 d8, d8, d20
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
@@ -1,0 +1,415 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_horiz_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlsl.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlal.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlal.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlsl.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlal.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vmlsl.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlsl.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlal.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlal.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlsl.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vmlal.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlal.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlsl.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vmlsl.u8 q11, d1, d24
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ vmlsl.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlal.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ addeq r1, r1, r8
+ subeq r14, r14, #2
+ vmlal.u8 q11, d13, d28
+ vhadd.s16 q5, q5, q10
+ vmlal.u8 q11, d15, d29
+ vmlsl.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vld1.u32 {q7}, [r12], r11
+ vmlsl.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q9}, [r12], r11
+ vqrshrun.s16 d11, q11, #6
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ moveq r5, r10
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vdup.16 q10, r7
+ vmlal.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlsl.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlsl.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlal.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlal.u8 q4, d5, d29
+ vmlsl.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
@@ -1,0 +1,415 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r3 => dst_stride
+; r4 => filter_x0
+; r8 => ht
+; r10 => wd
+
+ EXPORT |vpx_convolve8_horiz_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+
+start_loop_count
+ ldr r4, [sp, #104] ;loads pi1_coeff
+ ldr r8, [sp, #108] ;loads x0_q4
+ add r4, r4, r8, lsl #4 ;r4 = filter[x0_q4]
+ ldr r8, [sp, #128] ;loads ht
+ ldr r10, [sp, #124] ;loads wd
+ vld2.8 {d0, d1}, [r4] ;coeff = vld1_s8(pi1_coeff)
+ mov r11, #1
+ subs r14, r8, #0 ;checks for ht == 0
+ vabs.s8 d2, d0 ;vabs_s8(coeff)
+ vdup.8 d24, d2[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0)
+ sub r12, r0, #3 ;pu1_src - 3
+ vdup.8 d25, d2[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1)
+ add r4, r12, r2 ;pu1_src_tmp2_8 = pu1_src + src_strd
+ vdup.8 d26, d2[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2)
+ rsb r9, r10, r2, lsl #1 ;2*src_strd - wd
+ vdup.8 d27, d2[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3)
+ rsb r8, r10, r3, lsl #1 ;2*dst_strd - wd
+ vdup.8 d28, d2[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4)
+ vdup.8 d29, d2[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5)
+ vdup.8 d30, d2[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6)
+ vdup.8 d31, d2[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7)
+ mov r7, r1
+ cmp r10, #4
+ ble outer_loop_4
+
+ cmp r10, #24
+ moveq r10, #16
+ addeq r8, #8
+ addeq r9, #8
+ cmp r10, #16
+ bge outer_loop_16
+
+ cmp r10, #12
+ addeq r8, #4
+ addeq r9, #4
+ b outer_loop_8
+
+outer_loop8_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ mov r14, #32
+ add r1, #16
+ add r12, #16
+ mov r10, #8
+ add r8, #8
+ add r9, #8
+
+outer_loop_8
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_8
+
+inner_loop_8
+ mov r7, #0xc000
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {d1}, [r12], r11
+ vdup.16 q5, r7
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ mov r7, #0x4000
+ vld1.u32 {d4}, [r12], r11
+ vmlal.u8 q4, d1, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {d5}, [r12], r11
+ vmlal.u8 q4, d3, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d6}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {d7}, [r12], r11
+ vmlsl.u8 q4, d2, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlal.u8 q4, d4, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d13}, [r4], r11
+ vmlsl.u8 q4, d5, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vld1.u32 {d14}, [r4], r11
+ vmlal.u8 q4, d6, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vld1.u32 {d15}, [r4], r11
+ vmlsl.u8 q4, d7, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vld1.u32 {d16}, [r4], r11 ;vector load pu1_src + src_strd
+ vdup.16 q11, r7
+ vmlal.u8 q5, d15, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {d17}, [r4], r11
+ vmlsl.u8 q5, d14, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {d18}, [r4], r11
+ vmlal.u8 q5, d16, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vld1.u32 {d19}, [r4], r11 ;vector load pu1_src + src_strd
+ vmlsl.u8 q5, d17, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vmlal.u8 q5, d18, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d19, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ vqrshrun.s16 d20, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlsl.u8 q5, d12, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d13, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vst1.8 {d20}, [r1]! ;store the result pu1_dst
+ vhadd.s16 q5, q5, q11
+ subs r5, r5, #8 ;decrement the wd loop
+ vqrshrun.s16 d8, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ vst1.8 {d8}, [r6]! ;store the result pu1_dst
+ cmp r5, #4
+ bgt inner_loop_8
+
+end_inner_loop_8
+ subs r14, r14, #2 ;decrement the ht loop
+ add r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the dst pointer by
+ ; 2*dst_strd-wd
+ bgt outer_loop_8
+
+ ldr r10, [sp, #120] ;loads wd
+ cmp r10, #12
+ beq outer_loop4_residual
+
+end_loops
+ b end_func
+
+outer_loop_16
+ str r0, [sp, #-4]!
+ str r7, [sp, #-4]!
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ and r0, r12, #31
+ mov r7, #0xc000
+ sub r5, r10, #0 ;checks wd
+ pld [r4, r2, lsl #1]
+ pld [r12, r2, lsl #1]
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ vdup.16 q4, r7
+ vld1.u32 {q1}, [r12], r11
+ vld1.u32 {q2}, [r12], r11
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q7}, [r12], r11
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q8}, [r12], r11
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ vld1.u32 {q9}, [r12], r11
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vdup.16 q10, r7
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+
+inner_loop_16
+ vmlsl.u8 q10, d1, d24
+ vdup.16 q5, r7
+ vmlal.u8 q10, d3, d25
+ mov r7, #0x4000
+ vdup.16 q11, r7
+ vmlsl.u8 q10, d5, d26
+ vld1.u32 {q0}, [r4], r11 ;vector load pu1_src
+ vhadd.s16 q4, q4, q11
+ vld1.u32 {q1}, [r4], r11
+ vmlal.u8 q10, d7, d27
+ add r12, #8
+ subs r5, r5, #16
+ vmlal.u8 q10, d13, d28
+ vld1.u32 {q2}, [r4], r11
+ vmlsl.u8 q10, d15, d29
+ vld1.u32 {q3}, [r4], r11
+ vqrshrun.s16 d8, q4, #6 ;right shift and saturating narrow
+ ; result 1
+ vmlal.u8 q10, d17, d30
+ vld1.u32 {q6}, [r4], r11
+ vmlsl.u8 q10, d19, d31
+ vld1.u32 {q7}, [r4], r11
+ vmlsl.u8 q5, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vmlal.u8 q5, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r4], r11
+ vhadd.s16 q10, q10, q11
+ vld1.u32 {q9}, [r4], r11
+ vmlsl.u8 q5, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vmlal.u8 q5, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ add r4, #8
+ mov r7, #0xc000
+ vmlal.u8 q5, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vmlsl.u8 q5, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vqrshrun.s16 d9, q10, #6
+ vdup.16 q11, r7
+ vmlal.u8 q5, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q5, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ mov r7, #0x4000
+ vmlsl.u8 q11, d1, d24
+ vst1.8 {q4}, [r1]! ;store the result pu1_dst
+ vmlal.u8 q11, d3, d25
+ vdup.16 q10, r7
+ vmlsl.u8 q11, d5, d26
+ pld [r12, r2, lsl #2]
+ pld [r4, r2, lsl #2]
+ addeq r12, r12, r9 ;increment the src pointer by
+ ; 2*src_strd-wd
+ addeq r4, r12, r2 ;pu1_src + src_strd
+ vmlal.u8 q11, d7, d27
+ addeq r1, r1, r8
+ subeq r14, r14, #2
+ vmlal.u8 q11, d13, d28
+ vhadd.s16 q5, q5, q10
+ vmlsl.u8 q11, d15, d29
+ vmlal.u8 q11, d17, d30
+ cmp r14, #0
+ vmlsl.u8 q11, d19, d31
+ vqrshrun.s16 d10, q5, #6 ;right shift and saturating narrow
+ ; result 2
+ beq epilog_16
+
+ vld1.u32 {q0}, [r12], r11 ;vector load pu1_src
+ mov r7, #0xc000
+ cmp r5, #0
+ vld1.u32 {q1}, [r12], r11
+ vhadd.s16 q11, q11, q10
+ vld1.u32 {q2}, [r12], r11
+ vdup.16 q4, r7
+ vld1.u32 {q3}, [r12], r11
+ vmlsl.u8 q4, d0, d24 ;mul_res = vmlsl_u8(src[0_0],
+ ; coeffabs_0);
+ vld1.u32 {q6}, [r12], r11
+ vld1.u32 {q7}, [r12], r11
+ vmlal.u8 q4, d2, d25 ;mul_res = vmlal_u8(src[0_1],
+ ; coeffabs_1);
+ vld1.u32 {q8}, [r12], r11
+ vmlsl.u8 q4, d4, d26 ;mul_res = vmlsl_u8(src[0_2],
+ ; coeffabs_2);
+ vld1.u32 {q9}, [r12], r11
+ vqrshrun.s16 d11, q11, #6
+ vmlal.u8 q4, d6, d27 ;mul_res = vmull_u8(src[0_3],
+ ; coeffabs_3);
+ moveq r5, r10
+ vmlal.u8 q4, d12, d28 ;mul_res = vmlal_u8(src[0_4],
+ ; coeffabs_4);
+ vdup.16 q10, r7
+ vmlsl.u8 q4, d14, d29 ;mul_res = vmlsl_u8(src[0_5],
+ ; coeffabs_5);
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ vmlal.u8 q4, d16, d30 ;mul_res = vmlal_u8(src[0_6],
+ ; coeffabs_6);
+ vmlsl.u8 q4, d18, d31 ;mul_res = vmlsl_u8(src[0_7],
+ ; coeffabs_7);
+ addeq r6, r1, r3 ;pu1_dst + dst_strd
+ b inner_loop_16
+
+epilog_16
+ mov r7, #0x4000
+ ldr r0, [sp], #4
+ ldr r10, [sp, #120]
+ vdup.16 q10, r7
+ vhadd.s16 q11, q11, q10
+ vqrshrun.s16 d11, q11, #6
+ vst1.8 {q5}, [r6]! ;store the result pu1_dst
+ ldr r7, [sp], #4
+ cmp r10, #24
+ beq outer_loop8_residual
+
+end_loops1
+ b end_func
+
+outer_loop4_residual
+ sub r12, r0, #3 ;pu1_src - 3
+ mov r1, r7
+ add r1, #8
+ mov r10, #4
+ add r12, #8
+ mov r14, #16
+ add r8, #4
+ add r9, #4
+
+outer_loop_4
+ add r6, r1, r3 ;pu1_dst + dst_strd
+ add r4, r12, r2 ;pu1_src + src_strd
+ subs r5, r10, #0 ;checks wd
+ ble end_inner_loop_4
+
+inner_loop_4
+ vld1.u32 {d0}, [r12], r11 ;vector load pu1_src
+ vld1.u32 {d1}, [r12], r11
+ vld1.u32 {d2}, [r12], r11
+ vld1.u32 {d3}, [r12], r11
+ vld1.u32 {d4}, [r12], r11
+ vld1.u32 {d5}, [r12], r11
+ vld1.u32 {d6}, [r12], r11
+ vld1.u32 {d7}, [r12], r11
+ sub r12, r12, #4
+ vld1.u32 {d12}, [r4], r11 ;vector load pu1_src + src_strd
+ vld1.u32 {d13}, [r4], r11
+ vzip.32 d0, d12 ;vector zip the i iteration and ii
+ ; interation in single register
+ vld1.u32 {d14}, [r4], r11
+ vzip.32 d1, d13
+ vld1.u32 {d15}, [r4], r11
+ vzip.32 d2, d14
+ vld1.u32 {d16}, [r4], r11
+ vzip.32 d3, d15
+ vld1.u32 {d17}, [r4], r11
+ vzip.32 d4, d16
+ vld1.u32 {d18}, [r4], r11
+ vzip.32 d5, d17
+ vld1.u32 {d19}, [r4], r11
+ mov r7, #0xc000
+ vdup.16 q4, r7
+ sub r4, r4, #4
+ vzip.32 d6, d18
+ vzip.32 d7, d19
+ vmlal.u8 q4, d1, d25 ;arithmetic operations for ii
+ ; iteration in the same time
+ vmlsl.u8 q4, d0, d24
+ vmlsl.u8 q4, d2, d26
+ vmlal.u8 q4, d3, d27
+ vmlal.u8 q4, d4, d28
+ vmlsl.u8 q4, d5, d29
+ vmlal.u8 q4, d6, d30
+ vmlsl.u8 q4, d7, d31
+ mov r7, #0x4000
+ vdup.16 q10, r7
+ vhadd.s16 q4, q4, q10
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r1]! ;store the i iteration result which
+ ; is in upper part of the register
+ vst1.32 {d8[1]},[r6]! ;store the ii iteration result which
+ ; is in lower part of the register
+ subs r5, r5, #4 ;decrement the wd by 4
+ bgt inner_loop_4
+
+end_inner_loop_4
+ subs r14, r14, #2 ;decrement the ht by 4
+ add r12, r12, r9 ;increment the input pointer
+ ; 2*src_strd-wd
+ add r1, r1, r8 ;increment the output pointer
+ ; 2*dst_strd-wd
+ bgt outer_loop_4
+
+end_func
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
--- a/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
+++ /dev/null
@@ -1,273 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- ; These functions are only valid when:
- ; x_step_q4 == 16
- ; w%4 == 0
- ; h%4 == 0
- ; taps == 8
- ; VP9_FILTER_WEIGHT == 128
- ; VP9_FILTER_SHIFT == 7
-
- EXPORT |vpx_convolve8_horiz_neon|
- EXPORT |vpx_convolve8_vert_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Multiply and accumulate by q0
- MACRO
- MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
- vmull.s16 $dst, $src0, d0[0]
- vmlal.s16 $dst, $src1, d0[1]
- vmlal.s16 $dst, $src2, d0[2]
- vmlal.s16 $dst, $src3, d0[3]
- vmlal.s16 $dst, $src4, d1[0]
- vmlal.s16 $dst, $src5, d1[1]
- vmlal.s16 $dst, $src6, d1[2]
- vmlal.s16 $dst, $src7, d1[3]
- MEND
-
-; r0 const uint8_t *src
-; r1 int src_stride
-; r2 uint8_t *dst
-; r3 int dst_stride
-; sp[]const int16_t *filter
-; sp[]int x0_q4
-; sp[]int x_step_q4 ; unused
-; sp[]int y0_q4
-; sp[]int y_step_q4 ; unused
-; sp[]int w
-; sp[]int h
-
-|vpx_convolve8_horiz_neon| PROC
- push {r4-r10, lr}
-
- sub r0, r0, #3 ; adjust for taps
-
- ldrd r4, r5, [sp, #32] ; filter, x0_q4
- add r4, r5, lsl #4
- ldrd r6, r7, [sp, #52] ; w, h
-
- vld1.s16 {q0}, [r4] ; filter
-
- sub r8, r1, r1, lsl #2 ; -src_stride * 3
- add r8, r8, #4 ; -src_stride * 3 + 4
-
- sub r4, r3, r3, lsl #2 ; -dst_stride * 3
- add r4, r4, #4 ; -dst_stride * 3 + 4
-
- rsb r9, r6, r1, lsl #2 ; reset src for outer loop
- sub r9, r9, #7
- rsb r12, r6, r3, lsl #2 ; reset dst for outer loop
-
- mov r10, r6 ; w loop counter
-
-vpx_convolve8_loop_horiz_v
- vld1.8 {d24}, [r0], r1
- vld1.8 {d25}, [r0], r1
- vld1.8 {d26}, [r0], r1
- vld1.8 {d27}, [r0], r8
-
- vtrn.16 q12, q13
- vtrn.8 d24, d25
- vtrn.8 d26, d27
-
- pld [r0, r1, lsl #2]
-
- vmovl.u8 q8, d24
- vmovl.u8 q9, d25
- vmovl.u8 q10, d26
- vmovl.u8 q11, d27
-
- ; save a few instructions in the inner loop
- vswp d17, d18
- vmov d23, d21
-
- add r0, r0, #3
-
-vpx_convolve8_loop_horiz
- add r5, r0, #64
-
- vld1.32 {d28[]}, [r0], r1
- vld1.32 {d29[]}, [r0], r1
- vld1.32 {d31[]}, [r0], r1
- vld1.32 {d30[]}, [r0], r8
-
- pld [r5]
-
- vtrn.16 d28, d31
- vtrn.16 d29, d30
- vtrn.8 d28, d29
- vtrn.8 d31, d30
-
- pld [r5, r1]
-
- ; extract to s16
- vtrn.32 q14, q15
- vmovl.u8 q12, d28
- vmovl.u8 q13, d29
-
- pld [r5, r1, lsl #1]
-
- ; src[] * filter
- MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24
- MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26
- MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27
- MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25
-
- pld [r5, -r8]
-
- ; += 64 >> 7
- vqrshrun.s32 d2, q1, #7
- vqrshrun.s32 d3, q2, #7
- vqrshrun.s32 d4, q14, #7
- vqrshrun.s32 d5, q15, #7
-
- ; saturate
- vqmovn.u16 d2, q1
- vqmovn.u16 d3, q2
-
- ; transpose
- vtrn.16 d2, d3
- vtrn.32 d2, d3
- vtrn.8 d2, d3
-
- vst1.u32 {d2[0]}, [r2@32], r3
- vst1.u32 {d3[0]}, [r2@32], r3
- vst1.u32 {d2[1]}, [r2@32], r3
- vst1.u32 {d3[1]}, [r2@32], r4
-
- vmov q8, q9
- vmov d20, d23
- vmov q11, q12
- vmov q9, q13
-
- subs r6, r6, #4 ; w -= 4
- bgt vpx_convolve8_loop_horiz
-
- ; outer loop
- mov r6, r10 ; restore w counter
- add r0, r0, r9 ; src += src_stride * 4 - w
- add r2, r2, r12 ; dst += dst_stride * 4 - w
- subs r7, r7, #4 ; h -= 4
- bgt vpx_convolve8_loop_horiz_v
-
- pop {r4-r10, pc}
-
- ENDP
-
-|vpx_convolve8_vert_neon| PROC
- push {r4-r8, lr}
-
- ; adjust for taps
- sub r0, r0, r1
- sub r0, r0, r1, lsl #1
-
- ldr r4, [sp, #24] ; filter
- ldr r5, [sp, #36] ; y0_q4
- add r4, r5, lsl #4
- ldr r6, [sp, #44] ; w
- ldr lr, [sp, #48] ; h
-
- vld1.s16 {q0}, [r4] ; filter
-
- lsl r1, r1, #1
- lsl r3, r3, #1
-
-vpx_convolve8_loop_vert_h
- mov r4, r0
- add r7, r0, r1, asr #1
- mov r5, r2
- add r8, r2, r3, asr #1
- mov r12, lr ; h loop counter
-
- vld1.u32 {d16[0]}, [r4], r1
- vld1.u32 {d16[1]}, [r7], r1
- vld1.u32 {d18[0]}, [r4], r1
- vld1.u32 {d18[1]}, [r7], r1
- vld1.u32 {d20[0]}, [r4], r1
- vld1.u32 {d20[1]}, [r7], r1
- vld1.u32 {d22[0]}, [r4], r1
-
- vmovl.u8 q8, d16
- vmovl.u8 q9, d18
- vmovl.u8 q10, d20
- vmovl.u8 q11, d22
-
-vpx_convolve8_loop_vert
- ; always process a 4x4 block at a time
- vld1.u32 {d24[0]}, [r7], r1
- vld1.u32 {d26[0]}, [r4], r1
- vld1.u32 {d26[1]}, [r7], r1
- vld1.u32 {d24[1]}, [r4], r1
-
- ; extract to s16
- vmovl.u8 q12, d24
- vmovl.u8 q13, d26
-
- pld [r5]
- pld [r8]
-
- ; src[] * filter
- MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24
-
- pld [r5, r3]
- pld [r8, r3]
-
- MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26
-
- pld [r7]
- pld [r4]
-
- MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27
-
- pld [r7, r1]
- pld [r4, r1]
-
- MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25
-
- ; += 64 >> 7
- vqrshrun.s32 d2, q1, #7
- vqrshrun.s32 d3, q2, #7
- vqrshrun.s32 d4, q14, #7
- vqrshrun.s32 d5, q15, #7
-
- ; saturate
- vqmovn.u16 d2, q1
- vqmovn.u16 d3, q2
-
- vst1.u32 {d2[0]}, [r5@32], r3
- vst1.u32 {d2[1]}, [r8@32], r3
- vst1.u32 {d3[0]}, [r5@32], r3
- vst1.u32 {d3[1]}, [r8@32], r3
-
- vmov q8, q10
- vmov d18, d22
- vmov d19, d24
- vmov q10, q13
- vmov d22, d25
-
- subs r12, r12, #4 ; h -= 4
- bgt vpx_convolve8_loop_vert
-
- ; outer loop
- add r0, r0, #4
- add r2, r2, #4
- subs r6, r6, #4 ; w -= 4
- bgt vpx_convolve8_loop_vert_h
-
- pop {r4-r8, pc}
-
- ENDP
- END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.c
@@ -1,0 +1,41 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon_asm.h"
+
+/* Type1 and Type2 functions are called depending on the position of the
+ * negative and positive coefficients in the filter. In type1, the filter kernel
+ * used is sub_pel_filters_8lp, in which only the first two and the last two
+ * coefficients are negative. In type2, the negative coefficients are 0, 2, 5 &
+ * 7.
+ */
+
+#define DEFINE_FILTER(dir) \
+ void vpx_convolve8_##dir##_neon( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \
+ if (filter == vp9_filter_kernels[1]) { \
+ vpx_convolve8_##dir##_filter_type1_neon( \
+ src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h); \
+ } else { \
+ vpx_convolve8_##dir##_filter_type2_neon( \
+ src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+ y_step_q4, w, h); \
+ } \
+ }
+
+DEFINE_FILTER(horiz);
+DEFINE_FILTER(avg_horiz);
+DEFINE_FILTER(vert);
+DEFINE_FILTER(avg_vert);
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.h
@@ -1,0 +1,29 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+#define VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+
+#define DECLARE_FILTER(dir, type) \
+ void vpx_convolve8_##dir##_filter_##type##_neon( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+ int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+DECLARE_FILTER(horiz, type1);
+DECLARE_FILTER(avg_horiz, type1);
+DECLARE_FILTER(horiz, type2);
+DECLARE_FILTER(avg_horiz, type2);
+DECLARE_FILTER(vert, type1);
+DECLARE_FILTER(avg_vert, type1);
+DECLARE_FILTER(vert, type2);
+DECLARE_FILTER(avg_vert, type2);
+
+#endif /* VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_ */
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
@@ -1,0 +1,458 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_vert_filter_type1_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+ .syntax unified
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type1_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ pld [r3, r2, lsl #1]
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r3, r3, r2
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlsl.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ addle r1, r1, r9
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlsl.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ add r10, r10, r2 ; 11*strd
+ vmlal.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlal.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ vmlsl.u8 q6, d16, d28
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d17, d29
+ add r10, r10, r2 ;12*strd
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ subs r7, r7, #4
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vmlsl.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlal.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlal.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlsl.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlal.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlal.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlsl.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vst1.8 {d14}, [r14], r6
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vmlal.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlal.u8 q6, d7, d27
+ vmlsl.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vmlal.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlal.u8 q7, d16, d27
+ vmlsl.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from
+ ; sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlsl.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlal.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlsl.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlal.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlal.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlal.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlsl.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
@@ -1,0 +1,456 @@
+;
+; Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+; r0 => src
+; r1 => dst
+; r2 => src_stride
+; r6 => dst_stride
+; r12 => filter_y0
+; r5 => ht
+; r3 => wd
+
+ EXPORT |vpx_convolve8_vert_filter_type2_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+ .syntax unified
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type2_neon| PROC
+
+ stmfd sp!, {r4 - r12, r14} ;stack stores the values of
+ ; the arguments
+ vpush {d8 - d15} ; stack offset by 64
+ mov r4, r1
+ mov r1, r2
+ mov r2, r4
+ vmov.i16 q15, #0x4000
+ mov r11, #0xc000
+ ldr r12, [sp, #104] ;load filter
+ ldr r6, [sp, #116] ;load y0_q4
+ add r12, r12, r6, lsl #4 ;r12 = filter[y0_q4]
+ mov r6, r3
+ ldr r5, [sp, #124] ;load wd
+ vld2.8 {d0, d1}, [r12] ;coeff = vld1_s8(pi1_coeff)
+ sub r12, r2, r2, lsl #2 ;src_ctrd & pi1_coeff
+ vabs.s8 d0, d0 ;vabs_s8(coeff)
+ add r0, r0, r12 ;r0->pu1_src r12->pi1_coeff
+ ldr r3, [sp, #128] ;load ht
+ subs r7, r3, #0 ;r3->ht
+ vdup.u8 d22, d0[0] ;coeffabs_0 = vdup_lane_u8(coeffabs,
+ ; 0);
+ cmp r5, #8
+ vdup.u8 d23, d0[1] ;coeffabs_1 = vdup_lane_u8(coeffabs,
+ ; 1);
+ vdup.u8 d24, d0[2] ;coeffabs_2 = vdup_lane_u8(coeffabs,
+ ; 2);
+ vdup.u8 d25, d0[3] ;coeffabs_3 = vdup_lane_u8(coeffabs,
+ ; 3);
+ vdup.u8 d26, d0[4] ;coeffabs_4 = vdup_lane_u8(coeffabs,
+ ; 4);
+ vdup.u8 d27, d0[5] ;coeffabs_5 = vdup_lane_u8(coeffabs,
+ ; 5);
+ vdup.u8 d28, d0[6] ;coeffabs_6 = vdup_lane_u8(coeffabs,
+ ; 6);
+ vdup.u8 d29, d0[7] ;coeffabs_7 = vdup_lane_u8(coeffabs,
+ ; 7);
+ blt core_loop_wd_4 ;core loop wd 4 jump
+
+ str r0, [sp, #-4]!
+ str r1, [sp, #-4]!
+ bic r4, r5, #7 ;r5 ->wd
+ rsb r9, r4, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r4, r2, lsl #2 ;r2->src_strd
+ mov r3, r5, lsr #3 ;divide by 8
+ mul r7, r3 ;multiply height by width
+ sub r7, #4 ;subtract by one for epilog
+
+prolog
+ and r10, r0, #31
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vdup.16 q4, r11
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ subs r4, r4, #8
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vdup.16 q5, r11
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ pld [r3]
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ pld [r3, r2]
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r3, r3, r2
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ pld [r3, r2, lsl #1]
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+
+ vld1.u8 {d1}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d2, d22
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q6, d4, d24
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d5, d25
+ vmlal.u8 q6, d6, d26
+ vmlsl.u8 q6, d7, d27
+ vmlal.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ addle r1, r1, r9
+ vmlal.u8 q7, d4, d23
+ subs r7, r7, #4
+ vmlsl.u8 q7, d3, d22
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+ blt epilog_end ;jumps to epilog_end
+
+ beq epilog ;jumps to epilog
+
+main_loop_8
+ subs r4, r4, #8
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ ; coeffabs_1);
+ addle r0, r0, r8
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ bicle r4, r5, #7 ;r5 ->wd
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ add r3, r0, r2 ;pu1_src_tmp += src_strd;
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vld1.u8 {d0}, [r0]! ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vld1.u8 {d1}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vst1.8 {d14}, [r14], r6
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ add r14, r1, #0
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ add r1, r1, #8
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ addle r1, r1, r9
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vmlal.u8 q6, d3, d23
+ add r10, r3, r2, lsl #3 ; 10*strd - 8+2
+ vmlsl.u8 q6, d2, d22
+ add r10, r10, r2 ; 11*strd
+ vmlsl.u8 q6, d4, d24
+ vld1.u8 {d2}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vst1.8 {d8}, [r14], r6 ;vst1_u8(pu1_dst,sto_res);
+ pld [r10] ;11+ 0
+ vmlsl.u8 q6, d7, d27
+ pld [r10, r2] ;11+ 1*strd
+ vmlal.u8 q6, d16, d28
+ pld [r10, r2, lsl #1] ;11+ 2*strd
+ vmlsl.u8 q6, d17, d29
+ add r10, r10, r2 ;12*strd
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ pld [r10, r2, lsl #1] ;11+ 3*strd
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ subs r7, r7, #4
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vld1.u8 {d3}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vhadd.s16 q6, q6, q15
+ vdup.16 q4, r11
+ vmlal.u8 q7, d7, d26
+ vld1.u8 {d4}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d16, d27
+ vld1.u8 {d5}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d17, d28
+ vld1.u8 {d6}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlsl.u8 q7, d18, d29
+ vld1.u8 {d7}, [r3], r2 ;src_tmp4 = vld1_u8(pu1_src_tmp);
+ vqrshrun.s16 d12, q6, #6
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ bgt main_loop_8 ;jumps to main_loop_8
+
+epilog
+ vmlal.u8 q4, d1, d23 ;mul_res1 = vmull_u8(src_tmp2,
+ vmlsl.u8 q4, d0, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp1, coeffabs_0);
+ vmlsl.u8 q4, d2, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp3, coeffabs_2);
+ vmlal.u8 q4, d3, d25 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp4, coeffabs_3);
+ vhadd.s16 q7, q7, q15
+ vdup.16 q5, r11
+ vmlal.u8 q4, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp1, coeffabs_4);
+ vmlsl.u8 q4, d5, d27 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp2, coeffabs_5);
+ vmlal.u8 q4, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; src_tmp3, coeffabs_6);
+ vmlsl.u8 q4, d7, d29 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; src_tmp4, coeffabs_7);
+ vst1.8 {d12}, [r14], r6
+ vqrshrun.s16 d14, q7, #6
+ vld1.u8 {d16}, [r3], r2 ;src_tmp1 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q5, d2, d23 ;mul_res2 = vmull_u8(src_tmp3,
+ ; coeffabs_1);
+ vmlsl.u8 q5, d1, d22 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp2, coeffabs_0);
+ vmlsl.u8 q5, d3, d24 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp4, coeffabs_2);
+ vmlal.u8 q5, d4, d25 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp1, coeffabs_3);
+ vhadd.s16 q4, q4, q15
+ vdup.16 q6, r11
+ vmlal.u8 q5, d5, d26 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp2, coeffabs_4);
+ vmlsl.u8 q5, d6, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp3, coeffabs_5);
+ vmlal.u8 q5, d7, d28 ;mul_res2 = vmlal_u8(mul_res2,
+ ; src_tmp4, coeffabs_6);
+ vmlsl.u8 q5, d16, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; src_tmp1, coeffabs_7);
+ vst1.8 {d14}, [r14], r6
+ vqrshrun.s16 d8, q4, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d17}, [r3], r2 ;src_tmp2 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q6, d3, d23
+ vmlsl.u8 q6, d2, d22
+ vmlsl.u8 q6, d4, d24
+ vmlal.u8 q6, d5, d25
+ vhadd.s16 q5, q5, q15
+ vdup.16 q7, r11
+ vmlal.u8 q6, d6, d26
+ vmlsl.u8 q6, d7, d27
+ vmlal.u8 q6, d16, d28
+ vmlsl.u8 q6, d17, d29
+ add r14, r1, r6
+ vst1.8 {d8}, [r1]! ;vst1_u8(pu1_dst,sto_res);
+ vqrshrun.s16 d10, q5, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u8 {d18}, [r3], r2 ;src_tmp3 = vld1_u8(pu1_src_tmp);
+ vmlal.u8 q7, d4, d23
+ vmlsl.u8 q7, d3, d22
+ vmlsl.u8 q7, d5, d24
+ vmlal.u8 q7, d6, d25
+ vhadd.s16 q6, q6, q15
+ vmlal.u8 q7, d7, d26
+ vmlsl.u8 q7, d16, d27
+ vmlal.u8 q7, d17, d28
+ vmlsl.u8 q7, d18, d29
+ vst1.8 {d10}, [r14], r6 ;vst1_u8(pu1_dst_tmp,sto_res);
+ vqrshrun.s16 d12, q6, #6
+
+epilog_end
+ vst1.8 {d12}, [r14], r6
+ vhadd.s16 q7, q7, q15
+ vqrshrun.s16 d14, q7, #6
+ vst1.8 {d14}, [r14], r6
+
+end_loops
+ tst r5, #7
+ ldr r1, [sp], #4
+ ldr r0, [sp], #4
+ vpopeq {d8 - d15}
+ ldmfdeq sp!, {r4 - r12, r15} ;reload the registers from sp
+ mov r5, #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r7, #16
+
+core_loop_wd_4
+ rsb r9, r5, r6, lsl #2 ;r6->dst_strd r5 ->wd
+ rsb r8, r5, r2, lsl #2 ;r2->src_strd
+ vmov.i8 d4, #0
+
+outer_loop_wd_4
+ subs r12, r5, #0
+ ble end_inner_loop_wd_4 ;outer loop jump
+
+inner_loop_wd_4
+ add r3, r0, r2
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ subs r12, r12, #4
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vld1.u32 {d4[0]},[r0] ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 0);
+ vdup.16 q0, r11
+ vmlal.u8 q0, d5, d23 ;mul_res1 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ add r0, r0, #4
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlsl.u8 q0, d4, d22 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q0, d6, d24 ;mul_res1 = vmlsl_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+ vdup.16 q4, r11
+ vmlal.u8 q4, d7, d23
+ vdup.u32 d4, d7[1] ;src_tmp1 = vdup_lane_u32(src_tmp4,
+ ; 1);
+ vmull.u8 q1, d7, d25 ;mul_res2 =
+ ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+ vld1.u32 {d4[1]},[r3], r2 ;src_tmp1 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp1, 1);
+ vmlsl.u8 q4, d6, d22
+ vmlal.u8 q0, d4, d26 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+ vdup.u32 d5, d4[1] ;src_tmp2 = vdup_lane_u32(src_tmp1,
+ ; 1);
+ vmlsl.u8 q4, d4, d24
+ vld1.u32 {d5[1]},[r3], r2 ;src_tmp2 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp2, 1);
+ vmlsl.u8 q1, d5, d27 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+ vdup.u32 d6, d5[1] ;src_tmp3 = vdup_lane_u32(src_tmp2,
+ ; 1);
+ vmlal.u8 q4, d5, d25
+ vld1.u32 {d6[1]},[r3], r2 ;src_tmp3 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp3, 1);
+ vmlal.u8 q0, d6, d28 ;mul_res1 = vmlal_u8(mul_res1,
+ ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+ vdup.u32 d7, d6[1] ;src_tmp4 = vdup_lane_u32(src_tmp3,
+ ; 1);
+ vmlal.u8 q4, d6, d26
+ vld1.u32 {d7[1]},[r3], r2 ;src_tmp4 = vld1_lane_u32((uint32_t
+ ; *)pu1_src_tmp, src_tmp4, 1);
+ vmlsl.u8 q1, d7, d29 ;mul_res2 = vmlsl_u8(mul_res2,
+ ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+ vdup.u32 d4, d7[1]
+ vadd.i16 q0, q0, q1 ;mul_res1 = vaddq_u16(mul_res1,
+ ; mul_res2);
+ vmlsl.u8 q4, d7, d27
+ vld1.u32 {d4[1]},[r3], r2
+ vmlal.u8 q4, d4, d28
+ vdup.u32 d5, d4[1]
+ vhadd.s16 q0, q0, q15
+ vqrshrun.s16 d0, q0, #6 ;sto_res = vqmovun_s16(sto_res_tmp);
+ vld1.u32 {d5[1]},[r3]
+ add r3, r1, r6
+ vst1.32 {d0[0]},[r1] ;vst1_lane_u32((uint32_t *)pu1_dst,
+ ; vreinterpret_u32_u8(sto_res), 0);
+ vmlsl.u8 q4, d5, d29
+ vst1.32 {d0[1]},[r3], r6 ;vst1_lane_u32((uint32_t
+ ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+ vhadd.s16 q4, q4, q15
+ vqrshrun.s16 d8, q4, #6
+ vst1.32 {d8[0]},[r3], r6
+ add r1, r1, #4
+ vst1.32 {d8[1]},[r3]
+ bgt inner_loop_wd_4
+
+end_inner_loop_wd_4
+ subs r7, r7, #4
+ add r1, r1, r9
+ add r0, r0, r8
+ bgt outer_loop_wd_4
+
+ vpop {d8 - d15}
+ ldmfd sp!, {r4 - r12, r15} ;reload the registers from sp
+
+ ENDP
+
+ END
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -24,7 +24,8 @@
uint8_t temp[64 * 72];
// Account for the vertical phase needing 3 lines prior and 4 lines post
- const int intermediate_height = h + 7;
+ // (+ 1 to make it divisible by 4).
+ const int intermediate_height = h + 8;
assert(y_step_q4 == 16);
assert(x_step_q4 == 16);
@@ -48,7 +49,7 @@
int x_step_q4, int y0_q4, int y_step_q4, int w,
int h) {
uint8_t temp[64 * 72];
- const int intermediate_height = h + 7;
+ const int intermediate_height = h + 8;
assert(y_step_q4 == 16);
assert(x_step_q4 == 16);
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -110,11 +110,20 @@
DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm
DSP_SRCS-$(HAVE_NEON) += arm/vpx_scaled_convolve8_neon.c
+
ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type1_neon$(ASM)
DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.h
DSP_SRCS-yes += arm/vpx_convolve_neon.c
else
ifeq ($(HAVE_NEON),yes)