ref: d4df861993010586fdf61794f12ae923891872ac
parent: b704a993f61b1b07b1f3ac478935992239383084
author: B Krishnan Iyer <krishnaniyer97@gmail.com>
date: Tue Jul 23 12:07:11 EDT 2019
arm: mc: neon: Reduce usage of general purpose registers in blend/blend_v functions
A73 A53
Current Earlier Current Earlier
blend_h_w2_8bpc_neon: 74.1 74.1 137.5 137.5
blend_h_w4_8bpc_neon: 65.8 65.8 147.1 147.1
blend_h_w8_8bpc_neon: 68.9 68.7 131.7 131.7
blend_h_w16_8bpc_neon: 86 85.6 190.3 190.4
blend_h_w32_8bpc_neon: 149.2 149.8 358 358.3
blend_h_w64_8bpc_neon: 263.1 264.1 629.8 630.3
blend_h_w128_8bpc_neon: 571 575.4 1404.5 1404.2
blend_v_w2_8bpc_neon: 118.7 120.1 195.3 196.4
blend_v_w4_8bpc_neon: 245.8 247.2 357.3 358.4
blend_v_w8_8bpc_neon: 202 204.2 357.2 358.4
blend_v_w16_8bpc_neon: 234.8 238.5 591.3 591.8
blend_v_w32_8bpc_neon: 344.4 347.2 994.7 997.2
blend_w4_8bpc_neon: 37.5 38.3 96.7 98.7
blend_w8_8bpc_neon: 53 54.8 123.3 125.3
blend_w16_8bpc_neon: 151 150.8 332.4 334.5
blend_w32_8bpc_neon: 370.9 361.6 908.4 910.7
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -451,15 +451,15 @@
function blend_8bpc_neon, export=1
- push {r4-r8,lr}- ldr r4, [sp, #24]
- ldr r5, [sp, #28]
- clz r6, r3
- adr r7, L(blend_tbl)
- sub r6, r6, #26
- ldr r6, [r7, r6, lsl #2]
- add r7, r7, r6
- bx r7
+ push {r4-r5,lr}+ ldr r4, [sp, #12]
+ ldr r5, [sp, #16]
+ clz lr, r3
+ adr r3, L(blend_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
.align 2
L(blend_tbl):
.word 320f - L(blend_tbl) + CONFIG_THUMB
@@ -486,7 +486,7 @@
vst1.32 {d20[0]}, [r0], r1 vst1.32 {d20[1]}, [r12], r1bgt 4b
- pop {r4-r8,pc}+ pop {r4-r5,pc}80:
vmov.i8 d16, #64
add r12, r0, r1
@@ -510,7 +510,7 @@
vst1.u8 {d22}, [r0], r1 vst1.u8 {d23}, [r12], r1bgt 8b
- pop {r4-r8,pc}+ pop {r4-r5,pc}160:
vmov.i8 q12, #64
add r12, r0, r1
@@ -540,8 +540,7 @@
vst1.u8 {q9}, [r0], r1 vst1.u8 {q10}, [r12], r1bgt 16b
- pop {r4-r8,pc}-
+ pop {r4-r5,pc}320:
vmov.i8 q10, #64
32:
@@ -565,7 +564,7 @@
vrshrn.i16 d27, q14, #6
vst1.u8 {q12, q13}, [r0], r1bgt 32b
- pop {r4-r8,pc}+ pop {r4-r5,pc}endfunc
function blend_h_8bpc_neon, export=1
@@ -719,16 +718,16 @@
endfunc
function blend_v_8bpc_neon, export=1
- push {r4-r8,lr}- ldr r4, [sp, #24]
+ push {r4-r5,lr}+ ldr r4, [sp, #12]
movrel r5, X(obmc_masks)
add r5, r5, r3
- clz r8, r3
- adr r7, L(blend_v_tbl)
- sub r8, r8, #26
- ldr r8, [r7, r8, lsl #2]
- add r7, r7, r8
- bx r7
+ clz lr, r3
+ adr r3, L(blend_v_tbl)
+ sub lr, lr, #26
+ ldr lr, [r3, lr, lsl #2]
+ add r3, r3, lr
+ bx r3
.align 2
L(blend_v_tbl):
.word 320f - L(blend_v_tbl) + CONFIG_THUMB
@@ -756,7 +755,7 @@
vst1.8 {d6[0]}, [r0], r1 vst1.8 {d6[1]}, [r12], r1bgt 2b
- pop {r4-r8,pc}+ pop {r4-r5,pc}40:
vmov.i8 d22, #64
vld1.32 {d4[]}, [r5]@@ -780,7 +779,7 @@
add r0, r0, r1
add r12, r12, r1
bgt 4b
- pop {r4-r8,pc}+ pop {r4-r5,pc}80:
vmov.i8 d16, #64
vld1.u8 {d2}, [r5]@@ -807,7 +806,7 @@
add r0, r0, r1
add r12, r12, r1
bgt 8b
- pop {r4-r8,pc}+ pop {r4-r5,pc}160:
vmov.i8 q12, #64
vld1.u8 {q2}, [r5]@@ -840,7 +839,7 @@
add r0, r0, r1
add r12, r12, r1
bgt 16b
- pop {r4-r8,pc}+ pop {r4-r5,pc}320:
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5]@@ -861,7 +860,7 @@
vrshrn.i16 d2, q15, #6
vst1.u8 {d0, d1, d2}, [r0], r1bgt 32b
- pop {r4-r8,pc}+ pop {r4-r5,pc}endfunc
--
⑨