ref: d00a0227d6ce86d262ee142e68d6794c9f6917df
parent: 74d5cf57599f423c3d38d091b9a95ae245d89235
author: Martin Storsjö <martin@martin.st>
date: Fri Jun 26 20:13:13 EDT 2020
arm32: ipred: Remove unnecessary operations in ipred_dc_w4 These came from matching some parts too closely to the arm64 version (where the summation can be done efficiently with uaddlv by zeroing the upper half of the register). Before: Cortex A7 A8 A9 A53 A72 A73 intra_pred_dc_w4_8bpc_neon: 124.5 65.1 90.2 100.4 48.1 50.4 After: intra_pred_dc_w4_8bpc_neon: 120.3 60.7 83.6 94.0 44.1 47.9
--- a/src/arm/32/ipred.S
+++ b/src/arm/32/ipred.S
@@ -568,7 +568,6 @@
clz r3, r3
clz r12, r4
vdup.16 q15, lr // width + height
- mov r6, #0
adr r5, L(ipred_dc_tbl)
rbit lr, lr // rbit(width + height)
sub r3, r3, #20 // 25 leading bits, minus table offset 5
@@ -606,10 +605,8 @@
L(ipred_dc_w4):
add r2, r2, #1
vld1.32 {d1[]}, [r2]
- vmov.32 d1[1], r6
vadd.s16 d0, d0, d30
vpaddl.u8 d1, d1
- vpadd.u16 d1, d1
vpadd.u16 d1, d1
cmp r4, #4
vadd.s16 d0, d0, d1