ref: b4291523d926658c68c29f7f5d2b270e19ed39c2
parent: 8fd0bc90ba4ce34e62eea90b9df7c105cccf2886
author: Martin Storsjö <martin@martin.st>
date: Fri Jun 26 20:37:52 EDT 2020
arm32: ipred: Optimize ipred_dc_w32 Do the horizontal summing in the same way as for other cases of 32 pixel summing. This doesn't seem to affect the runtime significantly though (checkasm benchmarks vary by a couple cycles), but it's 5 instructions shorter at least.
--- a/src/arm/32/ipred.S
+++ b/src/arm/32/ipred.S
@@ -718,16 +718,13 @@
add r2, r2, #1
vld1.8 {d2, d3, d4, d5}, [r2]
vadd.s16 d0, d0, d30
- vaddl.u8 q2, d4, d5
- vadd.u16 d4, d4, d5
vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vadd.u16 q1, q1, q2
vadd.u16 d2, d2, d3
- vpadd.u16 d4, d4
vpadd.u16 d2, d2
- vpadd.u16 d4, d4
vpadd.u16 d2, d2
cmp r4, #32
- vadd.s16 d0, d0, d4
vadd.s16 d0, d0, d2
vshl.u16 d4, d0, d28
beq 1f