ref: 458273ed9e407253c434bc131916305902c19a1e
parent: ea7e13e77efa4fc38597cc981cc755685143fb26
author: Martin Storsjö <martin@martin.st>
date: Tue Sep 1 08:31:30 EDT 2020
arm32: mc: Load 8tap filter coefficients with alignment where possible
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -1431,7 +1431,7 @@
pop {r4-r11,pc}
80: // 8xN h
- vld1.8 {d0}, [\mx]
+ vld1.8 {d0}, [\mx, :64]
sub \src, \src, #3
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1482,7 +1482,7 @@
// one temporary for vext in the loop. That's slower on A7 and A53,
// (but surprisingly, marginally faster on A8 and A73).
vpush {q4-q6}
- vld1.8 {d0}, [\mx]
+ vld1.8 {d0}, [\mx, :64]
sub \src, \src, #3
add \ds2, \dst, \d_strd
add \sr2, \src, \s_strd
@@ -1629,7 +1629,7 @@
28: // 2x8, 2x16 v
vpush {q4-q7}
- vld1.8 {d0}, [\my]
+ vld1.8 {d0}, [\my, :64]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
sub \src, \sr2, \s_strd
@@ -1709,7 +1709,7 @@
480: // 4x8, 4x16 v
vpush {q4}
- vld1.8 {d0}, [\my]
+ vld1.8 {d0}, [\my, :64]
sub \sr2, \src, \s_strd, lsl #1
add \ds2, \dst, \d_strd
sub \src, \sr2, \s_strd
@@ -1782,7 +1782,7 @@
640:
1280:
vpush {q4}
- vld1.8 {d0}, [\my]
+ vld1.8 {d0}, [\my, :64]
sub \src, \src, \s_strd
sub \src, \src, \s_strd, lsl #1
vmovl.s8 q0, d0
@@ -1968,7 +1968,7 @@
b 2b
280: // 2x8, 2x16, 2x32 hv
- vld1.8 {d2}, [\my]
+ vld1.8 {d2}, [\my, :64]
sub \src, \src, #1
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
@@ -2108,7 +2108,7 @@
b 4b
480: // 4x8, 4x16, 4x32 hv
- vld1.8 {d2}, [\my]
+ vld1.8 {d2}, [\my, :64]
sub \src, \src, #1
sub \sr2, \src, \s_strd, lsl #1
sub \src, \sr2, \s_strd
@@ -2211,7 +2211,7 @@
bgt 880f
vpush {q4-q7}
add \my, \my, #2
- vld1.8 {d0}, [\mx]
+ vld1.8 {d0}, [\mx, :64]
vld1.32 {d2[]}, [\my]
sub \src, \src, #3
sub \src, \src, \s_strd
@@ -2301,8 +2301,8 @@
640:
1280:
vpush {q4-q7}
- vld1.8 {d0}, [\mx]
- vld1.8 {d2}, [\my]
+ vld1.8 {d0}, [\mx, :64]
+ vld1.8 {d2}, [\my, :64]
sub \src, \src, #3
sub \src, \src, \s_strd
sub \src, \src, \s_strd, lsl #1