ref: 74d5cf57599f423c3d38d091b9a95ae245d89235
parent: 72db660742c4c31a1a39f470c14fc24fefce361a
author: Martin Storsjö <martin@martin.st>
date: Tue Jun 23 09:33:11 EDT 2020
arm32: ipred: Mark a few more loads as aligned This speeds things up a bit on older cores. Also do a load that duplicates the input over the whole register instead of just loading a single lane in iprev_v_w4. This can be a bit faster on Cortex A8. Before: Cortex A7 A8 A9 A53 A72 A73 intra_pred_v_w4_8bpc_neon: 54.0 38.4 46.4 47.7 20.4 18.1 intra_pred_h_w4_8bpc_neon: 66.3 43.1 55.0 57.0 27.9 22.2 intra_pred_h_w8_8bpc_neon: 81.0 60.2 76.7 66.5 31.1 30.1 intra_pred_dc_left_w4_8bpc_neon: 91.0 49.0 72.8 77.7 35.4 38.5 intra_pred_dc_left_w8_8bpc_neon: 103.8 73.5 90.2 84.7 42.8 47.1 intra_pred_dc_left_w16_8bpc_neon: 156.1 101.8 186.1 119.4 77.7 92.6 intra_pred_dc_left_w32_8bpc_neon: 270.5 200.5 381.6 191.7 152.6 170.3 intra_pred_dc_left_w64_8bpc_neon: 560.7 439.1 877.0 375.4 333.5 343.6 After: intra_pred_v_w4_8bpc_neon: 53.9 38.0 46.4 47.7 19.8 19.2 intra_pred_h_w4_8bpc_neon: 66.5 39.2 52.6 57.0 27.7 22.2 intra_pred_h_w8_8bpc_neon: 80.5 55.8 72.9 66.5 31.4 30.1 intra_pred_dc_left_w4_8bpc_neon: 91.0 48.2 71.8 77.7 34.9 38.6 intra_pred_dc_left_w8_8bpc_neon: 103.8 69.6 89.2 84.7 43.2 47.3 intra_pred_dc_left_w16_8bpc_neon: 182.3 99.9 184.9 118.8 77.7 85.8 intra_pred_dc_left_w32_8bpc_neon: 355.4 198.9 380.1 190.6 152.9 161.0 intra_pred_dc_left_w64_8bpc_neon: 517.5 437.4 876.9 375.7 333.3 347.7
--- a/src/arm/32/ipred.S
+++ b/src/arm/32/ipred.S
@@ -132,7 +132,7 @@
.word 80f - L(ipred_v_tbl) + CONFIG_THUMB
.word 40f - L(ipred_v_tbl) + CONFIG_THUMB
40:
- vld1.32 {d0[0]}, [r2]
+ vld1.32 {d0[]}, [r2]
4:
vst1.32 {d0[0]}, [r0, :32], r1
vst1.32 {d0[0]}, [r12, :32], r1
@@ -215,7 +215,7 @@
.word 8f - L(ipred_h_tbl) + CONFIG_THUMB
.word 4f - L(ipred_h_tbl) + CONFIG_THUMB
4:
- vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
vst1.32 {d3[0]}, [r0, :32], r1
vst1.32 {d2[0]}, [r12, :32], r1
subs r4, r4, #4
@@ -224,7 +224,7 @@
bgt 4b
pop {r4-r5, pc}
8:
- vld4.8 {d0[], d1[], d2[], d3[]}, [r2], lr
+ vld4.8 {d0[], d1[], d2[], d3[]}, [r2, :32], lr
vst1.8 {d3}, [r0, :64], r1
vst1.8 {d2}, [r12, :64], r1
subs r4, r4, #4
@@ -453,7 +453,7 @@
.word L(ipred_dc_left_w4) - L(ipred_dc_left_tbl) + CONFIG_THUMB
L(ipred_dc_left_h4):
- vld1.32 {d0[]}, [r2]
+ vld1.32 {d0[]}, [r2, :32]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vrshrn.u16 d0, q0, #2
@@ -468,7 +468,7 @@
bgt L(ipred_dc_left_w4)
pop {r4-r5, pc}
L(ipred_dc_left_h8):
- vld1.8 {d0}, [r2]
+ vld1.8 {d0}, [r2, :64]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
@@ -484,7 +484,7 @@
bgt L(ipred_dc_left_w8)
pop {r4-r5, pc}
L(ipred_dc_left_h16):
- vld1.8 {d0, d1}, [r2]
+ vld1.8 {d0, d1}, [r2, :128]
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
@@ -501,7 +501,7 @@
bgt L(ipred_dc_left_w16)
pop {r4-r5, pc}
L(ipred_dc_left_h32):
- vld1.8 {d0, d1, d2, d3}, [r2]
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
@@ -522,8 +522,8 @@
bgt 1b
pop {r4-r5, pc}
L(ipred_dc_left_h64):
- vld1.8 {d0, d1, d2, d3}, [r2]!
- vld1.8 {d4, d5, d6, d7}, [r2]
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
@@ -599,13 +599,13 @@
.word L(ipred_dc_w4) - L(ipred_dc_tbl) + CONFIG_THUMB
L(ipred_dc_h4):
- vld1.32 {d0[0]}, [r2]!
+ vld1.32 {d0[]}, [r2, :32]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
bx r3
L(ipred_dc_w4):
add r2, r2, #1
- vld1.32 {d1[0]}, [r2]
+ vld1.32 {d1[]}, [r2]
vmov.32 d1[1], r6
vadd.s16 d0, d0, d30
vpaddl.u8 d1, d1
@@ -634,7 +634,7 @@
pop {r4-r6, pc}
L(ipred_dc_h8):
- vld1.8 {d0}, [r2]!
+ vld1.8 {d0}, [r2, :64]!
vpaddl.u8 d0, d0
vpadd.u16 d0, d0
vpadd.u16 d0, d0
@@ -669,7 +669,7 @@
pop {r4-r6, pc}
L(ipred_dc_h16):
- vld1.8 {d0, d1}, [r2]!
+ vld1.8 {d0, d1}, [r2, :128]!
vaddl.u8 q0, d0, d1
vadd.u16 d0, d0, d1
vpadd.u16 d0, d0
@@ -706,7 +706,7 @@
pop {r4-r6, pc}
L(ipred_dc_h32):
- vld1.8 {d0, d1, d2, d3}, [r2]!
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
vaddl.u8 q0, d0, d1
vaddl.u8 q1, d2, d3
vadd.u16 q0, q0, q1
@@ -751,9 +751,9 @@
pop {r4-r6, pc}
L(ipred_dc_h64):
- vld1.8 {d0, d1, d2, d3}, [r2]!
+ vld1.8 {d0, d1, d2, d3}, [r2, :128]!
vaddl.u8 q0, d0, d1
- vld1.8 {d4, d5, d6, d7}, [r2]!
+ vld1.8 {d4, d5, d6, d7}, [r2, :128]!
vaddl.u8 q1, d2, d3
vaddl.u8 q2, d4, d5
vaddl.u8 q3, d6, d7