ref: 52e9b4353f968fd27e2bd912b0e2302509063068
parent: a7f6fe32989ed5c13cc588f17db59b747d4a5fd5
author: Martin Storsjö <martin@martin.st>
date: Wed Mar 4 05:51:50 EST 2020
arm: mc: Optimize blend_v Use a post-increment with a register on the last increment, avoiding a separate increment. Avoid processing the last 8 pixels in the w32 case when we only output 24 pixels. Before: ARM32 Cortex A7 A8 A9 A53 A72 A73 blend_v_w4_8bpc_neon: 450.4 574.7 538.7 374.6 199.3 260.5 blend_v_w8_8bpc_neon: 559.6 351.3 552.5 357.6 214.8 204.3 blend_v_w16_8bpc_neon: 926.3 511.6 787.9 593.0 271.0 246.8 blend_v_w32_8bpc_neon: 1482.5 917.0 1149.5 991.9 354.0 368.9 ARM64 blend_v_w4_8bpc_neon: 351.1 200.0 224.1 blend_v_w8_8bpc_neon: 333.0 212.4 203.8 blend_v_w16_8bpc_neon: 495.2 302.0 247.0 blend_v_w32_8bpc_neon: 840.0 557.8 514.0 After: ARM32 blend_v_w4_8bpc_neon: 435.5 575.8 537.6 356.2 198.3 259.5 blend_v_w8_8bpc_neon: 545.2 347.9 553.5 339.1 207.8 204.2 blend_v_w16_8bpc_neon: 913.7 511.0 788.1 573.7 275.4 243.3 blend_v_w32_8bpc_neon: 1445.3 951.2 1079.1 920.4 352.2 361.6 ARM64 blend_v_w4_8bpc_neon: 333.0 191.3 225.9 blend_v_w8_8bpc_neon: 314.9 199.3 203.5 blend_v_w16_8bpc_neon: 476.9 301.3 241.1 blend_v_w32_8bpc_neon: 766.9 432.8 416.9
--- a/src/arm/32/mc.S
+++ b/src/arm/32/mc.S
@@ -753,7 +753,7 @@
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d5, d22, d4
- sub r1, r1, #3
+ sub r1, r1, #2
4:
vld1.u8 {d2}, [r2, :64]!
vld1.32 {d0[]}, [r0, :32]
@@ -764,10 +764,8 @@
vrshrn.i16 d20, q3, #6
vst1.16 {d20[0]}, [r0, :16]!
vst1.16 {d20[2]}, [r12, :16]!
- vst1.8 {d20[2]}, [r0]!
- vst1.8 {d20[6]}, [r12]!
- add r0, r0, r1
- add r12, r12, r1
+ vst1.8 {d20[2]}, [r0], r1
+ vst1.8 {d20[6]}, [r12], r1
bgt 4b
pop {r4-r5,pc}
80:
@@ -776,7 +774,7 @@
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 d17, d16, d2
- sub r1, r1, #6
+ sub r1, r1, #4
8:
vld1.u8 {d4, d5}, [r2, :128]!
vld1.u8 {d0}, [r0, :64]
@@ -790,10 +788,8 @@
vrshrn.i16 d23, q10, #6
vst1.32 {d22[0]}, [r0, :32]!
vst1.32 {d23[0]}, [r12, :32]!
- vst1.16 {d22[2]}, [r0, :16]!
- vst1.16 {d23[2]}, [r12, :16]!
- add r0, r0, r1
- add r12, r12, r1
+ vst1.16 {d22[2]}, [r0, :16], r1
+ vst1.16 {d23[2]}, [r12, :16], r1
bgt 8b
pop {r4-r5,pc}
160:
@@ -802,7 +798,7 @@
add r12, r0, r1
lsl r1, r1, #1
vsub.i8 q11, q12, q14
- sub r1, r1, #12
+ sub r1, r1, #8
16:
vld1.u8 {q1, q2}, [r2, :128]!
vld1.u8 {q0}, [r0, :128]
@@ -822,10 +818,8 @@
vrshrn.i16 d21, q8, #6
vst1.u8 {d18}, [r0, :64]!
vst1.u8 {d20}, [r12, :64]!
- vst1.32 {d19[0]}, [r0, :32]!
- vst1.32 {d21[0]}, [r12, :32]!
- add r0, r0, r1
- add r12, r12, r1
+ vst1.32 {d19[0]}, [r0, :32], r1
+ vst1.32 {d21[0]}, [r12, :32], r1
bgt 16b
pop {r4-r5,pc}
320:
@@ -832,10 +826,10 @@
vmov.i8 q10, #64
vld1.u8 {q2, q3}, [r5, :128]
vsub.i8 q11, q10, q2
- vsub.i8 q12, q10, q3
+ vsub.i8 d24, d20, d6
32:
vld1.u8 {q8, q9}, [r2, :128]!
- vld1.u8 {q0, q1}, [r0, :128]
+ vld1.u8 {d0, d1, d2}, [r0, :64]
subs r4, r4, #1
vmull.u8 q15, d16, d4
vmlal.u8 q15, d0, d22
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -709,8 +709,8 @@
ret
40:
ld1r {v0.2s}, [x5]
+ sub x1, x1, #2
sub v1.8b, v4.8b, v0.8b
- sub x1, x1, #3
4:
ld1 {v2.8b}, [x2], #8
ld1 {v3.s}[0], [x0]
@@ -721,16 +721,14 @@
rshrn v5.8b, v5.8h, #6
st1 {v5.h}[0], [x0], #2
st1 {v5.h}[2], [x8], #2
- st1 {v5.b}[2], [x0], #1
- st1 {v5.b}[6], [x8], #1
- add x0, x0, x1
- add x8, x8, x1
+ st1 {v5.b}[2], [x0], x1
+ st1 {v5.b}[6], [x8], x1
b.gt 4b
ret
80:
ld1r {v0.2d}, [x5]
+ sub x1, x1, #4
sub v1.16b, v4.16b, v0.16b
- sub x1, x1, #6
8:
ld1 {v2.16b}, [x2], #16
ld1 {v3.d}[0], [x0]
@@ -744,16 +742,14 @@
rshrn2 v7.16b, v6.8h, #6
st1 {v7.s}[0], [x0], #4
st1 {v7.s}[2], [x8], #4
- st1 {v7.h}[2], [x0], #2
- st1 {v7.h}[6], [x8], #2
- add x0, x0, x1
- add x8, x8, x1
+ st1 {v7.h}[2], [x0], x1
+ st1 {v7.h}[6], [x8], x1
b.gt 8b
ret
160:
ld1 {v0.16b}, [x5]
+ sub x1, x1, #8
sub v2.16b, v4.16b, v0.16b
- sub x1, x1, #12
16:
ld1 {v5.16b, v6.16b}, [x2], #32
ld1 {v7.16b}, [x0]
@@ -773,17 +769,15 @@
rshrn2 v22.16b, v21.8h, #6
st1 {v19.8b}, [x0], #8
st1 {v22.8b}, [x8], #8
- st1 {v19.s}[2], [x0], #4
- st1 {v22.s}[2], [x8], #4
- add x0, x0, x1
- add x8, x8, x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v22.s}[2], [x8], x1
b.gt 16b
ret
320:
ld1 {v0.16b, v1.16b}, [x5]
+ sub x1, x1, #16
sub v2.16b, v4.16b, v0.16b
- sub v3.16b, v4.16b, v1.16b
- sub x1, x1, #24
+ sub v3.8b, v4.8b, v1.8b
32:
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
ld1 {v5.16b, v6.16b}, [x0]
@@ -795,8 +789,6 @@
umlal2 v23.8h, v5.16b, v2.16b
umull v28.8h, v17.8b, v1.8b
umlal v28.8h, v6.8b, v3.8b
- umull2 v29.8h, v17.16b, v1.16b
- umlal2 v29.8h, v6.16b, v3.16b
umull v30.8h, v18.8b, v0.8b
umlal v30.8h, v20.8b, v2.8b
umull2 v31.8h, v18.16b, v0.16b
@@ -803,22 +795,16 @@
umlal2 v31.8h, v20.16b, v2.16b
umull v25.8h, v19.8b, v1.8b
umlal v25.8h, v21.8b, v3.8b
- umull2 v26.8h, v19.16b, v1.16b
- umlal2 v26.8h, v21.16b, v3.16b
rshrn v24.8b, v22.8h, #6
rshrn2 v24.16b, v23.8h, #6
rshrn v28.8b, v28.8h, #6
- rshrn2 v28.16b, v29.8h, #6
rshrn v30.8b, v30.8h, #6
rshrn2 v30.16b, v31.8h, #6
rshrn v27.8b, v25.8h, #6
- rshrn2 v27.16b, v26.8h, #6
st1 {v24.16b}, [x0], #16
st1 {v30.16b}, [x8], #16
- st1 {v28.8b}, [x0], #8
- st1 {v27.8b}, [x8], #8
- add x0, x0, x1
- add x8, x8, x1
+ st1 {v28.8b}, [x0], x1
+ st1 {v27.8b}, [x8], x1
b.gt 32b
ret
L(blend_v_tbl):