ref: 2e36a3be8519c7a9c883bba034ebbb231c489205
parent: a26882d221295129bb6471c489f311478b2200c3
author: Martin Storsjö <martin@martin.st>
date: Thu Jun 25 07:48:36 EDT 2020
arm64: ipred: Optimize the w16/w32 loop of pred_filter a bit Before: Cortex A53 A72 A73 intra_pred_filter_w16_8bpc_neon: 540.2 573.8 580.2 intra_pred_filter_w32_8bpc_neon: 1223.1 1364.1 1292.9 After: intra_pred_filter_w16_8bpc_neon: 531.4 559.8 565.4 intra_pred_filter_w32_8bpc_neon: 1243.0 1308.6 1270.9 This does give a minor slowdown for the w32 case on A53, but helps on w16 and quite notably in all cases on A72 and A73. Doing the same modification on ipred16.S doesn't give quite as clear gains (the gains on A72 and A73 are smaller, and the regression on A53 on on w32 is a bit bigger), so not doing the same adjustment there.
--- a/src/arm/64/ipred.S
+++ b/src/arm/64/ipred.S
@@ -1460,12 +1460,14 @@
subs w3, w3, #16
sqrshrun v6.8b, v6.8h, #4
- ins v0.h[2], v2.h[7]
st4 {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
- ins v0.b[0], v6.b[7]
st4 {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
- ins v0.b[2], v6.b[3]
- b.gt 2b
+ b.le 8f
+ ins v0.h[2], v2.h[7]
+ ins v0.b[0], v6.b[7]
+ ins v0.b[2], v6.b[3]
+ b 2b
+8:
subs w4, w4, #2
b.le 9f
sub x8, x6, w9, uxtw