ref: ea4bb892aa149695b43e0b4a90bdb638b4b8842d
parent: 02e824d1253cdf8800c51fb39a359a062cbb0f45
author: Martin Storsjö <martin@martin.st>
date: Tue Apr 11 07:01:07 EDT 2017
Fix arm downsampler to add horizontally first When rounding inbetween each step, the order of the additions matter. This fixes the testsuite when running on arm and arm64, after a56b927135a61c4363a21d6e779f611de6304932.
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -57,10 +57,11 @@
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
- vrhadd.u8 q0, q0, q2
- vrhadd.u8 q1, q1, q3
vuzp.8 q0, q1
+ vuzp.8 q2, q3
vrhadd.u8 q0, q0, q1
+ vrhadd.u8 q2, q2, q3
+ vrhadd.u8 q0, q0, q2
vst1.32 {q0}, [r0]!
add lr, #32
@@ -188,10 +189,11 @@
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
- vrhadd.u8 q0, q0, q2
- vrhadd.u8 q1, q1, q3
vuzp.8 q0, q1
+ vuzp.8 q2, q3
vrhadd.u8 q0, q0, q1
+ vrhadd.u8 q2, q2, q3
+ vrhadd.u8 q0, q0, q2
vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -51,11 +51,13 @@
ld1 {v0.16b, v1.16b}, [x2], #32
ld1 {v2.16b, v3.16b}, [x7], #32
- urhadd v0.16b, v0.16b, v2.16b
- urhadd v1.16b, v1.16b, v3.16b
- uzp1 v2.16b, v0.16b, v1.16b
- uzp2 v3.16b, v0.16b, v1.16b
- urhadd v2.16b, v2.16b, v3.16b
+ uzp1 v4.16b, v0.16b, v1.16b
+ uzp2 v5.16b, v0.16b, v1.16b
+ uzp1 v6.16b, v2.16b, v3.16b
+ uzp2 v7.16b, v2.16b, v3.16b
+ urhadd v0.16b, v4.16b, v5.16b
+ urhadd v1.16b, v6.16b, v7.16b
+ urhadd v2.16b, v0.16b, v1.16b
st1 {v2.16b}, [x0], #16
add w9, w9, #32
@@ -92,11 +94,13 @@
ld1 {v0.16b, v1.16b}, [x2], #32
ld1 {v2.16b, v3.16b}, [x7], #32
- urhadd v0.16b, v0.16b, v2.16b
- urhadd v1.16b, v1.16b, v3.16b
- uzp1 v2.16b, v0.16b, v1.16b
- uzp2 v3.16b, v0.16b, v1.16b
- urhadd v2.16b, v2.16b, v3.16b
+ uzp1 v4.16b, v0.16b, v1.16b
+ uzp2 v5.16b, v0.16b, v1.16b
+ uzp1 v6.16b, v2.16b, v3.16b
+ uzp2 v7.16b, v2.16b, v3.16b
+ urhadd v0.16b, v4.16b, v5.16b
+ urhadd v1.16b, v6.16b, v7.16b
+ urhadd v2.16b, v0.16b, v1.16b
st1 {v2.16b}, [x0], #16
sub w6, w6, #1