shithub: openh264

Download patch

ref: ea4bb892aa149695b43e0b4a90bdb638b4b8842d
parent: 02e824d1253cdf8800c51fb39a359a062cbb0f45
author: Martin Storsjö <martin@martin.st>
date: Tue Apr 11 07:01:07 EDT 2017

Fix arm downsampler to add horizontally first

When rounding inbetween each step, the order of the additions matter.

This fixes the testsuite when running on arm and arm64, after
a56b927135a61c4363a21d6e779f611de6304932.

--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -57,10 +57,11 @@
 
     vld1.8 {q0,q1}, [r2]!
     vld1.8 {q2,q3}, [r7]!
-    vrhadd.u8 q0, q0, q2
-    vrhadd.u8 q1, q1, q3
     vuzp.8 q0, q1
+    vuzp.8 q2, q3
     vrhadd.u8 q0, q0, q1
+    vrhadd.u8 q2, q2, q3
+    vrhadd.u8 q0, q0, q2
     vst1.32 {q0},   [r0]!
     add lr, #32
 
@@ -188,10 +189,11 @@
 
     vld1.8 {q0,q1}, [r2]!
     vld1.8 {q2,q3}, [r7]!
-    vrhadd.u8 q0, q0, q2
-    vrhadd.u8 q1, q1, q3
     vuzp.8 q0, q1
+    vuzp.8 q2, q3
     vrhadd.u8 q0, q0, q1
+    vrhadd.u8 q2, q2, q3
+    vrhadd.u8 q0, q0, q2
     vst1.32 {q0},   [r0]!
     subs r6, #1
     bne comp_ds_bilinear_w_x32_loop1
--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -51,11 +51,13 @@
 
     ld1     {v0.16b, v1.16b}, [x2], #32
     ld1     {v2.16b, v3.16b}, [x7], #32
-    urhadd  v0.16b, v0.16b, v2.16b
-    urhadd  v1.16b, v1.16b, v3.16b
-    uzp1    v2.16b, v0.16b, v1.16b
-    uzp2    v3.16b, v0.16b, v1.16b
-    urhadd  v2.16b, v2.16b, v3.16b
+    uzp1    v4.16b, v0.16b, v1.16b
+    uzp2    v5.16b, v0.16b, v1.16b
+    uzp1    v6.16b, v2.16b, v3.16b
+    uzp2    v7.16b, v2.16b, v3.16b
+    urhadd  v0.16b, v4.16b, v5.16b
+    urhadd  v1.16b, v6.16b, v7.16b
+    urhadd  v2.16b, v0.16b, v1.16b
     st1     {v2.16b}, [x0], #16
     add     w9, w9, #32
 
@@ -92,11 +94,13 @@
 
     ld1     {v0.16b, v1.16b}, [x2], #32
     ld1     {v2.16b, v3.16b}, [x7], #32
-    urhadd  v0.16b, v0.16b, v2.16b
-    urhadd  v1.16b, v1.16b, v3.16b
-    uzp1    v2.16b, v0.16b, v1.16b
-    uzp2    v3.16b, v0.16b, v1.16b
-    urhadd  v2.16b, v2.16b, v3.16b
+    uzp1    v4.16b, v0.16b, v1.16b
+    uzp2    v5.16b, v0.16b, v1.16b
+    uzp1    v6.16b, v2.16b, v3.16b
+    uzp2    v7.16b, v2.16b, v3.16b
+    urhadd  v0.16b, v4.16b, v5.16b
+    urhadd  v1.16b, v6.16b, v7.16b
+    urhadd  v2.16b, v0.16b, v1.16b
     st1     {v2.16b}, [x0], #16
 
     sub     w6, w6, #1