shithub: openh264

Download patch

ref: a56b927135a61c4363a21d6e779f611de6304932
parent: d26ebeec2c8e98339f85bfb5e9823c90da1ffd53
author: Guangwei Wang <guangwwa@cisco.com>
date: Fri Apr 7 07:07:36 EDT 2017

[Processing][ARM asm]ARM assembly:DyadicBilinearDownsample optimizations as Sindre Aamas done for x86 platform

--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -57,18 +57,10 @@
 
     vld1.8 {q0,q1}, [r2]!
     vld1.8 {q2,q3}, [r7]!
-    vpaddl.u8   q0, q0
-    vpaddl.u8   q1, q1
-    vpaddl.u8   q2, q2
-    vpaddl.u8   q3, q3
-    vrshr.u16   q0, #1
-    vrshr.u16   q1, #1
-    vrshr.u16   q2, #1
-    vrshr.u16   q3, #1
-    vrhadd.u16 q0, q2
-    vrhadd.u16 q1, q3
-    vmovn.u16   d0, q0
-    vmovn.u16   d1, q1
+    vrhadd.u8 q0, q0, q2
+    vrhadd.u8 q1, q1, q3
+    vuzp.8 q0, q1
+    vrhadd.u8 q0, q0, q1
     vst1.32 {q0},   [r0]!
     add lr, #32
 
@@ -196,19 +188,10 @@
 
     vld1.8 {q0,q1}, [r2]!
     vld1.8 {q2,q3}, [r7]!
-    vpaddl.u8   q0, q0
-    vpaddl.u8   q1, q1
-    vpaddl.u8   q2, q2
-    vpaddl.u8   q3, q3
-    vrshr.u16   q0, #1
-    vrshr.u16   q1, #1
-    vrshr.u16   q2, #1
-    vrshr.u16   q3, #1
-    vrhadd.u16 q0, q2
-    vrhadd.u16 q1, q3
-
-    vmovn.u16   d0, q0
-    vmovn.u16   d1, q1
+    vrhadd.u8 q0, q0, q2
+    vrhadd.u8 q1, q1, q3
+    vuzp.8 q0, q1
+    vrhadd.u8 q0, q0, q1
     vst1.32 {q0},   [r0]!
     subs r6, #1
     bne comp_ds_bilinear_w_x32_loop1
--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -51,19 +51,12 @@
 
     ld1     {v0.16b, v1.16b}, [x2], #32
     ld1     {v2.16b, v3.16b}, [x7], #32
-    uaddlp  v0.8h, v0.16b
-    uaddlp  v1.8h, v1.16b
-    uaddlp  v2.8h, v2.16b
-    uaddlp  v3.8h, v3.16b
-    urshr   v0.8h, v0.8h, #1
-    urshr   v1.8h, v1.8h, #1
-    urshr   v2.8h, v2.8h, #1
-    urshr   v3.8h, v3.8h, #1
-    urhadd  v0.8h, v0.8h, v2.8h
-    urhadd  v1.8h, v1.8h, v3.8h
-    xtn     v0.8b, v0.8h
-    xtn     v1.8b, v1.8h
-    st1     {v0.8b, v1.8b}, [x0], #16
+    urhadd  v0.16b, v0.16b, v2.16b
+    urhadd  v1.16b, v1.16b, v3.16b
+    uzp1    v2.16b, v0.16b, v1.16b
+    uzp2    v3.16b, v0.16b, v1.16b
+    urhadd  v2.16b, v2.16b, v3.16b
+    st1     {v2.16b}, [x0], #16
     add     w9, w9, #32
 
     cmp     w9, w4
@@ -99,19 +92,12 @@
 
     ld1     {v0.16b, v1.16b}, [x2], #32
     ld1     {v2.16b, v3.16b}, [x7], #32
-    uaddlp  v0.8h, v0.16b
-    uaddlp  v1.8h, v1.16b
-    uaddlp  v2.8h, v2.16b
-    uaddlp  v3.8h, v3.16b
-    urshr   v0.8h, v0.8h, #1
-    urshr   v1.8h, v1.8h, #1
-    urshr   v2.8h, v2.8h, #1
-    urshr   v3.8h, v3.8h, #1
-    urhadd  v0.8h, v0.8h, v2.8h
-    urhadd  v1.8h, v1.8h, v3.8h
-    xtn     v0.8b, v0.8h
-    xtn     v1.8b, v1.8h
-    st1     {v0.8b, v1.8b}, [x0], #16
+    urhadd  v0.16b, v0.16b, v2.16b
+    urhadd  v1.16b, v1.16b, v3.16b
+    uzp1    v2.16b, v0.16b, v1.16b
+    uzp2    v3.16b, v0.16b, v1.16b
+    urhadd  v2.16b, v2.16b, v3.16b
+    st1     {v2.16b}, [x0], #16
 
     sub     w6, w6, #1
     cbnz    w6, comp_ds_bilinear_w_x32_loop1