ref: a56b927135a61c4363a21d6e779f611de6304932
parent: d26ebeec2c8e98339f85bfb5e9823c90da1ffd53
author: Guangwei Wang <guangwwa@cisco.com>
date: Fri Apr 7 07:07:36 EDT 2017
[Processing][ARM asm]ARM assembly:DyadicBilinearDownsample optimizations as Sindre Aamas done for x86 platform
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -57,18 +57,10 @@
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vpaddl.u8 q2, q2
- vpaddl.u8 q3, q3
- vrshr.u16 q0, #1
- vrshr.u16 q1, #1
- vrshr.u16 q2, #1
- vrshr.u16 q3, #1
- vrhadd.u16 q0, q2
- vrhadd.u16 q1, q3
- vmovn.u16 d0, q0
- vmovn.u16 d1, q1
+ vrhadd.u8 q0, q0, q2
+ vrhadd.u8 q1, q1, q3
+ vuzp.8 q0, q1
+ vrhadd.u8 q0, q0, q1
vst1.32 {q0}, [r0]!
add lr, #32
@@ -196,19 +188,10 @@
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
- vpaddl.u8 q0, q0
- vpaddl.u8 q1, q1
- vpaddl.u8 q2, q2
- vpaddl.u8 q3, q3
- vrshr.u16 q0, #1
- vrshr.u16 q1, #1
- vrshr.u16 q2, #1
- vrshr.u16 q3, #1
- vrhadd.u16 q0, q2
- vrhadd.u16 q1, q3
-
- vmovn.u16 d0, q0
- vmovn.u16 d1, q1
+ vrhadd.u8 q0, q0, q2
+ vrhadd.u8 q1, q1, q3
+ vuzp.8 q0, q1
+ vrhadd.u8 q0, q0, q1
vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
--- a/codec/processing/src/arm64/down_sample_aarch64_neon.S
+++ b/codec/processing/src/arm64/down_sample_aarch64_neon.S
@@ -51,19 +51,12 @@
ld1 {v0.16b, v1.16b}, [x2], #32
ld1 {v2.16b, v3.16b}, [x7], #32
- uaddlp v0.8h, v0.16b
- uaddlp v1.8h, v1.16b
- uaddlp v2.8h, v2.16b
- uaddlp v3.8h, v3.16b
- urshr v0.8h, v0.8h, #1
- urshr v1.8h, v1.8h, #1
- urshr v2.8h, v2.8h, #1
- urshr v3.8h, v3.8h, #1
- urhadd v0.8h, v0.8h, v2.8h
- urhadd v1.8h, v1.8h, v3.8h
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.8b, v1.8b}, [x0], #16
+ urhadd v0.16b, v0.16b, v2.16b
+ urhadd v1.16b, v1.16b, v3.16b
+ uzp1 v2.16b, v0.16b, v1.16b
+ uzp2 v3.16b, v0.16b, v1.16b
+ urhadd v2.16b, v2.16b, v3.16b
+ st1 {v2.16b}, [x0], #16
add w9, w9, #32
cmp w9, w4
@@ -99,19 +92,12 @@
ld1 {v0.16b, v1.16b}, [x2], #32
ld1 {v2.16b, v3.16b}, [x7], #32
- uaddlp v0.8h, v0.16b
- uaddlp v1.8h, v1.16b
- uaddlp v2.8h, v2.16b
- uaddlp v3.8h, v3.16b
- urshr v0.8h, v0.8h, #1
- urshr v1.8h, v1.8h, #1
- urshr v2.8h, v2.8h, #1
- urshr v3.8h, v3.8h, #1
- urhadd v0.8h, v0.8h, v2.8h
- urhadd v1.8h, v1.8h, v3.8h
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.8b, v1.8b}, [x0], #16
+ urhadd v0.16b, v0.16b, v2.16b
+ urhadd v1.16b, v1.16b, v3.16b
+ uzp1 v2.16b, v0.16b, v1.16b
+ uzp2 v3.16b, v0.16b, v1.16b
+ urhadd v2.16b, v2.16b, v3.16b
+ st1 {v2.16b}, [x0], #16
sub w6, w6, #1
cbnz w6, comp_ds_bilinear_w_x32_loop1