ref: 2772b45ac0523a5d15e686a7c896e05b1eaef26e
parent: 9114f0afdb23d98ac0704832db43a88b4ca4af01
author: James Yu <james.yu@linaro.org>
date: Thu Jan 30 07:26:44 EST 2014
VP9 common for ARMv8 by using NEON intrinsics 09 Add vp9_idct8x8_1_add_neon.c - vp9_idct8x8_1_add_neon Change-Id: I9d23e01fa96013febbf64db6c76c6c955f14e3ff Signed-off-by: James Yu <james.yu@linaro.org>
--- a/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
+++ /dev/null
@@ -1,88 +1,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vp9_idct8x8_1_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
-; int dest_stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int dest_stride)
-
-|vp9_idct8x8_1_add_neon| PROC
- ldrsh r0, [r0]
-
- ; generate cospi_16_64 = 11585
- mov r12, #0x2d00
- add r12, #0x41
-
- ; out = dct_const_round_shift(input[0] * cospi_16_64)
- mul r0, r0, r12 ; input[0] * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; out = dct_const_round_shift(out * cospi_16_64)
- mul r0, r0, r12 ; out * cospi_16_64
- mov r12, r1 ; save dest
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; a1 = ROUND_POWER_OF_TWO(out, 5)
- add r0, r0, #16 ; + (1 <<((5) - 1))
- asr r0, r0, #5 ; >> 5
-
- vdup.s16 q0, r0 ; duplicate a1
-
- ; load destination data
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r2
- vld1.64 {d17}, [r1]
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r2
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r2
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r2
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r2
- vst1.64 {d31}, [r12], r2
-
- bx lr
- ENDP ; |vp9_idct8x8_1_add_neon|
-
- END
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c
@@ -1,0 +1,62 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct8x8_1_add_neon(
+ int16_t *input,
+ uint8_t *dest,
+ int dest_stride) {
+ uint8x8_t d2u8, d3u8, d30u8, d31u8;
+ uint64x1_t d2u64, d3u64, d4u64, d5u64;
+ uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+ int16x8_t q0s16;
+ uint8_t *d1, *d2;
+ int16_t i, a1, cospi_16_64 = 11585;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+
+ q0s16 = vdupq_n_s16(a1);
+ q0u16 = vreinterpretq_u16_s16(q0s16);
+
+ d1 = d2 = dest;
+ for (i = 0; i < 2; i++) {
+ d2u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d3u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d4u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+ d5u64 = vld1_u64((const uint64_t *)d1);
+ d1 += dest_stride;
+
+ q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+ q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+ q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+ q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+ d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+ d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+ d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+ d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+ d2 += dest_stride;
+ vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+ d2 += dest_stride;
+ }
+ return;
+}
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm
@@ -1,0 +1,88 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license and patent
+; grant that can be found in the LICENSE file in the root of the source
+; tree. All contributing project authors may be found in the AUTHORS
+; file in the root of the source tree.
+;
+
+
+ EXPORT |vp9_idct8x8_1_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_idct8x8_1_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride)
+
+|vp9_idct8x8_1_add_neon| PROC
+ ldrsh r0, [r0]
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+
+ ; out = dct_const_round_shift(input[0] * cospi_16_64)
+ mul r0, r0, r12 ; input[0] * cospi_16_64
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; out = dct_const_round_shift(out * cospi_16_64)
+ mul r0, r0, r12 ; out * cospi_16_64
+ mov r12, r1 ; save dest
+ add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
+ asr r0, r0, #14 ; >> DCT_CONST_BITS
+
+ ; a1 = ROUND_POWER_OF_TWO(out, 5)
+ add r0, r0, #16 ; + (1 <<((5) - 1))
+ asr r0, r0, #5 ; >> 5
+
+ vdup.s16 q0, r0 ; duplicate a1
+
+ ; load destination data
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d16}, [r1], r2
+ vld1.64 {d17}, [r1]
+
+ vaddw.u8 q9, q0, d2 ; dest[x] + a1
+ vaddw.u8 q10, q0, d3 ; dest[x] + a1
+ vaddw.u8 q11, q0, d4 ; dest[x] + a1
+ vaddw.u8 q12, q0, d5 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r2
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r2
+ vst1.64 {d31}, [r12], r2
+
+ vaddw.u8 q9, q0, d6 ; dest[x] + a1
+ vaddw.u8 q10, q0, d7 ; dest[x] + a1
+ vaddw.u8 q11, q0, d16 ; dest[x] + a1
+ vaddw.u8 q12, q0, d17 ; dest[x] + a1
+ vqmovun.s16 d2, q9 ; clip_pixel
+ vqmovun.s16 d3, q10 ; clip_pixel
+ vqmovun.s16 d30, q11 ; clip_pixel
+ vqmovun.s16 d31, q12 ; clip_pixel
+ vst1.64 {d2}, [r12], r2
+ vst1.64 {d3}, [r12], r2
+ vst1.64 {d30}, [r12], r2
+ vst1.64 {d31}, [r12], r2
+
+ bx lr
+ ENDP ; |vp9_idct8x8_1_add_neon|
+
+ END
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -429,8 +429,7 @@
$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
- $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+ specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -135,7 +135,6 @@
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_add_neon$(ASM)
@@ -156,6 +155,7 @@
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM)
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
else
@@ -166,6 +166,7 @@
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c
VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
endif # HAVE_NEON