shithub: libvpx

--- a/vpx_dsp/arm/idct4x4_add_neon.asm

+++ b/vpx_dsp/arm/idct4x4_add_neon.asm

@@ -15,6 +15,8 @@

     AREA ||.text||, CODE, READONLY, ALIGN=2

+    INCLUDE vpx_dsp/arm/idct_neon.asm.s

     AREA     Block, CODE, READONLY ; name this block of code

 ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)

@@ -33,7 +35,7 @@

     ; So, two passes of a transpose followed by a column transform.

     ; load the inputs into q8-q9, d16-d19

-    vld1.s16        {q8,q9}, [r0]!

+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0

     ; generate scalar constants

     ; cospi_8_64 = 15137

--- a/vpx_dsp/arm/idct4x4_add_neon.c

+++ b/vpx_dsp/arm/idct4x4_add_neon.c

@@ -11,6 +11,7 @@

 #include <arm_neon.h>

 #include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/arm/idct_neon.h"

 #include "vpx_dsp/txfm_common.h"

 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,

@@ -28,8 +29,8 @@

   d26u32 = d27u32 = vdup_n_u32(0);

-  q8s16 = vld1q_s16(input);

-  q9s16 = vld1q_s16(input + 8);

+  q8s16 = load_tran_low_to_s16(input);

+  q9s16 = load_tran_low_to_s16(input + 8);

   d16s16 = vget_low_s16(q8s16);

   d17s16 = vget_high_s16(q8s16);

--- a/vpx_dsp/arm/idct8x8_add_neon.asm

+++ b/vpx_dsp/arm/idct8x8_add_neon.asm

@@ -16,6 +16,8 @@

     AREA ||.text||, CODE, READONLY, ALIGN=2

+    INCLUDE vpx_dsp/arm/idct_neon.asm.s

     ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are

     ; loaded in q8-q15. The output will be stored back into q8-q15 registers.

     ; This macro will touch q0-q7 registers and use them as buffer during

@@ -207,10 +209,10 @@

 |vpx_idct8x8_64_add_neon| PROC

     push            {r4-r9}

     vpush           {d8-d15}

-    vld1.s16        {q8,q9}, [r0]!

-    vld1.s16        {q10,q11}, [r0]!

-    vld1.s16        {q12,q13}, [r0]!

-    vld1.s16        {q14,q15}, [r0]!

+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0

+    LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0

+    LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0

+    LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0

     ; transpose the input data

     TRANSPOSE8X8

@@ -312,10 +314,10 @@

 |vpx_idct8x8_12_add_neon| PROC

     push            {r4-r9}

     vpush           {d8-d15}

-    vld1.s16        {q8,q9}, [r0]!

-    vld1.s16        {q10,q11}, [r0]!

-    vld1.s16        {q12,q13}, [r0]!

-    vld1.s16        {q14,q15}, [r0]!

+    LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0

+    LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0

+    LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0

+    LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0

     ; transpose the input data

     TRANSPOSE8X8

--- a/vpx_dsp/arm/idct8x8_add_neon.c

+++ b/vpx_dsp/arm/idct8x8_add_neon.c

@@ -12,6 +12,7 @@

 #include "./vpx_config.h"

 #include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/arm/idct_neon.h"

 #include "vpx_dsp/arm/transpose_neon.h"

 #include "vpx_dsp/txfm_common.h"

@@ -173,14 +174,14 @@

   int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;

   uint16x8_t q8u16, q9u16, q10u16, q11u16;

-  q8s16 = vld1q_s16(input);

-  q9s16 = vld1q_s16(input + 8);

-  q10s16 = vld1q_s16(input + 16);

-  q11s16 = vld1q_s16(input + 24);

-  q12s16 = vld1q_s16(input + 32);

-  q13s16 = vld1q_s16(input + 40);

-  q14s16 = vld1q_s16(input + 48);

-  q15s16 = vld1q_s16(input + 56);

+  q8s16 = load_tran_low_to_s16(input);

+  q9s16 = load_tran_low_to_s16(input + 8);

+  q10s16 = load_tran_low_to_s16(input + 16);

+  q11s16 = load_tran_low_to_s16(input + 24);

+  q12s16 = load_tran_low_to_s16(input + 32);

+  q13s16 = load_tran_low_to_s16(input + 40);

+  q14s16 = load_tran_low_to_s16(input + 48);

+  q15s16 = load_tran_low_to_s16(input + 56);

   transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,

                     &q15s16);

@@ -279,14 +280,14 @@

   uint16x8_t q8u16, q9u16, q10u16, q11u16;

   int32x4_t q9s32, q10s32, q11s32, q12s32;

-  q8s16 = vld1q_s16(input);

-  q9s16 = vld1q_s16(input + 8);

-  q10s16 = vld1q_s16(input + 16);

-  q11s16 = vld1q_s16(input + 24);

-  q12s16 = vld1q_s16(input + 32);

-  q13s16 = vld1q_s16(input + 40);

-  q14s16 = vld1q_s16(input + 48);

-  q15s16 = vld1q_s16(input + 56);

+  q8s16 = load_tran_low_to_s16(input);

+  q9s16 = load_tran_low_to_s16(input + 8);

+  q10s16 = load_tran_low_to_s16(input + 16);

+  q11s16 = load_tran_low_to_s16(input + 24);

+  q12s16 = load_tran_low_to_s16(input + 32);

+  q13s16 = load_tran_low_to_s16(input + 40);

+  q14s16 = load_tran_low_to_s16(input + 48);

+  q15s16 = load_tran_low_to_s16(input + 56);

   transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,

                     &q15s16);

--- /dev/null

+++ b/vpx_dsp/arm/idct_neon.asm

@@ -1,0 +1,29 @@

+;

+;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    INCLUDE ./vpx_config.asm

+    ; Helper function used to load tran_low_t into int16, narrowing if

+    ; necessary.

+    ; $dst0..3 are d registers with the pairs assumed to be contiguous in

+    ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.

+    MACRO

+    LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src

+    IF CONFIG_VP9_HIGHBITDEPTH

+    vld1.s32        {q0,q1}, [$src]!

+    vld1.s32        {q2,q3}, [$src]!

+    vmovn.i32       $dst0, q0

+    vmovn.i32       $dst1, q1

+    vmovn.i32       $dst2, q2

+    vmovn.i32       $dst3, q3

+    ELSE

+    vld1.s16        {$dst0-$dst1,$dst2-$dst3}, [$src]!

+    ENDIF

+    MEND

--- /dev/null

+++ b/vpx_dsp/arm/idct_neon.h

@@ -1,0 +1,34 @@

+/*

+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VPX_DSP_ARM_IDCT_NEON_H_

+#define VPX_DSP_ARM_IDCT_NEON_H_

+#include <arm_neon.h>

+#include "./vpx_config.h"

+#include "vpx_dsp/vpx_dsp_common.h"

+//------------------------------------------------------------------------------

+// Helper function used to load tran_low_t into int16, narrowing if necessary.

+static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) {

+#if CONFIG_VP9_HIGHBITDEPTH

+  const int32x4_t v0 = vld1q_s32(buf);

+  const int32x4_t v1 = vld1q_s32(buf + 4);

+  const int16x4_t s0 = vmovn_s32(v0);

+  const int16x4_t s1 = vmovn_s32(v1);

+  return vcombine_s16(s0, s1);

+#else

+  return vld1q_s16(buf);

+#endif

+}

+#endif  // VPX_DSP_ARM_IDCT_NEON_H_

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -204,13 +204,9 @@

 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 ifeq ($(HAVE_NEON_ASM),yes)

-DSP_SRCS-yes  += arm/idct4x4_add_neon$(ASM)

-DSP_SRCS-yes  += arm/idct8x8_add_neon$(ASM)

 DSP_SRCS-yes  += arm/idct16x16_add_neon$(ASM)

 else

 ifeq ($(HAVE_NEON),yes)

-DSP_SRCS-yes  += arm/idct4x4_add_neon.c

-DSP_SRCS-yes  += arm/idct8x8_add_neon.c

 DSP_SRCS-yes  += arm/idct16x16_add_neon.c

 endif  # HAVE_NEON

 endif  # HAVE_NEON_ASM

@@ -233,14 +229,20 @@

 endif  # !CONFIG_VP9_HIGHBITDEPTH

 ifeq ($(HAVE_NEON_ASM),yes)

+DSP_SRCS-yes += arm/idct_neon$(ASM)

 DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)

+DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)

 DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)

+DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)

 DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)

 else

 DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c

+DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c

 DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c

+DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c

 DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c

 endif  # HAVE_NEON_ASM

+DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h

 DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c

 endif  # CONFIG_VP9

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -644,16 +644,16 @@

     add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";

   } else {

     add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vpx_idct4x4_16_add sse2/;

+    specialize qw/vpx_idct4x4_16_add neon sse2/;

     add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

     specialize qw/vpx_idct4x4_1_add neon sse2/;

     add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64";

+    specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64";

     add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

-    specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64";

+    specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64";

     add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";

     specialize qw/vpx_idct8x8_1_add neon sse2/;

--

⑨