shithub: libvpx

Download patch

ref: 8739a182c8250d2e44cd01a7da0a58773e537b3b
parent: 4a7424adba7a65766a92635dc67b6e7d94646693
parent: 1088b4f87ca5cd8e2b3242060cade0d8bbbcef7a
author: Johann Koenig <johannkoenig@google.com>
date: Mon May 15 14:15:27 EDT 2017

Merge "move neon load/stores to a new file"

--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -22,6 +22,7 @@
 #include "vp9/encoder/vp9_rd.h"
 
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -16,6 +16,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
   const uint32x4_t a = vpaddlq_u16(v_16x8);
--- a/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/vpx_dsp/arm/fwd_txfm_neon.c
@@ -14,6 +14,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
--- a/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/vpx_dsp/arm/idct16x16_add_neon.c
@@ -12,6 +12,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
 static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
--- a/vpx_dsp/arm/idct32x32_135_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
--- a/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
--- a/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -12,6 +12,7 @@
 #include <assert.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
 static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
--- a/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/vpx_dsp/arm/idct4x4_add_neon.c
@@ -13,6 +13,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
--- a/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/vpx_dsp/arm/idct8x8_add_neon.c
@@ -13,6 +13,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
--- a/vpx_dsp/arm/idct_neon.h
+++ b/vpx_dsp/arm/idct_neon.h
@@ -41,58 +41,6 @@
 };
 
 //------------------------------------------------------------------------------
-// Helper functions used to load tran_low_t into int16, narrowing if necessary.
-
-static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4x2_t v0 = vld2q_s32(buf);
-  const int32x4x2_t v1 = vld2q_s32(buf + 8);
-  const int16x4_t s0 = vmovn_s32(v0.val[0]);
-  const int16x4_t s1 = vmovn_s32(v0.val[1]);
-  const int16x4_t s2 = vmovn_s32(v1.val[0]);
-  const int16x4_t s3 = vmovn_s32(v1.val[1]);
-  int16x8x2_t res;
-  res.val[0] = vcombine_s16(s0, s2);
-  res.val[1] = vcombine_s16(s1, s3);
-  return res;
-#else
-  return vld2q_s16(buf);
-#endif
-}
-
-static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4_t v0 = vld1q_s32(buf);
-  const int32x4_t v1 = vld1q_s32(buf + 4);
-  const int16x4_t s0 = vmovn_s32(v0);
-  const int16x4_t s1 = vmovn_s32(v1);
-  return vcombine_s16(s0, s1);
-#else
-  return vld1q_s16(buf);
-#endif
-}
-
-static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4_t v0 = vld1q_s32(buf);
-  return vmovn_s32(v0);
-#else
-  return vld1_s16(buf);
-#endif
-}
-
-static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
-  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
-  vst1q_s32(buf, v0);
-  vst1q_s32(buf + 4, v1);
-#else
-  vst1q_s16(buf, a);
-#endif
-}
-
-//------------------------------------------------------------------------------
 // Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
 static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
 #if CONFIG_VP9_HIGHBITDEPTH
--- /dev/null
+++ b/vpx_dsp/arm/mem_neon.h
@@ -1,0 +1,71 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_MEM_NEON_H_
+#define VPX_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4x2_t v0 = vld2q_s32(buf);
+  const int32x4x2_t v1 = vld2q_s32(buf + 8);
+  const int16x4_t s0 = vmovn_s32(v0.val[0]);
+  const int16x4_t s1 = vmovn_s32(v0.val[1]);
+  const int16x4_t s2 = vmovn_s32(v1.val[0]);
+  const int16x4_t s3 = vmovn_s32(v1.val[1]);
+  int16x8x2_t res;
+  res.val[0] = vcombine_s16(s0, s2);
+  res.val[1] = vcombine_s16(s1, s3);
+  return res;
+#else
+  return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+#else
+  return vld1q_s16(buf);
+#endif
+}
+
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vld1q_s32(buf);
+  return vmovn_s32(v0);
+#else
+  return vld1_s16(buf);
+#endif
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+  vst1q_s32(buf, v0);
+  vst1q_s32(buf + 4, v1);
+#else
+  vst1q_s16(buf, a);
+#endif
+}
+#endif  // VPX_DSP_ARM_MEM_NEON_H_
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -352,6 +352,7 @@
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
 # Neon utilities
+DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h
 DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
 
 # PPC VSX utilities