ref: db26e381a45aadcd82851075b85e2466e7de77d2
parent: 72cc88dfddce319aeac075bd28eff791cd2b14d8
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Tue Nov 28 09:16:57 EST 2023
Trying to use fma instructions when possible Compilers sometimes replace vmlaq*() with fmul+fadd instead of fmla. Trying to use vfmaq*() instead when possible.
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -97,6 +97,14 @@
}
#else
+
+#if defined(__ARM_FEATURE_FMA) && defined(__ARM_ARCH_ISA_A64)
+/* If we can, force the compiler to use an FMA instruction rather than break
+ * vmlaq_f32() into fmul/fadd. */
+#define vmlaq_lane_f32(a,b,c,lane) vfmaq_lane_f32(a,b,c,lane)
+#endif
+
+
/*
* Function: xcorr_kernel_neon_float
* ---------------------------------
--- a/celt/arm/pitch_neon_intr.c
+++ b/celt/arm/pitch_neon_intr.c
@@ -130,6 +130,13 @@
/* ========================================================================== */
+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+ vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
+
#ifdef OPUS_CHECK_ASM
/* This part of code simulates floating-point NEON operations. */
--- a/dnn/vec_neon.h
+++ b/dnn/vec_neon.h
@@ -49,6 +49,12 @@
}
#endif
+#ifdef __ARM_FEATURE_FMA
+/* If we can, force the compiler to use an FMA instruction rather than break
+ vmlaq_f32() into fmul/fadd. */
+#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
+#endif
+
#ifndef LPCNET_TEST
static inline float32x4_t exp4_approx(float32x4_t x) {
int32x4_t i;
--
⑨