ref: 492ef9b362134693b5d2fafe5e2b2800cd02682a
parent: ddd38bd208e9d4e9463e225a7524ca6707adc094
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Tue Mar 19 23:03:44 EDT 2019
Neon implementation of the activation functions
--- a/dnn/vec_neon.h
+++ b/dnn/vec_neon.h
@@ -30,81 +30,96 @@
#include <arm_neon.h>
#ifndef LPCNET_TEST
-static float celt_exp2(float x)
-{
- int integer;
- float frac;
- union {
- float f;
- opus_uint32 i;
- } res;
- integer = floor(x);
- if (integer < -50)
- return 0;
- frac = x-integer;
- /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
- res.f = 0.99992522f + frac * (0.69583354f
- + frac * (0.22606716f + 0.078024523f*frac));
- res.i = (res.i + (integer<<23)) & 0x7fffffff;
- return res.f;
-}
-#define celt_exp_neon(x) celt_exp2((x)*1.44269504f)
+static OPUS_INLINE float32x4_t exp4_approx(float32x4_t x) {
+ int32x4_t i;
+ float32x4_t xf;
+
+ x = vmaxq_f32(vminq_f32(x, vdupq_n_f32(88.f)), vdupq_n_f32(-88.f));
+
+ /* express exp(x) as exp2(x/log(2)), add 127 for the exponent later */
+ x = vmlaq_f32(vdupq_n_f32(127.f), x, vdupq_n_f32(1.44269504f));
+
+ /* split into integer and fractional parts */
+ i = vcvtq_s32_f32(x);
+ xf = vcvtq_f32_s32(i);
+ x = vsubq_f32(x, xf);
-static float tansig_approx(float x)
-{
- int i;
- float y, dy;
- float sign=1;
- /* Tests are reversed to catch NaNs */
- if (!(x<8))
- return 1;
- if (!(x>-8))
- return -1;
-#ifndef FIXED_POINT
- /* Another check in case of -ffast-math */
- if (celt_isnan(x))
- return 0;
-#endif
- if (x<0)
- {
- x=-x;
- sign=-1;
- }
- i = (int)floor(.5f+25*x);
- x -= .04f*i;
- y = tansig_table[i];
- dy = 1-y*y;
- y = y + x*dy*(1 - y*x);
- return sign*y;
+ float32x4_t K0 = vdupq_n_f32(0.99992522f);
+ float32x4_t K1 = vdupq_n_f32(0.69583354f);
+ float32x4_t K2 = vdupq_n_f32(0.22606716f);
+ float32x4_t K3 = vdupq_n_f32(0.078024523f);
+ float32x4_t Y = vmlaq_f32(K0, x, vmlaq_f32(K1, x, vmlaq_f32(K2, K3, x)));
+
+ /* compute 2^i */
+ float32x4_t exponent = vreinterpretq_f32_s32(vshlq_n_s32(i, 23));
+
+ Y = vmulq_f32(Y, exponent);
+ return Y;
}
-static OPUS_INLINE float sigmoid_approx(float x)
+static OPUS_INLINE float celt_exp(float x)
{
- return .5f + .5f*tansig_approx(.5f*x);
+ float out[4];
+ float32x4_t X, Y;
+ X = vdupq_n_f32(x);
+ Y = exp4_approx(X);
+ vst1q_f32(out, Y);
+ return out[0];
}
static void softmax(float *y, const float *x, int N)
{
int i;
- for (i=0;i<N;i++)
- y[i] = celt_exp_neon(x[i]);
+ for (i=0;i<N-3;i+=4)
+ {
+ float32x4_t X, Y;
+ X = vld1q_f32(&x[i]);
+ Y = exp4_approx(X);
+ vst1q_f32(&y[i], Y);
+ }
+ for (;i<N;i++)
+ y[i] = celt_exp(x[i]);
}
static void vec_tanh(float *y, const float *x, int N)
{
int i;
- for (i=0;i<N;i++)
+ for (i=0;i<N-3;i+=4)
{
- y[i] = tansig_approx(x[i]);
+ const float32x4_t two = vdupq_n_f32(2.f);
+ const float32x4_t one = vdupq_n_f32(1.f);
+ float32x4_t X, Y;
+ X = vld1q_f32(&x[i]);
+ X = vmulq_f32(X, two);
+ Y = exp4_approx(X);
+ Y = vmulq_f32(vsubq_f32(Y, one), vrecpeq_f32(vaddq_f32(Y, one)));
+ vst1q_f32(&y[i], Y);
}
+ for (;i<N;i++)
+ {
+ float ex2;
+ ex2 = celt_exp(2*x[i]);
+ y[i] = (ex2-1)/(ex2+1);
+ }
}
static void vec_sigmoid(float *y, const float *x, int N)
{
int i;
- for (i=0;i<N;i++)
+ for (i=0;i<N-3;i+=4)
{
- y[i] = sigmoid_approx(x[i]);
+ const float32x4_t one = vdupq_n_f32(1.f);
+ float32x4_t X, Y;
+ X = vld1q_f32(&x[i]);
+ Y = exp4_approx(X);
+ Y = vmulq_f32(Y, vrecpeq_f32(vaddq_f32(Y, one)));
+ vst1q_f32(&y[i], Y);
+ }
+ for (;i<N;i++)
+ {
+ float ex;
+ ex = celt_exp(x[i]);
+ y[i] = (ex)/(ex+1);
}
}
#endif
--
⑨