ref: 7ee79b63df4cf176ec5c920d993e9b17ed8708fd
parent: d961d009a0b19896fe3f959929ea626309b6e40c
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Thu Nov 29 14:43:59 EST 2018
Add AXV versions of exp(), tanh() and sigmoid() Now 3x faster than real-time
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -39,9 +39,121 @@
#include "nnet.h"
#include "nnet_data.h"
-static OPUS_INLINE float tansig_approx(float x)
+#ifdef __AVX2__
+#include <immintrin.h>
+static __m256 exp8_approx(__m256 X)
{
+ const __m256 K0 = _mm256_set1_ps(0.99992522f);
+ const __m256 K1 = _mm256_set1_ps(0.69583354f);
+ const __m256 K2 = _mm256_set1_ps(0.22606716f);
+ const __m256 K3 = _mm256_set1_ps(0.078024523f);
+ const __m256 log2_E = _mm256_set1_ps(1.44269504);
+ const __m256 max_in = _mm256_set1_ps(50.f);
+ const __m256 min_in = _mm256_set1_ps(-50.f);
+ const __m256i mask = _mm256_set1_epi32(0x7fffffff);
+ __m256 XF, Y;
+ __m256i I;
+ X = _mm256_mul_ps(X, log2_E);
+ X = _mm256_max_ps(min_in, _mm256_min_ps(max_in, X));
+ XF = _mm256_floor_ps(X);
+ I = _mm256_cvtps_epi32(XF);
+ X = _mm256_sub_ps(X, XF);
+ Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);
+ I = _mm256_slli_epi32(I, 23);
+ Y = _mm256_castsi256_ps(_mm256_and_si256(mask, _mm256_add_epi32(I, _mm256_castps_si256(Y))));
+ return Y;
+}
+
+
+static float celt_exp(float x)
+{
+ float out[8];
+ __m256 X, Y;
+ X = _mm256_set1_ps(x);
+ Y = exp8_approx(X);
+ _mm256_storeu_ps(out, Y);
+ return out[0];
+}
+
+static void softmax(float *y, const float *x, int N)
+{
int i;
+ for (i=0;i<N-7;i+=8)
+ {
+ __m256 X, Y;
+ X = _mm256_loadu_ps(&x[i]);
+ Y = exp8_approx(X);
+ _mm256_storeu_ps(&y[i], Y);
+ }
+ for (;i<N;i++)
+ y[i] = celt_exp(x[i]);
+}
+
+static void vec_tanh(float *y, const float *x, int N)
+{
+ int i;
+ for (i=0;i<N-7;i+=8)
+ {
+ const __m256 two = _mm256_set1_ps(2.f);
+ const __m256 one = _mm256_set1_ps(1.f);
+ __m256 X, Y;
+ X = _mm256_loadu_ps(&x[i]);
+ X = _mm256_mul_ps(X, two);
+ Y = exp8_approx(X);
+ Y = _mm256_mul_ps(_mm256_sub_ps(Y, one), _mm256_rcp_ps(_mm256_add_ps(Y, one)));
+ _mm256_storeu_ps(&y[i], Y);
+ }
+ for (;i<N;i++)
+ {
+ float ex2;
+ ex2 = celt_exp(2*x[i]);
+ y[i] = (ex2-1)/(ex2+1);
+ }
+}
+
+static void vec_sigmoid(float *y, const float *x, int N)
+{
+ int i;
+ for (i=0;i<N-7;i+=8)
+ {
+ const __m256 one = _mm256_set1_ps(1.f);
+ __m256 X, Y;
+ X = _mm256_loadu_ps(&x[i]);
+ Y = exp8_approx(X);
+ Y = _mm256_mul_ps(Y, _mm256_rcp_ps(_mm256_add_ps(Y, one)));
+ _mm256_storeu_ps(&y[i], Y);
+ }
+ for (;i<N;i++)
+ {
+ float ex;
+ ex = celt_exp(x[i]);
+ y[i] = (ex)/(ex+1);
+ }
+}
+#else
+static float celt_exp2(float x)
+{
+ int integer;
+ float frac;
+ union {
+ float f;
+ opus_uint32 i;
+ } res;
+ integer = floor(x);
+ if (integer < -50)
+ return 0;
+ frac = x-integer;
+ /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
+ res.f = 0.99992522f + frac * (0.69583354f
+ + frac * (0.22606716f + 0.078024523f*frac));
+ res.i = (res.i + (integer<<23)) & 0x7fffffff;
+ return res.f;
+}
+#define celt_exp(x) celt_exp2((x)*1.44269504f)
+
+static float tansig_approx(float x)
+{
+ int i;
float y, dy;
float sign=1;
/* Tests are reversed to catch NaNs */
@@ -72,6 +184,36 @@
return .5f + .5f*tansig_approx(.5f*x);
}
+static void softmax(float *y, const float *x, int N)
+{
+ int i;
+ for (i=0;i<N;i++)
+ y[i] = celt_exp(x[i]);
+}
+
+static void vec_tanh(float *y, const float *x, int N)
+{
+ int i;
+ for (i=0;i<N;i++)
+ {
+ y[i] = tansig_approx(x[i]);
+ }
+}
+
+static void vec_sigmoid(float *y, const float *x, int N)
+{
+ int i;
+ for (i=0;i<N;i++)
+ {
+ y[i] = sigmoid_approx(x[i]);
+ }
+}
+
+
+#endif
+
+
+
static OPUS_INLINE float relu(float x)
{
return x < 0 ? 0 : x;
@@ -191,18 +333,16 @@
{
int i;
if (activation == ACTIVATION_SIGMOID) {
- for (i=0;i<N;i++)
- output[i] = sigmoid_approx(input[i]);
+ vec_sigmoid(output, input, N);
} else if (activation == ACTIVATION_TANH) {
- for (i=0;i<N;i++)
- output[i] = tansig_approx(input[i]);
+ vec_tanh(output, input, N);
} else if (activation == ACTIVATION_RELU) {
for (i=0;i<N;i++)
output[i] = relu(input[i]);
} else if (activation == ACTIVATION_SOFTMAX) {
float sum = 0;
+ softmax(output, input, N);
for (i=0;i<N;i++) {
- output[i] = exp(input[i]);
sum += output[i];
}
sum = 1.f/(sum+1e-30);
--
⑨