ref: c395a68b7dc246617f6dc981add1cebffb87502c
parent: 05f4851dcded87b7edec0e309319d8727fc79377
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Fri Nov 30 05:46:32 EST 2018
moving code around
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -132,7 +132,73 @@
y[i] = (ex)/(ex+1);
}
}
-#else
+
+static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+ int i, j;
+ for (i=0;i<rows;i+=16)
+ {
+ float * restrict y;
+ __m256 vy0, vy8;
+ y = &out[i];
+ vy0 = _mm256_loadu_ps(&y[0]);
+ vy8 = _mm256_loadu_ps(&y[8]);
+ for (j=0;j<cols;j++)
+ {
+ __m256 vxj;
+ __m256 vw;
+ vxj = _mm256_broadcast_ss(&x[j]);
+
+ vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
+ vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+ }
+ _mm256_storeu_ps (&y[0], vy0);
+ _mm256_storeu_ps (&y[8], vy8);
+ }
+}
+static void sparse_gemm_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
+{
+ int i, j;
+ for (i=0;i<rows;i+=16)
+ {
+ float * restrict y;
+ int cols;
+ __m256 vy0, vy8;
+ y = &out[i];
+ vy0 = _mm256_loadu_ps(&y[0]);
+ vy8 = _mm256_loadu_ps(&y[8]);
+ cols = *idx++;
+ for (j=0;j<cols;j++)
+ {
+ int id;
+ __m256 vxj;
+ __m256 vw;
+ id = *idx++;
+ vxj = _mm256_broadcast_ss(&x[id]);
+
+ vw = _mm256_loadu_ps(&weights[0]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vw = _mm256_loadu_ps(&weights[8]);
+ vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+ weights += 16;
+ }
+ _mm256_storeu_ps (&y[0], vy0);
+ _mm256_storeu_ps (&y[8], vy8);
+ }
+}
+
+
+#else /* No AVX2/FMA support */
+
+
+#warning Compiling without any vectorization. This code will be very slow
+#warning Try adding -mavx2 -mfma
+
+
static float celt_exp2(float x)
{
int integer;
@@ -211,88 +277,13 @@
}
}
-
-#endif
-
-
-
-static OPUS_INLINE float relu(float x)
-{
- return x < 0 ? 0 : x;
-}
-
-#ifdef __AVX2__
-#include <immintrin.h>
static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
{
int i, j;
for (i=0;i<rows;i+=16)
{
- float * restrict y;
- __m256 vy0, vy8;
- y = &out[i];
- vy0 = _mm256_loadu_ps(&y[0]);
- vy8 = _mm256_loadu_ps(&y[8]);
for (j=0;j<cols;j++)
{
- __m256 vxj;
- __m256 vw;
- vxj = _mm256_broadcast_ss(&x[j]);
-
- vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
- vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
-
- vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
- vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
- }
- _mm256_storeu_ps (&y[0], vy0);
- _mm256_storeu_ps (&y[8], vy8);
- }
-}
-static void sparse_gemm_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
-{
- int i, j;
- for (i=0;i<rows;i+=16)
- {
- float * restrict y;
- int cols;
- __m256 vy0, vy8;
- y = &out[i];
- vy0 = _mm256_loadu_ps(&y[0]);
- vy8 = _mm256_loadu_ps(&y[8]);
- cols = *idx++;
- for (j=0;j<cols;j++)
- {
- int id;
- __m256 vxj;
- __m256 vw;
- id = *idx++;
- vxj = _mm256_broadcast_ss(&x[id]);
-
- vw = _mm256_loadu_ps(&weights[0]);
- vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
-
- vw = _mm256_loadu_ps(&weights[8]);
- vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
- weights += 16;
- }
- _mm256_storeu_ps (&y[0], vy0);
- _mm256_storeu_ps (&y[8], vy8);
- }
-}
-
-#else
-
-#warning Compiling without any vectorization. This code will be very slow
-#warning Try adding -mavx2 -mfma
-
-static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
-{
- int i, j;
- for (i=0;i<rows;i+=16)
- {
- for (j=0;j<cols;j++)
- {
const float * restrict w;
float * restrict y;
float xj;
@@ -353,6 +344,14 @@
}
}
#endif
+
+
+
+static OPUS_INLINE float relu(float x)
+{
+ return x < 0 ? 0 : x;
+}
+
static void gemm_accum(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
{
--
⑨