shithub: opus

Download patch

ref: c395a68b7dc246617f6dc981add1cebffb87502c
parent: 05f4851dcded87b7edec0e309319d8727fc79377
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Fri Nov 30 05:46:32 EST 2018

moving code around

--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -132,7 +132,73 @@
         y[i] = (ex)/(ex+1);
     }
 }
-#else
+
+static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      float * restrict y;
+      __m256 vy0, vy8;
+      y = &out[i];
+      vy0 = _mm256_loadu_ps(&y[0]);
+      vy8 = _mm256_loadu_ps(&y[8]);
+      for (j=0;j<cols;j++)
+      {
+         __m256 vxj;
+         __m256 vw;
+         vxj = _mm256_broadcast_ss(&x[j]);
+
+         vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+      }
+      _mm256_storeu_ps (&y[0], vy0);
+      _mm256_storeu_ps (&y[8], vy8);
+   }
+}
+static void sparse_gemm_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      float * restrict y;
+      int cols;
+      __m256 vy0, vy8;
+      y = &out[i];
+      vy0 = _mm256_loadu_ps(&y[0]);
+      vy8 = _mm256_loadu_ps(&y[8]);
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         int id;
+         __m256 vxj;
+         __m256 vw;
+         id = *idx++;
+         vxj = _mm256_broadcast_ss(&x[id]);
+
+         vw = _mm256_loadu_ps(&weights[0]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vw = _mm256_loadu_ps(&weights[8]);
+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+         weights += 16;
+      }
+      _mm256_storeu_ps (&y[0], vy0);
+      _mm256_storeu_ps (&y[8], vy8);
+   }
+}
+
+
+#else /* No AVX2/FMA support */
+
+
+#warning Compiling without any vectorization. This code will be very slow
+#warning Try adding -mavx2 -mfma
+
+
 static float celt_exp2(float x)
 {
    int integer;
@@ -211,88 +277,13 @@
     }
 }
 
-
-#endif
-
-
-
-static OPUS_INLINE float relu(float x)
-{
-   return x < 0 ? 0 : x;
-}
-
-#ifdef __AVX2__
-#include <immintrin.h>
 static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
 {
    int i, j;
    for (i=0;i<rows;i+=16)
    {
-      float * restrict y;
-      __m256 vy0, vy8;
-      y = &out[i];
-      vy0 = _mm256_loadu_ps(&y[0]);
-      vy8 = _mm256_loadu_ps(&y[8]);
       for (j=0;j<cols;j++)
       {
-         __m256 vxj;
-         __m256 vw;
-         vxj = _mm256_broadcast_ss(&x[j]);
-
-         vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
-         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
-
-         vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
-         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
-      }
-      _mm256_storeu_ps (&y[0], vy0);
-      _mm256_storeu_ps (&y[8], vy8);
-   }
-}
-static void sparse_gemm_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
-{
-   int i, j;
-   for (i=0;i<rows;i+=16)
-   {
-      float * restrict y;
-      int cols;
-      __m256 vy0, vy8;
-      y = &out[i];
-      vy0 = _mm256_loadu_ps(&y[0]);
-      vy8 = _mm256_loadu_ps(&y[8]);
-      cols = *idx++;
-      for (j=0;j<cols;j++)
-      {
-         int id;
-         __m256 vxj;
-         __m256 vw;
-         id = *idx++;
-         vxj = _mm256_broadcast_ss(&x[id]);
-
-         vw = _mm256_loadu_ps(&weights[0]);
-         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
-
-         vw = _mm256_loadu_ps(&weights[8]);
-         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
-         weights += 16;
-      }
-      _mm256_storeu_ps (&y[0], vy0);
-      _mm256_storeu_ps (&y[8], vy8);
-   }
-}
-
-#else
-
-#warning Compiling without any vectorization. This code will be very slow
-#warning Try adding -mavx2 -mfma
-
-static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
-{
-   int i, j;
-   for (i=0;i<rows;i+=16)
-   {
-      for (j=0;j<cols;j++)
-      {
          const float * restrict w;
          float * restrict y;
          float xj;
@@ -353,6 +344,14 @@
    }
 }
 #endif
+
+
+
+static OPUS_INLINE float relu(float x)
+{
+   return x < 0 ? 0 : x;
+}
+
 
 static void gemm_accum(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
 {
--