shithub: opus

--- a/dnn/training_tf2/dump_lpcnet.py

+++ b/dnn/training_tf2/dump_lpcnet.py

@@ -84,7 +84,7 @@

     W = np.minimum(127, np.maximum(-128, np.round(W*128)))

     printVector(f, W.astype('int'), name, dtype='qweight')

     f.write('#else /*DOT_PROD*/\n')

-    printVector(f, W0, name, dtype='nnet_weight')

+    printVector(f, W0, name, dtype='qweight')

     f.write('#endif /*DOT_PROD*/\n')

     #idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)

     printVector(f, idx, name + '_idx', dtype='int')

--- a/dnn/vec.h

+++ b/dnn/vec.h

@@ -34,11 +34,6 @@

 #include <math.h>

 #include "arch.h"

-#ifdef DOT_PROD

-typedef signed char qweight;

-#else

-typedef float qweight;

-#endif

 #ifdef __AVX__

 #include "vec_avx.h"

@@ -46,9 +41,18 @@

 #include "vec_neon.h"

 #else

-//#define USE_SU_BIAS

 #define NO_OPTIMIZATIONS

+//#define DOT_PROD

+//#define USE_SU_BIAS

+#ifdef DOT_PROD

+typedef signed char qweight;

+#else

+typedef float qweight;

+#endif

 /* No AVX2/FMA support */

 #ifndef LPCNET_TEST

--- a/dnn/vec_avx.h

+++ b/dnn/vec_avx.h

@@ -34,6 +34,9 @@

 #include <immintrin.h>

+#define DOT_PROD

+#define USE_SU_BIAS

 #ifdef __AVX2__

 static inline __m256 exp8_approx(__m256 X)

@@ -222,9 +225,15 @@

 #ifdef DOT_PROD

 #define USE_SU_BIAS

+#ifdef DOT_PROD

+typedef signed char qweight;

+#else

+typedef float qweight;

+#endif

 #define MAX_INPUTS (2048)

 #define MAX_OUTPUTS (8192)

@@ -232,7 +241,6 @@

 #define SCALE (128.f*127.f)

 #define SCALE_1 (1.f/128.f/127.f)

-#if 1

 static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, const int *idx, const float *_x)

@@ -285,43 +293,7 @@

    for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];

-#else

-static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, const int *idx, const float *_x)

-{

-   int i, j;

-   unsigned char x[MAX_INPUTS];

-   int out[MAX_OUTPUTS];

-   for (i=0;i<rows;i++) out[i] = SCALE*_out[i];

-   for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);

-   for (i=0;i<rows;i+=8)

-   {

-      int * restrict y;

-      int colblocks;

-      colblocks = *idx++;

-      y = &out[i];

-      for (j=0;j<colblocks;j++)

-      {

-         int pos;

-         int xj0, xj1, xj2, xj3;

-         pos = 4 * (*idx++);

-         xj0 = x[pos+0];

-         xj1 = x[pos+1];

-         xj2 = x[pos+2];

-         xj3 = x[pos+3];

-         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

-         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

-         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

-         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

-         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

-         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

-         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

-         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

-         w += 32;

-      }

-   }

-   for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];

-}

-#endif

 #else /*DOT_PROD*/

 static inline void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x)

--

⑨