shithub: opus

--- a/dnn/nnet.c

+++ b/dnn/nnet.c

@@ -39,17 +39,14 @@

 #include "nnet.h"

 #include "nnet_data.h"

-#define SOFTMAX_HACK

-#ifdef __AVX__

-#include "vec_avx.h"

-#elif __ARM_NEON__

-#include "vec_neon.h"

-#else

+#ifdef NO_OPTIMIZATIONS

 #warning Compiling without any vectorization. This code will be very slow

-#include "vec.h"

 #endif

+#define SOFTMAX_HACK

 static OPUS_INLINE float relu(float x)

    return x < 0 ? 0 : x;

@@ -294,14 +291,19 @@

    celt_assert(input != state);

    celt_assert(gru->reset_after);

    RNN_COPY(zrh, input, 3*N);

+#ifdef USE_SU_BIAS

    for (i=0;i<3*N;i++)

-      recur[i] = gru->bias[3*N + i];

+      recur[i] = gru->subias[3*N + i];

+#else

+   for (i=0;i<3*N;i++)

+      recur[i] = gru->bias[3*N + i];

+#endif

    for (k=0;k<3;k++)

       for (i=0;i<N;i++)

          recur[k*N + i] += gru->diag_weights[k*N + i]*state[i];

-   sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, gru->idx, state);

+   sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, 3*N, gru->idx, state);

    for (i=0;i<2*N;i++)

       zrh[i] += recur[i];

    compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);

--- a/dnn/nnet.h

+++ b/dnn/nnet.h

@@ -28,6 +28,8 @@

 #ifndef _NNET_H_

 #define _NNET_H_

+#include "vec.h"

 #define ACTIVATION_LINEAR  0

 #define ACTIVATION_SIGMOID 1

 #define ACTIVATION_TANH    2

@@ -34,12 +36,6 @@

 #define ACTIVATION_RELU    3

 #define ACTIVATION_SOFTMAX 4

-#ifdef DOT_PROD

-typedef signed char qweight;

-#else

-typedef float qweight;

-#endif

 typedef struct {

   const float *bias;

   const float *input_weights;

@@ -70,6 +66,7 @@

 typedef struct {

   const float *bias;

+  const float *subias;

   const float *diag_weights;

   const qweight *recurrent_weights;

   const int *idx;

--- a/dnn/tansig_table.h

+++ b/dnn/tansig_table.h

@@ -1,5 +1,8 @@

 /* This file is auto-generated by gen_tables */

+#ifndef TANSIG_TABLE_H

+#define TANSIG_TABLE_H

 static const float tansig_table[201] = {

 0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,

 0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,

@@ -43,3 +46,5 @@

 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,

 1.000000f,

};

+#endif /*TANSIG_TABLE_H*/

--- a/dnn/training_tf2/dump_lpcnet.py

+++ b/dnn/training_tf2/dump_lpcnet.py

@@ -102,6 +102,9 @@

     weights = self.get_weights()

     printSparseVector(f, weights[1], name + '_recurrent_weights')

     printVector(f, weights[-1], name + '_bias')

+    subias = weights[-1].copy()

+    subias[1,:] = subias[1,:] - np.sum(np.clip(weights[1], -1, 1),axis=0)

+    printVector(f, subias, name + '_subias')

     if hasattr(self, 'activation'):

         activation = self.activation.__name__.upper()

     else:

@@ -112,8 +115,8 @@

         reset_after = 1

     neurons = weights[0].shape[1]//3

     max_rnn_neurons = max(max_rnn_neurons, neurons)

-    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'

-            .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))

+    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_subias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'

+            .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))

     hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

     hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

     hf.write('extern const SparseGRULayer {};\n\n'.format(name));

--- a/dnn/vec.h

+++ b/dnn/vec.h

@@ -26,11 +26,33 @@

    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

-#include "nnet.h"

+#ifndef VEC_H

+#define VEC_H

+#include "tansig_table.h"

+#include "opus_types.h"

+#include <math.h>

+#include "arch.h"

+#ifdef DOT_PROD

+typedef signed char qweight;

+#else

+typedef float qweight;

+#endif

+#ifdef __AVX__

+#include "vec_avx.h"

+#elif __ARM_NEON__

+#include "vec_neon.h"

+#else

+//#define USE_SU_BIAS

+#define NO_OPTIMIZATIONS

 /* No AVX2/FMA support */

 #ifndef LPCNET_TEST

-static float celt_exp2(float x)

+static inline float celt_exp2(float x)

    int integer;

    float frac;

@@ -50,7 +72,7 @@

 #define celt_exp(x) celt_exp2((x)*1.44269504f)

-static float tansig_approx(float x)

+static inline float tansig_approx(float x)

     int i;

     float y, dy;

@@ -69,12 +91,12 @@

     return sign*y;

-static OPUS_INLINE float sigmoid_approx(float x)

+static inline float sigmoid_approx(float x)

    return .5f + .5f*tansig_approx(.5f*x);

-static void softmax(float *y, const float *x, int N)

+static inline void softmax(float *y, const float *x, int N)

     int i;

     for (i=0;i<N;i++)

@@ -81,7 +103,7 @@

         y[i] = celt_exp(x[i]);

-static void vec_tanh(float *y, const float *x, int N)

+static inline void vec_tanh(float *y, const float *x, int N)

     int i;

     for (i=0;i<N;i++)

@@ -90,7 +112,7 @@

-static void vec_sigmoid(float *y, const float *x, int N)

+static inline void vec_sigmoid(float *y, const float *x, int N)

     int i;

     for (i=0;i<N;i++)

@@ -99,7 +121,7 @@

 #endif

-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

    int i, j;

    for (i=0;i<rows;i+=16)

@@ -132,7 +154,7 @@

-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

+static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

    int i, j;

    for (i=0;i<rows;i+=16)

@@ -167,42 +189,90 @@

 #ifdef DOT_PROD

+#define MAX_INPUTS (2048)

+#define SCALE (128.f*127.f)

 #define SCALE_1 (1.f/128.f/127.f)

-static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x)

+#ifdef USE_SU_BIAS

+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)

    int i, j;

+   unsigned x[MAX_INPUTS];

+   for (i=0;i<rows;i++) out[i] *= SCALE;

+   for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);

    for (i=0;i<rows;i+=8)

-      int cols;

-      cols = *idx++;

-      for (j=0;j<cols;j++)

+      int colblocks;

+      colblocks = *idx++;

+      for (j=0;j<colblocks;j++)

          int pos;

          float * restrict y;

          int xj0, xj1, xj2, xj3;

          pos = 4 * (*idx++);

-         xj0 = floor(.5+127*x[pos+0]);

-         xj1 = floor(.5+127*x[pos+1]);

-         xj2 = floor(.5+127*x[pos+2]);

-         xj3 = floor(.5+127*x[pos+3]);

+         xj0 = x[pos+0];

+         xj1 = x[pos+1];

+         xj2 = x[pos+2];

+         xj3 = x[pos+3];

          y = &out[i];

-         y[0] += SCALE_1*(w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

-         y[1] += SCALE_1*(w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

-         y[2] += SCALE_1*(w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

-         y[3] += SCALE_1*(w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

-         y[4] += SCALE_1*(w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

-         y[5] += SCALE_1*(w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

-         y[6] += SCALE_1*(w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

-         y[7] += SCALE_1*(w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

          w += 32;

+   for (i=0;i<rows;i++) out[i] *= SCALE_1;

+#else /*USE_SU_BIAS*/

+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)

+{

+   int i, j;

+   signed x[MAX_INPUTS];

+   for (i=0;i<rows;i++) out[i] *= SCALE;

+   for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);

+   for (i=0;i<rows;i+=8)

+   {

+      int colblocks;

+      colblocks = *idx++;

+      for (j=0;j<colblocks;j++)

+      {

+         int pos;

+         float * restrict y;

+         int xj0, xj1, xj2, xj3;

+         pos = 4 * (*idx++);

+         xj0 = x[pos+0];

+         xj1 = x[pos+1];

+         xj2 = x[pos+2];

+         xj3 = x[pos+3];

+         y = &out[i];

+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

+         w += 32;

+      }

+   }

+   for (i=0;i<rows;i++) out[i] *= SCALE_1;

+}

+#endif /*USE_SU_BIAS*/

-#else

-static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x)

+#else /*DOT_PROD*/

+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x)

    int i, j;

+   (void)ignore;

    for (i=0;i<rows;i+=8)

       int cols;

@@ -257,4 +327,8 @@

-#endif

+#endif /*DOT_PROD*/

+#endif /*no optimizations*/

+#endif /*VEC_H*/

--- a/dnn/vec_avx.h

+++ b/dnn/vec_avx.h

@@ -29,6 +29,9 @@

   AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma

*/

+#ifndef VEC_AVX_H

+#define VEC_AVX_H

 #include <immintrin.h>

 #ifdef __AVX2__

@@ -246,9 +249,10 @@

 #else

-static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, const int *idx, const float *x)

+static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x)

    int i, j;

+   (void)ignore;

    for (i=0;i<rows;i+=8)

       float * restrict y;

@@ -286,3 +290,4 @@

 #endif

+#endif /*VEC_AVX_H*/

--

⑨