shithub: opus

--- a/dnn/dump_lpcnet.py

+++ b/dnn/dump_lpcnet.py

@@ -41,10 +41,10 @@

 max_conv_inputs = 1

 max_mdense_tmp = 1

-def printVector(f, vector, name):

+def printVector(f, vector, name, dtype='float'):

     v = np.reshape(vector, (-1));

     #print('static const float ', name, '[', len(v), '] = \n', file=f)

-    f.write('static const float {}[{}] = {{\n   '.format(name, len(v)))

+    f.write('static const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))

     for i in range(0, len(v)):

         f.write('{}'.format(v[i]))

         if (i!=len(v)-1):

@@ -59,11 +59,51 @@

     f.write('\n};\n\n')

     return;

+def printSparseVector(f, A, name):

+    N = A.shape[0]

+    W = np.zeros((0,))

+    diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])

+    A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))

+    A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))

+    A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))

+    printVector(f, diag, name + '_diag')

+    for i in range(3*N//16):

+        for j in range(N):

+            W = np.concatenate([W, A[j, i*16:(i+1)*16]])

+    printVector(f, W, name)

+    idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)

+    printVector(f, idx, name + '_idx', dtype='int')

+    return;

 def dump_layer_ignore(self, f, hf):

     print("ignoring layer " + self.name + " of type " + self.__class__.__name__)

     return False

 Layer.dump_layer = dump_layer_ignore

+def dump_sparse_gru(self, f, hf):

+    global max_rnn_neurons

+    name = 'sparse_' + self.name

+    print("printing layer " + name + " of type sparse " + self.__class__.__name__)

+    weights = self.get_weights()

+    printSparseVector(f, weights[1], name + '_recurrent_weights')

+    printVector(f, weights[-1], name + '_bias')

+    if hasattr(self, 'activation'):

+        activation = self.activation.__name__.upper()

+    else:

+        activation = 'TANH'

+    if hasattr(self, 'reset_after') and not self.reset_after:

+        reset_after = 0

+    else:

+        reset_after = 1

+    neurons = weights[0].shape[1]//3

+    max_rnn_neurons = max(max_rnn_neurons, neurons)

+    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'

+            .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))

+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

+    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

+    hf.write('extern const SparseGRULayer {};\n\n'.format(name));

+    return True

 def dump_gru_layer(self, f, hf):

     global max_rnn_neurons

     name = self.name

@@ -204,6 +244,8 @@

 for i, layer in enumerate(model.layers):

     if layer.dump_layer(f, hf):

         layer_list.append(layer.name)

+dump_sparse_gru(model.get_layer('gru_a'), f, hf)

 hf.write('#define MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons))

 hf.write('#define MAX_CONV_INPUTS {}\n\n'.format(max_conv_inputs))

--- a/dnn/lpcnet.c

+++ b/dnn/lpcnet.c

@@ -122,7 +122,8 @@

     accum_embedding(&gru_a_embed_sig, gru_a_input, last_sig);

     accum_embedding(&gru_a_embed_pred, gru_a_input, pred);

     accum_embedding(&gru_a_embed_exc, gru_a_input, last_exc);

-    compute_gru3(&gru_a, net->gru_a_state, gru_a_input);

+    /*compute_gru3(&gru_a, net->gru_a_state, gru_a_input);*/

+    compute_sparse_gru(&sparse_gru_a, net->gru_a_state, gru_a_input);

     RNN_COPY(in_b, net->gru_a_state, GRU_A_STATE_SIZE);

     RNN_COPY(&in_b[GRU_A_STATE_SIZE], condition, FEATURE_DENSE2_OUT_SIZE);

     compute_gru2(&gru_b, net->gru_b_state, in_b);

--- a/dnn/nnet.c

+++ b/dnn/nnet.c

@@ -105,6 +105,38 @@

       _mm256_storeu_ps (&y[8], vy8);

+static void sparse_gemm_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)

+{

+   int i, j;

+   for (i=0;i<rows;i+=16)

+   {

+      float * restrict y;

+      int cols;

+      __m256 vy0, vy8;

+      y = &out[i];

+      vy0 = _mm256_loadu_ps(&y[0]);

+      vy8 = _mm256_loadu_ps(&y[8]);

+      cols = *idx++;

+      for (j=0;j<cols;j++)

+      {

+         int id;

+         __m256 vxj;

+         __m256 vw;

+         id = *idx++;

+         vxj = _mm256_broadcast_ss(&x[id]);

+         vw = _mm256_loadu_ps(&weights[0]);

+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);

+         vw = _mm256_loadu_ps(&weights[8]);

+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);

+         weights += 16;

+      }

+      _mm256_storeu_ps (&y[0], vy0);

+      _mm256_storeu_ps (&y[8], vy8);

+   }

+}

 #else

 static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

@@ -346,6 +378,43 @@

    for (i=0;i<3*N;i++)

       recur[i] = gru->bias[3*N + i];

    gemm_accum(recur, gru->recurrent_weights, 3*N, N, stride, state);

+   for (i=0;i<2*N;i++)

+      zrh[i] += recur[i];

+   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);

+   for (i=0;i<N;i++)

+      h[i] += recur[2*N+i]*r[i];

+   compute_activation(h, h, N, gru->activation);

+   for (i=0;i<N;i++)

+      h[i] = z[i]*state[i] + (1-z[i])*h[i];

+   for (i=0;i<N;i++)

+      state[i] = h[i];

+}

+void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input)

+{

+   int i, k;

+   int N;

+   float zrh[3*MAX_RNN_NEURONS];

+   float recur[3*MAX_RNN_NEURONS];

+   float *z;

+   float *r;

+   float *h;

+   N = gru->nb_neurons;

+   z = zrh;

+   r = &zrh[N];

+   h = &zrh[2*N];

+   celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);

+   celt_assert(input != state);

+   celt_assert(gru->reset_after);

+   RNN_COPY(zrh, input, 3*N);

+   for (i=0;i<3*N;i++)

+      recur[i] = gru->bias[3*N + i];

+   for (k=0;k<3;k++)

+   {

+      for (i=0;i<N;i++)

+         recur[k*N + i] += gru->diag_weights[k*N + i]*state[i];

+   }

+   sparse_gemm_accum16(recur, gru->recurrent_weights, 3*N, gru->idx, state);

    for (i=0;i<2*N;i++)

       zrh[i] += recur[i];

    compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);

--- a/dnn/nnet.h

+++ b/dnn/nnet.h

@@ -64,6 +64,16 @@

 typedef struct {

   const float *bias;

+  const float *diag_weights;

+  const float *recurrent_weights;

+  const int *idx;

+  int nb_neurons;

+  int activation;

+  int reset_after;

+} SparseGRULayer;

+typedef struct {

+  const float *bias;

   const float *input_weights;

   int nb_inputs;

   int kernel_size;

@@ -88,6 +98,8 @@

 void compute_gru2(const GRULayer *gru, float *state, const float *input);

 void compute_gru3(const GRULayer *gru, float *state, const float *input);

+void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input);

 void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input);

--

⑨