shithub: opus

--- a/dnn/dump_data.c

+++ b/dnn/dump_data.c

@@ -75,7 +75,7 @@

-void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file, int nframes) {

+void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file, int nframes, int e2e) {

   int i, k;

   for (k=0;k<nframes;k++) {

   unsigned char data[4*FRAME_SIZE];

@@ -92,11 +92,11 @@

     /* Excitation in. */

     data[4*i+2] = st->exc_mem;

     /* Excitation out. */

-#ifdef END2END

-    data[4*i+3] = lin2ulaw(pcm[k*FRAME_SIZE+i]);

-#else

-    data[4*i+3] = e;

-#endif

+    if (e2e) {

+      data[4*i+3] = lin2ulaw(pcm[k*FRAME_SIZE+i]);

+    } else {

+      data[4*i+3] = e;

+    }

     /* Simulate error on excitation. */

     e += noise[k*FRAME_SIZE+i];

     e = IMIN(255, IMAX(0, e));

@@ -118,6 +118,8 @@

 int main(int argc, char **argv) {

   int i;

+  char *argv0;

+  int e2e=0;

   int count=0;

   static const float a_hp[2] = {-1.99599, 0.99600};

   static const float b_hp[2] = {-2, 1};

@@ -148,6 +150,12 @@

   int quantize = 0;

   srand(getpid());

   st = lpcnet_encoder_create();

+  argv0=argv[0];

+  if (argc > 2 && strcmp(argv[1], "-end2end")==0) {

+      e2e = 1;

+      argv++;

+      argc--;

+  }

   if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;

   if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {

       training = 1;

@@ -168,8 +176,8 @@

       decode = 1;

   if (training == -1) {

-    fprintf(stderr, "usage: %s -train <speech> <features out> <pcm out>\n", argv[0]);

-    fprintf(stderr, "  or   %s -test <speech> <features out>\n", argv[0]);

+    fprintf(stderr, "usage: %s -train <speech> <features out> <pcm out>\n", argv0);

+    fprintf(stderr, "  or   %s -test <speech> <features out>\n", argv0);

     return 1;

   f1 = fopen(argv[2], "r");

@@ -273,7 +281,7 @@

     if (!quantize) {

       process_single_frame(st, ffeat);

-      if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1);

+      if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1, e2e);

     st->pcount++;

     /* Running on groups of 4 frames. */

@@ -281,7 +289,7 @@

       if (quantize) {

         unsigned char buf[8];

         process_superframe(st, buf, ffeat, encode, quantize);

-        if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm, 4);

+        if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm, 4, e2e);

       st->pcount = 0;

--- a/dnn/training_tf2/dump_lpcnet.py

+++ b/dnn/training_tf2/dump_lpcnet.py

@@ -250,6 +250,7 @@

     units = min(f['model_weights']['gru_a']['gru_a']['recurrent_kernel:0'].shape)

     units2 = min(f['model_weights']['gru_b']['gru_b']['recurrent_kernel:0'].shape)

     cond_size = min(f['model_weights']['feature_dense1']['feature_dense1']['kernel:0'].shape)

+    e2e = 'rc2lpc' in f['model_weights']

 model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=units, rnn_units2=units2, flag_e2e = flag_e2e, cond_size=cond_size)

 model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

@@ -275,6 +276,13 @@

 hf.write('/*This file is automatically generated from a Keras model*/\n\n')

 hf.write('#ifndef RNN_DATA_H\n#define RNN_DATA_H\n\n#include "nnet.h"\n\n')

+if e2e:

+    hf.write('/* This is an end-to-end model */\n')

+    hf.write('#define END2END\n\n')

+else:

+    hf.write('/* This is *not* an end-to-end model */\n')

+    hf.write('/* #define END2END */\n\n')

 embed_size = lpcnet.embed_size

--

⑨