ref: a3ef5968222befeca0619c5ce7eda4a42332b88b
parent: d5b6087f4862e8001beaa39ccdb1a0562e9d3473
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Wed Oct 13 18:12:39 EDT 2021
auto-detect end-to-end models
--- a/dnn/dump_data.c
+++ b/dnn/dump_data.c
@@ -75,7 +75,7 @@
}
-void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file, int nframes) {
+void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file, int nframes, int e2e) {
int i, k;
for (k=0;k<nframes;k++) {
unsigned char data[4*FRAME_SIZE];
@@ -92,11 +92,11 @@
/* Excitation in. */
data[4*i+2] = st->exc_mem;
/* Excitation out. */
-#ifdef END2END
- data[4*i+3] = lin2ulaw(pcm[k*FRAME_SIZE+i]);
-#else
- data[4*i+3] = e;
-#endif
+ if (e2e) {
+ data[4*i+3] = lin2ulaw(pcm[k*FRAME_SIZE+i]);
+ } else {
+ data[4*i+3] = e;
+ }
/* Simulate error on excitation. */
e += noise[k*FRAME_SIZE+i];
e = IMIN(255, IMAX(0, e));
@@ -118,6 +118,8 @@
int main(int argc, char **argv) {
int i;
+ char *argv0;
+ int e2e=0;
int count=0;
static const float a_hp[2] = {-1.99599, 0.99600};
static const float b_hp[2] = {-2, 1};
@@ -148,6 +150,12 @@
int quantize = 0;
srand(getpid());
st = lpcnet_encoder_create();
+ argv0=argv[0];
+ if (argc > 2 && strcmp(argv[1], "-end2end")==0) {
+ e2e = 1;
+ argv++;
+ argc--;
+ }
if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
training = 1;
@@ -168,8 +176,8 @@
decode = 1;
}
if (training == -1) {
- fprintf(stderr, "usage: %s -train <speech> <features out> <pcm out>\n", argv[0]);
- fprintf(stderr, " or %s -test <speech> <features out>\n", argv[0]);
+ fprintf(stderr, "usage: %s -train <speech> <features out> <pcm out>\n", argv0);
+ fprintf(stderr, " or %s -test <speech> <features out>\n", argv0);
return 1;
}
f1 = fopen(argv[2], "r");
@@ -273,7 +281,7 @@
if (!quantize) {
process_single_frame(st, ffeat);
- if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1);
+ if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1, e2e);
}
st->pcount++;
/* Running on groups of 4 frames. */
@@ -281,7 +289,7 @@
if (quantize) {
unsigned char buf[8];
process_superframe(st, buf, ffeat, encode, quantize);
- if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm, 4);
+ if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm, 4, e2e);
}
st->pcount = 0;
}
--- a/dnn/training_tf2/dump_lpcnet.py
+++ b/dnn/training_tf2/dump_lpcnet.py
@@ -250,6 +250,7 @@
units = min(f['model_weights']['gru_a']['gru_a']['recurrent_kernel:0'].shape)
units2 = min(f['model_weights']['gru_b']['gru_b']['recurrent_kernel:0'].shape)
cond_size = min(f['model_weights']['feature_dense1']['feature_dense1']['kernel:0'].shape)
+ e2e = 'rc2lpc' in f['model_weights']
model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=units, rnn_units2=units2, flag_e2e = flag_e2e, cond_size=cond_size)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
@@ -275,6 +276,13 @@
hf.write('/*This file is automatically generated from a Keras model*/\n\n')
hf.write('#ifndef RNN_DATA_H\n#define RNN_DATA_H\n\n#include "nnet.h"\n\n')
+
+if e2e:
+ hf.write('/* This is an end-to-end model */\n')
+ hf.write('#define END2END\n\n')
+else:
+ hf.write('/* This is *not* an end-to-end model */\n')
+ hf.write('/* #define END2END */\n\n')
embed_size = lpcnet.embed_size
--
⑨