shithub: opus

--- a/dnn/dump_data.c

+++ b/dnn/dump_data.c

@@ -75,10 +75,10 @@

-void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file, int nframes, int e2e) {

+void write_audio(LPCNetEncState *st, const short *pcm, const int *noise, FILE *file, int nframes) {

   int i, k;

   for (k=0;k<nframes;k++) {

-  unsigned char data[4*FRAME_SIZE];

+  short data[2*FRAME_SIZE];

   for (i=0;i<FRAME_SIZE;i++) {

     float p=0;

     float e;

@@ -85,18 +85,10 @@

     int j;

     for (j=0;j<LPC_ORDER;j++) p -= st->features[k][NB_BANDS+2+j]*st->sig_mem[j];

     e = lin2ulaw(pcm[k*FRAME_SIZE+i] - p);

-    /* Signal. */

-    data[4*i] = lin2ulaw(st->sig_mem[0]);

-    /* Prediction. */

-    data[4*i+1] = lin2ulaw(p);

-    /* Excitation in. */

-    data[4*i+2] = st->exc_mem;

-    /* Excitation out. */

-    if (e2e) {

-      data[4*i+3] = lin2ulaw(pcm[k*FRAME_SIZE+i]);

-    } else {

-      data[4*i+3] = e;

-    }

+    /* Signal in. */

+    data[2*i] = st->sig_mem[0];

+    /* Signal out. */

+    data[2*i+1] = pcm[k*FRAME_SIZE+i];

     /* Simulate error on excitation. */

     e += noise[k*FRAME_SIZE+i];

     e = IMIN(255, IMAX(0, e));

@@ -119,7 +111,6 @@

 int main(int argc, char **argv) {

   int i;

   char *argv0;

-  int e2e=0;

   int count=0;

   static const float a_hp[2] = {-1.99599, 0.99600};

   static const float b_hp[2] = {-2, 1};

@@ -151,11 +142,6 @@

   srand(getpid());

   st = lpcnet_encoder_create();

   argv0=argv[0];

-  if (argc > 2 && strcmp(argv[1], "-end2end")==0) {

-      e2e = 1;

-      argv++;

-      argc--;

-  }

   if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;

   if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {

       training = 1;

@@ -281,7 +267,7 @@

     if (!quantize) {

       process_single_frame(st, ffeat);

-      if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1, e2e);

+      if (fpcm) write_audio(st, pcm, &noisebuf[st->pcount*FRAME_SIZE], fpcm, 1);

     st->pcount++;

     /* Running on groups of 4 frames. */

@@ -289,7 +275,7 @@

       if (quantize) {

         unsigned char buf[8];

         process_superframe(st, buf, ffeat, encode, quantize);

-        if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm, 4, e2e);

+        if (fpcm) write_audio(st, pcmbuf, noisebuf, fpcm, 4);

       st->pcount = 0;

--- a/dnn/training_tf2/dataloader.py

+++ b/dnn/training_tf2/dataloader.py

@@ -1,5 +1,6 @@

 import numpy as np

 from tensorflow.keras.utils import Sequence

+from ulaw import lin2ulaw

 def lpc2rc(lpc):

     #print("shape is = ", lpc.shape)

@@ -12,13 +13,13 @@

     return rc

 class LPCNetLoader(Sequence):

-    def __init__(self, data, features, periods, batch_size, lpc_out=False):

+    def __init__(self, data, features, periods, batch_size, e2e=False):

         self.batch_size = batch_size

         self.nb_batches = np.minimum(np.minimum(data.shape[0], features.shape[0]), periods.shape[0])//self.batch_size

         self.data = data[:self.nb_batches*self.batch_size, :]

         self.features = features[:self.nb_batches*self.batch_size, :]

         self.periods = periods[:self.nb_batches*self.batch_size, :]

-        self.lpc_out = lpc_out

+        self.e2e = e2e

         self.on_epoch_end()

     def on_epoch_end(self):

@@ -27,15 +28,18 @@

     def __getitem__(self, index):

         data = self.data[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :]

-        in_data = data[: , :, :3]

-        out_data = data[: , :, 3:4]

+        in_data = data[: , :, :1]

+        out_data = data[: , :, 1:]

         features = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :-16]

         periods = self.periods[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :]

         outputs = [out_data]

-        if self.lpc_out:

-            lpc = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], 2:-2, -16:]

+        inputs = [in_data, features, periods]

+        lpc = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], 2:-2, -16:]

+        if self.e2e:

             outputs.append(lpc2rc(lpc))

-        return ([in_data, features, periods], outputs)

+        else:

+            inputs.append(lpc)

+        return (inputs, outputs)

     def __len__(self):

         return self.nb_batches

--- a/dnn/training_tf2/dump_lpcnet.py

+++ b/dnn/training_tf2/dump_lpcnet.py

@@ -252,7 +252,7 @@

     cond_size = min(f['model_weights']['feature_dense1']['feature_dense1']['kernel:0'].shape)

     e2e = 'rc2lpc' in f['model_weights']

-model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=units, rnn_units2=units2, flag_e2e = flag_e2e, cond_size=cond_size)

+model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=units, rnn_units2=units2, flag_e2e = e2e, cond_size=cond_size)

 model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

 #model.summary()

--- a/dnn/training_tf2/lossfuncs.py

+++ b/dnn/training_tf2/lossfuncs.py

@@ -12,7 +12,7 @@

     def loss(y_true,y_pred):

         p = y_pred[:,:,0:1]

         model_out = y_pred[:,:,1:]

-        e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p))

+        e_gt = tf_l2u(y_true - p)

         e_gt = tf.round(e_gt)

         e_gt = tf.cast(e_gt,'int32')

         sparse_cel = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)(e_gt,model_out)

@@ -24,9 +24,10 @@

 # Also adds a probability compensation (to account for matching cross entropy in the linear domain), weighted by gamma

 def interp_mulaw(gamma = 1):

     def loss(y_true,y_pred):

+        y_true = tf.cast(y_true, 'float32')

         p = y_pred[:,:,0:1]

         model_out = y_pred[:,:,1:]

-        e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p))

+        e_gt = tf_l2u(y_true - p)

         prob_compensation = tf.squeeze((K.abs(e_gt - 128)/128.0)*K.log(256.0))

         alpha = e_gt - tf.math.floor(e_gt)

         alpha = tf.tile(alpha,[1,1,256])

@@ -42,7 +43,7 @@

 def metric_oginterploss(y_true,y_pred):

     p = y_pred[:,:,0:1]

     model_out = y_pred[:,:,1:]

-    e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p))

+    e_gt = tf_l2u(y_true - p)

     prob_compensation = tf.squeeze((K.abs(e_gt - 128)/128.0)*K.log(256.0))

     alpha = e_gt - tf.math.floor(e_gt)

     alpha = tf.tile(alpha,[1,1,256])

@@ -57,7 +58,7 @@

 def metric_icel(y_true, y_pred):

     p = y_pred[:,:,0:1]

     model_out = y_pred[:,:,1:]

-    e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p))

+    e_gt = tf_l2u(y_true - p)

     alpha = e_gt - tf.math.floor(e_gt)

     alpha = tf.tile(alpha,[1,1,256])

     e_gt = tf.cast(e_gt,'int32')

@@ -68,9 +69,10 @@

 # Non-interpolated (rounded) cross entropy loss metric

 def metric_cel(y_true, y_pred):

+    y_true = tf.cast(y_true, 'float32')

     p = y_pred[:,:,0:1]

     model_out = y_pred[:,:,1:]

-    e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p))

+    e_gt = tf_l2u(y_true - p)

     e_gt = tf.round(e_gt)

     e_gt = tf.cast(e_gt,'int32')

     e_gt = tf.clip_by_value(e_gt,0,255)

@@ -80,7 +82,7 @@

 # Variance metric of the output excitation

 def metric_exc_sd(y_true,y_pred):

     p = y_pred[:,:,0:1]

-    e_gt = tf_l2u(tf_u2l(y_true) - tf_u2l(p))

+    e_gt = tf_l2u(y_true - p)

     sd_egt = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)(e_gt,128)

     return sd_egt

--- a/dnn/training_tf2/lpcnet.py

+++ b/dnn/training_tf2/lpcnet.py

@@ -230,8 +230,9 @@

 constraint = WeightClip(0.992)

-def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_size=128, training=False, adaptation=False, quantize=False, flag_e2e = False, cond_size=128):

-    pcm = Input(shape=(None, 3), batch_size=batch_size)

+def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features=20, batch_size=128, training=False, adaptation=False, quantize=False, flag_e2e = False, cond_size=128, lpc_order=16):

+    pcm = Input(shape=(None, 1), batch_size=batch_size)

+    dpcm = Input(shape=(None, 3), batch_size=batch_size)

     feat = Input(shape=(None, nb_used_features), batch_size=batch_size)

     pitch = Input(shape=(None, 1), batch_size=batch_size)

     dec_feat = Input(shape=(None, cond_size))

@@ -257,20 +258,19 @@

     cfeat = fdense2(fdense1(cfeat))

-    if not flag_e2e:

-        embed = Embedding(256, embed_size, embeddings_initializer=PCMInit(), name='embed_sig')

-        cpcm = Reshape((-1, embed_size*3))(embed(pcm))

-    else:

-        Input_extractor = Lambda(lambda x: K.expand_dims(x[0][:,:,x[1]],axis = -1))

-        error_calc = Lambda(lambda x: tf_l2u(tf_u2l(x[0]) - tf.roll(tf_u2l(x[1]),1,axis = 1)))

+    Input_extractor = Lambda(lambda x: K.expand_dims(x[0][:,:,x[1]],axis = -1))

+    error_calc = Lambda(lambda x: tf_l2u(x[0] - tf.roll(x[1],1,axis = 1)))

+    if flag_e2e:

         lpcoeffs = diff_rc2lpc(name = "rc2lpc")(cfeat)

-        tensor_preds = diff_pred(name = "lpc2preds")([Input_extractor([pcm,0]),lpcoeffs])

-        past_errors = error_calc([Input_extractor([pcm,0]),tensor_preds])

-        embed = diff_Embed(name='embed_sig',initializer = PCMInit())

-        cpcm = Concatenate()([Input_extractor([pcm,0]),tensor_preds,past_errors])

-        cpcm = Reshape((-1, embed_size*3))(embed(cpcm))

-        cpcm_decoder = Concatenate()([Input_extractor([pcm,0]),Input_extractor([pcm,1]),Input_extractor([pcm,2])])

-        cpcm_decoder = Reshape((-1, embed_size*3))(embed(cpcm_decoder))

+    else:

+        lpcoeffs = Input(shape=(None, lpc_order), batch_size=batch_size)

+    tensor_preds = diff_pred(name = "lpc2preds")([Input_extractor([pcm,0]),lpcoeffs])

+    past_errors = error_calc([Input_extractor([pcm,0]),tensor_preds])

+    embed = diff_Embed(name='embed_sig',initializer = PCMInit())

+    cpcm = Concatenate()([tf_l2u(Input_extractor([pcm,0])),tf_l2u(tensor_preds),past_errors])

+    cpcm = Reshape((-1, embed_size*3))(embed(cpcm))

+    cpcm_decoder = Concatenate()([Input_extractor([dpcm,0]),Input_extractor([dpcm,1]),Input_extractor([dpcm,2])])

+    cpcm_decoder = Reshape((-1, embed_size*3))(embed(cpcm_decoder))

     rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))

@@ -301,10 +301,10 @@

         md.trainable=False

         embed.Trainable=False

+    m_out = Concatenate(name='pdf')([tensor_preds,ulaw_prob])

     if not flag_e2e:

-        model = Model([pcm, feat, pitch], ulaw_prob)

+        model = Model([pcm, feat, pitch, lpcoeffs], m_out)

     else:

-        m_out = Concatenate(name='pdf')([tensor_preds,ulaw_prob])

         model = Model([pcm, feat, pitch], [m_out, cfeat])

     model.rnn_units1 = rnn_units1

     model.rnn_units2 = rnn_units2

@@ -321,5 +321,8 @@

     dec_gru_out2, state2 = rnn2(Concatenate()([dec_gru_out1, dec_feat]), initial_state=dec_state2)

     dec_ulaw_prob = Lambda(tree_to_pdf_infer)(md(dec_gru_out2))

-    decoder = Model([pcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])

+    if flag_e2e:

+        decoder = Model([dpcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])

+    else:

+        decoder = Model([pcm, dec_feat, dec_state1, dec_state2, lpcoeffs], [dec_ulaw_prob, state1, state2])

     return model, encoder, decoder

--- a/dnn/training_tf2/tf_funcs.py

+++ b/dnn/training_tf2/tf_funcs.py

@@ -30,7 +30,7 @@

 # The inputs xt and lpc conform with the shapes in lpcnet.py (the '2400' is coded keeping this in mind)

 class diff_pred(Layer):

     def call(self, inputs, lpcoeffs_N = 16, frame_size = 160):

-        xt = tf_u2l(inputs[0])

+        xt = inputs[0]

         lpc = inputs[1]

         rept = Lambda(lambda x: K.repeat_elements(x , frame_size, 1))

@@ -39,7 +39,7 @@

         pred = -Multiply()([rept(lpc),cX(zpX(xt))])

-        return tf_l2u(K.sum(pred,axis = 2,keepdims = True))

+        return K.sum(pred,axis = 2,keepdims = True)

 # Differentiable Transformations (RC <-> LPC) computed using the Levinson Durbin Recursion

 class diff_rc2lpc(Layer):

--- a/dnn/training_tf2/train_lpcnet.py

+++ b/dnn/training_tf2/train_lpcnet.py

@@ -125,7 +125,7 @@

 with strategy.scope():

     model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=args.grua_size, rnn_units2=args.grub_size, batch_size=batch_size, training=True, quantize=quantize, flag_e2e = flag_e2e, cond_size=args.cond_size)

     if not flag_e2e:

-        model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics='sparse_categorical_crossentropy')

+        model.compile(optimizer=opt, loss=metric_cel, metrics=metric_cel)

     else:

         model.compile(optimizer=opt, loss = [interp_mulaw(gamma=gamma), loss_matchlar()], loss_weights = [1.0, 2.0], metrics={'pdf':[metric_cel,metric_icel,metric_exc_sd,metric_oginterploss]})

     model.summary()

@@ -140,19 +140,17 @@

 # u for unquantised, load 16 bit PCM samples and convert to mu-law

-data = np.memmap(pcm_file, dtype='uint8', mode='r')

-nb_frames = (len(data)//(4*pcm_chunk_size)-1)//batch_size*batch_size

+data = np.memmap(pcm_file, dtype='int16', mode='r')

+nb_frames = (len(data)//(2*pcm_chunk_size)-1)//batch_size*batch_size

 features = np.memmap(feature_file, dtype='float32', mode='r')

 # limit to discrete number of frames

-data = data[4*2*frame_size:]

-data = data[:nb_frames*4*pcm_chunk_size]

+data = data[2*2*frame_size:]

+data = data[:nb_frames*2*pcm_chunk_size]

-data = np.reshape(data, (nb_frames, pcm_chunk_size, 4))

-#in_data = data[:,:,:3]

-#out_exc = data[:,:,3:4]

+data = np.reshape(data, (nb_frames, pcm_chunk_size, 2))

 #print("ulaw std = ", np.std(out_exc))

@@ -187,7 +185,7 @@

 model.save_weights('{}_{}_initial.h5'.format(args.output, args.grua_size))

-loader = LPCNetLoader(data, features, periods, batch_size, lpc_out=flag_e2e)

+loader = LPCNetLoader(data, features, periods, batch_size, e2e=flag_e2e)

 callbacks = [checkpoint, sparsify, grub_sparsify]

 if args.logdir is not None:

--

⑨