ref: 60450472a6c8cb97e35b56901878e6d15e9ed999
parent: 340ab3089b1fdd0a4767c71e694afc30aac49401
parent: c1da818f39af6d353b9467ce75b76e469de96953
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Tue Sep 6 20:38:55 EDT 2022
Merge branch 'plc_challenge' into master
--- a/dnn/Makefile.am
+++ b/dnn/Makefile.am
@@ -8,6 +8,7 @@
lib_LTLIBRARIES = liblpcnet.la
noinst_HEADERS = arch.h \
+ burg.h \
common.h \
freq.h \
_kiss_fft_guts.h \
@@ -16,6 +17,7 @@
lpcnet_private.h \
opus_types.h \
nnet_data.h \
+ plc_data.h \
nnet.h \
pitch.h \
tansig_table.h \
@@ -24,6 +26,7 @@
vec_neon.h
liblpcnet_la_SOURCES = \
+ burg.c \
common.c \
kiss99.c \
lpcnet.c \
@@ -31,6 +34,7 @@
lpcnet_enc.c \
nnet.c \
nnet_data.c \
+ plc_data.c \
ceps_codebooks.c \
pitch.c \
freq.c \
@@ -52,7 +56,7 @@
#dump_data_SOURCES = dump_data.c
#dump_data_LDADD = $(DUMP_OBJ) $(LIBM)
-dump_data_SOURCES = common.c dump_data.c freq.c kiss_fft.c pitch.c lpcnet_dec.c lpcnet_enc.c ceps_codebooks.c
+dump_data_SOURCES = common.c dump_data.c burg.c freq.c kiss_fft.c pitch.c lpcnet_dec.c lpcnet_enc.c ceps_codebooks.c
dump_data_LDADD = $(LIBM)
dump_data_CFLAGS = $(AM_CFLAGS)
--- a/dnn/README.md
+++ b/dnn/README.md
@@ -3,8 +3,14 @@
Low complexity implementation of the WaveRNN-based LPCNet algorithm, as described in:
- J.-M. Valin, J. Skoglund, [LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://jmvalin.ca/papers/lpcnet_icassp2019.pdf), *Proc. International Conference on Acoustics, Speech and Signal Processing (ICASSP)*, arXiv:1810.11846, 2019.
+- J.-M. Valin, U. Isik, P. Smaragdis, A. Krishnaswamy, [Neural Speech Synthesis on a Shoestring: Improving the Efficiency of LPCNet](https://jmvalin.ca/papers/improved_lpcnet.pdf), *Proc. ICASSP*, arxiv:2106.04129, 2022.
+- K. Subramani, J.-M. Valin, U. Isik, P. Smaragdis, A. Krishnaswamy, [End-to-end LPCNet: A Neural Vocoder With Fully-Differentiable LPC Estimation](https://jmvalin.ca/papers/lpcnet_end2end.pdf), *Proc. INTERSPEECH*, arxiv:2106.04129, 2022.
+
+For coding/PLC applications of LPCNet, see:
+
- J.-M. Valin, J. Skoglund, [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://jmvalin.ca/papers/lpcnet_codec.pdf), *Proc. INTERSPEECH*, arxiv:1903.12087, 2019.
- J. Skoglund, J.-M. Valin, [Improving Opus Low Bit Rate Quality with Neural Speech Synthesis](https://jmvalin.ca/papers/opusnet.pdf), *Proc. INTERSPEECH*, arxiv:1905.04628, 2020.
+- J.-M. Valin, A. Mustafa, C. Montgomery, T.B. Terriberry, M. Klingbeil, P. Smaragdis, A. Krishnaswamy, [Real-Time Packet Loss Concealment With Mixed Generative and Predictive Model](https://jmvalin.ca/papers/lpcnet_plc.pdf), *Proc. INTERSPEECH*, arxiv:2205.05785, 2022.
# Introduction
@@ -58,6 +64,22 @@
Alternatively, you can run the uncompressed analysis/synthesis using -features
instead of -encode and -synthesis instead of -decode.
The same functionality is available in the form of a library. See include/lpcnet.h for the API.
+
+To try packet loss concealment (PLC), you first need a PLC model, which you can get with:
+```
+./download_model.sh plc-3b1eab4
+```
+or (for the PLC challenge submission):
+```
+./download_model.sh plc_challenge
+```
+PLC can be tested with:
+```
+./lpcnet_demo -plc_file noncausal_dc error_pattern.txt input.pcm output.pcm
+```
+where error_pattern.txt is a text file with one entry per 20-ms packet, with 1 meaning "packet lost" and 0 meaning "packet not lost".
+noncausal_dc is the non-causal (5-ms look-ahead) with special handling for DC offsets. It's also possible to use "noncausal", "causal",
+or "causal_dc".
# Training a new model
--- /dev/null
+++ b/dnn/burg.c
@@ -1,0 +1,245 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <string.h>
+#include <assert.h>
+
+#include "burg.h"
+
+#define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr = ( 0.005 * 16000 + 16 ) * 4 = 384*/
+#define SILK_MAX_ORDER_LPC 16
+#define FIND_LPC_COND_FAC 1e-5f
+
+/* sum of squares of a silk_float array, with result as double */
+static double silk_energy_FLP(
+ const float *data,
+ int dataSize
+)
+{
+ int i;
+ double result;
+
+ /* 4x unrolled loop */
+ result = 0.0;
+ for( i = 0; i < dataSize - 3; i += 4 ) {
+ result += data[ i + 0 ] * (double)data[ i + 0 ] +
+ data[ i + 1 ] * (double)data[ i + 1 ] +
+ data[ i + 2 ] * (double)data[ i + 2 ] +
+ data[ i + 3 ] * (double)data[ i + 3 ];
+ }
+
+ /* add any remaining products */
+ for( ; i < dataSize; i++ ) {
+ result += data[ i ] * (double)data[ i ];
+ }
+
+ assert( result >= 0.0 );
+ return result;
+}
+
+/* inner product of two silk_float arrays, with result as double */
+static double silk_inner_product_FLP(
+ const float *data1,
+ const float *data2,
+ int dataSize
+)
+{
+ int i;
+ double result;
+
+ /* 4x unrolled loop */
+ result = 0.0;
+ for( i = 0; i < dataSize - 3; i += 4 ) {
+ result += data1[ i + 0 ] * (double)data2[ i + 0 ] +
+ data1[ i + 1 ] * (double)data2[ i + 1 ] +
+ data1[ i + 2 ] * (double)data2[ i + 2 ] +
+ data1[ i + 3 ] * (double)data2[ i + 3 ];
+ }
+
+ /* add any remaining products */
+ for( ; i < dataSize; i++ ) {
+ result += data1[ i ] * (double)data2[ i ];
+ }
+
+ return result;
+}
+
+
+/* Compute reflection coefficients from input signal */
+float silk_burg_analysis( /* O returns residual energy */
+ float A[], /* O prediction coefficients (length order) */
+ const float x[], /* I input signal, length: nb_subfr*(D+L_sub) */
+ const float minInvGain, /* I minimum inverse prediction gain */
+ const int subfr_length, /* I input signal subframe length (incl. D preceding samples) */
+ const int nb_subfr, /* I number of subframes stacked in x */
+ const int D /* I order */
+)
+{
+ int k, n, s, reached_max_gain;
+ double C0, invGain, num, nrg_f, nrg_b, rc, Atmp, tmp1, tmp2;
+ const float *x_ptr;
+ double C_first_row[ SILK_MAX_ORDER_LPC ], C_last_row[ SILK_MAX_ORDER_LPC ];
+ double CAf[ SILK_MAX_ORDER_LPC + 1 ], CAb[ SILK_MAX_ORDER_LPC + 1 ];
+ double Af[ SILK_MAX_ORDER_LPC ];
+
+ assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE );
+
+ /* Compute autocorrelations, added over subframes */
+ C0 = silk_energy_FLP( x, nb_subfr * subfr_length );
+ memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( double ) );
+ for( s = 0; s < nb_subfr; s++ ) {
+ x_ptr = x + s * subfr_length;
+ for( n = 1; n < D + 1; n++ ) {
+ C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n );
+ }
+ }
+ memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) );
+
+ /* Initialize */
+ CAb[ 0 ] = CAf[ 0 ] = C0 + FIND_LPC_COND_FAC * C0 + 1e-9f;
+ invGain = 1.0f;
+ reached_max_gain = 0;
+ for( n = 0; n < D; n++ ) {
+ /* Update first row of correlation matrix (without first element) */
+ /* Update last row of correlation matrix (without last element, stored in reversed order) */
+ /* Update C * Af */
+ /* Update C * flipud(Af) (stored in reversed order) */
+ for( s = 0; s < nb_subfr; s++ ) {
+ x_ptr = x + s * subfr_length;
+ tmp1 = x_ptr[ n ];
+ tmp2 = x_ptr[ subfr_length - n - 1 ];
+ for( k = 0; k < n; k++ ) {
+ C_first_row[ k ] -= x_ptr[ n ] * x_ptr[ n - k - 1 ];
+ C_last_row[ k ] -= x_ptr[ subfr_length - n - 1 ] * x_ptr[ subfr_length - n + k ];
+ Atmp = Af[ k ];
+ tmp1 += x_ptr[ n - k - 1 ] * Atmp;
+ tmp2 += x_ptr[ subfr_length - n + k ] * Atmp;
+ }
+ for( k = 0; k <= n; k++ ) {
+ CAf[ k ] -= tmp1 * x_ptr[ n - k ];
+ CAb[ k ] -= tmp2 * x_ptr[ subfr_length - n + k - 1 ];
+ }
+ }
+ tmp1 = C_first_row[ n ];
+ tmp2 = C_last_row[ n ];
+ for( k = 0; k < n; k++ ) {
+ Atmp = Af[ k ];
+ tmp1 += C_last_row[ n - k - 1 ] * Atmp;
+ tmp2 += C_first_row[ n - k - 1 ] * Atmp;
+ }
+ CAf[ n + 1 ] = tmp1;
+ CAb[ n + 1 ] = tmp2;
+
+ /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
+ num = CAb[ n + 1 ];
+ nrg_b = CAb[ 0 ];
+ nrg_f = CAf[ 0 ];
+ for( k = 0; k < n; k++ ) {
+ Atmp = Af[ k ];
+ num += CAb[ n - k ] * Atmp;
+ nrg_b += CAb[ k + 1 ] * Atmp;
+ nrg_f += CAf[ k + 1 ] * Atmp;
+ }
+ assert( nrg_f > 0.0 );
+ assert( nrg_b > 0.0 );
+
+ /* Calculate the next order reflection (parcor) coefficient */
+ rc = -2.0 * num / ( nrg_f + nrg_b );
+ assert( rc > -1.0 && rc < 1.0 );
+
+ /* Update inverse prediction gain */
+ tmp1 = invGain * ( 1.0 - rc * rc );
+ if( tmp1 <= minInvGain ) {
+ /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
+ rc = sqrt( 1.0 - minInvGain / invGain );
+ if( num > 0 ) {
+ /* Ensure adjusted reflection coefficients has the original sign */
+ rc = -rc;
+ }
+ invGain = minInvGain;
+ reached_max_gain = 1;
+ } else {
+ invGain = tmp1;
+ }
+
+ /* Update the AR coefficients */
+ for( k = 0; k < (n + 1) >> 1; k++ ) {
+ tmp1 = Af[ k ];
+ tmp2 = Af[ n - k - 1 ];
+ Af[ k ] = tmp1 + rc * tmp2;
+ Af[ n - k - 1 ] = tmp2 + rc * tmp1;
+ }
+ Af[ n ] = rc;
+
+ if( reached_max_gain ) {
+ /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
+ for( k = n + 1; k < D; k++ ) {
+ Af[ k ] = 0.0;
+ }
+ break;
+ }
+
+ /* Update C * Af and C * Ab */
+ for( k = 0; k <= n + 1; k++ ) {
+ tmp1 = CAf[ k ];
+ CAf[ k ] += rc * CAb[ n - k + 1 ];
+ CAb[ n - k + 1 ] += rc * tmp1;
+ }
+ }
+
+ if( reached_max_gain ) {
+ /* Convert to float */
+ for( k = 0; k < D; k++ ) {
+ A[ k ] = (float)( -Af[ k ] );
+ }
+ /* Subtract energy of preceding samples from C0 */
+ for( s = 0; s < nb_subfr; s++ ) {
+ C0 -= silk_energy_FLP( x + s * subfr_length, D );
+ }
+ /* Approximate residual energy */
+ nrg_f = C0 * invGain;
+ } else {
+ /* Compute residual energy and store coefficients as float */
+ nrg_f = CAf[ 0 ];
+ tmp1 = 1.0;
+ for( k = 0; k < D; k++ ) {
+ Atmp = Af[ k ];
+ nrg_f += CAf[ k + 1 ] * Atmp;
+ tmp1 += Atmp * Atmp;
+ A[ k ] = (float)(-Atmp);
+ }
+ nrg_f -= FIND_LPC_COND_FAC * C0 * tmp1;
+ }
+
+ /* Return residual energy */
+ return (float)nrg_f;
+}
--- /dev/null
+++ b/dnn/burg.h
@@ -1,0 +1,36 @@
+/***********************************************************************
+Copyright (c) 2006-2011, Skype Limited. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+
+float silk_burg_analysis( /* O returns residual energy */
+ float A[], /* O prediction coefficients (length order) */
+ const float x[], /* I input signal, length: nb_subfr*(D+L_sub) */
+ const float minInvGain, /* I minimum inverse prediction gain */
+ const int subfr_length, /* I input signal subframe length (incl. D preceding samples) */
+ const int nb_subfr, /* I number of subframes stacked in x */
+ const int D /* I order */
+);
--- a/dnn/download_model.sh
+++ b/dnn/download_model.sh
@@ -9,4 +9,5 @@
fi
tar xvf $model
touch src/nnet_data.[ch]
+touch src/plc_data.[ch]
mv src/*.[ch] .
--- a/dnn/dump_data.c
+++ b/dnn/dump_data.c
@@ -138,9 +138,18 @@
int encode = 0;
int decode = 0;
int quantize = 0;
+ int burg = 0;
srand(getpid());
st = lpcnet_encoder_create();
argv0=argv[0];
+ if (argc == 5 && strcmp(argv[1], "-btrain")==0) {
+ burg = 1;
+ training = 1;
+ }
+ if (argc == 4 && strcmp(argv[1], "-btest")==0) {
+ burg = 1;
+ training = 0;
+ }
if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
training = 1;
@@ -236,7 +245,8 @@
if (count*FRAME_SIZE_5MS>=10000000 && one_pass_completed) break;
if (training && ++gain_change_count > 2821) {
float tmp, tmp2;
- speech_gain = pow(10., (-20+(rand()%40))/20.);
+ speech_gain = pow(10., (-30+(rand()%40))/20.);
+ if (rand()&1) speech_gain = -speech_gain;
if (rand()%20==0) speech_gain *= .01;
if (rand()%100==0) speech_gain = 0;
gain_change_count = 0;
@@ -247,7 +257,6 @@
}
biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
- preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
for (i=0;i<FRAME_SIZE;i++) {
float g;
float f = (float)i/FRAME_SIZE;
@@ -254,6 +263,12 @@
g = f*speech_gain + (1-f)*old_speech_gain;
x[i] *= g;
}
+ if (burg) {
+ float ceps[2*NB_BANDS];
+ burg_cepstral_analysis(ceps, x);
+ fwrite(ceps, sizeof(float), 2*NB_BANDS, ffeat);
+ }
+ preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
/* PCM is delayed by 1/2 frame to make the features centered on the frames. */
for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
--- a/dnn/freq.c
+++ b/dnn/freq.c
@@ -37,6 +37,7 @@
#include "freq.h"
#include "pitch.h"
#include "arch.h"
+#include "burg.h"
#include <assert.h>
#define SQUARE(x) ((x)*(x))
@@ -58,6 +59,32 @@
} CommonState;
+void compute_band_energy_inverse(float *bandE, const kiss_fft_cpx *X) {
+ int i;
+ float sum[NB_BANDS] = {0};
+ for (i=0;i<NB_BANDS-1;i++)
+ {
+ int j;
+ int band_size;
+ band_size = (eband5ms[i+1]-eband5ms[i])*WINDOW_SIZE_5MS;
+ for (j=0;j<band_size;j++) {
+ float tmp;
+ float frac = (float)j/band_size;
+ tmp = SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].r);
+ tmp += SQUARE(X[(eband5ms[i]*WINDOW_SIZE_5MS) + j].i);
+ tmp = 1.f/(tmp + 1e-9);
+ sum[i] += (1-frac)*tmp;
+ sum[i+1] += frac*tmp;
+ }
+ }
+ sum[0] *= 2;
+ sum[NB_BANDS-1] *= 2;
+ for (i=0;i<NB_BANDS;i++)
+ {
+ bandE[i] = sum[i];
+ }
+}
+
float _lpcnet_lpc(
opus_val16 *lpc, /* out: [0...p-1] LPC coefficients */
opus_val16 *rc,
@@ -125,6 +152,54 @@
for (i=0;i<NB_BANDS;i++)
{
bandE[i] = sum[i];
+ }
+}
+
+void compute_burg_cepstrum(const float *pcm, float *burg_cepstrum, int len, int order) {
+ int i;
+ float burg_in[FRAME_SIZE];
+ float burg_lpc[LPC_ORDER];
+ float x[WINDOW_SIZE];
+ float Eburg[NB_BANDS];
+ float g;
+ float E;
+ kiss_fft_cpx LPC[FREQ_SIZE];
+ float Ly[NB_BANDS];
+ assert(order <= LPC_ORDER);
+ assert(len <= FRAME_SIZE);
+ for (i=0;i<len-1;i++) burg_in[i] = pcm[i+1] - PREEMPHASIS*pcm[i];
+ g = silk_burg_analysis(burg_lpc, burg_in, 1e-3, len-1, 1, order);
+ g /= len - 2*(order-1);
+ //printf("%g\n", g);
+ RNN_CLEAR(x, WINDOW_SIZE);
+ x[0] = 1;
+ for (i=0;i<order;i++) x[i+1] = -burg_lpc[i]*pow(.995, i+1);
+ forward_transform(LPC, x);
+ compute_band_energy_inverse(Eburg, LPC);
+ for (i=0;i<NB_BANDS;i++) Eburg[i] *= .45*g*(1.f/((float)WINDOW_SIZE*WINDOW_SIZE*WINDOW_SIZE));
+ float logMax = -2;
+ float follow = -2;
+ for (i=0;i<NB_BANDS;i++) {
+ Ly[i] = log10(1e-2+Eburg[i]);
+ Ly[i] = MAX16(logMax-8, MAX16(follow-2.5, Ly[i]));
+ logMax = MAX16(logMax, Ly[i]);
+ follow = MAX16(follow-2.5, Ly[i]);
+ E += Eburg[i];
+ }
+ dct(burg_cepstrum, Ly);
+ burg_cepstrum[0] += - 4;
+}
+
+void burg_cepstral_analysis(float *ceps, const float *x) {
+ int i;
+ compute_burg_cepstrum(x, &ceps[0 ], FRAME_SIZE/2, LPC_ORDER);
+ compute_burg_cepstrum(&x[FRAME_SIZE/2], &ceps[NB_BANDS], FRAME_SIZE/2, LPC_ORDER);
+ for (i=0;i<NB_BANDS;i++) {
+ float c0, c1;
+ c0 = ceps[i];
+ c1 = ceps[NB_BANDS+i];
+ ceps[i ] = .5*(c0+c1);
+ ceps[NB_BANDS+i] = (c0-c1);
}
}
--- a/dnn/freq.h
+++ b/dnn/freq.h
@@ -47,6 +47,8 @@
void compute_band_energy(float *bandE, const kiss_fft_cpx *X);
void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P);
+void compute_burg_cepstrum(const float *pcm, float *burg_cepstrum, int len, int order);
+void burg_cepstral_analysis(float *ceps, const float *x);
void apply_window(float *x);
void dct(float *out, const float *in);
--- a/dnn/include/lpcnet.h
+++ b/dnn/include/lpcnet.h
@@ -176,11 +176,18 @@
*/
LPCNET_EXPORT void lpcnet_synthesize(LPCNetState *st, const float *features, short *output, int N);
+
+#define LPCNET_PLC_CAUSAL 0
+#define LPCNET_PLC_NONCAUSAL 1
+#define LPCNET_PLC_CODEC 2
+
+#define LPCNET_PLC_DC_FILTER 4
+
LPCNET_EXPORT int lpcnet_plc_get_size(void);
-LPCNET_EXPORT void lpcnet_plc_init(LPCNetPLCState *st);
+LPCNET_EXPORT int lpcnet_plc_init(LPCNetPLCState *st, int options);
-LPCNET_EXPORT LPCNetPLCState *lpcnet_plc_create(void);
+LPCNET_EXPORT LPCNetPLCState *lpcnet_plc_create(int options);
LPCNET_EXPORT void lpcnet_plc_destroy(LPCNetPLCState *st);
--- a/dnn/lpcnet.c
+++ b/dnn/lpcnet.c
@@ -98,7 +98,6 @@
compute_conv1d(&feature_conv1, conv1_out, net->feature_conv1_state, in);
if (lpcnet->frame_count < FEATURE_CONV1_DELAY) RNN_CLEAR(conv1_out, FEATURE_CONV1_OUT_SIZE);
compute_conv1d(&feature_conv2, conv2_out, net->feature_conv2_state, conv1_out);
- celt_assert(FRAME_INPUT_SIZE == FEATURE_CONV2_OUT_SIZE);
if (lpcnet->frame_count < FEATURES_DELAY) RNN_CLEAR(conv2_out, FEATURE_CONV2_OUT_SIZE);
_lpcnet_compute_dense(&feature_dense1, dense1_out, conv2_out);
_lpcnet_compute_dense(&feature_dense2, condition, dense1_out);
@@ -196,8 +195,12 @@
last_sig_ulaw = lin2ulaw(lpcnet->last_sig[0]);
pred_ulaw = lin2ulaw(pred);
exc = run_sample_network(&lpcnet->nnet, lpcnet->gru_a_condition, lpcnet->gru_b_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw, lpcnet->sampling_logit_table, &lpcnet->rng);
- if (i < preload) exc = lin2ulaw(output[i]-PREEMPH*lpcnet->deemph_mem - pred);
- pcm = pred + ulaw2lin(exc);
+ if (i < preload) {
+ exc = lin2ulaw(output[i]-PREEMPH*lpcnet->deemph_mem - pred);
+ pcm = output[i]-PREEMPH*lpcnet->deemph_mem;
+ } else {
+ pcm = pred + ulaw2lin(exc);
+ }
RNN_MOVE(&lpcnet->last_sig[1], &lpcnet->last_sig[0], LPC_ORDER-1);
lpcnet->last_sig[0] = pcm;
lpcnet->last_exc = exc;
@@ -205,7 +208,7 @@
lpcnet->deemph_mem = pcm;
if (pcm<-32767) pcm = -32767;
if (pcm>32767) pcm = 32767;
- output[i] = (int)floor(.5 + pcm);
+ if (i >= preload) output[i] = (int)floor(.5 + pcm);
}
}
--- a/dnn/lpcnet_demo.c
+++ b/dnn/lpcnet_demo.c
@@ -40,19 +40,29 @@
#define MODE_SYNTHESIS 3
#define MODE_PLC 4
+void usage(void) {
+ fprintf(stderr, "usage: lpcnet_demo -encode <input.pcm> <compressed.lpcnet>\n");
+ fprintf(stderr, " lpcnet_demo -decode <compressed.lpcnet> <output.pcm>\n");
+ fprintf(stderr, " lpcnet_demo -features <input.pcm> <features.f32>\n");
+ fprintf(stderr, " lpcnet_demo -synthesis <features.f32> <output.pcm>\n");
+ fprintf(stderr, " lpcnet_demo -plc <plc_options> <percent> <input.pcm> <output.pcm>\n");
+ fprintf(stderr, " lpcnet_demo -plc_file <plc_options> <percent> <input.pcm> <output.pcm>\n\n");
+ fprintf(stderr, " plc_options:\n");
+ fprintf(stderr, " causal: normal (causal) PLC\n");
+ fprintf(stderr, " causal_dc: normal (causal) PLC with DC offset compensation\n");
+ fprintf(stderr, " noncausal: non-causal PLC\n");
+ fprintf(stderr, " noncausal_dc: non-causal PLC with DC offset compensation\n");
+ exit(1);
+}
+
int main(int argc, char **argv) {
int mode;
int plc_percent=0;
FILE *fin, *fout;
- if (argc != 4 && !(argc == 5 && strcmp(argv[1], "-plc") == 0))
- {
- fprintf(stderr, "usage: lpcnet_demo -encode <input.pcm> <compressed.lpcnet>\n");
- fprintf(stderr, " lpcnet_demo -decode <compressed.lpcnet> <output.pcm>\n");
- fprintf(stderr, " lpcnet_demo -features <input.pcm> <features.f32>\n");
- fprintf(stderr, " lpcnet_demo -synthesis <features.f32> <output.pcm>\n");
- fprintf(stderr, " lpcnet_demo -plc <percent> <input.pcm> <output.pcm>\n");
- return 0;
- }
+ FILE *plc_file = NULL;
+ const char *plc_options;
+ int plc_flags=-1;
+ if (argc < 4) usage();
if (strcmp(argv[1], "-encode") == 0) mode=MODE_ENCODE;
else if (strcmp(argv[1], "-decode") == 0) mode=MODE_DECODE;
else if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES;
@@ -59,21 +69,41 @@
else if (strcmp(argv[1], "-synthesis") == 0) mode=MODE_SYNTHESIS;
else if (strcmp(argv[1], "-plc") == 0) {
mode=MODE_PLC;
- plc_percent = atoi(argv[2]);
- argv++;
+ plc_options = argv[2];
+ plc_percent = atoi(argv[3]);
+ argv+=2;
+ argc-=2;
+ } else if (strcmp(argv[1], "-plc_file") == 0) {
+ mode=MODE_PLC;
+ plc_options = argv[2];
+ plc_file = fopen(argv[3], "r");
+ if (!plc_file) {
+ fprintf(stderr, "Can't open %s\n", argv[3]);
+ exit(1);
+ }
+ argv+=2;
+ argc-=2;
} else {
- exit(1);
+ usage();
}
+ if (mode == MODE_PLC) {
+ if (strcmp(plc_options, "causal")==0) plc_flags = LPCNET_PLC_CAUSAL;
+ else if (strcmp(plc_options, "causal_dc")==0) plc_flags = LPCNET_PLC_CAUSAL | LPCNET_PLC_DC_FILTER;
+ else if (strcmp(plc_options, "noncausal")==0) plc_flags = LPCNET_PLC_NONCAUSAL;
+ else if (strcmp(plc_options, "noncausal_dc")==0) plc_flags = LPCNET_PLC_NONCAUSAL | LPCNET_PLC_DC_FILTER;
+ else usage();
+ }
+ if (argc != 4) usage();
fin = fopen(argv[2], "rb");
if (fin == NULL) {
- fprintf(stderr, "Can't open %s\n", argv[2]);
- exit(1);
+ fprintf(stderr, "Can't open %s\n", argv[2]);
+ exit(1);
}
fout = fopen(argv[3], "wb");
if (fout == NULL) {
- fprintf(stderr, "Can't open %s\n", argv[3]);
- exit(1);
+ fprintf(stderr, "Can't open %s\n", argv[3]);
+ exit(1);
}
if (mode == MODE_ENCODE) {
@@ -131,20 +161,30 @@
}
lpcnet_destroy(net);
} else if (mode == MODE_PLC) {
+ short pcm[FRAME_SIZE];
int count=0;
int loss=0;
+ int skip=0, extra=0;
+ if ((plc_flags&0x3) == LPCNET_PLC_NONCAUSAL) skip=extra=80;
LPCNetPLCState *net;
- net = lpcnet_plc_create();
+ net = lpcnet_plc_create(plc_flags);
while (1) {
- short pcm[FRAME_SIZE];
size_t ret;
ret = fread(pcm, sizeof(pcm[0]), FRAME_SIZE, fin);
if (feof(fin) || ret != FRAME_SIZE) break;
- if (count % 2 == 0) loss = rand() < RAND_MAX*(float)plc_percent/100.f;
+ if (count % 2 == 0) {
+ if (plc_file != NULL) fscanf(plc_file, "%d", &loss);
+ else loss = rand() < RAND_MAX*(float)plc_percent/100.f;
+ }
if (loss) lpcnet_plc_conceal(net, pcm);
else lpcnet_plc_update(net, pcm);
- fwrite(pcm, sizeof(pcm[0]), FRAME_SIZE, fout);
+ fwrite(&pcm[skip], sizeof(pcm[0]), FRAME_SIZE-skip, fout);
+ skip = 0;
count++;
+ }
+ if (extra) {
+ lpcnet_plc_conceal(net, pcm);
+ fwrite(pcm, sizeof(pcm[0]), extra, fout);
}
lpcnet_plc_destroy(net);
} else {
--- a/dnn/lpcnet_enc.c
+++ b/dnn/lpcnet_enc.c
@@ -28,6 +28,10 @@
#include "config.h"
#endif
+#ifdef OPUS_BUILD
+#define celt_pitch_xcorr celt_pitch_xcorr_c
+#endif
+
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
@@ -540,7 +544,7 @@
ener = (1 + ener0 + celt_inner_prod(&st->exc_buf[i+off], &st->exc_buf[i+off], FRAME_SIZE/2));
st->xc[2+2*st->pcount+sub][i] = 2*xcorr[i] / ener;
}
- if (0) {
+ if (1) {
/* Upsample correlation by 3x and keep the max. */
float interpolated[PITCH_MAX_PERIOD]={0};
/* interp=sinc([-3:3]+1/3).*(.5+.5*cos(pi*[-3:3]/4.5)); interp=interp/sum(interp); */
--- a/dnn/lpcnet_plc.c
+++ b/dnn/lpcnet_plc.c
@@ -30,12 +30,19 @@
#include "lpcnet_private.h"
#include "lpcnet.h"
+#include "plc_data.h"
LPCNET_EXPORT int lpcnet_plc_get_size() {
return sizeof(LPCNetPLCState);
}
-LPCNET_EXPORT void lpcnet_plc_init(LPCNetPLCState *st) {
+LPCNET_EXPORT int lpcnet_plc_init(LPCNetPLCState *st, int options) {
+ if (FEATURES_DELAY != 0) {
+ fprintf(stderr, "PLC cannot work with non-zero FEATURES_DELAY\n");
+ fprintf(stderr, "Recompile with a no-lookahead model (see README.md)\n");
+ exit(1);
+ }
+ RNN_CLEAR(st, 1);
lpcnet_init(&st->lpcnet);
lpcnet_encoder_init(&st->enc);
RNN_CLEAR(st->pcm, PLC_BUF_SIZE);
@@ -42,12 +49,29 @@
st->pcm_fill = PLC_BUF_SIZE;
st->skip_analysis = 0;
st->blend = 0;
+ st->loss_count = 0;
+ st->dc_mem = 0;
+ st->queued_update = 0;
+ if ((options&0x3) == LPCNET_PLC_CAUSAL) {
+ st->enable_blending = 1;
+ st->non_causal = 0;
+ } else if ((options&0x3) == LPCNET_PLC_NONCAUSAL) {
+ st->enable_blending = 1;
+ st->non_causal = 1;
+ } else if ((options&0x3) == LPCNET_PLC_CODEC) {
+ st->enable_blending = 0;
+ st->non_causal = 0;
+ } else {
+ return -1;
+ }
+ st->remove_dc = !!(options&LPCNET_PLC_DC_FILTER);
+ return 0;
}
-LPCNET_EXPORT LPCNetPLCState *lpcnet_plc_create() {
+LPCNET_EXPORT LPCNetPLCState *lpcnet_plc_create(int options) {
LPCNetPLCState *st;
- st = malloc(sizeof(*st));
- lpcnet_plc_init(st);
+ st = calloc(sizeof(*st), 1);
+ lpcnet_plc_init(st, options);
return st;
}
@@ -55,20 +79,73 @@
free(st);
}
-LPCNET_EXPORT int lpcnet_plc_update(LPCNetPLCState *st, short *pcm) {
+static void compute_plc_pred(PLCNetState *net, float *out, const float *in) {
+ float zeros[3*PLC_MAX_RNN_NEURONS] = {0};
+ float dense_out[PLC_DENSE1_OUT_SIZE];
+ _lpcnet_compute_dense(&plc_dense1, dense_out, in);
+ compute_gruB(&plc_gru1, zeros, net->plc_gru1_state, dense_out);
+ compute_gruB(&plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state);
+ _lpcnet_compute_dense(&plc_out, out, net->plc_gru2_state);
+ /* Artificially boost the correlation to make harmonics cleaner. */
+ out[19] = MIN16(.5f, out[19]+.1f);
+}
+
+void clear_state(LPCNetPLCState *st) {
+ RNN_CLEAR(st->lpcnet.last_sig, LPC_ORDER);
+ st->lpcnet.last_exc = lin2ulaw(0.f);;
+ st->lpcnet.deemph_mem = 0;
+ RNN_CLEAR(st->lpcnet.nnet.gru_a_state, GRU_A_STATE_SIZE);
+ RNN_CLEAR(st->lpcnet.nnet.gru_b_state, GRU_B_STATE_SIZE);
+}
+
+#define DC_CONST 0.003
+
+/* In this causal version of the code, the DNN model implemented by compute_plc_pred()
+ needs to generate two feature vectors to conceal the first lost packet.*/
+
+static int lpcnet_plc_update_causal(LPCNetPLCState *st, short *pcm) {
int i;
float x[FRAME_SIZE];
short output[FRAME_SIZE];
+ float plc_features[2*NB_BANDS+NB_FEATURES+1];
+ short lp[FRAME_SIZE]={0};
+ int delta = 0;
+ if (st->remove_dc) {
+ st->dc_mem += st->syn_dc;
+ delta = st->syn_dc;
+ st->syn_dc = 0;
+ for (i=0;i<FRAME_SIZE;i++) {
+ lp[i] = (int)floor(.5 + st->dc_mem);
+ st->dc_mem += DC_CONST*(pcm[i] - st->dc_mem);
+ pcm[i] -= lp[i];
+ }
+ }
+ for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+ burg_cepstral_analysis(plc_features, x);
st->enc.pcount = 0;
if (st->skip_analysis) {
/*fprintf(stderr, "skip update\n");*/
if (st->blend) {
short tmp[FRAME_SIZE-TRAINING_OFFSET];
- lpcnet_synthesize_tail_impl(&st->lpcnet, tmp, FRAME_SIZE-TRAINING_OFFSET, 0);
- for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) {
- float w;
- w = .5 - .5*cos(M_PI*i/(FRAME_SIZE-TRAINING_OFFSET));
- pcm[i] = (int)floor(.5 + w*pcm[i] + (1-w)*tmp[i]);
+ float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};
+ RNN_COPY(zeros, plc_features, 2*NB_BANDS);
+ zeros[2*NB_BANDS+NB_FEATURES] = 1;
+ st->plc_net = st->plc_copy;
+ compute_plc_pred(&st->plc_net, st->features, zeros);
+ if (st->enable_blending) {
+ LPCNetState copy;
+ copy = st->lpcnet;
+ lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], tmp, FRAME_SIZE-TRAINING_OFFSET, 0);
+ for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) {
+ float w;
+ w = .5 - .5*cos(M_PI*i/(FRAME_SIZE-TRAINING_OFFSET));
+ pcm[i] = (int)floor(.5 + w*pcm[i] + (1-w)*(tmp[i]-delta));
+ }
+ st->lpcnet = copy;
+ lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], pcm, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
+ } else {
+ RNN_COPY(tmp, pcm, FRAME_SIZE-TRAINING_OFFSET);
+ lpcnet_synthesize_tail_impl(&st->lpcnet, tmp, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
}
st->blend = 0;
RNN_COPY(st->pcm, &pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET);
@@ -93,18 +170,28 @@
run_frame_network(&st->lpcnet, gru_a_condition, gru_b_condition, lpc, st->enc.features[0]);
st->skip_analysis--;
} else {
+ RNN_COPY(&plc_features[2*NB_BANDS], st->enc.features[0], NB_FEATURES);
+ plc_features[2*NB_BANDS+NB_FEATURES] = 1;
+ compute_plc_pred(&st->plc_net, st->features, plc_features);
for (i=0;i<FRAME_SIZE;i++) st->pcm[PLC_BUF_SIZE+i] = pcm[i];
RNN_COPY(output, &st->pcm[0], FRAME_SIZE);
lpcnet_synthesize_impl(&st->lpcnet, st->enc.features[0], output, FRAME_SIZE, FRAME_SIZE);
-
RNN_MOVE(st->pcm, &st->pcm[FRAME_SIZE], PLC_BUF_SIZE);
}
- RNN_COPY(st->features, st->enc.features[0], NB_TOTAL_FEATURES);
+ st->loss_count = 0;
+ if (st->remove_dc) {
+ for (i=0;i<FRAME_SIZE;i++) {
+ pcm[i] += lp[i];
+ }
+ }
return 0;
}
-LPCNET_EXPORT int lpcnet_plc_conceal(LPCNetPLCState *st, short *pcm) {
+static const float att_table[10] = {0, 0, -.2, -.2, -.4, -.4, -.8, -.8, -1.6, -1.6};
+static int lpcnet_plc_conceal_causal(LPCNetPLCState *st, short *pcm) {
+ int i;
short output[FRAME_SIZE];
+ float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};
st->enc.pcount = 0;
/* If we concealed the previous frame, finish synthesizing the rest of the samples. */
/* FIXME: Copy/predict features. */
@@ -113,16 +200,20 @@
int update_count;
update_count = IMIN(st->pcm_fill, FRAME_SIZE);
RNN_COPY(output, &st->pcm[0], update_count);
-
+ compute_plc_pred(&st->plc_net, st->features, zeros);
lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], output, update_count, update_count);
RNN_MOVE(st->pcm, &st->pcm[FRAME_SIZE], PLC_BUF_SIZE);
st->pcm_fill -= update_count;
st->skip_analysis++;
}
+ st->plc_copy = st->plc_net;
lpcnet_synthesize_tail_impl(&st->lpcnet, pcm, FRAME_SIZE-TRAINING_OFFSET, 0);
+ compute_plc_pred(&st->plc_net, st->features, zeros);
+ if (st->loss_count >= 10) st->features[0] = MAX16(-10, st->features[0]+att_table[9] - 2*(st->loss_count-9));
+ else st->features[0] = MAX16(-10, st->features[0]+att_table[st->loss_count]);
+ //if (st->loss_count > 4) st->features[NB_FEATURES-1] = MAX16(-.5, st->features[NB_FEATURES-1]-.1*(st->loss_count-4));
lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], &pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, 0);
{
- int i;
float x[FRAME_SIZE];
/* FIXME: Can we do better? */
for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
@@ -130,6 +221,175 @@
compute_frame_features(&st->enc, x);
process_single_frame(&st->enc, NULL);
}
+ st->loss_count++;
st->blend = 1;
+ if (st->remove_dc) {
+ for (i=0;i<FRAME_SIZE;i++) {
+ st->syn_dc += DC_CONST*(pcm[i] - st->syn_dc);
+ pcm[i] += (int)floor(.5 + st->dc_mem);
+ }
+ }
return 0;
+}
+
+/* In this non-causal version of the code, the DNN model implemented by compute_plc_pred()
+ is always called once per frame. We process audio up to the current position minus TRAINING_OFFSET. */
+
+void process_queued_update(LPCNetPLCState *st) {
+ if (st->queued_update) {
+ lpcnet_synthesize_impl(&st->lpcnet, st->features, st->queued_samples, FRAME_SIZE, FRAME_SIZE);
+ st->queued_update=0;
+ }
+}
+
+static int lpcnet_plc_update_non_causal(LPCNetPLCState *st, short *pcm) {
+ int i;
+ float x[FRAME_SIZE];
+ short pcm_save[FRAME_SIZE];
+ float plc_features[2*NB_BANDS+NB_FEATURES+1];
+ short lp[FRAME_SIZE]={0};
+ double mem_bak=0;
+ int delta = st->syn_dc;
+ process_queued_update(st);
+ if (st->remove_dc) {
+ st->dc_mem += st->syn_dc;
+ st->syn_dc = 0;
+ mem_bak = st->dc_mem;
+ for (i=0;i<FRAME_SIZE;i++) {
+ lp[i] = (int)floor(.5 + st->dc_mem);
+ st->dc_mem += DC_CONST*(pcm[i] - st->dc_mem);
+ pcm[i] -= lp[i];
+ }
+ }
+ RNN_COPY(pcm_save, pcm, FRAME_SIZE);
+ for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+ burg_cepstral_analysis(plc_features, x);
+ st->enc.pcount = 0;
+ if (st->loss_count > 0) {
+ LPCNetState copy;
+ /* Handle blending. */
+ float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};
+ RNN_COPY(zeros, plc_features, 2*NB_BANDS);
+ zeros[2*NB_BANDS+NB_FEATURES] = 1;
+ compute_plc_pred(&st->plc_net, st->features, zeros);
+ copy = st->lpcnet;
+ lpcnet_synthesize_impl(&st->lpcnet, st->features, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, 0);
+ /* Undo initial DC offset removal so that we can take into account the last 5ms of synthesis. */
+ if (st->remove_dc) {
+ for (i=0;i<FRAME_SIZE;i++) pcm[i] += lp[i];
+ st->dc_mem = mem_bak;
+ for (i=0;i<TRAINING_OFFSET;i++) st->syn_dc += DC_CONST*(st->pcm[FRAME_SIZE-TRAINING_OFFSET+i] - st->syn_dc);
+ st->dc_mem += st->syn_dc;
+ delta += st->syn_dc;
+ st->syn_dc = 0;
+ for (i=0;i<FRAME_SIZE;i++) {
+ lp[i] = (int)floor(.5 + st->dc_mem);
+ st->dc_mem += DC_CONST*(pcm[i] - st->dc_mem);
+ pcm[i] -= lp[i];
+ }
+ RNN_COPY(pcm_save, pcm, FRAME_SIZE);
+ }
+ {
+ short rev[FRAME_SIZE];
+ for (i=0;i<FRAME_SIZE;i++) rev[i] = pcm[FRAME_SIZE-i-1];
+ clear_state(st);
+ lpcnet_synthesize_impl(&st->lpcnet, st->features, rev, FRAME_SIZE, FRAME_SIZE);
+ lpcnet_synthesize_tail_impl(&st->lpcnet, rev, TRAINING_OFFSET, 0);
+ for (i=0;i<TRAINING_OFFSET;i++) {
+ float w;
+ w = .5 - .5*cos(M_PI*i/(TRAINING_OFFSET));
+ st->pcm[FRAME_SIZE-1-i] = (int)floor(.5 + w*st->pcm[FRAME_SIZE-1-i] + (1-w)*(rev[i]+delta));
+ }
+
+ }
+ st->lpcnet = copy;
+#if 1
+ st->queued_update = 1;
+ RNN_COPY(&st->queued_samples[0], &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET);
+ RNN_COPY(&st->queued_samples[TRAINING_OFFSET], pcm, FRAME_SIZE-TRAINING_OFFSET);
+#else
+ lpcnet_synthesize_impl(&st->lpcnet, st->features, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, TRAINING_OFFSET);
+ lpcnet_synthesize_tail_impl(&st->lpcnet, pcm, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
+#endif
+ for (i=0;i<FRAME_SIZE;i++) x[i] = st->pcm[i];
+ preemphasis(x, &st->enc.mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+ compute_frame_features(&st->enc, x);
+ process_single_frame(&st->enc, NULL);
+
+ }
+ for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
+ preemphasis(x, &st->enc.mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+ compute_frame_features(&st->enc, x);
+ process_single_frame(&st->enc, NULL);
+ if (st->loss_count == 0) {
+ RNN_COPY(&plc_features[2*NB_BANDS], st->enc.features[0], NB_FEATURES);
+ plc_features[2*NB_BANDS+NB_FEATURES] = 1;
+ compute_plc_pred(&st->plc_net, st->features, plc_features);
+ lpcnet_synthesize_impl(&st->lpcnet, st->enc.features[0], &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, TRAINING_OFFSET);
+ lpcnet_synthesize_tail_impl(&st->lpcnet, pcm, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
+ }
+ RNN_COPY(&pcm[FRAME_SIZE-TRAINING_OFFSET], pcm, TRAINING_OFFSET);
+ RNN_COPY(pcm, &st->pcm[TRAINING_OFFSET], FRAME_SIZE-TRAINING_OFFSET);
+ RNN_COPY(st->pcm, pcm_save, FRAME_SIZE);
+ st->loss_count = 0;
+ if (st->remove_dc) {
+ for (i=0;i<TRAINING_OFFSET;i++) pcm[i] += st->dc_buf[i];
+ for (;i<FRAME_SIZE;i++) pcm[i] += lp[i-TRAINING_OFFSET];
+ for (i=0;i<TRAINING_OFFSET;i++) st->dc_buf[i] = lp[FRAME_SIZE-TRAINING_OFFSET+i];
+ }
+ return 0;
+}
+
+static int lpcnet_plc_conceal_non_causal(LPCNetPLCState *st, short *pcm) {
+ int i;
+ float x[FRAME_SIZE];
+ float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};
+ process_queued_update(st);
+ st->enc.pcount = 0;
+
+ compute_plc_pred(&st->plc_net, st->features, zeros);
+ if (st->loss_count >= 10) st->features[0] = MAX16(-10, st->features[0]+att_table[9] - 2*(st->loss_count-9));
+ else st->features[0] = MAX16(-10, st->features[0]+att_table[st->loss_count]);
+ //if (st->loss_count > 4) st->features[NB_FEATURES-1] = MAX16(-.5, st->features[NB_FEATURES-1]-.1*(st->loss_count-4));
+
+ if (st->loss_count == 0) {
+ RNN_COPY(pcm, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET);
+ lpcnet_synthesize_impl(&st->lpcnet, st->features, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, TRAINING_OFFSET);
+ lpcnet_synthesize_tail_impl(&st->lpcnet, &pcm[TRAINING_OFFSET], FRAME_SIZE-TRAINING_OFFSET, 0);
+ } else {
+ lpcnet_synthesize_impl(&st->lpcnet, st->features, pcm, TRAINING_OFFSET, 0);
+ lpcnet_synthesize_tail_impl(&st->lpcnet, &pcm[TRAINING_OFFSET], FRAME_SIZE-TRAINING_OFFSET, 0);
+
+ RNN_COPY(&st->pcm[FRAME_SIZE-TRAINING_OFFSET], pcm, TRAINING_OFFSET);
+ for (i=0;i<FRAME_SIZE;i++) x[i] = st->pcm[i];
+ preemphasis(x, &st->enc.mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
+ compute_frame_features(&st->enc, x);
+ process_single_frame(&st->enc, NULL);
+ }
+ RNN_COPY(st->pcm, &pcm[TRAINING_OFFSET], FRAME_SIZE-TRAINING_OFFSET);
+
+ if (st->remove_dc) {
+ int dc = (int)floor(.5 + st->dc_mem);
+ if (st->loss_count == 0) {
+ for (i=TRAINING_OFFSET;i<FRAME_SIZE;i++) st->syn_dc += DC_CONST*(pcm[i] - st->syn_dc);
+ } else {
+ for (i=0;i<FRAME_SIZE;i++) st->syn_dc += DC_CONST*(pcm[i] - st->syn_dc);
+ }
+ for (i=0;i<TRAINING_OFFSET;i++) pcm[i] += st->dc_buf[i];
+ for (;i<FRAME_SIZE;i++) pcm[i] += dc;
+ for (i=0;i<TRAINING_OFFSET;i++) st->dc_buf[i] = dc;
+ }
+ st->loss_count++;
+ return 0;
+}
+
+
+LPCNET_EXPORT int lpcnet_plc_update(LPCNetPLCState *st, short *pcm) {
+ if (st->non_causal) return lpcnet_plc_update_non_causal(st, pcm);
+ else return lpcnet_plc_update_causal(st, pcm);
+}
+
+LPCNET_EXPORT int lpcnet_plc_conceal(LPCNetPLCState *st, short *pcm) {
+ if (st->non_causal) return lpcnet_plc_conceal_non_causal(st, pcm);
+ else return lpcnet_plc_conceal_causal(st, pcm);
}
--- a/dnn/lpcnet_private.h
+++ b/dnn/lpcnet_private.h
@@ -6,6 +6,7 @@
#include "freq.h"
#include "lpcnet.h"
#include "nnet_data.h"
+#include "plc_data.h"
#include "kiss99.h"
#define BITS_PER_CHAR 8
@@ -61,6 +62,7 @@
float features[4][NB_TOTAL_FEATURES];
float sig_mem[LPC_ORDER];
int exc_mem;
+ float burg_cepstrum[2*NB_BANDS];
};
#define PLC_BUF_SIZE (FEATURES_DELAY*FRAME_SIZE + TRAINING_OFFSET)
@@ -72,6 +74,18 @@
int skip_analysis;
int blend;
float features[NB_TOTAL_FEATURES];
+ int loss_count;
+ PLCNetState plc_net;
+ PLCNetState plc_copy;
+ int enable_blending;
+ int non_causal;
+ double dc_mem;
+ double syn_dc;
+ int remove_dc;
+
+ short dc_buf[TRAINING_OFFSET];
+ int queued_update;
+ short queued_samples[FRAME_SIZE];
};
extern float ceps_codebook1[];
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -38,6 +38,7 @@
#include "tansig_table.h"
#include "nnet.h"
#include "nnet_data.h"
+#include "plc_data.h"
#ifdef NO_OPTIMIZATIONS
#warning Compiling without any vectorization. This code will be very slow
@@ -315,13 +316,15 @@
state[i] = h[i];
}
+#define MAX_RNN_NEURONS_ALL IMAX(MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS)
+
void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input)
{
int i;
int N, M;
int stride;
- float zrh[3*MAX_RNN_NEURONS];
- float recur[3*MAX_RNN_NEURONS];
+ float zrh[3*MAX_RNN_NEURONS_ALL];
+ float recur[3*MAX_RNN_NEURONS_ALL];
float *z;
float *r;
float *h;
@@ -330,7 +333,7 @@
z = zrh;
r = &zrh[N];
h = &zrh[2*N];
- celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);
+ celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS_ALL);
celt_assert(input != state);
celt_assert(gru->reset_after);
stride = 3*N;
--- /dev/null
+++ b/dnn/training_tf2/dump_plc.py
@@ -1,0 +1,265 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+ Copyright (c) 2017-2018 Mozilla
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import lpcnet_plc
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.layers import Layer, GRU, Dense, Conv1D, Embedding
+import h5py
+import re
+
+# Flag for dumping e2e (differentiable lpc) network weights
+flag_e2e = False
+
+max_rnn_neurons = 1
+max_conv_inputs = 1
+
+def printVector(f, vector, name, dtype='float', dotp=False):
+ if dotp:
+ vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
+ vector = vector.transpose((2, 0, 3, 1))
+ v = np.reshape(vector, (-1));
+ #print('static const float ', name, '[', len(v), '] = \n', file=f)
+ f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))
+ for i in range(0, len(v)):
+ f.write('{}'.format(v[i]))
+ if (i!=len(v)-1):
+ f.write(',')
+ else:
+ break;
+ if (i%8==7):
+ f.write("\n ")
+ else:
+ f.write(" ")
+ #print(v, file=f)
+ f.write('\n};\n\n')
+ return;
+
+def printSparseVector(f, A, name, have_diag=True):
+ N = A.shape[0]
+ M = A.shape[1]
+ W = np.zeros((0,), dtype='int')
+ W0 = np.zeros((0,))
+ if have_diag:
+ diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])
+ A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))
+ A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))
+ A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))
+ printVector(f, diag, name + '_diag')
+ AQ = np.minimum(127, np.maximum(-128, np.round(A*128))).astype('int')
+ idx = np.zeros((0,), dtype='int')
+ for i in range(M//8):
+ pos = idx.shape[0]
+ idx = np.append(idx, -1)
+ nb_nonzero = 0
+ for j in range(N//4):
+ block = A[j*4:(j+1)*4, i*8:(i+1)*8]
+ qblock = AQ[j*4:(j+1)*4, i*8:(i+1)*8]
+ if np.sum(np.abs(block)) > 1e-10:
+ nb_nonzero = nb_nonzero + 1
+ idx = np.append(idx, j*4)
+ vblock = qblock.transpose((1,0)).reshape((-1,))
+ W0 = np.concatenate([W0, block.reshape((-1,))])
+ W = np.concatenate([W, vblock])
+ idx[pos] = nb_nonzero
+ f.write('#ifdef DOT_PROD\n')
+ printVector(f, W, name, dtype='qweight')
+ f.write('#else /*DOT_PROD*/\n')
+ printVector(f, W0, name, dtype='qweight')
+ f.write('#endif /*DOT_PROD*/\n')
+ #idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)
+ printVector(f, idx, name + '_idx', dtype='int')
+ return AQ
+
+def dump_layer_ignore(self, f, hf):
+ print("ignoring layer " + self.name + " of type " + self.__class__.__name__)
+ return False
+Layer.dump_layer = dump_layer_ignore
+
+def dump_sparse_gru(self, f, hf):
+ global max_rnn_neurons
+ name = 'sparse_' + self.name
+ print("printing layer " + name + " of type sparse " + self.__class__.__name__)
+ weights = self.get_weights()
+ qweights = printSparseVector(f, weights[1], name + '_recurrent_weights')
+ printVector(f, weights[-1], name + '_bias')
+ subias = weights[-1].copy()
+ subias[1,:] = subias[1,:] - np.sum(qweights*(1./128),axis=0)
+ printVector(f, subias, name + '_subias')
+ if hasattr(self, 'activation'):
+ activation = self.activation.__name__.upper()
+ else:
+ activation = 'TANH'
+ if hasattr(self, 'reset_after') and not self.reset_after:
+ reset_after = 0
+ else:
+ reset_after = 1
+ neurons = weights[0].shape[1]//3
+ max_rnn_neurons = max(max_rnn_neurons, neurons)
+ f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'
+ .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
+ hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+ hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+ hf.write('extern const SparseGRULayer {};\n\n'.format(name));
+ return True
+
+def dump_gru_layer(self, f, hf):
+ global max_rnn_neurons
+ name = self.name
+ print("printing layer " + name + " of type " + self.__class__.__name__)
+ weights = self.get_weights()
+ qweight = printSparseVector(f, weights[0], name + '_weights', have_diag=False)
+
+ f.write('#ifdef DOT_PROD\n')
+ qweight2 = np.clip(np.round(128.*weights[1]).astype('int'), -128, 127)
+ printVector(f, qweight2, name + '_recurrent_weights', dotp=True, dtype='qweight')
+ f.write('#else /*DOT_PROD*/\n')
+ printVector(f, weights[1], name + '_recurrent_weights')
+ f.write('#endif /*DOT_PROD*/\n')
+
+ printVector(f, weights[-1], name + '_bias')
+ subias = weights[-1].copy()
+ subias[0,:] = subias[0,:] - np.sum(qweight*(1./128.),axis=0)
+ subias[1,:] = subias[1,:] - np.sum(qweight2*(1./128.),axis=0)
+ printVector(f, subias, name + '_subias')
+ if hasattr(self, 'activation'):
+ activation = self.activation.__name__.upper()
+ else:
+ activation = 'TANH'
+ if hasattr(self, 'reset_after') and not self.reset_after:
+ reset_after = 0
+ else:
+ reset_after = 1
+ neurons = weights[0].shape[1]//3
+ max_rnn_neurons = max(max_rnn_neurons, neurons)
+ f.write('const GRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_weights,\n {}_weights_idx,\n {}_recurrent_weights,\n {}, {}, ACTIVATION_{}, {}\n}};\n\n'
+ .format(name, name, name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
+ hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+ hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+ hf.write('extern const GRULayer {};\n\n'.format(name));
+ return True
+GRU.dump_layer = dump_gru_layer
+
+def dump_gru_layer_dummy(self, f, hf):
+ name = self.name
+ weights = self.get_weights()
+ hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+ hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+ return True;
+
+#GRU.dump_layer = dump_gru_layer_dummy
+
+def dump_dense_layer_impl(name, weights, bias, activation, f, hf):
+ printVector(f, weights, name + '_weights')
+ printVector(f, bias, name + '_bias')
+ f.write('const DenseLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, ACTIVATION_{}\n}};\n\n'
+ .format(name, name, name, weights.shape[0], weights.shape[1], activation))
+ hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))
+ hf.write('extern const DenseLayer {};\n\n'.format(name));
+
+def dump_dense_layer(self, f, hf):
+ name = self.name
+ print("printing layer " + name + " of type " + self.__class__.__name__)
+ weights = self.get_weights()
+ activation = self.activation.__name__.upper()
+ dump_dense_layer_impl(name, weights[0], weights[1], activation, f, hf)
+ return False
+
+Dense.dump_layer = dump_dense_layer
+
+def dump_conv1d_layer(self, f, hf):
+ global max_conv_inputs
+ name = self.name
+ print("printing layer " + name + " of type " + self.__class__.__name__)
+ weights = self.get_weights()
+ printVector(f, weights[0], name + '_weights')
+ printVector(f, weights[-1], name + '_bias')
+ activation = self.activation.__name__.upper()
+ max_conv_inputs = max(max_conv_inputs, weights[0].shape[1]*weights[0].shape[0])
+ f.write('const Conv1DLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, {}, ACTIVATION_{}\n}};\n\n'
+ .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
+ hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[2]))
+ hf.write('#define {}_STATE_SIZE ({}*{})\n'.format(name.upper(), weights[0].shape[1], (weights[0].shape[0]-1)))
+ hf.write('#define {}_DELAY {}\n'.format(name.upper(), (weights[0].shape[0]-1)//2))
+ hf.write('extern const Conv1DLayer {};\n\n'.format(name));
+ return True
+Conv1D.dump_layer = dump_conv1d_layer
+
+
+
+filename = sys.argv[1]
+with h5py.File(filename, "r") as f:
+ units = min(f['model_weights']['plc_gru1']['plc_gru1']['recurrent_kernel:0'].shape)
+ units2 = min(f['model_weights']['plc_gru2']['plc_gru2']['recurrent_kernel:0'].shape)
+ cond_size = f['model_weights']['plc_dense1']['plc_dense1']['kernel:0'].shape[1]
+
+model = lpcnet_plc.new_lpcnet_plc_model(rnn_units=units, cond_size=cond_size)
+model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
+#model.summary()
+
+model.load_weights(filename, by_name=True)
+
+if len(sys.argv) > 2:
+ cfile = sys.argv[2];
+ hfile = sys.argv[3];
+else:
+ cfile = 'plc_data.c'
+ hfile = 'plc_data.h'
+
+
+f = open(cfile, 'w')
+hf = open(hfile, 'w')
+
+
+f.write('/*This file is automatically generated from a Keras model*/\n')
+f.write('/*based on model {}*/\n\n'.format(sys.argv[1]))
+f.write('#ifdef HAVE_CONFIG_H\n#include "config.h"\n#endif\n\n#include "nnet.h"\n#include "{}"\n\n'.format(hfile))
+
+hf.write('/*This file is automatically generated from a Keras model*/\n\n')
+hf.write('#ifndef PLC_DATA_H\n#define PLC_DATA_H\n\n#include "nnet.h"\n\n')
+
+layer_list = []
+for i, layer in enumerate(model.layers):
+ if layer.dump_layer(f, hf):
+ layer_list.append(layer.name)
+
+#dump_sparse_gru(model.get_layer('gru_a'), f, hf)
+
+hf.write('#define PLC_MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons))
+#hf.write('#define PLC_MAX_CONV_INPUTS {}\n\n'.format(max_conv_inputs))
+
+hf.write('typedef struct {\n')
+for i, name in enumerate(layer_list):
+ hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper()))
+hf.write('} PLCNetState;\n')
+
+hf.write('\n\n#endif\n')
+
+f.close()
+hf.close()
--- /dev/null
+++ b/dnn/training_tf2/lpcnet_plc.py
@@ -1,0 +1,101 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+ Copyright (c) 2018-2019 Mozilla
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import math
+import tensorflow as tf
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation, GaussianNoise
+from tensorflow.compat.v1.keras.layers import CuDNNGRU
+from tensorflow.keras import backend as K
+from tensorflow.keras.constraints import Constraint
+from tensorflow.keras.initializers import Initializer
+from tensorflow.keras.callbacks import Callback
+import numpy as np
+
+def quant_regularizer(x):
+ Q = 128
+ Q_1 = 1./Q
+ #return .01 * tf.reduce_mean(1 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))
+ return .01 * tf.reduce_mean(K.sqrt(K.sqrt(1.0001 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))))
+
+
+class WeightClip(Constraint):
+ '''Clips the weights incident to each hidden unit to be inside a range
+ '''
+ def __init__(self, c=2):
+ self.c = c
+
+ def __call__(self, p):
+ # Ensure that abs of adjacent weights don't sum to more than 127. Otherwise there's a risk of
+ # saturation when implementing dot products with SSSE3 or AVX2.
+ return self.c*p/tf.maximum(self.c, tf.repeat(tf.abs(p[:, 1::2])+tf.abs(p[:, 0::2]), 2, axis=1))
+ #return K.clip(p, -self.c, self.c)
+
+ def get_config(self):
+ return {'name': self.__class__.__name__,
+ 'c': self.c}
+
+constraint = WeightClip(0.992)
+
+def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, nb_burg_features=36, batch_size=128, training=False, adaptation=False, quantize=False, cond_size=128):
+ feat = Input(shape=(None, nb_used_features+nb_burg_features), batch_size=batch_size)
+ lost = Input(shape=(None, 1), batch_size=batch_size)
+
+ fdense1 = Dense(cond_size, activation='tanh', name='plc_dense1')
+
+ cfeat = Concatenate()([feat, lost])
+ cfeat = fdense1(cfeat)
+ #cfeat = Conv1D(cond_size, 3, padding='causal', activation='tanh', name='plc_conv1')(cfeat)
+
+ quant = quant_regularizer if quantize else None
+
+ if training:
+ rnn = CuDNNGRU(rnn_units, return_sequences=True, return_state=True, name='plc_gru1', stateful=True,
+ kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+ rnn2 = CuDNNGRU(rnn_units, return_sequences=True, return_state=True, name='plc_gru2', stateful=True,
+ kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+ else:
+ rnn = GRU(rnn_units, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='plc_gru1', stateful=True,
+ kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+ rnn2 = GRU(rnn_units, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='plc_gru2', stateful=True,
+ kernel_constraint=constraint, recurrent_constraint = constraint, kernel_regularizer=quant, recurrent_regularizer=quant)
+
+ gru_out1, _ = rnn(cfeat)
+ gru_out1 = GaussianNoise(.005)(gru_out1)
+ gru_out2, _ = rnn2(gru_out1)
+
+ out_dense = Dense(nb_used_features, activation='linear', name='plc_out')
+ plc_out = out_dense(gru_out2)
+
+ model = Model([feat, lost], plc_out)
+ model.rnn_units = rnn_units
+ model.cond_size = cond_size
+ model.nb_used_features = nb_used_features
+ model.nb_burg_features = nb_burg_features
+
+ return model
--- /dev/null
+++ b/dnn/training_tf2/plc_loader.py
@@ -1,0 +1,67 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import numpy as np
+from tensorflow.keras.utils import Sequence
+
+class PLCLoader(Sequence):
+ def __init__(self, features, lost, nb_burg_features, batch_size):
+ self.batch_size = batch_size
+ self.nb_batches = features.shape[0]//self.batch_size
+ self.features = features[:self.nb_batches*self.batch_size, :, :]
+ self.lost = lost.astype('float')
+ self.lost = self.lost[:(len(self.lost)//features.shape[1]-1)*features.shape[1]]
+ self.nb_burg_features = nb_burg_features
+ self.on_epoch_end()
+
+ def on_epoch_end(self):
+ self.indices = np.arange(self.nb_batches*self.batch_size)
+ np.random.shuffle(self.indices)
+ offset = np.random.randint(0, high=self.features.shape[1])
+ self.lost_offset = np.reshape(self.lost[offset:-self.features.shape[1]+offset], (-1, self.features.shape[1]))
+ self.lost_indices = np.random.randint(0, high=self.lost_offset.shape[0], size=self.nb_batches*self.batch_size)
+
+ def __getitem__(self, index):
+ features = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :]
+ #lost = (np.random.rand(features.shape[0], features.shape[1]) > .2).astype('float')
+ lost = self.lost_offset[self.lost_indices[index*self.batch_size:(index+1)*self.batch_size], :]
+ lost = np.reshape(lost, (features.shape[0], features.shape[1], 1))
+ lost_mask = np.tile(lost, (1,1,features.shape[2]))
+ in_features = features*lost_mask
+
+ #For the first frame after a loss, we don't have valid features, but the Burg estimate is valid.
+ in_features[:,1:,self.nb_burg_features:] = in_features[:,1:,self.nb_burg_features:]*lost_mask[:,:-1,self.nb_burg_features:]
+ out_lost = np.copy(lost)
+ out_lost[:,1:,:] = out_lost[:,1:,:]*out_lost[:,:-1,:]
+
+ out_features = np.concatenate([features[:,:,self.nb_burg_features:], 1.-out_lost], axis=-1)
+ inputs = [in_features*lost_mask, lost]
+ outputs = [out_features]
+ return (inputs, outputs)
+
+ def __len__(self):
+ return self.nb_batches
--- /dev/null
+++ b/dnn/training_tf2/test_plc.py
@@ -1,0 +1,92 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+ Copyright (c) 2018-2019 Mozilla
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+
+import argparse
+from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Test a PLC model')
+
+parser.add_argument('weights', metavar='<weights file>', help='weights file (.h5)')
+parser.add_argument('features', metavar='<features file>', help='binary features file (float32)')
+parser.add_argument('output', metavar='<output>', help='reconstructed file (float32)')
+parser.add_argument('--model', metavar='<model>', default='lpcnet_plc', help='PLC model python definition (without .py)')
+group1 = parser.add_mutually_exclusive_group()
+
+parser.add_argument('--gru-size', metavar='<units>', default=256, type=int, help='number of units in GRU (default 256)')
+parser.add_argument('--cond-size', metavar='<units>', default=128, type=int, help='number of units in conditioning network (default 128)')
+
+
+args = parser.parse_args()
+
+import importlib
+lpcnet = importlib.import_module(args.model)
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+import tensorflow as tf
+#gpus = tf.config.experimental.list_physical_devices('GPU')
+#if gpus:
+# try:
+# tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
+# except RuntimeError as e:
+# print(e)
+
+model = lpcnet.new_lpcnet_plc_model(rnn_units=args.gru_size, batch_size=1, training=False, quantize=False, cond_size=args.cond_size)
+model.compile()
+
+lpc_order = 16
+
+feature_file = args.features
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+features = np.loadtxt(feature_file)
+print(features.shape)
+sequence_size = features.shape[0]
+lost = np.reshape(features[:,-1:], (1, sequence_size, 1))
+features = features[:,:nb_used_features]
+features = np.reshape(features, (1, sequence_size, nb_used_features))
+
+
+model.load_weights(args.weights)
+
+features = features*lost
+out = model.predict([features, lost])
+
+out = features + (1-lost)*out
+
+np.savetxt(args.output, out[0,:,:])
--- /dev/null
+++ b/dnn/training_tf2/train_plc.py
@@ -1,0 +1,197 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+ Copyright (c) 2018-2019 Mozilla
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+
+import argparse
+from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Train a PLC model')
+
+parser.add_argument('features', metavar='<features file>', help='binary features file (float32)')
+parser.add_argument('lost_file', metavar='<packet loss file>', help='packet loss traces (int8)')
+parser.add_argument('output', metavar='<output>', help='trained model file (.h5)')
+parser.add_argument('--model', metavar='<model>', default='lpcnet_plc', help='PLC model python definition (without .py)')
+group1 = parser.add_mutually_exclusive_group()
+group1.add_argument('--quantize', metavar='<input weights>', help='quantize model')
+group1.add_argument('--retrain', metavar='<input weights>', help='continue training model')
+parser.add_argument('--gru-size', metavar='<units>', default=256, type=int, help='number of units in GRU (default 256)')
+parser.add_argument('--cond-size', metavar='<units>', default=128, type=int, help='number of units in conditioning network (default 128)')
+parser.add_argument('--epochs', metavar='<epochs>', default=120, type=int, help='number of epochs to train for (default 120)')
+parser.add_argument('--batch-size', metavar='<batch size>', default=128, type=int, help='batch size to use (default 128)')
+parser.add_argument('--seq-length', metavar='<sequence length>', default=1000, type=int, help='sequence length to use (default 1000)')
+parser.add_argument('--lr', metavar='<learning rate>', type=float, help='learning rate')
+parser.add_argument('--decay', metavar='<decay>', type=float, help='learning rate decay')
+parser.add_argument('--band-loss', metavar='<weight>', default=1.0, type=float, help='weight of band loss (default 1.0)')
+parser.add_argument('--loss-bias', metavar='<bias>', default=0.0, type=float, help='loss bias towards low energy (default 0.0)')
+parser.add_argument('--logdir', metavar='<log dir>', help='directory for tensorboard log files')
+
+
+args = parser.parse_args()
+
+import importlib
+lpcnet = importlib.import_module(args.model)
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+import tensorflow as tf
+#gpus = tf.config.experimental.list_physical_devices('GPU')
+#if gpus:
+# try:
+# tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
+# except RuntimeError as e:
+# print(e)
+
+nb_epochs = args.epochs
+
+# Try reducing batch_size if you run out of memory on your GPU
+batch_size = args.batch_size
+
+quantize = args.quantize is not None
+retrain = args.retrain is not None
+
+if quantize:
+ lr = 0.00003
+ decay = 0
+ input_model = args.quantize
+else:
+ lr = 0.001
+ decay = 2.5e-5
+
+if args.lr is not None:
+ lr = args.lr
+
+if args.decay is not None:
+ decay = args.decay
+
+if retrain:
+ input_model = args.retrain
+
+def plc_loss(alpha=1.0, bias=0.):
+ def loss(y_true,y_pred):
+ mask = y_true[:,:,-1:]
+ y_true = y_true[:,:,:-1]
+ e = (y_pred - y_true)*mask
+ e_bands = tf.signal.idct(e[:,:,:-2], norm='ortho')
+ bias_mask = K.minimum(1., K.maximum(0., 4*y_true[:,:,-1:]))
+ l1_loss = K.mean(K.abs(e)) + 0.1*K.mean(K.maximum(0., -e[:,:,-1:])) + alpha*K.mean(K.abs(e_bands) + bias*bias_mask*K.maximum(0., e_bands)) + K.mean(K.minimum(K.abs(e[:,:,18:19]),1.)) + 8*K.mean(K.minimum(K.abs(e[:,:,18:19]),.4))
+ return l1_loss
+ return loss
+
+def plc_l1_loss():
+ def L1_loss(y_true,y_pred):
+ mask = y_true[:,:,-1:]
+ y_true = y_true[:,:,:-1]
+ e = (y_pred - y_true)*mask
+ l1_loss = K.mean(K.abs(e))
+ return l1_loss
+ return L1_loss
+
+def plc_ceps_loss():
+ def ceps_loss(y_true,y_pred):
+ mask = y_true[:,:,-1:]
+ y_true = y_true[:,:,:-1]
+ e = (y_pred - y_true)*mask
+ l1_loss = K.mean(K.abs(e[:,:,:-2]))
+ return l1_loss
+ return ceps_loss
+
+def plc_band_loss():
+ def L1_band_loss(y_true,y_pred):
+ mask = y_true[:,:,-1:]
+ y_true = y_true[:,:,:-1]
+ e = (y_pred - y_true)*mask
+ e_bands = tf.signal.idct(e[:,:,:-2], norm='ortho')
+ l1_loss = K.mean(K.abs(e_bands))
+ return l1_loss
+ return L1_band_loss
+
+def plc_pitch_loss():
+ def pitch_loss(y_true,y_pred):
+ mask = y_true[:,:,-1:]
+ y_true = y_true[:,:,:-1]
+ e = (y_pred - y_true)*mask
+ l1_loss = K.mean(K.minimum(K.abs(e[:,:,18:19]),.4))
+ return l1_loss
+ return pitch_loss
+
+opt = Adam(lr, decay=decay, beta_2=0.99)
+strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+
+with strategy.scope():
+ model = lpcnet.new_lpcnet_plc_model(rnn_units=args.gru_size, batch_size=batch_size, training=True, quantize=quantize, cond_size=args.cond_size)
+ model.compile(optimizer=opt, loss=plc_loss(alpha=args.band_loss, bias=args.loss_bias), metrics=[plc_l1_loss(), plc_ceps_loss(), plc_band_loss(), plc_pitch_loss()])
+ model.summary()
+
+lpc_order = 16
+
+feature_file = args.features
+nb_features = model.nb_used_features + lpc_order + model.nb_burg_features
+nb_used_features = model.nb_used_features
+nb_burg_features = model.nb_burg_features
+sequence_size = args.seq_length
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+
+features = np.memmap(feature_file, dtype='float32', mode='r')
+nb_sequences = len(features)//(nb_features*sequence_size)//batch_size*batch_size
+features = features[:nb_sequences*sequence_size*nb_features]
+
+features = np.reshape(features, (nb_sequences, sequence_size, nb_features))
+
+features = features[:, :, :nb_used_features+model.nb_burg_features]
+
+lost = np.memmap(args.lost_file, dtype='int8', mode='r')
+
+# dump models to disk as we go
+checkpoint = ModelCheckpoint('{}_{}_{}.h5'.format(args.output, args.gru_size, '{epoch:02d}'))
+
+if args.retrain is not None:
+ model.load_weights(args.retrain)
+
+if quantize or retrain:
+ #Adapting from an existing model
+ model.load_weights(input_model)
+
+model.save_weights('{}_{}_initial.h5'.format(args.output, args.gru_size))
+
+loader = PLCLoader(features, lost, nb_burg_features, batch_size)
+
+callbacks = [checkpoint]
+if args.logdir is not None:
+ logdir = '{}/{}_{}_logs'.format(args.logdir, args.output, args.gru_size)
+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
+ callbacks.append(tensorboard_callback)
+
+model.fit(loader, epochs=nb_epochs, validation_split=0.0, callbacks=callbacks)
--
⑨