shithub: opus

--- a/celt/arch.h

+++ b/celt/arch.h

@@ -101,6 +101,7 @@

 typedef opus_int16 opus_val16;

 typedef opus_int32 opus_val32;

+typedef opus_int64 opus_val64;

 typedef opus_val32 celt_sig;

 typedef opus_val16 celt_norm;

@@ -158,6 +159,7 @@

 typedef float opus_val16;

 typedef float opus_val32;

+typedef float opus_val64;

 typedef float celt_sig;

 typedef float celt_norm;

--- a/celt/celt.h

+++ b/celt/celt.h

@@ -57,6 +57,7 @@

    float noisiness;

    float activity;

    float music_prob;

+   float vad_prob;

    int   bandwidth;

    float activity_probability;

 } AnalysisInfo;

--- a/src/analysis.c

+++ b/src/analysis.c

@@ -42,6 +42,7 @@

 #include "analysis.h"

 #include "mlp.h"

 #include "stack_alloc.h"

+#include "float_cast.h"

 #ifndef M_PI

 #define M_PI 3.141592653

@@ -100,24 +101,118 @@

};

 static const int tbands[NB_TBANDS+1] = {

-       2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120

+      4, 8, 12, 16, 20, 24, 28, 32, 40, 48, 56, 64, 80, 96, 112, 136, 160, 192, 240

};

-static const int extra_bands[NB_TOT_BANDS+1] = {

-      1, 2,  4,  6,  8, 10, 12, 14, 16, 20, 24, 28, 32, 40, 48, 56, 68, 80, 96, 120, 160, 200

-};

+#define NB_TONAL_SKIP_BANDS 9

-/*static const float tweight[NB_TBANDS+1] = {

-      .3, .4, .5, .6, .7, .8, .9, 1., 1., 1., 1., 1., 1., 1., .8, .7, .6, .5

-};*/

+static opus_val32 silk_resampler_down2_hp(

+    opus_val32                  *S,                 /* I/O  State vector [ 2 ]                                          */

+    opus_val32                  *out,               /* O    Output signal [ floor(len/2) ]                              */

+    const opus_val32            *in,                /* I    Input signal [ len ]                                        */

+    int                         inLen               /* I    Number of input samples                                     */

+)

+{

+    int k, len2 = inLen/2;

+    opus_val32 in32, out32, out32_hp, Y, X;

+    opus_val64 hp_ener = 0;

+    /* Internal variables and state are in Q10 format */

+    for( k = 0; k < len2; k++ ) {

+        /* Convert to Q10 */

+        in32 = in[ 2 * k ];

-#define NB_TONAL_SKIP_BANDS 9

+        /* All-pass section for even input sample */

+        Y      = SUB32( in32, S[ 0 ] );

+        X      = MULT16_32_Q15(QCONST16(0.6074371f, 15), Y);

+        out32  = ADD32( S[ 0 ], X );

+        S[ 0 ] = ADD32( in32, X );

+        out32_hp = out32;

+        /* Convert to Q10 */

+        in32 = in[ 2 * k + 1 ];

+        /* All-pass section for odd input sample, and add to output of previous section */

+        Y      = SUB32( in32, S[ 1 ] );

+        X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);

+        out32  = ADD32( out32, S[ 1 ] );

+        out32  = ADD32( out32, X );

+        S[ 1 ] = ADD32( in32, X );

-void tonality_analysis_init(TonalityAnalysisState *tonal)

+        Y      = SUB32( -in32, S[ 2 ] );

+        X      = MULT16_32_Q15(QCONST16(0.15063f, 15), Y);

+        out32_hp  = ADD32( out32_hp, S[ 2 ] );

+        out32_hp  = ADD32( out32_hp, X );

+        S[ 2 ] = ADD32( -in32, X );

+        hp_ener += out32_hp*(opus_val64)out32_hp;

+        /* Add, convert back to int16 and store to output */

+        out[ k ] = HALF32(out32);

+    }

+#ifdef FIXED_POINT

+    /* len2 can be up to 480, so we shift by 8 more to make it fit. */

+    hp_ener = hp_ener >> (2*SIG_SHIFT + 8);

+#endif

+    return hp_ener;

+}

+static opus_val32 downmix_and_resample(downmix_func downmix, const void *_x, opus_val32 *y, opus_val32 S[3], int subframe, int offset, int c1, int c2, int C, int Fs)

+   VARDECL(opus_val32, tmp);

+   opus_val32 scale;

+   int j;

+   opus_val32 ret = 0;

+   SAVE_STACK;

+   if (subframe==0) return 0;

+   if (Fs == 48000)

+   {

+      subframe *= 2;

+      offset *= 2;

+   } else if (Fs == 16000) {

+      subframe = subframe*2/3;

+      offset = offset*2/3;

+   }

+   ALLOC(tmp, subframe, opus_val32);

+   downmix(_x, tmp, subframe, offset, c1, c2, C);

+#ifdef FIXED_POINT

+   scale = (1<<SIG_SHIFT);

+#else

+   scale = 1.f/32768;

+#endif

+   if (c2==-2)

+      scale /= C;

+   else if (c2>-1)

+      scale /= 2;

+   for (j=0;j<subframe;j++)

+      tmp[j] *= scale;

+   if (Fs == 48000)

+   {

+      ret = silk_resampler_down2_hp(S, y, tmp, subframe);

+   } else if (Fs == 24000) {

+      OPUS_COPY(y, tmp, subframe);

+   } else if (Fs == 16000) {

+      VARDECL(opus_val32, tmp3x);

+      ALLOC(tmp3x, 3*subframe, opus_val32);

+      /* Don't do this at home! This resampler is horrible and it's only (barely)

+         usable for the purpose of the analysis because we don't care about all

+         the aliasing between 8 kHz and 12 kHz. */

+      for (j=0;j<subframe;j++)

+      {

+         tmp3x[3*j] = tmp[j];

+         tmp3x[3*j+1] = tmp[j];

+         tmp3x[3*j+2] = tmp[j];

+      }

+      silk_resampler_down2_hp(S, y, tmp3x, 3*subframe);

+   }

+   RESTORE_STACK;

+   return ret;

+}

+void tonality_analysis_init(TonalityAnalysisState *tonal, opus_int32 Fs)

+{

   /* Initialize reusable fields. */

   tonal->arch = opus_select_arch();

+  tonal->Fs = Fs;

   /* Clear remaining fields. */

   tonality_analysis_reset(tonal);

@@ -141,7 +236,8 @@

    if (curr_lookahead<0)

       curr_lookahead += DETECT_SIZE;

-   if (len > 480 && pos != tonal->write_pos)

+   /* On long frames, look at the second analysis window rather than the first. */

+   if (len > tonal->Fs/50 && pos != tonal->write_pos)

       pos++;

       if (pos==DETECT_SIZE)

@@ -152,18 +248,27 @@

    if (pos<0)

       pos = DETECT_SIZE-1;

    OPUS_COPY(info_out, &tonal->info[pos], 1);

-   tonal->read_subframe += len/120;

-   while (tonal->read_subframe>=4)

+   /* If possible, look ahead for a tone to compensate for the delay in the tone detector. */

+   for (i=0;i<3;i++)

-      tonal->read_subframe -= 4;

+      pos++;

+      if (pos==DETECT_SIZE)

+         pos = 0;

+      if (pos == tonal->write_pos)

+         break;

+      info_out->tonality = MAX32(0, -.03 + MAX32(info_out->tonality, tonal->info[pos].tonality-.05));

+   }

+   tonal->read_subframe += len/(tonal->Fs/400);

+   while (tonal->read_subframe>=8)

+   {

+      tonal->read_subframe -= 8;

       tonal->read_pos++;

    if (tonal->read_pos>=DETECT_SIZE)

       tonal->read_pos-=DETECT_SIZE;

-   /* Compensate for the delay in the features themselves.

-      FIXME: Need a better estimate the 10 I just made up */

-   curr_lookahead = IMAX(curr_lookahead-10, 0);

+   /* The -1 is to compensate for the delay in the features themselves. */

+   curr_lookahead = IMAX(curr_lookahead-1, 0);

    psum=0;

    /* Summing the probability of transition patterns that involve music at

@@ -173,7 +278,7 @@

    for (;i<DETECT_SIZE;i++)

       psum += tonal->pspeech[i];

    psum = psum*tonal->music_confidence + (1-psum)*tonal->speech_confidence;

-   /*printf("%f %f %f\n", psum, info_out->music_prob, info_out->tonality);*/

+   /*printf("%f %f %f %f %f\n", psum, info_out->music_prob, info_out->vad_prob, info_out->activity_probability, info_out->tonality);*/

    info_out->music_prob = psum;

@@ -216,19 +321,33 @@

     float noise_floor;

     int remaining;

     AnalysisInfo *info;

+    float hp_ener;

+    float tonality2[240];

+    float midE[8];

+    float spec_variability=0;

     SAVE_STACK;

-    tonal->last_transition++;

-    alpha = 1.f/IMIN(20, 1+tonal->count);

-    alphaE = 1.f/IMIN(50, 1+tonal->count);

-    alphaE2 = 1.f/IMIN(1000, 1+tonal->count);

+    alpha = 1.f/IMIN(10, 1+tonal->count);

+    alphaE = 1.f/IMIN(25, 1+tonal->count);

+    alphaE2 = 1.f/IMIN(500, 1+tonal->count);

+    if (tonal->Fs == 48000)

+    {

+       /* len and offset are now at 24 kHz. */

+       len/= 2;

+       offset /= 2;

+    } else if (tonal->Fs == 16000) {

+       len = 3*len/2;

+       offset = 3*offset/2;

+    }

     if (tonal->count<4)

        tonal->music_prob = .5;

     kfft = celt_mode->mdct.kfft[0];

     if (tonal->count==0)

        tonal->mem_fill = 240;

-    downmix(x, &tonal->inmem[tonal->mem_fill], IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C);

+    tonal->hp_ener_accum += downmix_and_resample(downmix, x, &tonal->inmem[tonal->mem_fill], tonal->downmix_state,

+          IMIN(len, ANALYSIS_BUF_SIZE-tonal->mem_fill), offset, c1, c2, C, tonal->Fs);

     if (tonal->mem_fill+len < ANALYSIS_BUF_SIZE)

        tonal->mem_fill += len;

@@ -236,6 +355,7 @@

        RESTORE_STACK;

        return;

+    hp_ener = tonal->hp_ener_accum;

     info = &tonal->info[tonal->write_pos++];

     if (tonal->write_pos>=DETECT_SIZE)

        tonal->write_pos-=DETECT_SIZE;

@@ -254,7 +374,8 @@

     OPUS_MOVE(tonal->inmem, tonal->inmem+ANALYSIS_BUF_SIZE-240, 240);

     remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);

-    downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);

+    tonal->hp_ener_accum = downmix_and_resample(downmix, x, &tonal->inmem[240], tonal->downmix_state,

+          remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);

     tonal->mem_fill = 240 + remaining;

     opus_fft(kfft, in, out, tonal->arch);

 #ifndef FIXED_POINT

@@ -286,24 +407,31 @@

        d_angle2 = angle2 - angle;

        d2_angle2 = d_angle2 - d_angle;

-       mod1 = d2_angle - (float)floor(.5+d2_angle);

+       mod1 = d2_angle - (float)float2int(d2_angle);

        noisiness[i] = ABS16(mod1);

        mod1 *= mod1;

        mod1 *= mod1;

-       mod2 = d2_angle2 - (float)floor(.5+d2_angle2);

+       mod2 = d2_angle2 - (float)float2int(d2_angle2);

        noisiness[i] += ABS16(mod2);

        mod2 *= mod2;

        mod2 *= mod2;

-       avg_mod = .25f*(d2A[i]+2.f*mod1+mod2);

+       avg_mod = .25f*(d2A[i]+mod1+2*mod2);

+       /* This introduces an extra delay of 2 frames in the detection. */

        tonality[i] = 1.f/(1.f+40.f*16.f*pi4*avg_mod)-.015f;

+       /* No delay on this detection, but it's less reliable. */

+       tonality2[i] = 1.f/(1.f+40.f*16.f*pi4*mod2)-.015f;

        A[i] = angle2;

        dA[i] = d_angle2;

        d2A[i] = mod2;

+    for (i=2;i<N2-1;i++)

+    {

+       float tt = MIN32(tonality2[i], MAX32(tonality2[i-1], tonality2[i+1]));

+       tonality[i] = .9*MAX32(tonality[i], tt-.1);

+    }

     frame_tonality = 0;

     max_frame_tonality = 0;

     /*tw_sum = 0;*/

@@ -334,7 +462,7 @@

           binE *= 5.55e-17f;

 #endif

           E += binE;

-          tE += binE*tonality[i];

+          tE += binE*MAX32(0, tonality[i]);

           nE += binE*2.f*(.5f-noisiness[i]);

 #ifndef FIXED_POINT

@@ -352,14 +480,26 @@

        frame_loudness += (float)sqrt(E+1e-10f);

        logE[b] = (float)log(E+1e-10f);

-       tonal->lowE[b] = MIN32(logE[b], tonal->lowE[b]+.01f);

-       tonal->highE[b] = MAX32(logE[b], tonal->highE[b]-.1f);

-       if (tonal->highE[b] < tonal->lowE[b]+1.f)

+       tonal->logE[tonal->E_count][b] = logE[b];

+       if (tonal->count==0)

+          tonal->highE[b] = tonal->lowE[b] = logE[b];

+       if (tonal->highE[b] > tonal->lowE[b] + 7.5)

-          tonal->highE[b]+=.5f;

-          tonal->lowE[b]-=.5f;

+          if (tonal->highE[b] - logE[b] > logE[b] - tonal->lowE[b])

+             tonal->highE[b] -= .01;

+          else

+             tonal->lowE[b] += .01;

-       relativeE += (logE[b]-tonal->lowE[b])/(1e-15f+tonal->highE[b]-tonal->lowE[b]);

+       if (logE[b] > tonal->highE[b])

+       {

+          tonal->highE[b] = logE[b];

+          tonal->lowE[b] = MAX32(tonal->highE[b]-15, tonal->lowE[b]);

+       } else if (logE[b] < tonal->lowE[b])

+       {

+          tonal->lowE[b] = logE[b];

+          tonal->highE[b] = MIN32(tonal->lowE[b]+15, tonal->highE[b]);

+       }

+       relativeE += (logE[b]-tonal->lowE[b])/(1e-15f + (tonal->highE[b]-tonal->lowE[b]));

        L1=L2=0;

        for (i=0;i<NB_FRAMES;i++)

@@ -391,6 +531,26 @@

        tonal->prev_band_tonality[b] = band_tonality[b];

+    for (i=0;i<NB_FRAMES;i++)

+    {

+       int j;

+       float mindist = 1e15;

+       for (j=0;j<NB_FRAMES;j++)

+       {

+          int k;

+          float dist=0;

+          for (k=0;k<NB_TBANDS;k++)

+          {

+             float tmp;

+             tmp = tonal->logE[i][k] - tonal->logE[j][k];

+             dist += tmp*tmp;

+          }

+          if (j!=i)

+             mindist = MIN32(mindist, dist);

+       }

+       spec_variability += mindist;

+    }

+    spec_variability = sqrt(spec_variability/NB_FRAMES/NB_TBANDS);

     bandwidth_mask = 0;

     bandwidth = 0;

     maxE = 0;

@@ -399,13 +559,13 @@

     noise_floor *= 1<<(15+SIG_SHIFT);

 #endif

     noise_floor *= noise_floor;

-    for (b=0;b<NB_TOT_BANDS;b++)

+    for (b=0;b<NB_TBANDS;b++)

        float E=0;

        int band_start, band_end;

        /* Keep a margin of 300 Hz for aliasing */

-       band_start = extra_bands[b];

-       band_end = extra_bands[b+1];

+       band_start = tbands[b];

+       band_end = tbands[b+1];

        for (i=band_start;i<band_end;i++)

           float binE = out[i].r*(float)out[i].r + out[N-i].r*(float)out[N-i].r

@@ -422,14 +582,31 @@

           2) less than 90 dB below the peak band (maximal masking possible considering

              both the ATH and the loudness-dependent slope of the spreading function)

           3) above the PCM quantization noise floor

+          We use b+1 because the first CELT band isn't included in tbands[]

*/

        if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*(band_end-band_start))

-          bandwidth = b;

+          bandwidth = b+1;

+    /* Special case for the last two bands, for which we don't have spectrum but only

+       the energy above 12 kHz. */

+    {

+       float E = hp_ener*(1./(240*240));

+#ifdef FIXED_POINT

+       /* silk_resampler_down2_hp() shifted right by an extra 8 bits. */

+       E *= ((opus_int32)1 << 2*SIG_SHIFT)*256.f;

+#endif

+       maxE = MAX32(maxE, E);

+       tonal->meanE[b] = MAX32((1-alphaE2)*tonal->meanE[b], E);

+       E = MAX32(E, tonal->meanE[b]);

+       /* Use a simple follower with 13 dB/Bark slope for spreading function */

+       bandwidth_mask = MAX32(.05f*bandwidth_mask, E);

+       if (E>.1*bandwidth_mask && E*1e9f > maxE && E > noise_floor*160)

+          bandwidth = 20;

+    }

     if (tonal->count<=2)

        bandwidth = 20;

     frame_loudness = 20*(float)log10(frame_loudness);

-    tonal->Etracker = MAX32(tonal->Etracker-.03f, frame_loudness);

+    tonal->Etracker = MAX32(tonal->Etracker-.003f, frame_loudness);

     tonal->lowECount *= (1-alphaE);

     if (frame_loudness < tonal->Etracker-30)

        tonal->lowECount += alphaE;

@@ -441,6 +618,13 @@

           sum += dct_table[i*16+b]*logE[b];

        BFCC[i] = sum;

+    for (i=0;i<8;i++)

+    {

+       float sum=0;

+       for (b=0;b<16;b++)

+          sum += dct_table[i*16+b]*.5*(tonal->highE[b]+tonal->lowE[b]);

+       midE[i] = sum;

+    }

     frame_stationarity /= NB_TBANDS;

     relativeE /= NB_TBANDS;

@@ -460,7 +644,7 @@

     info->tonality_slope = slope;

     tonal->E_count = (tonal->E_count+1)%NB_FRAMES;

-    tonal->count++;

+    tonal->count = IMIN(tonal->count+1, ANALYSIS_COUNT_MAX);

     info->tonality = frame_tonality;

     for (i=0;i<4;i++)

@@ -479,6 +663,8 @@

        for (i=0;i<9;i++)

           tonal->std[i] = (1-alpha)*tonal->std[i] + alpha*features[i]*features[i];

+    for (i=0;i<4;i++)

+       features[i] = BFCC[i]-midE[i];

     for (i=0;i<8;i++)

@@ -489,6 +675,7 @@

     for (i=0;i<9;i++)

        features[11+i] = (float)sqrt(tonal->std[i]) - std_feature_bias[i];

+    features[18] = spec_variability-.78;;

     features[20] = info->tonality - 0.154723;

     features[21] = info->activity - 0.724643;

     features[22] = frame_stationarity - 0.743717;

@@ -503,8 +690,6 @@

     /* Probability of active audio (as opposed to silence) */

     frame_probs[1] = .5f*frame_probs[1]+.5f;

     frame_probs[1] *= frame_probs[1];

-    /* Consider that silence has a 50-50 probability. */

-    frame_probs[0] = frame_probs[1]*frame_probs[0] + (1-frame_probs[1])*.5f;

     /* Probability of speech or music vs noise */

     info->activity_probability = frame_probs[1];

@@ -527,12 +712,32 @@

        float music0;

        float p, q;

+       /* More silence transitions for speech than for music. */

+       tau = .001f*tonal->music_prob + .01f*(1-tonal->music_prob);

+       p = MAX16(.05f,MIN16(.95f,frame_probs[1]));

+       q = MAX16(.05f,MIN16(.95f,tonal->vad_prob));

+       beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));

+       /* p0 and p1 are the probabilities of speech and music at this frame

+          using only information from previous frame and applying the

+          state transition model */

+       p0 = (1-tonal->vad_prob)*(1-tau) +    tonal->vad_prob *tau;

+       p1 =    tonal->vad_prob *(1-tau) + (1-tonal->vad_prob)*tau;

+       /* We apply the current probability with exponent beta to work around

+          the fact that the probability estimates aren't independent. */

+       p0 *= (float)pow(1-frame_probs[1], beta);

+       p1 *= (float)pow(frame_probs[1], beta);

+       /* Normalise the probabilities to get the Marokv probability of music. */

+       tonal->vad_prob = p1/(p0+p1);

+       info->vad_prob = tonal->vad_prob;

+       /* Consider that silence has a 50-50 probability of being speech or music. */

+       frame_probs[0] = tonal->vad_prob*frame_probs[0] + (1-tonal->vad_prob)*.5f;

        /* One transition every 3 minutes of active audio */

-       tau = .00005f*frame_probs[1];

+       tau = .0001f;

        /* Adapt beta based on how "unexpected" the new prob is */

        p = MAX16(.05f,MIN16(.95f,frame_probs[0]));

        q = MAX16(.05f,MIN16(.95f,tonal->music_prob));

-       beta = .01f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));

+       beta = .02f+.05f*ABS16(p-q)/(p*(1-q)+q*(1-p));

        /* p0 and p1 are the probabilities of speech and music at this frame

           using only information from previous frame and applying the

           state transition model */

@@ -546,6 +751,7 @@

        tonal->music_prob = p1/(p0+p1);

        info->music_prob = tonal->music_prob;

+       /*printf("%f %f %f %f\n", frame_probs[0], frame_probs[1], tonal->music_prob, tonal->vad_prob);*/

        /* This chunk of code deals with delayed decision. */

        psum=1e-20f;

        /* Instantaneous probability of speech and music, with beta pre-applied. */

@@ -611,15 +817,15 @@

              tonal->speech_confidence = .1f;

-    if (tonal->last_music != (tonal->music_prob>.5f))

-       tonal->last_transition=0;

     tonal->last_music = tonal->music_prob>.5f;

 #else

     info->music_prob = 0;

 #endif

-    /*for (i=0;i<25;i++)

+#ifdef MLP_TRAINING

+    for (i=0;i<25;i++)

        printf("%f ", features[i]);

-    printf("\n");*/

+    printf("\n");

+#endif

     info->bandwidth = bandwidth;

     /*printf("%d %d\n", info->bandwidth, info->opus_bandwidth);*/

@@ -635,17 +841,18 @@

    int offset;

    int pcm_len;

+   analysis_frame_size -= analysis_frame_size&1;

    if (analysis_pcm != NULL)

       /* Avoid overflow/wrap-around of the analysis buffer */

-      analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/100, analysis_frame_size);

+      analysis_frame_size = IMIN((DETECT_SIZE-5)*Fs/50, analysis_frame_size);

       pcm_len = analysis_frame_size - analysis->analysis_offset;

       offset = analysis->analysis_offset;

       while (pcm_len>0) {

-         tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(480, pcm_len), offset, c1, c2, C, lsb_depth, downmix);

-         offset += 480;

-         pcm_len -= 480;

+         tonality_analysis(analysis, celt_mode, analysis_pcm, IMIN(Fs/50, pcm_len), offset, c1, c2, C, lsb_depth, downmix);

+         offset += Fs/50;

+         pcm_len -= Fs/50;

       analysis->analysis_offset = analysis_frame_size;

--- a/src/analysis.h

+++ b/src/analysis.h

@@ -33,13 +33,19 @@

 #define NB_FRAMES 8

 #define NB_TBANDS 18

-#define NB_TOT_BANDS 21

-#define ANALYSIS_BUF_SIZE 720 /* 15 ms at 48 kHz */

+#define ANALYSIS_BUF_SIZE 720 /* 30 ms at 24 kHz */

-#define DETECT_SIZE 200

+/* At that point we can stop counting frames because it no longer matters. */

+#define ANALYSIS_COUNT_MAX 10000

+#define DETECT_SIZE 100

+/* Uncomment this to print the MLP features on stdout. */

+/*#define MLP_TRAINING*/

 typedef struct {

    int arch;

+   opus_int32 Fs;

 #define TONALITY_ANALYSIS_RESET_START angle

    float angle[240];

    float d_angle[240];

@@ -49,18 +55,19 @@

    float prev_band_tonality[NB_TBANDS];

    float prev_tonality;

    float E[NB_FRAMES][NB_TBANDS];

+   float logE[NB_FRAMES][NB_TBANDS];

    float lowE[NB_TBANDS];

    float highE[NB_TBANDS];

-   float meanE[NB_TOT_BANDS];

+   float meanE[NB_TBANDS+1];

    float mem[32];

    float cmean[8];

    float std[9];

    float music_prob;

+   float vad_prob;

    float Etracker;

    float lowECount;

    int E_count;

    int last_music;

-   int last_transition;

    int count;

    int analysis_offset;

    /** Probability of having speech for time i to DETECT_SIZE-1 (and music before).

@@ -76,6 +83,8 @@

    int write_pos;

    int read_pos;

    int read_subframe;

+   float hp_ener_accum;

+   opus_val32 downmix_state[3];

    AnalysisInfo info[DETECT_SIZE];

 } TonalityAnalysisState;

@@ -85,7 +94,7 @@

  * not be repeated every analysis step. No allocated memory is retained

  * by the state struct, so no cleanup call is required.

*/

-void tonality_analysis_init(TonalityAnalysisState *analysis);

+void tonality_analysis_init(TonalityAnalysisState *analysis, opus_int32 Fs);

 /** Reset a TonalityAnalysisState stuct.

--- a/src/mlp_data.c

+++ b/src/mlp_data.c

@@ -4,104 +4,104 @@

 #include "mlp.h"

-/* RMS error was 0.230027, seed was 1452289367 */

-/* 0.009100 0.069938 (0.230027 0.230027) 1.24058e-07 5543 */

+/* RMS error was 0.280492, seed was 1480478173 */

+/* 0.005976 0.031821 (0.280494 0.280492) done */

 static const float weights[450] = {

 /* hidden layer */

--1.20927f, -0.0275523f, 0.0304442f, -0.071791f, -0.0897356f,

-0.100996f, -0.0492634f, 0.070213f, 0.0187071f, 0.0042668f,

-0.0644589f, -0.10967f, -0.119688f, -0.00888386f, 0.170952f,

-0.174562f, -0.265435f, -0.0635892f, -0.284755f, -1.06453f,

-0.202855f, 2.31084f, -2.763f, -0.420894f, 0.698811f,

-6.46418f, 0.0662341f, 0.0758173f, 0.0511722f, 0.0426484f,

-0.115711f, -0.263815f, -0.0113386f, -0.189737f, -0.0929912f,

--0.287827f, 0.0925463f, 0.0286792f, -0.0199793f, -0.193071f,

-0.258586f, 0.018504f, 0.116125f, 0.099269f, -0.00781962f,

--0.266017f, 0.283733f, 10.5488f, -0.658286f, 0.836758f,

-13.1168f, -5.02553f, -1.0969f, -0.0738116f, 0.0204736f,

-0.0110775f, -0.00198985f, 0.00426824f, 0.148998f, 0.0755275f,

-0.112213f, -0.0518501f, 0.028398f, 0.0240943f, -0.0503666f,

--0.149506f, -0.133575f, -0.137328f, 0.116275f, 0.238077f,

-0.080265f, 0.0387349f, 0.09185f, 4.04867f, 3.2435f,

--0.7155f, 8.14792f, -29.8969f, 1.1575f, -0.124794f,

-0.0226943f, -0.0470538f, -0.0334476f, 0.0360859f, 0.0447789f,

--0.00258532f, -0.0192054f, -0.113082f, 0.109513f, -0.0437787f,

-0.0382349f, -0.00994462f, -0.155653f, 0.171922f, -0.222151f,

--0.523565f, -0.0454432f, -0.556888f, 0.761537f, -2.70075f,

--0.883015f, 0.887168f, 0.746329f, -0.363477f, 0.360424f,

-0.034755f, -0.015404f, 0.00688472f, -0.00949269f, 0.0625642f,

--0.050711f, 0.0370223f, 0.0149561f, 0.060385f, -0.0709806f,

--0.036509f, 0.099007f, -0.0397276f, 0.285237f, 0.127836f,

--0.15154f, 0.265848f, -0.0832318f, 0.0520659f, 0.897805f,

-0.439215f, -3.00803f, 1.93755f, -0.408725f, 0.300142f,

--1.42001f, 0.118794f, -0.04621f, 0.050757f, -0.0239654f,

--0.0629488f, -0.0083243f, -0.108989f, -0.0326831f, 0.104277f,

--0.0667274f, 0.0475941f, 0.069182f, -0.0574944f, -0.137823f,

--0.206978f, -0.162035f, -0.208444f, 0.141751f, -0.289377f,

--0.7875f, 0.0911f, 0.174999f, -2.03406f, 3.06743f,

-1.22255f, 2.10659f, 0.0779022f, -0.220946f, 0.137124f,

--0.0625512f, -0.073468f, 0.174861f, -0.139417f, 0.0967417f,

-0.0830658f, -0.223662f, 0.103016f, -0.102317f, 0.225611f,

-0.154375f, 0.187856f, -0.00878193f, 0.128648f, -0.371477f,

--0.479037f, 0.156541f, 1.10304f, -1.26162f, 0.086939f,

--0.143269f, 2.18318f, -2.88831f, 0.101126f, -0.308315f,

-0.222068f, -0.227709f, -0.00855236f, 0.0107035f, 0.00774349f,

--0.0185316f, 0.0306039f, -0.233612f, 0.0807309f, -0.029933f,

-0.151942f, -0.267724f, 0.0484763f, 0.132192f, -0.230059f,

-0.357879f, 0.075414f, 0.110637f, -1.27818f, 3.3101f,

-0.831064f, -0.212367f, -20.704f, -1.1492f, 0.0312941f,

--0.0208507f, -0.00804196f, 0.0110407f, 0.027599f, 0.00193594f,

--0.0135057f, -0.00614977f, 0.0505432f, -0.0108098f, 0.000826042f,

--0.0243765f, -0.323055f, 0.0682748f, -0.55873f, -0.103042f,

-0.174935f, -0.126558f, -0.104518f, 0.422479f, -0.0683178f,

--1.44811f, 0.702109f, 0.712138f, -0.420112f, 2.59746f,

--0.0297689f, -0.0453044f, -0.0330312f, -0.0344518f, -0.0260442f,

--0.0610515f, 0.0916816f, 0.0256295f, -0.105187f, 0.0771212f,

--0.0898792f, -0.186163f, -0.321019f, -0.225689f, 0.175825f,

-0.252939f, 0.738898f, 2.41919f, 0.114505f, -0.314026f,

-0.607983f, 1.73201f, -2.09609f, -0.609339f, 1.18997f,

-0.113871f, -0.177673f, -0.0785783f, -0.348033f, -0.0949274f,

--0.0191062f, 0.335823f, -0.0578655f, 0.131259f, -0.118687f,

--0.132123f, -0.239624f, 0.000738732f, -0.185936f, -0.13077f,

--0.436439f, -0.141664f, 0.0353391f, -0.0536557f, -0.0964537f,

-0.221853f, 1.94264f, -1.78544f, 3.8254f, 3.74598f,

-2.37071f, -1.42709f, 0.0463179f, -0.0568602f, 0.0529534f,

--0.103245f, -0.340972f, 0.101934f, -0.810811f, 0.176158f,

-0.469658f, 0.0248864f, -0.10734f, -0.143827f, -0.0457131f,

-0.779219f, -0.142152f, 0.0394297f, 0.160772f, -0.707623f,

--0.608236f, 1.07106f, -1.27037f, 2.27722f, 6.3688f,

-0.519837f, -3.33262f, -0.126443f, -0.0943922f, 0.0265837f,

-0.0620709f, 0.0113266f, -0.255811f, -0.0735781f, -0.0638952f,

--0.09543f, -0.204965f, 0.00454999f, 0.0554974f, -0.16251f,

--0.573836f, 0.258764f, 0.19895f, 0.0219289f, -0.376757f,

--0.508578f, -0.0767061f, -0.654512f, 4.48901f, 3.38949f,

--2.34533f, -11.0766f, 4.35799f, 1.66794f, -0.0513934f,

--0.0685787f, -0.0112154f, 0.000464661f, -0.234848f, -0.338596f,

--0.142242f, -0.167476f, -0.140324f, -0.104829f, -0.104195f,

-0.0110351f, -0.112668f, 0.0872292f, -0.170777f, -0.0876985f,

-0.123348f, -0.156758f, 0.199038f, -0.056107f, 0.899269f,

-0.0820197f, -1.295f, 0.0295294f, 2.27577f, -0.940993f,

--0.0100104f, -0.111541f, -0.132193f, -0.11037f, 0.0371375f,

--0.0180172f, -0.0105591f, 0.0197043f, 0.04099f, -0.0538671f,

--0.102347f, -0.0470742f, 0.178034f, -0.267772f, -0.105789f,

--0.105376f, 0.0623262f, -0.042906f, 0.176528f, -0.160076f,

--2.28483f, -1.92619f, 0.218149f, 9.67107f, 3.30399f,

--1.75951f, 0.129671f, 0.118305f, 0.140766f, 0.0678099f,

-0.00313175f, -0.0144533f, -0.0310217f, -0.0245139f, 0.136948f,

-0.150137f, 0.112326f, -0.0755033f, -0.280984f, -0.249342f,

--0.681657f, 0.0315246f, 0.294968f, 0.0407062f, 0.282759f,

--0.344185f, -7.32828f, -0.220036f, -0.560418f, -1.87191f,

--7.10132f,

+-0.514624f, 0.0234227f, -0.14329f, -0.0878216f, -0.00187827f,

+-0.0257443f, 0.108524f, 0.00333881f, 0.00585017f, -0.0246132f,

+0.142723f, -0.00436494f, 0.0101354f, -0.11124f, -0.0809367f,

+-0.0750772f, 0.0295524f, 0.00823944f, 0.150392f, 0.0320876f,

+-0.0710564f, -1.43818f, 0.652076f, 0.0650744f, -1.54821f,

+0.168949f, -1.92724f, 0.0517976f, -0.0670737f, -0.0690121f,

+0.00247528f, -0.0522024f, 0.0631368f, 0.0532776f, 0.047751f,

+-0.011715f, 0.142374f, -0.0290885f, -0.279263f, -0.433499f,

+-0.0795174f, -0.380458f, -0.051263f, 0.218537f, -0.322478f,

+1.06667f, -0.104607f, -4.70108f, 0.312037f, 0.277397f,

+-2.71859f, 1.70037f, -0.141845f, 0.0115618f, 0.0629883f,

+0.0403871f, 0.0139428f, -0.00430733f, -0.0429038f, -0.0590318f,

+-0.0501526f, -0.0284802f, -0.0415686f, -0.0438999f, 0.0822666f,

+0.197194f, 0.0363275f, -0.0584307f, 0.0752364f, -0.0799796f,

+-0.146275f, 0.161661f, -0.184585f, 0.145568f, 0.442823f,

+1.61221f, 1.11162f, 2.62177f, -2.482f, -0.112599f,

+-0.110366f, -0.140794f, -0.181694f, 0.0648674f, 0.0842248f,

+0.0933993f, 0.150122f, 0.129171f, 0.176848f, 0.141758f,

+-0.271822f, 0.235113f, 0.0668579f, -0.433957f, 0.113633f,

+-0.169348f, -1.40091f, 0.62861f, -0.134236f, 0.402173f,

+1.86373f, 1.53998f, -4.32084f, 0.735343f, 0.800214f,

+-0.00968415f, 0.0425904f, 0.0196811f, -0.018426f, -0.000343953f,

+-0.00416389f, 0.00111558f, 0.0173069f, -0.00998596f, -0.025898f,

+0.00123764f, -0.00520373f, -0.0565033f, 0.0637394f, 0.0051213f,

+0.0221361f, 0.00819962f, -0.0467061f, -0.0548258f, -0.00314063f,

+-1.18332f, 1.88091f, -0.41148f, -2.95727f, -0.521449f,

+-0.271641f, 0.124946f, -0.0532936f, 0.101515f, 0.000208564f,

+-0.0488748f, 0.0642388f, -0.0383848f, 0.0135046f, -0.0413592f,

+-0.0326402f, -0.0137421f, -0.0225219f, -0.0917294f, -0.277759f,

+-0.185418f, 0.0471128f, -0.125879f, 0.262467f, -0.212794f,

+-0.112931f, -1.99885f, -0.404787f, 0.224402f, 0.637962f,

+-0.27808f, -0.0723953f, -0.0537655f, -0.0336359f, -0.0906601f,

+-0.0641309f, -0.0713542f, 0.0524317f, 0.00608819f, 0.0754101f,

+-0.0488401f, -0.00671865f, 0.0418239f, 0.0536284f, -0.132639f,

+0.0267648f, -0.248432f, -0.0104153f, 0.035544f, -0.212753f,

+-0.302895f, -0.0357854f, 0.376838f, 0.597025f, -0.664647f,

+0.268422f, -0.376772f, -1.05472f, 0.0144178f, 0.179122f,

+0.0360155f, 0.220262f, -0.0056381f, 0.0317197f, 0.0621066f,

+-0.00779298f, 0.00789378f, 0.00350605f, 0.0104809f, 0.0362871f,

+-0.157708f, -0.0659779f, -0.0926278f, 0.00770791f, 0.0631621f,

+0.0817343f, -0.424295f, -0.0437727f, -0.24251f, 0.711217f,

+-0.736455f, -2.194f, -0.107612f, -0.175156f, -0.0366573f,

+-0.0123156f, -0.0628516f, -0.0218977f, -0.00693699f, 0.00695185f,

+0.00507362f, 0.00359334f, 0.0052661f, 0.035561f, 0.0382701f,

+0.0342179f, -0.00790271f, -0.0170925f, 0.047029f, 0.0197362f,

+-0.0153435f, 0.0644152f, -0.36862f, -0.0674876f, -2.82672f,

+1.34122f, -0.0788029f, -3.47792f, 0.507246f, -0.816378f,

+-0.0142383f, -0.127349f, -0.106926f, -0.0359524f, 0.105045f,

+0.291554f, 0.195413f, 0.0866214f, -0.066577f, -0.102188f,

+0.0979466f, -0.12982f, 0.400181f, -0.409336f, -0.0593326f,

+-0.0656203f, -0.204474f, 0.179802f, 0.000509084f, 0.0995954f,

+-2.377f, -0.686359f, 0.934861f, 1.10261f, 1.3901f,

+-4.33616f, -0.00264017f, 0.00713045f, 0.106264f, 0.143726f,

+-0.0685305f, -0.054656f, -0.0176725f, -0.0772669f, -0.0264526f,

+-0.0103824f, -0.0269872f, -0.00687f, 0.225804f, 0.407751f,

+-0.0612611f, -0.0576863f, -0.180131f, -0.222772f, -0.461742f,

+0.335236f, 1.03399f, 4.24112f, -0.345796f, -0.594549f,

+-76.1407f, -0.265276f, 0.0507719f, 0.0643044f, 0.0384832f,

+0.0424459f, -0.0387817f, -0.0235996f, -0.0740556f, -0.0270029f,

+0.00882177f, -0.0552371f, -0.00485851f, 0.314295f, 0.360431f,

+-0.0787085f, 0.110355f, -0.415958f, -0.385088f, -0.272224f,

+-1.55108f, -0.141848f, 0.448877f, -0.563447f, -2.31403f,

+-0.120077f, -1.49918f, -0.817726f, -0.0495854f, -0.0230782f,

+-0.0224014f, 0.117076f, 0.0393216f, 0.051997f, 0.0330763f,

+-0.110796f, 0.0211117f, -0.0197258f, 0.0187461f, 0.0125183f,

+0.14876f, 0.0920565f, -0.342475f, 0.135272f, -0.168155f,

+-0.033423f, -0.0604611f, -0.128835f, 0.664947f, -0.144997f,

+2.27649f, 1.28663f, 0.841217f, -2.42807f, 0.0230471f,

+0.226709f, -0.0374803f, 0.155436f, 0.0400342f, -0.184686f,

+0.128488f, -0.0939518f, -0.0578559f, 0.0265967f, -0.0999322f,

+-0.0322768f, -0.322994f, -0.189371f, -0.738069f, -0.0754914f,

+0.214717f, -0.093728f, -0.695741f, 0.0899298f, -2.06188f,

+-0.273719f, -0.896977f, 0.130553f, 0.134638f, 1.29355f,

+0.00520749f, -0.0324224f, 0.00530451f, 0.0192385f, 0.00328708f,

+0.0250838f, 0.0053365f, -0.0177321f, 0.00618789f, 0.00525364f,

+0.00104596f, -0.0360459f, 0.0402403f, -0.0406351f, 0.0136883f,

+0.0880722f, -0.0197449f, 0.089938f, 0.0100456f, -0.0475638f,

+-0.73267f, 0.037433f, -0.146551f, -0.230221f, -3.06489f,

+-1.40194f, 0.0198483f, 0.0397953f, -0.0190239f, 0.0470715f,

+-0.131363f, -0.191721f, -0.0176224f, -0.0480352f, -0.221799f,

+-0.26794f, -0.0292615f, 0.0612127f, -0.129877f, 0.00628332f,

+-0.085918f, 0.0175379f, 0.0541011f, -0.0810874f, -0.380809f,

+-0.222056f, -0.508859f, -0.473369f, 0.484958f, -2.28411f,

+0.0139516f,

 /* output layer */

-8.55144, 2.0822, 0.240592, 1.26638, 0.0309585,

--1.09841, 0.861549, -1.53704, 1.07356, 4.39194,

--2.60476, 0.375094, 0.122941, 0.00326393, 0.777163,

--2.03171, -0.944556, 4.02958, -0.260741, 0.556385,

--0.220568, -1.77121, -0.858706, -1.52023, -0.784162,

-0.345948, -0.0488489, -0.323381, -0.752573, 0.517346,

-0.876475, -1.44056, -0.382276, -1.55409, };

+3.90017, 1.71789, -1.43372, -2.70839, 1.77107,

+5.48006, 1.44661, 2.01134, -1.88383, -3.64958,

+-1.26351, 0.779421, 2.11357, 3.10409, 1.68846,

+-4.46197, -1.61455, 3.59832, 2.43531, -1.26458,

+0.417941, 1.47437, 2.16635, -1.909, -0.828869,

+1.38805, -2.67975, -0.110044, 1.95596, 0.697931,

+-0.313226, -0.889315, 0.283236, 0.946102, };

 static const int topo[3] = {25, 16, 2};

--- a/src/mlp_train.c

+++ b/src/mlp_train.c

@@ -138,13 +138,16 @@

     for (s=0;s<nbSamples;s++)

         float *in, *out;

+        float inp[inDim];

         in = inputs+s*inDim;

         out = outputs + s*outDim;

+        for (j=0;j<inDim;j++)

+           inp[j] = in[j];

         for (i=0;i<hiddenDim;i++)

             double sum = W0[i*(inDim+1)];

             for (j=0;j<inDim;j++)

-                sum += W0[i*(inDim+1)+j+1]*in[j];

+                sum += W0[i*(inDim+1)+j+1]*inp[j];

             hidden[i] = tansig_approx(sum);

         for (i=0;i<outDim;i++)

@@ -156,7 +159,7 @@

             error[i] = out[i] - netOut[i];

             if (out[i] == 0) error[i] *= .0;

             error_rate[i] += fabs(error[i])>1;

-            if (i==0) error[i] *= 3;

+            if (i==0) error[i] *= 5;

             rms += error[i]*error[i];

             /*error[i] = error[i]/(1+fabs(error[i]));*/

@@ -163,7 +166,7 @@

         /* Back-propagate error */

         for (i=0;i<outDim;i++)

-            float grad = 1-netOut[i]*netOut[i];

+            double grad = 1-netOut[i]*netOut[i];

             W1_grad[i*(hiddenDim+1)] += error[i]*grad;

             for (j=0;j<hiddenDim;j++)

                 W1_grad[i*(hiddenDim+1)+j+1] += grad*error[i]*hidden[j];

@@ -177,7 +180,7 @@

             grad *= 1-hidden[i]*hidden[i];

             W0_grad[i*(inDim+1)] += grad;

             for (j=0;j<inDim;j++)

-                W0_grad[i*(inDim+1)+j+1] += grad*in[j];

+                W0_grad[i*(inDim+1)+j+1] += grad*inp[j];

     return rms;

@@ -232,8 +235,6 @@

     int inDim, outDim, hiddenDim;

     int *topo;

     double *W0, *W1, *best_W0, *best_W1;

-    double *W0_old, *W1_old;

-    double *W0_old2, *W1_old2;

     double *W0_grad, *W1_grad;

     double *W0_oldgrad, *W1_oldgrad;

     double *W0_rate, *W1_rate;

@@ -256,10 +257,6 @@

     W1 = net->weights[1];

     best_W0 = net->best_weights[0];

     best_W1 = net->best_weights[1];

-    W0_old = malloc(W0_size*sizeof(double));

-    W1_old = malloc(W1_size*sizeof(double));

-    W0_old2 = malloc(W0_size*sizeof(double));

-    W1_old2 = malloc(W1_size*sizeof(double));

     W0_grad = malloc(W0_size*sizeof(double));

     W1_grad = malloc(W1_size*sizeof(double));

     W0_oldgrad = malloc(W0_size*sizeof(double));

@@ -268,12 +265,8 @@

     W1_rate = malloc(W1_size*sizeof(double));

     best_W0_rate = malloc(W0_size*sizeof(double));

     best_W1_rate = malloc(W1_size*sizeof(double));

-    memcpy(W0_old, W0, W0_size*sizeof(double));

-    memcpy(W0_old2, W0, W0_size*sizeof(double));

     memset(W0_grad, 0, W0_size*sizeof(double));

     memset(W0_oldgrad, 0, W0_size*sizeof(double));

-    memcpy(W1_old, W1, W1_size*sizeof(double));

-    memcpy(W1_old2, W1, W1_size*sizeof(double));

     memset(W1_grad, 0, W1_size*sizeof(double));

     memset(W1_oldgrad, 0, W1_size*sizeof(double));

@@ -378,8 +371,6 @@

             /*if (W0_rate[i] > .01)

                 W0_rate[i] = .01;*/

             W0_oldgrad[i] = W0_grad[i];

-            W0_old2[i] = W0_old[i];

-            W0_old[i] = W0[i];

             W0[i] += W0_grad[i]*W0_rate[i];

         for (i=0;i<W1_size;i++)

@@ -394,8 +385,6 @@

             if (W1_rate[i] < 1e-15)

                 W1_rate[i] = 1e-15;

             W1_oldgrad[i] = W1_grad[i];

-            W1_old2[i] = W1_old[i];

-            W1_old[i] = W1[i];

             W1[i] += W1_grad[i]*W1_rate[i];

         mean_rate /= (topo[0]+1)*topo[1] + (topo[1]+1)*topo[2];

@@ -413,12 +402,14 @@

         pthread_join(thread[i], NULL);

         fprintf (stderr, "joined %d\n", i);

-    free(W0_old);

-    free(W1_old);

     free(W0_grad);

+    free(W0_oldgrad);

     free(W1_grad);

+    free(W1_oldgrad);

     free(W0_rate);

+    free(best_W0_rate);

     free(W1_rate);

+    free(best_W1_rate);

     return best_rms;

@@ -476,6 +467,9 @@

     fprintf (stderr, "Got %d samples\n", nbSamples);

     net = mlp_init(topo, 3, inputs, outputs, nbSamples);

     rms = mlp_train_backprop(net, inputs, outputs, nbSamples, nbEpoch, 1);

+    printf ("#ifdef HAVE_CONFIG_H\n");

+    printf ("#include \"config.h\"\n");

+    printf ("#endif\n\n");

     printf ("#include \"mlp.h\"\n\n");

     printf ("/* RMS error was %f, seed was %u */\n\n", rms, seed);

     printf ("static const float weights[%d] = {\n", (topo[0]+1)*topo[1] + (topo[1]+1)*topo[2]);

@@ -482,16 +476,20 @@

     printf ("\n/* hidden layer */\n");

     for (i=0;i<(topo[0]+1)*topo[1];i++)

-        printf ("%gf, ", net->weights[0][i]);

+        printf ("%gf,", net->weights[0][i]);

         if (i%5==4)

             printf("\n");

+        else

+            printf(" ");

     printf ("\n/* output layer */\n");

     for (i=0;i<(topo[1]+1)*topo[2];i++)

-        printf ("%g, ", net->weights[1][i]);

+        printf ("%g,", net->weights[1][i]);

         if (i%5==4)

             printf("\n");

+        else

+            printf(" ");

     printf ("};\n\n");

     printf ("static const int topo[3] = {%d, %d, %d};\n\n", topo[0], topo[1], topo[2]);

--- a/src/opus_encoder.c

+++ b/src/opus_encoder.c

@@ -263,7 +263,7 @@

     st->bandwidth = OPUS_BANDWIDTH_FULLBAND;

 #ifndef DISABLE_FLOAT_API

-    tonality_analysis_init(&st->analysis);

+    tonality_analysis_init(&st->analysis, st->Fs);

 #endif

     return OPUS_OK;

@@ -577,18 +577,19 @@

 #else

 #define PCM2VAL(x) SCALEIN(x)

 #endif

-void downmix_float(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C)

+void downmix_float(const void *_x, opus_val32 *y, int subframe, int offset, int c1, int c2, int C)

    const float *x;

-   opus_val32 scale;

    int j;

    x = (const float *)_x;

    for (j=0;j<subframe;j++)

-      sub[j] = PCM2VAL(x[(j+offset)*C+c1]);

+      y[j] = PCM2VAL(x[(j+offset)*C+c1]);

    if (c2>-1)

       for (j=0;j<subframe;j++)

-         sub[j] += PCM2VAL(x[(j+offset)*C+c2]);

+         y[j] += PCM2VAL(x[(j+offset)*C+c2]);

    } else if (c2==-2)

       int c;

@@ -595,35 +596,24 @@

       for (c=1;c<C;c++)

          for (j=0;j<subframe;j++)

-            sub[j] += PCM2VAL(x[(j+offset)*C+c]);

+            y[j] += PCM2VAL(x[(j+offset)*C+c]);

-#ifdef FIXED_POINT

-   scale = (1<<SIG_SHIFT);

-#else

-   scale = 1.f;

-#endif

-   if (c2==-2)

-      scale /= C;

-   else if (c2>-1)

-      scale /= 2;

-   for (j=0;j<subframe;j++)

-      sub[j] *= scale;

 #endif

-void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C)

+void downmix_int(const void *_x, opus_val32 *y, int subframe, int offset, int c1, int c2, int C)

    const opus_int16 *x;

-   opus_val32 scale;

    int j;

    x = (const opus_int16 *)_x;

    for (j=0;j<subframe;j++)

-      sub[j] = x[(j+offset)*C+c1];

+      y[j] = x[(j+offset)*C+c1];

    if (c2>-1)

       for (j=0;j<subframe;j++)

-         sub[j] += x[(j+offset)*C+c2];

+         y[j] += x[(j+offset)*C+c2];

    } else if (c2==-2)

       int c;

@@ -630,20 +620,9 @@

       for (c=1;c<C;c++)

          for (j=0;j<subframe;j++)

-            sub[j] += x[(j+offset)*C+c];

+            y[j] += x[(j+offset)*C+c];

-#ifdef FIXED_POINT

-   scale = (1<<SIG_SHIFT);

-#else

-   scale = 1.f/32768;

-#endif

-   if (c2==-2)

-      scale /= C;

-   else if (c2>-1)

-      scale /= 2;

-   for (j=0;j<subframe;j++)

-      sub[j] *= scale;

 opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs)

@@ -866,7 +845,9 @@

    int silence = 0;

    opus_val32 sample_max = 0;

+#ifdef MLP_TRAINING

+   return 0;

+#endif

    sample_max = celt_maxabs16(pcm, frame_size*channels);

 #ifdef FIXED_POINT

@@ -1131,9 +1112,9 @@

 #ifndef DISABLE_FLOAT_API

     analysis_info.valid = 0;

 #ifdef FIXED_POINT

-    if (st->silk_mode.complexity >= 10 && st->Fs==48000)

+    if (st->silk_mode.complexity >= 10 && st->Fs>=16000)

 #else

-    if (st->silk_mode.complexity >= 7 && st->Fs==48000)

+    if (st->silk_mode.complexity >= 7 && st->Fs>=16000)

 #endif

        if (is_digital_silence(pcm, frame_size, st->channels, lsb_depth))

--

⑨