ref: 843121b356685ff5a8c40211951f392f77f689cc
parent: 0619d0848520ce2cd45ec3c9fc3a2c9743b4608e
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Tue Feb 12 14:33:26 EST 2019
Fixes analysis buffering for silence and complexity changes The previous code would go out of sync in those cases.
--- a/src/analysis.c
+++ b/src/analysis.c
@@ -249,6 +249,15 @@
if (curr_lookahead<0)
curr_lookahead += DETECT_SIZE;
+ tonal->read_subframe += len/(tonal->Fs/400);
+ while (tonal->read_subframe>=8)
+ {
+ tonal->read_subframe -= 8;
+ tonal->read_pos++;
+ }
+ if (tonal->read_pos>=DETECT_SIZE)
+ tonal->read_pos-=DETECT_SIZE;
+
/* On long frames, look at the second analysis window rather than the first. */
if (len > tonal->Fs/50 && pos != tonal->write_pos)
{
@@ -262,6 +271,8 @@
pos = DETECT_SIZE-1;
pos0 = pos;
OPUS_COPY(info_out, &tonal->info[pos], 1);
+ if (!info_out->valid)
+ return;
tonality_max = tonality_avg = info_out->tonality;
tonality_count = 1;
/* Look at the neighbouring frames and pick largest bandwidth found (to be safe). */
@@ -393,14 +404,6 @@
info_out->music_prob_max = prob_max;
/* printf("%f %f %f %f %f\n", prob_min, prob_max, prob_avg/prob_count, vad_prob, info_out->music_prob); */
- tonal->read_subframe += len/(tonal->Fs/400);
- while (tonal->read_subframe>=8)
- {
- tonal->read_subframe -= 8;
- tonal->read_pos++;
- }
- if (tonal->read_pos>=DETECT_SIZE)
- tonal->read_pos-=DETECT_SIZE;
}
static const float std_feature_bias[9] = {
@@ -420,6 +423,24 @@
#define SCALE_ENER(e) (e)
#endif
+#ifdef FIXED_POINT
+static int is_digital_silence32(const opus_val32* pcm, int frame_size, int channels, int lsb_depth)
+{
+ int silence = 0;
+ opus_val32 sample_max = 0;
+#ifdef MLP_TRAINING
+ return 0;
+#endif
+ sample_max = celt_maxabs32(pcm, frame_size*channels);
+
+ silence = (sample_max == 0);
+ (void)lsb_depth;
+ return silence;
+}
+#else
+#define is_digital_silence32(pcm, frame_size, channels, lsb_depth) is_digital_silence(pcm, frame_size, channels, lsb_depth)
+#endif
+
static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt_mode, const void *x, int len, int offset, int c1, int c2, int C, int lsb_depth, downmix_func downmix)
{
int i, b;
@@ -464,8 +485,10 @@
float layer_out[MAX_NEURONS];
float below_max_pitch;
float above_max_pitch;
+ int is_silence;
SAVE_STACK;
+ tonal->initialized = 1;
alpha = 1.f/IMIN(10, 1+tonal->count);
alphaE = 1.f/IMIN(25, 1+tonal->count);
/* Noise floor related decay for bandwidth detection: -2.2 dB/second */
@@ -500,6 +523,8 @@
if (tonal->write_pos>=DETECT_SIZE)
tonal->write_pos-=DETECT_SIZE;
+ is_silence = is_digital_silence32(tonal->inmem, ANALYSIS_BUF_SIZE, 1, lsb_depth);
+
ALLOC(in, 480, kiss_fft_cpx);
ALLOC(out, 480, kiss_fft_cpx);
ALLOC(tonality, 240, float);
@@ -518,6 +543,12 @@
&tonal->inmem[240], tonal->downmix_state, remaining,
offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C, tonal->Fs);
tonal->mem_fill = 240 + remaining;
+ if (is_silence)
+ {
+ info->valid = 0;
+ RESTORE_STACK;
+ return;
+ }
opus_fft(kfft, in, out, tonal->arch);
#ifndef FIXED_POINT
/* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
@@ -938,7 +969,6 @@
analysis->analysis_offset -= frame_size;
}
- analysis_info->valid = 0;
tonality_get_info(analysis, analysis_info, frame_size);
}
--- a/src/analysis.h
+++ b/src/analysis.h
@@ -74,6 +74,7 @@
int read_pos;
int read_subframe;
float hp_ener_accum;
+ int initialized;
float rnn_state[MAX_NEURONS];
opus_val32 downmix_state[3];
AnalysisInfo info[DETECT_SIZE];
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -837,7 +837,7 @@
#ifndef DISABLE_FLOAT_API
-static int is_digital_silence(const opus_val16* pcm, int frame_size, int channels, int lsb_depth)
+int is_digital_silence(const opus_val16* pcm, int frame_size, int channels, int lsb_depth)
{
int silence = 0;
opus_val32 sample_max = 0;
@@ -1140,21 +1140,19 @@
if (st->silk_mode.complexity >= 7 && st->Fs>=16000)
#endif
{
- if (is_digital_silence(pcm, frame_size, st->channels, lsb_depth))
- {
- is_silence = 1;
- } else {
- analysis_read_pos_bak = st->analysis.read_pos;
- analysis_read_subframe_bak = st->analysis.read_subframe;
- run_analysis(&st->analysis, celt_mode, analysis_pcm, analysis_size, frame_size,
- c1, c2, analysis_channels, st->Fs,
- lsb_depth, downmix, &analysis_info);
- }
+ is_silence = is_digital_silence(pcm, frame_size, st->channels, lsb_depth);
+ analysis_read_pos_bak = st->analysis.read_pos;
+ analysis_read_subframe_bak = st->analysis.read_subframe;
+ run_analysis(&st->analysis, celt_mode, analysis_pcm, analysis_size, frame_size,
+ c1, c2, analysis_channels, st->Fs,
+ lsb_depth, downmix, &analysis_info);
/* Track the peak signal energy */
if (!is_silence && analysis_info.activity_probability > DTX_ACTIVITY_THRESHOLD)
st->peak_signal_energy = MAX32(MULT16_32_Q15(QCONST16(0.999f, 15), st->peak_signal_energy),
compute_frame_energy(pcm, frame_size, st->channels, st->arch));
+ } else if (st->analysis.initialized) {
+ tonality_analysis_reset(&st->analysis);
}
#else
(void)analysis_pcm;
--- a/src/opus_private.h
+++ b/src/opus_private.h
@@ -135,6 +135,7 @@
typedef void (*downmix_func)(const void *, opus_val32 *, int, int, int, int, int);
void downmix_float(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
+int is_digital_silence(const opus_val16* pcm, int frame_size, int channels, int lsb_depth);
int encode_size(int size, unsigned char *data);