shithub: opus

Download patch

ref: 3510404ad5a5287148d89ee523aa1edb7ef8e257
parent: 17bb81934ba2bf3123500c10b8aac6a8f2dfc11a
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Sat Jul 1 10:15:26 EDT 2023

Properly compute and use the DRED offset field

Also, don't code DRED that's redundant with the main packet

--- a/silk/dred_encoder.c
+++ b/silk/dred_encoder.c
@@ -174,9 +174,13 @@
     }
 }
 
-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size)
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay)
 {
+    int curr_offset16k;
     int frame_size16k = frame_size * 16000 / enc->Fs;
+    curr_offset16k = 40 + extra_delay*16000/enc->Fs - enc->input_buffer_fill;
+    enc->dred_offset = (int)floor((curr_offset16k+20.f)/40.f);
+    enc->latent_offset = 0;
     while (frame_size16k > 0) {
         int process_size16k;
         int process_size;
@@ -186,9 +190,17 @@
         enc->input_buffer_fill += process_size16k;
         if (enc->input_buffer_fill >= 2*DRED_FRAME_SIZE)
         {
-          dred_process_frame(enc);
-          enc->input_buffer_fill -= 2*DRED_FRAME_SIZE;
-          OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill);
+            curr_offset16k += 320;
+            dred_process_frame(enc);
+            enc->input_buffer_fill -= 2*DRED_FRAME_SIZE;
+            OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill);
+            /* 15 ms (6*2.5 ms) is the ideal offset for DRED because it corresponds to our vocoder look-ahead. */
+            if (enc->dred_offset < 6) {
+                enc->dred_offset += 8;
+                OPUS_COPY(enc->initial_state, enc->state_buffer, 24);
+            } else {
+                enc->latent_offset++;
+            }
         }
 
         pcm += process_size;
@@ -207,21 +219,19 @@
     int i;
     int offset;
     int ec_buffer_fill;
-    int dred_offset;
     int q0;
     int dQ;
 
     /* entropy coding of state and latents */
     ec_enc_init(&ec_encoder, buf, max_bytes);
-    dred_offset = 8; /* 20 ms */
     q0 = DRED_ENC_Q0;
     dQ = 3;
-    ec_enc_uint(&ec_encoder, dred_offset, 32);
+    ec_enc_uint(&ec_encoder, enc->dred_offset, 32);
     ec_enc_uint(&ec_encoder, q0, 16);
     ec_enc_uint(&ec_encoder, dQ, 8);
-    dred_encode_state(&ec_encoder, enc->state_buffer);
+    dred_encode_state(&ec_encoder, enc->initial_state);
 
-    for (i = 0; i < IMIN(2*max_chunks, enc->latents_buffer_fill-1); i += 2)
+    for (i = 0; i < IMIN(2*max_chunks, enc->latents_buffer_fill-enc->latent_offset-1); i += 2)
     {
         ec_enc ec_bak;
         ec_bak = ec_encoder;
@@ -231,7 +241,7 @@
 
         dred_encode_latents(
             &ec_encoder,
-            enc->latents_buffer + i * DRED_LATENT_DIM,
+            enc->latents_buffer + (i+enc->latent_offset) * DRED_LATENT_DIM,
             quant_scales + offset,
             dead_zone + offset,
             r + offset,
--- a/silk/dred_encoder.h
+++ b/silk/dred_encoder.h
@@ -46,9 +46,12 @@
 #define DREDENC_RESET_START input_buffer
     float input_buffer[2*DRED_DFRAME_SIZE];
     int input_buffer_fill;
+    int dred_offset;
+    int latent_offset;
     float latents_buffer[DRED_MAX_FRAMES * DRED_LATENT_DIM];
     int latents_buffer_fill;
     float state_buffer[24];
+    float initial_state[24];
     float resample_mem[RESAMPLING_ORDER + 1];
     LPCNetEncState lpcnet_enc_state;
     RDOVAEEncState rdovae_enc;
@@ -60,7 +63,7 @@
 
 void dred_deinit_encoder(DREDEnc *enc);
 
-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size);
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay);
 
 int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes);
 
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@@ -665,15 +665,20 @@
       return OPUS_BAD_ARG;
 #ifdef ENABLE_NEURAL_FEC
    if (dred != NULL && dred->process_stage == 2) {
+      int F10;
       int features_per_frame;
       int needed_feature_frames;
+      int init_frames;
       lpcnet_plc_fec_clear(&st->lpcnet);
-      features_per_frame = IMAX(1, frame_size/(st->Fs/100));
-      needed_feature_frames = features_per_frame;
+      F10 = st->Fs/100;
       /* if blend==0, the last PLC call was "update" and we need to feed two extra 10-ms frames. */
-      if (st->lpcnet.blend == 0) needed_feature_frames+=2;
+      init_frames = (st->lpcnet.blend == 0) ? 2 : 0;
+      features_per_frame = IMAX(1, frame_size/F10);
+      needed_feature_frames = init_frames + features_per_frame;
       for (i=0;i<needed_feature_frames;i++) {
-         int feature_offset = (needed_feature_frames-i-1 + (dred_offset/(st->Fs/100)-1));
+         int feature_offset;
+         /* We floor instead of rounding because 5-ms overlap compensates for the missing 0.5 rounding offset. */
+         feature_offset = init_frames - i - 2 + (int)floor(((float)dred_offset + dred->dred_offset*F10/4)/F10);
          if (feature_offset <= 4*dred->nb_latents-1 && feature_offset >= 0) {
            lpcnet_plc_fec_add(&st->lpcnet, dred->fec_features+feature_offset*DRED_NUM_FEATURES);
          } else {
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -1685,7 +1685,7 @@
 #ifdef ENABLE_NEURAL_FEC
     if ( st->dred_duration > 0 ) {
         /* DRED Encoder */
-        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size );
+        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer );
     } else {
         st->dred_encoder.latents_buffer_fill = 0;
     }
--