shithub: opus

Download patch

ref: c9b7efd274807043ab926d5067362f6b7fbf352b
parent: 6f8db9392902f156125442730cf93b7de5f9aec1
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Mon Mar 11 14:00:27 EDT 2019

dump as ASCII

--- a/dnn/dump_data.c
+++ b/dnn/dump_data.c
@@ -142,7 +142,7 @@
 }
 
 
-int quantize_3stage_mbest(float *x)
+int quantize_3stage_mbest(float *x, int entry[3])
 {
     int i, k;
     int id, id2, id3;
@@ -227,9 +227,9 @@
         }
       }
     }
-    id = index3[0][0];
-    id2 = index3[0][1];
-    id3 = index3[0][2];
+    entry[0] = id = index3[0][0];
+    entry[1] = id2 = index3[0][1];
+    entry[2] = id3 = index3[0][2];
     //printf("%f ", glob_dist[0]);
     for (i=0;i<NB_BANDS_1;i++) {
         x[i] -= ceps_codebook1[id*NB_BANDS_1 + i];
@@ -292,7 +292,7 @@
 }
 
 
-int quantize_diff(float *x, float *left, float *right, float *codebook, int bits, int sign)
+int quantize_diff(float *x, float *left, float *right, float *codebook, int bits, int sign, int *entry)
 {
     int i;
     int nb_entries;
@@ -309,6 +309,7 @@
     for (i=0;i<4*NB_BANDS;i++) target[i] = x[i%NB_BANDS] - pred[i];
 
     id = find_nearest_multi(codebook, nb_entries, target, NB_BANDS, NULL, sign);
+    *entry = id;
     if (id >= 1<<bits) {
       s = -1;
       id -= (1<<bits);
@@ -391,12 +392,11 @@
 
 int double_interp_search(const float features[4][NB_FEATURES], const float *mem) {
     int i, j;
-    int id0, id1;
     int best_id=0;
     float min_dist = 1e15;
     float dist[2][3];
-    id0 = interp_search(features[0], mem, features[1], dist[0]);
-    id1 = interp_search(features[2], features[1], features[3], dist[1]);
+    interp_search(features[0], mem, features[1], dist[0]);
+    interp_search(features[2], features[1], features[3], dist[1]);
     for (i=0;i<3;i++) {
         for (j=0;j<3;j++) {
             float d;
@@ -591,7 +591,7 @@
   }
 }
 
-static void process_superframe(DenoiseState *st, FILE *ffeat) {
+static void process_superframe(DenoiseState *st, FILE *ffeat, int encode) {
   int i;
   int sub;
   int best_i;
@@ -607,6 +607,10 @@
   float center_pitch;
   int main_pitch;
   int modulation;
+  int c0_id;
+  int vq_end[3];
+  int vq_mid;
+  int corr_id = 0;
   for(sub=0;sub<8;sub++) frame_weight_sum += st->frame_weight[2+sub];
   for(sub=0;sub<8;sub++) st->frame_weight[2+sub] *= (8.f/frame_weight_sum);
   for(sub=0;sub<8;sub++) {
@@ -650,6 +654,7 @@
     best_i = pitch_prev[sub][best_i];
   }
   frame_corr /= 8;
+  if (frame_corr < 0) frame_corr = 0;
   for (sub=0;sub<8;sub++) {
     //printf("%d %f\n", best[2+sub], frame_corr);
   }
@@ -671,10 +676,12 @@
     /* Allow a relative variation of up to 1/4 over 8 sub-frames. */
     max_a = mean_pitch/32;
     best_a = MIN16(max_a, MAX16(-max_a, best_a));
-    frame_corr = 0.3875f + .175f*floor((frame_corr-.3f)/.175f);
+    corr_id = (int)floor((frame_corr-.3f)/.175f);
+    frame_corr = 0.3875f + .175f*corr_id;
   } else {
     best_a = 0;
-    frame_corr = 0.0375f + .075f*floor(frame_corr/.075f);
+    corr_id = (int)floor(frame_corr/.075f);
+    frame_corr = 0.0375f + .075f*corr_id;
   }
   //best_b = (sxx*sy - sx*sxy)/(sw*sxx - sx*sx);
   best_b = (sy - best_a*sx)/sw;
@@ -705,10 +712,11 @@
   RNN_COPY(&st->xc[0][0], &st->xc[8][0], PITCH_MAX_PERIOD);
   RNN_COPY(&st->xc[1][0], &st->xc[9][0], PITCH_MAX_PERIOD);
   //printf("%f\n", st->features[3][0]);
-  st->features[3][0] = floor(.5 + st->features[3][0]*5)/5;
-  quantize_3stage_mbest(&st->features[3][1]);
+  c0_id = (int)floor(.5 + st->features[3][0]*5);
+  st->features[3][0] = c0_id/5.;
+  quantize_3stage_mbest(&st->features[3][1], vq_end);
   /*perform_interp_relaxation(st->features, vq_mem);*/
-  quantize_diff(&st->features[1][0], vq_mem, &st->features[3][0], ceps_codebook_diff4, 11, 1);
+  quantize_diff(&st->features[1][0], vq_mem, &st->features[3][0], ceps_codebook_diff4, 11, 1, &vq_mid);
 #if 0
   interp_diff(&st->features[0][0], vq_mem, &st->features[1][0], ceps_codebook_diff2, 6, 0);
   interp_diff(&st->features[2][0], &st->features[1][0], &st->features[3][0], ceps_codebook_diff2, 6, 0);
@@ -718,8 +726,12 @@
 #endif
   //printf("\n");
   RNN_COPY(vq_mem, &st->features[3][0], NB_BANDS);
-  for (i=0;i<4;i++) {
-    fwrite(st->features[i], sizeof(float), NB_FEATURES, ffeat);
+  if (encode) {
+    fprintf(ffeat, "%d %d %d %d %d %d %d %d %d\n", c0_id, main_pitch, voiced ? modulation : -4, corr_id, vq_end[0], vq_end[1], vq_end[2], vq_mid, interp_id);
+  } else {
+    for (i=0;i<4;i++) {
+      fwrite(st->features[i], sizeof(float), NB_FEATURES, ffeat);
+    }
   }
 }
 
@@ -811,9 +823,14 @@
   DenoiseState *st;
   float noise_std=0;
   int training = -1;
+  int encode = 0;
   st = rnnoise_create();
   if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
   if (argc == 4 && strcmp(argv[1], "-test")==0) training = 0;
+  if (argc == 4 && strcmp(argv[1], "-encode")==0) {
+      training = 0;
+      encode = 1;
+  }
   if (training == -1) {
     fprintf(stderr, "usage: %s -train <speech> <features out> <pcm out>\n", argv[0]);
     fprintf(stderr, "  or   %s -test <speech> <features out>\n", argv[0]);
@@ -890,7 +907,7 @@
     st->pcount++;
     /* Running on groups of 4 frames. */
     if (st->pcount == 4) {
-      process_superframe(st, ffeat);
+      process_superframe(st, ffeat, encode);
       st->pcount = 0;
     }
 
--