ref: be42c3b514b3e69b8647bd561172c84b98b83685
parent: ef12c29f14a2821760be1a04363bc46d96dd6ede
author: jbuethe <jbuethe@amazon.de>
date: Tue Sep 27 12:29:13 EDT 2022
added fec_encoder.py and corresponding fec_packets.[chpy]
--- /dev/null
+++ b/dnn/training_tf2/fec_encoder.py
@@ -1,0 +1,176 @@
+
+import os
+import subprocess
+import argparse
+
+
+import numpy as np
+from scipy.io import wavfile
+import tensorflow as tf
+
+from rdovae import new_rdovae_model, pvq_quantize, apply_dead_zone, sq_rate_metric
+from fec_packets import write_fec_packets, read_fec_packets
+
+
+debug = False
+
+if debug:
+ args = type('dummy', (object,),
+ {
+ 'input' : 'item1.wav',
+ 'weights' : 'testout/rdovae_alignment_fix_1024_120.h5',
+ 'enc_lambda' : 0.0007,
+ 'output' : "test_0007.fec",
+ 'cond_size' : 1024,
+ 'num_redundancy_frames' : 64,
+ 'extra_delay' : 0,
+ 'dump_data' : './dump_data'
+ })()
+ os.environ['CUDA_VISIBLE_DEVICES']=""
+else:
+ parser = argparse.ArgumentParser(description='Encode redundancy for Opus neural FEC. Designed for use with voip application and 20ms frames')
+
+ parser.add_argument('input', metavar='<input signal>', help='audio input (.wav or .raw or .pcm as int16)')
+ parser.add_argument('weights', metavar='<weights>', help='trained model file (.h5)')
+ parser.add_argument('enc_lambda', metavar='<lambda>', type=float, help='lambda for controlling encoder rate (default=0.0007)', default=0.0007)
+ parser.add_argument('output', type=str, help='output file (will be extended with .fec)')
+
+ parser.add_argument('--dump-data', type=str, default='./dump_data', help='path to dump data executable (default ./dump_data)')
+ parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')
+ parser.add_argument('--num-redundancy-frames', default=64, type=int, help='number of redundancy frames per packet (default 64)')
+ parser.add_argument('--extra-delay', default=0, type=int, help="last features in packet are calculated with the decoder aligned samples, use this option to add extra delay (in samples at 16kHz)")
+
+ args = parser.parse_args()
+
+model, encoder, decoder = new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=1, cond_size=args.cond_size)
+model.load_weights(args.weights)
+
+lpc_order = 16
+
+## prepare input signal
+# SILK frame size is 20ms and LPCNet subframes are 10ms
+subframe_size = 160
+frame_size = 2 * subframe_size
+
+# 91 samples delay to align with SILK decoded frames
+silk_delay = 91
+
+# prepend zeros to have enough history to produce the first package
+zero_history = (args.num_redundancy_frames - 1) * frame_size
+
+total_delay = silk_delay + zero_history + args.extra_delay
+
+# load signal
+if args.input.endswith('.raw') or args.input.endswith('.pcm'):
+ signal = np.fromfile(args.input, dtype='int16')
+
+elif args.input.endswith('.wav'):
+ fs, signal = wavfile.read(args.input)
+else:
+ raise ValueError(f'unknown input signal format: {args.input}')
+
+# fill up last frame with zeros
+padded_signal_length = len(signal) + total_delay
+tail = padded_signal_length % frame_size
+right_padding = (frame_size - tail) % frame_size
+
+signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
+
+padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw'
+signal.tofile(padded_signal_file)
+
+# write signal and call dump_data to create features
+
+feature_file = os.path.splitext(args.input)[0] + '_features.f32'
+command = f"{args.dump_data} -test {padded_signal_file} {feature_file}"
+r = subprocess.run(command, shell=True)
+if r.returncode != 0:
+ raise RuntimeError(f"command '{command}' failed with exit code {r.returncode}")
+
+# load features
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+
+# load features
+features = np.fromfile(feature_file, dtype='float32')
+num_subframes = len(features) // nb_features
+num_subframes = 2 * (num_subframes // 2)
+num_frames = num_subframes // 2
+
+features = np.reshape(features, (1, -1, nb_features))
+features = features[:, :, :nb_used_features]
+features = features[:, :num_subframes, :]
+
+# lambda and q_id (ToDo: check validity of lambda and q_id)
+enc_lambda = args.enc_lambda * np.ones((1, num_frames, 1))
+quant_id = np.round(10*np.log(enc_lambda/.0007)).astype('int16')
+
+
+# run encoder
+print("running fec encoder...")
+symbols, quant_embed_dec, gru_state_dec = encoder.predict([features, quant_id, enc_lambda])
+
+# apply quantization
+nsymbols = 80
+dead_zone = tf.math.softplus(quant_embed_dec[:, :, nsymbols : 2 * nsymbols])
+symbols = apply_dead_zone([symbols, dead_zone]).numpy()
+qsymbols = np.round(symbols)
+quant_gru_state_dec = pvq_quantize(gru_state_dec, 30)
+
+# rate estimate
+hard_distr_embed = tf.math.sigmoid(quant_embed_dec[:, :, 4 * nsymbols : ]).numpy()
+rate_input = np.concatenate((symbols, hard_distr_embed, enc_lambda), axis=-1)
+rates = sq_rate_metric(None, rate_input, reduce=False).numpy()
+
+# run decoder
+input_length = args.num_redundancy_frames // 2
+offset = args.num_redundancy_frames - 1
+
+packets = []
+packet_sizes = []
+
+for i in range(offset, num_frames):
+ print(f"processing frame {i - offset}...")
+ features = decoder.predict([symbols[:, i - 2 * input_length + 1 : i + 1 : 2, :], quant_embed_dec[:, :input_length, :], quant_gru_state_dec[:, i, :]])
+ packets.append(features)
+ packet_size = 8 * int((np.sum(rates[:, i - 2 * input_length + 1 : i + 1 : 2]) + 7) / 8) + 64
+ packet_sizes.append(packet_size)
+
+
+# write packets
+packet_file = args.output + '.fec' if not args.output.endswith('.fec') else args.output
+write_fec_packets(packet_file, packets, packet_sizes)
+
+
+print(f"average redundancy rate: {int(round(sum(packet_sizes) / len(packet_sizes) * 50 / 1000))} kbps")
+
+
+if False:
+
+ # sanity check
+ packets2 = read_fec_packets(packet_file)
+
+ print(f"{len(packets)=} {len(packets2)=}")
+
+ print(f"{packets[0][0, 0]=}")
+ print(f"{packets2[0][0, 0]=}")
+
+ # sanity checks
+ # 1. concatenate features at offset 0
+
+ test_features_batch2 = np.concatenate([packet[:,-2:, :] for packet in packets], axis=1)
+ print(f"{test_features_batch2.shape=}")
+
+ test_features_full_batch2 = np.zeros((test_features_batch2.shape[1], nb_features), dtype=np.float32)
+ test_features_full_batch2[:, :nb_used_features] = test_features_batch2[0, :, :]
+
+ test_features_full_batch2.tofile('test_features_batch2.f32')
+
+ # 2. concatenate in batches of 4
+ test_features_batch4 = np.concatenate([packet[:,-4:, :] for packet in packets[::2]], axis=1)
+ print(f"{test_features_batch4.shape=}")
+
+ test_features_full_batch4 = np.zeros((test_features_batch4.shape[1], nb_features), dtype=np.float32)
+ test_features_full_batch4[:, :nb_used_features] = test_features_batch4[0, :, :]
+
+ test_features_full_batch4.tofile('test_features_batch4.f32')
--- /dev/null
+++ b/dnn/training_tf2/fec_packets.c
@@ -1,0 +1,115 @@
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "fec_packets.h"
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index)
+{
+
+ int16_t version;
+ int16_t header_size;
+ int16_t num_packets;
+ int16_t packet_size;
+ int16_t subframe_size;
+ int16_t subframes_per_packet;
+ int16_t num_features;
+ long offset;
+
+ FILE *fid = fopen(filename, "rb");
+
+ /* read header */
+ if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+ if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+ if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+ if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+ if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+ if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+ if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+ /* check if indices are valid */
+ if (packet_index >= num_packets || subframe_index >= subframes_per_packet)
+ {
+ fprintf(stderr, "get_fec_frame: index out of bounds\n");
+ goto error;
+ }
+
+ /* calculate offset in file (+ 2 is for rate) */
+ offset = header_size + packet_index * packet_size + 2 + subframe_index * subframe_size;
+ fseek(fid, offset, SEEK_SET);
+
+ /* read features */
+ if (fread(features, sizeof(*features), num_features, fid) != num_features) goto error;
+
+ fclose(fid);
+ return 0;
+
+error:
+ fclose(fid);
+ return 1;
+}
+
+int get_fec_rate(const char * const filename, int packet_index)
+{
+ int16_t version;
+ int16_t header_size;
+ int16_t num_packets;
+ int16_t packet_size;
+ int16_t subframe_size;
+ int16_t subframes_per_packet;
+ int16_t num_features;
+ long offset;
+ int16_t rate;
+
+ FILE *fid = fopen(filename, "rb");
+
+ /* read header */
+ if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+ if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+ if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+ if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+ if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+ if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+ if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+ /* check if indices are valid */
+ if (packet_index >= num_packets)
+ {
+ fprintf(stderr, "get_fec_rate: index out of bounds\n");
+ goto error;
+ }
+
+ /* calculate offset in file (+ 2 is for rate) */
+ offset = header_size + packet_index * packet_size;
+ fseek(fid, offset, SEEK_SET);
+
+ /* read rate */
+ if (fread(&rate, sizeof(rate), 1, fid) != 1) goto error;
+
+ fclose(fid);
+ return (int) rate;
+
+error:
+ fclose(fid);
+ return -1;
+}
+
+#if 0
+int main()
+{
+ float features[20];
+ int i;
+
+ if (get_fec_frame("../test.fec", &features[0], 0, 127))
+ {
+ return 1;
+ }
+
+ for (i = 0; i < 20; i ++)
+ {
+ printf("%d %f\n", i, features[i]);
+ }
+
+ printf("rate: %d\n", get_fec_rate("../test.fec", 0));
+
+}
+#endif
\ No newline at end of file
--- /dev/null
+++ b/dnn/training_tf2/fec_packets.h
@@ -1,0 +1,7 @@
+#ifndef _FEC_PACKETS_H
+#define _FEC_PACKETS_H
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index);
+int get_fec_rate(const char * const filename, int packet_index);
+
+#endif
\ No newline at end of file
--- /dev/null
+++ b/dnn/training_tf2/fec_packets.py
@@ -1,0 +1,79 @@
+import numpy as np
+
+
+
+def write_fec_packets(filename, packets, rates=None):
+ """ writes packets in binary format """
+
+ assert np.dtype(np.float32).itemsize == 4
+ assert np.dtype(np.int16).itemsize == 2
+
+ # derive some sizes
+ num_packets = len(packets)
+ subframes_per_packet = packets[0].shape[-2]
+ num_features = packets[0].shape[-1]
+
+ # size of float is 4
+ subframe_size = num_features * 4
+ packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate
+
+ version = 1
+ # header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
+ header_size = 14
+
+ with open(filename, 'wb') as f:
+
+ # header
+ f.write(np.int16(version).tobytes())
+ f.write(np.int16(header_size).tobytes())
+ f.write(np.int16(num_packets).tobytes())
+ f.write(np.int16(packet_size).tobytes())
+ f.write(np.int16(subframe_size).tobytes())
+ f.write(np.int16(subframes_per_packet).tobytes())
+ f.write(np.int16(num_features).tobytes())
+
+ # packets
+ for i, packet in enumerate(packets):
+ if type(rates) == type(None):
+ rate = 0
+ else:
+ rate = rates[i]
+
+ f.write(np.int16(rate).tobytes())
+
+ features = np.flip(packet, axis=-2)
+ f.write(features.astype(np.float32).tobytes())
+
+
+def read_fec_packets(filename):
+ """ reads packets from binary format """
+
+ assert np.dtype(np.float32).itemsize == 4
+ assert np.dtype(np.int16).itemsize == 2
+
+ with open(filename, 'rb') as f:
+
+ # header
+ version = np.frombuffer(f.read(2), dtype=np.int16).item()
+ header_size = np.frombuffer(f.read(2), dtype=np.int16).item()
+ num_packets = np.frombuffer(f.read(2), dtype=np.int16).item()
+ packet_size = np.frombuffer(f.read(2), dtype=np.int16).item()
+ subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item()
+ subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item()
+ num_features = np.frombuffer(f.read(2), dtype=np.int16).item()
+
+ dummy_features = np.zeros((1, subframes_per_packet, num_features), dtype=np.float32)
+
+ # packets
+ rates = []
+ packets = []
+ for i in range(num_packets):
+
+ rate = np.frombuffer(f.read(2), dtype=np.int16).item
+ rates.append(rate)
+
+ features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
+ packet = np.flip(features, axis=-2)
+ packets.append(packet)
+
+ return packets
\ No newline at end of file
--- a/dnn/training_tf2/rdovae.py
+++ b/dnn/training_tf2/rdovae.py
@@ -135,7 +135,7 @@
rate = lambda_val*K.sum(rate, axis=-1)
return K.mean(rate)
-def sq_rate_metric(y_true,y_pred):
+def sq_rate_metric(y_true,y_pred, reduce=True):
lambda_val = y_pred[:,:,-1]
y_pred = y_pred[:,:,:-1]
log2_e = 1.4427
@@ -149,7 +149,9 @@
y0 = K.maximum(0., 1. - K.abs(y_pred))**2
rate = -y0*safelog2(p0*r**K.abs(y_pred)) - (1-y0)*safelog2(.5*(1-p0)*(1-r)*r**(K.abs(y_pred)-1))
rate = K.sum(rate, axis=-1)
- return K.mean(rate)
+ if reduce:
+ rate = K.mean(rate)
+ return rate
def pvq_quant_search(x, k):
x = x/tf.reduce_sum(tf.abs(x), axis=-1, keepdims=True)
--
⑨