ref: d1646a680a3c9e8a29261cac792d90f7c62101e1
parent: ea4d8f54c3f0bd269ebd2ee97214d84d62628d81
author: Jan Buethe <jbuethe@amazon.de>
date: Thu Oct 20 13:27:39 EDT 2022
added NFEC decoder C implementation
--- /dev/null
+++ b/dnn/nfec_dec.c
@@ -1,0 +1,118 @@
+#include "nfec_dec.h"
+
+//#define DEBUG
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+void nfec_dec_init_states(
+ NFECDecState *h, /* io: state buffer handle */
+ const float *initial_state /* i: initial state */
+ )
+{
+ /* initialize GRU states from initial state */
+ compute_dense(&state1, h->dense2_state, initial_state);
+ compute_dense(&state2, h->dense4_state, initial_state);
+ compute_dense(&state3, h->dense6_state, initial_state);
+}
+
+void nfec_dec_unquantize_latent_vector(
+ float *z, /* o: unquantized latent vector */
+ const int *zq, /* i: quantized latent vector */
+ int quant_level /* i: quantization level */
+ )
+{
+ int i;
+ /* inverse scaling and type conversion */
+ for (i = 0; i < NFEC_STATS_NUM_LATENTS; i ++)
+ {
+ z[i] = (float) zq[i] / nfec_stats_quant_scales[quant_level * NFEC_STATS_NUM_LATENTS + i];
+ }
+}
+
+void nfec_decode_qframe(
+ NFECDecState *dec_state, /* io: state buffer handle */
+ float *qframe, /* o: quadruple feature frame (four concatenated frames) */
+ const float *input /* i: latent vector */
+ )
+{
+ float buffer[DEC_DENSE1_OUT_SIZE + DEC_DENSE2_OUT_SIZE + DEC_DENSE3_OUT_SIZE + DEC_DENSE4_OUT_SIZE + DEC_DENSE5_OUT_SIZE + DEC_DENSE6_OUT_SIZE + DEC_DENSE7_OUT_SIZE + DEC_DENSE8_OUT_SIZE];
+ int output_index = 0;
+ int input_index = 0;
+#ifdef DEBUG
+ static FILE *fids[8] = {NULL};
+ int i;
+ char filename[256];
+
+ for (i=0; i < 8; i ++)
+ {
+ if (fids[i] == NULL)
+ {
+ sprintf(filename, "y%d.f32", i + 1);
+ fids[i] = fopen(filename, "wb");
+ }
+ }
+#endif
+
+ /* run encoder stack and concatenate output in buffer*/
+ compute_dense(&dec_dense1, &buffer[output_index], input);
+#ifdef DEBUG
+ fwrite(&buffer[output_index], sizeof(buffer[0]), DEC_DENSE1_OUT_SIZE, fids[0]);
+#endif
+ input_index = output_index;
+ output_index += DEC_DENSE1_OUT_SIZE;
+
+ compute_gru2(&dec_dense2, dec_state->dense2_state, &buffer[input_index]);
+ memcpy(&buffer[output_index], dec_state->dense2_state, DEC_DENSE2_OUT_SIZE * sizeof(float));
+#ifdef DEBUG
+ fwrite(&buffer[output_index], sizeof(buffer[0]), DEC_DENSE2_OUT_SIZE, fids[1]);
+#endif
+ input_index = output_index;
+ output_index += DEC_DENSE2_OUT_SIZE;
+
+ compute_dense(&dec_dense3, &buffer[output_index], &buffer[input_index]);
+#ifdef DEBUG
+ fwrite(&buffer[output_index], sizeof(buffer[0]), DEC_DENSE3_OUT_SIZE, fids[2]);
+#endif
+ input_index = output_index;
+ output_index += DEC_DENSE3_OUT_SIZE;
+
+ compute_gru2(&dec_dense4, dec_state->dense4_state, &buffer[input_index]);
+ memcpy(&buffer[output_index], dec_state->dense4_state, DEC_DENSE4_OUT_SIZE * sizeof(float));
+#ifdef DEBUG
+ fwrite(&buffer[output_index], sizeof(buffer[0]), DEC_DENSE4_OUT_SIZE, fids[3]);
+#endif
+ input_index = output_index;
+ output_index += DEC_DENSE4_OUT_SIZE;
+
+ compute_dense(&dec_dense5, &buffer[output_index], &buffer[input_index]);
+#ifdef DEBUG
+ fwrite(&buffer[output_index], sizeof(buffer[0]), DEC_DENSE5_OUT_SIZE, fids[4]);
+#endif
+ input_index = output_index;
+ output_index += DEC_DENSE5_OUT_SIZE;
+
+ compute_gru2(&dec_dense6, dec_state->dense6_state, &buffer[input_index]);
+ memcpy(&buffer[output_index], dec_state->dense6_state, DEC_DENSE6_OUT_SIZE * sizeof(float));
+#ifdef DEBUG
+ fwrite(&buffer[output_index], sizeof(buffer[0]), DEC_DENSE6_OUT_SIZE, fids[5]);
+#endif
+ input_index = output_index;
+ output_index += DEC_DENSE6_OUT_SIZE;
+
+ compute_dense(&dec_dense7, &buffer[output_index], &buffer[input_index]);
+#ifdef DEBUG
+ fwrite(&buffer[output_index], sizeof(buffer[0]), DEC_DENSE7_OUT_SIZE, fids[6]);
+#endif
+ input_index = output_index;
+ output_index += DEC_DENSE7_OUT_SIZE;
+
+ compute_dense(&dec_dense8, &buffer[output_index], &buffer[input_index]);
+#ifdef DEBUG
+ fwrite(&buffer[output_index], sizeof(buffer[0]), DEC_DENSE8_OUT_SIZE, fids[7]);
+#endif
+ output_index += DEC_DENSE8_OUT_SIZE;
+
+ compute_dense(&dec_final, qframe, buffer);
+}
\ No newline at end of file
--- /dev/null
+++ b/dnn/nfec_dec.h
@@ -1,0 +1,17 @@
+#ifndef _NFEC_DEC_H
+#define _NFEC_DEC_H
+
+#include "nfec_dec_data.h"
+#include "nfec_stats_data.h"
+
+typedef struct {
+ float dense2_state[DEC_DENSE2_STATE_SIZE];
+ float dense4_state[DEC_DENSE2_STATE_SIZE];
+ float dense6_state[DEC_DENSE2_STATE_SIZE];
+} NFECDecState;
+
+void nfec_dec_init_states(NFECDecState *h, const float * initial_state);
+void nfec_dec_unquantize_latent_vector(float *z, const int *zq, int quant_level);
+void nfec_decode_qframe(NFECDecState *h, float *qframe, const float * z);
+
+#endif
\ No newline at end of file
--- /dev/null
+++ b/dnn/nfec_dec_demo.c
@@ -1,0 +1,68 @@
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "nfec_dec.h"
+#include "nfec_enc.h"
+
+
+void usage()
+{
+ printf("nfec_dec_demo <input> <output>\n");
+ exit(1);
+}
+
+int main(int argc, char **argv)
+{
+ NFECDecState dec_state;
+ float feature_buffer[36];
+ float qframe[4 * NFEC_DEC_NUM_FEATURES];
+ float latents[80];
+ float initial_state[24];
+ int quantized_latents[80];
+ int index = 0;
+ FILE *in_fid, *out_fid;
+ int qlevel = 0;
+
+ memset(&dec_state, 0, sizeof(dec_state));
+
+ if (argc < 3) usage();
+
+ in_fid = fopen(argv[1], "rb");
+ if (in_fid == NULL)
+ {
+ perror("Could not open input file");
+ usage();
+ }
+
+ out_fid = fopen(argv[2], "wb");
+ if (out_fid == NULL)
+ {
+ perror("Could not open output file");
+ usage();
+ }
+
+ /* read initial state from input stream */
+ if (fread(initial_state, sizeof(float), 24, in_fid) != 24)
+ {
+ perror("error while reading initial state");
+ return 1;
+ }
+
+ /* initialize GRU states */
+ nfec_dec_init_states(&dec_state, initial_state);
+
+ /* start decoding */
+ while (fread(latents, sizeof(float), 80, in_fid) == 80)
+ {
+ nfec_decode_qframe(&dec_state, qframe, latents);
+ fwrite(qframe, sizeof(float), 4*20, out_fid);
+ }
+
+ fclose(in_fid);
+ fclose(out_fid);
+
+
+ return 0;
+}
+
+/* gcc -DDISABLE_DOT_PROD -DDISABLE_NEON nfec_dec_demo.c nfec_dec.c nnet.c nfec_dec_data.c nfec_stats_data.c kiss99.c -g -o nfec_dec_demo */
\ No newline at end of file
--- a/dnn/nfec_enc.c
+++ b/dnn/nfec_enc.c
@@ -1,6 +1,9 @@
+#include <math.h>
+
#include "nfec_enc.h"
#include "nnet.h"
#include "nfec_enc_data.h"
+#include "nfec_stats_data.h"
//#define DEBUG
@@ -8,7 +11,12 @@
#include <stdio.h>
#endif
-void nfec_encode_dframe(struct NFECEncState *enc_state, float *latents, float *initial_state, const float *input)
+void nfec_encode_dframe(
+ struct NFECEncState *enc_state, /* io: encoder state */
+ float *latents, /* o: latent vector */
+ float *initial_state, /* o: initial state */
+ const float *input /* i: double feature frame (concatenated) */
+ )
{
float buffer[ENC_DENSE1_OUT_SIZE + ENC_DENSE2_OUT_SIZE + ENC_DENSE3_OUT_SIZE + ENC_DENSE4_OUT_SIZE + ENC_DENSE5_OUT_SIZE + ENC_DENSE6_OUT_SIZE + ENC_DENSE7_OUT_SIZE + ENC_DENSE8_OUT_SIZE + GDENSE1_OUT_SIZE];
int output_index = 0;
@@ -105,4 +113,28 @@
input_index = output_index;
compute_dense(&gdense2, initial_state, &buffer[input_index]);
+}
+
+void nfec_quantize_latent_vector(
+ int *z_q, /* o: quantized latent vector */
+ const float *z, /* i: unquantized latent vector */
+ int quant_level /* i: quantization level */
+ )
+{
+ int i;
+ float delta;
+ float tmp[NFEC_LATENT_DIM];
+
+ for (i = 0; i < NFEC_LATENT_DIM; i ++)
+ {
+ /* dead-zone transform */
+ delta = nfec_stats_dead_zone_theta[quant_level * NFEC_LATENT_DIM + i] - .5f;
+ tmp[i] = z[i] - delta * tanhf(z[i] / (delta + 0.1f));
+
+ /* scaling */
+ tmp[i] *= nfec_stats_quant_scales[quant_level * NFEC_LATENT_DIM + i];
+
+ /* quantization by rounding (CAVE: is there a quantization routine with overlfow check available?) */
+ z_q[i] = (int) roundf(tmp[i]);
+ }
}
\ No newline at end of file
--- a/dnn/nfec_enc.h
+++ b/dnn/nfec_enc.h
@@ -11,5 +11,6 @@
};
void nfec_encode_dframe(struct NFECEncState *enc_state, float *latents, float *initial_state, const float *input);
+void nfec_quantize_latent_vector(int *z_q, const float *z, int quant_level);
#endif
\ No newline at end of file
--- a/dnn/nfec_enc_demo.c
+++ b/dnn/nfec_enc_demo.c
@@ -16,8 +16,9 @@
float dframe[2 * NFEC_NUM_FEATURES];
float latents[80];
float initial_state[24];
+ int quantized_latents[NFEC_LATENT_DIM];
int index = 0;
- FILE *fid, *latents_fid, *states_fid;
+ FILE *fid, *latents_fid, *quantized_latents_fid, *states_fid;
memset(&enc_state, 0, sizeof(enc_state));
@@ -40,6 +41,16 @@
usage();
}
+ char filename[256];
+ strcpy(filename, argv[2]);
+ strcat(filename, ".quantized.f32");
+ quantized_latents_fid = fopen(filename, "wb");
+ if (latents_fid == NULL)
+ {
+ fprintf(stderr, "could not open latents file %s\n", filename);
+ usage();
+ }
+
states_fid = fopen(argv[3], "wb");
if (states_fid == NULL)
{
@@ -55,8 +66,10 @@
if (index == 2)
{
nfec_encode_dframe(&enc_state, latents, initial_state, dframe);
+ nfec_quantize_latent_vector(quantized_latents, latents, 0);
index = 0;
fwrite(latents, sizeof(float), NFEC_LATENT_DIM, latents_fid);
+ fwrite(quantized_latents, sizeof(int), NFEC_LATENT_DIM, quantized_latents_fid);
fwrite(initial_state, sizeof(float), GDENSE2_OUT_SIZE, states_fid);
}
}
@@ -64,6 +77,9 @@
fclose(fid);
fclose(states_fid);
fclose(latents_fid);
+ fclose(quantized_latents_fid);
+
+ return 0;
}
-/* gcc -DDISABLE_DOT_PROD -DDISABLE_NEON nfec_enc_demo.c nfec_enc.c nnet.c nfec_enc_data.c kiss99.c -g -o nfec_enc_demo */
\ No newline at end of file
+/* gcc -DDISABLE_DOT_PROD -DDISABLE_NEON nfec_enc_demo.c nfec_enc.c nnet.c nfec_enc_data.c nfec_stats_data.c kiss99.c -g -o nfec_enc_demo */
\ No newline at end of file
--- a/dnn/training_tf2/dump_nfec_model.py
+++ b/dnn/training_tf2/dump_nfec_model.py
@@ -1,6 +1,7 @@
import argparse
import os
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
parser = argparse.ArgumentParser()
@@ -59,17 +60,17 @@
r = 0.5 + 0.5 * tf.math.sigmoid(w[:, 4 * N : 5 * N]).numpy()
theta = tf.math.sigmoid(w[:, 5 * N : 6 * N]).numpy()
- printVector(f, quant_scales[:], 'nfec_stats_quant_scales')
- printVector(f, dead_zone_theta[:], 'nfec_stats_dead_zone_theta')
- printVector(f, r, 'nfec_stats_r')
- printVector(f, theta, 'nfec_stats_theta')
+ printVector(f, quant_scales[:], 'nfec_stats_quant_scales', static=False)
+ printVector(f, dead_zone_theta[:], 'nfec_stats_dead_zone_theta', static=False)
+ printVector(f, r, 'nfec_stats_r', static=False)
+ printVector(f, theta, 'nfec_stats_theta', static=False)
fh.write(
f"""
-extern float nfec_stats_quant_scales;
-extern float nfec_stats_dead_zone_theta;
-extern float nfec_stats_r;
-extern float nfec_stats_theta;
+extern const float nfec_stats_quant_scales[{levels * N}];
+extern const float nfec_stats_dead_zone_theta[{levels * N}];
+extern const float nfec_stats_r[{levels * N}];
+extern const float nfec_stats_theta[{levels * N}];
"""
)
@@ -159,6 +160,7 @@
header_fid.write(
f"""
#define NFEC_STATS_NUM_LEVELS {num_levels}
+#define NFEC_STATS_NUM_LATENTS {args.latent_dim}
"""
)
@@ -171,3 +173,60 @@
header_fid.close()
source_fid.close()
+ # decoder
+ decoder_dense_names = [
+ 'state1',
+ 'state2',
+ 'state3',
+ 'dec_dense1',
+ 'dec_dense3',
+ 'dec_dense5',
+ 'dec_dense7',
+ 'dec_dense8',
+ 'dec_final'
+ ]
+
+ decoder_gru_names = [
+ 'dec_dense2',
+ 'dec_dense4',
+ 'dec_dense6'
+ ]
+
+ source_fid = open("nfec_dec_data.c", 'w')
+ header_fid = open("nfec_dec_data.h", 'w')
+
+ start_header(header_fid, "nfec_dec_data.h")
+ start_source(source_fid, "nfec_dec_data.h", os.path.basename(args.weights))
+
+ # some global constants
+ header_fid.write(
+f"""
+#define NFEC_DEC_NUM_FEATURES 20
+
+#define NFEC_DEC_LATENT_DIM {args.latent_dim}
+
+#define NFEC_DEC_MAX_RNN_NEURONS {max_rnn_neurons}
+
+
+"""
+ )
+
+
+ # dump GRUs
+ max_rnn_neurons = max(
+ [
+ dump_gru_layer(decoder.get_layer(name), source_fid, header_fid)
+ for name in decoder_gru_names
+ ]
+ )
+
+ # dump Dense layers
+ for name in decoder_dense_names:
+ layer = decoder.get_layer(name)
+ dump_dense_layer(layer, source_fid, header_fid)
+
+ finish_header(header_fid)
+ finish_source(source_fid)
+
+ header_fid.close()
+ source_fid.close()
\ No newline at end of file
--- a/dnn/training_tf2/keraslayerdump.py
+++ b/dnn/training_tf2/keraslayerdump.py
@@ -3,13 +3,16 @@
import numpy as np
-def printVector(f, vector, name, dtype='float', dotp=False):
+def printVector(f, vector, name, dtype='float', dotp=False, static=True):
""" prints vector as one-dimensional C array """
if dotp:
vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
vector = vector.transpose((2, 0, 3, 1))
v = np.reshape(vector, (-1))
- f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))
+ if static:
+ f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))
+ else:
+ f.write('const {} {}[{}] = {{\n '.format(dtype, name, len(v)))
for i in range(0, len(v)):
f.write('{}'.format(v[i]))
if (i!=len(v)-1):
--
⑨