ref: d3321008081396024ca83e9ec5a5af894776c0a0
parent: c151fc1853cbd4742943decf3f637b6b39431403
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Sat Jul 3 10:54:08 EDT 2021
Representing output pdf as binary probability tree Saves on the MDense/softmax computation since we only need to compute 8 values instead of 256.
--- a/dnn/lpcnet.c
+++ b/dnn/lpcnet.c
@@ -77,7 +77,7 @@
if (lpcnet->frame_count < 1000) lpcnet->frame_count++;
}
-void run_sample_network(NNetState *net, float *pdf, const float *condition, const float *gru_a_condition, int last_exc, int last_sig, int pred)
+int run_sample_network(NNetState *net, const float *condition, const float *gru_a_condition, int last_exc, int last_sig, int pred)
{
float gru_a_input[3*GRU_A_STATE_SIZE];
float in_b[GRU_A_STATE_SIZE+FEATURE_DENSE2_OUT_SIZE];
@@ -90,7 +90,7 @@
RNN_COPY(in_b, net->gru_a_state, GRU_A_STATE_SIZE);
RNN_COPY(&in_b[GRU_A_STATE_SIZE], condition, FEATURE_DENSE2_OUT_SIZE);
compute_gru2(&gru_b, net->gru_b_state, in_b);
- compute_mdense(&dual_fc, pdf, net->gru_b_state);
+ return sample_mdense(&dual_fc, net->gru_b_state);
}
LPCNET_EXPORT int lpcnet_get_size()
@@ -124,14 +124,11 @@
int i;
float condition[FEATURE_DENSE2_OUT_SIZE];
float lpc[LPC_ORDER];
- float pdf[DUAL_FC_OUT_SIZE];
float gru_a_condition[3*GRU_A_STATE_SIZE];
int pitch;
- float pitch_gain;
/* Matches the Python code -- the 0.1 avoids rounding issues. */
pitch = (int)floor(.1 + 50*features[36]+100);
pitch = IMIN(255, IMAX(33, pitch));
- pitch_gain = lpcnet->old_gain[FEATURES_DELAY-1];
memmove(&lpcnet->old_gain[1], &lpcnet->old_gain[0], (FEATURES_DELAY-1)*sizeof(lpcnet->old_gain[0]));
lpcnet->old_gain[0] = features[PITCH_GAIN_FEATURE];
run_frame_network(lpcnet, condition, gru_a_condition, features, pitch);
@@ -154,8 +151,7 @@
for (j=0;j<LPC_ORDER;j++) pred -= lpcnet->last_sig[j]*lpc[j];
last_sig_ulaw = lin2ulaw(lpcnet->last_sig[0]);
pred_ulaw = lin2ulaw(pred);
- run_sample_network(&lpcnet->nnet, pdf, condition, gru_a_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw);
- exc = sample_from_pdf(pdf, DUAL_FC_OUT_SIZE, MAX16(0, 1.5f*pitch_gain - .5f), PDF_FLOOR);
+ exc = run_sample_network(&lpcnet->nnet, condition, gru_a_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw);
pcm = pred + ulaw2lin(exc);
RNN_MOVE(&lpcnet->last_sig[1], &lpcnet->last_sig[0], LPC_ORDER-1);
lpcnet->last_sig[0] = pcm;
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -141,6 +141,46 @@
compute_activation(output, output, N, layer->activation);
}
+int sample_mdense(const MDenseLayer *layer, const float *input)
+{
+ int b, j, N, M, C, stride;
+ M = layer->nb_inputs;
+ N = layer->nb_neurons;
+ C = layer->nb_channels;
+ celt_assert(N*C <= MAX_MDENSE_TMP);
+ stride = M*C;
+
+ celt_assert(N <= DUAL_FC_OUT_SIZE);
+ int val=0;
+
+ for (b=0;b<8;b++)
+ {
+ int bit;
+ int i;
+ float sum1, sum2;
+
+ i = (1<<b) | val;
+
+ sum1 = layer->bias[i];
+ sum2 = layer->bias[i + N];
+ for (j=0;j<M;j++) {
+ sum1 += layer->input_weights[i*stride + j]*input[j];
+ sum2 += layer->input_weights[i*stride + j + M]*input[j];
+ }
+ sum1 = layer->factor[i]*tanh_approx(sum1);
+ sum2 = layer->factor[N + i]*tanh_approx(sum2);
+ sum1 += sum2;
+ //sum1 = 1.f/(1 + exp(-sum1));
+ sum1 = sigmoid_approx(sum1);
+
+ bit = .025+.95*((rand()+.5f)/(RAND_MAX+1.f)) < sum1;
+ val = (val << 1) | bit;
+ }
+ return val;
+
+}
+
+
#if 0
void compute_gru(const GRULayer *gru, float *state, const float *input)
{
@@ -362,59 +402,4 @@
{
output[i] += layer->embedding_weights[input*layer->dim + i];
}
-}
-
-int sample_from_pdf(const float *pdf, int N, float exp_boost, float pdf_floor)
-{
- int i;
- float sum, norm;
- float r;
- float tmp[DUAL_FC_OUT_SIZE];
- celt_assert(N <= DUAL_FC_OUT_SIZE);
- sum = 0;
-#ifdef SOFTMAX_HACK
- for (i=0;i<N;i++)
- {
- tmp[i] = pdf[i] * (1.f+exp_boost);
- }
- softmax(tmp, tmp, N);
- for (i=0;i<N;i++)
- {
- sum += tmp[i];
- }
-#else
- /* Decrease the temperature of the sampling. */
- for (i=0;i<N;i++)
- {
- tmp[i] = pow(pdf[i], 1.f+exp_boost);
- sum += tmp[i];
- }
-#endif
- norm = 1.f/sum;
- /* Convert tmp to a CDF while subtracting the floor */
- tmp[0] = MAX16(0, norm*tmp[0] - pdf_floor);
- for (i=1;i<N;i++)
- {
- tmp[i] = tmp[i-1] + MAX16(0, norm*tmp[i] - pdf_floor);
- }
- /* Do the sampling (from the cdf). */
- r = tmp[N-1] * ((rand()+.5f)/(RAND_MAX+1.f));
-#if 1 /* Bisection search in the CDF (faster than the equivalent linear one below). */
- {
- int start=-1;
- int end = N-1;
- while (end > start+1) {
- int mid = (start+end)>>1;
- if (r <= tmp[mid]) end = mid;
- else start = mid;
- }
- return end;
- }
-#else
- for (i=0;i<N-1;i++)
- {
- if (r <= tmp[i]) return i;
- }
- return N-1;
-#endif
}
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -97,6 +97,8 @@
void compute_mdense(const MDenseLayer *layer, float *output, const float *input);
+int sample_mdense(const MDenseLayer *layer, const float *input);
+
void compute_gru(const GRULayer *gru, float *state, const float *input);
void compute_gru2(const GRULayer *gru, float *state, const float *input);
--- a/dnn/training_tf2/dump_lpcnet.py
+++ b/dnn/training_tf2/dump_lpcnet.py
@@ -183,7 +183,7 @@
name = self.name
print("printing layer " + name + " of type " + self.__class__.__name__)
weights = self.get_weights()
- printVector(f, np.transpose(weights[0], (1, 2, 0)), name + '_weights')
+ printVector(f, np.transpose(weights[0], (0, 2, 1)), name + '_weights')
printVector(f, np.transpose(weights[1], (1, 0)), name + '_bias')
printVector(f, np.transpose(weights[2], (1, 0)), name + '_factor')
activation = self.activation.__name__.upper()
--- a/dnn/training_tf2/lpcnet.py
+++ b/dnn/training_tf2/lpcnet.py
@@ -44,6 +44,15 @@
embed_size = 128
pcm_levels = 2**pcm_bits
+def interleave(p):
+ p2=tf.expand_dims(p, 3)
+ nb_repeats = pcm_levels//(2*p.shape[2])
+ p3 = tf.reshape(tf.repeat(tf.concat([1-p2, p2], 3), nb_repeats), (-1, 2400, pcm_levels))
+ return p3
+
+def tree_to_pdf(p):
+ return interleave(p[:,:,1:2]) * interleave(p[:,:,2:4]) * interleave(p[:,:,4:8]) * interleave(p[:,:,8:16]) * interleave(p[:,:,16:32]) * interleave(p[:,:,32:64]) * interleave(p[:,:,64:128]) * interleave(p[:,:,128:256])
+
def quant_regularizer(x):
Q = 128
Q_1 = 1./Q
@@ -185,10 +194,10 @@
kernel_constraint=constraint, kernel_regularizer=quant)
rnn_in = Concatenate()([cpcm, rep(cfeat)])
- md = MDense(pcm_levels, activation='softmax', name='dual_fc')
+ md = MDense(pcm_levels, activation='sigmoid', name='dual_fc')
gru_out1, _ = rnn(rnn_in)
gru_out2, _ = rnn2(Concatenate()([gru_out1, rep(cfeat)]))
- ulaw_prob = md(gru_out2)
+ ulaw_prob = Lambda(tree_to_pdf)(md(gru_out2))
if adaptation:
rnn.trainable=False
@@ -207,7 +216,7 @@
dec_rnn_in = Concatenate()([cpcm, dec_feat])
dec_gru_out1, state1 = rnn(dec_rnn_in, initial_state=dec_state1)
dec_gru_out2, state2 = rnn2(Concatenate()([dec_gru_out1, dec_feat]), initial_state=dec_state2)
- dec_ulaw_prob = md(dec_gru_out2)
+ dec_ulaw_prob = Lambda(tree_to_pdf)(md(dec_gru_out2))
decoder = Model([pcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
return model, encoder, decoder
--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -79,7 +79,7 @@
}
#define celt_exp(x) celt_exp2((x)*1.44269504f)
-static inline float tansig_approx(float x)
+static inline float tanh_approx(float x)
{
int i;
float y, dy;
@@ -100,7 +100,7 @@
static inline float sigmoid_approx(float x)
{
- return .5f + .5f*tansig_approx(.5f*x);
+ return .5f + .5f*tanh_approx(.5f*x);
}
static inline void softmax(float *y, const float *x, int N)
@@ -115,7 +115,7 @@
int i;
for (i=0;i<N;i++)
{
- y[i] = tansig_approx(x[i]);
+ y[i] = tanh_approx(x[i]);
}
}
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -214,6 +214,26 @@
return out[0];
}
+static inline float tanh_approx(float x)
+{
+ float out[8];
+ __m256 X, Y;
+ X = _mm256_set1_ps(x);
+ Y = tanh8_approx(X);
+ _mm256_storeu_ps(out, Y);
+ return out[0];
+}
+
+static inline float sigmoid_approx(float x)
+{
+ float out[8];
+ __m256 X, Y;
+ X = _mm256_set1_ps(x);
+ Y = sigmoid8_approx(X);
+ _mm256_storeu_ps(out, Y);
+ return out[0];
+}
+
static inline void softmax(float *y, const float *x, int N)
{
int i;
@@ -241,9 +261,7 @@
}
for (;i<N;i++)
{
- float ex2;
- ex2 = celt_exp(2*x[i]);
- y[i] = (ex2-1)/(ex2+1);
+ y[i] = tanh_approx(x[i]);
}
}
@@ -259,9 +277,7 @@
}
for (;i<N;i++)
{
- float ex;
- ex = celt_exp(x[i]);
- y[i] = (ex)/(ex+1);
+ y[i] = sigmoid_approx(x[i]);
}
}
#else
@@ -277,9 +293,7 @@
}
for (;i<N;i++)
{
- float ex2;
- ex2 = celt_exp(2*x[i]);
- y[i] = (ex2-1)/(ex2+1);
+ y[i] = tanh_approx(x[i]);
}
}
@@ -295,9 +309,7 @@
}
for (;i<N;i++)
{
- float ex;
- ex = celt_exp(x[i]);
- y[i] = (ex)/(ex+1);
+ y[i] = sigmoid_approx(x[i]);
}
}
--- a/dnn/vec_neon.h
+++ b/dnn/vec_neon.h
@@ -64,6 +64,47 @@
return Y;
}
+static inline float32x4_t tanh4_approx(float32x4_t X)
+{
+ const float32x4_t N0 = vdupq_n_f32(952.52801514f);
+ const float32x4_t N1 = vdupq_n_f32(96.39235687f);
+ const float32x4_t N2 = vdupq_n_f32(0.60863042f);
+ const float32x4_t D0 = vdupq_n_f32(952.72399902f);
+ const float32x4_t D1 = vdupq_n_f32(413.36801147f);
+ const float32x4_t D2 = vdupq_n_f32(11.88600922f);
+ const float32x4_t max_out = vdupq_n_f32(1.f);
+ const float32x4_t min_out = vdupq_n_f32(-1.f);
+ float32x4_t X2, num, den;
+ X2 = vmulq_f32(X, X);
+ num = vmlaq_f32(N0, X2, vmlaq_f32(N1, N2, X2));
+ den = vmlaq_f32(D0, X2, vmlaq_f32(D1, D2, X2));
+ num = vmulq_f32(num, X);
+ den = vrecpeq_f32(den);
+ num = vmulq_f32(num, den);
+ return vmaxq_f32(min_out, vminq_f32(max_out, num));
+}
+
+static inline float32x4_t sigmoid4_approx(float32x4_t X)
+{
+ const float32x4_t N0 = vdupq_n_f32(238.13200378f);
+ const float32x4_t N1 = vdupq_n_f32(6.02452230f);
+ const float32x4_t N2 = vdupq_n_f32(0.00950985f);
+ const float32x4_t D0 = vdupq_n_f32(952.72399902f);
+ const float32x4_t D1 = vdupq_n_f32(103.34200287f);
+ const float32x4_t D2 = vdupq_n_f32(0.74287558f);
+ const float32x4_t half = vdupq_n_f32(0.5f);
+ const float32x4_t max_out = vdupq_n_f32(1.f);
+ const float32x4_t min_out = vdupq_n_f32(0.f);
+ float32x4_t X2, num, den;
+ X2 = vmulq_f32(X, X);
+ num = vmlaq_f32(N0, X2, vmlaq_f32(N1, N2, X2));
+ den = vmlaq_f32(D0, X2, vmlaq_f32(D1, D2, X2));
+ num = vmulq_f32(num, X);
+ den = vrecpeq_f32(den);
+ num = vmlaq_f32(half, num, den);
+ return vmaxq_f32(min_out, vminq_f32(max_out, num));
+}
+
static inline float celt_exp(float x)
{
float out[4];
@@ -74,6 +115,26 @@
return out[0];
}
+static inline float tanh_approx(float x)
+{
+ float out[4];
+ float32x4_t X, Y;
+ X = vdupq_n_f32(x);
+ Y = tanh4_approx(X);
+ vst1q_f32(out, Y);
+ return out[0];
+}
+
+static inline float sigmoid_approx(float x)
+{
+ float out[4];
+ float32x4_t X, Y;
+ X = vdupq_n_f32(x);
+ Y = sigmoid4_approx(X);
+ vst1q_f32(out, Y);
+ return out[0];
+}
+
static inline void softmax(float *y, const float *x, int N)
{
int i;
@@ -93,13 +154,9 @@
int i;
for (i=0;i<N-3;i+=4)
{
- const float32x4_t two = vdupq_n_f32(2.f);
- const float32x4_t one = vdupq_n_f32(1.f);
float32x4_t X, Y;
X = vld1q_f32(&x[i]);
- X = vmulq_f32(X, two);
- Y = exp4_approx(X);
- Y = vmulq_f32(vsubq_f32(Y, one), vrecpeq_f32(vaddq_f32(Y, one)));
+ Y = tanh4_approx(X);
vst1q_f32(&y[i], Y);
}
for (;i<N;i++)
@@ -115,11 +172,9 @@
int i;
for (i=0;i<N-3;i+=4)
{
- const float32x4_t one = vdupq_n_f32(1.f);
float32x4_t X, Y;
X = vld1q_f32(&x[i]);
- Y = exp4_approx(X);
- Y = vmulq_f32(Y, vrecpeq_f32(vaddq_f32(Y, one)));
+ Y = sigmoid4_approx(X);
vst1q_f32(&y[i], Y);
}
for (;i<N;i++)
@@ -221,10 +276,16 @@
#define MAX_INPUTS 2048
#define MAX_OUTPUTS 8192
+#if __ARM_FEATURE_DOTPROD
+static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b) {
+ return vdotq_s32(acc, a, b);
+}
+#else
static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b)
{
return vpadalq_s16(acc, vpaddq_s16(vmull_s8(vget_low_s8(a), vget_low_s8(b)), vmull_high_s8(a, b)));
}
+#endif
static inline void sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
{
--
⑨