ref: bce779886de72547db991c5d4dec1ae1fc1a7a80
parent: 11736ca9e3654397bfe472e51f2ec96433764efe
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Fri Dec 25 21:15:57 EST 2020
WIP: signed*unsigned arithmetic
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -39,17 +39,14 @@
#include "nnet.h"
#include "nnet_data.h"
-#define SOFTMAX_HACK
-
-#ifdef __AVX__
-#include "vec_avx.h"
-#elif __ARM_NEON__
-#include "vec_neon.h"
-#else
+#ifdef NO_OPTIMIZATIONS
#warning Compiling without any vectorization. This code will be very slow
-#include "vec.h"
#endif
+
+#define SOFTMAX_HACK
+
+
static OPUS_INLINE float relu(float x)
{
return x < 0 ? 0 : x;
@@ -294,14 +291,19 @@
celt_assert(input != state);
celt_assert(gru->reset_after);
RNN_COPY(zrh, input, 3*N);
+#ifdef USE_SU_BIAS
for (i=0;i<3*N;i++)
- recur[i] = gru->bias[3*N + i];
+ recur[i] = gru->subias[3*N + i];
+#else
+ for (i=0;i<3*N;i++)
+ recur[i] = gru->bias[3*N + i];
+#endif
for (k=0;k<3;k++)
{
for (i=0;i<N;i++)
recur[k*N + i] += gru->diag_weights[k*N + i]*state[i];
}
- sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, gru->idx, state);
+ sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, 3*N, gru->idx, state);
for (i=0;i<2*N;i++)
zrh[i] += recur[i];
compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -28,6 +28,8 @@
#ifndef _NNET_H_
#define _NNET_H_
+#include "vec.h"
+
#define ACTIVATION_LINEAR 0
#define ACTIVATION_SIGMOID 1
#define ACTIVATION_TANH 2
@@ -34,12 +36,6 @@
#define ACTIVATION_RELU 3
#define ACTIVATION_SOFTMAX 4
-#ifdef DOT_PROD
-typedef signed char qweight;
-#else
-typedef float qweight;
-#endif
-
typedef struct {
const float *bias;
const float *input_weights;
@@ -70,6 +66,7 @@
typedef struct {
const float *bias;
+ const float *subias;
const float *diag_weights;
const qweight *recurrent_weights;
const int *idx;
--- a/dnn/tansig_table.h
+++ b/dnn/tansig_table.h
@@ -1,5 +1,8 @@
/* This file is auto-generated by gen_tables */
+#ifndef TANSIG_TABLE_H
+#define TANSIG_TABLE_H
+
static const float tansig_table[201] = {
0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
@@ -43,3 +46,5 @@
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f,
};
+
+#endif /*TANSIG_TABLE_H*/
--- a/dnn/training_tf2/dump_lpcnet.py
+++ b/dnn/training_tf2/dump_lpcnet.py
@@ -102,6 +102,9 @@
weights = self.get_weights()
printSparseVector(f, weights[1], name + '_recurrent_weights')
printVector(f, weights[-1], name + '_bias')
+ subias = weights[-1].copy()
+ subias[1,:] = subias[1,:] - np.sum(np.clip(weights[1], -1, 1),axis=0)
+ printVector(f, subias, name + '_subias')
if hasattr(self, 'activation'):
activation = self.activation.__name__.upper()
else:
@@ -112,8 +115,8 @@
reset_after = 1
neurons = weights[0].shape[1]//3
max_rnn_neurons = max(max_rnn_neurons, neurons)
- f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'
- .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
+ f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'
+ .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
hf.write('extern const SparseGRULayer {};\n\n'.format(name));
--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -26,11 +26,33 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#include "nnet.h"
+#ifndef VEC_H
+#define VEC_H
+#include "tansig_table.h"
+#include "opus_types.h"
+#include <math.h>
+#include "arch.h"
+
+#ifdef DOT_PROD
+typedef signed char qweight;
+#else
+typedef float qweight;
+#endif
+
+#ifdef __AVX__
+#include "vec_avx.h"
+#elif __ARM_NEON__
+#include "vec_neon.h"
+#else
+
+//#define USE_SU_BIAS
+
+#define NO_OPTIMIZATIONS
+
/* No AVX2/FMA support */
#ifndef LPCNET_TEST
-static float celt_exp2(float x)
+static inline float celt_exp2(float x)
{
int integer;
float frac;
@@ -50,7 +72,7 @@
}
#define celt_exp(x) celt_exp2((x)*1.44269504f)
-static float tansig_approx(float x)
+static inline float tansig_approx(float x)
{
int i;
float y, dy;
@@ -69,12 +91,12 @@
return sign*y;
}
-static OPUS_INLINE float sigmoid_approx(float x)
+static inline float sigmoid_approx(float x)
{
return .5f + .5f*tansig_approx(.5f*x);
}
-static void softmax(float *y, const float *x, int N)
+static inline void softmax(float *y, const float *x, int N)
{
int i;
for (i=0;i<N;i++)
@@ -81,7 +103,7 @@
y[i] = celt_exp(x[i]);
}
-static void vec_tanh(float *y, const float *x, int N)
+static inline void vec_tanh(float *y, const float *x, int N)
{
int i;
for (i=0;i<N;i++)
@@ -90,7 +112,7 @@
}
}
-static void vec_sigmoid(float *y, const float *x, int N)
+static inline void vec_sigmoid(float *y, const float *x, int N)
{
int i;
for (i=0;i<N;i++)
@@ -99,7 +121,7 @@
}
}
#endif
-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
{
int i, j;
for (i=0;i<rows;i+=16)
@@ -132,7 +154,7 @@
}
}
-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
{
int i, j;
for (i=0;i<rows;i+=16)
@@ -167,42 +189,90 @@
}
#ifdef DOT_PROD
+
+#define MAX_INPUTS (2048)
+
+
+#define SCALE (128.f*127.f)
#define SCALE_1 (1.f/128.f/127.f)
-static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x)
+
+#ifdef USE_SU_BIAS
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
{
int i, j;
+ unsigned x[MAX_INPUTS];
+ for (i=0;i<rows;i++) out[i] *= SCALE;
+ for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
for (i=0;i<rows;i+=8)
{
- int cols;
- cols = *idx++;
- for (j=0;j<cols;j++)
+ int colblocks;
+ colblocks = *idx++;
+ for (j=0;j<colblocks;j++)
{
int pos;
float * restrict y;
int xj0, xj1, xj2, xj3;
pos = 4 * (*idx++);
- xj0 = floor(.5+127*x[pos+0]);
- xj1 = floor(.5+127*x[pos+1]);
- xj2 = floor(.5+127*x[pos+2]);
- xj3 = floor(.5+127*x[pos+3]);
+ xj0 = x[pos+0];
+ xj1 = x[pos+1];
+ xj2 = x[pos+2];
+ xj3 = x[pos+3];
y = &out[i];
- y[0] += SCALE_1*(w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
- y[1] += SCALE_1*(w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
- y[2] += SCALE_1*(w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
- y[3] += SCALE_1*(w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
- y[4] += SCALE_1*(w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
- y[5] += SCALE_1*(w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
- y[6] += SCALE_1*(w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
- y[7] += SCALE_1*(w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+ y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+ y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+ y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+ y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+ y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+ y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+ y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+ y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
w += 32;
}
}
+ for (i=0;i<rows;i++) out[i] *= SCALE_1;
}
+#else /*USE_SU_BIAS*/
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
+{
+ int i, j;
+ signed x[MAX_INPUTS];
+ for (i=0;i<rows;i++) out[i] *= SCALE;
+ for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);
+ for (i=0;i<rows;i+=8)
+ {
+ int colblocks;
+ colblocks = *idx++;
+ for (j=0;j<colblocks;j++)
+ {
+ int pos;
+ float * restrict y;
+ int xj0, xj1, xj2, xj3;
+ pos = 4 * (*idx++);
+ xj0 = x[pos+0];
+ xj1 = x[pos+1];
+ xj2 = x[pos+2];
+ xj3 = x[pos+3];
+ y = &out[i];
+ y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+ y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+ y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+ y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+ y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+ y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+ y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+ y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+ w += 32;
+ }
+ }
+ for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+#endif /*USE_SU_BIAS*/
-#else
-static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x)
+#else /*DOT_PROD*/
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x)
{
int i, j;
+ (void)ignore;
for (i=0;i<rows;i+=8)
{
int cols;
@@ -257,4 +327,8 @@
}
}
}
-#endif
+#endif /*DOT_PROD*/
+
+
+#endif /*no optimizations*/
+#endif /*VEC_H*/
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -29,6 +29,9 @@
AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma
*/
+#ifndef VEC_AVX_H
+#define VEC_AVX_H
+
#include <immintrin.h>
#ifdef __AVX2__
@@ -246,9 +249,10 @@
}
#else
-static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, const int *idx, const float *x)
+static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x)
{
int i, j;
+ (void)ignore;
for (i=0;i<rows;i+=8)
{
float * restrict y;
@@ -286,3 +290,4 @@
}
#endif
+#endif /*VEC_AVX_H*/
--
⑨