ref: 05f4851dcded87b7edec0e309319d8727fc79377
parent: 1956467d794ba7efe053e260ab60ccd516890b3c
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Fri Nov 30 05:32:04 EST 2018
Making the code work even without AVX2/FMA
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -282,6 +282,10 @@
}
#else
+
+#warning Compiling without any vectorization. This code will be very slow
+#warning Try adding -mavx2 -mfma
+
static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
{
int i, j;
@@ -311,6 +315,40 @@
y[13] += w[13]*xj;
y[14] += w[14]*xj;
y[15] += w[15]*xj;
+ }
+ }
+}
+
+static void sparse_gemm_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+{
+ int i, j;
+ for (i=0;i<rows;i+=16)
+ {
+ int cols;
+ cols = *idx++;
+ for (j=0;j<cols;j++)
+ {
+ float * restrict y;
+ float xj;
+ xj = x[*idx++];
+ y = &out[i];
+ y[0] += w[0]*xj;
+ y[1] += w[1]*xj;
+ y[2] += w[2]*xj;
+ y[3] += w[3]*xj;
+ y[4] += w[4]*xj;
+ y[5] += w[5]*xj;
+ y[6] += w[6]*xj;
+ y[7] += w[7]*xj;
+ y[8] += w[8]*xj;
+ y[9] += w[9]*xj;
+ y[10] += w[10]*xj;
+ y[11] += w[11]*xj;
+ y[12] += w[12]*xj;
+ y[13] += w[13]*xj;
+ y[14] += w[14]*xj;
+ y[15] += w[15]*xj;
+ w += 16;
}
}
}
--
⑨