shithub: opus

--- a/dnn/nnet.c

+++ b/dnn/nnet.c

@@ -282,6 +282,10 @@

 #else

+#warning Compiling without any vectorization. This code will be very slow

+#warning Try adding -mavx2 -mfma

 static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

    int i, j;

@@ -311,6 +315,40 @@

          y[13] += w[13]*xj;

          y[14] += w[14]*xj;

          y[15] += w[15]*xj;

+      }

+   }

+}

+static void sparse_gemm_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

+{

+   int i, j;

+   for (i=0;i<rows;i+=16)

+   {

+      int cols;

+      cols = *idx++;

+      for (j=0;j<cols;j++)

+      {

+         float * restrict y;

+         float xj;

+         xj = x[*idx++];

+         y = &out[i];

+         y[0] += w[0]*xj;

+         y[1] += w[1]*xj;

+         y[2] += w[2]*xj;

+         y[3] += w[3]*xj;

+         y[4] += w[4]*xj;

+         y[5] += w[5]*xj;

+         y[6] += w[6]*xj;

+         y[7] += w[7]*xj;

+         y[8] += w[8]*xj;

+         y[9] += w[9]*xj;

+         y[10] += w[10]*xj;

+         y[11] += w[11]*xj;

+         y[12] += w[12]*xj;

+         y[13] += w[13]*xj;

+         y[14] += w[14]*xj;

+         y[15] += w[15]*xj;

+         w += 16;

--

⑨