ref: c045702e51e04006f81f6f473ab5326eb24aace6
parent: 73a05f55c75505b3ef05f931dbb054002ba0e2ba
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Wed Dec 23 21:46:19 EST 2020
Add non-dot-product AVX code
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -218,3 +218,45 @@
}
}
+#ifdef DOT_PROD
+#else
+static void sparse_sgemv_accum8x4(float *out, const float *weights, int rows, const int *idx, const float *x)
+{
+ int i, j;
+ for (i=0;i<rows;i+=8)
+ {
+ float * restrict y;
+ int cols;
+ __m256 vy0;
+ y = &out[i];
+ vy0 = _mm256_loadu_ps(&y[0]);
+ cols = *idx++;
+ for (j=0;j<cols;j++)
+ {
+ int id;
+ __m256 vxj;
+ __m256 vw;
+ id = *idx++;
+ vxj = _mm256_broadcast_ss(&x[4*id]);
+ vw = _mm256_loadu_ps(&weights[0]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vxj = _mm256_broadcast_ss(&x[4*id+1]);
+ vw = _mm256_loadu_ps(&weights[8]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vxj = _mm256_broadcast_ss(&x[4*id+2]);
+ vw = _mm256_loadu_ps(&weights[16]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vxj = _mm256_broadcast_ss(&x[4*id+3]);
+ vw = _mm256_loadu_ps(&weights[24]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ weights += 32;
+ }
+ _mm256_storeu_ps (&y[0], vy0);
+ }
+}
+#endif
+
--
⑨