ref: 714380e71b969ba6b3eff3e1a37ca16792aba68d
parent: 006556036a4838a0243690354a0f375882d4c49e
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Tue Jul 6 14:26:58 EDT 2021
More manual unrolling
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -419,7 +419,40 @@
__m256i vy0;
y = &out[i];
vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);
- for (j=0;j<cols;j+=4)
+ j=0;
+#if 1 /* Unrolling by 4 gives some gain, comment out if it does not. */
+ for (;j<cols-12;j+=16)
+ {
+ __m256i tmp;
+ __m256i vxj;
+ __m256i vw;
+ vxj = _mm256_set1_epi32(*(int*)&x[j]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ vxj = _mm256_set1_epi32(*(int*)&x[j+4]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ vxj = _mm256_set1_epi32(*(int*)&x[j+8]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ vxj = _mm256_set1_epi32(*(int*)&x[j+12]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ }
+#endif
+ for (;j<cols;j+=4)
{
__m256i tmp;
__m256i vxj;
--
⑨