ref: 44fe0556826c23fc4273c55f159351577fa98a57
parent: 60d6eab63d70d6892eea86543f2c84227ec3ac9a
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Tue Jul 6 13:59:21 EDT 2021
cleanup float<->int conversions
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -474,9 +474,7 @@
__m256i ones;
int i, j;
unsigned char x[MAX_INPUTS];
- int out[MAX_OUTPUTS];
ones = _mm256_set1_epi16(1);
- for (i=0;i<rows;i++) out[i] = SCALE*_out[i];
//for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
__m256 const127 = _mm256_set1_ps(127.f);
for (i=0;i<cols;i+=8) {
@@ -496,12 +494,13 @@
}
for (i=0;i<rows;i+=8)
{
- int * restrict y;
int colblocks;
__m256i vy0;
+ __m256 vout;
colblocks = *idx++;
- y = &out[i];
- vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);
+ vout = _mm256_loadu_ps(&_out[i]);
+ vout = _mm256_mul_ps(vout, _mm256_set1_ps(SCALE));
+ vy0 = _mm256_cvtps_epi32(vout);
j=0;
#if 1 /* Unrolling by 4 gives some gain, comment out if it does not. */
for (;j<colblocks-3;j+=4)
@@ -549,9 +548,10 @@
vy0 = _mm256_add_epi32(vy0, tmp);
w += 32;
}
- _mm256_storeu_si256 ((__m256i *)&y[0], vy0);
+ vout = _mm256_cvtepi32_ps(vy0);
+ vout = _mm256_mul_ps(vout, _mm256_set1_ps(SCALE_1));
+ _mm256_storeu_ps(&_out[i], vout);
}
- for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];
}
--
⑨