ref: 4298f2f9e18202317d513a047ef76ee9484d7988
parent: 116bcb38fb7bddb75a52b2da52ef536aadd4f3e1
author: Jean-Marc Valin <jmvalin@jmvalin.ca>
date: Sat Jul 10 23:36:20 EDT 2021
Adding support for SSE2 and SSSE3
--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -35,7 +35,7 @@
#include "arch.h"
-#if defined(__AVX__) || defined(__SSE4_1__)
+#if defined(__AVX__) || defined(__SSE2__)
#include "vec_avx.h"
#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
#include "vec_neon.h"
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -34,11 +34,28 @@
#include <immintrin.h>
-#ifndef DISABLE_DOT_PROD
+/* Use 8-bit dot products unless disabled or if stuck with SSE2. */
+#if (defined(__AVX2__) || defined(__SSSE3__)) && !defined(DISABLE_DOT_PROD)
#define DOT_PROD
#define USE_SU_BIAS
+
+#else
+
+#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
+
#endif
+
+#ifndef __SSE_4_1__
+static inline __m128 mm_floor_ps(__m128 x) {
+ __m128 half = _mm_set1_ps(0.5);
+ return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_sub_ps(x, half)));
+}
+#undef _mm_floor_ps
+#define _mm_floor_ps(x) mm_floor_ps(x)
+#endif
+
+
/* If we don't have AVX available, emulate what we need with SSE up to 4.1. */
#ifndef __AVX__
@@ -135,6 +152,7 @@
static inline __m128 mm256_extractf128_ps(mm256_emu x, int i) {
return (i==0) ? x.lo : x.hi;
}
+#undef _mm256_extractf128_ps
#define _mm256_extractf128_ps(x,i) mm256_extractf128_ps(x,i)
static inline mm256_emu mm256_insertf128_ps(mm256_emu dst, __m128 src, int i) {
@@ -142,6 +160,7 @@
else dst.hi = src;
return dst;
}
+#undef _mm256_insertf128_ps
#define _mm256_insertf128_ps(dst,src,i) mm256_insertf128_ps(dst,src,i)
#endif /* __AVX__ */
--
⑨