ref: fd028ff8d6e2136712b78636cb1861a491d57908
parent: 6e2f0b72b5bf1e578f888e70442288fed3dfd224
author: shuanzhu <shuanzhu@cisco.com>
date: Wed Jul 2 13:42:26 EDT 2025
small fix for supporting neon for dnn in Windwos ARM64 Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/cmake/OpusConfig.cmake
+++ b/cmake/OpusConfig.cmake
@@ -56,7 +56,7 @@
else()
set(OPUS_CPU_X86 1)
endif()
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64|ARM)")
set(OPUS_CPU_ARM 1)
endif()
--- a/cmake/OpusFunctions.cmake
+++ b/cmake/OpusFunctions.cmake
@@ -134,7 +134,7 @@
endfunction()
function(opus_detect_neon COMPILER_SUPPORT_NEON)
- if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64)")
+ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(arm|aarch64|ARM)")
message(STATUS "Check NEON support by compiler")
check_include_file(arm_neon.h HAVE_ARM_NEON_H)
if(HAVE_ARM_NEON_H)
--- a/dnn/vec_neon.h
+++ b/dnn/vec_neon.h
@@ -402,12 +402,12 @@
for (;j<cols-4;j+=8)
{
int8x16_t vw0, vw1, vw2, vw3, vx0, vx1;
- vx0 = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[j]);
+ vx0 = vreinterpretq_s8_s32(vld1q_dup_s32((int*)(void*)&x[j]));
vw0 = vld1q_s8(w);
vw1 = vld1q_s8(&w[16]);
acc0 = vdotprod(acc0, vw0, vx0);
acc1 = vdotprod(acc1, vw1, vx0);
- vx1 = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[j+4]);
+ vx1 = vreinterpretq_s8_s32(vld1q_dup_s32((int*)(void*)&x[j+4]));
vw2 = vld1q_s8(&w[32]);
vw3 = vld1q_s8(&w[48]);
acc2 = vdotprod(acc2, vw2, vx1);
@@ -419,7 +419,7 @@
for (;j<cols;j+=4)
{
int8x16_t vw0, vw1, vx;
- vx = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[j]);
+ vx = vreinterpretq_s8_s32(vld1q_dup_s32((int*)(void*)&x[j]));
vw0 = vld1q_s8(w);
vw1 = vld1q_s8(&w[16]);
acc0 = vdotprod(acc0, vw0, vx);
@@ -457,7 +457,7 @@
int pos;
pos = (*idx++);
int8x16_t vw0, vw1, vx;
- vx = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[pos]);
+ vx = vreinterpretq_s8_s32(vld1q_dup_s32((int*)(void*)&x[pos]));
vw0 = vld1q_s8(w);
vw1 = vld1q_s8(&w[16]);
acc0 = vdotprod(acc0, vw0, vx);
--
⑨