ref: ee90c140359eacbc950cc1ee4b140ecf9efcefea
parent: 03ef24ebe086c65ca3fcdf7a5808fc3bb702f2b6
author: Siarhei Volkau <lis8215@gmail.com>
date: Fri Aug 22 03:42:07 EDT 2025
MIPS: optimize MULT16_32_Q16 for MIPS without DSP It's typical implementation requires involving accumulator register to get 48+ bit multiplication. but getting scaled result back from accumulator requires 5 instructions, so typically GCC emits: # MULT16_32_Q16 mult a16, b32 mflo r1 mfhi r2 srl r1, r1, 16 sll r2, r2, 16 or result, r1, r2 but if we scale 16-bit argument before multiplication we can get result in one instruction (mfhi): # MULT16_32_Q16 sll a32, a16, 16 mult a32, b32 mfhi result for MIPS32r6 it's even shorter: sll a32, a16, 16 muh result, a32, b32 MIPS64 avoids using accumulator here and can scale result in general register with single instruction. So no special trick needed. Signed-off-by: Siarhei Volkau <lis8215@gmail.com> Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -200,7 +200,7 @@
/** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */
#define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b)))
-#if defined(__mips_dsp) && __mips == 32
+#if defined(__mips)
#include "mips/fixed_generic_mipsr1.h"
#endif
--- a/celt/mips/fixed_generic_mipsr1.h
+++ b/celt/mips/fixed_generic_mipsr1.h
@@ -33,6 +33,7 @@
#ifndef CELT_FIXED_GENERIC_MIPSR1_H
#define CELT_FIXED_GENERIC_MIPSR1_H
+#if defined (__mips_dsp) && __mips == 32
#undef MULT16_32_Q16
static inline int MULT16_32_Q16(int a, int b)
@@ -74,5 +75,12 @@
int r = a * b;
return __builtin_mips_shra_r_w(r, 15);
}
+
+#elif __mips == 32
+
+#undef MULT16_32_Q16
+#define MULT16_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(SHL32((a), 16))*(b),32))
+
+#endif
#endif /* CELT_FIXED_GENERIC_MIPSR1_H */
--
⑨