shithub: opus

Download patch

ref: ee90c140359eacbc950cc1ee4b140ecf9efcefea
parent: 03ef24ebe086c65ca3fcdf7a5808fc3bb702f2b6
author: Siarhei Volkau <lis8215@gmail.com>
date: Fri Aug 22 03:42:07 EDT 2025

MIPS: optimize MULT16_32_Q16 for MIPS without DSP

It's typical implementation requires involving accumulator register
to get 48+ bit multiplication. but getting scaled result back from
accumulator requires 5 instructions, so typically GCC emits:
# MULT16_32_Q16
 mult a16, b32
 mflo r1
 mfhi r2
 srl  r1, r1, 16
 sll  r2, r2, 16
 or   result, r1, r2

but if we scale 16-bit argument before multiplication
we can get result in one instruction (mfhi):
# MULT16_32_Q16
 sll  a32, a16, 16
 mult a32, b32
 mfhi result

for MIPS32r6 it's even shorter:
 sll  a32, a16, 16
 muh  result, a32, b32

MIPS64 avoids using accumulator here and can scale
result in general register with single instruction.
So no special trick needed.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>

--- a/celt/fixed_generic.h
+++ b/celt/fixed_generic.h
@@ -200,7 +200,7 @@
 /** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */
 #define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b)))
 
-#if defined(__mips_dsp) && __mips == 32
+#if defined(__mips)
 #include "mips/fixed_generic_mipsr1.h"
 #endif
 
--- a/celt/mips/fixed_generic_mipsr1.h
+++ b/celt/mips/fixed_generic_mipsr1.h
@@ -33,6 +33,7 @@
 #ifndef CELT_FIXED_GENERIC_MIPSR1_H
 #define CELT_FIXED_GENERIC_MIPSR1_H
 
+#if defined (__mips_dsp) && __mips == 32
 
 #undef MULT16_32_Q16
 static inline int MULT16_32_Q16(int a, int b)
@@ -74,5 +75,12 @@
     int r = a * b;
     return __builtin_mips_shra_r_w(r, 15);
 }
+
+#elif __mips == 32
+
+#undef MULT16_32_Q16
+#define MULT16_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(SHL32((a), 16))*(b),32))
+
+#endif
 
 #endif /* CELT_FIXED_GENERIC_MIPSR1_H */
--