shithub: opus

MIPS: optimize MULT16_32_Q16 for MIPS without DSP It's typical implementation requires involving accumulator register to get 48+ bit multiplication. but getting scaled result back from accumulator requires 5 instructions, so typically GCC emits: # MULT16_32_Q16 mult a16, b32 mflo r1 mfhi r2 srl r1, r1, 16 sll r2, r2, 16 or result, r1, r2 but if we scale 16-bit argument before multiplication we can get result in one instruction (mfhi): # MULT16_32_Q16 sll a32, a16, 16 mult a32, b32 mfhi result for MIPS32r6 it's even shorter: sll a32, a16, 16 muh result, a32, b32 MIPS64 avoids using accumulator here and can scale result in general register with single instruction. So no special trick needed. Signed-off-by: Siarhei Volkau <lis8215@gmail.com> Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>

--- a/celt/fixed_generic.h

+++ b/celt/fixed_generic.h

@@ -200,7 +200,7 @@

 /** Divide a 32-bit value by a 32-bit value. Result fits in 32 bits */

 #define DIV32(a,b) (((opus_val32)(a))/((opus_val32)(b)))

-#if defined(__mips_dsp) && __mips == 32

+#if defined(__mips)

 #include "mips/fixed_generic_mipsr1.h"

 #endif

--- a/celt/mips/fixed_generic_mipsr1.h

+++ b/celt/mips/fixed_generic_mipsr1.h

@@ -33,6 +33,7 @@

 #ifndef CELT_FIXED_GENERIC_MIPSR1_H

 #define CELT_FIXED_GENERIC_MIPSR1_H

+#if defined (__mips_dsp) && __mips == 32

 #undef MULT16_32_Q16

 static inline int MULT16_32_Q16(int a, int b)

@@ -74,5 +75,12 @@

     int r = a * b;

     return __builtin_mips_shra_r_w(r, 15);

+#elif __mips == 32

+#undef MULT16_32_Q16

+#define MULT16_32_Q16(a,b) ((opus_val32)SHR((opus_int64)(SHL32((a), 16))*(b),32))

+#endif

 #endif /* CELT_FIXED_GENERIC_MIPSR1_H */

--

⑨