shithub: opus

Download patch

ref: 2cd012096907fa4ac03b2e1665af95d02ee0d579
parent: d9df94947de8a754af8ab35b9ca1808d43a26f9e
author: Siarhei Volkau <lis8215@gmail.com>
date: Thu Aug 21 20:02:58 EDT 2025

MIPS: optimize fft and mdct for MIPS32 without DSP

MIPS32 since release 1 has support for multiply-accumulate pattern
with 32x32=>64 bit data, although in constrast to DSP extension
it has only one accumulator register.

Another disadvantage is extract scaled result back from accumulator
requires 5 instructions so for 16x16 multiply-accumulate it's
faster to use normal multiptication (32x32=>32) and addition
instructions.

GCC likes to shuffle mult+madd pattern instructions away and then
reload accumulator in between, so default C implementation is far
from optimal.

GCC don't have builtin functions for mult/madd/msub instructions
so inline assembly is used here.

Regarding MIPS32r6 - it doesn't have accumulator register at all.
Instead, it has pair of instructions MUL/MUH which implement
32x32=>64 multiplication on a general registers. C implemetation
matches much better with that processor. So no special version
for it.

MIPS64 in turn has mult+madd instructions but only for compatibility
with 32-bit binaries, taking back result from accumulator requires
same 5 instructions and they must be written in assembly.
Instead, it has 64x64=>64 multiplication on general registers, so
C code shall be good enough with typical dmul+daddu instructions for
multiply-accumulate. So no special version for it.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>

--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -102,7 +102,7 @@
 #if defined(OPUS_ARM_INLINE_EDSP)
 #include "arm/kiss_fft_armv5e.h"
 #endif
-#if defined(__mips_dsp) && __mips == 32
+#if defined(__mips)
 #include "mips/kiss_fft_mipsr1.h"
 #endif
 
--- a/celt/mdct.c
+++ b/celt/mdct.c
@@ -53,7 +53,7 @@
 #include "mathops.h"
 #include "stack_alloc.h"
 
-#if defined(FIXED_POINT) && defined(__mips_dsp) && __mips == 32
+#if defined(FIXED_POINT) && defined(__mips) && __mips == 32
 #include "mips/mdct_mipsr1.h"
 #endif
 
--- a/celt/mips/kiss_fft_mipsr1.h
+++ b/celt/mips/kiss_fft_mipsr1.h
@@ -32,10 +32,8 @@
 
 #ifdef FIXED_POINT
 
-#define S_MUL_ADD(a, b, c, d) (S_MUL(a,b)+S_MUL(c,d))
-#define S_MUL_SUB(a, b, c, d) (S_MUL(a,b)-S_MUL(c,d))
+#if __mips == 32 && defined (__mips_dsp)
 
-#undef S_MUL_ADD
 static inline int S_MUL_ADD(int a, int b, int c, int d) {
     long long acc = __builtin_mips_mult(a, b);
     acc = __builtin_mips_madd(acc, c, d);
@@ -42,7 +40,6 @@
     return __builtin_mips_extr_w(acc, 15);
 }
 
-#undef S_MUL_SUB
 static inline int S_MUL_SUB(int a, int b, int c, int d) {
     long long acc = __builtin_mips_mult(a, b);
     acc = __builtin_mips_msub(acc, c, d);
@@ -76,9 +73,66 @@
     return m;
 }
 
-#endif /* FIXED_POINT */
+#define OVERRIDE_kf_bfly5
 
+#elif __mips == 32 && defined(__mips_isa_rev) && __mips_isa_rev < 6
+
+static inline int S_MUL_ADD(int a, int b, int c, int d) {
+    long long acc;
+
+    asm volatile (
+            "mult %[a], %[b]  \n"
+            "madd %[c], %[d]  \n"
+        : [acc] "=x"(acc)
+        : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+        :
+    );
+    return (int)(acc >> 15);
+}
+
+static inline int S_MUL_SUB(int a, int b, int c, int d) {
+    long long acc;
+
+    asm volatile (
+            "mult %[a], %[b]  \n"
+            "msub %[c], %[d]  \n"
+        : [acc] "=x"(acc)
+        : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+        :
+    );
+    return (int)(acc >> 15);
+}
+
+#undef C_MUL
+#   define C_MUL(m,a,b) (m=C_MUL_fun(a,b))
+static inline kiss_fft_cpx C_MUL_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+    kiss_fft_cpx m;
+
+    m.r = S_MUL_SUB(a.r, b.r, a.i, b.i);
+    m.i = S_MUL_ADD(a.r, b.i, a.i, b.r);
+
+    return m;
+}
+
+#undef C_MULC
+#   define C_MULC(m,a,b) (m=C_MULC_fun(a,b))
+static inline kiss_fft_cpx C_MULC_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+    kiss_fft_cpx m;
+
+    m.r = S_MUL_ADD(a.r, b.r, a.i, b.i);
+    m.i = S_MUL_SUB(a.i, b.r, a.r, b.i);
+
+    return m;
+}
+
 #define OVERRIDE_kf_bfly5
+
+#endif
+
+#endif /* FIXED_POINT */
+
+#if defined(OVERRIDE_kf_bfly5)
+
 static void kf_bfly5(
                      kiss_fft_cpx * Fout,
                      const size_t fstride,
@@ -157,5 +211,6 @@
    }
 }
 
+#endif /* defined(OVERRIDE_kf_bfly5) */
 
 #endif /* KISS_FFT_MIPSR1_H */
--- a/celt/mips/mdct_mipsr1.h
+++ b/celt/mips/mdct_mipsr1.h
@@ -55,6 +55,7 @@
 #include "mathops.h"
 #include "stack_alloc.h"
 
+#if defined (__mips_dsp)
 static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) {
     long long acc = __builtin_mips_mult(a, b);
     acc = __builtin_mips_madd(acc, c, d);
@@ -67,8 +68,45 @@
     return __builtin_mips_extr_w(acc, 15+shift);
 }
 
-/* Forward MDCT trashes the input array */
 #define OVERRIDE_clt_mdct_forward
+#define OVERRIDE_clt_mdct_backward
+
+#elif defined(__mips_isa_rev) && __mips_isa_rev < 6
+
+static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) {
+    long long acc;
+
+    asm volatile (
+            "mult %[a], %[b]  \n"
+            "madd %[c], %[d]  \n"
+        : [acc] "=x"(acc)
+        : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+        :
+    );
+    return (int)(acc >> (15 + shift));
+}
+
+static inline int S_MUL_SUB_PSR(int a, int b, int c, int d, int shift) {
+    long long acc;
+
+    asm volatile (
+            "mult %[a], %[b]  \n"
+            "msub %[c], %[d]  \n"
+        : [acc] "=x"(acc)
+        : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+        :
+    );
+    return (int)(acc >> (15 + shift));
+}
+
+#define OVERRIDE_clt_mdct_forward
+#define OVERRIDE_clt_mdct_backward
+
+#endif
+
+#if defined (OVERRIDE_clt_mdct_forward)
+
+/* Forward MDCT trashes the input array */
 void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
       const celt_coef *window, int overlap, int shift, int stride, int arch)
 {
@@ -213,7 +251,10 @@
    RESTORE_STACK;
 }
 
-#define OVERRIDE_clt_mdct_backward
+#endif /* OVERRIDE_clt_mdct_forward */
+
+#if defined(OVERRIDE_clt_mdct_backward)
+
 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
       const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
 {
@@ -336,4 +377,7 @@
       }
    }
 }
+
+#endif /* OVERRIDE_clt_mdct_backward */
+
 #endif /* MDCT_MIPSR1_H__ */
--