ref: 2cd012096907fa4ac03b2e1665af95d02ee0d579
parent: d9df94947de8a754af8ab35b9ca1808d43a26f9e
author: Siarhei Volkau <lis8215@gmail.com>
date: Thu Aug 21 20:02:58 EDT 2025
MIPS: optimize fft and mdct for MIPS32 without DSP MIPS32 since release 1 has support for multiply-accumulate pattern with 32x32=>64 bit data, although in constrast to DSP extension it has only one accumulator register. Another disadvantage is extract scaled result back from accumulator requires 5 instructions so for 16x16 multiply-accumulate it's faster to use normal multiptication (32x32=>32) and addition instructions. GCC likes to shuffle mult+madd pattern instructions away and then reload accumulator in between, so default C implementation is far from optimal. GCC don't have builtin functions for mult/madd/msub instructions so inline assembly is used here. Regarding MIPS32r6 - it doesn't have accumulator register at all. Instead, it has pair of instructions MUL/MUH which implement 32x32=>64 multiplication on a general registers. C implemetation matches much better with that processor. So no special version for it. MIPS64 in turn has mult+madd instructions but only for compatibility with 32-bit binaries, taking back result from accumulator requires same 5 instructions and they must be written in assembly. Instead, it has 64x64=>64 multiplication on general registers, so C code shall be good enough with typical dmul+daddu instructions for multiply-accumulate. So no special version for it. Signed-off-by: Siarhei Volkau <lis8215@gmail.com> Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -102,7 +102,7 @@
#if defined(OPUS_ARM_INLINE_EDSP)
#include "arm/kiss_fft_armv5e.h"
#endif
-#if defined(__mips_dsp) && __mips == 32
+#if defined(__mips)
#include "mips/kiss_fft_mipsr1.h"
#endif
--- a/celt/mdct.c
+++ b/celt/mdct.c
@@ -53,7 +53,7 @@
#include "mathops.h"
#include "stack_alloc.h"
-#if defined(FIXED_POINT) && defined(__mips_dsp) && __mips == 32
+#if defined(FIXED_POINT) && defined(__mips) && __mips == 32
#include "mips/mdct_mipsr1.h"
#endif
--- a/celt/mips/kiss_fft_mipsr1.h
+++ b/celt/mips/kiss_fft_mipsr1.h
@@ -32,10 +32,8 @@
#ifdef FIXED_POINT
-#define S_MUL_ADD(a, b, c, d) (S_MUL(a,b)+S_MUL(c,d))
-#define S_MUL_SUB(a, b, c, d) (S_MUL(a,b)-S_MUL(c,d))
+#if __mips == 32 && defined (__mips_dsp)
-#undef S_MUL_ADD
static inline int S_MUL_ADD(int a, int b, int c, int d) {
long long acc = __builtin_mips_mult(a, b);
acc = __builtin_mips_madd(acc, c, d);
@@ -42,7 +40,6 @@
return __builtin_mips_extr_w(acc, 15);
}
-#undef S_MUL_SUB
static inline int S_MUL_SUB(int a, int b, int c, int d) {
long long acc = __builtin_mips_mult(a, b);
acc = __builtin_mips_msub(acc, c, d);
@@ -76,9 +73,66 @@
return m;
}
-#endif /* FIXED_POINT */
+#define OVERRIDE_kf_bfly5
+#elif __mips == 32 && defined(__mips_isa_rev) && __mips_isa_rev < 6
+
+static inline int S_MUL_ADD(int a, int b, int c, int d) {
+ long long acc;
+
+ asm volatile (
+ "mult %[a], %[b] \n"
+ "madd %[c], %[d] \n"
+ : [acc] "=x"(acc)
+ : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+ :
+ );
+ return (int)(acc >> 15);
+}
+
+static inline int S_MUL_SUB(int a, int b, int c, int d) {
+ long long acc;
+
+ asm volatile (
+ "mult %[a], %[b] \n"
+ "msub %[c], %[d] \n"
+ : [acc] "=x"(acc)
+ : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+ :
+ );
+ return (int)(acc >> 15);
+}
+
+#undef C_MUL
+# define C_MUL(m,a,b) (m=C_MUL_fun(a,b))
+static inline kiss_fft_cpx C_MUL_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+ kiss_fft_cpx m;
+
+ m.r = S_MUL_SUB(a.r, b.r, a.i, b.i);
+ m.i = S_MUL_ADD(a.r, b.i, a.i, b.r);
+
+ return m;
+}
+
+#undef C_MULC
+# define C_MULC(m,a,b) (m=C_MULC_fun(a,b))
+static inline kiss_fft_cpx C_MULC_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
+ kiss_fft_cpx m;
+
+ m.r = S_MUL_ADD(a.r, b.r, a.i, b.i);
+ m.i = S_MUL_SUB(a.i, b.r, a.r, b.i);
+
+ return m;
+}
+
#define OVERRIDE_kf_bfly5
+
+#endif
+
+#endif /* FIXED_POINT */
+
+#if defined(OVERRIDE_kf_bfly5)
+
static void kf_bfly5(
kiss_fft_cpx * Fout,
const size_t fstride,
@@ -157,5 +211,6 @@
}
}
+#endif /* defined(OVERRIDE_kf_bfly5) */
#endif /* KISS_FFT_MIPSR1_H */
--- a/celt/mips/mdct_mipsr1.h
+++ b/celt/mips/mdct_mipsr1.h
@@ -55,6 +55,7 @@
#include "mathops.h"
#include "stack_alloc.h"
+#if defined (__mips_dsp)
static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) {
long long acc = __builtin_mips_mult(a, b);
acc = __builtin_mips_madd(acc, c, d);
@@ -67,8 +68,45 @@
return __builtin_mips_extr_w(acc, 15+shift);
}
-/* Forward MDCT trashes the input array */
#define OVERRIDE_clt_mdct_forward
+#define OVERRIDE_clt_mdct_backward
+
+#elif defined(__mips_isa_rev) && __mips_isa_rev < 6
+
+static inline int S_MUL_ADD_PSR(int a, int b, int c, int d, int shift) {
+ long long acc;
+
+ asm volatile (
+ "mult %[a], %[b] \n"
+ "madd %[c], %[d] \n"
+ : [acc] "=x"(acc)
+ : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+ :
+ );
+ return (int)(acc >> (15 + shift));
+}
+
+static inline int S_MUL_SUB_PSR(int a, int b, int c, int d, int shift) {
+ long long acc;
+
+ asm volatile (
+ "mult %[a], %[b] \n"
+ "msub %[c], %[d] \n"
+ : [acc] "=x"(acc)
+ : [a] "r"(a), [b] "r"(b), [c] "r"(c), [d] "r"(d)
+ :
+ );
+ return (int)(acc >> (15 + shift));
+}
+
+#define OVERRIDE_clt_mdct_forward
+#define OVERRIDE_clt_mdct_backward
+
+#endif
+
+#if defined (OVERRIDE_clt_mdct_forward)
+
+/* Forward MDCT trashes the input array */
void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
const celt_coef *window, int overlap, int shift, int stride, int arch)
{
@@ -213,7 +251,10 @@
RESTORE_STACK;
}
-#define OVERRIDE_clt_mdct_backward
+#endif /* OVERRIDE_clt_mdct_forward */
+
+#if defined(OVERRIDE_clt_mdct_backward)
+
void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
const celt_coef * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
{
@@ -336,4 +377,7 @@
}
}
}
+
+#endif /* OVERRIDE_clt_mdct_backward */
+
#endif /* MDCT_MIPSR1_H__ */
--
⑨