ref: 81fc1497b1e5092e2a3df5945705c252b66bd74d
parent: ee90c140359eacbc950cc1ee4b140ecf9efcefea
author: Siarhei Volkau <lis8215@gmail.com>
date: Fri Aug 22 05:37:34 EDT 2025
MIPS: tune celt pitch for MIPS For non-DSP MIPS it's worth to use default MAC16_16 implementation. So there's no difference with pure C implementation. The real difference goes from manual tuning C code: - unroll loop one more time for dual_inner_prod - replace tail if-s by switch in xcorr_kernel - use 32-bit accumulators for non-DSP variant Why switch is faster? Probably because compiler don't have to track j variable till the end of the cycle and can replace exit condition by something like x < &initial_x[N-3]. These changes increase overall opus_decode test execution speed by about 1% for both DSP and non-DSP versions. Measurements done in QEMU by counting instructions executed. QEMU is not cycle accurate and real effect might be lower due to pipeline stalls. Signed-off-by: Siarhei Volkau <lis8215@gmail.com> Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/celt/mips/pitch_mipsr1.h
+++ b/celt/mips/pitch_mipsr1.h
@@ -34,29 +34,64 @@
#ifndef PITCH_MIPSR1_H
#define PITCH_MIPSR1_H
+#if defined (__mips_dsp) && __mips == 32
+
+#define accumulator_t opus_int64
+#define MIPS_MAC(acc,a,b) \
+ __builtin_mips_madd((acc), (int)(a), (int)(b))
+
#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_XCORR_KERNEL
+
+#else /* any other MIPS */
+
+/* using madd is slower due to single accumulator */
+#define accumulator_t opus_int32
+#define MIPS_MAC MAC16_16
+
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_XCORR_KERNEL
+
+#endif /* any other MIPS */
+
+
+#if defined(OVERRIDE_DUAL_INNER_PROD)
static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
int N, opus_val32 *xy1, opus_val32 *xy2, int arch)
{
int j;
- long long acc1 = 0;
- long long acc2 = 0;
+ accumulator_t acc1 = 0;
+ accumulator_t acc2 = 0;
(void)arch;
/* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
- for (j=0;j<N;j+=2)
+ for (j = 0; j < N - 3; j += 4)
{
- acc1 = __builtin_mips_madd(acc1, (int)x[j], (int)y01[j]);
- acc2 = __builtin_mips_madd(acc2, (int)x[j], (int)y02[j]);
- acc1 = __builtin_mips_madd(acc1, (int)x[j+1], (int)y01[j+1]);
- acc2 = __builtin_mips_madd(acc2, (int)x[j+1], (int)y02[j+1]);
+ acc1 = MIPS_MAC(acc1, x[j], y01[j]);
+ acc2 = MIPS_MAC(acc2, x[j], y02[j]);
+ acc1 = MIPS_MAC(acc1, x[j+1], y01[j+1]);
+ acc2 = MIPS_MAC(acc2, x[j+1], y02[j+1]);
+ acc1 = MIPS_MAC(acc1, x[j+2], y01[j+2]);
+ acc2 = MIPS_MAC(acc2, x[j+2], y02[j+2]);
+ acc1 = MIPS_MAC(acc1, x[j+3], y01[j+3]);
+ acc2 = MIPS_MAC(acc2, x[j+3], y02[j+3]);
}
+ if (j < N) {
+ acc1 = MIPS_MAC(acc1, x[j], y01[j]);
+ acc2 = MIPS_MAC(acc2, x[j], y02[j]);
+ acc1 = MIPS_MAC(acc1, x[j+1], y01[j+1]);
+ acc2 = MIPS_MAC(acc2, x[j+1], y02[j+1]);
+ }
+
*xy1 = (opus_val32)acc1;
*xy2 = (opus_val32)acc2;
}
+#endif /* OVERRIDE_DUAL_INNER_PROD */
+#if defined(OVERRIDE_XCORR_KERNEL)
+
static inline void xcorr_kernel_mips(const opus_val16 * x,
const opus_val16 * y, opus_val32 sum[4], int len)
{
@@ -63,13 +98,12 @@
int j;
opus_val16 y_0, y_1, y_2, y_3;
- opus_int64 sum_0, sum_1, sum_2, sum_3;
- sum_0 = (opus_int64)sum[0];
- sum_1 = (opus_int64)sum[1];
- sum_2 = (opus_int64)sum[2];
- sum_3 = (opus_int64)sum[3];
+ accumulator_t sum_0, sum_1, sum_2, sum_3;
+ sum_0 = (accumulator_t)sum[0];
+ sum_1 = (accumulator_t)sum[1];
+ sum_2 = (accumulator_t)sum[2];
+ sum_3 = (accumulator_t)sum[3];
- y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
y_0=*y++;
y_1=*y++;
y_2=*y++;
@@ -79,69 +113,73 @@
tmp = *x++;
y_3=*y++;
- sum_0 = __builtin_mips_madd( sum_0, tmp, y_0);
- sum_1 = __builtin_mips_madd( sum_1, tmp, y_1);
- sum_2 = __builtin_mips_madd( sum_2, tmp, y_2);
- sum_3 = __builtin_mips_madd( sum_3, tmp, y_3);
+ sum_0 = MIPS_MAC(sum_0, tmp, y_0);
+ sum_1 = MIPS_MAC(sum_1, tmp, y_1);
+ sum_2 = MIPS_MAC(sum_2, tmp, y_2);
+ sum_3 = MIPS_MAC(sum_3, tmp, y_3);
tmp=*x++;
y_0=*y++;
- sum_0 = __builtin_mips_madd( sum_0, tmp, y_1 );
- sum_1 = __builtin_mips_madd( sum_1, tmp, y_2 );
- sum_2 = __builtin_mips_madd( sum_2, tmp, y_3);
- sum_3 = __builtin_mips_madd( sum_3, tmp, y_0);
+ sum_0 = MIPS_MAC(sum_0, tmp, y_1);
+ sum_1 = MIPS_MAC(sum_1, tmp, y_2);
+ sum_2 = MIPS_MAC(sum_2, tmp, y_3);
+ sum_3 = MIPS_MAC(sum_3, tmp, y_0);
tmp=*x++;
y_1=*y++;
- sum_0 = __builtin_mips_madd( sum_0, tmp, y_2 );
- sum_1 = __builtin_mips_madd( sum_1, tmp, y_3 );
- sum_2 = __builtin_mips_madd( sum_2, tmp, y_0);
- sum_3 = __builtin_mips_madd( sum_3, tmp, y_1);
+ sum_0 = MIPS_MAC(sum_0, tmp, y_2);
+ sum_1 = MIPS_MAC(sum_1, tmp, y_3);
+ sum_2 = MIPS_MAC(sum_2, tmp, y_0);
+ sum_3 = MIPS_MAC(sum_3, tmp, y_1);
tmp=*x++;
y_2=*y++;
- sum_0 = __builtin_mips_madd( sum_0, tmp, y_3 );
- sum_1 = __builtin_mips_madd( sum_1, tmp, y_0 );
- sum_2 = __builtin_mips_madd( sum_2, tmp, y_1);
- sum_3 = __builtin_mips_madd( sum_3, tmp, y_2);
-
+ sum_0 = MIPS_MAC(sum_0, tmp, y_3);
+ sum_1 = MIPS_MAC(sum_1, tmp, y_0);
+ sum_2 = MIPS_MAC(sum_2, tmp, y_1);
+ sum_3 = MIPS_MAC(sum_3, tmp, y_2);
}
- if (j++<len)
- {
- opus_val16 tmp = *x++;
- y_3=*y++;
- sum_0 = __builtin_mips_madd( sum_0, tmp, y_0 );
- sum_1 = __builtin_mips_madd( sum_1, tmp, y_1 );
- sum_2 = __builtin_mips_madd( sum_2, tmp, y_2);
- sum_3 = __builtin_mips_madd( sum_3, tmp, y_3);
- }
+ switch (len & 3) {
+ case 3:
+ sum_0 = MIPS_MAC(sum_0, x[2], y_2);
+ sum_1 = MIPS_MAC(sum_1, x[2], y[0]);
+ sum_2 = MIPS_MAC(sum_2, x[2], y[1]);
+ sum_3 = MIPS_MAC(sum_3, x[2], y[2]);
- if (j++<len)
- {
- opus_val16 tmp=*x++;
- y_0=*y++;
+ sum_0 = MIPS_MAC(sum_0, x[1], y_1);
+ sum_1 = MIPS_MAC(sum_1, x[1], y_2);
+ sum_2 = MIPS_MAC(sum_2, x[1], y[0]);
+ sum_3 = MIPS_MAC(sum_3, x[1], y[1]);
- sum_0 = __builtin_mips_madd( sum_0, tmp, y_1 );
- sum_1 = __builtin_mips_madd( sum_1, tmp, y_2 );
- sum_2 = __builtin_mips_madd( sum_2, tmp, y_3);
- sum_3 = __builtin_mips_madd( sum_3, tmp, y_0);
- }
+ sum_0 = MIPS_MAC(sum_0, x[0], y_0);
+ sum_1 = MIPS_MAC(sum_1, x[0], y_1);
+ sum_2 = MIPS_MAC(sum_2, x[0], y_2);
+ sum_3 = MIPS_MAC(sum_3, x[0], y[0]);
+ break;
+ case 2:
+ sum_0 = MIPS_MAC(sum_0, x[1], y_1);
+ sum_1 = MIPS_MAC(sum_1, x[1], y_2);
+ sum_2 = MIPS_MAC(sum_2, x[1], y[0]);
+ sum_3 = MIPS_MAC(sum_3, x[1], y[1]);
- if (j<len)
- {
- opus_val16 tmp=*x++;
- y_1=*y++;
-
- sum_0 = __builtin_mips_madd( sum_0, tmp, y_2 );
- sum_1 = __builtin_mips_madd( sum_1, tmp, y_3 );
- sum_2 = __builtin_mips_madd( sum_2, tmp, y_0);
- sum_3 = __builtin_mips_madd( sum_3, tmp, y_1);
-
+ sum_0 = MIPS_MAC(sum_0, x[0], y_0);
+ sum_1 = MIPS_MAC(sum_1, x[0], y_1);
+ sum_2 = MIPS_MAC(sum_2, x[0], y_2);
+ sum_3 = MIPS_MAC(sum_3, x[0], y[0]);
+ break;
+ case 1:
+ sum_0 = MIPS_MAC(sum_0, x[0], y_0);
+ sum_1 = MIPS_MAC(sum_1, x[0], y_1);
+ sum_2 = MIPS_MAC(sum_2, x[0], y_2);
+ sum_3 = MIPS_MAC(sum_3, x[0], y[0]);
+ break;
+ case 0:
+ break;
}
sum[0] = (opus_val32)sum_0;
@@ -150,8 +188,12 @@
sum[3] = (opus_val32)sum_3;
}
-#define OVERRIDE_XCORR_KERNEL
#define xcorr_kernel(x, y, sum, len, arch) \
((void)(arch), xcorr_kernel_mips(x, y, sum, len))
+
+#undef accumulator_t
+#undef MIPS_MAC
+
+#endif /* OVERRIDE_XCORR_KERNEL */
#endif /* PITCH_MIPSR1_H */
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -42,7 +42,7 @@
#include "x86/pitch_sse.h"
#endif
-#if defined(FIXED_POINT) && defined(__mips_dsp) && __mips == 32
+#if defined(FIXED_POINT) && defined(__mips)
#include "mips/pitch_mipsr1.h"
#endif
--
⑨