shithub: opus

Download patch

ref: 81fc1497b1e5092e2a3df5945705c252b66bd74d
parent: ee90c140359eacbc950cc1ee4b140ecf9efcefea
author: Siarhei Volkau <lis8215@gmail.com>
date: Fri Aug 22 05:37:34 EDT 2025

MIPS: tune celt pitch for MIPS

For non-DSP MIPS it's worth to use default MAC16_16 implementation.
So there's no difference with pure C implementation.

The real difference goes from manual tuning C code:
- unroll loop one more time for dual_inner_prod
- replace tail if-s by switch in xcorr_kernel
- use 32-bit accumulators for non-DSP variant

Why switch is faster? Probably because compiler don't have to
track j variable till the end of the cycle and can replace exit
condition by something like x < &initial_x[N-3].

These changes increase overall opus_decode test execution speed by about
1% for both DSP and non-DSP versions.

Measurements done in QEMU by counting instructions executed.
QEMU is not cycle accurate and real effect might be lower due
to pipeline stalls.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>

--- a/celt/mips/pitch_mipsr1.h
+++ b/celt/mips/pitch_mipsr1.h
@@ -34,29 +34,64 @@
 #ifndef PITCH_MIPSR1_H
 #define PITCH_MIPSR1_H
 
+#if defined (__mips_dsp) && __mips == 32
+
+#define accumulator_t opus_int64
+#define MIPS_MAC(acc,a,b) \
+    __builtin_mips_madd((acc), (int)(a), (int)(b))
+
 #define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_XCORR_KERNEL
+
+#else /* any other MIPS */
+
+/* using madd is slower due to single accumulator */
+#define accumulator_t opus_int32
+#define MIPS_MAC MAC16_16
+
+#define OVERRIDE_DUAL_INNER_PROD
+#define OVERRIDE_XCORR_KERNEL
+
+#endif /* any other MIPS */
+
+
+#if defined(OVERRIDE_DUAL_INNER_PROD)
 static inline void dual_inner_prod(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
       int N, opus_val32 *xy1, opus_val32 *xy2, int arch)
 {
    int j;
-   long long acc1 = 0;
-   long long acc2 = 0;
+   accumulator_t acc1 = 0;
+   accumulator_t acc2 = 0;
 
    (void)arch;
 
    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
-   for (j=0;j<N;j+=2)
+   for (j = 0; j < N - 3; j += 4)
    {
-       acc1 = __builtin_mips_madd(acc1, (int)x[j],   (int)y01[j]);
-       acc2 = __builtin_mips_madd(acc2, (int)x[j],   (int)y02[j]);
-       acc1 = __builtin_mips_madd(acc1, (int)x[j+1], (int)y01[j+1]);
-       acc2 = __builtin_mips_madd(acc2, (int)x[j+1], (int)y02[j+1]);
+      acc1 = MIPS_MAC(acc1, x[j],   y01[j]);
+      acc2 = MIPS_MAC(acc2, x[j],   y02[j]);
+      acc1 = MIPS_MAC(acc1, x[j+1], y01[j+1]);
+      acc2 = MIPS_MAC(acc2, x[j+1], y02[j+1]);
+      acc1 = MIPS_MAC(acc1, x[j+2], y01[j+2]);
+      acc2 = MIPS_MAC(acc2, x[j+2], y02[j+2]);
+      acc1 = MIPS_MAC(acc1, x[j+3], y01[j+3]);
+      acc2 = MIPS_MAC(acc2, x[j+3], y02[j+3]);
    }
 
+   if (j < N) {
+      acc1 = MIPS_MAC(acc1, x[j],   y01[j]);
+      acc2 = MIPS_MAC(acc2, x[j],   y02[j]);
+      acc1 = MIPS_MAC(acc1, x[j+1], y01[j+1]);
+      acc2 = MIPS_MAC(acc2, x[j+1], y02[j+1]);
+   }
+
    *xy1 = (opus_val32)acc1;
    *xy2 = (opus_val32)acc2;
 }
+#endif /* OVERRIDE_DUAL_INNER_PROD */
 
+#if defined(OVERRIDE_XCORR_KERNEL)
+
 static inline void xcorr_kernel_mips(const opus_val16 * x,
       const opus_val16 * y, opus_val32 sum[4], int len)
 {
@@ -63,13 +98,12 @@
    int j;
    opus_val16 y_0, y_1, y_2, y_3;
 
-    opus_int64 sum_0, sum_1, sum_2, sum_3;
-    sum_0 =  (opus_int64)sum[0];
-    sum_1 =  (opus_int64)sum[1];
-    sum_2 =  (opus_int64)sum[2];
-    sum_3 =  (opus_int64)sum[3];
+    accumulator_t sum_0, sum_1, sum_2, sum_3;
+    sum_0 =  (accumulator_t)sum[0];
+    sum_1 =  (accumulator_t)sum[1];
+    sum_2 =  (accumulator_t)sum[2];
+    sum_3 =  (accumulator_t)sum[3];
 
-    y_3=0; /* gcc doesn't realize that y_3 can't be used uninitialized */
     y_0=*y++;
     y_1=*y++;
     y_2=*y++;
@@ -79,69 +113,73 @@
         tmp = *x++;
         y_3=*y++;
 
-        sum_0 = __builtin_mips_madd( sum_0, tmp, y_0);
-        sum_1 = __builtin_mips_madd( sum_1, tmp, y_1);
-        sum_2 = __builtin_mips_madd( sum_2, tmp, y_2);
-        sum_3 = __builtin_mips_madd( sum_3, tmp, y_3);
+        sum_0 = MIPS_MAC(sum_0, tmp, y_0);
+        sum_1 = MIPS_MAC(sum_1, tmp, y_1);
+        sum_2 = MIPS_MAC(sum_2, tmp, y_2);
+        sum_3 = MIPS_MAC(sum_3, tmp, y_3);
 
         tmp=*x++;
         y_0=*y++;
 
-        sum_0 = __builtin_mips_madd( sum_0, tmp, y_1 );
-        sum_1 = __builtin_mips_madd( sum_1, tmp, y_2 );
-        sum_2 = __builtin_mips_madd( sum_2, tmp, y_3);
-        sum_3 = __builtin_mips_madd( sum_3, tmp, y_0);
+        sum_0 = MIPS_MAC(sum_0, tmp, y_1);
+        sum_1 = MIPS_MAC(sum_1, tmp, y_2);
+        sum_2 = MIPS_MAC(sum_2, tmp, y_3);
+        sum_3 = MIPS_MAC(sum_3, tmp, y_0);
 
        tmp=*x++;
        y_1=*y++;
 
-       sum_0 = __builtin_mips_madd( sum_0, tmp, y_2 );
-       sum_1 = __builtin_mips_madd( sum_1, tmp, y_3 );
-       sum_2 = __builtin_mips_madd( sum_2, tmp, y_0);
-       sum_3 = __builtin_mips_madd( sum_3, tmp, y_1);
+       sum_0 = MIPS_MAC(sum_0, tmp, y_2);
+       sum_1 = MIPS_MAC(sum_1, tmp, y_3);
+       sum_2 = MIPS_MAC(sum_2, tmp, y_0);
+       sum_3 = MIPS_MAC(sum_3, tmp, y_1);
 
 
       tmp=*x++;
       y_2=*y++;
 
-       sum_0 = __builtin_mips_madd( sum_0, tmp, y_3 );
-       sum_1 = __builtin_mips_madd( sum_1, tmp, y_0 );
-       sum_2 = __builtin_mips_madd( sum_2, tmp, y_1);
-       sum_3 = __builtin_mips_madd( sum_3, tmp, y_2);
-
+      sum_0 = MIPS_MAC(sum_0, tmp, y_3);
+      sum_1 = MIPS_MAC(sum_1, tmp, y_0);
+      sum_2 = MIPS_MAC(sum_2, tmp, y_1);
+      sum_3 = MIPS_MAC(sum_3, tmp, y_2);
    }
-   if (j++<len)
-   {
-      opus_val16 tmp = *x++;
-      y_3=*y++;
 
-       sum_0 = __builtin_mips_madd( sum_0, tmp, y_0 );
-       sum_1 = __builtin_mips_madd( sum_1, tmp, y_1 );
-       sum_2 = __builtin_mips_madd( sum_2, tmp, y_2);
-       sum_3 = __builtin_mips_madd( sum_3, tmp, y_3);
-   }
+   switch (len & 3) {
+   case 3:
+      sum_0 = MIPS_MAC(sum_0, x[2], y_2);
+      sum_1 = MIPS_MAC(sum_1, x[2], y[0]);
+      sum_2 = MIPS_MAC(sum_2, x[2], y[1]);
+      sum_3 = MIPS_MAC(sum_3, x[2], y[2]);
 
-   if (j++<len)
-   {
-      opus_val16 tmp=*x++;
-      y_0=*y++;
+      sum_0 = MIPS_MAC(sum_0, x[1], y_1);
+      sum_1 = MIPS_MAC(sum_1, x[1], y_2);
+      sum_2 = MIPS_MAC(sum_2, x[1], y[0]);
+      sum_3 = MIPS_MAC(sum_3, x[1], y[1]);
 
-      sum_0 = __builtin_mips_madd( sum_0, tmp, y_1 );
-      sum_1 = __builtin_mips_madd( sum_1, tmp, y_2 );
-      sum_2 = __builtin_mips_madd( sum_2, tmp, y_3);
-      sum_3 = __builtin_mips_madd( sum_3, tmp, y_0);
-   }
+      sum_0 = MIPS_MAC(sum_0, x[0], y_0);
+      sum_1 = MIPS_MAC(sum_1, x[0], y_1);
+      sum_2 = MIPS_MAC(sum_2, x[0], y_2);
+      sum_3 = MIPS_MAC(sum_3, x[0], y[0]);
+      break;
+   case 2:
+      sum_0 = MIPS_MAC(sum_0, x[1], y_1);
+      sum_1 = MIPS_MAC(sum_1, x[1], y_2);
+      sum_2 = MIPS_MAC(sum_2, x[1], y[0]);
+      sum_3 = MIPS_MAC(sum_3, x[1], y[1]);
 
-   if (j<len)
-   {
-      opus_val16 tmp=*x++;
-      y_1=*y++;
-
-       sum_0 = __builtin_mips_madd( sum_0, tmp, y_2 );
-       sum_1 = __builtin_mips_madd( sum_1, tmp, y_3 );
-       sum_2 = __builtin_mips_madd( sum_2, tmp, y_0);
-       sum_3 = __builtin_mips_madd( sum_3, tmp, y_1);
-
+      sum_0 = MIPS_MAC(sum_0, x[0], y_0);
+      sum_1 = MIPS_MAC(sum_1, x[0], y_1);
+      sum_2 = MIPS_MAC(sum_2, x[0], y_2);
+      sum_3 = MIPS_MAC(sum_3, x[0], y[0]);
+      break;
+   case 1:
+      sum_0 = MIPS_MAC(sum_0, x[0], y_0);
+      sum_1 = MIPS_MAC(sum_1, x[0], y_1);
+      sum_2 = MIPS_MAC(sum_2, x[0], y_2);
+      sum_3 = MIPS_MAC(sum_3, x[0], y[0]);
+      break;
+   case 0:
+      break;
    }
 
    sum[0] = (opus_val32)sum_0;
@@ -150,8 +188,12 @@
    sum[3] = (opus_val32)sum_3;
 }
 
-#define OVERRIDE_XCORR_KERNEL
 #define xcorr_kernel(x, y, sum, len, arch) \
     ((void)(arch), xcorr_kernel_mips(x, y, sum, len))
+
+#undef accumulator_t
+#undef MIPS_MAC
+
+#endif /* OVERRIDE_XCORR_KERNEL */
 
 #endif /* PITCH_MIPSR1_H */
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -42,7 +42,7 @@
 #include "x86/pitch_sse.h"
 #endif
 
-#if defined(FIXED_POINT) && defined(__mips_dsp) && __mips == 32
+#if defined(FIXED_POINT) && defined(__mips)
 #include "mips/pitch_mipsr1.h"
 #endif
 
--