shithub: opus

Download patch

ref: cb20faaaacef7e597d38329b703bb2a7a188fdc1
parent: 6ecee3715bdc51454e923ad9ba046b02836b4406
author: Siarhei Volkau <lis8215@gmail.com>
date: Sat Aug 23 13:18:28 EDT 2025

MIPS DSP: utilize dpaq_s.w.ph for celt_inner_prod

dpaq_s.w.ph does two 16x16=>Q31 multiplications with adding both results
to accumulator register.
For getting Q31, result of multiplication is shifted left by 1.
Also it does saturation: for two 0x8000 (-32768) inputs result is
0x7fffffff (maximal positive Q31).

This instruction is ideal candidate for celt/dual_inner_prod functions
although data alignment isn't always good to utilize it.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>

--- a/celt/mips/pitch_mipsr1.h
+++ b/celt/mips/pitch_mipsr1.h
@@ -34,6 +34,8 @@
 #ifndef PITCH_MIPSR1_H
 #define PITCH_MIPSR1_H
 
+#include "fixed_generic_mipsr1.h"
+
 #if defined (__mips_dsp) && __mips == 32
 
 #define accumulator_t opus_int64
@@ -40,6 +42,9 @@
 #define MIPS_MAC(acc,a,b) \
     __builtin_mips_madd((acc), (int)(a), (int)(b))
 
+#define MIPS_MAC16x16_2X(acc,a2x,b2x) \
+    __builtin_mips_dpaq_s_w_ph((acc), (a2x), (b2x))
+
 #define OVERRIDE_CELT_INNER_PROD
 #define OVERRIDE_DUAL_INNER_PROD
 #define OVERRIDE_XCORR_KERNEL
@@ -58,6 +63,7 @@
 
 
 #if defined(OVERRIDE_CELT_INNER_PROD)
+
 static OPUS_INLINE opus_val32 celt_inner_prod(const opus_val16 *x,
       const opus_val16 *y, int N, int arch)
 {
@@ -64,8 +70,65 @@
    int j;
    accumulator_t acc = 0;
 
-   (void)arch;
+#if defined (MIPS_MAC16x16_2X)
+   const v2i16 *x2x;
+   const v2i16 *y2x;
+   int loops;
 
+   /* misaligned */
+   if (((long)x | (long)y) & 3)
+       goto fallback;
+
+   x2x = __builtin_assume_aligned(x, 4);
+   y2x = __builtin_assume_aligned(y, 4);
+   loops = N / 8;
+   for (j = 0; j < loops; j++)
+   {
+      acc = MIPS_MAC16x16_2X(acc, x2x[0], y2x[0]);
+      acc = MIPS_MAC16x16_2X(acc, x2x[1], y2x[1]);
+      acc = MIPS_MAC16x16_2X(acc, x2x[2], y2x[2]);
+      acc = MIPS_MAC16x16_2X(acc, x2x[3], y2x[3]);
+      x2x += 4; y2x += 4;
+   }
+
+   switch (N & 7) {
+   case 7:
+      acc = MIPS_MAC16x16_2X(acc, x2x[0], y2x[0]);
+      acc = MIPS_MAC16x16_2X(acc, x2x[1], y2x[1]);
+      acc = MIPS_MAC16x16_2X(acc, x2x[2], y2x[2]);
+      acc = MIPS_MAC(acc, x[N-1], y[N-1]);
+      break;
+   case 6:
+      acc = MIPS_MAC16x16_2X(acc, x2x[0], y2x[0]);
+      acc = MIPS_MAC16x16_2X(acc, x2x[1], y2x[1]);
+      acc = MIPS_MAC16x16_2X(acc, x2x[2], y2x[2]);
+      break;
+   case 5:
+      acc = MIPS_MAC16x16_2X(acc, x2x[0], y2x[0]);
+      acc = MIPS_MAC16x16_2X(acc, x2x[1], y2x[1]);
+      acc = MIPS_MAC(acc, x[N-1], y[N-1]);
+      break;
+   case 4:
+      acc = MIPS_MAC16x16_2X(acc, x2x[0], y2x[0]);
+      acc = MIPS_MAC16x16_2X(acc, x2x[1], y2x[1]);
+      break;
+   case 3:
+      acc = MIPS_MAC16x16_2X(acc, x2x[0], y2x[0]);
+      acc = MIPS_MAC(acc, x[N-1], y[N-1]);
+      break;
+   case 2:
+      acc = MIPS_MAC16x16_2X(acc, x2x[0], y2x[0]);
+      break;
+   case 1:
+      acc = MIPS_MAC(acc, x[N-1], y[N-1]);
+      break;
+   case 0:
+      break;
+   }
+   return __builtin_mips_extr_w(acc, 1);
+
+fallback:
+#endif
    for (j = 0; j < N - 3; j += 4)
    {
       acc = MIPS_MAC(acc, x[j],   y[j]);
@@ -91,6 +154,8 @@
       break;
    }
 
+   (void)arch;
+
    return (opus_val32)acc;
 }
 #endif /* OVERRIDE_CELT_INNER_PROD */
@@ -103,8 +168,61 @@
    accumulator_t acc1 = 0;
    accumulator_t acc2 = 0;
 
-   (void)arch;
+#if defined (MIPS_MAC16x16_2X)
+   const v2i16 *x2x;
+   const v2i16 *y01_2x;
+   const v2i16 *y02_2x;
 
+   /* misaligned */
+   if (((long)x | (long)y01 | (long)y02) & 3)
+       goto fallback;
+
+   x2x = __builtin_assume_aligned(x, 4);
+   y01_2x = __builtin_assume_aligned(y01, 4);
+   y02_2x = __builtin_assume_aligned(y02, 4);
+   N /= 2;
+
+   for (j = 0; j < N - 3; j += 4)
+   {
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j],   y01_2x[j]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j],   y02_2x[j]);
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j+1], y01_2x[j+1]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j+1], y02_2x[j+1]);
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j+2], y01_2x[j+2]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j+2], y02_2x[j+2]);
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j+3], y01_2x[j+3]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j+3], y02_2x[j+3]);
+   }
+
+   switch (N & 3) {
+   case 3:
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j],   y01_2x[j]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j],   y02_2x[j]);
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j+1], y01_2x[j+1]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j+1], y02_2x[j+1]);
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j+2], y01_2x[j+2]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j+2], y02_2x[j+2]);
+      break;
+   case 2:
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j],   y01_2x[j]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j],   y02_2x[j]);
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j+1], y01_2x[j+1]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j+1], y02_2x[j+1]);
+      break;
+   case 1:
+      acc1 = MIPS_MAC16x16_2X(acc1, x2x[j],   y01_2x[j]);
+      acc2 = MIPS_MAC16x16_2X(acc2, x2x[j],   y02_2x[j]);
+      break;
+   case 0:
+      break;
+   }
+
+   *xy1 = __builtin_mips_extr_w(acc1, 1);
+   *xy2 = __builtin_mips_extr_w(acc2, 1);
+   return;
+
+fallback:
+#endif
    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
    for (j = 0; j < N - 3; j += 4)
    {
@@ -124,6 +242,8 @@
       acc1 = MIPS_MAC(acc1, x[j+1], y01[j+1]);
       acc2 = MIPS_MAC(acc2, x[j+1], y02[j+1]);
    }
+
+   (void)arch;
 
    *xy1 = (opus_val32)acc1;
    *xy2 = (opus_val32)acc2;
--