shithub: opus

Download patch

ref: 6ecee3715bdc51454e923ad9ba046b02836b4406
parent: 325585954d0875afe6c0cbb6d9cf2c95db3d22cb
author: Siarhei Volkau <lis8215@gmail.com>
date: Sat Aug 23 13:17:01 EDT 2025

MIPS DSP: utilize vector insns for celt_maxabs16

absq_s.ph does two ABS16 with saturation on a vector of two 16-bit
values.
cmp.lt.ph and pick.ph form conditional move for vector of two 16-bit
values.

Also 2x loop unroll for better perfomance.

Original C version can return 0x8000 (positive) whereas this one
is limited by 0x7fff, since result of this function used in ILOG2
context this is important. As a quick fix 1 is added to the result
in hope to return 0x8000 if saturation happens.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>

--- a/celt/mips/fixed_generic_mipsr1.h
+++ b/celt/mips/fixed_generic_mipsr1.h
@@ -35,6 +35,9 @@
 
 #if defined (__mips_dsp) && __mips == 32
 
+typedef short v2i16 __attribute__((vector_size(4)));
+typedef char  v2i8  __attribute__((vector_size(4)));
+
 #undef MULT16_32_Q16
 static inline int MULT16_32_Q16(int a, int b)
 {
@@ -74,6 +77,75 @@
 {
     int r = a * b;
     return __builtin_mips_shra_r_w(r, 15);
+}
+
+#define OVERRIDE_CELT_MAXABS16
+static OPUS_INLINE opus_val32 celt_maxabs16(const opus_val16 *x, int len)
+{
+   int i;
+   v2i16 v2max = (v2i16){ 0, 0 };
+   v2i16 x01, x23;
+   const v2i16 *x2;
+   opus_val16 maxlo, maxhi;
+   int loops;
+
+   if ((long)x & 2 && len > 0) {
+      v2max = (v2i16){ 0, ABS16(*x) };
+      x++;
+      len--;
+   }
+   x2 = __builtin_assume_aligned(x, 4);
+   loops = len / 4;
+
+   for (i = 0; i < loops; i++)
+   {
+       x01 = *x2++;
+       x23 = *x2++;
+       x01 = __builtin_mips_absq_s_ph(x01);
+       x23 = __builtin_mips_absq_s_ph(x23);
+       __builtin_mips_cmp_lt_ph(v2max, x01);
+       v2max = __builtin_mips_pick_ph(x01, v2max);
+       __builtin_mips_cmp_lt_ph(v2max, x23);
+       v2max = __builtin_mips_pick_ph(x23, v2max);
+   }
+
+   switch (len & 3) {
+   case 3:
+       x01 = __builtin_mips_absq_s_ph(*x2);
+       __builtin_mips_cmp_lt_ph(v2max, x01);
+       v2max = __builtin_mips_pick_ph(x01, v2max);
+       maxlo = EXTRACT16((opus_val32)v2max);
+       maxhi = EXTRACT16((opus_val32)v2max >> 16);
+       maxlo = MAX16(MAX16(maxlo, maxhi), ABS16(x[len - 1]));
+       break;
+   case 2:
+       x01 = __builtin_mips_absq_s_ph(*x2);
+       __builtin_mips_cmp_lt_ph(v2max, x01);
+       v2max = __builtin_mips_pick_ph(x01, v2max);
+       maxlo = EXTRACT16((opus_val32)v2max);
+       maxhi = EXTRACT16((opus_val32)v2max >> 16);
+       maxlo = MAX16(maxlo, maxhi);
+       break;
+   case 1:
+       maxlo = EXTRACT16((opus_val32)v2max);
+       maxhi = EXTRACT16((opus_val32)v2max >> 16);
+       return MAX16(MAX16(maxlo, maxhi), ABS16(x[len - 1]));
+       break;
+   case 0:
+       maxlo = EXTRACT16((opus_val32)v2max);
+       maxhi = EXTRACT16((opus_val32)v2max >> 16);
+       maxlo = MAX16(maxlo, maxhi);
+       break;
+   default:
+       __builtin_unreachable();
+   }
+   /* C version might return 0x8000, this one can't
+    * because abs is saturated here. Since result
+    * used only for determine dynamic range
+    * in ilog2-like context it's worth to add 1
+    * for proper magnitude whether saturated
+    */
+   return (opus_val32)maxlo + 1;
 }
 
 #elif __mips == 32
--