ref: 6ecee3715bdc51454e923ad9ba046b02836b4406
parent: 325585954d0875afe6c0cbb6d9cf2c95db3d22cb
author: Siarhei Volkau <lis8215@gmail.com>
date: Sat Aug 23 13:17:01 EDT 2025
MIPS DSP: utilize vector insns for celt_maxabs16 absq_s.ph does two ABS16 with saturation on a vector of two 16-bit values. cmp.lt.ph and pick.ph form conditional move for vector of two 16-bit values. Also 2x loop unroll for better perfomance. Original C version can return 0x8000 (positive) whereas this one is limited by 0x7fff, since result of this function used in ILOG2 context this is important. As a quick fix 1 is added to the result in hope to return 0x8000 if saturation happens. Signed-off-by: Siarhei Volkau <lis8215@gmail.com> Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/celt/mips/fixed_generic_mipsr1.h
+++ b/celt/mips/fixed_generic_mipsr1.h
@@ -35,6 +35,9 @@
#if defined (__mips_dsp) && __mips == 32
+typedef short v2i16 __attribute__((vector_size(4)));
+typedef char v2i8 __attribute__((vector_size(4)));
+
#undef MULT16_32_Q16
static inline int MULT16_32_Q16(int a, int b)
{
@@ -74,6 +77,75 @@
{
int r = a * b;
return __builtin_mips_shra_r_w(r, 15);
+}
+
+#define OVERRIDE_CELT_MAXABS16
+static OPUS_INLINE opus_val32 celt_maxabs16(const opus_val16 *x, int len)
+{
+ int i;
+ v2i16 v2max = (v2i16){ 0, 0 };
+ v2i16 x01, x23;
+ const v2i16 *x2;
+ opus_val16 maxlo, maxhi;
+ int loops;
+
+ if ((long)x & 2 && len > 0) {
+ v2max = (v2i16){ 0, ABS16(*x) };
+ x++;
+ len--;
+ }
+ x2 = __builtin_assume_aligned(x, 4);
+ loops = len / 4;
+
+ for (i = 0; i < loops; i++)
+ {
+ x01 = *x2++;
+ x23 = *x2++;
+ x01 = __builtin_mips_absq_s_ph(x01);
+ x23 = __builtin_mips_absq_s_ph(x23);
+ __builtin_mips_cmp_lt_ph(v2max, x01);
+ v2max = __builtin_mips_pick_ph(x01, v2max);
+ __builtin_mips_cmp_lt_ph(v2max, x23);
+ v2max = __builtin_mips_pick_ph(x23, v2max);
+ }
+
+ switch (len & 3) {
+ case 3:
+ x01 = __builtin_mips_absq_s_ph(*x2);
+ __builtin_mips_cmp_lt_ph(v2max, x01);
+ v2max = __builtin_mips_pick_ph(x01, v2max);
+ maxlo = EXTRACT16((opus_val32)v2max);
+ maxhi = EXTRACT16((opus_val32)v2max >> 16);
+ maxlo = MAX16(MAX16(maxlo, maxhi), ABS16(x[len - 1]));
+ break;
+ case 2:
+ x01 = __builtin_mips_absq_s_ph(*x2);
+ __builtin_mips_cmp_lt_ph(v2max, x01);
+ v2max = __builtin_mips_pick_ph(x01, v2max);
+ maxlo = EXTRACT16((opus_val32)v2max);
+ maxhi = EXTRACT16((opus_val32)v2max >> 16);
+ maxlo = MAX16(maxlo, maxhi);
+ break;
+ case 1:
+ maxlo = EXTRACT16((opus_val32)v2max);
+ maxhi = EXTRACT16((opus_val32)v2max >> 16);
+ return MAX16(MAX16(maxlo, maxhi), ABS16(x[len - 1]));
+ break;
+ case 0:
+ maxlo = EXTRACT16((opus_val32)v2max);
+ maxhi = EXTRACT16((opus_val32)v2max >> 16);
+ maxlo = MAX16(maxlo, maxhi);
+ break;
+ default:
+ __builtin_unreachable();
+ }
+ /* C version might return 0x8000, this one can't
+ * because abs is saturated here. Since result
+ * used only for determine dynamic range
+ * in ilog2-like context it's worth to add 1
+ * for proper magnitude whether saturated
+ */
+ return (opus_val32)maxlo + 1;
}
#elif __mips == 32
--
⑨