shithub: opus

Download patch

ref: 1dc14e901d75b7fa656704d323613a19ffe53d5b
parent: 688bb91ba83f7d57a2c5649e9751c5c667bea8ca
author: Siarhei Volkau <lis8215@gmail.com>
date: Sun Aug 17 14:49:41 EDT 2025

refactor: MIPS DSP: inline assembly

GCC supports all MIPS DSP and DSPr2 instructions in form
of builtin functions, this is more convenient way rather than
inline assembly.

Moreover, performance on MIPS heavily depends on instruction scheduling
GCC is unable to schedule inline assembly properly because it
doesn't know what exactly the asm routine do.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
Signed-off-by: Jean-Marc Valin <jeanmarcv@google.com>

--- a/celt/mips/celt_mipsr1.h
+++ b/celt/mips/celt_mipsr1.h
@@ -97,19 +97,18 @@
    {
       opus_val16 f;
       opus_val32 res;
+      long long acc;
       f = MULT16_16_Q15(window[i],window[i]);
       x0= x[i-T1+2];
 
-      asm volatile("MULT $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g00)), "r" ((int)x[i-T0]));
+      acc = __builtin_mips_mult((int)MULT16_16_Q15((Q15ONE-f),g00), (int)x[i-T0]);
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15((Q15ONE-f),g01), (int)ADD32(x[i-T0-1],x[i-T0+1]));
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15((Q15ONE-f),g02), (int)ADD32(x[i-T0-2],x[i-T0+2]));
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g10), (int)x2);
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g11), (int)ADD32(x3,x1));
+      acc = __builtin_mips_madd(acc, (int)MULT16_16_Q15(f,g12), (int)ADD32(x4,x0));
+      res = __builtin_mips_extr_w(acc, 15);
 
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g01)), "r" ((int)ADD32(x[i-T0-1],x[i-T0+1])));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15((Q15ONE-f),g02)), "r" ((int)ADD32(x[i-T0-2],x[i-T0+2])));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g10)), "r" ((int)x2));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g11)), "r" ((int)ADD32(x3,x1)));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)MULT16_16_Q15(f,g12)), "r" ((int)ADD32(x4,x0)));
-
-      asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15));
-
       y[i] = x[i] + res;
 
       x4=x3;
@@ -134,13 +133,14 @@
    for (i=overlap;i<N;i++)
    {
       opus_val32 res;
+      long long acc;
       x0=x[i-T1+2];
 
-      asm volatile("MULT $ac1, %0, %1" : : "r" ((int)g10), "r" ((int)x2));
+      acc = __builtin_mips_mult((int)g10, (int)x2);
+      acc = __builtin_mips_madd(acc, (int)g11, (int)ADD32(x3,x1));
+      acc = __builtin_mips_madd(acc, (int)g12, (int)ADD32(x4,x0));
+      res = __builtin_mips_extr_w(acc, 15);
 
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)g11), "r" ((int)ADD32(x3,x1)));
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)g12), "r" ((int)ADD32(x4,x0)));
-      asm volatile("EXTR.W %0,$ac1, %1" : "=r" (res): "i" (15));
       y[i] = x[i] + res;
       x4=x3;
       x3=x2;
--- a/celt/mips/fixed_generic_mipsr1.h
+++ b/celt/mips/fixed_generic_mipsr1.h
@@ -35,38 +35,30 @@
 
 #undef MULT16_32_Q15_ADD
 static inline int MULT16_32_Q15_ADD(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_madd(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef MULT16_32_Q15_SUB
 static inline int MULT16_32_Q15_SUB(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_msub(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef MULT16_16_Q15_ADD
 static inline int MULT16_16_Q15_ADD(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_madd(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef MULT16_16_Q15_SUB
 static inline int MULT16_16_Q15_SUB(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_msub(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 
@@ -73,54 +65,42 @@
 #undef MULT16_32_Q16
 static inline int MULT16_32_Q16(int a, int b)
 {
-    int c;
-    asm volatile("MULT $ac1,%0, %1" : : "r" (a), "r" (b));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (c): "i" (16));
-    return c;
+    long long acc = __builtin_mips_mult(a, b);
+    return __builtin_mips_extr_w(acc, 16);
 }
 
 #undef MULT16_32_P16
 static inline int MULT16_32_P16(int a, int b)
 {
-    int c;
-    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
-    asm volatile("EXTR_R.W %0,$ac1, %1" : "=r" (c): "i" (16));
-    return c;
+    long long acc = __builtin_mips_mult(a, b);
+    return __builtin_mips_extr_r_w(acc, 16);
 }
 
 #undef MULT16_32_Q15
 static inline int MULT16_32_Q15(int a, int b)
 {
-    int c;
-    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (c): "i" (15));
-    return c;
+    long long acc = __builtin_mips_mult(a, b);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef MULT32_32_Q31
 static inline int MULT32_32_Q31(int a, int b)
 {
-    int r;
-    asm volatile("MULT $ac1, %0, %1" : : "r" (a), "r" (b));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (r): "i" (31));
-    return r;
+    long long acc = __builtin_mips_mult(a, b);
+    return __builtin_mips_extr_w(acc, 31);
 }
 
 #undef PSHR32
 static inline int PSHR32(int a, int shift)
 {
-    int r;
-    asm volatile ("SHRAV_R.W %0, %1, %2" :"=r" (r): "r" (a), "r" (shift));
-    return r;
+    return __builtin_mips_shra_r_w(a, shift);
 }
 
 #undef MULT16_16_P15
 static inline int MULT16_16_P15(int a, int b)
 {
-    int r;
-    asm volatile ("mul %0, %1, %2" :"=r" (r): "r" (a), "r" (b));
-    asm volatile ("SHRA_R.W %0, %1, %2" : "+r" (r):  "0" (r), "i"(15));
-    return r;
+    int r = a * b;
+    return __builtin_mips_shra_r_w(r, 15);
 }
 
 #endif /* CELT_FIXED_GENERIC_MIPSR1_H */
--- a/celt/mips/kiss_fft_mipsr1.h
+++ b/celt/mips/kiss_fft_mipsr1.h
@@ -37,20 +37,16 @@
 
 #undef S_MUL_ADD
 static inline int S_MUL_ADD(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_madd(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef S_MUL_SUB
 static inline int S_MUL_SUB(int a, int b, int c, int d) {
-    int m;
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a), "r" ((int)b));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)c), "r" ((int)d));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m): "i" (15));
-    return m;
+    long long acc = __builtin_mips_mult(a, b);
+    acc = __builtin_mips_msub(acc, c, d);
+    return __builtin_mips_extr_w(acc, 15);
 }
 
 #undef C_MUL
@@ -58,13 +54,12 @@
 static inline kiss_fft_cpx C_MUL_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
     kiss_fft_cpx m;
 
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.r));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.i));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.r): "i" (15));
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.i));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.r));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.i): "i" (15));
-
+    long long acc1 = __builtin_mips_mult((int)a.r, (int)b.r);
+    long long acc2 = __builtin_mips_mult((int)a.r, (int)b.i);
+    acc1 = __builtin_mips_msub(acc1, (int)a.i, (int)b.i);
+    acc2 = __builtin_mips_madd(acc2, (int)a.i, (int)b.r);
+    m.r = __builtin_mips_extr_w(acc1, 15);
+    m.i = __builtin_mips_extr_w(acc2, 15);
     return m;
 }
 #undef C_MULC
@@ -72,13 +67,12 @@
 static inline kiss_fft_cpx C_MULC_fun(kiss_fft_cpx a, kiss_twiddle_cpx b) {
     kiss_fft_cpx m;
 
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.r));
-    asm volatile("madd $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.i));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.r): "i" (15));
-    asm volatile("MULT $ac1, %0, %1" : : "r" ((int)a.i), "r" ((int)b.r));
-    asm volatile("msub $ac1, %0, %1" : : "r" ((int)a.r), "r" ((int)b.i));
-    asm volatile("EXTR.W %0,$ac1, %1" : "=r" (m.i): "i" (15));
-
+    long long acc1 = __builtin_mips_mult((int)a.r, (int)b.r);
+    long long acc2 = __builtin_mips_mult((int)a.i, (int)b.r);
+    acc1 = __builtin_mips_madd(acc1, (int)a.i, (int)b.i);
+    acc2 = __builtin_mips_msub(acc2, (int)a.r, (int)b.i);
+    m.r = __builtin_mips_extr_w(acc1, 15);
+    m.i = __builtin_mips_extr_w(acc2, 15);
     return m;
 }
 
--- a/celt/mips/pitch_mipsr1.h
+++ b/celt/mips/pitch_mipsr1.h
@@ -39,26 +39,22 @@
       int N, opus_val32 *xy1, opus_val32 *xy2, int arch)
 {
    int j;
-   opus_val32 xy01=0;
-   opus_val32 xy02=0;
+   long long acc1 = 0;
+   long long acc2 = 0;
 
    (void)arch;
 
-   asm volatile("MULT $ac1, $0, $0");
-   asm volatile("MULT $ac2, $0, $0");
    /* Compute the norm of X+Y and X-Y as |X|^2 + |Y|^2 +/- sum(xy) */
-   for (j=0;j<N;j++)
+   for (j=0;j<N;j+=2)
    {
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)x[j]), "r" ((int)y01[j]));
-      asm volatile("MADD $ac2, %0, %1" : : "r" ((int)x[j]), "r" ((int)y02[j]));
-      ++j;
-      asm volatile("MADD $ac1, %0, %1" : : "r" ((int)x[j]), "r" ((int)y01[j]));
-      asm volatile("MADD $ac2, %0, %1" : : "r" ((int)x[j]), "r" ((int)y02[j]));
+       acc1 = __builtin_mips_madd(acc1, (int)x[j],   (int)y01[j]);
+       acc2 = __builtin_mips_madd(acc2, (int)x[j],   (int)y02[j]);
+       acc1 = __builtin_mips_madd(acc1, (int)x[j+1], (int)y01[j+1]);
+       acc2 = __builtin_mips_madd(acc2, (int)x[j+1], (int)y02[j+1]);
    }
-   asm volatile ("mflo %0, $ac1": "=r"(xy01));
-   asm volatile ("mflo %0, $ac2": "=r"(xy02));
-   *xy1 = xy01;
-   *xy2 = xy02;
+
+   *xy1 = (opus_val32)acc1;
+   *xy2 = (opus_val32)acc2;
 }
 
 static inline void xcorr_kernel_mips(const opus_val16 * x,
--- a/celt/mips/vq_mipsr1.h
+++ b/celt/mips/vq_mipsr1.h
@@ -70,7 +70,8 @@
 #ifdef FIXED_POINT
    int k;
 #endif
-   opus_val32 E = EPSILON;
+   long long acc = EPSILON;
+   opus_val32 E;
    opus_val16 g;
    opus_val32 t;
    celt_norm *xptr = X;
@@ -78,26 +79,23 @@
 
    (void)arch;
 
-   asm volatile("mult $ac1, $0, $0");
-   asm volatile("MTLO %0, $ac1" : :"r" (E));
    /*if(N %4)
        printf("error");*/
    for (i=0;i<N-2;i+=2)
    {
       X0 = (int)*xptr++;
-      asm volatile("MADD $ac1, %0, %1" : : "r" (X0), "r" (X0));
-
       X1 = (int)*xptr++;
-      asm volatile("MADD $ac1, %0, %1" : : "r" (X1), "r" (X1));
+      acc = __builtin_mips_madd(acc, X0, X0);
+      acc = __builtin_mips_madd(acc, X1, X1);
    }
 
    for (;i<N;i++)
    {
       X0 = (int)*xptr++;
-      asm volatile("MADD $ac1, %0, %1" : : "r" (X0), "r" (X0));
+      acc = __builtin_mips_madd(acc, X0, X0);
    }
 
-   asm volatile("MFLO %0, $ac1" : "=r" (E));
+   E = (opus_val32)acc;
 #ifdef FIXED_POINT
    k = celt_ilog2(E)>>1;
 #endif
--