shithub: opus

Download patch

ref: 239d223d84c20d5c146ce6d5200b1b148ab85af6
parent: b93e4a149c1bb77e65f4e2c66249553d2a9c4428
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Mon Nov 20 21:56:04 EST 2023

Add rtcd for silk_inner_product_FLP()

--- a/silk/float/SigProc_FLP.h
+++ b/silk/float/SigProc_FLP.h
@@ -74,7 +74,8 @@
     silk_float          *results,           /* O    result (length correlationCount)                            */
     const silk_float    *inputData,         /* I    input data to correlate                                     */
     opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 );
 
 opus_int silk_pitch_analysis_core_FLP(      /* O    Voicing estimate: 0 voiced, 1 unvoiced                      */
@@ -106,7 +107,8 @@
     const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
     const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
     const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 );
 
 /* multiply a vector by a constant */
@@ -132,7 +134,7 @@
 );
 
 #ifndef OVERRIDE_inner_product_FLP
-#define silk_inner_product_FLP(data1, data2, dataSize) silk_inner_product_FLP_c(data1, data2, dataSize)
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_c(data1, data2, dataSize))
 #endif
 
 
--- a/silk/float/autocorrelation_FLP.c
+++ b/silk/float/autocorrelation_FLP.c
@@ -37,7 +37,8 @@
     silk_float          *results,           /* O    result (length correlationCount)                            */
     const silk_float    *inputData,         /* I    input data to correlate                                     */
     opus_int            inputDataSize,      /* I    length of input                                             */
-    opus_int            correlationCount    /* I    number of correlation taps to compute                       */
+    opus_int            correlationCount,    /* I    number of correlation taps to compute                       */
+    int                 arch
 )
 {
     opus_int i;
@@ -47,6 +48,6 @@
     }
 
     for( i = 0; i < correlationCount; i++ ) {
-        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i );
+        results[ i ] =  (silk_float)silk_inner_product_FLP( inputData, inputData + i, inputDataSize - i, arch );
     }
 }
--- a/silk/float/burg_modified_FLP.c
+++ b/silk/float/burg_modified_FLP.c
@@ -42,7 +42,8 @@
     const silk_float    minInvGain,         /* I    minimum inverse prediction gain                             */
     const opus_int      subfr_length,       /* I    input signal subframe length (incl. D preceding samples)    */
     const opus_int      nb_subfr,           /* I    number of subframes stacked in x                            */
-    const opus_int      D                   /* I    order                                                       */
+    const opus_int      D,                  /* I    order                                                       */
+    int                 arch
 )
 {
     opus_int         k, n, s, reached_max_gain;
@@ -60,7 +61,7 @@
     for( s = 0; s < nb_subfr; s++ ) {
         x_ptr = x + s * subfr_length;
         for( n = 1; n < D + 1; n++ ) {
-            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n );
+            C_first_row[ n - 1 ] += silk_inner_product_FLP( x_ptr, x_ptr + n, subfr_length - n, arch );
         }
     }
     silk_memcpy( C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof( double ) );
--- a/silk/float/corrMatrix_FLP.c
+++ b/silk/float/corrMatrix_FLP.c
@@ -41,7 +41,8 @@
     const silk_float                *t,                                 /* I    Target vector [L]                           */
     const opus_int                  L,                                  /* I    Length of vecors                            */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 )
 {
     opus_int lag;
@@ -50,7 +51,7 @@
     ptr1 = &x[ Order - 1 ];                     /* Points to first sample of column 0 of X: X[:,0] */
     for( lag = 0; lag < Order; lag++ ) {
         /* Calculate X[:,lag]'*t */
-        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L );
+        Xt[ lag ] = (silk_float)silk_inner_product_FLP( ptr1, t, L, arch );
         ptr1--;                                 /* Next column of X */
     }
 }
@@ -60,7 +61,8 @@
     const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
     const opus_int                  L,                                  /* I    Length of vectors                           */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 )
 {
     opus_int j, lag;
@@ -79,7 +81,7 @@
     ptr2 = &x[ Order - 2 ];                     /* First sample of column 1 of X */
     for( lag = 1; lag < Order; lag++ ) {
         /* Calculate X[:,0]'*X[:,lag] */
-        energy = silk_inner_product_FLP( ptr1, ptr2, L );
+        energy = silk_inner_product_FLP( ptr1, ptr2, L, arch );
         matrix_ptr( XX, lag, 0, Order ) = ( silk_float )energy;
         matrix_ptr( XX, 0, lag, Order ) = ( silk_float )energy;
         /* Calculate X[:,j]'*X[:,j + lag] */
--- a/silk/float/find_LPC_FLP.c
+++ b/silk/float/find_LPC_FLP.c
@@ -38,7 +38,8 @@
     silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
     opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
     const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Inverse of max prediction gain              */
+    const silk_float                minInvGain,                         /* I    Inverse of max prediction gain              */
+    int                             arch
 )
 {
     opus_int    k, subfr_length;
@@ -56,12 +57,12 @@
     psEncC->indices.NLSFInterpCoef_Q2 = 4;
 
     /* Burg AR analysis for the full frame */
-    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder );
+    res_nrg = silk_burg_modified_FLP( a, x, minInvGain, subfr_length, psEncC->nb_subfr, psEncC->predictLPCOrder, arch );
 
     if( psEncC->useInterpolatedNLSFs && !psEncC->first_frame_after_reset && psEncC->nb_subfr == MAX_NB_SUBFR ) {
         /* Optimal solution for last 10 ms; subtract residual energy here, as that's easier than        */
         /* adding it to the residual energy of the first 10 ms in each iteration of the search below    */
-        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder );
+        res_nrg -= silk_burg_modified_FLP( a_tmp, x + ( MAX_NB_SUBFR / 2 ) * subfr_length, minInvGain, subfr_length, MAX_NB_SUBFR / 2, psEncC->predictLPCOrder, arch );
 
         /* Convert to NLSFs */
         silk_A2NLSF_FLP( NLSF_Q15, a_tmp, psEncC->predictLPCOrder );
--- a/silk/float/find_LTP_FLP.c
+++ b/silk/float/find_LTP_FLP.c
@@ -38,7 +38,8 @@
     const silk_float                r_ptr[],                            /* I    LPC residual                                */
     const opus_int                  lag[ MAX_NB_SUBFR ],                /* I    LTP lags                                    */
     const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 )
 {
     opus_int   k;
@@ -50,8 +51,8 @@
     XX_ptr = XX;
     for( k = 0; k < nb_subfr; k++ ) {
         lag_ptr = r_ptr - ( lag[ k ] + LTP_ORDER / 2 );
-        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr );
-        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr );
+        silk_corrMatrix_FLP( lag_ptr, subfr_length, LTP_ORDER, XX_ptr, arch );
+        silk_corrVector_FLP( lag_ptr, r_ptr, subfr_length, LTP_ORDER, xX_ptr, arch );
         xx = ( silk_float )silk_energy_FLP( r_ptr, subfr_length + LTP_ORDER );
         temp = 1.0f / silk_max( xx, LTP_CORR_INV_MAX * 0.5f * ( XX_ptr[ 0 ] + XX_ptr[ 24 ] ) + 1.0f );
         silk_scale_vector_FLP( XX_ptr, temp, LTP_ORDER * LTP_ORDER );
--- a/silk/float/find_pitch_lags_FLP.c
+++ b/silk/float/find_pitch_lags_FLP.c
@@ -82,7 +82,7 @@
     silk_apply_sine_window_FLP( Wsig_ptr, x_buf_ptr, 2, psEnc->sCmn.la_pitch );
 
     /* Calculate autocorrelation sequence */
-    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1 );
+    silk_autocorrelation_FLP( auto_corr, Wsig, psEnc->sCmn.pitch_LPC_win_length, psEnc->sCmn.pitchEstimationLPCOrder + 1, arch );
 
     /* Add white noise, as a fraction of the energy */
     auto_corr[ 0 ] += auto_corr[ 0 ] * FIND_PITCH_WHITE_NOISE_FRACTION + 1;
--- a/silk/float/find_pred_coefs_FLP.c
+++ b/silk/float/find_pred_coefs_FLP.c
@@ -63,7 +63,7 @@
         celt_assert( psEnc->sCmn.ltp_mem_length - psEnc->sCmn.predictLPCOrder >= psEncCtrl->pitchL[ 0 ] + LTP_ORDER / 2 );
 
         /* LTP analysis */
-        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr );
+        silk_find_LTP_FLP( XXLTP, xXLTP, res_pitch, psEncCtrl->pitchL, psEnc->sCmn.subfr_length, psEnc->sCmn.nb_subfr, psEnc->sCmn.arch );
 
         /* Quantize LTP gain parameters */
         silk_quant_LTP_gains_FLP( psEncCtrl->LTPCoef, psEnc->sCmn.indices.LTPIndex, &psEnc->sCmn.indices.PERIndex,
@@ -102,7 +102,7 @@
     }
 
     /* LPC_in_pre contains the LTP-filtered input for voiced, and the unfiltered input for unvoiced */
-    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain );
+    silk_find_LPC_FLP( &psEnc->sCmn, NLSF_Q15, LPC_in_pre, minInvGain, psEnc->sCmn.arch );
 
     /* Quantize LSFs */
     silk_process_NLSFs_FLP( &psEnc->sCmn, psEncCtrl->PredCoef, NLSF_Q15, psEnc->sCmn.prev_NLSFq_Q15 );
--- a/silk/float/main_FLP.h
+++ b/silk/float/main_FLP.h
@@ -138,7 +138,8 @@
     silk_encoder_state              *psEncC,                            /* I/O  Encoder state                               */
     opus_int16                      NLSF_Q15[],                         /* O    NLSFs                                       */
     const silk_float                x[],                                /* I    Input signal                                */
-    const silk_float                minInvGain                          /* I    Prediction gain from LTP (dB)               */
+    const silk_float                minInvGain,                         /* I    Prediction gain from LTP (dB)               */
+    int                             arch
 );
 
 /* LTP analysis */
@@ -148,7 +149,8 @@
     const silk_float                r_ptr[],                            /* I    LPC residual                                */
     const opus_int                  lag[  MAX_NB_SUBFR ],               /* I    LTP lags                                    */
     const opus_int                  subfr_length,                       /* I    Subframe length                             */
-    const opus_int                  nb_subfr                            /* I    number of subframes                         */
+    const opus_int                  nb_subfr,                           /* I    number of subframes                         */
+    int                             arch
 );
 
 void silk_LTP_analysis_filter_FLP(
@@ -221,7 +223,8 @@
     const silk_float                *x,                                 /* I    x vector [ L+order-1 ] used to create X     */
     const opus_int                  L,                                  /* I    Length of vectors                           */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *XX                                 /* O    X'*X correlation matrix [order x order]     */
+    silk_float                      *XX,                                /* O    X'*X correlation matrix [order x order]     */
+    int                             arch
 );
 
 /* Calculates correlation vector X'*t */
@@ -230,7 +233,8 @@
     const silk_float                *t,                                 /* I    Target vector [L]                           */
     const opus_int                  L,                                  /* I    Length of vecors                            */
     const opus_int                  Order,                              /* I    Max lag for correlation                     */
-    silk_float                      *Xt                                 /* O    X'*t correlation vector [order]             */
+    silk_float                      *Xt,                                /* O    X'*t correlation vector [order]             */
+    int                             arch
 );
 
 /* Apply sine window to signal vector.  */
--- a/silk/float/noise_shape_analysis_FLP.c
+++ b/silk/float/noise_shape_analysis_FLP.c
@@ -255,7 +255,7 @@
                 psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder );
         } else {
             /* Calculate regular auto correlation */
-            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1 );
+            silk_autocorrelation_FLP( auto_corr, x_windowed, psEnc->sCmn.shapeWinLength, psEnc->sCmn.shapingLPCOrder + 1, psEnc->sCmn.arch );
         }
 
         /* Add white noise, as a fraction of energy */
--- a/silk/float/pitch_analysis_core_FLP.c
+++ b/silk/float/pitch_analysis_core_FLP.c
@@ -291,7 +291,7 @@
         for( j = 0; j < length_d_comp; j++ ) {
             d = d_comp[ j ];
             basis_ptr = target_ptr - d;
-            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz );
+            cross_corr = silk_inner_product_FLP( basis_ptr, target_ptr, sf_length_8kHz, arch );
             if( cross_corr > 0.0f ) {
                 energy = silk_energy_FLP( basis_ptr, sf_length_8kHz );
                 C[ k ][ d ] = (silk_float)( 2 * cross_corr / ( energy + energy_tmp ) );
--- a/silk/x86/main_sse.h
+++ b/silk/x86/main_sse.h
@@ -278,11 +278,18 @@
 #if defined (OPUS_X86_PRESUME_AVX2)
 
 #define OVERRIDE_inner_product_FLP
-#define silk_inner_product_FLP(data1, data2, dataSize) silk_inner_product_FLP_avx2(data1, data2, dataSize)
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,silk_inner_product_FLP_avx2(data1, data2, dataSize))
 
 #elif defined(OPUS_HAVE_RTCD) && defined(OPUS_X86_MAY_HAVE_AVX2)
 
-/*#define OVERRIDE_inner_product_FLP*/
+#define OVERRIDE_inner_product_FLP
+extern double (*const SILK_INNER_PRODUCT_FLP_IMPL[OPUS_ARCHMASK + 1])(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+);
+
+#define silk_inner_product_FLP(data1, data2, dataSize, arch) ((void)arch,(*SILK_INNER_PRODUCT_FLP_IMPL[(arch) & OPUS_ARCHMASK])(data1, data2, dataSize))
 
 #endif
 
--- a/silk/x86/x86_silk_map.c
+++ b/silk/x86/x86_silk_map.c
@@ -32,6 +32,7 @@
 #include "celt/x86/x86cpu.h"
 #include "structs.h"
 #include "SigProc_FIX.h"
+#include "SigProc_FLP.h"
 #include "pitch.h"
 #include "main.h"
 
@@ -156,4 +157,21 @@
 };
 
 #endif
+
+#ifndef FIXED_POINT
+
+double (*const SILK_INNER_PRODUCT_FLP_IMPL[ OPUS_ARCHMASK + 1 ] )(
+    const silk_float    *data1,
+    const silk_float    *data2,
+    opus_int            dataSize
+) = {
+  silk_inner_product_FLP_c,                  /* non-sse */
+  silk_inner_product_FLP_c,
+  silk_inner_product_FLP_c,
+  silk_inner_product_FLP_c, /* sse4.1 */
+  MAY_HAVE_AVX2( silk_inner_product_FLP )  /* avx */
+};
+
+#endif
+
 #endif
--