ref: 60eb7d88b4eace91395e553cf70fc4578a950353
parent: 1eaa67c0dce1f642af872cc26b7ff4b057a55bfb
author: Linfeng Zhang <linfengz@google.com>
date: Thu Sep 1 09:44:11 EDT 2016
Update silk_biquad_alt() Split to silk_biquad_alt_stride1() and silk_biquad_alt_stride2(), so that it can be optimized more efficiently when stride is 2. This change in C code is bit exact with the origin. Change-Id: Idaefe670397016ace2a489e3435ac61b7dbe79d5 Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/silk/LP_variable_cutoff.c
+++ b/silk/LP_variable_cutoff.c
@@ -130,6 +130,6 @@
/* ARMA low-pass filtering */
silk_assert( TRANSITION_NB == 3 && TRANSITION_NA == 2 );
- silk_biquad_alt( frame, B_Q28, A_Q28, psLP->In_LP_State, frame, frame_length, 1);
+ silk_biquad_alt_stride1( frame, B_Q28, A_Q28, psLP->In_LP_State, frame, frame_length);
}
}
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -100,14 +100,22 @@
* slower than biquad() but uses more precise coefficients
* can handle (slowly) varying coefficients
*/
-void silk_biquad_alt(
+void silk_biquad_alt_stride1(
const opus_int16 *in, /* I input signal */
const opus_int32 *B_Q28, /* I MA coefficients [3] */
const opus_int32 *A_Q28, /* I AR coefficients [2] */
opus_int32 *S, /* I/O State vector [2] */
opus_int16 *out, /* O output signal */
- const opus_int32 len, /* I signal length (must be even) */
- opus_int stride /* I Operate on interleaved signal if > 1 */
+ const opus_int32 len /* I signal length (must be even) */
+);
+
+void silk_biquad_alt_stride2(
+ const opus_int16 *in, /* I input signal */
+ const opus_int32 *B_Q28, /* I MA coefficients [3] */
+ const opus_int32 *A_Q28, /* I AR coefficients [2] */
+ opus_int32 *S, /* I/O State vector [4] */
+ opus_int16 *out, /* O output signal */
+ const opus_int32 len /* I signal length (must be even) */
);
/* Variable order MA prediction error filter. */
--- a/silk/biquad_alt.c
+++ b/silk/biquad_alt.c
@@ -39,14 +39,13 @@
#include "SigProc_FIX.h"
/* Second order ARMA filter, alternative implementation */
-void silk_biquad_alt(
+void silk_biquad_alt_stride1(
const opus_int16 *in, /* I input signal */
const opus_int32 *B_Q28, /* I MA coefficients [3] */
const opus_int32 *A_Q28, /* I AR coefficients [2] */
opus_int32 *S, /* I/O State vector [2] */
opus_int16 *out, /* O output signal */
- const opus_int32 len, /* I signal length (must be even) */
- opus_int stride /* I Operate on interleaved signal if > 1 */
+ const opus_int32 len /* I signal length (must be even) */
)
{
/* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
@@ -61,7 +60,7 @@
for( k = 0; k < len; k++ ) {
/* S[ 0 ], S[ 1 ]: Q12 */
- inval = in[ k * stride ];
+ inval = in[ k ];
out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], inval ), 2 );
S[ 0 ] = S[1] + silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14, A0_L_Q28 ), 14 );
@@ -73,6 +72,50 @@
S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], inval );
/* Scale back to Q0 and saturate */
- out[ k * stride ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14 + (1<<14) - 1, 14 ) );
+ out[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14 + (1<<14) - 1, 14 ) );
+ }
+}
+
+void silk_biquad_alt_stride2(
+ const opus_int16 *in, /* I input signal */
+ const opus_int32 *B_Q28, /* I MA coefficients [3] */
+ const opus_int32 *A_Q28, /* I AR coefficients [2] */
+ opus_int32 *S, /* I/O State vector [4] */
+ opus_int16 *out, /* O output signal */
+ const opus_int32 len /* I signal length (must be even) */
+)
+{
+ /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
+ opus_int k;
+ opus_int32 A0_U_Q28, A0_L_Q28, A1_U_Q28, A1_L_Q28, out32_Q14[ 2 ];
+
+ /* Negate A_Q28 values and split in two parts */
+ A0_L_Q28 = ( -A_Q28[ 0 ] ) & 0x00003FFF; /* lower part */
+ A0_U_Q28 = silk_RSHIFT( -A_Q28[ 0 ], 14 ); /* upper part */
+ A1_L_Q28 = ( -A_Q28[ 1 ] ) & 0x00003FFF; /* lower part */
+ A1_U_Q28 = silk_RSHIFT( -A_Q28[ 1 ], 14 ); /* upper part */
+
+ for( k = 0; k < len; k++ ) {
+ /* S[ 0 ], S[ 1 ], S[ 2 ], S[ 3 ]: Q12 */
+ out32_Q14[ 0 ] = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ 2 * k + 0 ] ), 2 );
+ out32_Q14[ 1 ] = silk_LSHIFT( silk_SMLAWB( S[ 2 ], B_Q28[ 0 ], in[ 2 * k + 1 ] ), 2 );
+
+ S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14[ 0 ], A0_L_Q28 ), 14 );
+ S[ 2 ] = S[ 3 ] + silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14[ 1 ], A0_L_Q28 ), 14 );
+ S[ 0 ] = silk_SMLAWB( S[ 0 ], out32_Q14[ 0 ], A0_U_Q28 );
+ S[ 2 ] = silk_SMLAWB( S[ 2 ], out32_Q14[ 1 ], A0_U_Q28 );
+ S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ 2 * k + 0 ] );
+ S[ 2 ] = silk_SMLAWB( S[ 2 ], B_Q28[ 1 ], in[ 2 * k + 1 ] );
+
+ S[ 1 ] = silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14[ 0 ], A1_L_Q28 ), 14 );
+ S[ 3 ] = silk_RSHIFT_ROUND( silk_SMULWB( out32_Q14[ 1 ], A1_L_Q28 ), 14 );
+ S[ 1 ] = silk_SMLAWB( S[ 1 ], out32_Q14[ 0 ], A1_U_Q28 );
+ S[ 3 ] = silk_SMLAWB( S[ 3 ], out32_Q14[ 1 ], A1_U_Q28 );
+ S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ 2 * k + 0 ] );
+ S[ 3 ] = silk_SMLAWB( S[ 3 ], B_Q28[ 2 ], in[ 2 * k + 1 ] );
+
+ /* Scale back to Q0 and saturate */
+ out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14[ 0 ] + (1<<14) - 1, 14 ) );
+ out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14[ 1 ] + (1<<14) - 1, 14 ) );
}
}
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@@ -364,9 +364,10 @@
A_Q28[ 1 ] = silk_SMULWW( r_Q22, r_Q22 );
#ifdef FIXED_POINT
- silk_biquad_alt( in, B_Q28, A_Q28, hp_mem, out, len, channels );
- if( channels == 2 ) {
- silk_biquad_alt( in+1, B_Q28, A_Q28, hp_mem+2, out+1, len, channels );
+ if( channels == 1 ) {
+ silk_biquad_alt_stride1( in, B_Q28, A_Q28, hp_mem, out, len );
+ } else {
+ silk_biquad_alt_stride2( in, B_Q28, A_Q28, hp_mem, out, len );
}
#else
silk_biquad_float( in, B_Q28, A_Q28, hp_mem, out, len, channels );