ref: 4f3761b0199df7024b6e6b2004fc5eb7a6dbb28b
dir: /dnn/torch/testsuite/utils/warpq.py/
""" WARP-Q: Quality Prediction For Generative Neural Speech Codecs This is the WARP-Q version used in the ICASSP 2021 Paper: W. A. Jassim, J. Skoglund, M. Chinen, and A. Hines, “WARP-Q: Quality prediction for generative neural speech codecs,” paper accepted for presentation at the 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021). Date of acceptance: 30 Jan 2021. Preprint: https://arxiv.org/pdf/2102.10449 Run using python 3.x and include these package dependencies in your virtual environment: - pandas - librosa - numpy - pyvad - skimage - speechpy - soundfile - scipy (optional) - seaborn (optional, for plotting only) - multiprocessing (optional, for parallel computing mode only) - joblib (optional, for parallel computing mode only) Input: - The main_test function calls a csv file that contains paths of audio files. - The csv file cosists of four columns: - Ref_Wave: reference speech - Test_Wave: test speech - MOS: subjective score (optinal, for plotting only) - Codec: type of speech codec for the test speech (optinal, for plotting only) Output: - Code will compute the WARP-Q quality scores between Ref_Wave and Test_Wave, and will store the obrained results in a new column in the same csv file. Releases: Warning: While this code has been tested and commented giving invalid input files may cause unexpected results and will not be caught by robust exception handling or validation checking. It will just fail or give you the wrong answer. In this simple and basic demo, we compute WARP-Q scores for 8 speech samples only. More data should should be provided to have better score distributions. (c) Dr Wissam Jassim University College Dublin wissam.a.jassim@gmail.com wissam.jassim@ucd.ie November 28, 2020 """ # Load libraries import librosa, librosa.core, librosa.display import numpy as np from pyvad import vad from skimage.util.shape import view_as_windows import speechpy import soundfile as sf ################################ WARP-Q ####################################### def compute_WAPRQ(ref_path,test_path,sr=16000,n_mfcc=12,fmax=5000,patch_size=0.4, sigma=np.array([[1,1],[3,2],[1,3]])): # Inputs: # refPath: path of reference speech # disPath: path pf degraded speech # sr: sampling frequency, Hz # n_mfcc: number of MFCCs # fmax: cutoff frequency # patch_size: size of each patch in s # sigma: step size conditon for DTW # Output: # WARP-Q quality score between refPath and disPath ####################### Load speech files ################################# # Load Ref Speech if ref_path[-4:] == '.wav': speech_Ref, sr_Ref = librosa.load(ref_path,sr=sr) else: if ref_path[-4:] == '.SRC': #For ITUT database if applicable speech_Ref, sr_Ref = sf.read(ref_path, format='RAW', channels=1, samplerate=16000, subtype='PCM_16', endian='LITTLE') if sr_Ref != sr: speech_Ref = librosa.resample(speech_Ref, sr_Ref, sr) sr_Ref = sr # Load Coded Speech if test_path[-4:] == '.wav': speech_Coded, sr_Coded = librosa.load(test_path,sr=sr) else: if test_path[-4:] == '.OUT': #For ITUT database if applicable speech_Coded, sr_Coded = sf.read(test_path, format='RAW', channels=1, samplerate=16000, subtype='PCM_16', endian='LITTLE') if sr_Coded != sr: speech_Coded = librosa.resample(speech_Coded, sr_Coded, sr) sr_Coded = sr if sr_Ref != sr_Coded: raise ValueError("Reference and degraded signals should have same sampling rate!") # Make sure amplitudes are in the range of [-1, 1] otherwise clipping to -1 to 1 # after resampling (if applicable). We experienced this issue for TCD-VOIP database only speech_Ref[speech_Ref>1]=1.0 speech_Ref[speech_Ref<-1]=-1.0 speech_Coded[speech_Coded>1]=1.0 speech_Coded[speech_Coded<-1]=-1.0 ########################################################################### win_length = int(0.032*sr) #32 ms frame hop_length = int(0.004*sr) #4 ms overlap #hop_length = int(0.016*sr) n_fft = 2*win_length lifter = 3 # DTW Parameters Metric = 'euclidean' # VAD Parameters hop_size_vad = 30 sr_vad = sr aggresive = 0 # VAD for Ref speech vact1 = vad(speech_Ref, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive) speech_Ref_vad = speech_Ref[vact1==1] # VAD for Coded speech vact2 = vad(speech_Coded, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive) speech_Coded_vad = speech_Coded[vact2==1] # Compute MFCC features for the two signals mfcc_Ref = librosa.feature.mfcc(y=speech_Ref_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax, n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter) mfcc_Coded = librosa.feature.mfcc(y=speech_Coded_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax, n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter) # Feature Normalisation using CMVNW method mfcc_Ref = speechpy.processing.cmvnw(mfcc_Ref.T,win_size=201,variance_normalization=True).T mfcc_Coded = speechpy.processing.cmvnw(mfcc_Coded.T,win_size=201,variance_normalization=True).T # Divid MFCC features of Coded speech into patches cols = int(patch_size/(hop_length/sr)) window_shape = (np.size(mfcc_Ref,0), cols) step = int(cols/2) mfcc_Coded_patch = view_as_windows(mfcc_Coded, window_shape, step) Acc =[] band_rad = 0.25 weights_mul=np.array([1, 1, 1]) # Compute alignment cose between each patch and Ref MFCC for i in range(mfcc_Coded_patch.shape[1]): patch = mfcc_Coded_patch[0][i] D, P = librosa.sequence.dtw(X=patch, Y=mfcc_Ref, metric=Metric, step_sizes_sigma=sigma, weights_mul=weights_mul, band_rad=band_rad, subseq=True, backtrack=True) P_librosa = P[::-1, :] b_ast = P_librosa[-1, 1] Acc.append(D[-1, b_ast] / D.shape[0]) # Final score return np.median(Acc).item()