shithub: opus

ref: 4f3761b0199df7024b6e6b2004fc5eb7a6dbb28b
dir: /dnn/torch/testsuite/utils/warpq.py/

View raw version

"""
WARP-Q: Quality Prediction For Generative Neural Speech Codecs

This is the WARP-Q version used in the ICASSP 2021 Paper:

W. A. Jassim, J. Skoglund, M. Chinen, and A. Hines, “WARP-Q: Quality prediction
for generative neural speech codecs,” paper accepted for presentation at the 2021 IEEE
International Conference on Acoustics, Speech and Signal Processing (ICASSP 2021).
Date of acceptance: 30 Jan 2021. Preprint: https://arxiv.org/pdf/2102.10449

Run using python 3.x and include these package dependencies in your virtual environment:
    - pandas
    - librosa
    - numpy
    - pyvad
    - skimage
    - speechpy
    - soundfile
    - scipy (optional)
    - seaborn (optional, for plotting only)
    - multiprocessing (optional, for parallel computing mode only)
    - joblib (optional, for parallel computing mode only)

Input:
    - The main_test function calls a csv file that contains paths of audio files.
    - The csv file cosists of four columns:
        - Ref_Wave: reference speech
        - Test_Wave: test speech
        - MOS: subjective score (optinal, for plotting only)
        - Codec: type of speech codec for the test speech (optinal, for plotting only)

Output:
    - Code will compute the WARP-Q quality scores between Ref_Wave and Test_Wave,
    and will store the obrained results in a new column in the same csv file.


Releases:

Warning: While this code has been tested and commented giving invalid input
files may cause unexpected results and will not be caught by robust exception
handling or validation checking. It will just fail or give you the wrong answer.

In this simple and basic demo, we compute WARP-Q scores for 8 speech samples only.
More data should should be provided to have better score distributions.


(c) Dr Wissam Jassim
    University College Dublin
    wissam.a.jassim@gmail.com
    wissam.jassim@ucd.ie
    November 28, 2020

"""

# Load libraries
import librosa, librosa.core, librosa.display
import numpy as np
from pyvad import vad
from skimage.util.shape import view_as_windows
import speechpy
import soundfile as sf

################################ WARP-Q #######################################
def compute_WAPRQ(ref_path,test_path,sr=16000,n_mfcc=12,fmax=5000,patch_size=0.4,
                  sigma=np.array([[1,1],[3,2],[1,3]])):

    # Inputs:
    # refPath: path of reference speech
    # disPath: path pf degraded speech
    # sr: sampling frequency, Hz
    # n_mfcc: number of MFCCs
    # fmax: cutoff frequency
    # patch_size: size of each patch in s
    # sigma: step size conditon for DTW

    # Output:
    # WARP-Q quality score between refPath and disPath


    ####################### Load speech files #################################
    # Load Ref Speech
    if ref_path[-4:] == '.wav':
        speech_Ref, sr_Ref = librosa.load(ref_path,sr=sr)
    else:
        if ref_path[-4:] == '.SRC': #For ITUT database if applicable
            speech_Ref, sr_Ref  = sf.read(ref_path, format='RAW', channels=1, samplerate=16000,
                           subtype='PCM_16', endian='LITTLE')
            if sr_Ref != sr:
                speech_Ref = librosa.resample(speech_Ref, sr_Ref, sr)
                sr_Ref = sr

    # Load Coded Speech
    if test_path[-4:] == '.wav':
        speech_Coded, sr_Coded = librosa.load(test_path,sr=sr)
    else:
        if test_path[-4:] == '.OUT': #For ITUT database if applicable
            speech_Coded, sr_Coded  = sf.read(test_path, format='RAW', channels=1, samplerate=16000,
                           subtype='PCM_16', endian='LITTLE')
            if sr_Coded != sr:
                speech_Coded = librosa.resample(speech_Coded, sr_Coded, sr)
                sr_Coded = sr

    if sr_Ref != sr_Coded:
        raise ValueError("Reference and degraded signals should have same sampling rate!")

    # Make sure amplitudes are in the range of [-1, 1] otherwise clipping to -1 to 1
    # after resampling (if applicable). We experienced this issue for TCD-VOIP database only
    speech_Ref[speech_Ref>1]=1.0
    speech_Ref[speech_Ref<-1]=-1.0

    speech_Coded[speech_Coded>1]=1.0
    speech_Coded[speech_Coded<-1]=-1.0

    ###########################################################################

    win_length = int(0.032*sr) #32 ms frame
    hop_length = int(0.004*sr) #4 ms overlap
    #hop_length = int(0.016*sr)

    n_fft = 2*win_length
    lifter = 3

    # DTW Parameters
    Metric = 'euclidean'

    # VAD Parameters
    hop_size_vad = 30
    sr_vad = sr
    aggresive = 0

    # VAD for Ref speech
    vact1 = vad(speech_Ref, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive)
    speech_Ref_vad = speech_Ref[vact1==1]

    # VAD for Coded speech
    vact2 = vad(speech_Coded, sr, fs_vad = sr_vad, hop_length = hop_size_vad, vad_mode=aggresive)
    speech_Coded_vad = speech_Coded[vact2==1]

    # Compute MFCC features for the two signals

    mfcc_Ref = librosa.feature.mfcc(y=speech_Ref_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax,
                                    n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter)
    mfcc_Coded = librosa.feature.mfcc(y=speech_Coded_vad,sr=sr,n_mfcc=n_mfcc,fmax=fmax,
                                    n_fft=n_fft,win_length=win_length,hop_length=hop_length,lifter=lifter)

    # Feature Normalisation using CMVNW method
    mfcc_Ref = speechpy.processing.cmvnw(mfcc_Ref.T,win_size=201,variance_normalization=True).T
    mfcc_Coded = speechpy.processing.cmvnw(mfcc_Coded.T,win_size=201,variance_normalization=True).T

    # Divid MFCC features of Coded speech into patches
    cols = int(patch_size/(hop_length/sr))
    window_shape = (np.size(mfcc_Ref,0), cols)
    step  = int(cols/2)

    mfcc_Coded_patch = view_as_windows(mfcc_Coded, window_shape, step)

    Acc =[]
    band_rad = 0.25
    weights_mul=np.array([1, 1, 1])

    # Compute alignment cose between each patch and Ref MFCC
    for i in range(mfcc_Coded_patch.shape[1]):

        patch = mfcc_Coded_patch[0][i]

        D, P = librosa.sequence.dtw(X=patch, Y=mfcc_Ref, metric=Metric,
                                    step_sizes_sigma=sigma, weights_mul=weights_mul,
                                    band_rad=band_rad, subseq=True, backtrack=True)

        P_librosa = P[::-1, :]
        b_ast = P_librosa[-1, 1]

        Acc.append(D[-1, b_ast] / D.shape[0])

    # Final score
    return np.median(Acc).item()