shithub: flite

ref: 397265f5791291defa76487c2388e1898e2e433c
dir: /src/synth/cst_ssml.c/

View raw version
/*************************************************************************/
/*                                                                       */
/*                  Language Technologies Institute                      */
/*                     Carnegie Mellon University                        */
/*                      Copyright (c) 2001-2011                          */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
/*               Date:  June 2008                                        */
/*************************************************************************/
/*                                                                       */
/*  SSML support for flite ( http://www.w3.org/TR/speech-synthesis/ )    */
/*                                                                       */
/*  We don't use a full XML parser here for space and availability       */
/*  reasons, but this is adequate for SSML                               */
/*  This is based on some old SABLE support in flite that never got      */
/*  completed                                                            */
/*                                                                       */
/*  <ssml> </ssml>                                                       */
/*  <voice ...> </voice>                                                 */
/*     name or urls for voices                                           */
/*  <audio ...> </audio>                                                 */
/*  <!-- ... -->                                                         */
/*  <break .../>                                                         */
/*  <prosody ...> </prosody>  rate volume (no pitch yet)                 */
/*  <emphasis ...> </emphasis>                                           */
/*  <sub alias="World Wide Web Consortium">W3C</sub>                     */
/*  <phoneme ph="x x x"> </phoneme>                                      */
/*                                                                       */
/*  <...> ignore all others                                              */
/*                                                                       */
/*  Voice call backs (e.g. -pw and -ps) are not transfered when new      */
/*  voices are selected                                                  */
/*                                                                       */
/*************************************************************************/

#include "flite.h"
#include "cst_tokenstream.h"

static const char * const ssml_singlecharsymbols_general = "<>&/\";";
static const char * const ssml_singlecharsymbols_inattr = "=>;/\"";

#define SSML_DEBUG 0

static const char *ts_get_quoted_remainder(cst_tokenstream *ts)
{
    const char *q;

    q = ts_get_quoted_token(ts,'"','\\');

    return q;
}

static cst_features *ssml_get_attributes(cst_tokenstream *ts)
{
    cst_features *a = new_features();
    const char* name, *val;
    const char *fnn,*vnn;
    int i=0;

    set_charclasses(ts,
                    ts->p_whitespacesymbols,
                    ssml_singlecharsymbols_inattr,
                    ts->p_prepunctuationsymbols,
                    ts->p_postpunctuationsymbols);

    name = ts_get(ts);
    while (!cst_streq(">",name))
    {
        /* I want names and values to be const */
        fnn = "_name0";
        vnn = "_val0";
        // Tags with more than one attribute need to have additional
        // attributes defined here.
        if (cst_streq("volume", name))
        {
            fnn = "_name1"; vnn = "_val1";
        }
        else if (cst_streq("pitch", name))
        {
            fnn = "_name2"; vnn = "_val2";
        }
        else if (cst_streq("range", name))
        {
            fnn = "_name3"; vnn = "_val3";
        }
	if (cst_streq(name,"/"))
	    feat_set_string(a,"_type","startend");
	else
	{
	    feat_set_string(a,"_type","start");
	    feat_set_string(a,fnn,name);
	    if (cst_streq("=",ts_get(ts)))
	    {
                val = ts_get_quoted_remainder(ts);
                feat_set_string(a,vnn,val);
            }
	}
	if (ts_eof(ts))
	{
	    fprintf(stderr,"ssml: unexpected EOF\n");
	    delete_features(a);
	    return 0;
	}
        name = ts_get(ts);
        i++;
    }
	
    set_charclasses(ts,
                    ts->p_whitespacesymbols,
                    ssml_singlecharsymbols_general,
                    ts->p_prepunctuationsymbols,
                    ts->p_postpunctuationsymbols);

    return a;
}

static cst_utterance *ssml_apply_tag(const char *tag,
                                     cst_features *attributes,
                                     cst_utterance *u,
                                     cst_features *word_feats,
                                     cst_features *feats)
{
    const char *wavefilename;
    const char *vname;
    cst_voice *nvoice;
    cst_wave *wave;
    cst_item *t;
    cst_relation *r;
    float break_size;

#if SSML_DEBUG
    printf("SSML TAG %s\n",tag);
    cst_feat_print(stdout,attributes);
    printf("...\n");
#endif

    if (cst_streq("AUDIO",tag))
    {
        if ((cst_streq("start",feat_string(attributes,"_type"))) ||
            (cst_streq("startend",feat_string(attributes,"_type"))))
        {
            wavefilename = feat_string(attributes,"_val0");
            wave = new_wave();
            if (cst_wave_load_riff(wave,wavefilename) == CST_OK_FORMAT)
            {
                if (cst_streq("start",feat_string(attributes,"_type")))
                {
                    feat_set_string(word_feats,"ssml_comment","1");
                }
                feat_set(word_feats,"ssml_play_audio",wave_val(wave));
            }
            else
                delete_wave(wave);
            return NULL; /* Cause eou */
        }
        else if (cst_streq("end",feat_string(attributes,"_type")))
        {
            feat_remove(word_feats,"ssml_comment");
            return NULL; /* Cause eou */
        }
    }
    else if (cst_streq("BREAK",tag))
    {
        if (u && 
            ((r = utt_relation(u,"Token")) != NULL) &&
            ((t = relation_tail(r)) != NULL))
        {
            item_set_string(t,"break","1");
            /* cst_feat_print(stdout,attributes); */
            if (cst_streq("size",get_param_string(attributes,"_name0","")))
            {
                break_size=feat_float(attributes,"_val0");
                item_set_float(t,"break_size",break_size);
            }
        }
    }
    else if (cst_streq("PROSODY",tag))
    {
        if (cst_streq("start",feat_string(attributes,"_type")))
        {
            /* Note SSML doesn't do stretch it does reciprical of stretch */
            if (cst_streq("rate",get_param_string(attributes,"_name0","")))
                feat_set_float(word_feats,"local_duration_stretch",
                               1.0/feat_float(attributes,"_val0"));
            // volume is stored in _name1
            if (cst_streq("volume",get_param_string(attributes,"_name1","")))
                feat_set_float(word_feats,"local_gain",
                               feat_float(attributes,"_val1")/100.0);
            // pitch is stored in _name2
            if (cst_streq("pitch", get_param_string(attributes, "_name2", "")))
            {
                feat_set_float(word_feats, "local_f0_mean", feat_float(attributes, "_val2"));
            }
            // range is stored in _name3
            if (cst_streq("range", get_param_string(attributes, "_name3", "")))
            {
                feat_set_float(word_feats, "local_f0_range",
                               // shift by + 1.0 to allow 0.0 to be passed.
                               feat_float(attributes, "_val3") + 1.0);
            }
        }
        else if (cst_streq("end",feat_string(attributes,"_type")))
        {
            feat_remove(word_feats,"local_duration_stretch");
            feat_remove(word_feats,"local_gain");
            feat_remove(word_feats, "local_f0_mean");
            feat_remove(word_feats, "local_f0_range");
        }

    }
    else if (cst_streq("PHONEME",tag))
    {
        if (cst_streq("start",feat_string(attributes,"_type")))
        {
            if (cst_streq("ph",get_param_string(attributes,"_name0","")))
            {
                const char *ph;
                ph = feat_string(attributes,"_val0");
                feat_set_string(word_feats,"phones",ph);
            }
        }
        else if (cst_streq("end",feat_string(attributes,"_type")))
        {
            feat_remove(word_feats,"phones");
        }

    }
    else if (cst_streq("SUB",tag))
    {
        if (cst_streq("start",feat_string(attributes,"_type")))
        {
            if (cst_streq("alias",get_param_string(attributes,"_name0","")))
            {
                const char *alias;
                alias = feat_string(attributes,"_val0");
                feat_set_string(word_feats,"ssml_alias",alias);
            }
        }
        else if (cst_streq("end",feat_string(attributes,"_type")))
        {
            feat_remove(word_feats,"ssml_alias");
        }

    }
    else if (cst_streq("VOICE",tag))
    {
        if (cst_streq("start",feat_string(attributes,"_type")))
        {
            vname = get_param_string(attributes,"_val0","");
            nvoice = flite_voice_select(vname);
            feat_set(feats,"current_voice",userdata_val(nvoice));
            return NULL;  /* cause an utterance break */
        }
        else if (cst_streq("end",feat_string(attributes,"_type")))
        {
            /* Hmm we should really have a stack of these */
            nvoice = 
            (cst_voice *)val_userdata(feat_val(feats,"default_voice"));
            feat_set(feats,"current_voice",userdata_val(nvoice));
            return NULL;
        }
    }

    /* do stuff */
    /* flag what to do mark or end */
    /*
      ph set attributes silence all contained tokens
      break add to previous token a break marker
      audio silence all following tokens (utt break)
        insert waveform 

    */

    return u;
}
			       
static float flite_ssml_to_speech_ts(cst_tokenstream *ts,
                                     cst_voice *voice,
                                     const char *outtype)
{
    /* This is a very ugly function, that might be better written with gotos */
    /* This just doesn't seem to be properly functions -- perhaps a proper */
    /* consumer/producer threaded model might be better here -- but its */
    /* not clear.  There is so much have-to-be-done-now vs note-for-later */
    /* code, that the code is far from clear, and probably not right */
    cst_features *ssml_feats, *ssml_word_feats;
    cst_features *attributes;
    const char *token = "";
    char *tag=NULL;
    cst_utterance *utt;
    cst_relation *tokrel;
    int num_tokens;
    cst_breakfunc breakfunc = default_utt_break;
    cst_uttfunc utt_user_callback = 0;
    float durs = 0.0;
    cst_item *t;
    cst_voice *current_voice; 
    int ssml_eou = 0;
    const cst_wave *wave;
    cst_wave *w;

    ssml_feats = new_features();
    feat_set(ssml_feats,"current_voice",userdata_val(voice));
    feat_set(ssml_feats,"default_voice",userdata_val(voice));
    ssml_word_feats = new_features();
    set_charclasses(ts,
                    " \t\n\r",
                    ssml_singlecharsymbols_general,
                    get_param_string(voice->features,"text_prepunctuation",""),
                    get_param_string(voice->features,"text_postpunctuation","")
                    );

    if (feat_present(voice->features,"utt_break"))
	breakfunc = val_breakfunc(feat_val(voice->features,"utt_break"));

    if (feat_present(voice->features,"utt_user_callback"))
	utt_user_callback = val_uttfunc(feat_val(voice->features,"utt_user_callback"));

    /* If its a file to write to, create and save an empty wave file */
    /* as we are going to incrementally append to it                 */
    if (!cst_streq(outtype,"play") && 
        !cst_streq(outtype,"none") &&
        !cst_streq(outtype,"stream"))
    {
	w = new_wave();
	cst_wave_resize(w,0,1);
	cst_wave_set_sample_rate(w,16000);
	cst_wave_save_riff(w,outtype);  /* an empty wave */
	delete_wave(w);
    }

    num_tokens = 0;
    utt = new_utterance();

    tokrel = utt_relation_create(utt, "Token");
    while (!ts_eof(ts) || num_tokens > 0)
    {
        current_voice = 
            (cst_voice *)val_userdata(feat_val(ssml_feats,"current_voice"));
        /* printf("awb_debug prewhile %d %s\n",ssml_eou,token); */
        if (ssml_eou == 0)
            token = ts_get(ts);
        else
        {
            if (!cst_streq("<",token))
                token = ts_get(ts);
            ssml_eou = 0;
        }
	while ((cst_streq("<",token)) && (ssml_eou == 0))
	{   /* A tag -- look ahead and process it to find out how to advance */
	    tag = cst_upcase(ts_get(ts));
            /* printf("awb_debug tag is %s\n",tag); */
            if (cst_streq("/",tag)) /* an end tag */
            {
                cst_free(tag); tag=NULL;
                tag = cst_upcase(ts_get(ts));
                attributes = ssml_get_attributes(ts);
                feat_set_string(attributes,"_type","end");
            }
            else
                attributes = ssml_get_attributes(ts);
            token = ts_get(ts);  /* skip ">" */
	    if (ssml_apply_tag(tag,attributes,utt,ssml_word_feats,ssml_feats))
                ssml_eou = 0;
            else
                ssml_eou = 1;
            
            delete_features(attributes);
	    cst_free(tag); tag=NULL;
	}

        if ((cst_strlen(token) == 0) ||
            (num_tokens > 500) ||  /* need an upper bound */
            (ssml_eou == 1) ||  /* ssml tag was utterance break */
            (relation_head(tokrel) && 
             breakfunc(ts,token,tokrel)))
        {
            /* An end of utt, so synthesize it */
            if (utt_user_callback)
                utt = (utt_user_callback)(utt);
            
            if (utt)
            {
                utt = flite_do_synth(utt,current_voice,utt_synth_tokens);
                if (feat_present(utt->features,"Interrupted"))
                {
                    delete_utterance(utt); utt = NULL;
                    break;
                }
                durs += flite_process_output(utt,outtype,TRUE);
                delete_utterance(utt); utt = NULL;
            }
            else 
                break;

            if (ts_eof(ts)) break;
            
            utt = new_utterance();
            tokrel = utt_relation_create(utt, "Token");
            num_tokens = 0;
        }

        if (feat_present(ssml_word_feats,"ssml_play_audio"))
        {
            wave = val_wave(feat_val(ssml_word_feats,"ssml_play_audio"));
            /* Should create an utterances with the waveform in it */
            /* Have to stream it if there is streaming */
            if (utt) delete_utterance(utt);
            utt = utt_synth_wave(copy_wave(wave),current_voice);
            if (utt_user_callback)
                utt = (utt_user_callback)(utt);
            durs += flite_process_output(utt,outtype,TRUE);
            delete_utterance(utt); utt = NULL;

            utt = new_utterance();
            tokrel = utt_relation_create(utt, "Token");
            num_tokens = 0;

            feat_remove(ssml_word_feats,"ssml_play_audio");
        }
	else if (!cst_streq("<",token))
        {  /* wasn't an ssml tag */
            num_tokens++;

            t = relation_append(tokrel, NULL);
            item_set_string(t,"name",token);
            item_set_string(t,"whitespace",ts->whitespace);
            item_set_string(t,"prepunctuation",ts->prepunctuation);
            item_set_string(t,"punc",ts->postpunctuation);
            /* Mark it at the beginning of the token */
            item_set_int(t,"file_pos",
                 ts->file_pos-(1+ /* as we are already on the next char */
                               cst_strlen(token)+
                               cst_strlen(ts->prepunctuation)+
                               cst_strlen(ts->postpunctuation)));
            item_set_int(t,"line_number",ts->line_number);
            feat_copy_into(ssml_word_feats,item_feats(t));
        }
    }

    delete_utterance(utt);
    delete_features(ssml_feats);
    delete_features(ssml_word_feats);
    return durs;
}

float flite_ssml_file_to_speech(const char *filename,
                                cst_voice *voice,
                                const char *outtype)
{
    cst_tokenstream *ts;
    int fp;
    cst_wave *w;
    float d;

    if ((ts = ts_open(filename,
	      get_param_string(voice->features,"text_whitespace",NULL),
	      get_param_string(voice->features,"text_singlecharsymbols",NULL),
	      get_param_string(voice->features,"text_prepunctuation",NULL),
	      get_param_string(voice->features,"text_postpunctuation",NULL)))
	== NULL)
    {
	cst_errmsg("failed to open file \"%s\" for ssml reading\n",
		   filename);
	return 1;
    }
    fp = get_param_int(voice->features,"file_start_position",0);
    if (fp > 0)
        ts_set_stream_pos(ts,fp);

    /* If its a file to write to, create and save an empty wave file */
    /* as we are going to incrementally append to it                 */
    if (!cst_streq(outtype,"play") && 
        !cst_streq(outtype,"none") &&
        !cst_streq(outtype,"stream"))
    {
	w = new_wave();
	cst_wave_resize(w,0,1);
	cst_wave_set_sample_rate(w,16000);
	cst_wave_save_riff(w,outtype);  /* an empty wave */
	delete_wave(w);
    }

    d = flite_ssml_to_speech_ts(ts,voice,outtype);

    ts_close(ts);
    
    return d;

}

float flite_ssml_text_to_speech(const char *text,
                                cst_voice *voice,
                                const char *outtype)
{
    cst_tokenstream *ts;
    int fp;
    cst_wave *w;
    float d;

    if ((ts = ts_open_string(text,
	      get_param_string(voice->features,"text_whitespace",NULL),
	      get_param_string(voice->features,"text_singlecharsymbols",NULL),
	      get_param_string(voice->features,"text_prepunctuation",NULL),
	      get_param_string(voice->features,"text_postpunctuation",NULL)))
	== NULL)
    {
	return 1;
    }
    fp = get_param_int(voice->features,"file_start_position",0);
    if (fp > 0)
        ts_set_stream_pos(ts,fp);

    /* If its a file to write to, create and save an empty wave file */
    /* as we are going to incrementally append to it                 */
    if (!cst_streq(outtype,"play") && 
        !cst_streq(outtype,"none") &&
        !cst_streq(outtype,"stream"))
    {
	w = new_wave();
	cst_wave_resize(w,0,1);
	cst_wave_set_sample_rate(w,16000);
	cst_wave_save_riff(w,outtype);  /* an empty wave */
	delete_wave(w);
    }

    d = flite_ssml_to_speech_ts(ts,voice,outtype);

    ts_close(ts);
    
    return d;

}