ref: 51a1a378632b453d050c9be3afcf75fd2baa198c
dir: /lang/cmu_indic_lex/cmu_indic_lex.c/
/*************************************************************************/
/* */
/* Language Technologies Institute */
/* Carnegie Mellon University */
/* Copyright (c) 2013 */
/* All Rights Reserved. */
/* */
/* Permission is hereby granted, free of charge, to use and distribute */
/* this software and its documentation without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of this work, and to */
/* permit persons to whom this work is furnished to do so, subject to */
/* the following conditions: */
/* 1. The code must retain the above copyright notice, this list of */
/* conditions and the following disclaimer. */
/* 2. Any modifications must be clearly marked as such. */
/* 3. Original authors' names are not deleted. */
/* 4. The authors' names are not used to endorse or promote products */
/* derived from this software without specific prior written */
/* permission. */
/* */
/* CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK */
/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
/* SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE */
/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
/* THIS SOFTWARE. */
/* */
/*************************************************************************/
/* */
/* indic Lexical function */
/* */
/*************************************************************************/
#include "flite.h"
#include "cst_val.h"
#include "cst_voice.h"
#include "cst_lexicon.h"
#include "cst_ffeatures.h"
#include "cmu_indic_lex.h"
#include "cst_tokenstream.h"
#include "../cmu_indic_lang/cmu_indic_lang.h"
extern cst_lexicon cmu_lex;
int utf8_sequence_length(char c0);
static const struct cmu_indic_char cmu_indic_offset_char[128] = {
/*000*/ {IND_IGNORE, ""},
/*001*/ {IND_ANUSWAAR, "nX"},
/*002*/ {IND_ANUSWAAR, "nX"},
/*003*/ {IND_VISARGA, "h"},
/*004*/ {IND_INDEPENDENT_VOWEL, "A"},
/*005*/ {IND_INDEPENDENT_VOWEL, "A"},
/*006*/ {IND_INDEPENDENT_VOWEL, "A:"},
/*007*/ {IND_INDEPENDENT_VOWEL, "i"},
/*008*/ {IND_INDEPENDENT_VOWEL, "i:"},
/*009*/ {IND_INDEPENDENT_VOWEL, "u"},
/*00A*/ {IND_INDEPENDENT_VOWEL, "u:"},
/*00B*/ {IND_INDEPENDENT_VOWEL, "9r="},
/*00C*/ {IND_INDEPENDENT_VOWEL, "l="},
/*00D*/ {IND_INDEPENDENT_VOWEL, "ay"},
/*00E*/ {IND_INDEPENDENT_VOWEL, "e"},
/*00F*/ {IND_INDEPENDENT_VOWEL, "e"},
/*010*/ {IND_INDEPENDENT_VOWEL, "aI"},
/*011*/ {IND_INDEPENDENT_VOWEL, "ow"},
/*012*/ {IND_INDEPENDENT_VOWEL, "ow"},
/*013*/ {IND_INDEPENDENT_VOWEL, "o"},
/*014*/ {IND_INDEPENDENT_VOWEL, "aU"},
/*015*/ {IND_CONSONANT, "k"},
/*016*/ {IND_CONSONANT, "kh"},
/*017*/ {IND_CONSONANT, "g"},
/*018*/ {IND_CONSONANT, "gh"},
/*019*/ {IND_CONSONANT, "N"},
/*01A*/ {IND_CONSONANT, "c"},
/*01B*/ {IND_CONSONANT, "ch"},
/*01C*/ {IND_CONSONANT, "J"},
/*01D*/ {IND_CONSONANT, "Jh"},
/*01E*/ {IND_CONSONANT, "n~"},
/*01F*/ {IND_CONSONANT, "tr"},
/*020*/ {IND_CONSONANT, "tR"},
/*021*/ {IND_CONSONANT, "dr"},
/*022*/ {IND_CONSONANT, "dR"},
/*023*/ {IND_CONSONANT, "nr"},
/*024*/ {IND_CONSONANT, "tB"},
/*025*/ {IND_CONSONANT, "tBh"},
/*026*/ {IND_CONSONANT, "dB"},
/*027*/ {IND_CONSONANT, "dBh"},
/*028*/ {IND_CONSONANT, "nB"},
/*029*/ {IND_CONSONANT, "nB"},
/*02A*/ {IND_CONSONANT, "p"},
/*02B*/ {IND_CONSONANT, "ph"},
/*02C*/ {IND_CONSONANT, "b"},
/*02D*/ {IND_CONSONANT, "bh"},
/*02E*/ {IND_CONSONANT, "m"},
/*02F*/ {IND_CONSONANT, "j"},
/*030*/ {IND_CONSONANT, "9r"},
/*031*/ {IND_CONSONANT, "9r"},
/*032*/ {IND_CONSONANT, "l"},
/*033*/ {IND_CONSONANT, "lr"},
/*034*/ {IND_CONSONANT, "lr"},
/*035*/ {IND_CONSONANT, "v"},
/*036*/ {IND_CONSONANT, "c}"},
/*037*/ {IND_CONSONANT, "sr"},
/*038*/ {IND_CONSONANT, "s"},
/*039*/ {IND_CONSONANT, "hv"},
/*03A*/ {IND_VOWEL, "e"},
/*03B*/ {IND_VOWEL, "e"},
/*03C*/ {IND_NUKTA, ""},
/*03D*/ {IND_AVAGRAHA, ""},
/*03E*/ {IND_VOWEL, "A:"},
/*03F*/ {IND_VOWEL, "i"},
/*040*/ {IND_VOWEL, "i:"},
/*041*/ {IND_VOWEL, "u"},
/*042*/ {IND_VOWEL, "u:"},
/*043*/ {IND_VOWEL, "9r="},
/*044*/ {IND_VOWEL, "9r="},
/*045*/ {IND_VOWEL, "ay"},
/*046*/ {IND_VOWEL, "e"},
/*047*/ {IND_VOWEL, "e"},
/*048*/ {IND_VOWEL, "aI"},
/*049*/ {IND_VOWEL, "ow"},
/*04A*/ {IND_VOWEL, "o"},
/*04B*/ {IND_VOWEL, "o"},
/*04C*/ {IND_VOWEL, "aU"},
/*04D*/ {IND_HALANT, ""},
/*04E*/ {IND_IGNORE, ""},
/*04F*/ {IND_VOWEL, "ow"},
/*050*/ {IND_INDEPENDENT_VOWEL, "A u m"},
/*051*/ {IND_IGNORE, ""},
/*052*/ {IND_IGNORE, ""},
/*053*/ {IND_IGNORE, ""},
/*054*/ {IND_IGNORE, ""},
/*055*/ {IND_VOWEL, "e"},
/*056*/ {IND_VOWEL, "u e"},
/*057*/ {IND_VOWEL, "u: e"},
/*058*/ {IND_CONSONANT, "q"},
/*059*/ {IND_CONSONANT, "x"},
/*05A*/ {IND_CONSONANT, "G"},
/*05B*/ {IND_CONSONANT, "z"},
/*05C*/ {IND_CONSONANT, "rr"},
/*05D*/ {IND_CONSONANT, "rrh"},
/*05E*/ {IND_CONSONANT, "f"},
/*05F*/ {IND_CONSONANT, "j"},
/*060*/ {IND_INDEPENDENT_VOWEL, "9r="},
/*061*/ {IND_INDEPENDENT_VOWEL, "lr="},
/*062*/ {IND_VOWEL, "lr="},
/*063*/ {IND_VOWEL, "lr="},
/*064*/ {IND_PUNC, ""},
/*065*/ {IND_PUNC, ""},
/*066*/ {IND_DIGIT, ""},
/*067*/ {IND_DIGIT, ""},
/*068*/ {IND_DIGIT, ""},
/*069*/ {IND_DIGIT, ""},
/*06A*/ {IND_DIGIT, ""},
/*06B*/ {IND_DIGIT, ""},
/*06C*/ {IND_DIGIT, ""},
/*06D*/ {IND_DIGIT, ""},
/*06E*/ {IND_DIGIT, ""},
/*06F*/ {IND_DIGIT, ""},
/*070*/ {IND_ANUSWAAR, "nX"},
/*071*/ {IND_ADDAK, ""},
/*072*/ {IND_INDEPENDENT_VOWEL, "ay"},
/*073*/ {IND_INDEPENDENT_VOWEL, "u e"},
/*074*/ {IND_INDEPENDENT_VOWEL, "u: e"},
/*075*/ {IND_INDEPENDENT_VOWEL, "ow"},
/*076*/ {IND_INDEPENDENT_VOWEL, "u e"},
/*077*/ {IND_INDEPENDENT_VOWEL, "u: e"},
/*078*/ {IND_IGNORE, ""},
/*079*/ {IND_CONSONANT, "z"},
/*07A*/ {IND_CONSONANT, "j"},
/*07B*/ {IND_CONSONANT, "G"},
/*07C*/ {IND_CONSONANT, "z"},
/*07D*/ {IND_CONSONANT, ""},
/*07E*/ {IND_CONSONANT, "dr"},
/*07F*/ {IND_CONSONANT, "b"},
};
static void replace_car(const cst_val *it,const cst_val *newcar)
{
/* Destructively replace it's car with newcar, deleting old car */
/* This does this by casting out of consts -- something you shouldn't do */
cst_val *tbd;
tbd = (cst_val *)val_car(it);
set_car((cst_val *)it,newcar);
delete_val(tbd);
}
static void replace_cdr(const cst_val *it,const cst_val *newcdr)
{
/* Destructively replace it's car with newcar, deleting old cdr */
/* This does this by casting out of consts -- something you shouldn't do */
cst_val *tbd;
tbd = (cst_val *)val_cdr(it);
set_cdr((cst_val *)it,newcdr);
set_cdr((cst_val *)tbd,NULL);
delete_val(tbd);
}
static int cmu_indic_is_vowel(const char *p)
{
/* this happens to work for the vowels in the indic set */
if (strchr("aeiouAEIOU",p[0]) == NULL)
return FALSE;
else
return TRUE;
}
static int cmu_indic_lex_ord_to_offset(const int indic_ord) {
int output=0;
int i;
int offset;
i = indic_ord;
if ((i >= 0x0900) && (i <= 0x097F))
offset = 0x0900;
if ((i >= 0x0980) && (i <= 0x09FF))
offset = 0x0980;
if ((i >= 0x0A00) && (i <= 0x0A7F))
offset = 0x0A00;
if ((i >= 0x0A80) && (i <= 0x0AFF))
offset = 0x0A80;
if ((i >= 0x0B00) && (i <= 0x0B7F))
offset = 0x0B00;
if ((i >= 0x0B80) && (i <= 0x0BFF))
offset = 0x0B80;
if ((i >= 0x0C00) && (i <= 0x0C7F))
offset = 0x0C00;
if ((i >= 0x0C80) && (i <= 0x0CFF))
offset = 0x0C80;
if ((i >= 0x0D00) && (i <= 0x0D7F))
offset = 0x0D00;
if (!offset) {
cst_errmsg("Indic language can not process character 0x%x\n", i);
} else {
output = i - offset;
}
return output;
}
static int cmu_indic_get_char_type(const cst_val *indic_char)
{
int c;
if (!indic_char) return IND_IGNORE;
c = val_int(indic_char);
if ((c < 0x0900) || (c > 0x0D7F))
return IND_IGNORE;
c = cmu_indic_lex_ord_to_offset(c);
return cmu_indic_offset_char[c].type;
}
static const char *cmu_indic_get_char_phoneme(const cst_val *indic_char)
{
int c;
if (!indic_char) return "";
c = val_int(indic_char);
/* Language-specific exceptions to the g2p mapping, like the
length distinction b/w e and e: */
/* e/e: & o/o: distinction: */
/* Kannada */
if ((c == 0x0C8F)||(c == 0x0CC7)) return "e:";
if (c == 0x0C92) return "o";
if ((c == 0x0C93)||(c == 0x0CCB)) return "o:";
/* Malayalam */
if ((c == 0x0D0F)||(c == 0x0D47)) return "e:";
if (c == 0x0D12) return "o";
if ((c == 0x0D13)||(c == 0x0D4B)) return "o:";
if (c == 0x0D34) return "zr"; /* Retroflex approximant */
if (c == 0x0D31) return "rr"; /* Retroflex flap */
/* Punjabi */
if (c == 0x0A33) return "l";
/* Tamil */
if ((c == 0x0B8F)||(c == 0x0BC7)) return "e:";
if (c == 0x0B92) return "o";
if ((c == 0x0B93)||(c == 0x0BCB)) return "o:";
if (c == 0x0BA9) return "n"; /* Tamil alveolar nasal */
if (c == 0x0BB1) return "rr";
if (c == 0x0BB4) return "zr";
/* Telugu */
if ((c == 0x0C0F)||(c == 0x0C47)) return "e:";
if (c == 0x0C12) return "o";
if ((c == 0x0C13)||(c == 0x0C4B)) return "o:";
/* Not a special case */
c = cmu_indic_lex_ord_to_offset(c);
/* printf("awb_debug get_char_phone %x %s\n",c,cmu_indic_offset_char[c].phoneme);*/
return cmu_indic_offset_char[c].phoneme;
}
static cst_val *cmu_indic_lex_remove_ignore_chars(const cst_val *indic_ords) {
cst_val *output = 0;
const cst_val *v;
for(v=indic_ords; v; v=val_cdr(v)) {
if (cmu_indic_get_char_type(val_car(v)) == IND_IGNORE)
continue;
output = cons_val(val_car(v), output);
}
return val_reverse(output);
}
static cst_val *cmu_indic_lex_map_nukta_chars(const cst_val *indic_ords) {
cst_val *output = 0;
const cst_val *v;
int i;
int mapped_val;
for(v=indic_ords; v; v=val_cdr(v)) {
i = val_int(val_car(v));
if (val_cdr(v) && cmu_indic_get_char_type(val_car(val_cdr(v))) == IND_NUKTA) {
switch (i) {
/* Devanagari */
case 2325: mapped_val=2392; break; /* क़ */
case 2326: mapped_val=2393; break; /* ख़ */
case 2327: mapped_val=2394; break; /* ग़ */
case 2332: mapped_val=2395; break; /* ज़ */
case 2337: mapped_val=2396; break; /* ड़ */
case 2338: mapped_val=2397; break; /* ढ़ */
case 2347: mapped_val=2398; break; /* फ़ */
case 2351: mapped_val=2399; break; /* य़ */
/* Bengali */
case 2465: mapped_val=2524; break;
case 2566: mapped_val=2525; break;
case 2479: mapped_val=2527; break;
/* Tamil */
case 2962: mapped_val=2964; break;
/* Gurmukhi */
case 2582: mapped_val=2649; break; /* ਖ਼ */
case 2583: mapped_val=2650; break; /* ਗ਼ */
case 2588: mapped_val=2651; break; /* ਜ਼ */
case 2603: mapped_val=2654; break; /* ਫ਼ */
case 2610: mapped_val=2610; break; /* ਲ਼ */
case 2616: mapped_val=2614; break; /* ਸ਼ */
default:
mapped_val=i;
};
output = cons_val(int_val(mapped_val), output);
v = val_cdr(v);
} else {
/* Copy current item into output */
output = cons_val(val_car(v), output);
}
}
return val_reverse(output);
}
cst_val *cmu_indic_lex_ord_to_phones(const cst_val *ords,
int cmu_indic_variant_deletes_word_final_schwa,
const cst_features *feats)
{
cst_val *in_ords = 0;
cst_val *t_ords = 0;
cst_val *out_phone_strings = 0;
cst_val *out_phones = 0;
const cst_val *prev_char = 0;
const cst_val *cur_char = 0;
const cst_val *next_char = 0;
const cst_val *remainder = 0;
const char *indic_variant = 0;
int cur_char_type = 0;
int next_char_type = 0;
int prev_char_type = 0;
indic_variant = get_param_string(feats, "variant", "none");
/* Ignore chars (filter) */
in_ords = cmu_indic_lex_remove_ignore_chars(ords);
if (!in_ords) {
return out_phones;
}
/* Map Nukta Chars (filter) */
t_ords = cmu_indic_lex_map_nukta_chars(in_ords);
delete_val(in_ords);
in_ords = t_ords;
prev_char = 0;
cur_char = val_car(in_ords);
if (val_cdr(in_ords)) {
next_char = val_car(val_cdr(in_ords));
remainder = val_cdr(val_cdr(in_ords));
}
else {
next_char = 0;
remainder = 0;
}
while (cur_char) {
/* printf("awb_debug out_phone_strings "); val_print(stdout,out_phone_strings); printf("\n"); */
cur_char_type = cmu_indic_get_char_type(cur_char);
if (prev_char)
prev_char_type = cmu_indic_get_char_type(prev_char);
if (next_char)
next_char_type = cmu_indic_get_char_type(next_char);
else
next_char_type = IND_IGNORE;
if (cur_char_type == IND_CONSONANT) {
/* Add consonant to the output list */
/* This part needs to be incorporated into Festvox code! */
/* In Tamil, if visarga comes right before p or J, it is not */
/* realized as "h" but instead maps p to f, or J to z. This */
/* takes care of that. */
if ((prev_char) &&
((prev_char_type == IND_VISARGA) &&
(((cst_streq("J", cmu_indic_get_char_phoneme(cur_char))) ||
(cst_streq("p", cmu_indic_get_char_phoneme(cur_char)))) &&
(cst_streq(indic_variant,"tam"))))) {
/* Don't add current character to out_phones; */
/* the correct mapped character is already added when */
/* cur_char is visarga */
} else {
out_phone_strings =
cons_val(string_val(cmu_indic_get_char_phoneme(cur_char)),
out_phone_strings);
/* If a consonant is followed by a combination vowel, a */
/* halant, a punctuation then don't add a schwa after */
/* it. Otherwise, insert a schwa. For end-of-word, check */
/* whether we should insert schwa in this language. */
if (!next_char) { /* We are in last char. Add schwa? */
if ((!prev_char) || /* Always add schwa for one-char words */
(!cmu_indic_variant_deletes_word_final_schwa)) {
out_phone_strings = cons_val(string_val("A"), out_phone_strings);
} else {
/* Schwa deletion should probably happen depending */
/* on whether there is a consonant cluster or not, */
/* at the end. But Adding that rule here seems to */
/* not have worked properly. Hence, we always */
/* delete the final schwa. */
}
} else { /* Not a final char */
if ( (next_char_type != IND_VOWEL) &&
(next_char_type != IND_PUNC) &&
(next_char_type != IND_HALANT) &&
(next_char_type != IND_IGNORE)) {
out_phone_strings = cons_val(string_val("A"), out_phone_strings);
}
}
}
} else if ((cur_char_type == IND_VOWEL) ||
(cur_char_type == IND_INDEPENDENT_VOWEL) ||
(cur_char_type == IND_DIGIT)) {
/* Add whatever the pronunciation is */
out_phone_strings =
cons_val(string_val(cmu_indic_get_char_phoneme(cur_char)),
out_phone_strings);
} else if (cur_char_type == IND_HALANT) {
/* Ignore */
} else if (cur_char_type == IND_AVAGRAHA) {
/* Lengthen previous vowel */
if (cmu_indic_get_char_type(prev_char) == IND_VOWEL) {
out_phone_strings =
cons_val(string_val(cmu_indic_get_char_phoneme(prev_char)),
out_phone_strings);
}
} else if (cur_char_type == IND_ANUSWAAR) {
/* The realization of anuswaar is context dependent: We */
/* only generate a placeholder symbol and let postlexical */
/* rules take care of this. */
out_phone_strings = cons_val(string_val("nX"),out_phone_strings);
} else if ((cur_char_type == IND_VISARGA) &&
(!cst_streq(indic_variant,"tam"))) {
/* If we are not in Tamil, add the glottal fricative ("h") */
out_phone_strings = cons_val(string_val("h"),out_phone_strings);
}
else if ((cur_char_type == IND_VISARGA) &&
(cst_streq(indic_variant,"tam"))) {
/* If there's a next character */
if (next_char) {
/* If next character is p or J, do mapping and add to output */
if (cst_streq("p", cmu_indic_get_char_phoneme(next_char))) {
out_phone_strings = cons_val(string_val("f"),
out_phone_strings);
} else if (cst_streq("J", cmu_indic_get_char_phoneme(next_char))) {
out_phone_strings = cons_val(string_val("z"),
out_phone_strings);
/* Otherwise it's "h" */
} else {
out_phone_strings = cons_val(string_val("h"),
out_phone_strings);
}
/* If this is the last character, it's "h" */
} else {
out_phone_strings = cons_val(string_val("h"),
out_phone_strings);
}
} else if (cur_char_type == IND_ADDAK) {
/* In Gurmukhi, this diacritic geminates the following consonant. */
out_phone_strings =
cons_val(string_val(cmu_indic_get_char_phoneme(next_char)),
out_phone_strings);
} else {
/* IDEALLY we should warn for unhandled characters! */
/* TODO? */
}
prev_char = cur_char;
cur_char = next_char;
if (remainder) {
next_char = val_car(remainder);
remainder = val_cdr(remainder);
} else {
next_char = 0;
}
}
/* There may be multiple phones in the expansion, but they are just */
/* space separated tokens in the val, so we need to make them into */
/* an actual val list of phones */
const cst_val *v, *vx;
cst_val *ph;
for(v=out_phone_strings; v; v=val_cdr(v)) {
ph = val_readlist_string(val_string(val_car(v)));
for(vx=ph; vx; vx=val_cdr(vx)) {
out_phones = cons_val(val_car(vx), out_phones);
}
delete_val(ph);
}
delete_val(in_ords);
delete_val(out_phone_strings);
return val_reverse(out_phones);
}
static cst_val *cmu_indic_lex_kannada_spelling_postfixes(cst_val *in_phones)
{
const cst_val *p;
p=in_phones;
for ( ; p && val_cdr(p) ; p=val_cdr(p))
{
/* Correct spell-errors ( e u:/aI ) => ( o/aI ) */
if ((cst_streq(val_string(val_car(p)),"e")) &&
((cst_streq(val_string(val_car(val_cdr(p))),"u:")) ||
(cst_streq(val_string(val_car(val_cdr(p))),"aI"))))
{
if (cst_streq(val_string(val_car(val_cdr(p))),"u:"))
replace_car(p,string_val("o"));
else
replace_car(p,string_val("aI"));
set_cdr((cst_val *)p, val_cdr(val_cdr(p)));
}
}
return in_phones;
}
cst_val *cmu_indic_lex_nasal_postfixes(cst_val *in_phones,
const cst_features *feats)
{
/* Given a phone sequence containing a special character nX */
/* (contextual nasal), replace it with the appropriate nasal phone */
/* based on its context */
char *tmpstr;
const cst_val *p;
const char *indic_variant = 0;
indic_variant = get_param_string(feats, "variant", "none");
/* printf("awb_debug: pre "); val_print(stdout,in_phones); printf("\n"); */
for( p=in_phones; p && val_cdr(p); p=val_cdr(p))
{
/* Nazalise vowels at ends of words */
if ((cmu_indic_is_vowel(val_string(val_car(p)))) &&
(cst_streq("nX", val_string(val_car(val_cdr(p))))) &&
((!val_cdr(val_cdr(p))) ||
(!val_car(val_cdr(val_cdr(p))))))
{
if (cst_streq(indic_variant,"kan") ||
cst_streq(indic_variant,"tel") || /* Dravidian languages don't nasalize */
cst_streq("A", val_string(val_car(p))))
{ /* If it's a schwa, it's not nasalized. nX becomes m */
replace_car(val_cdr(p),string_val("m"));
} else {
tmpstr = cst_strcat(val_string(val_car(p)),"nas");
replace_car(p,string_val(tmpstr));
cst_free(tmpstr);
replace_cdr(p,val_cdr(val_cdr(p)));
}
} else if (cst_streq("nX", val_string(val_car(p))))
{ /* Choose nasal variant based on next consonant */
const char *next_c =
val_string(phone_feature(&cmu_indic_phoneset,
val_string(val_car(val_cdr(p))),
"cplace"));
const char *repl_ph;
if (next_c) {
switch (next_c[0]) {
case 'v': repl_ph = "N"; break;
case 'p': repl_ph = "n~"; break;
case 'a': repl_ph = "nr"; break;
case 'd': repl_ph = "nB"; break;
case 'l': repl_ph = "m"; break;
default: repl_ph = "nB";
};
replace_car(p,string_val(repl_ph));
}
}
}
/* printf("awb_debug: post "); val_print(stdout,in_phones); printf("\n");*/
return in_phones;
}
static cst_val *cmu_indic_lex_jnyan_replacement(cst_val *in_phones,
const cst_features *feats)
{
/* Changes instances of ( J n~ ) to ( g n~ ) or ( g j ) depending on the language*/
const cst_val *p;
const char *indic_variant = 0;
indic_variant = get_param_string(feats, "variant", "none");
for (p=in_phones; p && val_cdr(p); p=val_cdr(p))
{
if ((cst_streq(val_string(val_car(p)),"J")) &&
(cst_streq(val_string(val_car(val_cdr(p))),"n~")))
{ /* Change "J" to "g" */
replace_car(p,string_val("g"));
if (cst_streq(indic_variant,"hin"))
/*Only Hindi pronounces this digraph as ( g j ), AFAIK -shyam*/
replace_car(val_cdr(p),string_val("j"));
p = val_cdr(p); /* Skip over them */
}
}
return in_phones;
}
static cst_val *cmu_indic_lex_punjabi_vowel_postfixes(cst_val *in_phones)
{
const cst_val *p;
p=in_phones;
/* Provide better approximates for 3rd person singular pronouns */
/* Check for orthographic variant of ihn/uhn, written inh/unh */
/* Equivalent to the punjabi_pronoun_postfixes function */
if (p && val_cdr(p) && val_cdr(val_cdr(p)) &&
(cst_streq(val_string(val_car(val_cdr(p))),"nB")) &&
(cst_streq(val_string(val_car(val_cdr(val_cdr(p)))),"hv")) &&
((cst_streq(val_string(val_car(p)),"i")) ||
(cst_streq(val_string(val_car(p)),"u"))))
{
if (cst_streq(val_string(val_car(p)),"i"))
replace_car(p,string_val("e"));
else if (cst_streq(val_string(val_car(p)),"u"))
replace_car(p,string_val("o"));
p = val_cdr(p);
replace_car(p,string_val("hv"));
set_cdr((cst_val *)p,cons_val(string_val("nB"),val_cdr(val_cdr(p))));
return in_phones;
}
for ( ; p && val_cdr(p); p=val_cdr(p))
{
/* Change sequences ( A hv i/u ) => ( aI/aU hv ) */
if ((cst_streq(val_string(val_car(p)),"A")) &&
(cst_streq(val_string(val_car(val_cdr(p))),"hv")) &&
val_cdr(val_cdr(p)) && (val_cdr(val_cdr(val_cdr(p)))) &&
((cst_streq(val_string(val_car(val_cdr(val_cdr(p)))),"i")) ||
(cst_streq(val_string(val_car(val_cdr(val_cdr(p)))),"u"))))
{
if (cst_streq(val_string(val_car(val_cdr(val_cdr(p)))),"i"))
replace_car(p,string_val("aI"));
else if (cst_streq(val_string(val_car(val_cdr(val_cdr(p)))),"u"))
replace_car(p,string_val("aU"));
set_cdr((cst_val *)p,cons_val(string_val("hv"),val_cdr(val_cdr(val_cdr(p)))));
p = val_cdr(val_cdr(p)); /* Skip over them */
}
/* Change sequences ( i/u hv ) => ( e/o hv ) */
else if (((cst_streq(val_string(val_car(p)),"i")) ||
(cst_streq(val_string(val_car(p)),"u"))) &&
val_cdr(val_cdr(p)) &&
(cst_streq(val_string(val_car(val_cdr(p))),"hv")))
{
if (cst_streq(val_string(val_car(p)),"i"))
replace_car(p,string_val("e"));
else if (cst_streq(val_string(val_car(p)),"u"))
replace_car(p,string_val("o"));
p = val_cdr(p); /* Skip over them */
}
/* Change sequences ( A: u/A ) => ( aU/A: ) */
else if ((cst_streq(val_string(val_car(p)),"A:")) &&
val_cdr(val_cdr(p)) &&
((cst_streq(val_string(val_car(val_cdr(p))),"u")) ||
(cst_streq(val_string(val_car(val_cdr(p))),"A")))
)
{
if (cst_streq(val_string(val_car(val_cdr(p))),"u"))
replace_car(p,string_val("aU"));
set_cdr((cst_val *)p,val_cdr(val_cdr(p)));
p = val_cdr(p); /* Skip over them */
}
}
return in_phones;
}
static cst_val *cmu_indic_lex_punjabi_glide_postfixes(cst_val *in_phones)
{
const cst_val *p;
/* Inserts glides/semivowels corresponding to i/u */
for (p=in_phones; p && val_cdr(p); p=val_cdr(p))
{
if (cmu_indic_is_vowel(val_string(val_car(val_cdr(p)))) &&
((cst_streq(val_string(val_car(p)),"i")) ||
(cst_streq(val_string(val_car(p)),"u")) ||
(cst_streq(val_string(val_car(p)),"i:")) ||
(cst_streq(val_string(val_car(p)),"u:"))))
{
/* Change sequences ( i/u V ) => ( j/v V ) */
if (cst_streq(val_string(val_car(p)),"i"))
replace_car(p,string_val("j"));
else if (cst_streq(val_string(val_car(p)),"u"))
replace_car(p,string_val("v"));
/* Change sequences ( i:/u: V ) => ( i/u j/v V ) */
else if (cst_streq(val_string(val_car(p)),"i:"))
{
replace_car(p,string_val("i"));
set_cdr((cst_val *)p,cons_val(string_val("j"),val_cdr(p)));
}
else if (cst_streq(val_string(val_car(p)),"u:"))
{
replace_car(p,string_val("u"));
set_cdr((cst_val *)p,cons_val(string_val("v"),val_cdr(p)));
}
p = val_cdr(p); /* Skip over them */
}
/* Change sequences ( V i ) => ( V j ) */
else if (cst_streq(val_string(val_car(val_cdr(p))),"i") &&
cmu_indic_is_vowel(val_string(val_car(p))))
{
p = val_cdr(p);
replace_car(p,string_val("j"));
}
/* Change sequence ( V i: V ) => ( V j j V ) */
else if (cst_streq(val_string(val_car(val_cdr(p))),"i:") &&
cmu_indic_is_vowel(val_string(val_car(p))) &&
(val_cdr(val_cdr(p))) &&
cmu_indic_is_vowel(val_string(val_car(val_cdr(val_cdr(p))))))
{
p = val_cdr(p);
replace_car(p,string_val("j"));
set_cdr((cst_val *)p,cons_val(string_val("j"),val_cdr(p)));
p = val_cdr(p); /* Skip over them */
}
}
return in_phones;
}
static cst_val *cmu_indic_lex_tamil_tr_replacement(cst_val *in_phones)
{
/* Changes instances of ( rr rr ) to ( tr tr rr ) */
const cst_val *p;
for (p=in_phones; p && val_cdr(p); p=val_cdr(p))
{
if ((cst_streq(val_string(val_car(p)),"rr")) &&
(cst_streq(val_string(val_car(val_cdr(p))),"rr")))
{ /* Change the first one to tr tr */
replace_car(p,string_val("tr"));
set_cdr((cst_val *)p,cons_val(string_val("tr"),val_cdr(p)));
p = val_cdr(val_cdr(p)); /* Skip over them */
}
}
return in_phones;
}
static cst_val *cmu_indic_lex_tamil_nr_replacement(cst_val *in_phones)
{
/* Changes instances of ( n rr ) to ( nr dr rr) */
const cst_val *p;
for (p=in_phones; p && val_cdr(p); p=val_cdr(p))
{
if ((cst_streq(val_string(val_car(p)),"n")) &&
(cst_streq(val_string(val_car(val_cdr(p))),"rr")))
{ /* Insert dB between them */
replace_car(p,string_val("nr"));
set_cdr((cst_val *)p,cons_val(string_val("dr"),val_cdr(p)));
p = val_cdr(val_cdr(p)); /* Skip over them */
}
}
return in_phones;
}
static cst_val *cmu_indic_lex_tamil_final_u(cst_val *in_phones)
{
/* Changes instances of final u to uy */
const cst_val *p;
for (p=in_phones; p; p=val_cdr(p))
{
if ((val_cdr(p) == NULL) &&
(cst_streq(val_string(val_car(p)),"u")))
{ /* change u to uy */
replace_car(p,string_val("uy"));
}
}
return in_phones;
}
cst_val *cmu_indic_lex_tamil_voicing_postfixes(cst_val *phones)
{
/* Destructively modify voicing in list of phones */
const cst_val *p;
const char *next_phone, *this_phone;
const char *voice_ph, *len_ph;
p = phones;
/* word-initial c becomes s */
if (p && (cst_streq(val_string(val_car(p)),"c")) &&
val_cdr(p) && !cst_streq(val_string(val_car(val_cdr(p))),"c"))
{
replace_car(p,string_val("s"));
p = val_cdr(p);
}
for( ; p && val_cdr(p); p=val_cdr(p))
{
this_phone = val_string(val_car(p));
next_phone = val_string(val_car(val_cdr(p)));
/* Next phone is a stop that could be mapped. */
if ((cst_streq(next_phone,"k")) ||
(cst_streq(next_phone,"c")) ||
(cst_streq(next_phone,"tr")) ||
(cst_streq(next_phone,"tB")) ||
(cst_streq(next_phone,"p")))
{
if (cst_streq(next_phone,"k")) { voice_ph = "g"; len_ph = "G";
}
else if (cst_streq(next_phone,"c")) { voice_ph = "J"; len_ph =
"s"; }
else if (cst_streq(next_phone,"tr")) { voice_ph = "dr"; len_ph
= "rrh"; }
else if (cst_streq(next_phone,"tB")) { voice_ph = "dB"; len_ph
= "dh"; }
else if (cst_streq(next_phone,"p")) { voice_ph = "b"; len_ph =
"B"; }
else { voice_ph = next_phone; len_ph = next_phone; }
/* If current phone is a nasal/voiced stop, add voicing. */
if ((!cmu_indic_is_vowel(this_phone)) &&
(cst_streq(val_string(phone_feature(&cmu_indic_phoneset,
this_phone,"ctype")),"n")))
{
replace_car(val_cdr(p),string_val(voice_ph));
p=val_cdr(p); /* skip */
}
/* If current phone is a vowel/approximant and next.next is
also a vowel
then stop undergoes lenition */
else if ((cmu_indic_is_vowel(this_phone)) ||
(cst_streq(val_string(phone_feature(&cmu_indic_phoneset,
this_phone,"ctype")),"r")))
{
if ((val_cdr(val_cdr(p))) &&
(cmu_indic_is_vowel(val_string(val_car(val_cdr(val_cdr(p)))))))
{
replace_car(val_cdr(p),string_val(len_ph));
p=val_cdr(p); /* skip */
}
}
/* If current is vowel, but this is last syllable,
then leave voicing as it is. */
else if ((cmu_indic_is_vowel(this_phone)) &&
(!val_cdr(val_cdr(p))))
{
continue;
}
}
}
return phones;
}
static const char * const eng_to_indic[99][3] =
{
{"aa", "A:", NULL },
{"ae", "aI", NULL },
{"ah", "A", NULL },
{"ao", "aU", NULL },
{"aw", "A:", "u" },
{"ax", "A", NULL },
{"axr", "A", "9r" },
{"ay", "A:", "i" },
{"b", "b", NULL },
{"ch", "c", NULL },
{"d", "dr", NULL },
{"dh", "dB", NULL },
{"eh", "E", NULL },
{"er", "A", "9r" },
{"ey", "e", NULL },
{"f", "ph", NULL },
{"g", "g", NULL },
{"hh", "hv", NULL },
{"ih", "i", NULL },
{"iy", "i:", NULL },
{"jh", "J", NULL },
{"k", "k", NULL },
{"l", "l", NULL },
{"m", "m", NULL },
{"n", "nB", NULL },
{"nx", "nB", NULL },
{"ng", "N", NULL },
{"ow", "o", NULL },
{"oy", "aU", "i" },
{"p", "p", NULL },
{"r", "9r", NULL },
{"s", "s", NULL },
{"sh", "c}", NULL },
{"t", "tr", NULL },
{"th", "tBh", NULL },
{"uh", "u", NULL },
{"uw", "u:", NULL },
{"v", "v", NULL },
{"w", "v", NULL },
{"y", "j", NULL },
{"z", "z", NULL },
{"zh", "c}", NULL },
{NULL, NULL, NULL }
};
/* Mapping for Tamil taking stress into consideration */
/* Shyam Krishna, 2018/03/06 */
static const char * const eng_to_tam_stress[99][3] =
{
{"aa0", "A", NULL },
{"aa1", "A:", NULL },
{"ae0", "A", NULL },
{"ae1", "e", NULL },
{"ah1", "A", NULL },
{"ao0", "A", NULL },
{"ao1", "o:", NULL }, /*TODO: resolve horse-hoarse merger */
{"aw0", "aU", NULL },
{"aw1", "aU", NULL },
{"ax", "A", NULL },
{"ax0", "A", NULL },
{"ay0", "aI", NULL },
{"ay1", "aI", NULL },
{"b", "b", NULL },
{"ch", "c", NULL },
{"d", "dr", NULL },
{"dh", "dB", NULL },
{"eh0", "e", NULL },
{"eh1", "e", NULL },
{"er", "A", "9r" },
{"er0", "A", "9r" },
{"er1", "A", "9r" },
{"ey0", "e", NULL },
{"ey1", "e:", NULL },
{"f", "p", NULL },
{"g", "g", NULL },
{"hh", "hv", NULL },
{"ih", "i", NULL },
{"ih0", "i", NULL },
{"ih1", "i", NULL },
{"iy0", "i", NULL },
{"iy1", "i:", NULL },
{"jh", "J", NULL },
{"k", "k", NULL },
{"l", "l", NULL },
{"m", "m", NULL },
{"n", "nB", NULL },
{"nx", "nB", NULL },
{"ng", "N", NULL },
{"ow0", "o", NULL },
{"ow1", "o:", NULL },
{"oy0", "o", "j" },
{"oy1", "o:", "j" },
{"p", "p", NULL },
{"r", "9r", NULL },
{"s", "s", NULL },
{"sh", "sr", NULL },
{"t", "tr", NULL },
{"th", "tB", NULL },
{"uh0", "u", NULL },
{"uh1", "u", NULL },
{"uw0", "u", NULL },
{"uw1", "u:", NULL },
{"v", "v", NULL },
{"w", "v", NULL },
{"y", "j", NULL },
{"z", "s", NULL },
{"zh", "sr", NULL },
{NULL, NULL, NULL }
};
/* Mapping for Kannada taking stress into consideration */
/* Shyam Krishna, 2018/04/06 */
static const char * const eng_to_kan_stress[99][3] =
{
{"aa0", "A", NULL },
{"aa1", "A:", NULL },
{"ae0", "A", NULL },
{"ae1", "e", NULL },
{"ah1", "A", NULL },
{"ao0", "A", NULL },
{"ao1", "o:", NULL }, /*TODO: resolve horse-hoarse merger */
{"aw0", "aU", NULL },
{"aw1", "aU", NULL },
{"ax", "A", NULL },
{"ax0", "A", NULL },
{"ay0", "aI", NULL },
{"ay1", "aI", NULL },
{"b", "b", NULL },
{"ch", "c", NULL },
{"d", "dr", NULL },
{"dh", "dB", NULL },
{"eh0", "e", NULL },
{"eh1", "e", NULL },
{"er", "A", "9r" },
{"er0", "A", "9r" },
{"er1", "A", "9r" },
{"ey0", "e", NULL },
{"ey1", "e:", NULL },
{"f", "ph", NULL },
{"g", "g", NULL },
{"hh", "hv", NULL },
{"ih", "i", NULL },
{"ih0", "i", NULL },
{"ih1", "i", NULL },
{"iy0", "i", NULL },
{"iy1", "i:", NULL },
{"jh", "J", NULL },
{"k", "k", NULL },
{"l", "l", NULL },
{"m", "m", NULL },
{"n", "nB", NULL },
{"nx", "nB", NULL },
{"ng", "N", NULL },
{"ow0", "o", NULL },
{"ow1", "o:", NULL },
{"oy0", "o", "j" },
{"oy1", "o:", "j" },
{"p", "p", NULL },
{"r", "9r", NULL },
{"s", "s", NULL },
{"sh", "c}", NULL },
{"t", "tr", NULL },
{"th", "tB", NULL },
{"uh0", "u", NULL },
{"uh1", "u", NULL },
{"uw0", "u", NULL },
{"uw1", "u:", NULL },
{"v", "v", NULL },
{"w", "v", NULL },
{"y", "j", NULL },
{"z", "s", NULL },
{"zh", "c}", NULL },
{NULL, NULL, NULL }
};
cst_val *map_english_to_indic_phones(const char *indic_variant,
const cst_val *english_phones)
{
/* Map English (radio) phones to their Indic equivalent */
cst_val *ip = NULL;
const cst_val *v;
char *english_phone;
int i;
for (v=english_phones; v; v=val_cdr(v))
{
english_phone = cst_strdup(val_string(val_car(v)));
/* *** mapping table should be indic variant specific */
if(cst_streq(indic_variant, "tam"))
{
for (i=0; eng_to_tam_stress[i][0]; i++)
{
if (cst_streq(english_phone,eng_to_tam_stress[i][0]))
{
ip = cons_val(string_val(eng_to_tam_stress[i][1]),ip);
if (eng_to_tam_stress[i][2])
ip = cons_val(string_val(eng_to_tam_stress[i][2]),ip);
}
/* if there is no mapping, we drop the phone */
}
}
else if(cst_streq(indic_variant, "kan"))
{
for (i=0; eng_to_kan_stress[i][0]; i++)
{
if (cst_streq(english_phone,eng_to_kan_stress[i][0]))
{
ip = cons_val(string_val(eng_to_kan_stress[i][1]),ip);
if (eng_to_kan_stress[i][2])
ip = cons_val(string_val(eng_to_kan_stress[i][2]),ip);
}
/* if there is no mapping, we drop the phone */
}
}
else
{
if ((english_phone[cst_strlen(english_phone)-1] == '0') ||
(english_phone[cst_strlen(english_phone)-1] == '1'))
/* It has a stress value on it */
english_phone[cst_strlen(english_phone)-1] = '\0';
for (i=0; eng_to_indic[i][0]; i++)
{
if (cst_streq(english_phone,eng_to_indic[i][0]))
{
ip = cons_val(string_val(eng_to_indic[i][1]),ip);
if (eng_to_indic[i][2])
ip = cons_val(string_val(eng_to_indic[i][2]),ip);
}
/* if there is no mapping, we drop the phone */
}
}
cst_free(english_phone);
}
ip = val_reverse(ip);
return ip;
}
static cst_val *delete_medial_schwa(cst_val *rphones)
{
/* This schwa deletion follows the technique by Narsimhan et al (2001). */
/* 1. Process input from right to left */
/* 2. If a schwa is found in a VC_CV context, then delete it. */
/* There are exceptions to this: (i) Phonotactic constraints of */
/* Hindi not violated, and no (ii) morpheme boundary present on the */
/* left. But I don't know how to handle these yet. So this will be */
/* wrong more often than the 11% reported in that paper. -- AUP */
const cst_val *p;
cst_val *tbd;
for (p=rphones; p && val_cdr(p); p=val_cdr(p))
{
if ((val_length(p) > 4) &&
(cst_streq(val_string(val_car(val_cdr(val_cdr(p)))),"A")))
{ /* we have a schwa coming up */
if ((cmu_indic_is_vowel(val_string(val_car(p)))) &&
(!cmu_indic_is_vowel(val_string(val_car(val_cdr(p))))) &&
(!cmu_indic_is_vowel(val_string(val_car(val_cdr(val_cdr(val_cdr(p))))))) &&
(cmu_indic_is_vowel(val_string(val_car(val_cdr(val_cdr(val_cdr(val_cdr(p)))))))))
{
/* delete the schwa */
tbd = (cst_val *)val_cdr(val_cdr(p));
set_cdr((cst_val *)val_cdr(p),val_cdr(val_cdr(val_cdr(p))));
set_cdr(tbd,NULL);
delete_val(tbd);
}
}
}
return rphones;
}
/* TODO */
static cst_val *cmu_indic_hindi_schwa_fixes(cst_val *phones)
{
cst_val *dd;
if ((val_length(phones) > 3) &&
(cst_streq("A",val_string(val_car(val_cdr(phones))))) &&
(cst_streq("hv",val_string(val_car(val_cdr(val_cdr(phones)))))) &&
((cst_streq("A",val_string(val_car(val_cdr(val_cdr(val_cdr(phones))))))) ||
(!cmu_indic_is_vowel(val_string(val_car(val_cdr(val_cdr(val_cdr(phones)))))))))
{
dd = (cst_val *)(void *)val_cdr(phones);
set_cdr(phones,val_cdr(val_cdr(phones)));
set_cdr(dd,NULL);
delete_val(dd);
return phones;
}
else
return phones;
}
cst_val *cmu_indic_lex_lts_function(const struct lexicon_struct *l,
const char *word, const char *pos,
const cst_features *feats)
{
cst_val *utflets = 0;
cst_val *ords = 0;
cst_val *english_phones;
cst_val *base_phones = NULL;
const char *indic_variant = 0;
const char *eng_bilingual_flag = 0;
int cmu_indic_variant_deletes_word_final_schwa=0;
const cst_val *v;
cst_val *tmpv;
indic_variant = get_param_string(feats, "variant", "none");
if (cst_streq(indic_variant, "hin")) {
cmu_indic_variant_deletes_word_final_schwa = 1;
} else if (cst_streq(indic_variant, "mar")) {
cmu_indic_variant_deletes_word_final_schwa = 1;
} else if (cst_streq(indic_variant, "ben")) {
cmu_indic_variant_deletes_word_final_schwa = 1;
} else if (cst_streq(indic_variant, "raj")) {
cmu_indic_variant_deletes_word_final_schwa = 1;
} else if (cst_streq(indic_variant, "guj")) {
cmu_indic_variant_deletes_word_final_schwa = 1;
} else if (cst_streq(indic_variant, "asm")) {
cmu_indic_variant_deletes_word_final_schwa = 1;
} else if (cst_streq(indic_variant, "pan")) {
cmu_indic_variant_deletes_word_final_schwa = 1;
} else if (cst_streq(indic_variant, "tel")) {
cmu_indic_variant_deletes_word_final_schwa = 0;
} else if (cst_streq(indic_variant, "tam")) {
cmu_indic_variant_deletes_word_final_schwa = 0;
} else if (cst_streq(indic_variant, "kan")) {
cmu_indic_variant_deletes_word_final_schwa = 0;
} else {
cmu_indic_variant_deletes_word_final_schwa = 0;
printf("Unknown indic variant: %s!\n", indic_variant);
}
if (cst_regex_match(cst_rx_not_indic,word))
{ /* It's English like so us English Lexicon to find Phones and map
to indic phones */
/* printf("awb_debug cmu_indic_lex: English >%s<\n",word); */
english_phones = lex_lookup(&cmu_lex,word,pos,feats);
eng_bilingual_flag = get_param_string(feats, "eng_shared", "none");
if (cst_streq(eng_bilingual_flag, "1")) { base_phones = english_phones; }
else base_phones = map_english_to_indic_phones(indic_variant,english_phones);
// base_phones = english_phones;
// delete_val(english_phones);
return base_phones;
}
else
{
/* string to utf8 chars */
utflets = cst_utf8_explode(word);
/* chars to ord */
for (v=utflets; v; v=val_cdr(v)) {
tmpv = cst_utf8_ord(val_car(v));
ords = cons_val(tmpv, ords);
}
ords = val_reverse(ords);
/* Ords to Phone List (Map) (with final schwa deletion) */
base_phones =
val_reverse(cmu_indic_lex_ord_to_phones(ords,
cmu_indic_variant_deletes_word_final_schwa,
feats));
delete_val(utflets);
delete_val(ords);
}
if (cst_streq(indic_variant,"tam")){
printf("Tamil doesn't have anuswara");
}
else
cmu_indic_lex_nasal_postfixes(base_phones, feats);
base_phones = cmu_indic_lex_jnyan_replacement(base_phones,feats);
/* Postfix Indic Nasals, Voicing, Medial Schwa deletion */
if (cst_streq(indic_variant,"tam"))
{
/* Tamil voicing rules */
base_phones = cmu_indic_lex_tamil_voicing_postfixes(base_phones);
/* Tamil tr replacement */
base_phones = cmu_indic_lex_tamil_tr_replacement(base_phones);
/* Tamil nr replacement */
base_phones = cmu_indic_lex_tamil_nr_replacement(base_phones);
/* final u -> uy */
base_phones = cmu_indic_lex_tamil_final_u(base_phones);
}
if (cst_streq(indic_variant,"pan"))
{
/* Punjabi vowel and pronoun rules */
base_phones = cmu_indic_lex_punjabi_vowel_postfixes(base_phones);
/* Punjabi glide rules */
base_phones = cmu_indic_lex_punjabi_glide_postfixes(base_phones);
}
if (cst_streq(indic_variant,"kan"))
cmu_indic_lex_kannada_spelling_postfixes(base_phones);
//if (cst_streq(indic_variant,"san"))
// base_phones=val_reverse(delete_medial_schwa(val_reverse(base_phones)));
if ((cst_streq(indic_variant,"hin")) || (cst_streq(indic_variant,"mar")) ||
(cst_streq(indic_variant,"guj")) || (cst_streq(indic_variant,"raj")) ||
(cst_streq(indic_variant,"pan")))
{ /* Do medial schwa deletion */
base_phones=val_reverse(delete_medial_schwa(val_reverse(base_phones)));
base_phones = cmu_indic_hindi_schwa_fixes(base_phones);
}
/* phone list to syls (done as post lexical rules) */
#if 0
printf("cmu_indic_lex.c: indic word \"%s\" \n",word);
val_print(stdout, base_phones);
printf("\n");
#endif
return base_phones;
}
static int cmu_indic_contains_vowel(const cst_val *r)
{
const cst_val *x;
for (x=r; x; x=val_cdr(x))
{
if (cmu_indic_is_vowel(val_string(val_car(x))))
return TRUE;
}
return FALSE;
}
static int cmu_indic_has_vowel_in_syl(const cst_item *i)
{
const cst_item *n;
for (n=i; n; n=item_prev(n))
if (cmu_indic_is_vowel(ffeature_string(n,"name")))
return TRUE;
return FALSE;
}
int cmu_indic_syl_boundary(const cst_item *i,const cst_val *rest)
{
if (!rest)
return TRUE;
else if (!cmu_indic_contains_vowel(rest))
return FALSE;
else if (!cmu_indic_has_vowel_in_syl(i))
return FALSE;
else if (rest && val_cdr(rest) &&
cst_streq("n",val_string(val_car(rest))) &&
!cmu_indic_is_vowel(val_string(val_car(rest))))
return FALSE;
else if (rest && val_cdr(rest) &&
cmu_indic_is_vowel(ffeature_string(i,"name")) &&
!cmu_indic_is_vowel(val_string(val_car(rest))) &&
!cmu_indic_is_vowel(val_string(val_car(val_cdr(rest)))))
return FALSE;
else if (rest && val_cdr(rest) && val_cdr(val_cdr(rest)) &&
!cmu_indic_is_vowel(val_string(val_car(rest))) &&
!cmu_indic_is_vowel(val_string(val_car(val_cdr(rest)))) &&
!cmu_indic_is_vowel(val_string(val_car(val_cdr(val_cdr(rest))))))
return FALSE;
else if (rest && val_cdr(rest) &&
(cst_streq(val_string(val_car(rest)),
val_string(val_car(val_cdr(rest))))))
return FALSE;
else
return TRUE;
}
#if 0
int cmu_indic_syl_boundary(const cst_item *i,const cst_val *rest)
{
/* For debugging the syl boundary code */
int x;
printf("syl boundary %s | ",ffeature_string(i,"name"));
if (rest)
printf("%s ",val_string(val_car(rest)));
if (rest && val_cdr(rest))
printf("%s ",val_string(val_car(val_cdr(rest))));
if (rest && val_cdr(rest) && val_cdr(val_cdr(rest)))
printf("%s ",val_string(val_car(val_cdr(val_cdr(rest)))));
x = cmu_indic_syl_boundary_x(i,rest);
printf("is %d",x);
printf("\n");
return x;
}
#endif
cst_utterance *cmu_indic_assign_lex_stress(cst_utterance *u)
{
/* Assign stress to a lexical entry: from indic_lexicon.scm (aup) */
/* The stress is placed on the syllable with the highest weight. */
/* If there is a tie, the last-most syllable with highest weight */
/* is chosen. However, the last syllable of the word does not */
/* participate in tie-breaking. That is, it is stressed only when */
/* there are no ties. (Hussein 1997) */
/* not sure this code actually follows the above rules, but its */
/* the same as the festival code -- awb 20140606 */
const cst_item *syl, *w;
const char *x1, *x2, *x3;
int syl_weight, best_weight, sw;
const cst_item *stress_position;
for (w=relation_head(utt_relation(u,"SylStructure")); w; w=item_next(w))
{
for (syl=item_daughter(w); syl; syl=item_next(syl))
{ /* Assign weight to each syllable */
syl_weight = 0;
x1 = ffeature_string(syl,"R:SylStructure.daughtern.name");
x2 = ffeature_string(syl,"R:SylStructure.daughtern.p.name");
x3 = ffeature_string(syl,"R:SylStructure.daughtern.p.p.name");
if (cmu_indic_is_vowel(x1))
{ /* If syllable is open vowel */
if (cst_streq(x1,"A") || cst_streq(x1,"i") ||
cst_streq(x1,"u"))
syl_weight = 1;
else
syl_weight = 2;
}
else
{
if (cmu_indic_is_vowel(x2))
{
if (cst_streq(x2,"A") || cst_streq(x2,"i") ||
cst_streq(x2,"u"))
syl_weight = 2;
else
syl_weight = 3;
}
else if (cmu_indic_is_vowel(x3))
syl_weight = 3;
}
item_set_int(syl,"syl_weight",syl_weight);
}
}
for (w=relation_head(utt_relation(u,"SylStructure")); w; w=item_next(w))
{
best_weight = 0;
stress_position = NULL;
for (syl=item_daughter(w); syl; syl=item_next(syl))
{
sw = ffeature_int(syl,"syl_weight");
if (sw > best_weight)
{
best_weight = sw;
stress_position = syl;
}
else if ((sw == best_weight) && item_next(syl))
stress_position = syl;
}
if (stress_position)
item_set_string(stress_position,"stress","1");
}
return u;
}
cst_utterance *cmu_indic_postlex(cst_utterance *u)
{
/* Post lexical rules */
const char *indic_variant;
indic_variant = get_param_string(u->features, "variant", "none");
if ((cst_streq(indic_variant,"hin")) ||
(cst_streq(indic_variant,"mar")) ||
(cst_streq(indic_variant,"pan")) ||
(cst_streq(indic_variant,"raj")) ||
(cst_streq(indic_variant,"asm")) ||
(cst_streq(indic_variant,"ben")))
cmu_indic_assign_lex_stress(u);
#if 0
/* Print out words as 'festival' lexical entries */
const cst_item *word, *syl, *seg;
for (word=relation_head(utt_relation(u,"Word"));
word; word=item_next(word))
{
printf("( \"%s\" nil (",ffeature_string(word,"name"));
for (syl=item_daughter(item_as(word,"SylStructure"));
syl;syl=item_next(syl))
{
printf("(( ");
for (seg=item_daughter(item_as(syl,"SylStructure")); seg;
seg=item_next(seg))
{
printf("%s ",ffeature_string(seg,"name"));
}
printf(") %s %d) ",
ffeature_string(syl,"stress"),
ffeature_int(syl,"syl_weight"));
}
printf("))\n");
}
#endif
return u;
}
cst_lexicon cmu_indic_lex;
cst_lexicon *cmu_indic_lex_init(void)
{
/* Should it be global const or dynamic */
/* Can make lts_rules just a cart tree like others */
cst_lexicon *l;
if (cmu_indic_lex.lts_function)
return &cmu_indic_lex;
l = &cmu_indic_lex;
l->name = "cmu_indic_lex";
l->lts_function = cmu_indic_lex_lts_function;
l->syl_boundary = cmu_indic_syl_boundary;
l->postlex = cmu_indic_postlex;
return l;
}