ref: f15235c1d015f08c9278ac5f5fa6096da716a651
parent: 25477859abb169a6041bca8e318c2b7039f5189d
author: Alan W Black <awb@cs.cmu.edu>
date: Mon Nov 6 01:21:42 EST 2017
is_english flag and formatting
--- a/lang/cmu_indic_lang/cmu_indic_lang.c
+++ b/lang/cmu_indic_lang/cmu_indic_lang.c
@@ -373,6 +373,7 @@
return r;
}
+#if 0
static int indic_nump_old(const char *number)
{
/* True if all (unicode) characters are in num_table's digit table */
@@ -406,8 +407,8 @@
return flag;
}
+#endif
-
static int indic_nump(const char *number)
{
/* Check if non-empty string */
@@ -515,6 +516,7 @@
cst_utterance *utt;
/* printf("awb_debug token_name %s name %s\n",item_name(token),name); */
+ r = NULL;
if (item_feat_present(token,"phones"))
return cons_val(string_val(name),NULL);
@@ -562,58 +564,58 @@
else if (indic_nump(name))
{ /* Its script specific digits (commas/dots) */
- if (indic_nump(name) == 2)
- { /* All characters are digits */
- // printf("nump is 2\n");
- p = indic_num_normalize(name,num_table);
- if (val_length(p) <= 9)
- r = indic_number(p,num_table);
- else
- r = indic_number_indiv(p,num_table);
- delete_val(p);
- }
- else if (indic_nump(name) == 1)
- { /* Some characters are digits */
- int len = 1;
- int i = 0;
- char c0;
- char *aaa;
- char *bbb;
- while(name[i] != '\0')
- {
- /* Iterate over UTF-8 string */
- c0 = name[i];
- len = ts_utf8_sequence_length(c0);
- /* Check if char after this is comma */
- if (name[i+len] == ',')
- {
- /* Skip commas */
- i += len;
- c0 = name[i];
- len = ts_utf8_sequence_length(c0);
- i += len;
- continue;
- }
- /* Find where character type switches to or from digits */
- if(indic_text_splitable(name, i, len))
- break;
- i +=len;
- }
- aaa = cst_strdup(name);
- aaa[i+len] = '\0';
- bbb = cst_strdup(&name[i+len]);
- r = val_append(cmu_indic_tokentowords_one(token, aaa),
- cmu_indic_tokentowords_one(token, bbb));
- cst_free(aaa);
- cst_free(bbb);
- }
+ if (indic_nump(name) == 2)
+ { /* All characters are digits */
+ // printf("nump is 2\n");
+ p = indic_num_normalize(name,num_table);
+ if (val_length(p) <= 9)
+ r = indic_number(p,num_table);
+ else
+ r = indic_number_indiv(p,num_table);
+ delete_val(p);
+ }
+ else if (indic_nump(name) == 1)
+ { /* Some characters are digits */
+ int len = 1;
+ int i = 0;
+ char c0;
+ char *aaa;
+ char *bbb;
+ while(name[i] != '\0')
+ {
+ /* Iterate over UTF-8 string */
+ c0 = name[i];
+ len = ts_utf8_sequence_length(c0);
+ /* Check if char after this is comma */
+ if (name[i+len] == ',')
+ {
+ /* Skip commas */
+ i += len;
+ c0 = name[i];
+ len = ts_utf8_sequence_length(c0);
+ i += len;
+ continue;
+ }
+ /* Find where character type switches to or from digits */
+ if(indic_text_splitable(name, i, len))
+ break;
+ i +=len;
+ }
+ aaa = cst_strdup(name);
+ aaa[i+len] = '\0';
+ bbb = cst_strdup(&name[i+len]);
+ r = val_append(cmu_indic_tokentowords_one(token, aaa),
+ cmu_indic_tokentowords_one(token, bbb));
+ cst_free(aaa);
+ cst_free(bbb);
+ }
}
else if (indic_hyphenated(name))
{ /* For numbers seeparated by - / , */
- char *aaa;
- aaa = cst_strdup(&name[1]);
- r = cmu_indic_tokentowords_one(token, aaa);
- cst_free(aaa);
+ char *aaa;
+ aaa = cst_strdup(&name[1]);
+ r = cmu_indic_tokentowords_one(token, aaa);
+ cst_free(aaa);
}
else if (cst_regex_match(cst_rx_not_indic,name))
@@ -651,6 +653,18 @@
return FALSE;
}
+DEF_STATIC_CONST_VAL_STRING(val_string_zero,"0");
+DEF_STATIC_CONST_VAL_STRING(val_string_one,"1");
+
+const cst_val *is_english(const cst_item *p)
+{
+ if (p && cst_regex_match(cst_rx_not_indic,
+ flite_ffeature_string(p,"name")))
+ return (cst_val *)&val_string_one;
+ else
+ return (cst_val *)&val_string_zero;
+}
+
void cmu_indic_lang_init(cst_voice *v)
{
/* Set indic language stuff */
@@ -690,6 +704,9 @@
/* Default ffunctions (required) */
basic_ff_register(v->ffunctions);
+
+ /* Indic specific features */
+ ff_register(v->ffunctions, "lisp_is_english", is_english);
return;
}