ref: 91b74e4777120478cf658bad19323e1c147b0449
dir: /desereter.ml/
let entry_rgx = Str.regexp "\\([.a-z'-]+\\)(?[0-9]?)? \\(.*\\)" type wordset = {prefix: string; word: string; suffix: string} let is_uppercase = function | 'A' |'B' |'C' |'D' |'E' |'F' |'G' |'H' |'I' |'J' |'K' |'L' |'M' |'N' |'O' |'P' |'Q' |'R' |'S' |'T' |'U' |'V' |'W' |'X' |'Y' |'Z' -> true | _ -> false let get_vowel vowel = if String.length vowel != 3 then ("", "") else let trimmed = String.sub vowel 0 2 in match trimmed with | "IY" -> ("𐐀", "𐐨") | "EY" -> ("𐐁", "𐐩") | "AA" -> ("𐐂", "𐐪") | "AO" -> ("𐐉", "𐐱") | "OW" -> ("𐐄", "𐐬") | "UW" -> ("𐐅", "𐐭") | "IH" |"IX" -> ("𐐆", "𐐮") | "EH" -> ("𐐇", "𐐯") | "AE" -> ("𐐈", "𐐰") | "AX" -> ("𐐉", "𐐱") | "AH" -> ("𐐊", "𐐲") | "UH" -> ("𐐋", "𐐳") | "AY" -> ("𐐌", "𐐴") | "AW" -> ("𐐍", "𐐵") | "ER" -> ("𐐊𐐡", "𐐊𐑉") | _ -> ("", "") let get_char = function | "W" -> ("𐐎", "𐐶") | "Y" -> ("𐐏", "𐐷") | "H" |"HH" -> ("𐐐", "𐐸") | "P" -> ("𐐑", "𐐹") | "B" -> ("𐐒", "𐐺") | "T" -> ("𐐓", "𐐻") | "D" -> ("𐐔", "𐐼") | "CH" -> ("𐐕", "𐐽") | "JH" -> ("𐐖", "𐐾") | "K" -> ("𐐗", "𐐿") | "G" -> ("𐐘", "𐑀") | "F" -> ("𐐙", "𐑁") | "V" -> ("𐐚", "𐑂") | "TH" -> ("𐐛", "𐑃") | "DH" -> ("𐐜", "𐑄") | "S" -> ("𐐝", "𐑅") | "Z" -> ("𐐞", "𐑆") | "SH" -> ("𐐟", "𐑇") | "ZH" -> ("𐐠", "𐑈") | "R" -> ("𐐡", "𐑉") | "L" -> ("𐐢", "𐑊") | "M" -> ("𐐣", "𐑋") | "N" -> ("𐐤", "𐑌") | "NX" |"NG" -> ("𐐥", "𐑍") | v -> get_vowel v let rec parse_arpabet line des uppercase = match line with | hd :: tl -> let u, l = get_char hd in (if uppercase then u else l) ^ parse_arpabet tl des false | [] -> des let unquoted word = String.sub word 1 (String.length word - 2) let consider word = let wrd = ref (String.lowercase_ascii word) in let prefix = ref "" in let suffix = ref "" in ( try let pos = Str.search_forward (Str.regexp "[({\"]") !wrd 0 + 1 in wrd := String.sub word pos (String.length !wrd - pos) ; prefix := String.sub word 0 pos with Not_found -> () ) ; ( try let pos = Str.search_backward (Str.regexp "[})\"\\.,!;:]") !wrd (String.length !wrd) in suffix := String.sub !wrd pos (String.length !wrd - pos) ; wrd := String.sub !wrd 0 pos; with Not_found -> () ) ; {prefix= !prefix; word= !wrd; suffix= !suffix} let parse word dictionary = let uppercase = is_uppercase word.[0] in let wordparts = consider word in try let des = parse_arpabet (String.split_on_char ' ' (Hashtbl.find dictionary wordparts.word)) "" uppercase in wordparts.prefix ^ des ^ wordparts.suffix with Not_found -> word let sanitize line = Str.global_replace (Str.regexp "\\.\\.\\.") " ... " line let load_dictionary extra = let default = try let prefix = Unix.getenv "OPAM_SWITCH_PREFIX" in prefix ^ "/share/desereter/cmudict.dict" with Not_found -> "/lib/cmudict.dict" in let dictionaries = [default] @ String.split_on_char ';' extra in let dictionary = Hashtbl.create 150000 in let load file = if String.length file > 0 then let ic = open_in file in try while true do let entry = input_line ic in if Str.string_match entry_rgx entry 0 then let word = Str.matched_group 1 entry in let pronunciation = Str.matched_group 2 entry in Hashtbl.add dictionary word pronunciation done with End_of_file -> close_in ic in List.iter load dictionaries ; dictionary let translate dictionary line = let words = String.split_on_char ' ' (sanitize line) in let words = List.filter (fun x -> let w = String.trim x in String.length w > 0 ) words in print_endline (List.fold_left (fun acc word -> acc ^ parse word dictionary ^ " ") "" words) let () = let line = ref "" in let extra = ref "" in Arg.parse [(* ("-i", Arg.Set_string line, "input"); *) ("-d", Arg.Set_string extra, "dictionary")] (fun x -> line := x) (Sys.argv.(0) ^ " [-d dictionary] <-i input>") ; let dictionary = load_dictionary !extra in if String.length !line > 0 then translate dictionary !line else try while true do read_line () |> translate dictionary done with End_of_file -> ()