shithub: desereter

ref: 91b74e4777120478cf658bad19323e1c147b0449
dir: /desereter.ml/

View raw version
let entry_rgx = Str.regexp "\\([.a-z'-]+\\)(?[0-9]?)? \\(.*\\)"

type wordset = {prefix: string; word: string; suffix: string}

let is_uppercase = function
  | 'A'
   |'B'
   |'C'
   |'D'
   |'E'
   |'F'
   |'G'
   |'H'
   |'I'
   |'J'
   |'K'
   |'L'
   |'M'
   |'N'
   |'O'
   |'P'
   |'Q'
   |'R'
   |'S'
   |'T'
   |'U'
   |'V'
   |'W'
   |'X'
   |'Y'
   |'Z' ->
      true
  | _ -> false

let get_vowel vowel =
  if String.length vowel != 3 then ("", "")
  else
    let trimmed = String.sub vowel 0 2 in
    match trimmed with
    | "IY" -> ("𐐀", "𐐨")
    | "EY" -> ("𐐁", "𐐩")
    | "AA" -> ("𐐂", "𐐪")
    | "AO" -> ("𐐉", "𐐱")
    | "OW" -> ("𐐄", "𐐬")
    | "UW" -> ("𐐅", "𐐭")
    | "IH"
     |"IX" ->
        ("𐐆", "𐐮")
    | "EH" -> ("𐐇", "𐐯")
    | "AE" -> ("𐐈", "𐐰")
    | "AX" -> ("𐐉", "𐐱")
    | "AH" -> ("𐐊", "𐐲")
    | "UH" -> ("𐐋", "𐐳")
    | "AY" -> ("𐐌", "𐐴")
    | "AW" -> ("𐐍", "𐐵")
    | "ER" -> ("𐐊𐐡", "𐐊𐑉")
    | _ -> ("", "")

let get_char = function
  | "W" -> ("𐐎", "𐐶")
  | "Y" -> ("𐐏", "𐐷")
  | "H"
   |"HH" ->
      ("𐐐", "𐐸")
  | "P" -> ("𐐑", "𐐹")
  | "B" -> ("𐐒", "𐐺")
  | "T" -> ("𐐓", "𐐻")
  | "D" -> ("𐐔", "𐐼")
  | "CH" -> ("𐐕", "𐐽")
  | "JH" -> ("𐐖", "𐐾")
  | "K" -> ("𐐗", "𐐿")
  | "G" -> ("𐐘", "𐑀")
  | "F" -> ("𐐙", "𐑁")
  | "V" -> ("𐐚", "𐑂")
  | "TH" -> ("𐐛", "𐑃")
  | "DH" -> ("𐐜", "𐑄")
  | "S" -> ("𐐝", "𐑅")
  | "Z" -> ("𐐞", "𐑆")
  | "SH" -> ("𐐟", "𐑇")
  | "ZH" -> ("𐐠", "𐑈")
  | "R" -> ("𐐡", "𐑉")
  | "L" -> ("𐐢", "𐑊")
  | "M" -> ("𐐣", "𐑋")
  | "N" -> ("𐐤", "𐑌")
  | "NX"
   |"NG" ->
      ("𐐥", "𐑍")
  | v -> get_vowel v

let rec parse_arpabet line des uppercase =
  match line with
  | hd :: tl ->
      let u, l = get_char hd in
      (if uppercase then u else l) ^ parse_arpabet tl des false
  | [] -> des

let unquoted word = String.sub word 1 (String.length word - 2)

let consider word =
  let wrd = ref (String.lowercase_ascii word) in
  let prefix = ref "" in
  let suffix = ref "" in
  ( try
      let pos = Str.search_forward (Str.regexp "[({\"]") !wrd 0 + 1 in
      wrd := String.sub word pos (String.length !wrd - pos) ;
      prefix := String.sub word 0 pos
    with Not_found -> () ) ;
  ( try
      let pos = Str.search_backward (Str.regexp "[})\"\\.,!;:]") !wrd (String.length !wrd) in
      suffix := String.sub !wrd pos (String.length !wrd - pos) ;
      wrd := String.sub !wrd 0 pos;
    with Not_found -> () ) ;
  {prefix= !prefix; word= !wrd; suffix= !suffix}

let parse word dictionary =
  let uppercase = is_uppercase word.[0] in
  let wordparts = consider word in
  try
    let des = parse_arpabet (String.split_on_char ' ' (Hashtbl.find dictionary wordparts.word)) "" uppercase in
    wordparts.prefix ^ des ^ wordparts.suffix
  with Not_found -> word

let sanitize line = Str.global_replace (Str.regexp "\\.\\.\\.") " ... " line

let load_dictionary extra =
  let default =
		try
			let prefix = Unix.getenv "OPAM_SWITCH_PREFIX" in
			prefix ^ "/share/desereter/cmudict.dict"
		with Not_found -> "/lib/cmudict.dict"
  in
  let dictionaries = [default] @ String.split_on_char ';' extra in
  let dictionary = Hashtbl.create 150000 in
  let load file =
    if String.length file > 0 then
      let ic = open_in file in
      try
        while true do
          let entry = input_line ic in
          if Str.string_match entry_rgx entry 0 then
            let word = Str.matched_group 1 entry in
            let pronunciation = Str.matched_group 2 entry in
            Hashtbl.add dictionary word pronunciation
        done
      with End_of_file -> close_in ic in
  List.iter load dictionaries ; dictionary

let translate dictionary line =
  let words = String.split_on_char ' ' (sanitize line) in
  let words =
    List.filter
      (fun x ->
        let w = String.trim x in
        String.length w > 0 )
      words in
  print_endline (List.fold_left (fun acc word -> acc ^ parse word dictionary ^ " ") "" words)

let () =
  let line = ref "" in
  let extra = ref "" in
  Arg.parse
    [(* ("-i", Arg.Set_string line, "input"); *) ("-d", Arg.Set_string extra, "dictionary")]
    (fun x -> line := x)
    (Sys.argv.(0) ^ " [-d dictionary] <-i input>") ;
  let dictionary = load_dictionary !extra in
  if String.length !line > 0 then translate dictionary !line
  else
    try
      while true do
        read_line () |> translate dictionary
      done
    with End_of_file -> ()