shithub: desereter

ref: 489868c4e3e09955b41506806168e1283400f06c
dir: desereter/desereter.ml

View raw version
let entry_rgx = Str.regexp "\\([.a-z'-]+\\)(?[0-9]?)? \\(.*\\)"

type wordset = {prefix: string; word: string; suffix: string}
type runesets = DESERET | FUTHORC

let is_uppercase = function
  | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'J' | 'K' | 'L'
   |'M' | 'N' | 'O' | 'P' | 'Q' | 'R' | 'S' | 'T' | 'U' | 'V' | 'W' | 'X'
   |'Y' | 'Z' ->
      true
  | _ -> false

let get_vowel vowel vowels =
  if String.length vowel != 3 then ("", "")
  else
    let trimmed = String.sub vowel 0 2 in
    try Hashtbl.find vowels trimmed with Not_found -> ("", "")

let get_char c runes =
  let vowels, consonants =
    match runes with
    | DESERET -> (Deseret.vowels, Deseret.consonants)
    | FUTHORC -> (Futhorc.vowels, Futhorc.consonants) in
  try Hashtbl.find consonants c with Not_found -> get_vowel c vowels

let rec parse_arpabet line des uppercase runes =
  match line with
  | hd :: tl ->
      let u, l = get_char hd runes in
      (if uppercase then u else l) ^ parse_arpabet tl des false runes
  | [] -> des

let unquoted word = String.sub word 1 (String.length word - 2)

let consider word =
  let wrd = ref (String.lowercase_ascii word) in
  let prefix = ref "" in
  let suffix = ref "" in
  ( try
      let pos = Str.search_forward (Str.regexp "[({\"]") !wrd 0 + 1 in
      wrd := String.sub word pos (String.length !wrd - pos) ;
      prefix := String.sub word 0 pos
    with Not_found -> () ) ;
  ( try
      let pos =
        Str.search_backward
          (Str.regexp "[})\"\\.,!;:]")
          !wrd (String.length !wrd) in
      suffix := String.sub !wrd pos (String.length !wrd - pos) ;
      wrd := String.sub !wrd 0 pos
    with Not_found -> () ) ;
  {prefix= !prefix; word= !wrd; suffix= !suffix}

let parse word dictionary runes =
  let uppercase = is_uppercase word.[0] in
  let wordparts = consider word in
  try
    let des =
      parse_arpabet
        (String.split_on_char ' ' (Hashtbl.find dictionary wordparts.word))
        "" uppercase runes in
    wordparts.prefix ^ des ^ wordparts.suffix
  with Not_found -> word

let sanitize line = Str.global_replace (Str.regexp "\\.\\.\\.") " ... " line

let load_dictionary extra =
  let prefix = Unix.getenv "OPAM_SWITCH_PREFIX" in
  let dictionaries =
    [prefix ^ "/share/desereter/cmudict.dict"] @ String.split_on_char ';' extra
  in
  let dictionary = Hashtbl.create 150000 in
  let load file =
    if String.length file > 0 then
      let ic = open_in file in
      try
        while true do
          let entry = input_line ic in
          if Str.string_match entry_rgx entry 0 then
            let word = Str.matched_group 1 entry in
            let pronunciation = Str.matched_group 2 entry in
            Hashtbl.add dictionary word pronunciation
        done
      with End_of_file -> close_in ic in
  List.iter load dictionaries ;
  dictionary

let translate runes dictionary line =
  let words = String.split_on_char ' ' (sanitize line) in
  let words =
    List.filter
      (fun x ->
        let w = String.trim x in
        String.length w > 0 )
      words in
  print_endline
    (List.fold_left
       (fun acc word -> acc ^ parse word dictionary runes ^ " ")
       "" words )

let () =
  let line = ref "" in
  let extra = ref "" in
  let runes = ref "deseret" in
  Arg.parse
    [ (* ("-i", Arg.Set_string line, "input"); *)
      ("-d", Arg.Set_string extra, "dictionary")
    ; ("-r", Arg.Set_string runes, "runes (deseret|futhorc)") ]
    (fun x -> line := x)
    (Sys.argv.(0) ^ " [-d dictionary] <-i input>") ;
  let dictionary = load_dictionary !extra in
  let runeset =
    match !runes with
    | "deseret" -> DESERET
    | "futhorc" -> FUTHORC
    | _ -> raise Not_found in
  if String.length !line > 0 then translate runeset dictionary !line
  else
    try
      while true do
        read_line () |> translate runeset dictionary
      done
    with End_of_file -> ()