shithub: martian9

ref: 02b360fe6c10ef6ec280c44a7935df743e68e0de
dir: martian9/reader.ml

View raw version
module T = Types.Types

exception Syntax_error of string

let token_re =
  Str.regexp "~@\\|[][{}()'`~^@]\\|\"\\(\\\\.\\|[^\"]\\)*\"?\\|;.*\\|[^][  \n{}('\"`,;)]*"
;;

let string_re = Str.regexp "\"\\(\\\\.\\|[^\\\\\"]\\)*\""

type reader =
  { form : Types.m9type
  ; tokens : string list
  }

type list_reader =
  { list_form : Types.m9type list
  ; tokens : string list
  }

let tokenize str =
  List.map
    (function
      | Str.Delim x -> String.trim x (* move trim to regex for speed? *)
      | Str.Text x -> "tokenize botch")
    (List.filter
       (function
         | Str.Delim x -> true
         | Str.Text x -> false)
       (Str.full_split token_re str))
;;

(* copied verbatim - must needs grok *)
let gsub re f str =
  String.concat
    ""
    (List.map
       (function
         | Str.Delim x -> f x
         | Str.Text x -> x)
       (Str.full_split re str))
;;

let unescape_string token =
  if Str.string_match string_re token 0
  then (
    let without_quotes = String.sub token 1 (String.length token - 2) in
    gsub
      (Str.regexp "\\\\.")
      (function
        | "\\n" -> "\n"
        | x -> String.sub x 1 1)
      without_quotes)
  else (
    output_string stderr "expected '\"', got EOF\n";
    flush stderr;
    raise End_of_file)
;;

let read_atom token =
  match token with
  | "null" -> T.Nil
  | "#t" | "#true" -> T.Bool true
  | "#f" | "#false" -> T.Bool false
  | _ ->
    (match token.[0] with
    | '0' .. '9' -> Types.number (float_of_string token)
    | '#' ->
      (match token.[1], token.[2] with
      | '\\', '0' .. '9' | '\\', 'a' .. 'z' | '\\', 'A' .. 'Z' -> T.Char token.[2]
      | _ -> Types.symbol token)
    | '-' ->
      (match String.length token with
      | 1 -> Types.symbol token
      | _ ->
        (match token.[1] with
        | '0' .. '9' -> Types.number (float_of_string token)
        | _ -> Types.symbol token))
    | '"' -> T.String (unescape_string token)
    | _ -> Types.symbol token)
;;

let rec read_list eol list_reader =
  match list_reader.tokens with
  | [] -> raise (Syntax_error ("unterminated '" ^ eol ^ "'"))
  | token :: tokens ->
    if Str.string_match (Str.regexp eol) token 0
    then { list_form = list_reader.list_form; tokens }
    else (
      let reader = read_form list_reader.tokens in
      read_list
        eol
        { list_form = list_reader.list_form @ [ reader.form ]; tokens = reader.tokens })

and read_quote sym tokens =
  let reader = read_form tokens in
  { form = Types.list [ Types.symbol sym; reader.form ]; tokens = reader.tokens }

and read_vector all_tokens =
  match all_tokens with
  | [] -> raise End_of_file
  | token :: tokens ->
    (match token with
    | "(" ->
      let list_reader = read_list ")" { list_form = []; tokens } in
      { form = Types.vector list_reader.list_form; tokens = list_reader.tokens }
    | _ -> read_form tokens)

and read_form all_tokens =
  match all_tokens with
  | [] -> raise End_of_file
  | token :: tokens ->
    (match token with
    | "'" -> read_quote "quote" tokens
    | "`" -> read_quote "quasiquote" tokens
    | "#" -> read_vector tokens
    | "#|" ->
      let list_reader = read_list "|#" { list_form = []; tokens } in
      { form = T.Unspecified; tokens = list_reader.tokens }
    | "(" ->
      let list_reader = read_list ")" { list_form = []; tokens } in
      { form = Types.list list_reader.list_form; tokens = list_reader.tokens }
    | "" | "\t" | "\n" -> read_form tokens
    | _ ->
      if token.[0] = ';' then read_form tokens else { form = read_atom token; tokens })
;;

let slurp filename =
  let chan = open_in filename in
  let b = Buffer.create 27 in
  Buffer.add_channel b chan (in_channel_length chan);
  close_in chan;
  Buffer.contents b
;;

let read str = (read_form (tokenize str)).form