ref: 12c9d2fc728b51aa1eb9a70d0d331eb9464912d9
dir: /src/read.c/
#include "sl.h"
#include "cvalues.h"
#include "read.h"
#include "nan.h"
enum {
TOK_NONE, TOK_OPEN, TOK_CLOSE, TOK_DOT, TOK_QUOTE, TOK_SYM, TOK_NUM,
TOK_BQ, TOK_COMMA, TOK_COMMAAT, TOK_COMMADOT,
TOK_SHARPDOT, TOK_LABEL, TOK_BACKREF, TOK_SHARPQUOTE, TOK_SHARPOPEN,
TOK_OPENB, TOK_CLOSEB, TOK_SHARPSYM, TOK_GENSYM, TOK_DOUBLEQUOTE,
TOK_OPENC, TOK_CLOSEC,
};
#define PAtLoc "at %"PRIu32":%"PRIu32
typedef struct Rctx Rctx;
struct Rctx {
u32int toktype;
sl_v tokval;
sl_loc loc;
char buf[1024];
};
static sl_v do_read_sexpr(Rctx *ctx, sl_v label);
#define RS value2c(sl_ios*, sl.readstate->source)
bool
sl_read_numtok(const char *tok, sl_v *pval, int base)
{
char *end;
s64int i64;
double d;
if(*tok == '\0')
return false;
if(!((tok[0] == '0' && tok[1] == 'x') || (base >= 15)) && strpbrk(tok, ".eEpP")){
d = strtod(tok, &end);
if(*end == '\0'){
if(pval)
*pval = mk_double(d);
return true;
}
// floats can end in f or f0
if(end > tok && end[0] == 'f' &&
(end[1] == '\0' ||
(end[1] == '0' && end[2] == '\0'))){
if(pval)
*pval = mk_float((float)d);
return true;
}
}
if(tok[0] == '+'){
if(!strcmp(tok, "+NaN") || !strcasecmp(tok, "+nan.0")){
if(pval)
*pval = mk_double(D_PNAN);
return true;
}
if(!strcmp(tok, "+Inf") || !strcasecmp(tok, "+inf.0")){
if(pval)
*pval = mk_double(D_PINF);
return true;
}
}else if(tok[0] == '-'){
if(!strcmp(tok, "-NaN") || !strcasecmp(tok, "-nan.0")){
if(pval)
*pval = mk_double(D_NNAN);
return true;
}
if(!strcmp(tok, "-Inf") || !strcasecmp(tok, "-inf.0")){
if(pval)
*pval = mk_double(D_NINF);
return true;
}
}
i64 = strtoll(tok, &end, base);
if(*end != '\0')
return false;
if(pval != nil){
mpint *m;
if(fits_fixnum(i64))
*pval = fixnum(i64);
else if((m = strtomp(tok, &end, base, nil)) != nil)
*pval = mk_mp(m);
else
return false;
}
return true;
}
static char
nextchar(void)
{
int ch;
char c;
sl_ios *f = RS;
do{
ch = ios_getc(RS);
if(ch == IOS_EOF)
return 0;
c = (char)ch;
if(c == ';'){
// single-line comment
do{
ch = ios_getc(f);
if(ch == IOS_EOF)
return 0;
}while((char)ch != '\n');
c = (char)ch;
}
}while(c == ' ' || isspace(c));
return c;
}
static void
take(Rctx *ctx)
{
ctx->toktype = TOK_NONE;
}
static _Noreturn void sl_printfmt(2, 3)
parse_error(sl_loc *loc, const char *format, ...)
{
char msgbuf[512];
va_list args;
int n;
n = snprintf(msgbuf, sizeof(msgbuf), "%s:%"PRIu64":%"PRIu64": ",
loc->filename, (u64int)loc->lineno, (u64int)loc->colno);
if(n >= (int)sizeof(msgbuf))
n = 0;
va_start(args, format);
vsnprintf(msgbuf+n, sizeof(msgbuf)-n, format, args);
sl_v msg = string_from_cstr(msgbuf);
va_end(args);
sl_raise(mk_list2(sl_errparse, msg));
}
static void
accumchar(Rctx *ctx, char c, int *pi)
{
ctx->buf[(*pi)++] = c;
if(*pi >= (int)(sizeof(ctx->buf)-1))
parse_error(&ctx->loc, "token too long");
}
// return: 1 if escaped (forced to be symbol)
static bool
read_token(Rctx *ctx, char c, bool digits)
{
int i = 0, ch, nc = 0;
bool escaped = false, issym = false;
while(1){
if(nc != 0){
if(nc != 1)
ios_getc(RS);
ch = ios_peekc(RS);
if(ch == IOS_EOF)
goto terminate;
c = (char)ch;
}
if(c == '|'){
issym = true;
escaped = !escaped;
}else if(c == '\\'){
issym = true;
ios_getc(RS);
ch = ios_peekc(RS);
if(ch == IOS_EOF)
goto terminate;
accumchar(ctx, (char)ch, &i);
}else if(!escaped && !(symchar(c) && (!digits || isdigit(c)))){
break;
}else{
accumchar(ctx, c, &i);
}
nc++;
}
if(nc == 0)
ios_skip(RS, -1);
terminate:
ctx->buf[i++] = '\0';
return issym;
}
static int
isdigit_base(char c, int base)
{
if(base < 11)
return c >= '0' && c < '0'+base;
return (c >= '0' && c <= '9') || (c >= 'a' && c < 'a'+base-10) || (c >= 'A' && c < 'A'+base-10);
}
static u32int
peek(Rctx *ctx)
{
char c, *end;
sl_fx x;
int ch, base;
if(ctx->toktype != TOK_NONE)
return ctx->toktype;
c = nextchar();
ctx->loc = RS->loc;
if(ios_eof(RS))
return TOK_NONE;
if(c == '(')
ctx->toktype = TOK_OPEN;
else if(c == ')')
ctx->toktype = TOK_CLOSE;
else if(c == '[')
ctx->toktype = TOK_OPENB;
else if(c == ']')
ctx->toktype = TOK_CLOSEB;
else if(c == '{')
ctx->toktype = TOK_OPENC;
else if(c == '}')
ctx->toktype = TOK_CLOSEC;
else if(c == '\'')
ctx->toktype = TOK_QUOTE;
else if(c == '`')
ctx->toktype = TOK_BQ;
else if(c == '"')
ctx->toktype = TOK_DOUBLEQUOTE;
else if(c == '#'){
ch = ios_getc(RS); c = (char)ch;
if(ch == IOS_EOF)
parse_error(&ctx->loc, "invalid read macro");
if(c == '.')
ctx->toktype = TOK_SHARPDOT;
else if(c == '\'')
ctx->toktype = TOK_SHARPQUOTE;
else if(c == '\\'){
Rune cval;
if(ios_getutf8(RS, &cval) == IOS_EOF)
parse_error(&ctx->loc, "end of input in character constant");
if(cval == 'u' || cval == 'U' || cval == 'x'){
read_token(ctx, 'u', 0);
if(ctx->buf[1] != '\0'){ // not a solitary 'u','U','x'
if(!sl_read_numtok(&ctx->buf[1], &ctx->tokval, 16))
parse_error(&ctx->loc, "invalid hex character constant");
cval = numval(ctx->tokval);
}
}else if(cval >= 'a' && cval <= 'z'){
read_token(ctx, (char)cval, 0);
ctx->tokval = symbol(ctx->buf, true);
if(ctx->buf[1] == '\0') USED(cval); /* one character */
else if(ctx->tokval == sl_nulsym) cval = 0x00;
else if(ctx->tokval == sl_alarmsym) cval = 0x07;
else if(ctx->tokval == sl_backspacesym) cval = 0x08;
else if(ctx->tokval == sl_tabsym) cval = 0x09;
else if(ctx->tokval == sl_linefeedsym) cval = 0x0A;
else if(ctx->tokval == sl_newlinesym) cval = 0x0A;
else if(ctx->tokval == sl_vtabsym) cval = 0x0B;
else if(ctx->tokval == sl_pagesym) cval = 0x0C;
else if(ctx->tokval == sl_returnsym) cval = 0x0D;
else if(ctx->tokval == sl_escsym) cval = 0x1B;
else if(ctx->tokval == sl_spacesym) cval = 0x20;
else if(ctx->tokval == sl_deletesym) cval = 0x7F;
else
parse_error(&ctx->loc, "unknown character #\\%s", ctx->buf);
}
ctx->toktype = TOK_NUM;
ctx->tokval = mk_rune(cval);
}else if(c == '('){
ctx->toktype = TOK_SHARPOPEN;
}else if(c == '<'){
parse_error(&ctx->loc, "unreadable object");
}else if(isdigit(c)){
read_token(ctx, c, 1);
c = (char)ios_getc(RS);
if(c == '#')
ctx->toktype = TOK_BACKREF;
else if(c == '=')
ctx->toktype = TOK_LABEL;
else
parse_error(&ctx->loc, "invalid label");
x = strtoll(ctx->buf, &end, 10);
if(*end != '\0')
parse_error(&ctx->loc, "invalid label");
ctx->tokval = fixnum(x);
}else if(c == '!'){
// #! single line comment for shbang script support
do{
ch = ios_getc(RS);
}while(ch != IOS_EOF && (char)ch != '\n');
return peek(ctx);
}else if(c == '|'){
// multiline comment
int commentlevel = 1;
while(1){
ch = ios_getc(RS);
hashpipe_gotc:
if(ch == IOS_EOF)
parse_error(&ctx->loc, "eof within comment");
if((char)ch == '|'){
ch = ios_getc(RS);
if((char)ch == '#'){
commentlevel--;
if(commentlevel == 0)
break;
else
continue;
}
goto hashpipe_gotc;
}else if((char)ch == '#'){
ch = ios_getc(RS);
if((char)ch == '|')
commentlevel++;
else
goto hashpipe_gotc;
}
}
// this was whitespace, so keep peeking
return peek(ctx);
}else if(c == ';'){
// datum comment
(void)do_read_sexpr(ctx, UNBOUND); // skip
return peek(ctx);
}else if(c == ':'){
// gensym
ch = ios_getc(RS);
if((char)ch == 'g')
ch = ios_getc(RS);
read_token(ctx, (char)ch, 0);
x = strtol(ctx->buf, &end, 10);
if(*end != '\0' || ctx->buf[0] == '\0')
parse_error(&ctx->loc, "invalid gensym label");
ctx->toktype = TOK_GENSYM;
ctx->tokval = fixnum(x);
}else if(symchar(c)){
read_token(ctx, ch, 0);
if(((c == 'b' && (base = 2)) ||
(c == 'o' && (base = 8)) ||
(c == 'd' && (base = 10)) ||
(c == 'x' && (base = 16))) && (isdigit_base(ctx->buf[1], base) || ctx->buf[1] == '-')){
if(!sl_read_numtok(&ctx->buf[1], &ctx->tokval, base))
parse_error(&ctx->loc, "invalid base %d constant", base);
return (ctx->toktype = TOK_NUM);
}
ctx->toktype = TOK_SHARPSYM;
ctx->tokval = symbol(ctx->buf, true);
}else{
parse_error(&ctx->loc, "unknown read macro");
}
}else if(c == ','){
ctx->toktype = TOK_COMMA;
ch = ios_peekc(RS);
if(ch == IOS_EOF)
return ctx->toktype;
if((char)ch == '@')
ctx->toktype = TOK_COMMAAT;
else if((char)ch == '.')
ctx->toktype = TOK_COMMADOT;
else
return ctx->toktype;
ios_getc(RS);
}else{
bool ok = read_token(ctx, c, 0);
const char *s = ctx->buf;
if(!ok){
if(s[0] == '.' && s[1] == '\0')
return (ctx->toktype = TOK_DOT);
if(sl_read_numtok(s, &ctx->tokval, 0))
return (ctx->toktype = TOK_NUM);
}
ctx->toktype = TOK_SYM;
if(strcasecmp(s, "nil") == 0)
ctx->tokval = sl_nil;
else if(s[1] == 0 && (s[0] == 't' || s[0] == 'T'))
ctx->tokval = sl_t;
else if(strcmp(s, "λ") == 0 || strcmp(s, "lambda") == 0)
ctx->tokval = sl_lambda;
else{
ctx->tokval = symbol(s, true);
if(s[strlen(s)-1] == '#')
ctx->toktype = TOK_GENSYM;
}
}
return ctx->toktype;
}
// NOTE: this is NOT an efficient operation. it is only used by the
// reader, and requires at least 1 and up to 3 garbage collections!
static sl_v
vector_grow(sl_v v, bool rewrite_refs)
{
usize i, s = vector_size(v);
usize d = vector_grow_amt(s);
PUSH(v);
assert(s+d > s);
sl_v newv = alloc_vector(s+d, 1);
v = sl.sp[-1];
for(i = 0; i < s; i++)
vector_elt(newv, i) = vector_elt(v, i);
// use gc to rewrite references from the old vector to the new
sl.sp[-1] = newv;
if(s > 0 && rewrite_refs){
((usize*)ptr(v))[0] |= 0x1;
vector_elt(v, 0) = newv;
sl_gc(false);
}
return POP();
}
static sl_v
read_vector(Rctx *ctx, sl_v label, u32int closer)
{
sl_v v = sl_emptyvec, elt;
u32int i = 0;
PUSH(v);
if(label != UNBOUND)
ptrhash_put(&sl.readstate->backrefs, (void*)label, (void*)v);
while(peek(ctx) != closer){
if(ios_eof(RS))
parse_error(&ctx->loc, "unexpected end of input");
v = sl.sp[-1]; // reload after possible alloc in peek()
if(i >= vector_size(v)){
v = sl.sp[-1] = vector_grow(v, label != UNBOUND);
if(label != UNBOUND)
ptrhash_put(&sl.readstate->backrefs, (void*)label, (void*)v);
}
elt = do_read_sexpr(ctx, UNBOUND);
v = sl.sp[-1];
assert(i < vector_size(v));
vector_elt(v, i) = elt;
i++;
}
take(ctx);
if(i > 0)
vector_setsize(v, i);
return POP();
}
static sl_v
read_string(Rctx *ctx)
{
char *buf, *temp;
char eseq[10];
usize i = 0, j, sz, ndig;
int c;
sl_v s;
Rune r = 0;
sz = sizeof(ctx->buf);
buf = ctx->buf;
while(1){
if(i >= sz-UTFmax){ // -UTFmax: leaves room for longest utf8 sequence
sz *= 2;
if(buf == ctx->buf){
if((temp = MEM_ALLOC(sz)) != nil)
memcpy(temp, ctx->buf, i);
}else
temp = MEM_REALLOC(buf, sz);
if(temp == nil){
if(buf == ctx->buf)
MEM_FREE(buf);
parse_error(&ctx->loc, "out of memory reading string");
}
buf = temp;
}
c = ios_getc(RS);
if(c == IOS_EOF){
if(buf != ctx->buf)
MEM_FREE(buf);
parse_error(&ctx->loc, "unexpected end of input in string");
}
if(c == '"')
break;
else if(c == '\\'){
c = ios_getc(RS);
if(c == IOS_EOF){
if(buf != ctx->buf)
MEM_FREE(buf);
parse_error(&ctx->loc, "end of input in escape sequence");
}
j = 0;
if(octal_digit(c)){
while(1){
eseq[j++] = c;
c = ios_peekc(RS);
if(c == IOS_EOF || !octal_digit(c) || j >= 3)
break;
ios_getc(RS);
}
eseq[j] = '\0';
r = strtol(eseq, nil, 8);
// \DDD and \xXX read bytes, not characters
buf[i++] = (char)r;
}else if((c == 'x' && (ndig = 2)) || (c == 'u' && (ndig = 4)) || (c == 'U' && (ndig = 8))){
while(1){
c = ios_peekc(RS);
if(c == IOS_EOF || !hex_digit(c) || j >= ndig)
break;
eseq[j++] = c;
ios_getc(RS);
}
eseq[j] = '\0';
if(j)
r = strtol(eseq, nil, 16);
if(!j || r > Runemax){
if(buf != ctx->buf)
MEM_FREE(buf);
parse_error(&ctx->loc, "invalid escape sequence");
}
if(ndig == 2)
buf[i++] = (char)r;
else
i += runetochar(&buf[i], &r);
}else if(c == '\n'){
/* do nothing */
}else{
char esc = read_escape_control_char((char)c);
if(esc == (char)c && !strchr("\\'\"`", esc)){
if(buf != ctx->buf)
MEM_FREE(buf);
sl_loc *l = &RS->loc;
parse_error(
&ctx->loc,
"invalid escape sequence \\%c "PAtLoc,
(char)c,
l->lineno,
l->colno
);
}
buf[i++] = esc;
}
}else{
buf[i++] = c;
}
}
s = cvalue_string(i);
memcpy(cvalue_data(s), buf, i);
if(buf != ctx->buf)
MEM_FREE(buf);
return s;
}
// build a list of conses. this is complicated by the fact that all conses
// can move whenever a new cons is allocated. we have to refer to every cons
// through a handle to a relocatable pointer (i.e. a pointer on the stack).
static void
read_list(Rctx *ctx, sl_v label, u32int closer)
{
sl_v c, *pc, *pval, *ipval, *ipc;
u32int t;
sl_loc loc0;
loc0 = RS->loc;
loc0.colno--;
ipval = sl.sp-1;
PUSH(sl_nil);
ipc = sl.sp-1; // to keep track of current cons cell
t = peek(ctx);
while(t != closer){
if(ios_eof(RS))
parse_error(&loc0, "not closed: unexpected EOI "PAtLoc, ctx->loc.lineno, ctx->loc.colno);
c = alloc_cons(); car_(c) = cdr_(c) = sl_nil;
pc = ipc;
if(iscons(*pc))
cdr_(*pc) = c;
else{
pval = ipval;
*pval = c;
if(label != UNBOUND)
ptrhash_put(&sl.readstate->backrefs, (void*)label, (void*)c);
}
*pc = c;
c = do_read_sexpr(ctx, UNBOUND);
pc = ipc;
car_(*pc) = c;
t = peek(ctx);
if(t == TOK_DOT){
take(ctx);
c = do_read_sexpr(ctx, UNBOUND);
pc = ipc;
cdr_(*pc) = c;
t = peek(ctx);
if(ios_eof(RS))
parse_error(&ctx->loc, "unexpected end of input");
if(t != closer){
take(ctx);
parse_error(
&ctx->loc,
"expected '%c'",
closer == TOK_CLOSEB ? ']' : (closer == TOK_CLOSEC ? '}' : ')')
);
}
}
}
take(ctx);
c = POP();
USED(c);
}
// label is the backreference we'd like to fix up with this read
static sl_v
do_read_sexpr(Rctx *ctx, sl_v label)
{
sl_v v, sym, oldtokval, *head;
sl_v *pv;
u32int t;
char c;
t = peek(ctx);
take(ctx);
switch(t){
case TOK_OPEN:
PUSH(sl_nil);
read_list(ctx, label, TOK_CLOSE);
return POP();
case TOK_SYM:
case TOK_NUM:
return ctx->tokval;
case TOK_OPENB:
PUSH(sl_nil);
read_list(ctx, label, TOK_CLOSEB);
return POP();
case TOK_OPENC:
PUSH(sl_nil);
read_list(ctx, label, TOK_CLOSEC);
return POP();
case TOK_COMMA:
head = &sl_comma; goto listwith;
case TOK_COMMAAT:
head = &sl_commaat; goto listwith;
case TOK_COMMADOT:
head = &sl_commadot; goto listwith;
case TOK_BQ:
head = &sl_backquote; goto listwith;
case TOK_QUOTE:
head = &sl_quote;
listwith:
v = cons_reserve(2);
car_(v) = *head;
cdr_(v) = tagptr((sl_cons*)ptr(v)+1, TAG_CONS);
car_(cdr_(v)) = cdr_(cdr_(v)) = sl_nil;
PUSH(v);
if(label != UNBOUND)
ptrhash_put(&sl.readstate->backrefs, (void*)label, (void*)v);
v = do_read_sexpr(ctx, UNBOUND);
car_(cdr_(sl.sp[-1])) = v;
return POP();
case TOK_SHARPQUOTE:
// StreetLISP doesn't need symbol-function, so #' does nothing
return do_read_sexpr(ctx, label);
case TOK_SHARPSYM:
sym = ctx->tokval;
// constructor notation
c = nextchar();
ctx->loc = RS->loc;
if(c != '('){
take(ctx);
parse_error(&ctx->loc, "expected argument list for %s", symbol_name(ctx->tokval));
}
PUSH(sl_nil);
read_list(ctx, UNBOUND, TOK_CLOSE);
if(sym == sl_vu8sym){
sym = sl_arraysym;
sl.sp[-1] = mk_cons(sl_u8sym, sl.sp[-1]);
}else if(sym == sl_fnsym){
sym = sl_function;
}
v = symbol_value(sym);
if(v == UNBOUND)
unbound_error(sym);
return sl_apply(v, POP());
case TOK_SHARPOPEN:
return read_vector(ctx, label, TOK_CLOSE);
case TOK_SHARPDOT:
// eval-when-read
// evaluated expressions can refer to existing backreferences, but they
// cannot see pending labels. in other words:
// (... #2=#.#0# ... ) OK
// (... #2=#.(#2#) ... ) DO NOT WANT
sym = do_read_sexpr(ctx, UNBOUND);
if(issymbol(sym)){
v = symbol_value(sym);
if(v == UNBOUND)
unbound_error(sym);
return v;
}
return sl_toplevel_eval(sym);
case TOK_LABEL:
// create backreference label
if(ptrhash_has(&sl.readstate->backrefs, (void*)ctx->tokval))
parse_error(&ctx->loc, "label %"PRIdPTR" redefined", (intptr)numval(ctx->tokval));
oldtokval = ctx->tokval;
v = do_read_sexpr(ctx, ctx->tokval);
ptrhash_put(&sl.readstate->backrefs, (void*)oldtokval, (void*)v);
return v;
case TOK_BACKREF:
// look up backreference
v = (sl_v)ptrhash_get(&sl.readstate->backrefs, (void*)ctx->tokval);
if(v == (sl_v)HT_NOTFOUND)
parse_error(&ctx->loc, "undefined label %"PRIdPTR, (intptr)numval(ctx->tokval));
return v;
case TOK_GENSYM:
pv = (sl_v*)ptrhash_bp(&sl.readstate->gensyms, (void*)ctx->tokval);
if(*pv == (sl_v)HT_NOTFOUND)
*pv = gensym();
return *pv;
case TOK_DOUBLEQUOTE:
return read_string(ctx);
case TOK_CLOSE:
parse_error(&ctx->loc, "unexpected ')'");
case TOK_CLOSEB:
parse_error(&ctx->loc, "unexpected ']'");
case TOK_CLOSEC:
parse_error(&ctx->loc, "unexpected '}'");
case TOK_DOT:
parse_error(&ctx->loc, "unexpected '.'");
}
return sl_void;
}
sl_v
sl_read_sexpr(sl_v f)
{
sl_readstate state;
state.prev = sl.readstate;
htable_new(&state.backrefs, 8);
htable_new(&state.gensyms, 8);
state.source = f;
sl.readstate = &state;
Rctx ctx;
ctx.toktype = TOK_NONE;
sl_gc_handle(&ctx.tokval);
sl_v v = do_read_sexpr(&ctx, UNBOUND);
sl_free_gc_handles(1);
sl.readstate = state.prev;
free_readstate(&state);
return v;
}