ref: a70379d7e4b822f532fb0a8ccdd1624a90b64a68
dir: /src/read.c/
#include "sl.h" #include "cvalues.h" #include "read.h" #include "nan.h" enum { TOK_NONE, TOK_OPEN, TOK_CLOSE, TOK_DOT, TOK_QUOTE, TOK_SYM, TOK_NUM, TOK_BQ, TOK_COMMA, TOK_COMMAAT, TOK_COMMADOT, TOK_SHARPDOT, TOK_LABEL, TOK_BACKREF, TOK_SHARPQUOTE, TOK_SHARPOPEN, TOK_OPENB, TOK_CLOSEB, TOK_SHARPSYM, TOK_GENSYM, TOK_DOUBLEQUOTE, TOK_OPENC, TOK_CLOSEC, TOK_VERBATIM, }; #define PAtLoc "at %"PRIu32":%"PRIu32 typedef struct Rctx Rctx; struct Rctx { sl_v tokval; sl_loc loc; u32int toktype; bool ws; char buf[1024]; }; static sl_v do_read_sexpr(Rctx *ctx, sl_v label); #define RS value2c(sl_ios*, sl.readstate->source) bool sl_read_numtok(const char *tok, sl_v *pval, int base) { char *end; s64int i64; double d; if(*tok == '\0') return false; if(!((tok[0] == '0' && tok[1] == 'x') || (base >= 15)) && strpbrk(tok, ".eEpP")){ d = strtod(tok, &end); if(*end == '\0'){ if(pval) *pval = mk_double(d); return true; } // floats can end in f or f0 if(end > tok && end[0] == 'f' && (end[1] == '\0' || (end[1] == '0' && end[2] == '\0'))){ if(pval) *pval = mk_float((float)d); return true; } } if(tok[0] == '+'){ if(!strcmp(tok, "+NaN") || !strcasecmp(tok, "+nan.0")){ if(pval) *pval = mk_double(D_PNAN); return true; } if(!strcmp(tok, "+Inf") || !strcasecmp(tok, "+inf.0")){ if(pval) *pval = mk_double(D_PINF); return true; } }else if(tok[0] == '-'){ if(!strcmp(tok, "-NaN") || !strcasecmp(tok, "-nan.0")){ if(pval) *pval = mk_double(D_NNAN); return true; } if(!strcmp(tok, "-Inf") || !strcasecmp(tok, "-inf.0")){ if(pval) *pval = mk_double(D_NINF); return true; } } i64 = strtoll(tok, &end, base); if(*end != '\0') return false; if(pval != nil){ mpint *m; if(fits_fixnum(i64)) *pval = fixnum(i64); else if((m = strtomp(tok, &end, base, nil)) != nil) *pval = mk_mp(m); else return false; } return true; } static char nextchar(Rctx *ctx) { int c; sl_ios *f = RS; do{ c = ios_getc(RS); if(c == IOS_EOF) return 0; if(c == ';'){ // single-line comment do{ c = ios_getc(f); if(c == IOS_EOF) return 0; }while(c != '\n'); } if(c == '\n' && ctx->ws) return '\n'; }while(c == ' ' || isspace(c)); return c; } static void take(Rctx *ctx) { ctx->toktype = TOK_NONE; } static _Noreturn void sl_printfmt(2, 3) parse_error(sl_loc *loc, const char *format, ...) { char msgbuf[512]; va_list args; int n; n = snprintf(msgbuf, sizeof(msgbuf), "%s:%"PRIu64":%"PRIu64": ", loc->filename, (u64int)loc->lineno, (u64int)loc->colno); if(n >= (int)sizeof(msgbuf)) n = 0; va_start(args, format); vsnprintf(msgbuf+n, sizeof(msgbuf)-n, format, args); sl_v msg = str_from_cstr(msgbuf); va_end(args); sl_raise(mk_list2(sl_errparse, msg)); } static void accumchar(Rctx *ctx, char c, int *pi) { ctx->buf[(*pi)++] = c; if(*pi >= (int)(sizeof(ctx->buf)-1)) parse_error(&ctx->loc, "token too long"); } // return: 1 if escaped (forced to be symbol) static bool read_token(Rctx *ctx, char c, bool digits) { int i = 0, ch, nc = 0; bool escaped = false, issym = false; while(1){ if(nc != 0){ if(nc != 1) ios_getc(RS); ch = ios_peekc(RS); if(ch == IOS_EOF) goto terminate; c = ch; } if(c == '|'){ issym = true; escaped = !escaped; }else if(c == '\\'){ issym = true; ios_getc(RS); ch = ios_peekc(RS); if(ch == IOS_EOF) goto terminate; accumchar(ctx, ch, &i); }else if(!escaped && !(symchar(c) && (!digits || isdigit(c)))){ break; }else{ accumchar(ctx, c, &i); } nc++; } if(nc == 0) ios_skip(RS, -1); terminate: ctx->buf[i++] = '\0'; return issym; } static int isdigit_base(char c, int base) { if(base < 11) return c >= '0' && c < '0'+base; return (c >= '0' && c <= '9') || (c >= 'a' && c < 'a'+base-10) || (c >= 'A' && c < 'A'+base-10); } static u32int peek(Rctx *ctx) { u8int c; char *end; sl_fx x; int ch, base; if(ctx->toktype != TOK_NONE){ ctx->ws = false; return ctx->toktype; } c = nextchar(ctx); ctx->loc = RS->loc; if(ios_eof(RS) || isspace(c)){ ctx->ws = false; return TOK_NONE; } if(c == '(') ctx->toktype = TOK_OPEN; else if(c == ')') ctx->toktype = TOK_CLOSE; else if(c == '[') ctx->toktype = TOK_OPENB; else if(c == ']') ctx->toktype = TOK_CLOSEB; else if(c == '{') ctx->toktype = TOK_OPENC; else if(c == '}') ctx->toktype = TOK_CLOSEC; else if(c == '\'') ctx->toktype = TOK_QUOTE; else if(c == '`') ctx->toktype = TOK_BQ; else if(c == '"') ctx->toktype = TOK_DOUBLEQUOTE; else if(c == 0xc2 && ios_peekc(RS) == 0xab){ // « ctx->toktype = TOK_VERBATIM; ios_getc(RS); }else if(c == '#'){ c = ch = ios_getc(RS); if(ch == IOS_EOF) parse_error(&ctx->loc, "invalid read macro"); if(c == '.') ctx->toktype = TOK_SHARPDOT; else if(c == '\'') ctx->toktype = TOK_SHARPQUOTE; else if(c == '\\'){ Rune cval; if(ios_getrune(RS, &cval) == IOS_EOF) parse_error(&ctx->loc, "end of input in character constant"); if(cval == 'u' || cval == 'U' || cval == 'x'){ read_token(ctx, 'u', 0); if(ctx->buf[1] != '\0'){ // not a solitary 'u','U','x' if(!sl_read_numtok(&ctx->buf[1], &ctx->tokval, 16)) parse_error(&ctx->loc, "invalid hex character constant"); cval = numval(ctx->tokval); } }else if(cval >= 'a' && cval <= 'z'){ read_token(ctx, cval, 0); ctx->tokval = mk_sym(ctx->buf, true); if(ctx->buf[1] == '\0') USED(cval); /* one character */ else if(ctx->tokval == sl_nulsym) cval = 0x00; else if(ctx->tokval == sl_alarmsym) cval = 0x07; else if(ctx->tokval == sl_backspacesym) cval = 0x08; else if(ctx->tokval == sl_tabsym) cval = 0x09; else if(ctx->tokval == sl_linefeedsym) cval = 0x0A; else if(ctx->tokval == sl_newlinesym) cval = 0x0A; else if(ctx->tokval == sl_vtabsym) cval = 0x0B; else if(ctx->tokval == sl_pagesym) cval = 0x0C; else if(ctx->tokval == sl_returnsym) cval = 0x0D; else if(ctx->tokval == sl_escsym) cval = 0x1B; else if(ctx->tokval == sl_spacesym) cval = 0x20; else if(ctx->tokval == sl_deletesym) cval = 0x7F; else parse_error(&ctx->loc, "unknown character #\\%s", ctx->buf); } ctx->toktype = TOK_NUM; ctx->tokval = mk_rune(cval); }else if(c == '('){ ctx->toktype = TOK_SHARPOPEN; }else if(c == '<'){ parse_error(&ctx->loc, "unreadable object"); }else if(isdigit(c)){ read_token(ctx, c, 1); c = ios_getc(RS); if(c == '#') ctx->toktype = TOK_BACKREF; else if(c == '=') ctx->toktype = TOK_LABEL; else parse_error(&ctx->loc, "invalid label"); x = strtoll(ctx->buf, &end, 10); if(*end != '\0') parse_error(&ctx->loc, "invalid label"); ctx->tokval = fixnum(x); }else if(c == '!'){ // #! single line comment for shbang script support do{ ch = ios_getc(RS); }while(ch != IOS_EOF && ch != '\n'); return peek(ctx); }else if(c == '|'){ // multiline comment int commentlevel = 1; while(1){ ch = ios_getc(RS); hashpipe_gotc: if(ch == IOS_EOF) parse_error(&ctx->loc, "eof within comment"); if(ch == '|'){ ch = ios_getc(RS); if(ch == '#'){ commentlevel--; if(commentlevel == 0) break; else continue; } goto hashpipe_gotc; }else if(ch == '#'){ ch = ios_getc(RS); if(ch == '|') commentlevel++; else goto hashpipe_gotc; } } return peek(ctx); }else if(c == ';'){ // datum comment (void)do_read_sexpr(ctx, UNBOUND); // skip return peek(ctx); }else if(c == ':'){ // gensym ch = ios_getc(RS); if(ch == 'g') ch = ios_getc(RS); read_token(ctx, ch, 0); x = strtol(ctx->buf, &end, 10); if(*end != '\0' || ctx->buf[0] == '\0') parse_error(&ctx->loc, "invalid gensym label"); ctx->toktype = TOK_GENSYM; ctx->tokval = fixnum(x); }else if(symchar(c)){ read_token(ctx, ch, 0); if(((c == 'b' && (base = 2)) || (c == 'o' && (base = 8)) || (c == 'd' && (base = 10)) || (c == 'x' && (base = 16))) && (isdigit_base(ctx->buf[1], base) || ctx->buf[1] == '-')){ if(!sl_read_numtok(&ctx->buf[1], &ctx->tokval, base)) parse_error(&ctx->loc, "invalid base %d constant", base); ctx->ws = false; return (ctx->toktype = TOK_NUM); } ctx->toktype = TOK_SHARPSYM; ctx->tokval = mk_sym(ctx->buf, true); }else{ parse_error(&ctx->loc, "unknown read macro"); } }else if(c == ','){ ctx->toktype = TOK_COMMA; ch = ios_peekc(RS); if(ch == IOS_EOF) return ctx->toktype; if(ch == '@') ctx->toktype = TOK_COMMAAT; else if(ch == '.') ctx->toktype = TOK_COMMADOT; else{ ctx->ws = false; return ctx->toktype; } ios_getc(RS); }else{ bool ok = read_token(ctx, c, 0); const char *s = ctx->buf; if(!ok){ if(s[0] == '.' && s[1] == 0){ ctx->ws = false; return (ctx->toktype = TOK_DOT); } if(sl_read_numtok(s, &ctx->tokval, 0)){ ctx->ws = false; return (ctx->toktype = TOK_NUM); } } ctx->toktype = TOK_SYM; if(strcasecmp(s, "nil") == 0) ctx->tokval = sl_nil; else if(s[1] == 0 && (s[0] == 't' || s[0] == 'T')) ctx->tokval = sl_t; else{ ctx->tokval = mk_sym(s, true); if(s[strlen(s)-1] == '#') ctx->toktype = TOK_GENSYM; } } ctx->ws = false; return ctx->toktype; } // NOTE: this is NOT an efficient operation. it is only used by the // reader, and requires at least 1 and up to 3 garbage collections! static sl_v vec_grow(sl_v v, bool rewrite_refs) { usize i, s = vec_size(v); usize d = vec_grow_amt(s); PUSH(v); assert(s+d > s); sl_v newv = alloc_vec(s+d, 1); v = sl.sp[-1]; for(i = 0; i < s; i++) vec_elt(newv, i) = vec_elt(v, i); // use gc to rewrite references from the old vector to the new sl.sp[-1] = newv; if(s > 0 && rewrite_refs){ ((usize*)ptr(v))[0] |= 0x1; vec_elt(v, 0) = newv; sl_gc(false); } return POP(); } static sl_v read_vec(Rctx *ctx, sl_v label, u32int closer) { sl_v v = sl_emptyvec, elt; u32int i = 0; PUSH(v); if(label != UNBOUND) ptrhash_put(&sl.readstate->backrefs, (void*)label, (void*)v); while(peek(ctx) != closer){ if(ios_eof(RS)) parse_error(&ctx->loc, "unexpected end of input"); v = sl.sp[-1]; // reload after possible alloc in peek() if(i >= vec_size(v)){ v = sl.sp[-1] = vec_grow(v, label != UNBOUND); if(label != UNBOUND) ptrhash_put(&sl.readstate->backrefs, (void*)label, (void*)v); } elt = do_read_sexpr(ctx, UNBOUND); v = sl.sp[-1]; assert(i < vec_size(v)); vec_elt(v, i) = elt; i++; } take(ctx); if(i > 0) vec_setsize(v, i); return POP(); } static sl_v read_verbatim(Rctx *ctx) { char *temp; usize i = 0, sz = sizeof(ctx->buf); char *buf = ctx->buf; while(1){ if(i >= sz-UTFmax){ // -UTFmax: leaves room for longest utf8 sequence sz *= 2; if(buf == ctx->buf){ if((temp = MEM_ALLOC(sz)) != nil) memcpy(temp, ctx->buf, i); }else temp = MEM_REALLOC(buf, sz); if(temp == nil){ if(buf == ctx->buf) MEM_FREE(buf); parse_error(&ctx->loc, "out of memory reading verbatim string"); } buf = temp; } int c = ios_getc(RS); if(c == IOS_EOF){ if(buf != ctx->buf) MEM_FREE(buf); parse_error(&ctx->loc, "unexpected end of input in verbatim string"); } if(c == 0xc2 && ios_peekc(RS) == 0xbb){ // » ios_getc(RS); break; } buf[i++] = c; } sl_v s = cvalue_str(i); memcpy(cvalue_data(s), buf, i); if(buf != ctx->buf) MEM_FREE(buf); return s; } static sl_v read_str(Rctx *ctx) { char *buf, *temp; char eseq[10]; usize i = 0, j, sz, ndig; sl_v s; Rune r = 0; sz = sizeof(ctx->buf); buf = ctx->buf; while(1){ if(i >= sz-UTFmax){ // -UTFmax: leaves room for longest utf8 sequence sz *= 2; if(buf == ctx->buf){ if((temp = MEM_ALLOC(sz)) != nil) memcpy(temp, ctx->buf, i); }else temp = MEM_REALLOC(buf, sz); if(temp == nil){ if(buf == ctx->buf) MEM_FREE(buf); parse_error(&ctx->loc, "out of memory reading string"); } buf = temp; } int c = ios_getc(RS); if(c == IOS_EOF){ if(buf != ctx->buf) MEM_FREE(buf); parse_error(&ctx->loc, "unexpected end of input in string"); } if(c == '"') break; else if(c == '\\'){ c = ios_getc(RS); if(c == IOS_EOF){ if(buf != ctx->buf) MEM_FREE(buf); parse_error(&ctx->loc, "end of input in escape sequence"); } j = 0; if(octal_digit(c)){ while(1){ eseq[j++] = c; c = ios_peekc(RS); if(c == IOS_EOF || !octal_digit(c) || j >= 3) break; ios_getc(RS); } eseq[j] = '\0'; r = strtol(eseq, nil, 8); // \DDD and \xXX read bytes, not characters buf[i++] = r; }else if((c == 'x' && (ndig = 2)) || (c == 'u' && (ndig = 4)) || (c == 'U' && (ndig = 8))){ while(1){ c = ios_peekc(RS); if(c == IOS_EOF || !hex_digit(c) || j >= ndig) break; eseq[j++] = c; ios_getc(RS); } eseq[j] = '\0'; if(j) r = strtol(eseq, nil, 16); if(!j || r > Runemax){ if(buf != ctx->buf) MEM_FREE(buf); parse_error(&ctx->loc, "invalid escape sequence"); } if(ndig == 2) buf[i++] = r; else i += runetochar(&buf[i], &r); }else if(c == '\n'){ /* do nothing */ }else{ char esc = read_escape_control_char(c); if(esc == c && !strchr("\\'\"`", esc)){ if(buf != ctx->buf) MEM_FREE(buf); sl_loc *l = &RS->loc; parse_error( &ctx->loc, "invalid escape sequence \\%c "PAtLoc, c, l->lineno, l->colno ); } buf[i++] = esc; } }else{ buf[i++] = c; } } s = cvalue_str(i); memcpy(cvalue_data(s), buf, i); if(buf != ctx->buf) MEM_FREE(buf); return s; } // build a list of conses. this is complicated by the fact that all conses // can move whenever a new cons is allocated. we have to refer to every cons // through a handle to a relocatable pointer (i.e. a pointer on the stack). static void read_list(Rctx *ctx, sl_v label, u32int closer) { sl_v c, *pc, *pval, *ipval, *ipc; u32int t; sl_loc loc0; loc0 = RS->loc; loc0.colno--; ipval = sl.sp-1; PUSH(sl_nil); ipc = sl.sp-1; // to keep track of current cons cell t = peek(ctx); while(t != closer){ if(ios_eof(RS)) parse_error(&loc0, "not closed: unexpected EOI "PAtLoc, ctx->loc.lineno, ctx->loc.colno); c = alloc_cons(); car_(c) = cdr_(c) = sl_nil; pc = ipc; if(iscons(*pc)) cdr_(*pc) = c; else{ pval = ipval; *pval = c; if(label != UNBOUND) ptrhash_put(&sl.readstate->backrefs, (void*)label, (void*)c); } *pc = c; c = do_read_sexpr(ctx, UNBOUND); pc = ipc; car_(*pc) = c; t = peek(ctx); if(t == TOK_DOT){ take(ctx); c = do_read_sexpr(ctx, UNBOUND); pc = ipc; cdr_(*pc) = c; t = peek(ctx); if(ios_eof(RS)) parse_error(&ctx->loc, "unexpected end of input"); if(t != closer){ take(ctx); parse_error( &ctx->loc, "expected '%c'", closer == TOK_CLOSEB ? ']' : (closer == TOK_CLOSEC ? '}' : ')') ); } } } take(ctx); c = POP(); USED(c); } // label is the backreference we'd like to fix up with this read static sl_v do_read_sexpr(Rctx *ctx, sl_v label) { sl_v v, sym, oldtokval, *head; sl_v *pv; u32int t; char c; t = peek(ctx); take(ctx); switch(t){ case TOK_OPEN: PUSH(sl_nil); read_list(ctx, label, TOK_CLOSE); return POP(); case TOK_SYM: case TOK_NUM: return ctx->tokval; case TOK_OPENB: PUSH(sl_nil); read_list(ctx, label, TOK_CLOSEB); return POP(); case TOK_OPENC: PUSH(sl_nil); read_list(ctx, label, TOK_CLOSEC); return POP(); case TOK_COMMA: head = &sl_comma; goto listwith; case TOK_COMMAAT: head = &sl_commaat; goto listwith; case TOK_COMMADOT: head = &sl_commadot; goto listwith; case TOK_BQ: head = &sl_backquote; goto listwith; case TOK_QUOTE: head = &sl_quote; listwith: v = cons_reserve(2); car_(v) = *head; cdr_(v) = tagptr((sl_cons*)ptr(v)+1, TAG_CONS); car_(cdr_(v)) = cdr_(cdr_(v)) = sl_nil; PUSH(v); if(label != UNBOUND) ptrhash_put(&sl.readstate->backrefs, (void*)label, (void*)v); v = do_read_sexpr(ctx, UNBOUND); car_(cdr_(sl.sp[-1])) = v; return POP(); case TOK_SHARPQUOTE: // StreetLISP doesn't need symbol-function, so #' does nothing return do_read_sexpr(ctx, label); case TOK_SHARPSYM: sym = ctx->tokval; // constructor notation c = nextchar(ctx); ctx->loc = RS->loc; if(c != '('){ take(ctx); parse_error(&ctx->loc, "expected argument list for %s", sym_name(ctx->tokval)); } PUSH(sl_nil); read_list(ctx, UNBOUND, TOK_CLOSE); if(sym == sl_vu8sym){ sym = sl_arrsym; sl.sp[-1] = mk_cons(sl_u8sym, sl.sp[-1]); } v = sym_value(sym); if(v == UNBOUND) unbound_error(sym); return sl_apply(v, POP()); case TOK_SHARPOPEN: return read_vec(ctx, label, TOK_CLOSE); case TOK_SHARPDOT: // eval-when-read // evaluated expressions can refer to existing backreferences, but they // cannot see pending labels. in other words: // (... #2=#.#0# ... ) OK // (... #2=#.(#2#) ... ) DO NOT WANT sym = do_read_sexpr(ctx, UNBOUND); if(issym(sym)){ v = sym_value(sym); if(v == UNBOUND) unbound_error(sym); return v; } return sl_toplevel_eval(sym); case TOK_LABEL: // create backreference label if(ptrhash_has(&sl.readstate->backrefs, (void*)ctx->tokval)) parse_error(&ctx->loc, "label %"PRIdPTR" redefined", (intptr)numval(ctx->tokval)); oldtokval = ctx->tokval; v = do_read_sexpr(ctx, ctx->tokval); ptrhash_put(&sl.readstate->backrefs, (void*)oldtokval, (void*)v); return v; case TOK_BACKREF: // look up backreference v = (sl_v)ptrhash_get(&sl.readstate->backrefs, (void*)ctx->tokval); if(v == (sl_v)HT_NOTFOUND) parse_error(&ctx->loc, "undefined label %"PRIdPTR, (intptr)numval(ctx->tokval)); return v; case TOK_GENSYM: pv = (sl_v*)ptrhash_bp(&sl.readstate->gensyms, (void*)ctx->tokval); if(*pv == (sl_v)HT_NOTFOUND) *pv = mk_gensym(); return *pv; case TOK_DOUBLEQUOTE: return read_str(ctx); case TOK_VERBATIM: return read_verbatim(ctx); case TOK_CLOSE: parse_error(&ctx->loc, "unexpected ')'"); case TOK_CLOSEB: parse_error(&ctx->loc, "unexpected ']'"); case TOK_CLOSEC: parse_error(&ctx->loc, "unexpected '}'"); case TOK_DOT: parse_error(&ctx->loc, "unexpected '.'"); } return sl_void; } sl_v sl_read_sexpr(sl_v f, bool ws) { sl_readstate state; state.prev = sl.readstate; htable_new(&state.backrefs, 8); htable_new(&state.gensyms, 8); state.source = f; sl.readstate = &state; Rctx ctx; ctx.toktype = TOK_NONE; ctx.ws = ws; sl_gc_handle(&ctx.tokval); sl_v v = do_read_sexpr(&ctx, UNBOUND); sl_free_gc_handles(1); sl.readstate = state.prev; free_readstate(&state); return v; }