shithub: femtolisp

ref: a4f25a806f45195df93398688e7e2683c2b73890
dir: /read.c/

View raw version
#include "flisp.h"
#include "cvalues.h"
#include "read.h"

enum {
	TOK_NONE, TOK_OPEN, TOK_CLOSE, TOK_DOT, TOK_QUOTE, TOK_SYM, TOK_NUM,
	TOK_BQ, TOK_COMMA, TOK_COMMAAT, TOK_COMMADOT,
	TOK_SHARPDOT, TOK_LABEL, TOK_BACKREF, TOK_SHARPQUOTE, TOK_SHARPOPEN,
	TOK_OPENB, TOK_CLOSEB, TOK_SHARPSYM, TOK_GENSYM, TOK_DOUBLEQUOTE,
	TOK_OPENC, TOK_CLOSEC,
};

typedef struct Rctx Rctx;

struct Rctx {
	uint32_t toktype;
	value_t tokval;
	char buf[1024];
};

static value_t do_read_sexpr(Rctx *ctx, value_t label);

#define RS value2c(ios_t*, FL(readstate)->source)

bool
fl_read_numtok(const char *tok, value_t *pval, int base)
{
	char *end;
	int64_t i64;
	double d;
	if(*tok == '\0')
		return false;
	if(!((tok[0] == '0' && tok[1] == 'x') || (base >= 15)) && strpbrk(tok, ".eEpP")){
		d = strtod(tok, &end);
		if(*end == '\0'){
			if(pval)
				*pval = mk_double(d);
			return true;
		}
		// floats can end in f or f0
		if(end > tok && end[0] == 'f' &&
			(end[1] == '\0' ||
			 (end[1] == '0' && end[2] == '\0'))){
			if(pval)
				*pval = mk_float((float)d);
			return true;
		}
	}

	if(tok[0] == '+'){
		if(!strcmp(tok, "+NaN") || !strcasecmp(tok, "+nan.0")){
			if(pval)
				*pval = mk_double(D_PNAN);
			return true;
		}
		if(!strcmp(tok, "+Inf") || !strcasecmp(tok, "+inf.0")){
			if(pval)
				*pval = mk_double(D_PINF);
			return true;
		}
	}else if(tok[0] == '-'){
		if(!strcmp(tok, "-NaN") || !strcasecmp(tok, "-nan.0")){
			if(pval)
				*pval = mk_double(D_NNAN);
			return true;
		}
		if(!strcmp(tok, "-Inf") || !strcasecmp(tok, "-inf.0")){
			if(pval)
				*pval = mk_double(D_NINF);
			return true;
		}
	}
	i64 = strtoll(tok, &end, base);
	if(*end != '\0')
		return false;
	if(pval != nil)
		*pval = fits_fixnum(i64) ? fixnum(i64) : mk_mpint(strtomp(tok, &end, base, nil));
	return true;
}

static char
nextchar(void)
{
	int ch;
	char c;
	ios_t *f = RS;

	do{
		ch = ios_getc(RS);
		if(ch == IOS_EOF)
			return 0;
		c = (char)ch;
		if(c == ';'){
			// single-line comment
			do{
				ch = ios_getc(f);
				if(ch == IOS_EOF)
					return 0;
			}while((char)ch != '\n');
			c = (char)ch;
		}
	}while(c == ' ' || isspace(c));
	return c;
}

static void
take(Rctx *ctx)
{
	ctx->toktype = TOK_NONE;
}

static _Noreturn void
parse_error(const char *format, ...)
{
	char msgbuf[512];
	va_list args;
	int n;

	n = snprintf(msgbuf, sizeof(msgbuf), "%s:%"PRIu64":%"PRIu64": ",
		RS->filename, (uint64_t)RS->lineno, (uint64_t)RS->colno);
	if(n >= (int)sizeof(msgbuf))
		n = 0;
	va_start(args, format);
	vsnprintf(msgbuf+n, sizeof(msgbuf)-n, format, args);
	value_t msg = string_from_cstr(msgbuf);
	va_end(args);

	fl_raise(fl_list2(FL(ParseError), msg));
}

static void
accumchar(Rctx *ctx, char c, int *pi)
{
	ctx->buf[(*pi)++] = c;
	if(*pi >= (int)(sizeof(ctx->buf)-1))
		parse_error("token too long");
}

// return: 1 if escaped (forced to be symbol)
static bool
read_token(Rctx *ctx, char c, bool digits)
{
	int i = 0, ch, nc = 0;
	bool escaped = false, issym = false;

	while(1){
		if(nc != 0){
			if(nc != 1)
				ios_getc(RS);
			ch = ios_peekc(RS);
			if(ch == IOS_EOF)
				goto terminate;
			c = (char)ch;
		}
		if(c == '|'){
			issym = true;
			escaped = !escaped;
		}else if(c == '\\'){
			issym = true;
			ios_getc(RS);
			ch = ios_peekc(RS);
			if(ch == IOS_EOF)
				goto terminate;
			accumchar(ctx, (char)ch, &i);
		}else if(!escaped && !(symchar(c) && (!digits || isdigit(c)))){
			break;
		}else{
			accumchar(ctx, c, &i);
		}
		nc++;
	}
	if(nc == 0)
		ios_skip(RS, -1);
terminate:
	ctx->buf[i++] = '\0';
	return issym;
}

static int
isdigit_base(char c, int base)
{
	if(base < 11)
		return c >= '0' && c < '0'+base;
	return (c >= '0' && c <= '9') || (c >= 'a' && c < 'a'+base-10) || (c >= 'A' && c < 'A'+base-10);
}

static uint32_t
peek(Rctx *ctx)
{
	char c, *end;
	fixnum_t x;
	int ch, base;

	if(ctx->toktype != TOK_NONE)
		return ctx->toktype;
	c = nextchar();
	if(ios_eof(RS))
		return TOK_NONE;
	if(c == '(')
		ctx->toktype = TOK_OPEN;
	else if(c == ')')
		ctx->toktype = TOK_CLOSE;
	else if(c == '[')
		ctx->toktype = TOK_OPENB;
	else if(c == ']')
		ctx->toktype = TOK_CLOSEB;
	else if(c == '{')
		ctx->toktype = TOK_OPENC;
	else if(c == '}')
		ctx->toktype = TOK_CLOSEC;
	else if(c == '\'')
		ctx->toktype = TOK_QUOTE;
	else if(c == '`')
		ctx->toktype = TOK_BQ;
	else if(c == '"')
		ctx->toktype = TOK_DOUBLEQUOTE;
	else if(c == '#'){
		ch = ios_getc(RS); c = (char)ch;
		if(ch == IOS_EOF)
			parse_error("invalid read macro");
		if(c == '.')
			ctx->toktype = TOK_SHARPDOT;
		else if(c == '\'')
			ctx->toktype = TOK_SHARPQUOTE;
		else if(c == '\\'){
			Rune cval;
			if(ios_getutf8(RS, &cval) == IOS_EOF)
				parse_error("end of input in character constant");
			if(cval == 'u' || cval == 'U' || cval == 'x'){
				read_token(ctx, 'u', 0);
				if(ctx->buf[1] != '\0'){ // not a solitary 'u','U','x'
					if(!fl_read_numtok(&ctx->buf[1], &ctx->tokval, 16))
						parse_error("invalid hex character constant");
					cval = numval(ctx->tokval);
				}
			}else if(cval >= 'a' && cval <= 'z'){
				read_token(ctx, (char)cval, 0);
				ctx->tokval = symbol(ctx->buf, true);
				if(ctx->buf[1] == '\0') USED(cval); /* one character */
				else if(ctx->tokval == FL(nulsym))       cval = 0x00;
				else if(ctx->tokval == FL(alarmsym))     cval = 0x07;
				else if(ctx->tokval == FL(backspacesym)) cval = 0x08;
				else if(ctx->tokval == FL(tabsym))       cval = 0x09;
				else if(ctx->tokval == FL(linefeedsym))  cval = 0x0A;
				else if(ctx->tokval == FL(newlinesym))   cval = 0x0A;
				else if(ctx->tokval == FL(vtabsym))      cval = 0x0B;
				else if(ctx->tokval == FL(pagesym))      cval = 0x0C;
				else if(ctx->tokval == FL(returnsym))    cval = 0x0D;
				else if(ctx->tokval == FL(escsym))       cval = 0x1B;
				else if(ctx->tokval == FL(spacesym))     cval = 0x20;
				else if(ctx->tokval == FL(deletesym))    cval = 0x7F;
				else
					parse_error("unknown character #\\%s", ctx->buf);
			}
			ctx->toktype = TOK_NUM;
			ctx->tokval = mk_rune(cval);
		}else if(c == '('){
			ctx->toktype = TOK_SHARPOPEN;
		}else if(c == '<'){
			parse_error("unreadable object");
		}else if(isdigit(c)){
			read_token(ctx, c, 1);
			c = (char)ios_getc(RS);
			if(c == '#')
				ctx->toktype = TOK_BACKREF;
			else if(c == '=')
				ctx->toktype = TOK_LABEL;
			else
				parse_error("invalid label");
			x = strtoll(ctx->buf, &end, 10);
			if(*end != '\0')
				parse_error("invalid label");
			ctx->tokval = fixnum(x);
		}else if(c == '!'){
			// #! single line comment for shbang script support
			do{
				ch = ios_getc(RS);
			}while(ch != IOS_EOF && (char)ch != '\n');
			return peek(ctx);
		}else if(c == '|'){
			// multiline comment
			int commentlevel = 1;
			while(1){
				ch = ios_getc(RS);
			hashpipe_gotc:
				if(ch == IOS_EOF)
					parse_error("eof within comment");
				if((char)ch == '|'){
					ch = ios_getc(RS);
					if((char)ch == '#'){
						commentlevel--;
						if(commentlevel == 0)
							break;
						else
							continue;
					}
					goto hashpipe_gotc;
				}else if((char)ch == '#'){
					ch = ios_getc(RS);
					if((char)ch == '|')
						commentlevel++;
					else
						goto hashpipe_gotc;
				}
			}
			// this was whitespace, so keep peeking
			return peek(ctx);
		}else if(c == ';'){
			// datum comment
			(void)do_read_sexpr(ctx, UNBOUND); // skip
			return peek(ctx);
		}else if(c == ':'){
			// gensym
			ch = ios_getc(RS);
			if((char)ch == 'g')
				ch = ios_getc(RS);
			read_token(ctx, (char)ch, 0);
			x = strtol(ctx->buf, &end, 10);
			if(*end != '\0' || ctx->buf[0] == '\0')
				parse_error("invalid gensym label");
			ctx->toktype = TOK_GENSYM;
			ctx->tokval = fixnum(x);
		}else if(symchar(c)){
			read_token(ctx, ch, 0);

			if(((c == 'b' && (base = 2)) ||
			    (c == 'o' && (base = 8)) ||
			    (c == 'd' && (base = 10)) ||
			    (c == 'x' && (base = 16))) && (isdigit_base(ctx->buf[1], base) || ctx->buf[1] == '-')){
				if(!fl_read_numtok(&ctx->buf[1], &ctx->tokval, base))
					parse_error("invalid base %d constant", base);
				return (ctx->toktype = TOK_NUM);
			}

			ctx->toktype = TOK_SHARPSYM;
			ctx->tokval = symbol(ctx->buf, true);
		}else{
			parse_error("unknown read macro");
		}
	}else if(c == ','){
		ctx->toktype = TOK_COMMA;
		ch = ios_peekc(RS);
		if(ch == IOS_EOF)
			return ctx->toktype;
		if((char)ch == '@')
			ctx->toktype = TOK_COMMAAT;
		else if((char)ch == '.')
			ctx->toktype = TOK_COMMADOT;
		else
			return ctx->toktype;
		ios_getc(RS);
	}else{
		if(!read_token(ctx, c, 0)){
			if(ctx->buf[0] == '.' && ctx->buf[1] == '\0')
				return (ctx->toktype = TOK_DOT);
			if(fl_read_numtok(ctx->buf, &ctx->tokval, 0))
				return (ctx->toktype = TOK_NUM);
		}
		ctx->toktype = TOK_SYM;
		char *name = (strcmp(ctx->buf, "lambda") == 0 || strcmp(ctx->buf, "λ") == 0) ? "λ" : ctx->buf;
		ctx->tokval = symbol(name, name == ctx->buf);
	}
	return ctx->toktype;
}

// NOTE: this is NOT an efficient operation. it is only used by the
// reader, and requires at least 1 and up to 3 garbage collections!
static value_t
vector_grow(value_t v, bool rewrite_refs)
{
	size_t i, s = vector_size(v);
	size_t d = vector_grow_amt(s);
	PUSH(v);
	assert(s+d > s);
	value_t newv = alloc_vector(s+d, 1);
	v = FL(stack)[FL(sp)-1];
	for(i = 0; i < s; i++)
		vector_elt(newv, i) = vector_elt(v, i);
	// use gc to rewrite references from the old vector to the new
	FL(stack)[FL(sp)-1] = newv;
	if(s > 0 && rewrite_refs){
		((size_t*)ptr(v))[0] |= 0x1;
		vector_elt(v, 0) = newv;
		gc(0);
	}
	return POP();
}

static value_t
read_vector(Rctx *ctx, value_t label, uint32_t closer)
{
	value_t v = FL(the_empty_vector), elt;
	uint32_t i = 0;
	PUSH(v);
	if(label != UNBOUND)
		ptrhash_put(&FL(readstate)->backrefs, (void*)label, (void*)v);
	while(peek(ctx) != closer){
		if(ios_eof(RS))
			parse_error("unexpected end of input");
		v = FL(stack)[FL(sp)-1]; // reload after possible alloc in peek()
		if(i >= vector_size(v)){
			v = FL(stack)[FL(sp)-1] = vector_grow(v, label != UNBOUND);
			if(label != UNBOUND)
				ptrhash_put(&FL(readstate)->backrefs, (void*)label, (void*)v);
		}
		elt = do_read_sexpr(ctx, UNBOUND);
		v = FL(stack)[FL(sp)-1];
		assert(i < vector_size(v));
		vector_elt(v, i) = elt;
		i++;
	}
	take(ctx);
	if(i > 0)
		vector_setsize(v, i);
	return POP();
}

static value_t
read_string(Rctx *ctx)
{
	char *buf, *temp;
	char eseq[10];
	size_t i = 0, j, sz, ndig;
	int c;
	value_t s;
	Rune r = 0;

	sz = sizeof(ctx->buf);
	buf = ctx->buf;
	while(1){
		if(i >= sz-UTFmax){ // -UTFmax: leaves room for longest utf8 sequence
			sz *= 2;
			if(buf == ctx->buf){
				if((temp = MEM_ALLOC(sz)) != nil)
					memcpy(temp, ctx->buf, i);
			}else
				temp = MEM_REALLOC(buf, sz);
			if(temp == nil){
				if(buf == ctx->buf)
					MEM_FREE(buf);
				parse_error("out of memory reading string");
			}
			buf = temp;
		}
		c = ios_getc(RS);
		if(c == IOS_EOF){
			if(buf != ctx->buf)
				MEM_FREE(buf);
			parse_error("unexpected end of input in string");
		}
		if(c == '"')
			break;
		else if(c == '\\'){
			c = ios_getc(RS);
			if(c == IOS_EOF){
				if(buf != ctx->buf)
					MEM_FREE(buf);
				parse_error("end of input in escape sequence");
			}
			j = 0;
			if(octal_digit(c)){
				while(1){
					eseq[j++] = c;
					c = ios_peekc(RS);
					if(c == IOS_EOF || !octal_digit(c) || j >= 3)
						break;
					ios_getc(RS);
				}
				eseq[j] = '\0';
				r = strtol(eseq, nil, 8);
				// \DDD and \xXX read bytes, not characters
				buf[i++] = (char)r;
			}else if((c == 'x' && (ndig = 2)) || (c == 'u' && (ndig = 4)) || (c == 'U' && (ndig = 8))){
				while(1){
					c = ios_peekc(RS);
					if(c == IOS_EOF || !hex_digit(c) || j >= ndig)
						break;
					eseq[j++] = c;
					ios_getc(RS);
				}
				eseq[j] = '\0';
				if(j)
					r = strtol(eseq, nil, 16);
				if(!j || r > Runemax){
					if(buf != ctx->buf)
						MEM_FREE(buf);
					parse_error("invalid escape sequence");
				}
				if(ndig == 2)
					buf[i++] = (char)r;
				else
					i += runetochar(&buf[i], &r);
			}else if(c == '\n'){
				/* do nothing */
			}else{
				char esc = read_escape_control_char((char)c);
				if(esc == (char)c && !strchr("\\'\"`", esc)){
					if(buf != ctx->buf)
						MEM_FREE(buf);
					parse_error("invalid escape sequence: \\%c", (char)c);
				}
				buf[i++] = esc;
			}
		}else{
			buf[i++] = c;
		}
	}
	s = cvalue_string(i);
	memcpy(cvalue_data(s), buf, i);
	if(buf != ctx->buf)
		MEM_FREE(buf);
	return s;
}

// build a list of conses. this is complicated by the fact that all conses
// can move whenever a new cons is allocated. we have to refer to every cons
// through a handle to a relocatable pointer (i.e. a pointer on the stack).
static void
read_list(Rctx *ctx, value_t *pval, value_t label, uint32_t closer)
{
	value_t c, *pc;
	uint32_t t;
	uint64_t lineno0, colno0;

	lineno0 = RS->lineno;
	colno0 = RS->colno - 1;
	PUSH(FL(Nil));
	pc = &FL(stack)[FL(sp)-1];  // to keep track of current cons cell
	t = peek(ctx);
	while(t != closer){
		if(ios_eof(RS))
			parse_error("unexpected end of input: %"PRIu64":%"PRIu64" not closed", lineno0, colno0);
		c = mk_cons(); car_(c) = cdr_(c) = FL(Nil);
		if(iscons(*pc))
			cdr_(*pc) = c;
		else{
			*pval = c;
			if(label != UNBOUND)
				ptrhash_put(&FL(readstate)->backrefs, (void*)label, (void*)c);
		}
		*pc = c;
		c = do_read_sexpr(ctx, UNBOUND); // must be on separate lines due to
		car_(*pc) = c;			  // undefined evaluation order

		t = peek(ctx);
		if(t == TOK_DOT){
			take(ctx);
			c = do_read_sexpr(ctx, UNBOUND);
			cdr_(*pc) = c;
			t = peek(ctx);
			if(ios_eof(RS))
				parse_error("unexpected end of input");
			if(t != closer){
				take(ctx);
				parse_error(
					"expected '%c'",
					closer == TOK_CLOSEB ? ']' : (closer == TOK_CLOSEC ? '}' : ')')
				);
			}
		}
	}
	take(ctx);
	c = POP();
	USED(c);
}

// label is the backreference we'd like to fix up with this read
static value_t
do_read_sexpr(Rctx *ctx, value_t label)
{
	value_t v, sym, oldtokval, *head;
	value_t *pv;
	uint32_t t;
	char c;

	t = peek(ctx);
	take(ctx);
	switch(t){
	case TOK_OPEN:
		PUSH(FL(Nil));
		read_list(ctx, &FL(stack)[FL(sp)-1], label, TOK_CLOSE);
		return POP();
	case TOK_SYM:
	case TOK_NUM:
		return ctx->tokval;
	case TOK_OPENB:
		PUSH(FL(Nil));
		read_list(ctx, &FL(stack)[FL(sp)-1], label, TOK_CLOSEB);
		return POP();
	case TOK_OPENC:
		PUSH(FL(Nil));
		read_list(ctx, &FL(stack)[FL(sp)-1], label, TOK_CLOSEC);
		return POP();
	case TOK_CLOSE:
		parse_error("unexpected ')'");
	case TOK_CLOSEB:
		parse_error("unexpected ']'");
	case TOK_CLOSEC:
		parse_error("unexpected '}'");
	case TOK_DOT:
		parse_error("unexpected '.'");
	case TOK_COMMA:
		head = &FL(comma); goto listwith;
	case TOK_COMMAAT:
		head = &FL(commaat); goto listwith;
	case TOK_COMMADOT:
		head = &FL(commadot); goto listwith;
	case TOK_BQ:
		head = &FL(backquote); goto listwith;
	case TOK_QUOTE:
		head = &FL(quote);
	listwith:
		v = cons_reserve(2);
		car_(v) = *head;
		cdr_(v) = tagptr(((cons_t*)ptr(v))+1, TAG_CONS);
		car_(cdr_(v)) = cdr_(cdr_(v)) = FL(Nil);
		PUSH(v);
		if(label != UNBOUND)
			ptrhash_put(&FL(readstate)->backrefs, (void*)label, (void*)v);
		v = do_read_sexpr(ctx, UNBOUND);
		car_(cdr_(FL(stack)[FL(sp)-1])) = v;
		return POP();
	case TOK_SHARPQUOTE:
		// femtoLisp doesn't need symbol-function, so #' does nothing
		return do_read_sexpr(ctx, label);
	case TOK_SHARPSYM:
		sym = ctx->tokval;
		if(sym == FL(tsym) || sym == FL(Tsym))
			return FL(t);
		if(sym == FL(fsym) || sym == FL(Fsym))
			return FL(f);
		// constructor notation
		c = nextchar();
		if(c != '('){
			take(ctx);
			parse_error("expected argument list for %s", symbol_name(ctx->tokval));
		}
		PUSH(FL(Nil));
		read_list(ctx, &FL(stack)[FL(sp)-1], UNBOUND, TOK_CLOSE);
		if(sym == FL(vu8sym)){
			sym = FL(arraysym);
			FL(stack)[FL(sp)-1] = fl_cons(FL(uint8sym), FL(stack)[FL(sp)-1]);
		}else if(sym == FL(fnsym)){
			sym = FL(function);
		}
		v = symbol_value(sym);
		if(v == UNBOUND)
			unbound_error(sym);
		return fl_apply(v, POP());
	case TOK_SHARPOPEN:
		return read_vector(ctx, label, TOK_CLOSE);
	case TOK_SHARPDOT:
		// eval-when-read
		// evaluated expressions can refer to existing backreferences, but they
		// cannot see pending labels. in other words:
		// (... #2=#.#0# ... )	OK
		// (... #2=#.(#2#) ... )  DO NOT WANT
		sym = do_read_sexpr(ctx, UNBOUND);
		if(issymbol(sym)){
			v = symbol_value(sym);
			if(v == UNBOUND)
				unbound_error(sym);
			return v;
		}
		return fl_toplevel_eval(sym);
	case TOK_LABEL:
		// create backreference label
		if(ptrhash_has(&FL(readstate)->backrefs, (void*)ctx->tokval))
			parse_error("label %"PRIdPTR" redefined", numval(ctx->tokval));
		oldtokval = ctx->tokval;
		v = do_read_sexpr(ctx, ctx->tokval);
		ptrhash_put(&FL(readstate)->backrefs, (void*)oldtokval, (void*)v);
		return v;
	case TOK_BACKREF:
		// look up backreference
		v = (value_t)ptrhash_get(&FL(readstate)->backrefs, (void*)ctx->tokval);
		if(v == (value_t)HT_NOTFOUND)
			parse_error("undefined label %"PRIdPTR, numval(ctx->tokval));
		return v;
	case TOK_GENSYM:
		pv = (value_t*)ptrhash_bp(&FL(readstate)->gensyms, (void*)ctx->tokval);
		if(*pv == (value_t)HT_NOTFOUND)
			*pv = gensym();
		return *pv;
	case TOK_DOUBLEQUOTE:
		return read_string(ctx);
	}
	return FL(unspecified);
}

value_t
fl_read_sexpr(value_t f)
{
	fl_readstate_t state;
	state.prev = FL(readstate);
	htable_new(&state.backrefs, 8);
	htable_new(&state.gensyms, 8);
	state.source = f;
	FL(readstate) = &state;
	Rctx ctx;
	ctx.toktype = TOK_NONE;
	fl_gc_handle(&ctx.tokval);

	value_t v = do_read_sexpr(&ctx, UNBOUND);

	fl_free_gc_handles(1);
	FL(readstate) = state.prev;
	free_readstate(&state);
	return v;
}