shithub: pprolog

ref: 8dd4f85c85aa9ab7a5f1219efd694e6707f2718c
dir: /parser.c/

View raw version
#include <u.h>
#include <libc.h>
#include <bio.h>

#include "dat.h"
#include "fns.h"

#define PrecedenceLevels 1200

typedef struct Token Token;
typedef struct Operator Operator;
typedef struct OpInfo OpInfo;
struct Token
{
	int	tag;
	Rune	*text;
	double	dval;
	vlong	ival;
};

struct Operator
{
	int type;
	int level;
	Rune *spelling;
	Operator *next;
};

struct OpInfo
{
	int level;
	int type;
};

enum {
	Xf	= 1<<0, /* 1 */
	Yf	= 1<<1, /* 2 */
	Xfx	= 1<<2, /* 4 */
	Xfy	= 1<<3, /* 8 */
	Yfx	= 1<<4, /* 16 */
	Fy	= 1<<5, /* 32 */
	Fx	= 1<<6, /* 64 */
};

enum {
	AtomTok			= 1<<0, /* 1 */
	FunctorTok		= 1<<1, /* 2 */
	VarTok			= 1<<2, /* 4 */
	StringTok		= 1<<3, /* 8 */
	IntTok			= 1<<4, /* 16 */
	FloatTok		= 1<<5, /* 32 */
	ParenLeftTok		= 1<<6, /* 64 */
	ParenRightTok		= 1<<7, /* 128 */
	SquareBracketLeftTok	= 1<<8, /* 256 */
	SquareBracketRightTok	= 1<<9, /* 512 */
	CurlyBracketLeftTok	= 1<<10, /* 1024 */
	CurlyBracketRightTok	= 1<<11, /* 2048 */
	CommaTok		= 1<<12, /* 4096 */
	PipeTok			= 1<<13, /* 8192 */
	EofTok			= 1<<14, /* 16384 */
};

static Biobuf *parsein;
static Token lookahead;
static Operator *operators[PrecedenceLevels];

void initoperators(void);
void addoperator(int, int, Rune *);
Operator *getoperator(Rune *);
void nexttoken(void);
Term *fullterm(int, Rune *, Term *);
Term *term(void);
Term *listterm(int);
Term *curlybracketterm(void);
Term *compound(void);
Term *parseoperators(Term *);
void match(int);
void syntaxerror_parser(char *);
Term *prologtext(int);

Term *
parse(int fd, int querymode)
{
	fd = dup(fd, -1);
	parsein = Bfdopen(fd, OREAD);
	if(parsein == nil){
		print("Could not open file\n");
		return nil;
	}
	initgoals = nil;
	initoperators();
	nexttoken();

	Term *result = prologtext(querymode);
	if(querymode){
		uvlong id = 1;
		result = copyterm(result, &id);
	}
	Bterm(parsein);
	return result;
}

Term *
prologtext(int querymode)
{
	if(lookahead.tag == EofTok)
		return nil;

	Term *t = fullterm(AtomTok, L".", nil);
	if(lookahead.tag == AtomTok && runestrcmp(lookahead.text, L".") == 0){
		if(!querymode)
			match(AtomTok);
	}else
		syntaxerror_parser("prologtext");

	if(querymode)
		return t;
	
	if(t->tag == CompoundTerm && runestrcmp(t->text, L":-") == 0 && t->arity == 1){
		Term *body = t->children;
		print("Got directive: %S\n", prettyprint(body));
		if(body->tag == CompoundTerm && body->arity == 1 && runestrcmp(body->text, L"initialization") == 0){
			Term *tmp = initgoals;
			initgoals = body->children;
			initgoals->next = tmp;
		}
		t = prologtext(querymode);
	}else if(t->tag == CompoundTerm && runestrcmp(t->text, L":-") == 0 && t->arity == 2){
		t->next = prologtext(querymode);
	}else if(t->tag == AtomTerm || t->tag == CompoundTerm){
		t->next = prologtext(querymode);
	}else{
		print("Expected directive or clause as toplevel\n");
		syntaxerror_parser("prologtext");
	}

	return t;
}

Term *
fullterm(int stoptag, Rune *stoptext, Term *list)
{
	if((lookahead.tag & stoptag) && (stoptext == nil || runestrcmp(stoptext, lookahead.text) == 0))
		return parseoperators(list);

	Term *t = term();
	list = appendterm(list, t);
	return fullterm(stoptag, stoptext, list);
}

Term *
term(void)
{
	Term *result;

	switch(lookahead.tag){
	case AtomTok:
		result = mkatom(lookahead.text);
		match(AtomTok);
		break;
	case FunctorTok:
		result = compound();
		break;
	case VarTok:
		result = mkvariable(lookahead.text);
		match(VarTok);
		break;
	case IntTok:
		result = mknumber(NumberInt, lookahead.ival, 0);
		match(IntTok);
		break;
	case FloatTok:
		result = mknumber(NumberFloat, 0, lookahead.dval);
		match(FloatTok);
		break;
	case CommaTok:
		result = mkatom(L",");
		match(CommaTok);
		break;
	case SquareBracketLeftTok:
		result = listterm(1);
		break;
	case CurlyBracketLeftTok:
		result = curlybracketterm();
		break;
	case ParenLeftTok:
		match(ParenLeftTok);
		result = fullterm(ParenRightTok, nil, nil);
		match(ParenRightTok);
		break;
	case StringTok:
		result = mkstring(lookahead.text);
		match(StringTok);
		break;
	default:
		print("Can't parse term of token type %d\n", lookahead.tag);
		syntaxerror_parser("term");
		result = nil;
	}

	return result;
}

Term *
listterm(int start)
{
	if(start)
		match(SquareBracketLeftTok);

	if(lookahead.tag == SquareBracketRightTok){
		match(SquareBracketRightTok);
		return mkatom(L"[]");
	}else if(lookahead.tag == PipeTok){
		match(PipeTok);
		Term *t = fullterm(SquareBracketRightTok, nil, nil);
		match(SquareBracketRightTok);
		return t;
	}else{
		Term *t = fullterm(CommaTok|PipeTok|SquareBracketRightTok, nil, nil);
		if(lookahead.tag == CommaTok)
			match(CommaTok);
		t->next = listterm(0);
		return mkcompound(L".", 2, t);
	}
}

Term *
curlybracketterm(void)
{
	match(CurlyBracketLeftTok);
	Term *result = fullterm(CurlyBracketRightTok, nil, nil);
	match(CurlyBracketRightTok);
	return mkcompound(L"{}", 1, result);
}

Term *
compound(void)
{
	Rune *name = lookahead.text;
	Term *args = nil;
	int arity = 0;

	match(FunctorTok);
	match(ParenLeftTok);

	while(lookahead.tag != ParenRightTok){
		Term *t = fullterm(CommaTok|ParenRightTok, nil, nil);
		if(lookahead.tag == CommaTok)
			match(CommaTok);
		args = appendterm(args, t);
		arity++;
	}

	match(ParenRightTok);
	return mkcompound(name, arity, args);
}

Term *
parseoperators(Term *list)
{
	int i;
	int length = termslength(list);
	Term *t;
	Term **terms = malloc(sizeof(Term *) * length);
	OpInfo *infos = malloc(sizeof(OpInfo) * length);

	for(i = 0, t = list; i < length; i++){
		Operator *op = getoperator(t->text);
		if(op && t->tag == AtomTerm){
			infos[i].type = op->type;
			infos[i].level = op->level;
		}else{
			infos[i].type = 0;
			infos[i].level = 0;
		}
		terms[i] = t;
		Term *tmp = t;
		t = t->next;
		tmp->next = nil;
	}

	while(length > 1){
		int index = -1;
		int lowest = PrecedenceLevels + 1;
		for(i = 0; i < length; i++){
			if(infos[i].type == 0)
				continue;
			if(infos[i].level == lowest && infos[i].type == Xfy)
				index = i;
			else if(infos[i].level < lowest){
				index = i;
				lowest = infos[i].level;
			}
		}

		if(index == -1){
			print("Can't parse, list of length %d contains no operators: ", length);
			for(i = 0; i < length; i++)
				print("%S(%d) ", prettyprint(terms[i]), infos[i].level);
			print("\n");
			syntaxerror_parser("parseoperators");
		}

		int infixlevel = infos[index].type & (Xfx|Xfy|Yfx);
		int prefixlevel = infos[index].type & (Fx|Fy);
		int postfixlevel = infos[index].type & (Xf|Yf);

		if(infixlevel && index != 0 && index != length-1 && infos[index-1].type == 0 && infos[index-1].type == 0){
			infos[index-1].type = 0;
			infos[index-1].level = infos[index].level;
			Term *args = terms[index-1];
			args->next = terms[index+1];
			terms[index-1] = mkcompound(terms[index]->text, 2, args);
			length -= 2;
			for(i = index; i < length; i++){
				infos[i].level = infos[i+2].level;
				infos[i].type = infos[i+2].type;
				terms[i] = terms[i+2];
			}
		}else if(prefixlevel && index != length-1 && infos[index+1].type == 0){
			infos[index].type = 0;
			terms[index] = mkcompound(terms[index]->text, 1, terms[index+1]);
			length--;
			for(i = index+1; i < length; i++){
				infos[i].level = infos[i+1].level;
				infos[i].type = infos[i+1].type;
				terms[i] = terms[i+1];
			}
		}else if(postfixlevel && index != 0 && infos[index-1].type == 0){
			infos[index-1].type = 0;
			infos[index-1].level = infos[index].level;
			terms[index-1] = mkcompound(terms[index]->text, 1, terms[index-1]);
			length--;
			for(i = index; i < length; i++){
				infos[i].level = infos[i+1].level;
				infos[i].type = infos[i+1].type;
				terms[i] = terms[i+1];
			}
		}else{
			print("Parse error when parsing operator %S (prefix=%d, postfix=%d, infix=%d level=%d)\n", prettyprint(terms[index]), prefixlevel, postfixlevel, infixlevel, infos[index].level);
			syntaxerror_parser("parseoperators");
		}
	}

	Term *result = terms[0];
	free(infos);
	free(terms);
	return result;
}

void
initoperators(void)
{
	Operator *op;
	int i;
	for(i = 0; i < PrecedenceLevels; i++){
		while(operators[i]){
			op = operators[i];
			operators[i] = op->next;
			free(op);
		}
	}

	addoperator(1200, Xfx, L":-");
	addoperator(1200, Xfx, L"-->");
	addoperator(1200, Fx,  L":-");
	addoperator(1200, Fx,  L"?-");
	addoperator(1100, Xfy, L";");
	addoperator(1050, Xfy, L"->");
	addoperator(1000, Xfy, L",");
	addoperator(900,  Fy,  L"\\+");
	addoperator(700,  Xfx, L"=");
	addoperator(700,  Xfx, L"\\=");
	addoperator(700,  Xfx, L"==");
	addoperator(700,  Xfx, L"\\==");
	addoperator(700,  Xfx, L"@<");
	addoperator(700,  Xfx, L"@=<");
	addoperator(700,  Xfx, L"@>");
	addoperator(700,  Xfx, L"@>=");
	addoperator(700,  Xfx, L"is");
	addoperator(700,  Xfx, L"=:=");
	addoperator(700,  Xfx, L"=\=");
	addoperator(700,  Xfx, L"<");
	addoperator(700,  Xfx, L"=<");
	addoperator(700,  Xfx, L">");
	addoperator(700,  Xfx, L">=");
	addoperator(700,  Xfx, L"=..");
	addoperator(500,  Yfx, L"+");
	addoperator(400,  Yfx, L"*");
	addoperator(400,  Yfx, L"/");
}

void
addoperator(int level, int type, Rune *spelling)
{
	Operator *op = malloc(sizeof(Operator));
	op->type = type;
	op->level = level;
	op->spelling = spelling;
	op->next = operators[level-1];
	operators[level-1] = op;
}

Operator *
getoperator(Rune *spelling)
{
	Operator *op = nil;
	int level;

	if(spelling == nil)
		return nil;

	for(level = 0; level < PrecedenceLevels && op == nil; level++){
		Operator *tmp;
		for(tmp = operators[level]; tmp != nil; tmp = tmp->next){
			if(runestrcmp(tmp->spelling, spelling) == 0){
				if(op == nil){
					op = malloc(sizeof(Operator));
					memcpy(op, tmp, sizeof(Operator));
				}else
					op->type |= tmp->type;
			}
		}
	}

	return op;
}

void
nexttoken(void)
{
	Rune buf[1024];
	Rune peek;
	int i = 0;
	double numD;
	vlong numI = 0;
	int negative = 0;
	static long replaypeek = -1;

	if(replaypeek == -1)
		peek = Bgetrune(parsein);
	else{
		peek = replaypeek;
		replaypeek = -1;
	}

SkipWhite:
	/* Skip whitespace */
	while(isspacerune(peek))
		peek = Bgetrune(parsein);

	/* Skip single line comments */
	if(peek == L'%'){
		while(peek != L'\n')
			peek = Bgetrune(parsein);
		goto SkipWhite;
	}

	/* Variables */
	if(peek == L'_' || isupperrune(peek)){
		while(peek == L'_' || isalpharune(peek) || isdigitrune(peek)){
			/* FIXME Should handle input length, also elsewhere in this function */
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		buf[i] = '\0';
		Bungetrune(parsein);

		lookahead.tag = VarTok;
		lookahead.text = runestrdup(buf);
		return;
	}

	/* Strings */
	if(peek == L'"'){
		peek = Bgetrune(parsein);
		while(peek != L'"'){
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		buf[i] = '\0';
		lookahead.tag = StringTok;
		lookahead.text = runestrdup(buf);
		return;
	}

	/* Numbers */
	if(peek == L'-'){
		peek = Bgetrune(parsein);
		if(isdigitrune(peek))
			negative = 1;
		else{
			Bungetrune(parsein);
			peek = L'-';
		}
	}
	if(isdigitrune(peek)){
		if(peek == L'0'){
			peek = Bgetrune(parsein);
			switch(peek){
			case L'o': /* Octal */
			case L'x': /* Hexadecimal */
			case L'b': /* Binary */
				print("Cant lex numbers in other bases than base 10 right now\n");
				exits("lexer");
			}
		}
		while(isdigitrune(peek)){
			numI *= 10;
			numI += peek - L'0';
			peek = Bgetrune(parsein);
		}
		if(peek == L'.'){
			int place = 1;
			numD = numI;
			peek = Bgetrune(parsein);
			if(!isdigitrune(peek)){
				Bungetrune(parsein);
				replaypeek = L'.';
				goto Integer;
			}
			while(isdigitrune(peek)){
				numD += (peek - L'0') / (10 * place);
				peek = Bgetrune(parsein);
			}
			Bungetrune(parsein);
			/* Should also lex 123.45E10 */
			lookahead.tag = FloatTok;
			lookahead.dval = negative ? -numD : numD;
		}else{
			Bungetrune(parsein);
Integer:
			lookahead.tag = IntTok;
			lookahead.ival = negative ? -numI : numI;
		}
		return;
	}

	/* Atoms */
	/* Atoms in single quotes */
	if(peek == L'\''){
		peek = Bgetrune(parsein);
		while(peek != L'\''){
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		buf[i] = '\0';
		lookahead.tag = AtomTok;
		lookahead.text = runestrdup(buf);
		goto Functor;
	}

	/* Special atom [] */
	if(peek == L'['){
		peek = Bgetrune(parsein);
		if(peek == L']'){
			lookahead.tag = AtomTok;
			lookahead.text = runestrdup(L"[]");
			goto Functor;
		}else{
			Bungetrune(parsein);
			lookahead.tag = SquareBracketLeftTok;
			return;
		}
	}

	/* Special atom {} */
	if(peek == L'{'){
		peek = Bgetrune(parsein);
		if(peek == L'}'){
			lookahead.tag = AtomTok;
			lookahead.text = runestrdup(L"{}");
			goto Functor;
		}else{
			Bungetrune(parsein);
			lookahead.tag = CurlyBracketLeftTok;
			return;
		}
	}

	/* Graphic atom */
	Rune *graphics = L"#$&*+-./:<=>?@^~\\";
	if(runestrchr(graphics, peek)){
		while(runestrchr(graphics, peek)){
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		buf[i] = '\0';
		Bungetrune(parsein);
		lookahead.tag = AtomTok;
		lookahead.text = runestrdup(buf);
		goto Functor;
	}

	/* simple atom */
	if(islowerrune(peek)){
		while(isalpharune(peek) || isdigitrune(peek) || peek == L'_'){
			buf[i++] = peek;
			peek = Bgetrune(parsein);
		}
		buf[i] = '\0';
		Bungetrune(parsein);
		lookahead.tag = AtomTok;
		lookahead.text = runestrdup(buf);
		goto Functor;
	}

	/* Other */
	if(runestrchr(L",.()]}|!;", peek)){
		switch(peek){
		case L',': lookahead.tag = CommaTok; break;
		case L'(': lookahead.tag = ParenLeftTok; break;
		case L')': lookahead.tag = ParenRightTok; break;
		case L']': lookahead.tag = SquareBracketRightTok; break;
		case L'}': lookahead.tag = CurlyBracketRightTok; break;
		case L'|': lookahead.tag = PipeTok; break;
		case L'!': lookahead.tag = AtomTok; lookahead.text = runestrdup(L"!"); break;
		case L';': lookahead.tag = AtomTok; lookahead.text = runestrdup(L";"); break;
		}
		return;
	}

	/* EOF */
	if(peek == Beof){
		lookahead.tag = EofTok;
		return;
	}

	print("Could not lex rune %C (%ud)\n", peek, peek);
	exits("lexer");

Functor:
	if(peek == Beof){
		Bgetrune(parsein);
		return;
	}

	peek = Bgetrune(parsein);
	if(peek == L'(')
		lookahead.tag = FunctorTok;
	Bungetrune(parsein);
	return;
}

void
match(int tag)
{
	if(lookahead.tag == tag)
		nexttoken();
	else
		syntaxerror_parser("match");
}

void
syntaxerror_parser(char *where)
{
	print("Syntax error: Unexpected %d (%S) token in %s\n", lookahead.tag, lookahead.text, where);
	exits("syntax error");
}