ref: 02145f06ac007d730bc16930185fe18fa3e76c68
parent: 0b36426d023e45d6acbc7672c7083a91d10913a8
author: Peter Mikkelsen <peter@pmikkelsen.com>
date: Tue Jun 29 11:51:04 EDT 2021
Add a term parser.
--- a/dat.h
+++ b/dat.h
@@ -1,1 +1,28 @@
+typedef struct Term Term;
+struct Term
+{
+ int tag;
+
+ Rune *text;
+ int arity;
+ Term *next;
+ Term *children;
+ int numbertype;
+ vlong ival;
+ double dval;
+};
+
+enum {
+ CompoundTerm,
+ AtomTerm,
+ VariableTerm,
+ NumberTerm,
+ StringTerm,
+};
+
+enum {
+ NumberInt,
+ NumberFloat,
+};
+
int debug;
\ No newline at end of file
--- /dev/null
+++ b/example.pl
@@ -1,0 +1,5 @@
+:- dynamic(math/4).
+
+math(A,B,C,D) :- D is A + B + C * A.
+
+true :- 1 = 1.
--- a/fns.h
+++ b/fns.h
@@ -1,0 +1,13 @@
+/* parser.c */
+void parse(int);
+
+/* prettyprint.c */
+Rune *prettyprint(Term *);
+
+/* misc.c */
+Term *appendterm(Term *, Term *);
+int termslength(Term *);
+Term *mkatom(Rune *);
+Term *mkvariable(Rune *);
+Term *mkcompound(Rune *, int, Term *);
+Term *mknumber(int, vlong, double);
\ No newline at end of file
--- a/main.c
+++ b/main.c
@@ -9,10 +9,15 @@
void
main(int argc, char *argv[])
{
+ char *parsetestfile = nil;
+
ARGBEGIN{
case 'd':
debug = 1;
break;
+ case 'f':
+ parsetestfile = EARGF(usage());
+ break;
default:
usage();
}ARGEND
@@ -19,6 +24,13 @@
if(argc != 0)
usage();
+
+ if(parsetestfile){
+ int fd = open(parsetestfile, OREAD);
+ if(fd < 0)
+ exits("open");
+ parse(fd);
+ }
exits(nil);
}
--- /dev/null
+++ b/misc.c
@@ -1,0 +1,72 @@
+#include <u.h>
+#include <libc.h>
+
+#include "dat.h"
+#include "fns.h"
+
+Term *
+appendterm(Term *a, Term *b)
+{
+ if(a == nil)
+ return b;
+
+ Term *tmp;
+ for(tmp = a; tmp->next != nil; tmp = tmp->next);
+ tmp->next = b;
+ return a;
+}
+
+int
+termslength(Term *list)
+{
+ int len;
+ for(len = 0; list != nil; len++, list = list->next);
+ return len;
+}
+
+Term *
+mkterm(int tag)
+{
+ Term *t = malloc(sizeof(Term));
+ t->tag = tag;
+ t->next = nil;
+ t->children = nil;
+ t->text = nil;
+ return t;
+}
+
+Term *
+mkatom(Rune *name)
+{
+ Term *t = mkterm(AtomTerm);
+ t->text = name;
+ return t;
+}
+
+Term *
+mkvariable(Rune *name)
+{
+ Term *t = mkterm(VariableTerm);
+ t->text = name;
+ return t;
+}
+
+Term *
+mkcompound(Rune *name, int arity, Term *args)
+{
+ Term *t = mkterm(CompoundTerm);
+ t->text = name;
+ t->arity = arity;
+ t->children = args;
+ return t;
+}
+
+Term *
+mknumber(int type, vlong ival, double dval)
+{
+ Term *t = mkterm(NumberTerm);
+ t->numbertype = type;
+ t->ival = ival;
+ t->dval = dval;
+ return t;
+}
--- a/mkfile
+++ b/mkfile
@@ -2,7 +2,7 @@
TARG=pprolog
-OFILES=main.$O parser.$O
+OFILES=main.$O parser.$O prettyprint.$O misc.$O
HFILES=dat.h fns.h
--- a/parser.c
+++ b/parser.c
@@ -1,5 +1,561 @@
#include <u.h>
#include <libc.h>
+#include <bio.h>
#include "dat.h"
#include "fns.h"
+
+#define PrecedenceLevels 1200
+
+/* The parser doesn't produce an ast for now, it only acts like a recognizer,
+ printing errors if it fails */
+
+typedef struct Token Token;
+typedef struct Operator Operator;
+typedef struct OpInfo OpInfo;
+struct Token
+{
+ int tag;
+ Rune *text;
+ double dval;
+ vlong ival;
+};
+
+struct Operator
+{
+ int type;
+ int level;
+ Rune *spelling;
+ Operator *next;
+};
+
+struct OpInfo
+{
+ int level;
+ int type;
+};
+
+enum {
+ Xf = 1<<0, /* 1 */
+ Yf = 1<<1, /* 2 */
+ Xfx = 1<<2, /* 4 */
+ Xfy = 1<<3, /* 8 */
+ Yfx = 1<<4, /* 16 */
+ Fy = 1<<5, /* 32 */
+ Fx = 1<<6, /* 64 */
+};
+
+enum {
+ AtomTok = 1<<0, /* 1 */
+ FunctorTok = 1<<1, /* 2 */
+ VarTok = 1<<2, /* 4 */
+ StringTok = 1<<3, /* 8 */
+ IntTok = 1<<4, /* 16 */
+ FloatTok = 1<<5, /* 32 */
+ ParenLeftTok = 1<<6, /* 64 */
+ ParenRightTok = 1<<7, /* 128 */
+ SquareBracketLeftTok = 1<<8, /* 256 */
+ SquareBracketRightTok = 1<<9, /* 512 */
+ CurlyBracketLeftTok = 1<<10, /* 1024 */
+ CurlyBracketRightTok = 1<<11, /* 2048 */
+ CommaTok = 1<<12, /* 4096 */
+ EofTok = 1<<13, /* 8192 */
+};
+
+static Biobuf *parsein;
+static Token lookahead;
+static Operator *operators[PrecedenceLevels];
+
+void initoperators(void);
+void addoperator(int, int, Rune *);
+Operator *getoperator(Rune *);
+void nexttoken(void);
+Term *fullterm(int, Rune *, Term *);
+Term *term(void);
+Term *compound(void);
+Term *parseoperators(Term *);
+void match(int);
+void syntaxerror(char *);
+void prologtext(void);
+
+void
+parse(int fd)
+{
+ parsein = Bfdopen(fd, OREAD);
+ if(parsein == nil){
+ print("Could not open file\n");
+ return;
+ }
+ initoperators();
+ nexttoken();
+
+ prologtext();
+}
+
+void
+prologtext(void)
+{
+ if(lookahead.tag == EofTok)
+ return;
+
+ Term *t = fullterm(AtomTok, L".", nil);
+ if(t->tag != CompoundTerm || runestrcmp(t->text, L":-") != 0 || !(t->arity == 1 || t->arity == 2)){
+ print("Expected directive or clause as toplevel\n");
+ print("Got %S\n", prettyprint(t));
+ print("Tag=%d arity=%d text=\"%S\"\n", t->tag, t->arity, t->text);
+ syntaxerror("prologtext");
+ }
+
+ if(t->arity == 1){
+ /* A directive */
+ print("Got directive: %S\n", prettyprint(t));
+ }else{
+ /* A clause */
+ print("Got clause: %S\n", prettyprint(t));
+ }
+
+ if(lookahead.tag == AtomTok && runestrcmp(lookahead.text, L".") == 0)
+ match(AtomTok);
+ else
+ syntaxerror("prologtext");
+
+ prologtext();
+}
+
+Term *
+fullterm(int stoptag, Rune *stoptext, Term *list)
+{
+ if((lookahead.tag & stoptag) && (stoptext == nil || runestrcmp(stoptext, lookahead.text) == 0))
+ return parseoperators(list);
+
+ Term *t = term();
+ list = appendterm(list, t);
+ return fullterm(stoptag, stoptext, list);
+}
+
+Term *
+term(void)
+{
+ Term *result;
+
+ switch(lookahead.tag){
+ case AtomTok:
+ result = mkatom(lookahead.text);
+ match(AtomTok);
+ break;
+ case FunctorTok:
+ result = compound();
+ break;
+ case VarTok:
+ result = mkvariable(lookahead.text);
+ match(VarTok);
+ break;
+ case IntTok:
+ result = mknumber(NumberInt, lookahead.ival, 0);
+ match(IntTok);
+ break;
+ case FloatTok:
+ result = mknumber(NumberFloat, 0, lookahead.dval);
+ match(FloatTok);
+ break;
+ default:
+ print("Cant parse term of token type %d\n", lookahead.tag);
+ syntaxerror("term");
+ result = nil;
+ }
+
+ return result;
+}
+
+Term *
+compound(void)
+{
+ Rune *name = lookahead.text;
+ Term *args = nil;
+ int arity = 0;
+
+ match(FunctorTok);
+ match(ParenLeftTok);
+
+ while(lookahead.tag != ParenRightTok){
+ Term *t = fullterm(CommaTok|ParenRightTok, nil, nil);
+ if(lookahead.tag == CommaTok)
+ match(CommaTok);
+ args = appendterm(args, t);
+ arity++;
+ }
+
+ match(ParenRightTok);
+ return mkcompound(name, arity, args);
+}
+
+Term *
+parseoperators(Term *list)
+{
+ int i;
+ int length = termslength(list);
+ Term *t;
+ Term **terms = malloc(sizeof(Term *) * length);
+ OpInfo *infos = malloc(sizeof(OpInfo) * length);
+
+ for(i = 0, t = list; i < length; i++){
+ Operator *op = getoperator(t->text);
+ if(op){
+ infos[i].type = op->type;
+ infos[i].level = op->level;
+ }else{
+ infos[i].type = 0;
+ infos[i].level = 0;
+ }
+ terms[i] = t;
+ Term *tmp = t;
+ t = t->next;
+ tmp->next = nil;
+ }
+
+ while(length > 1){
+ int index = -1;
+ int lowest = PrecedenceLevels + 1;
+ for(i = 0; i < length; i++){
+ if(infos[i].type == 0)
+ continue;
+ if(infos[i].level == lowest && infos[i].type == Xfy && i == index+1)
+ index = i;
+ else if(infos[i].level < lowest){
+ index = i;
+ lowest = infos[i].level;
+ }
+ }
+
+ if(index == -1){
+ print("Can't parse, list contains no operators");
+ syntaxerror("parseoperators");
+ }
+
+ int infixlevel = infos[index].level & (Xfx|Xfy|Yfx);
+ int prefixlevel = infos[index].level & (Fx|Fy);
+ int postfixlevel = infos[index].level & (Xf|Yf);
+
+ if(infixlevel && index != 0 && index != length-1 && infos[index-1].type == 0 && infos[index-1].type == 0){
+ infos[index-1].type = 0;
+ infos[index-1].level = infos[index].level;
+ Term *args = terms[index-1];
+ args->next = terms[index+1];
+ terms[index-1] = mkcompound(terms[index]->text, 2, args);
+ length -= 2;
+ for(i = index; i < length; i++){
+ infos[i].level = infos[i+2].level;
+ infos[i].type = infos[i+2].type;
+ terms[i] = terms[i+2];
+ }
+ }else if(prefixlevel && index != length-1 && infos[index+1].type == 0){
+ infos[index].type = 0;
+ terms[index] = mkcompound(terms[index]->text, 1, terms[index+1]);
+ length--;
+ for(i = index+1; i < length; i++){
+ infos[i].level = infos[i+1].level;
+ infos[i].type = infos[i+1].type;
+ terms[i] = terms[i+1];
+ }
+ }else if(postfixlevel && index != 0 && infos[index-1].type == 0){
+ infos[index-1].type = 0;
+ infos[index-1].level = infos[index].level;
+ terms[index-1] = mkcompound(terms[index]->text, 1, terms[index-1]);
+ length--;
+ for(i = index; i < length; i++){
+ infos[i].level = infos[i+1].level;
+ infos[i].type = infos[i+1].type;
+ terms[i] = terms[i+1];
+ }
+ }else{
+ print("Parse error when parsing operators\n");
+ syntaxerror("parseoperators");
+ }
+ }
+
+ Term *result = terms[0];
+ free(infos);
+ free(terms);
+ return result;
+}
+
+void
+initoperators(void)
+{
+ Operator *op;
+ int i;
+ for(i = 0; i < PrecedenceLevels; i++){
+ while(operators[i]){
+ op = operators[i];
+ operators[i] = op->next;
+ free(op);
+ }
+ }
+
+ addoperator(1200, Xfx, L":-");
+ addoperator(1200, Fx, L":-");
+ addoperator(1100, Xfy, L";");
+ addoperator(1000, Xfy, L",");
+ addoperator(700, Xfx, L"=");
+ addoperator(700, Xfx, L"is");
+ addoperator(500, Yfx, L"+");
+ addoperator(400, Yfx, L"*");
+ addoperator(400, Yfx, L"/");
+}
+
+void
+addoperator(int level, int type, Rune *spelling)
+{
+ Operator *op = malloc(sizeof(Operator));
+ op->type = type;
+ op->level = level;
+ op->spelling = spelling;
+ op->next = operators[level-1];
+ operators[level-1] = op;
+}
+
+Operator *
+getoperator(Rune *spelling)
+{
+ Operator *op = nil;
+ int level;
+
+ if(spelling == nil)
+ return nil;
+
+ for(level = 0; level < PrecedenceLevels && op == nil; level++){
+ Operator *tmp;
+ for(tmp = operators[level]; tmp != nil; tmp = tmp->next){
+ if(runestrcmp(tmp->spelling, spelling) == 0){
+ if(op == nil){
+ op = malloc(sizeof(Operator));
+ memcpy(op, tmp, sizeof(Operator));
+ }else
+ op->type |= tmp->type;
+ }
+ }
+ }
+ return op;
+}
+
+void
+nexttoken(void)
+{
+ Rune buf[1024];
+ Rune peek;
+ int i = 0;
+ double numD;
+ vlong numI = 0;
+ int negative = 0;
+ static long replaypeek = -1;
+
+ if(replaypeek == -1)
+ peek = Bgetrune(parsein);
+ else{
+ peek = replaypeek;
+ replaypeek = -1;
+ }
+
+ /* Skip whitespace */
+ while(isspacerune(peek))
+ peek = Bgetrune(parsein);
+
+ /* Skip single line comments */
+ if(peek == L'%'){
+ while(peek != L'\n')
+ peek = Bgetrune(parsein);
+ peek = Bgetrune(parsein);
+ }
+
+ /* Variables */
+ if(peek == L'_' || isupperrune(peek)){
+ while(peek == L'_' || isalpharune(peek) || isdigitrune(peek)){
+ /* FIXME Should handle input length, also elsewhere in this function */
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ Bungetrune(parsein);
+
+ lookahead.tag = VarTok;
+ lookahead.text = runestrdup(buf);
+ return;
+ }
+
+ /* Strings */
+ if(peek == L'"'){
+ peek = Bgetrune(parsein);
+ while(peek != L'"'){
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ lookahead.tag = StringTok;
+ lookahead.text = runestrdup(buf);
+ return;
+ }
+
+ /* Numbers */
+ if(peek == L'-'){
+ peek = Bgetrune(parsein);
+ if(isdigitrune(peek))
+ negative = 1;
+ else{
+ Bungetrune(parsein);
+ peek = L'-';
+ }
+ }
+ if(isdigitrune(peek)){
+ if(peek == L'0'){
+ peek = Bgetrune(parsein);
+ switch(peek){
+ case L'o': /* Octal */
+ case L'x': /* Hexadecimal */
+ case L'b': /* Binary */
+ print("Cant lex numbers in other bases than base 10 right now\n");
+ exits("lexer");
+ }
+ }
+ while(isdigitrune(peek)){
+ numI *= 10;
+ numI += peek - L'0';
+ peek = Bgetrune(parsein);
+ }
+ if(peek == L'.'){
+ int place = 1;
+ numD = numI;
+ peek = Bgetrune(parsein);
+ if(!isdigitrune(peek)){
+ Bungetrune(parsein);
+ replaypeek = L'.';
+ goto Integer;
+ }
+ while(isdigitrune(peek)){
+ numD += (peek - L'0') / (10 * place);
+ peek = Bgetrune(parsein);
+ }
+ /* Should also lex 123.45E10 */
+ lookahead.tag = FloatTok;
+ lookahead.dval = negative ? -numD : numD;
+ }else{
+ Bungetrune(parsein);
+Integer:
+ lookahead.tag = IntTok;
+ lookahead.ival = negative ? -numI : numI;
+ }
+ return;
+ }
+
+ /* Atoms */
+ /* Atoms in single quotes */
+ if(peek == L'\''){
+ peek = Bgetrune(parsein);
+ while(peek != L'\''){
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(buf);
+ goto Functor;
+ }
+
+ /* Special atom [] */
+ if(peek == L'['){
+ peek = Bgetrune(parsein);
+ if(peek == L']'){
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(L"[]");
+ goto Functor;
+ }else{
+ Bungetrune(parsein);
+ lookahead.tag = SquareBracketLeftTok;
+ return;
+ }
+ }
+
+ /* Special atom {} */
+ if(peek == L'{'){
+ peek = Bgetrune(parsein);
+ if(peek == L'}'){
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(L"{}");
+ goto Functor;
+ }else{
+ Bungetrune(parsein);
+ lookahead.tag = CurlyBracketLeftTok;
+ return;
+ }
+ }
+
+ /* Graphic atom */
+ Rune *graphics = L"#$&*+-./:<=>?@^~\\";
+ if(runestrchr(graphics, peek)){
+ while(runestrchr(graphics, peek)){
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ Bungetrune(parsein);
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(buf);
+ goto Functor;
+ }
+
+ /* simple atom */
+ if(islowerrune(peek)){
+ while(isalpharune(peek) || isdigitrune(peek) || peek == L'_'){
+ buf[i++] = peek;
+ peek = Bgetrune(parsein);
+ }
+ buf[i] = '\0';
+ Bungetrune(parsein);
+ lookahead.tag = AtomTok;
+ lookahead.text = runestrdup(buf);
+ goto Functor;
+ }
+
+ /* Other */
+ if(runestrchr(L",.()]}", peek)){
+ switch(peek){
+ case L',': lookahead.tag = CommaTok; break;
+ case L'(': lookahead.tag = ParenLeftTok; break;
+ case L')': lookahead.tag = ParenRightTok; break;
+ case L']': lookahead.tag = SquareBracketRightTok; break;
+ case L'}': lookahead.tag = CurlyBracketRightTok; break;
+ }
+ return;
+ }
+
+ /* EOF */
+ if(peek == Beof){
+ lookahead.tag = EofTok;
+ return;
+ }
+
+ print("Could not lex rune %C (%ud)\n", peek, peek);
+ exits("lexer");
+
+Functor:
+ peek = Bgetrune(parsein);
+ if(peek == L'(')
+ lookahead.tag = FunctorTok;
+ Bungetrune(parsein);
+ return;
+}
+
+void
+match(int tag)
+{
+ if(lookahead.tag == tag)
+ nexttoken();
+ else
+ syntaxerror("match");
+}
+
+void
+syntaxerror(char *where)
+{
+ print("Syntax error: Unexpected %d token in %s\n", lookahead.tag, where);
+ exits("syntax error");
+}
\ No newline at end of file
--- /dev/null
+++ b/prettyprint.c
@@ -1,0 +1,61 @@
+#include <u.h>
+#include <libc.h>
+
+#include "dat.h"
+#include "fns.h"
+
+Rune *prettyprintlist(Term *, Rune *, int);
+
+Rune *
+prettyprint(Term *t)
+{
+ Rune *result;
+ Rune *args;
+
+ switch(t->tag){
+ case CompoundTerm:
+ args = prettyprintlist(t->children, L", ", 0);
+ result = runesmprint("%S(%S)", t->text, args);
+ free(args);
+ break;
+ case AtomTerm:
+ case VariableTerm:
+ result = runesmprint("%S", t->text);
+ break;
+ case NumberTerm:
+ if(t->numbertype == NumberInt)
+ result = runesmprint("%lld", t->ival);
+ else
+ result = runesmprint("%f", t->dval);
+ break;
+ default:
+ result = runesmprint("cant print term with tag %d", t->tag);
+ break;
+ }
+
+ return result;
+}
+
+Rune *
+prettyprintlist(Term *t, Rune *sep, int end)
+{
+ if(t == nil){
+ if(end)
+ return runesmprint("%S", sep);
+ else
+ return runesmprint("");
+ }
+
+ Rune *str = prettyprint(t);
+ Rune *rest = prettyprintlist(t->next, sep, end);
+ Rune *result;
+
+ if(t->next != nil)
+ result = runesmprint("%S%S%S", str, sep, rest);
+ else
+ result = runesmprint("%S%S", str, end ? rest : L"");
+
+ free(str);
+ free(rest);
+ return result;
+}
\ No newline at end of file