ref: a60e9f279604c56fc4c74054bc01f93f3e6debef
dir: /sam/regexp.c/
#include "sam.h" Rangeset sel; String lastregexp; /* * Machine Information */ typedef struct Inst Inst; struct Inst { long type; /* <= Runemax ==> literal, otherwise action */ union { int rsid; int rsubid; int class; struct Inst *rother; struct Inst *rright; } r; union{ struct Inst *lleft; struct Inst *lnext; } l; }; #define sid r.rsid #define subid r.rsubid #define rclass r.class #define other r.rother #define right r.rright #define left l.lleft #define next l.lnext #define NPROG 1024 Inst program[NPROG]; Inst *progp; Inst *startinst; /* First inst. of program; might not be program[0] */ Inst *bstartinst; /* same for backwards machine */ typedef struct Ilist Ilist; struct Ilist { Inst *inst; /* Instruction of the thread */ Rangeset se; Posn startp; /* first char of match */ }; #define NLIST 127 Ilist *tl, *nl; /* This list, next list */ Ilist list[2][NLIST+1]; /* +1 for trailing null */ static Rangeset sempty; /* * Actions and Tokens * * 0x100xx are operators, value == precedence * 0x200xx are tokens, i.e. operands for operators */ enum { OPERATOR = Runemask+1, /* Bitmask of all operators */ START = OPERATOR, /* Start, used for marker on stack */ RBRA, /* Right bracket, ) */ LBRA, /* Left bracket, ( */ OR, /* Alternation, | */ CAT, /* Concatentation, implicit operator */ STAR, /* Closure, * */ PLUS, /* a+ == aa* */ QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */ ANY = OPERATOR<<1, /* Any character but newline, . */ NOP, /* No operation, internal use only */ BOL, /* Beginning of line, ^ */ EOL, /* End of line, $ */ CCLASS, /* Character class, [] */ NCCLASS, /* Negated character class, [^] */ END, /* Terminate: match found */ ISATOR = OPERATOR, ISAND = OPERATOR<<1, }; /* * Parser Information */ typedef struct Node Node; struct Node { Inst *first; Inst *last; }; #define NSTACK 20 Node andstack[NSTACK]; Node *andp; int atorstack[NSTACK]; int *atorp; int lastwasand; /* Last token was operand */ int cursubid; int subidstack[NSTACK]; int *subidp; int backwards; int nbra; Rune *exprp; /* pointer to next character in source expression */ #define DCLASS 10 /* allocation increment */ int nclass; /* number active */ int Nclass; /* high water mark */ Rune **class; int negateclass; int addinst(Ilist *l, Inst *inst, Rangeset *sep); void newmatch(Rangeset*); void bnewmatch(Rangeset*); void pushand(Inst*, Inst*); void pushator(int); Node *popand(int); int popator(void); void startlex(Rune*); int lex(void); void operator(int); void operand(int); void evaluntil(int); void optimize(Inst*); void bldcclass(void); void regerror(Err e) { Strzero(&lastregexp); error(e); } void regerror_c(Err e, int c) { Strzero(&lastregexp); error_c(e, c); } Inst * newinst(int t) { if(progp >= &program[NPROG]) regerror(Etoolong); progp->type = t; progp->left = 0; progp->right = 0; return progp++; } Inst * realcompile(Rune *s) { int token; startlex(s); atorp = atorstack; andp = andstack; subidp = subidstack; cursubid = 0; lastwasand = FALSE; /* Start with a low priority operator to prime parser */ pushator(START-1); while((token=lex()) != END){ if((token&ISATOR) == OPERATOR) operator(token); else operand(token); } /* Close with a low priority operator */ evaluntil(START); /* Force END */ operand(END); evaluntil(START); if(nbra) regerror(Eleftpar); --andp; /* points to first and only operand */ return andp->first; } void compile(String *s) { int i; Inst *oprogp; if(Strcmp(s, &lastregexp)==0) return; for(i=0; i<nclass; i++) free(class[i]); nclass = 0; progp = program; backwards = FALSE; startinst = realcompile(s->s); optimize(program); oprogp = progp; backwards = TRUE; bstartinst = realcompile(s->s); optimize(oprogp); Strduplstr(&lastregexp, s); } void operand(int t) { Inst *i; if(lastwasand) operator(CAT); /* catenate is implicit */ i = newinst(t); if(t == CCLASS){ if(negateclass) i->type = NCCLASS; /* UGH */ i->rclass = nclass-1; /* UGH */ } pushand(i, i); lastwasand = TRUE; } void operator(int t) { if(t==RBRA && --nbra<0) regerror(Erightpar); if(t==LBRA){ /* * if(++cursubid >= NSUBEXP) * regerror(Esubexp); */ cursubid++; /* silently ignored */ nbra++; if(lastwasand) operator(CAT); }else evaluntil(t); if(t!=RBRA) pushator(t); lastwasand = FALSE; if(t==STAR || t==QUEST || t==PLUS || t==RBRA) lastwasand = TRUE; /* these look like operands */ } void cant(char *s) { char buf[100]; sprint(buf, "regexp: can't happen: %s", s); panic(buf); } void pushand(Inst *f, Inst *l) { if(andp >= &andstack[NSTACK]) cant("operand stack overflow"); andp->first = f; andp->last = l; andp++; } void pushator(int t) { if(atorp >= &atorstack[NSTACK]) cant("operator stack overflow"); *atorp++=t; if(cursubid >= NSUBEXP) *subidp++= -1; else *subidp++=cursubid; } Node * popand(int op) { if(andp <= &andstack[0]) if(op) regerror_c(Emissop, op); else regerror(Ebadregexp); return --andp; } int popator(void) { if(atorp <= &atorstack[0]) cant("operator stack underflow"); --subidp; return *--atorp; } void evaluntil(int pri) { Node *op1, *op2, *t; Inst *inst1, *inst2; while(pri==RBRA || atorp[-1]>=pri){ switch(popator()){ case LBRA: op1 = popand('('); inst2 = newinst(RBRA); inst2->subid = *subidp; op1->last->next = inst2; inst1 = newinst(LBRA); inst1->subid = *subidp; inst1->next = op1->first; pushand(inst1, inst2); return; /* must have been RBRA */ default: panic("unknown regexp operator"); break; case OR: op2 = popand('|'); op1 = popand('|'); inst2 = newinst(NOP); op2->last->next = inst2; op1->last->next = inst2; inst1 = newinst(OR); inst1->right = op1->first; inst1->left = op2->first; pushand(inst1, inst2); break; case CAT: op2 = popand(0); op1 = popand(0); if(backwards && op2->first->type!=END) t = op1, op1 = op2, op2 = t; op1->last->next = op2->first; pushand(op1->first, op2->last); break; case STAR: op2 = popand('*'); inst1 = newinst(OR); op2->last->next = inst1; inst1->right = op2->first; pushand(inst1, inst1); break; case PLUS: op2 = popand('+'); inst1 = newinst(OR); op2->last->next = inst1; inst1->right = op2->first; pushand(op2->first, inst1); break; case QUEST: op2 = popand('?'); inst1 = newinst(OR); inst2 = newinst(NOP); inst1->left = inst2; inst1->right = op2->first; op2->last->next = inst2; pushand(inst1, inst2); break; } } } void optimize(Inst *start) { Inst *inst, *target; for(inst=start; inst->type!=END; inst++){ target = inst->next; while(target->type == NOP) target = target->next; inst->next = target; } } #ifdef DEBUG void dumpstack(void){ Node *stk; int *ip; dprint("operators\n"); for(ip = atorstack; ip<atorp; ip++) dprint("0%o\n", *ip); dprint("operands\n"); for(stk = andstack; stk<andp; stk++) dprint("0%o\t0%o\n", stk->first->type, stk->last->type); } void dump(void){ Inst *l; l = program; do{ dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type, l->left-program, l->right-program); }while(l++->type); } #endif void startlex(Rune *s) { exprp = s; nbra = 0; } int lex(void){ int c= *exprp++; switch(c){ case '\\': if(*exprp) if((c= *exprp++)=='n') c='\n'; break; case 0: c = END; --exprp; /* In case we come here again */ break; case '*': c = STAR; break; case '?': c = QUEST; break; case '+': c = PLUS; break; case '|': c = OR; break; case '.': c = ANY; break; case '(': c = LBRA; break; case ')': c = RBRA; break; case '^': c = BOL; break; case '$': c = EOL; break; case '[': c = CCLASS; bldcclass(); break; } return c; } long nextrec(void){ if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) regerror(Ebadclass); if(exprp[0] == '\\'){ exprp++; if(*exprp=='n'){ exprp++; return '\n'; } return *exprp++|(Runemask+1); } return *exprp++; } void bldcclass(void) { long c1, c2, n, na; Rune *classp; classp = emalloc(DCLASS*RUNESIZE); n = 0; na = DCLASS; /* we have already seen the '[' */ if(*exprp == '^'){ classp[n++] = '\n'; /* don't match newline in negate case */ negateclass = TRUE; exprp++; }else negateclass = FALSE; while((c1 = nextrec()) != ']'){ if(c1 == '-'){ Error: free(classp); regerror(Ebadclass); } if(n+4 >= na){ /* 3 runes plus NUL */ na += DCLASS; classp = erealloc(classp, na*RUNESIZE); } if(*exprp == '-'){ exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; classp[n+0] = Runemax; classp[n+1] = c1 & Runemask; classp[n+2] = c2 & Runemask; n += 3; }else classp[n++] = c1 & Runemask; } classp[n] = 0; if(nclass == Nclass){ Nclass += DCLASS; class = erealloc(class, Nclass*sizeof(Rune*)); } class[nclass++] = classp; } int classmatch(int classno, int c, int negate) { Rune *p; p = class[classno]; while(*p){ if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; }else if(*p++ == c) return !negate; } return negate; } /* * Note optimization in addinst: * *l must be pending when addinst called; if *l has been looked * at already, the optimization is a bug. */ int addinst(Ilist *l, Inst *inst, Rangeset *sep) { Ilist *p; for(p = l; p->inst; p++){ if(p->inst==inst){ if((sep)->p[0].p1 < p->se.p[0].p1) p->se= *sep; /* this would be bug */ return 0; /* It's already there */ } } p->inst = inst; p->se= *sep; (p+1)->inst = 0; return 1; } int execute(File *f, Posn startp, Posn eof) { int flag = 0; Inst *inst; Ilist *tlp; Posn p = startp; int nnl = 0, ntl; int c; int wrapped = 0; int startchar = startinst->type<OPERATOR? startinst->type : 0; list[0][0].inst = list[1][0].inst = 0; sel.p[0].p1 = -1; /* Execute machine once for each character */ for(;;p++){ doloop: c = filereadc(f, p); if(p>=eof || c<0){ switch(wrapped++){ case 0: /* let loop run one more click */ case 2: break; case 1: /* expired; wrap to beginning */ if(sel.p[0].p1>=0 || eof!=INFINITY) goto Return; list[0][0].inst = list[1][0].inst = 0; p = 0; goto doloop; default: goto Return; } }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0) break; /* fast check for first char */ if(startchar && nnl==0 && c!=startchar) continue; tl = list[flag]; nl = list[flag^=1]; nl->inst = 0; ntl = nnl; nnl = 0; if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){ /* Add first instruction to this list */ sempty.p[0].p1 = p; if(addinst(tl, startinst, &sempty)) if(++ntl >= NLIST) Overflow: error(Eoverflow); } /* Execute machine until this list is empty */ for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ Switchstmt: switch(inst->type){ default: /* regular character */ if(inst->type==c){ Addinst: if(addinst(nl, inst->next, &tlp->se)) if(++nnl >= NLIST) goto Overflow; } break; case LBRA: if(inst->subid>=0) tlp->se.p[inst->subid].p1 = p; inst = inst->next; goto Switchstmt; case RBRA: if(inst->subid>=0) tlp->se.p[inst->subid].p2 = p; inst = inst->next; goto Switchstmt; case ANY: if(c!='\n') goto Addinst; break; case BOL: if(p==0 || filereadc(f, p - 1)=='\n'){ Step: inst = inst->next; goto Switchstmt; } break; case EOL: if(c == '\n') goto Step; break; case CCLASS: if(c>=0 && classmatch(inst->rclass, c, 0)) goto Addinst; break; case NCCLASS: if(c>=0 && classmatch(inst->rclass, c, 1)) goto Addinst; break; case OR: /* evaluate right choice later */ if(addinst(tlp, inst->right, &tlp->se)) if(++ntl >= NLIST) goto Overflow; /* efficiency: advance and re-evaluate */ inst = inst->left; goto Switchstmt; case END: /* Match! */ tlp->se.p[0].p2 = p; newmatch(&tlp->se); break; } } } Return: return sel.p[0].p1>=0; } void newmatch(Rangeset *sp) { int i; if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 || (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2)) for(i = 0; i<NSUBEXP; i++) sel.p[i] = sp->p[i]; } int bexecute(File *f, Posn startp) { int flag = 0; Inst *inst; Ilist *tlp; Posn p = startp; int nnl = 0, ntl; int c; int wrapped = 0; int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0; list[0][0].inst = list[1][0].inst = 0; sel.p[0].p1= -1; /* Execute machine once for each character, including terminal NUL */ for(;;--p){ doloop: if((c = filereadc(f, p - 1))==-1){ switch(wrapped++){ case 0: /* let loop run one more click */ case 2: break; case 1: /* expired; wrap to end */ if(sel.p[0].p1>=0) case 3: goto Return; list[0][0].inst = list[1][0].inst = 0; p = f->nc; goto doloop; default: goto Return; } }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0) break; /* fast check for first char */ if(startchar && nnl==0 && c!=startchar) continue; tl = list[flag]; nl = list[flag^=1]; nl->inst = 0; ntl = nnl; nnl = 0; if(sel.p[0].p1<0 && (!wrapped || p>startp)){ /* Add first instruction to this list */ /* the minus is so the optimizations in addinst work */ sempty.p[0].p1 = -p; if(addinst(tl, bstartinst, &sempty)) if(++ntl >= NLIST) Overflow: error(Eoverflow); } /* Execute machine until this list is empty */ for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ Switchstmt: switch(inst->type){ default: /* regular character */ if(inst->type == c){ Addinst: if(addinst(nl, inst->next, &tlp->se)) if(++nnl >= NLIST) goto Overflow; } break; case LBRA: if(inst->subid>=0) tlp->se.p[inst->subid].p1 = p; inst = inst->next; goto Switchstmt; case RBRA: if(inst->subid >= 0) tlp->se.p[inst->subid].p2 = p; inst = inst->next; goto Switchstmt; case ANY: if(c != '\n') goto Addinst; break; case BOL: if(c=='\n' || p==0){ Step: inst = inst->next; goto Switchstmt; } break; case EOL: if(p==f->nc || filereadc(f, p)=='\n') goto Step; break; case CCLASS: if(c>=0 && classmatch(inst->rclass, c, 0)) goto Addinst; break; case NCCLASS: if(c>=0 && classmatch(inst->rclass, c, 1)) goto Addinst; break; case OR: /* evaluate right choice later */ if(addinst(tl, inst->right, &tlp->se)) if(++ntl >= NLIST) goto Overflow; /* efficiency: advance and re-evaluate */ inst = inst->left; goto Switchstmt; case END: /* Match! */ tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */ tlp->se.p[0].p2 = p; bnewmatch(&tlp->se); break; } } } Return: return sel.p[0].p1>=0; } void bnewmatch(Rangeset *sp) { int i; if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1)) for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */ sel.p[i].p1 = sp->p[i].p2; sel.p[i].p2 = sp->p[i].p1; } }