shithub: ham

ref: 9c0a6bb7ee2543794594f3a678f58aa4c702c011
dir: ham/sam/regexp.c

View raw version
#include "sam.h"

Rangeset	sel;
String		lastregexp;
/*
 * Machine Information
 */
typedef struct Inst Inst;

struct Inst
{
	long	type;	/* <= Runemax ==> literal, otherwise action */
	union {
		int rsid;
		int rsubid;
		int class;
		struct Inst *rother;
		struct Inst *rright;
	} r;
	union{
		struct Inst *lleft;
		struct Inst *lnext;
	} l;
};
#define	sid	r.rsid
#define	subid	r.rsubid
#define	rclass	r.class
#define	other	r.rother
#define	right	r.rright
#define	left	l.lleft
#define	next	l.lnext

#define	NPROG	1024
Inst	program[NPROG];
Inst	*progp;
Inst	*startinst;	/* First inst. of program; might not be program[0] */
Inst	*bstartinst;	/* same for backwards machine */

typedef struct Ilist Ilist;
struct Ilist
{
	Inst	*inst;		/* Instruction of the thread */
	Rangeset se;
	Posn	startp;		/* first char of match */
};

#define	NLIST	127

Ilist	*tl, *nl;		/* This list, next list */
Ilist	list[2][NLIST+1];	/* +1 for trailing null */
static	Rangeset sempty;

/*
 * Actions and Tokens
 *
 *	0x100xx are operators, value == precedence
 *	0x200xx are tokens, i.e. operands for operators
 */
enum {
	OPERATOR = Runemask+1,	/* Bitmask of all operators */
	START	= OPERATOR,	/* Start, used for marker on stack */
	RBRA,			/* Right bracket, ) */
	LBRA,			/* Left bracket, ( */
	OR,			/* Alternation, | */
	CAT,			/* Concatentation, implicit operator */
	STAR,			/* Closure, * */
	PLUS,			/* a+ == aa* */
	QUEST,			/* a? == a|nothing, i.e. 0 or 1 a's */

	ANY	= OPERATOR<<1,	/* Any character but newline, . */
	NOP,			/* No operation, internal use only */
	BOL,			/* Beginning of line, ^ */
	EOL,			/* End of line, $ */
	CCLASS,			/* Character class, [] */
	NCCLASS,		/* Negated character class, [^] */
	END,			/* Terminate: match found */

	ISATOR	= OPERATOR,
	ISAND	= OPERATOR<<1,
};

/*
 * Parser Information
 */
typedef struct Node Node;
struct Node
{
	Inst	*first;
	Inst	*last;
};

#define	NSTACK	20
Node	andstack[NSTACK];
Node	*andp;
int	atorstack[NSTACK];
int	*atorp;
int	lastwasand;	/* Last token was operand */
int	cursubid;
int	subidstack[NSTACK];
int	*subidp;
int	backwards;
int	nbra;
Rune	*exprp;		/* pointer to next character in source expression */
#define	DCLASS	10	/* allocation increment */
int	nclass;		/* number active */
int	Nclass;		/* high water mark */
Rune	**class;
int	negateclass;

int	addinst(Ilist *l, Inst *inst, Rangeset *sep);
void	newmatch(Rangeset*);
void	bnewmatch(Rangeset*);
void	pushand(Inst*, Inst*);
void	pushator(int);
Node	*popand(int);
int	popator(void);
void	startlex(Rune*);
int	lex(void);
void	operator(int);
void	operand(int);
void	evaluntil(int);
void	optimize(Inst*);
void	bldcclass(void);

void
regerror(Err e)
{
	Strzero(&lastregexp);
	error(e);
}

void
regerror_c(Err e, int c)
{
	Strzero(&lastregexp);
	error_c(e, c);
}

Inst *
newinst(int t)
{
	if(progp >= &program[NPROG])
		regerror(Etoolong);
	progp->type = t;
	progp->left = 0;
	progp->right = 0;
	return progp++;
}

Inst *
realcompile(Rune *s)
{
	int token;

	startlex(s);
	atorp = atorstack;
	andp = andstack;
	subidp = subidstack;
	cursubid = 0;
	lastwasand = FALSE;
	/* Start with a low priority operator to prime parser */
	pushator(START-1);
	while((token=lex()) != END){
		if((token&ISATOR) == OPERATOR)
			operator(token);
		else
			operand(token);
	}
	/* Close with a low priority operator */
	evaluntil(START);
	/* Force END */
	operand(END);
	evaluntil(START);
	if(nbra)
		regerror(Eleftpar);
	--andp;	/* points to first and only operand */
	return andp->first;
}

void
compile(String *s)
{
	int i;
	Inst *oprogp;

	if(Strcmp(s, &lastregexp)==0)
		return;
	for(i=0; i<nclass; i++)
		free(class[i]);
	nclass = 0;
	progp = program;
	backwards = FALSE;
	startinst = realcompile(s->s);
	optimize(program);
	oprogp = progp;
	backwards = TRUE;
	bstartinst = realcompile(s->s);
	optimize(oprogp);
	Strduplstr(&lastregexp, s);
}

void
operand(int t)
{
	Inst *i;
	if(lastwasand)
		operator(CAT);	/* catenate is implicit */
	i = newinst(t);
	if(t == CCLASS){
		if(negateclass)
			i->type = NCCLASS;	/* UGH */
		i->rclass = nclass-1;		/* UGH */
	}
	pushand(i, i);
	lastwasand = TRUE;
}

void
operator(int t)
{
	if(t==RBRA && --nbra<0)
		regerror(Erightpar);
	if(t==LBRA){
/*
 *		if(++cursubid >= NSUBEXP)
 *			regerror(Esubexp);
 */
		cursubid++;	/* silently ignored */
		nbra++;
		if(lastwasand)
			operator(CAT);
	}else
		evaluntil(t);
	if(t!=RBRA)
		pushator(t);
	lastwasand = FALSE;
	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
		lastwasand = TRUE;	/* these look like operands */
}

void
cant(char *s)
{
	char buf[100];

	sprint(buf, "regexp: can't happen: %s", s);
	panic(buf);
}

void
pushand(Inst *f, Inst *l)
{
	if(andp >= &andstack[NSTACK])
		cant("operand stack overflow");
	andp->first = f;
	andp->last = l;
	andp++;
}

void
pushator(int t)
{
	if(atorp >= &atorstack[NSTACK])
		cant("operator stack overflow");
	*atorp++=t;
	if(cursubid >= NSUBEXP)
		*subidp++= -1;
	else
		*subidp++=cursubid;
}

Node *
popand(int op)
{
	if(andp <= &andstack[0])
		if(op)
			regerror_c(Emissop, op);
		else
			regerror(Ebadregexp);
	return --andp;
}

int
popator(void)
{
	if(atorp <= &atorstack[0])
		cant("operator stack underflow");
	--subidp;
	return *--atorp;
}

void
evaluntil(int pri)
{
	Node *op1, *op2, *t;
	Inst *inst1, *inst2;

	while(pri==RBRA || atorp[-1]>=pri){
		switch(popator()){
		case LBRA:
			op1 = popand('(');
			inst2 = newinst(RBRA);
			inst2->subid = *subidp;
			op1->last->next = inst2;
			inst1 = newinst(LBRA);
			inst1->subid = *subidp;
			inst1->next = op1->first;
			pushand(inst1, inst2);
			return;		/* must have been RBRA */
		default:
			panic("unknown regexp operator");
			break;
		case OR:
			op2 = popand('|');
			op1 = popand('|');
			inst2 = newinst(NOP);
			op2->last->next = inst2;
			op1->last->next = inst2;
			inst1 = newinst(OR);
			inst1->right = op1->first;
			inst1->left = op2->first;
			pushand(inst1, inst2);
			break;
		case CAT:
			op2 = popand(0);
			op1 = popand(0);
			if(backwards && op2->first->type!=END)
				t = op1, op1 = op2, op2 = t;
			op1->last->next = op2->first;
			pushand(op1->first, op2->last);
			break;
		case STAR:
			op2 = popand('*');
			inst1 = newinst(OR);
			op2->last->next = inst1;
			inst1->right = op2->first;
			pushand(inst1, inst1);
			break;
		case PLUS:
			op2 = popand('+');
			inst1 = newinst(OR);
			op2->last->next = inst1;
			inst1->right = op2->first;
			pushand(op2->first, inst1);
			break;
		case QUEST:
			op2 = popand('?');
			inst1 = newinst(OR);
			inst2 = newinst(NOP);
			inst1->left = inst2;
			inst1->right = op2->first;
			op2->last->next = inst2;
			pushand(inst1, inst2);
			break;
		}
	}
}


void
optimize(Inst *start)
{
	Inst *inst, *target;

	for(inst=start; inst->type!=END; inst++){
		target = inst->next;
		while(target->type == NOP)
			target = target->next;
		inst->next = target;
	}
}

#ifdef	DEBUG
void
dumpstack(void){
	Node *stk;
	int *ip;

	dprint("operators\n");
	for(ip = atorstack; ip<atorp; ip++)
		dprint("0%o\n", *ip);
	dprint("operands\n");
	for(stk = andstack; stk<andp; stk++)
		dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
}
void
dump(void){
	Inst *l;

	l = program;
	do{
		dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
			l->left-program, l->right-program);
	}while(l++->type);
}
#endif

void
startlex(Rune *s)
{
	exprp = s;
	nbra = 0;
}


int
lex(void){
	int c= *exprp++;

	switch(c){
	case '\\':
		if(*exprp)
			if((c= *exprp++)=='n')
				c='\n';
		break;
	case 0:
		c = END;
		--exprp;	/* In case we come here again */
		break;
	case '*':
		c = STAR;
		break;
	case '?':
		c = QUEST;
		break;
	case '+':
		c = PLUS;
		break;
	case '|':
		c = OR;
		break;
	case '.':
		c = ANY;
		break;
	case '(':
		c = LBRA;
		break;
	case ')':
		c = RBRA;
		break;
	case '^':
		c = BOL;
		break;
	case '$':
		c = EOL;
		break;
	case '[':
		c = CCLASS;
		bldcclass();
		break;
	}
	return c;
}

long
nextrec(void){
	if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
		regerror(Ebadclass);
	if(exprp[0] == '\\'){
		exprp++;
		if(*exprp=='n'){
			exprp++;
			return '\n';
		}
		return *exprp++|(Runemask+1);
	}
	return *exprp++;
}

void
bldcclass(void)
{
	long c1, c2, n, na;
	Rune *classp;

	classp = emalloc(DCLASS*RUNESIZE);
	n = 0;
	na = DCLASS;
	/* we have already seen the '[' */
	if(*exprp == '^'){
		classp[n++] = '\n';	/* don't match newline in negate case */
		negateclass = TRUE;
		exprp++;
	}else
		negateclass = FALSE;
	while((c1 = nextrec()) != ']'){
		if(c1 == '-'){
    Error:
			free(classp);
			regerror(Ebadclass);
		}
		if(n+4 >= na){		/* 3 runes plus NUL */
			na += DCLASS;
			classp = erealloc(classp, na*RUNESIZE);
		}
		if(*exprp == '-'){
			exprp++;	/* eat '-' */
			if((c2 = nextrec()) == ']')
				goto Error;
			classp[n+0] = Runemax;
			classp[n+1] = c1 & Runemask;
			classp[n+2] = c2 & Runemask;
			n += 3;
		}else
			classp[n++] = c1 & Runemask;
	}
	classp[n] = 0;
	if(nclass == Nclass){
		Nclass += DCLASS;
		class = erealloc(class, Nclass*sizeof(Rune*));
	}
	class[nclass++] = classp;
}

int
classmatch(int classno, int c, int negate)
{
	Rune *p;

	p = class[classno];
	while(*p){
		if(*p == Runemax){
			if(p[1]<=c && c<=p[2])
				return !negate;
			p += 3;
		}else if(*p++ == c)
			return !negate;
	}
	return negate;
}

/*
 * Note optimization in addinst:
 * 	*l must be pending when addinst called; if *l has been looked
 *		at already, the optimization is a bug.
 */
int
addinst(Ilist *l, Inst *inst, Rangeset *sep)
{
	Ilist *p;

	for(p = l; p->inst; p++){
		if(p->inst==inst){
			if((sep)->p[0].p1 < p->se.p[0].p1)
				p->se= *sep;	/* this would be bug */
			return 0;	/* It's already there */
		}
	}
	p->inst = inst;
	p->se= *sep;
	(p+1)->inst = 0;
	return 1;
}

int
execute(File *f, Posn startp, Posn eof)
{
	int flag = 0;
	Inst *inst;
	Ilist *tlp;
	Posn p = startp;
	int nnl = 0, ntl;
	int c;
	int wrapped = 0;
	int startchar = startinst->type<OPERATOR? startinst->type : 0;

	list[0][0].inst = list[1][0].inst = 0;
	sel.p[0].p1 = -1;
	/* Execute machine once for each character */
	for(;;p++){
	doloop:
		c = filereadc(f, p);
		if(p>=eof || c<0){
			switch(wrapped++){
			case 0:		/* let loop run one more click */
			case 2:
				break;
			case 1:		/* expired; wrap to beginning */
				if(sel.p[0].p1>=0 || eof!=INFINITY)
					goto Return;
				list[0][0].inst = list[1][0].inst = 0;
				p = 0;
				goto doloop;
			default:
				goto Return;
			}
		}else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
			break;
		/* fast check for first char */
		if(startchar && nnl==0 && c!=startchar)
			continue;
		tl = list[flag];
		nl = list[flag^=1];
		nl->inst = 0;
		ntl = nnl;
		nnl = 0;
		if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
			/* Add first instruction to this list */
			sempty.p[0].p1 = p;
			if(addinst(tl, startinst, &sempty))
			if(++ntl >= NLIST)
	Overflow:
				error(Eoverflow);
		}
		/* Execute machine until this list is empty */
		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
	Switchstmt:
			switch(inst->type){
			default:	/* regular character */
				if(inst->type==c){
	Addinst:
					if(addinst(nl, inst->next, &tlp->se))
					if(++nnl >= NLIST)
						goto Overflow;
				}
				break;
			case LBRA:
				if(inst->subid>=0)
					tlp->se.p[inst->subid].p1 = p;
				inst = inst->next;
				goto Switchstmt;
			case RBRA:
				if(inst->subid>=0)
					tlp->se.p[inst->subid].p2 = p;
				inst = inst->next;
				goto Switchstmt;
			case ANY:
				if(c!='\n')
					goto Addinst;
				break;
			case BOL:
				if(p==0 || filereadc(f, p - 1)=='\n'){
	Step:
					inst = inst->next;
					goto Switchstmt;
				}
				break;
			case EOL:
				if(c == '\n')
					goto Step;
				break;
			case CCLASS:
				if(c>=0 && classmatch(inst->rclass, c, 0))
					goto Addinst;
				break;
			case NCCLASS:
				if(c>=0 && classmatch(inst->rclass, c, 1))
					goto Addinst;
				break;
			case OR:
				/* evaluate right choice later */
				if(addinst(tlp, inst->right, &tlp->se))
				if(++ntl >= NLIST)
					goto Overflow;
				/* efficiency: advance and re-evaluate */
				inst = inst->left;
				goto Switchstmt;
			case END:	/* Match! */
				tlp->se.p[0].p2 = p;
				newmatch(&tlp->se);
				break;
			}
		}
	}
    Return:
	return sel.p[0].p1>=0;
}

void
newmatch(Rangeset *sp)
{
	int i;

	if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
	   (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
		for(i = 0; i<NSUBEXP; i++)
			sel.p[i] = sp->p[i];
}

int
bexecute(File *f, Posn startp)
{
	int flag = 0;
	Inst *inst;
	Ilist *tlp;
	Posn p = startp;
	int nnl = 0, ntl;
	int c;
	int wrapped = 0;
	int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;

	list[0][0].inst = list[1][0].inst = 0;
	sel.p[0].p1= -1;
	/* Execute machine once for each character, including terminal NUL */
	for(;;--p){
	doloop:
		if((c = filereadc(f, p - 1))==-1){
			switch(wrapped++){
			case 0:		/* let loop run one more click */
			case 2:
				break;
			case 1:		/* expired; wrap to end */
				if(sel.p[0].p1>=0)
			case 3:
					goto Return;
				list[0][0].inst = list[1][0].inst = 0;
				p = f->nc;
				goto doloop;
			default:
				goto Return;
			}
		}else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
			break;
		/* fast check for first char */
		if(startchar && nnl==0 && c!=startchar)
			continue;
		tl = list[flag];
		nl = list[flag^=1];
		nl->inst = 0;
		ntl = nnl;
		nnl = 0;
		if(sel.p[0].p1<0 && (!wrapped || p>startp)){
			/* Add first instruction to this list */
			/* the minus is so the optimizations in addinst work */
			sempty.p[0].p1 = -p;
			if(addinst(tl, bstartinst, &sempty))
			if(++ntl >= NLIST)
	Overflow:
				error(Eoverflow);
		}
		/* Execute machine until this list is empty */
		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
	Switchstmt:
			switch(inst->type){
			default:	/* regular character */
				if(inst->type == c){
	Addinst:
					if(addinst(nl, inst->next, &tlp->se))
					if(++nnl >= NLIST)
						goto Overflow;
				}
				break;
			case LBRA:
				if(inst->subid>=0)
					tlp->se.p[inst->subid].p1 = p;
				inst = inst->next;
				goto Switchstmt;
			case RBRA:
				if(inst->subid >= 0)
					tlp->se.p[inst->subid].p2 = p;
				inst = inst->next;
				goto Switchstmt;
			case ANY:
				if(c != '\n')
					goto Addinst;
				break;
			case BOL:
				if(c=='\n' || p==0){
	Step:
					inst = inst->next;
					goto Switchstmt;
				}
				break;
			case EOL:
				if(p==f->nc || filereadc(f, p)=='\n')
					goto Step;
				break;
			case CCLASS:
				if(c>=0 && classmatch(inst->rclass, c, 0))
					goto Addinst;
				break;
			case NCCLASS:
				if(c>=0 && classmatch(inst->rclass, c, 1))
					goto Addinst;
				break;
			case OR:
				/* evaluate right choice later */
				if(addinst(tl, inst->right, &tlp->se))
				if(++ntl >= NLIST)
					goto Overflow;
				/* efficiency: advance and re-evaluate */
				inst = inst->left;
				goto Switchstmt;
			case END:	/* Match! */
				tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
				tlp->se.p[0].p2 = p;
				bnewmatch(&tlp->se);
				break;
			}
		}
	}
    Return:
	return sel.p[0].p1>=0;
}

void
bnewmatch(Rangeset *sp)
{
        int  i;
        if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
                for(i = 0; i<NSUBEXP; i++){       /* note the reversal; p1<=p2 */
                        sel.p[i].p1 = sp->p[i].p2;
                        sel.p[i].p2 = sp->p[i].p1;
                }
}