shithub: purgatorio

ref: e047274e8dedebb9588a82e48e6cd9d7e50cae79
dir: /appl/lib/regex.b/

View raw version
implement Regex;

include "regex.m";

# syntax

# RE	ALT		regular expression
#	NUL
# ALT	CAT		alternation
# 	CAT | ALT
#
# CAT	DUP		catenation
# 	DUP CAT
#
# DUP	PRIM		possibly duplicated primary
# 	PCLO
# 	CLO
# 	OPT
#
# PCLO	PRIM +		1 or more
# CLO	PRIM *		0 or more
# OPT	PRIM ?		0 or 1
#
# PRIM	( RE )
#	()
# 	DOT		any character
# 	CHAR		a single character
#	ESC		escape sequence
# 	[ SET ]		character set
# 	NUL		null string
# 	HAT		beginning of string
# 	DOL		end of string
#

NIL : con -1;		# a refRex constant
NONE: con -2;		# ditto, for an un-set value
BAD: con 1<<16;		# a non-character 
HUGE: con (1<<31) - 1;

# the data structures of re.m would like to be ref-linked, but are
# circular (see fn walk), thus instead of pointers we use indexes
# into an array (arena) of nodes of the syntax tree of a regular expression.
# from a storage-allocation standpoint, this replaces many small
# allocations of one size with one big one of variable size.

ReStr: adt {
	s : string;
	i : int;	# cursor postion
	n : int;	# number of chars left; -1 on error
	peek : fn(s: self ref ReStr): int;
	next : fn(s: self ref ReStr): int;
};

ReStr.peek(s: self ref ReStr): int
{
	if(s.n <= 0)
		return BAD;
	return s.s[s.i];
}

ReStr.next(s: self ref ReStr): int
{
	if(s.n <= 0)
		return BAD;
	s.n--;
	return s.s[s.i++];
}

newRe(kind: int, left, right: refRex, set: ref Set, ar: ref Arena, pno: int): refRex
{
	ar.rex[ar.ptr] = Rex(kind, left, right, set, pno);
	return ar.ptr++;
}

# parse a regex by recursive descent to get a syntax tree

re(s: ref ReStr, ar: ref Arena): refRex
{
	left := cat(s, ar);
	if(left==NIL || s.peek()!='|')
		return left;
	s.next();
	right := re(s, ar);
	if(right == NIL)
		return NIL;
	return newRe(ALT, left, right, nil, ar, 0);
}

cat(s: ref ReStr, ar: ref Arena): refRex
{
	left := dup(s, ar);
	if(left == NIL)
		return left;
	right := cat(s, ar);
	if(right == NIL)
		return left;
	return newRe(CAT, left, right, nil, ar, 0);
}

dup(s: ref ReStr, ar: ref Arena): refRex
{
	case s.peek() {
	BAD or ')' or ']' or '|' or '?' or '*' or '+' =>
		return NIL;
	}
	prim: refRex;
	case kind:=s.next() {
	'(' =>	if(ar.pno < 0) {
			if(s.peek() == ')') {
				s.next();
				prim = newRe(NUL, NONE, NONE, nil, ar, 0);
			} else {
				prim = re(s, ar);
				if(prim==NIL || s.next()!=')')
					s.n = -1;
			}
		} else {
			pno := ++ar.pno;
			lp := newRe(LPN, NONE, NONE, nil, ar, pno);
			rp := newRe(RPN, NONE, NONE, nil, ar, pno);
			if(s.peek() == ')') {
				s.next();
				prim = newRe(CAT, lp, rp, nil, ar, 0);
				
			} else {
				prim = re(s, ar);
				if(prim==NIL || s.next()!=')')
					s.n = -1;
				else {
					prim = newRe(CAT, prim, rp, nil, ar, 0);
					prim = newRe(CAT, lp, prim, nil, ar, 0);
				}
			}
		}
	'[' =>	prim = newRe(SET, NONE, NONE, newSet(s), ar, 0);
	* =>	case kind {
		'.' =>	kind = DOT;
		'^' =>	kind = HAT;
		'$' =>	kind = DOL;
		}
		prim = newRe(esc(s, kind), NONE, NONE, nil, ar, 0);
	}
	case s.peek() {
	'*' =>	kind = CLO;
	'+' =>	kind = PCLO;
	'?' =>	kind = OPT;
	* =>	return prim;
	}
	s.next();
	return newRe(kind, prim, NONE, nil, ar, 0);
}

esc(s: ref ReStr, char: int): int
{
	if(char == '\\') {
		char = s.next();
		case char {
		BAD =>	s.n = -1;
		'n' =>	char = '\n';
		}
	}
	return char;
}

# walk the tree adjusting pointers to refer to 
# next state of the finite state machine

walk(r: refRex, succ: refRex, ar: ref Arena)
{
	if(r==NONE)
		return;
	rex := ar.rex[r];
	case rex.kind {
	ALT =>	walk(rex.left, succ, ar);
		walk(rex.right, succ, ar);
		return;
	CAT =>	walk(rex.left, rex.right, ar);
		walk(rex.right, succ, ar);
		ar.rex[r] = ar.rex[rex.left];	# optimization
		return;
	CLO or PCLO =>
		end := newRe(OPT, r, succ, nil, ar, 0); # here's the circularity
		walk(rex.left, end, ar);
	OPT =>	walk(rex.left, succ, ar);
	}
	ar.rex[r].right = succ;
}

compile(e: string, flag: int): (Re, string)
{
	if(e == nil)
		return (nil, "missing expression");	
	s := ref ReStr(e, 0, len e);
	ar := ref Arena(array[2*s.n] of Rex, 0, 0, (flag&1)-1);
	start := ar.start = re(s, ar);
	if(start==NIL || s.n!=0)
		return (nil, "invalid regular expression");
	walk(start, NIL, ar);
	if(ar.pno < 0)
		ar.pno = 0;
	return (ar, nil);
}

# todo1, todo2: queues for epsilon and advancing transitions
Gaz: adt {
	pno: int;
	beg: int;
	end: int;
};
Trace: adt {
	cre: refRex;		# cursor in Re
	beg: int;		# where this trace began;
	gaz: list of Gaz;
};
Queue: adt {
	ptr: int;
	q: array of Trace;
};

execute(re: Re, s: string): array of (int, int)
{
	return executese(re, s, (-1,-1), 1, 1);
}

executese(re: Re, s: string, range: (int, int), bol: int, eol: int): array of (int,int)
{
	if(re==nil)
		return nil;
	(s0, s1) := range;
	if(s0 < 0)
		s0 = 0;
	if(s1 < 0)
		s1 = len s;
	gaz : list of Gaz;
	(beg, end) := (-1, -1);
	todo1 := ref Queue(0, array[re.ptr] of Trace);
	todo2 := ref Queue(0, array[re.ptr] of Trace);
	for(i:=s0; i<=s1; i++) {
		small2 := HUGE;		# earliest possible match if advance
		if(beg == -1)		# no leftmost match yet
			todo1.q[todo1.ptr++] = Trace(re.start, i, nil);
		for(k:=0; k<todo1.ptr; k++) {
			q := todo1.q[k];
			rex := re.rex[q.cre];
			next1 := next2 := NONE;
			case rex.kind {
			NUL =>
				next1 = rex.right;
			DOT =>
				if(i<len s && s[i]!='\n')
					next2 = rex.right;
			HAT =>
				if(i == s0 && bol)
					next1 = rex.right;
			DOL =>
				if(i == s1 && eol)
					next1 = rex.right;
			SET =>
				if(i<len s && member(s[i], rex.set))
					next2 = rex.right;
			CAT or
			PCLO =>
				next1 = rex.left;
			ALT or 
			CLO or 
			OPT =>
				next1 = rex.right;
				k = insert(rex.left, q.beg, q.gaz, todo1, k);
			LPN =>
				next1 = rex.right;
				q.gaz = Gaz(rex.pno,i,-1)::q.gaz;
			RPN =>
				next1 = rex.right;
				for(r:=q.gaz; ; r=tl r) {
					(pno,beg1,end1) := hd r;
					if(rex.pno==pno && end1==-1) {
						q.gaz = Gaz(pno,beg1,i)::q.gaz;
						break;
					}
				}
			* =>
				if(i<len s && rex.kind==s[i])
					next2 = rex.right;
			}
			if(next1 != NONE) {
				if(next1 != NIL)
					k =insert(next1, q.beg, q.gaz, todo1, k);
				else if(better(q.beg, i, beg, end))
					(gaz, beg, end) = (q.gaz, q.beg, i);
			}
			if(next2 != NONE) {
				if(next2 != NIL) {
					if(q.beg < small2)
						small2 = q.beg;
					insert(next2, q.beg, q.gaz, todo2, 0);
				 } else if(better(q.beg, i+1, beg, end))
					(gaz, beg, end) = (q.gaz, q.beg, i+1);
			}
			
		}
		if(beg!=-1 && beg<small2)	# nothing better possible
			break;
		(todo1,todo2) = (todo2, todo1);
		todo2.ptr = 0;
	}
	if(beg == -1)
		return nil;
	result := array[re.pno+1] of { 0 => (beg,end), * => (-1,-1) };
	for( ; gaz!=nil; gaz=tl gaz) {
		(pno, beg1, end1) := hd gaz;
		(rbeg, nil) := result[pno];
		if(rbeg==-1 && (beg1|end1)!=-1)
			result[pno] = (beg1,end1);
	}
	return result;
}

better(newbeg, newend, oldbeg, oldend: int): int
{
	return oldbeg==-1 || newbeg<oldbeg ||
	       newbeg==oldbeg && newend>oldend;
}

insert(next: refRex, tbeg: int, tgaz: list of Gaz, todo: ref Queue, k: int): int
{
	for(j:=0; j<todo.ptr; j++)
		if(todo.q[j].cre == next)
			if(todo.q[j].beg <= tbeg)
			 	return k;
			else
				break;
	if(j < k)
		k--;
	if(j < todo.ptr)
		todo.ptr--;
	for( ; j<todo.ptr; j++)
		todo.q[j] = todo.q[j+1];
	todo.q[todo.ptr++] = Trace(next, tbeg, tgaz);
	return k;
}

ASCII : con 128;
WORD : con 32;

member(char: int, set: ref Set): int
{
	if(char < 128)
		return ((set.ascii[char/WORD]>>char%WORD)&1)^set.neg;
	for(l:=set.unicode; l!=nil; l=tl l) {
		(beg, end) := hd l;
		if(char>=beg && char<=end)
			return !set.neg;
	}
	return set.neg;
}

newSet(s: ref ReStr): ref Set
{
	set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil);
	if(s.peek() == '^') {
		set.neg = 1;
		s.next();
	}
	while(s.n > 0) {
		char1 := s.next();
		if(char1 == ']')
			return set;
		char1 = esc(s, char1);
		char2 := char1;
		if(s.peek() == '-') {
			s.next();
			char2 = s.next();
			if(char2 == ']')
				break;
			char2 = esc(s, char2);
			if(char2 < char1)
				break;
		}
		for( ; char1<=char2; char1++)
			if(char1 < ASCII)
				set.ascii[char1/WORD] |= 1<<char1%WORD;
			else {
				set.unicode = (char1,char2)::set.unicode;
				break;
			}
	}
	s.n = -1;
	return nil;
}