shithub: purgatorio

ref: a411870ee4640241e3c494367d922847da84f972
dir: purgatorio/appl/lib/ecmascript/regexp.b

View raw version
strhas(s: string, c: int): ref Val
{
	for(i := 0; i < len s; i++)
		if(s[i] == c)
			return true;
	return false;
}

rsplit(r: string): (string, string)
{
	esc := 0;
	i := 1;	# skip '/'
	for(;;){
		c := r[i++];
		if(!esc && c == '/')
			break;
		esc = !esc && c == '\\';
	}
	return (r[1: i-1], r[i: ]);
}
		
badflags(f: string): int
{
	g := i := m := 0;
	for(j := 0; j < len f; j++){
		case(f[j]){
			'g' =>
				g++;
			'i' =>
				i++;
			'm' =>
				m++;
			* =>
				return 1;
		}
	}
	return g > 1 || i > 1 || m > 1;
}

regexpvals(ex: ref Exec, v: ref Val, o: ref Ecmascript->Obj): (string, string, int)
{
	if(v != nil){
		if(v.ty == TRegExp)
			return (v.rev.p, v.rev.f, v.rev.i);
		o = v.obj;
	}
	p := toString(ex, esget(ex, o, "source", 0));
	f := "";
	if(toBoolean(ex, esget(ex, o, "global", 0)) == true)
		f += "g";
	if(toBoolean(ex, esget(ex, o, "ignoreCase", 0)) == true)
		f += "i";
	if(toBoolean(ex, esget(ex, o, "multiline", 0)) == true)
		f += "m";
	i := toInt32(ex, esget(ex, o, "lastIndex", 0));
	return (p, f, i);
}

nregexp(ex: ref Exec, nil: ref Ecmascript->Obj, args: array of ref Val): ref Ecmascript->Obj
{
	pat := biarg(args, 0);
	flags := biarg(args, 1);
	(p, f) := ("", "");
	if(isregexp(pat)){
		if(flags == undefined)
			(p, f, nil) = regexpvals(ex, pat, nil);
		else
			runtime(ex, TypeError, "flags defined");
	}
	else{
		if(pat == undefined)
			p = "";
		else
			p = toString(ex, pat);
		if(flags == undefined)
			f = "";
		else
			f = toString(ex, flags);
	}
	o := nobj(ex, nil, array[] of { regexpval(p, f, 0) });
	if(badflags(f))
		runtime(ex, SyntaxError, "bad regexp flags");
	regex = ex;
	(re, err) := compile(p, 1);
	if(re == nil || err != nil)
		runtime(ex, SyntaxError, "bad regexp pattern");
	o.re = re;
	return o;
}

cregexp(ex: ref Exec, f, nil: ref Ecmascript->Obj, args: array of ref Val): ref Val
{
	pat := biarg(args, 0);
	flags := biarg(args, 1);
	if(isregexp(pat) && flags == undefined)
		return pat;
	return objval(nregexp(ex, f, args));
}

cregexpprotoexec(ex: ref Exec, f, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
{
	m: array of (int, int);

	regexpcheck(ex, this, f);
	s := toString(ex, biarg(args, 0));
	l := len s;
	i := toInt32(ex, esget(ex, this, "lastIndex", 0));
	e := 0;
	glob := esget(ex, this, "global", 0);
	multiline := esget(ex, this, "multiline", 0);
	ignorecase := esget(ex, this, "ignoreCase", 0);
	if(glob == false)
		i = 0;
	for(;;){
		if(i < 0 || i >= l){
			esput(ex, this, "lastIndex", numval(real 0), 0);
			return null;
		}
		regex = ex;
		m = executese(this.re, s, (i, len s), i == 0, 1, multiline == true, ignorecase == true);
		if(m != nil)
			break;
		i++;
		i = -1;	# no need to loop with executese
	}
	(i, e) = m[0];
	if(glob == true)
		esput(ex, this, "lastIndex", numval(real e), 0);
	n := len m;
	av := array[n] of ref Val;
	for(j := 0; j < n; j++){
		(a, b) := m[j];
		if(a < 0)
			av[j] = undefined;
		else
			av[j] = strval(s[a: b]);
	}
	a := narray(ex, nil, av);
	esput(ex, a, "index", numval(real i), 0);
	esput(ex, a, "input", strval(s), 0);
	return objval(a);
}

cregexpprototest(ex: ref Exec, f, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
{
	regexpcheck(ex, this, f);
	v := cregexpprotoexec(ex, f, this, args);
	if(!isnull(v))
		return true;
	return false;
}

cregexpprototoString(ex: ref Exec, f, this: ref Ecmascript->Obj, nil: array of ref Val): ref Val
{
	regexpcheck(ex, this, f);
	(p, fl, nil) := regexpvals(ex, nil, this);
	return strval("/" + p + "/" + fl);
}

regexpcheck(ex: ref Exec, o: ref Ecmascript->Obj, f: ref Obj)
{
	if(f == nil)
		s := "exec";
	else
		s = f.val.str;
	if(!isregexpobj(o))
		runtime(ex, TypeError, "RegExp.prototype." + s + " called on non-RegExp object");
}

cstrprotomatch(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
{
	v := biarg(args, 0);
	if(!isregexp(v))
		re := nregexp(ex, nil, args);
	else if(v.ty == TObj)
		re = v.obj;
	else
		re = nobj(ex, nil, args);
	s := toString(ex, this.val);
	glob := esget(ex, re, "global", 0);
	av := array[1] of ref Val;
	av[0] = strval(s);
	if(glob == false)
		return cregexpprotoexec(ex, nil, re, av);
	li := 0;
	esput(ex, re, "lastIndex", numval(real li), 0);
	ms: list of ref Val;
	for(;;){
		v = cregexpprotoexec(ex, nil, re, av);
		if(isnull(v))
			break;
		ms = esget(ex, v.obj, "0", 0) :: ms;
		ni := int toUint32(ex, esget(ex, re, "lastIndex", 0));
		if(ni == li)
			esput(ex, re, "lastIndex", numval(real ++li), 0);
		else
			li = ni;
	}	
	n := len ms;
	av = array[n] of ref Val;
	for(j := n-1; j >= 0; j--){
		av[j] = hd ms;
		ms = tl ms;
	}
	return objval(narray(ex, nil, av));
}

cstrprotoreplace(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
{
	re: ref Ecmascript->Obj;

	v := biarg(args, 0);
	rege := isregexp(v);
	if(!rege){
		if(args == nil)
			re = nregexp(ex, nil, args);
		else
			re = nregexp(ex, nil, args[0:1]);
	}
	else if(v.ty == TObj)
		re = v.obj;
	else
		re = nobj(ex, nil, args);
	s := toString(ex, this.val);
	if(rege)
		glob := esget(ex, re, "global", 0);
	else
		glob = false;
	av := array[1] of ref Val;
	av[0] = strval(s);
	ms: list of ref Val;
	li := 0;
	if(glob == true)
		esput(ex, re, "lastIndex", numval(real li), 0);
	for(;;){
		v = cregexpprotoexec(ex, nil, re, av);
		if(!isnull(v))
			ms = v :: ms;
		if(isnull(v) || glob == false)
			break;
		ni := int toUint32(ex, esget(ex, re, "lastIndex", 0));
		if(ni == li)
			esput(ex, re, "lastIndex", numval(real ++li), 0);
		else
			li = ni;
	}
	if(ms == nil)
		return strval(s);
	ms = rev(ms);
	if(rege)
		lcp := int toUint32(ex, esget(ex, (hd ms).obj, "length", 0))-1;
	else
		lcp = 0;
	v = biarg(args, 1);
	if(isobj(v) && isfuncobj(v.obj)){
		ns := s;
		n := len ms;
		args = array[lcp+3] of ref Val;
		o := inc := 0;
		for(i := 0; i < n; i++){
			a := (hd ms).obj;
			ms = tl ms;
			for(j := 0; j <= lcp; j++)
				args[j] = esget(ex, a, string j, 0);
			ss := toString(ex, args[0]);
			o = offset(ss, s, o);
			args[lcp+1] = numval(real o);
			args[lcp+2] = strval(s);
			rs := toString(ex, getValue(ex, escall(ex, v.obj, nil, args, 0)));
			ns = repl(ns, o+inc, o+inc+len ss, rs);
			o += len ss;
			inc += len rs - len ss;
		}
		return strval(ns);
	}
	else{
		ps := toString(ex, v);
		lps := len ps;
		ns := s;
		n := len ms;
		o := inc := 0;
		for(i := 0; i < n; i++){
			a := (hd ms).obj;
			ms = tl ms;
			ss := toString(ex, esget(ex, a, "0", 0));
			o = offset(ss, s, o);
			rs := "";
			for(j := 0; j < lps; j++){
				if(ps[j] == '$' && j < lps-1){
					j++;
					case(c := ps[j]){
						'$' =>
							rs += "$";
						'&' =>
							rs += ss;
						'`' =>
							rs += s[0: o];
						''' =>
							rs += s[o+len ss: ];
						'0' to '9' =>
							if(j < lps-1 && isdigit(ps[j+1]))
								c = 10*(c-'0')+ps[++j]-'0';
							else
								c = c-'0';
							if(c >= 1 && c <= lcp)
								rs += toString(ex, esget(ex, a, string c, 0));
					}
				}
				else
					rs += ps[j: j+1];
			}
			ns = repl(ns, o+inc, o+inc+len ss, rs);
			o += len ss;
			inc += len rs - len ss;
		}
		return strval(ns);
	}
}

cstrprotosearch(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val
{
	v := biarg(args, 0);
	if(!isregexp(v))
		re := nregexp(ex, nil, args);
	else if(v.ty == TObj)
		re = v.obj;
	else
		re = nobj(ex, nil, args);
	s := toString(ex, this.val);
	glob := esget(ex, re, "global", 0);
	esput(ex, re, "global", false, 0);
	av := array[1] of ref Val;
	av[0] = strval(s);
	v = cregexpprotoexec(ex, nil, re, av);
	if(isnull(v))
		r := -1;
	else{
		ss := toString(ex, esget(ex, v.obj, "0", 0));
		r = offset(ss, s, 0);
	}
	esput(ex, re, "global", glob, 0);
	return numval(real r);
}

offset(ss: string, s: string, m: int): int
{
	nn := len ss;
	n := len s;
	for(i := m; i <= n-nn; i++){
		if(s[i: i+nn] == ss)
			return i;
	}
	return -1;
}

repl(s: string, a: int, b: int, ns: string): string
{
	return s[0: a] + ns + s[b: ];
}

rev(ls: list of ref Val): list of ref Val
{
	ns: list of ref Val;

	for( ; ls != nil; ls = tl ls)
		ns = hd ls :: ns;
	return ns;
}

#########################################################################
# regex.b originally

# normally imported identifiers

# internal identifiers, not normally imported

ALT, CAT, DOT, SET, HAT, DOL, NUL, PCLO, CLO, OPT, LPN, RPN, LPN0, RPN0, LPN1, RPN1, LPN2, RPN2, BEET, BEEF, MNCLO, LCP, IDLE: con (1<<16)+iota;

# syntax

# RE	ALT		regular expression
#	NUL
# ALT	CAT		alternation
# 	CAT | ALT
#
# CAT	DUP		catenation
# 	DUP CAT
#
# DUP	PRIM		possibly duplicated primary
# 	PCLO
# 	CLO
# 	OPT
#
# PCLO	PRIM +		1 or more
# CLO	PRIM *		0 or more
# OPT	PRIM ?		0 or 1
#
# PRIM	( RE )
#	()
# 	DOT		any character
# 	CHAR		a single character
#	ESC		escape sequence
# 	[ SET ]		character set
# 	NUL		null string
# 	HAT		beginning of string
# 	DOL		end of string
#

regex: ref Exec;

NIL : con -1;		# a refRex constant
NONE: con -2;		# ditto, for an un-set value
BAD: con 1<<16;		# a non-character 
HUGE: con (1<<31) - 1;

# the data structures of re.m would like to be ref-linked, but are
# circular (see fn walk), thus instead of pointers we use indexes
# into an array (arena) of nodes of the syntax tree of a regular expression.
# from a storage-allocation standpoint, this replaces many small
# allocations of one size with one big one of variable size.

ReStr: adt {
	s : string;
	i : int;	# cursor postion
	n : int;	# number of chars left; -1 on error
	peek : fn(s: self ref ReStr): int;
	next : fn(s: self ref ReStr): int;
	unput: fn(s: self ref ReStr);
};

ReStr.peek(s: self ref ReStr): int
{
	if(s.n <= 0)
		return BAD;
	return s.s[s.i];
}

ReStr.next(s: self ref ReStr): int
{
	if(s.n <= 0)
		syntax("bad regular expression");
	s.n--;
	return s.s[s.i++];
}

ReStr.unput(s: self ref ReStr)
{
	s.n++;
	s.i--;
}

newRe(kind: int, left, right: refRex, set: ref Set, ar: ref Arena, pno: int, greedy: int): refRex
{
	ar.rex[ar.ptr] = Rex(kind, left, right, set, pno, greedy, nil);
	return ar.ptr++;
}

# parse a regex by recursive descent to get a syntax tree

re(s: ref ReStr, ar: ref Arena): refRex
{
	left := cat(s, ar);
	if(left==NIL || s.peek()!='|')
		return left;
	s.next();
	right := re(s, ar);
	if(right == NIL)
		return NIL;
	return newRe(ALT, left, right, nil, ar, 0, 0);
}

cat(s: ref ReStr, ar: ref Arena): refRex
{
	left := dup(s, ar);
	if(left == NIL)
		return left;
	right := cat(s, ar);
	if(right == NIL)
		return left;
	return newRe(CAT, left, right, nil, ar, 0, 0);
}

dup(s: ref ReStr, ar: ref Arena): refRex
{
	n1, n2: int;

	case s.peek() {
	BAD or ')' or ']' or '|' or '?' or '*' or '+' =>
		return NIL;
	}
	prim: refRex;
	case kind:=s.next() {
	'(' =>	if(ar.pno < 0) {
			if(s.peek() == ')') {
				s.next();
				prim = newRe(NUL, NONE, NONE, nil, ar, 0, 0);
			} else {
				prim = re(s, ar);
				if(prim==NIL || s.next()!=')')
					syntax("( with no )");
			}
		} else {
			pno := ++ar.pno;
			lp := newRe(LPN, NONE, NONE, nil, ar, pno, 0);
			rp := newRe(RPN, NONE, NONE, nil, ar, pno, 0);
			if(s.peek() == ')') {
				s.next();
				prim = newRe(CAT, lp, rp, nil, ar, 0, 0);
			} else {
				if(s.peek() == '?'){
					s.next();
					case s.next(){
						':' => ar.rex[lp].kind = LPN0;
							ar.rex[rp].kind = RPN0;
						'=' => ar.rex[lp].kind = LPN1;
							ar.rex[rp].kind = RPN1;
						'!' => ar.rex[lp].kind = LPN2;
							ar.rex[rp].kind = RPN2;
						* => syntax("bad char after ?");
					}
				}
				prim = re(s, ar);
				if(prim==NIL || s.next()!=')')
					syntax("( with no )");
				else {
					prim = newRe(CAT, prim, rp, nil, ar, 0, 0);
					prim = newRe(CAT, lp, prim, nil, ar, 0, 0);
				}
			}
		}
	'[' =>	prim = newRe(SET, NONE, NONE, newSet(s), ar, 0, 0);
	* =>	case kind {
		'.' =>		kind = DOT;
		'^' =>	kind = HAT;
		'$' =>	kind = DOL;
		}
		(c, set, op) := esc(s, kind, 0);
		if(set != nil)
			prim = newRe(SET, NONE, NONE, set, ar, 0, 0);
		else if(op == LCP){
			if(c > ar.pno)
				syntax("\num too big");
			prim = newRe(LCP, NONE, NONE, nil, ar, 0, 0);
			ar.rex[prim].ns = ref Nstate(c, c);
		}
		else
			prim = newRe(c, NONE, NONE, nil, ar, 0, 0);
	}
	case s.peek() {
	'*' =>	kind = CLO;
	'+' =>	kind = PCLO;
	'?' =>	kind = OPT;
	'{' =>	s.next();
		(n1, n2) = drange(s);
		kind = MNCLO;
		if(s.peek() != '}')
			syntax("{ with no }");
	* =>	return prim;
	}
	s.next();
	greedy := 1;
	if(s.peek() == '?'){
		# non-greedy op
		greedy = 0;
		s.next();
	}
	prim = newRe(kind, prim, NONE, nil, ar, 0, greedy);
	if(kind == MNCLO)
		ns := ar.rex[prim].ns = ref Nstate(n1, n2);
	return prim;
}

esc(s: ref ReStr, char: int, inset: int): (int, ref Set, int)
{
	set: ref Set;

	op := 0;
	if(char == '\\') {
		char = s.next();
		case char {
		'b' =>
				if(inset)
					char = '\b';
				else
					char = BEET;
		'B' =>	if(inset)
					syntax("\\B in set");
				else
					char = BEEF;
		'f' =>		char = '\u000c';
		'n' =>	char = '\n';
		'r' =>		char = '\r';
		't' =>		char = '\t';
		'v' =>	char = '\v';
		'0' to '9' =>
				s.unput();
				char = digits(s);
				if(char == 0)
					char = '\0';
				else if(inset)
					syntax("\num in set");
				else
					op = LCP;
		'x' =>	char = hexdigits(s, 2);
		'u' =>	char = hexdigits(s, 4);
		'c' =>	char = s.next()%32;
		'd' or 'D' =>
				set = newset('0', '9');
				if(char == 'D')
					set.neg = 1;
		's' or 'S' =>
				set = newset(' ', ' ');
				addsets(set, "\t\v\u000c\u00a0\n\r\u2028\u2029");
				if(char == 'S')
					set.neg = 1;
		'w' or 'W' =>
				set = newset('0', '9');
				addset(set, 'a', 'z');
				addset(set, 'A', 'Z');
				addset(set, '_', '_');
				if(char == 'W')
					set.neg = 1;
		* =>
				;
		}
	}
	if(char == -1){
		if(inset)
			syntax("bad set");
		else
			syntax("bad character");
	}
	return (char, set, op);
}

isdigit(c: int): int
{
	return c >= '0' && c <= '9';
}

islower(c: int): int
{
	return c >= 'a' && c <= 'z';
}

isupper(c: int): int
{
	return c >= 'A' && c <= 'Z';
}

isalpha(c: int): int
{
	return islower(c) || isupper(c);
}

hexdigit(c: int): int
{
	if(isdigit(c))
		return c-'0';
	if('a' <= c && c <= 'f')
		return c-'a'+10;
	if('A' <= c && c <= 'F')
		return c-'A'+10;
	return -1;
}

digits(s: ref ReStr): int
{
	n := 0;
	while(isdigit(s.peek()))
		n = 10*n + s.next() -'0';
	return n;
}

hexdigits(s: ref ReStr, n: int): int
{
	x := 0;
	for(i := 0; i < n; i++){
		v := hexdigit(s.next());
		if(v < 0)
			return -1;
		x = 16*x+v;
	}
	return x;
}

drange(s: ref ReStr): (int, int)
{
	n1 := n2 := -1;
	if(isdigit(s.peek()))
		n1 = digits(s);
	if(s.peek() == ','){
		s.next();
		if(isdigit(s.peek()))
			n2 = digits(s);
		else
			n2 = HUGE;
	}
	else
		n2 = n1;
	if(n1 < 0 || n1 > n2)
		syntax("bad number range");
	return (n1, n2);
}

# walk the tree adjusting pointers to refer to 
# next state of the finite state machine

walk(r: refRex, succ: refRex, ar: ref Arena)
{
	if(r==NONE)
		return;
	rex := ar.rex[r];
	case rex.kind {
	ALT =>	walk(rex.left, succ, ar);
		walk(rex.right, succ, ar);
		return;
	CAT =>	walk(rex.left, rex.right, ar);
		walk(rex.right, succ, ar);
		ar.rex[r] = ar.rex[rex.left];	# optimization
		return;
	CLO or PCLO =>
		end := newRe(OPT, r, succ, nil, ar, 0, rex.greedy); # here's the circularity
		walk(rex.left, end, ar);
	OPT =>	walk(rex.left, succ, ar);
	MNCLO =>
		ar.ptr++;
		walk(rex.left, r, ar);
	LCP =>
		ar.rex[r].left = newRe(IDLE, NONE, succ, nil, ar, 0, 0);
	}
	ar.rex[r].right = succ;
}

prtree(r: refRex, ar: ref Arena, done: list of int, ind: string): list of int
{
	sys->print("%s", ind);
	if(r==NIL){
		sys->print("NIL\n");
		return done;
	}
	if(r==NONE){
		sys->print("NONE\n");
		return done;
	}
	printed := 0;
	for(li := done; li != nil; li = tl li){
		if(hd li == r){
			printed = 1;
			break;
		}
	}
	rex := ar.rex[r];
	op := "";
	z := "Z";
	case rex.kind{
		ALT => op = "|";
		CAT => op = "and";
		DOT => op = ".";
		SET => op = "[]";
		HAT => op = "^";
		DOL => op = "$";
		NUL => op = "NUL";
		PCLO => op = "+";
		CLO => op = "*";
		OPT => op = "?";
		LPN => op = "(";
		RPN => op = ")";
		LPN0 => op = "?:";
		RPN0 => op = ":?";
		LPN1 => op = "?=";
		RPN1 => op = "=?";
		LPN2 => op = "?!";
		RPN2 => op = "!?";
		BEET => op = "\\b";
		BEEF => op = "\\B";
		MNCLO => op = "{}";
		LCP => op = "n";
		IDLE => op = "i";
		* => z[0] = rex.kind; op = z;
	}
	if(printed){
		sys->print("node %d (%d)\n", r, r);
		return done;
	}
	else{
		if(rex.ns != nil)
			sys->print("%s [%d-%d] (%d)\n", op, rex.ns.m, rex.ns.n, r);
		else
			sys->print("%s (%d)\n", op, r);
		done = r :: done;
		ind += "  ";
		done = prtree(rex.left, ar, done, ind);
		done  = prtree(rex.right, ar, done, ind);
		return done;
	}
}

compile(e: string, flag: int): (Re, string)
{
	if(e == nil)
		return (nil, "missing expression");	
	s := ref ReStr(e, 0, len e);
	ar := ref Arena(array[2*s.n] of Rex, 0, 0, (flag&1)-1);
	start := ar.start = re(s, ar);
	if(start==NIL || s.n!=0)
		syntax("invalid regular expression");
	walk(start, NIL, ar);
	# prtree(start, ar, nil, "");
	if(ar.pno < 0)
		ar.pno = 0;
	return (ar, nil);
}

# todo: queue for epsilon and advancing transitions

Num: adt{
	ns: ref Nstate;
	m: int;
	n: int;
};
Gaz: adt {
	pno: int;
	beg: int;
	end: int;
};
Trace: adt {
	cre: refRex;		# cursor in Re
	trans: int;		# 0 epsilon transition, 1 advancing transition
	beg: int;		# where this trace began;
	end: int;		# where this trace ended if success (-1 by default)
	gaz: list of Gaz;
	ns: list of ref Num;
};
Queue: adt {
	ptr: int;
	q: array of Trace;
};

execute(re: Re, s: string): array of (int, int)
{
	return executese(re, s, (-1,-1), 1, 1, 1, 0);
}

executese(re: Re, s: string, range: (int, int), bol: int, eol: int, multiline: int, ignorecase: int): array of (int,int)
{
	if(re==nil)
		return nil;
	(s0, s1) := range;
	if(s0 < 0)
		s0 = 0;
	if(s1 < 0)
		s1 = len s;
	match := 0;
	todo := ref Queue(0, array[2*re.ptr] of Trace);
	for(i:=s0; i<=s1; i++) {
		if(!match)		# no leftmost match yet
			todo.q[todo.ptr++] = Trace(re.start, 0, i, -1, nil, nil);
		for(k:=0; k<todo.ptr; k++) {
			q := todo.q[k];
			if(q.trans)
				continue;
			rex := re.rex[q.cre];
			next0 := next1 := next2 := NONE;
			case rex.kind {
			NUL =>
				next1 = rex.right;
			DOT =>
				if(i<len s && !islt(s[i]))
					next2 = rex.right;
			HAT =>
				if(i == s0 && bol)
					next1 = rex.right;
				else if(multiline && i > 0 && islt(s[i-1]))
					next1 = rex.right;
			DOL =>
				if(i == s1 && eol)
					next1 = rex.right;
				else if(multiline && i < s1 && islt(s[i]))
					next1 = rex.right;
			SET =>
				if(i<len s && member(s[i], rex.set, ignorecase))
					next2 = rex.right;
			CAT or
			PCLO =>
				next1 = rex.left;
			ALT or 
			CLO or 
			OPT =>
				if(rex.kind == ALT || rex.greedy){
					next0 = rex.left;
					next1 = rex.right;
				}
				else{
					next0 = rex.right;
					next1 = rex.left;
				}
			LPN =>
				next1 = rex.right;
				q.gaz = Gaz(rex.pno,i,-1)::q.gaz;
			RPN =>
				next1 = rex.right;
				for(r:=q.gaz; ; r=tl r) {
					(pno,beg1,end1) := hd r;
					if(rex.pno==pno && end1==-1) {
						q.gaz = Gaz(pno,beg1,i)::q.gaz;
						break;
					}
				}
			LPN0 or RPN0 or RPN1 or RPN2 =>
				next1 = rex.right;
			LPN1 =>
				(rpn, nxt, nre) := storetree(q.cre, re);
				m := executese(nre, s, (i, -1), bol, eol, multiline, ignorecase);
				if(m != nil && m[0].t0 == i){
					next1 = nxt;
					for(j := 1; j < len m; j++)
						if(m[j].t0 >= 0)
							q.gaz = Gaz(j, m[j].t0, m[j].t1)::q.gaz;	
				}
				restoretree(LPN1, rpn, nxt, nre);
			LPN2 =>
				(rpn, nxt, nre) := storetree(q.cre, re);
				m := executese(nre, s, (i, -1), bol, eol, multiline, ignorecase);
				if(m == nil || m[0].t0 != i)
					next1 = nxt;
				restoretree(LPN2, rpn, nxt, nre);
			MNCLO =>
				num: ref Num;

				(q.ns, num) = nextn(q.cre, q.ns, rex.ns.m, rex.ns.n, re);
				if(num.m > 0)
					next1 = rex.left;
				else if(num.n > 0){
					if(rex.greedy){
						next0 = rex.left;
						next1 = rex.right;
					}
					else{
						next0 = rex.right;
						next1 = rex.left;	
					}
				}
				else{
					next1 = rex.right;
					(num.m, num.n) = (-1, -1);
				}
			LCP =>
				pno := rex.ns.m;
				(beg1, end1) := lcpar(q.gaz, pno);
				l := end1-beg1;
				if(beg1 < 0)	# undefined so succeeds
					next1 = rex.right;
				else if(i+l <= s1 && eqstr(s[beg1: end1], s[i: i+l], ignorecase)){
					(q.ns, nil) = nextn(rex.left, q.ns, l, l, re);
					next1 = rex.left;	# idle
				}
			IDLE =>
				num: ref Num;

				(q.ns, num) = nextn(q.cre, q.ns, -1, -1, re);
				if(num.m >= 0)
					next2 = q.cre;
				else{
					next1 = rex.right;
					(num.m, num.n) = (-1, -1);
				}
			BEET =>
				if(iswordc(s, i-1) != iswordc(s, i))
					next1 = rex.right;
			BEEF =>
				if(iswordc(s, i-1) == iswordc(s, i))
					next1 = rex.right;
			* =>
				if(i<len s && (rex.kind==s[i] || (ignorecase && eqcase(rex.kind, s[i]))))
					next2 = rex.right;
			}
			l := k;
			if(next0 != NONE) {
				if(next0 != NIL)
					(k, l) = insert(next0, 0, q.beg, -1, q.gaz, q.ns, todo, k, l);
				else{
					match = 1;
					(k, l) = insert(NIL, 2, q.beg, i, q.gaz, nil, todo, k, l);
				}
			}
			if(next1 != NONE) {
				if(next1 != NIL)
					(k, l) = insert(next1, 0, q.beg, -1, q.gaz, q.ns, todo, k, l);
				else{
					match = 1;
					(k, l) = insert(NIL, 2, q.beg, i, q.gaz, nil, todo, k, l);
				}
			}
			if(next2 != NONE) {
				if(next2 != NIL)
					(k, l) = insert(next2, 1, q.beg, -1, q.gaz, q.ns, todo, k, l);
				else{
					match = 1;
					(k, l) = insert(NIL, 2, q.beg, i+1, q.gaz, nil, todo, k, l);
				}
			}
		}
		if(!atoe(todo) && match)
			break;
	}
	if(todo.ptr == 0)
		return nil;
	if(todo.ptr > 1)
		rfatal(sys->sprint("todo.ptr = %d", todo.ptr));
	if(todo.q[0].trans != 2)
		rfatal(sys->sprint("trans = %d", todo.q[0].trans));
	if(todo.q[0].cre != NIL)
		rfatal(sys->sprint("cre = %d", todo.q[0].cre));
	beg := todo.q[0].beg;
	end := todo.q[0].end;
	gaz := todo.q[0].gaz;
	if(beg == -1)
		return nil;
	result := array[re.pno+1] of { 0 => (beg,end), * => (-1,-1) };
	for( ; gaz!=nil; gaz=tl gaz) {
		(pno, beg1, end1) := hd gaz;
		(rbeg, nil) := result[pno];
		if(rbeg==-1 && (beg1|end1)!=-1)
			result[pno] = (beg1,end1);
	}
	return result;
}

better(newbeg, newend, oldbeg, oldend: int): int
{
	return oldbeg==-1 || newbeg<oldbeg ||
	       newbeg==oldbeg && newend>oldend;
}

insert(next: refRex, trans: int, tbeg: int, tend: int, tgaz: list of Gaz, tns: list of ref Num, todo: ref Queue, k: int, l: int): (int, int)
{
# sys->print("insert %d eps=%d beg=%d end=%d (k, l) = (%d %d) => ", next, trans, tbeg, tend, k, l);
	for(j:=0; j<todo.ptr; j++){
		if(todo.q[j].trans == trans){
			if(todo.q[j].cre == next){
				if(better(todo.q[j].beg, todo.q[j].end, tbeg, tend))
					return (k, l);
				else if(better(tbeg, tend, todo.q[j].beg, todo.q[j].end))
					break;
				else if(j < k)
					return (k, l);
				else
					break;
			}
		}
	}
	if(j < k){
		k--;
		l--;
	}
	if(j < todo.ptr){
		todo.q[j: ] = todo.q[j+1: todo.ptr];
		todo.ptr--;
	}
	todo.q[l+2: ] = todo.q[l+1: todo.ptr];
	todo.ptr++;
	todo.q[l+1] = Trace(next, trans, tbeg, tend, tgaz, tns);
# for(j=0; j < todo.ptr; j++) sys->print("%d(%d) ", todo.q[j].cre, todo.q[j].trans); sys->print("\n");
	return (k, l+1);
}

# remove epsilon transitions and move advancing transitions to epsilon ones
atoe(todo: ref Queue): int
{
	n := 0;
	for(j := 0; j < todo.ptr; j++){
		if(todo.q[j].trans){
			if(todo.q[j].trans == 1){
				todo.q[j].trans = 0;
				n++;
			}
		}
		else{
			todo.q[j: ] = todo.q[j+1: todo.ptr];
			todo.ptr--;
			j--;
		}
	}
	return n;
}

nextn(re: int, ln: list of ref Num, m: int, n: int, ar: ref Arena): (list of ref Num, ref Num)
{
	num: ref Num;

	ns := ar.rex[re].ns;
	for(l := ln; l != nil; l = tl l){
		if((hd l).ns == ns){
			num = hd l;
			break;
		}
	}
	if(num == nil)
		ln = (num = ref Num(ns, -1, -1)) :: ln;
	if(num.m == -1 && num.n == -1)
		(num.m, num.n) = (m, n);
	else
		(nil, nil) = (--num.m, --num.n);
	return (ln, num);
}

ASCII : con 128;
WORD : con 32;

mem(c: int, set: ref Set): int
{
	return (set.ascii[c/WORD]>>c%WORD)&1;
}

member(char: int, set: ref Set, ignorecase: int): int
{
	if(set.subset != nil){
		for(l := set.subset; l != nil; l = tl l)
			if(member(char, hd l, ignorecase))
				return !set.neg;
	}
	if(char < 128){
		if(ignorecase)
			return (mem(tolower(char), set) || mem(toupper(char), set))^set.neg;
		else
			return ((set.ascii[char/WORD]>>char%WORD)&1)^set.neg;
	}
	for(l:=set.unicode; l!=nil; l=tl l) {
		(beg, end) := hd l;
		if(char>=beg && char<=end)
			return !set.neg;
	}
	return set.neg;
}

newSet(s: ref ReStr): ref Set
{
	op: int;
	set0: ref Set;

	set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil, nil);
	if(s.peek() == '^') {
		set.neg = 1;
		s.next();
	}
	while(s.n > 0) {
		char1 := s.next();
		if(char1 == ']')
			return set;
		(char1, set0, op) = esc(s, char1, 1);
		if(set0 != nil)
			mergeset(set, set0);
		char2 := char1;
		if(s.peek() == '-') {
			if(set0 != nil)
				syntax("set in range");
			s.next();
			char2 = s.next();
			if(char2 == ']')
				break;
			(char2, set0, op) = esc(s, char2, 1);
			if(set0 != nil)
				syntax("set in range");
			if(char2 < char1)
				break;
		}
		addset(set, char1, char2);
	}
	syntax("bad set");
	return nil;
}

addset(set: ref Set, c1: int, c2: int)
{
	for(c := c1; c <= c2; c++){
		if(c < ASCII)
			set.ascii[c/WORD] |= 1<<c%WORD;
		else{
			set.unicode = (c, c2) :: set.unicode;
			break;
		}
	}
}

addsets(set: ref Set, s: string)
{
	for(i := 0; i < len s; i++)
		addset(set, s[i], s[i]);
}

mergeset(set: ref Set, set0: ref Set)
{
	if(!set0.neg){
		for(i := 0; i < ASCII/WORD; i++)
			set.ascii[i] |= set0.ascii[i];
		for(l := set0.unicode; l != nil; l = tl l)
			set.unicode = hd l :: set.unicode;
	}
	else
		set.subset = set0 :: set.subset;
}
		
newset(c1: int, c2: int): ref Set
{
	set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil, nil);
	addset(set, c1, c2);
	return set;
}

storetree(lpn: int, re: ref Arena): (int, int, ref Arena)
{
	rpn: int;

	rex := re.rex[lpn];
	k := rex.kind;
	l := 1;
	for(;;){
		rpn = rex.right;
		rex = re.rex[rpn];
		if(rex.kind == k)
			l++;
		else if(rex.kind == k+1 && --l == 0)
			break;
	}
	re.rex[lpn].kind = LPN;
	re.rex[rpn].kind = RPN;
	nxt := re.rex[rpn].right;
	re.rex[rpn].right = NIL;
	nre := ref *re;
	nre.start = lpn;
	return (rpn, nxt, nre);
}

restoretree(lop: int, rpn: int, nxt: int, re: ref Arena)
{
	lpn := re.start;
	re.rex[lpn].kind = lop;
	re.rex[rpn].kind = lop+1;
	re.rex[rpn].right = nxt;
}

iswordc(s: string, i: int): int
{
	if(i < 0 || i >= len s)
		return 0;
	c := s[i];
	return isdigit(c) || isalpha(c) || c == '_';
}

lcpar(gaz: list of Gaz, pno: int): (int, int)
{
	for(r := gaz; r != nil; r = tl r) {
		(pno1, beg1, end1) := hd r;
		if(pno == pno1)
			return (beg1, end1);
	}
	return (-1, -1);
}

eqstr(s: string, t: string, ic: int): int
{
	if(!ic)
		return s == t;
	if(len s != len t)
		return 0;
	for(i := 0; i < len s; i++)
		if(!eqcase(s[i], t[i]))
			return 0;
	return 1;
}

eqcase(c1: int, c2: int): int
{
	return toupper(c1) == toupper(c2);
}
	
syntax(s: string)
{
	runtime(regex, SyntaxError, s);
}

rfatal(s: string)
{
	runtime(regex, InternalError, s);
}