ref: 00c219c7d9c2b9f60c2db0e1ba7289b2301209a7
dir: /appl/lib/regex.b/
implement Regex; include "regex.m"; # syntax # RE ALT regular expression # NUL # ALT CAT alternation # CAT | ALT # # CAT DUP catenation # DUP CAT # # DUP PRIM possibly duplicated primary # PCLO # CLO # OPT # # PCLO PRIM + 1 or more # CLO PRIM * 0 or more # OPT PRIM ? 0 or 1 # # PRIM ( RE ) # () # DOT any character # CHAR a single character # ESC escape sequence # [ SET ] character set # NUL null string # HAT beginning of string # DOL end of string # NIL : con -1; # a refRex constant NONE: con -2; # ditto, for an un-set value BAD: con 1<<16; # a non-character HUGE: con (1<<31) - 1; # the data structures of re.m would like to be ref-linked, but are # circular (see fn walk), thus instead of pointers we use indexes # into an array (arena) of nodes of the syntax tree of a regular expression. # from a storage-allocation standpoint, this replaces many small # allocations of one size with one big one of variable size. ReStr: adt { s : string; i : int; # cursor postion n : int; # number of chars left; -1 on error peek : fn(s: self ref ReStr): int; next : fn(s: self ref ReStr): int; }; ReStr.peek(s: self ref ReStr): int { if(s.n <= 0) return BAD; return s.s[s.i]; } ReStr.next(s: self ref ReStr): int { if(s.n <= 0) return BAD; s.n--; return s.s[s.i++]; } newRe(kind: int, left, right: refRex, set: ref Set, ar: ref Arena, pno: int): refRex { ar.rex[ar.ptr] = Rex(kind, left, right, set, pno); return ar.ptr++; } # parse a regex by recursive descent to get a syntax tree re(s: ref ReStr, ar: ref Arena): refRex { left := cat(s, ar); if(left==NIL || s.peek()!='|') return left; s.next(); right := re(s, ar); if(right == NIL) return NIL; return newRe(ALT, left, right, nil, ar, 0); } cat(s: ref ReStr, ar: ref Arena): refRex { left := dup(s, ar); if(left == NIL) return left; right := cat(s, ar); if(right == NIL) return left; return newRe(CAT, left, right, nil, ar, 0); } dup(s: ref ReStr, ar: ref Arena): refRex { case s.peek() { BAD or ')' or ']' or '|' or '?' or '*' or '+' => return NIL; } prim: refRex; case kind:=s.next() { '(' => if(ar.pno < 0) { if(s.peek() == ')') { s.next(); prim = newRe(NUL, NONE, NONE, nil, ar, 0); } else { prim = re(s, ar); if(prim==NIL || s.next()!=')') s.n = -1; } } else { pno := ++ar.pno; lp := newRe(LPN, NONE, NONE, nil, ar, pno); rp := newRe(RPN, NONE, NONE, nil, ar, pno); if(s.peek() == ')') { s.next(); prim = newRe(CAT, lp, rp, nil, ar, 0); } else { prim = re(s, ar); if(prim==NIL || s.next()!=')') s.n = -1; else { prim = newRe(CAT, prim, rp, nil, ar, 0); prim = newRe(CAT, lp, prim, nil, ar, 0); } } } '[' => prim = newRe(SET, NONE, NONE, newSet(s), ar, 0); * => case kind { '.' => kind = DOT; '^' => kind = HAT; '$' => kind = DOL; } prim = newRe(esc(s, kind), NONE, NONE, nil, ar, 0); } case s.peek() { '*' => kind = CLO; '+' => kind = PCLO; '?' => kind = OPT; * => return prim; } s.next(); return newRe(kind, prim, NONE, nil, ar, 0); } esc(s: ref ReStr, char: int): int { if(char == '\\') { char = s.next(); case char { BAD => s.n = -1; 'n' => char = '\n'; } } return char; } # walk the tree adjusting pointers to refer to # next state of the finite state machine walk(r: refRex, succ: refRex, ar: ref Arena) { if(r==NONE) return; rex := ar.rex[r]; case rex.kind { ALT => walk(rex.left, succ, ar); walk(rex.right, succ, ar); return; CAT => walk(rex.left, rex.right, ar); walk(rex.right, succ, ar); ar.rex[r] = ar.rex[rex.left]; # optimization return; CLO or PCLO => end := newRe(OPT, r, succ, nil, ar, 0); # here's the circularity walk(rex.left, end, ar); OPT => walk(rex.left, succ, ar); } ar.rex[r].right = succ; } compile(e: string, flag: int): (Re, string) { if(e == nil) return (nil, "missing expression"); s := ref ReStr(e, 0, len e); ar := ref Arena(array[2*s.n] of Rex, 0, 0, (flag&1)-1); start := ar.start = re(s, ar); if(start==NIL || s.n!=0) return (nil, "invalid regular expression"); walk(start, NIL, ar); if(ar.pno < 0) ar.pno = 0; return (ar, nil); } # todo1, todo2: queues for epsilon and advancing transitions Gaz: adt { pno: int; beg: int; end: int; }; Trace: adt { cre: refRex; # cursor in Re beg: int; # where this trace began; gaz: list of Gaz; }; Queue: adt { ptr: int; q: array of Trace; }; execute(re: Re, s: string): array of (int, int) { return executese(re, s, (-1,-1), 1, 1); } executese(re: Re, s: string, range: (int, int), bol: int, eol: int): array of (int,int) { if(re==nil) return nil; (s0, s1) := range; if(s0 < 0) s0 = 0; if(s1 < 0) s1 = len s; gaz : list of Gaz; (beg, end) := (-1, -1); todo1 := ref Queue(0, array[re.ptr] of Trace); todo2 := ref Queue(0, array[re.ptr] of Trace); for(i:=s0; i<=s1; i++) { small2 := HUGE; # earliest possible match if advance if(beg == -1) # no leftmost match yet todo1.q[todo1.ptr++] = Trace(re.start, i, nil); for(k:=0; k<todo1.ptr; k++) { q := todo1.q[k]; rex := re.rex[q.cre]; next1 := next2 := NONE; case rex.kind { NUL => next1 = rex.right; DOT => if(i<len s && s[i]!='\n') next2 = rex.right; HAT => if(i == s0 && bol) next1 = rex.right; DOL => if(i == s1 && eol) next1 = rex.right; SET => if(i<len s && member(s[i], rex.set)) next2 = rex.right; CAT or PCLO => next1 = rex.left; ALT or CLO or OPT => next1 = rex.right; k = insert(rex.left, q.beg, q.gaz, todo1, k); LPN => next1 = rex.right; q.gaz = Gaz(rex.pno,i,-1)::q.gaz; RPN => next1 = rex.right; for(r:=q.gaz; ; r=tl r) { (pno,beg1,end1) := hd r; if(rex.pno==pno && end1==-1) { q.gaz = Gaz(pno,beg1,i)::q.gaz; break; } } * => if(i<len s && rex.kind==s[i]) next2 = rex.right; } if(next1 != NONE) { if(next1 != NIL) k =insert(next1, q.beg, q.gaz, todo1, k); else if(better(q.beg, i, beg, end)) (gaz, beg, end) = (q.gaz, q.beg, i); } if(next2 != NONE) { if(next2 != NIL) { if(q.beg < small2) small2 = q.beg; insert(next2, q.beg, q.gaz, todo2, 0); } else if(better(q.beg, i+1, beg, end)) (gaz, beg, end) = (q.gaz, q.beg, i+1); } } if(beg!=-1 && beg<small2) # nothing better possible break; (todo1,todo2) = (todo2, todo1); todo2.ptr = 0; } if(beg == -1) return nil; result := array[re.pno+1] of { 0 => (beg,end), * => (-1,-1) }; for( ; gaz!=nil; gaz=tl gaz) { (pno, beg1, end1) := hd gaz; (rbeg, nil) := result[pno]; if(rbeg==-1 && (beg1|end1)!=-1) result[pno] = (beg1,end1); } return result; } better(newbeg, newend, oldbeg, oldend: int): int { return oldbeg==-1 || newbeg<oldbeg || newbeg==oldbeg && newend>oldend; } insert(next: refRex, tbeg: int, tgaz: list of Gaz, todo: ref Queue, k: int): int { for(j:=0; j<todo.ptr; j++) if(todo.q[j].cre == next) if(todo.q[j].beg <= tbeg) return k; else break; if(j < k) k--; if(j < todo.ptr) todo.ptr--; for( ; j<todo.ptr; j++) todo.q[j] = todo.q[j+1]; todo.q[todo.ptr++] = Trace(next, tbeg, tgaz); return k; } ASCII : con 128; WORD : con 32; member(char: int, set: ref Set): int { if(char < 128) return ((set.ascii[char/WORD]>>char%WORD)&1)^set.neg; for(l:=set.unicode; l!=nil; l=tl l) { (beg, end) := hd l; if(char>=beg && char<=end) return !set.neg; } return set.neg; } newSet(s: ref ReStr): ref Set { set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil); if(s.peek() == '^') { set.neg = 1; s.next(); } while(s.n > 0) { char1 := s.next(); if(char1 == ']') return set; char1 = esc(s, char1); char2 := char1; if(s.peek() == '-') { s.next(); char2 = s.next(); if(char2 == ']') break; char2 = esc(s, char2); if(char2 < char1) break; } for( ; char1<=char2; char1++) if(char1 < ASCII) set.ascii[char1/WORD] |= 1<<char1%WORD; else { set.unicode = (char1,char2)::set.unicode; break; } } s.n = -1; return nil; }