ref: 8a788aea84aad3bfbd5b39d78c1925654f3b9e14
dir: /appl/lib/ecmascript/regexp.b/
strhas(s: string, c: int): ref Val { for(i := 0; i < len s; i++) if(s[i] == c) return true; return false; } rsplit(r: string): (string, string) { esc := 0; i := 1; # skip '/' for(;;){ c := r[i++]; if(!esc && c == '/') break; esc = !esc && c == '\\'; } return (r[1: i-1], r[i: ]); } badflags(f: string): int { g := i := m := 0; for(j := 0; j < len f; j++){ case(f[j]){ 'g' => g++; 'i' => i++; 'm' => m++; * => return 1; } } return g > 1 || i > 1 || m > 1; } regexpvals(ex: ref Exec, v: ref Val, o: ref Ecmascript->Obj): (string, string, int) { if(v != nil){ if(v.ty == TRegExp) return (v.rev.p, v.rev.f, v.rev.i); o = v.obj; } p := toString(ex, esget(ex, o, "source", 0)); f := ""; if(toBoolean(ex, esget(ex, o, "global", 0)) == true) f += "g"; if(toBoolean(ex, esget(ex, o, "ignoreCase", 0)) == true) f += "i"; if(toBoolean(ex, esget(ex, o, "multiline", 0)) == true) f += "m"; i := toInt32(ex, esget(ex, o, "lastIndex", 0)); return (p, f, i); } nregexp(ex: ref Exec, nil: ref Ecmascript->Obj, args: array of ref Val): ref Ecmascript->Obj { pat := biarg(args, 0); flags := biarg(args, 1); (p, f) := ("", ""); if(isregexp(pat)){ if(flags == undefined) (p, f, nil) = regexpvals(ex, pat, nil); else runtime(ex, TypeError, "flags defined"); } else{ if(pat == undefined) p = ""; else p = toString(ex, pat); if(flags == undefined) f = ""; else f = toString(ex, flags); } o := nobj(ex, nil, array[] of { regexpval(p, f, 0) }); if(badflags(f)) runtime(ex, SyntaxError, "bad regexp flags"); regex = ex; (re, err) := compile(p, 1); if(re == nil || err != nil) runtime(ex, SyntaxError, "bad regexp pattern"); o.re = re; return o; } cregexp(ex: ref Exec, f, nil: ref Ecmascript->Obj, args: array of ref Val): ref Val { pat := biarg(args, 0); flags := biarg(args, 1); if(isregexp(pat) && flags == undefined) return pat; return objval(nregexp(ex, f, args)); } cregexpprotoexec(ex: ref Exec, f, this: ref Ecmascript->Obj, args: array of ref Val): ref Val { m: array of (int, int); regexpcheck(ex, this, f); s := toString(ex, biarg(args, 0)); l := len s; i := toInt32(ex, esget(ex, this, "lastIndex", 0)); e := 0; glob := esget(ex, this, "global", 0); multiline := esget(ex, this, "multiline", 0); ignorecase := esget(ex, this, "ignoreCase", 0); if(glob == false) i = 0; for(;;){ if(i < 0 || i >= l){ esput(ex, this, "lastIndex", numval(real 0), 0); return null; } regex = ex; m = executese(this.re, s, (i, len s), i == 0, 1, multiline == true, ignorecase == true); if(m != nil) break; i++; i = -1; # no need to loop with executese } (i, e) = m[0]; if(glob == true) esput(ex, this, "lastIndex", numval(real e), 0); n := len m; av := array[n] of ref Val; for(j := 0; j < n; j++){ (a, b) := m[j]; if(a < 0) av[j] = undefined; else av[j] = strval(s[a: b]); } a := narray(ex, nil, av); esput(ex, a, "index", numval(real i), 0); esput(ex, a, "input", strval(s), 0); return objval(a); } cregexpprototest(ex: ref Exec, f, this: ref Ecmascript->Obj, args: array of ref Val): ref Val { regexpcheck(ex, this, f); v := cregexpprotoexec(ex, f, this, args); if(!isnull(v)) return true; return false; } cregexpprototoString(ex: ref Exec, f, this: ref Ecmascript->Obj, nil: array of ref Val): ref Val { regexpcheck(ex, this, f); (p, fl, nil) := regexpvals(ex, nil, this); return strval("/" + p + "/" + fl); } regexpcheck(ex: ref Exec, o: ref Ecmascript->Obj, f: ref Obj) { if(f == nil) s := "exec"; else s = f.val.str; if(!isregexpobj(o)) runtime(ex, TypeError, "RegExp.prototype." + s + " called on non-RegExp object"); } cstrprotomatch(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val { v := biarg(args, 0); if(!isregexp(v)) re := nregexp(ex, nil, args); else if(v.ty == TObj) re = v.obj; else re = nobj(ex, nil, args); s := toString(ex, this.val); glob := esget(ex, re, "global", 0); av := array[1] of ref Val; av[0] = strval(s); if(glob == false) return cregexpprotoexec(ex, nil, re, av); li := 0; esput(ex, re, "lastIndex", numval(real li), 0); ms: list of ref Val; for(;;){ v = cregexpprotoexec(ex, nil, re, av); if(isnull(v)) break; ms = esget(ex, v.obj, "0", 0) :: ms; ni := int toUint32(ex, esget(ex, re, "lastIndex", 0)); if(ni == li) esput(ex, re, "lastIndex", numval(real ++li), 0); else li = ni; } n := len ms; av = array[n] of ref Val; for(j := n-1; j >= 0; j--){ av[j] = hd ms; ms = tl ms; } return objval(narray(ex, nil, av)); } cstrprotoreplace(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val { re: ref Ecmascript->Obj; v := biarg(args, 0); rege := isregexp(v); if(!rege){ if(args == nil) re = nregexp(ex, nil, args); else re = nregexp(ex, nil, args[0:1]); } else if(v.ty == TObj) re = v.obj; else re = nobj(ex, nil, args); s := toString(ex, this.val); if(rege) glob := esget(ex, re, "global", 0); else glob = false; av := array[1] of ref Val; av[0] = strval(s); ms: list of ref Val; li := 0; if(glob == true) esput(ex, re, "lastIndex", numval(real li), 0); for(;;){ v = cregexpprotoexec(ex, nil, re, av); if(!isnull(v)) ms = v :: ms; if(isnull(v) || glob == false) break; ni := int toUint32(ex, esget(ex, re, "lastIndex", 0)); if(ni == li) esput(ex, re, "lastIndex", numval(real ++li), 0); else li = ni; } if(ms == nil) return strval(s); ms = rev(ms); if(rege) lcp := int toUint32(ex, esget(ex, (hd ms).obj, "length", 0))-1; else lcp = 0; v = biarg(args, 1); if(isobj(v) && isfuncobj(v.obj)){ ns := s; n := len ms; args = array[lcp+3] of ref Val; o := inc := 0; for(i := 0; i < n; i++){ a := (hd ms).obj; ms = tl ms; for(j := 0; j <= lcp; j++) args[j] = esget(ex, a, string j, 0); ss := toString(ex, args[0]); o = offset(ss, s, o); args[lcp+1] = numval(real o); args[lcp+2] = strval(s); rs := toString(ex, getValue(ex, escall(ex, v.obj, nil, args, 0))); ns = repl(ns, o+inc, o+inc+len ss, rs); o += len ss; inc += len rs - len ss; } return strval(ns); } else{ ps := toString(ex, v); lps := len ps; ns := s; n := len ms; o := inc := 0; for(i := 0; i < n; i++){ a := (hd ms).obj; ms = tl ms; ss := toString(ex, esget(ex, a, "0", 0)); o = offset(ss, s, o); rs := ""; for(j := 0; j < lps; j++){ if(ps[j] == '$' && j < lps-1){ j++; case(c := ps[j]){ '$' => rs += "$"; '&' => rs += ss; '`' => rs += s[0: o]; ''' => rs += s[o+len ss: ]; '0' to '9' => if(j < lps-1 && isdigit(ps[j+1])) c = 10*(c-'0')+ps[++j]-'0'; else c = c-'0'; if(c >= 1 && c <= lcp) rs += toString(ex, esget(ex, a, string c, 0)); } } else rs += ps[j: j+1]; } ns = repl(ns, o+inc, o+inc+len ss, rs); o += len ss; inc += len rs - len ss; } return strval(ns); } } cstrprotosearch(ex: ref Exec, nil, this: ref Ecmascript->Obj, args: array of ref Val): ref Val { v := biarg(args, 0); if(!isregexp(v)) re := nregexp(ex, nil, args); else if(v.ty == TObj) re = v.obj; else re = nobj(ex, nil, args); s := toString(ex, this.val); glob := esget(ex, re, "global", 0); esput(ex, re, "global", false, 0); av := array[1] of ref Val; av[0] = strval(s); v = cregexpprotoexec(ex, nil, re, av); if(isnull(v)) r := -1; else{ ss := toString(ex, esget(ex, v.obj, "0", 0)); r = offset(ss, s, 0); } esput(ex, re, "global", glob, 0); return numval(real r); } offset(ss: string, s: string, m: int): int { nn := len ss; n := len s; for(i := m; i <= n-nn; i++){ if(s[i: i+nn] == ss) return i; } return -1; } repl(s: string, a: int, b: int, ns: string): string { return s[0: a] + ns + s[b: ]; } rev(ls: list of ref Val): list of ref Val { ns: list of ref Val; for( ; ls != nil; ls = tl ls) ns = hd ls :: ns; return ns; } ######################################################################### # regex.b originally # normally imported identifiers # internal identifiers, not normally imported ALT, CAT, DOT, SET, HAT, DOL, NUL, PCLO, CLO, OPT, LPN, RPN, LPN0, RPN0, LPN1, RPN1, LPN2, RPN2, BEET, BEEF, MNCLO, LCP, IDLE: con (1<<16)+iota; # syntax # RE ALT regular expression # NUL # ALT CAT alternation # CAT | ALT # # CAT DUP catenation # DUP CAT # # DUP PRIM possibly duplicated primary # PCLO # CLO # OPT # # PCLO PRIM + 1 or more # CLO PRIM * 0 or more # OPT PRIM ? 0 or 1 # # PRIM ( RE ) # () # DOT any character # CHAR a single character # ESC escape sequence # [ SET ] character set # NUL null string # HAT beginning of string # DOL end of string # regex: ref Exec; NIL : con -1; # a refRex constant NONE: con -2; # ditto, for an un-set value BAD: con 1<<16; # a non-character HUGE: con (1<<31) - 1; # the data structures of re.m would like to be ref-linked, but are # circular (see fn walk), thus instead of pointers we use indexes # into an array (arena) of nodes of the syntax tree of a regular expression. # from a storage-allocation standpoint, this replaces many small # allocations of one size with one big one of variable size. ReStr: adt { s : string; i : int; # cursor postion n : int; # number of chars left; -1 on error peek : fn(s: self ref ReStr): int; next : fn(s: self ref ReStr): int; unput: fn(s: self ref ReStr); }; ReStr.peek(s: self ref ReStr): int { if(s.n <= 0) return BAD; return s.s[s.i]; } ReStr.next(s: self ref ReStr): int { if(s.n <= 0) syntax("bad regular expression"); s.n--; return s.s[s.i++]; } ReStr.unput(s: self ref ReStr) { s.n++; s.i--; } newRe(kind: int, left, right: refRex, set: ref Set, ar: ref Arena, pno: int, greedy: int): refRex { ar.rex[ar.ptr] = Rex(kind, left, right, set, pno, greedy, nil); return ar.ptr++; } # parse a regex by recursive descent to get a syntax tree re(s: ref ReStr, ar: ref Arena): refRex { left := cat(s, ar); if(left==NIL || s.peek()!='|') return left; s.next(); right := re(s, ar); if(right == NIL) return NIL; return newRe(ALT, left, right, nil, ar, 0, 0); } cat(s: ref ReStr, ar: ref Arena): refRex { left := dup(s, ar); if(left == NIL) return left; right := cat(s, ar); if(right == NIL) return left; return newRe(CAT, left, right, nil, ar, 0, 0); } dup(s: ref ReStr, ar: ref Arena): refRex { n1, n2: int; case s.peek() { BAD or ')' or ']' or '|' or '?' or '*' or '+' => return NIL; } prim: refRex; case kind:=s.next() { '(' => if(ar.pno < 0) { if(s.peek() == ')') { s.next(); prim = newRe(NUL, NONE, NONE, nil, ar, 0, 0); } else { prim = re(s, ar); if(prim==NIL || s.next()!=')') syntax("( with no )"); } } else { pno := ++ar.pno; lp := newRe(LPN, NONE, NONE, nil, ar, pno, 0); rp := newRe(RPN, NONE, NONE, nil, ar, pno, 0); if(s.peek() == ')') { s.next(); prim = newRe(CAT, lp, rp, nil, ar, 0, 0); } else { if(s.peek() == '?'){ s.next(); case s.next(){ ':' => ar.rex[lp].kind = LPN0; ar.rex[rp].kind = RPN0; '=' => ar.rex[lp].kind = LPN1; ar.rex[rp].kind = RPN1; '!' => ar.rex[lp].kind = LPN2; ar.rex[rp].kind = RPN2; * => syntax("bad char after ?"); } } prim = re(s, ar); if(prim==NIL || s.next()!=')') syntax("( with no )"); else { prim = newRe(CAT, prim, rp, nil, ar, 0, 0); prim = newRe(CAT, lp, prim, nil, ar, 0, 0); } } } '[' => prim = newRe(SET, NONE, NONE, newSet(s), ar, 0, 0); * => case kind { '.' => kind = DOT; '^' => kind = HAT; '$' => kind = DOL; } (c, set, op) := esc(s, kind, 0); if(set != nil) prim = newRe(SET, NONE, NONE, set, ar, 0, 0); else if(op == LCP){ if(c > ar.pno) syntax("\num too big"); prim = newRe(LCP, NONE, NONE, nil, ar, 0, 0); ar.rex[prim].ns = ref Nstate(c, c); } else prim = newRe(c, NONE, NONE, nil, ar, 0, 0); } case s.peek() { '*' => kind = CLO; '+' => kind = PCLO; '?' => kind = OPT; '{' => s.next(); (n1, n2) = drange(s); kind = MNCLO; if(s.peek() != '}') syntax("{ with no }"); * => return prim; } s.next(); greedy := 1; if(s.peek() == '?'){ # non-greedy op greedy = 0; s.next(); } prim = newRe(kind, prim, NONE, nil, ar, 0, greedy); if(kind == MNCLO) ns := ar.rex[prim].ns = ref Nstate(n1, n2); return prim; } esc(s: ref ReStr, char: int, inset: int): (int, ref Set, int) { set: ref Set; op := 0; if(char == '\\') { char = s.next(); case char { 'b' => if(inset) char = '\b'; else char = BEET; 'B' => if(inset) syntax("\\B in set"); else char = BEEF; 'f' => char = '\u000c'; 'n' => char = '\n'; 'r' => char = '\r'; 't' => char = '\t'; 'v' => char = '\v'; '0' to '9' => s.unput(); char = digits(s); if(char == 0) char = '\0'; else if(inset) syntax("\num in set"); else op = LCP; 'x' => char = hexdigits(s, 2); 'u' => char = hexdigits(s, 4); 'c' => char = s.next()%32; 'd' or 'D' => set = newset('0', '9'); if(char == 'D') set.neg = 1; 's' or 'S' => set = newset(' ', ' '); addsets(set, "\t\v\u000c\u00a0\n\r\u2028\u2029"); if(char == 'S') set.neg = 1; 'w' or 'W' => set = newset('0', '9'); addset(set, 'a', 'z'); addset(set, 'A', 'Z'); addset(set, '_', '_'); if(char == 'W') set.neg = 1; * => ; } } if(char == -1){ if(inset) syntax("bad set"); else syntax("bad character"); } return (char, set, op); } isdigit(c: int): int { return c >= '0' && c <= '9'; } islower(c: int): int { return c >= 'a' && c <= 'z'; } isupper(c: int): int { return c >= 'A' && c <= 'Z'; } isalpha(c: int): int { return islower(c) || isupper(c); } hexdigit(c: int): int { if(isdigit(c)) return c-'0'; if('a' <= c && c <= 'f') return c-'a'+10; if('A' <= c && c <= 'F') return c-'A'+10; return -1; } digits(s: ref ReStr): int { n := 0; while(isdigit(s.peek())) n = 10*n + s.next() -'0'; return n; } hexdigits(s: ref ReStr, n: int): int { x := 0; for(i := 0; i < n; i++){ v := hexdigit(s.next()); if(v < 0) return -1; x = 16*x+v; } return x; } drange(s: ref ReStr): (int, int) { n1 := n2 := -1; if(isdigit(s.peek())) n1 = digits(s); if(s.peek() == ','){ s.next(); if(isdigit(s.peek())) n2 = digits(s); else n2 = HUGE; } else n2 = n1; if(n1 < 0 || n1 > n2) syntax("bad number range"); return (n1, n2); } # walk the tree adjusting pointers to refer to # next state of the finite state machine walk(r: refRex, succ: refRex, ar: ref Arena) { if(r==NONE) return; rex := ar.rex[r]; case rex.kind { ALT => walk(rex.left, succ, ar); walk(rex.right, succ, ar); return; CAT => walk(rex.left, rex.right, ar); walk(rex.right, succ, ar); ar.rex[r] = ar.rex[rex.left]; # optimization return; CLO or PCLO => end := newRe(OPT, r, succ, nil, ar, 0, rex.greedy); # here's the circularity walk(rex.left, end, ar); OPT => walk(rex.left, succ, ar); MNCLO => ar.ptr++; walk(rex.left, r, ar); LCP => ar.rex[r].left = newRe(IDLE, NONE, succ, nil, ar, 0, 0); } ar.rex[r].right = succ; } prtree(r: refRex, ar: ref Arena, done: list of int, ind: string): list of int { sys->print("%s", ind); if(r==NIL){ sys->print("NIL\n"); return done; } if(r==NONE){ sys->print("NONE\n"); return done; } printed := 0; for(li := done; li != nil; li = tl li){ if(hd li == r){ printed = 1; break; } } rex := ar.rex[r]; op := ""; z := "Z"; case rex.kind{ ALT => op = "|"; CAT => op = "and"; DOT => op = "."; SET => op = "[]"; HAT => op = "^"; DOL => op = "$"; NUL => op = "NUL"; PCLO => op = "+"; CLO => op = "*"; OPT => op = "?"; LPN => op = "("; RPN => op = ")"; LPN0 => op = "?:"; RPN0 => op = ":?"; LPN1 => op = "?="; RPN1 => op = "=?"; LPN2 => op = "?!"; RPN2 => op = "!?"; BEET => op = "\\b"; BEEF => op = "\\B"; MNCLO => op = "{}"; LCP => op = "n"; IDLE => op = "i"; * => z[0] = rex.kind; op = z; } if(printed){ sys->print("node %d (%d)\n", r, r); return done; } else{ if(rex.ns != nil) sys->print("%s [%d-%d] (%d)\n", op, rex.ns.m, rex.ns.n, r); else sys->print("%s (%d)\n", op, r); done = r :: done; ind += " "; done = prtree(rex.left, ar, done, ind); done = prtree(rex.right, ar, done, ind); return done; } } compile(e: string, flag: int): (Re, string) { if(e == nil) return (nil, "missing expression"); s := ref ReStr(e, 0, len e); ar := ref Arena(array[2*s.n] of Rex, 0, 0, (flag&1)-1); start := ar.start = re(s, ar); if(start==NIL || s.n!=0) syntax("invalid regular expression"); walk(start, NIL, ar); # prtree(start, ar, nil, ""); if(ar.pno < 0) ar.pno = 0; return (ar, nil); } # todo: queue for epsilon and advancing transitions Num: adt{ ns: ref Nstate; m: int; n: int; }; Gaz: adt { pno: int; beg: int; end: int; }; Trace: adt { cre: refRex; # cursor in Re trans: int; # 0 epsilon transition, 1 advancing transition beg: int; # where this trace began; end: int; # where this trace ended if success (-1 by default) gaz: list of Gaz; ns: list of ref Num; }; Queue: adt { ptr: int; q: array of Trace; }; execute(re: Re, s: string): array of (int, int) { return executese(re, s, (-1,-1), 1, 1, 1, 0); } executese(re: Re, s: string, range: (int, int), bol: int, eol: int, multiline: int, ignorecase: int): array of (int,int) { if(re==nil) return nil; (s0, s1) := range; if(s0 < 0) s0 = 0; if(s1 < 0) s1 = len s; match := 0; todo := ref Queue(0, array[2*re.ptr] of Trace); for(i:=s0; i<=s1; i++) { if(!match) # no leftmost match yet todo.q[todo.ptr++] = Trace(re.start, 0, i, -1, nil, nil); for(k:=0; k<todo.ptr; k++) { q := todo.q[k]; if(q.trans) continue; rex := re.rex[q.cre]; next0 := next1 := next2 := NONE; case rex.kind { NUL => next1 = rex.right; DOT => if(i<len s && !islt(s[i])) next2 = rex.right; HAT => if(i == s0 && bol) next1 = rex.right; else if(multiline && i > 0 && islt(s[i-1])) next1 = rex.right; DOL => if(i == s1 && eol) next1 = rex.right; else if(multiline && i < s1 && islt(s[i])) next1 = rex.right; SET => if(i<len s && member(s[i], rex.set, ignorecase)) next2 = rex.right; CAT or PCLO => next1 = rex.left; ALT or CLO or OPT => if(rex.kind == ALT || rex.greedy){ next0 = rex.left; next1 = rex.right; } else{ next0 = rex.right; next1 = rex.left; } LPN => next1 = rex.right; q.gaz = Gaz(rex.pno,i,-1)::q.gaz; RPN => next1 = rex.right; for(r:=q.gaz; ; r=tl r) { (pno,beg1,end1) := hd r; if(rex.pno==pno && end1==-1) { q.gaz = Gaz(pno,beg1,i)::q.gaz; break; } } LPN0 or RPN0 or RPN1 or RPN2 => next1 = rex.right; LPN1 => (rpn, nxt, nre) := storetree(q.cre, re); m := executese(nre, s, (i, -1), bol, eol, multiline, ignorecase); if(m != nil && m[0].t0 == i){ next1 = nxt; for(j := 1; j < len m; j++) if(m[j].t0 >= 0) q.gaz = Gaz(j, m[j].t0, m[j].t1)::q.gaz; } restoretree(LPN1, rpn, nxt, nre); LPN2 => (rpn, nxt, nre) := storetree(q.cre, re); m := executese(nre, s, (i, -1), bol, eol, multiline, ignorecase); if(m == nil || m[0].t0 != i) next1 = nxt; restoretree(LPN2, rpn, nxt, nre); MNCLO => num: ref Num; (q.ns, num) = nextn(q.cre, q.ns, rex.ns.m, rex.ns.n, re); if(num.m > 0) next1 = rex.left; else if(num.n > 0){ if(rex.greedy){ next0 = rex.left; next1 = rex.right; } else{ next0 = rex.right; next1 = rex.left; } } else{ next1 = rex.right; (num.m, num.n) = (-1, -1); } LCP => pno := rex.ns.m; (beg1, end1) := lcpar(q.gaz, pno); l := end1-beg1; if(beg1 < 0) # undefined so succeeds next1 = rex.right; else if(i+l <= s1 && eqstr(s[beg1: end1], s[i: i+l], ignorecase)){ (q.ns, nil) = nextn(rex.left, q.ns, l, l, re); next1 = rex.left; # idle } IDLE => num: ref Num; (q.ns, num) = nextn(q.cre, q.ns, -1, -1, re); if(num.m >= 0) next2 = q.cre; else{ next1 = rex.right; (num.m, num.n) = (-1, -1); } BEET => if(iswordc(s, i-1) != iswordc(s, i)) next1 = rex.right; BEEF => if(iswordc(s, i-1) == iswordc(s, i)) next1 = rex.right; * => if(i<len s && (rex.kind==s[i] || (ignorecase && eqcase(rex.kind, s[i])))) next2 = rex.right; } l := k; if(next0 != NONE) { if(next0 != NIL) (k, l) = insert(next0, 0, q.beg, -1, q.gaz, q.ns, todo, k, l); else{ match = 1; (k, l) = insert(NIL, 2, q.beg, i, q.gaz, nil, todo, k, l); } } if(next1 != NONE) { if(next1 != NIL) (k, l) = insert(next1, 0, q.beg, -1, q.gaz, q.ns, todo, k, l); else{ match = 1; (k, l) = insert(NIL, 2, q.beg, i, q.gaz, nil, todo, k, l); } } if(next2 != NONE) { if(next2 != NIL) (k, l) = insert(next2, 1, q.beg, -1, q.gaz, q.ns, todo, k, l); else{ match = 1; (k, l) = insert(NIL, 2, q.beg, i+1, q.gaz, nil, todo, k, l); } } } if(!atoe(todo) && match) break; } if(todo.ptr == 0) return nil; if(todo.ptr > 1) rfatal(sys->sprint("todo.ptr = %d", todo.ptr)); if(todo.q[0].trans != 2) rfatal(sys->sprint("trans = %d", todo.q[0].trans)); if(todo.q[0].cre != NIL) rfatal(sys->sprint("cre = %d", todo.q[0].cre)); beg := todo.q[0].beg; end := todo.q[0].end; gaz := todo.q[0].gaz; if(beg == -1) return nil; result := array[re.pno+1] of { 0 => (beg,end), * => (-1,-1) }; for( ; gaz!=nil; gaz=tl gaz) { (pno, beg1, end1) := hd gaz; (rbeg, nil) := result[pno]; if(rbeg==-1 && (beg1|end1)!=-1) result[pno] = (beg1,end1); } return result; } better(newbeg, newend, oldbeg, oldend: int): int { return oldbeg==-1 || newbeg<oldbeg || newbeg==oldbeg && newend>oldend; } insert(next: refRex, trans: int, tbeg: int, tend: int, tgaz: list of Gaz, tns: list of ref Num, todo: ref Queue, k: int, l: int): (int, int) { # sys->print("insert %d eps=%d beg=%d end=%d (k, l) = (%d %d) => ", next, trans, tbeg, tend, k, l); for(j:=0; j<todo.ptr; j++){ if(todo.q[j].trans == trans){ if(todo.q[j].cre == next){ if(better(todo.q[j].beg, todo.q[j].end, tbeg, tend)) return (k, l); else if(better(tbeg, tend, todo.q[j].beg, todo.q[j].end)) break; else if(j < k) return (k, l); else break; } } } if(j < k){ k--; l--; } if(j < todo.ptr){ todo.q[j: ] = todo.q[j+1: todo.ptr]; todo.ptr--; } todo.q[l+2: ] = todo.q[l+1: todo.ptr]; todo.ptr++; todo.q[l+1] = Trace(next, trans, tbeg, tend, tgaz, tns); # for(j=0; j < todo.ptr; j++) sys->print("%d(%d) ", todo.q[j].cre, todo.q[j].trans); sys->print("\n"); return (k, l+1); } # remove epsilon transitions and move advancing transitions to epsilon ones atoe(todo: ref Queue): int { n := 0; for(j := 0; j < todo.ptr; j++){ if(todo.q[j].trans){ if(todo.q[j].trans == 1){ todo.q[j].trans = 0; n++; } } else{ todo.q[j: ] = todo.q[j+1: todo.ptr]; todo.ptr--; j--; } } return n; } nextn(re: int, ln: list of ref Num, m: int, n: int, ar: ref Arena): (list of ref Num, ref Num) { num: ref Num; ns := ar.rex[re].ns; for(l := ln; l != nil; l = tl l){ if((hd l).ns == ns){ num = hd l; break; } } if(num == nil) ln = (num = ref Num(ns, -1, -1)) :: ln; if(num.m == -1 && num.n == -1) (num.m, num.n) = (m, n); else (nil, nil) = (--num.m, --num.n); return (ln, num); } ASCII : con 128; WORD : con 32; mem(c: int, set: ref Set): int { return (set.ascii[c/WORD]>>c%WORD)&1; } member(char: int, set: ref Set, ignorecase: int): int { if(set.subset != nil){ for(l := set.subset; l != nil; l = tl l) if(member(char, hd l, ignorecase)) return !set.neg; } if(char < 128){ if(ignorecase) return (mem(tolower(char), set) || mem(toupper(char), set))^set.neg; else return ((set.ascii[char/WORD]>>char%WORD)&1)^set.neg; } for(l:=set.unicode; l!=nil; l=tl l) { (beg, end) := hd l; if(char>=beg && char<=end) return !set.neg; } return set.neg; } newSet(s: ref ReStr): ref Set { op: int; set0: ref Set; set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil, nil); if(s.peek() == '^') { set.neg = 1; s.next(); } while(s.n > 0) { char1 := s.next(); if(char1 == ']') return set; (char1, set0, op) = esc(s, char1, 1); if(set0 != nil) mergeset(set, set0); char2 := char1; if(s.peek() == '-') { if(set0 != nil) syntax("set in range"); s.next(); char2 = s.next(); if(char2 == ']') break; (char2, set0, op) = esc(s, char2, 1); if(set0 != nil) syntax("set in range"); if(char2 < char1) break; } addset(set, char1, char2); } syntax("bad set"); return nil; } addset(set: ref Set, c1: int, c2: int) { for(c := c1; c <= c2; c++){ if(c < ASCII) set.ascii[c/WORD] |= 1<<c%WORD; else{ set.unicode = (c, c2) :: set.unicode; break; } } } addsets(set: ref Set, s: string) { for(i := 0; i < len s; i++) addset(set, s[i], s[i]); } mergeset(set: ref Set, set0: ref Set) { if(!set0.neg){ for(i := 0; i < ASCII/WORD; i++) set.ascii[i] |= set0.ascii[i]; for(l := set0.unicode; l != nil; l = tl l) set.unicode = hd l :: set.unicode; } else set.subset = set0 :: set.subset; } newset(c1: int, c2: int): ref Set { set := ref Set(0, array[ASCII/WORD] of {* => 0}, nil, nil); addset(set, c1, c2); return set; } storetree(lpn: int, re: ref Arena): (int, int, ref Arena) { rpn: int; rex := re.rex[lpn]; k := rex.kind; l := 1; for(;;){ rpn = rex.right; rex = re.rex[rpn]; if(rex.kind == k) l++; else if(rex.kind == k+1 && --l == 0) break; } re.rex[lpn].kind = LPN; re.rex[rpn].kind = RPN; nxt := re.rex[rpn].right; re.rex[rpn].right = NIL; nre := ref *re; nre.start = lpn; return (rpn, nxt, nre); } restoretree(lop: int, rpn: int, nxt: int, re: ref Arena) { lpn := re.start; re.rex[lpn].kind = lop; re.rex[rpn].kind = lop+1; re.rex[rpn].right = nxt; } iswordc(s: string, i: int): int { if(i < 0 || i >= len s) return 0; c := s[i]; return isdigit(c) || isalpha(c) || c == '_'; } lcpar(gaz: list of Gaz, pno: int): (int, int) { for(r := gaz; r != nil; r = tl r) { (pno1, beg1, end1) := hd r; if(pno == pno1) return (beg1, end1); } return (-1, -1); } eqstr(s: string, t: string, ic: int): int { if(!ic) return s == t; if(len s != len t) return 0; for(i := 0; i < len s; i++) if(!eqcase(s[i], t[i])) return 0; return 1; } eqcase(c1: int, c2: int): int { return toupper(c1) == toupper(c2); } syntax(s: string) { runtime(regex, SyntaxError, s); } rfatal(s: string) { runtime(regex, InternalError, s); }