ref: 4e4f36fcf5177b0c51c4bcb5a203c5f86edf6e00
dir: /lib/regex/interp.myr/
use std use "types" pkg regex = /* regex execution */ const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:])) const search : (re : regex#, str : byte[:] -> std.option(byte[:][:])) /* regex execution returning indexes */ const iexec : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:])) const isearch : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:])) /* substitution */ const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:])) const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool) const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:]) const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void) const matchfree : (pat : byte[:][:] -> void) ;; /* Ugly: for performance. std.option() should be used instead when unions get faster. */ const Zthr = (0 : rethread#) const exec = {re, str var thr, m thr = run(re, str, 0, true) m = getmatches(re, thr) cleanup(re) -> m } const iexec = {re, str var thr, m thr = run(re, str, 0, true) m = getidxmatches(re, thr) cleanup(re) -> m } const search = {re, str var thr var m m = `std.None for var i = 0; i < str.len; i++ thr = run(re, str[i:], 0, false) m = getmatches(re, thr) match m | `std.Some _: break | `std.None: /* nothing */ ;; cleanup(re) ;; -> m } const isearch = {re, str var thr var m m = `std.None for var i = 0; i < str.len; i++ thr = run(re, str[i:], 0, false) m = getidxmatches(re, thr) match m | `std.Some _: break | `std.None: /* nothing */ ;; cleanup(re) ;; -> m } const sub = {re, str, subst var sb sb = std.mksb() if !sbsub(sb, re, str, subst) -> `std.None else -> `std.Some std.sbfin(sb) ;; } const sbsub = {sb, re, str, subst var thr, m /* we always have m[0] as the full match */ if re.nmatch != subst.len + 1 -> false ;; thr = run(re, str, 0, true) if thr == Zthr m = false else m = dosubst(sb, re, thr, str, subst) ;; cleanup(re) -> m } const suball = {re, str, subst var sb sb = std.mksb() sbsuball(sb, re, str, subst) -> std.sbfin(sb) } const sbsuball = {sb, re, str, subst var thr, len, s, i /* we always have m[0] as the full match */ if re.nmatch != subst.len + 1 -> void ;; i = 0 while i < str.len thr = run(re, str[i:], 0, false) if thr == Zthr std.sbputb(sb, str[i]) i++ else len = thr.mend[0] s = str[i:len + i] dosubst(sb, re, thr, s, subst) i += len ;; cleanup(re) ;; } const dosubst = {sb, re, thr, str, subst var off off = 0 for var i = 1; i < re.nmatch; i++ if thr.mstart[i] != -1 && thr.mend[i] != -1 std.sbputs(sb, str[off:thr.mstart[i]]) std.sbputs(sb, subst[i - 1]) off = thr.mend[i] ;; ;; std.sbputs(sb, str[off:]) thrfree(re, thr) -> true } const cleanup = {re var thr, next for thr = re.runq; thr != Zthr; thr = next next = thr.next thrfree(re, thr) ;; for thr = re.expired; thr != Zthr; thr = next next = thr.next thrfree(re, thr) ;; re.nexttid = 0 } const matchfree = {m std.slfree(m) } const getmatches = {re, thr var ret if thr == Zthr -> `std.None ;; ret = std.slalloc(re.nmatch) for var i = 0; i < re.nmatch; i++ if thr.mstart[i] != -1 && thr.mend[i] != -1 ret[i] = re.str[thr.mstart[i]:thr.mend[i]] else ret[i] = [][:] ;; ;; thrfree(re, thr) -> `std.Some ret } const getidxmatches = {re, thr var ret if thr == Zthr -> `std.None ;; ret = std.slalloc(re.nmatch) for var i = 0; i < re.nmatch; i++ if thr.mstart[i] != -1 && thr.mend[i] != -1 ret[i] = (thr.mstart[i], thr.mend[i]) else ret[i] = (-1, -1) ;; ;; thrfree(re, thr) -> `std.Some ret } /* returns a matching thread, or Zthr if no threads matched */ const run = {re, str, idx, wholestr var bestmatch var consumed var states var thr var ip re.str = str re.strp = 0 bestmatch = Zthr states = std.mkbs() re.runq = mkthread(re, 0) if re.debug /* The last run could have left things here, since we need this info after the run */ for bs : re.traces std.bsfree(bs) ;; std.slfree(re.traces) re.traces = [][:] std.slpush(&re.traces, std.mkbs()) ;; re.runq.mstart = std.slalloc(re.nmatch) re.runq.mend = std.slalloc(re.nmatch) for var i = 0; i < re.nmatch; i++ re.runq.mstart[i] = -1 re.runq.mend[i] = -1 ;; while re.nthr > 0 while re.runq != Zthr /* set up the next thread */ thr = re.runq re.runq = thr.next trace(re, thr, "\nrunning tid={}, ip={}, s[{}]={}\n", \ thr.tid, thr.ip, re.strp, std.decode(re.str[re.strp:])) ip = thr.ip consumed = step(re, thr, -1) while !consumed consumed = step(re, thr, ip) ;; if std.bshas(states, thr.ip) die(re, thr, "there can be only one") ;; if thr.dead thrfree(re, thr) elif thr.matched trace(re, thr, "new bestmatch\n") if bestmatch != Zthr thrfree(re, bestmatch) ;; if re.strp == re.str.len bestmatch = thr goto done elif !wholestr bestmatch = thr ;; elif !thr.matched std.bsput(states, thr.ip) if re.expired == Zthr re.expired = thr ;; if re.expiredtail != Zthr re.expiredtail.next = thr ;; re.expiredtail = thr thr.next = Zthr ;; ;; std.bsclear(states) trace(re, Zthr, "switch\n") re.runq = re.expired re.expired = Zthr re.expiredtail = Zthr re.strp++ ;; :done std.bsfree(states) -> bestmatch } /* Steps forward one instruction. Returns true if a byte of input was consumed, false otherwise. */ const step = {re, thr, curip var str var mstart var mend str = re.str match re.prog[thr.ip] /* Char matching. Consume exactly one byte from the string. */ | `Ibyte b: trace(re, thr, "\t{}:\tByte {} ({})\n", thr.ip, b, (b : char)) if !within(re, str) die(re, thr, "end of string") elif b != str[re.strp] die(re, thr, "not right char") else hit(re, thr) thr.ip++ trace(re, thr, "\t\tmatched {} with {}\n", b, str[re.strp]) ;; | `Irange (start, end): trace(re, thr, "\t{}:\tRange ({}, {}) /* {} - {} */\n", thr.ip, start, end, (start : char), (end : char)) if !within(re, str) || start > str[re.strp] || end < str[re.strp] die(re, thr, "bad range") else hit(re, thr) thr.ip++ ;; /* Non-consuming. All of these return false, and expect step to be called again until exactly one byte is consumed from the string. */ | `Ibol: trace(re, thr, "\t{}:\tBol\n", thr.ip) if re.strp == 0 || str[re.strp - 1] == ('\n' : byte) hit(re, thr) thr.ip++ -> false else die(re, thr, "not beginning of line") ;; | `Ieol: trace(re, thr, "\t{}:\tEol\n", thr.ip) if re.strp == str.len || str[re.strp] == ('\n' : byte) hit(re, thr) thr.ip++ -> false else die(re, thr, "not end of line") ;; /* check for word characters */ | `Ibow: trace(re, thr, "\t{}:\tBow\n", thr.ip) if iswordchar(str[re.strp:]) && (re.strp == 0 || !iswordchar(prevchar(str, re.strp))) hit(re, thr) thr.ip++ -> false else die(re, thr, "not beginning of word") ;; | `Ieow: trace(re, thr, "\t{}:\tEow\n", thr.ip) if re.strp == str.len && iswordchar(prevchar(str, re.strp)) hit(re, thr) thr.ip++ -> false elif re.strp > 0 && !iswordchar(str[re.strp:]) && iswordchar(prevchar(str, re.strp)) hit(re, thr) thr.ip++ -> false else die(re, thr, "not end of word") ;; | `Ilbra m: trace(re, thr, "\t{}:\tLbra {}\n", thr.ip, m) trace(re, thr, "\t\tmatch start = {}\n", re.strp) thr.mstart[m] = re.strp hit(re, thr) thr.ip++ -> false | `Irbra m: trace(re, thr, "\t{}:\tRbra {}\n", thr.ip, m) thr.mend[m] = re.strp hit(re, thr) thr.ip++ -> false | `Ifork (lip, rip): trace(re, thr, "\t{}:\tFork ({}, {})\n", thr.ip, lip, rip) mstart = std.sldup(thr.mstart) mend = std.sldup(thr.mend) hit(re, thr) fork(re, rip, curip, mstart, mend) if re.debug std.slpush(&re.traces, std.bsdup(re.traces[thr.tid])) ;; thr.ip = lip -> false | `Ijmp ip: trace(re, thr, "\t{}:\tJmp {}\n", thr.ip, ip) hit(re, thr) thr.ip = ip -> false | `Imatch id: trace(re, thr, "\t{}:\tMatch\n", thr.ip) re.lastthr = thr.tid finish(re, thr) -> true ;; -> true } const fork = {re, ip, curip, mstart, mend var thr if ip == curip /* loop detection */ -> void ;; thr = mkthread(re, ip) thr.next = re.runq thr.mstart = mstart thr.mend = mend re.runq = thr } const die = {re, thr, msg /* we can have die called on a thread multiple times, eg, if it has a bad range *and* end in a state that another thread is in. We should only decrement the number of threads for that once. */ trace(re, thr, "\t\tdie {}: {}\n", thr.tid, msg) if !thr.dead re.nthr-- ;; re.lastip = thr.ip re.lastthr = thr.tid thr.dead = true } const finish = {re, thr trace(re, thr, "finish {}\n", thr.tid) thr.matched = true re.nthr-- } const mkthread = {re, ip var thr : rethread# thr = std.alloc() thr.next = Zthr thr.ip = ip thr.tid = re.nexttid++ thr.dead = false thr.matched = false thr.mstart = [][:] thr.mend = [][:] re.nthr++ -> thr } const thrfree = {re, thr trace(re, thr, "\t\tcleanup {}\n", thr.tid) std.slfree(thr.mstart) std.slfree(thr.mend) std.free(thr) } const within = {re, str -> re.strp < str.len } const trace : (re : regex#, thr : rethread#, msg : byte[:], args : ... -> void) = {re, thr, msg, args var ap if re.trace && thr != Zthr ap = std.vastart(&args) std.putv(msg, &ap) std.put("\t{}\n", re.pat) std.put("\t") for var i = 0; i < re.pcidx[thr.ip] - 1; i++ std.put(" ") ;; std.put("^\n") ;; } const hit = {re, thr if re.debug std.bsput(re.traces[thr.tid], thr.ip) ;; } /* must be called with i >= 1 */ const prevchar = {s, i std.assert(i != 0, "prevchar must be called with i >= 1\n") i-- while i != 0 && s[i] >= 0x80 i-- ;; -> s[i:] } const iswordchar = {s var c c = std.decode(s) -> std.isalpha(c) || std.isdigit(c) || c == '_' }