ref: cefdbe00dfad4086e2f3ba7cd0007d729e77e137
parent: 830f28c844022a71f6c7ad1caf1bcfb7ca9397dc
author: Ori Bernstein <ori@eigenstate.org>
date: Wed Jun 8 08:09:30 EDT 2016
Add matching that returns indexes.
--- a/lib/regex/interp.myr
+++ b/lib/regex/interp.myr
@@ -3,12 +3,20 @@
use "types"
pkg regex =
+ /* regex execution */
const exec : (re : regex#, str : byte[:] -> std.option(byte[:][:]))
const search : (re : regex#, str : byte[:] -> std.option(byte[:][:]))
+
+ /* regex execution returning indexes */
+ const iexec : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:]))
+ const isearch : (re : regex#, str : byte[:] -> std.option((std.size, std.size)[:]))
+
+ /* substitution */
const sub : (re : regex#, str : byte[:], subst : byte[:][:] -> std.option(byte[:]))
const sbsub : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> bool)
const suball : (re : regex#, str : byte[:], subst : byte[:][:] -> byte[:])
const sbsuball : (sb : std.strbuf#, re : regex#, str : byte[:], subst : byte[:][:] -> void)
+
const matchfree : (pat : byte[:][:] -> void)
;;
@@ -16,17 +24,23 @@
const Zthr = (0 : rethread#)
const exec = {re, str
- var thr
- var m
+ var thr, m
- re.str = str
- re.strp = 0
- thr = run(re, true)
+ thr = run(re, str, 0, true)
m = getmatches(re, thr)
cleanup(re)
-> m
}
+const iexec = {re, str
+ var thr, m
+
+ thr = run(re, str, 0, true)
+ m = getidxmatches(re, thr)
+ cleanup(re)
+ -> m
+}
+
const search = {re, str
var thr
var m
@@ -33,9 +47,7 @@
m = `std.None
for var i = 0; i < str.len; i++
- re.str = str[i:]
- re.strp = 0
- thr = run(re, false)
+ thr = run(re, str[i:], 0, false)
m = getmatches(re, thr)
match m
| `std.Some _: break
@@ -46,6 +58,23 @@
-> m
}
+const isearch = {re, str
+ var thr
+ var m
+
+ m = `std.None
+ for var i = 0; i < str.len; i++
+ thr = run(re, str[i:], 0, false)
+ m = getidxmatches(re, thr)
+ match m
+ | `std.Some _: break
+ | `std.None: /* nothing */
+ ;;
+ cleanup(re)
+ ;;
+ -> m
+}
+
const sub = {re, str, subst
var sb
@@ -65,9 +94,7 @@
-> false
;;
- re.str = str
- re.strp = 0
- thr = run(re, true)
+ thr = run(re, str, 0, true)
if thr == Zthr
m = false
else
@@ -95,9 +122,7 @@
i = 0
while i < str.len
- re.str = str[i:]
- re.strp = 0
- thr = run(re, false)
+ thr = run(re, str[i:], 0, false)
if thr == Zthr
std.sbputb(sb, str[i])
i++
@@ -164,14 +189,34 @@
-> `std.Some ret
}
+const getidxmatches = {re, thr
+ var ret
+ if thr == Zthr
+ -> `std.None
+ ;;
+ ret = std.slalloc(re.nmatch)
+ for var i = 0; i < re.nmatch; i++
+ if thr.mstart[i] != -1 && thr.mend[i] != -1
+ ret[i] = (thr.mstart[i], thr.mend[i])
+ else
+ ret[i] = (-1, -1)
+ ;;
+ ;;
+ thrfree(re, thr)
+ -> `std.Some ret
+}
+
/* returns a matching thread, or Zthr if no threads matched */
-const run = {re, wholestr
+const run = {re, str, idx, wholestr
var bestmatch
var consumed
var states
var thr
var ip
+
+ re.str = str
+ re.strp = 0
bestmatch = Zthr
states = std.mkbs()
--- a/lib/regex/test/bld.sub
+++ b/lib/regex/test/bld.sub
@@ -5,6 +5,15 @@
lib @/lib/sys:sys
lib @/lib/regex:regex
;;
+
+test idxmatch =
+ idxmatch.myr
+ testmatch.myr
+ lib @/lib/std:std
+ lib @/lib/sys:sys
+ lib @/lib/regex:regex
+;;
+
test boundaries =
boundaries.myr
testmatch.myr
--- /dev/null
+++ b/lib/regex/test/idxmatch.myr
@@ -1,0 +1,48 @@
+use std
+
+use "testmatch"
+
+const main = {
+ var s : byte[:]
+
+ s = std.strjoin([
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
+ ][:], "")
+
+ testidxmatch(".*bc", "Abc", `std.Some [(0, 3)][:])
+ testidxmatch("(a*)*", "a", `std.Some [(0,1), (0, 1)][:])
+ testidxmatch("(aa|aab?)*", s, `std.Some [(0, 408), (406, 408)][:])
+ /* greedy matches */
+ testidxmatch("(<.*>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (0, 18),
+ ][:])
+ testidxmatch("(<.+>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (0, 18),
+ ][:])
+ /* reluctant matches */
+ testidxmatch("(<.*?>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (0, 7),
+ ][:])
+ testidxmatch("(<.+?>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (0, 7),
+ ][:])
+ testidxmatch(".*(<b.+?>).*", "<a foo> blah <bar>", `std.Some [
+ (0, 18),
+ (13, 18),
+ ][:])
+}
--- a/lib/regex/test/testmatch.myr
+++ b/lib/regex/test/testmatch.myr
@@ -8,6 +8,12 @@
expected : std.option(byte[:][:]) \
-> void)
+ const testidxmatch : (\
+ pat : byte[:], \
+ text : byte[:], \
+ expected : std.option((std.size, std.size)[:]) \
+ -> void)
+
const testsearch : ( \
pat : byte[:], \
text : byte[:], \
@@ -39,6 +45,10 @@
run(regex.compile(pat), pat, text, expected, false)
}
+const testidxmatch = {pat, text, expected
+ runidx(regex.compile(pat), pat, text, expected, false)
+}
+
const testsearch = {pat, text, expected
run(regex.compile(pat), pat, text, expected, true)
}
@@ -84,6 +94,51 @@
;;
}
+const runidx = {regex, pat, text, expected : std.option((std.size, std.size)[:]), search
+ var re, r
+ var lo, elo, hi, ehi
+
+ re = std.try(regex)
+ if search
+ r = regex.isearch(re, text)
+ else
+ r = regex.iexec(re, text)
+ ;;
+ match r
+ | `std.Some res:
+ match expected
+ | `std.None:
+ std.fatal("expected no match, got:")
+ for var i = 0; i < res.len; i++
+ std.put("\t{}: {}\n", i, res[i])
+ ;;
+ | `std.Some exp:
+ if res.len != exp.len
+ std.put("mismatch: expected {} matches, got {}\n", exp.len, res.len)
+ std.fatal("failed matching {} over {}\n", pat, text)
+ ;;
+ for var i = 0; i < exp.len; i++
+ (elo, ehi) = exp[i]
+ (lo, hi) = res[i]
+ if lo != elo || hi != ehi
+ std.put("mismatch on {}: expected {}, got {}\n", i, exp[i], res[i])
+ std.fatal("failed matching {} over {}\n", pat, text)
+ ;;
+ ;;
+ ;;
+ | `std.None:
+ match expected
+ | `std.None: /* : expected failure */
+ | `std.Some matches:
+ std.put("expected matches:\n")
+ for var i = 0; i < matches.len; i++
+ std.put("\t{}: {}\n", i, matches[i])
+ ;;
+ std.fatal("no match found\n")
+ ;;
+ ;;
+ regex.free(re)
+}
const run = {regex, pat, text, expected, search
var i, re, r