ref: 25bfea9b4ab857720f4d184fb2f7e1c42c3fdae5
dir: /compile.myr/
use std
use "types.use"
pkg regex =
const compile : (re : byte[:] -> std.error(regex#, status))
;;
type tree = union
/* basic string building */
`Alt [tree#, tree#]
`Cat [tree#, tree#]
/* repetition */
`Star tree#
`Plus tree#
`Quest tree#
/* end matches */
`Byte byte
`Chr char
`Class [char, char]
/* meta */
`Cap tree#
`Bol /* beginning of line */
`Eol /* end of line */
;;
type parseresult = union
`Some tree#
`None
`Fail status
;;
const compile = {pat
var re : regex#
re = std.zalloc()
re.pat = pat
re.nmatch = 1 /* whole match */
match parse(re)
`None: -> `std.Failure (`Earlystop);;
`Fail f: -> `std.Failure f;;
`Some t:
dump(t, 0)
append(re, `Ilbra 0)
gen(re, t)
append(re, `Irbra 0)
append(re, `Imatch)
idump(re)
-> `std.Success re
;;
;;
-> `std.Failure (`Noimpl)
}
const gen = {re, t
var m
match t#
`Alt (a, b): genalt(re, a, b);;
`Cat (a, b): gen(re, a); gen(re, b);;
/* repetition */
`Star a: genstar(re, a);;
`Plus a: gen(re, a); genstar(re, a);;
`Quest a: genquest(re, a);;
/* end matches */
`Byte b: append(re, `Ibyte b);;
`Chr c: genchar(re, c);;
`Class (a, b): genrange(re, a, b);;
/* meta */
`Bol:
append(re, `Ibol)
;;
`Eol:
append(re, `Ibol)
;;
`Cap a:
m = re.nmatch++
append(re, `Ilbra m)
gen(re, a)
append(re, `Irbra m)
;;
;;
-> re.proglen
}
const genrange = {re, lo, hi
var charrng = [
0,
0x80,
0x800,
0x10000,
0x200000,
-1
]
var lbuf : byte[4]
var hbuf : byte[4]
var lsz
var hsz
var end
var sz
var d
var i
var j
lsz = std.charlen(lo)
hsz = std.charlen(hi)
charrng[lsz - 1] = lo
charrng[hsz] = hi
if lsz == 1 && hsz == 1
append(re, `Irange (lo castto(byte), hi castto(byte)))
else
for i = hsz; i > lsz; i--
std.put("i = %z\n", i - 2)
d = re.proglen + i - 1
append(re, `Ifork (re.proglen + 1, jmpdist(i) + d))
;;
end = re.proglen + jmpdist(hsz + 1);
for i = 0; i < hsz; i++
std.put("lo[%z] = %i\n", i, charrng[i] castto(int))
std.put("hi[%z] = %i\n", i, (charrng[i + 1] - 1) castto(int))
sz = std.encode(lbuf[:], charrng[i])
std.encode(hbuf[:], charrng[i + 1] - 1)
for j = 0; j < sz; j++
append(re, `Irange (lbuf[j], hbuf[j]))
;;
append(re, `Ijmp (end))
;;
;;
-> re.proglen
}
const jmpdist = {n
var d
var i
d = n - 1
for i = n - 1; i > 0; i--
d += i
;;
-> d
}
const genalt = {re, l, r
var alt
var jmp
var l0
var l1
var l2
alt = re.proglen
l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
gen(re, l)
jmp = re.proglen
l1 = append(re, `Ijmp -1) /* needs to be replaced */
l2 = gen(re, r)
re.prog[alt] = `Ifork(l0, l1)
re.prog[jmp] = `Ijmp l2
-> re.proglen
}
const genstar = {re, rep
var alt
var jmp
var l0
var l1
var l2
l0 = re.proglen
alt = re.proglen
l1 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
jmp = gen(re, rep)
l2 = append(re, `Ijmp -1)
re.prog[alt] = `Ifork (l1, l2)
re.prog[jmp] = `Ijmp l0
-> re.proglen
}
const genquest = {re, q
var alt
var l0
var l1
alt = re.proglen
l0 = append(re, `Ifork (-1, -1)) /* needs to be replaced */
l1 = gen(re, q)
re.prog[alt] = `Ifork (l0, l1)
-> re.proglen
}
const genchar = {re, c
var b : byte[4]
var n
var i
n = std.encode(b[:], c)
for i = 0; i < n; i++
append(re, `Ibyte b[i])
;;
-> re.proglen
}
const append = {re, insn
if re.proglen == re.prog.len
re.prog = std.slgrow(re.prog, std.max(1, 2*re.proglen))
;;
re.prog[re.proglen] = insn
re.proglen++
-> re.proglen
}
const idump = {re
var i
for i = 0; i < re.proglen; i++
std.put("%i:\t", i)
match re.prog[i]
/* Char matching. Consume exactly one byte from the string. */
`Ibyte b: std.put("`Ibyte %b (%c)\n", b, b castto(char)) ;;
`Irange (start, end): std.put("`Irange (%b,%b)\n", start, end) ;;
`Idot: std.put("`Idot\n") ;;
/* capture groups */
`Ilbra m: std.put("`Ilbra %z\n", m) ;;
`Irbra m: std.put("`Irbra %z\n", m) ;;
/* anchors */
`Ibol: std.put("`Ibol\n");;
`Ieol: std.put("`Ieol\n");;
/* control flow */
`Ifork (lip, rip): std.put("`Ifork (%z,%z)\n", lip, rip) ;;
`Ijmp ip: std.put("`Ijmp %z\n", ip) ;;
`Imatch: std.put("`Imatch\n") ;;
;;
;;
}
const dump = {t, indent
var i
for i = 0; i < indent; i++
std.put(" ")
;;
match t#
`Alt (a, b):
std.put("Alt\n")
dump(a, indent + 1)
dump(b, indent + 1)
;;
`Cat (a, b):
std.put("Cat\n")
dump(a, indent + 1)
dump(b, indent + 1)
;;
/* repetition */
`Star a:
std.put("Star\n")
dump(a, indent + 1)
;;
`Plus a:
std.put("Plus\n")
dump(a, indent + 1)
;;
`Quest a:
std.put("Quest\n")
dump(a, indent + 1)
;;
`Bol:
std.put("Bol\n")
;;
`Eol:
std.put("Eol\n")
;;
/* end matches */
`Byte b:
std.put("Byte %b\n", b)
;;
`Chr c:
std.put("Char %c\n", c)
;;
`Class (a, b):
std.put("Class (%c-%c)\n", a, b)
;;
/* meta */
`Cap a:
std.put("Cap\n")
dump(a, indent + 1)
;;
;;
}
const parse = {re
match altexpr(re)
`Some t:
if re.pat.len == 0
-> `Some t
else
free(t)
-> `Fail (`Earlystop)
;;
;;
`None:
-> `None
;;
;;
}
const altexpr = {re
var ret : tree#
match catexpr(re)
`Some t:
ret = t
if matchc(re, '|')
match altexpr(re)
`Some rhs:
ret = mk(`Alt (ret, rhs))
;;
`None:
free(ret)
-> `Fail (`Earlystop)
;;
`Fail f:
-> `Fail f
;;
;;
;;
;;
other: -> other;;
;;
-> `Some ret
}
const catexpr = {re
var ret
match repexpr(re)
`Some t:
ret = t
match catexpr(re)
`Some rhs:
ret = mk(`Cat (t, rhs))
;;
`Fail f: -> `Fail f;;
`None: /* nothing */;;
;;
;;
other: -> other;;
;;
-> `Some ret
}
const repexpr = {re
var ret
match baseexpr(re)
`Some t:
if matchc(re, '*')
ret = mk(`Star t)
elif matchc(re, '+')
ret = mk(`Plus t)
elif matchc(re, '?')
ret = mk(`Quest t)
else
ret = t
;;
;;
other: -> other;;
;;
-> `Some ret
}
const baseexpr = {re
var ret
if re.pat.len == 0
-> `None
;;
match peekc(re)
/* lower prec operators */
'|': -> `None;;
')': -> `None;;
'*': -> `Fail (`Badrep);;
'+': -> `Fail (`Badrep);;
'?': -> `Fail (`Badrep);;
'[': -> chrclass(re);;
'.': getc(re); ret = mk(`Class (0, std.Maxcharval));;
'^': getc(re); ret = mk(`Bol);;
'$': getc(re); ret = mk(`Eol);;
'(':
getc(re)
match altexpr(re)
`Some s: ret = mk(`Cap s);;
`None: -> `Fail (`Emptyparen);;
;;
if !matchc(re, ')')
free(ret)
-> `Fail (`Unbalanced)
;;
;;
c:
getc(re)
if c == '\\'
if re.pat.len == 0
-> `Fail (`Earlystop)
;;
c = getc(re)
;;
ret = mk(`Chr c)
;;
;;
dump(ret, 0)
-> `Some ret
}
const chrclass = {re
var t
matchc(re, '[')
if !matchc(re, ']')
-> `Fail (`Earlystop)
;;
t = std.alloc()
t# = `Class (0, 1)
-> `Some t
}
const matchc = {re, c
var str
var chr
(chr, str) = std.striter(re.pat)
if chr != c
-> false
;;
re.pat = str
-> true
}
const getc = {re
var c
(c, re.pat) = std.striter(re.pat)
-> c
}
const peekc = {re
var c
var _
(c, _) = std.striter(re.pat)
-> c
}
const mk = {v
var t
t = std.alloc()
t# = v
-> t
}
const free = {t
match t#
`Alt (a, b): free(a); free(b);;
`Cat (a, b): free(a); free(b);;
/* repetition */
`Star a: free(a);;
`Plus a: free(a);;
`Quest a: free(a);;
/* end matches */
`Byte b: ;;
`Chr c: ;;
`Class (a, b): ;;
/* meta */
`Cap a: free(a);;
;;
std.free(t)
}