shithub: libmujs

Download patch

ref: eed76191580004d5461bdf8accd8ca23e8541b57
parent: c1ad1ba1e482e7d01743e3f4f9517572bebf99ac
author: Tor Andersson <tor@ccxvii.net>
date: Fri Aug 14 08:05:09 EDT 2015

Rename our internal regex.h to not collide with system regex.h.

--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-SRCS := $(wildcard js*.c utf*.c regex.c)
-HDRS := $(wildcard js*.h mujs.h utf.h regex.h)
+SRCS := $(wildcard js*.c utf*.c regexp.c)
+HDRS := $(wildcard js*.h mujs.h utf.h regexp.h)
 OBJS := $(SRCS:%.c=build/%.o)
 
 prefix ?= /usr/local
--- a/jsgc.c
+++ b/jsgc.c
@@ -3,7 +3,7 @@
 #include "jsvalue.h"
 #include "jsrun.h"
 
-#include "regex.h"
+#include "regexp.h"
 
 static void jsG_markobject(js_State *J, int mark, js_Object *obj);
 
--- a/jsregexp.c
+++ b/jsregexp.c
@@ -1,7 +1,7 @@
 #include "jsi.h"
 #include "jsvalue.h"
 #include "jsbuiltin.h"
-#include "regex.h"
+#include "regexp.h"
 
 void js_newregexp(js_State *J, const char *pattern, int flags)
 {
--- a/jsstring.c
+++ b/jsstring.c
@@ -2,7 +2,7 @@
 #include "jsvalue.h"
 #include "jsbuiltin.h"
 #include "utf.h"
-#include "regex.h"
+#include "regexp.h"
 
 int js_runeat(js_State *J, const char *s, int i)
 {
--- a/one.c
+++ b/one.c
@@ -21,6 +21,6 @@
 #include "jsstate.c"
 #include "jsstring.c"
 #include "jsvalue.c"
-#include "regex.c"
+#include "regexp.c"
 #include "utf.c"
 #include "utftype.c"
--- a/regex.c
+++ /dev/null
@@ -1,1143 +1,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <setjmp.h>
-#include <limits.h>
-
-#include "regex.h"
-#include "utf.h"
-
-#define emit regemit
-#define next regnext
-#define accept regaccept
-
-#define nelem(a) (sizeof (a) / sizeof (a)[0])
-
-#define REPINF 255
-#define MAXTHREAD 1000
-#define MAXSUB REG_MAXSUB
-
-typedef struct Reclass Reclass;
-typedef struct Renode Renode;
-typedef struct Reinst Reinst;
-typedef struct Rethread Rethread;
-
-struct Reclass {
-	Rune *end;
-	Rune spans[64];
-};
-
-struct Reprog {
-	Reinst *start, *end;
-	int flags;
-	unsigned int nsub;
-	Reclass cclass[16];
-};
-
-struct cstate {
-	Reprog *prog;
-	Renode *pstart, *pend;
-
-	const char *source;
-	unsigned int ncclass;
-	unsigned int nsub;
-	Renode *sub[MAXSUB];
-
-	int lookahead;
-	Rune yychar;
-	Reclass *yycc;
-	int yymin, yymax;
-
-	const char *error;
-	jmp_buf kaboom;
-};
-
-static void die(struct cstate *g, const char *message)
-{
-	g->error = message;
-	longjmp(g->kaboom, 1);
-}
-
-static int canon(Rune c)
-{
-	Rune u = toupperrune(c);
-	if (c >= 128 && u < 128)
-		return c;
-	return u;
-}
-
-/* Scan */
-
-enum {
-	L_CHAR = 256,
-	L_CCLASS,	/* character class */
-	L_NCCLASS,	/* negative character class */
-	L_NC,		/* "(?:" no capture */
-	L_PLA,		/* "(?=" positive lookahead */
-	L_NLA,		/* "(?!" negative lookahead */
-	L_WORD,		/* "\b" word boundary */
-	L_NWORD,	/* "\B" non-word boundary */
-	L_REF,		/* "\1" back-reference */
-	L_COUNT,	/* {M,N} */
-};
-
-static int hex(struct cstate *g, int c)
-{
-	if (c >= '0' && c <= '9') return c - '0';
-	if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
-	if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
-	die(g, "invalid escape sequence");
-	return 0;
-}
-
-static int dec(struct cstate *g, int c)
-{
-	if (c >= '0' && c <= '9') return c - '0';
-	die(g, "invalid quantifier");
-	return 0;
-}
-
-#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
-
-static int isunicodeletter(int c)
-{
-	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
-}
-
-static int nextrune(struct cstate *g)
-{
-	g->source += chartorune(&g->yychar, g->source);
-	if (g->yychar == '\\') {
-		g->source += chartorune(&g->yychar, g->source);
-		switch (g->yychar) {
-		case 0: die(g, "unterminated escape sequence");
-		case 'f': g->yychar = '\f'; return 0;
-		case 'n': g->yychar = '\n'; return 0;
-		case 'r': g->yychar = '\r'; return 0;
-		case 't': g->yychar = '\t'; return 0;
-		case 'v': g->yychar = '\v'; return 0;
-		case 'c':
-			g->yychar = (*g->source++) & 31;
-			return 0;
-		case 'x':
-			g->yychar = hex(g, *g->source++) << 4;
-			g->yychar += hex(g, *g->source++);
-			if (g->yychar == 0) {
-				g->yychar = '0';
-				return 1;
-			}
-			return 0;
-		case 'u':
-			g->yychar = hex(g, *g->source++) << 12;
-			g->yychar += hex(g, *g->source++) << 8;
-			g->yychar += hex(g, *g->source++) << 4;
-			g->yychar += hex(g, *g->source++);
-			if (g->yychar == 0) {
-				g->yychar = '0';
-				return 1;
-			}
-			return 0;
-		}
-		if (strchr(ESCAPES, g->yychar))
-			return 1;
-		if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
-			die(g, "invalid escape character");
-		return 0;
-	}
-	return 0;
-}
-
-static int lexcount(struct cstate *g)
-{
-	g->yychar = *g->source++;
-
-	g->yymin = dec(g, g->yychar);
-	g->yychar = *g->source++;
-	while (g->yychar != ',' && g->yychar != '}') {
-		g->yymin = g->yymin * 10 + dec(g, g->yychar);
-		g->yychar = *g->source++;
-	}
-	if (g->yymin >= REPINF)
-		die(g, "numeric overflow");
-
-	if (g->yychar == ',') {
-		g->yychar = *g->source++;
-		if (g->yychar == '}') {
-			g->yymax = REPINF;
-		} else {
-			g->yymax = dec(g, g->yychar);
-			g->yychar = *g->source++;
-			while (g->yychar != '}') {
-				g->yymax = g->yymax * 10 + dec(g, g->yychar);
-				g->yychar = *g->source++;
-			}
-			if (g->yymax >= REPINF)
-				die(g, "numeric overflow");
-		}
-	} else {
-		g->yymax = g->yymin;
-	}
-
-	return L_COUNT;
-}
-
-static void newcclass(struct cstate *g)
-{
-	if (g->ncclass >= nelem(g->prog->cclass))
-		die(g, "too many character classes");
-	g->yycc = g->prog->cclass + g->ncclass++;
-	g->yycc->end = g->yycc->spans;
-}
-
-static void addrange(struct cstate *g, Rune a, Rune b)
-{
-	if (a > b)
-		die(g, "invalid character class range");
-	if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
-		die(g, "too many character class ranges");
-	*g->yycc->end++ = a;
-	*g->yycc->end++ = b;
-}
-
-static void addranges_d(struct cstate *g)
-{
-	addrange(g, '0', '9');
-}
-
-static void addranges_D(struct cstate *g)
-{
-	addrange(g, 0, '0'-1);
-	addrange(g, '9'+1, 0xFFFF);
-}
-
-static void addranges_s(struct cstate *g)
-{
-	addrange(g, 0x9, 0x9);
-	addrange(g, 0xA, 0xD);
-	addrange(g, 0x20, 0x20);
-	addrange(g, 0xA0, 0xA0);
-	addrange(g, 0x2028, 0x2029);
-	addrange(g, 0xFEFF, 0xFEFF);
-}
-
-static void addranges_S(struct cstate *g)
-{
-	addrange(g, 0, 0x9-1);
-	addrange(g, 0x9+1, 0xA-1);
-	addrange(g, 0xD+1, 0x20-1);
-	addrange(g, 0x20+1, 0xA0-1);
-	addrange(g, 0xA0+1, 0x2028-1);
-	addrange(g, 0x2029+1, 0xFEFF-1);
-	addrange(g, 0xFEFF+1, 0xFFFF);
-}
-
-static void addranges_w(struct cstate *g)
-{
-	addrange(g, '0', '9');
-	addrange(g, 'A', 'Z');
-	addrange(g, '_', '_');
-	addrange(g, 'a', 'z');
-}
-
-static void addranges_W(struct cstate *g)
-{
-	addrange(g, 0, '0'-1);
-	addrange(g, '9'+1, 'A'-1);
-	addrange(g, 'Z'+1, '_'-1);
-	addrange(g, '_'+1, 'a'-1);
-	addrange(g, 'z'+1, 0xFFFF);
-}
-
-static int lexclass(struct cstate *g)
-{
-	int type = L_CCLASS;
-	int quoted, havesave, havedash;
-	Rune save;
-
-	newcclass(g);
-
-	quoted = nextrune(g);
-	if (!quoted && g->yychar == '^') {
-		type = L_NCCLASS;
-		quoted = nextrune(g);
-	}
-
-	havesave = havedash = 0;
-	for (;;) {
-		if (g->yychar == 0)
-			die(g, "unterminated character class");
-		if (!quoted && g->yychar == ']')
-			break;
-
-		if (!quoted && g->yychar == '-') {
-			if (havesave) {
-				if (havedash) {
-					addrange(g, save, '-');
-					havesave = havedash = 0;
-				} else {
-					havedash = 1;
-				}
-			} else {
-				save = '-';
-				havesave = 1;
-			}
-		} else if (quoted && strchr("DSWdsw", g->yychar)) {
-			if (havesave) {
-				addrange(g, save, save);
-				if (havedash)
-					addrange(g, '-', '-');
-			}
-			switch (g->yychar) {
-			case 'd': addranges_d(g); break;
-			case 's': addranges_s(g); break;
-			case 'w': addranges_w(g); break;
-			case 'D': addranges_D(g); break;
-			case 'S': addranges_S(g); break;
-			case 'W': addranges_W(g); break;
-			}
-			havesave = havedash = 0;
-		} else {
-			if (quoted) {
-				if (g->yychar == 'b')
-					g->yychar = '\b';
-				else if (g->yychar == '0')
-					g->yychar = 0;
-				/* else identity escape */
-			}
-			if (havesave) {
-				if (havedash) {
-					addrange(g, save, g->yychar);
-					havesave = havedash = 0;
-				} else {
-					addrange(g, save, save);
-					save = g->yychar;
-				}
-			} else {
-				save = g->yychar;
-				havesave = 1;
-			}
-		}
-
-		quoted = nextrune(g);
-	}
-
-	if (havesave) {
-		addrange(g, save, save);
-		if (havedash)
-			addrange(g, '-', '-');
-	}
-
-	return type;
-}
-
-static int lex(struct cstate *g)
-{
-	int quoted = nextrune(g);
-	if (quoted) {
-		switch (g->yychar) {
-		case 'b': return L_WORD;
-		case 'B': return L_NWORD;
-		case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
-		case 's': newcclass(g); addranges_s(g); return L_CCLASS;
-		case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
-		case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
-		case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
-		case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
-		case '0': g->yychar = 0; return L_CHAR;
-		}
-		if (g->yychar >= '0' && g->yychar <= '9') {
-			g->yychar -= '0';
-			if (*g->source >= '0' && *g->source <= '9')
-				g->yychar = g->yychar * 10 + *g->source++ - '0';
-			return L_REF;
-		}
-		return L_CHAR;
-	}
-
-	switch (g->yychar) {
-	case 0:
-	case '$': case ')': case '*': case '+':
-	case '.': case '?': case '^': case '|':
-		return g->yychar;
-	}
-
-	if (g->yychar == '{')
-		return lexcount(g);
-	if (g->yychar == '[')
-		return lexclass(g);
-	if (g->yychar == '(') {
-		if (g->source[0] == '?') {
-			if (g->source[1] == ':') {
-				g->source += 2;
-				return L_NC;
-			}
-			if (g->source[1] == '=') {
-				g->source += 2;
-				return L_PLA;
-			}
-			if (g->source[1] == '!') {
-				g->source += 2;
-				return L_NLA;
-			}
-		}
-		return '(';
-	}
-
-	return L_CHAR;
-}
-
-/* Parse */
-
-enum {
-	P_CAT, P_ALT, P_REP,
-	P_BOL, P_EOL, P_WORD, P_NWORD,
-	P_PAR, P_PLA, P_NLA,
-	P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
-	P_REF,
-};
-
-struct Renode {
-	unsigned char type;
-	unsigned char ng, m, n;
-	Rune c;
-	Reclass *cc;
-	Renode *x;
-	Renode *y;
-};
-
-static Renode *newnode(struct cstate *g, int type)
-{
-	Renode *node = g->pend++;
-	node->type = type;
-	node->cc = NULL;
-	node->c = 0;
-	node->ng = 0;
-	node->m = 0;
-	node->n = 0;
-	node->x = node->y = NULL;
-	return node;
-}
-
-static int empty(Renode *node)
-{
-	if (!node) return 1;
-	switch (node->type) {
-	default: return 1;
-	case P_CAT: return empty(node->x) && empty(node->y);
-	case P_ALT: return empty(node->x) || empty(node->y);
-	case P_REP: return empty(node->x) || node->m == 0;
-	case P_PAR: return empty(node->x);
-	case P_REF: return empty(node->x);
-	case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
-	}
-}
-
-static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
-{
-	Renode *rep = newnode(g, P_REP);
-	if (max == REPINF && empty(atom))
-		die(g, "infinite loop matching the empty string");
-	rep->ng = ng;
-	rep->m = min;
-	rep->n = max;
-	rep->x = atom;
-	return rep;
-}
-
-static void next(struct cstate *g)
-{
-	g->lookahead = lex(g);
-}
-
-static int accept(struct cstate *g, int t)
-{
-	if (g->lookahead == t) {
-		next(g);
-		return 1;
-	}
-	return 0;
-}
-
-static Renode *parsealt(struct cstate *g);
-
-static Renode *parseatom(struct cstate *g)
-{
-	Renode *atom;
-	if (g->lookahead == L_CHAR) {
-		atom = newnode(g, P_CHAR);
-		atom->c = g->yychar;
-		next(g);
-		return atom;
-	}
-	if (g->lookahead == L_CCLASS) {
-		atom = newnode(g, P_CCLASS);
-		atom->cc = g->yycc;
-		next(g);
-		return atom;
-	}
-	if (g->lookahead == L_NCCLASS) {
-		atom = newnode(g, P_NCCLASS);
-		atom->cc = g->yycc;
-		next(g);
-		return atom;
-	}
-	if (g->lookahead == L_REF) {
-		atom = newnode(g, P_REF);
-		if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
-			die(g, "invalid back-reference");
-		atom->n = g->yychar;
-		atom->x = g->sub[g->yychar];
-		next(g);
-		return atom;
-	}
-	if (accept(g, '.'))
-		return newnode(g, P_ANY);
-	if (accept(g, '(')) {
-		atom = newnode(g, P_PAR);
-		if (g->nsub == MAXSUB)
-			die(g, "too many captures");
-		atom->n = g->nsub++;
-		atom->x = parsealt(g);
-		g->sub[atom->n] = atom;
-		if (!accept(g, ')'))
-			die(g, "unmatched '('");
-		return atom;
-	}
-	if (accept(g, L_NC)) {
-		atom = parsealt(g);
-		if (!accept(g, ')'))
-			die(g, "unmatched '('");
-		return atom;
-	}
-	if (accept(g, L_PLA)) {
-		atom = newnode(g, P_PLA);
-		atom->x = parsealt(g);
-		if (!accept(g, ')'))
-			die(g, "unmatched '('");
-		return atom;
-	}
-	if (accept(g, L_NLA)) {
-		atom = newnode(g, P_NLA);
-		atom->x = parsealt(g);
-		if (!accept(g, ')'))
-			die(g, "unmatched '('");
-		return atom;
-	}
-	die(g, "syntax error");
-	return NULL;
-}
-
-static Renode *parserep(struct cstate *g)
-{
-	Renode *atom;
-
-	if (accept(g, '^')) return newnode(g, P_BOL);
-	if (accept(g, '$')) return newnode(g, P_EOL);
-	if (accept(g, L_WORD)) return newnode(g, P_WORD);
-	if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
-
-	atom = parseatom(g);
-	if (g->lookahead == L_COUNT) {
-		int min = g->yymin, max = g->yymax;
-		next(g);
-		if (max < min)
-			die(g, "invalid quantifier");
-		return newrep(g, atom, accept(g, '?'), min, max);
-	}
-	if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
-	if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
-	if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
-	return atom;
-}
-
-static Renode *parsecat(struct cstate *g)
-{
-	Renode *cat, *x;
-	if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
-		cat = parserep(g);
-		while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
-			x = cat;
-			cat = newnode(g, P_CAT);
-			cat->x = x;
-			cat->y = parserep(g);
-		}
-		return cat;
-	}
-	return NULL;
-}
-
-static Renode *parsealt(struct cstate *g)
-{
-	Renode *alt, *x;
-	alt = parsecat(g);
-	while (accept(g, '|')) {
-		x = alt;
-		alt = newnode(g, P_ALT);
-		alt->x = x;
-		alt->y = parsecat(g);
-	}
-	return alt;
-}
-
-/* Compile */
-
-enum {
-	I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
-	I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
-	I_BOL, I_EOL, I_WORD, I_NWORD,
-	I_LPAR, I_RPAR
-};
-
-struct Reinst {
-	unsigned char opcode;
-	unsigned char n;
-	Rune c;
-	Reclass *cc;
-	Reinst *x;
-	Reinst *y;
-};
-
-static unsigned int count(Renode *node)
-{
-	unsigned int min, max;
-	if (!node) return 0;
-	switch (node->type) {
-	default: return 1;
-	case P_CAT: return count(node->x) + count(node->y);
-	case P_ALT: return count(node->x) + count(node->y) + 2;
-	case P_REP:
-		min = node->m;
-		max = node->n;
-		if (min == max) return count(node->x) * min;
-		if (max < REPINF) return count(node->x) * max + (max - min);
-		return count(node->x) * (min + 1) + 2;
-	case P_PAR: return count(node->x) + 2;
-	case P_PLA: return count(node->x) + 2;
-	case P_NLA: return count(node->x) + 2;
-	}
-}
-
-static Reinst *emit(Reprog *prog, int opcode)
-{
-	Reinst *inst = prog->end++;
-	inst->opcode = opcode;
-	inst->n = 0;
-	inst->c = 0;
-	inst->cc = NULL;
-	inst->x = inst->y = NULL;
-	return inst;
-}
-
-static void compile(Reprog *prog, Renode *node)
-{
-	Reinst *inst, *split, *jump;
-	unsigned int i;
-
-	if (!node)
-		return;
-
-	switch (node->type) {
-	case P_CAT:
-		compile(prog, node->x);
-		compile(prog, node->y);
-		break;
-
-	case P_ALT:
-		split = emit(prog, I_SPLIT);
-		compile(prog, node->x);
-		jump = emit(prog, I_JUMP);
-		compile(prog, node->y);
-		split->x = split + 1;
-		split->y = jump + 1;
-		jump->x = prog->end;
-		break;
-
-	case P_REP:
-		for (i = 0; i < node->m; ++i) {
-			inst = prog->end;
-			compile(prog, node->x);
-		}
-		if (node->m == node->n)
-			break;
-		if (node->n < REPINF) {
-			for (i = node->m; i < node->n; ++i) {
-				split = emit(prog, I_SPLIT);
-				compile(prog, node->x);
-				if (node->ng) {
-					split->y = split + 1;
-					split->x = prog->end;
-				} else {
-					split->x = split + 1;
-					split->y = prog->end;
-				}
-			}
-		} else if (node->m == 0) {
-			split = emit(prog, I_SPLIT);
-			compile(prog, node->x);
-			jump = emit(prog, I_JUMP);
-			if (node->ng) {
-				split->y = split + 1;
-				split->x = prog->end;
-			} else {
-				split->x = split + 1;
-				split->y = prog->end;
-			}
-			jump->x = split;
-		} else {
-			split = emit(prog, I_SPLIT);
-			if (node->ng) {
-				split->y = inst;
-				split->x = prog->end;
-			} else {
-				split->x = inst;
-				split->y = prog->end;
-			}
-		}
-		break;
-
-	case P_BOL: emit(prog, I_BOL); break;
-	case P_EOL: emit(prog, I_EOL); break;
-	case P_WORD: emit(prog, I_WORD); break;
-	case P_NWORD: emit(prog, I_NWORD); break;
-
-	case P_PAR:
-		inst = emit(prog, I_LPAR);
-		inst->n = node->n;
-		compile(prog, node->x);
-		inst = emit(prog, I_RPAR);
-		inst->n = node->n;
-		break;
-	case P_PLA:
-		split = emit(prog, I_PLA);
-		compile(prog, node->x);
-		emit(prog, I_END);
-		split->x = split + 1;
-		split->y = prog->end;
-		break;
-	case P_NLA:
-		split = emit(prog, I_NLA);
-		compile(prog, node->x);
-		emit(prog, I_END);
-		split->x = split + 1;
-		split->y = prog->end;
-		break;
-
-	case P_ANY:
-		emit(prog, I_ANY);
-		break;
-	case P_CHAR:
-		inst = emit(prog, I_CHAR);
-		inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
-		break;
-	case P_CCLASS:
-		inst = emit(prog, I_CCLASS);
-		inst->cc = node->cc;
-		break;
-	case P_NCCLASS:
-		inst = emit(prog, I_NCCLASS);
-		inst->cc = node->cc;
-		break;
-	case P_REF:
-		inst = emit(prog, I_REF);
-		inst->n = node->n;
-		break;
-	}
-}
-
-#ifdef TEST
-static void dumpnode(Renode *node)
-{
-	Rune *p;
-	if (!node) { printf("Empty"); return; }
-	switch (node->type) {
-	case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
-	case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
-	case P_REP:
-		printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
-		dumpnode(node->x);
-		printf(")");
-		break;
-	case P_BOL: printf("Bol"); break;
-	case P_EOL: printf("Eol"); break;
-	case P_WORD: printf("Word"); break;
-	case P_NWORD: printf("NotWord"); break;
-	case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
-	case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
-	case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
-	case P_ANY: printf("Any"); break;
-	case P_CHAR: printf("Char(%c)", node->c); break;
-	case P_CCLASS:
-		printf("Class(");
-		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
-		printf(")");
-		break;
-	case P_NCCLASS:
-		printf("NotClass(");
-		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
-		printf(")");
-		break;
-	case P_REF: printf("Ref(%d)", node->n); break;
-	}
-}
-
-static void dumpprog(Reprog *prog)
-{
-	Reinst *inst;
-	int i;
-	for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
-		printf("% 5d: ", i);
-		switch (inst->opcode) {
-		case I_END: puts("end"); break;
-		case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
-		case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
-		case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
-		case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
-		case I_ANY: puts("any"); break;
-		case I_ANYNL: puts("anynl"); break;
-		case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
-		case I_CCLASS: puts("cclass"); break;
-		case I_NCCLASS: puts("ncclass"); break;
-		case I_REF: printf("ref %d\n", inst->n); break;
-		case I_BOL: puts("bol"); break;
-		case I_EOL: puts("eol"); break;
-		case I_WORD: puts("word"); break;
-		case I_NWORD: puts("nword"); break;
-		case I_LPAR: printf("lpar %d\n", inst->n); break;
-		case I_RPAR: printf("rpar %d\n", inst->n); break;
-		}
-	}
-}
-#endif
-
-Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
-{
-	struct cstate g;
-	Renode *node;
-	Reinst *split, *jump;
-	int i;
-
-	g.prog = malloc(sizeof (Reprog));
-	g.pstart = g.pend = malloc(sizeof (Renode) * strlen(pattern) * 2);
-
-	if (setjmp(g.kaboom)) {
-		if (errorp) *errorp = g.error;
-		free(g.pstart);
-		free(g.prog);
-		return NULL;
-	}
-
-	g.source = pattern;
-	g.ncclass = 0;
-	g.nsub = 1;
-	for (i = 0; i < MAXSUB; ++i)
-		g.sub[i] = 0;
-
-	g.prog->flags = cflags;
-
-	next(&g);
-	node = parsealt(&g);
-	if (g.lookahead == ')')
-		die(&g, "unmatched ')'");
-	if (g.lookahead != 0)
-		die(&g, "syntax error");
-
-	g.prog->nsub = g.nsub;
-	g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
-
-	split = emit(g.prog, I_SPLIT);
-	split->x = split + 3;
-	split->y = split + 1;
-	emit(g.prog, I_ANYNL);
-	jump = emit(g.prog, I_JUMP);
-	jump->x = split;
-	emit(g.prog, I_LPAR);
-	compile(g.prog, node);
-	emit(g.prog, I_RPAR);
-	emit(g.prog, I_END);
-
-#ifdef TEST
-	dumpnode(node);
-	putchar('\n');
-	dumpprog(g.prog);
-#endif
-
-	free(g.pstart);
-
-	if (errorp) *errorp = NULL;
-	return g.prog;
-}
-
-void regfree(Reprog *prog)
-{
-	if (prog) {
-		free(prog->start);
-		free(prog);
-	}
-}
-
-/* Match */
-
-static int isnewline(int c)
-{
-	return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
-}
-
-static int iswordchar(int c)
-{
-	return c == '_' ||
-		(c >= 'a' && c <= 'z') ||
-		(c >= 'A' && c <= 'Z') ||
-		(c >= '0' && c <= '9');
-}
-
-static int incclass(Reclass *cc, Rune c)
-{
-	Rune *p;
-	for (p = cc->spans; p < cc->end; p += 2)
-		if (p[0] <= c && c <= p[1])
-			return 1;
-	return 0;
-}
-
-static int incclasscanon(Reclass *cc, Rune c)
-{
-	Rune *p, r;
-	for (p = cc->spans; p < cc->end; p += 2)
-		for (r = p[0]; r <= p[1]; ++r)
-			if (c == canon(r))
-				return 1;
-	return 0;
-}
-
-static int strncmpcanon(const char *a, const char *b, unsigned int n)
-{
-	Rune ra, rb;
-	int c;
-	while (n--) {
-		if (!*a) return -1;
-		if (!*b) return 1;
-		a += chartorune(&ra, a);
-		b += chartorune(&rb, b);
-		c = canon(ra) - canon(rb);
-		if (c)
-			return c;
-	}
-	return 0;
-}
-
-struct Rethread {
-	Reinst *pc;
-	const char *sp;
-	Resub sub;
-};
-
-static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
-{
-	t->pc = pc;
-	t->sp = sp;
-	memcpy(&t->sub, sub, sizeof t->sub);
-}
-
-static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
-{
-	Rethread ready[MAXTHREAD];
-	Resub scratch;
-	Resub sub;
-	Rune c;
-	unsigned int nready;
-	int i;
-
-	/* queue initial thread */
-	spawn(ready + 0, pc, sp, out);
-	nready = 1;
-
-	/* run threads in stack order */
-	while (nready > 0) {
-		--nready;
-		pc = ready[nready].pc;
-		sp = ready[nready].sp;
-		memcpy(&sub, &ready[nready].sub, sizeof sub);
-		for (;;) {
-			switch (pc->opcode) {
-			case I_END:
-				for (i = 0; i < MAXSUB; ++i) {
-					out->sub[i].sp = sub.sub[i].sp;
-					out->sub[i].ep = sub.sub[i].ep;
-				}
-				return 1;
-			case I_JUMP:
-				pc = pc->x;
-				continue;
-			case I_SPLIT:
-				if (nready >= MAXTHREAD) {
-					fprintf(stderr, "regexec: backtrack overflow!\n");
-					return 0;
-				}
-				spawn(&ready[nready++], pc->y, sp, &sub);
-				pc = pc->x;
-				continue;
-
-			case I_PLA:
-				if (!match(pc->x, sp, bol, flags, &sub))
-					goto dead;
-				pc = pc->y;
-				continue;
-			case I_NLA:
-				memcpy(&scratch, &sub, sizeof scratch);
-				if (match(pc->x, sp, bol, flags, &scratch))
-					goto dead;
-				pc = pc->y;
-				continue;
-
-			case I_ANYNL:
-				sp += chartorune(&c, sp);
-				if (c == 0)
-					goto dead;
-				break;
-			case I_ANY:
-				sp += chartorune(&c, sp);
-				if (c == 0)
-					goto dead;
-				if (isnewline(c))
-					goto dead;
-				break;
-			case I_CHAR:
-				sp += chartorune(&c, sp);
-				if (c == 0)
-					goto dead;
-				if (flags & REG_ICASE)
-					c = canon(c);
-				if (c != pc->c)
-					goto dead;
-				break;
-			case I_CCLASS:
-				sp += chartorune(&c, sp);
-				if (c == 0)
-					goto dead;
-				if (flags & REG_ICASE) {
-					if (!incclasscanon(pc->cc, canon(c)))
-						goto dead;
-				} else {
-					if (!incclass(pc->cc, c))
-						goto dead;
-				}
-				break;
-			case I_NCCLASS:
-				sp += chartorune(&c, sp);
-				if (c == 0)
-					goto dead;
-				if (flags & REG_ICASE) {
-					if (incclasscanon(pc->cc, canon(c)))
-						goto dead;
-				} else {
-					if (incclass(pc->cc, c))
-						goto dead;
-				}
-				break;
-			case I_REF:
-				i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
-				if (flags & REG_ICASE) {
-					if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
-						goto dead;
-				} else {
-					if (strncmp(sp, sub.sub[pc->n].sp, i))
-						goto dead;
-				}
-				if (i > 0)
-					sp += i;
-				break;
-
-			case I_BOL:
-				if (sp == bol && !(flags & REG_NOTBOL))
-					break;
-				if (flags & REG_NEWLINE)
-					if (sp > bol && isnewline(sp[-1]))
-						break;
-				goto dead;
-			case I_EOL:
-				if (*sp == 0)
-					break;
-				if (flags & REG_NEWLINE)
-					if (isnewline(*sp))
-						break;
-				goto dead;
-			case I_WORD:
-				i = sp > bol && iswordchar(sp[-1]);
-				i ^= iswordchar(sp[0]);
-				if (i)
-					break;
-				goto dead;
-			case I_NWORD:
-				i = sp > bol && iswordchar(sp[-1]);
-				i ^= iswordchar(sp[0]);
-				if (!i)
-					break;
-				goto dead;
-
-			case I_LPAR:
-				sub.sub[pc->n].sp = sp;
-				break;
-			case I_RPAR:
-				sub.sub[pc->n].ep = sp;
-				break;
-			default:
-				goto dead;
-			}
-			pc = pc + 1;
-		}
-dead: ;
-	}
-	return 0;
-}
-
-int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
-{
-	Resub scratch;
-	int i;
-
-	if (!sub)
-		sub = &scratch;
-
-	sub->nsub = prog->nsub;
-	for (i = 0; i < MAXSUB; ++i)
-		sub->sub[i].sp = sub->sub[i].ep = NULL;
-
-	return !match(prog->start, sp, sp, prog->flags | eflags, sub);
-}
-
-#ifdef TEST
-int main(int argc, char **argv)
-{
-	const char *error;
-	const char *s;
-	Reprog *p;
-	Resub m;
-	unsigned int i;
-
-	if (argc > 1) {
-		p = regcomp(argv[1], 0, &error);
-		if (!p) {
-			fprintf(stderr, "regcomp: %s\n", error);
-			return 1;
-		}
-
-		if (argc > 2) {
-			s = argv[2];
-			printf("nsub = %d\n", p->nsub);
-			if (!regexec(p, s, &m, 0)) {
-				for (i = 0; i < m.nsub; ++i) {
-					int n = m.sub[i].ep - m.sub[i].sp;
-					if (n > 0)
-						printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
-					else
-						printf("match %d: n=0 ''\n", i);
-				}
-			} else {
-				printf("no match\n");
-			}
-		}
-	}
-
-	return 0;
-}
-#endif
--- a/regex.h
+++ /dev/null
@@ -1,35 +1,0 @@
-#ifndef regex_h
-#define regex_h
-
-#define regcomp js_regcomp
-#define regexec js_regexec
-#define regfree js_regfree
-
-typedef struct Reprog Reprog;
-typedef struct Resub Resub;
-
-Reprog *regcomp(const char *pattern, int cflags, const char **errorp);
-int regexec(Reprog *prog, const char *string, Resub *sub, int eflags);
-void regfree(Reprog *prog);
-
-enum {
-	/* regcomp flags */
-	REG_ICASE = 1,
-	REG_NEWLINE = 2,
-
-	/* regexec flags */
-	REG_NOTBOL = 4,
-
-	/* limits */
-	REG_MAXSUB = 16
-};
-
-struct Resub {
-	unsigned int nsub;
-	struct {
-		const char *sp;
-		const char *ep;
-	} sub[REG_MAXSUB];
-};
-
-#endif
--- /dev/null
+++ b/regexp.c
@@ -1,0 +1,1143 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <setjmp.h>
+#include <limits.h>
+
+#include "regexp.h"
+#include "utf.h"
+
+#define emit regemit
+#define next regnext
+#define accept regaccept
+
+#define nelem(a) (sizeof (a) / sizeof (a)[0])
+
+#define REPINF 255
+#define MAXTHREAD 1000
+#define MAXSUB REG_MAXSUB
+
+typedef struct Reclass Reclass;
+typedef struct Renode Renode;
+typedef struct Reinst Reinst;
+typedef struct Rethread Rethread;
+
+struct Reclass {
+	Rune *end;
+	Rune spans[64];
+};
+
+struct Reprog {
+	Reinst *start, *end;
+	int flags;
+	unsigned int nsub;
+	Reclass cclass[16];
+};
+
+struct cstate {
+	Reprog *prog;
+	Renode *pstart, *pend;
+
+	const char *source;
+	unsigned int ncclass;
+	unsigned int nsub;
+	Renode *sub[MAXSUB];
+
+	int lookahead;
+	Rune yychar;
+	Reclass *yycc;
+	int yymin, yymax;
+
+	const char *error;
+	jmp_buf kaboom;
+};
+
+static void die(struct cstate *g, const char *message)
+{
+	g->error = message;
+	longjmp(g->kaboom, 1);
+}
+
+static int canon(Rune c)
+{
+	Rune u = toupperrune(c);
+	if (c >= 128 && u < 128)
+		return c;
+	return u;
+}
+
+/* Scan */
+
+enum {
+	L_CHAR = 256,
+	L_CCLASS,	/* character class */
+	L_NCCLASS,	/* negative character class */
+	L_NC,		/* "(?:" no capture */
+	L_PLA,		/* "(?=" positive lookahead */
+	L_NLA,		/* "(?!" negative lookahead */
+	L_WORD,		/* "\b" word boundary */
+	L_NWORD,	/* "\B" non-word boundary */
+	L_REF,		/* "\1" back-reference */
+	L_COUNT,	/* {M,N} */
+};
+
+static int hex(struct cstate *g, int c)
+{
+	if (c >= '0' && c <= '9') return c - '0';
+	if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
+	if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
+	die(g, "invalid escape sequence");
+	return 0;
+}
+
+static int dec(struct cstate *g, int c)
+{
+	if (c >= '0' && c <= '9') return c - '0';
+	die(g, "invalid quantifier");
+	return 0;
+}
+
+#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
+
+static int isunicodeletter(int c)
+{
+	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
+}
+
+static int nextrune(struct cstate *g)
+{
+	g->source += chartorune(&g->yychar, g->source);
+	if (g->yychar == '\\') {
+		g->source += chartorune(&g->yychar, g->source);
+		switch (g->yychar) {
+		case 0: die(g, "unterminated escape sequence");
+		case 'f': g->yychar = '\f'; return 0;
+		case 'n': g->yychar = '\n'; return 0;
+		case 'r': g->yychar = '\r'; return 0;
+		case 't': g->yychar = '\t'; return 0;
+		case 'v': g->yychar = '\v'; return 0;
+		case 'c':
+			g->yychar = (*g->source++) & 31;
+			return 0;
+		case 'x':
+			g->yychar = hex(g, *g->source++) << 4;
+			g->yychar += hex(g, *g->source++);
+			if (g->yychar == 0) {
+				g->yychar = '0';
+				return 1;
+			}
+			return 0;
+		case 'u':
+			g->yychar = hex(g, *g->source++) << 12;
+			g->yychar += hex(g, *g->source++) << 8;
+			g->yychar += hex(g, *g->source++) << 4;
+			g->yychar += hex(g, *g->source++);
+			if (g->yychar == 0) {
+				g->yychar = '0';
+				return 1;
+			}
+			return 0;
+		}
+		if (strchr(ESCAPES, g->yychar))
+			return 1;
+		if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
+			die(g, "invalid escape character");
+		return 0;
+	}
+	return 0;
+}
+
+static int lexcount(struct cstate *g)
+{
+	g->yychar = *g->source++;
+
+	g->yymin = dec(g, g->yychar);
+	g->yychar = *g->source++;
+	while (g->yychar != ',' && g->yychar != '}') {
+		g->yymin = g->yymin * 10 + dec(g, g->yychar);
+		g->yychar = *g->source++;
+	}
+	if (g->yymin >= REPINF)
+		die(g, "numeric overflow");
+
+	if (g->yychar == ',') {
+		g->yychar = *g->source++;
+		if (g->yychar == '}') {
+			g->yymax = REPINF;
+		} else {
+			g->yymax = dec(g, g->yychar);
+			g->yychar = *g->source++;
+			while (g->yychar != '}') {
+				g->yymax = g->yymax * 10 + dec(g, g->yychar);
+				g->yychar = *g->source++;
+			}
+			if (g->yymax >= REPINF)
+				die(g, "numeric overflow");
+		}
+	} else {
+		g->yymax = g->yymin;
+	}
+
+	return L_COUNT;
+}
+
+static void newcclass(struct cstate *g)
+{
+	if (g->ncclass >= nelem(g->prog->cclass))
+		die(g, "too many character classes");
+	g->yycc = g->prog->cclass + g->ncclass++;
+	g->yycc->end = g->yycc->spans;
+}
+
+static void addrange(struct cstate *g, Rune a, Rune b)
+{
+	if (a > b)
+		die(g, "invalid character class range");
+	if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
+		die(g, "too many character class ranges");
+	*g->yycc->end++ = a;
+	*g->yycc->end++ = b;
+}
+
+static void addranges_d(struct cstate *g)
+{
+	addrange(g, '0', '9');
+}
+
+static void addranges_D(struct cstate *g)
+{
+	addrange(g, 0, '0'-1);
+	addrange(g, '9'+1, 0xFFFF);
+}
+
+static void addranges_s(struct cstate *g)
+{
+	addrange(g, 0x9, 0x9);
+	addrange(g, 0xA, 0xD);
+	addrange(g, 0x20, 0x20);
+	addrange(g, 0xA0, 0xA0);
+	addrange(g, 0x2028, 0x2029);
+	addrange(g, 0xFEFF, 0xFEFF);
+}
+
+static void addranges_S(struct cstate *g)
+{
+	addrange(g, 0, 0x9-1);
+	addrange(g, 0x9+1, 0xA-1);
+	addrange(g, 0xD+1, 0x20-1);
+	addrange(g, 0x20+1, 0xA0-1);
+	addrange(g, 0xA0+1, 0x2028-1);
+	addrange(g, 0x2029+1, 0xFEFF-1);
+	addrange(g, 0xFEFF+1, 0xFFFF);
+}
+
+static void addranges_w(struct cstate *g)
+{
+	addrange(g, '0', '9');
+	addrange(g, 'A', 'Z');
+	addrange(g, '_', '_');
+	addrange(g, 'a', 'z');
+}
+
+static void addranges_W(struct cstate *g)
+{
+	addrange(g, 0, '0'-1);
+	addrange(g, '9'+1, 'A'-1);
+	addrange(g, 'Z'+1, '_'-1);
+	addrange(g, '_'+1, 'a'-1);
+	addrange(g, 'z'+1, 0xFFFF);
+}
+
+static int lexclass(struct cstate *g)
+{
+	int type = L_CCLASS;
+	int quoted, havesave, havedash;
+	Rune save;
+
+	newcclass(g);
+
+	quoted = nextrune(g);
+	if (!quoted && g->yychar == '^') {
+		type = L_NCCLASS;
+		quoted = nextrune(g);
+	}
+
+	havesave = havedash = 0;
+	for (;;) {
+		if (g->yychar == 0)
+			die(g, "unterminated character class");
+		if (!quoted && g->yychar == ']')
+			break;
+
+		if (!quoted && g->yychar == '-') {
+			if (havesave) {
+				if (havedash) {
+					addrange(g, save, '-');
+					havesave = havedash = 0;
+				} else {
+					havedash = 1;
+				}
+			} else {
+				save = '-';
+				havesave = 1;
+			}
+		} else if (quoted && strchr("DSWdsw", g->yychar)) {
+			if (havesave) {
+				addrange(g, save, save);
+				if (havedash)
+					addrange(g, '-', '-');
+			}
+			switch (g->yychar) {
+			case 'd': addranges_d(g); break;
+			case 's': addranges_s(g); break;
+			case 'w': addranges_w(g); break;
+			case 'D': addranges_D(g); break;
+			case 'S': addranges_S(g); break;
+			case 'W': addranges_W(g); break;
+			}
+			havesave = havedash = 0;
+		} else {
+			if (quoted) {
+				if (g->yychar == 'b')
+					g->yychar = '\b';
+				else if (g->yychar == '0')
+					g->yychar = 0;
+				/* else identity escape */
+			}
+			if (havesave) {
+				if (havedash) {
+					addrange(g, save, g->yychar);
+					havesave = havedash = 0;
+				} else {
+					addrange(g, save, save);
+					save = g->yychar;
+				}
+			} else {
+				save = g->yychar;
+				havesave = 1;
+			}
+		}
+
+		quoted = nextrune(g);
+	}
+
+	if (havesave) {
+		addrange(g, save, save);
+		if (havedash)
+			addrange(g, '-', '-');
+	}
+
+	return type;
+}
+
+static int lex(struct cstate *g)
+{
+	int quoted = nextrune(g);
+	if (quoted) {
+		switch (g->yychar) {
+		case 'b': return L_WORD;
+		case 'B': return L_NWORD;
+		case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
+		case 's': newcclass(g); addranges_s(g); return L_CCLASS;
+		case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
+		case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
+		case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
+		case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
+		case '0': g->yychar = 0; return L_CHAR;
+		}
+		if (g->yychar >= '0' && g->yychar <= '9') {
+			g->yychar -= '0';
+			if (*g->source >= '0' && *g->source <= '9')
+				g->yychar = g->yychar * 10 + *g->source++ - '0';
+			return L_REF;
+		}
+		return L_CHAR;
+	}
+
+	switch (g->yychar) {
+	case 0:
+	case '$': case ')': case '*': case '+':
+	case '.': case '?': case '^': case '|':
+		return g->yychar;
+	}
+
+	if (g->yychar == '{')
+		return lexcount(g);
+	if (g->yychar == '[')
+		return lexclass(g);
+	if (g->yychar == '(') {
+		if (g->source[0] == '?') {
+			if (g->source[1] == ':') {
+				g->source += 2;
+				return L_NC;
+			}
+			if (g->source[1] == '=') {
+				g->source += 2;
+				return L_PLA;
+			}
+			if (g->source[1] == '!') {
+				g->source += 2;
+				return L_NLA;
+			}
+		}
+		return '(';
+	}
+
+	return L_CHAR;
+}
+
+/* Parse */
+
+enum {
+	P_CAT, P_ALT, P_REP,
+	P_BOL, P_EOL, P_WORD, P_NWORD,
+	P_PAR, P_PLA, P_NLA,
+	P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
+	P_REF,
+};
+
+struct Renode {
+	unsigned char type;
+	unsigned char ng, m, n;
+	Rune c;
+	Reclass *cc;
+	Renode *x;
+	Renode *y;
+};
+
+static Renode *newnode(struct cstate *g, int type)
+{
+	Renode *node = g->pend++;
+	node->type = type;
+	node->cc = NULL;
+	node->c = 0;
+	node->ng = 0;
+	node->m = 0;
+	node->n = 0;
+	node->x = node->y = NULL;
+	return node;
+}
+
+static int empty(Renode *node)
+{
+	if (!node) return 1;
+	switch (node->type) {
+	default: return 1;
+	case P_CAT: return empty(node->x) && empty(node->y);
+	case P_ALT: return empty(node->x) || empty(node->y);
+	case P_REP: return empty(node->x) || node->m == 0;
+	case P_PAR: return empty(node->x);
+	case P_REF: return empty(node->x);
+	case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
+	}
+}
+
+static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
+{
+	Renode *rep = newnode(g, P_REP);
+	if (max == REPINF && empty(atom))
+		die(g, "infinite loop matching the empty string");
+	rep->ng = ng;
+	rep->m = min;
+	rep->n = max;
+	rep->x = atom;
+	return rep;
+}
+
+static void next(struct cstate *g)
+{
+	g->lookahead = lex(g);
+}
+
+static int accept(struct cstate *g, int t)
+{
+	if (g->lookahead == t) {
+		next(g);
+		return 1;
+	}
+	return 0;
+}
+
+static Renode *parsealt(struct cstate *g);
+
+static Renode *parseatom(struct cstate *g)
+{
+	Renode *atom;
+	if (g->lookahead == L_CHAR) {
+		atom = newnode(g, P_CHAR);
+		atom->c = g->yychar;
+		next(g);
+		return atom;
+	}
+	if (g->lookahead == L_CCLASS) {
+		atom = newnode(g, P_CCLASS);
+		atom->cc = g->yycc;
+		next(g);
+		return atom;
+	}
+	if (g->lookahead == L_NCCLASS) {
+		atom = newnode(g, P_NCCLASS);
+		atom->cc = g->yycc;
+		next(g);
+		return atom;
+	}
+	if (g->lookahead == L_REF) {
+		atom = newnode(g, P_REF);
+		if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
+			die(g, "invalid back-reference");
+		atom->n = g->yychar;
+		atom->x = g->sub[g->yychar];
+		next(g);
+		return atom;
+	}
+	if (accept(g, '.'))
+		return newnode(g, P_ANY);
+	if (accept(g, '(')) {
+		atom = newnode(g, P_PAR);
+		if (g->nsub == MAXSUB)
+			die(g, "too many captures");
+		atom->n = g->nsub++;
+		atom->x = parsealt(g);
+		g->sub[atom->n] = atom;
+		if (!accept(g, ')'))
+			die(g, "unmatched '('");
+		return atom;
+	}
+	if (accept(g, L_NC)) {
+		atom = parsealt(g);
+		if (!accept(g, ')'))
+			die(g, "unmatched '('");
+		return atom;
+	}
+	if (accept(g, L_PLA)) {
+		atom = newnode(g, P_PLA);
+		atom->x = parsealt(g);
+		if (!accept(g, ')'))
+			die(g, "unmatched '('");
+		return atom;
+	}
+	if (accept(g, L_NLA)) {
+		atom = newnode(g, P_NLA);
+		atom->x = parsealt(g);
+		if (!accept(g, ')'))
+			die(g, "unmatched '('");
+		return atom;
+	}
+	die(g, "syntax error");
+	return NULL;
+}
+
+static Renode *parserep(struct cstate *g)
+{
+	Renode *atom;
+
+	if (accept(g, '^')) return newnode(g, P_BOL);
+	if (accept(g, '$')) return newnode(g, P_EOL);
+	if (accept(g, L_WORD)) return newnode(g, P_WORD);
+	if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
+
+	atom = parseatom(g);
+	if (g->lookahead == L_COUNT) {
+		int min = g->yymin, max = g->yymax;
+		next(g);
+		if (max < min)
+			die(g, "invalid quantifier");
+		return newrep(g, atom, accept(g, '?'), min, max);
+	}
+	if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
+	if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
+	if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
+	return atom;
+}
+
+static Renode *parsecat(struct cstate *g)
+{
+	Renode *cat, *x;
+	if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+		cat = parserep(g);
+		while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+			x = cat;
+			cat = newnode(g, P_CAT);
+			cat->x = x;
+			cat->y = parserep(g);
+		}
+		return cat;
+	}
+	return NULL;
+}
+
+static Renode *parsealt(struct cstate *g)
+{
+	Renode *alt, *x;
+	alt = parsecat(g);
+	while (accept(g, '|')) {
+		x = alt;
+		alt = newnode(g, P_ALT);
+		alt->x = x;
+		alt->y = parsecat(g);
+	}
+	return alt;
+}
+
+/* Compile */
+
+enum {
+	I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
+	I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
+	I_BOL, I_EOL, I_WORD, I_NWORD,
+	I_LPAR, I_RPAR
+};
+
+struct Reinst {
+	unsigned char opcode;
+	unsigned char n;
+	Rune c;
+	Reclass *cc;
+	Reinst *x;
+	Reinst *y;
+};
+
+static unsigned int count(Renode *node)
+{
+	unsigned int min, max;
+	if (!node) return 0;
+	switch (node->type) {
+	default: return 1;
+	case P_CAT: return count(node->x) + count(node->y);
+	case P_ALT: return count(node->x) + count(node->y) + 2;
+	case P_REP:
+		min = node->m;
+		max = node->n;
+		if (min == max) return count(node->x) * min;
+		if (max < REPINF) return count(node->x) * max + (max - min);
+		return count(node->x) * (min + 1) + 2;
+	case P_PAR: return count(node->x) + 2;
+	case P_PLA: return count(node->x) + 2;
+	case P_NLA: return count(node->x) + 2;
+	}
+}
+
+static Reinst *emit(Reprog *prog, int opcode)
+{
+	Reinst *inst = prog->end++;
+	inst->opcode = opcode;
+	inst->n = 0;
+	inst->c = 0;
+	inst->cc = NULL;
+	inst->x = inst->y = NULL;
+	return inst;
+}
+
+static void compile(Reprog *prog, Renode *node)
+{
+	Reinst *inst, *split, *jump;
+	unsigned int i;
+
+	if (!node)
+		return;
+
+	switch (node->type) {
+	case P_CAT:
+		compile(prog, node->x);
+		compile(prog, node->y);
+		break;
+
+	case P_ALT:
+		split = emit(prog, I_SPLIT);
+		compile(prog, node->x);
+		jump = emit(prog, I_JUMP);
+		compile(prog, node->y);
+		split->x = split + 1;
+		split->y = jump + 1;
+		jump->x = prog->end;
+		break;
+
+	case P_REP:
+		for (i = 0; i < node->m; ++i) {
+			inst = prog->end;
+			compile(prog, node->x);
+		}
+		if (node->m == node->n)
+			break;
+		if (node->n < REPINF) {
+			for (i = node->m; i < node->n; ++i) {
+				split = emit(prog, I_SPLIT);
+				compile(prog, node->x);
+				if (node->ng) {
+					split->y = split + 1;
+					split->x = prog->end;
+				} else {
+					split->x = split + 1;
+					split->y = prog->end;
+				}
+			}
+		} else if (node->m == 0) {
+			split = emit(prog, I_SPLIT);
+			compile(prog, node->x);
+			jump = emit(prog, I_JUMP);
+			if (node->ng) {
+				split->y = split + 1;
+				split->x = prog->end;
+			} else {
+				split->x = split + 1;
+				split->y = prog->end;
+			}
+			jump->x = split;
+		} else {
+			split = emit(prog, I_SPLIT);
+			if (node->ng) {
+				split->y = inst;
+				split->x = prog->end;
+			} else {
+				split->x = inst;
+				split->y = prog->end;
+			}
+		}
+		break;
+
+	case P_BOL: emit(prog, I_BOL); break;
+	case P_EOL: emit(prog, I_EOL); break;
+	case P_WORD: emit(prog, I_WORD); break;
+	case P_NWORD: emit(prog, I_NWORD); break;
+
+	case P_PAR:
+		inst = emit(prog, I_LPAR);
+		inst->n = node->n;
+		compile(prog, node->x);
+		inst = emit(prog, I_RPAR);
+		inst->n = node->n;
+		break;
+	case P_PLA:
+		split = emit(prog, I_PLA);
+		compile(prog, node->x);
+		emit(prog, I_END);
+		split->x = split + 1;
+		split->y = prog->end;
+		break;
+	case P_NLA:
+		split = emit(prog, I_NLA);
+		compile(prog, node->x);
+		emit(prog, I_END);
+		split->x = split + 1;
+		split->y = prog->end;
+		break;
+
+	case P_ANY:
+		emit(prog, I_ANY);
+		break;
+	case P_CHAR:
+		inst = emit(prog, I_CHAR);
+		inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
+		break;
+	case P_CCLASS:
+		inst = emit(prog, I_CCLASS);
+		inst->cc = node->cc;
+		break;
+	case P_NCCLASS:
+		inst = emit(prog, I_NCCLASS);
+		inst->cc = node->cc;
+		break;
+	case P_REF:
+		inst = emit(prog, I_REF);
+		inst->n = node->n;
+		break;
+	}
+}
+
+#ifdef TEST
+static void dumpnode(Renode *node)
+{
+	Rune *p;
+	if (!node) { printf("Empty"); return; }
+	switch (node->type) {
+	case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
+	case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
+	case P_REP:
+		printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
+		dumpnode(node->x);
+		printf(")");
+		break;
+	case P_BOL: printf("Bol"); break;
+	case P_EOL: printf("Eol"); break;
+	case P_WORD: printf("Word"); break;
+	case P_NWORD: printf("NotWord"); break;
+	case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
+	case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
+	case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
+	case P_ANY: printf("Any"); break;
+	case P_CHAR: printf("Char(%c)", node->c); break;
+	case P_CCLASS:
+		printf("Class(");
+		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
+		printf(")");
+		break;
+	case P_NCCLASS:
+		printf("NotClass(");
+		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
+		printf(")");
+		break;
+	case P_REF: printf("Ref(%d)", node->n); break;
+	}
+}
+
+static void dumpprog(Reprog *prog)
+{
+	Reinst *inst;
+	int i;
+	for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
+		printf("% 5d: ", i);
+		switch (inst->opcode) {
+		case I_END: puts("end"); break;
+		case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
+		case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
+		case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
+		case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
+		case I_ANY: puts("any"); break;
+		case I_ANYNL: puts("anynl"); break;
+		case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
+		case I_CCLASS: puts("cclass"); break;
+		case I_NCCLASS: puts("ncclass"); break;
+		case I_REF: printf("ref %d\n", inst->n); break;
+		case I_BOL: puts("bol"); break;
+		case I_EOL: puts("eol"); break;
+		case I_WORD: puts("word"); break;
+		case I_NWORD: puts("nword"); break;
+		case I_LPAR: printf("lpar %d\n", inst->n); break;
+		case I_RPAR: printf("rpar %d\n", inst->n); break;
+		}
+	}
+}
+#endif
+
+Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
+{
+	struct cstate g;
+	Renode *node;
+	Reinst *split, *jump;
+	int i;
+
+	g.prog = malloc(sizeof (Reprog));
+	g.pstart = g.pend = malloc(sizeof (Renode) * strlen(pattern) * 2);
+
+	if (setjmp(g.kaboom)) {
+		if (errorp) *errorp = g.error;
+		free(g.pstart);
+		free(g.prog);
+		return NULL;
+	}
+
+	g.source = pattern;
+	g.ncclass = 0;
+	g.nsub = 1;
+	for (i = 0; i < MAXSUB; ++i)
+		g.sub[i] = 0;
+
+	g.prog->flags = cflags;
+
+	next(&g);
+	node = parsealt(&g);
+	if (g.lookahead == ')')
+		die(&g, "unmatched ')'");
+	if (g.lookahead != 0)
+		die(&g, "syntax error");
+
+	g.prog->nsub = g.nsub;
+	g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
+
+	split = emit(g.prog, I_SPLIT);
+	split->x = split + 3;
+	split->y = split + 1;
+	emit(g.prog, I_ANYNL);
+	jump = emit(g.prog, I_JUMP);
+	jump->x = split;
+	emit(g.prog, I_LPAR);
+	compile(g.prog, node);
+	emit(g.prog, I_RPAR);
+	emit(g.prog, I_END);
+
+#ifdef TEST
+	dumpnode(node);
+	putchar('\n');
+	dumpprog(g.prog);
+#endif
+
+	free(g.pstart);
+
+	if (errorp) *errorp = NULL;
+	return g.prog;
+}
+
+void regfree(Reprog *prog)
+{
+	if (prog) {
+		free(prog->start);
+		free(prog);
+	}
+}
+
+/* Match */
+
+static int isnewline(int c)
+{
+	return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
+}
+
+static int iswordchar(int c)
+{
+	return c == '_' ||
+		(c >= 'a' && c <= 'z') ||
+		(c >= 'A' && c <= 'Z') ||
+		(c >= '0' && c <= '9');
+}
+
+static int incclass(Reclass *cc, Rune c)
+{
+	Rune *p;
+	for (p = cc->spans; p < cc->end; p += 2)
+		if (p[0] <= c && c <= p[1])
+			return 1;
+	return 0;
+}
+
+static int incclasscanon(Reclass *cc, Rune c)
+{
+	Rune *p, r;
+	for (p = cc->spans; p < cc->end; p += 2)
+		for (r = p[0]; r <= p[1]; ++r)
+			if (c == canon(r))
+				return 1;
+	return 0;
+}
+
+static int strncmpcanon(const char *a, const char *b, unsigned int n)
+{
+	Rune ra, rb;
+	int c;
+	while (n--) {
+		if (!*a) return -1;
+		if (!*b) return 1;
+		a += chartorune(&ra, a);
+		b += chartorune(&rb, b);
+		c = canon(ra) - canon(rb);
+		if (c)
+			return c;
+	}
+	return 0;
+}
+
+struct Rethread {
+	Reinst *pc;
+	const char *sp;
+	Resub sub;
+};
+
+static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
+{
+	t->pc = pc;
+	t->sp = sp;
+	memcpy(&t->sub, sub, sizeof t->sub);
+}
+
+static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
+{
+	Rethread ready[MAXTHREAD];
+	Resub scratch;
+	Resub sub;
+	Rune c;
+	unsigned int nready;
+	int i;
+
+	/* queue initial thread */
+	spawn(ready + 0, pc, sp, out);
+	nready = 1;
+
+	/* run threads in stack order */
+	while (nready > 0) {
+		--nready;
+		pc = ready[nready].pc;
+		sp = ready[nready].sp;
+		memcpy(&sub, &ready[nready].sub, sizeof sub);
+		for (;;) {
+			switch (pc->opcode) {
+			case I_END:
+				for (i = 0; i < MAXSUB; ++i) {
+					out->sub[i].sp = sub.sub[i].sp;
+					out->sub[i].ep = sub.sub[i].ep;
+				}
+				return 1;
+			case I_JUMP:
+				pc = pc->x;
+				continue;
+			case I_SPLIT:
+				if (nready >= MAXTHREAD) {
+					fprintf(stderr, "regexec: backtrack overflow!\n");
+					return 0;
+				}
+				spawn(&ready[nready++], pc->y, sp, &sub);
+				pc = pc->x;
+				continue;
+
+			case I_PLA:
+				if (!match(pc->x, sp, bol, flags, &sub))
+					goto dead;
+				pc = pc->y;
+				continue;
+			case I_NLA:
+				memcpy(&scratch, &sub, sizeof scratch);
+				if (match(pc->x, sp, bol, flags, &scratch))
+					goto dead;
+				pc = pc->y;
+				continue;
+
+			case I_ANYNL:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
+				break;
+			case I_ANY:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
+				if (isnewline(c))
+					goto dead;
+				break;
+			case I_CHAR:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
+				if (flags & REG_ICASE)
+					c = canon(c);
+				if (c != pc->c)
+					goto dead;
+				break;
+			case I_CCLASS:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
+				if (flags & REG_ICASE) {
+					if (!incclasscanon(pc->cc, canon(c)))
+						goto dead;
+				} else {
+					if (!incclass(pc->cc, c))
+						goto dead;
+				}
+				break;
+			case I_NCCLASS:
+				sp += chartorune(&c, sp);
+				if (c == 0)
+					goto dead;
+				if (flags & REG_ICASE) {
+					if (incclasscanon(pc->cc, canon(c)))
+						goto dead;
+				} else {
+					if (incclass(pc->cc, c))
+						goto dead;
+				}
+				break;
+			case I_REF:
+				i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
+				if (flags & REG_ICASE) {
+					if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
+						goto dead;
+				} else {
+					if (strncmp(sp, sub.sub[pc->n].sp, i))
+						goto dead;
+				}
+				if (i > 0)
+					sp += i;
+				break;
+
+			case I_BOL:
+				if (sp == bol && !(flags & REG_NOTBOL))
+					break;
+				if (flags & REG_NEWLINE)
+					if (sp > bol && isnewline(sp[-1]))
+						break;
+				goto dead;
+			case I_EOL:
+				if (*sp == 0)
+					break;
+				if (flags & REG_NEWLINE)
+					if (isnewline(*sp))
+						break;
+				goto dead;
+			case I_WORD:
+				i = sp > bol && iswordchar(sp[-1]);
+				i ^= iswordchar(sp[0]);
+				if (i)
+					break;
+				goto dead;
+			case I_NWORD:
+				i = sp > bol && iswordchar(sp[-1]);
+				i ^= iswordchar(sp[0]);
+				if (!i)
+					break;
+				goto dead;
+
+			case I_LPAR:
+				sub.sub[pc->n].sp = sp;
+				break;
+			case I_RPAR:
+				sub.sub[pc->n].ep = sp;
+				break;
+			default:
+				goto dead;
+			}
+			pc = pc + 1;
+		}
+dead: ;
+	}
+	return 0;
+}
+
+int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
+{
+	Resub scratch;
+	int i;
+
+	if (!sub)
+		sub = &scratch;
+
+	sub->nsub = prog->nsub;
+	for (i = 0; i < MAXSUB; ++i)
+		sub->sub[i].sp = sub->sub[i].ep = NULL;
+
+	return !match(prog->start, sp, sp, prog->flags | eflags, sub);
+}
+
+#ifdef TEST
+int main(int argc, char **argv)
+{
+	const char *error;
+	const char *s;
+	Reprog *p;
+	Resub m;
+	unsigned int i;
+
+	if (argc > 1) {
+		p = regcomp(argv[1], 0, &error);
+		if (!p) {
+			fprintf(stderr, "regcomp: %s\n", error);
+			return 1;
+		}
+
+		if (argc > 2) {
+			s = argv[2];
+			printf("nsub = %d\n", p->nsub);
+			if (!regexec(p, s, &m, 0)) {
+				for (i = 0; i < m.nsub; ++i) {
+					int n = m.sub[i].ep - m.sub[i].sp;
+					if (n > 0)
+						printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
+					else
+						printf("match %d: n=0 ''\n", i);
+				}
+			} else {
+				printf("no match\n");
+			}
+		}
+	}
+
+	return 0;
+}
+#endif
--- /dev/null
+++ b/regexp.h
@@ -1,0 +1,35 @@
+#ifndef regexp_h
+#define regexp_h
+
+#define regcomp js_regcomp
+#define regexec js_regexec
+#define regfree js_regfree
+
+typedef struct Reprog Reprog;
+typedef struct Resub Resub;
+
+Reprog *regcomp(const char *pattern, int cflags, const char **errorp);
+int regexec(Reprog *prog, const char *string, Resub *sub, int eflags);
+void regfree(Reprog *prog);
+
+enum {
+	/* regcomp flags */
+	REG_ICASE = 1,
+	REG_NEWLINE = 2,
+
+	/* regexec flags */
+	REG_NOTBOL = 4,
+
+	/* limits */
+	REG_MAXSUB = 16
+};
+
+struct Resub {
+	unsigned int nsub;
+	struct {
+		const char *sp;
+		const char *ep;
+	} sub[REG_MAXSUB];
+};
+
+#endif