shithub: libmujs

--- a/Makefile

+++ b/Makefile

@@ -26,6 +26,9 @@

 js: build/main.o build/libjs.a

 	$(CC) -o $@ $^ -lm

+re: regex.c utf.c utftype.c

+	$(CC) $(CFLAGS) -DTEST -o $@ $^

 libjs.c : $(SRCS)

 	ls $(SRCS) | awk '{print "#include \""$$1"\""}' > $@

--- a/jsregexp.c

+++ b/jsregexp.c

@@ -14,7 +14,7 @@

 	obj = jsV_newobject(J, JS_CREGEXP, J->RegExp_prototype);

-	opts = REG_EXTENDED;

+	opts = 0;

 	if (flags & JS_REGEXP_I) opts |= REG_ICASE;

 	if (flags & JS_REGEXP_M) opts |= REG_NEWLINE;

--- a/regex.c

+++ b/regex.c

@@ -1,45 +1,1023 @@

 #include <stdlib.h>

+#include <stdio.h>

+#include <string.h>

+#include <setjmp.h>

+#include <limits.h>

 #include "regex.h"

+#include "utf.h"

-Reprog *js_regcomp(const char *pattern, int cflags, const char **errorp)

+#define nelem(a) (sizeof (a) / sizeof (a)[0])

+typedef struct Reclass Reclass;

+typedef struct Renode Renode;

+typedef struct Reinst Reinst;

+struct Reclass {

+	Rune *end;

+	Rune spans[64];

+};

+struct Reprog {

+	Reinst *start, *end;

+	int icase, newline;

+	Reclass cclass[16];

+};

+struct cstate {

+	Reprog *prog;

+	Renode *pstart, *pend;

+	int flags;

+	const char *source;

+	int ncclass;

+	int ncap;

+	int nref[10];

+	int lookahead;

+	Rune yychar;

+	Reclass *yycclass;

+	int yymin, yymax;

+	const char *error;

+	jmp_buf kaboom;

+};

+static void die(struct cstate *g, const char *message)

-	static char msg[256];

-	regex_t *prog = malloc(sizeof *prog);

-	int status = regcomp(prog, pattern, cflags);

-	if (status) {

-		free(prog);

-		if (errorp) {

-			regerror(status, prog, msg, sizeof msg);

-			*errorp = msg;

+	g->error = message;

+	longjmp(g->kaboom, 1);

+}

+static inline int canon(Rune c)

+{

+	Rune u = toupperrune(c);

+	if (c >= 128 && u < 128)

+		return c;

+	return u;

+}

+/* Scan */

+enum {

+	L_CHAR = 256,

+	L_CCLASS,	/* character class */

+	L_NCCLASS,	/* negative character class */

+	L_NC,		/* "(?:" no capture */

+	L_PLA,		/* "(?=" positive lookahead */

+	L_NLA,		/* "(?!" negative lookahead */

+	L_WORD,		/* "\b" word boundary */

+	L_NWORD,	/* "\B" non-word boundary */

+	L_REF,		/* "\0" back-reference */

+	L_COUNT,	/* {M,N} */

+};

+static Reclass class_d = {

+	class_d.spans + 2, {

+		'0', '9',

+	}

+};

+static Reclass class_s = {

+	class_s.spans + 12, {

+		0x9, 0x9,

+		0xA, 0xD,

+		0x20, 0x20,

+		0xA0, 0xA0,

+		0x2028, 0x2029,

+		0xFEFF, 0xFEFF,

+	}

+};

+static Reclass class_w = {

+	class_w.spans + 8, {

+		'0', '9',

+		'A', 'Z',

+		'_', '_',

+		'a', 'z',

+	}

+};

+static Reclass *newclass(struct cstate *g)

+{

+	if (g->ncclass >= nelem(g->prog->cclass))

+		die(g, "too many character classes");

+	return &g->prog->cclass[g->ncclass++];

+}

+static int hex(struct cstate *g, int c)

+{

+	if (c >= '0' && c <= '9') return c - '0';

+	if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;

+	if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;

+	die(g, "invalid escape sequence");

+	return 0;

+}

+static int dec(struct cstate *g, int c)

+{

+	if (c >= '0' && c <= '9') return c - '0';

+	die(g, "invalid quantifier");

+	return 0;

+}

+static int nextrune(struct cstate *g)

+{

+	g->source += chartorune(&g->yychar, g->source);

+	if (g->yychar == '\\') {

+		g->source += chartorune(&g->yychar, g->source);

+		switch (g->yychar) {

+		case 0: die(g, "unterminated escape sequence");

+		// case 'b': g->yychar = '\b'; return 0;

+		case 'f': g->yychar = '\f'; return 0;

+		case 'n': g->yychar = '\n'; return 0;

+		case 'r': g->yychar = '\r'; return 0;

+		case 't': g->yychar = '\t'; return 0;

+		case 'v': g->yychar = '\v'; return 0;

+		case 'c':

+			g->yychar = (*g->source++) & 31;

+			return 0;

+		case 'x':

+			g->yychar = hex(g, *g->source++) << 4;

+			g->yychar += hex(g, *g->source++);

+			return 0;

+		case 'u':

+			g->yychar = hex(g, *g->source++) << 12;

+			g->yychar += hex(g, *g->source++) << 8;

+			g->yychar += hex(g, *g->source++) << 4;

+			g->yychar += hex(g, *g->source++);

+			return 0;

-		return NULL;

+		return 1;

-	if (errorp)

-		*errorp = NULL;

-	return (Reprog*)prog;

+	return 0;

-int js_regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags)

+static int lexcount(struct cstate *g)

-	regmatch_t m[10];

-	int i, status;

-	status = regexec((regex_t*)prog, string, 10, m, eflags);

-	for (i = 0; i < nmatch; ++i) {

-		if (m[i].rm_so >= 0) {

-			pmatch[i].sp = string + m[i].rm_so;

-			pmatch[i].ep = string + m[i].rm_eo;

+	nextrune(g);

+	g->yymin = dec(g, g->yychar);

+	nextrune(g);

+	while (g->yychar != ',' && g->yychar != '}') {

+		g->yymin = g->yymin * 10 + dec(g, g->yychar);

+		nextrune(g);

+	}

+	if (g->yymin >= USHRT_MAX)

+		die(g, "numeric overflow");

+	if (g->yychar == ',') {

+		nextrune(g);

+		if (g->yychar == '}') {

+			g->yymax = USHRT_MAX;

 		} else {

-			pmatch[i].sp = NULL;

-			pmatch[i].ep = NULL;

+			g->yymax = dec(g, g->yychar);

+			nextrune(g);

+			while (g->yychar != '}') {

+				g->yymax = g->yymax * 10 + dec(g, g->yychar);

+				nextrune(g);

+			}

+			if (g->yymax >= USHRT_MAX)

+				die(g, "numeric overflow");

+	} else {

+		g->yymax = g->yymin;

-	return status;

+	return L_COUNT;

-void js_regfree(Reprog *prog)

+static int lexclass(struct cstate *g)

+	int type = L_CCLASS;

+	int quoted;

+	Rune *p, *ep;

+	Rune save;

+	g->yycclass = newclass(g);

+	p = g->yycclass->spans;

+	ep = p + nelem(g->yycclass->spans);

+	quoted = nextrune(g);

+	if (!quoted && g->yychar == '^') {

+		type = L_NCCLASS;

+		quoted = nextrune(g);

+	}

+	while (p < ep) {

+		if (g->yychar == 0)

+			die(g, "unterminated character class");

+		if (!quoted && g->yychar == ']')

+			break;

+		save = g->yychar;

+		quoted = nextrune(g);

+		// TODO: \d \D \s \S \w \W

+		if (!quoted && g->yychar == '-') {

+			quoted = nextrune(g);

+			if (g->yychar == 0)

+				die(g, "unterminated character class");

+			if (!quoted && g->yychar == ']') {

+				*p++ = save;

+				*p++ = save;

+				if (p == ep)

+					die(g, "too many character classes");

+				*p++ = '-';

+				*p++ = '-';

+				break;

+			}

+			if (g->yychar < save)

+				die(g, "invalid character class range");

+			*p++ = save;

+			*p++ = g->yychar;

+			quoted = nextrune(g);

+		} else {

+			*p++ = save;

+			*p++ = save;

+		}

+	}

+	if (p == ep)

+		die(g, "too many character classes");

+	g->yycclass->end = p;

+	return type;

+}

+static int lex(struct cstate *g)

+{

+	int quoted = nextrune(g);

+	if (quoted) {

+		switch (g->yychar) {

+		case 'b': return L_WORD;

+		case 'B': return L_NWORD;

+		case 'd': g->yycclass = &class_d; return L_CCLASS;

+		case 's': g->yycclass = &class_s; return L_CCLASS;

+		case 'w': g->yycclass = &class_w; return L_CCLASS;

+		case 'D': g->yycclass = &class_d; return L_NCCLASS;

+		case 'S': g->yycclass = &class_s; return L_NCCLASS;

+		case 'W': g->yycclass = &class_w; return L_NCCLASS;

+		}

+		if (g->yychar >= '0' && g->yychar <= '9') {

+			g->yychar -= '0';

+			return L_REF;

+		}

+		return L_CHAR;

+	}

+	switch (g->yychar) {

+	case '*': case '+': case '?': case '|':

+	case ')': case '.': case '^': case '$':

+	case 0:

+		return g->yychar;

+	}

+	if (g->yychar == '{')

+		return lexcount(g);

+	if (g->yychar == '[')

+		return lexclass(g);

+	if (g->yychar == '(') {

+		if (g->source[0] == '?') {

+			if (g->source[1] == ':') {

+				g->source += 2;

+				return L_NC;

+			}

+			if (g->source[1] == '=') {

+				g->source += 2;

+				return L_PLA;

+			}

+			if (g->source[1] == '!') {

+				g->source += 2;

+				return L_NLA;

+			}

+		}

+		return '(';

+	}

+	return L_CHAR;

+}

+/* Parse */

+enum {

+	P_CAT, P_ALT, P_REP,

+	P_BOL, P_EOL, P_WORD, P_NWORD,

+	P_PAR, P_PLA, P_NLA,

+	P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,

+	P_REF,

+};

+struct Renode {

+	int type;

+	Reclass *cc;

+	Rune c;

+	unsigned char ng;

+	unsigned short m, n;

+	Renode *x;

+	Renode *y;

+};

+static Renode *newnode(struct cstate *g, int type)

+{

+	Renode *node = g->pend++;

+	node->type = type;

+	node->cc = NULL;

+	node->c = 0;

+	node->ng = 0;

+	node->m = 0;

+	node->n = 0;

+	node->x = node->y = NULL;

+	return node;

+}

+static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)

+{

+	Renode *rep = newnode(g, P_REP);

+	rep->ng = ng;

+	rep->m = min;

+	rep->n = max;

+	rep->x = atom;

+	return rep;

+}

+static inline void next(struct cstate *g)

+{

+	g->lookahead = lex(g);

+}

+static inline int accept(struct cstate *g, int t)

+{

+	if (g->lookahead == t) {

+		next(g);

+		return 1;

+	}

+	return 0;

+}

+static Renode *parsealt(struct cstate *g);

+static Renode *parsecap(struct cstate *g, int type)

+{

+	Renode *atom = newnode(g, type);

+	if (++g->ncap == 10)

+		die(g, "too many captures");

+	atom->n = g->ncap;

+	g->nref[atom->n] = 0;

+	atom->x = parsealt(g);

+	g->nref[atom->n] = 1;

+	if (!accept(g, ')'))

+		die(g, "unmatched '('");

+	return atom;

+}

+static Renode *parseatom(struct cstate *g)

+{

+	Renode *atom;

+	if (g->lookahead == L_CHAR) {

+		atom = newnode(g, P_CHAR);

+		atom->c = g->yychar;

+		next(g);

+		return atom;

+	}

+	if (g->lookahead == L_CCLASS) {

+		atom = newnode(g, P_CCLASS);

+		atom->cc = g->yycclass;

+		next(g);

+		return atom;

+	}

+	if (g->lookahead == L_NCCLASS) {

+		atom = newnode(g, P_NCCLASS);

+		atom->cc = g->yycclass;

+		next(g);

+		return atom;

+	}

+	if (g->lookahead == L_REF) {

+		atom = newnode(g, P_REF);

+		if (g->yychar == 0 || g->yychar > g->ncap || !g->nref[g->yychar])

+			die(g, "invalid back-reference");

+		atom->n = g->yychar;

+		next(g);

+		return atom;

+	}

+	if (accept(g, '.'))

+		return newnode(g, P_ANY);

+	if (accept(g, L_NC)) {

+		atom = parsealt(g);

+		if (!accept(g, ')'))

+			die(g, "unmatched '('");

+		return atom;

+	}

+	if (accept(g, '('))

+		return parsecap(g, P_PAR);

+	if (accept(g, L_PLA))

+		return parsecap(g, P_PLA);

+	if (accept(g, L_NLA))

+		return parsecap(g, P_NLA);

+	die(g, "syntax error");

+	return NULL;

+}

+static Renode *parserep(struct cstate *g)

+{

+	Renode *atom;

+	if (accept(g, '^')) return newnode(g, P_BOL);

+	if (accept(g, '$')) return newnode(g, P_EOL);

+	if (accept(g, L_WORD)) return newnode(g, P_WORD);

+	if (accept(g, L_NWORD)) return newnode(g, P_NWORD);

+	atom = parseatom(g);

+	if (g->lookahead == L_COUNT) {

+		int min = g->yymin, max = g->yymax;

+		next(g);

+		if (max < min)

+			die(g, "invalid quantifier");

+		return newrep(g, atom, accept(g, '?'), min, max);

+	}

+	if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, USHRT_MAX);

+	if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, USHRT_MAX);

+	if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);

+	return atom;

+}

+static Renode *parsecat(struct cstate *g)

+{

+	Renode *cat, *x;

+	if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {

+		cat = parserep(g);

+		while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {

+			x = cat;

+			cat = newnode(g, P_CAT);

+			cat->x = x;

+			cat->y = parserep(g);

+		}

+		return cat;

+	}

+	return NULL;

+}

+static Renode *parsealt(struct cstate *g)

+{

+	Renode *alt, *x;

+	alt = parsecat(g);

+	while (accept(g, '|')) {

+		x = alt;

+		alt = newnode(g, P_ALT);

+		alt->x = x;

+		alt->y = parsecat(g);

+	}

+	return alt;

+}

+/* Compile */

+enum {

+	I_MATCH, I_JUMP, I_SPLIT, I_PLA, I_NLA,

+	I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,

+	I_BOL, I_EOL, I_WORD, I_NWORD,

+	I_PAR, I_ENDPAR

+};

+struct Reinst {

+	int opcode;

+	Reclass *cc;

+	Rune c;

+	unsigned char n;

+	Reinst *x;

+	Reinst *y;

+	const char *p;

+};

+static int count(Renode *node)

+{

+	int min, max;

+	if (!node) return 0;

+	switch (node->type) {

+	default: return 1;

+	case P_CAT: return count(node->x) + count(node->y);

+	case P_ALT: return count(node->x) + count(node->y) + 2;

+	case P_REP:

+		min = node->m;

+		max = node->n;

+		if (min == max) return count(node->x) * min;

+		if (max < USHRT_MAX) return count(node->x) * max + (max - min);

+		return count(node->x) * (min + 1) + 2;

+	case P_PAR: return count(node->x) + 2;

+	case P_PLA: return count(node->x) + 4;

+	case P_NLA: return count(node->x) + 4;

+	}

+}

+static Reinst *emit(Reprog *prog, int opcode)

+{

+	Reinst *inst = prog->end++;

+	inst->opcode = opcode;

+	inst->cc = NULL;

+	inst->c = 0;

+	inst->n = 0;

+	inst->x = inst->y = NULL;

+	inst->p = 0;

+	return inst;

+}

+static void compile(Reprog *prog, Renode *node)

+{

+	Reinst *inst, *split, *jump;

+	int i;

+	if (!node)

+		return;

+	switch (node->type) {

+	case P_CAT:

+		compile(prog, node->x);

+		compile(prog, node->y);

+		break;

+	case P_ALT:

+		split = emit(prog, I_SPLIT);

+		compile(prog, node->x);

+		jump = emit(prog, I_JUMP);

+		compile(prog, node->y);

+		split->x = split + 1;

+		split->y = jump + 1;

+		jump->x = prog->end;

+		break;

+	case P_REP:

+		for (i = 0; i < node->m; ++i) {

+			inst = prog->end;

+			compile(prog, node->x);

+		}

+		if (node->m == node->n)

+			break;

+		if (node->n < USHRT_MAX) {

+			for (i = node->m; i < node->n; ++i) {

+				split = emit(prog, I_SPLIT);

+				compile(prog, node->x);

+				if (node->ng) {

+					split->y = split + 1;

+					split->x = prog->end;

+				} else {

+					split->x = split + 1;

+					split->y = prog->end;

+				}

+			}

+		} else if (node->m == 0) {

+			split = emit(prog, I_SPLIT);

+			compile(prog, node->x);

+			jump = emit(prog, I_JUMP);

+			if (node->ng) {

+				split->y = split + 1;

+				split->x = prog->end;

+			} else {

+				split->x = split + 1;

+				split->y = prog->end;

+			}

+			jump->x = split;

+		} else {

+			split = emit(prog, I_SPLIT);

+			if (node->ng) {

+				split->y = inst;

+				split->x = prog->end;

+			} else {

+				split->x = inst;

+				split->y = prog->end;

+			}

+		}

+		break;

+	case P_BOL: emit(prog, I_BOL); break;

+	case P_EOL: emit(prog, I_EOL); break;

+	case P_WORD: emit(prog, I_WORD); break;

+	case P_NWORD: emit(prog, I_NWORD); break;

+	case P_PAR:

+		inst = emit(prog, I_PAR);

+		inst->n = node->n;

+		compile(prog, node->x);

+		inst = emit(prog, I_ENDPAR);

+		inst->n = node->n;

+		break;

+	case P_PLA:

+		split = emit(prog, I_PLA);

+		inst = emit(prog, I_PAR);

+		inst->n = node->n;

+		compile(prog, node->x);

+		inst = emit(prog, I_ENDPAR);

+		inst->n = node->n;

+		emit(prog, I_MATCH);

+		split->x = split + 1;

+		split->y = prog->end;

+		break;

+	case P_NLA:

+		split = emit(prog, I_NLA);

+		inst = emit(prog, I_PAR);

+		inst->n = node->n;

+		compile(prog, node->x);

+		inst = emit(prog, I_ENDPAR);

+		inst->n = node->n;

+		emit(prog, I_MATCH);

+		split->x = split + 1;

+		split->y = prog->end;

+		break;

+	case P_ANY:

+		emit(prog, I_ANY);

+		break;

+	case P_CHAR:

+		inst = emit(prog, I_CHAR);

+		inst->c = prog->icase ? canon(node->c) : node->c;

+		break;

+	case P_CCLASS:

+		inst = emit(prog, I_CCLASS);

+		inst->cc = node->cc;

+		break;

+	case P_NCCLASS:

+		inst = emit(prog, I_NCCLASS);

+		inst->cc = node->cc;

+		break;

+	case P_REF:

+		inst = emit(prog, I_REF);

+		inst->n = node->n;

+		break;

+	}

+}

+#ifdef TEST

+static void dumpnode(Renode *node)

+{

+	Rune *p;

+	if (!node) { printf("Empty"); return; }

+	switch (node->type) {

+	case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;

+	case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;

+	case P_REP:

+		printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);

+		dumpnode(node->x);

+		printf(")");

+		break;

+	case P_BOL: printf("Bol"); break;

+	case P_EOL: printf("Eol"); break;

+	case P_WORD: printf("Word"); break;

+	case P_NWORD: printf("NotWord"); break;

+	case P_PAR: printf("Par("); dumpnode(node->x); printf(")"); break;

+	case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;

+	case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;

+	case P_ANY: printf("Any"); break;

+	case P_CHAR: printf("Char(%c)", node->c); break;

+	case P_CCLASS:

+		printf("Class(");

+		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);

+		printf(")");

+		break;

+	case P_NCCLASS:

+		printf("NotClass(");

+		for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);

+		printf(")");

+		break;

+	case P_REF: printf("Ref(%d)", node->n); break;

+	}

+}

+static void dumpprog(Reprog *prog)

+{

+	Reinst *inst;

+	int i;

+	for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {

+		printf("% 5d: ", i);

+		switch (inst->opcode) {

+		case I_MATCH: puts("match"); break;

+		case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;

+		case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;

+		case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;

+		case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;

+		case I_ANY: puts("any"); break;

+		case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;

+		case I_CCLASS: puts("cclass"); break;

+		case I_NCCLASS: puts("ncclass"); break;

+		case I_REF: printf("ref %d\n", inst->n); break;

+		case I_BOL: puts("bol"); break;

+		case I_EOL: puts("eol"); break;

+		case I_WORD: puts("word"); break;

+		case I_NWORD: puts("nword"); break;

+		case I_PAR: printf("par %d\n", inst->n); break;

+		case I_ENDPAR: printf("endpar %d\n", inst->n); break;

+		}

+	}

+}

+#endif

+Reprog *regcomp(const char *pattern, int cflags, const char **errorp)

+{

+	struct cstate g;

+	Renode *node;

+	int i;

+	g.prog = malloc(sizeof (Reprog));

+	g.pstart = g.pend = malloc(sizeof (Renode) * strlen(pattern) * 2);

+	if (setjmp(g.kaboom)) {

+		if (errorp) *errorp = g.error;

+		free(g.pstart);

+		free(g.prog);

+		return NULL;

+	}

+	g.source = pattern;

+	g.ncclass = 0;

+	g.ncap = 0;

+	for (i = 0; i < 10; ++i)

+		g.nref[i] = 0;

+	g.prog->icase = cflags & REG_ICASE;

+	g.prog->newline = cflags & REG_NEWLINE;

+	next(&g);

+	node = parsealt(&g);

+	if (g.lookahead == ')')

+		die(&g, "unmatched ')'");

+	if (g.lookahead != 0)

+		die(&g, "syntax error");

+	g.prog->start = g.prog->end = malloc((count(node) + 3) * sizeof (Reinst));

+	emit(g.prog, I_PAR);

+	compile(g.prog, node);

+	emit(g.prog, I_ENDPAR);

+	emit(g.prog, I_MATCH);

+#ifdef TEST

+	dumpnode(node);

+	putchar('\n');

+	dumpprog(g.prog);

+#endif

+	free(g.pstart);

+	if (errorp) *errorp = NULL;

+	return g.prog;

+}

+void regfree(Reprog *prog)

+{

 	if (prog) {

-		regfree((regex_t*)prog);

+		free(prog->start);

 		free(prog);

+/* Match */

+struct estate {

+	int icase, newline;

+	const char *bol;

+	Resub *m;

+};

+static inline int isnewline(int c)

+{

+	return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;

+}

+static inline int iswordchar(int c)

+{

+	return c == '_' ||

+		(c >= 'a' && c <= 'z') ||

+		(c >= 'A' && c <= 'Z') ||

+		(c >= '0' && c <= '9');

+}

+static inline int incclass(Reclass *cc, Rune c)

+{

+	Rune *p;

+	for (p = cc->spans; p < cc->end; p += 2)

+		if (p[0] <= c && c <= p[1])

+			return 1;

+	return 0;

+}

+static inline int incclasscanon(Reclass *cc, Rune c)

+{

+	Rune *p, r;

+	for (p = cc->spans; p < cc->end; p += 2)

+		for (r = p[0]; r <= p[1]; ++r)

+			if (c == canon(r))

+				return 1;

+	return 0;

+}

+static int strncmpcanon(const char *a, const char *b, int n)

+{

+	Rune ra, rb;

+	int c;

+	while (n--) {

+		if (!*a) return -1;

+		if (!*b) return 1;

+		a += chartorune(&ra, a);

+		b += chartorune(&rb, b);

+		c = canon(ra) - canon(rb);

+		if (c)

+			return c;

+	}

+	return 0;

+}

+static int match(struct estate *g, Reinst *pc, const char *s)

+{

+	const char *p;

+	Rune c;

+	int n;

+	for (;;) {

+		switch (pc->opcode) {

+		case I_MATCH:

+			return 1;

+		case I_JUMP:

+			pc = pc->x;

+			continue;

+		case I_SPLIT:

+			// TODO: use I_PROGRESS instead?

+#if 0

+			if (match(g, pc->x, s))

+				return 1;

+			pc = pc->y;

+			continue;

+#else

+			if (pc->p == s) return 0;

+			p = pc->p;

+			pc->p = s;

+			if (match(g, pc->x, s) || match(g, pc->y, s)) {

+				pc->p = p;

+				return 1;

+			}

+			pc->p = p;

+			return 0;

+#endif

+		case I_PLA:

+			if (!match(g, pc->x, s))

+				return 0;

+			pc = pc->y;

+			continue;

+		case I_NLA:

+			if (match(g, pc->x, s))

+				return 0;

+			pc = pc->y;

+			continue;

+		case I_ANY:

+			s += chartorune(&c, s);

+			if (c == 0)

+				return 0;

+			if (isnewline(c))

+				return 0;

+			break;

+		case I_CHAR:

+			s += chartorune(&c, s);

+			if (c == 0)

+				return 0;

+			if (g->icase)

+				c = canon(c);

+			if (c != pc->c)

+				return 0;

+			break;

+		case I_CCLASS:

+			s += chartorune(&c, s);

+			if (c == 0)

+				return 0;

+			if (g->icase) {

+				if (!incclasscanon(pc->cc, canon(c)))

+					return 0;

+			} else {

+				if (!incclass(pc->cc, c))

+					return 0;

+			}

+			break;

+		case I_NCCLASS:

+			s += chartorune(&c, s);

+			if (c == 0)

+				return 0;

+			if (g->icase) {

+				if (incclasscanon(pc->cc, canon(c)))

+					return 0;

+			} else {

+				if (incclass(pc->cc, c))

+					return 0;

+			}

+			break;

+		case I_REF:

+			n = (int)(g->m[pc->n].ep - g->m[pc->n].sp);

+			if (g->icase) {

+				if (strncmpcanon(s, g->m[pc->n].sp, n))

+					return 0;

+			} else {

+				if (strncmp(s, g->m[pc->n].sp, n))

+					return 0;

+			}

+			if (n > 0)

+				s += n;

+			break;

+		case I_BOL:

+			if (s == g->bol)

+				break;

+			if (g->newline)

+				if (isnewline(s[-1]))

+					break;

+			return 0;

+		case I_EOL:

+			if (*s == 0)

+				break;

+			if (g->newline)

+				if (isnewline(*s))

+					break;

+			return 0;

+		case I_WORD:

+			n = !(s == g->bol) && iswordchar(s[-1]);

+			n ^= iswordchar(s[0]);

+			if (n)

+				break;

+			return 0;

+		case I_NWORD:

+			n = !(s == g->bol) && iswordchar(s[-1]);

+			n ^= iswordchar(s[0]);

+			if (!n)

+				break;

+			return 0;

+		case I_PAR:

+			p = g->m[pc->n].sp;

+			g->m[pc->n].sp = s;

+			if (match(g, pc + 1, s))

+				return 1;

+			g->m[pc->n].sp = p;

+			return 0;

+		case I_ENDPAR:

+			p = g->m[pc->n].ep;

+			g->m[pc->n].ep = s;

+			if (match(g, pc + 1, s))

+				return 1;

+			g->m[pc->n].ep = p;

+			return 0;

+		default:

+			return 0;

+		}

+		pc = pc + 1;

+	}

+}

+int regexec(Reprog *prog, const char *s, int n, Resub *m, int eflags)

+{

+	struct estate g;

+	Resub gm[10];

+	Rune c;

+	int i;

+	g.icase = prog->icase;

+	g.newline = prog->newline;

+	g.bol = eflags & REG_NOTBOL ? NULL : s;

+	g.m = m ? m : gm;

+	for (i = 0; i < n; ++i)

+		g.m[i].sp = g.m[i].ep = NULL;

+	do {

+		if (match(&g, prog->start, s))

+			return 0;

+		s += chartorune(&c, s);

+	} while (c);

+	return 1;

+}

+#ifdef TEST

+int main(int argc, char **argv)

+{

+	const char *error;

+	const char *s;

+	Reprog *p;

+	Resub m[10];

+	int i;

+	if (argc > 1) {

+		p = regcomp(argv[1], 0, &error);

+		if (!p) {

+			fprintf(stderr, "regcomp: %s\n", error);

+			return 1;

+		}

+		if (argc > 2) {

+			s = argv[2];

+			if (!regexec(p, s, 10, m, 0)) {

+				for (i = 0; i < 10; ++i)

+					if (m[i].sp) {

+						int n = m[i].ep - m[i].sp;

+						printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m[i].sp - s), (int)(m[i].ep - s), n, n, m[i].sp);

+					}

+			} else {

+				printf("no match\n");

+			}

+		}

+	}

+	return 0;

+}

+#endif

--- a/regex.h

+++ b/regex.h

@@ -1,16 +1,31 @@

 #ifndef regex_h

 #define regex_h

-#include <regex.h>

+#define regcomp js_regcomp

+#define regexec js_regexec

+#define regfree js_regfree

 typedef struct Reprog Reprog;

-typedef struct {

+typedef struct Resub Resub;

+struct Resub {

 	const char *sp;

 	const char *ep;

-} Resub;

+};

-Reprog *js_regcomp(const char *pattern, int cflags, const char **errorp);

-int js_regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags);

-void js_regfree(Reprog *prog);

+Reprog *regcomp(const char *pattern, int cflags, const char **errorp);

+int regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags);

+void regfree(Reprog *prog);

+enum {

+	/* regcomp flags */

+	REG_ICASE = 1,

+	REG_NEWLINE = 2

+};

+enum {

+	/* regexec flags */

+	REG_NOTBOL = 1,

+	REG_NOTEOL = 2

+};

 #endif

--

⑨