ref: 8898715c8319aa25d0bd316fedfaddbd42587bf4
parent: 2ac601d07e226f0243125f2079c7af041b74fa6f
author: Tor Andersson <tor@ccxvii.net>
date: Thu Feb 20 21:48:52 EST 2014
Regular expression implementation. Does not support [\w\D] type character ranges yet. Has bugs in lookahead captures.
--- a/Makefile
+++ b/Makefile
@@ -26,6 +26,9 @@
js: build/main.o build/libjs.a
$(CC) -o $@ $^ -lm
+re: regex.c utf.c utftype.c
+ $(CC) $(CFLAGS) -DTEST -o $@ $^
+
libjs.c : $(SRCS)
ls $(SRCS) | awk '{print "#include \""$$1"\""}' > $@
--- a/jsregexp.c
+++ b/jsregexp.c
@@ -14,7 +14,7 @@
obj = jsV_newobject(J, JS_CREGEXP, J->RegExp_prototype);
- opts = REG_EXTENDED;
+ opts = 0;
if (flags & JS_REGEXP_I) opts |= REG_ICASE;
if (flags & JS_REGEXP_M) opts |= REG_NEWLINE;
--- a/regex.c
+++ b/regex.c
@@ -1,45 +1,1023 @@
#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <setjmp.h>
+#include <limits.h>
+
#include "regex.h"
+#include "utf.h"
-Reprog *js_regcomp(const char *pattern, int cflags, const char **errorp)
+#define nelem(a) (sizeof (a) / sizeof (a)[0])
+
+typedef struct Reclass Reclass;
+typedef struct Renode Renode;
+typedef struct Reinst Reinst;
+
+struct Reclass {
+ Rune *end;
+ Rune spans[64];
+};
+
+struct Reprog {
+ Reinst *start, *end;
+ int icase, newline;
+ Reclass cclass[16];
+};
+
+struct cstate {
+ Reprog *prog;
+ Renode *pstart, *pend;
+ int flags;
+
+ const char *source;
+ int ncclass;
+ int ncap;
+ int nref[10];
+
+ int lookahead;
+ Rune yychar;
+ Reclass *yycclass;
+ int yymin, yymax;
+
+ const char *error;
+ jmp_buf kaboom;
+};
+
+static void die(struct cstate *g, const char *message)
{
- static char msg[256];
- regex_t *prog = malloc(sizeof *prog);
- int status = regcomp(prog, pattern, cflags);
- if (status) {
- free(prog);
- if (errorp) {
- regerror(status, prog, msg, sizeof msg);
- *errorp = msg;
+ g->error = message;
+ longjmp(g->kaboom, 1);
+}
+
+static inline int canon(Rune c)
+{
+ Rune u = toupperrune(c);
+ if (c >= 128 && u < 128)
+ return c;
+ return u;
+}
+
+/* Scan */
+
+enum {
+ L_CHAR = 256,
+ L_CCLASS, /* character class */
+ L_NCCLASS, /* negative character class */
+ L_NC, /* "(?:" no capture */
+ L_PLA, /* "(?=" positive lookahead */
+ L_NLA, /* "(?!" negative lookahead */
+ L_WORD, /* "\b" word boundary */
+ L_NWORD, /* "\B" non-word boundary */
+ L_REF, /* "\0" back-reference */
+ L_COUNT, /* {M,N} */
+};
+
+static Reclass class_d = {
+ class_d.spans + 2, {
+ '0', '9',
+ }
+};
+
+static Reclass class_s = {
+ class_s.spans + 12, {
+ 0x9, 0x9,
+ 0xA, 0xD,
+ 0x20, 0x20,
+ 0xA0, 0xA0,
+ 0x2028, 0x2029,
+ 0xFEFF, 0xFEFF,
+ }
+};
+
+static Reclass class_w = {
+ class_w.spans + 8, {
+ '0', '9',
+ 'A', 'Z',
+ '_', '_',
+ 'a', 'z',
+ }
+};
+
+static Reclass *newclass(struct cstate *g)
+{
+ if (g->ncclass >= nelem(g->prog->cclass))
+ die(g, "too many character classes");
+ return &g->prog->cclass[g->ncclass++];
+}
+
+static int hex(struct cstate *g, int c)
+{
+ if (c >= '0' && c <= '9') return c - '0';
+ if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
+ if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
+ die(g, "invalid escape sequence");
+ return 0;
+}
+
+static int dec(struct cstate *g, int c)
+{
+ if (c >= '0' && c <= '9') return c - '0';
+ die(g, "invalid quantifier");
+ return 0;
+}
+
+static int nextrune(struct cstate *g)
+{
+ g->source += chartorune(&g->yychar, g->source);
+ if (g->yychar == '\\') {
+ g->source += chartorune(&g->yychar, g->source);
+ switch (g->yychar) {
+ case 0: die(g, "unterminated escape sequence");
+ // case 'b': g->yychar = '\b'; return 0;
+ case 'f': g->yychar = '\f'; return 0;
+ case 'n': g->yychar = '\n'; return 0;
+ case 'r': g->yychar = '\r'; return 0;
+ case 't': g->yychar = '\t'; return 0;
+ case 'v': g->yychar = '\v'; return 0;
+ case 'c':
+ g->yychar = (*g->source++) & 31;
+ return 0;
+ case 'x':
+ g->yychar = hex(g, *g->source++) << 4;
+ g->yychar += hex(g, *g->source++);
+ return 0;
+ case 'u':
+ g->yychar = hex(g, *g->source++) << 12;
+ g->yychar += hex(g, *g->source++) << 8;
+ g->yychar += hex(g, *g->source++) << 4;
+ g->yychar += hex(g, *g->source++);
+ return 0;
}
- return NULL;
+ return 1;
}
- if (errorp)
- *errorp = NULL;
- return (Reprog*)prog;
+ return 0;
}
-int js_regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags)
+static int lexcount(struct cstate *g)
{
- regmatch_t m[10];
- int i, status;
- status = regexec((regex_t*)prog, string, 10, m, eflags);
- for (i = 0; i < nmatch; ++i) {
- if (m[i].rm_so >= 0) {
- pmatch[i].sp = string + m[i].rm_so;
- pmatch[i].ep = string + m[i].rm_eo;
+ nextrune(g);
+
+ g->yymin = dec(g, g->yychar);
+ nextrune(g);
+ while (g->yychar != ',' && g->yychar != '}') {
+ g->yymin = g->yymin * 10 + dec(g, g->yychar);
+ nextrune(g);
+ }
+ if (g->yymin >= USHRT_MAX)
+ die(g, "numeric overflow");
+
+ if (g->yychar == ',') {
+ nextrune(g);
+ if (g->yychar == '}') {
+ g->yymax = USHRT_MAX;
} else {
- pmatch[i].sp = NULL;
- pmatch[i].ep = NULL;
+ g->yymax = dec(g, g->yychar);
+ nextrune(g);
+ while (g->yychar != '}') {
+ g->yymax = g->yymax * 10 + dec(g, g->yychar);
+ nextrune(g);
+ }
+ if (g->yymax >= USHRT_MAX)
+ die(g, "numeric overflow");
}
+ } else {
+ g->yymax = g->yymin;
}
- return status;
+
+ return L_COUNT;
}
-void js_regfree(Reprog *prog)
+static int lexclass(struct cstate *g)
{
+ int type = L_CCLASS;
+ int quoted;
+ Rune *p, *ep;
+ Rune save;
+
+ g->yycclass = newclass(g);
+ p = g->yycclass->spans;
+ ep = p + nelem(g->yycclass->spans);
+
+ quoted = nextrune(g);
+ if (!quoted && g->yychar == '^') {
+ type = L_NCCLASS;
+ quoted = nextrune(g);
+ }
+
+ while (p < ep) {
+ if (g->yychar == 0)
+ die(g, "unterminated character class");
+ if (!quoted && g->yychar == ']')
+ break;
+
+ save = g->yychar;
+ quoted = nextrune(g);
+
+ // TODO: \d \D \s \S \w \W
+
+ if (!quoted && g->yychar == '-') {
+ quoted = nextrune(g);
+ if (g->yychar == 0)
+ die(g, "unterminated character class");
+ if (!quoted && g->yychar == ']') {
+ *p++ = save;
+ *p++ = save;
+ if (p == ep)
+ die(g, "too many character classes");
+ *p++ = '-';
+ *p++ = '-';
+ break;
+ }
+
+ if (g->yychar < save)
+ die(g, "invalid character class range");
+ *p++ = save;
+ *p++ = g->yychar;
+ quoted = nextrune(g);
+ } else {
+ *p++ = save;
+ *p++ = save;
+ }
+ }
+ if (p == ep)
+ die(g, "too many character classes");
+
+ g->yycclass->end = p;
+
+ return type;
+}
+
+static int lex(struct cstate *g)
+{
+ int quoted = nextrune(g);
+ if (quoted) {
+ switch (g->yychar) {
+ case 'b': return L_WORD;
+ case 'B': return L_NWORD;
+ case 'd': g->yycclass = &class_d; return L_CCLASS;
+ case 's': g->yycclass = &class_s; return L_CCLASS;
+ case 'w': g->yycclass = &class_w; return L_CCLASS;
+ case 'D': g->yycclass = &class_d; return L_NCCLASS;
+ case 'S': g->yycclass = &class_s; return L_NCCLASS;
+ case 'W': g->yycclass = &class_w; return L_NCCLASS;
+ }
+ if (g->yychar >= '0' && g->yychar <= '9') {
+ g->yychar -= '0';
+ return L_REF;
+ }
+ return L_CHAR;
+ }
+
+ switch (g->yychar) {
+ case '*': case '+': case '?': case '|':
+ case ')': case '.': case '^': case '$':
+ case 0:
+ return g->yychar;
+ }
+
+ if (g->yychar == '{')
+ return lexcount(g);
+ if (g->yychar == '[')
+ return lexclass(g);
+ if (g->yychar == '(') {
+ if (g->source[0] == '?') {
+ if (g->source[1] == ':') {
+ g->source += 2;
+ return L_NC;
+ }
+ if (g->source[1] == '=') {
+ g->source += 2;
+ return L_PLA;
+ }
+ if (g->source[1] == '!') {
+ g->source += 2;
+ return L_NLA;
+ }
+ }
+ return '(';
+ }
+
+ return L_CHAR;
+}
+
+/* Parse */
+
+enum {
+ P_CAT, P_ALT, P_REP,
+ P_BOL, P_EOL, P_WORD, P_NWORD,
+ P_PAR, P_PLA, P_NLA,
+ P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
+ P_REF,
+};
+
+struct Renode {
+ int type;
+ Reclass *cc;
+ Rune c;
+ unsigned char ng;
+ unsigned short m, n;
+ Renode *x;
+ Renode *y;
+};
+
+static Renode *newnode(struct cstate *g, int type)
+{
+ Renode *node = g->pend++;
+ node->type = type;
+ node->cc = NULL;
+ node->c = 0;
+ node->ng = 0;
+ node->m = 0;
+ node->n = 0;
+ node->x = node->y = NULL;
+ return node;
+}
+
+static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
+{
+ Renode *rep = newnode(g, P_REP);
+ rep->ng = ng;
+ rep->m = min;
+ rep->n = max;
+ rep->x = atom;
+ return rep;
+}
+
+static inline void next(struct cstate *g)
+{
+ g->lookahead = lex(g);
+}
+
+static inline int accept(struct cstate *g, int t)
+{
+ if (g->lookahead == t) {
+ next(g);
+ return 1;
+ }
+ return 0;
+}
+
+static Renode *parsealt(struct cstate *g);
+
+static Renode *parsecap(struct cstate *g, int type)
+{
+ Renode *atom = newnode(g, type);
+ if (++g->ncap == 10)
+ die(g, "too many captures");
+ atom->n = g->ncap;
+ g->nref[atom->n] = 0;
+ atom->x = parsealt(g);
+ g->nref[atom->n] = 1;
+ if (!accept(g, ')'))
+ die(g, "unmatched '('");
+ return atom;
+}
+
+static Renode *parseatom(struct cstate *g)
+{
+ Renode *atom;
+ if (g->lookahead == L_CHAR) {
+ atom = newnode(g, P_CHAR);
+ atom->c = g->yychar;
+ next(g);
+ return atom;
+ }
+ if (g->lookahead == L_CCLASS) {
+ atom = newnode(g, P_CCLASS);
+ atom->cc = g->yycclass;
+ next(g);
+ return atom;
+ }
+ if (g->lookahead == L_NCCLASS) {
+ atom = newnode(g, P_NCCLASS);
+ atom->cc = g->yycclass;
+ next(g);
+ return atom;
+ }
+ if (g->lookahead == L_REF) {
+ atom = newnode(g, P_REF);
+ if (g->yychar == 0 || g->yychar > g->ncap || !g->nref[g->yychar])
+ die(g, "invalid back-reference");
+ atom->n = g->yychar;
+ next(g);
+ return atom;
+ }
+ if (accept(g, '.'))
+ return newnode(g, P_ANY);
+ if (accept(g, L_NC)) {
+ atom = parsealt(g);
+ if (!accept(g, ')'))
+ die(g, "unmatched '('");
+ return atom;
+ }
+ if (accept(g, '('))
+ return parsecap(g, P_PAR);
+ if (accept(g, L_PLA))
+ return parsecap(g, P_PLA);
+ if (accept(g, L_NLA))
+ return parsecap(g, P_NLA);
+ die(g, "syntax error");
+ return NULL;
+}
+
+static Renode *parserep(struct cstate *g)
+{
+ Renode *atom;
+
+ if (accept(g, '^')) return newnode(g, P_BOL);
+ if (accept(g, '$')) return newnode(g, P_EOL);
+ if (accept(g, L_WORD)) return newnode(g, P_WORD);
+ if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
+
+ atom = parseatom(g);
+ if (g->lookahead == L_COUNT) {
+ int min = g->yymin, max = g->yymax;
+ next(g);
+ if (max < min)
+ die(g, "invalid quantifier");
+ return newrep(g, atom, accept(g, '?'), min, max);
+ }
+ if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, USHRT_MAX);
+ if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, USHRT_MAX);
+ if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
+ return atom;
+}
+
+static Renode *parsecat(struct cstate *g)
+{
+ Renode *cat, *x;
+ if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+ cat = parserep(g);
+ while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+ x = cat;
+ cat = newnode(g, P_CAT);
+ cat->x = x;
+ cat->y = parserep(g);
+ }
+ return cat;
+ }
+ return NULL;
+}
+
+static Renode *parsealt(struct cstate *g)
+{
+ Renode *alt, *x;
+ alt = parsecat(g);
+ while (accept(g, '|')) {
+ x = alt;
+ alt = newnode(g, P_ALT);
+ alt->x = x;
+ alt->y = parsecat(g);
+ }
+ return alt;
+}
+
+/* Compile */
+
+enum {
+ I_MATCH, I_JUMP, I_SPLIT, I_PLA, I_NLA,
+ I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
+ I_BOL, I_EOL, I_WORD, I_NWORD,
+ I_PAR, I_ENDPAR
+};
+
+struct Reinst {
+ int opcode;
+ Reclass *cc;
+ Rune c;
+ unsigned char n;
+ Reinst *x;
+ Reinst *y;
+ const char *p;
+};
+
+static int count(Renode *node)
+{
+ int min, max;
+ if (!node) return 0;
+ switch (node->type) {
+ default: return 1;
+ case P_CAT: return count(node->x) + count(node->y);
+ case P_ALT: return count(node->x) + count(node->y) + 2;
+ case P_REP:
+ min = node->m;
+ max = node->n;
+ if (min == max) return count(node->x) * min;
+ if (max < USHRT_MAX) return count(node->x) * max + (max - min);
+ return count(node->x) * (min + 1) + 2;
+ case P_PAR: return count(node->x) + 2;
+ case P_PLA: return count(node->x) + 4;
+ case P_NLA: return count(node->x) + 4;
+ }
+}
+
+static Reinst *emit(Reprog *prog, int opcode)
+{
+ Reinst *inst = prog->end++;
+ inst->opcode = opcode;
+ inst->cc = NULL;
+ inst->c = 0;
+ inst->n = 0;
+ inst->x = inst->y = NULL;
+ inst->p = 0;
+ return inst;
+}
+
+static void compile(Reprog *prog, Renode *node)
+{
+ Reinst *inst, *split, *jump;
+ int i;
+
+ if (!node)
+ return;
+
+ switch (node->type) {
+ case P_CAT:
+ compile(prog, node->x);
+ compile(prog, node->y);
+ break;
+
+ case P_ALT:
+ split = emit(prog, I_SPLIT);
+ compile(prog, node->x);
+ jump = emit(prog, I_JUMP);
+ compile(prog, node->y);
+ split->x = split + 1;
+ split->y = jump + 1;
+ jump->x = prog->end;
+ break;
+
+ case P_REP:
+ for (i = 0; i < node->m; ++i) {
+ inst = prog->end;
+ compile(prog, node->x);
+ }
+ if (node->m == node->n)
+ break;
+ if (node->n < USHRT_MAX) {
+ for (i = node->m; i < node->n; ++i) {
+ split = emit(prog, I_SPLIT);
+ compile(prog, node->x);
+ if (node->ng) {
+ split->y = split + 1;
+ split->x = prog->end;
+ } else {
+ split->x = split + 1;
+ split->y = prog->end;
+ }
+ }
+ } else if (node->m == 0) {
+ split = emit(prog, I_SPLIT);
+ compile(prog, node->x);
+ jump = emit(prog, I_JUMP);
+ if (node->ng) {
+ split->y = split + 1;
+ split->x = prog->end;
+ } else {
+ split->x = split + 1;
+ split->y = prog->end;
+ }
+ jump->x = split;
+ } else {
+ split = emit(prog, I_SPLIT);
+ if (node->ng) {
+ split->y = inst;
+ split->x = prog->end;
+ } else {
+ split->x = inst;
+ split->y = prog->end;
+ }
+ }
+ break;
+
+ case P_BOL: emit(prog, I_BOL); break;
+ case P_EOL: emit(prog, I_EOL); break;
+ case P_WORD: emit(prog, I_WORD); break;
+ case P_NWORD: emit(prog, I_NWORD); break;
+
+ case P_PAR:
+ inst = emit(prog, I_PAR);
+ inst->n = node->n;
+ compile(prog, node->x);
+ inst = emit(prog, I_ENDPAR);
+ inst->n = node->n;
+ break;
+ case P_PLA:
+ split = emit(prog, I_PLA);
+ inst = emit(prog, I_PAR);
+ inst->n = node->n;
+ compile(prog, node->x);
+ inst = emit(prog, I_ENDPAR);
+ inst->n = node->n;
+ emit(prog, I_MATCH);
+ split->x = split + 1;
+ split->y = prog->end;
+ break;
+ case P_NLA:
+ split = emit(prog, I_NLA);
+ inst = emit(prog, I_PAR);
+ inst->n = node->n;
+ compile(prog, node->x);
+ inst = emit(prog, I_ENDPAR);
+ inst->n = node->n;
+ emit(prog, I_MATCH);
+ split->x = split + 1;
+ split->y = prog->end;
+ break;
+
+ case P_ANY:
+ emit(prog, I_ANY);
+ break;
+ case P_CHAR:
+ inst = emit(prog, I_CHAR);
+ inst->c = prog->icase ? canon(node->c) : node->c;
+ break;
+ case P_CCLASS:
+ inst = emit(prog, I_CCLASS);
+ inst->cc = node->cc;
+ break;
+ case P_NCCLASS:
+ inst = emit(prog, I_NCCLASS);
+ inst->cc = node->cc;
+ break;
+ case P_REF:
+ inst = emit(prog, I_REF);
+ inst->n = node->n;
+ break;
+ }
+}
+
+#ifdef TEST
+static void dumpnode(Renode *node)
+{
+ Rune *p;
+ if (!node) { printf("Empty"); return; }
+ switch (node->type) {
+ case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
+ case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
+ case P_REP:
+ printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
+ dumpnode(node->x);
+ printf(")");
+ break;
+ case P_BOL: printf("Bol"); break;
+ case P_EOL: printf("Eol"); break;
+ case P_WORD: printf("Word"); break;
+ case P_NWORD: printf("NotWord"); break;
+ case P_PAR: printf("Par("); dumpnode(node->x); printf(")"); break;
+ case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
+ case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
+ case P_ANY: printf("Any"); break;
+ case P_CHAR: printf("Char(%c)", node->c); break;
+ case P_CCLASS:
+ printf("Class(");
+ for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
+ printf(")");
+ break;
+ case P_NCCLASS:
+ printf("NotClass(");
+ for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
+ printf(")");
+ break;
+ case P_REF: printf("Ref(%d)", node->n); break;
+ }
+}
+
+static void dumpprog(Reprog *prog)
+{
+ Reinst *inst;
+ int i;
+ for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
+ printf("% 5d: ", i);
+ switch (inst->opcode) {
+ case I_MATCH: puts("match"); break;
+ case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
+ case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
+ case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
+ case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
+ case I_ANY: puts("any"); break;
+ case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
+ case I_CCLASS: puts("cclass"); break;
+ case I_NCCLASS: puts("ncclass"); break;
+ case I_REF: printf("ref %d\n", inst->n); break;
+ case I_BOL: puts("bol"); break;
+ case I_EOL: puts("eol"); break;
+ case I_WORD: puts("word"); break;
+ case I_NWORD: puts("nword"); break;
+ case I_PAR: printf("par %d\n", inst->n); break;
+ case I_ENDPAR: printf("endpar %d\n", inst->n); break;
+ }
+ }
+}
+#endif
+
+Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
+{
+ struct cstate g;
+ Renode *node;
+ int i;
+
+ g.prog = malloc(sizeof (Reprog));
+ g.pstart = g.pend = malloc(sizeof (Renode) * strlen(pattern) * 2);
+
+ if (setjmp(g.kaboom)) {
+ if (errorp) *errorp = g.error;
+ free(g.pstart);
+ free(g.prog);
+ return NULL;
+ }
+
+ g.source = pattern;
+ g.ncclass = 0;
+ g.ncap = 0;
+ for (i = 0; i < 10; ++i)
+ g.nref[i] = 0;
+
+ g.prog->icase = cflags & REG_ICASE;
+ g.prog->newline = cflags & REG_NEWLINE;
+
+ next(&g);
+ node = parsealt(&g);
+ if (g.lookahead == ')')
+ die(&g, "unmatched ')'");
+ if (g.lookahead != 0)
+ die(&g, "syntax error");
+
+ g.prog->start = g.prog->end = malloc((count(node) + 3) * sizeof (Reinst));
+ emit(g.prog, I_PAR);
+ compile(g.prog, node);
+ emit(g.prog, I_ENDPAR);
+ emit(g.prog, I_MATCH);
+
+#ifdef TEST
+ dumpnode(node);
+ putchar('\n');
+ dumpprog(g.prog);
+#endif
+
+ free(g.pstart);
+
+ if (errorp) *errorp = NULL;
+ return g.prog;
+}
+
+void regfree(Reprog *prog)
+{
if (prog) {
- regfree((regex_t*)prog);
+ free(prog->start);
free(prog);
}
}
+
+/* Match */
+
+struct estate {
+ int icase, newline;
+ const char *bol;
+ Resub *m;
+};
+
+static inline int isnewline(int c)
+{
+ return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
+}
+
+static inline int iswordchar(int c)
+{
+ return c == '_' ||
+ (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9');
+}
+
+static inline int incclass(Reclass *cc, Rune c)
+{
+ Rune *p;
+ for (p = cc->spans; p < cc->end; p += 2)
+ if (p[0] <= c && c <= p[1])
+ return 1;
+ return 0;
+}
+
+static inline int incclasscanon(Reclass *cc, Rune c)
+{
+ Rune *p, r;
+ for (p = cc->spans; p < cc->end; p += 2)
+ for (r = p[0]; r <= p[1]; ++r)
+ if (c == canon(r))
+ return 1;
+ return 0;
+}
+
+static int strncmpcanon(const char *a, const char *b, int n)
+{
+ Rune ra, rb;
+ int c;
+ while (n--) {
+ if (!*a) return -1;
+ if (!*b) return 1;
+ a += chartorune(&ra, a);
+ b += chartorune(&rb, b);
+ c = canon(ra) - canon(rb);
+ if (c)
+ return c;
+ }
+ return 0;
+}
+
+static int match(struct estate *g, Reinst *pc, const char *s)
+{
+ const char *p;
+ Rune c;
+ int n;
+
+ for (;;) {
+ switch (pc->opcode) {
+ case I_MATCH:
+ return 1;
+ case I_JUMP:
+ pc = pc->x;
+ continue;
+ case I_SPLIT:
+ // TODO: use I_PROGRESS instead?
+#if 0
+ if (match(g, pc->x, s))
+ return 1;
+ pc = pc->y;
+ continue;
+#else
+ if (pc->p == s) return 0;
+ p = pc->p;
+ pc->p = s;
+ if (match(g, pc->x, s) || match(g, pc->y, s)) {
+ pc->p = p;
+ return 1;
+ }
+ pc->p = p;
+ return 0;
+#endif
+ case I_PLA:
+ if (!match(g, pc->x, s))
+ return 0;
+ pc = pc->y;
+ continue;
+ case I_NLA:
+ if (match(g, pc->x, s))
+ return 0;
+ pc = pc->y;
+ continue;
+ case I_ANY:
+ s += chartorune(&c, s);
+ if (c == 0)
+ return 0;
+ if (isnewline(c))
+ return 0;
+ break;
+ case I_CHAR:
+ s += chartorune(&c, s);
+ if (c == 0)
+ return 0;
+ if (g->icase)
+ c = canon(c);
+ if (c != pc->c)
+ return 0;
+ break;
+ case I_CCLASS:
+ s += chartorune(&c, s);
+ if (c == 0)
+ return 0;
+ if (g->icase) {
+ if (!incclasscanon(pc->cc, canon(c)))
+ return 0;
+ } else {
+ if (!incclass(pc->cc, c))
+ return 0;
+ }
+ break;
+ case I_NCCLASS:
+ s += chartorune(&c, s);
+ if (c == 0)
+ return 0;
+ if (g->icase) {
+ if (incclasscanon(pc->cc, canon(c)))
+ return 0;
+ } else {
+ if (incclass(pc->cc, c))
+ return 0;
+ }
+ break;
+ case I_REF:
+ n = (int)(g->m[pc->n].ep - g->m[pc->n].sp);
+ if (g->icase) {
+ if (strncmpcanon(s, g->m[pc->n].sp, n))
+ return 0;
+ } else {
+ if (strncmp(s, g->m[pc->n].sp, n))
+ return 0;
+ }
+ if (n > 0)
+ s += n;
+ break;
+ case I_BOL:
+ if (s == g->bol)
+ break;
+ if (g->newline)
+ if (isnewline(s[-1]))
+ break;
+ return 0;
+ case I_EOL:
+ if (*s == 0)
+ break;
+ if (g->newline)
+ if (isnewline(*s))
+ break;
+ return 0;
+ case I_WORD:
+ n = !(s == g->bol) && iswordchar(s[-1]);
+ n ^= iswordchar(s[0]);
+ if (n)
+ break;
+ return 0;
+ case I_NWORD:
+ n = !(s == g->bol) && iswordchar(s[-1]);
+ n ^= iswordchar(s[0]);
+ if (!n)
+ break;
+ return 0;
+ case I_PAR:
+ p = g->m[pc->n].sp;
+ g->m[pc->n].sp = s;
+ if (match(g, pc + 1, s))
+ return 1;
+ g->m[pc->n].sp = p;
+ return 0;
+ case I_ENDPAR:
+ p = g->m[pc->n].ep;
+ g->m[pc->n].ep = s;
+ if (match(g, pc + 1, s))
+ return 1;
+ g->m[pc->n].ep = p;
+ return 0;
+ default:
+ return 0;
+ }
+ pc = pc + 1;
+ }
+}
+
+int regexec(Reprog *prog, const char *s, int n, Resub *m, int eflags)
+{
+ struct estate g;
+ Resub gm[10];
+ Rune c;
+ int i;
+
+ g.icase = prog->icase;
+ g.newline = prog->newline;
+ g.bol = eflags & REG_NOTBOL ? NULL : s;
+ g.m = m ? m : gm;
+ for (i = 0; i < n; ++i)
+ g.m[i].sp = g.m[i].ep = NULL;
+
+ do {
+ if (match(&g, prog->start, s))
+ return 0;
+ s += chartorune(&c, s);
+ } while (c);
+
+ return 1;
+}
+
+#ifdef TEST
+int main(int argc, char **argv)
+{
+ const char *error;
+ const char *s;
+ Reprog *p;
+ Resub m[10];
+ int i;
+
+ if (argc > 1) {
+ p = regcomp(argv[1], 0, &error);
+ if (!p) {
+ fprintf(stderr, "regcomp: %s\n", error);
+ return 1;
+ }
+
+ if (argc > 2) {
+ s = argv[2];
+ if (!regexec(p, s, 10, m, 0)) {
+ for (i = 0; i < 10; ++i)
+ if (m[i].sp) {
+ int n = m[i].ep - m[i].sp;
+ printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m[i].sp - s), (int)(m[i].ep - s), n, n, m[i].sp);
+ }
+ } else {
+ printf("no match\n");
+ }
+ }
+ }
+
+ return 0;
+}
+#endif
--- a/regex.h
+++ b/regex.h
@@ -1,16 +1,31 @@
#ifndef regex_h
#define regex_h
-#include <regex.h>
+#define regcomp js_regcomp
+#define regexec js_regexec
+#define regfree js_regfree
typedef struct Reprog Reprog;
-typedef struct {
+typedef struct Resub Resub;
+struct Resub {
const char *sp;
const char *ep;
-} Resub;
+};
-Reprog *js_regcomp(const char *pattern, int cflags, const char **errorp);
-int js_regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags);
-void js_regfree(Reprog *prog);
+Reprog *regcomp(const char *pattern, int cflags, const char **errorp);
+int regexec(Reprog *prog, const char *string, int nmatch, Resub *pmatch, int eflags);
+void regfree(Reprog *prog);
+
+enum {
+ /* regcomp flags */
+ REG_ICASE = 1,
+ REG_NEWLINE = 2
+};
+
+enum {
+ /* regexec flags */
+ REG_NOTBOL = 1,
+ REG_NOTEOL = 2
+};
#endif