ref: eed76191580004d5461bdf8accd8ca23e8541b57
parent: c1ad1ba1e482e7d01743e3f4f9517572bebf99ac
author: Tor Andersson <tor@ccxvii.net>
date: Fri Aug 14 08:05:09 EDT 2015
Rename our internal regex.h to not collide with system regex.h.
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
-SRCS := $(wildcard js*.c utf*.c regex.c)
-HDRS := $(wildcard js*.h mujs.h utf.h regex.h)
+SRCS := $(wildcard js*.c utf*.c regexp.c)
+HDRS := $(wildcard js*.h mujs.h utf.h regexp.h)
OBJS := $(SRCS:%.c=build/%.o)
prefix ?= /usr/local
--- a/jsgc.c
+++ b/jsgc.c
@@ -3,7 +3,7 @@
#include "jsvalue.h"
#include "jsrun.h"
-#include "regex.h"
+#include "regexp.h"
static void jsG_markobject(js_State *J, int mark, js_Object *obj);
--- a/jsregexp.c
+++ b/jsregexp.c
@@ -1,7 +1,7 @@
#include "jsi.h"
#include "jsvalue.h"
#include "jsbuiltin.h"
-#include "regex.h"
+#include "regexp.h"
void js_newregexp(js_State *J, const char *pattern, int flags)
{
--- a/jsstring.c
+++ b/jsstring.c
@@ -2,7 +2,7 @@
#include "jsvalue.h"
#include "jsbuiltin.h"
#include "utf.h"
-#include "regex.h"
+#include "regexp.h"
int js_runeat(js_State *J, const char *s, int i)
{
--- a/one.c
+++ b/one.c
@@ -21,6 +21,6 @@
#include "jsstate.c"
#include "jsstring.c"
#include "jsvalue.c"
-#include "regex.c"
+#include "regexp.c"
#include "utf.c"
#include "utftype.c"
--- a/regex.c
+++ /dev/null
@@ -1,1143 +1,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <setjmp.h>
-#include <limits.h>
-
-#include "regex.h"
-#include "utf.h"
-
-#define emit regemit
-#define next regnext
-#define accept regaccept
-
-#define nelem(a) (sizeof (a) / sizeof (a)[0])
-
-#define REPINF 255
-#define MAXTHREAD 1000
-#define MAXSUB REG_MAXSUB
-
-typedef struct Reclass Reclass;
-typedef struct Renode Renode;
-typedef struct Reinst Reinst;
-typedef struct Rethread Rethread;
-
-struct Reclass {
- Rune *end;
- Rune spans[64];
-};
-
-struct Reprog {
- Reinst *start, *end;
- int flags;
- unsigned int nsub;
- Reclass cclass[16];
-};
-
-struct cstate {
- Reprog *prog;
- Renode *pstart, *pend;
-
- const char *source;
- unsigned int ncclass;
- unsigned int nsub;
- Renode *sub[MAXSUB];
-
- int lookahead;
- Rune yychar;
- Reclass *yycc;
- int yymin, yymax;
-
- const char *error;
- jmp_buf kaboom;
-};
-
-static void die(struct cstate *g, const char *message)
-{
- g->error = message;
- longjmp(g->kaboom, 1);
-}
-
-static int canon(Rune c)
-{
- Rune u = toupperrune(c);
- if (c >= 128 && u < 128)
- return c;
- return u;
-}
-
-/* Scan */
-
-enum {
- L_CHAR = 256,
- L_CCLASS, /* character class */
- L_NCCLASS, /* negative character class */
- L_NC, /* "(?:" no capture */
- L_PLA, /* "(?=" positive lookahead */
- L_NLA, /* "(?!" negative lookahead */
- L_WORD, /* "\b" word boundary */
- L_NWORD, /* "\B" non-word boundary */
- L_REF, /* "\1" back-reference */
- L_COUNT, /* {M,N} */
-};
-
-static int hex(struct cstate *g, int c)
-{
- if (c >= '0' && c <= '9') return c - '0';
- if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
- if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
- die(g, "invalid escape sequence");
- return 0;
-}
-
-static int dec(struct cstate *g, int c)
-{
- if (c >= '0' && c <= '9') return c - '0';
- die(g, "invalid quantifier");
- return 0;
-}
-
-#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
-
-static int isunicodeletter(int c)
-{
- return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
-}
-
-static int nextrune(struct cstate *g)
-{
- g->source += chartorune(&g->yychar, g->source);
- if (g->yychar == '\\') {
- g->source += chartorune(&g->yychar, g->source);
- switch (g->yychar) {
- case 0: die(g, "unterminated escape sequence");
- case 'f': g->yychar = '\f'; return 0;
- case 'n': g->yychar = '\n'; return 0;
- case 'r': g->yychar = '\r'; return 0;
- case 't': g->yychar = '\t'; return 0;
- case 'v': g->yychar = '\v'; return 0;
- case 'c':
- g->yychar = (*g->source++) & 31;
- return 0;
- case 'x':
- g->yychar = hex(g, *g->source++) << 4;
- g->yychar += hex(g, *g->source++);
- if (g->yychar == 0) {
- g->yychar = '0';
- return 1;
- }
- return 0;
- case 'u':
- g->yychar = hex(g, *g->source++) << 12;
- g->yychar += hex(g, *g->source++) << 8;
- g->yychar += hex(g, *g->source++) << 4;
- g->yychar += hex(g, *g->source++);
- if (g->yychar == 0) {
- g->yychar = '0';
- return 1;
- }
- return 0;
- }
- if (strchr(ESCAPES, g->yychar))
- return 1;
- if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
- die(g, "invalid escape character");
- return 0;
- }
- return 0;
-}
-
-static int lexcount(struct cstate *g)
-{
- g->yychar = *g->source++;
-
- g->yymin = dec(g, g->yychar);
- g->yychar = *g->source++;
- while (g->yychar != ',' && g->yychar != '}') {
- g->yymin = g->yymin * 10 + dec(g, g->yychar);
- g->yychar = *g->source++;
- }
- if (g->yymin >= REPINF)
- die(g, "numeric overflow");
-
- if (g->yychar == ',') {
- g->yychar = *g->source++;
- if (g->yychar == '}') {
- g->yymax = REPINF;
- } else {
- g->yymax = dec(g, g->yychar);
- g->yychar = *g->source++;
- while (g->yychar != '}') {
- g->yymax = g->yymax * 10 + dec(g, g->yychar);
- g->yychar = *g->source++;
- }
- if (g->yymax >= REPINF)
- die(g, "numeric overflow");
- }
- } else {
- g->yymax = g->yymin;
- }
-
- return L_COUNT;
-}
-
-static void newcclass(struct cstate *g)
-{
- if (g->ncclass >= nelem(g->prog->cclass))
- die(g, "too many character classes");
- g->yycc = g->prog->cclass + g->ncclass++;
- g->yycc->end = g->yycc->spans;
-}
-
-static void addrange(struct cstate *g, Rune a, Rune b)
-{
- if (a > b)
- die(g, "invalid character class range");
- if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
- die(g, "too many character class ranges");
- *g->yycc->end++ = a;
- *g->yycc->end++ = b;
-}
-
-static void addranges_d(struct cstate *g)
-{
- addrange(g, '0', '9');
-}
-
-static void addranges_D(struct cstate *g)
-{
- addrange(g, 0, '0'-1);
- addrange(g, '9'+1, 0xFFFF);
-}
-
-static void addranges_s(struct cstate *g)
-{
- addrange(g, 0x9, 0x9);
- addrange(g, 0xA, 0xD);
- addrange(g, 0x20, 0x20);
- addrange(g, 0xA0, 0xA0);
- addrange(g, 0x2028, 0x2029);
- addrange(g, 0xFEFF, 0xFEFF);
-}
-
-static void addranges_S(struct cstate *g)
-{
- addrange(g, 0, 0x9-1);
- addrange(g, 0x9+1, 0xA-1);
- addrange(g, 0xD+1, 0x20-1);
- addrange(g, 0x20+1, 0xA0-1);
- addrange(g, 0xA0+1, 0x2028-1);
- addrange(g, 0x2029+1, 0xFEFF-1);
- addrange(g, 0xFEFF+1, 0xFFFF);
-}
-
-static void addranges_w(struct cstate *g)
-{
- addrange(g, '0', '9');
- addrange(g, 'A', 'Z');
- addrange(g, '_', '_');
- addrange(g, 'a', 'z');
-}
-
-static void addranges_W(struct cstate *g)
-{
- addrange(g, 0, '0'-1);
- addrange(g, '9'+1, 'A'-1);
- addrange(g, 'Z'+1, '_'-1);
- addrange(g, '_'+1, 'a'-1);
- addrange(g, 'z'+1, 0xFFFF);
-}
-
-static int lexclass(struct cstate *g)
-{
- int type = L_CCLASS;
- int quoted, havesave, havedash;
- Rune save;
-
- newcclass(g);
-
- quoted = nextrune(g);
- if (!quoted && g->yychar == '^') {
- type = L_NCCLASS;
- quoted = nextrune(g);
- }
-
- havesave = havedash = 0;
- for (;;) {
- if (g->yychar == 0)
- die(g, "unterminated character class");
- if (!quoted && g->yychar == ']')
- break;
-
- if (!quoted && g->yychar == '-') {
- if (havesave) {
- if (havedash) {
- addrange(g, save, '-');
- havesave = havedash = 0;
- } else {
- havedash = 1;
- }
- } else {
- save = '-';
- havesave = 1;
- }
- } else if (quoted && strchr("DSWdsw", g->yychar)) {
- if (havesave) {
- addrange(g, save, save);
- if (havedash)
- addrange(g, '-', '-');
- }
- switch (g->yychar) {
- case 'd': addranges_d(g); break;
- case 's': addranges_s(g); break;
- case 'w': addranges_w(g); break;
- case 'D': addranges_D(g); break;
- case 'S': addranges_S(g); break;
- case 'W': addranges_W(g); break;
- }
- havesave = havedash = 0;
- } else {
- if (quoted) {
- if (g->yychar == 'b')
- g->yychar = '\b';
- else if (g->yychar == '0')
- g->yychar = 0;
- /* else identity escape */
- }
- if (havesave) {
- if (havedash) {
- addrange(g, save, g->yychar);
- havesave = havedash = 0;
- } else {
- addrange(g, save, save);
- save = g->yychar;
- }
- } else {
- save = g->yychar;
- havesave = 1;
- }
- }
-
- quoted = nextrune(g);
- }
-
- if (havesave) {
- addrange(g, save, save);
- if (havedash)
- addrange(g, '-', '-');
- }
-
- return type;
-}
-
-static int lex(struct cstate *g)
-{
- int quoted = nextrune(g);
- if (quoted) {
- switch (g->yychar) {
- case 'b': return L_WORD;
- case 'B': return L_NWORD;
- case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
- case 's': newcclass(g); addranges_s(g); return L_CCLASS;
- case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
- case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
- case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
- case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
- case '0': g->yychar = 0; return L_CHAR;
- }
- if (g->yychar >= '0' && g->yychar <= '9') {
- g->yychar -= '0';
- if (*g->source >= '0' && *g->source <= '9')
- g->yychar = g->yychar * 10 + *g->source++ - '0';
- return L_REF;
- }
- return L_CHAR;
- }
-
- switch (g->yychar) {
- case 0:
- case '$': case ')': case '*': case '+':
- case '.': case '?': case '^': case '|':
- return g->yychar;
- }
-
- if (g->yychar == '{')
- return lexcount(g);
- if (g->yychar == '[')
- return lexclass(g);
- if (g->yychar == '(') {
- if (g->source[0] == '?') {
- if (g->source[1] == ':') {
- g->source += 2;
- return L_NC;
- }
- if (g->source[1] == '=') {
- g->source += 2;
- return L_PLA;
- }
- if (g->source[1] == '!') {
- g->source += 2;
- return L_NLA;
- }
- }
- return '(';
- }
-
- return L_CHAR;
-}
-
-/* Parse */
-
-enum {
- P_CAT, P_ALT, P_REP,
- P_BOL, P_EOL, P_WORD, P_NWORD,
- P_PAR, P_PLA, P_NLA,
- P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
- P_REF,
-};
-
-struct Renode {
- unsigned char type;
- unsigned char ng, m, n;
- Rune c;
- Reclass *cc;
- Renode *x;
- Renode *y;
-};
-
-static Renode *newnode(struct cstate *g, int type)
-{
- Renode *node = g->pend++;
- node->type = type;
- node->cc = NULL;
- node->c = 0;
- node->ng = 0;
- node->m = 0;
- node->n = 0;
- node->x = node->y = NULL;
- return node;
-}
-
-static int empty(Renode *node)
-{
- if (!node) return 1;
- switch (node->type) {
- default: return 1;
- case P_CAT: return empty(node->x) && empty(node->y);
- case P_ALT: return empty(node->x) || empty(node->y);
- case P_REP: return empty(node->x) || node->m == 0;
- case P_PAR: return empty(node->x);
- case P_REF: return empty(node->x);
- case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
- }
-}
-
-static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
-{
- Renode *rep = newnode(g, P_REP);
- if (max == REPINF && empty(atom))
- die(g, "infinite loop matching the empty string");
- rep->ng = ng;
- rep->m = min;
- rep->n = max;
- rep->x = atom;
- return rep;
-}
-
-static void next(struct cstate *g)
-{
- g->lookahead = lex(g);
-}
-
-static int accept(struct cstate *g, int t)
-{
- if (g->lookahead == t) {
- next(g);
- return 1;
- }
- return 0;
-}
-
-static Renode *parsealt(struct cstate *g);
-
-static Renode *parseatom(struct cstate *g)
-{
- Renode *atom;
- if (g->lookahead == L_CHAR) {
- atom = newnode(g, P_CHAR);
- atom->c = g->yychar;
- next(g);
- return atom;
- }
- if (g->lookahead == L_CCLASS) {
- atom = newnode(g, P_CCLASS);
- atom->cc = g->yycc;
- next(g);
- return atom;
- }
- if (g->lookahead == L_NCCLASS) {
- atom = newnode(g, P_NCCLASS);
- atom->cc = g->yycc;
- next(g);
- return atom;
- }
- if (g->lookahead == L_REF) {
- atom = newnode(g, P_REF);
- if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
- die(g, "invalid back-reference");
- atom->n = g->yychar;
- atom->x = g->sub[g->yychar];
- next(g);
- return atom;
- }
- if (accept(g, '.'))
- return newnode(g, P_ANY);
- if (accept(g, '(')) {
- atom = newnode(g, P_PAR);
- if (g->nsub == MAXSUB)
- die(g, "too many captures");
- atom->n = g->nsub++;
- atom->x = parsealt(g);
- g->sub[atom->n] = atom;
- if (!accept(g, ')'))
- die(g, "unmatched '('");
- return atom;
- }
- if (accept(g, L_NC)) {
- atom = parsealt(g);
- if (!accept(g, ')'))
- die(g, "unmatched '('");
- return atom;
- }
- if (accept(g, L_PLA)) {
- atom = newnode(g, P_PLA);
- atom->x = parsealt(g);
- if (!accept(g, ')'))
- die(g, "unmatched '('");
- return atom;
- }
- if (accept(g, L_NLA)) {
- atom = newnode(g, P_NLA);
- atom->x = parsealt(g);
- if (!accept(g, ')'))
- die(g, "unmatched '('");
- return atom;
- }
- die(g, "syntax error");
- return NULL;
-}
-
-static Renode *parserep(struct cstate *g)
-{
- Renode *atom;
-
- if (accept(g, '^')) return newnode(g, P_BOL);
- if (accept(g, '$')) return newnode(g, P_EOL);
- if (accept(g, L_WORD)) return newnode(g, P_WORD);
- if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
-
- atom = parseatom(g);
- if (g->lookahead == L_COUNT) {
- int min = g->yymin, max = g->yymax;
- next(g);
- if (max < min)
- die(g, "invalid quantifier");
- return newrep(g, atom, accept(g, '?'), min, max);
- }
- if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
- if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
- if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
- return atom;
-}
-
-static Renode *parsecat(struct cstate *g)
-{
- Renode *cat, *x;
- if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
- cat = parserep(g);
- while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
- x = cat;
- cat = newnode(g, P_CAT);
- cat->x = x;
- cat->y = parserep(g);
- }
- return cat;
- }
- return NULL;
-}
-
-static Renode *parsealt(struct cstate *g)
-{
- Renode *alt, *x;
- alt = parsecat(g);
- while (accept(g, '|')) {
- x = alt;
- alt = newnode(g, P_ALT);
- alt->x = x;
- alt->y = parsecat(g);
- }
- return alt;
-}
-
-/* Compile */
-
-enum {
- I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
- I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
- I_BOL, I_EOL, I_WORD, I_NWORD,
- I_LPAR, I_RPAR
-};
-
-struct Reinst {
- unsigned char opcode;
- unsigned char n;
- Rune c;
- Reclass *cc;
- Reinst *x;
- Reinst *y;
-};
-
-static unsigned int count(Renode *node)
-{
- unsigned int min, max;
- if (!node) return 0;
- switch (node->type) {
- default: return 1;
- case P_CAT: return count(node->x) + count(node->y);
- case P_ALT: return count(node->x) + count(node->y) + 2;
- case P_REP:
- min = node->m;
- max = node->n;
- if (min == max) return count(node->x) * min;
- if (max < REPINF) return count(node->x) * max + (max - min);
- return count(node->x) * (min + 1) + 2;
- case P_PAR: return count(node->x) + 2;
- case P_PLA: return count(node->x) + 2;
- case P_NLA: return count(node->x) + 2;
- }
-}
-
-static Reinst *emit(Reprog *prog, int opcode)
-{
- Reinst *inst = prog->end++;
- inst->opcode = opcode;
- inst->n = 0;
- inst->c = 0;
- inst->cc = NULL;
- inst->x = inst->y = NULL;
- return inst;
-}
-
-static void compile(Reprog *prog, Renode *node)
-{
- Reinst *inst, *split, *jump;
- unsigned int i;
-
- if (!node)
- return;
-
- switch (node->type) {
- case P_CAT:
- compile(prog, node->x);
- compile(prog, node->y);
- break;
-
- case P_ALT:
- split = emit(prog, I_SPLIT);
- compile(prog, node->x);
- jump = emit(prog, I_JUMP);
- compile(prog, node->y);
- split->x = split + 1;
- split->y = jump + 1;
- jump->x = prog->end;
- break;
-
- case P_REP:
- for (i = 0; i < node->m; ++i) {
- inst = prog->end;
- compile(prog, node->x);
- }
- if (node->m == node->n)
- break;
- if (node->n < REPINF) {
- for (i = node->m; i < node->n; ++i) {
- split = emit(prog, I_SPLIT);
- compile(prog, node->x);
- if (node->ng) {
- split->y = split + 1;
- split->x = prog->end;
- } else {
- split->x = split + 1;
- split->y = prog->end;
- }
- }
- } else if (node->m == 0) {
- split = emit(prog, I_SPLIT);
- compile(prog, node->x);
- jump = emit(prog, I_JUMP);
- if (node->ng) {
- split->y = split + 1;
- split->x = prog->end;
- } else {
- split->x = split + 1;
- split->y = prog->end;
- }
- jump->x = split;
- } else {
- split = emit(prog, I_SPLIT);
- if (node->ng) {
- split->y = inst;
- split->x = prog->end;
- } else {
- split->x = inst;
- split->y = prog->end;
- }
- }
- break;
-
- case P_BOL: emit(prog, I_BOL); break;
- case P_EOL: emit(prog, I_EOL); break;
- case P_WORD: emit(prog, I_WORD); break;
- case P_NWORD: emit(prog, I_NWORD); break;
-
- case P_PAR:
- inst = emit(prog, I_LPAR);
- inst->n = node->n;
- compile(prog, node->x);
- inst = emit(prog, I_RPAR);
- inst->n = node->n;
- break;
- case P_PLA:
- split = emit(prog, I_PLA);
- compile(prog, node->x);
- emit(prog, I_END);
- split->x = split + 1;
- split->y = prog->end;
- break;
- case P_NLA:
- split = emit(prog, I_NLA);
- compile(prog, node->x);
- emit(prog, I_END);
- split->x = split + 1;
- split->y = prog->end;
- break;
-
- case P_ANY:
- emit(prog, I_ANY);
- break;
- case P_CHAR:
- inst = emit(prog, I_CHAR);
- inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
- break;
- case P_CCLASS:
- inst = emit(prog, I_CCLASS);
- inst->cc = node->cc;
- break;
- case P_NCCLASS:
- inst = emit(prog, I_NCCLASS);
- inst->cc = node->cc;
- break;
- case P_REF:
- inst = emit(prog, I_REF);
- inst->n = node->n;
- break;
- }
-}
-
-#ifdef TEST
-static void dumpnode(Renode *node)
-{
- Rune *p;
- if (!node) { printf("Empty"); return; }
- switch (node->type) {
- case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
- case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
- case P_REP:
- printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
- dumpnode(node->x);
- printf(")");
- break;
- case P_BOL: printf("Bol"); break;
- case P_EOL: printf("Eol"); break;
- case P_WORD: printf("Word"); break;
- case P_NWORD: printf("NotWord"); break;
- case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
- case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
- case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
- case P_ANY: printf("Any"); break;
- case P_CHAR: printf("Char(%c)", node->c); break;
- case P_CCLASS:
- printf("Class(");
- for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
- printf(")");
- break;
- case P_NCCLASS:
- printf("NotClass(");
- for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
- printf(")");
- break;
- case P_REF: printf("Ref(%d)", node->n); break;
- }
-}
-
-static void dumpprog(Reprog *prog)
-{
- Reinst *inst;
- int i;
- for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
- printf("% 5d: ", i);
- switch (inst->opcode) {
- case I_END: puts("end"); break;
- case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
- case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
- case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
- case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
- case I_ANY: puts("any"); break;
- case I_ANYNL: puts("anynl"); break;
- case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
- case I_CCLASS: puts("cclass"); break;
- case I_NCCLASS: puts("ncclass"); break;
- case I_REF: printf("ref %d\n", inst->n); break;
- case I_BOL: puts("bol"); break;
- case I_EOL: puts("eol"); break;
- case I_WORD: puts("word"); break;
- case I_NWORD: puts("nword"); break;
- case I_LPAR: printf("lpar %d\n", inst->n); break;
- case I_RPAR: printf("rpar %d\n", inst->n); break;
- }
- }
-}
-#endif
-
-Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
-{
- struct cstate g;
- Renode *node;
- Reinst *split, *jump;
- int i;
-
- g.prog = malloc(sizeof (Reprog));
- g.pstart = g.pend = malloc(sizeof (Renode) * strlen(pattern) * 2);
-
- if (setjmp(g.kaboom)) {
- if (errorp) *errorp = g.error;
- free(g.pstart);
- free(g.prog);
- return NULL;
- }
-
- g.source = pattern;
- g.ncclass = 0;
- g.nsub = 1;
- for (i = 0; i < MAXSUB; ++i)
- g.sub[i] = 0;
-
- g.prog->flags = cflags;
-
- next(&g);
- node = parsealt(&g);
- if (g.lookahead == ')')
- die(&g, "unmatched ')'");
- if (g.lookahead != 0)
- die(&g, "syntax error");
-
- g.prog->nsub = g.nsub;
- g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
-
- split = emit(g.prog, I_SPLIT);
- split->x = split + 3;
- split->y = split + 1;
- emit(g.prog, I_ANYNL);
- jump = emit(g.prog, I_JUMP);
- jump->x = split;
- emit(g.prog, I_LPAR);
- compile(g.prog, node);
- emit(g.prog, I_RPAR);
- emit(g.prog, I_END);
-
-#ifdef TEST
- dumpnode(node);
- putchar('\n');
- dumpprog(g.prog);
-#endif
-
- free(g.pstart);
-
- if (errorp) *errorp = NULL;
- return g.prog;
-}
-
-void regfree(Reprog *prog)
-{
- if (prog) {
- free(prog->start);
- free(prog);
- }
-}
-
-/* Match */
-
-static int isnewline(int c)
-{
- return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
-}
-
-static int iswordchar(int c)
-{
- return c == '_' ||
- (c >= 'a' && c <= 'z') ||
- (c >= 'A' && c <= 'Z') ||
- (c >= '0' && c <= '9');
-}
-
-static int incclass(Reclass *cc, Rune c)
-{
- Rune *p;
- for (p = cc->spans; p < cc->end; p += 2)
- if (p[0] <= c && c <= p[1])
- return 1;
- return 0;
-}
-
-static int incclasscanon(Reclass *cc, Rune c)
-{
- Rune *p, r;
- for (p = cc->spans; p < cc->end; p += 2)
- for (r = p[0]; r <= p[1]; ++r)
- if (c == canon(r))
- return 1;
- return 0;
-}
-
-static int strncmpcanon(const char *a, const char *b, unsigned int n)
-{
- Rune ra, rb;
- int c;
- while (n--) {
- if (!*a) return -1;
- if (!*b) return 1;
- a += chartorune(&ra, a);
- b += chartorune(&rb, b);
- c = canon(ra) - canon(rb);
- if (c)
- return c;
- }
- return 0;
-}
-
-struct Rethread {
- Reinst *pc;
- const char *sp;
- Resub sub;
-};
-
-static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
-{
- t->pc = pc;
- t->sp = sp;
- memcpy(&t->sub, sub, sizeof t->sub);
-}
-
-static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
-{
- Rethread ready[MAXTHREAD];
- Resub scratch;
- Resub sub;
- Rune c;
- unsigned int nready;
- int i;
-
- /* queue initial thread */
- spawn(ready + 0, pc, sp, out);
- nready = 1;
-
- /* run threads in stack order */
- while (nready > 0) {
- --nready;
- pc = ready[nready].pc;
- sp = ready[nready].sp;
- memcpy(&sub, &ready[nready].sub, sizeof sub);
- for (;;) {
- switch (pc->opcode) {
- case I_END:
- for (i = 0; i < MAXSUB; ++i) {
- out->sub[i].sp = sub.sub[i].sp;
- out->sub[i].ep = sub.sub[i].ep;
- }
- return 1;
- case I_JUMP:
- pc = pc->x;
- continue;
- case I_SPLIT:
- if (nready >= MAXTHREAD) {
- fprintf(stderr, "regexec: backtrack overflow!\n");
- return 0;
- }
- spawn(&ready[nready++], pc->y, sp, &sub);
- pc = pc->x;
- continue;
-
- case I_PLA:
- if (!match(pc->x, sp, bol, flags, &sub))
- goto dead;
- pc = pc->y;
- continue;
- case I_NLA:
- memcpy(&scratch, &sub, sizeof scratch);
- if (match(pc->x, sp, bol, flags, &scratch))
- goto dead;
- pc = pc->y;
- continue;
-
- case I_ANYNL:
- sp += chartorune(&c, sp);
- if (c == 0)
- goto dead;
- break;
- case I_ANY:
- sp += chartorune(&c, sp);
- if (c == 0)
- goto dead;
- if (isnewline(c))
- goto dead;
- break;
- case I_CHAR:
- sp += chartorune(&c, sp);
- if (c == 0)
- goto dead;
- if (flags & REG_ICASE)
- c = canon(c);
- if (c != pc->c)
- goto dead;
- break;
- case I_CCLASS:
- sp += chartorune(&c, sp);
- if (c == 0)
- goto dead;
- if (flags & REG_ICASE) {
- if (!incclasscanon(pc->cc, canon(c)))
- goto dead;
- } else {
- if (!incclass(pc->cc, c))
- goto dead;
- }
- break;
- case I_NCCLASS:
- sp += chartorune(&c, sp);
- if (c == 0)
- goto dead;
- if (flags & REG_ICASE) {
- if (incclasscanon(pc->cc, canon(c)))
- goto dead;
- } else {
- if (incclass(pc->cc, c))
- goto dead;
- }
- break;
- case I_REF:
- i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
- if (flags & REG_ICASE) {
- if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
- goto dead;
- } else {
- if (strncmp(sp, sub.sub[pc->n].sp, i))
- goto dead;
- }
- if (i > 0)
- sp += i;
- break;
-
- case I_BOL:
- if (sp == bol && !(flags & REG_NOTBOL))
- break;
- if (flags & REG_NEWLINE)
- if (sp > bol && isnewline(sp[-1]))
- break;
- goto dead;
- case I_EOL:
- if (*sp == 0)
- break;
- if (flags & REG_NEWLINE)
- if (isnewline(*sp))
- break;
- goto dead;
- case I_WORD:
- i = sp > bol && iswordchar(sp[-1]);
- i ^= iswordchar(sp[0]);
- if (i)
- break;
- goto dead;
- case I_NWORD:
- i = sp > bol && iswordchar(sp[-1]);
- i ^= iswordchar(sp[0]);
- if (!i)
- break;
- goto dead;
-
- case I_LPAR:
- sub.sub[pc->n].sp = sp;
- break;
- case I_RPAR:
- sub.sub[pc->n].ep = sp;
- break;
- default:
- goto dead;
- }
- pc = pc + 1;
- }
-dead: ;
- }
- return 0;
-}
-
-int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
-{
- Resub scratch;
- int i;
-
- if (!sub)
- sub = &scratch;
-
- sub->nsub = prog->nsub;
- for (i = 0; i < MAXSUB; ++i)
- sub->sub[i].sp = sub->sub[i].ep = NULL;
-
- return !match(prog->start, sp, sp, prog->flags | eflags, sub);
-}
-
-#ifdef TEST
-int main(int argc, char **argv)
-{
- const char *error;
- const char *s;
- Reprog *p;
- Resub m;
- unsigned int i;
-
- if (argc > 1) {
- p = regcomp(argv[1], 0, &error);
- if (!p) {
- fprintf(stderr, "regcomp: %s\n", error);
- return 1;
- }
-
- if (argc > 2) {
- s = argv[2];
- printf("nsub = %d\n", p->nsub);
- if (!regexec(p, s, &m, 0)) {
- for (i = 0; i < m.nsub; ++i) {
- int n = m.sub[i].ep - m.sub[i].sp;
- if (n > 0)
- printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
- else
- printf("match %d: n=0 ''\n", i);
- }
- } else {
- printf("no match\n");
- }
- }
- }
-
- return 0;
-}
-#endif
--- a/regex.h
+++ /dev/null
@@ -1,35 +1,0 @@
-#ifndef regex_h
-#define regex_h
-
-#define regcomp js_regcomp
-#define regexec js_regexec
-#define regfree js_regfree
-
-typedef struct Reprog Reprog;
-typedef struct Resub Resub;
-
-Reprog *regcomp(const char *pattern, int cflags, const char **errorp);
-int regexec(Reprog *prog, const char *string, Resub *sub, int eflags);
-void regfree(Reprog *prog);
-
-enum {
- /* regcomp flags */
- REG_ICASE = 1,
- REG_NEWLINE = 2,
-
- /* regexec flags */
- REG_NOTBOL = 4,
-
- /* limits */
- REG_MAXSUB = 16
-};
-
-struct Resub {
- unsigned int nsub;
- struct {
- const char *sp;
- const char *ep;
- } sub[REG_MAXSUB];
-};
-
-#endif
--- /dev/null
+++ b/regexp.c
@@ -1,0 +1,1143 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <setjmp.h>
+#include <limits.h>
+
+#include "regexp.h"
+#include "utf.h"
+
+#define emit regemit
+#define next regnext
+#define accept regaccept
+
+#define nelem(a) (sizeof (a) / sizeof (a)[0])
+
+#define REPINF 255
+#define MAXTHREAD 1000
+#define MAXSUB REG_MAXSUB
+
+typedef struct Reclass Reclass;
+typedef struct Renode Renode;
+typedef struct Reinst Reinst;
+typedef struct Rethread Rethread;
+
+struct Reclass {
+ Rune *end;
+ Rune spans[64];
+};
+
+struct Reprog {
+ Reinst *start, *end;
+ int flags;
+ unsigned int nsub;
+ Reclass cclass[16];
+};
+
+struct cstate {
+ Reprog *prog;
+ Renode *pstart, *pend;
+
+ const char *source;
+ unsigned int ncclass;
+ unsigned int nsub;
+ Renode *sub[MAXSUB];
+
+ int lookahead;
+ Rune yychar;
+ Reclass *yycc;
+ int yymin, yymax;
+
+ const char *error;
+ jmp_buf kaboom;
+};
+
+static void die(struct cstate *g, const char *message)
+{
+ g->error = message;
+ longjmp(g->kaboom, 1);
+}
+
+static int canon(Rune c)
+{
+ Rune u = toupperrune(c);
+ if (c >= 128 && u < 128)
+ return c;
+ return u;
+}
+
+/* Scan */
+
+enum {
+ L_CHAR = 256,
+ L_CCLASS, /* character class */
+ L_NCCLASS, /* negative character class */
+ L_NC, /* "(?:" no capture */
+ L_PLA, /* "(?=" positive lookahead */
+ L_NLA, /* "(?!" negative lookahead */
+ L_WORD, /* "\b" word boundary */
+ L_NWORD, /* "\B" non-word boundary */
+ L_REF, /* "\1" back-reference */
+ L_COUNT, /* {M,N} */
+};
+
+static int hex(struct cstate *g, int c)
+{
+ if (c >= '0' && c <= '9') return c - '0';
+ if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
+ if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
+ die(g, "invalid escape sequence");
+ return 0;
+}
+
+static int dec(struct cstate *g, int c)
+{
+ if (c >= '0' && c <= '9') return c - '0';
+ die(g, "invalid quantifier");
+ return 0;
+}
+
+#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|0123456789"
+
+static int isunicodeletter(int c)
+{
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
+}
+
+static int nextrune(struct cstate *g)
+{
+ g->source += chartorune(&g->yychar, g->source);
+ if (g->yychar == '\\') {
+ g->source += chartorune(&g->yychar, g->source);
+ switch (g->yychar) {
+ case 0: die(g, "unterminated escape sequence");
+ case 'f': g->yychar = '\f'; return 0;
+ case 'n': g->yychar = '\n'; return 0;
+ case 'r': g->yychar = '\r'; return 0;
+ case 't': g->yychar = '\t'; return 0;
+ case 'v': g->yychar = '\v'; return 0;
+ case 'c':
+ g->yychar = (*g->source++) & 31;
+ return 0;
+ case 'x':
+ g->yychar = hex(g, *g->source++) << 4;
+ g->yychar += hex(g, *g->source++);
+ if (g->yychar == 0) {
+ g->yychar = '0';
+ return 1;
+ }
+ return 0;
+ case 'u':
+ g->yychar = hex(g, *g->source++) << 12;
+ g->yychar += hex(g, *g->source++) << 8;
+ g->yychar += hex(g, *g->source++) << 4;
+ g->yychar += hex(g, *g->source++);
+ if (g->yychar == 0) {
+ g->yychar = '0';
+ return 1;
+ }
+ return 0;
+ }
+ if (strchr(ESCAPES, g->yychar))
+ return 1;
+ if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
+ die(g, "invalid escape character");
+ return 0;
+ }
+ return 0;
+}
+
+static int lexcount(struct cstate *g)
+{
+ g->yychar = *g->source++;
+
+ g->yymin = dec(g, g->yychar);
+ g->yychar = *g->source++;
+ while (g->yychar != ',' && g->yychar != '}') {
+ g->yymin = g->yymin * 10 + dec(g, g->yychar);
+ g->yychar = *g->source++;
+ }
+ if (g->yymin >= REPINF)
+ die(g, "numeric overflow");
+
+ if (g->yychar == ',') {
+ g->yychar = *g->source++;
+ if (g->yychar == '}') {
+ g->yymax = REPINF;
+ } else {
+ g->yymax = dec(g, g->yychar);
+ g->yychar = *g->source++;
+ while (g->yychar != '}') {
+ g->yymax = g->yymax * 10 + dec(g, g->yychar);
+ g->yychar = *g->source++;
+ }
+ if (g->yymax >= REPINF)
+ die(g, "numeric overflow");
+ }
+ } else {
+ g->yymax = g->yymin;
+ }
+
+ return L_COUNT;
+}
+
+static void newcclass(struct cstate *g)
+{
+ if (g->ncclass >= nelem(g->prog->cclass))
+ die(g, "too many character classes");
+ g->yycc = g->prog->cclass + g->ncclass++;
+ g->yycc->end = g->yycc->spans;
+}
+
+static void addrange(struct cstate *g, Rune a, Rune b)
+{
+ if (a > b)
+ die(g, "invalid character class range");
+ if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
+ die(g, "too many character class ranges");
+ *g->yycc->end++ = a;
+ *g->yycc->end++ = b;
+}
+
+static void addranges_d(struct cstate *g)
+{
+ addrange(g, '0', '9');
+}
+
+static void addranges_D(struct cstate *g)
+{
+ addrange(g, 0, '0'-1);
+ addrange(g, '9'+1, 0xFFFF);
+}
+
+static void addranges_s(struct cstate *g)
+{
+ addrange(g, 0x9, 0x9);
+ addrange(g, 0xA, 0xD);
+ addrange(g, 0x20, 0x20);
+ addrange(g, 0xA0, 0xA0);
+ addrange(g, 0x2028, 0x2029);
+ addrange(g, 0xFEFF, 0xFEFF);
+}
+
+static void addranges_S(struct cstate *g)
+{
+ addrange(g, 0, 0x9-1);
+ addrange(g, 0x9+1, 0xA-1);
+ addrange(g, 0xD+1, 0x20-1);
+ addrange(g, 0x20+1, 0xA0-1);
+ addrange(g, 0xA0+1, 0x2028-1);
+ addrange(g, 0x2029+1, 0xFEFF-1);
+ addrange(g, 0xFEFF+1, 0xFFFF);
+}
+
+static void addranges_w(struct cstate *g)
+{
+ addrange(g, '0', '9');
+ addrange(g, 'A', 'Z');
+ addrange(g, '_', '_');
+ addrange(g, 'a', 'z');
+}
+
+static void addranges_W(struct cstate *g)
+{
+ addrange(g, 0, '0'-1);
+ addrange(g, '9'+1, 'A'-1);
+ addrange(g, 'Z'+1, '_'-1);
+ addrange(g, '_'+1, 'a'-1);
+ addrange(g, 'z'+1, 0xFFFF);
+}
+
+static int lexclass(struct cstate *g)
+{
+ int type = L_CCLASS;
+ int quoted, havesave, havedash;
+ Rune save;
+
+ newcclass(g);
+
+ quoted = nextrune(g);
+ if (!quoted && g->yychar == '^') {
+ type = L_NCCLASS;
+ quoted = nextrune(g);
+ }
+
+ havesave = havedash = 0;
+ for (;;) {
+ if (g->yychar == 0)
+ die(g, "unterminated character class");
+ if (!quoted && g->yychar == ']')
+ break;
+
+ if (!quoted && g->yychar == '-') {
+ if (havesave) {
+ if (havedash) {
+ addrange(g, save, '-');
+ havesave = havedash = 0;
+ } else {
+ havedash = 1;
+ }
+ } else {
+ save = '-';
+ havesave = 1;
+ }
+ } else if (quoted && strchr("DSWdsw", g->yychar)) {
+ if (havesave) {
+ addrange(g, save, save);
+ if (havedash)
+ addrange(g, '-', '-');
+ }
+ switch (g->yychar) {
+ case 'd': addranges_d(g); break;
+ case 's': addranges_s(g); break;
+ case 'w': addranges_w(g); break;
+ case 'D': addranges_D(g); break;
+ case 'S': addranges_S(g); break;
+ case 'W': addranges_W(g); break;
+ }
+ havesave = havedash = 0;
+ } else {
+ if (quoted) {
+ if (g->yychar == 'b')
+ g->yychar = '\b';
+ else if (g->yychar == '0')
+ g->yychar = 0;
+ /* else identity escape */
+ }
+ if (havesave) {
+ if (havedash) {
+ addrange(g, save, g->yychar);
+ havesave = havedash = 0;
+ } else {
+ addrange(g, save, save);
+ save = g->yychar;
+ }
+ } else {
+ save = g->yychar;
+ havesave = 1;
+ }
+ }
+
+ quoted = nextrune(g);
+ }
+
+ if (havesave) {
+ addrange(g, save, save);
+ if (havedash)
+ addrange(g, '-', '-');
+ }
+
+ return type;
+}
+
+static int lex(struct cstate *g)
+{
+ int quoted = nextrune(g);
+ if (quoted) {
+ switch (g->yychar) {
+ case 'b': return L_WORD;
+ case 'B': return L_NWORD;
+ case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
+ case 's': newcclass(g); addranges_s(g); return L_CCLASS;
+ case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
+ case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
+ case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
+ case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
+ case '0': g->yychar = 0; return L_CHAR;
+ }
+ if (g->yychar >= '0' && g->yychar <= '9') {
+ g->yychar -= '0';
+ if (*g->source >= '0' && *g->source <= '9')
+ g->yychar = g->yychar * 10 + *g->source++ - '0';
+ return L_REF;
+ }
+ return L_CHAR;
+ }
+
+ switch (g->yychar) {
+ case 0:
+ case '$': case ')': case '*': case '+':
+ case '.': case '?': case '^': case '|':
+ return g->yychar;
+ }
+
+ if (g->yychar == '{')
+ return lexcount(g);
+ if (g->yychar == '[')
+ return lexclass(g);
+ if (g->yychar == '(') {
+ if (g->source[0] == '?') {
+ if (g->source[1] == ':') {
+ g->source += 2;
+ return L_NC;
+ }
+ if (g->source[1] == '=') {
+ g->source += 2;
+ return L_PLA;
+ }
+ if (g->source[1] == '!') {
+ g->source += 2;
+ return L_NLA;
+ }
+ }
+ return '(';
+ }
+
+ return L_CHAR;
+}
+
+/* Parse */
+
+enum {
+ P_CAT, P_ALT, P_REP,
+ P_BOL, P_EOL, P_WORD, P_NWORD,
+ P_PAR, P_PLA, P_NLA,
+ P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
+ P_REF,
+};
+
+struct Renode {
+ unsigned char type;
+ unsigned char ng, m, n;
+ Rune c;
+ Reclass *cc;
+ Renode *x;
+ Renode *y;
+};
+
+static Renode *newnode(struct cstate *g, int type)
+{
+ Renode *node = g->pend++;
+ node->type = type;
+ node->cc = NULL;
+ node->c = 0;
+ node->ng = 0;
+ node->m = 0;
+ node->n = 0;
+ node->x = node->y = NULL;
+ return node;
+}
+
+static int empty(Renode *node)
+{
+ if (!node) return 1;
+ switch (node->type) {
+ default: return 1;
+ case P_CAT: return empty(node->x) && empty(node->y);
+ case P_ALT: return empty(node->x) || empty(node->y);
+ case P_REP: return empty(node->x) || node->m == 0;
+ case P_PAR: return empty(node->x);
+ case P_REF: return empty(node->x);
+ case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
+ }
+}
+
+static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
+{
+ Renode *rep = newnode(g, P_REP);
+ if (max == REPINF && empty(atom))
+ die(g, "infinite loop matching the empty string");
+ rep->ng = ng;
+ rep->m = min;
+ rep->n = max;
+ rep->x = atom;
+ return rep;
+}
+
+static void next(struct cstate *g)
+{
+ g->lookahead = lex(g);
+}
+
+static int accept(struct cstate *g, int t)
+{
+ if (g->lookahead == t) {
+ next(g);
+ return 1;
+ }
+ return 0;
+}
+
+static Renode *parsealt(struct cstate *g);
+
+static Renode *parseatom(struct cstate *g)
+{
+ Renode *atom;
+ if (g->lookahead == L_CHAR) {
+ atom = newnode(g, P_CHAR);
+ atom->c = g->yychar;
+ next(g);
+ return atom;
+ }
+ if (g->lookahead == L_CCLASS) {
+ atom = newnode(g, P_CCLASS);
+ atom->cc = g->yycc;
+ next(g);
+ return atom;
+ }
+ if (g->lookahead == L_NCCLASS) {
+ atom = newnode(g, P_NCCLASS);
+ atom->cc = g->yycc;
+ next(g);
+ return atom;
+ }
+ if (g->lookahead == L_REF) {
+ atom = newnode(g, P_REF);
+ if (g->yychar == 0 || g->yychar > g->nsub || !g->sub[g->yychar])
+ die(g, "invalid back-reference");
+ atom->n = g->yychar;
+ atom->x = g->sub[g->yychar];
+ next(g);
+ return atom;
+ }
+ if (accept(g, '.'))
+ return newnode(g, P_ANY);
+ if (accept(g, '(')) {
+ atom = newnode(g, P_PAR);
+ if (g->nsub == MAXSUB)
+ die(g, "too many captures");
+ atom->n = g->nsub++;
+ atom->x = parsealt(g);
+ g->sub[atom->n] = atom;
+ if (!accept(g, ')'))
+ die(g, "unmatched '('");
+ return atom;
+ }
+ if (accept(g, L_NC)) {
+ atom = parsealt(g);
+ if (!accept(g, ')'))
+ die(g, "unmatched '('");
+ return atom;
+ }
+ if (accept(g, L_PLA)) {
+ atom = newnode(g, P_PLA);
+ atom->x = parsealt(g);
+ if (!accept(g, ')'))
+ die(g, "unmatched '('");
+ return atom;
+ }
+ if (accept(g, L_NLA)) {
+ atom = newnode(g, P_NLA);
+ atom->x = parsealt(g);
+ if (!accept(g, ')'))
+ die(g, "unmatched '('");
+ return atom;
+ }
+ die(g, "syntax error");
+ return NULL;
+}
+
+static Renode *parserep(struct cstate *g)
+{
+ Renode *atom;
+
+ if (accept(g, '^')) return newnode(g, P_BOL);
+ if (accept(g, '$')) return newnode(g, P_EOL);
+ if (accept(g, L_WORD)) return newnode(g, P_WORD);
+ if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
+
+ atom = parseatom(g);
+ if (g->lookahead == L_COUNT) {
+ int min = g->yymin, max = g->yymax;
+ next(g);
+ if (max < min)
+ die(g, "invalid quantifier");
+ return newrep(g, atom, accept(g, '?'), min, max);
+ }
+ if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
+ if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
+ if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
+ return atom;
+}
+
+static Renode *parsecat(struct cstate *g)
+{
+ Renode *cat, *x;
+ if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+ cat = parserep(g);
+ while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+ x = cat;
+ cat = newnode(g, P_CAT);
+ cat->x = x;
+ cat->y = parserep(g);
+ }
+ return cat;
+ }
+ return NULL;
+}
+
+static Renode *parsealt(struct cstate *g)
+{
+ Renode *alt, *x;
+ alt = parsecat(g);
+ while (accept(g, '|')) {
+ x = alt;
+ alt = newnode(g, P_ALT);
+ alt->x = x;
+ alt->y = parsecat(g);
+ }
+ return alt;
+}
+
+/* Compile */
+
+enum {
+ I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
+ I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
+ I_BOL, I_EOL, I_WORD, I_NWORD,
+ I_LPAR, I_RPAR
+};
+
+struct Reinst {
+ unsigned char opcode;
+ unsigned char n;
+ Rune c;
+ Reclass *cc;
+ Reinst *x;
+ Reinst *y;
+};
+
+static unsigned int count(Renode *node)
+{
+ unsigned int min, max;
+ if (!node) return 0;
+ switch (node->type) {
+ default: return 1;
+ case P_CAT: return count(node->x) + count(node->y);
+ case P_ALT: return count(node->x) + count(node->y) + 2;
+ case P_REP:
+ min = node->m;
+ max = node->n;
+ if (min == max) return count(node->x) * min;
+ if (max < REPINF) return count(node->x) * max + (max - min);
+ return count(node->x) * (min + 1) + 2;
+ case P_PAR: return count(node->x) + 2;
+ case P_PLA: return count(node->x) + 2;
+ case P_NLA: return count(node->x) + 2;
+ }
+}
+
+static Reinst *emit(Reprog *prog, int opcode)
+{
+ Reinst *inst = prog->end++;
+ inst->opcode = opcode;
+ inst->n = 0;
+ inst->c = 0;
+ inst->cc = NULL;
+ inst->x = inst->y = NULL;
+ return inst;
+}
+
+static void compile(Reprog *prog, Renode *node)
+{
+ Reinst *inst, *split, *jump;
+ unsigned int i;
+
+ if (!node)
+ return;
+
+ switch (node->type) {
+ case P_CAT:
+ compile(prog, node->x);
+ compile(prog, node->y);
+ break;
+
+ case P_ALT:
+ split = emit(prog, I_SPLIT);
+ compile(prog, node->x);
+ jump = emit(prog, I_JUMP);
+ compile(prog, node->y);
+ split->x = split + 1;
+ split->y = jump + 1;
+ jump->x = prog->end;
+ break;
+
+ case P_REP:
+ for (i = 0; i < node->m; ++i) {
+ inst = prog->end;
+ compile(prog, node->x);
+ }
+ if (node->m == node->n)
+ break;
+ if (node->n < REPINF) {
+ for (i = node->m; i < node->n; ++i) {
+ split = emit(prog, I_SPLIT);
+ compile(prog, node->x);
+ if (node->ng) {
+ split->y = split + 1;
+ split->x = prog->end;
+ } else {
+ split->x = split + 1;
+ split->y = prog->end;
+ }
+ }
+ } else if (node->m == 0) {
+ split = emit(prog, I_SPLIT);
+ compile(prog, node->x);
+ jump = emit(prog, I_JUMP);
+ if (node->ng) {
+ split->y = split + 1;
+ split->x = prog->end;
+ } else {
+ split->x = split + 1;
+ split->y = prog->end;
+ }
+ jump->x = split;
+ } else {
+ split = emit(prog, I_SPLIT);
+ if (node->ng) {
+ split->y = inst;
+ split->x = prog->end;
+ } else {
+ split->x = inst;
+ split->y = prog->end;
+ }
+ }
+ break;
+
+ case P_BOL: emit(prog, I_BOL); break;
+ case P_EOL: emit(prog, I_EOL); break;
+ case P_WORD: emit(prog, I_WORD); break;
+ case P_NWORD: emit(prog, I_NWORD); break;
+
+ case P_PAR:
+ inst = emit(prog, I_LPAR);
+ inst->n = node->n;
+ compile(prog, node->x);
+ inst = emit(prog, I_RPAR);
+ inst->n = node->n;
+ break;
+ case P_PLA:
+ split = emit(prog, I_PLA);
+ compile(prog, node->x);
+ emit(prog, I_END);
+ split->x = split + 1;
+ split->y = prog->end;
+ break;
+ case P_NLA:
+ split = emit(prog, I_NLA);
+ compile(prog, node->x);
+ emit(prog, I_END);
+ split->x = split + 1;
+ split->y = prog->end;
+ break;
+
+ case P_ANY:
+ emit(prog, I_ANY);
+ break;
+ case P_CHAR:
+ inst = emit(prog, I_CHAR);
+ inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
+ break;
+ case P_CCLASS:
+ inst = emit(prog, I_CCLASS);
+ inst->cc = node->cc;
+ break;
+ case P_NCCLASS:
+ inst = emit(prog, I_NCCLASS);
+ inst->cc = node->cc;
+ break;
+ case P_REF:
+ inst = emit(prog, I_REF);
+ inst->n = node->n;
+ break;
+ }
+}
+
+#ifdef TEST
+static void dumpnode(Renode *node)
+{
+ Rune *p;
+ if (!node) { printf("Empty"); return; }
+ switch (node->type) {
+ case P_CAT: printf("Cat("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
+ case P_ALT: printf("Alt("); dumpnode(node->x); printf(", "); dumpnode(node->y); printf(")"); break;
+ case P_REP:
+ printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
+ dumpnode(node->x);
+ printf(")");
+ break;
+ case P_BOL: printf("Bol"); break;
+ case P_EOL: printf("Eol"); break;
+ case P_WORD: printf("Word"); break;
+ case P_NWORD: printf("NotWord"); break;
+ case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
+ case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
+ case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
+ case P_ANY: printf("Any"); break;
+ case P_CHAR: printf("Char(%c)", node->c); break;
+ case P_CCLASS:
+ printf("Class(");
+ for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
+ printf(")");
+ break;
+ case P_NCCLASS:
+ printf("NotClass(");
+ for (p = node->cc->spans; p < node->cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
+ printf(")");
+ break;
+ case P_REF: printf("Ref(%d)", node->n); break;
+ }
+}
+
+static void dumpprog(Reprog *prog)
+{
+ Reinst *inst;
+ int i;
+ for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
+ printf("% 5d: ", i);
+ switch (inst->opcode) {
+ case I_END: puts("end"); break;
+ case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
+ case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
+ case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
+ case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
+ case I_ANY: puts("any"); break;
+ case I_ANYNL: puts("anynl"); break;
+ case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
+ case I_CCLASS: puts("cclass"); break;
+ case I_NCCLASS: puts("ncclass"); break;
+ case I_REF: printf("ref %d\n", inst->n); break;
+ case I_BOL: puts("bol"); break;
+ case I_EOL: puts("eol"); break;
+ case I_WORD: puts("word"); break;
+ case I_NWORD: puts("nword"); break;
+ case I_LPAR: printf("lpar %d\n", inst->n); break;
+ case I_RPAR: printf("rpar %d\n", inst->n); break;
+ }
+ }
+}
+#endif
+
+Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
+{
+ struct cstate g;
+ Renode *node;
+ Reinst *split, *jump;
+ int i;
+
+ g.prog = malloc(sizeof (Reprog));
+ g.pstart = g.pend = malloc(sizeof (Renode) * strlen(pattern) * 2);
+
+ if (setjmp(g.kaboom)) {
+ if (errorp) *errorp = g.error;
+ free(g.pstart);
+ free(g.prog);
+ return NULL;
+ }
+
+ g.source = pattern;
+ g.ncclass = 0;
+ g.nsub = 1;
+ for (i = 0; i < MAXSUB; ++i)
+ g.sub[i] = 0;
+
+ g.prog->flags = cflags;
+
+ next(&g);
+ node = parsealt(&g);
+ if (g.lookahead == ')')
+ die(&g, "unmatched ')'");
+ if (g.lookahead != 0)
+ die(&g, "syntax error");
+
+ g.prog->nsub = g.nsub;
+ g.prog->start = g.prog->end = malloc((count(node) + 6) * sizeof (Reinst));
+
+ split = emit(g.prog, I_SPLIT);
+ split->x = split + 3;
+ split->y = split + 1;
+ emit(g.prog, I_ANYNL);
+ jump = emit(g.prog, I_JUMP);
+ jump->x = split;
+ emit(g.prog, I_LPAR);
+ compile(g.prog, node);
+ emit(g.prog, I_RPAR);
+ emit(g.prog, I_END);
+
+#ifdef TEST
+ dumpnode(node);
+ putchar('\n');
+ dumpprog(g.prog);
+#endif
+
+ free(g.pstart);
+
+ if (errorp) *errorp = NULL;
+ return g.prog;
+}
+
+void regfree(Reprog *prog)
+{
+ if (prog) {
+ free(prog->start);
+ free(prog);
+ }
+}
+
+/* Match */
+
+static int isnewline(int c)
+{
+ return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
+}
+
+static int iswordchar(int c)
+{
+ return c == '_' ||
+ (c >= 'a' && c <= 'z') ||
+ (c >= 'A' && c <= 'Z') ||
+ (c >= '0' && c <= '9');
+}
+
+static int incclass(Reclass *cc, Rune c)
+{
+ Rune *p;
+ for (p = cc->spans; p < cc->end; p += 2)
+ if (p[0] <= c && c <= p[1])
+ return 1;
+ return 0;
+}
+
+static int incclasscanon(Reclass *cc, Rune c)
+{
+ Rune *p, r;
+ for (p = cc->spans; p < cc->end; p += 2)
+ for (r = p[0]; r <= p[1]; ++r)
+ if (c == canon(r))
+ return 1;
+ return 0;
+}
+
+static int strncmpcanon(const char *a, const char *b, unsigned int n)
+{
+ Rune ra, rb;
+ int c;
+ while (n--) {
+ if (!*a) return -1;
+ if (!*b) return 1;
+ a += chartorune(&ra, a);
+ b += chartorune(&rb, b);
+ c = canon(ra) - canon(rb);
+ if (c)
+ return c;
+ }
+ return 0;
+}
+
+struct Rethread {
+ Reinst *pc;
+ const char *sp;
+ Resub sub;
+};
+
+static void spawn(Rethread *t, Reinst *pc, const char *sp, Resub *sub)
+{
+ t->pc = pc;
+ t->sp = sp;
+ memcpy(&t->sub, sub, sizeof t->sub);
+}
+
+static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out)
+{
+ Rethread ready[MAXTHREAD];
+ Resub scratch;
+ Resub sub;
+ Rune c;
+ unsigned int nready;
+ int i;
+
+ /* queue initial thread */
+ spawn(ready + 0, pc, sp, out);
+ nready = 1;
+
+ /* run threads in stack order */
+ while (nready > 0) {
+ --nready;
+ pc = ready[nready].pc;
+ sp = ready[nready].sp;
+ memcpy(&sub, &ready[nready].sub, sizeof sub);
+ for (;;) {
+ switch (pc->opcode) {
+ case I_END:
+ for (i = 0; i < MAXSUB; ++i) {
+ out->sub[i].sp = sub.sub[i].sp;
+ out->sub[i].ep = sub.sub[i].ep;
+ }
+ return 1;
+ case I_JUMP:
+ pc = pc->x;
+ continue;
+ case I_SPLIT:
+ if (nready >= MAXTHREAD) {
+ fprintf(stderr, "regexec: backtrack overflow!\n");
+ return 0;
+ }
+ spawn(&ready[nready++], pc->y, sp, &sub);
+ pc = pc->x;
+ continue;
+
+ case I_PLA:
+ if (!match(pc->x, sp, bol, flags, &sub))
+ goto dead;
+ pc = pc->y;
+ continue;
+ case I_NLA:
+ memcpy(&scratch, &sub, sizeof scratch);
+ if (match(pc->x, sp, bol, flags, &scratch))
+ goto dead;
+ pc = pc->y;
+ continue;
+
+ case I_ANYNL:
+ sp += chartorune(&c, sp);
+ if (c == 0)
+ goto dead;
+ break;
+ case I_ANY:
+ sp += chartorune(&c, sp);
+ if (c == 0)
+ goto dead;
+ if (isnewline(c))
+ goto dead;
+ break;
+ case I_CHAR:
+ sp += chartorune(&c, sp);
+ if (c == 0)
+ goto dead;
+ if (flags & REG_ICASE)
+ c = canon(c);
+ if (c != pc->c)
+ goto dead;
+ break;
+ case I_CCLASS:
+ sp += chartorune(&c, sp);
+ if (c == 0)
+ goto dead;
+ if (flags & REG_ICASE) {
+ if (!incclasscanon(pc->cc, canon(c)))
+ goto dead;
+ } else {
+ if (!incclass(pc->cc, c))
+ goto dead;
+ }
+ break;
+ case I_NCCLASS:
+ sp += chartorune(&c, sp);
+ if (c == 0)
+ goto dead;
+ if (flags & REG_ICASE) {
+ if (incclasscanon(pc->cc, canon(c)))
+ goto dead;
+ } else {
+ if (incclass(pc->cc, c))
+ goto dead;
+ }
+ break;
+ case I_REF:
+ i = sub.sub[pc->n].ep - sub.sub[pc->n].sp;
+ if (flags & REG_ICASE) {
+ if (strncmpcanon(sp, sub.sub[pc->n].sp, i))
+ goto dead;
+ } else {
+ if (strncmp(sp, sub.sub[pc->n].sp, i))
+ goto dead;
+ }
+ if (i > 0)
+ sp += i;
+ break;
+
+ case I_BOL:
+ if (sp == bol && !(flags & REG_NOTBOL))
+ break;
+ if (flags & REG_NEWLINE)
+ if (sp > bol && isnewline(sp[-1]))
+ break;
+ goto dead;
+ case I_EOL:
+ if (*sp == 0)
+ break;
+ if (flags & REG_NEWLINE)
+ if (isnewline(*sp))
+ break;
+ goto dead;
+ case I_WORD:
+ i = sp > bol && iswordchar(sp[-1]);
+ i ^= iswordchar(sp[0]);
+ if (i)
+ break;
+ goto dead;
+ case I_NWORD:
+ i = sp > bol && iswordchar(sp[-1]);
+ i ^= iswordchar(sp[0]);
+ if (!i)
+ break;
+ goto dead;
+
+ case I_LPAR:
+ sub.sub[pc->n].sp = sp;
+ break;
+ case I_RPAR:
+ sub.sub[pc->n].ep = sp;
+ break;
+ default:
+ goto dead;
+ }
+ pc = pc + 1;
+ }
+dead: ;
+ }
+ return 0;
+}
+
+int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
+{
+ Resub scratch;
+ int i;
+
+ if (!sub)
+ sub = &scratch;
+
+ sub->nsub = prog->nsub;
+ for (i = 0; i < MAXSUB; ++i)
+ sub->sub[i].sp = sub->sub[i].ep = NULL;
+
+ return !match(prog->start, sp, sp, prog->flags | eflags, sub);
+}
+
+#ifdef TEST
+int main(int argc, char **argv)
+{
+ const char *error;
+ const char *s;
+ Reprog *p;
+ Resub m;
+ unsigned int i;
+
+ if (argc > 1) {
+ p = regcomp(argv[1], 0, &error);
+ if (!p) {
+ fprintf(stderr, "regcomp: %s\n", error);
+ return 1;
+ }
+
+ if (argc > 2) {
+ s = argv[2];
+ printf("nsub = %d\n", p->nsub);
+ if (!regexec(p, s, &m, 0)) {
+ for (i = 0; i < m.nsub; ++i) {
+ int n = m.sub[i].ep - m.sub[i].sp;
+ if (n > 0)
+ printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
+ else
+ printf("match %d: n=0 ''\n", i);
+ }
+ } else {
+ printf("no match\n");
+ }
+ }
+ }
+
+ return 0;
+}
+#endif
--- /dev/null
+++ b/regexp.h
@@ -1,0 +1,35 @@
+#ifndef regexp_h
+#define regexp_h
+
+#define regcomp js_regcomp
+#define regexec js_regexec
+#define regfree js_regfree
+
+typedef struct Reprog Reprog;
+typedef struct Resub Resub;
+
+Reprog *regcomp(const char *pattern, int cflags, const char **errorp);
+int regexec(Reprog *prog, const char *string, Resub *sub, int eflags);
+void regfree(Reprog *prog);
+
+enum {
+ /* regcomp flags */
+ REG_ICASE = 1,
+ REG_NEWLINE = 2,
+
+ /* regexec flags */
+ REG_NOTBOL = 4,
+
+ /* limits */
+ REG_MAXSUB = 16
+};
+
+struct Resub {
+ unsigned int nsub;
+ struct {
+ const char *sp;
+ const char *ep;
+ } sub[REG_MAXSUB];
+};
+
+#endif