ref: 3585abad419a5eb53ebb58d6c37bbeeff44ce65e
parent: d68a6f342486ebf65e78e26fc45089c9fe96df1d
author: Tor Andersson <tor@ccxvii.net>
date: Fri Feb 21 14:46:12 EST 2014
More robust character class parser.
--- a/regex.c
+++ b/regex.c
@@ -36,7 +36,7 @@
int lookahead;
Rune yychar;
- Reclass *yycclass;
+ Reclass *yycc;
int yymin, yymax;
const char *error;
@@ -72,39 +72,6 @@
L_COUNT, /* {M,N} */
};
-static Reclass class_d = {
- class_d.spans + 2, {
- '0', '9',
- }
-};
-
-static Reclass class_s = {
- class_s.spans + 12, {
- 0x9, 0x9,
- 0xA, 0xD,
- 0x20, 0x20,
- 0xA0, 0xA0,
- 0x2028, 0x2029,
- 0xFEFF, 0xFEFF,
- }
-};
-
-static Reclass class_w = {
- class_w.spans + 8, {
- '0', '9',
- 'A', 'Z',
- '_', '_',
- 'a', 'z',
- }
-};
-
-static Reclass *newclass(struct cstate *g)
-{
- if (g->ncclass >= nelem(g->prog->cclass))
- die(g, "too many character classes");
- return &g->prog->cclass[g->ncclass++];
-}
-
static int hex(struct cstate *g, int c)
{
if (c >= '0' && c <= '9') return c - '0';
@@ -128,7 +95,6 @@
g->source += chartorune(&g->yychar, g->source);
switch (g->yychar) {
case 0: die(g, "unterminated escape sequence");
- // case 'b': g->yychar = '\b'; return 0;
case 'f': g->yychar = '\f'; return 0;
case 'n': g->yychar = '\n'; return 0;
case 'r': g->yychar = '\r'; return 0;
@@ -187,16 +153,80 @@
return L_COUNT;
}
+static void newcclass(struct cstate *g)
+{
+ if (g->ncclass >= nelem(g->prog->cclass))
+ die(g, "too many character classes");
+ g->yycc = g->prog->cclass + g->ncclass++;
+ g->yycc->end = g->yycc->spans;
+}
+
+static void addrange(struct cstate *g, Rune a, Rune b)
+{
+ if (a > b)
+ die(g, "invalid character class range");
+ if (g->yycc->end + 2 == g->yycc->spans + nelem(g->yycc->spans))
+ die(g, "too many character class ranges");
+ *g->yycc->end++ = a;
+ *g->yycc->end++ = b;
+}
+
+static void addranges_d(struct cstate *g)
+{
+ addrange(g, '0', '9');
+}
+
+static void addranges_D(struct cstate *g)
+{
+ addrange(g, 0, '0'-1);
+ addrange(g, '9'+1, 0xFFFF);
+}
+
+static void addranges_s(struct cstate *g)
+{
+ addrange(g, 0x9, 0x9);
+ addrange(g, 0xA, 0xD);
+ addrange(g, 0x20, 0x20);
+ addrange(g, 0xA0, 0xA0);
+ addrange(g, 0x2028, 0x2029);
+ addrange(g, 0xFEFF, 0xFEFF);
+}
+
+static void addranges_S(struct cstate *g)
+{
+ addrange(g, 0, 0x9-1);
+ addrange(g, 0x9+1, 0xA-1);
+ addrange(g, 0xD+1, 0x20-1);
+ addrange(g, 0x20+1, 0xA0-1);
+ addrange(g, 0xA0+1, 0x2028-1);
+ addrange(g, 0x2029+1, 0xFEFF-1);
+ addrange(g, 0xFEFF+1, 0xFFFF);
+}
+
+static void addranges_w(struct cstate *g)
+{
+ addrange(g, '0', '9');
+ addrange(g, 'A', 'Z');
+ addrange(g, '_', '_');
+ addrange(g, 'a', 'z');
+}
+
+static void addranges_W(struct cstate *g)
+{
+ addrange(g, 0, '0'-1);
+ addrange(g, '9'+1, 'A'-1);
+ addrange(g, 'Z'+1, '_'-1);
+ addrange(g, '_'+1, 'a'-1);
+ addrange(g, 'z'+1, 0xFFFF);
+}
+
static int lexclass(struct cstate *g)
{
int type = L_CCLASS;
- int quoted;
- Rune *p, *ep;
+ int quoted, havesave, havedash;
Rune save;
- g->yycclass = newclass(g);
- p = g->yycclass->spans;
- ep = p + nelem(g->yycclass->spans);
+ newcclass(g);
quoted = nextrune(g);
if (!quoted && g->yychar == '^') {
@@ -204,45 +234,65 @@
quoted = nextrune(g);
}
- while (p < ep) {
+ havesave = havedash = 0;
+ for (;;) {
if (g->yychar == 0)
die(g, "unterminated character class");
if (!quoted && g->yychar == ']')
break;
- save = g->yychar;
- quoted = nextrune(g);
-
- // TODO: \d \D \s \S \w \W
-
if (!quoted && g->yychar == '-') {
- quoted = nextrune(g);
- if (g->yychar == 0)
- die(g, "unterminated character class");
- if (!quoted && g->yychar == ']') {
- *p++ = save;
- *p++ = save;
- if (p == ep)
- die(g, "too many character classes");
- *p++ = '-';
- *p++ = '-';
- break;
+ if (havesave) {
+ if (havedash) {
+ addrange(g, save, '-');
+ havesave = havedash = 0;
+ } else {
+ havedash = 1;
+ }
+ } else {
+ save = '-';
+ havesave = 1;
}
-
- if (g->yychar < save)
- die(g, "invalid character class range");
- *p++ = save;
- *p++ = g->yychar;
- quoted = nextrune(g);
+ } else if (quoted && strchr("DSWdsw", g->yychar)) {
+ if (havesave) {
+ addrange(g, save, save);
+ if (havedash)
+ addrange(g, '-', '-');
+ }
+ switch (g->yychar) {
+ case 'd': addranges_d(g); break;
+ case 's': addranges_s(g); break;
+ case 'w': addranges_w(g); break;
+ case 'D': addranges_D(g); break;
+ case 'S': addranges_S(g); break;
+ case 'W': addranges_W(g); break;
+ }
+ havesave = havedash = 0;
} else {
- *p++ = save;
- *p++ = save;
+ if (quoted && g->yychar == 'b')
+ g->yychar = '\b';
+ if (havesave) {
+ if (havedash) {
+ addrange(g, save, g->yychar);
+ havesave = havedash = 0;
+ } else {
+ addrange(g, save, save);
+ save = g->yychar;
+ }
+ } else {
+ save = g->yychar;
+ havesave = 1;
+ }
}
+
+ quoted = nextrune(g);
}
- if (p == ep)
- die(g, "too many character classes");
- g->yycclass->end = p;
+ if (havesave) {
+ addrange(g, save, save);
+ if (havedash)
+ addrange(g, '-', '-');
+ }
return type;
}
@@ -254,12 +304,12 @@
switch (g->yychar) {
case 'b': return L_WORD;
case 'B': return L_NWORD;
- case 'd': g->yycclass = &class_d; return L_CCLASS;
- case 's': g->yycclass = &class_s; return L_CCLASS;
- case 'w': g->yycclass = &class_w; return L_CCLASS;
- case 'D': g->yycclass = &class_d; return L_NCCLASS;
- case 'S': g->yycclass = &class_s; return L_NCCLASS;
- case 'W': g->yycclass = &class_w; return L_NCCLASS;
+ case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
+ case 's': newcclass(g); addranges_s(g); return L_CCLASS;
+ case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
+ case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
+ case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
+ case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
}
if (g->yychar >= '0' && g->yychar <= '9') {
g->yychar -= '0';
@@ -384,13 +434,13 @@
}
if (g->lookahead == L_CCLASS) {
atom = newnode(g, P_CCLASS);
- atom->cc = g->yycclass;
+ atom->cc = g->yycc;
next(g);
return atom;
}
if (g->lookahead == L_NCCLASS) {
atom = newnode(g, P_NCCLASS);
- atom->cc = g->yycclass;
+ atom->cc = g->yycc;
next(g);
return atom;
}