shithub: libmujs

Download patch

ref: d4f7b160e14aec453919c4c13328a92d59ccbb78
parent: 55ad332621a4761674ef84fc9a78e890b8cdc8dd
author: Tor Andersson <tor@ccxvii.net>
date: Mon Feb 24 11:14:43 EST 2014

Fix lookahead captures.

--- a/regex.c
+++ b/regex.c
@@ -25,6 +25,7 @@
 struct Reprog {
 	Reinst *start, *end;
 	int icase, newline;
+	int ncap;
 	Reclass cclass[16];
 };
 
@@ -413,20 +414,6 @@
 
 static Renode *parsealt(struct cstate *g);
 
-static Renode *parsecap(struct cstate *g, int type)
-{
-	Renode *atom = newnode(g, type);
-	if (++g->ncap == 10)
-		die(g, "too many captures");
-	atom->n = g->ncap;
-	g->nref[atom->n] = 0;
-	atom->x = parsealt(g);
-	g->nref[atom->n] = 1;
-	if (!accept(g, ')'))
-		die(g, "unmatched '('");
-	return atom;
-}
-
 static Renode *parseatom(struct cstate *g)
 {
 	Renode *atom;
@@ -458,6 +445,18 @@
 	}
 	if (accept(g, '.'))
 		return newnode(g, P_ANY);
+	if (accept(g, '(')) {
+		atom = newnode(g, P_PAR);
+		if (++g->ncap == 10)
+			die(g, "too many captures");
+		atom->n = g->ncap;
+		g->nref[atom->n] = 0;
+		atom->x = parsealt(g);
+		g->nref[atom->n] = 1;
+		if (!accept(g, ')'))
+			die(g, "unmatched '('");
+		return atom;
+	}
 	if (accept(g, L_NC)) {
 		atom = parsealt(g);
 		if (!accept(g, ')'))
@@ -464,12 +463,20 @@
 			die(g, "unmatched '('");
 		return atom;
 	}
-	if (accept(g, '('))
-		return parsecap(g, P_PAR);
-	if (accept(g, L_PLA))
-		return parsecap(g, P_PLA);
-	if (accept(g, L_NLA))
-		return parsecap(g, P_NLA);
+	if (accept(g, L_PLA)) {
+		atom = newnode(g, P_PLA);
+		atom->x = parsealt(g);
+		if (!accept(g, ')'))
+			die(g, "unmatched '('");
+		return atom;
+	}
+	if (accept(g, L_NLA)) {
+		atom = newnode(g, P_NLA);
+		atom->x = parsealt(g);
+		if (!accept(g, ')'))
+			die(g, "unmatched '('");
+		return atom;
+	}
 	die(g, "syntax error");
 	return NULL;
 }
@@ -560,8 +567,8 @@
 		if (max < USHRT_MAX) return count(node->x) * max + (max - min);
 		return count(node->x) * (min + 1) + 2;
 	case P_PAR: return count(node->x) + 2;
-	case P_PLA: return count(node->x) + 4;
-	case P_NLA: return count(node->x) + 4;
+	case P_PLA: return count(node->x) + 2;
+	case P_NLA: return count(node->x) + 2;
 	}
 }
 
@@ -658,11 +665,7 @@
 		break;
 	case P_PLA:
 		split = emit(prog, I_PLA);
-		inst = emit(prog, I_LPAR);
-		inst->n = node->n;
 		compile(prog, node->x);
-		inst = emit(prog, I_RPAR);
-		inst->n = node->n;
 		emit(prog, I_END);
 		split->x = split + 1;
 		split->y = prog->end;
@@ -669,11 +672,7 @@
 		break;
 	case P_NLA:
 		split = emit(prog, I_NLA);
-		inst = emit(prog, I_LPAR);
-		inst->n = node->n;
 		compile(prog, node->x);
-		inst = emit(prog, I_RPAR);
-		inst->n = node->n;
 		emit(prog, I_END);
 		split->x = split + 1;
 		split->y = prog->end;
@@ -719,8 +718,8 @@
 	case P_WORD: printf("Word"); break;
 	case P_NWORD: printf("NotWord"); break;
 	case P_PAR: printf("Par(%d,", node->n); dumpnode(node->x); printf(")"); break;
-	case P_PLA: printf("PLA(%d,", node->n); dumpnode(node->x); printf(")"); break;
-	case P_NLA: printf("NLA(%d,", node->n); dumpnode(node->x); printf(")"); break;
+	case P_PLA: printf("PLA("); dumpnode(node->x); printf(")"); break;
+	case P_NLA: printf("NLA("); dumpnode(node->x); printf(")"); break;
 	case P_ANY: printf("Any"); break;
 	case P_CHAR: printf("Char(%c)", node->c); break;
 	case P_CCLASS:
@@ -797,6 +796,7 @@
 	if (g.lookahead != 0)
 		die(&g, "syntax error");
 
+	g.prog->ncap = g.ncap;
 	g.prog->start = g.prog->end = malloc((count(node) + 3) * sizeof (Reinst));
 	emit(g.prog, I_LPAR);
 	compile(g.prog, node);
@@ -827,6 +827,7 @@
 
 struct estate {
 	int icase, newline, notbol;
+	int nla;
 	const char *bol;
 	Resub *m;
 };
@@ -916,7 +917,10 @@
 			pc = pc->y;
 			continue;
 		case I_NLA:
-			if (match(g, pc->x, s))
+			++g->nla;
+			n = match(g, pc->x, s);
+			--g->nla;
+			if (n)
 				return 0;
 			pc = pc->y;
 			continue;
@@ -1001,15 +1005,19 @@
 		case I_LPAR:
 			p = g->m[pc->n].sp;
 			g->m[pc->n].sp = s;
-			if (match(g, pc + 1, s))
+			if (match(g, pc + 1, s)) {
+				if (g->nla) g->m[pc->n].sp = p;
 				return 1;
+			}
 			g->m[pc->n].sp = p;
 			return 0;
 		case I_RPAR:
 			p = g->m[pc->n].ep;
 			g->m[pc->n].ep = s;
-			if (match(g, pc + 1, s))
+			if (match(g, pc + 1, s)) {
+				if (g->nla) g->m[pc->n].ep = p;
 				return 1;
+			}
 			g->m[pc->n].ep = p;
 			return 0;
 		default:
@@ -1030,9 +1038,10 @@
 	g.newline = prog->newline;
 	g.notbol = eflags & REG_NOTBOL;
 	g.bol = s;
+	g.nla = 0;
 	g.m = m ? m : gm;
-	for (i = 0; i < n; ++i)
-		g.m[i].sp = g.m[i].ep = NULL;
+	for (i = 0; i < 10; ++i)
+		g.m[i].sp = g.m[i].ep = i <= prog->ncap ? s : NULL;
 
 	do {
 		if (match(&g, prog->start, s))
@@ -1061,6 +1070,7 @@
 
 		if (argc > 2) {
 			s = argv[2];
+			printf("ncap = %d\n", p->ncap);
 			if (!regexec(p, s, 10, m, 0)) {
 				for (i = 0; i < 10; ++i)
 					if (m[i].sp) {