shithub: mc

Download patch

ref: 16b894430a5b71f4956ee305229bbbffd13605d0
parent: e9b59bee4efb648e323dfb6269209b59c575806f
author: Mura Li <github@ctli.io>
date: Wed Oct 23 11:14:24 EDT 2019

Merge decision tree nodes when possible

(Tested on Linux/AMD64)
Sample count: 506
Dtree Refcnt    avg: 5.38       95th percentile: 3.00    maximum: 100
Dtree Size      avg: 5.23       95th percentile: 3.00    maximum: 84
Dtree Height    avg: 1.39       95th percentile: 1.00    maximum: 12

References:
Mikael Pettersson. A term pattern-match compiler inspired by finite automata
theory. (p.6 "Step 3: Optimizing the DFA")

--- a/mi/match.c
+++ b/mi/match.c
@@ -14,11 +14,12 @@
 #include "parse.h"
 #include "mi.h"
 
-Dtree *gendtree(Node *m, Node *val, Node **lbl, size_t nlbl, int startid);
+Dtree *gendtree(Node *m, Node *val, Node **lbl, size_t nlbl);
 void dtreedump(FILE *fd, Dtree *dt);
 
 
-static int ndtree;
+static size_t ndtree;
+static Dtree **dtree;
 
 /* Path is a integer sequence that labels a subtree of a subject tree.
  * For example,
@@ -178,10 +179,82 @@
 	t = zalloc(sizeof(Dtree));
 	t->lbl = lbl;
 	t->loc = loc;
-	t->id = ndtree++;
+	t->id = ndtree;
+	lappend(&dtree, &ndtree, t);
 	return t;
 }
 
+static int
+loadeq(Node *a, Node *b)
+{
+	if (a == b)
+		return 1;
+
+	if (exprop(a) != exprop(b))
+		return 0;
+
+	switch (exprop(a)) {
+	case Outag:
+	case Oudata:
+	case Oderef:
+		return loadeq(a->expr.args[0], b->expr.args[0]);
+	case Omemb:
+		return loadeq(a->expr.args[0], b->expr.args[0]) && nameeq(a->expr.args[1], b->expr.args[1]);
+	case Otupget:
+	case Oidx:
+		return loadeq(a->expr.args[0], b->expr.args[0]) && liteq(a->expr.args[1]->expr.args[0], b->expr.args[1]->expr.args[0]);
+	case Ovar:
+		return a == b;
+	default:
+		die("unreachable");
+	}
+}
+
+static int
+pateq(Node *a, Node *b)
+{
+	if (exprop(a) != exprop(b))
+		return 0;
+
+	switch (exprop(a)) {
+	case Olit:
+		return liteq(a->expr.args[0], b->expr.args[0]);
+	case Ogap:
+	case Ovar:
+		return 1;
+	default:
+		die("unreachable");
+	}
+	return 0;
+}
+
+static int
+dtreusable(Dtree *dt, Node *load, Node **pat, size_t npat, Dtree **next, size_t nnext, Dtree *any)
+{
+	size_t i;
+
+	if (!loadeq(dt->load, load))
+		return 0;
+
+	if (dt->npat != npat)
+		return 0;
+
+	for (i = 0; i < npat; i++) {
+		if (dt->next[i]->id != next[i]->id)
+			return 0;
+		if (!pateq(dt->pat[i], pat[i]))
+			return 0;
+	}
+
+	if (dt->any == NULL && any == NULL)
+		return 1;
+
+	if (dt->any->id != any->id)
+		return 0;
+
+	return 1;
+}
+
 void
 dtreedumplit(FILE *fd, Dtree *dt, Node *n, size_t depth)
 {
@@ -260,24 +333,6 @@
 	return 1;
 }
 
-static int
-pateq(Node *a, Node *b)
-{
-	if (exprop(a) != exprop(b))
-		return 0;
-
-	switch (exprop(a)) {
-	case Olit:
-		return liteq(a->expr.args[0], b->expr.args[0]);
-	case Ogap:
-	case Ovar:
-		return 1;
-	default:
-		die("unreachable");
-	}
-	return 0;
-}
-
 char *
 pathfmt(Path *p)
 {
@@ -683,7 +738,24 @@
 		any = NULL;
 
 	/* construct the result dtree */
-	out = mkdtree(slot->pat->loc, genlbl(slot->pat->loc));
+	out = NULL;
+
+	/* TODO
+	 * use a hash table to avoid the quadratic complexity
+	 * when we have a large N and the bottleneck becomes obvious.
+	 */
+	for (i = 0; i < ndtree; i++) {
+		if (!dtree[i]->accept && dtreusable(dtree[i], slot->load, pat, npat, edge, nedge, any)) {
+			out = dtree[i];
+			out->refcnt++;
+			break;
+		}
+	}
+	if (out == NULL) {
+		out = mkdtree(slot->pat->loc, genlbl(slot->pat->loc));
+		out->refcnt++;
+	}
+
 	out->load = slot->load;
 	out->npat = npat,
 	out->pat = pat,
@@ -749,8 +821,49 @@
 	return m+1;
 }
 
+static size_t
+refsum(Dtree *dt)
+{
+	size_t i;
+	size_t sum;
+
+	if (dt == NULL)
+		return 0;
+
+	dt->emitted = 1;
+
+	/* NOTE
+	 * MATCH nodes are always pre-allocated and shared,
+	 * so counted as 1 for the size measurement.
+	 */
+	if (dt->accept)
+		return 1;
+
+	sum = 0;
+	for (i = 0; i < dt->nnext; i++)
+		if (!dt->next[i]->emitted)
+			sum += refsum(dt->next[i]);
+	if (dt->any && !dt->any->emitted)
+		sum += refsum(dt->any);
+
+	return dt->refcnt + sum;
+}
+
+static void
+clearemit(Dtree *dt)
+{
+	size_t i;
+
+	if (dt == NULL)
+		return;
+	for (i = 0; i < dt->nnext; i++)
+		clearemit(dt->next[i]);
+	clearemit(dt->any);
+	dt->emitted = 0;
+}
+
 Dtree *
-gendtree(Node *m, Node *val, Node **lbl, size_t nlbl, int startid)
+gendtree(Node *m, Node *val, Node **lbl, size_t nlbl)
 {
 	Dtree *root;
 	Node **pat;
@@ -762,7 +875,8 @@
 	char *dbgloc, *dbgfn, *dbgln;
 
 
-	ndtree = startid;
+	ndtree = 0;
+	dtree = NULL;
 
 	pat = m->matchstmt.matches;
 	npat = m->matchstmt.nmatches;
@@ -799,8 +913,11 @@
 	if (getenv("MATCH_STATS")) {
 		csv = fopen("match.csv", "a");
 		assert(csv != NULL);
-		fprintf(csv, "%s@L%d, %d, %ld\n", fname(m->loc), lnum(m->loc), ndtree, dtheight(root));
+		fprintf(csv, "%s@L%d, %ld, %zd, %zd\n", fname(m->loc), lnum(m->loc), refsum(root), ndtree, dtheight(root));
 		fclose(csv);
+
+		/* clear 'emitted' so it can be reused by genmatchcode. */
+		clearemit(root);
 	}
 
 	return root;
@@ -887,7 +1004,7 @@
 
 
 	endlbl = genlbl(m->loc);
-	dt = gendtree(m, val, lbl, nlbl, 0);
+	dt = gendtree(m, val, lbl, nlbl);
 	genmatchcode(dt, out, nout);
 
 	for (i = 0; i < npat; i++) {
--- a/mi/match_test.c
+++ b/mi/match_test.c
@@ -26,7 +26,7 @@
 char debugopt[128];
 
 typedef struct Dtree Dtree;
-extern Dtree *gendtree(Node *m, Node *val, Node **lbl, size_t nlbl, int startid);
+extern Dtree *gendtree(Node *m, Node *val, Node **lbl, size_t nlbl);
 extern void dtreedump(FILE *fd, Dtree *dt);
 
 
@@ -250,7 +250,7 @@
 			lappend(&lbl, &nlbl, genlbl(pat[i]->match.block->loc));
 		}
 
-		dt = gendtree(m, v, lbl, nlbl, 0);
+		dt = gendtree(m, v, lbl, nlbl);
 		if (getenv("d")) {
 			fprintf(stderr, "dtree >>\n");
 			dtreedump(stderr, dt);
--- a/support/matchstats.myr
+++ b/support/matchstats.myr
@@ -60,7 +60,7 @@
 }
 
 const main = {args : byte[:][:]
-	var f, locs, sizes, heights, count
+	var f, locs, refcnts, sizes, heights, count
 
 	if args.len < 2
 		std.put("need input file\n")
@@ -73,6 +73,7 @@
 	;;
 
 	locs = [][:]
+	refcnts = [][:]
 	sizes = [][:]
 	heights = [][:]
 	count = 0
@@ -85,6 +86,11 @@
 		;;
 
 		match bio.readto(f, ",")
+		| `std.Ok refcnt: std.slpush(&refcnts, atoi(std.strstrip(refcnt)))
+		| `std.Err e: std.fatal("error read refcnt: {}\n", e)
+		;;
+
+		match bio.readto(f, ",")
 		| `std.Ok size: std.slpush(&sizes, atoi(std.strstrip(size)))
 		| `std.Err e: std.fatal("error read size: {}\n", e)
 		;;
@@ -97,6 +103,7 @@
 	;;
 
 	std.put("Sample count: {}\n", count)
+	std.put("Dtree Refcnt\tavg: {s=3}\t95th percentile: {s=3}\t maximum: {}\n", avg(refcnts), percentile(95, refcnts), maximum(refcnts))
 	std.put("Dtree Size\tavg: {s=3}\t95th percentile: {s=3}\t maximum: {}\n", avg(sizes), percentile(95, sizes), maximum(sizes))
 	std.put("Dtree Height\tavg: {s=3}\t95th percentile: {s=3}\t maximum: {}\n", avg(heights), percentile(95, heights), maximum(heights))