shithub: gefs

Download patch

ref: f2fb2cbdd516cfa3a27bd1ede173ec79aaaf80d8
parent: 344034ea3fd4be80713a50989a03e331fd609dc9
author: Ori Bernstein <ori@eigenstate.org>
date: Fri Apr 7 11:55:27 EDT 2023

snap.c: rewrite to use deadlists properly.

--- a/TODO
+++ b/TODO
@@ -1,20 +1,17 @@
 *** pending disk format changes ***
 - deadlists in snap tree, rather than tree blob
-- hashes stored out-of-line
-- remove generic header from blocks, type header
-  for arenas should start with 'ge', type header
-  for other blocks should be some other mnemonic
 - packarena should have next gen
 - small file optimizations
 	- inline data
 	- block fragmentation
 
-*** major issues, need to fix ***
+*** issues, need to fix ***
 - live alloc log recompression
 - Reserve blocks for deletion
 - reclaiming data from deleted files is very delayed
 - transient exec snapshots
 - testing, debugging, bugfixes
+- make empty snapshot explicit
 
 *** nice to have, can go without ***
 - add missing management commands in console
--- a/blk.c
+++ b/blk.c
@@ -17,7 +17,7 @@
 static vlong	blkalloc_lk(Arena*, int);
 static vlong	blkalloc(int);
 static int	blkdealloc_lk(vlong);
-static Blk*	initblk(Blk*, vlong, int);
+static Blk*	initblk(Blk*, vlong, vlong, int);
 static int	logop(Arena *, vlong, vlong, int);
 
 int
@@ -97,7 +97,6 @@
 	b->nbuf = 0;
 	b->bufsz = 0;
 	b->logsz = 0;
-	b->lognxt = 0;
 
 	switch(b->type){
 	default:
@@ -108,11 +107,14 @@
 	case Tarena:
 		b->data = b->buf;
 		break;
+	case Tdlist:
+		b->deadsz = UNPACK16(b->buf+2);
+		b->deadp = unpackbp(b->buf+4, Blksz-4);
+		b->data = b->buf + Dlhdsz;
+		break;
 	case Tlog:
-	case Tdead:
 		b->data = b->buf + Loghdsz;
 		break;
-		break;
 	case Tpivot:
 		b->data = b->buf + Pivhdsz;
 		b->nval = UNPACK16(b->buf+2);
@@ -249,7 +251,7 @@
 	assert(op == LogAlloc || op == LogFree);
 	o = -1;
 	lb = *tl;
-	dprint("logop %llx+%llx@%llx: %s\n", off, len, lb->logsz, (op == LogAlloc) ? "Alloc" : "Free");
+	dprint("logop %llx+%llx@%x: %s\n", off, len, lb->logsz, (op == LogAlloc) ? "Alloc" : "Free");
 	/*
 	 * move to the next block when we have
 	 * 40 bytes in the log:
@@ -264,7 +266,7 @@
 			return -1;
 		if((lb = cachepluck()) == nil)
 			return -1;
-		initblk(lb, o, Tlog);
+		initblk(lb, o, -1, Tlog);
 
 		lb->logsz = Loghashsz;
 		p = lb->data + lb->logsz;
@@ -427,7 +429,7 @@
 		return -1;
 	if((b = cachepluck()) == nil)
 		return -1;
-	initblk(b, ba, Tlog);
+	initblk(b, ba, -1, Tlog);
 	b->logsz = Loghashsz;
 
 	p = b->data + b->logsz;
@@ -472,7 +474,7 @@
 		free(log);
 		return -1;
 	}
-	initblk(b, ba, Tlog);
+	initblk(b, ba, -1, Tlog);
 	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
 		if(n == sz){
 			sz *= 2;
@@ -640,8 +642,21 @@
 	return b;
 }
 
+int
+blkdealloc(vlong b)
+{
+	Arena *a;
+	int r;
+
+	a = getarena(b);
+	lock(a);
+	r = blkdealloc_lk(b);
+	unlock(a);
+	return r;
+}
+
 static Blk*
-initblk(Blk *b, vlong bp, int t)
+initblk(Blk *b, vlong bp, vlong gen, int ty)
 {
 	Blk *ob;
 
@@ -651,17 +666,21 @@
 			ob, ob->bp, ob->alloced, ob->freed, ob->lasthold, ob->lastdrop);
 		abort();
 	}
-	b->type = t;
+	b->type = ty;
 	b->bp.addr = bp;
 	b->bp.hash = -1;
-	b->bp.gen = fs->nextgen;
-	switch(t){
+	b->bp.gen = gen;
+	switch(ty){
 	case Tdat:
 	case Tarena:
 		b->data = b->buf;
 		break;
-	case Tdead:
+	case Tdlist:
+		b->deadsz = 0;
+		b->data = b->buf + Dlhdsz;
+		break;
 	case Tlog:
+		b->logsz = 0;
 		b->data = b->buf + Loghdsz;
 		break;
 	case Tpivot:
@@ -679,7 +698,6 @@
 	b->nbuf = 0;
 	b->bufsz = 0;
 	b->logsz = 0;
-	b->lognxt = 0;
 	b->alloced = getcallerpc(&b);
 
 	return b;
@@ -686,26 +704,26 @@
 }
 
 Blk*
-newblk(int t)
+newblk(Tree *t, int ty)
 {
 	vlong bp;
 	Blk *b;
 
-	if((bp = blkalloc(t)) == -1)
+	if((bp = blkalloc(ty)) == -1)
 		return nil;
 	if((b = cachepluck()) == nil)
 		return nil;
-	initblk(b, bp, t);
+	initblk(b, bp, t->gen, ty);
 	b->alloced = getcallerpc(&t);
 	return b;
 }
 
 Blk*
-dupblk(Blk *b)
+dupblk(Tree *t, Blk *b)
 {
 	Blk *r;
 
-	if((r = newblk(b->type)) == nil)
+	if((r = newblk(t, b->type)) == nil)
 		return nil;
 
 	setflag(r, Bdirty);
@@ -715,7 +733,6 @@
 	r->nbuf = b->nbuf;
 	r->bufsz = b->bufsz;
 	r->logsz = b->logsz;
-	r->lognxt = b->lognxt;
 	r->alloced = getcallerpc(&b);
 	memcpy(r->buf, b->buf, sizeof(r->buf));
 	return r;
@@ -736,27 +753,26 @@
 		PACK16(b->buf+4, b->valsz);
 		PACK16(b->buf+6, b->nbuf);
 		PACK16(b->buf+8, b->bufsz);
-		b->bp.hash = blkhash(b);
 		break;
 	case Tleaf:
 		PACK16(b->buf+2, b->nval);
 		PACK16(b->buf+4, b->valsz);
-		b->bp.hash = blkhash(b);
 		break;
+	case Tdlist:
+		PACK16(b->buf+2, b->deadsz);
+		packbp(b->buf+4, Ptrsz, &b->deadp);
+		break;
 	case Tlog:
-	case Tdead:
 		h = siphash(b->data + Loghashsz, Logspc-Loghashsz);
 		PACK64(b->data, h);
-		b->bp.hash = blkhash(b);
 		break;
 	case Tdat:
-		b->bp.hash = blkhash(b);
-		break;
 	case Tmagic:
 	case Tarena:
 		break;
 	}
 
+	b->bp.hash = blkhash(b);
 	setflag(b, Bfinal);
 	cacheins(b);
 }
@@ -847,7 +863,7 @@
 	Bfree *f;
 	ulong ge;
 
-	if(t != nil && t != &fs->snap && bp.gen <= t->gen){
+	if(t != nil && t != &fs->snap && bp.gen <= t->prev){
 		killblk(t, bp);
 		return;
 	}
@@ -1056,6 +1072,7 @@
 	int i;
 
 	qlock(&fs->synclk);
+	flushdlcache(0);
 	fs->syncing = fs->nsyncers;
 	for(i = 0; i < fs->nsyncers; i++){
 		b = cachepluck();
--- a/cache.c
+++ b/cache.c
@@ -84,7 +84,7 @@
 	qlock(&fs->lrulk);
 	assert(b->magic == Magic);
 	h = ihash(b->bp.addr);
-	bkt = &fs->cache[h % fs->cmax];
+	bkt = &fs->bcache[h % fs->cmax];
 	lock(bkt);
 	if(checkflag(b, Bcached)){
 		unlock(bkt);
@@ -113,7 +113,7 @@
 		return;
 
 	h = ihash(addr);
-	bkt = &fs->cache[h % fs->cmax];
+	bkt = &fs->bcache[h % fs->cmax];
 	lock(bkt);
 	p = &bkt->b;
 	for(b = bkt->b; b != nil; b = b->hnext){
@@ -140,7 +140,7 @@
 
 	h = ihash(off);
 
-	bkt = &fs->cache[h % fs->cmax];
+	bkt = &fs->bcache[h % fs->cmax];
 
 	qlock(&fs->lrulk);
 	lock(bkt);
--- a/check.c
+++ b/check.c
@@ -209,7 +209,7 @@
 			break;
 		memcpy(name, s.kv.k+1, s.kv.nk-1);
 		name[s.kv.nk-1] = 0;
-		if((t = openlabel(name)) == nil){
+		if((t = opensnap(name)) == nil){
 			fprint(2, "invalid snap label %s\n", name);
 			ok = 0;
 			break;
--- a/cons.c
+++ b/cons.c
@@ -48,6 +48,7 @@
 syncfs(int fd, char **, int)
 {
 	sendsync(fd, 0);
+	fprint(fd, "synced\n");
 }
 
 static void
@@ -54,6 +55,7 @@
 haltfs(int fd, char **, int)
 {
 	sendsync(fd, 1);
+	fprint(fd, "gefs: ending...\n");
 }
 
 static void
@@ -72,8 +74,13 @@
 		fprint(fd, "not a new snap: %s\n", ap[1]);
 		goto Error;
 	}
-	strecpy(a->old, a->old+sizeof(a->old), ap[0]);
-	strecpy(a->new, a->new+sizeof(a->new), ap[1]);
+	if(strcmp(ap[0], "-d") == 0){
+		strecpy(a->old, a->old+sizeof(a->old), ap[1]);
+		a->new[0] = 0;
+	}else{
+		strecpy(a->old, a->old+sizeof(a->old), ap[0]);
+		strecpy(a->new, a->new+sizeof(a->new), ap[1]);
+	}
 	a->op = AOsnap;
 	a->fd = fd;
 	m->a = a;
@@ -102,7 +109,7 @@
 	Tree *t;
 
 	l = (na == 0) ? "main" : ap[0];
-	if((t = openlabel(l)) == nil){
+	if((t = opensnap(l)) == nil){
 		fprint(fd, "load users: no label %s\n", l);
 		return;
 	}
@@ -185,7 +192,7 @@
 	vlong pqid;
 	Scan s;
 
-	if((t = openlabel("main")) == nil){
+	if((t = opensnap("main")) == nil){
 		fprint(fd, "could not open main snap\n");
 		return;
 	}
@@ -226,26 +233,6 @@
 }
 
 static void
-showblkdump(int fd, char **ap, int na)
-{
-	Bptr bp;
-	Blk *b;
-
-	if(na == 0){
-		for(b = fs->blks; b != fs->blks+fs->cmax; b++){
-			fprint(fd, "%#p %B:\t%#lx %#llx %#llx\n", b, b->bp, b->flag, b->alloced, b->freed);
-			b->magic = Magic;
-			lrutop(b);
-		}
-	}else{
-		bp.addr = strtoll(ap[0], nil, 16);
-		bp.hash = -1;
-		bp.gen = -1;
-		showbp(fd, bp, 0);
-	}
-}
-
-static void
 help(int fd, char**, int)
 {
 	char *msg =
@@ -292,6 +279,7 @@
 
 	/* debugging */
 	{.name="show",	.sub="cache",	.minarg=0, .maxarg=0, .fn=showcache},
+	{.name="show",	.sub="dlist",	.minarg=0, .maxarg=0, .fn=showdlist},
 	{.name="show",	.sub="ent",	.minarg=1, .maxarg=2, .fn=showent},
 	{.name="show",	.sub="fid",	.minarg=0, .maxarg=0, .fn=showfid},
 	{.name="show",	.sub="free",	.minarg=0, .maxarg=0, .fn=showfree},
@@ -298,8 +286,6 @@
 	{.name="show",	.sub="snap",	.minarg=0, .maxarg=1, .fn=showsnap},
 	{.name="show",	.sub="tree",	.minarg=0, .maxarg=1, .fn=showtree},
 	{.name="show",	.sub="users",	.minarg=0, .maxarg=0, .fn=showusers},
-	{.name="show",	.sub="blk",	.minarg=0, .maxarg=1, .fn=showblkdump},
-	{.name="show",	.sub="blks",	.minarg=1, .maxarg=1, .fn=showblkdump},
 	{.name="debug",	.sub=nil,	.minarg=0, .maxarg=1, .fn=setdbg},
 
 	{.name=nil, .sub=nil},
--- a/dat.h
+++ b/dat.h
@@ -56,19 +56,22 @@
 	Fillsz	= 2,			/* block fill count */
 	Offksz	= 17,			/* type, qid, off */
 	Snapsz	= 9,			/* tag, snapid */
-	Dpfxsz	= 9,
-	Ndead	= 8,			/* number of deadlist heads */
-	Deadsz	= 8+8+8,		/* prev, head, hash */
-	Treesz	= 4+4+8+Ptrsz+Ndead*Deadsz,	/* ref, height, gen, root, deadlist */
+	Dpfxsz	= 9,			/* directory prefix */
+	Dlksz	= 1+8+8,		/* tag, death, birth */
+	Dlvsz	= Ptrsz+Ptrsz,		/* hd,tl of deadlist */
+	Dlkvpsz	= Dlksz+Dlvsz,		/* full size of dlist kvp */
+	Linksz	= 1+8+8,		/* gen, prev of snap */
+	Treesz	= 4+4+4+8+8+Ptrsz,	/* ref, height, gen, prev, root */
 	Kvmax	= Keymax + Inlmax,	/* Key and value */
 	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
 	Wstatmax = 4+8+8+8,		/* mode, size, atime, mtime */
 	
-
 	Pivhdsz		= 10,
 	Leafhdsz	= 6,
 	Loghdsz		= 2,
 	Loghashsz	= 8,
+	Dlhdsz		= 2+2+Ptrsz,	/* type, size, chain */
+	Dlspc		= Blksz - Dlhdsz,
 	Rootsz		= 4+Ptrsz,	/* root pointer */
 	Pivsz		= Blksz - Pivhdsz,
 	Bufspc		= (Blksz - Pivhdsz) / 2,
@@ -85,15 +88,21 @@
 enum {
 	/*
 	 * dent: pqid[8] qid[8] -- a directory entry key.
-	 * ptr:  off[8] hash[8] -- a key for an Dir block.
+	 * ptr:  off[8] hash[8] gen[8] -- a key for an Dir block.
 	 * dir:  serialized Xdir
 	 */
-	Kdat,	/* qid[8] off[8] => ptr[16]:	pointer to data page */
-	Kent,	/* pqid[8] name[n] => dir[n]:	serialized Dir */
-	Klabel,	/* name[] => snapid[]:		snapshot label */
-	Ktref,	/* tag[8] = snapid[]		scratch snapshot label */
-	Ksnap,	/* sid[8] => ref[8], tree[52]:	snapshot root */
-	Ksuper,	/* qid[8] => Kent:		parent dir */
+
+	/* fs keys */
+	Kdat,	/* qid[8] off[8] => ptr:		pointer to data page */
+	Kent,	/* pqid[8] name[n] => dir[n]:		serialized Dir */
+	Ksuper,	/* qid[8] => Kent:			parent dir */
+
+	/* snapshot keys */
+	Klabel,	/* name[] => snapid[]:			snapshot label */
+	Ktref,	/* tag[8] = snapid[]			scratch snapshot label */
+	Ksnap,	/* sid[8] => ref[8], tree[52]:		snapshot root */
+	Kdlist,	/* snap[8] gen[8] => hd[ptr],tl[ptr]	deadlist  */
+	Kslink,	/* snap[8] next[8] => []		snap successor list */
 };
 
 enum {
@@ -107,6 +116,8 @@
 	Qdump = 1ULL << 63,
 };
 
+#define Zb (Bptr){-1, -1, -1}
+
 /* internal errors */
 #define Efs	(abort(), "fs broke")
 extern char Eimpl[];
@@ -127,6 +138,7 @@
 extern char Enomem[];
 extern char Eattach[];
 extern char Enosnap[];
+extern char Esnap[];
 extern char Edir[];
 extern char Esyntax[];
 extern char Enouser[];
@@ -161,7 +173,7 @@
  * and refcount blocks.
  *
  * The superblock has this layout:
- *	version[8]	always "gefs0001"
+ *	version[8]	always "gefs0002"
  *	blksz[4]	block size in bytes
  *	bufsz[4]	portion of leaf nodes
  *			allocated to buffers,
@@ -224,7 +236,7 @@
 	Tpivot,
 	Tleaf,
 	Tlog,
-	Tdead,
+	Tdlist,
 	Tmagic,
 	Tarena = 0x6765,	/* 'ge' bigendian */
 };
@@ -245,7 +257,8 @@
 	Oinsert,	/* new kvp */
 	Odelete,	/* delete kvp */
 	Oclearb,	/* free block ptr if exists */
-	Owstat,		/* kvp dirent */
+	Owstat,		/* update kvp dirent */
+	Orefsnap,	/* increment snap refcount by delta */
 	Nmsgtype,	/* maximum message type */
 };
 
@@ -285,17 +298,7 @@
 	LogFree,	/* free a range */
 };
 
-/*
- * Operations for the deadlist log
- */
 enum {
-	DlEnd,		/* no data */
-	DlChain,	/* [8]addr, [8]hash */
-	DlGraft,	/* [8]addr, [8]hash */
-	DlKill,		/* [8]addr, [8]gen */
-};
-
-enum {
 	AOnone,
 	AOsnap,
 	AOsync,
@@ -328,9 +331,15 @@
 };
 
 struct Dlist {
-	vlong	prev;	/* previous generation */
-	Bptr	head;	/* first flushed block */
-	Blk	*ins;	/* inserted block */
+	Dlist	*cnext;	/* cache next entry */
+	Dlist	*cprev;	/* cache prev entry */
+	Dlist	*chain;	/* hash table chain */
+	Blk	*ins;	/* loaded head */
+
+	vlong	gen;	/* deadlist gen */
+	vlong	bgen;	/* birth gen */
+	Bptr	hd;	/* deadlist head */
+	Bptr	tl;	/* deadlist tail */
 };
 
 struct Arange {
@@ -368,16 +377,18 @@
 };
 
 struct Tree {
+	/* in-memory */
 	Lock	lk;
-	Tree	*snext;
-
 	long	memref;	/* number of in-memory references to this */
 	int	dirty;
-	int	ref;	/* number of on-disk references to this */
-	int	ht;
-	Bptr	bp;
-	vlong	gen;
-	Dlist	dead[Ndead];
+
+	/* on-disk */
+	int	nsucc;	/* number snapshots after us */
+	int	nlbl;	/* number of labels referring to us */
+	int	ht;	/* height of the tree */
+	Bptr	bp;	/* block pointer of root */
+	vlong	gen;	/* generation */
+	vlong	prev;	/* previous snapshot */
 };
 
 struct Bfree {
@@ -434,8 +445,6 @@
 	QLock	synclk;
 	Rendez	syncrz;
 
-	QLock	snaplk;	/* snapshot lock */
-	Tree	*opensnap;
 	Lock	mountlk;
 	Mount	*mounts;
 	Mount	*snapmnt;
@@ -465,18 +474,24 @@
 	User	*users;
 	int	nusers;
 
-	/* dent hash table */
+	/* open directory entries */
 	Lock	dtablk;
 	Dent	*dtab[Ndtab];
 
 	/* slow block io */
 	QLock	blklk[32];
+	
+	/* deadlist cache */
+	Dlist	**dlcache;
+	Dlist	*dlhead;
+	Dlist	*dltail;
+	int	dlcount;
+	int	dlcmax;
 
-	/* protected by lrulk */
+	/* block lru */
 	QLock	lrulk;
 	Rendez	lrurz;
-	Bucket	*cache;
-	Blk	*blks;	/* all blocks for debugging */
+	Bucket	*bcache;
 	Blk	*chead;
 	Blk	*ctail;
 	usize	ccount;
@@ -616,13 +631,21 @@
 
 	/* serialized to disk in header */
 	short	type;	/* @0, for all */
-	short	nval;	/* @2, for Leaf, Pivot: data[0:2] */
-	short	valsz;	/* @4, for Leaf, Pivot: data[2:4] */
-	short   nbuf;	/* @6, for Pivot */
-	short   bufsz;	/* @8, for Pivot */
-
-	vlong	logsz;	/* for allocation log */
-	vlong	lognxt;	/* for allocation log */
+	union {
+		struct {
+			short	nval;	/* @2, for Leaf, Pivot: data[0:2] */
+			short	valsz;	/* @4, for Leaf, Pivot: data[2:4] */
+			short   nbuf;	/* @6, for Pivot */
+			short   bufsz;	/* @8, for Pivot */
+		};
+		struct {
+			int	logsz;	/* @2 for allocation log */
+		};
+		struct {
+			int	deadsz;	/* @2 size of deadlist */
+			Bptr	deadp;	/* @4 next deadlist chain */
+		};
+	};
 
 	/* debug */
 	uintptr lasthold;
--- a/dump.c
+++ b/dump.c
@@ -32,13 +32,19 @@
 		n = fmtprint(fmt, "label name:\"%.*s\"", k->nk-1, k->k+1);
 		break;
 	case Ksnap:	/* name[n] => tree[24]:	snapshot root */
-		n = fmtprint(fmt, "snap id:\"%llx\"", UNPACK64(k->k+1));
+		n = fmtprint(fmt, "snap id:%lld", UNPACK64(k->k+1));
 		break;
 	case Ksuper:	/* qid[8] => pqid[8]:		parent dir */
 		n = fmtprint(fmt, "up dir:%llx", UNPACK64(k->k+1));
 		break;
+	case Kslink:
+		n = fmtprint(fmt, "slink gen:%llx, succ:%llx", UNPACK64(k->k+1), UNPACK64(k->k+9));
+		break;
+	case Kdlist:
+		n = fmtprint(fmt, "dlist gen:%llx, bgen:%llx", UNPACK64(k->k+1), UNPACK64(k->k+9));
+		break;
 	default:
-		n = fmtprint(fmt, "%.*H", k->nk, k->k);
+		n = fmtprint(fmt, "??? %.*H", k->nk, k->k);
 		break;
 	}
 	return n;
@@ -129,17 +135,24 @@
 		break;
 	case Ksnap:	/* name[n] => dent[16] ptr[16]:	snapshot root */
 		if(unpacktree(&t, v->v, v->nv) == nil)
-			return fmtprint(fmt, "corrupt tree");
-		n = fmtprint(fmt, "ref: %d, ht: %d, bp: %B, prev=%lld", t.ref, t.ht, t.bp, t.dead[0].prev);
+			n = fmtprint(fmt, "corrupt tree");
+		else
+			n = fmtprint(fmt, "<tree>");
 		break;
 	case Klabel:
-		n = fmtprint(fmt, "snap id:\"%llx\"", UNPACK64(v->v+1));
+		n = fmtprint(fmt, "snap id:\"%lld\"", UNPACK64(v->v+1));
 		break;
 	case Ksuper:	/* qid[8] => pqid[8]:		parent dir */
 		n = fmtprint(fmt, "super dir:%llx, name:\"%.*s\")", UNPACK64(v->v+1), v->nv-11, v->v+11);
 		break;
+	case Kslink:
+		n = fmtprint(fmt, "()");
+		break;
+	case Kdlist:
+		n = fmtprint(fmt, "hd:%B, tl:%B", unpackbp(v->v, v->nv), unpackbp(v->v+Ptrsz, v->nv-Ptrsz));
+		break;
 	default:
-		n = fmtprint(fmt, "%.*H", v->nk, v->k);
+		n = fmtprint(fmt, "??? %.*H", v->nk, v->k);
 		break;
 	}
 	return n;
@@ -281,8 +294,8 @@
 	case Tlog:
 		fprint(fd, "log -- ");
 		goto Show;
-	case Tdead:
-		fprint(fd, "dead -- ");
+	case Tdlist:
+		fprint(fd, "dlist -- ");
 		goto Show;
 	case Tdat:
 		fprint(fd, "dat -- ");
@@ -318,7 +331,7 @@
 		name = ap[0];
 	if(strcmp(name, "snap") == 0)
 		t = &fs->snap;
-	else if((t = openlabel(name)) == nil){
+	else if((t = opensnap(name)) == nil){
 		fprint(fd, "open %s: %r\n", name);
 		return;
 	}
@@ -349,23 +362,12 @@
 void
 showtreeroot(int fd, Tree *t)
 {
-	Dlist *dl;
-	int i;
-
 	fprint(fd, "\tgen:\t%lld\n", t->gen);
-	fprint(fd, "\tref:\t%d\n", t->ref);
+	fprint(fd, "\tprev:\t%lld\n", t->prev);
+	fprint(fd, "\tnsucc:\t%d\n", t->nsucc);
+	fprint(fd, "\tnlbl:\t%d\n", t->nlbl);
 	fprint(fd, "\tht:\t%d\n", t->ht);
 	fprint(fd, "\tbp:\t%B\n", t->bp);
-	for(i = 0; i < Ndead; i++){
-		dl = &t->dead[i];
-		fprint(fd, "	deadlist[%d]:\n", i);
-		fprint(fd, "		prev:	%llx\n", dl->prev);
-		fprint(fd, "		fhead:	%B\n", dl->head);
-		if(dl->ins != nil)
-			fprint(fd, "		open:	%B\n", dl->ins->bp);
-		fprint(fd, "		dead[%d]: (%B)\n", i, dl->head);
-		scandead(&t->dead[i], 0, showdeadbp, &fd);
-	}
 }
 
 void
@@ -378,6 +380,36 @@
 	int sz;
 	Tree t;
 
+	/* dump the labels */
+	if(na == 0){
+		lock(&fs->mountlk);
+		fprint(fd, "open:");
+		for(mnt = fs->mounts; mnt != nil; mnt = mnt->next)
+			fprint(fd, " %s\n", mnt->name);
+		fprint(fd, "\n");
+		unlock(&fs->mountlk);
+
+		pfx[0] = Klabel;
+		sz = 1;
+		btnewscan(&s, pfx, sz);
+		if((e = btenter(&fs->snap, &s)) != nil){
+			fprint(fd, "scan: %s\n", e);
+			btexit(&fs->snap, &s);
+			return;
+		}
+		while(1){
+			if((e = btnext(&fs->snap, &s, &s.kv)) != nil){
+				fprint(fd, "scan: %s\n", e);
+				break;
+			}
+			if(s.done)
+				break;
+			fprint(fd, "label: %P\n", &s.kv);
+		}
+		btexit(&fs->snap, &s);
+	}
+
+	/* dump the snapshots */
 	pfx[0] = Ksnap;
 	sz = 1;
 	if(na != 0){
@@ -406,12 +438,55 @@
 		showtreeroot(fd, &t);
 	}
 	btexit(&fs->snap, &s);
-	qlock(&fs->snaplk);
-	for(mnt = fs->mounts; mnt != nil; mnt = mnt->next){
-		fprint(fd, "open: %s\n", mnt->name);
-		showtreeroot(fd, mnt->root);
+}
+
+void
+showdlist(int fd, char** ap, int na)
+{
+	char *p, *e, *err, pfx[Kvmax];
+	Dlist dl;
+	Bptr hd;
+	Scan s;
+	Blk *b;
+	vlong id;
+	int sz;
+
+	pfx[0] = Kdlist;
+	sz = 1;
+	if(na != 0){
+		sz = 9;
+		id = atoll(ap[0]);
+		PACK64(pfx+1, id);
 	}
-	qunlock(&fs->snaplk);
+	btnewscan(&s, pfx, sz);
+	if((err = btenter(&fs->snap, &s)) != nil){
+		fprint(fd, "scan: %s\n", err);
+		btexit(&fs->snap, &s);
+		return;
+	}
+	while(1){
+		if((e = btnext(&fs->snap, &s, &s.kv)) != nil){
+			fprint(fd, "scan: %s\n", e);
+			break;
+		}
+		if(s.done)
+			break;
+		fprint(fd, "dlist: %P\n", &s.kv);
+		kv2dlist(&s.kv, &dl);
+		hd = dl.hd;
+		while(hd.addr != -1){
+			if((b = getblk(hd, 0)) == nil){
+				fprint(fd, "broken: %B\n", hd);
+				break;
+			}
+			fprint(fd, "deadsz: %xm deadp=%B\n", b->deadsz, b->deadp);
+			e = b->data + b->deadsz;
+			for(p = b->data; p != e; p += 8)
+				fprint(fd, "\tdead: %llx\n", UNPACK64(p));
+			hd = b->deadp;
+		}
+	}
+	btexit(&fs->snap, &s);
 }
 
 void
@@ -422,7 +497,7 @@
 	int i;
 
 	for(i = 0; i < fs->cmax; i++){
-		bkt = &fs->cache[i];
+		bkt = &fs->bcache[i];
 		lock(bkt);
 		if(bkt->b != nil)
 			fprint(fd, "bkt%d\n", i);
--- a/fns.h
+++ b/fns.h
@@ -26,8 +26,8 @@
 #define	PACK64(p,v)	do{(p)[0]=(v)>>56;(p)[1]=(v)>>48;(p)[2]=(v)>>40;(p)[3]=(v)>>32;\
 			   (p)[4]=(v)>>24;(p)[5]=(v)>>16;(p)[6]=(v)>>8;(p)[7]=(v);}while(0)
 
-Blk*	newblk(int type);
-Blk*	dupblk(Blk*);
+Blk*	newblk(Tree *t, int type);
+Blk*	dupblk(Tree *t, Blk*);
 Blk*	getroot(Tree*, int*);
 Blk*	getblk(Bptr, int);
 Blk*	holdblk(Blk*);
@@ -53,19 +53,17 @@
 void	freeblk(Tree*, Blk*);
 void	freebp(Tree*, Bptr);
 int	killblk(Tree*, Bptr);
-void	reclaimblk(Bptr);
+int	blkdealloc(vlong);
 ushort	blkfill(Blk*);
 uvlong	blkhash(Blk*);
 u32int	ihash(uvlong);
 void	finalize(Blk*);
-Tree*	newsnap(Tree*);
-char*	freesnap(Tree*, Tree*);
-char*	labelsnap(char*, vlong);
-char*	unlabelsnap(vlong, char*);
-char*	refsnap(vlong);
-char*	unrefsnap(vlong, vlong);
-Tree*	openlabel(char*);
-Tree*	opensnap(vlong);
+
+char*	updatesnap(Tree**, Tree*, char*);
+char*	labelsnap(Tree*, char*);
+char*	delsnap(Tree*, char*);
+Tree*	opensnap(char*);
+
 void	closesnap(Tree*);
 uvlong	siphash(void*, usize);
 void	reamfs(char*);
@@ -76,6 +74,7 @@
 int	scandead(Dlist*, int, void(*)(Bptr, void*), void*);
 int	endfs(void);
 int	compresslog(Arena*);
+char*	flushdlcache(int);
 void	setval(Blk*, Kvp*);
 
 Conn*	newconn(int, int);
@@ -110,6 +109,7 @@
 void	showblk(int, Blk*, char*, int);
 void	showbp(int, Bptr, int);
 void	showtreeroot(int, Tree*);
+void	showdlist(int, char**, int);
 void	showtree(int, char**, int);
 void	showsnap(int, char**, int);
 void	showfid(int, char**, int);
@@ -138,7 +138,13 @@
 int	kv2statbuf(Kvp*, char*, int);
 int	dir2statbuf(Xdir*, char*, int);
 int	kv2dir(Kvp*, Xdir*);
-int	kv2qid(Kvp*, Qid*);
+void	kv2qid(Kvp*, Qid*);
+void	kv2dlist(Kvp*, Dlist*);
+void	dlist2kv(Dlist*, Kvp*, char*, int);
+void	tree2kv(Tree*, Kvp*, char*, int);
+void	lbl2kv(char*, vlong, Kvp*, char*, int);
+void	link2kv(vlong, vlong, Kvp*, char*, int);
+void	kv2link(Kvp*, vlong*, vlong*);
 
 char*	packbp(char*, int, Bptr*);
 Bptr	unpackbp(char*, int);
--- a/fs.c
+++ b/fs.c
@@ -11,63 +11,55 @@
 
 static char*	clearb(Fid*, vlong, vlong);
 
-static char*
-updatemount(Mount *mnt)
-{
-	Tree *t, *n;
-	char *e;
-
-	t = mnt->root;
-	if(!t->dirty)
-		return nil;
-	if((n = newsnap(t)) == nil){
-		fprint(2, "snap: save %s: %s\n", mnt->name, "create snap");
-		abort();
-	}
-	if((e = labelsnap(mnt->name, t->gen)) != nil){
-		fprint(2, "snap: save %s: %s\n", mnt->name, e);
-		abort();
-	}
-	if(t->dead[0].prev != -1){
-		if((e = unrefsnap(t->dead[0].prev, t->gen)) != nil){
-			fprint(2, "snap: unref old: %s\n", e);
-			abort();
-		}
-	}
-	lock(mnt);
-	mnt->root = n;
-	unlock(mnt);
-	closesnap(t);
-	return nil;
-}
-
 static void
 snapfs(int fd, char *old, char *new)
 {
-	Tree *t, *u;
+	Mount *mnt;
+	Tree *t, *s;
 	char *e;
 
-	u = openlabel(new);
-	if((t = openlabel(old)) == nil){
-		fprint(fd, "snap: open %s: does not exist\n", old);
+	lock(&fs->mountlk);
+	t = nil;
+	for(mnt = fs->mounts; mnt != nil; mnt = mnt->next){
+		if(strcmp(old, mnt->name) == 0){
+			t = mnt->root;
+			ainc(&t->memref);
+		}
+	}
+	if(strlen(new) == 0 && t != nil) {
+		fprint(fd, "snap: open snap '%s'\n", old);
+		unlock(&fs->mountlk);
 		return;
 	}
-	if((e = labelsnap(new, t->gen)) != nil){
-		fprint(fd, "snap: label %s: %s\n", new, e);
+	if(t == nil && (t = opensnap(old)) == nil){
+		fprint(fd, "snap: open '%s': does not exist\n", old);
+		unlock(&fs->mountlk);
 		return;
 	}
-	if(u != nil){
-		if((e = unrefsnap(u->gen, -1)) != nil){
-			fprint(fd, "snap: unref %s: %s\n", new, e);
+	if(strlen(new) == 0){
+		if((e = delsnap(t, old)) != nil){
+			fprint(fd, "snap: error deleting '%s': %s\n", new, e);
+			unlock(&fs->mountlk);
 			return;
 		}
+	}else{
+		if((s = opensnap(new)) != nil){
+			fprint(fd, "snap: already exists '%s'\n", new);
+			closesnap(s);
+			unlock(&fs->mountlk);
+			return;
+		}
+		if((e = labelsnap(t, new)) != nil){
+			fprint(fd, "snap: error creating '%s': %s\n", new, e);
+			unlock(&fs->mountlk);
+			return;
+		}
 	}
-	if(u != nil)
-		closesnap(u);
 	closesnap(t);
+	unlock(&fs->mountlk);
 	/* we probably want explicit snapshots to get synced */
 	sync();
-	fprint(fd, "snap taken: %s\n", new);
+	fprint(fd, "created: %s\n", new);
 }
 
 static void
@@ -238,6 +230,23 @@
 	return e;
 }
 
+int
+frozen(Tree *t)
+{
+	return t->nlbl != 1 || t->nsucc != 0;
+}
+
+static char*
+upsert(Mount *mnt, Msg *m, int nm)
+{
+	char *e;
+
+	if(frozen(mnt->root))
+		if((e = updatesnap(&mnt->root, mnt->root, mnt->name)) != nil)
+			return e;
+	return btupsert(mnt->root, m, nm);
+}
+
 /*
  * Clears all blocks in that intersect with
  * the range listed.
@@ -258,7 +267,7 @@
 		PACK64(m.k+9, o);
 		m.v = nil;
 		m.nv = 0;
-		if((e = btupsert(f->mnt->root, &m, 1)) != nil)
+		if((e = upsert(f->mnt, &m, 1)) != nil)
 			return e;
 	}
 	return nil;
@@ -323,7 +332,10 @@
 	PACK64(m->k+9, fb);
 
 
-	b = newblk(Tdat);
+	if(frozen(f->mnt->root))
+		if(updatesnap(&f->mnt->root, f->mnt->root, f->mnt->name) != nil)
+			return -1;
+	b = newblk(f->mnt->root, Tdat);
 	if(b == nil)
 		return -1;
 	t = nil;
@@ -421,7 +433,7 @@
 		mnt = nil;
 		goto Out;
 	}
-	if((t = openlabel(name)) == nil){
+	if((t = opensnap(name)) == nil){
 		werrstr("%s", Enosnap);
 		free(mnt->name);
 		free(mnt);
@@ -428,15 +440,7 @@
 		mnt = nil;
 		goto Out;
 	}
-	mnt->root = newsnap(t);
-	closesnap(t);
-	if(mnt->root == nil){
-		free(mnt->name);
-		free(mnt);
-		mnt = nil;
-		goto Out;
-	}
-
+	mnt->root = t;
 	mnt->next = fs->mounts;
 	fs->mounts = mnt;
 
@@ -1289,7 +1293,7 @@
 		mb[nm].nv = p - opbuf;
 		nm++;
 	}
-	if((e = btupsert(f->mnt->root, mb, nm)) != nil){
+	if((e = upsert(f->mnt, mb, nm)) != nil){
 		rerror(m, e);
 		goto Out;
 	}
@@ -1409,7 +1413,7 @@
 		mb[nm].nv = p - upvbuf;
 		nm++;
 	}
-	if((e = btupsert(f->mnt->root, mb, nm)) != nil){
+	if((e = upsert(f->mnt, mb, nm)) != nil){
 		rerror(m, e);
 		putfid(f);
 		return;
@@ -1504,7 +1508,7 @@
 	mb.k = f->dent->k;
 	mb.nk = f->dent->nk;
 	mb.nv = 0;
-	if((e = btupsert(f->mnt->root, &mb, 1)) != nil)
+	if((e = upsert(f->mnt, &mb, 1)) != nil)
 		goto Error;
 	if(f->dent->qid.type == QTFILE){
 		e = clearb(f, 0, f->dent->length);
@@ -1578,7 +1582,6 @@
 //	if(!fs->rdonly && (m->mode == OEXEC)){
 //		lock(&fs->root.lk);
 //		f->root = fs->root;
-//		refblk(fs->root.bp);
 //		unlock(&fs->root.lk);
 //	}
 	if(m->mode & OTRUNC){
@@ -1597,7 +1600,7 @@
 		mb.v = buf;
 		mb.nv = p - buf;
 		clearb(f, 0, f->dent->length);
-		if((e = btupsert(f->mnt->root, &mb, 1)) != nil){
+		if((e = upsert(f->mnt, &mb, 1)) != nil){
 			wunlock(f->dent);
 			rerror(m, e);
 			putfid(f);
@@ -1934,7 +1937,7 @@
 
 	kv[i].v = sbuf;
 	kv[i].nv = p - sbuf;
-	if((e = btupsert(f->mnt->root, kv, i+1)) != nil){
+	if((e = upsert(f->mnt, kv, i+1)) != nil){
 		rerror(m, e);
 		putfid(f);
 		abort();
@@ -2086,7 +2089,7 @@
 			if(m->a->halt)
 				ainc(&fs->rdonly);
 			for(mnt = fs->mounts; mnt != nil; mnt = mnt->next)
-				updatemount(mnt);
+				updatesnap(&mnt->root, mnt->root, mnt->name);
 			sync();
 			freemsg(m);
 			break;
--- a/load.c
+++ b/load.c
@@ -62,7 +62,6 @@
 	mnt->gen = -1;
 	mnt->root = &fs->snap;
 
-	fs->opensnap = nil;
 	fs->snapmnt = mnt;
 	fs->gotinfo = 0;
 	fs->narena = 1;
@@ -93,13 +92,6 @@
 		if(compresslog(a) == -1)
 			sysfatal("compress log: %r");
 	}
-	for(i = 0; i < Ndead; i++){
-		fs->snap.dead[i].prev = -1;
-		fs->snap.dead[i].head.addr = -1;
-		fs->snap.dead[i].head.hash = -1;
-		fs->snap.dead[i].head.gen = -1;
-		fs->snap.dead[i].ins = nil;
-	}
 
 	fprint(2, "load %s:\n", dev);
 	fprint(2, "\tsnaptree:\t%B\n", fs->snap.bp);
@@ -109,7 +101,7 @@
 	fprint(2, "\tnextgen:\t%lld\n", fs->nextgen);
 	fprint(2, "\tblocksize:\t%lld\n", Blksz);
 	fprint(2, "\tcachesz:\t%lld MiB\n", fs->cmax*Blksz/MiB);
-	if((t = openlabel("main")) == nil)
+	if((t = opensnap("main")) == nil)
 		sysfatal("load users: no main label");
 	if((e = loadusers(2, t)) != nil)
 		sysfatal("load users: %s\n", e);
--- a/main.c
+++ b/main.c
@@ -32,8 +32,15 @@
 	fs->cmax = cachesz/Blksz;
 	if(fs->cmax > (1<<30))
 		sysfatal("cache too big");
-	if((fs->cache = mallocz(fs->cmax*sizeof(Bucket), 1)) == nil)
+	if((fs->bcache = mallocz(fs->cmax*sizeof(Bucket), 1)) == nil)
 		sysfatal("malloc: %r");
+	fs->dlcmax = fs->cmax/10;
+	if(fs->dlcmax < 4)
+		fs->dlcmax = 4;
+	if(fs->dlcmax > 512)
+		fs->dlcmax = 512;
+	if((fs->dlcache = mallocz(fs->dlcmax*sizeof(Dlist*), 1)) == nil)
+		sysfatal("malloc: %r");
 
 	/* leave room for corruption check magic */
 	buf = sbrk(fs->cmax * sizeof(Blk));
@@ -45,7 +52,6 @@
 		b->magic = Magic;
 		lrutop(b);
 	}
-	fs->blks = buf;
 }
 
 static void
@@ -214,9 +220,9 @@
 		fs->arenas[i].sync = &fs->syncq[i%fs->nsyncers];
 	srvfd = postfd(srvname, "");
 	ctlfd = postfd(srvname, ".cmd");
-	launch(runtasks, -1, nil, "tasks");
 	launch(runcons, fs->nworker++, (void*)ctlfd, "ctl");
 	launch(runwrite, fs->nworker++, nil, "mutate");
+	launch(runtasks, -1, nil, "tasks");
 	for(i = 0; i < nproc/2; i++)
 		launch(runread, fs->nworker++, nil, "readio");
 	for(i = 0; i < fs->nsyncers; i++)
--- a/mkfile
+++ b/mkfile
@@ -1,4 +1,5 @@
 </$objtype/mkfile
+</sys/doc/fonts
 
 TARG=gefs
 BIN=/$objtype/bin
--- a/pack.c
+++ b/pack.c
@@ -320,30 +320,129 @@
 {
 	Xdir d;
 
-	if(kv2dir(kv, &d) == -1)
-		return -1;
+	kv2dir(kv, &d);
 	return dir2statbuf(&d, buf, nbuf);
 }
 
-int
+void
 kv2qid(Kvp *kv, Qid *q)
 {
-	char *v, *ev;
-	int err;
+	char *v, *e;
 
-	err = 0;
 	v = kv->v;
-	ev = v + kv->nv;
-	v = unpack64(&err, v, ev, &q->path);
-	v = unpack32(&err, v, ev, &q->vers);
-	unpack8(&err, v, ev, &q->type);
-	if(err){
-		werrstr("kv too small");
-		return -1;
-	}
-	return 0;
+	e = v + kv->nv;
+	q->path = UNPACK64(v);	v += 8;
+	q->vers = UNPACK64(v);	v += 8;
+	assert(v <= e);
+}
+
+void
+kv2dlist(Kvp *kv, Dlist *dl)
+{
+	char *p, *e;
+
+	p = kv->k;
+	e = p + kv->nk;
+	p++;
+	dl->gen = UNPACK64(p);	p += 8;
+	dl->bgen = UNPACK64(p);	p += 8;
+	assert(p <= e);
+	
+	p = kv->v;
+	e = p + kv->nv;
+	dl->hd = unpackbp(p, e-p);	p += Ptrsz;
+	dl->tl = unpackbp(p, e-p);	p += Ptrsz;
+	assert(p <= e);
+}
+
+void
+dlist2kv(Dlist *dl, Kvp *kv, char *buf, int nbuf)
+{
+	char *p, *e;
+
+	assert(nbuf >= Dlkvpsz);
+	p = buf;
+	e = buf+nbuf;
+
+	kv->k = p;
+	*p++ = Kdlist;
+	PACK64(p, dl->gen);	p += 8;
+	PACK64(p, dl->bgen);	p += 8;
+	kv->nk = (p - kv->k);
+	
+	kv->v = p;
+	p = packbp(p, e-p, &dl->hd);
+	p = packbp(p, e-p, &dl->tl);
+	kv->nv = (p - kv->v);
+}
+
+void
+tree2kv(Tree *t, Kvp *kv, char *buf, int nbuf)
+{
+	char *p, *e;
+
+	p = buf;
+	e = buf+nbuf;
+
+	kv->k = p;
+	if((p = packsnap(p, e-p, t->gen)) == nil)
+		abort();
+	kv->nk = p - kv->k;
+
+	kv->v = p;
+	if((p = packtree(p, e-p, t)) == nil)
+		abort();
+	kv->nv = p - kv->v;
+}
+
+void
+link2kv(vlong gen, vlong succ, Kvp *kv, char *buf, int nbuf)
+{
+	char *p;
+
+	assert(nbuf >= Linksz);
+
+	p = buf;
+	kv->k = p;
+	*p++ = Kslink;
+	PACK64(p, gen);		p += 8;
+	PACK64(p, succ);	p += 8;
+	kv->nk = (p - kv->k);
+	kv->v = p;
+	kv->nv = 0;
+}
+
+void
+kv2link(Kvp *kv, vlong *gen, vlong *succ)
+{
+	char *p;
+
+	assert(kv->nk >= Linksz);
+	assert(kv->nv == 0);
+
+	p = kv->k+1;
+	*gen = UNPACK64(p);	p += 8;
+	*succ = UNPACK64(p);	//p += 8;
 }
 
+void
+lbl2kv(char *lbl, vlong gen, Kvp *kv, char *buf, int nbuf)
+{
+	char *p;
+
+	assert(nbuf >= strlen(lbl) + 9);
+
+	p = buf;
+	kv->k = p;
+	p = packlabel(buf, nbuf, lbl);
+	kv->nk = p - kv->k;
+
+	kv->v = p;
+	if((p = packsnap(p, nbuf-kv->nk, gen)) == nil)
+		abort();
+	kv->nv = p - kv->v;
+}
+
 char*
 packlabel(char *p, int sz, char *name)
 {
@@ -390,27 +489,17 @@
 Tree*
 unpacktree(Tree *t, char *p, int sz)
 {
-	Dlist *dl;
-	Bptr *bp;
-	int i;
-
 	assert(sz >= Treesz);
 	memset(t, 0, sizeof(Tree));
-	t->ref = UNPACK32(p);		p += 4;
+	t->nsucc = UNPACK32(p);		p += 4;
+	t->nlbl = UNPACK32(p);		p += 4;
 	t->ht = UNPACK32(p);		p += 4;
 	t->gen = UNPACK64(p);		p += 8;
-	t->bp.addr = UNPACK64(p);		p += 8;
-	t->bp.hash = UNPACK64(p);		p += 8;
-	t->bp.gen = UNPACK64(p);		p += 8;
-	for(i = 0; i < Ndead; i++){
-		dl = &t->dead[i];
-		bp = &dl->head;
-		dl->prev = UNPACK64(p);	p += 8;
-		bp->addr = UNPACK64(p);	p += 8;
-		bp->hash = UNPACK64(p);	p += 8;
-		bp->gen = -1;
-		t->dead[i].ins	= nil;	/* loaded on demand */
-	}
+	t->prev = UNPACK64(p);		p += 8;
+	t->bp.addr = UNPACK64(p);	p += 8;
+	t->bp.hash = UNPACK64(p);	p += 8;
+	t->bp.gen = UNPACK64(p);	//p += 8;
+
 	return t;
 }
 
@@ -417,28 +506,15 @@
 char*
 packtree(char *p, int sz, Tree *t)
 {
-	Dlist *dl;
-	Bptr bp;
-	int i;
-
 	assert(sz >= Treesz);
-	PACK32(p, t->ref);	p += 4;
+	PACK32(p, t->nsucc);	p += 4;
+	PACK32(p, t->nlbl);	p += 4;
 	PACK32(p, t->ht);	p += 4;
 	PACK64(p, t->gen);	p += 8;
+	PACK64(p, t->prev);	p += 8;
 	PACK64(p, t->bp.addr);	p += 8;
 	PACK64(p, t->bp.hash);	p += 8;
 	PACK64(p, t->bp.gen);	p += 8;
-	for(i = 0; i < Ndead; i++){
-		dl = &t->dead[i];
-		bp = dl->head;
-		if(dl->ins != nil){
-			assert(checkflag(dl->ins, Bfinal));
-			bp = dl->ins->bp;
-		}
-		PACK64(p, dl->prev);	p += 8;
-		PACK64(p, bp.addr);	p += 8;
-		PACK64(p, bp.hash);	p += 8;
-	}
 	return p;
 }
 
@@ -446,7 +522,7 @@
 packarena(char *p, int sz, Arena *a, Fshdr *fi)
 {
 	assert(sz == Blksz);
-	memcpy(p, "gefs0001", 8);	p += 8;
+	memcpy(p, "gefs0002", 8);	p += 8;
 	PACK32(p, Blksz);		p += 4;
 	PACK32(p, Bufspc);		p += 4;
 	PACK32(p, fi->snap.ht);		p += 4;
@@ -469,7 +545,7 @@
 	assert(sz == Blksz);
 	memset(a, 0, sizeof(*a));
 	memset(fi, 0, sizeof(*fi));
-	if(memcmp(p, "gefs0001", 8) != 0){
+	if(memcmp(p, "gefs0002", 8) != 0){
 		werrstr("wrong block header %.8s\n", p);
 		return nil;
 	}
--- a/ream.c
+++ b/ream.c
@@ -46,8 +46,14 @@
 	char *p, kbuf[Keymax], vbuf[Treesz];
 	Tree t;
 	Kvp kv;
-	int i;
 
+	p = packlabel(kbuf, sizeof(kbuf), "empty");
+	kv.k = kbuf;
+	kv.nk = p - kbuf;
+	p = packsnap(vbuf, sizeof(vbuf), 0);
+	kv.v = vbuf;
+	kv.nv = p - vbuf;
+	setval(s, &kv);
 	p = packlabel(kbuf, sizeof(kbuf), "main");
 	kv.k = kbuf;
 	kv.nk = p - kbuf;
@@ -61,17 +67,12 @@
 	kv.nk = p - kbuf;
 
 	memset(&t, 0, sizeof(Tree));
-	t.ref = 2;
+	t.nsucc = 0;
+	t.nlbl = 2;
 	t.ht = 1;
 	t.gen = fs->nextgen++;
+	t.prev = -1ULL;
 	t.bp = r->bp;
-	for(i = 0; i < Ndead; i++){
-		t.dead[i].prev = -1;
-		t.dead[i].head.addr = -1;
-		t.dead[i].head.hash = -1;
-		t.dead[i].head.gen = -1;
-		t.dead[i].ins = nil;
-	}
 	p = packtree(vbuf, sizeof(vbuf), &t);
 	kv.v = vbuf;
 	kv.nv = p - vbuf;
@@ -173,12 +174,6 @@
 		initarena(&fs->arenas[i], fs, off, asz);
 		off += asz;
 	}
-	for(i = 0; i < Ndead; i++){
-		fs->snap.dead[i].prev = -1;
-		fs->snap.dead[i].head.addr = -1;
-		fs->snap.dead[i].head.hash = -1;
-		fs->snap.dead[i].head.gen = -1;
-	}
 	
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
@@ -189,7 +184,7 @@
 		if(compresslog(a) == -1)
 			sysfatal("compress log: %r");
 	}
-	if((tb = newblk(Tleaf)) == nil)
+	if((tb = newblk(mnt->root, Tleaf)) == nil)
 		sysfatal("ream: allocate root: %r");
 	holdblk(tb);
 	initroot(tb);
@@ -204,7 +199,7 @@
 	 * a single snap block that the tree will insert
 	 * into, and take a snapshot as the initial state.
 	 */
-	if((rb = newblk(Tleaf)) == nil)
+	if((rb = newblk(mnt->root, Tleaf)) == nil)
 		sysfatal("ream: allocate snaps: %r");
 	holdblk(rb);
 	initsnap(rb, tb);
--- a/snap.c
+++ b/snap.c
@@ -7,416 +7,552 @@
 #include "fns.h"
 #include "atomic.h"
 
-int
-scandead(Dlist *l, int lblk, void (*fn)(Bptr, void*), void *dat)
+static char*
+dlsync(Dlist *dl)
 {
-	char *p;
-	int i, op;
-	Dlist t;
-	Bptr bp;
-	Blk *b;
+	char kvbuf[512];
+	Msg m;
 
-	if(l->ins != nil)
-		b = l->ins;
-	else if(l->head.addr != -1)
-		b = getblk(l->head, 0);
-	else
-		return 0;
-	if(b == nil)
-		return -1;
-Nextblk:
-	for(i = Loghashsz; i < Logspc; i += 16){
-		p = b->data + i;
-		op = UNPACK64(p) & 0xff;
-		switch(op){
-		case DlEnd:
-			return 0;
-		case DlChain:
-			bp.addr = UNPACK64(p);	p += 8;
-			bp.addr &= ~0xffULL;
-			bp.hash = UNPACK64(p);
-			bp.gen = -1;
-			if(lblk)
-				fn(b->bp, dat);
-			if((b = getblk(bp, 0)) == nil)
-				return -1;
-			goto Nextblk;
-		case DlGraft:
-			t.head.addr = UNPACK64(p);	p += 8;
-			t.head.addr &= ~0xffULL;
-			t.head.hash = UNPACK64(p);
-			t.head.gen = -1;
-			t.ins = nil;
-			scandead(&t, lblk, fn, dat);
-			break;
-		case DlKill:
-			bp.addr = UNPACK64(p);	p += 8;
-			bp.hash = -1;
-			bp.gen = UNPACK64(p);
-			bp.addr &= ~0xffULL;
-			fn(bp, dat);
-			break;
-		default:
-			fprint(2, "bad op=%d\n", op);
-			abort();
-		}
-	}
-	return 0;
+	if(dl->ins == nil)
+		return nil;
+	if(checkflag(dl->ins, Bdirty))
+		enqueue(dl->ins);
+
+	dl->hd = dl->ins->bp;
+	if(dl->hd.addr == dl->tl.addr)
+		dl->tl = dl->hd;
+	m.op = Oinsert;
+	dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+	return btupsert(&fs->snap, &m, 1);
 }
 
-/*
- * Insert into a deadlist. Because the only
- * operations are chaining deadlist pages
- * and killing blocks, we don't preserve the
- * order of kills.
- */
-static int
-dlinsert(Dlist *dl, vlong v1, vlong v2)
+static void
+dlcachedel(Dlist *dl, int hdel)
 {
-	Blk *lb, *pb;
-	vlong end, hash;
-	char *p;
+	uint h;
+	Dlist *d, **p;
 
-	lb = dl->ins;
-	if(lb == nil && dl->head.addr != -1)
-		lb = getblk(dl->head, 0);
-	/*
-	 * move to the next block when we have
-	 * 32 bytes in the log:
-	 * We're appending up to 16 bytes as
-	 * part of the operation, followed by
-	 * 16 bytes of chaining.
-	 */
-	if(lb == nil || lb->logsz >= Logspc - 40){
-		pb = lb;
-		if((lb = newblk(Tdead)) == nil)
-			return -1;
-		if(pb != nil){
-			finalize(pb);
-			if(syncblk(pb) == -1)
-				return -1;
-			dl->head = pb->bp;
+	h = ihash(dl->gen) ^ ihash(dl->bgen);
+	if(hdel){
+		p = &fs->dlcache[h % fs->dlcmax];
+		for(d = *p; d != nil; d = d->chain){
+			if(d->gen == dl->gen && d->bgen == dl->bgen)
+				break;
+			p = &d->chain;
 		}
-		lb->logsz = Loghashsz;
-		dl->ins = lb;
-		dropblk(pb);
+		if(d != nil)
+			*p  = d->chain;
 	}
-	p = lb->data + lb->logsz;
-	PACK64(p, v1);	p += 8;
-	PACK64(p, v2);	p += 8;
-	if(dl->head.addr == -1){
-		end = DlEnd;
-		hash = -1;
-	}else{
-		end = dl->head.addr|DlChain;
-		hash = dl->head.hash;
-	}
-	PACK64(p+0, end);
-	PACK64(p+8, hash);
-	lb->logsz = (p - lb->data);
-	return 0;
+	if(dl == fs->dlhead)
+		fs->dlhead = dl->cnext;
+	if(dl == fs->dltail)
+		fs->dltail = dl->cprev;
+	if(dl->cnext != nil)
+		dl->cnext->cprev = dl->cprev;
+	if(dl->cprev != nil)
+		dl->cprev->cnext = dl->cnext;
+	dl->cnext = nil;
+	dl->cprev = nil;
 }
 
-static int
-graft(Dlist *dst, Dlist *src)
+static Dlist*
+dlcacheget(vlong gen, vlong bgen)
 {
-	if(src->ins != nil){
-		finalize(src->ins);
-		if(syncblk(src->ins) == -1)
-			return -1;
-		src->head = src->ins->bp;
-		src->ins = nil;
-	}
-	if(src->head.addr == -1)
-		return 0;
-	return dlinsert(dst, src->head.addr|DlGraft, src->head.hash);
-}
-
-int
-killblk(Tree *t, Bptr bp)
-{
 	Dlist *dl;
-	int i;
+	uint h;
 
-	dl = &t->dead[0];
-	for(i = 0; i < Ndead; i++){
-		if(t->dead[i].prev <= bp.gen)
+	h = ihash(gen) ^ ihash(bgen);
+	for(dl = fs->dlcache[h % nelem(fs->dlcache)]; dl != nil; dl = dl->chain)
+		if(dl->gen == gen && dl->bgen == bgen)
 			break;
-		dl = &t->dead[i];
-	}
-	dlinsert(dl, bp.addr|DlKill, bp.gen);
-	return 0;
+	if(dl != nil)
+		dlcachedel(dl, 0);
+	return dl;
 }
 
-Tree*
-openlabel(char *name)
+static Dlist*
+getdl(vlong gen, vlong bgen)
 {
-	char *p, buf[Kvmax];
+	char kbuf[Dlksz], kvbuf[Dlkvpsz];
+	Dlist *dl, **p;
+	uint h;
+	Msg m;
 	Kvp kv;
 	Key k;
 
-	if((p = packlabel(buf, sizeof(buf), name)) == nil)
+	if((dl = dlcacheget(gen, bgen)) != nil)
+		return dl;
+	if((dl = mallocz(sizeof(Dlist), 1)) == nil)
 		return nil;
-	k.k = buf;
-	k.nk = p - buf;
-	if(btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)) != nil)
-		return nil;
-	if(kv.nv != Snapsz)
-		return nil;
-	return opensnap(UNPACK64(kv.v + 1));
-}
+	kbuf[0] = Kdlist;
+	PACK64(kbuf+1, gen);
+	PACK64(kbuf+9, bgen);
+	k.k = kbuf;
+	k.nk = sizeof(kbuf);
 
-Tree*
-opensnap(vlong id)
-{
-	char *p, buf[Kvmax];
-	Tree *t;
-	Kvp kv;
-	Key k;
-
-	qlock(&fs->snaplk);
-	for(t = fs->opensnap; t != nil; t = t->snext){
-		if(t->gen == id){
-			ainc(&t->memref);
-			qunlock(&fs->snaplk);
-			return t;
+	/* load up existing dlist */
+	if(btlookup(&fs->snap, &k, &kv, kvbuf, sizeof(kvbuf)) == nil){
+		kv2dlist(&kv, dl);
+		if(dl->hd.addr != -1 && (dl->ins = getblk(dl->hd, 0)) == nil){
+			free(dl);
+			return nil;
 		}
+		goto Found;
 	}
-	if((t = mallocz(sizeof(Tree), 1)) == nil)
-		goto Error;
-	memset(&t->lk, 0, sizeof(t->lk));
 
-	if((p = packsnap(buf, sizeof(buf), id)) == nil)
-		goto Error;
-	k.k = buf;
-	k.nk = p - buf;
-	if(btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)) != nil)
-		goto Error;
-	if(unpacktree(t, kv.v, kv.nv) == nil)
-		goto Error;
-	t->memref = 1;
-	t->snext = fs->opensnap;
-	fs->opensnap = t;
-	qunlock(&fs->snaplk);
-	return t;
+	/* create a new one if it didn't exist */
+	dl->gen = gen;
+	dl->bgen = bgen;
+	dl->hd.addr = -1;
+	dl->tl.addr = -1;
+	dl->ins = nil;
 
-Error:
-	qunlock(&fs->snaplk);
-	return nil;
+	m.op = Oinsert;
+	dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+	if(btupsert(&fs->snap, &m, 1) != nil){
+		free(dl);
+		return nil;
+	}
+Found:
+	h = ihash(gen) ^ ihash(bgen);
+	p = &fs->dlcache[h % nelem(fs->dlcache)];
+	dl->chain = *p;
+	*p = dl;
+	return dl;
 }
 
 void
-closesnap(Tree *t)
+putdl(Dlist *dl)
 {
-	Tree *te, **p;
-	Blk *ins;
-	int i;
+	Dlist *dt;
 
-	if(adec(&t->memref) != 0)
-		return;
+	dlcachedel(dl, 0);
+	dl->cprev = nil;
+	dl->cnext = fs->dlhead;
+	fs->dlhead = dl;
 
-	for(i = Ndead-1; i >= 0; i--){
-		ins = t->dead[i].ins;
-		if(ins == nil)
-			continue;
-		finalize(ins);
-		syncblk(ins);
-		t->dead[i].head = ins->bp;
-//FIXME: 	putblk(ins);
+	while(fs->ctail != nil && fs->dlcount >= fs->dlcmax){
+		dt = fs->dltail;
+		dlsync(dt);
+		dlcachedel(dt, 1);
+		dropblk(dt->ins);
+		free(dt);
 	}
-
-	p = &fs->opensnap;
-	for(te = fs->opensnap; te != nil; te = te->snext){
-		if(te == t)
-			break;
-		p = &te->snext;
-	}
-	assert(te != nil);
-	*p = te->snext;
-	free(te);
 }
 
 static char*
-modifysnap(int op, Tree *t)
+freedl(Dlist *dl, int docontents)
 {
-	char kbuf[Snapsz], vbuf[Treesz];
-	char *p, *e;
-	Msg m;
-	int i;
+	Bptr bp;
+	Blk *b;
+	char *p;
 
-	for(i = 0; i < Ndead; i++){
-		if(t->dead[i].ins != nil){
-			finalize(t->dead[i].ins);
-			syncblk(t->dead[i].ins);
+	bp = dl->hd;
+	while(bp.addr != -1){
+		/* ugly: when we merge deadlists, we change their hash */
+		if((b = getblk(bp, GBnochk)) == nil)
+			return Efs;
+		if(docontents){
+			for(p = b->data; p != b->data+b->deadsz; p += 8){
+				if(blkdealloc(UNPACK64(p)) == -1){
+					dropblk(b);
+					return Efs;
+				}
+			}
 		}
+		bp = b->deadp;
+		freeblk(&fs->snap, b);
+		dropblk(b);
 	}
-	m.op = op;
-	if((p = packsnap(kbuf, sizeof(kbuf), t->gen)) == nil)
-		return Elength;
-	m.k = kbuf;
-	m.nk = p - kbuf;
-	if(op == Oinsert){
-		if((p = packtree(vbuf, sizeof(vbuf), t)) == nil)
-			return Elength;
-		m.v = vbuf;
-		m.nv = p - vbuf;
-	}else{
-		m.v = nil;
-		m.nv = 0;
-	}
-	if((e = btupsert(&fs->snap, &m, 1)) != nil)
-		return e;
 	return nil;
 }
 
 static char*
-modifylabel(int op, char *name, vlong id)
+mergedl(vlong succ, vlong gen, vlong bgen)
 {
-	char *p, *e, kbuf[Keymax], vbuf[Snapsz];
-	Msg m;
+	char *err, buf[2][Kvmax];
+	Dlist *d, *m;
+	Msg msg[2];
+	Blk *b;
+//	vlong x;
 
-	if(strcmp(name, "dump") == 0)
-		return Ename;
-	if(op == Oinsert)
-		e = refsnap(id);
-	else
-		e = unrefsnap(id, -1);
-	if(e != nil)
-		return e;
+	err = nil;
+	d = getdl(gen, bgen);
+	m = getdl(succ, bgen);
+	if(d == nil || m == nil){
+		err = Efs;
+		goto Out;
+	}
+	if(m->ins == nil){
+		m->hd = d->hd;
+		m->tl = d->tl;
+		m->ins = d->ins;
+	}else{
+		m->hd = d->hd;
+		/* ugly: when we merge deadlists, we change their hash */
+		if((b = getblk(d->tl, GBnochk)) == nil)
+			goto Out;
+		msg[0].op = Odelete;
+		dlist2kv(d, &msg[0], buf[0], sizeof(buf[0]));
+		msg[1].op = Oinsert;
+		dlist2kv(m, &msg[1], buf[1], sizeof(buf[1]));
+		if((err = btupsert(&fs->snap, msg, 2)) != nil)
+			sysfatal("merge deadlists: %s", err);
 
-	m.op = op;
-	if((p = packlabel(kbuf, sizeof(kbuf), name)) == nil)
-		return Elength;
-	m.k = kbuf;
-	m.nk = p - kbuf;
-	if((p = packsnap(vbuf, sizeof(vbuf), id)) == nil)
-		return Elength;
-	m.v = vbuf;
-	m.nv = p - vbuf;
+		packbp(b->buf+2, Blksz, &m->hd);
+		enqueue(b);
+		dropblk(b);
 
-	if((e = btupsert(&fs->snap, &m, 1)) != nil)
-		return e;
-	return nil;
+// TODO: merge
+//		if(m->ins->deadsz + d->ins->deadsz < Dlspc){
+//			p = d->ins->data;
+//			q = m->ins->data + m->ins->deadsz;
+//			for(i = 0; i < d->ins->deadsz; i += 8){
+//				m->ins->deadsz += 8;
+//				x = UNPACK64(p);
+//				PACK64(q, x);
+//				p += 8;
+//				q += 8;
+//			}
+//		}
+	}
+Out:
+	if(d != nil)
+		putdl(d);
+	if(m != nil)
+		putdl(m);
+	return err;
 }
 
-char*
-labelsnap(char *name, vlong gen)
+static vlong
+successor(vlong gen)
 {
-	return modifylabel(Oinsert, name, gen);
-}
+	char *e, pfx[9];
+	vlong id, succ;
+	Scan s;
 
-char*
-unlabelsnap(vlong gen, char *name)
-{
-	return modifylabel(Odelete, name, gen);
+	pfx[0] = Kslink;
+	succ = -1;
+	PACK64(pfx+1, gen);
+	btnewscan(&s, pfx, sizeof(pfx));
+	if((e = btenter(&fs->snap, &s)) != nil)
+		goto Err;
+	if((e = btnext(&fs->snap, &s, &s.kv)) != nil)
+		goto Err;
+	if(!s.done)
+		kv2link(&s.kv, &id, &succ);
+	if((e = btnext(&fs->snap, &s, &s.kv)) != nil)
+		goto Err;
+	if(!s.done)
+		sysfatal("multiple successors in cleanup");
+	btexit(&fs->snap, &s);
+	return succ;
+Err:
+	fprint(2, "error getting snap successor: %s", e);
+	btexit(&fs->snap, &s);
+	return -1;
 }
 
-char*
-refsnap(vlong id)
+static char*
+reclaimblocks(vlong gen, vlong prev)
 {
-	Tree *t;
-	char *e;
+	char *e, pfx[9];
+	vlong succ;
+	Dlist dl;
+	Scan s;
+	Msg m;
 
-	t = opensnap(id);
-	t->ref++;
-	if((e = modifysnap(Oinsert, t)) != nil)
+	succ = successor(gen);
+
+	pfx[0] = Kdlist;
+	if(succ == -1)
+		PACK64(pfx+1, gen);
+	else
+		PACK64(pfx+1, succ);
+	btnewscan(&s, pfx, sizeof(pfx));
+	if((e = btenter(&fs->snap, &s)) != nil)
 		return e;
-	closesnap(t);
-	return nil;
+	while(1){
+		if((e = btnext(&fs->snap, &s, &s.kv)) != nil)
+			break;
+		if(s.done)
+			break;
+		kv2dlist(&s.kv, &dl);
+
+		/*
+		 * delete the dlist before processing it; it's 
+		 * better to leak blocks than to double free.
+		 */
+		m.op = Odelete;
+		m.Kvp = s.kv;
+		if((e = btupsert(&fs->snap, &m, 1)) != nil)
+			break;
+		if(succ == -1)
+			e = freedl(&dl, 1);
+		else if(dl.bgen > prev)
+			e = freedl(&dl, 0);
+		else
+			e = mergedl(succ, dl.gen, dl.bgen);
+		if(e != nil)
+			break;
+	}
+	btexit(&fs->snap, &s);
+	return e;
 }
 
+/*
+ * Removes a label from a snapshot, allowing
+ * it to be reclaimed if it is not a direct
+ * predecessor of more than one other snapshot.
+ *
+ * If it has one successor and no label, then
+ * it will be merged with that successor.
+ */
 char*
-unrefsnap(vlong id, vlong succ)
+delsnap(Tree *t, char *name)
 {
-	Tree *t, *u;
-	char *e;
+	char buf[2][Kvmax], *e;
+	Msg m[2];
+	int nm;
 
-	if((t = opensnap(id)) == nil)
-		return Eexist;
-	if(--t->ref == 0){
-		if((u = opensnap(succ)) == nil)
-			return Eexist;
-		if((e = freesnap(t, u)) != nil)
-			return e;
-		if((e = modifysnap(Odelete, t)) != nil)
-			return e;
-		if((e = modifysnap(Oinsert, u)) != nil)
-			return e;
-		closesnap(u);
-		closesnap(t);
+	if(strcmp(name, "dump") == 0 || strcmp(name, "empty") == 0)
+		return Ename;
+
+	nm = 0;
+	m[nm].op = Odelete;
+	m[nm].k = buf[nm];
+	if((e = packlabel(buf[nm], sizeof(buf[nm]), name)) == nil)
+		return Ename;
+	m[nm].nk = e - m[nm].k;
+	m[nm].v = nil;
+	m[nm].nv = 0;
+	nm++;
+ 
+	t->nlbl--;
+	if(t->nlbl == 0 && t->nsucc <= 1){
+		m[nm].op = Odelete;
+		m[nm].k = buf[nm];
+		if((e = packsnap(buf[nm], sizeof(buf[nm]), t->gen)) == nil)
+			return Ename;
+		m[nm].nk = e - m[nm].k;
+		m[nm].v = nil;
+		m[nm].nv = 0;
+		nm++;
 	}else{
-		if((e = modifysnap(Oinsert, t)) != nil)
-			return e;
-		closesnap(t);
+		m[nm].op = Oinsert;
+		tree2kv(t, &m[nm], buf[nm], sizeof(buf[nm]));
+		nm++;
 	}
+	if((e = flushdlcache(1)) != nil)
+		return e;
+	if((e = btupsert(&fs->snap, m, nm)) != nil)
+		return e;
+	if((e = reclaimblocks(t->gen, t->prev)) != nil)
+		return e;
 	return nil;
 }
 
-static void
-freedead(Bptr bp, void *)
+/*
+ * Attaches a label to a tree, incrementing
+ * its reference count. This labelled snapshot
+ * will show up in the dump.
+ */
+char*
+labelsnap(Tree *t, char *name)
 {
-	freebp(nil, bp);
-}
+	char buf[2][Kvmax];
+	Msg m[2];
 
-static void
-redeadlist(Bptr bp, void *pt)
-{
-	killblk(pt, bp);
+	if(strcmp(name, "dump") == 0 || strcmp(name, "empty") == 0)
+		return Ename;
+	t->nlbl++;
+	m[0].op = Oinsert;
+	tree2kv(t, &m[0], buf[0], sizeof(buf[0]));
+	m[1].op = Oinsert;
+	lbl2kv(name, t->gen, &m[1], buf[1], sizeof(buf[1]));
+
+	return btupsert(&fs->snap, m, 2);
 }
 
+/*
+ * Updates a snapshot; keeps the generation the same if possible,
+ * otherwise moves to a new generation. A snapshot may only stay
+ * at the same generation as long as it is at the tip of a snapshot
+ * list; once it's observable by a derived snapshot it must be
+ * immutable.
+ */
 char*
-freesnap(Tree *snap, Tree *next)
+updatesnap(Tree **r, Tree *o, char *lbl)
 {
-	Dlist dl;
-	int i;
+	char buf[4][Kvmax], *e;
+	Msg m[4];
+	Tree *t;
 
-	assert(snap->gen != next->gen);
-	assert(next->dead[0].prev == snap->gen);
+	if(!o->dirty)
+		return nil;
+	/* this snap can be modified in place, so do that */
+	if(o->nlbl == 1 && o->nsucc == 0){
+		m[0].op = Oinsert;
+		tree2kv(o, &m[0], buf[0], sizeof(buf[0]));
+		if((e = btupsert(&fs->snap, &m[0], 1)) != nil)
+			return e;
+		o->dirty = 0;
+		return nil;
+	}
 
-	dl = next->dead[Ndead-1];
-	scandead(&next->dead[0], 1, freedead, nil);
-	for(i = 0; i < Ndead-2; i++){
-		if(graft(&snap->dead[i], &next->dead[i+1]) == -1)
-			return Efs;
-		next->dead[i] = snap->dead[i];
+	/* update the old kvp */
+	o->nsucc++;
+	o->nlbl--;
+	m[0].op = Oinsert;
+	tree2kv(o, &m[0], buf[0], sizeof(buf[0]));
+
+	/* create the new one */
+	if((t = mallocz(sizeof(Tree), 1)) == nil)
+		return Enomem;
+	t->memref = 1;
+	t->dirty = 0;
+
+	t->nlbl = 1;
+	t->nsucc = 0;
+	t->ht = o->ht;
+	t->bp = o->bp;
+	t->prev = o->gen;
+	t->gen = aincv(&fs->nextgen, 1);
+
+	m[1].op = Oinsert;
+	tree2kv(t, &m[1], buf[1], sizeof(buf[1]));
+	m[2].op = Oinsert;
+	link2kv(t->prev, t->gen, &m[2], buf[2], sizeof(buf[2]));
+	m[3].op = Oinsert;
+	lbl2kv(lbl, t->gen, &m[3], buf[3], sizeof(buf[3]));
+	if((e = btupsert(&fs->snap, m, 4)) != nil){
+		free(t);
+		return e;
 	}
-	for(; i < Ndead; i++)
-		next->dead[i] = snap->dead[i];
-	scandead(&dl, 0, redeadlist, next);
 
+	/* only update the dirty status after we sync */
+	o->dirty = 0;
+	closesnap(o);
+	*r = t;
 	return nil;
 }
 
+/*
+ * open snapshot by label, returning a tree.
+ */
 Tree*
-newsnap(Tree *t)
+opensnap(char *label)
 {
+	char *p, buf[Kvmax];
+	Tree *t;
 	vlong gen;
-	Tree *r;
-	int i;
+	Kvp kv;
+	Key k;
 
-	if(t->dirty && modifysnap(Oinsert, t) != nil)
+	/* Klabel{"name"} => Ksnap{id} */
+	if((p = packlabel(buf, sizeof(buf), label)) == nil)
 		return nil;
-
-	if((r = calloc(sizeof(Tree), 1)) == nil)
+	k.k = buf;
+	k.nk = p - buf;
+	if(btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)) != nil)
 		return nil;
-	gen = aincv(&fs->nextgen, 1);
-	memset(&r->lk, 0, sizeof(r->lk));
-	r->snext = fs->opensnap;
-	r->memref = 1;
-	r->ref = 0;
-	r->ht = t->ht;
-	r->bp = t->bp;
-	r->gen = gen;
-	r->dirty = 0;
-	/* shift deadlist down */
-	for(i = Ndead-1; i >= 0; i--){
-		r->dead[i].prev = (i == 0) ? t->gen : t->dead[i-1].prev;
-		r->dead[i].head.addr = -1;
-		r->dead[i].head.hash = -1;
-		r->dead[i].head.gen = -1;
-		r->dead[i].ins = nil;
+	if(kv.nv != Snapsz)
+		abort();
+	gen = UNPACK64(kv.v + 1);
+
+	if((t = mallocz(sizeof(Tree), 1)) == nil)
+		goto Error;
+	if((p = packsnap(buf, sizeof(buf), gen)) == nil)
+		goto Error;
+	k.k = buf;
+	k.nk = p - buf;
+	if(btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)) != nil)
+		abort();
+	if(unpacktree(t, kv.v, kv.nv) == nil)
+		abort();
+	ainc(&t->memref);
+	return t;
+Error:
+	free(t);
+	return nil;
+}
+
+/*
+ * close snapshot, flushing and freeing in-memory
+ * representation.
+ */
+void
+closesnap(Tree *t)
+{
+	if(t == nil || adec(&t->memref) != 0)
+		return;
+	assert(!t->dirty);
+	free(t);
+}
+
+char*
+flushdlcache(int clear)
+{
+	Dlist *dl, *n;
+	char *e;
+
+	for(dl = fs->dlhead; dl != nil; dl = n){
+		n = dl->cnext;
+		if((e = dlsync(dl)) != nil)
+			return e;
+		if(clear){
+			if(dl->ins != nil)
+				dropblk(dl->ins);
+			free(dl);
+		}
 	}
-	fs->opensnap = r;
+	if(clear){
+		fs->dlhead = nil;
+		fs->dltail = nil;
+		memset(fs->dlcache, 0, fs->dlcmax*sizeof(Dlist*));
+	}
+	return nil;
+}
 
-	return r;
+/*
+ * Marks a block as killed by the tree
+ * t, which means that it will be free
+ * for use after t is reclaimed.
+ *
+ * t must be an active snapshot with
+ * no successors.
+ */
+int
+killblk(Tree *t, Bptr bp)
+{
+	Dlist *dl;
+	Blk *b;
+	char *p;
+
+	if((dl = getdl(t->gen, bp.gen)) == nil)
+		return -1;
+	if(dl->ins == nil || dl->ins->deadsz == Dlspc){
+		if((b = newblk(&fs->snap, Tdlist)) == nil){
+			putdl(dl);
+			return -1;
+		}
+		if(dl->ins != nil){
+			enqueue(dl->ins);
+			/* enqueuing updates the hashes */
+			if(dl->ins->bp.addr == dl->tl.addr)
+				dl->tl = dl->ins->bp;
+			b->deadp = dl->ins->bp;
+		}else{
+			dl->tl = b->bp;
+			b->deadp = (Bptr){-1, -1, -1};
+		}
+		dl->hd = b->bp;
+		dl->ins = b;
+	}
+	p = dl->ins->data + dl->ins->deadsz;
+	dl->ins->flag |=  Bdirty;
+	dl->ins->deadsz += 8;
+	PACK64(p, bp.addr);
+	putdl(dl);
+	return 0;
 }
--- a/tree.c
+++ b/tree.c
@@ -463,7 +463,7 @@
 	 */
 	full = 0;
 	spc = Leafspc - blkfill(b);
-	if((n = newblk(b->type)) == nil)
+	if((n = newblk(t, b->type)) == nil)
 		return -1;
 	while(i < b->nval){
 		ok = 1;
@@ -479,7 +479,7 @@
 			 * an insertion, mutations come after.
 			 */
 			if(m.op != Oinsert){
-				print("%d(/%d), %d: %M not insert\n", i, b->nval, j, &m);
+				fprint(2, "%d(/%d), %d: %M not insert\n", i, b->nval, j, &m);
 				abort();
 			}
 			cpkvp(&v, &m, buf, sizeof(buf));
@@ -513,7 +513,7 @@
 		cpkvp(&v, &m, buf, sizeof(buf));
 		p->pullsz += msgsz(&m);
 		if(m.op != Oinsert){
-			print("%d(/%d), %d: %M not insert\n", i, b->nval, j, &m);
+			fprint(2, "%d(/%d), %d: %M not insert\n", i, b->nval, j, &m);
 			showblk(2, up->b, "parent", 0);
 			showblk(2, p->b, "current", 0);
 			abort();
@@ -540,7 +540,7 @@
  * When pidx != -1, 
  */
 static int
-updatepiv(Tree *, Path *up, Path *p, Path *pp)
+updatepiv(Tree *t, Path *up, Path *p, Path *pp)
 {
 	char buf[Msgmax];
 	int i, j, sz, full, spc;
@@ -548,7 +548,7 @@
 	Msg m, u;
 
 	b = p->b;
-	if((n = newblk(b->type)) == nil)
+	if((n = newblk(t, b->type)) == nil)
 		return -1;
 	for(i = 0; i < b->nval; i++){
 		if(pp != nil && i == p->midx){
@@ -626,8 +626,8 @@
 	 * so we want to make a new block.
 	 */
 	b = p->b;
-	l = newblk(b->type);
-	r = newblk(b->type);
+	l = newblk(t, b->type);
+	r = newblk(t, b->type);
 	if(l == nil || r == nil){
 		freeblk(t, l);
 		freeblk(t, r);
@@ -672,7 +672,7 @@
 			 * an insertion, mutations come after.
 			 */
 			if(m.op != Oinsert){
-				print("%d(/%d), %d: %M not insert\n", i, b->nval, j, &m);
+				fprint(2, "%d(/%d), %d: %M not insert\n", i, b->nval, j, &m);
 				abort();
 			}
 			cpkvp(&v, &m, buf, sizeof(buf));
@@ -723,8 +723,8 @@
 	 * so we want to make a new bp->lock.
 	 */
 	b = p->b;
-	l = newblk(b->type);
-	r = newblk(b->type);
+	l = newblk(t, b->type);
+	r = newblk(t, b->type);
 	if(l == nil || r == nil){
 		freeblk(t, l);
 		freeblk(t, r);
@@ -773,13 +773,13 @@
 }
 
 static int
-merge(Path *p, Path *pp, int idx, Blk *a, Blk *b)
+merge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
 {
 	Blk *d;
 	Msg m;
 	int i;
 
-	if((d = newblk(a->type)) == nil)
+	if((d = newblk(t, a->type)) == nil)
 		return -1;
 	for(i = 0; i < a->nval; i++){
 		getval(a, i, &m);
@@ -858,8 +858,8 @@
 	Blk *d, *l, *r;
 	Msg m;
 
-	l = newblk(a->type);
-	r = newblk(a->type);
+	l = newblk(t, a->type);
+	r = newblk(t, a->type);
 	if(l == nil || r == nil){
 		freeblk(t, l);
 		freeblk(t, r);
@@ -939,7 +939,7 @@
 		imbalance *= -1;
 	/* works for leaf, because 0 always < Bufspc */
 	if(na + nb < (Pivspc - 4*Msgmax) && ma + mb < Bufspc)
-		return merge(p, pp, idx, a, b);
+		return merge(t, p, pp, idx, a, b);
 	else if(imbalance > 4*Msgmax)
 		return rotate(t, p, pp, idx, a, b, (na + nb)/2);
 	else
@@ -1056,7 +1056,7 @@
 	}
 	if(pp->nl != nil && pp->nr != nil){
 		rp = &path[0];
-		rp->nl = newblk(Tpivot);
+		rp->nl = newblk(t, Tpivot);
 		if(rp->nl == nil)
 			goto Error;
 		rp->npull = pp->npull;
@@ -1140,7 +1140,7 @@
 	char *p;
 	Blk *r;
 
-	if((r = dupblk(b)) == nil)
+	if((r = dupblk(t, b)) == nil)
 		return Enomem;
 	
 	nbuf = r->nbuf;
@@ -1336,7 +1336,7 @@
 		if(!(ok || m.op == Oinsert)){
 			fprint(2, "lookup %K << %M missing insert\n", k, &m);
 			for(int j = 0; j < h; j++){
-				print("busted %d\n",j);
+				fprint(2, "busted %d\n",j);
 				if(p[j] != nil)
 					showblk(2, p[j], "busted insert", 0);
 			}