shithub: gefs

Download patch

ref: 2a941967c8f06ee6f6eb63bf5399c0d2ba4b89df
parent: f3b4a20db85543154fdde5b8a6b88bb10bbfb5a5
author: Ori Bernstein <ori@eigenstate.org>
date: Wed Dec 4 11:12:28 EST 2024

sync

--- a/atomic-386.s
+++ b/atomic-386.s
@@ -1,12 +1,3 @@
-#define CMPXCHG	/* (CX) */\
-	BYTE $0x0F; BYTE $0xB1; BYTE $0x11
-#define CMPXCHG64 /* (DI) */\
-	BYTE $0x0F; BYTE $0xC7; BYTE $0x0F
-#define XADDL /* BX, (AX) */ \
-	BYTE $0x0F; BYTE $0xC1; BYTE $0x03
-#define XADDLSP /* AX, (SP) */ \
-	BYTE $0x0F; BYTE $0xC1; BYTE $0x04; BYTE $0x24
-
 /*  get variants */
 TEXT ageti+0(SB),1,$0
 TEXT agetl+0(SB),1,$0
@@ -38,7 +29,7 @@
 	MOVL	0(DI), AX
 	MOVL	4(DI), DX
 loop:
-	LOCK;	CMPXCHG64
+	LOCK;	CMPXCHG8B (DI)
         JNE     loop
 	MOVL	p+0(FP),DI
 	MOVL	AX, 0(DI)
@@ -52,7 +43,7 @@
 	MOVL	p+0(FP), BX
 	MOVL	v+4(FP), CX
 	MOVL	CX, AX
-	LOCK; XADDL
+	LOCK; XADDL AX, (BX)
 	ADDL	CX, AX
 	RET
 
@@ -65,7 +56,7 @@
 	MOVL	DX, CX
 	ADDL	v+8(FP), BX
 	ADCL	v+12(FP), CX
-	LOCK; CMPXCHG64
+	LOCK; CMPXCHG8B (DI)
 	JNE	retry
 	MOVL	r+0(FP), DI
 	MOVL	BX, 0x0(DI)
@@ -79,7 +70,7 @@
 	MOVL	p+0(FP), CX
 	MOVL	ov+4(FP), AX
 	MOVL	nv+8(FP), DX
-	LOCK; CMPXCHG
+	LOCK; CMPXCHGL DX, (CX)
 	JNE	fail32
 	MOVL	$1,AX
 	RET
@@ -93,7 +84,7 @@
 	MOVL	ov+8(FP), DX
 	MOVL	nv+12(FP), BX
 	MOVL	nv+16(FP), CX
-	LOCK; CMPXCHG64
+	LOCK; CMPXCHG8B (DI)
 	JNE	fail64
 	MOVL	$1,AX
 	RET
@@ -105,5 +96,5 @@
 TEXT coherence+0(SB),1,$0
 	/* this is essentially mfence but that requires sse2 */
 	XORL	AX, AX
-	LOCK; XADDLSP
+	LOCK; XADDL AX, (SP)
 	RET
--- a/blk.c
+++ b/blk.c
@@ -7,28 +7,29 @@
 #include "fns.h"
 #include "atomic.h"
 
-static vlong	blkalloc_lk(Arena*);
-static vlong	blkalloc(int, uint);
+static vlong	blkalloc_lk(Arena*, int);
+static vlong	blkalloc(int, uint, int);
 static void	blkdealloc_lk(Arena*, vlong);
 static Blk*	initblk(Blk*, vlong, vlong, int);
+static void	readblk(Blk*, Bptr, int);
 
 int
-checkflag(Blk *b, int f)
+checkflag(Blk *b, int set, int clr)
 {
 	long v;
 
 	v = agetl(&b->flag);
-	return (v & f) == f;
+	return (v & (set|clr)) == set;
 }
 
 void
-setflag(Blk *b, int f)
+setflag(Blk *b, int set, int clr)
 {
 	long ov, nv;
 
 	while(1){
 		ov = agetl(&b->flag);
-		nv = ov | f;
+		nv = (ov & ~clr) | set;
 		if(acasl(&b->flag, ov, nv))
 			break;
 	}
@@ -35,39 +36,23 @@
 }
 
 void
-clrflag(Blk *b, int f)
-{
-	long ov, nv;
-
-	while(1){
-		ov = agetl(&b->flag);
-		nv = ov & ~f;
-		if(acasl(&b->flag, ov, nv))
-			break;
-	}
-}
-
-void
 syncblk(Blk *b)
 {
-	assert(checkflag(b, Bfinal));
+	assert(checkflag(b, Bfinal, 0));
 	assert(b->bp.addr >= 0);
-	clrflag(b, Bdirty);
+	tracex("syncblk", b->bp, b->type, -1);
 	if(pwrite(fs->fd, b->buf, Blksz, b->bp.addr) == -1)
 		broke("%B %s: %r", b->bp, Eio);
+	setflag(b, 0, Bdirty);
 }
 
-static Blk*
-readblk(vlong bp, int flg)
+static void
+readblk(Blk *b, Bptr bp, int flg)
 {
-	vlong off, rem, n;
+	vlong off, xh, ck, rem, n;
 	char *p;
-	Blk *b;
 
-	assert(bp != -1);
-	b = cachepluck();
-	b->alloced = getcallerpc(&bp);
-	off = bp;
+	off = bp.addr;
 	rem = Blksz;
 	while(rem != 0){
 		n = pread(fs->fd, b->buf, rem, off);
@@ -79,12 +64,10 @@
 	b->cnext = nil;
 	b->cprev = nil;
 	b->hnext = nil;
-	b->flag = 0;
 
-	b->bp.addr = bp;
+	b->bp.addr = bp.addr;
 	b->bp.hash = -1;
 	b->bp.gen = -1;
-	b->fnext = nil;
 
 	b->nval = 0;
 	b->valsz = 0;
@@ -128,21 +111,33 @@
 		b->data = p;
 		break;
 	}
+	if(b->type == Tlog || b->type == Tdlist){
+		xh = b->logh;
+		ck = bufhash(b->data, b->logsz);
+	}else{
+		xh = bp.hash;
+		ck = blkhash(b);
+	}
+	if((!flg&GBnochk) && ck != xh){
+		if(!(flg&GBsoftchk))
+			broke("%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+		fprint(2, "%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+		error(Ecorrupt);
+	}
 	assert(b->magic == Magic);
-	return b;
 }
 
 static Arena*
 pickarena(uint ty, uint hint, int tries)
 {
-	uint n;
+	uint n, r;
 
-	n = hint + tries + ainc(&fs->roundrobin)/1024;
+	r = ainc(&fs->roundrobin)/2048;
 	if(ty == Tdat)
-		n++;
-	if(hint % fs->narena == 0)
-		n++;
-	return &fs->arenas[n%fs->narena];
+		n = hint % (fs->narena - 1) + r + 1;
+	else
+		n = r;
+	return &fs->arenas[(n + tries) % fs->narena];
 }
 
 Arena*
@@ -154,8 +149,10 @@
 
 	lo = 0;
 	hi = fs->narena;
-	if(b == 0)
+	if(b == fs->sb0->bp.addr)
 		return &fs->arenas[0];
+	if(b == fs->sb1->bp.addr)
+		return &fs->arenas[hi-1];
 	while(1){
 		mid = (hi + lo)/2;
 		a = &fs->arenas[mid];
@@ -243,13 +240,17 @@
 {
 	Blk *lb;
 
-	lb = a->logbuf[a->lbidx++ % nelem(a->logbuf)];
-	if(lb->bp.addr != -1)
-		cachedel(lb->bp.addr);
+assert(!canqlock(a));
+	lb = a->logbuf[0];
+	if(lb == a->logtl)
+		lb = a->logbuf[1];
+	assert(lb->ref == 1);
+	lb->flag = Bstatic;
 	initblk(lb, o, -1, Tlog);
-	finalize(lb);
-	syncblk(lb);
-	traceb("logblk" , lb->bp);
+	tracex("newlogb" , lb->bp, -1, getcallerpc(&a));
+	lb->lasthold0 = lb->lasthold;
+	lb = holdblk(lb);
+	lb->lasthold = getcallerpc(&a);
 	return lb;
 }
 
@@ -263,27 +264,26 @@
 logappend(Arena *a, vlong off, vlong len, int op)
 {
 	vlong o, start, end;
-	Blk *nl, *lb;
-	char *p, *name;
+	Blk *lb;
+	char *p;
 
-	lb = a->logtl;
 	assert((off & 0xff) == 0);
 	assert(op == LogAlloc || op == LogFree || op == LogSync);
 	if(op != LogSync){
 		start = a->h0->bp.addr;
 		end = start + a->size + 2*Blksz;
-		assert(lb == nil || lb->type == Tlog);
 		assert(off >= start);
-		assert(off <= end);
+		assert(off < end);
 	}
-	switch(op){
-	case LogAlloc:	name = "alloc";	break;
-	case LogFree:	name = "free";	break;
-	case LogSync:	name = "sync";	break;
-	default:	name = "???";	break;
-	}
-	assert(lb == nil || lb->logsz >= 0);
-	dprint("logop %llx+%llx@%x: %s\n", off, len, lb?lb->logsz:-1, name);
+	lb = a->logtl;
+	assert(lb->ref > 0);
+	assert(lb->type == Tlog);
+	assert(lb->logsz >= 0);
+	dprint("logop %d: %llx+%llx@%x\n", op, off, len, lb->logsz);
+
+	if(checkflag(lb, 0, Bdirty))
+		setflag(lb, Bdirty, Bfinal);
+
 	/*
 	 * move to the next block when we have
 	 * too little room in the log:
@@ -292,23 +292,18 @@
 	 * 16 bytes of new log entry allocation
 	 * and chaining.
 	 */
-	if(lb == nil || lb->logsz >= Logspc - Logslop){
-		o = blkalloc_lk(a);
+	if(lb->logsz >= Logspc - Logslop){
+		o = blkalloc_lk(a, 0);
 		if(o == -1)
 			error(Efull);
-		nl = mklogblk(a, o);
 		p = lb->data + lb->logsz;
 		PACK64(p, o|LogAlloc1);
 		lb->logsz += 8;
-		lb->logp = nl->bp;
-		finalize(lb);
-		syncblk(lb);
-		a->logtl = nl;
-		a->nlog++;
-		lb = nl;
+		lb->logp = (Bptr){o, -1, -1};
+tracex("logchain1", lb->bp, o, a - fs->arenas);
+		lb = mklogblk(a, o);
+tracex("logchain2", lb->bp, getcallerpc(&a), -1);
 	}
-
-	setflag(lb, Bdirty);
 	if(len == Blksz){
 		if(op == LogAlloc)
 			op = LogAlloc1;
@@ -323,6 +318,18 @@
 		PACK64(p+8, len);
 		lb->logsz += 8;
 	}
+	if(lb != a->logtl) {
+traceb("logstep1", a->logtl->logp);
+traceb("logstep2", a->logtl->bp);
+		finalize(lb);
+		syncblk(lb);
+
+		finalize(a->logtl);
+		syncblk(a->logtl);
+		dropblk(a->logtl);
+		a->logtl = lb;
+		a->nlog++;
+	}
 }
 
 void
@@ -336,10 +343,13 @@
 
 	dprint("loadlog %B\n", bp);
 	traceb("loadlog", bp);
+	b = a->logbuf[0];
 	while(1){
-		b = getblk(bp, 0);
+		assert(checkflag(b, Bstatic, Bcached));
+		holdblk(b);
+		readblk(b, bp, 0);
 		dprint("\tload %B chain %B\n", bp, b->logp);
-		/* the hash covers the log and offset */
+		a->nlog++;
 		for(i = 0; i < b->logsz; i += n){
 			d = b->data + i;
 			ent = UNPACK64(d);
@@ -353,7 +363,9 @@
 				if(gen >= fs->qgen){
 					if(a->logtl == nil){
 						b->logsz = i;
-						a->logtl = holdblk(b);
+						a->logtl = b;
+						cachedel(b->bp.addr);
+						setflag(b, Bdirty, 0);
 						return;
 					}
 					dropblk(b);
@@ -391,21 +403,25 @@
 }
 
 void
-compresslog(Arena *a)
+flushlog(Arena *a)
 {
+	if(checkflag(a->logtl, 0, Bdirty|Bstatic))
+		return;
+	finalize(a->logtl);
+	syncblk(a->logtl);
+}
 
-	int i, nr, nblks;
+void
+compresslog(Arena *a)
+{
+	int i, nr, nblks, nlog;
 	vlong sz, *blks;
-	Blk *b, *nb;
+	Blk *b;
 	Arange *r;
-	Bptr hd;
 	char *p;
 
-	tracem("compresslog");
-	if(a->logtl != nil){
-		finalize(a->logtl);
-		syncblk(a->logtl);
-	}
+	flushlog(a);
+tracex("compress", a->loghd, getcallerpc(&a), -1);
 	/*
 	 * Prepare what we're writing back.
 	 * Arenas must be sized so that we can
@@ -414,7 +430,7 @@
 	 */
 	sz = 0;
 	nr = 0;
-	a->nlog = 0;
+	nlog = 0;
 	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
 		sz += 16;
 		nr++;
@@ -437,31 +453,22 @@
 		nexterror();
 	}
 	for(i = 0; i < nblks; i++){
-		blks[i] = blkalloc_lk(a);
+		blks[i] = blkalloc_lk(a, 1);
 		if(blks[i] == -1)
 			error(Efull);
 	}
+
 	/* fill up the log with the ranges from the tree */
 	i = 0;
-	hd = (Bptr){blks[0], -1, -1};
-	b = a->logbuf[a->lbidx++ % nelem(a->logbuf)];
-	a->logbuf[a->lbidx % nelem(a->logbuf)]->bp = Zb;
-	if(b->bp.addr != -1)
-		cachedel(b->bp.addr);
-	initblk(b, blks[i++], -1, Tlog);
-	finalize(b);
+	b = mklogblk(a, blks[i++]);
 	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
 		if(b->logsz >= Logspc - Logslop){
-			a->nlog++;
-			nb = a->logbuf[a->lbidx++ % nelem(a->logbuf)];
-			if(nb->bp.addr != -1)
-				cachedel(nb->bp.addr);
-			initblk(nb, blks[i++], -1, Tlog);
-			b->logp = nb->bp;
-			setflag(b, Bdirty);
+			b->logp = (Bptr){blks[i], -1, -1};
 			finalize(b);
 			syncblk(b);
-			b = nb;
+			dropblk(b);
+			nlog++;
+			b = mklogblk(a, blks[i++]);
 		}
 		p = b->data + b->logsz;
 		PACK64(p+0, r->off|LogFree);
@@ -468,20 +475,25 @@
 		PACK64(p+8, r->len);
 		b->logsz += 16;
 	}
-	finalize(b);
-	syncblk(b);
 
 	/*
 	 * now we have a valid freelist, and we can start
 	 * appending stuff to it. Clean up the eagerly
 	 * allocated extra blocks.
+	 *
+	 * Note that we need to drop the reference to the
+	 * old logtl before we free the old blocks, because
+	 * deallocating a block may require another block.
 	 */
-	a->loghd = hd;
-	a->logtl = b;
-	for(; i < nblks; i++){
-		cachedel(b->bp.addr);
+	dropblk(a->logtl);
+	a->loghd = (Bptr){blks[0], -1, -1};
+	a->logtl = b;	/* written back by sync() later */
+	a->nlog = nlog;
+	a->lastlogsz = nlog;
+
+	/* May add blocks to new log */
+	for(; i < nblks; i++)
 		blkdealloc_lk(a, blks[i]);
-	}
 	poperror();
 	free(blks);
 }
@@ -490,8 +502,6 @@
 logbarrier(Arena *a, vlong gen)
 {
 	logappend(a, gen<<8, 0, LogSync);
-	if(a->loghd.addr == -1)
-		a->loghd = a->logtl->bp;
 	return 0;
 }
 
@@ -502,14 +512,15 @@
  * the alloc log.
  */
 static vlong
-blkalloc_lk(Arena *a)
+blkalloc_lk(Arena *a, int seq)
 {
-	Avltree *t;
 	Arange *r;
 	vlong b;
 
-	t = a->free;
-	r = (Arange*)t->root;
+	if(seq)
+		r = (Arange*)avlmin(a->free);
+	else
+		r = (Arange*)avlmax(a->free);
 	if(!usereserve && a->size - a->used <= a->reserve)
 		return -1;
 	if(r == nil)
@@ -522,11 +533,16 @@
 	 * the sort order because the tree
 	 * covers disjoint ranges
 	 */
-	b = r->off;
-	r->len -= Blksz;
-	r->off += Blksz;
+	if(seq){
+		b = r->off;
+		r->len -= Blksz;
+		r->off += Blksz;
+	}else{
+		r->len -= Blksz;
+		b = r->off + r->len;
+	}
 	if(r->len == 0){
-		avldelete(t, r);
+		avldelete(a->free, r);
 		free(r);
 	}
 	a->used += Blksz;
@@ -536,26 +552,14 @@
 static void
 blkdealloc_lk(Arena *a, vlong b)
 {
+	cachedel(b);
 	logappend(a, b, Blksz, LogFree);
-	if(a->loghd.addr == -1)
-		a->loghd = a->logtl->bp;
 	freerange(a->free, b, Blksz);
 	a->used -= Blksz;
 }
 
-void
-blkdealloc(vlong b)
-{
-	Arena *a;
-
-	a = getarena(b);
- 	qlock(a);
-	blkdealloc_lk(a, b);
-	qunlock(a);
-}
-
 static vlong
-blkalloc(int ty, uint hint)
+blkalloc(int ty, uint hint, int seq)
 {
 	Arena *a;
 	vlong b;
@@ -582,7 +586,7 @@
 		qunlock(a);
 		nexterror();
 	}
-	b = blkalloc_lk(a);
+	b = blkalloc_lk(a, seq);
 	if(b == -1){
 		qunlock(a);
 		poperror();
@@ -589,8 +593,6 @@
 		goto Again;
 	}
 	logappend(a, b, Blksz, LogAlloc);
-	if(a->loghd.addr == -1)
-		a->loghd = a->logtl->bp;
 	qunlock(a);
 	poperror();
 	return b;
@@ -628,9 +630,7 @@
 		b->data = b->buf + Leafhdsz;
 		break;
 	}
-	b->fnext = nil;
-
-	setflag(b, Bdirty);
+	setflag(b, Bdirty, 0);
 	b->nval = 0;
 	b->valsz = 0;
 	b->nbuf = 0;
@@ -642,16 +642,31 @@
 }
 
 Blk*
-newblk(Tree *t, int ty, vlong hint)
+newdblk(Tree *t, vlong hint, int seq)
 {
 	vlong bp;
 	Blk *b;
 
-	bp = blkalloc(ty, hint);
+	bp = blkalloc(Tdat, hint, seq);
 	b = cachepluck();
+	initblk(b, bp, t->memgen, Tdat);
+	b->alloced = getcallerpc(&t);
+	tracex("newdblk" , b->bp, Tdat, getcallerpc(&t));
+	return b;
+
+}
+
+Blk*
+newblk(Tree *t, int ty)
+{
+	vlong bp;
+	Blk *b;
+
+	bp = blkalloc(ty, 0, 0);
+	b = cachepluck();
 	initblk(b, bp, t->memgen, ty);
 	b->alloced = getcallerpc(&t);
-	tracex("newblk" , b->bp, ty, -1);
+	tracex("newblk" , b->bp, ty, getcallerpc(&t));
 	return b;
 }
 
@@ -660,11 +675,10 @@
 {
 	Blk *r;
 
-	if((r = newblk(t, b->type, 0)) == nil)
+	if((r = newblk(t, b->type)) == nil)
 		return nil;
 
 	tracex("dup" , b->bp, b->type, t->gen);
-	setflag(r, Bdirty);
 	r->bp.hash = -1;
 	r->nval = b->nval;
 	r->valsz = b->valsz;
@@ -710,20 +724,16 @@
 	}
 
 	b->bp.hash = blkhash(b);
-	setflag(b, Bfinal);
-	cacheins(b);
-	b->cached = getcallerpc(&b);
+	setflag(b, Bdirty|Bfinal, 0);
 }
 
 Blk*
 getblk(Bptr bp, int flg)
 {
-	uvlong xh, ck;
 	Blk *b;
 	int i;
 
 	i = ihash(bp.addr) % nelem(fs->blklk);
-	tracex("get" , bp, getcallerpc(&bp), -1);
 	qlock(&fs->blklk[i]);
 	if(waserror()){
 		qunlock(&fs->blklk[i]);
@@ -730,31 +740,16 @@
 		nexterror();
 	}
 	if((b = cacheget(bp.addr)) != nil){
+		assert(checkflag(b, 0, Bfreed));
 		b->lasthold = getcallerpc(&bp);
 		qunlock(&fs->blklk[i]);
 		poperror();
 		return b;
 	}
-	b = readblk(bp.addr, flg);
+	b = cachepluck();
 	b->alloced = getcallerpc(&bp);
-	b->bp.hash = blkhash(b);
-	if((flg&GBnochk) == 0){
-		if(b->type == Tlog || b->type == Tdlist){
-			xh = b->logh;
-			ck = bufhash(b->data, b->logsz);
-		}else{
-			xh = bp.hash;
-			ck = b->bp.hash;
-		}
-		if(ck != xh){
-			if(flg & GBsoftchk){
-				fprint(2, "%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
-				error(Ecorrupt);
-			}else{
-				broke("%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
-			}
-		}
-	}
+	b->alloced = getcallerpc(&bp);
+	readblk(b, bp, flg);
 	b->bp.gen = bp.gen;
 	b->lasthold = getcallerpc(&bp);
 	cacheins(b);
@@ -776,15 +771,16 @@
 void
 dropblk(Blk *b)
 {
-	assert(b == nil || b->ref > 0);
-	if(b == nil || adec(&b->ref) != 0)
+	if(b == nil)
 		return;
 	b->lastdrop = getcallerpc(&b);
+	if(adec(&b->ref) != 0)
+		return;
 	/*
 	 * freed blocks go to the LRU bottom
 	 * for early reuse.
 	 */
-	if(checkflag(b, Bfreed))
+	if(checkflag(b, Bfreed, 0))
 		lrubot(b);
 	else
 		lrutop(b);
@@ -805,17 +801,18 @@
 }
 
 void
-limbo(Bfree *f)
+limbo(int op, Limbo *l)
 {
-	Bfree *p;
+	Limbo *p;
 	ulong ge;
 
+	l->op = op;
 	while(1){
 		ge = agetl(&fs->epoch);
 		p = agetp(&fs->limbo[ge]);
-		f->next = p;
-		if(acasp(&fs->limbo[ge], p, f)){
-			aincl(&fs->nlimbo, 1);
+		l->next = p;
+		if(acasp(&fs->limbo[ge], p, l)){
+			ainc(&fs->nlimbo);
 			break;
 		}
 	}
@@ -822,27 +819,42 @@
 }
 
 void
-freeblk(Tree *t, Blk *b, Bptr bp)
+freeblk(Tree *t, Blk *b)
 {
+	if(t == &fs->snap || (t != nil && b->bp.gen < t->memgen)){
+		tracex("killb", b->bp, t->memgen, getcallerpc(&t));
+		killblk(t, b->bp);
+		return;
+	}
+	b->freed = getcallerpc(&t);
+	tracex("freeb", b->bp, getcallerpc(&t), -1);
+	setflag(b, Blimbo, 0);
+	holdblk(b);
+	assert(b->ref > 1);
+	limbo(DFblk, b);
+}
+
+void
+freebp(Tree *t, Bptr bp)
+{
 	Bfree *f;
 
 	if(t == &fs->snap || (t != nil && bp.gen < t->memgen)){
-		tracex("killb", bp, getcallerpc(&t), -1);
+		tracex("killbp", bp, t->memgen, getcallerpc(&t));
 		killblk(t, bp);
 		return;
 	}
+	tracex("freebp", bp, getcallerpc(&t), -1);
 
-	tracex("freeb", bp, getcallerpc(&t), -1);
-	f = emalloc(sizeof(Bfree), 0);
-	f->op = DFblk;
+	qlock(&fs->bfreelk);
+	while(fs->bfree == nil)
+		rsleep(&fs->bfreerz);
+	f = fs->bfree;
+	fs->bfree = (Bfree*)f->next;
+	qunlock(&fs->bfreelk);
+
 	f->bp = bp;
-	f->b = nil;
-	if(b != nil){
-		setflag(b, Blimbo);
-		b->freed = getcallerpc(&t);
-		f->b = holdblk(b);
-	}
-	limbo(f);
+	limbo(DFbp, f);
 }
 
 void
@@ -875,7 +887,7 @@
 	for(i = 0; i < fs->nworker; i++){
 		e = agetl(&fs->lepoch[i]);
 		if((e & Eactive) && e != (ge | Eactive)){
-			if(delay < 100)
+			if(delay < 1000)
 				delay++;
 			else
 				fprint(2, "stalled epoch %lx [worker %d]\n", e, i);
@@ -889,7 +901,9 @@
 epochclean(void)
 {
 	ulong c, e, ge;
-	Bfree *p, *n;
+	Limbo *p, *n;
+	Blk *b;
+	Bfree *f;
 	Arena *a;
 	Qent qe;
 	int i;
@@ -912,28 +926,42 @@
 		n = p->next;
 		switch(p->op){
 		case DFtree:
-			free(p->t);
+			free(p);
 			break;
 		case DFmnt:
-			free(p->m);
+			free(p);
 			break;
+		case DFbp:
+			f = (Bfree*)p;
+			a = getarena(f->bp.addr);
+			if((b = cacheget(f->bp.addr)) != nil){
+				setflag(b, Bfreed, Bdirty|Blimbo);
+				dropblk(b);
+			}
+			qe.op = Qfree;
+			qe.bp = f->bp;
+			qe.b = nil;
+			qput(a->sync, qe);
+			qlock(&fs->bfreelk);
+			f->next = fs->bfree;
+			fs->bfree = f;
+			rwakeup(&fs->bfreerz);
+			qunlock(&fs->bfreelk);
+			break;
 		case DFblk:
-			a = getarena(p->bp.addr);
+			b = (Blk*)p;
 			qe.op = Qfree;
-			qe.bp = p->bp;
+			qe.bp = b->bp;
 			qe.b = nil;
+			setflag(b, Bfreed, Bdirty|Blimbo);
+			a = getarena(b->bp.addr);
+			dropblk(b);
 			qput(a->sync, qe);
-			if(p->b != nil){
-				clrflag(p->b, Blimbo);
-				setflag(p->b, Bfreed);
-				dropblk(p->b);
-			}
 			break;
 		default:
 			abort();
 		}
-		aincl(&fs->nlimbo, -1);
-		free(p);
+		adec(&fs->nlimbo);
 	}
 }
 
@@ -943,16 +971,18 @@
 	Arena *a;
 	Qent qe;
 
-	assert(checkflag(b, Bdirty));
+	assert(checkflag(b, Bdirty, Bqueued|Bstatic));
 	assert(b->bp.addr >= 0);
+	finalize(b);
+	if(checkflag(b, 0, Bcached)){
+		cacheins(b);
+		b->cached = getcallerpc(&b);
+	}
+	holdblk(b);
 
 	b->enqueued = getcallerpc(&b);
-	a = getarena(b->bp.addr);
-	holdblk(b);
-	finalize(b);
 	traceb("queueb", b->bp);
-	setflag(b, Bqueued);
-	b->queued = getcallerpc(&b);
+	a = getarena(b->bp.addr);
 	qe.op = Qwrite;
 	qe.bp = b->bp;
 	qe.b = b;
@@ -967,10 +997,9 @@
 	q->nheap = 0;
 	q->heapsz = fs->cmax;
 	q->heap = emalloc(q->heapsz*sizeof(Qent), 1);
-
 }
 
-int
+static int
 qcmp(Qent *a, Qent *b)
 {
 	if(a->qgen != b->qgen)
@@ -988,11 +1017,13 @@
 	int i;
 
 	if(qe.op == Qfree || qe.op == Qwrite)
-		assert(qe.bp.addr != 0 && (qe.bp.addr & (Blksz-1)) == 0);
+		assert((qe.bp.addr & (Blksz-1)) == 0);
 	else if(qe.op == Qfence)
 		assert(fs->syncing > 0);
 	else
 		abort();
+	if(qe.b != nil)
+		assert(qe.b->ref > 0);
 	qlock(&q->lk);
 	qe.qgen = agetv(&fs->qgen);
 	while(q->nheap == q->heapsz)
@@ -1042,7 +1073,7 @@
 	rwakeup(&q->fullrz);
 	qunlock(&q->lk);
 	if(e.b != nil){
-		clrflag(e.b, Bqueued);
+		setflag(e.b, 0, Bqueued);
 		e.b->queued = 0;
 	}
 	return e;
@@ -1057,7 +1088,7 @@
 
 	q = p;
 	if(waserror()){
-		aincl(&fs->rdonly, 1);
+		ainc(&fs->rdonly);
 		fprint(2, "error syncing: %s\n", errmsg());
 		return;
 	}
@@ -1066,12 +1097,15 @@
 		switch(qe.op){
 		case Qfree:
 			tracex("qfreeb", qe.bp, qe.qgen, -1);
+			/*
+			 * we shouldn't have a block in a free op,
+			 * the frees go into the queue just to ensure
+			 * write/reuse ordering.
+			 */
+			assert(qe.b == nil);
 			a = getarena(qe.bp.addr);
 			qlock(a);
-			cachedel(qe.bp.addr);
 			blkdealloc_lk(a, qe.bp.addr);
-			if(qe.b != nil)
-				dropblk(qe.b);
 			qunlock(a);
 			break;
 		case Qfence:
@@ -1083,7 +1117,7 @@
 			break;
 		case Qwrite:
 			tracex("qsyncb", qe.bp, qe.qgen, -1);
-			if(checkflag(qe.b, Bfreed) == 0)
+			if(checkflag(qe.b, Bfreed, Bstatic) == 0)
 				syncblk(qe.b);
 			dropblk(qe.b);
 			break;
--- a/cache.c
+++ b/cache.c
@@ -32,6 +32,7 @@
 	 * its now in use.
 	 */
 	assert(b->magic == Magic);
+	assert(checkflag(b, 0, Bstatic));
 	if(b->ref != 0){
 		qunlock(&fs->lrulk);
 		return;
@@ -58,6 +59,7 @@
 	 * its now in use.
 	 */
 	assert(b->magic == Magic);
+	assert(checkflag(b, 0, Bstatic));
 	if(b->ref != 0){
 		qunlock(&fs->lrulk);
 		return;
@@ -83,25 +85,18 @@
 	h = ihash(b->bp.addr);
 	bkt = &fs->bcache[h % fs->cmax];
 	qlock(&fs->lrulk);
-	traceb("cache", b->bp);
-	lock(bkt);
-	if(checkflag(b, Bcached)){
-		unlock(bkt);
-		qunlock(&fs->lrulk);
-		return;
-	}
+	assert(checkflag(b, 0, Bstatic|Bcached));
+	setflag(b, Bcached, 0);
 	assert(b->hnext == nil);
 	for(Blk *bb = bkt->b; bb != nil; bb = bb->hnext)
-		assert(b != bb);
-	setflag(b, Bcached);
+		assert(b != bb && b->bp.addr != bb->bp.addr);
 	b->cached = getcallerpc(&b);
 	b->hnext = bkt->b;
 	bkt->b = b;
-	unlock(bkt);
 	qunlock(&fs->lrulk);
 }
 
-void
+static void
 cachedel_lk(vlong addr)
 {
 	Bucket *bkt;
@@ -111,28 +106,26 @@
 	if(addr == -1)
 		return;
 
-	tracex("uncache", Zb, addr, getcallerpc(&addr));
 	h = ihash(addr);
 	bkt = &fs->bcache[h % fs->cmax];
-	lock(bkt);
 	p = &bkt->b;
 	for(b = bkt->b; b != nil; b = b->hnext){
 		if(b->bp.addr == addr){
+			/* FIXME: Until we clean up snap.c, we can have dirty blocks in cache */
+			assert(checkflag(b, Bcached, Bstatic)); //Bdirty));
 			*p = b->hnext;
-			clrflag(b, Bcached);
 			b->uncached = getcallerpc(&addr);
 			b->hnext = nil;
+			setflag(b, 0, Bcached);
 			break;
 		}
 		p = &b->hnext;
 	}
-	unlock(bkt);
 }
 void
 cachedel(vlong addr)
 {
 	qlock(&fs->lrulk);
-	tracex("uncachelk", Zb, addr, getcallerpc(&addr));
 	cachedel_lk(addr);
 	qunlock(&fs->lrulk);
 }
@@ -147,7 +140,6 @@
 	h = ihash(addr);
 	bkt = &fs->bcache[h % fs->cmax];
 	qlock(&fs->lrulk);
-	lock(bkt);
 	for(b = bkt->b; b != nil; b = b->hnext){
 		if(b->bp.addr == addr){
 			holdblk(b);
@@ -156,7 +148,6 @@
 			break;
 		}
 	}
-	unlock(bkt);
 	qunlock(&fs->lrulk);
 
 	return b;
@@ -177,12 +168,12 @@
 	b = fs->ctail;
 	assert(b->magic == Magic);
 	assert(b->ref == 0);
-	if(checkflag(b, Bcached))
+	if(checkflag(b, Bcached, 0))
 		cachedel_lk(b->bp.addr);
-	if(checkflag(b, Bcached))
+	if(checkflag(b, Bcached, 0))
 		fprint(2, "%B cached %#p freed %#p\n", b->bp, b->cached, b->freed);
+	assert(checkflag(b, 0, Bcached));
 	lrudel(b);
-	assert(!checkflag(b, Bcached));
 	b->flag = 0;
 	b->lasthold = 0;
 	b->lastdrop = 0;
--- a/check.c
+++ b/check.c
@@ -158,6 +158,7 @@
 			fprint(fd, "error loading %B\n", bp);
 			return 0;
 		}
+traceb("chklg", bp);
 		b = getblk(bp, 0);
 		nb = b->logp;
 		dropblk(b);
@@ -254,7 +255,7 @@
 	Blk *b;
 
 	ok = 1;
-	aincl(&fs->rdonly, 1);
+	ainc(&fs->rdonly);
 	epochwait();
 	if(waserror()){
 		fprint(fd, "error checking %s\n", errmsg());
@@ -299,7 +300,7 @@
 		poperror();
 	}
 	btexit(&s);
-	aincl(&fs->rdonly, -1);
+	adec(&fs->rdonly);
 	poperror();
 	return ok;
 }
--- a/cons.c
+++ b/cons.c
@@ -8,6 +8,7 @@
 #include "fns.h"
 
 typedef struct Cmd	Cmd;
+typedef struct Sizes	Sizes;
 
 struct Cmd {
 	char	*name;
@@ -14,9 +15,33 @@
 	char	*sub;
 	int	minarg;
 	int	maxarg;
+	int	epoch;
 	void	(*fn)(int, char**, int);
 };
 
+struct Sizes {
+	vlong	datasz;
+	vlong	metasz;
+	vlong	delqsz;
+	vlong	clobsz;
+};
+
+
+static double
+hscaled(vlong sz, char **unit)
+{
+	static char *units[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", nil};
+	double hsz;
+	int u;
+
+	hsz = sz;
+	for(u = 0; u < nelem(units)-1 && hsz >= 500 ; u++)
+		hsz /= 1024;
+	*unit = units[u];
+	return hsz;
+}
+
+
 static void
 setdbg(int fd, char **ap, int na)
 {
@@ -209,14 +234,297 @@
 }
 
 static void
-showdf(int fd, char**, int)
+countlog(int fd, Dlist *dl)
 {
-	char *units[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", nil};
+	Bptr bp, nb;
+	Blk *b;
+	int n;
+
+	n = 0;
+	for(bp = dl->hd; bp.addr != -1; bp = nb){
+		if(waserror()){
+			fprint(fd, "error loading %B\n", bp);
+			return;
+		}
+		b = getblk(bp, 0);
+		nb = b->logp;
+		dropblk(b);
+		poperror();
+		n += b->logsz/8;
+		n++;
+	}
+	fprint(fd, "\tDl(%lld, %lld): %d blocks\n", dl->gen, dl->bgen, n);
+}
+
+
+static void
+prleak(int fd, uvlong *marks)
+{
+	vlong a0, a1, ba, bi, leaksz;
+	Arena *a;
+	Arange *r;
+
+	if(marks == nil)
+		return;
+	leaksz = 0;
+	for(a = &fs->arenas[0]; a < &fs->arenas[fs->narena]; a++){
+		r = (Arange*)avlmin(a->free);
+		a0 = a->h0->bp.addr + 2*Blksz;
+		a1 = a->h0->bp.addr + a->size - 2*Blksz;
+		for(ba = a0; ba < a1; ba += Blksz){
+			if(r != nil && ba == r->off){
+				for(; ba < r->off+r->len; ba += Blksz){
+					bi = ba/Blksz;
+					if(marks[bi/64] & 1ULL<<(bi%64))
+						fprint(fd, "uaf %#llx\n", ba);
+				}
+				r = (Arange*)avlnext(r);
+			}
+			if(ba >= a1)
+				break;
+			bi = ba/Blksz;
+			if((marks[bi/64] & 1ULL<<(bi%64)) == 0){
+				leaksz += Blksz;
+				fprint(fd, "leak %#llx\n", ba);
+			}
+		}
+	}
+	fprint(fd, "total bytes leaked: %lld (%f MiB)\n", leaksz, (double)leaksz/MiB);
+}
+
+static void
+marktree(Tree *t, Blk *b, Sizes *ts, uvlong *marks)
+{
+	int i, fill;
+	vlong bn;
+	Bptr bp;
+	Blk *c;
+	Msg m;
+
+	bn = b->bp.addr/Blksz;
+	if(marks != nil)
+		marks[bn/64] |= 1ULL<<(bn%64);
+	ts->metasz += Blksz;
+	switch(b->type){
+	case Tleaf:
+		for(i = 0; i < b->nval; i++){
+			getval(b, i, &m);
+			if(m.k[0] != Kdat)
+				continue;
+			bp = unpackbp(m.v, m.nv);
+			bn = bp.addr/Blksz;
+			if(marks != nil)
+				marks[bn/64] |= 1ULL<<(bn%64);
+			if(bp.gen <= t->pred)
+				continue;
+			ts->datasz += Blksz;
+		}
+		break;
+	case Tpivot:
+		for(i = 0; i < b->nval; i++){
+			getval(b, i, &m);
+			bp = getptr(&m, &fill);
+			if(bp.gen <= t->pred)
+				continue;
+			c = getblk(bp, 0);
+			marktree(t, c, ts, marks);
+		}
+		for(i = 0; i < b->nbuf; i++){
+			getmsg(b, i, &m);
+			if(m.k[0] != Kdat)
+				continue;
+			switch(m.op){
+			case Odelete:	ts->delqsz += Blksz;	break;
+			case Oclobber:	ts->clobsz += Blksz;	break;
+			case Oclearb:	ts->clobsz += Blksz;	break;
+			case Oinsert:
+				bp = unpackbp(m.v, m.nv);
+				bn = bp.addr/Blksz;
+				if(marks != nil)
+					marks[bn/64] |= 1ULL<<(bn%64);
+				if(bp.gen > t->pred)
+					ts->datasz += Blksz;
+				break;
+			}
+		}
+		break;
+	}
+}
+
+static int
+marklog(int arena, Bptr hd, uvlong *marks)
+{
+	Bptr bp, nb;
+	vlong bn;
+	Blk *b;
+
+	bp = (Bptr){-1, -1, -1};
+	for(bp = hd; bp.addr != -1; bp = nb){
+tracex("marklog", bp, arena, -1);
+		b = getblk(bp, 0);
+		bn = b->bp.addr/Blksz;
+		marks[bn/64] |= 1ULL<<(bn%64);
+		nb = b->logp;
+		dropblk(b);
+	}
+	return 1;
+}
+
+static int
+markdlist(Bptr hd, uvlong *marks)
+{
+	Bptr bp, nb;
+	vlong bn;
+	char *p;
+	Blk *b;
+
+	bp = (Bptr){-1, -1, -1};
+	for(bp = hd; bp.addr != -1; bp = nb){
+		b = getblk(bp, 0);
+		bn = b->bp.addr/Blksz;
+		marks[bn/64] |= 1ULL<<(bn%64);
+		for(p = b->data; p != b->data+b->logsz; p += 8){
+			bn = UNPACK64(p);
+			bn /= Blksz;
+			marks[bn/64] |= 1ULL<<(bn%64);
+		}
+		nb = b->logp;
+		dropblk(b);
+	}
+	return 1;
+}
+
+static int
+markdlists(uvlong *marks)
+{
+	char pfx[1];
+	Dlist dl;
+	Scan s;
+
+	markdlist(fs->snapdl.hd, marks);
+	pfx[0] = Kdlist;
+	btnewscan(&s, pfx, 1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		kv2dlist(&s.kv, &dl);
+		markdlist(dl.hd, marks);
+	}
+	btexit(&s);
+	return 0;
+}
+
+static void
+showsnapsz(int fd)
+{
+	char pfx[1], name[Keymax+1], *u;
+	int i, h, ndone;
+	uvlong *marks;
+	vlong *done;
+	vlong ba, bn, used, total;
+	double sz;
+	Limbo *l;
+	Sizes ts;
+	Tree *t;
+	Scan s;
+	Blk *b;
+
+
+	done = nil;
+	ndone = 0;
+	total = 0;
+	ba = fs->sb1->bp.addr/Blksz;
+	marks = mallocz(sizeof(vlong)*(ba/64 + 1), 1);
+	if(marks == nil)
+		fprint(2, "not enough memory for leak detection\n");
+
+	/* RACY, may crash */
+	for(i = 0; i < 3; i++){
+		for(l = fs->limbo[i]; l != nil; l = l->next){
+			if(l->op == DFbp){
+				bn = ((Bfree*)l)->bp.addr/Blksz;
+				marks[bn/64] |= 1ULL<<(bn%64);
+			}else if(l->op == DFblk){
+				bn = ((Blk*)l)->bp.addr/Blksz;
+				marks[bn/64] |= 1ULL<<(bn%64);
+			}
+		}
+	}
+
+	b = getroot(&fs->snap, &h);
+	memset(&ts, 0, sizeof(Sizes));
+	marktree(&fs->snap, b, &ts, marks);
+	dropblk(b);
+
+	pfx[0] = Klabel;
+	btnewscan(&s, pfx, 1);
+	btenter(&fs->snap, &s);
+	while(1){
+		if(!btnext(&s, &s.kv))
+			break;
+		if(waserror()){
+			fprint(fd, "moving on: %s\n", errmsg());
+			continue;
+		}
+		memcpy(name, s.kv.k+1, s.kv.nk-1);
+		name[s.kv.nk-1] = 0;
+		if((t = opensnap(name, nil)) == nil){
+			fprint(2, "invalid snap label %s\n", name);
+			break;
+		}
+		fprint(fd, "snap %s [gen %lld..%lld]:\n", name, t->pred+1, t->gen);
+		for(i = 0; i < ndone; i++){
+			if(done[i] == t->gen){
+				fprint(fd, "\tdup\n");
+				goto Next;
+			}
+		}
+		done = realloc(done, (ndone+1)*sizeof(vlong));
+		done[ndone++] = t->gen;
+
+		b = getroot(t, &h);
+		memset(&ts, 0, sizeof(Sizes));
+		marktree(t, b, &ts, marks);
+
+		used = ts.datasz + ts.metasz;
+		sz = hscaled(used, &u);
+		fprint(fd, "\tused %lld (%.2f %s)\n", used, sz, u);
+		sz = hscaled(ts.datasz, &u);
+		fprint(fd, "\tdata %lld (%.2f %s)\n", ts.datasz, sz, u);
+		sz = hscaled(ts.metasz, &u);
+		fprint(fd, "\tmeta %lld (%.2f %s)\n", ts.metasz, sz, u);
+		sz = hscaled(ts.delqsz, &u);
+		fprint(fd, "\tdelq %lld (%.2f %s)\n", ts.delqsz, sz, u);
+		sz = hscaled(ts.clobsz, &u);
+		fprint(fd, "\tclob %lld (%.2f %s)\n", ts.clobsz, sz, u);
+		dropblk(b);
+		total += used;
+Next:
+		closesnap(t);
+		poperror();
+	}
+	btexit(&s);
+	if(marks != nil){
+		for(i = 0; i < fs->narena; i++)
+			marklog(i, fs->arenas[i].loghd, marks);
+		markdlists(marks);
+	}
+	sz = hscaled(total, &u);
+	fprint(fd, "total used: %lld (%.2f %s)\n", total, sz, u);
+	prleak(fd, marks);
+	free(marks);
+}
+
+static void
+showdf(int fd, char **ap, int na)
+{
 	vlong size, used, free;
 	double hsize, hused, hfree;
+	char *us, *uu, *uf;
 	double pct;
 	Arena *a;
-	int i, us, uu, uf;
+	int i;
 
 	size = 0;
 	used = 0;
@@ -229,20 +537,16 @@
 		fprint(fd, "arena %d: %llx/%llx (%.2f%%)\n", i, a->used, a->size, 100*(double)a->used/(double)a->size);
 	}
 	free = size - used;
-	hsize = size;
-	hused = used;
-	hfree = free;
-	for(us = 0; us < nelem(units)-1 && hsize >= 500 ; us++)
-		hsize /= 1024;
-	for(uu = 0; uu < nelem(units)-1 && hused >= 500 ; uu++)
-		hused /= 1024;
-	for(uf = 0; uf < nelem(units)-1 && hfree >= 500 ; uf++)
-		hfree /= 1024;
+	hsize = hscaled(size, &us);
+	hused = hscaled(used, &uu);
+	hfree = hscaled(free, &uf);
 	pct = 100.0*(double)used/(double)size;
 	fprint(fd, "fill:\t%.2f%%\n", pct);
-	fprint(fd, "used:\t%lld (%.2f %s)\n", used, hused, units[uu]);
-	fprint(fd, "size:\t%lld (%.2f %s)\n", size, hsize, units[us]);
-	fprint(fd, "free:\t%lld (%.2f %s)\n", free, hfree, units[uf]);
+	fprint(fd, "used:\t%lld (%.2f %s)\n", used, hused, uu);
+	fprint(fd, "size:\t%lld (%.2f %s)\n", size, hsize, us);
+	fprint(fd, "free:\t%lld (%.2f %s)\n", free, hfree, uf);
+	if(na == 1 && strcmp(ap[0], "verbose") == 0)
+		showsnapsz(fd);
 }
 
 void
@@ -253,13 +557,14 @@
 	Conn *c;
 
 	for(c = fs->conns; c != nil; c = c->next){
-		fprint(fd, "fids:\n");
+		fprint(fd, "-- conn %p: fids --\n", c);
 		for(i = 0; i < Nfidtab; i++){
 			lock(&c->fidtablk[i]);
 			for(f = c->fidtab[i]; f != nil; f = f->next){
 				rlock(f->dent);
-				fprint(fd, "\tfid[%d] from %#zx: %d [refs=%ld, k=%K, qid=%Q]\n",
-					i, getmalloctag(f), f->fid, f->dent->ref, &f->dent->Key, f->dent->qid);
+				fprint(fd, "\tfid[%d] from %#zx: %d [refs=%ld, k=%K, qid=%Q m=%d, dmode:%d duid: %d, dgid: %d]\n",
+					i, getmalloctag(f), f->fid, f->dent->ref, &f->dent->Key, f->dent->qid,
+					f->mode, f->dmode, f->duid, f->dgid);
 				runlock(f->dent);
 			}
 			unlock(&c->fidtablk[i]);
@@ -338,6 +643,24 @@
 }
 
 static void
+showfree(int fd, char **, int)
+{
+	Arange *r;
+	Arena *a;
+	int i;
+
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		qlock(a);
+		fprint(fd, "arena %d %llx+%llx{\n", i, a->h0->bp.addr, a->size);
+		for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r))
+			fprint(fd, "\t%llx..%llx (%llx)\n", r->off, r->off+r->len, r->len);
+		fprint(fd, "}\n");
+		qunlock(a);
+	}
+}
+
+static void
 unreserve(int fd, char **ap, int)
 {
 	if(strcmp(ap[0], "on") == 0)
@@ -350,6 +673,20 @@
 }
 
 static void
+showbptr(int fd, char **ap, int na)
+{
+	Bptr bp;
+	int i;
+
+	for(i = 0; i < na; i++){
+		bp.addr = strtoll(ap[i], nil, 0);
+		bp.hash = -1;
+		bp.gen = -1;
+		showbp(fd, bp, 0);
+	}
+}
+
+static void
 help(int fd, char**, int)
 {
 	char *msg =
@@ -372,8 +709,8 @@
 
 Cmd cmdtab[] = {
 	/* admin */
-	{.name="check",		.sub=nil,	.minarg=0, .maxarg=0, .fn=fsckfs},
-	{.name="df",		.sub=nil, 	.minarg=0, .maxarg=0, .fn=showdf},
+	{.name="check",		.sub=nil,	.minarg=0, .maxarg=0, .fn=fsckfs, .epoch=1},
+	{.name="df",		.sub=nil, 	.minarg=0, .maxarg=1, .fn=showdf, .epoch=1},
 	{.name="halt",		.sub=nil,	.minarg=0, .maxarg=0, .fn=haltfs},
 	{.name="help",		.sub=nil,	.minarg=0, .maxarg=0, .fn=help},
 	{.name="permit",	.sub=nil,	.minarg=1, .maxarg=1, .fn=permflip},
@@ -383,10 +720,12 @@
 	{.name="users",		.sub=nil,	.minarg=0, .maxarg=1, .fn=refreshusers},
 
 	/* debugging */
+	{.name="show",		.sub="bp", 	.minarg=1, .maxarg=1, .fn=showbptr},
 	{.name="show",		.sub="fid",	.minarg=0, .maxarg=0, .fn=showfid},
-	{.name="show",		.sub="tree",	.minarg=0, .maxarg=1, .fn=showtree},
+	{.name="show",		.sub="tree",	.minarg=0, .maxarg=1, .fn=showtree, .epoch=1},
 	{.name="show",		.sub="users",	.minarg=0, .maxarg=0, .fn=showusers},
-	{.name="show",		.sub="bstate",	.minarg=0, .maxarg=0, .fn=showbstate},
+	{.name="show",		.sub="bstate",	.minarg=0, .maxarg=0, .fn=showbstate, .epoch=1},
+	{.name="show",		.sub="free",	.minarg=0, .maxarg=0, .fn=showfree},
 	{.name="debug",		.sub=nil,	.minarg=0, .maxarg=1, .fn=setdbg},
 	{.name="save",		.sub="trace",	.minarg=0, .maxarg=1, .fn=savetrace},
 	{.name=nil, .sub=nil},
@@ -404,11 +743,10 @@
 		fprint(fd, "gefs# ");
 		if((n = read(fd, buf, sizeof(buf)-1)) == -1)
 			break;
-		epochstart(tid);
 		buf[n] = 0;
 		nf = tokenize(buf, f, nelem(f));
 		if(nf == 0 || strlen(f[0]) == 0)
-			goto Next;
+			continue;
 		for(c = cmdtab; c->name != nil; c++){
 			ap = f;
 			na = nf;
@@ -424,7 +762,15 @@
 			}
 			if(na < c->minarg || na > c->maxarg)
 				continue;
-			c->fn(fd, ap, na);
+			if(c->epoch)
+				epochstart(tid);
+			if(!waserror()){
+				c->fn(fd, ap, na);
+				poperror();
+			}else
+				fprint(fd, "%s: %s\n", f[0], errmsg());
+			if(c->epoch)
+				epochend(tid);
 			break;
 		}
 		if(c->name == nil){
@@ -433,7 +779,5 @@
 				fprint(fd, " %s", f[i]);
 			fprint(fd, "'\n");
 		}
-Next:
-		epochend(tid);
 	}
 }
--- a/dat.h
+++ b/dat.h
@@ -10,6 +10,7 @@
 typedef struct Kvp	Kvp;
 typedef struct Xdir	Xdir;
 typedef struct Bptr	Bptr;
+typedef struct Limbo	Limbo;
 typedef struct Bfree	Bfree;
 typedef struct Scan	Scan;
 typedef struct Dent	Dent;
@@ -114,6 +115,7 @@
 	Bcached	= 1 << 3,
 	Bqueued	= 1 << 4,
 	Blimbo	= 1 << 5,
+	Bstatic	= 1 << 6,
 };
 
 enum {
@@ -325,6 +327,18 @@
 	AOrclose,
 };
 
+enum {
+	DFblk,
+	DFbp,
+	DFmnt,
+	DFtree,
+};
+
+struct Limbo {
+	Limbo	*next;
+	int	op;
+};
+
 struct Bptr {
 	vlong	addr;
 	uvlong	hash;
@@ -377,7 +391,6 @@
 };
 
 struct Bucket {
-	Lock;
 	Blk	*b;
 };
 
@@ -413,6 +426,8 @@
 };
 
 struct Tree {
+	Limbo;
+
 	/* in-memory */
 	Lock	lk;
 	long	memref;	/* number of in-memory references to this */
@@ -431,19 +446,9 @@
 	vlong	base;	/* base snapshot */
 };
 
-enum {
-	DFblk,
-	DFmnt,
-	DFtree,
-};
-
 struct Bfree {
-	Bfree	*next;
-	int	op;
-	Mount	*m;
-	Tree	*t;
-	Blk	*b;
-	Bptr	bp;
+	Limbo;
+	Bptr bp;
 };
 
 struct User {
@@ -517,6 +522,7 @@
 	QLock	synclk;
 	Rendez	syncrz;
 
+	QLock	mountlk;
 	Mount	*mounts;
 	Mount	*snapmnt;
 	Lock	connlk;
@@ -530,12 +536,11 @@
 	long	nworker;
 	long	epoch;
 	long	lepoch[32];
-	Bfree	*limbo[3];
+	Limbo	*limbo[3];
 	long	nlimbo;
 
 	Syncq	syncq[32];
 
-
 	int	fd;
 	long	rdonly;
 	int	noauth;
@@ -545,10 +550,6 @@
 	User	*users;
 	int	nusers;
 
-	/* open directory entries */
-	Lock	dtablk;
-	Dent	*dtab[Ndtab];
-
 	/* slow block io */
 	QLock	blklk[32];
 	
@@ -568,6 +569,11 @@
 	usize	ccount;
 	usize	cmax;
 
+	/* preallocated deferred frees */
+	QLock	bfreelk;
+	Rendez	bfreerz;
+	Bfree	*bfree;
+
 	RWLock	flushq[Nflushtab];
 	int	flushop[Nflushtab];
 
@@ -581,7 +587,6 @@
 	Avltree *free;
 	Blk	**queue;
 	int	nqueue;
-	int	lbidx;
 	Blk	*logbuf[2];	/* preallocated log pages */
 	Blk	*h0;		/* arena header */
 	Blk	*h1;		/* arena footer */
@@ -591,7 +596,8 @@
 	vlong	used;
 	vlong	reserve;
 	/* allocation log */
-	vlong	nlog;		/* logged since last copression */
+	vlong	lastlogsz;	/* size after last compression */
+	vlong	nlog;		/* number of blocks in log */
 	Bptr	loghd;		/* allocation log */
 	Blk	*logtl;		/* end of the log, open for writing */
 	Syncq	*sync;
@@ -623,10 +629,14 @@
 	char	gone;
 	char	trunc;
 
-	char	buf[Maxent];
+	union {
+		char	buf[Maxent];
+		void	*auth;
+	};
 };
 
 struct Mount {
+	Limbo;
 	Lock;
 	Mount	*next;
 	long	ref;
@@ -636,6 +646,10 @@
 
 	int	flag;
 
+	/* open directory entries */
+	Lock	dtablk;
+	Dent	*dtab[Ndtab];
+
 	/* snapshot history */
 	char	minutely[60][128];
 	char	hourly[24][128];
@@ -643,12 +657,19 @@
 
 struct Conn {
 	Conn	*next;
+
 	QLock	wrlk;
+
 	int	rfd;
 	int	wfd;
+	int	cfd;
 	int	iounit;
 	int	versioned;
+	int	authok;
+	int	hangup;
 
+	long	ref;
+
 	/* fid hash table */
 	Lock	fidtablk[Nfidtab];
 	Fid	*fidtab[Nfidtab];
@@ -655,7 +676,7 @@
 };
 
 struct Fid {
-	Lock;
+	RWLock;
 	Fid	*next;
 	/*
 	 * if opened with OEXEC, we want to use a snapshot,
@@ -664,8 +685,9 @@
 	 */
 	Mount	*mnt;
 	Scan	*scan;	/* in progres scan */
-	Dent	*dent;	/* (pqid, name) ref, modified on rename */	
-	void	*auth;
+	Dent	*dent;	/* (pqid, name) ref, modified on rename */
+	Dent	*dir;
+	Amsg	*rclose;	
 
 	u32int	fid;
 	vlong	qpath;
@@ -680,7 +702,7 @@
 	int	dmode;
 
 	char	permit;
-	char	rclose;
+	char	fromdump;
 };
 
 enum {
@@ -711,16 +733,12 @@
 };
 
 struct Blk {
+	Limbo;
 	/* cache entry */
 	Blk	*cnext;
 	Blk	*cprev;
 	Blk	*hnext;
 
-	/* Freelist entry */
-	Blk	*fnext;
-
-	long	flag;
-
 	/* serialized to disk in header */
 	short	type;	/* @0, for all */
 	union {
@@ -740,6 +758,7 @@
 	/* debug */
 	uintptr queued;
 	uintptr lasthold;
+	uintptr lasthold0;
 	uintptr lastdrop;
 	uintptr	enqueued;
 	uintptr cached;
@@ -749,6 +768,7 @@
 
 	Bptr	bp;
 	long	ref;
+	long	flag;
 	char	*data;
 	char	buf[Blksz];
 	vlong	magic;
--- a/dump.c
+++ b/dump.c
@@ -89,9 +89,10 @@
 		case Onop:
 		case Oinsert:
 			kv2dir(v, &d);
-			n = fmtprint(fmt, "[qid=(%llux,%lud,%d), %luo, t=%lld,%lld, l=%lld]",
-				d.qid.path, d.qid.vers, d.qid.type,
-				d.mode, d.atime, d.mtime, d.length);
+			n = fmtprint(fmt, "[qid=(%llux,%lud,%d), p=%luo, f=%llux, t=%lld,%lld, l=%lld, o=%d, g=%d m=%d]",
+				d.qid.path, d.qid.vers, d.qid.type, d.mode,
+				d.flag, d.atime, d.mtime, d.length,
+				d.uid, d.gid, d.muid);
 			break;
 		case Odelete:
 			n = fmtprint(fmt, "delete");
@@ -306,9 +307,11 @@
 		goto Show;
 	case Tlog:
 		fprint(fd, "log -- ");
+		fprint(fd, "logsz: %d, logh: %lld, logp: %B\n", b->logsz, b->logh, b->logp);
 		goto Show;
 	case Tdlist:
 		fprint(fd, "dlist -- ");
+		fprint(fd, "logsz: %d, logh: %lld, logp: %B\n", b->logsz, b->logh, b->logp);
 		goto Show;
 	case Tdat:
 		fprint(fd, "dat -- ");
--- /dev/null
+++ b/env.rc
@@ -1,0 +1,108 @@
+fn r{mk all && 6.out -dA -m 64 -r $user -f $testdev}
+fn t{mk all && 6.out -A -m 512 -f $testdev}
+fn d{mk all && 6.out -dA -m 512 -f $testdev}
+fn k{kill 6.out | rc}
+fn leak { g `{echo $1 | sed s/0x//g}  /tmp/trace | grep -v syncblk | tail -n 3}
+fn m{mount -c /srv/gefs /n/gefs $*}
+fn s{mount -c /srv/gefs /n/gefs dump}
+fn tf{
+	arg=100
+	if(! ~ $#* 0)
+		arg=$*
+	touch /n/gefs/`{seq $arg}
+}
+fn cf{
+	dd -if /dev/zero -of /n/gefs/test -bs 1kk -count 100
+}
+fn rf{
+	arg=100
+	if(! ~ $#* 0)
+		arg=$*
+	for(f in `{seq $arg}) {
+		echo $f
+		rm /n/gefs/$f
+	}
+}
+
+fn iob{
+	6c iobench.c && 6l -o 6.iobench iobench.6 && 6.iobench -o rand test.fs
+}
+
+fn a{
+	mk fs.acid && acid -l fs.acid $*
+}
+
+fn rps {
+	mac=(-ms)
+	if(~ gefs comp utf 9 contents) mac=(-ms -mnihongo)
+	{ echo .FP lucidasans; cat gefs.ms } | pic | tbl | eqn | 
+		troff $mac | lp -dstdout > gefs.ps
+	cleanps gefs.ps
+	page -p150 gefs.ps
+}
+fn tt {
+	kill 6.out|rc
+	@ {
+		cd /usr/ori/src/gefs/ && mk all
+	} && @ {
+		cd /usr/ori/src/gefs/test && ./mkgefs.rc /dev/sdO0/data && 6c fsbench.c && 6l fsbench.6 && 6.out /n/gefs
+	}
+}
+
+fn tg {@{
+	GOROOT=/n/gefs/go
+	GOROOT_BOOTSTRAP=/n/gefs/go-plan9-amd64-bootstrap
+	rfork ne
+	m
+	cd /n/gefs
+	mkdir go
+	mkdir tmp
+	bunzip2 -c /tmp/go1.17.13-plan9-amd64-bootstrap.tbz | tar x
+	dircp go-plan9-amd64-bootstrap go
+	bind -c tmp /tmp
+	cd go/src
+	alarm 1200 ./all.rc
+}}
+
+
+fn t9 {@{
+	rfork ne
+	m
+	cd /n/gefs
+	. /sys/lib/rootstub
+	if(! test -e plan9front)
+		git/clone /dist/plan9front
+	bind -c $objtype/lib /$objtype/lib
+	bind -c plan9front/sys/include /sys/include
+	bind -c tmp /tmp
+	cd plan9front/sys/src
+	mk clean >> /tmp/log
+	mk all >> /tmp/log
+}}
+
+fn tsl {@{
+	rfork ne
+	m
+	cd /n/gefs
+	if(! test -e gefs)
+		git/clone $home/src/gefs
+	cd gefs
+	for(i in `{seq 1000}){
+		echo @@ $i
+		mk clean > /dev/null
+		mk all > /dev/null
+		sleep 15
+	}
+}}
+
+fn tb {@{
+	rfork ne
+	m
+	cd /n/gefs
+	for(i in `{seq 1000}){
+		echo @@ $i
+		rm -f x
+		dd -if /dev/zero -of x -bs 15k -count 1
+		sleep 1
+	}
+}}
--- a/fns.h
+++ b/fns.h
@@ -35,8 +35,9 @@
 
 void*	emalloc(usize, int);
 
-Blk*	newblk(Tree *, int, vlong);
-Blk*	dupblk(Tree *, Blk*);
+Blk*	newdblk(Tree*, vlong, int);
+Blk*	newblk(Tree*, int);
+Blk*	dupblk(Tree*, Blk*);
 Blk*	getroot(Tree*, int*);
 Blk*	getblk(Bptr, int);
 Blk*	holdblk(Blk*);
@@ -59,12 +60,12 @@
 void	epochend(int);
 void	epochwait(void);
 void	epochclean(void);
-void	limbo(Bfree*);
-void	freeblk(Tree*, Blk*, Bptr);
+void	limbo(int op, Limbo*);
+void	freeblk(Tree*, Blk*);
+void	freebp(Tree*, Bptr);
 int	logbarrier(Arena *, vlong);
 void	dlappend(Dlist *dl, Bptr);
 void	killblk(Tree*, Bptr);
-void	blkdealloc(vlong);
 ushort	blkfill(Blk*);
 uvlong	blkhash(Blk*);
 uvlong	bufhash(void*, usize);
@@ -86,6 +87,7 @@
 void	loadarena(Arena*, Bptr);
 void	loadfs(char*);
 void	loadlog(Arena*, Bptr);
+void	flushlog(Arena*);
 int	scandead(Dlist*, int, void(*)(Bptr, void*), void*);
 int	endfs(void);
 void	compresslog(Arena*);
@@ -92,7 +94,8 @@
 void	dlsync(void);
 void	setval(Blk*, Kvp*);
 
-Conn*	newconn(int, int);
+Conn*	newconn(int, int, int);
+void	putconn(Conn*);
 
 int	walk1(Tree*, vlong, char*, Qid*, vlong*);
 void	loadusers(int, Tree*);
@@ -106,9 +109,8 @@
 int	btnext(Scan*, Kvp*);
 void	btexit(Scan*);
 
-int	checkflag(Blk *b, int);
-void	setflag(Blk *b, int);
-void	clrflag(Blk *b, int);
+int	checkflag(Blk *b, int, int);
+void	setflag(Blk *b, int, int);
 
 char*	estrdup(char*);
 
--- a/fs.c
+++ b/fs.c
@@ -12,6 +12,8 @@
 static void	rerror(Fmsg*, char*, ...);
 static void	clunkfid(Conn*, Fid*, Amsg**);
 
+static void	authfree(AuthRpc*);
+
 int
 walk1(Tree *t, vlong up, char *name, Qid *qid, vlong *len)
 {
@@ -36,13 +38,33 @@
 }
 
 static void
+touch(Dent *de, Msg *msg)
+{
+	wlock(de);
+	de->qid.vers++;
+	msg->op = Owstat;
+	msg->k = de->k;
+	msg->nk = de->nk;
+	msg->v = "\0";
+	msg->nv = 1;
+	wunlock(de);
+}
+
+static void
 wrbarrier(void)
 {
+	tracev("barrier", fs->qgen);
+	aincv(&fs->qgen, 1);
+}
+
+static void
+wrwait(void)
+{
 	Qent qe;
 	int i;
-	
+
+	tracev("wrwait", fs->qgen);
 	aincv(&fs->qgen, 1);
-	tracev("barrier", fs->qgen);
 	fs->syncing = fs->nsyncers;
 	for(i = 0; i < fs->nsyncers; i++){
 		qe.op = Qfence;
@@ -66,7 +88,6 @@
 	Dlist dl;
 	int i;
 
-
 	qlock(&fs->synclk);
 	if(waserror()){
 		fprint(2, "failed to sync: %s\n", errmsg());
@@ -111,16 +132,13 @@
 		 * block out synchronously, or it may
 		 * get reused.
 		 */
-		logbarrier(a, fs->qgen);
-		finalize(a->logtl);
-		syncblk(a->logtl);
+		logbarrier(a, agetv(&fs->qgen));
+		flushlog(a);
 
 		packarena(a->h0->data, Blksz, a);
 		packarena(a->h1->data, Blksz, a);
 		finalize(a->h0);
 		finalize(a->h1);
-		setflag(a->h0, Bdirty);
-		setflag(a->h1, Bdirty);
 		fs->arenabp[i] = a->h0->bp;
 		qunlock(a);
 	}
@@ -150,10 +168,10 @@
 	 * get synced after so that we can use them next
 	 * time around.
          */
-	qlock(&fs->mutlk);
 	tracem("supers");
-	syncblk(fs->sb0);
-	syncblk(fs->sb1);
+	enqueue(fs->sb0);
+	enqueue(fs->sb1);
+	wrbarrier();
 
 	/*
 	 * pass 3: sync block footers; if we crash here,
@@ -165,11 +183,13 @@
 		enqueue(fs->arenas[i].h1);
 
 	/*
-	 * Pass 4: clean up the old snap tree's deadlist
+	 * Pass 4: clean up the old snap tree's deadlist.
+	 * we need to wait for all the new data to hit disk
+	 * before we can free anything, otherwise it gets
+	 * clobbered.
 	 */
 	tracem("snapdl");
-	wrbarrier();
-	qunlock(&fs->mutlk);
+	wrwait();
 	freedl(&dl, 1);
 	qunlock(&fs->synclk);
 	tracem("synced");
@@ -210,7 +230,7 @@
 			return;
 		}
 		if(t->nlbl == 1 && t->nref <= 1 && t->succ == -1){
-			aincl(&t->memref, 1);
+			ainc(&t->memref);
 			*tp = t;
 		}
 		delsnap(t, t->succ, a->old);
@@ -244,7 +264,7 @@
 	d->qid.path = Qdump;
 	d->qid.vers = fs->nextgen;
 	d->qid.type = QTDIR;
-	d->mode = 0555;
+	d->mode = DMDIR|0555;
 	d->atime = 0;
 	d->mtime = 0;
 	d->length = 0;
@@ -326,38 +346,23 @@
 {
 	char buf[ERRMAX];
 	va_list ap;
-	Amsg *a;
-	Fid *f;
-	int i;
 
+	c->hangup = 1;
+
 	va_start(ap, fmt);
 	vsnprint(buf, sizeof(buf), fmt, ap);
 	va_end(ap);
+
 	fprint(2, "hangup: %s\n", buf);
-	close(c->rfd);
-	close(c->wfd);
-	for(i = 0; i < Nfidtab; i++){
-		lock(&c->fidtablk[i]);
-		for(f = c->fidtab[i]; f != nil; f = f->next){
-			lock(f);
-			if(waserror()){
-				unlock(f);
-				continue;
-			}
-			a = nil;
-			clunkfid(c, f, &a);
-			unlock(f);
-			if(a != nil)
-				chsend(fs->admchan, a);
-			nexterror();
-		}
-		unlock(&c->fidtablk[i]);
-	}
+
+	if(c->cfd >= 0)
+		hangup(c->cfd);
 }
 
 static void
 respond(Fmsg *m, Fcall *r)
 {
+	Conn *c;
 	RWLock *lk;
 	uchar buf[Max9p+IOHDRSZ];
 	int w, n;
@@ -367,11 +372,12 @@
 	assert(m->type+1 == r->type || r->type == Rerror);
 	if((n = convS2M(r, buf, sizeof(buf))) == 0)
 		abort();
-	qlock(&m->conn->wrlk);
-	w = write(m->conn->wfd, buf, n);
-	qunlock(&m->conn->wrlk);
+	c = m->conn;
+	qlock(&c->wrlk);
+	w = c->hangup? n: write(c->wfd, buf, n);
+	qunlock(&c->wrlk);
 	if(w != n)
-		fshangup(m->conn, Eio);
+		fshangup(c, Eio);
 	if(m->type == Tflush){
 		lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
 		wunlock(lk);
@@ -380,6 +386,7 @@
 		runlock(lk);
 	}
 	free(m);
+	putconn(c);
 }
 
 static void
@@ -432,7 +439,7 @@
 static int
 readb(Tree *t, Fid *f, char *d, vlong o, vlong n, vlong sz)
 {
-	char buf[17], kvbuf[17+32];
+	char buf[Offksz], kvbuf[Offksz+32];
 	vlong fb, fo;
 	Bptr bp;
 	Blk *b;
@@ -471,6 +478,7 @@
 	char buf[Kvmax];
 	vlong fb, fo;
 	Blk *b, *t;
+	int seq;
 	Tree *r;
 	Bptr bp;
 	Kvp kv;
@@ -482,9 +490,14 @@
 	PACK64(m->k+1, f->qpath);
 	PACK64(m->k+9, fb);
 
-	b = newblk(f->mnt->root, Tdat, f->qpath);
+	if(fo+n >= Blksz)
+		seq = 1;
+	else
+		seq = 0;
+	b = newdblk(f->mnt->root, f->qpath, seq);
 	t = nil;
 	r = f->mnt->root;
+tracex("writeb", b->bp, f->qpath, o);
 	if(btlookup(r, m, &kv, buf, sizeof(buf))){
 		bp = unpackbp(kv.v, kv.nv);
 		if(fb < sz && (fo != 0 || n != Blksz)){
@@ -511,7 +524,7 @@
 }
 
 static Dent*
-getdent(vlong pqid, Xdir *d)
+getdent(Mount *mnt, vlong pqid, Xdir *d)
 {
 	Dent *de;
 	char *e;
@@ -518,8 +531,8 @@
 	u32int h;
 
 	h = ihash(d->qid.path) % Ndtab;
-	lock(&fs->dtablk);
-	for(de = fs->dtab[h]; de != nil; de = de->next){
+	lock(&mnt->dtablk);
+	for(de = mnt->dtab[h]; de != nil; de = de->next){
 		if(de->qid.path == d->qid.path){
 			ainc(&de->ref);
 			goto Out;
@@ -542,11 +555,11 @@
 	de->k = de->buf;
 	de->nk = e - de->buf;
 	de->name = de->buf + 11;
-	de->next = fs->dtab[h];
-	fs->dtab[h] = de;
+	de->next = mnt->dtab[h];
+	mnt->dtab[h] = de;
 
 Out:
-	unlock(&fs->dtablk);
+	unlock(&mnt->dtablk);
 	return de;
 }
 
@@ -607,6 +620,7 @@
 		return fs->snapmnt;
 	}
 
+	qlock(&fs->mountlk);
 	for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
 		if(strcmp(name, mnt->name) == 0){
 			ainc(&mnt->ref);
@@ -617,6 +631,7 @@
 	if((mnt = mallocz(sizeof(*mnt), 1)) == nil)
 		error(Enomem);
 	if(waserror()){
+		qunlock(&fs->mountlk);
 		free(mnt);
 		nexterror();
 	}
@@ -632,6 +647,7 @@
 	poperror();
 
 Out:
+	qunlock(&fs->mountlk);
 	return mnt;
 }
 
@@ -639,26 +655,24 @@
 clunkmount(Mount *mnt)
 {
 	Mount *me, **p;
-	Bfree *f;
 
 	if(mnt == nil)
 		return;
 	if(adec(&mnt->ref) == 0){
+		qlock(&fs->mountlk);
 		for(p = &fs->mounts; (me = *p) != nil; p = &me->next){
 			if(me == mnt)
 				break;
 		}
 		assert(me != nil);
-		f = emalloc(sizeof(Bfree), 0);
-		f->op = DFmnt;
-		f->m = mnt;
 		*p = me->next;
-		limbo(f);
+		limbo(DFmnt, me);
+		qunlock(&fs->mountlk);
 	}
 }
 
 static void
-clunkdent(Dent *de)
+clunkdent(Mount *mnt, Dent *de)
 {
 	Dent *e, **pe;
 	u32int h;
@@ -665,16 +679,19 @@
 
 	if(de == nil)
 		return;
-	if(de->qid.type & QTAUTH && adec(&de->ref) == 0){
-		free(de);
+	if(de->qid.type & QTAUTH){
+		if(adec(&de->ref) == 0){
+			authfree(de->auth);
+			free(de);
+		}
 		return;
 	}
-	lock(&fs->dtablk);
+	lock(&mnt->dtablk);
 	if(adec(&de->ref) != 0)
 		goto Out;
 	h = ihash(de->qid.path) % Ndtab;
-	pe = &fs->dtab[h];
-	for(e = fs->dtab[h]; e != nil; e = e->next){
+	pe = &mnt->dtab[h];
+	for(e = mnt->dtab[h]; e != nil; e = e->next){
 		if(e == de)
 			break;
 		pe = &e->next;
@@ -683,7 +700,7 @@
 	*pe = e->next;
 	free(de);
 Out:
-	unlock(&fs->dtablk);
+	unlock(&mnt->dtablk);
 }
 
 static Fid*
@@ -708,8 +725,9 @@
 {
 	if(adec(&f->ref) != 0)
 		return;
+	clunkdent(f->mnt, f->dent);
+	clunkdent(f->mnt, f->dir);
 	clunkmount(f->mnt);
-	clunkdent(f->dent);
 	free(f);
 }
 
@@ -724,6 +742,7 @@
 		return nil;
 
 	*n = *f;
+	memset(&n->RWLock, 0, sizeof(RWLock));
 	n->fid = new;
 	n->ref = 2; /* one for dup, one for clunk */
 	n->mode = -1;
@@ -747,10 +766,16 @@
 	if(n->mnt != nil)
 		ainc(&n->mnt->ref);
 	ainc(&n->dent->ref);
+	ainc(&n->dir->ref);
 	setmalloctag(n, getcallerpc(&c));
 	return n;
 }
 
+/*
+ * clunkfid() removes a fid from the
+ * connection fid tab and drops reference.
+ * Fid must be locked.
+ */
 static void
 clunkfid(Conn *c, Fid *fid, Amsg **ao)
 {
@@ -757,6 +782,8 @@
 	Fid *f, **pf;
 	u32int h;
 
+	assert(!canwlock(fid));
+
 	h = ihash(fid->fid) % Nfidtab;
 	lock(&c->fidtablk[h]);
 	pf = &c->fidtab[h];
@@ -768,21 +795,27 @@
 		}
 		pf = &f->next;
 	}
+	unlock(&c->fidtablk[h]);
+
 	assert(f != nil);
 	if(f->scan != nil){
 		free(f->scan);
 		f->scan = nil;
 	}
-	if(f->rclose){
+
+	if((*ao = f->rclose) != nil){
+		f->rclose = nil;
+
 		qlock(&f->dent->trunclk);
 		f->dent->trunc = 1;
 		qunlock(&f->dent->trunclk);
+
 		wlock(f->dent);
 		f->dent->gone = 1;
 		wunlock(f->dent);
-		*ao = emalloc(sizeof(Amsg), 1);
-		aincl(&f->dent->ref, 1);
-		aincl(&f->mnt->ref, 1);
+
+		ainc(&f->dent->ref);
+		ainc(&f->mnt->ref);
 		(*ao)->op = AOrclose;
 		(*ao)->mnt = f->mnt;
 		(*ao)->qpath = f->qpath;
@@ -790,9 +823,23 @@
 		(*ao)->end = f->dent->length;
 		(*ao)->dent = f->dent;
 	}
-	unlock(&c->fidtablk[h]);
 }
 
+static void
+freeamsg(Amsg *a)
+{
+	if(a == nil)
+		return;
+	switch(a->op){
+	case AOrclose:
+	case AOclear:
+		clunkdent(a->mnt, a->dent);
+		clunkmount(a->mnt);
+		break;
+	}
+	free(a);
+}
+
 static int
 readmsg(Conn *c, Fmsg **pm)
 {
@@ -821,6 +868,7 @@
 		free(m);
 		return -1;
 	}
+	ainc(&c->ref);
 	m->conn = c;
 	m->sz = sz;
 	PBIT32(m->buf, sz);
@@ -853,7 +901,7 @@
 	respond(m, &r);
 }
 
-void
+static void
 authfree(AuthRpc *auth)
 {
 	AuthRpc *rpc;
@@ -894,7 +942,7 @@
 	AuthRpc *rpc;
 	User *u;
 
-	if((rpc = f->auth) == nil)
+	if((f->dir->qid.type & QTAUTH) == 0 || (rpc = f->dir->auth) == nil)
 		error(Etype);
 
 	switch(auth_rpc(rpc, "read", nil, 0)){
@@ -930,7 +978,7 @@
 {
 	AuthRpc *rpc;
 
-	if((rpc = f->auth) == nil)
+	if((f->dir->qid.type & QTAUTH) == 0 || (rpc = f->dir->auth) == nil)
 		error(Etype);
 	if(auth_rpc(rpc, "write", data, count) != ARok)
 		error(Ebotch);
@@ -944,7 +992,7 @@
 {
 	Dent *de;
 	Fcall r;
-	Fid f;
+	Fid f, *nf;
 
 	if(fs->noauth){
 		rerror(m, Eauth);
@@ -959,6 +1007,11 @@
 		return;
 	}
 	memset(de, 0, sizeof(Dent));
+	de->auth = authnew();
+	if(de->auth == nil){
+		rerror(m, errmsg());
+		return;
+	}
 	de->ref = 0;
 	de->qid.type = QTAUTH;
 	de->qid.path = aincv(&fs->nextqid, 1);
@@ -975,13 +1028,15 @@
 	f.mode = -1;
 	f.iounit = m->conn->iounit;
 	f.dent = de;
+	f.dir = de;
 	f.uid = -1;
 	f.duid = -1;
 	f.dgid = -1;
 	f.dmode = 0600;
-	f.auth = authnew();
-	if(dupfid(m->conn, m->afid, &f) == nil){
+	nf = dupfid(m->conn, m->afid, &f);
+	if(nf == nil){
 		rerror(m, Efid);
+		authfree(de->auth);
 		free(de);
 		return;
 	}
@@ -988,6 +1043,7 @@
 	r.type = Rauth;
 	r.aqid = de->qid;
 	respond(m, &r);
+	putfid(nf);
 }
 
 static int
@@ -1065,7 +1121,7 @@
 			if((m & (fmode>>3)) == m)
 				return 0;
 	}
-	if(m & fmode) {
+	if((m & fmode) == m) {
 		if((fmode & DMDIR) && (m == DMEXEC))
 			return 0;
 		if(!ingroup(f->uid, nogroupid))
@@ -1087,7 +1143,7 @@
 	Xdir d;
 	Kvp kv;
 	Key dk;
-	Fid f, *af;
+	Fid f, *af, *nf;
 	int uid;
 
 	de = nil;
@@ -1128,10 +1184,15 @@
 		putfid(af);
 		if(af->uid != uid)
 			error(Ebadu);
-	}else if(!fs->noauth && strcmp(m->uname, "none") != 0)
-		error(Ebadu);
+		m->conn->authok = 1;	/* none attach allowed now */
+	}else if(!fs->noauth){
+		if(uid != noneid || !m->conn->authok)
+			error(Ebadu);
+	}
 
 	if(strcmp(m->aname, "dump") == 0){
+		if(uid == noneid)
+			error(Eperm);
 		memset(&d, 0, sizeof(d));
 		filldumpdir(&d);
 	}else{
@@ -1144,7 +1205,7 @@
 			error(Enosnap);
 		kv2dir(&kv, &d);
 	}
-	de = getdent(-1, &d);
+	de = getdent(mnt, -1, &d);
 	memset(&f, 0, sizeof(Fid));
 	f.fid = NOFID;
 	f.mnt = mnt;
@@ -1153,6 +1214,7 @@
 	f.mode = -1;
 	f.iounit = m->conn->iounit;
 	f.dent = de;
+	f.dir = de;
 	f.uid = uid;
 	f.duid = d.uid;
 	f.dgid = d.gid;
@@ -1162,44 +1224,58 @@
 			error(Eperm);
 		f.permit = 1;
 	}
-	if(dupfid(m->conn, m->fid, &f) == nil)
+	if(strcmp(aname, "dump") == 0)
+		f.fromdump = 1;
+	nf = dupfid(m->conn, m->fid, &f);
+	if(nf == nil)
 		error(Efid);
-
 	r.type = Rattach;
 	r.qid = d.qid;
 	respond(m, &r);
+	putfid(nf);
 	poperror();
 
 
-Err:	clunkdent(de);
+Err:	clunkdent(mnt, de);
 	clunkmount(mnt);
 }
 
 static int
-findparent(Tree *t, Fid *f, vlong *qpath, char **name, char *buf, int nbuf)
+findparent(Tree *t, vlong up, vlong *qpath, char **name, char *buf, int nbuf)
 {
 	char *p, kbuf[Keymax];
 	Kvp kv;
 	Key k;
 
-	p = packsuper(kbuf, sizeof(kbuf), f->pqpath);
+	p = packsuper(kbuf, sizeof(kbuf), up);
 	k.k = kbuf;
 	k.nk = p - kbuf;
 	if(!btlookup(t, &k, &kv, buf, nbuf))
-		return 0;
+		error(Esrch);
 	*name = unpackdkey(kv.v, kv.nv, qpath);
 	return 1;
 }
 
 static void
+dkey(Key *k, vlong up, char *name, char *buf, int nbuf)
+{
+	char *p;
+
+	p = packdkey(buf, nbuf, up, name);
+	k->k = buf;
+	k->nk = p - buf;
+}
+
+static void
 fswalk(Fmsg *m)
 {
-	char *p, *name, kbuf[Maxent], kvbuf[Kvmax];
-	int duid, dgid, dmode;
-	vlong up, prev;
+	char *name, kbuf[Maxent], kvbuf[Kvmax];
+	int duid, dgid, dmode, duped;
+	vlong up, upup, prev;
+	Dent *dent, *dir;
 	Fid *o, *f;
-	Dent *dent;
 	Mount *mnt;
+	Amsg *ao;
 	Tree *t;
 	Fcall r;
 	Xdir d;
@@ -1211,8 +1287,10 @@
 		rerror(m, Enofid);
 		return;
 	}
+	rlock(o);
 	if(waserror()){
 		rerror(m, errmsg());
+		runlock(o);
 		putfid(o);
 		return;
 	}
@@ -1220,7 +1298,7 @@
 		error(Einuse);
 	t = o->mnt->root;
 	mnt = o->mnt;
-	up = o->qpath;
+	up = o->pqpath;
 	prev = o->qpath;
 	rlock(o->dent);
 	d = *o->dent;
@@ -1234,42 +1312,31 @@
 		if(strlen(name) > Maxname)
 			error(Elength);
 		if(fsaccess(o, d.mode, d.uid, d.gid, DMEXEC) != 0)
-			error(Eperm);
-		if(d.qid.path == Qdump){
-			if((mnt = getmount(m->wname[i])) == nil)
-				error(Esrch);
-			if(waserror()){
-				clunkmount(mnt);
-				nexterror();
+			break;
+		if(strcmp(name, "..") == 0){
+			if(up == -1 && o->fromdump){
+				mnt = fs->snapmnt;
+				filldumpdir(&d);
+				prev = -1ULL;
+				up = -1ULL;
+				r.wqid[i] = d.qid;
+				continue;
 			}
+			findparent(t, up, &prev, &name, kbuf, sizeof(kbuf));
+		}else if(d.qid.path == Qdump){
+			mnt = getmount(m->wname[i]);
+			name = "";
+			prev = -1ULL;
 			t = mnt->root;
-			p = packdkey(kbuf, sizeof(kbuf), -1ULL, "");
-			poperror();
-		}else{
-			if(strcmp(m->wname[i], "..") == 0){
-				if(o->pqpath == Qdump){
-					mnt = fs->snapmnt;
-					filldumpdir(&d);
-					duid = d.uid;
-					dgid = d.gid;
-					dmode = d.mode;
-					goto Found;
-				}
-				if(!findparent(t, o, &prev, &name, kbuf, sizeof(kbuf)))
-					error(Esrch);
-			}
-			p = packdkey(kbuf, sizeof(kbuf), prev, name);
 		}
+		up = prev;
 		duid = d.uid;
 		dgid = d.gid;
 		dmode = d.mode;
-		k.k = kbuf;
-		k.nk = p - kbuf;
+		dkey(&k, prev, name, kbuf, sizeof(kbuf));
 		if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
 			break;
 		kv2dir(&kv, &d);
-Found:
-		up = prev;
 		prev = d.qid.path;
 		r.wqid[i] = d.qid;
 	}
@@ -1277,41 +1344,59 @@
 	if(i == 0 && m->nwname != 0)
 		error(Esrch);
 	f = o;
+	duped = 0;
 	if(m->fid != m->newfid && i == m->nwname){
 		if((f = dupfid(m->conn, m->newfid, o)) == nil)
 			error(Efid);
-		putfid(o);
+		duped = 1;
 	}
+	runlock(o);
+
 	if(i > 0 && i == m->nwname){
-		lock(f);
+		wlock(f);
+		ao = nil;
 		if(waserror()){
-			if(f != o)
-				clunkfid(m->conn, f, nil);
-			unlock(f);
+			if(duped)
+				clunkfid(m->conn, f, &ao);
+			assert(ao == nil);
+			wunlock(f);
 			nexterror();
 		}
-		if(up == Qdump)
-			dent = getdent(-1ULL, &d);
-		else
-			dent = getdent(up, &d);
+		if(up == -1ULL){
+			/* the root contains itself, I guess */
+			dent = getdent(mnt, up, &d);
+			dir = getdent(mnt, up, &d);
+		}else{
+			dent = getdent(mnt, up, &d);
+			findparent(t, up, &upup, &name, kbuf, sizeof(kbuf));
+			dkey(&k, upup, name, kbuf, sizeof(kbuf));
+			if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+				broke("missing parent");
+			kv2dir(&kv, &d);
+			dir = getdent(mnt, upup, &d);
+		}
+		clunkdent(f->mnt, f->dent);
+		clunkdent(f->mnt, f->dir);
 		if(mnt != f->mnt){
 			clunkmount(f->mnt);
 			ainc(&mnt->ref);
 			f->mnt = mnt;
 		}
-		clunkdent(f->dent);
 		f->qpath = r.wqid[i-1].path;
 		f->pqpath = up;
 		f->dent = dent;
+		f->dir = dir;
 		f->duid = duid;
 		f->dgid = dgid;
 		f->dmode = dmode;
 		poperror();
-		unlock(f);
+		wunlock(f);
 	}
+	if(duped)
+		putfid(o);
+	putfid(f);
 	respond(m, &r);
 	poperror();
-	putfid(f);
 }
 
 static void
@@ -1353,7 +1438,7 @@
 	Qid old;
 	Fcall r;
 	Dent *de;
-	Msg mb[3];
+	Msg mb[4];
 	Xdir n;
 	Dir d;
 	Tree *t;
@@ -1372,7 +1457,7 @@
 	wlock(de);
 	if(waserror()){
 		rerror(m, errmsg());
-		free(*ao);
+		freeamsg(*ao);
 		*ao = nil;
 		goto Err;
 	}
@@ -1418,8 +1503,8 @@
 				qlock(&de->trunclk);
 				de->trunc = 1;
 				qunlock(&de->trunclk);
-				aincl(&de->ref, 1);
-				aincl(&f->mnt->ref, 1);
+				ainc(&de->ref);
+				ainc(&f->mnt->ref);
 				(*ao)->op = AOclear;
 				(*ao)->mnt = f->mnt;
 				(*ao)->qpath = f->qpath;
@@ -1536,6 +1621,7 @@
 			mb[nm].nv = mb[nm-1].nk;
 			nm++;
 		}
+		touch(f->dir, &mb[nm++]);
 	}else{
 		opbuf[0] = op;
 		mb[nm].op = Owstat;
@@ -1570,9 +1656,9 @@
 		rerror(m, Enofid);
 		return;
 	}
-	lock(f);
+	wlock(f);
 	clunkfid(m->conn, f, ao);
-	unlock(f);
+	wunlock(f);
 	r.type = Rclunk;
 	respond(m, &r);
 	putfid(f);
@@ -1582,14 +1668,14 @@
 fscreate(Fmsg *m)
 {
 	char *p, *e, buf[Kvmax], upkbuf[Keymax], upvbuf[Inlmax];
+	int nm, duid, dgid, dmode;
 	Dent *de;
 	vlong oldlen;
 	Qid old;
 	Fcall r;
-	Msg mb[2];
+	Msg mb[3];
 	Fid *f;
 	Xdir d;
-	int nm;
 
 	if((e = okname(m->name)) != nil){
 		rerror(m, e);
@@ -1603,7 +1689,7 @@
 		rerror(m, Enofid);
 		return;
 	}
-	lock(f);
+	wlock(f);
 
 	if(waserror()){
 		rerror(m, errmsg());
@@ -1626,8 +1712,9 @@
 		runlock(de);
 		goto Out;
 	}
-
-	d.gid = de->gid;
+	duid = de->uid;
+	dgid = de->gid;
+	dmode = de->mode;
 	runlock(de);
 
 	nm = 0;
@@ -1652,6 +1739,7 @@
 	d.mtime = d.atime;
 	d.length = 0;
 	d.uid = f->uid;
+	d.gid = dgid;
 	d.muid = f->uid;
 
 	mb[nm].op = Oinsert;
@@ -1670,16 +1758,21 @@
 		mb[nm].nv = p - upvbuf;
 		nm++;
 	}
+	touch(f->dent, &mb[nm++]);
+	assert(nm <= nelem(mb));
 	upsert(f->mnt, mb, nm);
 
-	de = getdent(f->qpath, &d);
-	clunkdent(f->dent);
+	de = getdent(f->mnt, f->qpath, &d);
+	clunkdent(f->mnt, f->dent);
 	f->mode = mode2bits(m->mode);
 	f->pqpath = f->qpath;
 	f->qpath = d.qid.path;
 	f->dent = de;
+	f->duid = duid;
+	f->dgid = dgid;
+	f->dmode = dmode;
 	if(m->mode & ORCLOSE)
-		f->rclose = 1;
+		f->rclose = emalloc(sizeof(Amsg), 1);
 
 	r.type = Rcreate;
 	r.qid = d.qid;
@@ -1686,7 +1779,7 @@
 	r.iounit = f->iounit;
 	respond(m, &r);
 Out:	poperror();
-Err:	unlock(f);
+Err:	wunlock(f);
 	putfid(f);
 	return;
 }
@@ -1718,7 +1811,8 @@
 {
 	char *e, buf[Kvmax];
 	Fcall r;
-	Msg mb[2];
+	int nm;
+	Msg mb[3];
 	Tree *t;
 	Kvp kv;
 	Fid *f;
@@ -1728,17 +1822,18 @@
 		return;
 	}
 	t = f->mnt->root;
-	clunkfid(m->conn, f, nil);
-
+	nm = 0;
+	wlock(f);
+	clunkfid(m->conn, f, ao);
 	truncwait(f->dent, id);
 	wlock(f->dent);
-	*ao = nil;
 	if(waserror()){
 		rerror(m, errmsg());
-		free(*ao);
+		freeamsg(*ao);
 		*ao = nil;
 		goto Err;
 	}
+tracex("removef", Zb, f->qpath, -1);
 	if(f->dent->gone)
 		error(Ephase);
 	/*
@@ -1756,21 +1851,26 @@
 		error(e);
 	if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
 		error(Eperm);
-	mb[0].op = Odelete;
-	mb[0].k = f->dent->k;
-	mb[0].nk = f->dent->nk;
-	mb[0].nv = 0;
 
+	freeamsg(*ao);
+	*ao = nil;
+
+	mb[nm].op = Odelete;
+	mb[nm].k = f->dent->k;
+	mb[nm].nk = f->dent->nk;
+	mb[nm].v = "\0";
+	mb[nm].nv = 1;
+	nm++;
 	if(f->dent->qid.type & QTDIR){
 		packsuper(buf, sizeof(buf), f->qpath);
-		mb[1].op = Oclobber;
-		mb[1].k = buf;
-		mb[1].nk = Upksz;
-		mb[1].nv = 0;
-		upsert(f->mnt, mb, 2);
+		mb[nm].op = Oclobber;
+		mb[nm].k = buf;
+		mb[nm].nk = Upksz;
+		mb[nm].nv = 0;
+		nm++;
 	}else{
 		*ao = emalloc(sizeof(Amsg), 1);
-		aincl(&f->mnt->ref, 1);
+		ainc(&f->mnt->ref);
 		(*ao)->op = AOclear;
 		(*ao)->mnt = f->mnt;
 		(*ao)->qpath = f->qpath;
@@ -1777,8 +1877,10 @@
 		(*ao)->off = 0;
 		(*ao)->end = f->dent->length;
 		(*ao)->dent = nil;
-		upsert(f->mnt, mb, 1);
 	}
+	touch(f->dir, &mb[nm++]);
+	assert(nm <= nelem(mb));
+	upsert(f->mnt, mb, nm);
 	f->dent->gone = 1;
 	r.type = Rremove;
 	respond(m, &r);
@@ -1785,6 +1887,7 @@
 	poperror();
 Err:
 	wunlock(f->dent);
+	wunlock(f);
 	putfid(f);
 	return;
 }
@@ -1843,9 +1946,9 @@
 	r.qid = d.qid;
 	r.iounit = f->iounit;
 
-	lock(f);
+	wlock(f);
 	if(f->mode != -1){
-		unlock(f);
+		wunlock(f);
 		error(Einuse);
 	}
 	if((m->mode & OTRUNC) && !(f->dent->mode & DMAPPEND)){
@@ -1853,7 +1956,7 @@
 
 		if(waserror()){
 			wunlock(f->dent);
-			free(*ao);
+			freeamsg(*ao);
 			*ao = nil;
 			nexterror();
 		}
@@ -1861,8 +1964,8 @@
 		qlock(&f->dent->trunclk);
 		f->dent->trunc = 1;
 		qunlock(&f->dent->trunclk);
-		aincl(&f->dent->ref, 1);
-		aincl(&f->mnt->ref, 1);
+		ainc(&f->dent->ref);
+		ainc(&f->mnt->ref);
 		(*ao)->op = AOclear;
 		(*ao)->mnt = f->mnt;
 		(*ao)->qpath = f->qpath;
@@ -1890,8 +1993,8 @@
 	}
 	f->mode = mode2bits(m->mode);
 	if(m->mode & ORCLOSE)
-		f->rclose = 1;
-	unlock(f);
+		f->rclose = emalloc(sizeof(Amsg), 1);
+	wunlock(f);
 	poperror();
 	respond(m, &r);
 	putfid(f);
@@ -1905,6 +2008,12 @@
 	Scan *s;
 	Xdir d;
 
+	/* mutates scan */
+	wlock(f);
+	if(waserror()){
+		wunlock(f);
+		nexterror();
+	}
 	s = f->scan;
 	if(s != nil && s->offset != 0 && s->offset != m->offset)
 		error(Edscan);
@@ -1912,12 +2021,10 @@
 		s = emalloc(sizeof(Scan), 1);
 		pfx[0] = Klabel;
 		btnewscan(s, pfx, 1);
-		lock(f);
 		if(f->scan != nil){
 			free(f->scan);
 		}
 		f->scan = s;
-		unlock(f);
 	}
 	if(s->donescan){
 		r->count = 0;
@@ -1925,7 +2032,7 @@
 	}
 	p = r->data;
 	n = m->count;
-	d = f->dent->Xdir;
+	filldumpdir(&d);
 	if(s->overflow){
 		memcpy(d.name, s->kv.k+1, s->kv.nk-1);
 		d.name[s->kv.nk-1] = 0;
@@ -1953,6 +2060,8 @@
 		n -= ns;
 	}
 	btexit(s);
+	poperror();
+	wunlock(f);
 	r->count = p - r->data;
 	return;
 }
@@ -1965,6 +2074,12 @@
 	Tree *t;
 	Scan *s;
 
+	/* mutates scan */
+	wlock(f);
+	if(waserror()){
+		wunlock(f);
+		nexterror();
+	}
 	s = f->scan;
 	t = agetp(&f->mnt->root);
 	if(s != nil && s->offset != 0 && s->offset != m->offset)
@@ -1973,22 +2088,21 @@
 		s = emalloc(sizeof(Scan), 1);
 		packdkey(pfx, sizeof(pfx), f->qpath, nil);
 		btnewscan(s, pfx, sizeof(pfx));
-		lock(f);
 		if(f->scan != nil)
 			free(f->scan);
 		f->scan = s;
-		unlock(f);
 	}
 	if(s->donescan){
 		r->count = 0;
-		return;
+		goto Out;
 	}
 	p = r->data;
 	n = m->count;
 	if(s->overflow){
+		/* someone picked an iounit too small for a dir */
 		if((ns = kv2statbuf(&s->kv, p, n)) == -1){
 			r->count = 0;
-			return;
+			error(Ebotch);
 		}
 		s->overflow = 0;
 		p += ns;
@@ -2005,8 +2119,11 @@
 		p += ns;
 		n -= ns;
 	}
-	btexit(s);
 	r->count = p - r->data;
+	btexit(s);
+Out:
+	poperror();
+	wunlock(f);
 }
 
 static void
@@ -2017,10 +2134,12 @@
 	Dent *e;
 	Tree *t;
 
+	rlock(f);
 	e = f->dent;
 	rlock(e);
 	if(m->offset > e->length){
 		runlock(e);
+		runlock(f);
 		return;
 	}
 	p = r->data;
@@ -2039,6 +2158,7 @@
 		c -= n;
 	}
 	runlock(e);
+	runlock(f);
 }
 
 static void
@@ -2092,19 +2212,18 @@
 		rerror(m, Enofid);
 		return;
 	}
-	if(!(f->mode & DMWRITE)){
-		rerror(m, Einuse);
-		putfid(f);
-		return;
-	}
+	wlock(f);
 	truncwait(f->dent, id);
 	wlock(f->dent);
 	if(waserror()){
 		rerror(m, errmsg());
 		wunlock(f->dent);
+		wunlock(f);
 		putfid(f);
 		return;
 	}
+	if(!(f->mode & DMWRITE))
+		error(Einuse);
 	if(f->dent->gone)
 		error(Ephase);
 	if(f->dent->qid.type & QTAUTH){
@@ -2119,7 +2238,8 @@
 	if(f->dent->mode & DMAPPEND)
 		o = f->dent->length;
 	t = agetp(&f->mnt->root);
-	for(i = 0; i < nelem(kv)-1 && c != 0; i++){
+	for(i = 0; c != 0; i++){
+		assert(i < nelem(kv));
 		assert(i == 0 || o%Blksz == 0);
 		kv[i].op = Oinsert;
 		kv[i].k = kbuf[i];
@@ -2129,7 +2249,7 @@
 		if(waserror()){
 			if(!fs->rdonly)
 				for(j = 0; j < i; j++)
-					freeblk(t, nil, bp[j]);
+					freebp(t, bp[j]);
 			nexterror();
 		}
 		n = writeb(f, &kv[i], &bp[i], p, o, c, f->dent->length);
@@ -2149,7 +2269,7 @@
 		sbuf[0] |= Owsize;
 		PACK64(p, o);
 		p += 8;
-		f->dent->length = m->offset+m->count;
+		f->dent->length = o;
 	}
 	sbuf[0] |= Owmtime;
 	f->dent->mtime = nsec();
@@ -2169,6 +2289,7 @@
 	poperror();
  	respond(m, &r);
 	wunlock(f->dent);
+	wunlock(f);
 	putfid(f);	
 }
 
@@ -2182,23 +2303,79 @@
 }
 
 Conn *
-newconn(int rfd, int wfd)
+newconn(int rfd, int wfd, int cfd)
 {
 	Conn *c;
 
 	if((c = mallocz(sizeof(*c), 1)) == nil)
 		return nil;
+
 	c->rfd = rfd;
 	c->wfd = wfd;
+	c->cfd = cfd;
+
 	c->iounit = Max9p;
-	c->next = fs->conns;
+
+	c->ref = 1;
+
 	lock(&fs->connlk);
+	c->next = fs->conns;
 	fs->conns = c;
 	unlock(&fs->connlk);
+
 	return c;
 }
 
 void
+putconn(Conn *c)
+{
+	Conn **pp;
+	Amsg *a;
+	Fid *f;
+	int i;
+
+	if(adec(&c->ref) != 0)
+		return;
+
+	lock(&fs->connlk);
+	for(pp = &fs->conns; *pp != nil; pp = &((*pp)->next)){
+		if(*pp == c){
+			*pp = c->next;
+			break;
+		}
+	}
+	unlock(&fs->connlk);
+
+	close(c->rfd);
+	if(c->rfd != c->wfd)
+		close(c->wfd);
+	if(c->cfd >= 0)
+		close(c->cfd);
+
+	for(i = 0; i < Nfidtab; i++){
+		for(;;){
+			lock(&c->fidtablk[i]);
+			f = c->fidtab[i];
+			if(f == nil){
+				unlock(&c->fidtablk[i]);
+				break;
+			}
+			ainc(&f->ref);
+			unlock(&c->fidtablk[i]);
+			
+			wlock(f);
+			clunkfid(c, f, &a);
+			wunlock(f);
+			putfid(f);
+
+			if(a != nil)
+				chsend(fs->admchan, a);
+		}
+	}
+	free(c);
+}
+
+void
 runfs(int, void *pc)
 {
 	char err[128];
@@ -2210,20 +2387,20 @@
 	u32int h;
 
 	c = pc;
-	while(1){
+	while(!c->hangup){
 		if(readmsg(c, &m) < 0){
 			fshangup(c, "read message: %r");
-			return;
+			break;
 		}
 		if(m == nil)
 			break;
 		if(convM2S(m->buf, m->sz, m) == 0){
 			fshangup(c, "invalid message: %r");
-			return;
+			break;
 		}
 		if(m->type != Tversion && !c->versioned){
 			fshangup(c, "version required");
-			return;
+			break;
 		}
 		dprint("← %F\n", &m->Fcall);
 
@@ -2276,6 +2453,7 @@
 		if(a != nil)
 			chsend(fs->admchan, a);
 	}
+	putconn(c);
 }
 
 void
@@ -2298,8 +2476,11 @@
 					rerror(m, Enofid);
 					continue;
 				}
-				clunkfid(m->conn, f, nil);
+				wlock(f);
+				clunkfid(m->conn, f, &a);
+				wunlock(f);
 				putfid(f);
+				freeamsg(a);
 			}
 			rerror(m, Erdonly);
 			continue;
@@ -2318,8 +2499,8 @@
 		}
 		assert(estacksz() == 0);
 		epochend(id);
-		epochclean();
 		qunlock(&fs->mutlk);
+		epochclean();
 
 		if(a != nil)
 			chsend(fs->admchan, a);
@@ -2361,12 +2542,12 @@
 			bp = unpackbp(kv.v, kv.nv);
 			freetree(bp, pred);
 			qlock(&fs->mutlk);
-			epochclean();
 			qunlock(&fs->mutlk);
+			epochclean();
 		}
 	}
 	if(rb.gen > pred)
-		freeblk(nil, nil, rb);
+		freebp(nil, rb);
 	dropblk(b);
 }
 
@@ -2395,10 +2576,10 @@
 			break;
 		bp = unpackbp(s.kv.v, s.kv.nv);
 		if(bp.gen > t->pred)
-			freeblk(nil, nil, bp);
+			freebp(nil, bp);
 		qlock(&fs->mutlk);
-		epochclean();
 		qunlock(&fs->mutlk);
+		epochclean();
 	}
 	btexit(&s);
 	freetree(t->bp, t->pred);
@@ -2408,28 +2589,24 @@
 runsweep(int id, void*)
 {
 	char buf[Kvmax];
+	Msg mb[Kvmax/Offksz];
 	Bptr bp, nb, *oldhd;
+	int i, nm;
 	vlong off;
 	Tree *t;
 	Arena *a;
 	Amsg *am;
 	Blk *b;
-	Msg m, mb[2];
-	int i, nm;
 
 	if((oldhd = calloc(fs->narena, sizeof(Bptr))) == nil)
 		sysfatal("malloc log heads");
 	while(1){
 		am = chrecv(fs->admchan);
-		if(agetl(&fs->rdonly)){
-			fprint(2, "spurious adm message\n");
-			break;
-		}
 		switch(am->op){
 		case AOsync:
 			tracem("syncreq");
 			if(!fs->snap.dirty && !am->halt)
-				continue;
+				goto Next;
 			if(agetl(&fs->rdonly))
 				goto Justhalt;
 			if(waserror()){
@@ -2440,31 +2617,31 @@
 
 			if(am->halt)
 				ainc(&fs->rdonly);
-			qlock(&fs->mutlk);
 			for(i = 0; i < fs->narena; i++){
 				a = &fs->arenas[i];
+				oldhd[i].addr = -1;
+				oldhd[i].hash = -1;
+				oldhd[i].gen = -1;
 				qlock(a);
-				if(a->nlog < a->reserve/(10*Blksz)){
-					oldhd[i].addr = -1;
-					oldhd[i].hash = -1;
-					oldhd[i].gen = -1;
-					qunlock(a);
-					continue;
-				}
-				if(waserror()){
-					qunlock(&fs->mutlk);
-					qunlock(a);
-					nexterror();
-				}
-				oldhd[i] = a->loghd;
-				epochstart(id);
-				compresslog(a);
+				/*
+				 * arbitrary heuristic -- try compressing
+				 * when the log doubles in size.
+				 */
+//				if(a->nlog >= 2*a->lastlogsz){
+//					oldhd[i] = a->loghd;
+//					epochstart(id);
+//					if(waserror()){
+//						epochend(id);
+//						qunlock(a);
+//						nexterror();
+//					}
+//					compresslog(a);
+//					epochend(id);
+//					poperror();
+//				}
 				qunlock(a);
-				epochend(id);
 				epochclean();
-				poperror();
 			}
-			qunlock(&fs->mutlk);
 			sync();
 
 			for(i = 0; i < fs->narena; i++){
@@ -2473,11 +2650,11 @@
 					epochstart(id);
 					b = getblk(bp, 0);
 					nb = b->logp;
-					freeblk(nil, b, b->bp);
+					freeblk(nil, b);
 					dropblk(b);
 					epochend(id);
-					epochclean();
 					qunlock(&fs->mutlk);
+					epochclean();
 				}
 			}
 
@@ -2494,8 +2671,8 @@
 		case AOsnap:
 			tracem("snapreq");
 			if(agetl(&fs->rdonly)){
-				fprint(2, "read only fs");
-				continue;
+				fprint(2, "snap on read only fs");
+				goto Next;
 			}
 			if(waserror()){
 				fprint(2, "taking snap: %s\n", errmsg());
@@ -2525,6 +2702,10 @@
 			break;
 
 		case AOrclose:
+			if(agetl(&fs->rdonly)){
+				fprint(2, "rclose on read only fs");
+				goto Next;
+			}
 			nm = 0;
 			mb[nm].op = Odelete;
 			mb[nm].k = am->dent->k;
@@ -2531,6 +2712,7 @@
 			mb[nm].nk = am->dent->nk;
 			mb[nm].nv = 0;
 			nm++;
+tracex("rclose", Zb, am->qpath, -1);
 			if(am->dent->qid.type & QTDIR){
 				packsuper(buf, sizeof(buf), am->qpath);
 				mb[nm].op = Oclobber;
@@ -2539,9 +2721,15 @@
 				mb[nm].nv = 0;
 				nm++;
 			}
+			qlock(&fs->mutlk);
 			upsert(am->mnt, mb, nm);
+			qunlock(&fs->mutlk);
 			/* fallthrough */
 		case AOclear:
+			if(agetl(&fs->rdonly)){
+				fprint(2, "clear on read only fs");
+				goto Next;
+			}
 			tracem("bgclear");
 			if(waserror()){
 				fprint(2, "clear file %llx: %s\n", am->qpath, errmsg());
@@ -2551,39 +2739,43 @@
 			if(am->dent != nil)
 				qlock(&am->dent->trunclk);
 			fs->snap.dirty = 1;
+			nm = 0;
 			for(off = am->off; off < am->end; off += Blksz){
-				qlock(&fs->mutlk);
-				if(waserror()){
+tracex("clearb", Zb, am->qpath, off);
+				mb[nm].op = Oclearb;
+				mb[nm].k = buf + Offksz * nm;
+				mb[nm].nk = Offksz;
+				mb[nm].k[0] = Kdat;
+				PACK64(mb[nm].k+1, am->qpath);
+				PACK64(mb[nm].k+9, off);
+				mb[nm].v = nil;
+				mb[nm].nv = 0;
+				if(++nm >= nelem(mb) || off + Blksz >= am->end){
+					qlock(&fs->mutlk);
+					if(waserror()){
+						qunlock(&fs->mutlk);
+						nexterror();
+					}
+					epochstart(id);
+					upsert(am->mnt, mb, nm);
+					epochend(id);
 					qunlock(&fs->mutlk);
-					nexterror();
+					epochclean();
+					poperror();
+					nm = 0;
 				}
-				epochstart(id);
-				m.k = buf;
-				m.nk = sizeof(buf);
-				m.op = Oclearb;
-				m.k[0] = Kdat;
-				PACK64(m.k+1, am->qpath);
-				PACK64(m.k+9, off);
-				m.v = nil;
-				m.nv = 0;
-				upsert(am->mnt, &m, 1);
-				epochend(id);
-				epochclean();
-				qunlock(&fs->mutlk);
-				poperror();
 			}
 			if(am->dent != nil){
 				am->dent->trunc = 0;
 				rwakeup(&am->dent->truncrz);
 				qunlock(&am->dent->trunclk);
-				clunkdent(am->dent);
 			}
-			clunkmount(am->mnt);
 			poperror();
 			break;
 		}
+Next:
 		assert(estacksz() == 0);
-		free(am);
+		freeamsg(am);
 	}
 }
 
@@ -2631,6 +2823,7 @@
 		a->fd = -1;
 		chsend(fs->admchan, a);
 
+if(0){
 		tmnow(&now, nil);
 		for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
 			if(!(mnt->flag & Ltsnap))
@@ -2638,7 +2831,7 @@
 			if(now.yday != then.yday){
 				snprint(buf, sizeof(buf),
 					"%s@day.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
-				snapmsg("main", buf, Lauto);
+				snapmsg(mnt->name, buf, Lauto);
 			}
 			if(now.hour != then.hour){
 				if(mnt->hourly[h][0] != 0)
@@ -2645,7 +2838,7 @@
 					snapmsg(mnt->hourly[h], nil, 0);
 				snprint(mnt->hourly[h], sizeof(mnt->hourly[h]),
 					"%s@hour.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
-				snapmsg("main", mnt->hourly[h], Lauto);
+				snapmsg(mnt->name, mnt->hourly[h], Lauto);
 			}
 			if(now.min != then.min){
 				if(mnt->minutely[m][0] != 0)
@@ -2652,7 +2845,7 @@
 					snapmsg(mnt->minutely[m], nil, 0);
 				snprint(mnt->minutely[m], sizeof(mnt->minutely[m]),
 					"%s@minute.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
-				snapmsg("main", mnt->minutely[m], Lauto);
+				snapmsg(mnt->name, mnt->minutely[m], Lauto);
 			}
 		}
 		if(now.hour != then.hour)
@@ -2659,6 +2852,7 @@
 			h = (h+1)%24;
 		if(now.min != then.min)
 			m = (m+1)%60;
+}
 		then = now;
 		poperror();
 	}
--- a/load.c
+++ b/load.c
@@ -54,6 +54,12 @@
 	unpackarena(a, b->data, Arenasz);
 	if((a->free = avlcreate(rangecmp)) == nil)
 		error(Enomem);
+	a->logbuf[0] = cachepluck();
+	a->logbuf[1] = cachepluck();
+	a->logbuf[0]->bp = (Bptr){-1, -1, -1};
+	a->logbuf[1]->bp = (Bptr){-1, -1, -1};
+	setflag(a->logbuf[0], Bstatic, 0);
+	setflag(a->logbuf[1], Bstatic, 0);
 	a->h0 = h0;
 	a->h1 = h1;
 	a->used = a->size;
@@ -117,10 +123,6 @@
 	}
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
-		a->logbuf[0] = cachepluck();
-		a->logbuf[1] = cachepluck();
-		a->logbuf[0]->bp = (Bptr){-1, -1, -1};
-		a->logbuf[1]->bp = (Bptr){-1, -1, -1};
 		loadlog(a, a->loghd);
 	}
 
--- a/main.c
+++ b/main.c
@@ -21,7 +21,7 @@
 int	checkonly;
 char	*reamuser;
 char	*dev;
-vlong	tracesz		= 16*MiB;
+vlong	tracesz		= 1024*MiB;
 vlong	cachesz 	= 512*MiB;
 char	*srvname 	= "gefs";
 int	noneid		= 0;
@@ -28,6 +28,7 @@
 int	nogroupid	= 9999;
 int	admid		= -1;
 Blk	*blkbuf;
+Bfree	*bfbuf;
 Errctx	**errctx;
 
 void
@@ -46,23 +47,6 @@
 	t->v1 = v1;
 }
 
-static void
-nokill(void)
-{
-	char buf[128];
-	int fd;
-
-	snprint(buf, sizeof(buf), "/proc/%d/ctl", getpid());
-	if((fd = open(buf, OWRITE)) == -1){
-		fprint(2, "nokill: open %s: %r", buf);
-		return;
-	}
-	if(fprint(fd, "noswap\n") == -1){
-		fprint(2, "nokill: write %s: %r", buf);
-		return;
-	}
-}
-
 static uvlong
 memsize(void)
 {
@@ -118,7 +102,7 @@
 {
 	va_list ap;
 
-	aincl(&fs->rdonly, 1);
+	ainc(&fs->rdonly);
 	va_start(ap, fmt);
 	errorv(fmt, ap, 1);
 }
@@ -156,6 +140,7 @@
 static void
 initfs(vlong cachesz)
 {
+	Bfree *f, *g;
 	Blk *b;
 
 	if((fs = mallocz(sizeof(Gefs), 1)) == nil)
@@ -167,6 +152,7 @@
 	}
 	fs->lrurz.l = &fs->lrulk;
 	fs->syncrz.l = &fs->synclk;
+	fs->bfreerz.l = &fs->bfreelk;
 	fs->noauth = noauth;
 	fs->cmax = cachesz/Blksz;
 	if(fs->cmax > (1<<30))
@@ -181,12 +167,23 @@
 	if((fs->dlcache = mallocz(fs->dlcmax*sizeof(Dlist*), 1)) == nil)
 		sysfatal("malloc: %r");
 
+	bfbuf = sbrk(fs->cmax * sizeof(Bfree));
+	if(bfbuf == (void*)-1)
+		sysfatal("sbrk: %r");
+
+	g = nil;
+	for(f = bfbuf; f != bfbuf+fs->cmax; f++){
+		f->bp = Zb;
+		f->next = g;
+		g = f;
+	}
+	fs->bfree = g;
+
 	blkbuf = sbrk(fs->cmax * sizeof(Blk));
 	if(blkbuf == (void*)-1)
 		sysfatal("sbrk: %r");
 	for(b = blkbuf; b != blkbuf+fs->cmax; b++){
-		b->bp.addr = -1;
-		b->bp.hash = -1;
+		b->bp = Zb;
 		b->magic = Magic;
 		lrutop(b);
 	}
@@ -202,7 +199,6 @@
 	if (pid < 0)
 		sysfatal("can't fork: %r");
 	if (pid == 0) {
-		nokill();
 		id = aincl(&fs->nworker, 1);
 		if((*errctx = mallocz(sizeof(Errctx), 1)) == nil)
 			sysfatal("malloc: %r");
@@ -243,21 +239,22 @@
 		sysfatal("announce %s: %r", ann);
 	while(1){
 		if((lctl = listen(adir, ldir)) < 0){
-			fprint(2, "listen %s: %r", adir);
+			fprint(2, "listen %s: %r\n", adir);
 			break;
 		}
 		fd = accept(lctl, ldir);
-		close(lctl);
 		if(fd < 0){
-			fprint(2, "accept %s: %r", ldir);
+			fprint(2, "accept %s: %r\n", ldir);
+			close(lctl);
 			continue;
 		}
-		if(!(c = newconn(fd, fd))){
+		c = newconn(fd, fd, lctl);
+		if(c == nil){
+			fprint(2, "newconn: %r\n");
+			close(lctl);
 			close(fd);
-			fprint(2, "%r");
 			continue;
 		}
-
 		launch(runfs, c, "netio");
 	}
 	close(actl);
@@ -393,11 +390,14 @@
 	}
 
 	rfork(RFNOTEG);
-	nokill();
 	loadfs(dev);
 	fs->wrchan = mkchan(32);
 	fs->admchan = mkchan(32);
-	fs->nsyncers = nproc/2;
+	/*
+	 * for spinning disks, parallel sync tanks performance
+	 * for ssds, it doesn't help much.
+	 */
+	fs->nsyncers = 1;
 	fs->nreaders = nproc/2;
 	if(fs->nsyncers > fs->narena)
 		fs->nsyncers = fs->narena;
@@ -422,12 +422,12 @@
 	for(i = 0; i < nann; i++)
 		launch(runannounce, ann[i], "announce");
 	if(srvfd != -1){
-		if((c = newconn(srvfd, srvfd)) == nil)
+		if((c = newconn(srvfd, srvfd, -1)) == nil)
 			sysfatal("%r");
 		launch(runfs, c, "srvio");
 	}
 	if(stdio){
-		if((c = newconn(0, 1)) == nil)
+		if((c = newconn(0, 1, -1)) == nil)
 			sysfatal("%r");
 		launch(runfs, c, "stdio");
 	}
--- a/pack.c
+++ b/pack.c
@@ -402,6 +402,7 @@
 
 	assert(sz >= Arenasz);
 	e = p + Arenasz;
+tracex("loghd", a->loghd, a - fs->arenas, -1);
 	PACK64(p, a->loghd.addr);	p += 8;	/* freelist addr */
 	PACK64(p, a->loghd.hash);	p += 8;	/* freelist hash */
 	PACK64(p, a->size);		p += 8;	/* arena size */
--- a/ream.c
+++ b/ream.c
@@ -54,7 +54,7 @@
 	dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
 	setval(r, &kv);
 
-	p = packsuper(kbuf, sizeof(kbuf), 0);
+	p = packsuper(kbuf, sizeof(kbuf), Qadmroot);
 	kv.k = kbuf;
 	kv.nk = p - kbuf;
 	p = packdkey(vbuf, sizeof(vbuf), -1, "");
@@ -75,7 +75,7 @@
 	dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
 	setval(r, &kv);
 
-	p = packsuper(kbuf, sizeof(kbuf), 0);
+	p = packsuper(kbuf, sizeof(kbuf), Qmainroot);
 	kv.k = kbuf;
 	kv.nk = p - kbuf;
 	p = packdkey(vbuf, sizeof(vbuf), -1, "");
@@ -168,6 +168,7 @@
 	char *p;
 
 	b = cachepluck();
+
 	addr = hdaddr+2*Blksz;	/* leave room for arena hdr */
 
 	a->loghd.addr = -1;
@@ -180,7 +181,7 @@
 	b->logsz = 0;
 	b->logp = (Bptr){-1, -1, -1};
 	b->data = b->buf + Loghdsz;
-	setflag(b, Bdirty);
+	setflag(b, Bdirty, 0);
 
 	p = b->buf + Loghdsz;
 	b->logp = (Bptr){-1, -1, -1};
@@ -206,21 +207,18 @@
 	h0->type = Tarena;
 	h0->bp.addr = hdaddr;
 	h0->data = h0->buf+2;
+	packarena(h0->data, Arenasz, a);
 	finalize(h0);
+	syncblk(h0);
+	a->h0 = h0;
 
 	memset(h1->buf, 0, sizeof(h1->buf));
 	h1->type = Tarena;
 	h1->bp.addr = hdaddr+Blksz;
 	h1->data = h1->buf+2;
-	finalize(h1);
-
-	packarena(h0->data, Arenasz, a);
 	packarena(h1->data, Arenasz, a);
-	finalize(h0);
 	finalize(h1);
-	syncblk(h0);
 	syncblk(h1);
-	a->h0 = h0;
 	a->h1 = h1;
 }
 
@@ -286,7 +284,7 @@
 		loadlog(a, a->loghd);
 	}
 
-	if((mb = newblk(mnt->root, Tleaf, 0)) == nil)
+	if((mb = newblk(mnt->root, Tleaf)) == nil)
 		sysfatal("ream: allocate root: %r");
 	holdblk(mb);
 	initroot(mb);
@@ -296,9 +294,9 @@
 	mnt->root->ht = 1;
 	mnt->root->bp = mb->bp;
 
-	if((ab = newblk(adm->root, Tleaf, 0)) == nil)
+	if((ab = newblk(adm->root, Tleaf)) == nil)
 		sysfatal("ream: allocate root: %r");
-	if((ub = newblk(adm->root, Tdat, 0)) == nil)
+	if((ub = newdblk(adm->root, 0, 1)) == nil)
 		sysfatal("ream: allocate root: %r");
 	holdblk(ab);
 	holdblk(ub);
@@ -322,7 +320,7 @@
 	 * a single snap block that the tree will insert
 	 * into, and take a snapshot as the initial state.
 	 */
-	if((tb = newblk(mnt->root, Tleaf, 0)) == nil)
+	if((tb = newblk(mnt->root, Tleaf)) == nil)
 		sysfatal("ream: allocate snaps: %r");
 	holdblk(tb);
 	initsnap(tb, mb, ab);
--- a/snap.c
+++ b/snap.c
@@ -189,10 +189,11 @@
 		bp = b->logp;
 		qe.op = Qfree;
 		qe.bp = b->bp;
-		qe.b = b;
+		qe.b = nil;
 		a = getarena(qe.bp.addr);
 		qput(a->sync, qe);
 		traceb("dlfreeb", qe.bp);
+		dropblk(b);
 	}
 }
 
@@ -377,12 +378,12 @@
 
 	i = 0;
 	n = nil;
-	if(waserror()){
-		free(n);
-		nexterror();
-	}
 	if(flg & Lmut){
 		n = emalloc(sizeof(Tree), 1);
+		if(waserror()){
+			free(n);
+			nexterror();
+		}
 		n->memref = 1;
 		n->dirty = 0;
 		n->nlbl = 1;
@@ -405,6 +406,7 @@
 		m[i].op = Oinsert;
 		tree2kv(n, &m[i], buf[i], sizeof(buf[i]));
 		i++;
+		poperror();
 	}else{
 		t->nlbl++;
 		m[i].op = Orelink;
@@ -418,7 +420,6 @@
 		i++;
 	}
 	btupsert(&fs->snap, m, i);
-	poperror();
 	free(n);
 }
 
@@ -542,14 +543,9 @@
 void
 closesnap(Tree *t)
 {
-	Bfree *f;
-
 	if(t == nil || adec(&t->memref) != 0)
 		return;
-	f = malloc(sizeof(Bfree));
-	f->op = DFtree;
-	f->t = t;
-	limbo(f);
+	limbo(DFtree, t);
 }
 
 void
@@ -585,10 +581,14 @@
 	 * are the responsibility of the other chain; in this chain, we
 	 * leak it and let the last reference in the other chain clean up
 	 */
-	if(t == &fs->snap)
+	if(t == &fs->snap){
+traceb("killsnap", bp);
 		dl = &fs->snapdl;
-	else if(bp.gen > t->base)
+}
+	else if(bp.gen > t->base){
+traceb("killdl", bp);
 		dl = getdl(t->memgen, bp.gen);
+}
 	else
 		return;
 	if(waserror()){
@@ -596,7 +596,7 @@
 		nexterror();
 	}
 	if(dl->ins == nil || Logspc - dl->ins->logsz < Logslop){
-		b = newblk(&fs->snap, Tdlist, 0);
+		b = newblk(&fs->snap, Tdlist);
 		if(dl->ins != nil){
 			enqueue(dl->ins);
 			dropblk(dl->ins);
@@ -610,7 +610,7 @@
 	}
 	p = dl->ins->data + dl->ins->logsz;
 	dl->ins->logsz += 8;
-	setflag(dl->ins, Bdirty);
+	setflag(dl->ins, Bdirty, 0);
 	PACK64(p, bp.addr);
 	poperror();
 	putdl(dl);
--- a/test/freplay.c
+++ b/test/freplay.c
@@ -171,8 +171,9 @@
 		sysfatal("open %s: %r", argv[0]);
 	if((d = dirfstat(fd)) == nil)
 		sysfatal("failed to stat file: %r");
-	if((membuf = sbrk(d->length)) == nil)
+	if((membuf = sbrk(d->length)) == (void*)-1)
 		sysfatal("failed to allocate buffer: %r");
+	d->length -= (d->length % IOUNIT);
 	memset(membuf, 0, d->length);
 	for(off = 0; off < d->length; off += n)
 		if((n = read(fd, membuf+off, IOUNIT)) <= 0)
--- a/test/fsbench.c
+++ b/test/fsbench.c
@@ -3,13 +3,13 @@
 #include <libsec.h>
 #include <thread.h>
 
-int mainstacksize = 2*1024*1024;
+int mainstacksize = 64*1024*1024;
 typedef struct Bench Bench;
 enum {
 	KiB	= 1024ULL,
 	MiB	= 1024ULL*KiB,
 	GiB	= 1024ULL*MiB,
-	Bufsz	= IOUNIT,
+	Bufsz	= 128*IOUNIT,
 };
 
 enum {
--- a/test/mkfile
+++ b/test/mkfile
@@ -3,8 +3,9 @@
 TESTS=\
 	basic\
 	build\
+	files\
 
-all:V: 6.freplay 6.fsbench
+all:V: 6.freplay 6.fsbench 6.files
 
 test:VQ:
 	@{cd .. && mk 6.out}
--- a/test/run.rc
+++ b/test/run.rc
@@ -1,6 +1,6 @@
 #!/bin/rc
 
-rfork ne
+rfork e
 
 dev=$testdev
 if(~ $#testdev 0)
@@ -26,11 +26,11 @@
 }
 
 fn ge_ream {
-	$O.out -m 512 -r $user -f $1
+	gefs -m 512 -r $user -f $1
 }
 
 fn ge_start {
-	$O.out -m 512 -A -f $1 -n gefs.test
+	gefs -m 512 -A -f $1 -n gefs.test
 	while(! test -e /srv/gefs.test)
 		sleep 0.1
 	mount -c /srv/gefs.test /n/gefs
@@ -37,7 +37,7 @@
 }
 
 fn ge_kill {
-	kill $O.out | rc
+	kill gefs | rc
 	while(test -e /srv/gefs.test)
 		sleep 0.1
 }
@@ -48,7 +48,7 @@
 	ge_ream $dev
 	log preparing replay...
 	rm -f replay.log
-	test/6.freplay -l replay.log $dev
+	$O.freplay -l replay.log $dev
 	ge_start /mnt/replay/data
 	$*
 	echo save trace /tmp/trace >> /srv/gefs.test.cmd
@@ -63,9 +63,9 @@
 
 	# check blockwise consistency
 	log starting replay...
-	test/6.freplay -c 1 -r replay.log $dev
+	$O.freplay -c 1 -r replay.log $dev
 	for(i in `{seq 2 $count}){
-		$O.out -c -f /mnt/replay/data >[2]/tmp/log || die 'broken'
+		gefs -c -f /mnt/replay/data >[2]/tmp/log || die 'broken'
 		log stepping $i...
 		echo step > /mnt/replay/ctl
 	}
@@ -80,12 +80,12 @@
 	log reaming...
 	ge_ream $dev
 	log preparing build-and-verify...
-	test/6.freplay -l replay.log $dev
+	$O.freplay -l replay.log $dev
 	ge_start $dev
 	$*
 	echo save trace /tmp/trace >> /srv/gefs.test.cmd
 	ge_kill
-	$O.out -c -f $dev
+	gefs -c -f $dev
 }}
 
 fn buildsys{@{
--- a/tree.c
+++ b/tree.c
@@ -29,7 +29,7 @@
 
 #define efreeblk(t, b) do { \
 	if(b != nil) \
-		freeblk(t, b, b->bp); \
+		freeblk(t, b); \
 	} while(0)
 
 static void
@@ -326,7 +326,7 @@
 	 * delete messages, so we need to check if
 	 * there's anything in it to copy up.
 	 */
-	if(pp->nl->nval > 0){
+	if(pp->nl != nil){
 		getval(pp->nl, 0, &kv);
 		if(pp->nl->nbuf > 0){
 			getmsg(pp->nl, 0, &m);
@@ -337,7 +337,7 @@
 		if(nbytes != nil)
 			*nbytes += valsz(&kv);
 	}
-	if(pp->nr != nil && pp->nr->nval > 0){
+	if(pp->nr != nil){
 		getval(pp->nr, 0, &kv);
 		if(pp->nr->nbuf > 0){
 			getmsg(pp->nr, 0, &m);
@@ -405,11 +405,12 @@
 	Tree t;
 
 	switch(m->op){
-	case Oclearb:
 	case Odelete:
-	case Oclobber:
 		assert(keycmp(kv, m) == 0);
 		return 0;
+	case Oclearb:
+	case Oclobber:
+		return 0;
 	case Oinsert:
 		cpkvp(kv, m, buf, nbuf);
 		return 1;
@@ -435,6 +436,19 @@
 	return 0;
 }
 
+static Blk*
+setb(Tree *t, Blk *b)
+{
+	if(b->nval == 0){
+		freeblk(t, b);
+		return nil;
+	}else{
+		enqueue(b);
+		return b;
+	}
+}
+		
+
 static int
 pullmsg(Path *p, int i, Kvp *v, Msg *m, int *full, int spc)
 {
@@ -482,7 +496,7 @@
 	 */
 	full = 0;
 	spc = Leafspc - blkfill(b);
-	n = newblk(t, b->type, 0);
+	n = newblk(t, b->type);
 	assert(i >= 0 && j >= 0);
 	while(i < b->nval || j < up->hi){
 		if(i >= b->nval)
@@ -517,7 +531,7 @@
 			|| m.op == Oinsert
 			|| m.op == Odelete){
 				bp = unpackbp(v.v, v.nv);
-				freeblk(t, nil, bp);
+				freebp(t, bp);
 			}
 			ok = apply(&v, &m, buf, sizeof(buf));
 			goto Copyloop;
@@ -541,7 +555,7 @@
 				|| m.op == Oinsert
 				|| m.op == Odelete){
 					bp = unpackbp(v.v, v.nv);
-					freeblk(t, nil, bp);
+					freebp(t, bp);
 				}
 				p->pullsz += msgsz(&m);
 				ok = apply(&v, &m, buf, sizeof(buf));
@@ -553,7 +567,8 @@
 		}
 	}
 	p->npull = (j - up->lo);
-	p->nl = n;
+	p->op = POmod;
+	p->nl = setb(t, n);
 }
 
 /*
@@ -573,7 +588,7 @@
 	Msg m, u;
 
 	b = p->b;
-	n = newblk(t, b->type, 0);
+	n = newblk(t, b->type);
 	for(i = 0; i < b->nval; i++){
 		if(pp != nil && i == p->midx){
 			copyup(n, pp, nil);
@@ -625,7 +640,8 @@
 		j++;
 	}
 	p->npull = (j - up->lo);
-	p->nl = n;
+	p->op = POmod;
+	p->nl = setb(t, n);
 }
 
 /*
@@ -657,8 +673,8 @@
 		efreeblk(t, r);
 		nexterror();
 	}
-	l = newblk(t, b->type, 0);
-	r = newblk(t, b->type, 0);
+	l = newblk(t, b->type);
+	r = newblk(t, b->type);
 
 	d = l;
 	i = 0;
@@ -701,7 +717,7 @@
 			|| m.op == Oinsert
 			|| m.op == Odelete){
 				bp = unpackbp(v.v, v.nv);
-				freeblk(t, nil, bp);
+				freebp(t, bp);
 			}
 			ok = apply(&v, &m, buf, sizeof(buf));
 			goto Copyloop;
@@ -725,7 +741,7 @@
 				|| m.op == Oinsert
 				|| m.op == Odelete){
 					bp = unpackbp(v.v, v.nv);
-					freeblk(t, nil, bp);
+					freebp(t, bp);
 				}
 				p->pullsz += msgsz(&m);
 				ok = apply(&v, &m, buf, sizeof(buf));
@@ -738,8 +754,8 @@
 	}
 	p->npull = (j - up->lo);
 	p->op = POsplit;
-	p->nl = l;
-	p->nr = r;
+	p->nl = setb(t, l);
+	p->nr = setb(t, r);
 	poperror();
 }
 
@@ -770,8 +786,8 @@
 		efreeblk(t, r);
 		nexterror();
 	}
-	l = newblk(t, b->type, 0);
-	r = newblk(t, b->type, 0);
+	l = newblk(t, b->type);
+	r = newblk(t, b->type);
 	d = l;
 	copied = 0;
 	halfsz = (2*b->nval + b->valsz)/2;
@@ -808,8 +824,8 @@
 		setmsg(d, &m);
 	}
 	p->op = POsplit;
-	p->nl = l;
-	p->nr = r;
+	p->nl = setb(t, l);
+	p->nr = setb(t, r);
 	poperror();
 }
 
@@ -820,7 +836,7 @@
 	Msg m;
 	int i;
 
-	d = newblk(t, a->type, 0);
+	d = newblk(t, a->type);
 	for(i = 0; i < a->nval; i++){
 		getval(a, i, &m);
 		setval(d, &m);
@@ -839,11 +855,9 @@
 			setmsg(d, &m);
 		}
 	}
-	enqueue(d);
 	p->midx = idx;
-	pp->nl = d;
 	pp->op = POmerge;
-	pp->nr = nil;
+	pp->nl = setb(t, d);
 }
 
 /*
@@ -904,8 +918,8 @@
 		efreeblk(t, r);
 		nexterror();
 	}
-	l = newblk(t, a->type, 0);
-	r = newblk(t, a->type, 0);
+	l = newblk(t, a->type);
+	r = newblk(t, a->type);
 	d = l;
 	cp = 0;
 	sp = -1;
@@ -950,12 +964,10 @@
 			o++;
 		}
 	}
-	enqueue(l);
-	enqueue(r);
 	p->midx = midx;
 	pp->op = POrot;
-	pp->nl = l;
-	pp->nr = r;
+	pp->nl = setb(t, l);
+	pp->nr = setb(t, r);
 	poperror();
 }
 
@@ -1054,12 +1066,9 @@
 	if(p->b->type == Tleaf){
 		if(!filledleaf(p->b, up->sz)){
 			updateleaf(t, p-1, p);
-			enqueue(p->nl);
 			rp = p;
 		}else{
 			splitleaf(t, up, p, &mid);
-			enqueue(p->nl);
-			enqueue(p->nr);
 		}
 		p->midx = -1;
 		pp = p;
@@ -1075,12 +1084,9 @@
 				goto Out;
 			}
 			updatepiv(t, up, p, pp);
-			enqueue(p->nl);
 			rp = p;
 		}else{
 			splitpiv(t, up, p, pp, &mid);
-			enqueue(p->nl);
-			enqueue(p->nr);
 		}
 		pp = p;
 		up--;
@@ -1088,7 +1094,7 @@
 	}
 	if(pp->nl != nil && pp->nr != nil){
 		rp = &path[0];
-		rp->nl = newblk(t, Tpivot, 0);
+		rp->nl = newblk(t, Tpivot);
 		rp->npull = pp->npull;
 		rp->pullsz = pp->pullsz;
 		copyup(rp->nl, pp, nil);
@@ -1105,9 +1111,9 @@
 
 	for(p = path; p != path + npath; p++){
 		if(p->b != nil)
-			freeblk(t, p->b, p->b->bp);
+			freeblk(t, p->b);
 		if(p->m != nil)
-			freeblk(t, p->b, p->m->bp);
+			freeblk(t, p->b);
 		dropblk(p->b);
 		dropblk(p->nl);
 		dropblk(p->nr);
@@ -1211,7 +1217,7 @@
 	t->dirty = 1;
 	unlock(&t->lk);
 
-	freeblk(t, b, b->bp);
+	freeblk(t, b);
 	dropblk(b);
 	dropblk(r);
 }
@@ -1226,6 +1232,7 @@
 	Kvp sep;
 	Bptr bp;
 
+	assert(!canqlock(&fs->mutlk));
 	sz = 0;
 	stablesort(msg, nmsg);
 	for(i = 0; i < nmsg; i++)
@@ -1358,9 +1365,10 @@
 		j = bufsearch(p[i], k, &m, &same);
 		if(j < 0 || !same)
 			continue;
-		if(!(ok || m.op == Oinsert || m.op == Oclearb))
+		if(ok || m.op == Oinsert)
+			ok = apply(r, &m, buf, nbuf);
+		else if(m.op != Oclearb && m.op != Oclobber)
 			fatal("lookup %K << %M missing insert\n", k, &m);
-		ok = apply(r, &m, buf, nbuf);
 		for(j++; j < p[i]->nbuf; j++){
 			getmsg(p[i], j, &m);
 			if(keycmp(k, &m) != 0)