shithub: gefs

Download patch

ref: 6b02773727cc3b9c434ef1ce5bbd94dd6b706aef
parent: e0fffa8ae9627363f518cc7077e5e8e204135db5
author: Ori Bernstein <ori@eigenstate.org>
date: Fri Mar 19 12:08:17 EDT 2021

fs: allocate blocks correctly, group insertions, snapshot

--- a/blk.c
+++ b/blk.c
@@ -1,66 +1,862 @@
 #include <u.h>
 #include <libc.h>
+#include <fcall.h>
+#include <avl.h>
 
 #include "dat.h"
 #include "fns.h"
 
+static vlong	blkalloc_lk(Arena*);
+static Blk	*cacheblk(Blk*);
+static Blk	*lookupblk(vlong);
+
 Blk*
+readblk(vlong bp, int flg)
+{
+	Blk *b;
+	vlong off, rem, n;
+
+	assert(bp != -1);
+	if((b = malloc(sizeof(Blk))) == nil)
+		return nil;
+	off = bp;
+	rem = Blksz;
+	while(rem != 0){
+		n = pread(fs->fd, b->buf, rem, off);
+		if(n <= 0){
+			free(b);
+			return nil;
+		}
+		off += n;
+		rem -= n;
+	}
+	memset(&b->RWLock, 0, sizeof(RWLock));
+	b->type = (flg&GBraw) ? Traw : GBIT16(b->buf+0);
+	b->off = bp;
+	b->cnext = nil;
+	b->cprev = nil;
+	b->hnext = nil;
+	b->data = b->buf + 10;
+	switch(b->type){
+	default:
+		if(flg&GBraw)
+			break;
+		fprint(2, "invalid block @%llx\n", bp);
+		abort();
+		break;
+	case Tarena:
+	case Traw:
+	case Tsuper:
+	case Tlog:
+		break;
+	case Tpivot:
+		b->nval = GBIT16(b->buf+2);
+		b->valsz = GBIT16(b->buf+4);
+		b->nbuf = GBIT16(b->buf+6);
+		b->bufsz = GBIT16(b->buf+8);
+		break;
+	case Tleaf:
+		b->nval = GBIT16(b->buf+2);
+		b->valsz = GBIT16(b->buf+4);
+		break;
+	}
+	return b;
+}
+
+static Arena*
+pickarena(vlong hint)
+{
+	long n;
+
+	n = -1; /* shut up, ken */
+	if(hint > 0 || hint < fs->narena)
+		n = hint / fs->arenasz;
+	else if(hint == -1)
+		n = ainc(&fs->nextarena) % fs->narena;
+	else
+		abort();
+	return &fs->arenas[n];
+}
+
+static Arena*
+getarena(vlong b)
+{
+	int i;
+
+	i = b / fs->arenasz;
+	if(i < 0 || i >= fs->narena){
+		werrstr("out of range block %lld", b);
+		abort();
+//		return nil;
+	}
+	return &fs->arenas[i];
+}
+
+static int
+freerange(Avltree *t, vlong off, vlong len)
+{
+	Arange *r, *s, q;
+
+	assert(len % Blksz == 0);
+
+	q.off = off;
+	q.len = len;
+	r = (Arange*)avllookup(t, &q.Avl, -1);
+	if(r == nil){
+		if((r = calloc(1, sizeof(Arange))) == nil)
+			return -1;
+		r->off = off;
+		r->len = len;
+		avlinsert(t, r);
+	}else if(off+len == r->off){
+		r->len += len;
+		r->off -= len;
+	}else if(off == r->off + r->len){
+		r->len += len;
+	}else{
+		r = (Arange*)avlnext(r);
+		if(r != nil && off+len == r->off){
+			r->off -= len;
+			r->len += len;
+		} else {
+			if((r = calloc(1, sizeof(Arange))) == nil)
+				return -1;
+			r->off = off;
+			r->len = len;
+			avlinsert(t, r);
+		}
+	}
+
+	/* merge with previous */
+	s = (Arange*)avlprev(r);
+	if(s != nil && s->off + s->len == r->off){
+		s->off += r->len;
+		avldelete(t, r);
+	}
+	/* merge with next */
+	s = (Arange*)avlnext(r);
+	if(s != nil && r->off + r->len == s->off){
+		r->off += r->len;
+		avldelete(t, s);
+	}
+	return 0;
+}
+
+int
+grabrange(Avltree *t, vlong off, vlong len)
+{
+	Arange *r, *s, q;
+
+	assert(len % Blksz == 0);
+	q.off = off;
+	q.len = len;
+	r = (Arange*)avllookup(t, &q.Avl, -1);
+	if(r == nil || off + len > r->off + r->len){
+		fprint(2, "no blocks: %llx+%llx in:\n", off, len);
+		for(r = (Arange*)avlmin(t); r != nil; r = (Arange*)avlnext(r))
+			fprint(2, "\t%llx+%llx\n", r->off, r->len);
+		abort();
+		return -1;
+	}
+	if(off == r->off)
+		r->off += len;
+	else if(off == r->off + r->len)
+		r->len -= len;
+	else if(off + len > r->off && off > r->off + r->len){
+		if((s = malloc(sizeof(Arange))) == nil)
+			return -1;
+		s->off = off;
+		s->len = r->len - (off - r->off) - len;;
+		r->len = off - r->off;
+		avlinsert(t, s);
+	}
+	if(r->len == 0){
+		avldelete(t, r);
+		free(r);
+	}
+	return 0;
+}
+
+int
+synclog(Blk *b, vlong link, vlong sz)
+{
+
+	if(sz == -1)
+		sz = b->logsz;
+	PBIT64(b->data+8, link);
+	PBIT64(b->data+16, sz);
+	finalize(b);
+fprint(2, "synclog: @%llx\n", b->off);
+showfree("synclog");
+	return pwrite(fs->fd, b->buf, Blksz, b->off);
+}
+
+Blk*
+logappend(Arena *a, Blk *lb, vlong off, vlong len, int op, vlong graft)
+{
+	Blk *pb;
+	vlong o, n;
+	char *p;
+
+	assert(off % Blksz == 0);
+	if(lb == nil || lb->logsz+16+24 > Blkspc){
+		pb = lb;
+		if((o = blkalloc_lk(a)) == -1)
+			return nil;
+		if((lb = mallocz(sizeof(Blk), 1)) == nil)
+			return nil;
+		lb->data = lb->buf + Hdrsz;
+		lb->flag |= Bdirty;
+		lb->type = Tlog;
+		lb->off = o;
+		lb->logsz = 0;
+		if(synclog(lb, graft, 0) == -1)
+			return nil;
+
+		a->logtl = lb;
+		if(pb != nil)
+			if(synclog(pb, lb->off, -1) == -1)
+				return nil;
+	}
+	n = 8;
+	p = lb->data + 24 + lb->logsz;
+	if(len == Blksz && op == LgAlloc)
+		op = LgAlloc1;
+	off |= op;
+	PBIT64(p, off);
+	if(op == LgAlloc || op == LgFree){
+		PBIT64(p+8, len);
+		n += 8;
+	}
+	lb->logsz += n;
+	return lb;
+}
+
+/*
+ * Logs an allocation. Must be called
+ * with arena lock held. Duplicates some/c
+ * of the work in allocblk to prevent
+ * recursion.
+ */
+int
+logalloc(Arena *a, vlong off, int op)
+{
+	Blk *b;
+
+	if((b = logappend(a, a->logtl, off, Blksz, op, -1)) == nil)
+		return -1;
+	if(a->logtl == nil){
+		a->log = b->off;
+		a->logtl = b;
+	}
+	return 0;
+}
+
+int
+loadlog(Arena *a)
+{
+	Blk *b;
+	vlong bp, ent, off, len;
+	uvlong bh;
+	char *p, *d;
+	int op, i, n;
+
+	bp = a->log;
+	while(bp != -1){
+		dprint("log block: %llx\n", bp);
+		if((b = readblk(bp, 0)) == nil)
+			return -1;
+		p = b->data;
+		bh = GBIT64(p + 0);
+		bp = GBIT64(p + 8);
+		b->logsz = GBIT64(p + 16);
+		/* the hash covers the log and offset */
+		if(bh != siphash(p+8, Blkspc-8)){
+			werrstr("corrupt log");
+			return -1;
+		}
+		for(i = 0; i < b->logsz; i += n){
+			d = p + i + 24;
+			ent = GBIT64(d);
+			op = ent & 0xff;
+			off = ent & ~0xff;
+			switch(op){
+			case LgFlush:
+				dprint("log@%d: flush: %llx\n", i, off>>8);
+				n = 8;
+				lock(&fs->genlk);
+				fs->gen = off >> 8;
+				unlock(&fs->genlk);
+				break;
+			case LgAlloc1:
+				n = 8;
+				dprint("log@%d alloc1: %llx\n", i, off);
+				if(grabrange(a->free, off & ~0xff, Blksz) == -1)
+					return -1;
+				break;
+			case LgAlloc:
+				n = 16;
+				len = GBIT64(d+8);
+				dprint("log@%d alloc: %llx+%llx\n", i, off, len);
+				if(grabrange(a->free, off & ~0xff, len) == -1)
+					return -1;
+				break;
+			case LgFree:
+				n = 16;
+				len = GBIT64(d+8);
+				dprint("log@%d free: %llx+%llx\n", i, off, len);
+				if(freerange(a->free, off & ~0xff, len) == -1)
+					return -1;
+				break;
+			case LgRef1:
+			case LgUnref1:
+				n = 8;
+				fprint(2, "unimplemented ref op at log@%d: log op %d\n", i, op);
+				break;
+			default:
+				n = 0;
+				dprint("log@%d: log op %d\n", i, op);
+				abort();
+				break;
+			}
+		}
+	}
+	return 0;
+}
+
+int
+compresslog(Arena *a)
+{
+	Arange *r;
+	vlong *log, *p, bp, hd, hh, graft;
+	int i, n, sz;
+	Blk *pb, *ab, *b;
+
+showfree("precompress");
+fprint(2, "compress start\n");
+	/*
+	 * Sync the current log to disk. While
+	 * compressing the log, nothing else is
+	 * using this arena, so any allocs come
+	 * from the log compression.
+	 *
+	 * A bit of copy paste from newblk,
+	 * because otherwise we have bootstrap
+	 * issues.
+	 *
+	 * Set up the head of the new log as
+	 * an empty block.
+	 */
+	if((bp = blkalloc_lk(a)) == -1)
+		return -1;
+	if((b = mallocz(sizeof(Blk), 1)) == nil)
+		return -1;
+	b->type = Tlog;
+	b->flag = Bdirty;
+	b->off = bp;
+	b->ref = 1;
+	b->data = b->buf + Hdrsz;
+checkfs();
+	if(synclog(b, -1, 0) == -1)
+		return -1;
+checkfs();
+
+	/*
+	 * When reaming or loading, we may not
+	 * have set up the log.
+	 */
+checkfs();
+	if(a->logtl != nil)
+		synclog(a->logtl, b->off, -1);
+checkfs();
+	graft = b->off;
+	a->logtl = b;
+
+	/*
+	 * Prepare what we're writing back.
+	 * Arenas must be sized so that we can
+	 * keep the merged log in memory for
+	 * a rewrite.
+	 */
+	n = 0;
+	sz = 512;
+checkfs();
+	if((log = malloc(2*sz*sizeof(vlong))) == nil)
+		return -1;
+checkfs();
+	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+		if(n == sz){
+			sz *= 2;
+			if((p = realloc(log, 2*sz*sizeof(vlong))) == nil){
+				free(log);
+				return -1;
+			}
+			log = p;
+		}
+		log[2*n+0] = r->off;
+		log[2*n+1] = r->len;
+		n++;
+	}
+	if((b = newblk(Tlog)) == nil){
+		free(log);
+		return -1;
+	}
+checkfs();
+	pb = b;
+	hd = b->off;
+	hh = -1;
+	PBIT64(b->data +  8, graft);	/* link */
+	PBIT64(b->data + 16, 0ULL);	/* count */
+checkfs();
+
+	for(i = 0; i < n; i++){
+		if((b = logappend(a, b, log[2*i], log[2*i+1], LgFree, graft)) == nil)
+			return -1;
+checkfs();
+		if(b != pb){
+checkfs();
+			synclog(pb, b->off, -1);
+checkfs();
+			if(pb->off == hd)
+				hh = blkhash(pb);
+			b = pb;
+		}
+	}
+	free(log);
+checkfs();
+	if(synclog(b, graft, -1) == -1)
+		return -1;
+checkfs();
+	if(pb->off == hd)
+		hh = blkhash(pb);
+checkfs();
+
+	a->log = hd;
+	a->logh = hh;
+	ab = a->b;
+	PBIT64(ab->data + 0, hd);
+	PBIT64(ab->data + 8, hh);
+checkfs();
+	pwrite(fs->fd, ab->buf, Blksz, ab->off);
+checkfs();
+fprint(2, "compress done\n");
+	return 0;
+}
+/*
+ * Allocate from an arena, with lock
+ * held. May be called recursively, to
+ * alloc space for the alloc log.
+ */
+static vlong
+blkalloc_lk(Arena *a)
+{
+	Avltree *t;
+	Arange *r;
+	vlong b;
+
+	t = a->free;
+	r = (Arange*)t->root;
+	if(r == nil){
+		unlock(a);
+		return -1;
+	}
+
+	/*
+	 * A bit of sleight of hand here:
+	 * while we're changing the sorting
+	 * key, but we know it won't change
+	 * the sort order because the tree
+	 * covers disjoint ranges
+	 */
+	b = r->off;
+	r->len -= Blksz;
+	r->off += Blksz;
+	if(r->len == 0){
+		avldelete(t, r);
+		free(r);
+	}
+	return b;
+}
+
+int
+blkrelease(vlong b)
+{
+	Arena *a;
+	int r;
+
+	r = -1;
+	a = getarena(b);
+	lock(a);
+	if(freerange(a->free, b, Blksz) == -1)
+		goto out;
+	if(logalloc(a, b, LgFree) == -1)
+		goto out;
+	r = 0;
+out:
+	unlock(a);
+	return r;
+}
+
+vlong
+blkalloc(vlong hint)
+{
+	Arena *a;
+	vlong b;
+	int tries;
+
+	tries = 0;
+again:
+	a = pickarena(hint);
+	if(a == nil || tries == fs->narena){
+		werrstr("no empty arenas");
+		return -1;
+	}
+	lock(a);
+	/*
+	 * TODO: there's an extreme edge case
+	 * here.
+	 *
+	 * If the file system has room to alloc
+	 * a data block but no log block, then
+	 * we end up with it in a stuck state.
+	 * The fix is to reserve alloc blocks,
+	 * so that we're guaranteed to be able
+	 * to log an alloc if the disk is working
+	 * correctly.
+	 */
+	tries++;
+	if((b = blkalloc_lk(a)) == -1){
+		unlock(a);
+		goto again;
+	}
+	if(logalloc(a, b, LgAlloc) == -1){
+		unlock(a);
+		return -1;
+	}
+	unlock(a);
+	return b;
+}
+
+Blk*
 newblk(int t)
 {
 	Blk *b;
+	vlong bp;
 
-	b = emalloc(sizeof(Blk));
-	b->flag |= Bdirty;
+	if((bp = blkalloc(-1)) == -1)
+		return nil;
+	if((b = mallocz(sizeof(Blk), 1)) == nil)
+		return nil;
 	b->type = t;
-	return b;	
+	b->flag = Bdirty;
+	b->off = bp;
+	b->ref = 1;
+	b->data = b->buf + Hdrsz;
+	return cacheblk(b);
 }
 
-Blk*
-getblk(uvlong bp, uvlong bh)
+static Blk*
+lookupblk(vlong off)
 {
+	Bucket *bkt;
+	u32int h;
 	Blk *b;
 
-	b = (Blk*)bp;
-	if(blkhash(b) != bh){
-		werrstr("corrupt block %llx\n", bp);
-		fprint(2, "corrupt block %llx\n", bp);
-//		abort();
+	h = ihash(off);
+
+	bkt = &fs->cache[h % fs->cmax];
+	lock(bkt);
+	for(b = bkt->b; b != nil; b = b->hnext)
+		if(b->off == off)
+			break;
+	if(b != nil)
+		pinblk(b);
+	unlock(bkt);
+	return b;
+}
+
+static Blk*
+cacheblk(Blk *b)
+{
+	Bucket *bkt;
+	Blk *e, *c;
+	u32int h;
+
+	/* FIXME: better hash. */
+	assert(b->off != 0);
+	h = ihash(b->off);
+//	dprint("cache %lld (h=%xm, bkt=%d) => %p\n", b->off, h%fs->cmax, h, b);
+	ainc(&b->ref);
+	bkt = &fs->cache[h % fs->cmax];
+	lock(bkt);
+	for(e = bkt->b; e != nil; e = e->hnext){
+		if(b == e)
+			goto found;
+		assert(b->off != e->off);
 	}
+	bkt->b = b;
+found:
+	ainc(&b->ref);
+	unlock(bkt);
+
+	lock(&fs->lrulk);
+	if(b == fs->chead)
+		goto Cached;
+	if(b == fs->ctail)
+		fs->ctail = b->cprev;
+
+	if(b->cnext != nil)
+		b->cnext->cprev = b->cprev;
+	if(b->cprev != nil)
+		b->cprev->cnext = b->cnext;
+	if(fs->ctail == nil)
+		fs->ctail = b;
+	if(fs->chead != nil)
+		fs->chead->cprev = b;
+	if(fs->ctail == nil)
+		fs->ctail = b;
+	b->cnext = fs->chead;
+	b->cprev = nil;
+	fs->chead = b;
+	if((b->flag&Bcache) == 0){
+		b->flag |= Bcache;
+		fs->ccount++;
+		ainc(&b->ref);
+	}
+	c=0;
+	USED(c);
+/*
+	for(c = fs->ctail; c != nil && fs->ccount >= fs->cmax; c = fs->ctail){
+		fs->ctail = c->cprev;
+		fs->ccount--; 
+		putblk(c);
+	}
+*/
+Cached:
+	unlock(&fs->lrulk);
 	return b;
 }
 
+static void
+cachedel(Blk *del)
+{
+	Bucket *bkt;
+	Blk *b, **p;
+	u32int h;
+
+	/* FIXME: better hash. */
+	h = ihash(del->off);
+
+	bkt = &fs->cache[h % fs->cmax];
+	lock(bkt);
+	p = &bkt->b;
+	for(b = bkt->b; b != nil; b = b->hnext){
+		if(b == del){
+			*p = del->hnext;
+			break;
+		}
+		p = &b->hnext;
+	}
+	unlock(bkt);
+
+	lock(&fs->lrulk);
+	if(b->cnext != nil)
+		b->cnext->cprev = b->cprev;
+	if(b->cprev != nil)
+		b->cprev->cnext = b->cnext;
+	if(fs->ctail == b)
+		fs->ctail = b->cprev;
+	if(fs->chead == nil)
+		fs->chead = b;
+	unlock(&fs->lrulk);
+}
+
 void
-freeblk(Blk *b)
+enqueue(Blk *b)
 {
-	b->refs--;
-	if(b->refs == 0)
-		free(b);
+	if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1){
+		ainc(&fs->broken);
+		fprint(2, "write: %r");
+		return;
+	}
+	wlock(b);
+	b->flag &= ~(Bqueued|Bdirty|Bfinal);
+	wunlock(b);
+
 }
 
-uvlong
-blkhash(Blk *b)
+void
+fillsuper(Blk *b)
 {
-	return siphash(b->data, Blksz);
+	char *p;
+
+	assert(b->type == Tsuper);
+	p = b->data;
+	memcpy(p +  0, "gefs0001", 8);
+	PBIT32(p +  8, 0); /* dirty */
+	PBIT32(p + 12, Blksz);
+	PBIT32(p + 16, Bufspc);
+	PBIT32(p + 20, Hdrsz);
+	PBIT32(p + 24, fs->height);
+	PBIT64(p + 32, fs->rootb);
+	PBIT64(p + 40, fs->rooth);
+	PBIT32(p + 48, fs->narena);
+	PBIT64(p + 56, fs->arenasz);
+	PBIT64(p + 64, fs->gen);
+	PBIT64(p + 72, fs->nextqid);
 }
 
+void
+finalize(Blk *b)
+{
+	vlong h;
+
+//	assert((b->flag & Bfinal) == 0);
+	b->flag |= Bfinal;
+	PBIT16(b->buf, b->type);
+	switch(b->type){
+	default:
+	case Tnone:
+		abort();
+		break;
+	case Tpivot:
+		PBIT16(b->buf+2, b->nval);
+		PBIT16(b->buf+4, b->valsz);
+		PBIT16(b->buf+6, b->nbuf);
+		PBIT16(b->buf+8, b->bufsz);
+		break;
+	case Tleaf:
+		PBIT16(b->buf+2, b->nval);
+		PBIT16(b->buf+4, b->valsz);
+		break;
+	case Tlog:
+		h = siphash(b->data + 8, Blkspc-8);
+		PBIT64(b->data, h);
+	case Tsuper:
+	case Tarena:
+	case Traw:
+		break;
+	}
+}
+
+Blk*
+getblk(vlong bp, uvlong bh)
+{
+	Blk *b;
+
+	if((b = lookupblk(bp)) == nil){
+		if((b = readblk(bp, 0)) == nil)
+			return nil;
+		if(siphash(b->buf, Blksz) != bh){
+			werrstr("corrupt block %llx", bp);
+			return nil;
+		}
+	}
+	return cacheblk(b);
+}
+
+Blk*
+dupblk(vlong bp, uvlong bh)
+{
+	USED(bp, bh);
+	return nil;
+}
+
+Blk*
+pinblk(Blk *b)
+{
+	ainc(&b->ref);
+	return b;
+}
+
+int
+refblk(Blk *b)
+{
+	Arena *a;
+	int r;
+
+	a = getarena(b->off);
+	lock(a);
+	r = logalloc(a, b->off, LgRef1);
+	unlock(a);
+	return r;
+}
+
+int
+unrefblk(Blk *b)
+{
+	Arena *a;
+	int r;
+
+	a = getarena(b->off);
+	lock(a);
+	r = logalloc(a, b->off, LgUnref1);
+	unlock(a);
+	return r;
+}
+
 ushort
 blkfill(Blk *b)
 {
 	switch(b->type){
-	case Pivot:
-		return 2*b->nmsg + b->bufsz +  2*b->nent + b->valsz;
-	case Leaf:
-		return 2*b->nent + b->valsz;
+	case Tpivot:
+		return 2*b->nbuf + b->bufsz +  2*b->nval + b->valsz;
+	case Tleaf:
+		return 2*b->nval + b->valsz;
 	default:
+		fprint(2, "invalid block @%lld\n", b->off);
 		abort();
-		return 0; // shut up kencc
 	}
+	return 0; // shut up kencc
 }
 
-int
+void
 putblk(Blk *b)
 {
-	USED(b);
-	/* TODO: unref. */
-	return 0;
+	if(b == nil)
+		return;
+	if((b->flag & (Bdirty|Bqueued)) == Bdirty)
+		enqueue(b);
+	if(adec(&b->ref) == 0){
+		cachedel(b);
+		free(b);
+	}
+}
+
+void
+freeblk(Blk *b)
+{
+	Arena *a;
+
+	assert(b->ref == 1 && b->flag & (Bdirty|Bqueued) == Bdirty);
+	a = getarena(b->off);
+	lock(a);
+	/*
+	 * TODO: what to do if we fail to log a free here??
+	 * This is already an error path!
+	 */
+	logalloc(a, b->off, LgRef1);
+	unlock(a);
+	free(b);
+}
+
+int
+sync(void)
+{
+	int i, r;
+	Blk *b;
+
+	dprint("syncing\n");
+	r = 0;
+	for(i = 0; i < fs->narena; i++)
+		if(synclog(fs->arenas[i].logtl, -1, -1) == -1)
+			r = -1;
+	for(b = fs->chead; b != nil; b = b->cnext){
+//		dprint("sync %p\n", b);
+		if(!(b->flag & Bdirty))
+			continue;
+		if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1)
+			r = -1;
+	}
+	return r;
 }
--- a/check.c
+++ b/check.c
@@ -1,14 +1,15 @@
 #include <u.h>
 #include <libc.h>
-#include <bio.h>
+#include <fcall.h>
+#include <avl.h>
+
 #include "dat.h"
 #include "fns.h"
 
 char	spc[128];
-int	debug;
 
 static int
-invalidblk(Blk *b, int h, Kvp *lo, Kvp *hi)
+badblk(Blk *b, int h, Kvp *lo, Kvp *hi)
 {
 	Kvp x, y;
 	int i, r;
@@ -16,17 +17,17 @@
 	int fail;
 
 	fail = 0;
-	if(b->type == Leaf){
-		if(h != fs->height){
+	if(b->type == Tleaf){
+		if(h != 0){
 			fprint(2, "unbalanced leaf\n");
 			fail++;
 		}
-		if(h != 1 && b->nent < 2){
+		if(h != 0 && b->nval < 2){
 			fprint(2, "underfilled leaf\n");
 			fail++;
 		}
 	}
-	if(b->type == Pivot && b->nent < 2){
+	if(b->type == Tpivot && b->nval < 2){
 		fprint(2, "underfilled  pivot\n");
 		fail++;
 	}
@@ -33,19 +34,23 @@
 	getval(b, 0, &x);
 	if(lo && keycmp(lo, &x) != 0)
 		fprint(2, "out of range keys %P != %P\n", lo, &x);
-	for(i = 1; i < b->nent; i++){
+	for(i = 1; i < b->nval; i++){
 		getval(b, i, &y);
 		if(hi && keycmp(&y, hi) >= 0){
 			fprint(2, "out of range keys %P >= %P\n", &y, hi);
 			fail++;
 		}
-		if(b->type == Pivot){
-			c = getblk(x.bp, x.bh);
+		if(b->type == Tpivot){
+			if((c = getblk(x.bp, x.bh)) == nil){
+				fprint(2, "corrupt block: %r\n");
+				fail++;
+				continue;
+			}
 			if(blkfill(c) != x.fill){
-				fprint(2, "mismatched block fill");
+				fprint(2, "mismatched block fill\n");
 				fail++;
 			}
-			if(invalidblk(c, h + 1, &x, &y))
+			if(badblk(c, h - 1, &x, &y))
 				fail++;
 		}
 		r = keycmp(&x, &y);
@@ -65,17 +70,25 @@
 	}
 	return fail;
 }
-	
 
 /* TODO: this will grow into fsck. */
 int
 checkfs(void)
 {
-	return invalidblk(fs->root, 1, nil, nil) == 0;
+	int ok, height;
+	Blk *b;
+
+	ok = 1;
+	if((b = getroot(&height)) != nil){
+		if(badblk(b, height-1, nil, 0))
+			ok = 0;
+		putblk(b);
+	}
+	return ok;
 }
 
-static void
-rshowblk(Blk *b, int indent)
+void
+rshowblk(int fd, Blk *b, int indent, int recurse)
 {
 	Blk *c;
 	int i;
@@ -85,22 +98,24 @@
 	if(indent > sizeof(spc)/4)
 		indent = sizeof(spc)/4;
 	if(b == nil){
-		print("NIL\n");
+		fprint(fd, "NIL\n");
 		return;
 	}
-	if(b->type == Pivot){
-		for(i = 0; i < b->nmsg; i++){
+	if(b->type == Tpivot){
+		for(i = 0; i < b->nbuf; i++){
 			getmsg(b, i, &m);
-			print("%.*s|%M\n", 4*indent, spc, &m);
+			fprint(fd, "%.*s|%M\n", 4*indent, spc, &m);
 		}
 	}
-	for(i = 0; i < b->nent; i++){
+	for(i = 0; i < b->nval; i++){
 		getval(b, i, &kv);
-		print("%.*s|%P\n", 4*indent, spc, &kv);
-		if(b->type == Pivot){
+		fprint(fd, "%.*s|%P\n", 4*indent, spc, &kv);
+		if(b->type == Tpivot){
 			if((c = getblk(kv.bp, kv.bh)) == nil)
-				sysfatal("falied load: %r");
-			rshowblk(c, indent + 1);
+				sysfatal("failed load: %r");
+			if(recurse)
+				rshowblk(fd, c, indent + 1, 1);
+			putblk(c);
 		}
 	}
 }
@@ -117,22 +132,29 @@
 }
 
 void
-showblk(Blk *b, char *m)
+showblk(Blk *b, char *m, int recurse)
 {
-	if(m != nil)
-		print("=== %s\n", m);
-	rshowblk(b, 0);
+	fprint(2, "=== %s\n", m);
+	rshowblk(2, b, 0, recurse);
 }
 
 void
+fshowfs(int fd, char *m)
+{
+	int h;
+
+	fprint(fd, "=== %s\n", m);
+	fprint(fd, "fs->height: %d\n", fs->height);
+	rshowblk(fd, getroot(&h), 0, 1);
+}
+
+void
 showfs(char *m)
 {
-	if(m != nil)
-		print("=== %s\n", m);
-	rshowblk(fs->root, 0);
+	if(debug)
+		fshowfs(2, m);
 }
 
-
 void
 showpath(Path *p, int np)
 {
@@ -142,5 +164,21 @@
 		print("==> b=%p, n=%p, idx=%d\n", p[i].b, p[i].n, p[i].idx);
 		print("\tclear=(%d, %d):%d\n", p[i].lo, p[i].hi, p[i].sz);
 		print("\tl=%p, r=%p, n=%p, split=%d\n", p[i].l, p[i].r, p[i].n, p[i].split);
+	}
+}
+
+void
+showfree(char *m)
+{
+	Arange *r;
+	int i;
+
+	if(!debug)
+		return;
+	print("=== %s\n", m);
+	for(i = 0; i < fs->narena; i++){
+		print("allocs[%d]:\n", i);
+		for(r = (Arange*)avlmin(fs->arenas[i].free); r != nil; r = (Arange*)avlnext(r))
+			print("\t%llx+%llx\n", r->off, r->len);
 	}
 }
--- a/dat.h
+++ b/dat.h
@@ -1,35 +1,80 @@
 typedef struct Blk	Blk;
 typedef struct Gefs	Gefs;
+typedef struct Fmsg	Fmsg;
+typedef struct Fid	Fid;
 typedef struct Msg	Msg;
 typedef struct Key	Key;
 typedef struct Val	Val;
 typedef struct Kvp	Kvp;
+typedef struct Bptr	Bptr;
 typedef struct Path	Path;
+typedef struct Scan	Scan;
+typedef struct Dent	Dent;
+typedef struct Scanp	Scanp;
+typedef struct Arena	Arena;
+typedef struct Arange	Arange;
+typedef struct Bucket	Bucket;
+typedef struct Chan	Chan;
 
 enum {
-	// buffer for the whole block
-	Blksz	= 256,
-	// will store what type of block
-	Hdrsz	= 16,
+	KiB	= 1024ULL,
+	MiB	= 1024ULL*KiB,
+	GiB	= 1024ULL*MiB,
+	TiB	= 1024ULL*GiB,
+
+	Lgblk	= 9,
+	Blksz	= (1ULL<<Lgblk),
+
+	Nrefbuf	= 1024,			/* number of ref incs before syncing */
+	Nfidtab	= 1024,			/* number of fit hash entries */
+	Ndtab	= 1024,			/* number of dir tab entries */
+	Max9p	= 16*KiB,		/* biggest message size we're willing to negotiate */
+	Nsec	= 1000*1000*1000,	/* nanoseconds to the second */
+	Maxname	= 256,			/* maximum size of a name element */
+	Maxent	= 9+Maxname+1,		/* maximum size of ent key, with terminator */
+
+	/*
+	 * Kpmax must be no more than 1/4 of pivspc, or
+	 * there is no way to get a valid split of a
+	 * maximally filled tree.
+	 */
+	Keymax	= 32,			/* key data limit */
+	Inlmax	= 128,			/* inline data limit */
+	Ptrsz	= 18,			/* off, hash, fill */
+	Kvmax	= Keymax + Inlmax,	/* Key and value */
+	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
+
+	Hdrsz	= 10,
 	Blkspc	= Blksz - Hdrsz,
-	// space for message buffer
 	Bufspc  = Blkspc / 2,
-	// space for pivot keys and offsets
 	Pivspc	= Blkspc - Bufspc,
 	Leafspc = Blkspc,
-	Keymax	= 16,
-	Inlmax	= 64,
-	Ptrsz	= 18,
-	Kvmax	= Keymax + Inlmax,	/* Key and value */
-	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
 	Msgmax  = 1 + (Kvmax > Kpmax ? Kvmax : Kpmax)
 };
 
 enum {
-	Bdirty = 1 << 0,
+	/*
+	 * dent: pqid[8] qid[8] -- a directory entry key.
+	 * ptr:  off[8] hash[8] -- a key for an Dir block.
+	 * dir:  fixed statbuf header, user ids
+	 */
+	Kref,	/* off[8] => ptr[16]:		pointer to refcount page */
+	Kdat,	/* qid[8] off[8] => ptr[16]:	pointer to data page */
+	Kent,	/* pqid[8] name[n] => dir[n]:	serialized Dir */
+	Ksnap,	/* name[n] => dent[16] ptr[16]:	snapshot root */
+	Ksuper,	/* qid[8] => pqid[8]:		parent dir */
 };
 
+enum {
+	Bdirty	= 1 << 0,
+	Bqueued	= 1 << 1,
+	Bfinal	= 1 << 2,
+	Bcache	= 1 << 3,
+};
+
 #define Efs	"i will not buy this fs, it is scratched"
+#define Efid	"bad fid"
+#define Edscan	"invalid dir scan offset"
 #define Eexist	"does not exist"
 #define Ebotch	"protocol botch"
 #define Emode	"unknown mode"
@@ -37,20 +82,93 @@
 #define Eauth	"authentication failed"
 #define Elength	"name too long"
 #define Eperm	"permission denied"
+#define Einuse	"resource in use"
+#define Ebadf	"invalid file"
+#define Emem	"out of memory"
+#define Ename	"invalid file name"
+#define Enomem	"out of memory"
 
 /*
- * The type of block. Pivot nodes are internal to the tree,
- * while leaves inhabit the edges. Pivots are split in half,
- * containing a buffer for the data, and keys to direct the
- * searches. Their buffers contain messages en-route to the
- * leaves.
+ * All metadata blocks share a common header:
+ * 
+ *	type[2]
  *
- * Leaves contain the key, and some chunk of data as the
- * value.
+ * The None type is reserved for file data blocks
+ * and refcount blocks.
+ *
+ * The superblock has this layout:
+ *	version[8]	always "gefs0001"
+ *	flags[4]	status flags:
+ *				dirty=1<<0,
+ *	blksz[4]	block size in bytes
+ *	bufsz[4]	portion of leaf nodes
+ *			allocated to buffers,
+ *			in bytes
+ *	hdrsz[4]	size of tree node header,
+ *			in bytes.
+ *	height[4]	tree height of root node
+ *	rootb[8]	address of root in last
+ *			snapshot.
+ *	rooth[8]	hash of root node
+ *	narena[4]	number of arenas in tree
+ *	arenasz[8]	maximum size of arenas;
+ *			they may be smaller.
+ *	gen[8]		The flush generation
+ *
+ * The arena zone blocks have this layout, and
+ * are overwritten in place:
+ *
+ *	log[8]		The head of the alloc log
+ *	logh[8]		The hash of the alloc log
+ *
+ * The log blocks have this layout, and are one of
+ * two types of blocks that get overwritten in place:
+ *
+ *	hash[8]		The hash of the rest of the block
+ *	link[8]		The next block, or -1 if nil.
+ *	entsz[8]	number of bytes of log entries.
+ *
+ *	The remainder of the block is filled with log
+ *	entries. Each log entry has at least 8 bytes
+ *	of entry. Some are longer. The opcode is or'ed
+ *	into the low order bits of the first vlong.
+ *	These ops take the following form:
+ *
+ *	Alloc, Free:
+ *		off[8] len[8]
+ *	Alloc1, Free1:
+ *		off[8]
+ *	Ref:
+ *		off[8]
+ *	Flush:	
+ *		gen[8]
+ *
+ * Pivots have the following layout:
+ *
+ *	nval[2]
+ *	valsz[2]
+ *	nbuf[2]
+ *	bufsz[2]
+ *
+ * Leaves have the following layout:
+ *
+ *	nval[2]
+ *	valsz[2]
+ *	_pad[4]
+ *
+ * Within these nodes, pointers have the following
+ * layout:
+ *
+ *	off[8] hash[8] fill[2]
  */
 enum {
-	Pivot,
-	Leaf,
+	Tnone,
+	Traw,
+	Tsuper,
+	Tarena,
+	Tlog,
+	Tpivot,
+	Tleaf,
 };
 
 enum {
@@ -59,13 +177,53 @@
 };
 
 enum {
-	Ocreate,
+	GBraw	= 1<<0,
+	GBwrite	= 1<<1,
+};
+
+enum {
+	Oinsert,
 	Odelete,
-	Owrite,
 	Owstat,
+	/* wstat flags */
+	Owsize	= 1<<4,
+	Owname	= 1<<5,
+	Owmode	= 1<<6,
+	Owmtime	= 1<<7,
 };
 
 /*
+ * Operations for the allocation log.
+ */
+enum {
+	LgFree,		/* free a range: 16 byte */
+	LgAlloc,	/* alloc a range: 16 byte */
+	LgAlloc1,	/* alloc a block */
+	LgRef1,		/* ref a block */
+	LgUnref1,	/* free a block */
+	LgFlush,	/* flush log, bump gen */
+};
+
+struct Arange {
+	Avl;
+	vlong	off;
+	vlong	len;
+};
+
+struct Bucket {
+	Lock;
+	Blk	*b;
+};
+
+struct Fmsg {
+	Fcall;
+	int	fd;	/* the fd to repsond on */
+	QLock	*wrlk;	/* write lock on fd */
+	int	sz;	/* the size of the message buf */
+	uchar	buf[];
+};
+
+/*
  * Overall state of the file sytem.
  * Shadows the superblock contents.
  */
@@ -75,11 +233,59 @@
 	int	pivsz;
 	int	hdrsz;
 
+	QLock	snaplk;
+	Blk*	super;
+
+	Chan	*wrchan;
+	Chan	*rdchan;
+
+	int	fd;
+	long	broken;
+
+	Lock	rootlk;
+	/* protected by rootlk */
+	vlong	rootb;
+	vlong	rooth;
 	int	height;
-	Blk	*root;
-	
+
+	Lock	genlk;
+	vlong	gen;
+	Lock	qidlk;
+	vlong	nextqid;
+
+	Arena	*arenas;
+	int	narena;
+	long	nextarena;
+	vlong	arenasz;
+
+	Lock	fidtablk;
+	Fid	*fidtab[Nfidtab];
+	Lock	dtablk;
+	Dent	*dtab[Ndtab];
+
+	Lock	lrulk;
+	/* protected by lrulk */
+	Bucket	*cache;
+	Blk	*chead;
+	Blk	*ctail;
+	int	ccount;
+	int	cmax;
 };
 
+struct Arena {
+	Lock;
+	Avltree *free;
+	Avltree *partial;
+
+	vlong	log;	/* log head */
+	vlong	logh;	/* log head hash */
+	Blk	*logtl;	/* tail block open for writing */
+	Blk	*b;	/* arena block */
+
+	Blk	**q;	/* write queue */
+	vlong	nq;
+};
+
 struct Key{
 	char	*k;
 	int	nk;
@@ -112,6 +318,39 @@
 	Kvp;
 };
 
+struct Dent {
+	RWLock;
+	Key;
+	Dent	*next;
+	long	ref;
+
+	Qid	qid;
+	vlong	length;
+	vlong	rootb;
+	char	buf[Maxent];
+};
+
+struct Fid {
+	Lock;
+	Fid	*next;
+	/*
+	 * if opened with OEXEC, we want to use a snapshot,
+	 * instead of the most recent root, to prevent
+	 * paging in the wrong executable.
+	 */
+	vlong	rootb;
+	vlong	rooth;
+	int	height;
+
+	u32int	fid;
+	vlong	qpath;
+	int	mode;
+	int	iounit;
+
+	Scan	*scan;	/* in progres scan */
+	Dent	*dent;	/* (pqid, name) ref, modified on rename */
+};
+
 struct Path {
 	/* Flowing down for flush */
 	Blk	*b;	/* insertion */
@@ -130,21 +369,65 @@
 	 * and midx points at the left of
 	 * the two nodes.
 	 */
-	Blk	*ml;	/* merged/rotated left */
-	Blk	*mr;	/* merged/rotated right */
+	Blk	*nl;	/* merged/rotated left */
+	Blk	*nr;	/* merged/rotated right */
 	int	midx;	/* merged index */
 	char	split;	/* did we split? */
 	char	merge;	/* did we merge? */
 };
 
+struct Scanp {
+	int	bi;
+	int	vi;
+	Blk	*b;
+};
+
+struct Scan {
+	vlong	offset;	/* last read offset */
+	vlong	rootb;
+	vlong	rooth;
+	int	height;
+
+	int	done;
+	Kvp	kv;
+	Key	pfx;
+	char	kvbuf[Kvmax];
+	char	pfxbuf[Keymax];
+	Scanp	*path;
+};
+
 struct Blk {
-	char	type;
-	char	flag;
-	short	nent;
-	short	valsz;
-	short   nmsg;
-	short   bufsz;
+	RWLock;
+
+	/* cache entry */
+	Blk	*cnext;
+	Blk	*cprev;
+	Blk	*hnext;
+
+	short	flag;
+
+	/* serialized to disk in header */
+	short	type;	/* @0, for all */
+	short	nval;	/* @2, for Leaf, Pivot: data[0:2] */
+	short	valsz;	/* @4, for Leaf, Pivot: data[2:4] */
+	short   nbuf;	/* @6, for Pivot */
+	short   bufsz;	/* @8, for Pivot */
+
+	vlong	logsz;	/* for allocation log */
+	vlong	lognxt;	/* for allocation log */
+
 	vlong	off;	/* -1 for unallocated */
-	int	refs;	/* TODO: move out */
-	char	data[Blksz];
+	long	ref;	/* TODO: move out */
+	char	*data;
+	char	buf[Blksz];
+};
+
+struct Chan {
+	int	size;	/* size of queue */
+	long	count;	/* how many in queue (semaphore) */
+	long	avail;	/* how many available to send (semaphore) */
+	Lock	rl, wl;	/* circular pointers */
+	void	**rp;
+	void	**wp;
+	void*	args[];	/* list of saved pointers, [->size] */
 };
--- a/fns.h
+++ b/fns.h
@@ -3,26 +3,44 @@
 #pragma varargck type "K"	Key*
 #pragma varargck type "V"	Val*
 #pragma varargck type "B"	Blk*
+#pragma varargck type "R"	Arange*
+#pragma varargck type "X"	char*
 
 extern Gefs	*fs;
 extern int	debug;
 
-void	initfs(void);
-
 Blk*	newblk(int type);
 Blk*	shadow(Blk*, Path*, Path*);
-int	putblk(Blk*);
-Blk*	getblk(uvlong bp, uvlong bh);
-void	freeblk(Blk *b);
+Blk*	getroot(int*);
+Blk*	getblk(vlong, uvlong);
+Blk*	pinblk(Blk*);
+Blk*	readblk(vlong, int);
+void	putblk(Blk*);
+void	enqueue(Blk*);
+void	freeblk(Blk*);
 ushort	blkfill(Blk*);
 uvlong	blkhash(Blk*);
+u32int	ihash(vlong);
+void	finalize(Blk*);
+void	fillsuper(Blk*);
+int	snapshot(void);
 uvlong	siphash(void*, usize);
+void	reamfs(char*);
+int	loadarena(Arena*, vlong);
+void	loadfs(char*);
+int	sync(void);
+int	loadlog(Arena *a);
+int	synclog(Blk*, vlong, vlong);
+int	endfs(void);
+int	compresslog(Arena *a);
 
-int	fsupsert(Msg*);
-char	*fswalk1(Key*, Kvp*);
+int	btupsert(Msg*, int);
+char	*btlookup(Key*, Kvp*, Blk**);
+char	*btlookupat(Blk*, Key*, Kvp*, Blk**);
+char	*btscan(Scan*, Key*, vlong, vlong, int);
+char	*btnext(Scan*, Kvp*, int*);
+void	btdone(Scan*);
 
-void*	emalloc(usize);
-void*	erealloc(void*, usize);
 char*	estrdup(char*);
 
 int	keycmp(Key *, Key *);
@@ -32,14 +50,48 @@
 void	getmsg(Blk *, int, Msg *);
 
 void	initshow(void);
-void	showblk(Blk*, char*);
+void	showblk(Blk*, char*, int);
 void	showpath(Path*, int);
 void	showfs(char*);
+void	fshowfs(int, char*);
+void	showfree(char*);
 int	checkfs(void);
 
+#define dprint(...) \
+	do{ \
+		if(1) fprint(2, __VA_ARGS__); \
+	}while(0)
+
+char	*pack8(int*, char*, char*, uchar);
+char	*pack16(int*, char*, char*, ushort);
+char	*pack32(int*, char*, char*, uint);
+char	*pack64(int*, char*, char*, uvlong);
+char	*packstr(int*, char*, char*, char*);
+
+/* void* is a bit hacky, but we want both signed and unsigned to work */
+char	*unpack8(int*, char*, char*, void*);
+char	*unpack16(int*, char*, char*, void*);
+char	*unpack32(int*, char*, char*, void*);
+char	*unpack64(int*, char*, char*, void*);
+char	*unpackstr(int*, char*, char*, char**);
+int	dir2kv(vlong, Dir*, Kvp*, char*, int);
+int	kv2statbuf(Kvp*, char*, int);
+int	kv2dir(Kvp*, Dir*);
+int	kv2qid(Kvp*, Qid*);
+
 /* scratch */
 void	setmsg(Blk *, int, Msg *);
 void	bufinsert(Blk *, Msg *);
-void victim(Blk *b, Path *p);
-void blkinsert(Blk *b, Kvp *kv);
+void	victim(Blk *b, Path *p);
+void	blkinsert(Blk *b, Kvp *kv);
 
+Chan	*mkchan(int);
+Fmsg	*chrecv(Chan*);
+void	chsend(Chan*, Fmsg*);
+void	runfs(void*);
+void	runwrite(void*);
+void	runread(void*);
+void	runctl(void*);
+
+/* it's in libc... */
+extern int cas(long *, long, long);
--- /dev/null
+++ b/fs.c
@@ -1,0 +1,1103 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+
+int
+okname(char *name)
+{
+	int i;
+
+	if(name[0] == 0)
+		return -1;
+	if(strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
+		return -1;
+	for(i = 0; i < Maxname; i++){
+		if(name[i] == 0)
+			return 0;
+		if((name[i]&0xff) < 0x20 || name[i] == '/')
+			return -1;
+	}
+	return -1;
+}
+
+static vlong
+nextqid(void)
+{
+	vlong q;
+
+	lock(&fs->qidlk);
+	q = fs->nextqid++;
+	unlock(&fs->qidlk);
+	return q;
+}
+
+static char*
+fslookup(Fid *f, Key *k, Kvp *kv, Blk **bp, int lk)
+{
+	char *e;
+	Blk *b;
+
+	if(f->rootb == -1)
+		b = getroot(nil);
+	else
+		b = getblk(f->rootb, f->rooth);
+	if(b == nil)
+		return Efs;
+	if(lk)
+		rlock(f->dent);
+	e = btlookupat(b, k, kv, bp);
+	if(lk)
+		runlock(f->dent);
+	return e;
+}
+
+static Dent*
+getdent(vlong root, vlong pqid, Dir *d)
+{
+	Dent *e;
+	char *ek, *eb;
+	u32int h;
+	int err;
+
+	h = (ihash(d->qid.path) ^ ihash(root)) % Ndtab;
+	dprint("hash: %d\n", h);
+	lock(&fs->dtablk);
+	for(e = fs->dtab[h]; e != nil; e = e->next){
+		if(e->qid.path == d->qid.path && e->rootb == root){
+			dprint("found %p\n", e);
+			ainc(&e->ref);
+			unlock(&fs->dtablk);
+			return e;
+		}
+	}
+
+	err = 0;
+	if((e = mallocz(sizeof(Dent), 1)) == nil)
+		return nil;
+	e->ref = 1;
+	e->qid = d->qid;
+	e->rootb = root;
+	e->k = e->buf;
+	e->nk = 9 + strlen(d->name) + 1;
+
+	ek = e->buf;
+	eb = ek + sizeof(e->buf);
+	ek = pack8(&err, ek, eb, Kent);
+	ek = pack64(&err, ek, eb, pqid);
+	ek = packstr(&err, ek, eb, d->name);
+	e->nk = ek - e->buf;
+	e->next = fs->dtab[h];
+	fs->dtab[h] = e;
+
+	unlock(&fs->dtablk);
+	return e;
+}
+
+static void
+clunkdent(Dent *de)
+{
+	Dent *e, **pe;
+	u32int h;
+
+	if(adec(&de->ref) == 0){
+		h = (ihash(de->qid.path) ^ ihash(de->rootb)) % Ndtab;
+dprint("freeing dent, hash=%d, p=0x%p\n", h, de);
+		lock(&fs->dtablk);
+		pe = &fs->dtab[h];
+		for(e = fs->dtab[h]; e != nil; e = e->next){
+			if(e == de){
+				*pe = e->next;
+				unlock(&fs->dtablk);
+				free(de);
+				return;
+			}
+			pe = &e->next;
+		}
+		abort();
+	}
+}
+
+void
+showfids(void)
+{
+	int i;
+	Fid *f;
+
+	if(!debug)
+		return;
+	fprint(2, "fids:---\n");
+	for(i = 0; i < Nfidtab; i++)
+		for(f = fs->fidtab[i]; f != nil; f = f->next){
+			rlock(f->dent);
+			fprint(2, "\tfid[%d]: %d [refs=%ld, k=%K]\n", i, f->fid, f->dent->ref, &f->dent->Key);
+			runlock(f->dent);
+		}		
+}
+
+Fid*
+getfid(u32int fid)
+{
+	u32int h;
+	Fid *f;
+
+	h = ihash(fid) % Nfidtab;
+	lock(&fs->fidtablk);
+	for(f = fs->fidtab[h]; f != nil; f = f->next)
+		if(f->fid == fid)
+			break;
+	unlock(&fs->fidtablk);
+	return f;
+}
+
+Fid*
+dupfid(int new, Fid *f)
+{
+	Fid *n, *o;
+	u32int h;
+
+	h = ihash(new) % Nfidtab;
+	if((n = malloc(sizeof(Fid))) == nil)
+		return nil;
+
+	*n = *f;
+	n->fid = new;
+	n->mode = -1;
+	n->next = nil;
+
+	lock(&fs->fidtablk);
+	ainc(&n->dent->ref);
+	for(o = fs->fidtab[h]; o != nil; o = o->next)
+		if(o->fid == new)
+			break;
+	if(o == nil){
+		n->next = fs->fidtab[h];
+		fs->fidtab[h] = n;
+	}
+	unlock(&fs->fidtablk);
+
+	if(o != nil){
+		werrstr("fid in use: %d == %d", o->fid, new);
+		abort();
+		free(n);
+		return nil;
+	}
+	return n;
+}
+
+void
+clunkfid(Fid *fid)
+{
+	Fid *f, **pf;
+	u32int h;
+
+	lock(&fs->fidtablk);
+	h = ihash(fid->fid) % Nfidtab;
+	pf = &fs->fidtab[h];
+	for(f = fs->fidtab[h]; f != nil; f = f->next){
+		if(f == fid){
+			*pf = f->next;
+			goto Found;
+		}
+		pf = &f->next;
+	}
+	abort();
+Found:
+	clunkdent(fid->dent);
+	unlock(&fs->fidtablk);
+	free(fid);
+}
+
+void
+fshangup(int fd, char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vfprint(2, fmt, ap);
+	va_end(ap);
+	close(fd);
+	abort();
+}
+
+Fmsg*
+readmsg(int fd, int max)
+{
+	char szbuf[4];
+	int sz;
+	Fmsg *m;
+
+	if(readn(fd, szbuf, 4) != 4)
+		return nil;
+	sz = GBIT32(szbuf);
+	if(sz > max)
+		return nil;
+	if((m = malloc(sizeof(Fmsg)+sz)) == nil)
+		return nil;
+	if(readn(fd, m->buf+4, sz-4) != sz-4){
+		werrstr("short read: %r");
+		free(m);
+		return nil;
+	}
+	m->fd = fd;
+	m->sz = sz;
+	PBIT32(m->buf, sz);
+	return m;
+}
+
+void
+respond(Fmsg *m, Fcall *r)
+{
+	uchar buf[Max9p];
+	int w, n;
+
+	r->tag = m->tag;
+	dprint("→ %F\n", r);
+	if((n = convS2M(r, buf, sizeof(buf))) == 0)
+		abort();
+	qlock(m->wrlk);
+	w = write(m->fd, buf, n);
+	qunlock(m->wrlk);
+	if(w != n)
+		fshangup(m->fd, "failed write");
+	free(m);
+}
+
+void
+rerror(Fmsg *m, char *fmt, ...)
+{
+	char buf[128];
+	va_list ap;
+	Fcall r;
+
+	va_start(ap, fmt);
+	vsnprint(buf, sizeof(buf), fmt, ap);
+	va_end(ap);
+	r.type = Rerror;
+	r.ename = buf;
+	respond(m, &r);
+}
+
+Chan*
+mkchan(int size)
+{
+	Chan *c;
+
+	if((c = mallocz(sizeof(Chan) + size*sizeof(void*), 1)) == nil)
+		sysfatal("create channel");
+	c->size = size;
+	c->avail = size;
+	c->count = 0;
+	c->rp = c->args;
+	c->wp = c->args;
+	return c;
+
+}
+
+Fmsg*
+chrecv(Chan *c)
+{
+	void *a;
+	long v;
+
+	v = c->count;
+	if(v == 0 || cas(&c->count, v, v-1) == 0)
+		semacquire(&c->count, 1);
+	lock(&c->rl);
+	a = *c->rp;
+	if(++c->rp >= &c->args[c->size])
+		c->rp = c->args;
+	unlock(&c->rl);
+	semrelease(&c->avail, 1);
+	return a;
+}
+
+void
+chsend(Chan *c, Fmsg *m)
+{
+	long v;
+
+	v = c->avail;
+	if(v == 0 || cas(&c->avail, v, v-1) == 0)
+		semacquire(&c->avail, 1);
+	lock(&c->wl);
+	*c->wp = m;
+	if(++c->wp >= &c->args[c->size])
+		c->wp = c->args;
+	unlock(&c->wl);
+	semrelease(&c->count, 1);
+
+}
+
+void
+fsversion(Fmsg *m, int *msz)
+{
+	Fcall r;
+
+	memset(&r, 0, sizeof(Fcall));
+	if(strcmp(m->version, "9P2000") == 0){
+		if(m->msize < *msz)
+			*msz = m->msize;
+		r.type = Rversion;
+		r.msize = *msz;
+	}else{
+		r.type = Rversion;
+		r.version = "unknown";
+	}
+	respond(m, &r);
+}
+
+void
+fsauth(Fmsg *m)
+{
+	Fcall r;
+
+	showfids();
+	r.type = Rerror;
+	r.ename = "unimplemented auth";
+	respond(m, &r);
+}
+
+void
+fsattach(Fmsg *m, int iounit)
+{
+	char *p, *ep, buf[128];
+	int err;
+	Dent *e;
+	Fcall r;
+	Kvp kv;
+	Key dk;
+	Blk *b;
+	Fid f;
+	Dir d;
+
+	err = 0;
+	p = buf;
+	ep = buf + sizeof(buf);
+	p = pack8(&err, p, ep, Kent);
+	p = pack64(&err, p, ep, -1ULL);
+	p = packstr(&err, p, ep, "");
+	dk.k = buf;
+	dk.nk = p - buf;
+showfs("attach");
+	if(btlookup(&dk, &kv, &b) != nil){
+		rerror(m, Efs);
+		return;
+	}
+	r.type = Rattach;
+	if(kv2dir(&kv, &d) == -1){
+		rerror(m, Efs);
+		putblk(b);
+		return;
+	}
+	if((e = getdent(-1, -1, &d)) == nil){
+		rerror(m, Efs);
+		putblk(b);
+		return;
+	}
+	putblk(b);
+
+	/*
+	 * A bit of a hack; we're duping a fid
+	 * that doesn't exist, so we'd be off
+	 * by one on the refcount. Adjust to
+	 * compensate for the dup.
+	 */
+	adec(&e->ref);
+
+	memset(&f, 0, sizeof(Fid));
+	f.fid = NOFID;
+	f.qpath = d.qid.path;
+	f.mode = -1;
+	f.rooth = -1;
+	f.rootb = -1;
+	f.iounit = iounit;
+	f.dent = e;
+showfids();
+	if(dupfid(m->fid, &f) == nil){
+		rerror(m, Enomem);
+		return;
+	}
+checkfs();
+showfids();
+	r.qid = d.qid;
+	respond(m, &r);
+	return;
+}
+
+void
+fswalk(Fmsg *m)
+{
+	char *p, *e, *estr, kbuf[Maxent];
+	int i, err;
+	vlong up, prev;
+	Fid *o, *f;
+	Dent *dent;
+	Fcall r;
+	Blk *b;
+	Kvp kv;
+	Key k;
+	Dir d;
+
+	if((o = getfid(m->fid)) == nil){
+		rerror(m, Efid);
+		return;
+	}
+	if(o->mode != -1){
+		rerror(m, Einuse);
+		return;
+	}
+	err = 0;
+	estr = nil;
+	up = o->qpath;
+	prev = o->qpath;
+	r.type = Rwalk;
+	r.nwqid = 0;
+	for(i = 0; i < m->nwname; i++){
+		up = prev;
+		p = kbuf;
+		e = p + sizeof(kbuf);
+		p = pack8(&err, p, e, Kent);
+		p = pack64(&err, p, e, up);
+		p = packstr(&err, p, e, m->wname[i]);
+		if(err){
+			rerror(m, "bad walk: %r");
+			return;
+		}
+		k.k = kbuf;
+		k.nk = p - kbuf;
+//showfs("walking");
+dprint("looking up %K\n", &k);
+		if((estr = fslookup(o, &k, &kv, &b, 0)) != nil)
+			break;
+		if(kv2dir(&kv, &d) == -1){
+			rerror(m, Efs);
+			putblk(b);
+			return;
+		}
+		prev = d.qid.path;
+		putblk(b);
+		r.wqid[r.nwqid] = d.qid;
+		r.nwqid++;
+	}
+	if(i == 0 && m->nwname != 0){
+		rerror(m, estr);
+		return;
+	}
+	f = o;
+	if(m->fid != m->newfid){
+		if((f = dupfid(m->newfid, o)) == nil){
+			rerror(m, "%r");
+			return;
+		}
+	}
+	if(i > 0){
+		d.name = m->wname[i-1];
+		dent = getdent(f->rootb, up, &d);
+		if(dent == nil){
+			if(m->fid != m->newfid)
+				clunkfid(f);
+			rerror(m, Enomem);
+			return;
+		}
+		f->qpath = r.wqid[r.nwqid-1].path;
+		f->dent = dent;
+	}
+	respond(m, &r);
+}
+
+void
+fsstat(Fmsg *m)
+{
+	char buf[STATMAX];
+	Fcall r;
+	Fid *f;
+	Kvp kv;
+	Blk *b;
+	int n;
+
+	showfids();
+	if((f = getfid(m->fid)) == nil){
+		rerror(m, "no such fid");
+		return;
+	}
+	if(btlookup(f->dent, &kv, &b) != nil){
+		rerror(m, Eexist);
+		return;
+	}
+	if((n = kv2statbuf(&kv, buf, sizeof(buf))) == -1){
+		rerror(m, "stat: %r");
+		return;
+	}
+	r.type = Rstat;
+	r.stat = (uchar*)buf;
+	r.nstat = n;
+	respond(m, &r);
+	putblk(b);
+}
+
+void
+fswstat(Fmsg *m)
+{
+	USED(m);
+	rerror(m, "wstat unimplemented");
+}
+
+void
+fsclunk(Fmsg *m)
+{
+	Fcall r;
+	Fid *f;
+
+	if((f = getfid(m->fid)) == nil){
+		rerror(m, "no such fid");
+		return;
+	}
+	clunkfid(f);
+	r.type = Rclunk;
+	respond(m, &r);
+}
+
+void
+fscreate(Fmsg *m)
+{
+	char buf[Kvmax];
+	Dent *dent;
+	Fcall r;
+	Msg mb;
+	Fid *f;
+	Dir d;
+
+	if(okname(m->name) == -1){
+		rerror(m, Ename);
+		return;
+	}
+	if((f = getfid(m->fid)) == nil){
+		rerror(m, "no such fid");
+		return;
+	}
+	if(m->perm & (DMMOUNT|DMAUTH)){
+		rerror(m, "unknown permission");
+		return;
+	}
+	d.qid.type = 0;
+	if(m->perm & DMDIR)
+		d.qid.type |= QTDIR;
+	if(m->perm & DMAPPEND)
+		d.qid.type |= QTAPPEND;
+	if(m->perm & DMEXCL)
+		d.qid.type |= QTEXCL;
+	if(m->perm & DMTMP)
+		d.qid.type |= QTTMP;
+	d.qid.path = nextqid();
+	d.qid.vers = 0;
+	d.mode = m->perm;
+	d.name = m->name;
+	d.atime = nsec();
+	d.mtime = d.atime;
+	d.length = 0;
+	d.uid = "glenda";
+	d.gid = "glenda";
+	d.muid = "glenda";
+	mb.op = Oinsert;
+	if(dir2kv(f->qpath, &d, &mb, buf, sizeof(buf)) == -1){
+		rerror(m, "%r");
+		return;
+	}
+//showfs("precreate");
+	if(btupsert(&mb, 1) == -1){
+		rerror(m, "%r");
+		return;
+	}
+//showfs("postcreate");
+	dent = getdent(f->rootb, f->qpath, &d);
+	if(dent == nil){
+		if(m->fid != m->newfid)
+			clunkfid(f);
+		rerror(m, Enomem);
+		return;
+	}
+
+	lock(f);
+	if(f->mode != -1){
+		unlock(f);
+		clunkdent(dent);
+		rerror(m, Einuse);
+		return;
+	}
+	f->mode = m->mode;
+	f->qpath = d.qid.path;
+	f->dent = dent;
+	unlock(f);
+
+	r.type = Rcreate;
+	r.qid = d.qid;
+	r.iounit = f->iounit;
+	respond(m, &r);
+		
+}
+
+int
+fsaccess(Dir*, int)
+{
+	/* all is permitted */
+	return 0;
+}
+
+void
+fsopen(Fmsg *m)
+{
+	Fcall r;
+	char *e;
+	Dir d;
+	Fid *f;
+	Blk *b;
+	Kvp kv;
+
+	if((f = getfid(m->fid)) == nil){
+		rerror(m, Efid);
+		return;
+	}
+	if((e = fslookup(f, f->dent, &kv, &b, 0)) != nil){
+		rerror(m, e);
+		return;
+	}
+	if(kv2dir(&kv, &d) == -1){
+		rerror(m, Efs);
+		putblk(b);
+	}
+	if(fsaccess(&d, m->mode) == -1){
+		rerror(m, Eperm);
+		putblk(b);
+		return;
+	}
+	wlock(f->dent);
+	f->dent->length = d.length;
+	wunlock(f->dent);
+	r.type = Ropen;
+	r.qid = d.qid;
+	r.iounit = f->iounit;
+	putblk(b);
+
+	lock(f);
+	if(f->mode != -1){
+		rerror(m, Einuse);
+		unlock(f);
+		return;
+	}
+	f->mode = m->mode;
+	if((f->mode & 0x7) == OEXEC){
+		lock(&fs->rootlk);
+		f->rootb = fs->rootb;
+		f->rooth = fs->rooth;
+		f->height = fs->height;
+//		refblk(fs->rootb);
+		unlock(&fs->rootlk);
+	}
+	unlock(f);
+	respond(m, &r);
+}
+
+char*
+fsreaddir(Fmsg *m, Fid *f, Fcall *r)
+{
+	char pfx[9], *p, *e;
+	int n, ns, done;
+	Scan *s;
+	Kvp kv;
+
+	s = f->scan;
+	if(s != nil && s->offset != 0 && s->offset != m->offset)
+		return Edscan;
+	if(s == nil || m->offset == 0){
+		pfx[0] = Kent;
+		PBIT64(pfx+1, f->qpath);
+		kv.k = pfx;
+		kv.nk = sizeof(pfx);
+
+		print("scan starting\n");
+		if((s = mallocz(sizeof(Scan), 1)) == nil)
+			return "out of memory";
+		if((e = btscan(s, &kv, f->rootb, f->rooth, f->height)) != nil){
+			free(r->data);
+			btdone(s);
+			return e;
+		}
+
+		lock(f);
+		if(f->scan != nil)
+			btdone(f->scan);
+		f->scan = s;
+		unlock(f);
+	}
+	if(s->done){
+		fprint(2, "early done...\n");
+		r->count = 0;
+		return nil;
+	}
+	p = r->data;
+	n = m->count;
+	while(1){
+		if((e = btnext(s, &kv, &done)) != nil)
+			return e;
+		if(done)
+			break;
+		if((ns = kv2statbuf(&kv, p, n)) == -1){
+			fprint(2, "** could not fill buf: %r\n");
+			break;
+		}
+		fprint(2, "*** nscan: %d\n", ns);
+		p += ns;
+		n -= ns;
+	}
+	r->count = p - r->data;
+	return nil;
+}
+
+int
+readb(Fid *f, char *d, vlong o, vlong n, int sz)
+{
+	char *e, buf[17];
+	vlong bp, bh, bo;
+	Blk *b;
+	Key k;
+	Kvp kv;
+
+	if(o >= sz)
+		return 0;
+
+	bp = o & ~(Blksz-1);
+	bo = o & (Blksz-1);
+
+	k.k = buf;
+	k.nk = sizeof(buf);
+	k.k[0] = Kdat;
+	PBIT64(k.k+1, f->qpath);
+	PBIT64(k.k+9, bp);
+
+	e = fslookup(f, &k, &kv, &b, 0);
+	if(e != nil && e != Eexist){
+		fprint(2, "!!! error: %s", e);
+		werrstr(e);
+		return -1;
+	}
+	fprint(2, "\treadb: key=%K, val=%P\n", &k, &kv);
+	bp = GBIT64(kv.v+0);
+	bh = GBIT64(kv.v+8);
+	putblk(b);
+
+	if((b = getblk(bp, bh)) == nil)
+		return -1;
+	fprint(2, "\treading from %llx (%llx) %s %s\n", bp, b->off, b->buf, b->data);
+	if(bo+n > Blksz)
+		n = Blksz-bo;
+	if(b != nil){
+		fprint(2, "\tcopying %lld to resp %p\n", n, d);
+		memcpy(d, b->buf, n);
+		putblk(b);
+	}else
+		memset(d, 0, n);
+	return n;
+}
+
+char*
+fsreadfile(Fmsg *m, Fid *f, Fcall *r)
+{
+	vlong n, c, o;
+	char *p;
+	Dent *e;
+
+	r->type = Rread;
+	r->count = 0;
+	e = f->dent;
+	rlock(e);
+	if(m->offset > e->length){
+		runlock(e);
+		return nil;
+	}
+	if((r->data = malloc(m->count)) == nil){
+		runlock(e);
+		return Enomem;
+	}
+	p = r->data;
+	c = m->count;
+	o = m->offset;
+	if(m->offset + m->count > e->length)
+		c = e->length - m->offset;
+//showfs("pre-readb");
+	while(c != 0){
+		n = readb(f, p, o, c, e->length);
+		if(n == -1){
+			runlock(e);
+			return Efs;
+		}
+		r->count += n;
+		if(n == 0)
+			break;
+		p += n;
+		o += n;
+		c -= n;
+	}
+	runlock(e);
+	return nil;
+}
+
+void
+fsread(Fmsg *m)
+{
+	char *e;
+	Fcall r;
+	Fid *f;
+
+	if((f = getfid(m->fid)) == nil){
+		rerror(m, Efid);
+		return;
+	}
+//showfs("fs contents before read");
+	fprint(2, "\n");
+	r.type = Rread;
+	r.count = 0;
+	if((r.data = malloc(m->count)) == nil){
+		rerror(m, Emem);
+		return;
+	}
+	fprint(2, "\nread{{{{\n");
+	if(f->dent->qid.type & QTDIR)
+		e = fsreaddir(m, f, &r);
+	else
+		e = fsreadfile(m, f, &r);
+	if(e != nil){
+		rerror(m, e);
+		return;
+	}else{
+		respond(m, &r);
+		free(r.data);
+	}
+	fprint(2, "\n}}}read\n");
+}
+
+int
+writeb(Fid *f, Msg *m, char *s, vlong o, vlong n, int sz)
+{
+	vlong fb, fo, bp, bh;
+	Blk *b, *t;
+	Kvp kv;
+
+	fb = o & ~(Blksz-1);
+	fo = o & (Blksz-1);
+
+	m->k[0] = Kdat;
+//	dprint("offset: %llx\n", fb);
+	PBIT64(m->k+1, f->qpath);
+	PBIT64(m->k+9, fb);
+
+	b = newblk(Traw);
+	if(b == nil)
+		return -1;
+	if(o < sz){
+		fprint(2, "\tappending to block %llx\n", b->off);
+		if(fslookup(f, m, &kv, &t, 0) != nil)
+			goto new;
+		bp = GBIT64(kv.v+0);
+		bh = GBIT64(kv.v+8);
+		putblk(t);
+
+		if((t = getblk(bp, bh)) == nil)
+			return -1;
+		memcpy(b->buf, t->buf, Blksz);
+		putblk(t);
+	}
+new:
+	if(fo+n > Blksz)
+		n = Blksz-fo;
+	memcpy(b->buf, s, n);
+	putblk(b);
+	fprint(2, "\twrote to new blk %llx at offset %lld\n", b->off, o);
+	bh = blkhash(b);
+	PBIT64(m->v+0, b->off);
+	PBIT64(m->v+8, bh);
+	fprint(2, "\tkv: %M", m);
+	return n;
+}
+
+void
+fswrite(Fmsg *m)
+{
+	char sbuf[8], offbuf[4][13+16], *p;
+	vlong n, o, c;
+	Msg kv[4];
+	Fcall r;
+	Fid *f;
+	int i;
+
+	
+	if((f = getfid(m->fid)) == nil){
+		rerror(m, Efid);
+		return;
+	}
+	if((f->mode&0x7) != OWRITE){
+		dprint("f->mode: %x\n", f->mode);
+		rerror(m, Einuse);
+		return;
+	}
+
+	p = m->data;
+	o = m->offset;
+	c = m->count;
+	for(i = 0; i < nelem(kv)-1 && c != 0; i++){
+		kv[i].op = Oinsert;
+		kv[i].k = offbuf[i];
+		kv[i].nk = 17;
+		kv[i].v = offbuf[i]+17;
+		kv[i].nv = 16;
+		n = writeb(f, &kv[i], p, o, c, f->dent->length);
+		if(n == -1){
+			// badwrite(f, i);
+			// FIXME: free pages
+			rerror(m, "%r");
+		}
+		p += n;
+		o += n;
+		c -= n;
+	}
+
+	wlock(f->dent);
+	kv[i].op = Owstat;
+	kv[i].k = f->dent->k;
+	kv[i].nk = f->dent->nk;
+	kv[i].v = sbuf;
+	kv[i].nv = 0;
+	if(m->offset+m->count > f->dent->length){
+		fprint(2, "bumping size...\n");
+		kv[i].op |= Owsize;
+		kv[i].nv += 8;
+		PBIT64(kv[i].v, m->offset+m->count);
+		f->dent->length = m->offset+m->count;
+	}
+	btupsert(kv, i+1);
+	wunlock(f->dent);
+
+	r.type = Rwrite;
+	r.count = m->count;
+	respond(m, &r);
+}
+
+void
+runfs(void *pfd)
+{
+	int fd, msgmax;
+	char err[128];
+	QLock *wrlk;
+	Fcall r;
+	Fmsg *m;
+
+	fd = (uintptr)pfd;
+	msgmax = Max9p;
+	if((wrlk = mallocz(sizeof(QLock), 1)) == nil)
+		fshangup(fd, "alloc wrlk: %r");
+	while(1){
+		if((m = readmsg(fd, msgmax)) == nil){
+			fshangup(fd, "truncated message: %r");
+			return;
+		}
+		if(convM2S(m->buf, m->sz, m) == 0){
+			fshangup(fd, "invalid message: %r");
+			return;
+		}
+/*
+		if(m->type != Tversion && !versioned){
+			fshangup(fd, "version required");
+			return;
+		}
+*/
+		m->wrlk = wrlk;
+		dprint("← %F\n", &m->Fcall);
+		switch(m->type){
+		/* sync setup */
+		case Tversion:	fsversion(m, &msgmax);	break;
+		case Tauth:	fsauth(m);		break;
+		case Tclunk:	fsclunk(m);		break;
+		case Topen:	fsopen(m);		break;
+		case Tattach:	fsattach(m, msgmax);	break;
+
+		/* mutators */
+		case Tflush:	chsend(fs->wrchan, m);	break;
+		case Tcreate:	chsend(fs->wrchan, m);	break;
+		case Twrite:	chsend(fs->wrchan, m);	break;
+		case Twstat:	chsend(fs->wrchan, m);	break;
+		case Tremove:	chsend(fs->wrchan, m);	break;
+
+		/* reads */
+		case Twalk:	chsend(fs->rdchan, m);	break;
+		case Tread:	chsend(fs->rdchan, m);	break;
+		case Tstat:	chsend(fs->rdchan, m);	break;
+		default:
+			fprint(2, "unknown message %F\n", &m->Fcall);
+			snprint(err, sizeof(err), "unknown message: %F", &m->Fcall);
+			r.type = Rerror;
+			r.ename = err;
+			respond(m, &r);
+			break;
+		}
+	}
+}
+
+void
+runwrite(void *)
+{
+	Fmsg *m;
+
+	while(1){
+		m = chrecv(fs->wrchan);
+		switch(m->type){
+		case Tflush:	rerror(m, "unimplemented flush");	break;
+		case Tcreate:	fscreate(m);	break;
+		case Twrite:	fswrite(m);	break;
+		case Twstat:	fswstat(m);	break;
+		case Tremove:	rerror(m, "unimplemented remove");	break;
+		}
+	}
+}
+
+void
+runread(void *)
+{
+	Fmsg *m;
+
+	while(1){
+		m = chrecv(fs->rdchan);
+		switch(m->type){
+		case Twalk:	fswalk(m);	break;
+		case Tread:	fsread(m);	break;
+		case Tstat:	fsstat(m);	break;
+		}
+	}
+}
+
+void
+runctl(void *pfd)
+{
+	char buf[128];
+	int fd, n;
+
+	fd = (uintptr)pfd;
+	while(1){
+		if((n = read(fd, buf, sizeof(buf)-1)) == -1)
+			break;
+		buf[n--] = 0;
+		while(buf[n] == ' ' || buf[n] == '\t' || buf[n] == '\n')
+			buf[n--] = 0;
+		fprint(2, "ctl message: %s\n", buf);
+		fprint(fd, "ctl message: %s\n", buf);
+		if(strcmp(buf, "show") == 0)
+			fshowfs(fd, "show");
+		else if(strcmp(buf, "check") == 0)
+			checkfs();
+		else
+			fprint(fd, "unknown command %s", buf);
+	}
+}
--- a/hash.c
+++ b/hash.c
@@ -30,7 +30,11 @@
 #include <u.h>
 #include <libc.h>
 #include <fcall.h>
+#include <avl.h>
 
+#include "dat.h"
+#include "fns.h"
+
 #define _le64toh(x) \
 	GBIT64((char*)&x)
 
@@ -96,4 +100,19 @@
 {
 	char key[16] = "gefsgefsgefsgefs";
 	return siphash24(src, len, key);
+}
+
+uvlong
+blkhash(Blk *b)
+{
+	return siphash(b->buf, Blksz);
+}
+
+u32int
+ihash(vlong x)
+{
+    x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
+    x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
+    x = x ^ (x >> 31);
+    return x;
 }
--- /dev/null
+++ b/load.c
@@ -1,0 +1,91 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+rangecmp(Avl *a, Avl *b)
+{
+	return ((Arange*)a)->off - ((Arange*)b)->off;
+}
+
+int
+loadarena(Arena *a, vlong o)
+{
+	Blk *b;
+
+	if((a->free = avlcreate(rangecmp)) == nil)
+		return -1;
+	if((b = readblk(o, 0)) == nil)
+		return -1;
+	a->b = b;
+	a->log = GBIT64(b->data+0);
+	a->logh = GBIT64(b->data+8);
+	if(loadlog(a) == -1)
+		return -1;
+	if(compresslog(a) == -1)
+		return -1;
+	return 0;
+}
+
+void
+loadfs(char *dev)
+{
+	vlong sb;
+	char *p;
+	Blk *b;
+	Dir *d;
+	int i, dirty;
+
+	if((fs->fd = open(dev, ORDWR)) == -1)
+		sysfatal("open %s: %r", dev);
+	if((d = dirfstat(fs->fd)) == nil)
+		sysfatal("ream: %r");
+	sb = d->length - (d->length % Blksz) - Blksz;
+print("superblock @%llx\n", sb);
+	free(d);
+
+	if((b = readblk(sb, 0)) == nil)
+		sysfatal("read superblock: %r");
+	if(b->type != Tsuper)
+		sysfatal("corrupt superblock: bad type");
+	p = b->data;
+	if(memcmp(p, "gefs0001", 8) != 0)
+		sysfatal("corrupt superblock: bad magic");
+	dirty = GBIT32(p +  8);
+	if(GBIT32(p + 12) != Blksz)
+		sysfatal("fs uses different block size");
+	if(GBIT32(p + 16) != Bufspc)
+		sysfatal("fs uses different buffer size");
+	if(GBIT32(p + 20) != Hdrsz)
+		sysfatal("fs uses different buffer size");
+	fs->height = GBIT32(p + 24);
+	fs->rootb = GBIT64(p + 32);
+	fs->rooth = GBIT64(p + 40);
+	fs->narena = GBIT32(p + 48);
+	fs->arenasz = GBIT64(p + 56);
+	fs->arenasz = GBIT64(p + 56);
+	fs->gen = GBIT64(p + 64);
+	fs->nextqid = GBIT64(p + 72);
+	fs->super = b;
+	fprint(2, "load: %8s\n", p);
+	fprint(2, "\theight:\t%d\n", fs->height);
+	fprint(2, "\trootb:\t%llx\n", fs->rootb);
+	fprint(2, "\trooth:\t%llx\n", fs->rooth);
+	fprint(2, "\tarenas:\t%d\n", fs->narena);
+	fprint(2, "\tarenasz:\t%lld\n", fs->arenasz);
+	fprint(2, "\trootgen:\t%lld\n", fs->gen);
+	fprint(2, "\tnextqid:\t%lld\n", fs->nextqid);
+	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+		sysfatal("malloc: %r");
+	for(i = 0; i < fs->narena; i++)
+		if((loadarena(&fs->arenas[i], i*fs->arenasz)) == -1)
+			sysfatal("loadfs: %r");
+	if(dirty){
+		fprint(2, "file system was not unmounted cleanly");
+		/* TODO: start gc pass */
+	}
+}
--- a/main.c
+++ b/main.c
@@ -1,11 +1,19 @@
 #include <u.h>
 #include <libc.h>
 #include <bio.h>
+#include <avl.h>
+#include <fcall.h>
+#include <ctype.h>
+
 #include "dat.h"
 #include "fns.h"
 
 Gefs *fs;
 
+int	ream;
+int	debug;
+char	*srvname = "gefs";
+
 static int
 Bconv(Fmt *fmt)
 {
@@ -14,22 +22,40 @@
 	b = va_arg(fmt->args, Blk*);
 	if(b == nil)
 		return fmtprint(fmt, "Blk(nil)");
-	return fmtprint(fmt, "Blk(%c)", (b->type == Pivot) ? 'P' : 'L');
+	return fmtprint(fmt, "Blk(%c)", (b->type == Tpivot) ? 'P' : 'L');
 }
 
+void
+launch(void (*f)(void *), void *arg, char *text)
+{
+	int pid;
+
+
+	pid = rfork(RFPROC|RFMEM|RFNOWAIT);
+	if (pid < 0)
+		sysfatal("can't fork: %r");
+	if (pid == 0) {
+		procsetname("%s", text);
+		(*f)(arg);
+		exits("child returned");
+	}
+}
+
 static int
 Mconv(Fmt *fmt)
 {
 	char *opname[] = {
-	[Ocreate]	"Ocreate",
+	[Oinsert]	"Oinsert",
 	[Odelete]	"Odelete",
-	[Owrite]	"Owrite",
 	[Owstat]	"Owstat",
 	};
 	Msg *m;
 
 	m = va_arg(fmt->args, Msg*);
-	return fmtprint(fmt, "Msg(%s, %.*s,%.*s)", opname[m->op], m->nk, m->k, m->nv, m->v);
+	return fmtprint(fmt, "Msg(%s, [%d]%.*X,[%d]%.*X)",
+		opname[m->op&0xf],
+		m->nk, m->nk, m->k,
+		m->nv, m->nv, m->v);
 }
 
 static int
@@ -39,126 +65,159 @@
 
 	kv = va_arg(fmt->args, Kvp*);
 	if(kv->type == Vinl)
-		return fmtprint(fmt, "Kvp(%.*s,%.*s)", kv->nk, kv->k, kv->nv, kv->v);
+		return fmtprint(fmt, "Kvp([%d]%.*X,[%d]%.*X)",
+			kv->nk, kv->nk, kv->k,
+			kv->nv, kv->nv, kv->v);
 	else
-		return fmtprint(fmt, "Kvp(%.*s,(%llux,%llux,%ud))",
-			kv->nk, kv->k, kv->bp, kv->bh, kv->fill);
+		return fmtprint(fmt, "Kvp([%d]%.*X,(%llux,%llux,%ud))",
+			kv->nk, kv->nk, kv->k,
+			kv->bp, kv->bh, kv->fill);
 }
 
 static int
+Rconv(Fmt *fmt)
+{
+	Arange *r;
+
+	r = va_arg(fmt->args, Arange*);
+	if(r == nil)
+		return fmtprint(fmt, "<Arange:nil>");
+	else
+		return fmtprint(fmt, "Arange(%lld+%lld)", r->off, r->len);
+}
+
+static int
 Kconv(Fmt *fmt)
 {
 	Key *k;
 
 	k = va_arg(fmt->args, Key*);
-	return fmtprint(fmt, "Key(%.*s)", k->nk, k->k);
+	return fmtprint(fmt, "Key([%d]%.*X)", k->nk, k->nk, k->k);
 }
 
-static void
-init(void)
+static int
+Xconv(Fmt *fmt)
 {
-	initshow();
-	quotefmtinstall();
-	fmtinstall('B', Bconv);
-	fmtinstall('M', Mconv);
-	fmtinstall('P', Pconv);
-	fmtinstall('K', Kconv);
-	fs = emalloc(sizeof(Gefs));
-	fs->root = newblk(Leaf);
-	fs->height = 1;
+	char *s, *e;
+	int n, i;
+
+	n = 0;
+	i = fmt->prec;
+	s = va_arg(fmt->args, char*);
+	e = s + fmt->prec;
+	for(; s != e; s++){
+		if(i % 4 == 0 && i != 0)
+			n += fmtprint(fmt, ":");
+		i--;
+		if(isalnum(*s))
+			n += fmtrune(fmt, *s);
+		else
+			n += fmtprint(fmt, "%02x", *s&0xff);
+	}
+	return n;
 }
 
-int
-test(char *path)
+void
+initfs(vlong cachesz)
 {
-	Biobuf *bfd;
-	char *e, *ln, *f[3];
-	int nf;
-	Msg m;
-	Kvp r;
-	Key k;
+	if((fs = mallocz(sizeof(Gefs), 1)) == nil)
+		sysfatal("malloc: %r");
 
-	if((bfd = Bopen(path, OREAD)) == nil)
-		sysfatal("open %s: %r", path);
-	while((ln = Brdstr(bfd, '\n', 1)) != nil){
-		memset(f, 0, sizeof(f));
-		nf = tokenize(ln, f, nelem(f));
-		if(nf < 1 || strlen(f[0]) != 1)
-			sysfatal("malformed test file");
-		switch(*f[0]){
-		case '#':
-			break;
-		case 'I':
-			if(nf != 3)
-				sysfatal("malformed insert");
-			m.type = Vinl;
-			m.k = f[1];
-			m.v = f[2];
-			m.op = Ocreate;
-			m.nk = strlen(f[1]);
-			m.nv = strlen(f[2]);
-			print("insert (%s, %s)\n", m.k, m.v);
-			if(fsupsert(&m) == -1){
-				print("failed insert (%s, %s): %r\n", f[1], f[2]);
-				return -1;
-			}
-			break;
-		case 'D':
-			if(nf != 2)
-				sysfatal("malformed delete");
-			m.type = Vinl;
-			m.op = Odelete;
-			m.k = f[1];
-			m.v = nil;
-			m.nk = strlen(f[1]);
-			m.nv = 0;
-			print("delete %s\n", f[1]);
-			if(fsupsert(&m) == -1){
-				print("failed delete (%s): %r\n", f[1]);
-				return -1;
-			}
-			break;
-		case 'G':
-			k.k = f[1];
-			k.nk = strlen(f[1]);
-			e = fswalk1(&k, &r);
-			if(e != nil){
-				print("failed lookup on (%s): %s\n", f[1], e);
-				return -1;
-			}
-			break;
-		case 'S':
-			showfs("fs");
-			print("\n\n");
-			break;
-		case 'C':
-			checkfs();
-			break;
-		case 'V':
-			debug++;
-			break;
-		case 'X':
-			exits(f[1]);
-			break;
-		}
-//		if(!checkfs())
-//			abort();
-		free(ln);
-	}
-	return 0;
+	fs->cmax = cachesz/Blksz;
+	if(fs->cmax >= (2*GiB)/sizeof(Bucket))
+		sysfatal("cache too big");
+	if((fs->cache = mallocz(fs->cmax*sizeof(Bucket), 1)) == nil)
+		sysfatal("malloc: %r");
+}
+
+int
+postfd(char *name, char *suff)
+{
+	char buf[80];
+	int fd[2];
+	int cfd;
+
+	if(pipe(fd) < 0)
+		sysfatal("can't make a pipe");
+	snprint(buf, sizeof buf, "/srv/%s%s", name, suff);
+	if((cfd = create(buf, OWRITE|ORCLOSE|OCEXEC, 0600)) == -1)
+		sysfatal("create %s: %r", buf);
+	if(fprint(cfd, "%d", fd[0]) == -1)
+		sysfatal("write %s: %r", buf);
+	close(fd[0]);
+	return fd[1];
 }
 
 void
+usage(void)
+{
+	fprint(2, "usage: %s [-r] dev\n", argv0);
+	exits("usage");
+}
+
+void
 main(int argc, char **argv)
 {
-	int i;
+	int srvfd, ctlfd;
+	vlong cachesz;
 
+	cachesz = 16*MiB;
 	ARGBEGIN{
+	case 'r':
+		ream = 1;
+		break;
+	case 'c':
+		cachesz = strtoll(EARGF(usage()), nil, 0)*MiB;
+		break;
+	case 'd':
+		debug++;
+		break;
+	case 's':
+		srvname = EARGF(usage());
+		break;
+	default:
+		usage();
+		break;
 	}ARGEND;
+	if(argc == 0)
+		usage();
 
-	init();
-	for(i = 0; i < argc; i++)
-		if(test(argv[0]) == -1)
-			sysfatal("test %s: %r\n", argv[i]);
-	exits(nil);
+	/*
+	 * sanity checks -- I've tuned these to stupid
+	 * values in the past.
+	 */
+//	assert(4*Kpmax < Pivspc);
+//	assert(2*Msgmax < Bufspc);
+
+	initfs(cachesz);
+	initshow();
+	quotefmtinstall();
+	fmtinstall('B', Bconv);
+	fmtinstall('M', Mconv);
+	fmtinstall('P', Pconv);
+	fmtinstall('K', Kconv);
+	fmtinstall('R', Rconv);
+	fmtinstall('X', Xconv);
+	fmtinstall('F', fcallfmt);
+	if(ream){
+		reamfs(argv[0]);
+		exits(nil);
+	}else{
+		fs->rdchan = mkchan(128);
+		fs->wrchan = mkchan(128);
+		srvfd = postfd(srvname, "");
+		ctlfd = postfd(srvname, ".ctl");
+		loadfs(argv[0]);
+		launch(runctl, (void*)ctlfd, "ctl");
+		launch(runwrite, nil, "writeio");
+		launch(runread, nil, "readio");
+//		launch(runfs, (void*)srvfd, "fs");
+		runfs((void*)srvfd);
+//		launch(syncproc, nil, "sync");
+//		launch(flushproc, &fs->flushev, "flush");
+//		for(i = 1; i < argc; i++)
+//			if(test(argv[i]) == -1)
+//				sysfatal("test %s: %r\n", argv[i]);
+		exits(nil);
+	}
 }
--- a/mkfile
+++ b/mkfile
@@ -4,8 +4,12 @@
 OFILES=\
 	blk.$O\
 	check.$O\
+	fs.$O\
 	hash.$O\
+	load.$O\
 	main.$O\
+	pack.$O\
+	ream.$O\
 	tree.$O\
 	util.$O\
 
--- /dev/null
+++ b/pack.c
@@ -1,0 +1,266 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+
+char*
+unpack8(int *err, char *p, char *e, void *v)
+{
+	if (e - p < 1 || *err){
+		*err = 1;
+		return p;
+	}
+	*(uchar*)v = p[0];
+	return p+1;
+}
+
+char*
+unpack16(int *err, char *p, char *e, void *v)
+{
+	if (e - p < 2 || *err){
+		*err = 1;
+		return p;
+	}
+	*(ushort*)v = GBIT16(p);
+	return p+2;
+}
+
+char*
+unpack32(int *err, char *p, char *e, void *v)
+{
+	if (e - p < 4 || *err){
+		*err = 1;
+		return p;
+	}
+	*(uint*)v = GBIT32(p);
+	return p+4;
+}
+
+char*
+unpack64(int *err, char *p, char *e, void *v)
+{
+	if (e - p < 8 || *err){
+		*err = 1;
+		return p;
+	}
+	*(uvlong*)v = GBIT64(p);
+	return p+8;
+}
+
+/* Terminated so we can use them directly in C */
+char*
+unpackstr(int *err, char *p, char *e, char **s)
+{
+	int n;
+
+	if (e - p < 3 || *err){
+		*err = 1;
+		return p;
+	}
+	n = GBIT16(p);
+	if(e - p < n + 3 || p[n+2] != 0){
+		*err = 1;
+		return p;
+	}
+	*s = p+2;
+	return p+3+n;
+}
+
+char*
+pack8(int *err, char *p, char *e, uchar v)
+{
+	if (e - p < 1 || *err){
+		*err = 1;
+		return p;
+	}
+	p[0] = v;
+	return p+1;
+}
+
+char*
+pack16(int *err, char *p, char *e, ushort v)
+{
+	if (e - p < 2 || *err){
+		*err = 1;
+		return p;
+	}
+	PBIT16(p, v);
+	return p+2;
+}
+
+char*
+pack32(int *err, char *p, char *e, uint v)
+{
+	if (e - p < 4 || *err){
+		*err = 1;
+		return p;
+	}
+	PBIT32(p, v);
+	return p+4;
+}
+
+char*
+pack64(int *err, char *p, char *e, uvlong v)
+{
+	if (e - p < 8 || *err){
+		*err = 1;
+		return p;
+	}
+	PBIT64(p, v);
+	return p+8;
+}
+
+/* Terminated so we can use them directly in C */
+char*
+packstr(int *err, char *p, char *e, char *s)
+{
+	int n;
+
+	n = strlen(s);
+	if (e - p < n+3 || *err){
+		*err = 1;
+		return p;
+	}
+	PBIT16(p+0, n);
+	memcpy(p+2, s, n);
+	p[2+n] = 0;
+	return p+3+n;
+}
+		
+int
+dir2kv(vlong up, Dir *d, Kvp *kv, char *buf, int nbuf)
+{
+	char *k, *ek, *v, *ev, *eb;
+	int err;
+
+	err = 0;
+	k = buf;
+	ek = buf;
+	eb = buf + nbuf;
+	ek = pack8(&err, ek, eb, Kent);
+	ek = pack64(&err, ek, eb, up);
+	ek = packstr(&err, ek, eb, d->name);
+
+	v = ek;
+	ev = ek;
+	ev = pack64(&err, ev, eb, d->qid.path);
+	ev = pack32(&err, ev, eb, d->qid.vers);
+	ev = pack8(&err, ev, eb, d->qid.type);
+	ev = pack32(&err, ev, eb, d->mode);
+	ev = pack64(&err, ev, eb, d->atime*Nsec);
+	ev = pack64(&err, ev, eb, d->mtime*Nsec);
+	ev = pack64(&err, ev, eb, d->length);
+	ev = packstr(&err, ev, eb, d->uid);
+	ev = packstr(&err, ev, eb, d->gid);
+	ev = packstr(&err, ev, eb, d->muid);
+	if(err){
+		werrstr("stat too big: %.*s...", 32, d->name);
+		return -1;
+	}
+	kv->type = Vinl;
+	kv->k = k;
+	kv->nk = ek - k;
+	kv->v = v;
+	kv->nv = ev - v;
+	return 0;
+}
+
+int
+name2dkey(vlong up, char *name, Key *k, char *buf, int nbuf)
+{
+	char *ek, *eb;
+	int err;
+
+	err = 0;
+	ek = buf;
+	eb = buf + nbuf;
+	ek = pack8(&err, ek, eb, Kent);
+	ek = pack64(&err, ek, eb, up);
+	ek = packstr(&err, ek, eb, name);
+	if(err)
+		return -1;
+	k->k = buf;
+	k->nk = ek - buf;
+	return k->nk;
+}
+
+int
+kv2dir(Kvp *kv, Dir *d)
+{
+	char *k, *ek, *v, *ev;
+	vlong atime, mtime;
+	int err;
+
+	err = 0;
+	k = kv->k + 9;
+	ek = kv->k + kv->nk;
+dprint("unpacking... [%d %d]\n", k[0], k[1]);
+	k = unpackstr(&err, k, ek, &d->name);
+
+	v = kv->v;
+	ev = v + kv->nv;
+	v = unpack64(&err, v, ev, &d->qid.path);
+	v = unpack32(&err, v, ev, &d->qid.vers);
+	v = unpack8(&err, v, ev, &d->qid.type);
+	v = unpack32(&err, v, ev, &d->mode);
+	v = unpack64(&err, v, ev, &atime);
+	v = unpack64(&err, v, ev, &mtime);
+	v = unpack64(&err, v, ev, &d->length);
+	v = unpackstr(&err, v, ev, &d->uid);
+	v = unpackstr(&err, v, ev, &d->gid);
+	v = unpackstr(&err, v, ev, &d->muid);
+	if(err){
+		werrstr("kv too small");
+		return -1;
+	}
+	if(k != ek){
+		werrstr("invalid path");
+		return -1;
+	}
+	if(v != ev){
+		werrstr("stat full of fuck");
+		return -1;
+	}
+	d->atime = (atime+Nsec/2)/Nsec;
+	d->mtime = (mtime+Nsec/2)/Nsec;
+	return 0;
+}
+
+int
+kv2statbuf(Kvp *kv, char *buf, int nbuf)
+{
+	Dir d;
+	int n;
+
+	if(kv2dir(kv, &d) == -1)
+		return -1;
+	dprint("have %d bytes to pack into\n", nbuf);
+	if((n = convD2M(&d, (uchar*)buf, nbuf)) <= BIT16SZ){
+		fprint(2, "here...failed convert??, needed %d\n", GBIT16(buf));
+		return -1;
+	}
+	return n;	
+}
+
+int
+kv2qid(Kvp *kv, Qid *q)
+{
+	char *v, *ev;
+	int err;
+
+	err = 0;
+	v = kv->v;
+	ev = v + kv->nv;
+	v = unpack64(&err, v, ev, &q->path);
+	v = unpack32(&err, v, ev, &q->vers);
+	unpack8(&err, v, ev, &q->type);
+	if(err){
+		werrstr("kv too small");
+		return -1;
+	}
+	return 0;
+}
--- /dev/null
+++ b/ream.c
@@ -1,0 +1,157 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+
+static void
+initroot(Blk *r)
+{
+	char buf[512];
+	Kvp kv;
+	Dir d;
+
+	memset(&d, 0, sizeof(Dir));
+	d.qid = (Qid){fs->nextqid++, 0, QTDIR};
+	d.mode = 0755;
+	d.atime = 0;
+	d.mtime = 0;
+	d.length = 0;
+	d.name = "";
+	d.uid = "glenda";
+	d.gid = "glenda";
+	d.muid = "glenda";
+	if(dir2kv(-1, &d, &kv, buf, sizeof(buf)) == -1)
+		sysfatal("ream: pack root: %r");
+	blkinsert(r, &kv);
+
+	kv.k = buf;
+	kv.nk = 9;
+	kv.v = buf+9;
+	kv.nv = 8;
+	buf[0] = Ksuper;
+	PBIT64(buf+1, 0);
+	PBIT64(buf+9, 0);
+	blkinsert(r, &kv);
+}
+
+static void
+reamarena(Arena *a, vlong start, vlong asz)
+{
+	vlong off, bo, bh;
+	char *p;
+	Blk *b;
+
+	off = start;
+	if((b = mallocz(sizeof(Blk), 1)) == nil)
+		sysfatal("ream: %r");
+	off += Blksz;	/* arena header */
+
+	a->log = -1;
+	memset(b, 0, sizeof(Blk));
+	b->type = Tlog;
+	b->off = off;
+	b->logsz = 32;
+	b->data = b->buf + Hdrsz;
+	b->flag |= Bdirty;
+
+	p = b->data;
+	PBIT64(p+24, off|LgFree);		/* off */
+	PBIT64(p+32, asz);			/* len */
+	PBIT64(p+40, b->off|LgAlloc);		/* off */
+	PBIT64(p+48, Blksz);			/* len */
+	finalize(b);
+	synclog(b, -1, 32);
+
+	bh = blkhash(b);
+	bo = b->off;
+
+	memset(b, 0, sizeof(Blk));
+	b->type = Tarena;
+	b->off = start;
+	p = b->buf + Hdrsz;
+	print("b->off: %llx\n", b->off);
+	PBIT64(p+0, bo);
+	PBIT64(p+8, bh);
+	finalize(b);
+	if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1)
+		sysfatal("ream: write arena: %r");
+}
+
+void
+reamfs(char *dev)
+{
+	vlong sz, asz, off;
+	Blk *s, *r;
+	Dir *d;
+	int i;
+
+	if((fs->fd = open(dev, ORDWR)) == -1)
+		sysfatal("open %s: %r", dev);
+	if((d = dirfstat(fs->fd)) == nil)
+		sysfatal("ream: %r");
+	if(d->length < 64*MiB)
+		sysfatal("ream: disk too small");
+	if((s = mallocz(sizeof(Blk), 1)) == nil)
+		sysfatal("ream: %r");
+
+
+	sz = d->length;
+	sz = sz - (sz % Blksz) - Blksz;
+	fs->narena = sz / (128*GiB);
+	if(fs->narena < 1)
+		fs->narena = 1;
+	if(fs->narena >= 128)
+		fs->narena = 128;
+	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+		sysfatal("malloc: %r");
+	free(d);
+
+	asz = sz/fs->narena;
+	asz = asz - (asz % Blksz) - Blksz;
+	fs->arenasz = asz;
+	off = 0;
+	fprint(2, "reaming %d arenas:\n", fs->narena);
+
+	for(i = 0; i < fs->narena; i++){
+		print("\tarena %d: %lld blocks at %lld\n", i, asz/Blksz, off);
+		reamarena(&fs->arenas[i], off, asz);
+		asz += off;
+	}
+	
+	s->type = Tsuper;
+	s->off = sz;
+	s->data = s->buf + Hdrsz;
+	fillsuper(s);
+	finalize(s);
+
+print("superblock @%llx\n", s->off);
+	for(i = 0; i < fs->narena; i++)
+		if((loadarena(&fs->arenas[i], i*asz)) == -1)
+			sysfatal("ream: loadarena: %r");
+
+	/*
+	 * Now that we have a completely empty fs, give it
+	 * a single root block that the tree will insert
+	 * into, and take a snapshot as the initial state.
+	 */
+	if((r = newblk(Tleaf)) == nil)
+		sysfatal("ream: allocate root: %r");
+	initroot(r);
+	finalize(r);
+
+	fs->super = s;
+	fs->rootb = r->off;
+	fs->rooth = blkhash(r);
+	fs->height = 1;
+	snapshot();
+
+	putblk(s);
+	putblk(r);
+	if(sync() == -1)
+		sysfatal("ream: sync: %r");
+}
--- a/tree.c
+++ b/tree.c
@@ -1,23 +1,44 @@
 #include <u.h>
 #include <libc.h>
 #include <fcall.h>
+#include <avl.h>
 
 #include "dat.h"
 #include "fns.h"
 
-int lookup;
 void
-dprint(char *fmt, ...)
+cpkey(Key *dst, Key *src, char *buf, int nbuf)
 {
-	va_list ap;
+	assert(src->nk <= nbuf);
+	dst->k = buf;
+	dst->nk = src->nk;
+	memcpy(dst->k, src->k, src->nk);
+}
 
-	if(!debug)
-		return;
-	va_start(ap, fmt);
-	vfprint(2, fmt, ap);
-	va_end(ap);
+void
+cpkvp(Kvp *dst, Kvp *src, char *buf, int nbuf)
+{
+	assert(src->nk+src->nv <= nbuf);
+	cpkey(dst, src, buf, nbuf);
+	dst->type = src->type;
+	if(src->type == Vinl){
+		dst->v = buf+src->nk;
+		dst->nv = src->nv;
+	}else{
+		dst->bp = src->bp;
+		dst->bh = src->bh;
+		dst->fill = src->fill;
+	}
+	memcpy(dst->v, src->v, src->nv);
 }
 
+void
+cpmsg(Msg *dst, Msg *src, char *buf, int nbuf)
+{
+	dst->op = src->op;
+	cpkvp(dst, src, buf, nbuf);
+}
+
 int
 keycmp(Key *a, Key *b)
 {
@@ -54,9 +75,9 @@
 {
 	int o;
 
-	assert(i >= 0 && i < b->nent);
+	assert(i >= 0 && i < b->nval);
 	o = GBIT16(b->data + 2*i);
-	if(b->type == Pivot){
+	if(b->type == Tpivot){
 		kv->type = Vref;
 		kv->nk = GBIT16(b->data + o);
 		kv->k = b->data + o + 2;
@@ -78,16 +99,16 @@
 	int o, nk, nv, ksz, vsz, spc;
 	char *p;
 
-	assert(i >= 0 && i <= b->nent);
-	spc = (b->type == Leaf) ? Leafspc : Pivspc;
+	assert(i >= 0 && i <= b->nval);
+	spc = (b->type == Tleaf) ? Leafspc : Pivspc;
 	p = b->data + 2*i;
 	nk = 2 + kv->nk;
 	nv = (kv->type == Vref) ? Ptrsz : 2 + kv->nv;
 	if (i < 0)
 		i = 0;
-	if(!replace || b->nent == i){
-		memmove(p + 2, p, 2*(b->nent - i));
-		b->nent++;
+	if(!replace || b->nval == i){
+		memmove(p + 2, p, 2*(b->nval - i));
+		b->nval++;
 		b->valsz += nk + nv;
 		o = spc - b->valsz;
 	}else{
@@ -99,7 +120,7 @@
 		 */
 		o = GBIT16(b->data + 2*i);
 		ksz = 2 + GBIT16(b->data + o);
-		if(b->type == Leaf)
+		if(b->type == Tleaf)
 			vsz = 2 + GBIT16(b->data + o + ksz);
 		else
 			vsz = 16;
@@ -109,13 +130,16 @@
 		}
 	}
 
-	if(2*b->nent + b->valsz > spc)
-		showblk(b, "setval overflow");
-	assert(2*b->nent + b->valsz <= spc);
-	assert(2*b->nent < o);
+	if(2*b->nval + b->valsz > spc){
+		dprint("2*%d + %d > %d [ksz: %d, vsz: %d]\n",
+			2*b->nval, b->valsz, spc, kv->nk, kv->nv);
+		showblk(b, "setval overflow", 1);
+		abort();
+	}
+	assert(2*b->nval + b->valsz <= spc);
+	assert(2*b->nval <= o);
 	p = b->data + o;
-	if(b->type == Pivot){
-		assert(kv->type == Vref);
+	if(b->type == Tpivot){
 		PBIT16(b->data + 2*i, o);
 		PBIT16(p +  0, kv->nk);
 		memcpy(p +  2, kv->k, kv->nk);
@@ -123,7 +147,6 @@
 		PBIT64(p + kv->nk + 10, kv->bh);
 		PBIT16(p + kv->nk + 18, kv->fill);
 	} else {
-		assert(kv->type == Vinl);
 		PBIT16(b->data + 2*i, o);
 		PBIT16(p +  0, kv->nk);
 		memcpy(p +  2, kv->k, kv->nk);
@@ -137,10 +160,10 @@
 {
 	char *p;
 
-	assert(i >= 0 && i <= b->nent);
-	b->nent--;
+	assert(i >= 0 && i <= b->nval);
+	b->nval--;
 	p = b->data + 2*i;
-	memmove(p, p + 2, 2*(b->nent - i));
+	memmove(p, p + 2, 2*(b->nval - i));
 	return 0;
 }
 
@@ -150,15 +173,15 @@
 	char *p;
 	int o;
 
-	assert(b->type == Pivot);
-	assert(i >= 0 && i <= b->nmsg);
-	b->nmsg++;
+	assert(b->type == Tpivot);
+	assert(i >= 0 && i <= b->nbuf);
+	b->nbuf++;
 	b->bufsz += msgsz(m);
-	assert(2*b->nent + b->bufsz <= Bufspc);
+	assert(2*b->nbuf + b->bufsz <= Bufspc);
 
 	p = b->data + Pivspc;
 	o = Pivspc - b->bufsz;
-	memmove(p + 2*(i+1), p+2*i, 2*(b->nmsg - i));
+	memmove(p + 2*(i+1), p+2*i, 2*(b->nbuf - i));
 	PBIT16(p + 2*i, o);
 
 	p = b->data + Bufspc + o;
@@ -175,8 +198,8 @@
 	char *p;
 	int o;
 
-	assert(b->type == Pivot);
-	assert(i >= 0 && i < b->nmsg);
+	assert(b->type == Tpivot);
+	assert(i >= 0 && i < b->nbuf);
 	o = GBIT16(b->data + Pivspc + 2*i);
 
 	p = b->data + Pivspc + o;
@@ -196,7 +219,7 @@
 
 	r = -1;
 	lo = 0;
-	hi = b->nent;
+	hi = b->nval;
 	while(lo < hi){
 		mid = (hi + lo) / 2;
 		getval(b, mid, &cmp);
@@ -206,7 +229,7 @@
 		else
 			lo = mid + 1;
 	}
-	if(lo < b->nent){
+	if(lo < b->nval){
 		getval(b, lo, &cmp);
 		r = keycmp(kv, &cmp);
 	}
@@ -220,12 +243,12 @@
 	Msg cmp;
 
 	lo = 0;
-	hi = b->nmsg;
+	hi = b->nbuf;
 	while(lo < hi){
 		mid = (hi + lo) / 2;
 		getmsg(b, mid, &cmp);
 		r = keycmp(m, &cmp);
-		if(r <= 0)
+		if(r < 0)
 			hi = mid;
 		else
 			lo = mid + 1;
@@ -234,26 +257,30 @@
 }
 
 static int
-bufsearch(Blk *b, Key *k, Msg *m, int *idx)
+bufsearch(Blk *b, Key *k, Msg *m, int *same)
 {
 	int lo, hi, mid, r;
 
+	r = -1;
 	lo = 0;
-	hi = b->nmsg - 1;
+	hi = b->nbuf - 1;
 	while(lo <= hi){
 		mid = (hi + lo) / 2;
 		getmsg(b, mid, m);
 		r = keycmp(k, m);
-		if(r == 0){
-			*idx = mid;
-			return 0;
-		}
 		if(r < 0)
 			hi = mid - 1;
 		else
 			lo = mid + 1;
 	}
-	return -1;
+	lo--;
+	if(lo >= 0){
+		getmsg(b, lo, m);
+		r = keycmp(k, m);
+	}
+	if(same != nil)
+		*same = (r == 0);
+	return lo;
 }
 
 int
@@ -263,7 +290,7 @@
 	Kvp cmp;
 
 	lo = 0;
-	hi = b->nent - 1;
+	hi = b->nval - 1;
 	while(lo <= hi){
 		mid = (hi + lo) / 2;
 		getval(b, mid, &cmp);
@@ -279,7 +306,7 @@
 }
 
 int
-blksearch(Blk *b, Key *k, Kvp *rp, int *idx, int *same)
+blksearch(Blk *b, Key *k, Kvp *rp, int *same)
 {
 	int lo, hi, mid, r;
 	Kvp cmp;
@@ -286,7 +313,7 @@
 
 	r = -1;
 	lo = 0;
-	hi = b->nent;
+	hi = b->nval;
 	while(lo < hi){
 		mid = (hi + lo) / 2;
 		getval(b, mid, &cmp);
@@ -303,16 +330,14 @@
 	}
 	if(same != nil)
 		*same = (r == 0);
-	if(idx != nil)
-		*idx = lo;
-	return 0;
+	return lo;
 }
 
 int
 filledbuf(Blk *b, int needed)
 {
-	assert(b->type == Pivot);
-	return 2*b->nmsg + b->bufsz > Bufspc - needed;
+	assert(b->type == Tpivot);
+	return 2*(b->nbuf+1) + b->bufsz + needed > Bufspc;
 }
 
 
@@ -319,8 +344,8 @@
 int
 filledleaf(Blk *b, int needed)
 {
-	assert(b->type == Leaf);
-	return 2*b->nent + b->valsz > Leafspc - needed;
+	assert(b->type == Tleaf);
+	return 2*(b->nval+1) + b->valsz + needed > Leafspc;
 }
 
 int
@@ -329,10 +354,10 @@
 	/* 
 	 * We need to guarantee there's room for one message
 	 * at all times, so that splits along the whole path
-	 * have somewhere to go.
+	 * have somewhere to go as they propagate up.
 	 */
-	assert(b->type == Pivot);
-	return 2*b->nent + b->valsz > Pivspc - reserve*Kpmax;
+	assert(b->type == Tpivot);
+	return 2*(b->nval+1) + b->valsz + reserve*Kpmax > Pivspc;
 }
 
 int
@@ -344,7 +369,7 @@
 	if(pp->split){
 		getval(pp->l, 0, &kv);
 		kv.type = Vref;
-		kv.bp = (uintptr)pp->l;
+		kv.bp = pp->l->off;
 		kv.bh = blkhash(pp->l);
 		kv.fill = blkfill(pp->l);
 		setval(n, i++, &kv, 0);
@@ -353,7 +378,7 @@
 
 		getval(pp->r, 0, &kv);
 		kv.type = Vref;
-		kv.bp = (uintptr)pp->r;
+		kv.bp = pp->r->off;
 		kv.bh = blkhash(pp->r);
 		kv.fill = blkfill(pp->r);
 		setval(n, i++, &kv, 0);
@@ -362,7 +387,7 @@
 	}else{
 		getval(pp->n, 0, &kv);
 		kv.type = Vref;
-		kv.bp = (uintptr)pp->n;
+		kv.bp = pp->n->off;
 		kv.bh = blkhash(pp->n);
 		kv.fill = blkfill(pp->n);
 		setval(n, i++, &kv, 1);
@@ -395,15 +420,15 @@
 	midx = (p != nil && p->merge) ? p->midx : -1;
 	if((n = newblk(b->type)) == nil)
 		return -1;
-	for(i = 0; i < b->nent; i++){
+	for(i = 0; i < b->nval; i++){
 		if(i == pidx){
 			j = copyup(n, j, pp, nil);
 			continue;
 		}else if(i == midx){
-			getval(p->ml, 0, &m);
+			getval(p->nl, 0, &m);
 			setval(n, j++, &m, 0);
-			if(p->mr){
-				getval(p->mr, 0, &m);
+			if(p->nr){
+				getval(p->nr, 0, &m);
 				setval(n, j++, &m, 0);
 			}
 			continue;
@@ -411,18 +436,18 @@
 		getval(b, i, &m);
 		setval(n, j++, &m, 0);
 	}
-	if(b->type == Pivot){
+	if(b->type == Tpivot){
 		i = 0;
 		j = 0;
-		while(i < b->nmsg){
+		while(i < b->nbuf){
 			if(i == lo)
 				i = hi;
-			if(i == b->nmsg)
+			if(i == b->nbuf)
 				break;
 			getmsg(b, i++, &m);
 			setmsg(n, j++, &m);
 		}
-		b->nmsg = j;
+		b->nbuf = j;
 	}
 	p->n = n;
 	return 0;
@@ -462,8 +487,20 @@
 
 	j = 0;
 	copied = 0;
-	halfsz = (2*b->nent + b->valsz)/2;
-	for(i = 0; copied < halfsz; i++){
+	halfsz = (2*b->nval + b->valsz)/2;
+	assert(b->nval >= 4);
+	for(i = 0; i < b->nval; i++){
+		/*
+		 * We're trying to balance size,
+		 * but we need at least 2 nodes
+		 * in each half of the split if
+		 * we want a valid tree.
+		 */
+		if(i == b->nval-2)
+			break;
+		if(i >= 2 && copied >= halfsz)
+			break;
+
 		if(i == pidx){
 			j = copyup(l, j, pp, &copied);
 			continue;
@@ -474,7 +511,7 @@
 	}
 	j = 0;
 	getval(b, i, mid);
-	for(; i < b->nent; i++){
+	for(; i < b->nval; i++){
 		if(i == pidx){
 			j = copyup(r, j, pp, nil);
 			continue;
@@ -482,12 +519,12 @@
 		getval(b, i, &t);
 		setval(r, j++, &t, 0);
 	}
-	if(b->type == Pivot){
+	if(b->type == Tpivot){
 		i = 0;
-		for(j = 0; i < b->nmsg; i++, j++){
+		for(j = 0; i < b->nbuf; i++, j++){
 			if(i == lo)
 				i = hi;
-			if(i == b->nmsg)
+			if(i == b->nbuf)
 				break;
 			getmsg(b, i, &m);
 			if(keycmp(&m, mid) >= 0)
@@ -494,10 +531,10 @@
 				break;
 			setmsg(l, j, &m);
 		}
-		for(j = 0; i < b->nmsg; i++, j++){
+		for(j = 0; i < b->nbuf; i++, j++){
 			if(i == lo)
 				i = hi;
-			if(i == b->nmsg)
+			if(i == b->nbuf)
 				break;
 			getmsg(b, i, &m);
 			setmsg(r, j, &m);
@@ -512,23 +549,55 @@
 int
 apply(Blk *b, Msg *m)
 {
-	assert(b->type == Leaf);
+	int idx, same;
+	vlong v;
+	char *p;
+	Kvp kv;
+
+	assert(b->type == Tleaf);
 	assert(b->flag & Bdirty);
-	switch(m->op){
-	case Ocreate:
+	switch(m->op&0xf){
+	case Oinsert:
 		blkinsert(b, m);
 		break;
 	case Odelete:
 		blkdelete(b, m);
 		break;
-	case Owrite:
-		werrstr("unimplemented");
-		goto error;
+	case Owstat:
+		p = m->v;
+		idx = blksearch(b, m, &kv, &same);
+		if(idx == -1 || !same)
+			abort();
+		/* bump version */
+		v = GBIT32(kv.v+8);
+		PBIT32(kv.v+8, v+1);
+		if(m->op & Owmtime){
+			v = GBIT64(p);
+			p += 8;
+			PBIT32(kv.v+25, v);
+		}
+		if(m->op & Owsize){
+			fprint(2, "wstat: incrementing size");
+			v = GBIT64(p);
+			p += 8;
+			PBIT64(kv.v+33, v);
+		}
+		if(m->op & Owmode){
+			v = GBIT32(p);
+			p += 4;
+			PBIT32(kv.v+33, v);
+		}
+		if(m->op & Owname){
+			fprint(2, "renames not yet supported");
+			abort();
+		}
+		if(p != m->v + m->nv)
+			fprint(2, "malformed wstat message");
+		break;
+	default:
+		abort();
 	}
 	return 0;
-error:
-	werrstr("invalid upsert");
-	return -1;
 }
 
 int
@@ -542,20 +611,21 @@
 	if((d = newblk(a->type)) == nil)
 		return -1;
 	o = 0;
-	for(i = 0; i < a->nent; i++){
+	for(i = 0; i < a->nval; i++){
 		getval(a, i, &m);
 		setval(d, o++, &m, 0);
 	}
-	for(i = 0; i < b->nent; i++){
+	for(i = 0; i < b->nval; i++){
 		getval(b, i, &m);
 		setval(d, o++, &m, 0);
 	}
-	if(a->type == Pivot){
-		for(i = 0; i < a->nmsg; i++){
+	if(a->type == Tpivot){
+		o = 0;
+		for(i = 0; i < a->nbuf; i++){
 			getmsg(a, i, &m);
 			setmsg(d, o++, &m);
 		}
-		for(i = 0; i < b->nmsg; i++){
+		for(i = 0; i < b->nbuf; i++){
 			getmsg(b, i, &m);
 			setmsg(d, o++, &m);
 		}
@@ -562,42 +632,61 @@
 	}
 	p->merge = 1;
 	p->midx = idx;
-	p->ml = d;
-	p->mr = nil;
+	p->nl = d;
+	p->nr = nil;
 	return 0;
 }
 
 /*
- * Returns whether the keys in b between
- * idx and m would spill out of the buffer
- * of d.
+ * Scan a single block for the split offset;
+ * returns 1 if we'd spill out of the buffer,
+ * updates *idx and returns 0 otherwise.
  */
 int
-spillsbuf(Blk *d, Blk *b, Msg *m, int *idx)
+spillscan(Blk *d, Blk *b, Msg *m, int *idx, int o)
 {
 	int i, used;
 	Msg n;
 
-	if(b->type == Leaf)
-		return 0;
-	used = 2*d->nmsg + d->bufsz;
-	for(i = *idx; i < b->nmsg; i++){
+	used = 2*d->nbuf + d->bufsz;
+	for(i = 0; i < b->nbuf; i++){
 		getmsg(b, i, &n);
-		if(keycmp(&n, m) > 0)
-			break;
+		print("check %P before %P: %d\n", &n.Kvp, &m->Kvp, keycmp(&n, m));
+		if(keycmp(m, &n) <= 0){
+			*idx = i + o;
+			return 0;
+		}
+		print("would copy %P before %P: %d\n", &n.Kvp, &m->Kvp, keycmp(&n, m));
 		used += 2 + msgsz(&n);
-		if(used >= Bufspc)
+		if(used > Bufspc)
 			return 1;
 	}
-	print("does not spill");
-	*idx = i;
+	*idx = b->nbuf;
 	return 0;
 }
 
+/*
+ * Returns whether the keys in b between
+ * idx and m would spill out of the buffer
+ * of d.
+ */
 int
-rotate(Path *p, int idx, Blk *a, Blk *b, int halfbuf)
+spillsbuf(Blk *d, Blk *l, Blk *r, Msg *m, int *idx)
 {
-	int i, o, copied, split, ovfidx;
+	if(l->type == Tleaf)
+		return 0;
+
+	if(*idx < l->nbuf && spillscan(d, l, m, idx, 0))
+		return 1;
+	if(*idx >= l->nbuf && spillscan(d, r, m, idx, l->nbuf))
+		return 1;
+	return 0;
+}
+
+int
+rotate(Path *p, int midx, Blk *a, Blk *b, int halfpiv)
+{
+	int i, o, cp, sp, idx;
 	Blk *d, *l, *r;
 	Msg m;
 
@@ -610,34 +699,35 @@
 	}
 	o = 0;
 	d = l;
-	split = -1;
-	ovfidx = 0;
-	copied = 0;
-	for(i = 0; i < a->nent; i++){
+	cp = 0;
+	sp = -1;
+	idx = 0;
+	for(i = 0; i < a->nval; i++){
 		getval(a, i, &m);
-		if(d == l && (copied >= halfbuf || spillsbuf(d, a, &m, &ovfidx))){
-			split = i;
-			o = 0;
+		
+		if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+			sp = idx;
 			d = r;
+			o = 0;
 		}
 		setval(d, o++, &m, 0);
-		copied += valsz(&m);
+		cp += valsz(&m);
 	}
-	for(i = 0; i < b->nent; i++){
-		if(d == l && (copied >= halfbuf || spillsbuf(d, b, &m, &ovfidx))){
-			split = i;
-			o = 0;
+	for(i = 0; i < b->nval; i++){
+		getval(b, i, &m);
+		if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+			sp = idx;
 			d = r;
+			o = 0;
 		}
-		getval(b, i, &m);
 		setval(d, o++, &m, 0);
-		copied += valsz(&m);
+		cp += valsz(&m);
 	}
-	if(a->type == Pivot){
+	if(a->type == Tpivot){
 		d = l;
 		o = 0;
-		for(i = 0; i < a->nmsg; i++){
-			if(i == split){
+		for(i = 0; i < a->nbuf; i++){
+			if(o == sp){
 				d = r;
 				o = 0;
 			}
@@ -644,8 +734,8 @@
 			getmsg(a, i, &m);
 			setmsg(d, o++, &m);
 		}
-		for(i = 0; i < b->nmsg; i++){
-			if(i == split){
+		for(i = 0; i < b->nbuf; i++){
+			if(o == sp){
 				d = r;
 				o = 0;
 			}
@@ -654,9 +744,9 @@
 		}
 	}
 	p->merge = 1;
-	p->midx = idx;
-	p->ml = l;
-	p->mr = r;
+	p->midx = midx;
+	p->nl = l;
+	p->nr = r;
 	return 0;
 }
 
@@ -667,14 +757,14 @@
 
 	assert(a->type == b->type);
 
-	na = 2*a->nent + a->valsz;
-	nb = 2*b->nent + b->valsz;
-	if(a->type == Leaf){
+	na = 2*a->nval + a->valsz;
+	nb = 2*b->nval + b->valsz;
+	if(a->type == Tleaf){
 		ma = 0;
 		mb = 0;
 	}else{
-		ma = 2*a->nmsg + a->bufsz;
-		mb = 2*b->nmsg + b->bufsz;
+		ma = 2*a->nbuf + a->bufsz;
+		mb = 2*b->nbuf + b->bufsz;
 	}
 	imbalance = na - nb;
 	if(imbalance < 0)
@@ -682,7 +772,7 @@
 	/* works for leaf, because 0 always < Bufspc */
 	if(na + nb < Pivspc && ma + mb < Bufspc)
 		return merge(p, idx, a, b);
-	else if(imbalance < 4*Msgmax)
+	else if(imbalance > 4*Msgmax)
 		return rotate(p, idx, a, b, (na + nb)/2);
 	else
 		return 0;
@@ -709,7 +799,7 @@
 	}
 	/* Try merging left */
 	getval(p->b, idx, &km);
-	if(idx > 0){
+	if(idx-1 >= 0){
 		getval(p->b, idx-1, &kl);
 		if(kl.fill + km.fill >= Blkspc)
 			goto next;
@@ -720,7 +810,7 @@
 		goto done;
 	}
 next:
-	if(idx < p->b->nent){
+	if(idx+1 < p->b->nval){
 		getval(p->b, idx+1, &kr);
 		if(kr.fill + km.fill >= Blkspc)
 			goto done;
@@ -741,17 +831,15 @@
 }
 
 static int
-flush(Path *path, int npath, Msg *ins, int *redo)
+flush(Path *path, int npath)
 {
-	static int nins;
 
 	Path *p, *pp;
 	Blk *b;
 	Kvp mid;
 	Msg m;
-	int i, ret;
+	int i;
 
-	ret = -1;
 	/*
 	 * The path must contain at minimum two elements:
 	 * we must have 1 node we're inserting into, and
@@ -759,10 +847,9 @@
 	 * we put the new root into if the root gets split.
 	 */
 	assert(npath >= 2);
-	*redo = 0;
 	pp = nil;
 	p = &path[npath - 1];
-	if(p->b->type == Leaf){
+	if(p->b->type == Tleaf){
 		if(!filledleaf(p->b, p[-1].sz)){
 			if(update(p, pp) == -1)
 				return -1;
@@ -772,6 +859,7 @@
 					break;
 				apply(p->n, &m);
 			}
+			finalize(p->n);
 		}else{
 			if(split(p->b, p, pp, &mid) == -1)
 				goto error;
@@ -785,6 +873,8 @@
 				if(apply(b, &m) == -1)
 					goto error;
 			}
+			finalize(p->l);
+			finalize(p->r);
 		}
 		assert(p->n || (p->l && p->r));
 		p->midx = -1;
@@ -793,11 +883,17 @@
 	}
 	for(; p > path; p--){
 		if(!filledpiv(p->b, 1)){
+#ifdef NOPE
 			if(trymerge(p, pp, p->idx) == -1)
 				goto error;
+#endif
 			/* If we merged the root node, break out. */
-			if(p[-1].b == nil && p[0].merge && p[0].mr == nil && p[0].b->nent == 2)
+			if(p[-1].b == nil && p[0].merge && p[0].nr == nil && p[0].b->nval == 2){
+				/* FIXME: shouldn't p[1].n already be right? */
+				p[1].n = p[0].nl;
+				p[0].n = nil;
 				break;
+			}
 			if(update(p, pp) == -1)
 				goto error;
 			for(i = p[-1].lo; i < p[-1].hi; i++){
@@ -806,7 +902,9 @@
 					break;
 				bufinsert(p->n, &m);
 			}
+			finalize(p->n);
 		}else{
+			dprint("-- split\n");
 			if(split(p->b, p, pp, &mid) == -1)
 				goto error;
 			b = p->l;
@@ -818,24 +916,28 @@
 					continue;
 				bufinsert(b, &m);
 			}
+			finalize(p->l);
+			Blk *x = getblk(p->l->off, -1);
+			assert(p->l == x);
+			finalize(p->r);
 		}
 		pp = p;
 	}
 	if(path[1].split){
-		if((path[0].n = newblk(Pivot)) == nil)
+		if((path[0].n = newblk(Tpivot)) == nil)
 			goto error;
 		copyup(path[0].n, 0, &path[1], nil);
-		bufinsert(path[0].n, ins);
-	}else{
-		if(path[1].b->type == Leaf && !filledleaf(path[1].b, msgsz(ins)))
-			apply(path[1].n, ins);
-		else if(!filledbuf(path[1].n, msgsz(ins)))
-			bufinsert(path[1].n, ins);
-		else
-			*redo = 1;
 	}
-	ret = 0;
+	return 0;
 error:
+	return -1;
+}
+
+void
+freepath(Path *path, int npath)
+{
+	Path *p;
+
 	for(p = path; p != path + npath; p++){
 		if(p->b != nil)
 			putblk(p->b);
@@ -846,7 +948,6 @@
 		if(p->r != nil)
 			putblk(p->r);
 	}
-	return ret;
 }
 
 /*
@@ -869,13 +970,13 @@
 	 * go to the first node. Stop *after* the last entry,
 	 * because entries >= the last entry all go into it.
 	 */
-	for(i = 1; i <= b->nent; i++){
-		if(i < b->nent)
+	for(i = 1; i <= b->nval; i++){
+		if(i < b->nval)
 			getval(b, i, &kv);
 		cursz = 0;
-		for(; j < b->nmsg; j++){
+		for(; j < b->nbuf; j++){
 			getmsg(b, j, &m);
-			if(i < b->nent && keycmp(&m, &kv) >= 0)
+			if(i < b->nval && keycmp(&m, &kv) >= 0)
 				break;
 			/* 2 bytes for offset, plus message size in buffer */
 			cursz += 2 + msgsz(&m);
@@ -892,36 +993,59 @@
 }
 
 int
-fsupsert(Msg *m)
+btupsert(Msg *msg, int nmsg)
 {
-	Blk *b, *n, *s;
-	int npath, redo;
+	int i, npath, redo, dh, nm, height;
+	vlong rh;
 	Path *path;
+	Blk *b, *rb;
 	Kvp sep;
 
-	if(m->nk + 2 > Keymax){
-		werrstr("overlong key");
+	nm = 0;
+	for(i = 0; i < nmsg; i++){
+		if(msg[i].nk + 2 > Keymax){
+			werrstr("overlong key");
+			return -1;
+		}
+		nm += msgsz(&msg[i]);
+	}
+
+again:
+	if((b = getroot(&height)) == nil){
+		werrstr("get root: %r");
 		return -1;
 	}
+
 	/*
+	 * Fast path: the root of the tree has room,
+	 * and nobody else has observed the node, so
+	 * we can modify it with gleeful abandon.
+	 */
+	if(b->type == Tpivot && canwlock(b)) {
+		if((b->flag & Bdirty) && !filledbuf(b, nm)){
+			for(i = 0; i < nmsg; i++)
+				bufinsert(b, &msg[i]);
+			wunlock(b);
+			return 0;
+		}
+	}
+
+	/*
 	 * The tree can grow in height by 1 when we
 	 * split, so we allocate room for one extra
 	 * node in the path.
 	 */
-again:
-	n = nil;
-	s = nil;
 	redo = 0;
 	npath = 0;
-	path = emalloc((fs->height + 2)*sizeof(Path));
+	if((path = calloc((height + 2), sizeof(Path))) == nil)
+		return -1;
 	path[npath].b = nil;
 	path[npath].idx = -1;
 	path[npath].midx = -1;
 	npath++;
 
-	b = fs->root;
-	path[0].sz = msgsz(m);
-	while(b->type == Pivot){
+	path[0].sz = nm;
+	while(b->type == Tpivot){
 		if(!filledbuf(b, path[npath - 1].sz))
 			break;
 		victim(b, &path[npath]);
@@ -936,75 +1060,352 @@
 	path[npath].lo = 0;
 	path[npath].hi = 0;
 	npath++;
-	if(flush(path, npath, m, &redo) == -1)
+
+	dh = -1;
+	rb = nil;
+//	showfs("flushing");
+	if(flush(path, npath) == -1)
 		goto error;
+
 	if(path[0].n != nil){
-		fs->height++;
-		fs->root = path[0].n;
+		dh = 1;
+		rb = path[0].n;
 	}else if(path[1].n != nil){
-		fs->root = path[1].n;
+		dh = 0;
+		rb = path[1].n;
 	}else if(npath >2 && path[2].n != nil){
-		fs->height--;
-		fs->root = path[2].n;
+		dh = -1;
+		rb = path[2].n;
 	}else
 		abort();
+
+	if(rb->type == Tleaf && !filledleaf(rb, nm))
+		for(i = 0; i < nmsg; i++)
+			apply(rb, &msg[i]);
+	else if(rb->type == Tpivot && !filledbuf(rb, nm))
+		for(i = 0; i < nmsg; i++)
+			bufinsert(rb, &msg[i]);
+	else
+		redo = 1;
+	finalize(rb);
+
+	assert(rb->off != 0);
+	rh = blkhash(rb);
+	lock(&fs->rootlk);
+	fs->height += dh;
+	fs->rootb = rb->off;
+	fs->rooth = rh;
+	unlock(&fs->rootlk);
+
+	freepath(path, npath);
 	free(path);
+	if(!checkfs()){
+		showfs("broken");
+		abort();
+	}
 	if(redo)
 		goto again;
+//	showfs("fs");
+	snapshot();
 	return 0;
 error:
-	if(n != nil) freeblk(n);
-	if(s != nil) freeblk(s);
+	freepath(path, npath);
 	free(path);
 	return -1;
 }
 
+Blk*
+getroot(int *h)
+{
+	vlong bp, bh;
+
+	lock(&fs->rootlk);
+	bp = fs->rootb;
+	bh = fs->rooth;
+	if(h != nil)
+		*h = fs->height;
+	unlock(&fs->rootlk);
+	return getblk(bp, bh);
+}
+
 static char*
 collect(Blk *b, Key *k, Kvp *r, int *done)
 {
-	int i, idx;
+	int i, idx, same;
+	char *err;
 	Msg m;
 
 	*done = 0;
-	if(bufsearch(b, k, &m, &idx) != 0)
+dprint("collecting %K\n", k);
+//showblk(b, "collecting from", 0);
+	idx = bufsearch(b, k, &m, &same);
+dprint("idx=%d, same=%d\n", idx, same);
+	if(!same)
 		return nil;
-	for(i = idx; i < b->nmsg; i++){
+	err = Eexist;
+	for(i = idx; i < b->nbuf; i++){
 		getmsg(b, i, &m);
+dprint("msg=%M\n", &m);
+dprint("cmp=%d\n", keycmp(&m, k));
 		if(keycmp(&m, k) != 0)
 			break;
 		switch(m.op){
-		case Ocreate:
+		case Oinsert:
 			*r = m.Kvp;
 			*done = 1;
-			return nil;
+			err = nil;
+			break;
 		case Odelete:
 			*done = 1;
-			return Eexist;
-		/* The others don't affect walks */
+			err = Eexist;
+			break;
+		default:
+			return Efs;
 		}
 	}
-	return nil;
+	return err;
 }
 
 char*
-fswalk1(Key *k, Kvp *r)
+btlookupat(Blk *b0, Key *k, Kvp *r, Blk **bp)
 {
 	int idx, same, done;
 	char *ret;
 	Blk *b;
 
-	b = fs->root;
-	while(b->type == Pivot){
+	*bp = nil;
+	b = pinblk(b0);
+	assert(k != r);
+	while(b->type == Tpivot){
 		ret = collect(b, k, r, &done);
 		if(done)
 			return ret;
-		if(blksearch(b, k, r, &idx, nil) == -1 || idx == -1)
+		idx = blksearch(b, k, r, nil);
+		if(idx == -1)
 			return Eexist;
+		putblk(b);
 		if((b = getblk(r->bp, r->bh)) == nil)
 			return Efs;
 	}
-	assert(b->type == Leaf);
-	if(blksearch(b, k, r, nil, &same) == -1 || !same)
+	assert(b->type == Tleaf);
+	blksearch(b, k, r, &same);
+	if(!same){
+		putblk(b);
 		return Eexist;
+	}
+	*bp = b;
 	return nil;
+}
+
+char*
+btlookup(Key *k, Kvp *r, Blk **bp)
+{
+	char *ret;
+	Blk *b;
+
+	*bp = nil;
+	if((b = getroot(nil)) == nil)
+		return Efs;
+	ret = btlookupat(b, k, r, bp);
+	putblk(b);
+
+	return ret;
+}
+
+char*
+btscan(Scan *s, Key *pfx, vlong rootb, vlong rooth, int height)
+{
+	int i, same;
+	Scanp *p;
+	Msg m;
+	Kvp v;
+	Blk *b;
+
+	s->done = 0;
+	s->offset = 0;
+	s->height = height;
+	cpkey(&s->pfx, pfx, s->pfxbuf, sizeof(s->pfxbuf));
+
+	s->kv.v = s->kvbuf+pfx->nk;
+	s->kv.nv = 0;
+	cpkey(&s->kv, pfx, s->kvbuf, sizeof(s->kvbuf));
+
+	if(rootb != -1){
+		s->rootb = rootb;
+		s->rooth = rooth;
+		s->height = height;
+	}else{
+		lock(&fs->rootlk);
+		s->rootb = fs->rootb;
+		s->rooth = fs->rooth;
+		s->height = fs->height;
+dprint("height %d\n", s->height);
+		unlock(&fs->rootlk);
+	}
+	if((s->path = calloc(s->height, sizeof(Scanp))) == nil){
+		free(s);
+		return nil;
+	}
+
+	p = s->path;
+	if((b = getblk(s->rootb, s->rooth)) == nil)
+		return "error reading block";
+	p[0].b = b;
+	for(i = 0; i < s->height; i++){
+		p[i].vi = blksearch(b, &s->kv, &v, &same);
+		if(p[i].vi == -1 || (!same && b->type == Tleaf))
+			getval(b, ++p[i].vi, &v);
+		if(b->type == Tpivot){
+			p[i].bi = bufsearch(b, &s->kv, &m, &same);
+			if(p[i].bi == -1 || !same)
+				p[i].bi++;
+			if((b = getblk(v.bp, v.bh)) == nil)
+				return "error readivg block";
+			p[i+1].b = b;
+		}else{
+			assert(i == s->height-1);
+		}
+	}
+dprint("inited\n");
+for(i = 0; i < s->height; i++){
+dprint("\t%p", p[i].b);
+dprint(" (%d %d)\n", p[i].vi, p[i].bi);
+}
+	return nil;
+}
+
+char *
+btnext(Scan *s, Kvp *r, int *done)
+{
+	Scanp *p;
+	int i, j, h, start;
+	Msg m, n, t;
+	Kvp kv;
+
+	p = s->path;
+	h = s->height;
+	*done = 0;
+	start = h;
+	for(i = h-1; i > 0; i--){
+dprint("advancing (i=%d)\n", i);
+for(j = 0; j < h; j++){
+dprint("\t%p", p[j].b);
+dprint(" (%d %d)\n", p[j].vi, p[j].bi);
+}
+		if(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf)
+			break;
+		if(i == 0){
+			*done = 1;
+			return nil;
+		}
+		start = i;
+		p[i-1].vi++;
+		p[i].vi = 0;
+		p[i].bi = 0;
+	}
+	for(i = start; i < h; i++){
+		getval(p[i-1].b, p[i-1].vi, &kv);
+		if((p[i].b = getblk(kv.bp, kv.bh)) == nil)
+			return "error reading block";
+	}
+	getval(p[h-1].b, p[h-1].vi, &m);
+	for(i = h-2; i >= 0; i--){
+		if(p[i].bi == p[i].b->nbuf)
+			continue;
+		getmsg(p[i].b, p[i].bi, &n);
+		if(keycmp(&m, &n) >= 0)
+			m = n;
+	}
+	if(m.nk < s->pfx.nk || memcmp(m.k, s->pfx.k, s->pfx.nk) != 0){
+		*done = 1;
+		return nil;
+	}
+	cpkvp(r, &m, s->kvbuf, sizeof(s->kvbuf));
+	getval(p[h-1].b, p[h-1].vi, &t);
+	if(keycmp(&m, &t) == 0)
+		p[h-1].vi++;
+	for(i = h-2; i >= 0; i--){
+		for(j = p[i].bi; j < p[i].b->nbuf; j++){
+			getmsg(p[i].b, j, &t);
+			if(keycmp(&m, &t) != 0)
+				break;
+			p[i].bi++;
+		}
+	}
+	return nil;
+}
+
+void
+btdone(Scan *s)
+{
+	int i;
+
+	for(i = 0; i < s->height; i++)
+		if(s->path[i].b != nil)
+			putblk(s->path[i].b);
+	free(s->path);
+}
+
+int
+snapshot(void)
+{
+	Blk *s;
+	int r;
+
+	s = fs->super;
+	qlock(&fs->snaplk);
+	lock(&fs->rootlk);
+	fillsuper(s);
+	finalize(s);
+	unlock(&fs->rootlk);
+print("snap superblock @%llx\n", s->off);
+showfree("snap");
+	r = pwrite(fs->fd, s->buf, Blksz, s->off);
+	qunlock(&fs->snaplk);
+	return r;
+}
+
+int
+getrefpg(vlong off, Blk **p)
+{
+	char kbuf[9], *e;
+	vlong bp, bh;
+	Blk *b;
+	Kvp kv;
+
+	*p = nil;
+	kbuf[0] = Kref;
+	PBIT64(kbuf+1, off & ~(Blksz-1));
+	kv.k = kbuf;
+	kv.nk = sizeof(kbuf);
+	e = btlookup(&kv, &kv, &b);
+	if(e == Eexist)
+		return 0;
+	if(e != nil || kv.nv != 16)
+		return -1;
+	bp = GBIT64(kv.v+0);
+	bh = GBIT64(kv.v+8);
+	putblk(b);
+	if((*p = getblk(bp, bh)) == nil)
+		return -1;
+	return 0;
+}
+
+int
+setrefpg(vlong pg, vlong bp, vlong bh)
+{
+	char kbuf[9], vbuf[16];
+	Msg m;
+
+	kbuf[0] = Kref;
+	PBIT64(kbuf+1, pg & ~(Blksz-1));
+	PBIT64(vbuf+0, bp);
+	PBIT64(vbuf+8, bh);
+
+	m.op = Oinsert;
+	m.k = kbuf;
+	m.nk = sizeof(kbuf);
+	m.v = vbuf;
+	m.nv = sizeof(vbuf);
+	return btupsert(&m, 1);
 }
--- a/util.c
+++ b/util.c
@@ -1,5 +1,8 @@
 #include <u.h>
 #include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
 #include "dat.h"
 #include "fns.h"