shithub: gefs

Download patch

ref: 32325a04016b615ffbdbe07d3f401dc2dd43d957
parent: f0563cbdbce1777d5672fee507acac7ab0d6dbc0
author: Ori Bernstein <ori@eigenstate.org>
date: Thu Sep 16 17:54:18 EDT 2021

tree: refactor to allow multiple parallel trees

--- a/blk.c
+++ b/blk.c
@@ -180,7 +180,7 @@
 	char *p;
 
 	assert(off % Blksz == 0);
-	assert(op == LgAlloc || op == LgFree);
+	assert(op == LogAlloc || op == LogFree);
 	if(lb == nil || lb->logsz > Logspc - 8){
 		pb = lb;
 		if((o = blkalloc_lk(a)) == -1)
@@ -193,7 +193,7 @@
 		lb->off = o;
 		lb->logsz = Loghdsz;
 		p = lb->data + lb->logsz;
-		PBIT64(p + 0, (uvlong)LgEnd);
+		PBIT64(p + 0, (uvlong)LogEnd);
 		finalize(lb);
 		if(syncblk(lb) == -1){
 			free(lb);
@@ -203,7 +203,7 @@
 		a->logtl = lb;
 		if(pb != nil){
 			p = pb->data + pb->logsz;
-			PBIT64(p + 0, lb->off|LgChain);
+			PBIT64(p + 0, lb->off|LogChain);
 			finalize(pb);
 			if(syncblk(pb) == -1)
 				return nil;
@@ -212,7 +212,7 @@
 
 	p = lb->data + lb->logsz;
 	if(len == Blksz){
-		off |= (op & ~Lg2w);
+		off |= (op & ~Log2w);
 		PBIT64(p, off);
 		lb->logsz += 8;
 	}else{
@@ -223,7 +223,7 @@
 	}
 	/* this gets overwritten by the next append */
 	p = lb->data + lb->logsz;
-	PBIT64(p, (uvlong)LgEnd);
+	PBIT64(p, (uvlong)LogEnd);
 	return lb;
 }
 
@@ -234,7 +234,7 @@
  * recursion.
  */
 int
-logalloc(Arena *a, vlong off, int op)
+logop(Arena *a, vlong off, int op)
 {
 	Blk *b;
 
@@ -275,9 +275,9 @@
 		ent = GBIT64(d);
 		op = ent & 0xff;
 		off = ent & ~0xff;
-		n = (op & Lg2w) ? 16 : 8;
+		n = (op & Log2w) ? 16 : 8;
 		switch(op){
-		case LgEnd:
+		case LogEnd:
 			dprint("log@%d: end\n", i);
 			/*
 			 * since we want the next insertion to overwrite
@@ -285,7 +285,7 @@
 			 */
 			b->logsz = i;
 			return 0;
-		case LgChain:
+		case LogChain:
 			bp = off & ~0xff;
 			dprint("log@%d: chain %llx\n", i, bp);
 			b->logsz = i+n;
@@ -292,30 +292,26 @@
 			goto Nextblk;
 			break;
 
-		case LgFlush:
+		case LogFlush:
 			dprint("log@%d: flush: %llx\n", i, off>>8);
 			lock(&fs->genlk);
 			fs->gen = off >> 8;
 			unlock(&fs->genlk);
 			break;
-		case LgAlloc:
-		case LgAlloc1:
-			len = (op & Lg2w) ? GBIT64(d+8) : Blksz;
+		case LogAlloc:
+		case LogAlloc1:
+			len = (op & Log2w) ? GBIT64(d+8) : Blksz;
 			dprint("log@%d alloc: %llx+%llx\n", i, off, len);
 			if(grabrange(a->free, off & ~0xff, len) == -1)
 				return -1;
 			break;
-		case LgFree:
-		case LgFree1:
-			len = (op & Lg2w) ? GBIT64(d+8) : Blksz;
+		case LogFree:
+		case LogFree1:
+			len = (op & Log2w) ? GBIT64(d+8) : Blksz;
 			dprint("log@%d free: %llx+%llx\n", i, off, len);
 			if(freerange(a->free, off & ~0xff, len) == -1)
 				return -1;
 			break;
-		case LgRef:
-		case LgUnref:
-			fprint(2, "unimplemented ref op at log@%d: log op %d\n", i, op);
-			break;
 		default:
 			n = 0;
 			dprint("log@%d: log op %d\n", i, op);
@@ -363,7 +359,7 @@
 	b->data = b->buf + Hdrsz;
 	b->logsz = Loghdsz;
 
-	PBIT64(b->data+b->logsz, (uvlong)LgEnd);
+	PBIT64(b->data+b->logsz, (uvlong)LogEnd);
 	finalize(b);
 	if(syncblk(b) == -1){
 		free(b);
@@ -411,10 +407,10 @@
 	hd = b;
 	b->logsz = Loghdsz;
 	for(i = 0; i < n; i++)
-		if((b = logappend(a, b, log[i].off, log[i].len, LgFree)) == nil)
+		if((b = logappend(a, b, log[i].off, log[i].len, LogFree)) == nil)
 			return -1;
 	p = b->data + b->logsz;
-	PBIT64(p, LgChain|graft);
+	PBIT64(p, LogChain|graft);
 	free(log);
 	finalize(b);
 	if(syncblk(b) == -1)
@@ -426,6 +422,7 @@
 	ab = a->b;
 	PBIT64(ab->data + 0, a->log);
 	PBIT64(ab->data + 8, a->logh);
+	finalize(ab);
 	if(syncblk(ab) == -1)
 		return -1;
 checkfs();
@@ -438,11 +435,11 @@
 			for(i = Loghdsz; i < Logspc; i += n){
 				p = b->data + i;
 				v = GBIT64(p);
-				n = (v & Lg2w) ? 16 : 8;
-				if((v&0xff) == LgChain){
+				n = (v & Log2w) ? 16 : 8;
+				if((v&0xff) == LogChain){
 					nb = v & ~0xff;
 					break;
-				}else if((v&0xff) == LgEnd){
+				}else if((v&0xff) == LogEnd){
 					nb = -1;
 					break;
 				}
@@ -507,7 +504,7 @@
 	cachedel(b);
 	if(freerange(a->free, b, Blksz) == -1)
 		goto out;
-	if(logalloc(a, b, LgFree) == -1)
+	if(logop(a, b, LogFree) == -1)
 		goto out;
 	r = 0;
 out:
@@ -523,7 +520,7 @@
 	int tries;
 
 	tries = 0;
-again:
+Again:
 	a = pickarena(hint);
 	if(a == nil || tries == fs->narena){
 		werrstr("no empty arenas");
@@ -545,9 +542,9 @@
 	tries++;
 	if((b = blkalloc_lk(a)) == -1){
 		unlock(a);
-		goto again;
+		goto Again;
 	}
-	if(logalloc(a, b, LgAlloc) == -1){
+	if(logop(a, b, LogAlloc) == -1){
 		unlock(a);
 		return -1;
 	}
@@ -694,7 +691,8 @@
 int
 syncblk(Blk *b)
 {
-	dprint("\tsyncblk: %llx+%llx\n", b->off, Blksz);
+	assert(b->flag & Bfinal);
+	print("\tsyncblk: %llx+%llx\n", b->off, Blksz);
 	return pwrite(fs->fd, b->buf, Blksz, b->off);
 }
 
@@ -702,6 +700,8 @@
 void
 enqueue(Blk *b)
 {
+	print("sync %llx\n", b->off);
+	assert(b->flag & Bfinal);
 	if(syncblk(b) == -1){
 		ainc(&fs->broken);
 		fprint(2, "write: %r");
@@ -708,7 +708,7 @@
 		return;
 	}
 	wlock(b);
-	b->flag &= ~(Bqueued|Bdirty|Bfinal);
+	b->flag &= ~(Bqueued|Bdirty);
 	wunlock(b);
 
 }
@@ -725,9 +725,9 @@
 	PBIT32(p + 12, Blksz);
 	PBIT32(p + 16, Bufspc);
 	PBIT32(p + 20, Hdrsz);
-	PBIT32(p + 24, fs->height);
-	PBIT64(p + 32, fs->rootb);
-	PBIT64(p + 40, fs->rooth);
+	PBIT32(p + 24, fs->root.ht);
+	PBIT64(p + 32, fs->root.bp);
+	PBIT64(p + 40, fs->root.bh);
 	PBIT32(p + 48, fs->narena);
 	PBIT64(p + 56, fs->arenasz);
 	PBIT64(p + 64, fs->gen);
@@ -768,12 +768,12 @@
 }
 
 Blk*
-getblk(vlong bp, uvlong bh)
+getblk(vlong bp, uvlong bh, int flg)
 {
 	Blk *b;
 
 	if((b = lookupblk(bp)) == nil){
-		if((b = readblk(bp, 0)) == nil)
+		if((b = readblk(bp, flg)) == nil)
 			return nil;
 		if(siphash(b->buf, Blksz) != bh){
 			werrstr("corrupt block %llx", bp);
@@ -797,32 +797,6 @@
 	return b;
 }
 
-int
-refblk(Blk *b)
-{
-	Arena *a;
-	int r;
-
-	a = getarena(b->off);
-	lock(a);
-	r = logalloc(a, b->off, LgRef);
-	unlock(a);
-	return r;
-}
-
-int
-unrefblk(Blk *b)
-{
-	Arena *a;
-	int r;
-
-	a = getarena(b->off);
-	lock(a);
-	r = logalloc(a, b->off, LgUnref);
-	unlock(a);
-	return r;
-}
-
 ushort
 blkfill(Blk *b)
 {
@@ -859,11 +833,7 @@
 	assert(b->ref == 1 && b->flag & (Bdirty|Bqueued) == Bdirty);
 	a = getarena(b->off);
 	lock(a);
-	/*
-	 * TODO: what to do if we fail to log a free here??
-	 * This is already an error path!
-	 */
-	logalloc(a, b->off, LgUnref);
+	logop(a, b->off, LogFree);
 	blkdealloc(b->off);
 	unlock(a);
 	free(b);
--- a/check.c
+++ b/check.c
@@ -61,7 +61,7 @@
 				fprint(2, "freed block in use: %llx\n", x.bp);
 				fail++;
 			}
-			if((c = getblk(x.bp, x.bh)) == nil){
+			if((c = getblk(x.bp, x.bh, 0)) == nil){
 				fprint(2, "corrupt block: %r\n");
 				fail++;
 				continue;
@@ -126,7 +126,7 @@
 	ok = 1;
 	if(badfree())
 		ok = 0;
-	if((b = getroot(&height)) != nil){
+	if((b = getroot(&fs->root, &height)) != nil){
 		if(badblk(b, height-1, nil, 0))
 			ok = 0;
 		putblk(b);
@@ -158,7 +158,7 @@
 		getval(b, i, &kv);
 		fprint(fd, "%.*s|%P\n", 4*indent, spc, &kv);
 		if(b->type == Tpivot){
-			if((c = getblk(kv.bp, kv.bh)) == nil)
+			if((c = getblk(kv.bp, kv.bh, 0)) == nil)
 				sysfatal("failed load: %r");
 			if(recurse)
 				rshowblk(fd, c, indent + 1, 1);
@@ -191,8 +191,8 @@
 	int h;
 
 	fprint(fd, "=== %s\n", m);
-	fprint(fd, "fs->height: %d\n", fs->height);
-	rshowblk(fd, getroot(&h), 0, 1);
+	fprint(fd, "\tht: %d\n", fs->root.ht);
+	rshowblk(fd, getroot(&fs->root, &h), 0, 1);
 }
 
 void
@@ -207,10 +207,13 @@
 {
 	int i;
 
+	print("path:\n");
+#define A(b) (b ? b->off : -1)
 	for(i = 0; i < np; i++){
-		print("==> b=%p, n=%p, idx=%d\n", p[i].b, p[i].n, p[i].idx);
-		print("\tclear=(%d, %d):%d\n", p[i].lo, p[i].hi, p[i].sz);
-		print("\tl=%p, r=%p, n=%p, split=%d\n", p[i].l, p[i].r, p[i].n, p[i].split);
+		print("\t[%d] ==> b(%p)=%llx, n(%p)=%llx, nl(%p)=%llx, nr(%p)=%llx idx=%d\n",
+			i, p[i].b, A(p[i].b), p[i].n, A(p[i].n), p[i].nl, A(p[i].nl), p[i].nr, A(p[i].nr), p[i].idx);
+		print("\t\tclear=(%d, %d):%d\n", p[i].lo, p[i].hi, p[i].sz);
+		print("\t\tl=%p, r=%p, n=%p, split=%d\n", p[i].l, p[i].r, p[i].n, p[i].split);
 	}
 }
 
--- a/dat.h
+++ b/dat.h
@@ -15,6 +15,7 @@
 typedef struct Arange	Arange;
 typedef struct Bucket	Bucket;
 typedef struct Chan	Chan;
+typedef struct Tree	Tree;
 
 enum {
 	KiB	= 1024ULL,
@@ -22,7 +23,7 @@
 	GiB	= 1024ULL*MiB,
 	TiB	= 1024ULL*GiB,
 
-	Lgblk	= 9,
+	Lgblk	= 13,
 	Blksz	= (1ULL<<Lgblk),
 
 	Nrefbuf	= 1024,			/* number of ref incs before syncing */
@@ -198,18 +199,17 @@
  */
 enum {
 	/* 1-wide entries */
-	LgAlloc1,	/* alloc a block */
-	LgFree1,	/* alloc a block */
-	LgRef,		/* ref a block */
-	LgUnref,	/* free a block */
-	LgFlush,	/* flush log, bump gen */
-	LgChain,	/* point to next log block */
-	LgEnd,		/* last entry in log */	
+	LogAlloc1,	/* alloc a block */
+	LogFree1,	/* alloc a block */
+	LogDead1,	/* free a block */
+	LogFlush,	/* flush log, bump gen */
+	LogChain,	/* point to next log block */
+	LogEnd,		/* last entry in log */	
 
 	/* 2-wide entries */
-	Lg2w	= 1<<5,
-	LgAlloc = LgAlloc1|Lg2w,	/* alloc a range */
-	LgFree	= LgFree1|Lg2w,		/* free a range */
+	Log2w	= 1<<5,
+	LogAlloc = LogAlloc1|Log2w,	/* alloc a range */
+	LogFree	= LogFree1|Log2w,	/* free a range */
 };
 
 struct Arange {
@@ -231,6 +231,13 @@
 	uchar	buf[];
 };
 
+struct Tree {
+	Lock	lk;
+	vlong	bp;
+	vlong	bh;
+	int	ht;
+};
+
 /*
  * Overall state of the file sytem.
  * Shadows the superblock contents.
@@ -250,11 +257,8 @@
 	int	fd;
 	long	broken;
 
-	Lock	rootlk;
 	/* protected by rootlk */
-	vlong	rootb;
-	vlong	rooth;
-	int	height;
+	Tree	root;
 
 	Lock	genlk;
 	vlong	gen;
@@ -346,9 +350,7 @@
 	 * instead of the most recent root, to prevent
 	 * paging in the wrong executable.
 	 */
-	vlong	rootb;
-	vlong	rooth;
-	int	height;
+	Tree	root;
 
 	u32int	fid;
 	vlong	qpath;
@@ -392,9 +394,7 @@
 
 struct Scan {
 	vlong	offset;	/* last read offset */
-	vlong	rootb;
-	vlong	rooth;
-	int	height;
+	Tree	root;
 
 	int	done;
 	Kvp	kv;
--- a/fns.h
+++ b/fns.h
@@ -11,8 +11,8 @@
 
 Blk*	newblk(int type);
 Blk*	shadow(Blk*, Path*, Path*);
-Blk*	getroot(int*);
-Blk*	getblk(vlong, uvlong);
+Blk*	getroot(Tree*, int*);
+Blk*	getblk(vlong, uvlong, int);
 Blk*	pinblk(Blk*);
 Blk*	readblk(vlong, int);
 Arena*	getarena(vlong);
@@ -35,10 +35,10 @@
 int	endfs(void);
 int	compresslog(Arena *a);
 
-int	btupsert(Msg*, int);
-char	*btlookup(Key*, Kvp*, Blk**);
+int	btupsert(Tree*, Msg*, int);
+char	*btlookup(Tree*, Key*, Kvp*, Blk**);
 char	*btlookupat(Blk*, Key*, Kvp*, Blk**);
-char	*btscan(Scan*, Key*, vlong, vlong, int);
+char	*btscan(Tree*, Scan*, Key*);
 char	*btnext(Scan*, Kvp*, int*);
 void	btdone(Scan*);
 
--- a/fs.c
+++ b/fs.c
@@ -42,10 +42,10 @@
 	char *e;
 	Blk *b;
 
-	if(f->rootb == -1)
-		b = getroot(nil);
+	if(f->root.bp == -1)
+		b = getroot(&fs->root, nil);
 	else
-		b = getblk(f->rootb, f->rooth);
+		b = getblk(f->root.bp, f->root.bh, 0);
 	if(b == nil)
 		return Efs;
 	if(lk)
@@ -256,7 +256,7 @@
 	int w, n;
 
 	r->tag = m->tag;
-//	dprint("→ %F\n", r);
+	dprint("→ %F\n", r);
 	if((n = convS2M(r, buf, sizeof(buf))) == 0)
 		abort();
 	qlock(m->wrlk);
@@ -384,7 +384,7 @@
 	dk.k = buf;
 	dk.nk = p - buf;
 showfs("attach");
-	if(btlookup(&dk, &kv, &b) != nil){
+	if(btlookup(&fs->root, &dk, &kv, &b) != nil){
 		rerror(m, Efs);
 		return;
 	}
@@ -413,8 +413,8 @@
 	f.fid = NOFID;
 	f.qpath = d.qid.path;
 	f.mode = -1;
-	f.rooth = -1;
-	f.rootb = -1;
+	f.root.bh = -1;
+	f.root.bp = -1;
 	f.iounit = iounit;
 	f.dent = e;
 showfids();
@@ -497,7 +497,7 @@
 	}
 	if(i > 0){
 		d.name = m->wname[i-1];
-		dent = getdent(f->rootb, up, &d);
+		dent = getdent(f->root.bp, up, &d);
 		if(dent == nil){
 			if(m->fid != m->newfid)
 				clunkfid(f);
@@ -525,7 +525,7 @@
 		rerror(m, "no such fid");
 		return;
 	}
-	if(btlookup(f->dent, &kv, &b) != nil){
+	if(btlookup(&fs->root, f->dent, &kv, &b) != nil){
 		rerror(m, Eexist);
 		return;
 	}
@@ -609,12 +609,12 @@
 		return;
 	}
 //showfs("precreate");
-	if(btupsert(&mb, 1) == -1){
+	if(btupsert(&fs->root, &mb, 1) == -1){
 		rerror(m, "%r");
 		return;
 	}
 //showfs("postcreate");
-	dent = getdent(f->rootb, f->qpath, &d);
+	dent = getdent(f->root.bp, f->qpath, &d);
 	if(dent == nil){
 		if(m->fid != m->newfid)
 			clunkfid(f);
@@ -641,6 +641,39 @@
 		
 }
 
+void
+fsremove(Fmsg *m)
+{
+	Fcall r;
+	Msg mb;
+	Fid *f;
+
+	if(okname(m->name) == -1){
+		rerror(m, Ename);
+		return;
+	}
+	if((f = getfid(m->fid)) == nil){
+		rerror(m, "no such fid");
+		return;
+	}
+
+	rlock(f->dent);
+	mb.op = Odelete;
+	mb.k = f->dent->k;
+	mb.nk = f->dent->nk;
+	mb.nv = 0;
+	if(btupsert(&fs->root, &mb, 1) == -1){
+		runlock(f->dent);
+		rerror(m, "remove: %r");
+		return;
+	}
+	runlock(f->dent);
+	clunkfid(f);
+
+	r.type = Rremove;
+	respond(m, &r);
+}
+
 int
 fsaccess(Dir*, int)
 {
@@ -691,12 +724,10 @@
 	}
 	f->mode = m->mode;
 	if((f->mode & 0x7) == OEXEC){
-		lock(&fs->rootlk);
-		f->rootb = fs->rootb;
-		f->rooth = fs->rooth;
-		f->height = fs->height;
-//		refblk(fs->rootb);
-		unlock(&fs->rootlk);
+		lock(&fs->root.lk);
+		f->root = fs->root;
+//		refblk(fs->root.bp);
+		unlock(&fs->root.lk);
 	}
 	unlock(f);
 	respond(m, &r);
@@ -707,6 +738,7 @@
 {
 	char pfx[9], *p, *e;
 	int n, ns, done;
+	Tree *t;
 	Scan *s;
 	Kvp kv;
 
@@ -722,7 +754,8 @@
 		print("scan starting\n");
 		if((s = mallocz(sizeof(Scan), 1)) == nil)
 			return "out of memory";
-		if((e = btscan(s, &kv, f->rootb, f->rooth, f->height)) != nil){
+		t = (f->root.bp != -1) ? &f->root : &fs->root;
+		if((e = btscan(t, s, &kv)) != nil){
 			free(r->data);
 			btdone(s);
 			return e;
@@ -790,7 +823,7 @@
 	bh = GBIT64(kv.v+8);
 	putblk(b);
 
-	if((b = getblk(bp, bh)) == nil)
+	if((b = getblk(bp, bh, GBraw)) == nil)
 		return -1;
 	fprint(2, "\treading from %llx (%llx) %s %s\n", bp, b->off, b->buf, b->data);
 	if(bo+n > Blksz)
@@ -906,7 +939,7 @@
 		bh = GBIT64(kv.v+8);
 		putblk(t);
 
-		if((t = getblk(bp, bh)) == nil)
+		if((t = getblk(bp, bh, GBraw)) == nil)
 			return -1;
 		memcpy(b->buf, t->buf, Blksz);
 		putblk(t);
@@ -978,7 +1011,7 @@
 		PBIT64(kv[i].v, m->offset+m->count);
 		f->dent->length = m->offset+m->count;
 	}
-	btupsert(kv, i+1);
+	btupsert(&fs->root, kv, i+1);
 	wunlock(f->dent);
 
 	r.type = Rwrite;
@@ -1015,7 +1048,7 @@
 		}
 */
 		m->wrlk = wrlk;
-//		dprint("← %F\n", &m->Fcall);
+		dprint("← %F\n", &m->Fcall);
 		switch(m->type){
 		/* sync setup */
 		case Tversion:	fsversion(m, &msgmax);	break;
@@ -1058,7 +1091,7 @@
 		case Tcreate:	fscreate(m);	break;
 		case Twrite:	fswrite(m);	break;
 		case Twstat:	fswstat(m);	break;
-		case Tremove:	rerror(m, "unimplemented remove");	break;
+		case Tremove:	fsremove(m);	break;
 		}
 	}
 }
--- a/load.c
+++ b/load.c
@@ -62,9 +62,9 @@
 		sysfatal("fs uses different buffer size");
 	if(GBIT32(p + 20) != Hdrsz)
 		sysfatal("fs uses different buffer size");
-	fs->height = GBIT32(p + 24);
-	fs->rootb = GBIT64(p + 32);
-	fs->rooth = GBIT64(p + 40);
+	fs->root.ht = GBIT32(p + 24);
+	fs->root.bp = GBIT64(p + 32);
+	fs->root.bh = GBIT64(p + 40);
 	fs->narena = GBIT32(p + 48);
 	fs->arenasz = GBIT64(p + 56);
 	fs->arenasz = GBIT64(p + 56);
@@ -72,9 +72,9 @@
 	fs->nextqid = GBIT64(p + 72);
 	fs->super = b;
 	fprint(2, "load: %8s\n", p);
-	fprint(2, "\theight:\t%d\n", fs->height);
-	fprint(2, "\trootb:\t%llx\n", fs->rootb);
-	fprint(2, "\trooth:\t%llx\n", fs->rooth);
+	fprint(2, "\theight:\t%d\n", fs->root.ht);
+	fprint(2, "\trootb:\t%llx\n", fs->root.bp);
+	fprint(2, "\trooth:\t%llx\n", fs->root.bh);
 	fprint(2, "\tarenas:\t%d\n", fs->narena);
 	fprint(2, "\tarenasz:\t%lld\n", fs->arenasz);
 	fprint(2, "\trootgen:\t%lld\n", fs->gen);
--- a/ream.c
+++ b/ream.c
@@ -60,11 +60,11 @@
 	b->flag |= Bdirty;
 
 	p = b->data+Loghdsz;
-	PBIT64(p+ 0, off|LgFree);		/* off */
+	PBIT64(p+ 0, off|LogFree);		/* off */
 	PBIT64(p+ 8, asz);			/* len */
-	PBIT64(p+16, b->off|LgAlloc);		/* off */
+	PBIT64(p+16, b->off|LogAlloc);		/* off */
 	PBIT64(p+24, Blksz);			/* len */
-	PBIT64(p+32, (uvlong)LgEnd);		/* done */
+	PBIT64(p+32, (uvlong)LogEnd);		/* done */
 	finalize(b);
 	if(syncblk(b) == -1)
 		sysfatal("ream: init log");
@@ -147,9 +147,9 @@
 	finalize(r);
 
 	fs->super = s;
-	fs->rootb = r->off;
-	fs->rooth = blkhash(r);
-	fs->height = 1;
+	fs->root.bp = r->off;
+	fs->root.bh = blkhash(r);
+	fs->root.ht = 1;
 	snapshot();
 
 	putblk(s);
--- a/tree.c
+++ b/tree.c
@@ -418,6 +418,7 @@
 	hi = (p != nil) ? p->hi : -1;
 	pidx = (p != nil) ? p->idx : -1;
 	midx = (p != nil && p->merge) ? p->midx : -1;
+if(p->merge) showblk(b, "preupdate", 0);
 	if((n = newblk(b->type)) == nil)
 		return -1;
 	for(i = 0; i < b->nval; i++){
@@ -426,10 +427,19 @@
 			continue;
 		}else if(i == midx){
 			getval(p->nl, 0, &m);
+			m.type = Vref;
+			m.bp = p->nl->off;
+			m.bh = blkhash(p->nl);
+			m.fill = blkfill(p->nl);
 			setval(n, j++, &m, 0);
 			if(p->nr){
 				getval(p->nr, 0, &m);
+				m.type = Vref;
+				m.bp = p->nr->off;
+				m.bh = blkhash(p->nr);
+				m.fill = blkfill(p->nr);
 				setval(n, j++, &m, 0);
+				i++;
 			}
 			continue;
 		}
@@ -630,6 +640,8 @@
 			setmsg(d, o++, &m);
 		}
 	}
+	finalize(d);
+	enqueue(d);
 	p->merge = 1;
 	p->midx = idx;
 	p->nl = d;
@@ -690,6 +702,7 @@
 	Blk *d, *l, *r;
 	Msg m;
 
+	print("a->type: %d\n", a->type);
 	l = newblk(a->type);
 	r = newblk(a->type);
 	if(l == nil || r == nil){
@@ -743,6 +756,10 @@
 			setmsg(d, o++, &m);
 		}
 	}
+	finalize(l);
+	finalize(r);
+	enqueue(l);
+	enqueue(r);
 	p->merge = 1;
 	p->midx = midx;
 	p->nl = l;
@@ -794,7 +811,7 @@
 		if((m = pp->n) == nil)
 			return 0;
 	}else{
-		if((m = getblk(km.bp, km.bh)) == nil)
+		if((m = getblk(km.bp, km.bh, 0)) == nil)
 			return -1;
 	}
 	/* Try merging left */
@@ -803,7 +820,7 @@
 		getval(p->b, idx-1, &kl);
 		if(kl.fill + km.fill >= Blkspc)
 			goto next;
-		if((l = getblk(kl.bp, kl.bh)) == nil)
+		if((l = getblk(kl.bp, kl.bh, 0)) == nil)
 			goto out;
 		if(rotmerge(p, idx-1, l, m) == -1)
 			goto out;
@@ -814,7 +831,7 @@
 		getval(p->b, idx+1, &kr);
 		if(kr.fill + km.fill >= Blkspc)
 			goto done;
-		if((r = getblk(kr.bp, kr.bh)) == nil)
+		if((r = getblk(kr.bp, kr.bh, 0)) == nil)
 			goto out;
 		if(rotmerge(p, idx, m, r) == -1)
 			goto out;
@@ -883,10 +900,8 @@
 	}
 	for(; p > path; p--){
 		if(!filledpiv(p->b, 1)){
-#ifdef NOPE
 			if(trymerge(p, pp, p->idx) == -1)
 				goto error;
-#endif
 			/* If we merged the root node, break out. */
 			if(p[-1].b == nil && p[0].merge && p[0].nr == nil && p[0].b->nval == 2){
 				/* FIXME: shouldn't p[1].n already be right? */
@@ -917,7 +932,7 @@
 				bufinsert(b, &m);
 			}
 			finalize(p->l);
-			Blk *x = getblk(p->l->off, -1);
+			Blk *x = getblk(p->l->off, -1, 0);
 			assert(p->l == x);
 			finalize(p->r);
 		}
@@ -993,7 +1008,7 @@
 }
 
 int
-btupsert(Msg *msg, int nmsg)
+btupsert(Tree *t, Msg *msg, int nmsg)
 {
 	int i, npath, redo, dh, nm, height;
 	vlong rh;
@@ -1011,26 +1026,12 @@
 	}
 
 again:
-	if((b = getroot(&height)) == nil){
+	if((b = getroot(t, &height)) == nil){
 		werrstr("get root: %r");
 		return -1;
 	}
 
 	/*
-	 * Fast path: the root of the tree has room,
-	 * and nobody else has observed the node, so
-	 * we can modify it with gleeful abandon.
-	 */
-	if(b->type == Tpivot && canwlock(b)) {
-		if((b->flag & Bdirty) && !filledbuf(b, nm)){
-			for(i = 0; i < nmsg; i++)
-				bufinsert(b, &msg[i]);
-			wunlock(b);
-			return 0;
-		}
-	}
-
-	/*
 	 * The tree can grow in height by 1 when we
 	 * split, so we allocate room for one extra
 	 * node in the path.
@@ -1050,7 +1051,7 @@
 			break;
 		victim(b, &path[npath]);
 		getval(b, path[npath].idx, &sep);
-		b = getblk(sep.bp, sep.bh);
+		b = getblk(sep.bp, sep.bh, 0);
 		if(b == nil)
 			goto error;
 		npath++;
@@ -1091,11 +1092,11 @@
 
 	assert(rb->off != 0);
 	rh = blkhash(rb);
-	lock(&fs->rootlk);
-	fs->height += dh;
-	fs->rootb = rb->off;
-	fs->rooth = rh;
-	unlock(&fs->rootlk);
+	lock(&t->lk);
+	t->ht += dh;
+	t->bp = rb->off;
+	t->bh = rh;
+	unlock(&t->lk);
 
 	freepath(path, npath);
 	free(path);
@@ -1115,17 +1116,17 @@
 }
 
 Blk*
-getroot(int *h)
+getroot(Tree *t, int *h)
 {
 	vlong bp, bh;
 
-	lock(&fs->rootlk);
-	bp = fs->rootb;
-	bh = fs->rooth;
+	lock(&t->lk);
+	bp = t->bp;
+	bh = t->bh;
 	if(h != nil)
-		*h = fs->height;
-	unlock(&fs->rootlk);
-	return getblk(bp, bh);
+		*h = t->ht;
+	unlock(&t->lk);
+	return getblk(bp, bh, 0);
 }
 
 static char*
@@ -1179,7 +1180,7 @@
 		if(idx == -1)
 			return Eexist;
 		putblk(b);
-		if((b = getblk(r->bp, r->bh)) == nil)
+		if((b = getblk(r->bp, r->bh, 0)) == nil)
 			return Efs;
 	}
 	assert(b->type == Tleaf);
@@ -1193,13 +1194,13 @@
 }
 
 char*
-btlookup(Key *k, Kvp *r, Blk **bp)
+btlookup(Tree *t, Key *k, Kvp *r, Blk **bp)
 {
 	char *ret;
 	Blk *b;
 
 	*bp = nil;
-	if((b = getroot(nil)) == nil)
+	if((b = getroot(t, nil)) == nil)
 		return Efs;
 	ret = btlookupat(b, k, r, bp);
 	putblk(b);
@@ -1208,7 +1209,7 @@
 }
 
 char*
-btscan(Scan *s, Key *pfx, vlong rootb, vlong rooth, int height)
+btscan(Tree *t, Scan *s, Key *pfx)
 {
 	int i, same;
 	Scanp *p;
@@ -1218,7 +1219,6 @@
 
 	s->done = 0;
 	s->offset = 0;
-	s->height = height;
 	cpkey(&s->pfx, pfx, s->pfxbuf, sizeof(s->pfxbuf));
 
 	s->kv.v = s->kvbuf+pfx->nk;
@@ -1225,28 +1225,20 @@
 	s->kv.nv = 0;
 	cpkey(&s->kv, pfx, s->kvbuf, sizeof(s->kvbuf));
 
-	if(rootb != -1){
-		s->rootb = rootb;
-		s->rooth = rooth;
-		s->height = height;
-	}else{
-		lock(&fs->rootlk);
-		s->rootb = fs->rootb;
-		s->rooth = fs->rooth;
-		s->height = fs->height;
-dprint("height %d\n", s->height);
-		unlock(&fs->rootlk);
-	}
-	if((s->path = calloc(s->height, sizeof(Scanp))) == nil){
+	lock(&t->lk);
+	s->root = *t;
+//dprint("height %d\n", s->root.ht);
+	unlock(&t->lk);
+	if((s->path = calloc(s->root.ht, sizeof(Scanp))) == nil){
 		free(s);
 		return nil;
 	}
 
 	p = s->path;
-	if((b = getblk(s->rootb, s->rooth)) == nil)
+	if((b = getblk(s->root.bp, s->root.bh, 0)) == nil)
 		return "error reading block";
 	p[0].b = b;
-	for(i = 0; i < s->height; i++){
+	for(i = 0; i < s->root.ht; i++){
 		p[i].vi = blksearch(b, &s->kv, &v, &same);
 		if(p[i].vi == -1 || (!same && b->type == Tleaf))
 			getval(b, ++p[i].vi, &v);
@@ -1254,18 +1246,18 @@
 			p[i].bi = bufsearch(b, &s->kv, &m, &same);
 			if(p[i].bi == -1 || !same)
 				p[i].bi++;
-			if((b = getblk(v.bp, v.bh)) == nil)
+			if((b = getblk(v.bp, v.bh, 0)) == nil)
 				return "error readivg block";
 			p[i+1].b = b;
 		}else{
-			assert(i == s->height-1);
+			assert(i == s->root.ht-1);
 		}
 	}
-dprint("inited\n");
-for(i = 0; i < s->height; i++){
-dprint("\t%p", p[i].b);
-dprint(" (%d %d)\n", p[i].vi, p[i].bi);
-}
+//dprint("inited\n");
+//for(i = 0; i < s->root.ht; i++){
+//dprint("\t%p", p[i].b);
+//dprint(" (%d %d)\n", p[i].vi, p[i].bi);
+//}
 	return nil;
 }
 
@@ -1278,15 +1270,15 @@
 	Kvp kv;
 
 	p = s->path;
-	h = s->height;
+	h = s->root.ht;
 	*done = 0;
 	start = h;
 	for(i = h-1; i > 0; i--){
-dprint("advancing (i=%d)\n", i);
-for(j = 0; j < h; j++){
-dprint("\t%p", p[j].b);
-dprint(" (%d %d)\n", p[j].vi, p[j].bi);
-}
+//dprint("advancing (i=%d)\n", i);
+//for(j = 0; j < h; j++){
+//dprint("\t%p", p[j].b);
+//dprint(" (%d %d)\n", p[j].vi, p[j].bi);
+//}
 		if(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf)
 			break;
 		if(i == 0){
@@ -1300,7 +1292,7 @@
 	}
 	for(i = start; i < h; i++){
 		getval(p[i-1].b, p[i-1].vi, &kv);
-		if((p[i].b = getblk(kv.bp, kv.bh)) == nil)
+		if((p[i].b = getblk(kv.bp, kv.bh, 0)) == nil)
 			return "error reading block";
 	}
 	getval(p[h-1].b, p[h-1].vi, &m);
@@ -1335,7 +1327,7 @@
 {
 	int i;
 
-	for(i = 0; i < s->height; i++)
+	for(i = 0; i < s->root.ht; i++)
 		if(s->path[i].b != nil)
 			putblk(s->path[i].b);
 	free(s->path);
@@ -1352,10 +1344,10 @@
 	s = fs->super;
 
 	qlock(&fs->snaplk);
-	lock(&fs->rootlk);
+	lock(&fs->root.lk);
 	fillsuper(s);
 	finalize(s);
-	unlock(&fs->rootlk);
+	unlock(&fs->root.lk);
 
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
@@ -1367,49 +1359,4 @@
 		r = syncblk(s);
 	qunlock(&fs->snaplk);
 	return r;
-}
-
-int
-getrefpg(vlong off, Blk **p)
-{
-	char kbuf[9], *e;
-	vlong bp, bh;
-	Blk *b;
-	Kvp kv;
-
-	*p = nil;
-	kbuf[0] = Kref;
-	PBIT64(kbuf+1, off & ~(Blksz-1));
-	kv.k = kbuf;
-	kv.nk = sizeof(kbuf);
-	e = btlookup(&kv, &kv, &b);
-	if(e == Eexist)
-		return 0;
-	if(e != nil || kv.nv != 16)
-		return -1;
-	bp = GBIT64(kv.v+0);
-	bh = GBIT64(kv.v+8);
-	putblk(b);
-	if((*p = getblk(bp, bh)) == nil)
-		return -1;
-	return 0;
-}
-
-int
-setrefpg(vlong pg, vlong bp, vlong bh)
-{
-	char kbuf[9], vbuf[16];
-	Msg m;
-
-	kbuf[0] = Kref;
-	PBIT64(kbuf+1, pg & ~(Blksz-1));
-	PBIT64(vbuf+0, bp);
-	PBIT64(vbuf+8, bh);
-
-	m.op = Oinsert;
-	m.k = kbuf;
-	m.nk = sizeof(kbuf);
-	m.v = vbuf;
-	m.nv = sizeof(vbuf);
-	return btupsert(&m, 1);
 }