shithub: gefs

Download patch

ref: eed6e68174a13a4b7c0b39acdcfa911a302f96c2
parent: 4a29d3fe0aada31421c2c48f167049ab373a96bf
author: Ori Bernstein <ori@eigenstate.org>
date: Thu Jan 20 16:08:27 EST 2022

blk, snap, fs: many many fixes

Roll up a bunch of fixes, the largest of
which is split deadlist and freelists.

They're similar, but not similar enough,
and the recursive nature of the freelist
adds some subtle complexity.

On top of that, fix several bits of FS
semantics, implementing some of the funky
flags.

--- a/blk.c
+++ b/blk.c
@@ -13,7 +13,7 @@
 };
 
 static vlong	blkalloc_lk(Arena*);
-static vlong	blkalloc(void);
+static vlong	blkalloc(int);
 static int	blkdealloc_lk(vlong);
 static Blk*	initblk(vlong, int);
 
@@ -63,7 +63,7 @@
 	}
 }
 
-Blk*
+static Blk*
 readblk(vlong bp, int flg)
 {
 	Blk *b;
@@ -70,7 +70,7 @@
 	vlong off, rem, n;
 
 	assert(bp != -1);
-	if((b = mallocz(sizeof(Blk), 1)) == nil)
+	if((b = malloc(sizeof(Blk))) == nil)
 		return nil;
 	off = bp;
 	rem = Blksz;
@@ -83,15 +83,26 @@
 		off += n;
 		rem -= n;
 	}
-	b->type = (flg&GBraw) ? Traw : GBIT16(b->buf+0);
-	b->bp.addr = bp;
-	b->bp.hash = -1;
-	b->bp.gen = -1;
 	b->ref = 1;
 	b->cnext = nil;
 	b->cprev = nil;
 	b->hnext = nil;
-	b->data = b->buf + 10;
+	b->flag = 0;
+
+	b->type = (flg&GBraw) ? Traw : GBIT16(b->buf+0);
+	b->bp.addr = bp;
+	b->bp.hash = -1;
+	b->bp.gen = -1;
+	b->data = b->buf + Hdrsz;
+	b->fnext = nil;
+
+	b->nval = 0;
+	b->valsz = 0;
+	b->nbuf = 0;
+	b->bufsz = 0;
+	b->logsz = 0;
+	b->lognxt = 0;
+
 	switch(b->type){
 	default:
 		if(flg&GBraw)
@@ -103,6 +114,7 @@
 	case Traw:
 	case Tsuper:
 	case Tlog:
+	case Tdead:
 		break;
 	case Tpivot:
 		b->nval = GBIT16(b->buf+2);
@@ -119,12 +131,12 @@
 }
 
 static Arena*
-pickarena(void)
+pickarena(int hint)
 {
-	long n;
+	int n;
 
-	n = (ainc(&fs->roundrobin)/1024) % fs->narena;
-	return &fs->arenas[n];
+	n = hint+ainc(&fs->roundrobin)/1024;
+	return &fs->arenas[n%fs->narena];
 }
 
 Arena*
@@ -176,14 +188,14 @@
 	return 0;
 }
 
-int
+static int
 syncarena(Arena *a)
 {
 	char *p;
 
 	p = a->b->data;
-	PBIT64(p, a->log.head.addr);	p += 8;	/* freelist addr */
-	PBIT64(p, a->log.head.hash);	p += 8;	/* freelist hash */
+	PBIT64(p, a->head.addr);	p += 8;	/* freelist addr */
+	PBIT64(p, a->head.hash);	p += 8;	/* freelist hash */
 	PBIT64(p, a->size);		p += 8;	/* arena size */
 	PBIT64(p, a->used);			/* arena used */
 	finalize(a->b);
@@ -190,7 +202,7 @@
 	return syncblk(a->b);
 }
 
-int
+static int
 grabrange(Avltree *t, vlong off, vlong len)
 {
 	Arange *r, *s, q;
@@ -226,24 +238,35 @@
 	return 0;
 }
 
-int
-logappend(Oplog *ol, Arena *a, vlong off, vlong len, vlong val, int op)
+/*
+ * Logs an allocation. Must be called
+ * with arena lock held. Duplicates some
+ * of the work in allocblk to prevent
+ * recursion.
+ */
+static int
+logappend(Arena *_a, vlong off, vlong len, int op, Blk **tl)
 {
 	Blk *pb, *lb;
 	vlong o;
 	char *p;
-	int r;
 
 	assert(off % Blksz == 0);
-	assert(op == LogAlloc || op == LogFree || op == LogDead);
-	lb = ol->tail;
-	if(lb == nil || lb->logsz > Logspc - 8){
+	assert(op == LogAlloc || op == LogFree);
+	o = -1;
+	lb = *tl;
+	dprint("logop %llx+%llx@%llx: %s\n", off, len, lb->logsz, (op == LogAlloc) ? "Alloc" : "Free");
+	/*
+	 * move to the next block when we have
+	 * 40 bytes in the log:
+	 * We're appending up to 16 bytes as
+	 * part of the operation, followed by
+	 * 16 bytes of new log entry allocation
+	 * and chaining.
+	 */
+	if(lb == nil || lb->logsz >= Logspc - 40){
 		pb = lb;
-		if(a == nil)
-			o = blkalloc();
-		else
-			o = blkalloc_lk(a);
-		if(o == -1)
+		if((o = blkalloc_lk(_a)) == -1)
 			return -1;
 		if((lb = initblk(o, Tlog)) == nil)
 			return -1;
@@ -256,17 +279,18 @@
 			putblk(lb);
 			return -1;
 		}
-		ol->tail = lb;
 
 		if(pb != nil){
 			p = pb->data + pb->logsz;
-			PBIT64(p + 0, lb->bp.addr|LogChain);
+			PBIT64(p, lb->bp.addr|LogChain);
 			finalize(pb);
-			r = syncblk(pb);
-			putblk(pb);
-			if(r == -1)
+			if(syncblk(pb) == -1){
+				putblk(pb);
 				return -1;
+			}
+			putblk(pb);
 		}
+		*tl = lb;
 	}
 
 	if(len == Blksz){
@@ -280,146 +304,36 @@
 	PBIT64(p, off);
 	lb->logsz += 8;
 	if(op >= Log2wide){
-		PBIT64(p+8, val);
+		PBIT64(p+8, len);
 		lb->logsz += 8;
 	}
+	/*
+	 * The newly allocated log block needs
+	 * to be recorded.
+	 */
+	if(o != -1){
+		p = lb->data + lb->logsz;
+		PBIT64(p, o|LogAlloc1);
+		lb->logsz += 8;
+	}
 	/* this gets overwritten by the next append */
 	p = lb->data + lb->logsz;
 	PBIT64(p, (uvlong)LogEnd);
-	if(ol->head.addr == -1)
-		ol->head = lb->bp;
-	if(ol->tail != lb){
-		putblk(ol->tail);
-		ol->tail = lb;
-	}
 	return 0;
-}
 
-static void
-showdeadbp(Bptr bp, void *)
-{
-	fprint(2, "\tdead: %B\n", bp);
 }
 
-void
-showlst(char *m, Oplog *l)
+static int
+logop(Arena *a, vlong off, vlong len, int op)
 {
-	fprint(2, "showlst: %s\n", m);
-	scandead(l, showdeadbp, nil);
-}
-
-int
-graft(Oplog *a, Oplog *b)
-{
-	char *p;
-	vlong o;
-
-
-	if(b->head.addr == -1)
-		return 0;
-	if(a->head.addr == -1){
-		a->head = b->head;
-		a->tail = b->tail;
-		return 0;
-	}
-
-	o = b->head.addr|LogChain;
-	p = a->tail->data + a->tail->logsz;
-	PBIT64(p, o);
-	finalize(a->tail);
-	if(a->tail->bp.addr == a->head.addr)
-		a->head = a->tail->bp;
-	if(syncblk(a->tail) == -1)
+	if(logappend(a, off, len, op, &a->tail) == -1)
 		return -1;
-	putblk(a->tail);
-	a->tail = b->tail;
+	if(a->head.addr == -1)
+		a->head = a->tail->bp;
 	return 0;
 }
 
 int
-killblk(Tree *t, Bptr bp)
-{
-	Oplog *l;
-	int i;
-
-	l = nil;
-	for(i = 0; i < Ndead; i++){
-		l = &t->dead[i];
-		if(bp.gen > t->prev[i])
-			break;
-	}
-	if(logappend(l, nil, bp.addr, Blksz, bp.gen, LogDead) == -1)
-		return -1;
-	return 0;
-}
-
-/*
- * Logs an allocation. Must be called
- * with arena lock held. Duplicates some/c
- * of the work in allocblk to prevent
- * recursion.
- */
-int
-logop(Arena *a, vlong off, int op)
-{
-	cachedel(off);
-	if(logappend(&a->log, a, off, Blksz, Blksz, op) == -1)
-		return -1;
-	return 0;
-}
-
-int
-scandead(Oplog *l, void (*fn)(Bptr, void*), void *dat)
-{
-	vlong ent, off;
-	int op, i, n;
-	Bptr dead;
-	char *d;
-	Bptr bp;
-	Blk *b;
-
-	bp = l->head;
-	if(bp.addr == -1)
-		return 0;
-Nextblk:
-	if((b = getblk(bp, GBnochk)) == nil)
-		return -1;
-// TODO: hash checks for these chains
-	for(i = Loghdsz; i < Logspc; i += n){
-		d = b->data + i;
-		ent = GBIT64(d);
-		op = ent & 0xff;
-		off = ent & ~0xff;
-		n = (op >= Log2wide) ? 16 : 8;
-		switch(op){
-		case LogEnd:
-			dprint("log@%d: end\n", i);
-			return 0;
-		case LogDead:
-			dead.addr = off;
-			dead.hash = -1;
-			dead.gen = GBIT64(d+8);
-			dprint("log@%d: dead %B\n", i, dead);
-			fn(dead, dat);
-			break;
-		case LogChain:
-			bp.addr = off & ~0xff;
-			bp.hash = -1;
-			bp.gen = -1;
-			dprint("log@%d: chain %B\n", i, bp);
-			goto Nextblk;
-			break;
-		default:
-			n = 0;
-			fprint(2, "log@%d: log op %d\n", i, op);
-			abort();
-			break;
-		}
-	}
-	return 0;
-}
-
-int
 loadlog(Arena *a)
 {
 	vlong ent, off, len;
@@ -430,7 +344,7 @@
 	Blk *b;
 
 
-	bp = a->log.head;
+	bp = a->head;
 Nextblk:
 	if((b = getblk(bp, GBnochk)) == nil)
 		return -1;
@@ -490,11 +404,11 @@
 	Range *log, *nlog;
 	vlong v, ba, na, graft, oldhd;
 	int i, n, sz;
-	Blk *hd, *b;
-	Oplog ol;
-	char *p;
+	Blk *b, *hd, *tl;
 	Bptr bp;
+	char *p;
 
+//	Oplog ol;
 	/*
 	 * Sync the current log to disk, and
 	 * set up a new block log tail.  While
@@ -514,7 +428,8 @@
 	setflag(b, Bdirty);
 	b->logsz = Loghdsz;
 
-	PBIT64(b->data+b->logsz, (uvlong)LogEnd);
+	p = b->data + b->logsz;
+	PBIT64(p, (uvlong)LogEnd);
 	finalize(b);
 	if(syncblk(b) == -1){
 		free(b);
@@ -522,14 +437,14 @@
 	}
 
 	graft = b->bp.addr;
-	if(a->log.tail != nil){
-		finalize(a->log.tail);
-		if(syncblk(a->log.tail) == -1){
+	if(a->tail != nil){
+		finalize(a->tail);
+		if(syncblk(a->tail) == -1){
 			free(b);
 			return -1;
 		}
 	}
-	a->log.tail = b;
+	a->tail = b;
 
 	/*
 	 * Prepare what we're writing back.
@@ -541,6 +456,20 @@
 	sz = 512;
 	if((log = malloc(sz*sizeof(Range))) == nil)
 		return -1;
+	/*
+	 * we need to allocate this block before
+	 * we prepare the ranges we write back,
+	 * so we don't record this block as
+	 * available when we compress the log.
+	 */
+	if((ba = blkalloc_lk(a)) == -1){
+		free(log);
+		return -1;
+	}
+	if((b = initblk(ba, Tlog)) == nil){
+		free(log);
+		return -1;
+	}
 	for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
 		if(n == sz){
 			sz *= 2;
@@ -554,17 +483,13 @@
 		log[n].len = r->len;
 		n++;
 	}
-	if((b = newblk(Tlog)) == nil){
-		free(log);
-		return -1;
-	}
 	hd = b;
-	ol.head = b->bp;
-	ol.tail = b;
+	tl = b;
 	b->logsz = Loghdsz;
 	for(i = 0; i < n; i++)
-		if(logappend(&ol, a, log[i].off, log[i].len, log[i].len, LogFree) == -1)
+		if(logappend(a, log[i].off, log[i].len, LogFree, &tl) == -1)
 			return -1;
+
 	p = b->data + b->logsz;
 	PBIT64(p, LogChain|graft);
 	free(log);
@@ -572,10 +497,10 @@
 	if(syncblk(b) == -1)
 		return -1;
 
-	oldhd = a->log.head.addr;
-	a->log.head.addr = hd->bp.addr;
-	a->log.head.hash = blkhash(hd);
-	a->log.head.gen = -1;
+	oldhd = a->head.addr;
+	a->head.addr = hd->bp.addr;
+	a->head.hash = blkhash(hd);
+	a->head.gen = -1;
 	if(syncarena(a) == -1)
 		return -1;
 	if(oldhd != -1){
@@ -606,8 +531,8 @@
 			unlock(a);
 		}
 	}
-	finalize(a->log.tail);
-	if(syncblk(a->log.tail) == -1)
+	finalize(a->tail);
+	if(syncblk(a->tail) == -1)
 		return -1;
 	return 0;
 }
@@ -660,7 +585,7 @@
 	a = getarena(b);
 	if(freerange(a->free, b, Blksz) == -1)
 		goto out;
-	if(logop(a, b, LogFree) == -1)
+	if(logop(a, b, Blksz, LogFree) == -1)
 		goto out;
 	a->used -= Blksz;
 	r = 0;
@@ -669,7 +594,7 @@
 }
 
 static vlong
-blkalloc(void)
+blkalloc(int hint)
 {
 	Arena *a;
 	vlong b;
@@ -677,7 +602,7 @@
 
 	tries = 0;
 Again:
-	a = pickarena();
+	a = pickarena(hint);
 	if(a == nil || tries == fs->narena){
 		werrstr("no empty arenas");
 		return -1;
@@ -700,7 +625,7 @@
 		unlock(a);
 		goto Again;
 	}
-	if(logop(a, b, LogAlloc) == -1){
+	if(logop(a, b, Blksz, LogAlloc) == -1){
 		unlock(a);
 		return -1;
 	}
@@ -714,7 +639,7 @@
 	Blk *b;
 
 	if((b = lookupblk(bp)) == nil){
-		if((b = mallocz(sizeof(Blk), 1)) == nil)
+		if((b = malloc(sizeof(Blk))) == nil)
 			return nil;
 		setmalloctag(b, getcallerpc(&bp));
 		/*
@@ -728,6 +653,7 @@
 		b->cnext = nil;
 		b->cprev = nil;
 		b->hnext = nil;
+		b->flag = 0;
 	}
 
 	b->type = t;
@@ -754,7 +680,7 @@
 	vlong bp;
 	Blk *b;
 
-	if((bp = blkalloc()) == -1)
+	if((bp = blkalloc(t)) == -1)
 		return nil;
 	if((b = initblk(bp, t)) == nil)
 		return nil;
@@ -810,6 +736,7 @@
 		b->bp.hash = blkhash(b);
 		break;
 	case Tlog:
+	case Tdead:
 		h = siphash(b->data + 8, Blkspc-8);
 		PBIT64(b->data, h);
 		b->bp.hash = blkhash(b);
@@ -859,13 +786,6 @@
 }
 
 Blk*
-dupblk(vlong bp, uvlong bh)
-{
-	USED(bp, bh);
-	return nil;
-}
-
-Blk*
 refblk(Blk *b)
 {
 	ainc(&b->ref);
@@ -903,7 +823,7 @@
 	Bfree *f;
 
 	dprint("[%s] free blk %B\n", (t == &fs->snap) ? "snap" : "data", bp);
-	if(bp.gen <= t->gen){
+	if(t != nil && bp.gen <= t->gen){
 		killblk(t, bp);
 		return;
 	}
@@ -994,8 +914,8 @@
 	qlock(&fs->snaplk);
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
-		finalize(a->log.tail);
-		if(syncblk(a->log.tail) == -1)
+		finalize(a->tail);
+		if(syncblk(a->tail) == -1)
 			r = -1;
 		if(syncarena(a) == -1)
 			r = -1;
--- a/cache.c
+++ b/cache.c
@@ -6,7 +6,7 @@
 #include "dat.h"
 #include "fns.h"
 
-void
+static void
 cachedel_lk(vlong del)
 {
 	Bucket *bkt;
@@ -112,10 +112,12 @@
 
 	h = ihash(off);
 
+	inc64(&fs->stats.cachelook, 1);
 	bkt = &fs->cache[h % fs->cmax];
 	lock(bkt);
 	for(b = bkt->b; b != nil; b = b->hnext)
 		if(b->bp.addr == off){
+			inc64(&fs->stats.cachehit, 1);
  			refblk(b);
 			break;
 		}
--- a/cons.c
+++ b/cons.c
@@ -77,6 +77,7 @@
 	a->op = AOsnap;
 	a->fd = fd;
 	m->a = a;
+	sendsync(fd, 0);
 	chsend(fs->wrchan, m);
 	return;	
 Error:
--- a/dat.h
+++ b/dat.h
@@ -19,7 +19,7 @@
 typedef struct Bucket	Bucket;
 typedef struct Chan	Chan;
 typedef struct Tree	Tree;
-typedef struct Oplog	Oplog;
+typedef struct Dlist	Dlist;
 typedef struct Mount	Mount;
 typedef struct User	User;
 typedef struct Stats	Stats;
@@ -57,7 +57,7 @@
 	Snapsz	= 9,			/* tag, snapid */
 	Dpfxsz	= 9,
 	Ndead	= 8,			/* number of deadlist heads */
-	Deadsz	= 8+8+8+8+8,		/* prev, head, head hash, tail, tail hash */
+	Deadsz	= 8+8+8,		/* prev, head, hash */
 	Treesz	= 4+4+8+Ptrsz+Ndead*Deadsz,	/* ref, height, gen, root, deadlist */
 	Kvmax	= Keymax + Inlmax,	/* Key and value */
 	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
@@ -237,11 +237,12 @@
 enum {
 	Tnone,
 	Traw,
+	Tpivot,
+	Tleaf,
 	Tsuper,
 	Tarena,
 	Tlog,
-	Tpivot,
-	Tleaf,
+	Tdead,
 };
 
 enum {
@@ -294,10 +295,19 @@
 #define	Log2wide	LogAlloc
 	LogAlloc,	/* alloc a range */
 	LogFree,	/* free a range */
-	LogDead	,	/* deadlist a block */
 };
 
+/*
+ * Operations for the deadlist log
+ */
 enum {
+	DlEnd,		/* no data */
+	DlChain,	/* [8]addr, [8]hash */
+	DlGraft,	/* [8]addr, [8]hash */
+	DlKill,		/* [8]addr, [8]gen */
+};
+
+enum {
 	AOnone,
 	AOsnap,
 	AOsync,
@@ -309,9 +319,10 @@
 	vlong	gen;
 };
 
-struct Oplog {
-	Bptr	head;
-	Blk	*tail;	/* tail block open for writing */
+struct Dlist {
+	vlong	prev;	/* previous generation */
+	Bptr	head;	/* first flushed block */
+	Blk	*ins;	/* inserted block */
 };
 
 struct Arange {
@@ -357,8 +368,7 @@
 	int	ht;
 	Bptr	bp;
 	vlong	gen;
-	vlong	prev[Ndead];
-	Oplog	dead[Ndead];
+	Dlist	dead[Ndead];
 };
 
 struct Bfree {
@@ -413,8 +423,8 @@
 	/* root snapshot tree */
 	Tree	snap;
 
-	uvlong	nextqid;
-	uvlong	nextgen;
+	vlong	nextqid;
+	vlong	nextgen;
 
 	/* arena allocation */
 	Arena	*arenas;
@@ -459,7 +469,9 @@
 	vlong	nq;
 	vlong	size;
 	vlong	used;
-	Oplog	log;
+	/* freelist */
+	Bptr	head;
+	Blk	*tail;	/* tail held open for writing */
 };
 
 struct Key{
@@ -506,6 +518,7 @@
 };
 
 struct Mount {
+	Lock;
 	Mount	*next;
 	long	ref;
 	vlong	gen;
--- a/dump.c
+++ b/dump.c
@@ -57,6 +57,10 @@
 		n = fmtprint(fmt, "(%B,%d)", unpackbp(v->v, v->nv), GBIT16(v->v+Ptrsz));
 		return n;
 	}
+	if(op == Odelete || op == Oclearb){
+		n = fmtprint(fmt, "delete");
+		return n;
+	}
 	switch(v->k[0]){
 	case Kdat:	/* qid[8] off[8] => ptr[16]:	pointer to data page */
 		switch(op){
@@ -69,6 +73,7 @@
 			n = fmtprint(fmt, "ptr:%B", unpackbp(v->v, v->nv));
 			break;
 		}
+		break;
 	case Kent:	/* pqid[8] name[n] => dir[n]:	serialized Dir */
 		switch(op){
 		case Onop:
@@ -124,7 +129,7 @@
 	case Ksnap:	/* name[n] => dent[16] ptr[16]:	snapshot root */
 		if(unpacktree(&t, v->v, v->nv) == nil)
 			return fmtprint(fmt, "corrupt tree");
-		n = fmtprint(fmt, "ref: %d, ht: %d, bp: %B, prev=%lld", t.ref, t.ht, t.bp, t.prev[0]);
+		n = fmtprint(fmt, "ref: %d, ht: %d, bp: %B, prev=%lld", t.ref, t.ht, t.bp, t.dead[0].prev);
 		break;
 	case Klabel:
 		n = fmtprint(fmt, "snap id:\"%llx\"", GBIT64(v->v+1));
@@ -227,7 +232,7 @@
 	return fmtprint(fmt, "(%llx %ld %d)", q.path, q.vers, q.type);
 }
 
-void
+static void
 rshowblk(int fd, Blk *b, int indent, int recurse)
 {
 	Blk *c;
@@ -315,6 +320,7 @@
 void
 showtreeroot(int fd, Tree *t)
 {
+	Dlist *dl;
 	int i;
 
 	fprint(fd, "\tgen:\t%lld\n", t->gen);
@@ -322,18 +328,14 @@
 	fprint(fd, "\tht:\t%d\n", t->ht);
 	fprint(fd, "\tbp:\t%B\n", t->bp);
 	for(i = 0; i < Ndead; i++){
-		fprint(fd, "\tdeadlist[%d]: prev=%llx\n", i, t->prev[i]);
-		fprint(fd, "\t\tprev:\t%llx\n", t->prev[i]);
-		fprint(fd, "\t\tfhead:\t%B\n", t->dead[i].head);
-		if(t->dead[i].tail != nil){
-			fprint(fd, "\t\tftailp:%llx\n", t->dead[i].tail->bp.addr);
-			fprint(fd, "\t\tftailh:%llx\n", t->dead[i].tail->bp.hash);
-		}else{
-			fprint(fd, "\t\tftailp:\t-1\n");
-			fprint(fd, "\t\tftailh:\t-1\n");
-		}
-		fprint(fd, "\t\tdead[%d]: (%B)\n", i, t->dead[i].head);
-		scandead(&t->dead[i], showdeadbp, &fd);
+		dl = &t->dead[i];
+		fprint(fd, "	deadlist[%d]:\n", i);
+		fprint(fd, "		prev:	%llx\n", dl->prev);
+		fprint(fd, "		fhead:	%B\n", dl->head);
+		if(dl->ins != nil)
+			fprint(fd, "		open:	%B\n", dl->ins->bp);
+		fprint(fd, "		dead[%d]: (%B)\n", i, dl->head);
+		scandead(&t->dead[i], 0, showdeadbp, &fd);
 	}
 }
 
@@ -342,6 +344,7 @@
 {
 	char *e, pfx[Snapsz];
 	int sz, done;
+	Mount *mnt;
 	vlong id;
 	Scan *s;
 	Tree t;
@@ -376,6 +379,12 @@
 		}
 		showtreeroot(fd, &t);
 	}
+	qlock(&fs->snaplk);
+	for(mnt = fs->mounts; mnt != nil; mnt = mnt->next){
+		fprint(fd, "open: %s\n", mnt->name);
+		showtreeroot(fd, mnt->root);
+	}
+	qunlock(&fs->snaplk);
 }
 
 void
--- a/fns.h
+++ b/fns.h
@@ -25,7 +25,6 @@
 void	freeblk(Tree*, Blk*);
 void	freebp(Tree*, Bptr);
 int	killblk(Tree*, Bptr);
-int	graft(Oplog*, Oplog*);
 void	reclaimblk(Bptr);
 ushort	blkfill(Blk*);
 uvlong	blkhash(Blk*);
@@ -47,7 +46,7 @@
 void	loadfs(char*);
 int	sync(void);
 int	loadlog(Arena*);
-int	scandead(Oplog*, void(*)(Bptr, void*), void*);
+int	scandead(Dlist*, int, void(*)(Bptr, void*), void*);
 int	endfs(void);
 int	compresslog(Arena*);
 void	setval(Blk*, int, Kvp*);
@@ -68,6 +67,8 @@
 char*	estrdup(char*);
 
 int	keycmp(Key *, Key *);
+void	cpkey(Key*, Key*, char*, int);
+void	cpkvp(Kvp*, Kvp*, char*, int);
 
 /* for dumping */
 void	getval(Blk*, int, Kvp*);
@@ -128,11 +129,6 @@
 int	Kconv(Fmt*);
 int	Qconv(Fmt*);
 
-/* scratch */
-void	setmsg(Blk *, int, Msg *);
-void	bufinsert(Blk *, Msg *);
-void	victim(Blk *b, Path *p);
-
 Chan*	mkchan(int);
 Fmsg*	chrecv(Chan*);
 void	chsend(Chan*, Fmsg*);
@@ -144,6 +140,6 @@
 
 /* it's in libc... */
 extern int cas(long*, long, long);
-extern int fas32(int*, int);
+extern int fasp(void***, void*);
 extern int cas64(u64int*, u64int, u64int);
-uvlong	inc64(uvlong*, uvlong);
+vlong	inc64(vlong*, vlong);
--- a/fs.c
+++ b/fs.c
@@ -27,13 +27,15 @@
 		fprint(2, "snap: save %s: %s\n", mnt->name, e);
 		abort();
 	}
-	if(t->prev[0] != -1){
-		if((e = unrefsnap(t->prev[0], t->gen)) != nil){
+	if(t->dead[0].prev != -1){
+		if((e = unrefsnap(t->dead[0].prev, t->gen)) != nil){
 			fprint(2, "snap: unref old: %s\n", e);
 			abort();
 		}
 	}
+	lock(mnt);
 	mnt->root = n;
+	unlock(mnt);
 	closesnap(t);
 	return nil;
 }
@@ -67,7 +69,7 @@
 	fprint(fd, "snap taken: %s\n", new);
 }
 
-void
+static void
 freemsg(Fmsg *m)
 {
 	free(m->a);
@@ -143,7 +145,7 @@
 
 }
 
-void
+static void
 fshangup(int fd, char *fmt, ...)
 {
 	va_list ap;
@@ -191,12 +193,16 @@
 lookup(Fid *f, Key *k, Kvp *kv, char *buf, int nbuf, int lk)
 {
 	char *e;
+	Tree *r;
 
 	if(f->mnt == nil)
 		return Eattach;
 	if(lk)
 		rlock(f->dent);
-	e = btlookup(f->mnt->root, k, kv, buf, nbuf);
+	lock(f->mnt);
+	r = f->mnt->root;
+	unlock(f->mnt);
+	e = btlookup(r, k, kv, buf, nbuf);
 	if(lk)
 		runlock(f->dent);
 	return e;
@@ -533,7 +539,7 @@
 	respond(m, &r);
 }
 
-int
+static int
 ingroup(int uid, int gid)
 {
 	User *u, *g;
@@ -551,7 +557,7 @@
 	return in;
 }
 
-int
+static int
 mode2bits(int req)
 {
 	int m;
@@ -568,7 +574,7 @@
 	return m;
 }
 
-int
+static int
 fsaccess(Mount *mnt, int fmode, int fuid, int fgid, int m)
 {
 	/* uid none gets only other permissions */
@@ -693,7 +699,7 @@
 	return nil;
 }
 
-void
+static void
 fswalk(Fmsg *m)
 {
 	char *p, *e, *name, kbuf[Maxent], kvbuf[Kvmax];
@@ -790,7 +796,7 @@
 	putfid(f);
 }
 
-void
+static void
 fsstat(Fmsg *m)
 {
 	char *err, buf[STATMAX], kvbuf[Kvmax];
@@ -820,11 +826,11 @@
 	putfid(f);
 }
 
-void
+static void
 fswstat(Fmsg *m)
 {
 	char *p, *e, strs[65535], rnbuf[Kvmax], opbuf[Kvmax], kvbuf[Kvmax];
-	int op, nm, sync;
+	int op, nm, sync, rename;
 	vlong up;
 	Fcall r;
 	Dent *de;
@@ -837,10 +843,14 @@
 
 	nm = 0;
 	sync = 1;
+	rename = 0;
 	if((f = getfid(m->fid)) == nil){
 		rerror(m, Efid);
 		return;
 	}
+	de = f->dent;
+	k = f->dent->Key;
+	wlock(de);
 	if(f->dent->qid.type != QTDIR && f->dent->qid.type != QTFILE){
 		rerror(m, Efid);
 		goto Out;
@@ -849,13 +859,9 @@
 		rerror(m, Edir);
 		goto Out;
 	}
-	de = f->dent;
-	k = f->dent->Key;
-	rlock(de);
 	if(fsaccess(f->mnt, de->mode, de->uid, de->gid, DMWRITE) == -1){
 		rerror(m, Eperm);
-		runlock(de);
-		return;
+		goto Out;
 	}
 
 	/* A nop qid change is allowed. */
@@ -884,6 +890,8 @@
 		/* renaming to the same name is a nop. */
 		mb[nm].op = Odelete;
 		mb[nm].Key = f->dent->Key;
+		mb[nm].v = nil;
+		mb[nm].nv = 0;
 		nm++;
 		if((e = btlookup(f->mnt->root, de, &kv, kvbuf, sizeof(kvbuf))) != nil){
 			rerror(m, e);
@@ -900,13 +908,12 @@
 			rerror(m, Efs);
 			goto Out;
 		}
-		sync = 0;
 		k = mb[nm].Key;
+		rename = 1;
+		sync = 0;
 		nm++;
 	}
-	runlock(de);
 
-	wlock(de);
 	p = opbuf+1;
 	op = 0;
 	mb[nm].Key = k;
@@ -937,7 +944,6 @@
 	de->muid = f->mnt->uid;
 	PBIT32(p, f->mnt->uid);
 	p += 4;
-	wunlock(de);
 
 	opbuf[0] = op;
 	mb[nm].v = opbuf;
@@ -954,13 +960,16 @@
 		r.type = Rwstat;
 		respond(m, &r);
 	}
+	if(rename)
+		cpkey(f->dent, &k, f->dent->buf, sizeof(f->dent->buf));
 
 Out:
+	wunlock(de);
 	putfid(f);
 }
 
 
-void
+static void
 fsclunk(Fmsg *m)
 {
 	Fcall r;
@@ -984,7 +993,7 @@
 	putfid(f);
 }
 
-void
+static void
 fscreate(Fmsg *m)
 {
 	char *p, *e, buf[Kvmax], upkbuf[Keymax], upvbuf[Inlmax];
@@ -1105,7 +1114,7 @@
 	putfid(f);
 }
 
-char*
+static char*
 candelete(Fid *f)
 {
 	char *e, pfx[Dpfxsz];
@@ -1133,7 +1142,7 @@
 	return Enempty;
 }
 
-void
+static void
 fsremove(Fmsg *m)
 {
 	Fcall r;
@@ -1183,7 +1192,7 @@
 	clunkfid(f);
 }
 
-void
+static void
 fsopen(Fmsg *m)
 {
 	char *p, *e, buf[Kvmax];
@@ -1263,7 +1272,7 @@
 	putfid(f);
 }
 
-char*
+static char*
 fsreaddir(Fmsg *m, Fid *f, Fcall *r)
 {
 	char pfx[Dpfxsz], *p, *e;
@@ -1321,7 +1330,7 @@
 	return nil;
 }
 
-char*
+static char*
 fsreadfile(Fmsg *m, Fid *f, Fcall *r)
 {
 	vlong n, c, o;
@@ -1357,7 +1366,7 @@
 	return nil;
 }
 
-void
+static void
 fsread(Fmsg *m)
 {
 	char *e;
@@ -1388,7 +1397,7 @@
 }
 
 
-void
+static void
 fswrite(Fmsg *m)
 {
 	char sbuf[Wstatmax], kbuf[4][Offksz], vbuf[4][Ptrsz];
@@ -1621,6 +1630,6 @@
 		a->halt = 0;
 		a->fd = -1;
 		m->a = a;
-		chsend(fs->wrchan, m);		
+		chsend(fs->wrchan, m);	
 	}
 }
--- a/hash.c
+++ b/hash.c
@@ -54,7 +54,7 @@
 	HALF_ROUND(v2,v1,v0,v3,17,21);
 
 
-u64int
+static u64int
 siphash24(const void *src, unsigned long src_sz, const char key[16]) {
 	const u64int *_key = (u64int *)key;
 	u64int k0 = _le64toh(_key[0]);
--- a/load.c
+++ b/load.c
@@ -28,11 +28,12 @@
 		return -1;
 	p = b->data;
 	a->b = b;
-	a->log.head.addr = GBIT64(p);	p += 8;
-	a->log.head.hash = GBIT64(p);	p += 8;
-	a->log.head.gen = -1;
+	a->head.addr = GBIT64(p);	p += 8;
+	a->head.hash = GBIT64(p);	p += 8;
+	a->head.gen = -1;
 	a->size = GBIT64(p);	p += 8;
 	a->used = GBIT64(p);
+	a->tail = nil;
 	if(loadlog(a) == -1)
 		return -1;
 	if(compresslog(a) == -1)
@@ -83,7 +84,7 @@
 	fs->super = b;
 	fs->nextgen = fs->snap.bp.gen + 1;
 	for(i = 0; i < Ndead; i++){
-		fs->snap.prev[i] = -1;
+		fs->snap.dead[i].prev = -1;
 		fs->snap.dead[i].head.addr = -1;
 		fs->snap.dead[i].head.hash = -1;
 		fs->snap.dead[i].head.gen = -1;
--- a/main.c
+++ b/main.c
@@ -13,7 +13,20 @@
 int	debug;
 char	*srvname = "gefs";
 
-void
+vlong
+inc64(vlong *v, vlong dv)
+{
+	vlong ov, nv;
+
+	while(1){
+		ov = *v;
+		nv = ov + dv;
+		if(cas64((u64int*)v, ov, nv))
+			return ov;
+	}
+}
+
+static void
 initfs(vlong cachesz)
 {
 	if((fs = mallocz(sizeof(Gefs), 1)) == nil)
@@ -20,13 +33,13 @@
 		sysfatal("malloc: %r");
 
 	fs->cmax = cachesz/Blksz;
-	if(fs->cmax >= (2*GiB)/sizeof(Bucket))
+	if(fs->cmax >= (2ULL*GiB)/sizeof(Bucket))
 		sysfatal("cache too big");
 	if((fs->cache = mallocz(fs->cmax*sizeof(Bucket), 1)) == nil)
 		sysfatal("malloc: %r");
 }
 
-void
+static void
 launch(void (*f)(int, void *), int wid, void *arg, char *text)
 {
 	int pid;
@@ -42,7 +55,7 @@
 	}
 }
 
-int
+static int
 postfd(char *name, char *suff)
 {
 	char buf[80];
@@ -60,7 +73,7 @@
 	return fd[1];
 }
 
-void
+static void
 usage(void)
 {
 	fprint(2, "usage: %s [-r] dev\n", argv0);
--- a/pack.c
+++ b/pack.c
@@ -385,9 +385,9 @@
 Tree*
 unpacktree(Tree *t, char *p, int sz)
 {
-	int i, j;
-	Bptr bp, head;
-	Blk *b;
+	Dlist *dl;
+	Bptr *bp;
+	int i;
 
 	assert(sz >= Treesz);
 	memset(t, 0, sizeof(Tree));
@@ -398,24 +398,13 @@
 	t->bp.hash = GBIT64(p);		p += 8;
 	t->bp.gen = GBIT64(p);		p += 8;
 	for(i = 0; i < Ndead; i++){
-		t->prev[i] = GBIT64(p);	p += 8;
-		head.addr = GBIT64(p);	p += 8;
-		head.hash = GBIT64(p);	p += 8;
-		head.gen = -1;
-		t->dead[i].head = head;
-		bp.addr = GBIT64(p);	p += 8;
-		bp.hash = GBIT64(p);	p += 8;
-		bp.gen = -1;
-		if(bp.addr == -1){
-			t->dead[i].tail	= nil;
-			continue;
-		}
-		if((b = getblk(bp, 0)) == nil){
-			for(j = 0; j < i; j++)
-				putblk(t->dead[j].tail);
-			return nil;
-		}
-		t->dead[i].tail	= b;
+		dl = &t->dead[i];
+		bp = &dl->head;
+		dl->prev = GBIT64(p);	p += 8;
+		bp->addr = GBIT64(p);	p += 8;
+		bp->hash = GBIT64(p);	p += 8;
+		bp->gen = -1;
+		t->dead[i].ins	= nil;	/* loaded on demand */
 	}
 	return t;
 }
@@ -423,9 +412,8 @@
 char*
 packtree(char *p, int sz, Tree *t)
 {
-	vlong tladdr, tlhash;
-	Bptr head;
-	Blk *tl;
+	Dlist *dl;
+	Bptr bp;
 	int i;
 
 	assert(sz >= Treesz);
@@ -436,20 +424,15 @@
 	PBIT64(p, t->bp.hash);	p += 8;
 	PBIT64(p, t->bp.gen);	p += 8;
 	for(i = 0; i < Ndead; i++){
-		tladdr = -1;
-		tlhash = -1;
-		if(t->dead[i].tail != nil){
-			tl = t->dead[i].tail;
-			assert(tl->flag & Bfinal);
-			tladdr = tl->bp.addr;
-			tlhash = tl->bp.hash;
+		dl = &t->dead[i];
+		bp = dl->head;
+		if(dl->ins != nil){
+			assert(dl->ins->flag & Bfinal);
+			bp = dl->ins->bp;
 		}
-		head = t->dead[i].head;
-		PBIT64(p, t->prev[i]);	p += 8;
-		PBIT64(p, head.addr);	p += 8;
-		PBIT64(p, head.hash);	p += 8;
-		PBIT64(p, tladdr);	p += 8;
-		PBIT64(p, tlhash);	p += 8;
+		PBIT64(p, dl->prev);	p += 8;
+		PBIT64(p, bp.addr);	p += 8;
+		PBIT64(p, bp.hash);	p += 8;
 	}
 	return p;
 }
--- a/ream.c
+++ b/ream.c
@@ -69,11 +69,11 @@
 	t.gen = 0;
 	t.bp = r->bp;
 	for(i = 0; i < Ndead; i++){
-		t.prev[i] = -1;
+		t.dead[i].prev = -1;
 		t.dead[i].head.addr = -1;
 		t.dead[i].head.hash = -1;
 		t.dead[i].head.gen = -1;
-		t.dead[i].tail = nil;
+		t.dead[i].ins = nil;
 	}
 	p = packtree(vbuf, sizeof(vbuf), &t);
 	kv.v = vbuf;
@@ -93,9 +93,9 @@
 		sysfatal("ream: %r");
 	addr += Blksz;	/* arena header */
 
-	a->log.head.addr = -1;
-	a->log.head.hash = -1;
-	a->log.head.gen = -1;
+	a->head.addr = -1;
+	a->head.hash = -1;
+	a->head.gen = -1;
 
 	memset(b, 0, sizeof(Blk));
 	b->type = Tlog;
@@ -156,9 +156,9 @@
 
 	sz = d->length;
 	sz = sz - (sz % Blksz) - Blksz;
-	fs->narena = (d->length + 64*GiB - 1) / (64*GiB);
-	if(fs->narena < 1)
-		fs->narena = 1;
+	fs->narena = (d->length + 64ULL*GiB - 1) / (64ULL*GiB);
+	if(fs->narena < 8)
+		fs->narena = 8;
 	if(fs->narena >= 128)
 		fs->narena = 128;
 	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
@@ -167,6 +167,8 @@
 
 	asz = sz/fs->narena;
 	asz = asz - (asz % Blksz) - Blksz;
+	if(asz < 512*MiB)
+		sysfatal("disk too small");
 	fs->arenasz = asz;
 	off = 0;
 	fprint(2, "reaming %d arenas:\n", fs->narena);
--- a/snap.c
+++ b/snap.c
@@ -6,19 +6,145 @@
 #include "dat.h"
 #include "fns.h"
 
-uvlong
-inc64(uvlong *v, uvlong dv)
+int
+scandead(Dlist *l, int lblk, void (*fn)(Bptr, void*), void *dat)
 {
-	vlong ov, nv;
+	char *p;
+	int i, op;
+	Dlist t;
+	Bptr bp;
+	Blk *b;
 
-	while(1){
-		ov = *v;
-		nv = ov + dv;
-		if(cas64((u64int*)v, ov, nv))
-			return ov;
+	if(l->ins != nil)
+		b = l->ins;
+	else if(l->head.addr != -1)
+		b = getblk(l->head, 0);
+	else
+		return 0;
+	if(b == nil)
+		return -1;
+	p = b->data + Loghdsz;
+	for(i = Loghdsz; i < Logspc; i += 16){
+		op = GBIT64(p) & 0xff;
+		switch(op){
+		case DlEnd:
+			return 0;
+		case DlChain:
+			bp.addr = GBIT64(p);	p += 8;
+			bp.addr &= ~0xffULL;
+			bp.hash = GBIT64(p);
+			bp.gen = -1;
+			if(lblk)
+				fn(b->bp, dat);
+			if((b = getblk(bp, 0)) == nil)
+				return -1;
+			p = b->data + Loghdsz;
+			break;
+		case DlGraft:
+			t.head.addr = GBIT64(p);	p += 8;
+			t.head.addr &= ~0xffULL;
+			t.head.hash = GBIT64(p);	p += 8;
+			t.head.gen = -1;
+			t.ins = nil;
+			scandead(&t, lblk, fn, dat);
+			break;
+		case DlKill:
+			bp.addr = GBIT64(p);	p += 8;
+			bp.hash = -1;
+			bp.gen = GBIT64(p);	p += 8;
+			bp.addr &= ~0xffULL;
+			fn(bp, dat);
+			break;
+		default:
+			fprint(2, "bad op=%d\n", op);
+			abort();
+		}
 	}
+	return 0;
 }
 
+/*
+ * Insert into a deadlist. Because the only
+ * operations are chaining deadlist pages
+ * and killing blocks, we don't preserve the
+ * order of kills.
+ */
+static int
+dlinsert(Dlist *dl, vlong v1, vlong v2)
+{
+	Blk *lb, *pb;
+	vlong end, hash;
+	char *p;
+
+	lb = dl->ins;
+	if(lb == nil && dl->head.addr != -1)
+		lb = getblk(dl->head, 0);
+	/*
+	 * move to the next block when we have
+	 * 32 bytes in the log:
+	 * We're appending up to 16 bytes as
+	 * part of the operation, followed by
+	 * 16 bytes of chaining.
+	 */
+	if(lb == nil || lb->logsz >= Logspc - 40){
+		pb = lb;
+		if((lb = newblk(Tdead)) == nil)
+			return -1;
+		if(pb != nil){
+			finalize(pb);
+			if(syncblk(pb) == -1)
+				return -1;
+			dl->head = pb->bp;
+		}
+		lb->logsz = Loghdsz;
+		dl->ins = lb;
+		putblk(pb);
+	}
+	p = lb->data + lb->logsz;
+	PBIT64(p, v1);	p += 8;
+	PBIT64(p, v2);	p += 8;
+	if(dl->head.addr == -1){
+		end = DlEnd;
+		hash = -1;
+	}else{
+		end = dl->head.addr|DlChain;
+		hash = dl->head.hash;
+	}
+	PBIT64(p+0, end);
+	PBIT64(p+8, hash);
+	lb->logsz = (p - lb->data);
+	return 0;
+}
+
+static int
+graft(Dlist *dst, Dlist *src)
+{
+	if(src->ins != nil){
+		finalize(src->ins);
+		if(syncblk(src->ins) == -1)
+			return -1;
+		src->head = src->ins->bp;
+		src->ins = nil;
+	}
+	return dlinsert(dst, src->head.addr|DlGraft, src->head.hash);
+}
+
+int
+killblk(Tree *t, Bptr bp)
+{
+	Dlist *dl;
+	int i;
+
+	dl = &t->dead[0];
+	for(i = 0; i < Ndead; i++){
+		if(t->dead[i].prev <= bp.gen)
+			break;
+		dl = &t->dead[i];
+	}
+	dlinsert(dl, bp.addr|DlKill, bp.gen);
+	return 0;
+}
+
 Tree*
 openlabel(char *name)
 {
@@ -80,6 +206,7 @@
 closesnap(Tree *t)
 {
 	Tree *te, **p;
+	Blk *ins;
 	int i;
 
 	if(adec(&t->memref) != 0)
@@ -86,11 +213,13 @@
 		return;
 
 	for(i = Ndead-1; i >= 0; i--){
-		if(t->dead[i].tail == nil)
+		ins = t->dead[i].ins;
+		if(ins == nil)
 			continue;
-		finalize(t->dead[i].tail);
-		syncblk(t->dead[i].tail);
-//FIXME: 	putblk(t->dead[i].tail);
+		finalize(ins);
+		syncblk(ins);
+		t->dead[i].head = ins->bp;
+//FIXME: 	putblk(ins);
 	}
 
 	p = &fs->osnap;
@@ -104,7 +233,7 @@
 	free(te);
 }
 
-char*
+static char*
 modifysnap(int op, Tree *t)
 {
 	char kbuf[Snapsz], vbuf[Treesz];
@@ -113,9 +242,9 @@
 	int i;
 
 	for(i = 0; i < Ndead; i++){
-		if(t->dead[i].tail != nil){
-			finalize(t->dead[i].tail);
-			syncblk(t->dead[i].tail);
+		if(t->dead[i].ins != nil){
+			finalize(t->dead[i].ins);
+			syncblk(t->dead[i].ins);
 		}
 	}
 	m.op = op;
@@ -218,14 +347,13 @@
 	return nil;
 }
 
-void
+static void
 freedead(Bptr bp, void *)
 {
-	dprint("reclaimed deadlist: %B\n", bp);
-	reclaimblk(bp);
+	freebp(nil, bp);
 }
 
-void
+static void
 redeadlist(Bptr bp, void *pt)
 {
 	killblk(pt, bp);
@@ -234,25 +362,22 @@
 char*
 freesnap(Tree *snap, Tree *next)
 {
-	Oplog dl;
+	Dlist dl;
 	int i;
 
 	assert(snap->gen != next->gen);
-	assert(next->prev[0] == snap->gen);
+	assert(next->dead[0].prev == snap->gen);
 
 	dl = next->dead[Ndead-1];
-	scandead(&next->dead[0], freedead, nil);
+	scandead(&next->dead[0], 1, freedead, nil);
 	for(i = 0; i < Ndead-2; i++){
 		if(graft(&snap->dead[i], &next->dead[i+1]) == -1)
 			return Efs;
-		next->prev[i] = snap->prev[i];
 		next->dead[i] = snap->dead[i];
 	}
-	for(; i < Ndead; i++){
-		next->prev[i] = snap->prev[i];
+	for(; i < Ndead; i++)
 		next->dead[i] = snap->dead[i];
-	}
-	scandead(&dl, redeadlist, next);
+	scandead(&dl, 0, redeadlist, next);
 
 	return nil;
 }
@@ -280,11 +405,11 @@
 	r->dirty = 0;
 	/* shift deadlist down */
 	for(i = Ndead-1; i >= 0; i--){
-		r->prev[i] = i == 0 ? t->gen : t->prev[i-1];
+		r->dead[i].prev = (i == 0) ? t->gen : t->dead[i-1].prev;
 		r->dead[i].head.addr = -1;
 		r->dead[i].head.hash = -1;
 		r->dead[i].head.gen = -1;
-		r->dead[i].tail = nil;
+		r->dead[i].ins = nil;
 	}
 	fs->osnap = r;
 
--- a/tree.c
+++ b/tree.c
@@ -6,7 +6,7 @@
 #include "dat.h"
 #include "fns.h"
 
-void
+static void
 stablesort(Msg *m, int nm)
 {
 	int i, j;
@@ -44,13 +44,6 @@
 	dst->nv = src->nv;
 }
 
-void
-cpmsg(Msg *dst, Msg *src, char *buf, int nbuf)
-{
-	dst->op = src->op;
-	cpkvp(dst, src, buf, nbuf);
-}
-
 int
 keycmp(Key *a, Key *b)
 {
@@ -67,7 +60,7 @@
 		return 0;
 }
 
-int
+static int
 msgsz(Msg *m)
 {
 	/* disp + op + klen + key + vlen + v */
@@ -74,7 +67,7 @@
 	return 2+1+2+m->nk +2+ m->nv;
 }
 
-int
+static int
 valsz(Kvp *kv)
 {
 	return 2 + 2+kv->nk + 2+kv->nv;
@@ -102,6 +95,7 @@
 	return unpackbp(kv->v, kv->nv);
 }
 
+/* Exported for reaming */
 void
 setval(Blk *b, int i, Kvp *kv)
 {
@@ -134,7 +128,7 @@
 	memcpy(p + kv->nk + 4, kv->v, kv->nv);
 }
 
-void
+static void
 setptr(Blk *b, int i, Key *k, Bptr bp, int fill)
 {
 	char *p, buf[Ptrsz+2];
@@ -149,7 +143,7 @@
 	setval(b, i, &kv);
 }
 
-void
+static void
 setmsg(Blk *b, int i, Msg *m)
 {
 	char *p;
@@ -235,7 +229,7 @@
 	return ri;
 }
 
-int
+static int
 blksearch(Blk *b, Key *k, Kvp *rp, int *same)
 {
 	int lo, hi, ri, mid, r;
@@ -273,7 +267,7 @@
 	return ri;
 }
 
-int
+static int
 buffill(Blk *b)
 {
 	assert(b->type == Tpivot);
@@ -280,7 +274,7 @@
 	return 2*b->nbuf + b->bufsz;
 }
 
-int
+static int
 filledbuf(Blk *b, int nmsg, int needed)
 {
 	assert(b->type == Tpivot);
@@ -287,7 +281,7 @@
 	return 2*(b->nbuf+nmsg) + b->bufsz + needed > Bufspc;
 }
 
-int
+static int
 filledleaf(Blk *b, int needed)
 {
 	assert(b->type == Tleaf);
@@ -294,7 +288,7 @@
 	return 2*(b->nval+1) + b->valsz + needed > Leafspc;
 }
 
-int
+static int
 filledpiv(Blk *b, int reserve)
 {
 	/* 
@@ -306,7 +300,7 @@
 	return 2*(b->nval+1) + b->valsz + reserve*Kpmax > Pivspc;
 }
 
-int
+static int
 copyup(Blk *n, int i, Path *pp, int *nbytes)
 {
 	Kvp kv;
@@ -343,7 +337,7 @@
 	return i;
 }
 
-void
+static void
 statupdate(Kvp *kv, Msg *m)
 {
 	int op;
@@ -394,36 +388,7 @@
 	}
 }
 
-static char*
-dropsnap(Kvp *t, Msg *m)
-{
-	char *e, buf[Msgmax];
-	Tree snap, tmp, *from;
-	vlong id;
-	Kvp kv;
-	Key k;
-
-	if(unpacktree(&snap, t->v, t->nv) == nil)
-		return Efs;
-	k.k = m->v;
-	k.nk = m->nv;
-	id = GBIT64(m->v+1);
-	for(from = fs->osnap; from != nil; from = from->snext)
-		if(from->gen == id)
-			break;
-	if(from == nil){
-		if((e = btlookup(&fs->snap, &k, &kv, buf, sizeof(buf))) != nil)
-			return e;
-		if(unpacktree(&tmp, kv.v, kv.nv) == nil)
-			return Efs;
-		from = &tmp;
-	}
-	if((e = freesnap(&snap, from)) != nil)
-		return e;
-	return nil;
-}
-
-int
+static int
 apply(Kvp *kv, Msg *m, char *buf, int nbuf)
 {
 	switch(m->op){
@@ -444,7 +409,7 @@
 	return 0;
 }
 
-int
+static int
 pullmsg(Path *p, int i, Kvp *v, Msg *m, int *full, int spc)
 {
 	if(i < 0 || i >= p->hi || *full)
@@ -468,7 +433,7 @@
  *
  * When pidx != -1, 
  */
-int
+static int
 updateleaf(Tree *t, Path *up, Path *p)
 {
 	char buf[Msgmax];
@@ -566,7 +531,7 @@
  *
  * When pidx != -1, 
  */
-int
+static int
 updatepiv(Tree *, Path *up, Path *p, Path *pp)
 {
 	char buf[Msgmax];
@@ -639,7 +604,7 @@
  * would be inserted into. Split must never
  * grow the total height of the 
  */
-int
+static int
 splitleaf(Tree *t, Path *up, Path *p, Kvp *mid)
 {
 	char buf[Msgmax];
@@ -739,7 +704,7 @@
  * grow the total height of the tree by more
  * than one.
  */
-int
+static int
 splitpiv(Tree *t, Path *, Path *p, Path *pp, Kvp *mid)
 {
 	int i, o, copied, halfsz;
@@ -807,7 +772,7 @@
 	return 0;
 }
 
-int
+static int
 merge(Path *p, Path *pp, int idx, Blk *a, Blk *b)
 {
 	int i, o;
@@ -849,7 +814,7 @@
  * returns 1 if we'd spill out of the buffer,
  * updates *idx and returns 0 otherwise.
  */
-int
+static int
 spillscan(Blk *d, Blk *b, Msg *m, int *idx, int o)
 {
 	int i, used;
@@ -875,7 +840,7 @@
  * idx and m would spill out of the buffer
  * of d.
  */
-int
+static int
 spillsbuf(Blk *d, Blk *l, Blk *r, Msg *m, int *idx)
 {
 	if(l->type == Tleaf)
@@ -888,7 +853,7 @@
 	return 0;
 }
 
-int
+static int
 rotate(Tree *t, Path *p, Path *pp, int midx, Blk *a, Blk *b, int halfpiv)
 {
 	int i, o, cp, sp, idx;
@@ -956,7 +921,7 @@
 	return 0;
 }
 
-int
+static int
 rotmerge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
 {
 	int na, nb, ma, mb, imbalance;
@@ -984,7 +949,7 @@
 		return 0;
 }
 
-int
+static int
 trybalance(Tree *t, Path *p, Path *pp, int idx)
 {
 	Blk *l, *m, *r;
@@ -1109,7 +1074,7 @@
 	return nil;
 }
 
-void
+static void
 freepath(Tree *t, Path *path, int npath)
 {
 	Path *p;
@@ -1129,7 +1094,7 @@
  * Select child node that with the largest message
  * segment in the current node's buffer.
  */
-void
+static void
 victim(Blk *b, Path *p)
 {
 	int i, j, lo, maxsz, cursz;
@@ -1170,12 +1135,6 @@
 	}
 }
 
-int
-msgcmp(void *a, void *b)
-{
-	return keycmp((Msg*)a, (Msg*)b);
-}
-
 char*
 btupsert(Tree *t, Msg *msg, int nmsg)
 {
@@ -1278,7 +1237,7 @@
 	return getblk(bp, 0);
 }
 
-char*
+static char*
 btlookupat(Blk *b, int h, Key *k, Kvp *r, char *buf, int nbuf)
 {
 	int i, j, ok, same;
@@ -1389,6 +1348,8 @@
 	for(i = 0; i < s->root.ht; i++){
 		p[i].vi = blksearch(b, &s->kv, &v, &same);
 		if(b->type == Tpivot){
+			if(p[i].vi == -1)
+				getval(b, ++p[i].vi, &v);
 			p[i].bi = bufsearch(b, &s->kv, &m, &same);
 			if(p[i].bi == -1 || !same)
 				p[i].bi++;
--- a/user.c
+++ b/user.c
@@ -79,7 +79,7 @@
 	return ret;
 }
 
-char*
+static char*
 readline(char **p, char *buf, int nbuf)
 {
 	char *e;
@@ -95,7 +95,7 @@
 	return buf;
 }
 
-char*
+static char*
 getfield(char **p, char delim)
 {
 	char *r;
@@ -133,7 +133,7 @@
 	return nil;
 }
 
-char*
+static char*
 parseusers(int fd, char *udata)
 {
 	char *pu, *p, *f, *m, *err, buf[8192];