shithub: gefs

Download patch

ref: 7e0d72a78a9c2a01b2ba83674478a56cffddd0f7
parent: 86363b483b2a8e5c1529522d1e2962fa9988b2fc
author: Ori Bernstein <ori@eigenstate.org>
date: Thu Dec 30 21:11:12 EST 2021

todo: update

--- a/TODO
+++ b/TODO
@@ -1,21 +1,21 @@
-- Implement: reclaim blocks that escape snapshots
-- Snapshot forking: refcount snapshots, add ids
-- Transient exec snapshots
-- Auth
-- Users and groups:
-	- Permission checks
-	- User and group loading
-	- User and group manipulation from console
-		- (do we want this? maybe just edit /adm/users?)
-- Complete wstat: implement missing operations
-- Implement special file modes: ORCLOSE, OEXCL, etc
-- Add missing commands
-- Performance optimization:
-	- Async page writeback: currently, every write to device is sync.
-	- Bulk block frees
-	- Background block frees
-	- Root block reuse
-- Small file optimizations
-	- Inline data
-	- Block fragmentation
-- Testing, Debugging
+*** must have before usable for testing ***
+- correctly implement deadlists
+- auth against authservers
+- users and groups:
+	- permission checks
+	- user and group loading
+- complete wstat: implement missing operations
+- implement special file modes: ORCLOSE, OEXCL, etc
+- add missing management commands in console
+
+*** nice to have, can get testing without**
+- transient exec snapshots
+- performance optimization:
+	- async page writeback: currently, every write to device is sync.
+	- bulk block frees
+	- background block frees
+	- root block reuse
+- small file optimizations
+	- inline data
+	- block fragmentation
+- testing, debugging
--- a/blk.c
+++ b/blk.c
@@ -15,7 +15,7 @@
 static vlong	blkalloc_lk(Arena*);
 static vlong	blkalloc(void);
 static int	blkdealloc_lk(vlong);
-static void	cachedel(vlong);
+static Blk*	initblk(vlong, int);
 
 QLock blklock;
 
@@ -93,7 +93,7 @@
 	if(i < 0 || i >= fs->narena){
 		werrstr("out of range block %lld", b);
 		abort();
-//		return nil;
+		return nil;
 	}
 	return &fs->arenas[i];
 }
@@ -168,7 +168,7 @@
 	return 0;
 }
 
-Blk*
+int
 logappend(Oplog *ol, Arena *a, vlong off, vlong len, vlong val, int op)
 {
 	Blk *pb, *lb;
@@ -185,20 +185,16 @@
 		else
 			o = blkalloc_lk(a);
 		if(o == -1)
-			return nil;
-		if((lb = mallocz(sizeof(Blk), 1)) == nil)
-			return nil;
-		lb->data = lb->buf + Hdrsz;
-		lb->flag |= Bdirty;
-		lb->type = Tlog;
-		lb->bp.addr = o;
+			return -1;
+		if((lb = initblk(o, Tlog)) == nil)
+			return -1;
 		lb->logsz = Loghdsz;
 		p = lb->data + lb->logsz;
-		PBIT64(p + 0, (uvlong)LogEnd);
+		PBIT64(p, (uvlong)LogEnd);
 		finalize(lb);
 		if(syncblk(lb) == -1){
 			free(lb);
-			return nil;
+			return -1;
 		}
 
 		ol->tail = lb;
@@ -207,14 +203,16 @@
 			PBIT64(p + 0, lb->bp.addr|LogChain);
 			finalize(pb);
 			if(syncblk(pb) == -1)
-				return nil;
+				return -1;
 		}
 	}
 
-	if(len == Blksz && op == LogAlloc)
-		op = LogAlloc1;
-	if(len == Blksz && op == LogFree)
-		op = LogFree1;
+	if(len == Blksz){
+		if(op == LogAlloc)
+			op = LogAlloc1;
+		else if(op == LogFree)
+			op = LogFree1;
+	}
 	off |= op;
 	p = lb->data + lb->logsz;
 	PBIT64(p, off);
@@ -226,9 +224,26 @@
 	/* this gets overwritten by the next append */
 	p = lb->data + lb->logsz;
 	PBIT64(p, (uvlong)LogEnd);
-	return lb;
+	if(ol->head.addr == -1)
+		ol->head = lb->bp;
+	if(ol->tail != lb)
+		ol->tail = lb;
+	return 0;
 }
 
+static void
+showdeadbp(Bptr bp, void *)
+{
+	fprint(2, "\tdead: %B\n", bp);
+}
+
+void
+showlst(char *m, Oplog *l)
+{
+	fprint(2, "showlst: %s\n", m);
+	scandead(l, showdeadbp, nil);
+}
+
 int
 graft(Oplog *a, Oplog *b)
 {
@@ -235,6 +250,7 @@
 	char *p;
 	vlong o;
 
+
 	if(b->head.addr == -1)
 		return 0;
 	if(a->head.addr == -1){
@@ -242,7 +258,9 @@
 		a->tail = b->tail;
 		return 0;
 	}
-	
+
+showlst("a", a);
+showlst("b", b);
 	o = b->head.addr|LogChain;
 	p = a->tail->data + a->tail->logsz;
 	PBIT64(p, o);
@@ -250,6 +268,7 @@
 		return -1;
 	putblk(a->tail);
 	a->tail = b->tail;
+showlst("grafted", b);
 	return 0;
 }
 
@@ -257,22 +276,23 @@
 deadlistappend(Tree *t, Bptr bp)
 {
 	Oplog *l;
-	Blk *b;
 	int i;
 
-	dprint("deadlisted %B\n", bp);
+fprint(2, "deadlisted %B [%p::%B]\n", bp, t, t->bp);
+//fprint(2, "predeadlist\n");
+//showtreeroot(2, t);
+//fprint(2, "----\n");
 	l = nil;
 	for(i = 0; i < Ndead; i++){
+		l = &t->dead[i];
 		if(bp.gen > t->prev[i])
 			break;
-		l = &t->dead[i];
 	}
-	if((b = logappend(l, nil, bp.addr, Blksz, bp.gen, LogDead)) == nil)
+	if(logappend(l, nil, bp.addr, Blksz, bp.gen, LogDead) == -1)
 		return -1;
-	if(l->head.addr == -1)
-		l->head = b->bp;
-	if(l->tail != b)
-		l->tail = b;
+//fprint(2, "postdeadlist\n");
+//showtreeroot(2, t);
+//fprint(2, "@@@@@@@@@@@@@@@@@@\n");
 	return 0;
 }
 
@@ -285,14 +305,9 @@
 int
 freelistappend(Arena *a, vlong off, int op)
 {
-	Blk *b;
-
-	if((b = logappend(&a->log, a, off, Blksz, Blksz, op)) == nil)
+	cachedel(off);
+	if(logappend(&a->log, a, off, Blksz, Blksz, op) == -1)
 		return -1;
-	if(a->log.head.addr == -1)
-		a->log.head = b->bp;
-	if(a->log.tail != b)
-		a->log.tail = b;
 	return 0;
 }
 
@@ -301,7 +316,6 @@
 {
 	vlong ent, off;
 	int op, i, n;
-	uvlong bh;
 	Bptr dead;
 	char *d;
 	Bptr bp;
@@ -313,12 +327,7 @@
 Nextblk:
 	if((b = getblk(bp, GBnochk)) == nil)
 		return -1;
-	bh = GBIT64(b->data);
-	/* the hash covers the log and offset */
-	if(bh != siphash(b->data+8, Blkspc-8)){
-		werrstr("corrupt log");
-		return -1;
-	}
+// TODO: hash checks for these chains
 	for(i = Loghdsz; i < Logspc; i += n){
 		d = b->data + i;
 		ent = GBIT64(d);
@@ -326,31 +335,26 @@
 		off = ent & ~0xff;
 		n = (op >= Log2wide) ? 16 : 8;
 		switch(op){
+		case LogEnd:
+			dprint("log@%d: end\n", i);
+			return 0;
 		case LogDead:
-			dead.addr = ent;
+			dead.addr = off;
 			dead.hash = -1;
 			dead.gen = GBIT64(d+8);
+			dprint("log@%d: dead %B\n", i, dead);
 			fn(dead, dat);
 			break;
-		case LogEnd:
-			dprint("log@%d: end\n", i);
-			/*
-			 * since we want the next insertion to overwrite
-			 * this, don't include the size in this entry.
-			 */
-			b->logsz = i;
-			return 0;
 		case LogChain:
 			bp.addr = off & ~0xff;
 			bp.hash = -1;
 			bp.gen = -1;
 			dprint("log@%d: chain %B\n", i, bp);
-			b->logsz = i+n;
 			goto Nextblk;
 			break;
 		default:
 			n = 0;
-			dprint("log@%d: log op %d\n", i, op);
+			fprint(2, "log@%d: log op %d\n", i, op);
 			abort();
 			break;
 		}
@@ -388,11 +392,6 @@
 		switch(op){
 		case LogEnd:
 			dprint("log@%d: end\n", i);
-			/*
-			 * since we want the next insertion to overwrite
-			 * this, don't include the size in this entry.
-			 */
-			b->logsz = i;
 			return 0;
 		case LogChain:
 			bp.addr = off & ~0xff;
@@ -403,10 +402,6 @@
 			goto Nextblk;
 			break;
 
-		case LogFlush:
-			dprint("log@%d: flush: %llx\n", i, off>>8);
-			fs->nextgen = (off >> 8)+1;
-			break;
 		case LogAlloc:
 		case LogAlloc1:
 			len = (op >= Log2wide) ? GBIT64(d+8) : Blksz;
@@ -514,7 +509,7 @@
 	ol.tail = b;
 	b->logsz = Loghdsz;
 	for(i = 0; i < n; i++)
-		if((b = logappend(&ol, a, log[i].off, log[i].len, log[i].len, LogFree)) == nil)
+		if(logappend(&ol, a, log[i].off, log[i].len, log[i].len, LogFree) == -1)
 			return -1;
 	p = b->data + b->logsz;
 	PBIT64(p, LogChain|graft);
@@ -658,14 +653,11 @@
 	return b;
 }
 
-Blk*
-newblk(int t)
+static Blk*
+initblk(vlong bp, int t)
 {
 	Blk *b;
-	vlong bp;
 
-	if((bp = blkalloc()) == -1)
-		return nil;
 	if((b = lookupblk(bp)) == nil){
 		if((b = mallocz(sizeof(Blk), 1)) == nil)
 			return nil;
@@ -681,6 +673,7 @@
 		b->cprev = nil;
 		b->hnext = nil;
 	}
+
 	b->type = t;
 	b->bp.addr = bp;
 	b->bp.hash = -1;
@@ -696,10 +689,23 @@
 	b->logsz = 0;
 	b->lognxt = 0;
 
-	dprint("new block %B from %p, flag=%x\n", b->bp, getcallerpc(&t), b->flag);
 	return cacheblk(b);
 }
 
+Blk*
+newblk(int t)
+{
+	vlong bp;
+	Blk *b;
+
+	if((bp = blkalloc()) == -1)
+		return nil;
+	if((b = initblk(bp, t)) == nil)
+		return nil;
+	setmalloctag(b, getcallerpc(&t));
+	return b;
+}
+
 char*
 fillsuper(Blk *b)
 {
@@ -834,6 +840,7 @@
 {
 	if(b == nil || adec(&b->ref) != 0)
 		return;
+	assert(!(b->flag & Bcached));
 	assert((b->flag & Bqueued) || !(b->flag & Bdirty));
 	free(b);
 }
@@ -844,7 +851,7 @@
 	Bfree *f;
 
 	dprint("[%s] free blk %B\n", (t == &fs->snap) ? "snap" : "data", bp);
-	if(bp.gen <= t->prev[0]){
+	if(bp.gen <= t->gen){
 		deadlistappend(t, bp);
 		return;
 	}
@@ -875,6 +882,7 @@
 	a = getarena(bp.addr);
 	lock(a);
 	blkdealloc_lk(bp.addr);
+	cachedel(bp.addr);
 	unlock(a);
 }
 
@@ -918,6 +926,7 @@
 	while(p != nil){
 		n = p->next;
 		reclaimblk(p->bp);
+		free(p);
 		p = n;
 	}
 	fs->freep = fs->freehd;
@@ -927,15 +936,12 @@
 sync(void)
 {
 	int i, r;
-	Blk *b, *s;
 	Arena *a;
+//	Blk *b;
 
-	qlock(&fs->snaplk);
 	r = 0;
-	s = fs->super;
-	fillsuper(s);
-	enqueue(s);
 
+	qlock(&fs->snaplk);
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
 		finalize(a->log.tail);
@@ -942,14 +948,18 @@
 		if(syncblk(a->log.tail) == -1)
 			r = -1;
 	}
-	for(b = fs->chead; b != nil; b = b->cnext){
-		if(!(b->flag & Bdirty))
-			continue;
-		if(syncblk(b) == -1)
-			r = -1;
-	}
+//
+//	for(b = fs->chead; b != nil; b = b->cnext){
+//		if((b->flag & Bdirty) == 0)
+//			continue;
+//		if(syncblk(b) == -1)
+//			r = -1;
+//	}
+	fillsuper(fs->super);
+	finalize(fs->super);
+	enqueue(fs->super);
 	if(r != -1)
-		r = syncblk(s);
+		r = syncblk(fs->super);
 
 	qunlock(&fs->snaplk);
 	return r;
--- a/cache.c
+++ b/cache.c
@@ -6,7 +6,7 @@
 #include "dat.h"
 #include "fns.h"
 
-static void
+void
 cachedel(vlong del)
 {
 	Bucket *bkt;
@@ -13,21 +13,23 @@
 	Blk *b, **p;
 	u32int h;
 
-	/* FIXME: better hash. */
 	h = ihash(del);
 	bkt = &fs->cache[h % fs->cmax];
 	lock(bkt);
 	p = &bkt->b;
 	for(b = bkt->b; b != nil; b = b->hnext){
-		if(b->bp.addr == del){
-			*p = b->hnext;
+		if(b->bp.addr == del)
 			break;
-		}
 		p = &b->hnext;
 	}
+	if(b == nil){
+		unlock(bkt);
+		return;
+	}
+	*p = b->hnext;
 	unlock(bkt);
-	assert(b != nil);
 
+	assert(b != fs->chead || b != fs->ctail);
 	if(b->cnext != nil)
 		b->cnext->cprev = b->cprev;
 	if(b->cprev != nil)
@@ -38,6 +40,8 @@
 		fs->chead = b->cnext;
 	b->cnext = nil;
 	b->cprev = nil;
+	b->flag &= ~Bcached;
+	fs->ccount--;
 }
 
 Blk*
@@ -64,9 +68,9 @@
 	lock(&fs->lrulk);
 	if(b == fs->chead)
 		goto Cached;
+
 	if(b == fs->ctail)
 		fs->ctail = b->cprev;
-
 	if(b->cnext != nil)
 		b->cnext->cprev = b->cprev;
 	if(b->cprev != nil)
@@ -75,8 +79,6 @@
 		fs->ctail = b;
 	if(fs->chead != nil)
 		fs->chead->cprev = b;
-	if(fs->ctail == nil)
-		fs->ctail = b;
 	b->cnext = fs->chead;
 	b->cprev = nil;
 	fs->chead = b;
@@ -87,14 +89,11 @@
 		fs->ccount++;
 		refblk(b);
 	}
-	c=0;
-	USED(c);
-#ifdef NOTYET
 	for(c = fs->ctail; c != nil && fs->ccount >= fs->cmax; c = fs->ctail){
 		cachedel(c->bp.addr);
 		putblk(c);
 	}
-#endif
+
 Cached:
 	unlock(&fs->lrulk);
 	return b;
--- a/cons.c
+++ b/cons.c
@@ -34,29 +34,31 @@
 static void
 snapfs(int fd, char **ap, int na)
 {
-	vlong gen, old;
+	Tree *t, *n;
 	char *e;
-	Tree *t;
 
-	if((t = opensnap(ap[0])) == nil){
-		fprint(fd, "snap: open %s: %r\n", ap[0]);
+	if((t = openlabel(ap[0])) == nil){
+		fprint(fd, "snap: open %s: does not exist\n", ap[0]);
 		return;
 	}
-	if((e = newsnap(t, &gen, &old)) != nil){
-		fprint(fd, "snap: save %s: %s\n", ap[na-1], e);
+	if((n = newsnap(t)) == nil){
+		fprint(fd, "snap: save %s: failed\n", ap[na-1]);
 		return;
 	}
-	if((e = labelsnap(ap[na-1], gen)) != nil){
+	if((e = labelsnap(ap[na-1], n->gen)) != nil){
 		fprint(fd, "snap: save %s: %s\n", ap[na-1], e);
 		return;
 	}
 	if(na <= 1 || strcmp(ap[0], ap[1]) == 0){
 		/* the label moved */
-		if((e = unrefsnap(old, gen)) != nil){
+		if((e = unrefsnap(t->gen, n->gen)) != nil){
 			fprint(fd, "snap: unref old: %s\n", e);
 			return;
 		}
 	}
+	closesnap(n);
+	closesnap(t);
+	sync();
 	fprint(fd, "snap %s: ok\n", ap[na-1]);
 }
 
--- a/dat.h
+++ b/dat.h
@@ -33,7 +33,6 @@
 	Nrefbuf	= 1024,			/* number of ref incs before syncing */
 	Nfidtab	= 1024,			/* number of fit hash entries */
 	Ndtab	= 1024,			/* number of dir tab entries */
-	Nttab	= 32,			/* number of tree tab entries */
 	Max9p	= 16*KiB,		/* biggest message size we're willing to negotiate */
 	Nsec	= 1000*1000*1000,	/* nanoseconds to the second */
 	Maxname	= 256,			/* maximum size of a name element */
@@ -277,10 +276,10 @@
  * Operations for the allocation log.
  */
 enum {
+	LogNop,		/* unused */
 	/* 1-wide entries */
 	LogAlloc1,	/* alloc a block */
 	LogFree1,	/* free a block */
-	LogFlush,	/* flush log, bump gen */
 	LogChain,	/* point to next log block */
 	LogEnd,		/* last entry in log */	
 
@@ -323,7 +322,7 @@
 
 struct Tree {
 	Lock	lk;
-	Tree	*hnext;
+	Tree	*snext;
 
 	long	memref;	/* number of in-memory references to this */
 	int	ref;	/* number of on-disk references to this */
@@ -350,8 +349,8 @@
 	int	hdrsz;	/* immutable */
 
 	QLock	snaplk;	/* snapshot lock */
-	Tree	*ttab[Nttab];
-	Blk*	super;
+	Tree	*osnap;
+	Blk	*super;
 
 	Chan	*wrchan;
 	Chan	*rdchan;
--- a/dump.c
+++ b/dump.c
@@ -102,8 +102,10 @@
 				n += fmtprint(fmt, "mtime:%llx ", GBIT64(p));
 				p += 8;
 			}
-			if(p != v->v + v->nv)
+			if(p != v->v + v->nv){
+				fprint(2, "v->nv: %d\n", v->nv);
 				abort();
+			}
 			break;
 		}
 		break;
@@ -284,7 +286,7 @@
 		name = ap[0];
 	if(strcmp(name, "dump") == 0)
 		t = &fs->snap;
-	else if((t = opensnap(name)) == nil){
+	else if((t = openlabel(name)) == nil){
 		fprint(fd, "open %s: %r\n", name);
 		return;
 	}
@@ -291,6 +293,16 @@
 	b = getroot(t, &h);
 	fprint(fd, "=== [%s] %B @%d\n", name, t->bp, t->ht);
 	rshowblk(fd, b, 0, 1);
+	putblk(b);
+}
+
+void
+showbp(int fd, Bptr bp, int recurse)
+{
+	Blk *b;
+
+	b = getblk(bp, 0);
+	rshowblk(fd, b, 0, recurse);
 	putblk(b);
 }
 
--- a/fns.h
+++ b/fns.h
@@ -15,6 +15,7 @@
 Blk*	getblk(Bptr, int);
 Blk*	refblk(Blk*);
 Blk*	cacheblk(Blk*);
+void	cachedel(vlong);
 Blk*	lookupblk(vlong);
 Blk*	readblk(vlong, int);
 Arena*	getarena(vlong);
@@ -31,13 +32,14 @@
 u32int	ihash(vlong);
 void	finalize(Blk*);
 char*	fillsuper(Blk*);
-char*	newsnap(Tree*, vlong*, vlong*);
+Tree*	newsnap(Tree*);
 char*	freesnap(Tree*, Tree*);
 char*	labelsnap(char*, vlong);
 char*	unlabelsnap(vlong, char*);
 char*	refsnap(vlong);
 char*	unrefsnap(vlong, vlong);
-Tree*	opensnap(char*);
+Tree*	openlabel(char*);
+void	closesnap(Tree*);
 uvlong	siphash(void*, usize);
 void	reamfs(char*);
 int	loadarena(Arena*, vlong);
@@ -55,7 +57,6 @@
 char*	btnext(Scan*, Kvp*, int*);
 void	btdone(Scan*);
 
-
 void	setflag(Blk *b, int);
 int	chkflag(Blk *b, int);
 
@@ -70,6 +71,7 @@
 
 void	initshow(void);
 void	showblk(int, Blk*, char*, int);
+void	showbp(int, Bptr, int);
 void	showpath(int, Path*, int);
 void	showtreeroot(int, Tree*);
 void	showtree(int, char**, int);
@@ -103,7 +105,7 @@
 
 char*	packbp(char*, int, Bptr*);
 Bptr	unpackbp(char*, int);
-char*	packtree(char*, int, Tree*, int);
+char*	packtree(char*, int, Tree*);
 Tree*	unpacktree(Tree*, char*, int);
 char*	packdkey(char*, int, vlong, char*);
 
--- a/fs.c
+++ b/fs.c
@@ -13,23 +13,28 @@
 static char*
 updatesnap(Fid *f)
 {
-	vlong gen, old;
+	Tree *t, *n;
 	char *e;
 
-	if((e = newsnap(f->mnt->root, &gen, &old)) != nil){
-		fprint(2, "snap: save %s: %s\n", f->mnt->name, e);
+	t = f->mnt->root;
+	qlock(&fs->snaplk);
+	if((n = newsnap(t)) == nil){
+		fprint(2, "snap: save %s: %s\n", f->mnt->name, "create snap");
 		abort();
 	}
-	if((e = labelsnap(f->mnt->name, gen)) != nil){
+	if((e = labelsnap(f->mnt->name, t->gen)) != nil){
 		fprint(2, "snap: save %s: %s\n", f->mnt->name, e);
 		abort();
 	}
-	if(old >= 0){
-		if((e = unrefsnap(old, gen)) != nil){
+	if(t->prev[0] != -1){
+		if((e = unrefsnap(t->prev[0], t->gen)) != nil){
 			fprint(2, "snap: unref old: %s\n", e);
 			abort();
 		}
 	}
+	f->mnt->root = n;
+	closesnap(t);
+	qunlock(&fs->snaplk);
 	sync();
 	return nil;
 }
@@ -509,7 +514,7 @@
 		return;
 	}
 	mnt->name = strdup(m->aname);
-	if((mnt->root = opensnap(m->aname)) == nil){
+	if((mnt->root = openlabel(m->aname)) == nil){
 		rerror(m, Enosnap);
 		return;
 	}
@@ -749,12 +754,6 @@
 	op = 0;
 	mb[nm].Key = k;
 	mb[nm].op = Owstat;
-	if(d.mode != ~0){
-		op |= Owmode;
-		PBIT32(p, d.mode);
-		p += 4;
-		sync = 0;
-	}
 	if(d.length != ~0){
 		op |= Owsize;
 		PBIT64(p, d.length);
@@ -761,6 +760,12 @@
 		p += 8;
 		sync = 0;
 	}
+	if(d.mode != ~0){
+		op |= Owmode;
+		PBIT32(p, d.mode);
+		p += 4;
+		sync = 0;
+	}
 	if(d.mtime != ~0){
 		op |= Owmtime;
 		PBIT64(p, (vlong)d.mtime*Nsec);
@@ -1244,9 +1249,12 @@
 	kv[i].op = Owstat;
 	kv[i].k = f->dent->k;
 	kv[i].nk = f->dent->nk;
-	if(m->offset+m->count > f->dent->length){
-		*p++ = Owsize;
-		PBIT64(p, m->offset+m->count);	p += 8;
+	n = m->offset+m->count;
+	*p++ = 0;
+	if(n > f->dent->length){
+		sbuf[0] |= Owsize;
+		PBIT64(p, n);
+		p += 8;
 		f->dent->length = m->offset+m->count;
 	}
 	kv[i].v = sbuf;
--- a/load.c
+++ b/load.c
@@ -42,6 +42,7 @@
 	int i, dirty;
 	int blksz, bufspc, hdrsz;
 
+	fs->osnap = nil;
 	if((fs->fd = open(dev, ORDWR)) == -1)
 		sysfatal("open %s: %r", dev);
 	if((d = dirfstat(fs->fd)) == nil)
@@ -71,6 +72,12 @@
 	fs->super = b;
 	fs->nextgen = fs->snap.bp.gen + 1;
 
+	for(i = 0; i < Ndead; i++){
+		fs->snap.prev[i] = -1;
+		fs->snap.dead[i].head.addr = -1;
+		fs->snap.dead[i].head.hash = -1;
+		fs->snap.dead[i].head.gen = -1;
+	}
 	fprint(2, "load: %8s\n", p);
 	fprint(2, "\tsnaptree:\t%B\n", fs->snap.bp);
 	fprint(2, "\tarenas:\t%d\n", fs->narena);
--- a/pack.c
+++ b/pack.c
@@ -328,8 +328,10 @@
 		bp.addr = GBIT64(p);	p += 8;
 		bp.hash = GBIT64(p);	p += 8;
 		bp.gen = -1;
-		if(bp.addr == -1)
+		if(bp.addr == -1){
+			t->dead[i].tail	= nil;
 			continue;
+		}
 		if((b = getblk(bp, 0)) == nil){
 			for(j = 0; j < i; j++)
 				putblk(t->dead[j].tail);
@@ -336,13 +338,12 @@
 			return nil;
 		}
 		t->dead[i].tail	= b;
-		cacheblk(b);		
 	}
 	return t;
 }
 
 char*
-packtree(char *p, int sz, Tree *t, int refs)
+packtree(char *p, int sz, Tree *t)
 {
 	vlong tladdr, tlhash;
 	Bptr head;
@@ -350,7 +351,7 @@
 	int i;
 
 	assert(sz >= Treesz);
-	PBIT32(p, refs);	p += 4;
+	PBIT32(p, t->ref);	p += 4;
 	PBIT32(p, t->ht);	p += 4;
 	PBIT64(p, t->gen);	p += 8;
 	PBIT64(p, t->bp.addr);	p += 8;
--- a/ream.c
+++ b/ream.c
@@ -63,7 +63,7 @@
 	kv.nk = Snapsz;
 
 	memset(&t, 0, sizeof(Tree));
-	t.ref = 1;
+	t.ref = 2;
 	t.ht = 1;
 	t.gen = 0;
 	t.bp = r->bp;
@@ -74,7 +74,7 @@
 		t.dead[i].head.gen = -1;
 		t.dead[i].tail = nil;
 	}
-	p = packtree(vbuf, sizeof(vbuf), &t, 2);
+	p = packtree(vbuf, sizeof(vbuf), &t);
 	kv.v = vbuf;
 	kv.nv = p - vbuf;
 	setval(s, 1, &kv);
--- a/snap.c
+++ b/snap.c
@@ -42,13 +42,12 @@
 }
 
 Tree*
-opensnap(char *name)
+openlabel(char *name)
 {
 	char dbuf[Keymax], buf[Kvmax];
 	char *p, *e;
 	vlong id;
 	Tree *t;
-	u32int h;
 	int n;
 	Key k;
 	Kvp kv;
@@ -62,9 +61,11 @@
 	k.nk = p - dbuf;
 	if((e = btlookup(&fs->snap, &k, &kv, buf, sizeof(buf))) != nil)
 		goto Error;
+	if(kv.nv != Snapsz)
+		goto Error;
+
 	id = GBIT64(kv.v + 1);
-	h = ihash(id) % Nttab;
-	for(t = fs->ttab[h]; t != nil; t = t->hnext){
+	for(t = fs->osnap; t != nil; t = t->snext){
 		if(t->gen == id){
 			ainc(&t->memref);
 			qunlock(&fs->snaplk);
@@ -72,20 +73,26 @@
 		}
 	}
 	if((t = mallocz(sizeof(Tree), 1)) == nil){
-		fprint(2, "opensnap: %s\n", e);
+		fprint(2, "open %s: %s\n", name, e);
 		goto Error;
 	}
+	memset(&t->lk, 0, sizeof(t->lk));
+
 	memmove(dbuf, kv.v, kv.nv);
 	k.k = dbuf;
 	k.nk = kv.nv;
 	if((e = btlookup(&fs->snap, &k, &kv, buf, sizeof(buf))) != nil){
-		fprint(2, "opensnap: %s\n", e);
+		fprint(2, "open %s: %s\n", name, e);
 		goto Error;
 	}
 	if(unpacktree(t, kv.v, kv.nv) == nil)
 		goto Error;
+	t->memref = 1;
+	t->snext = fs->osnap;
+	fs->osnap = t;
 	qunlock(&fs->snaplk);
 	return t;
+
 Error:
 	qunlock(&fs->snaplk);
 	return nil;
@@ -95,21 +102,26 @@
 closesnap(Tree *t)
 {
 	Tree *te, **p;
-	u32int h;
+	int i;
 
 	if(adec(&t->memref) != 0)
 		return;
 
-	qlock(&fs->snaplk);
-	h = ihash(t->gen) % Nttab;
-	p = &fs->ttab[h];
-	for(te = fs->ttab[h]; te != nil; te = te->hnext){
+	for(i = Ndead-1; i >= 0; i--){
+		if(t->dead[i].tail == nil)
+			continue;
+		syncblk(t->dead[i].tail);
+//FIXME		putblk(t->dead[i].tail);
+	}
+
+	p = &fs->osnap;
+	for(te = fs->osnap; te != nil; te = te->snext){
 		if(te == t)
 			break;
-		p = &te->hnext;
+		p = &te->snext;
 	}
 	assert(te != nil);
-	*p = te->hnext;
+	*p = te->snext;
 	free(te);
 }
 
@@ -129,14 +141,13 @@
 	m[nm].op = del ? Ounrefsnap : Orefsnap;
 	m[nm].k = sbuf;
 	m[nm].nk = p - sbuf;
-
 	p = nbuf;
 	p[0] = Ksnap;		p += 1;
 	PBIT64(p, next);	p += 8;
 	m[nm].v = nbuf;
 	m[nm].nv = p - nbuf;
-
 	nm++;
+
 	if(name != nil){
 		p = dbuf;
 		n = strlen(name);
@@ -188,6 +199,7 @@
 void
 freedead(Bptr bp, void *)
 {
+	fprint(2, "reclaimed deadlist: %B\n", bp);
 	reclaimblk(bp);
 }
 
@@ -196,9 +208,14 @@
 {
 	Oplog dl;
 	int i;
+
 	assert(snap->gen != next->gen);
 	assert(next->prev[0] == snap->gen);
 
+//fprint(2, "next tree\n");
+//showtreeroot(2, next);
+//fprint(2, "snap tree\n");
+//showtreeroot(2, snap);
 	scandead(&next->dead[0], freedead, nil);
 	for(i = 0; i < Ndead-1; i++){
 		next->prev[i] = snap->prev[i];
@@ -208,58 +225,62 @@
 				return Efs;
 		next->dead[i] = dl;
 	}
+//fprint(2, "transferred\n");
+//showtreeroot(2, next);
 	return nil;
 }
 
-char*
-newsnap(Tree *t, vlong *genp, vlong *oldp)
+Tree*
+newsnap(Tree *t)
 {
 	char kbuf[Snapsz], vbuf[Treesz];
+	vlong gen;
 	char *p, *e;
-	uvlong gen, old;
+	Tree *r;
 	Msg m;
 	int i;
 
-	qlock(&fs->snaplk);
 	gen = inc64(&fs->nextgen, 1);
 	for(i = 0; i < Ndead; i++){
 		if(t->dead[i].tail != nil){
 			finalize(t->dead[i].tail);
 			syncblk(t->dead[i].tail);
-			putblk(t->dead[i].tail);
-			t->dead[i].tail = nil;
 		}
 	}
-
-	/* shift deadlist down */
-	if(t->dead[Ndead-1].tail != nil)
-		putblk(t->dead[Ndead-1].tail);
-	old = t->gen;
-	for(i = Ndead-1; i >= 0; i--){
-		t->prev[i] = i == 0 ? t->gen : t->prev[i-1];
-		t->dead[i].head.addr = -1;
-		t->dead[i].head.hash = -1;
-		t->dead[i].head.gen = -1;
-		t->dead[i].tail = nil;
-	}
-	t->gen = gen;
-
 	p = kbuf;
-	p[0] = Ksnap;	p += 1;
-	PBIT64(p, gen);	p += 8;
+	p[0] = Ksnap;		p += 1;
+	PBIT64(p, t->gen);	p += 8;
 	m.op = Oinsert;
 	m.k = kbuf;
 	m.nk = p - kbuf;
-	p = packtree(vbuf, sizeof(vbuf), t, 0);
+	p = packtree(vbuf, sizeof(vbuf), t);
 	m.v = vbuf;
 	m.nv = p - vbuf;
 	if((e = btupsert(&fs->snap, &m, 1)) != nil){
+		fprint(2, "error snapshotting: %s\n", e);
 		qunlock(&fs->snaplk);
-		return e;
+		return nil;
 	}
 
-	*genp = gen;
-	*oldp = old;
-	qunlock(&fs->snaplk);
-	return nil;
+	if((r = malloc(sizeof(Tree))) == nil)
+		return nil;
+
+	memset(&r->lk, 0, sizeof(r->lk));
+	r->snext = fs->osnap;
+	r->memref = 1;
+	r->ref = 0;
+	r->ht = t->ht;
+	r->bp = t->bp;
+	r->gen = gen;
+	/* shift deadlist down */
+	for(i = Ndead-1; i >= 0; i--){
+		r->prev[i] = i == 0 ? t->gen : t->prev[i-1];
+		r->dead[i].head.addr = -1;
+		r->dead[i].head.hash = -1;
+		r->dead[i].head.gen = -1;
+		r->dead[i].tail = nil;
+	}
+	fs->osnap = r;
+
+	return r;
 }
--- a/tree.c
+++ b/tree.c
@@ -314,32 +314,53 @@
 void
 statupdate(Kvp *kv, Msg *m)
 {
-	vlong v;
-	char *p;
-	int op;
+	int op, err;
+	char *p, *e;
+	Dir d;
 
 	p = m->v;
 	op = *p++;
+	kv2dir(kv, &d);
 	/* bump version */
-	v = GBIT32(kv->v+8);
-	PBIT32(kv->v+8, v+1);
-	if(op & Owmtime){
-		v = GBIT64(p);
-		p += 8;
-		PBIT32(kv->v+25, v);
-	}
+	d.qid.vers++;
 	if(op & Owsize){
-		v = GBIT64(p);
+		d.length = GBIT64(p);
 		p += 8;
-		PBIT64(kv->v+33, v);
 	}
 	if(op & Owmode){
-		v = GBIT32(p);
+		d.mode = GBIT32(p);
 		p += 4;
-		PBIT32(kv->v+33, v);
 	}
-	if(p != m->v + m->nv)
-		fprint(2, "malformed wstat message");
+	if(op & Owmtime){
+		d.mtime = GBIT64(p);
+		p += 8;
+	}
+	if(op & Owatime){
+		d.atime = GBIT64(p);
+		p += 8;
+	}
+	if(p != m->v + m->nv){
+		fprint(2, "kv=%P, m=%M\n", kv, m);
+		fprint(2, "malformed stat message (op=%x, len=%lld, sz=%d)\n", op, p - m->v, m->nv);
+		abort();
+	}
+	err = 0;
+	p = kv->v;
+	e = kv->v + kv->nv;
+	p = pack64(&err, p, e, d.qid.path);
+	p = pack32(&err, p, e, d.qid.vers);
+	p = pack8(&err, p, e, d.qid.type);
+	p = pack32(&err, p, e, d.mode);
+	p = pack64(&err, p, e, (vlong)d.atime*Nsec);
+	p = pack64(&err, p, e, (vlong)d.mtime*Nsec);
+	p = pack64(&err, p, e, d.length);
+	p = packstr(&err, p, e, d.uid);
+	p = packstr(&err, p, e, d.gid);
+	p = packstr(&err, p, e, d.muid);
+	if(err){
+		fprint(2, "wstat not fixed size(%llx != %d)\n", p - kv->v, kv->nv);
+		abort();
+	}
 }
 
 static char*
@@ -346,22 +367,28 @@
 dropsnap(Kvp *t, Msg *m)
 {
 	char *e, buf[Msgmax];
-	Tree snap, from;
+	Tree snap, tmp, *from;
+	vlong id;
 	Kvp kv;
 	Key k;
 
-	qlock(&fs->snaplk);
 	if(unpacktree(&snap, t->v, t->nv) == nil)
 		return Efs;
 	k.k = m->v;
 	k.nk = m->nv;
-	if((e = btlookup(&fs->snap, &k, &kv, buf, sizeof(buf))) != nil)
+	id = GBIT64(m->v+1);
+	for(from = fs->osnap; from != nil; from = from->snext)
+		if(from->gen == id)
+			break;
+	if(from == nil){
+		if((e = btlookup(&fs->snap, &k, &kv, buf, sizeof(buf))) != nil)
+			return e;
+		if(unpacktree(&tmp, kv.v, kv.nv) == nil)
+			return Efs;
+		from = &tmp;
+	}
+	if((e = freesnap(&snap, from)) != nil)
 		return e;
-	if(unpacktree(&from, kv.v, kv.nv) == nil)
-		return Efs;
-	if((e = freesnap(&snap, &from)) != nil)
-		return e;
-	qunlock(&fs->snaplk);
 	return nil;
 }
 
@@ -396,8 +423,9 @@
 		}
 		PBIT32(kv->v, refs);
 		return 1;
+	default:
+		abort();
 	}
-	abort();
 	return 0;
 }
 
@@ -1214,8 +1242,6 @@
 	unlock(&t->lk);
 	freepath(t, path, npath);
 	free(path);
-	if(!checkfs(2))
-		abort();
 	if(redo)
 		goto Again;
 	return 0;
@@ -1258,8 +1284,10 @@
 		if(blksearch(p[i-1], k, r, nil) == -1)
 			break;
 		bp = getptr(r, nil);
-		if((p[i] = getblk(bp, 0)) == nil)
-			return Efs;
+		if((p[i] = getblk(bp, 0)) == nil){
+			err = Efs;
+			goto Out;
+		}
 	}
 	if(p[h-1] != nil)
 		blksearch(p[h-1], k, r, &ok);
@@ -1282,9 +1310,11 @@
 	}
 	if(ok)
 		err = nil;
+Out:
 	for(i = 0; i < h; i++)
 		if(p[i] != nil)
 			putblk(p[i]);
+	free(p);
 	return err;
 }