shithub: gefs

Download patch

ref: c7bc6cca77498f87fa9230d9628b82c46c12dba5
parent: e808b613b7b70a5b23b25bef22ad0e7d5ab846dc
author: Ori Bernstein <ori@eigenstate.org>
date: Fri May 26 11:15:31 EDT 2023

fs: pull out superblock from arena, duplicate arenas to allow for better crash resilience

--- a/blk.c
+++ b/blk.c
@@ -102,13 +102,16 @@
 
 	switch(b->type){
 	default:
-		fprint(2, "invalid block @%llx\n", bp);
+		fprint(2, "invalid block type %d @%llx\n", b->type, bp);
 		abort();
 		break;
 	case Tdat:
-	case Tarena:
+	case Tsuper:
 		b->data = b->buf;
 		break;
+	case Tarena:
+		b->data = b->buf + 2;
+		break;
 	case Tdlist:
 		b->deadsz = UNPACK16(b->buf+2);
 		b->deadp = unpackbp(b->buf+4, Blksz-4);
@@ -285,6 +288,8 @@
 	assert(off % Blksz == 0);
 	assert(op == LogAlloc || op == LogFree);
 	assert(lb == nil || lb->type == Tlog);
+	assert(off >= a->hd->bp.addr + Blksz);
+	assert(off < a->tl->bp.addr);
 	dprint("logop %llx+%llx@%x: %s\n", off, len, lb?lb->logsz:-1, (op == LogAlloc) ? "Alloc" : "Free");
 	/*
 	 * move to the next block when we have
@@ -354,7 +359,7 @@
 	bh = UNPACK64(b->data);
 	/* the hash covers the log and offset */
 	if(bh != bufhash(b->data+Loghashsz, Logspc-Loghashsz)){
-		werrstr("corrupt log");
+		werrstr("corrupt log block %B [%llx]", bp, blkhash(b));
 		return -1;
 	}
 	for(i = Loghashsz; i < Logspc; i += n){
@@ -785,6 +790,8 @@
 
 	switch(b->type){
 	default:
+		abort();
+		break;
 	case Tpivot:
 		PACK16(b->buf+2, b->nval);
 		PACK16(b->buf+4, b->valsz);
@@ -806,6 +813,7 @@
 	case Tdat:
 	case Tmagic:
 	case Tarena:
+	case Tsuper:
 		break;
 	}
 
@@ -829,6 +837,7 @@
 	}
 	if((b = readblk(bp.addr, flg)) == nil){
 		qunlock(&fs->blklk[i]);
+		abort();
 		return nil;
 	}
 	b->alloced = getcallerpc(&bp);
@@ -836,6 +845,7 @@
 	if((flg&GBnochk) == 0 && h != bp.hash){
 		fprint(2, "corrupt block %p %B: %.16llux != %.16llux\n", b, bp, h, bp.hash);
 		qunlock(&fs->blklk[i]);
+		abort();
 		return nil;
 	}
 	b->bp.hash = h;
@@ -851,8 +861,6 @@
 holdblk(Blk *b)
 {
 	ainc(&b->ref);
-	b->lasthold1 = b->lasthold0;
-	b->lasthold0 = b->lasthold;
 	b->lasthold = getcallerpc(&b);
 	return b;
 }
@@ -863,8 +871,6 @@
 	assert(b == nil || b->ref > 0);
 	if(b == nil || adec(&b->ref) != 0)
 		return;
-	b->lastdrop1 = b->lastdrop0;
-	b->lastdrop0 = b->lastdrop;
 	b->lastdrop = getcallerpc(&b);
 	/*
 	 * While a freed block can get resurrected
@@ -990,6 +996,7 @@
 	Arena *a;
 
 	b->qgen = agetv(&fs->qgen);
+	b->enqueued = getcallerpc(&b);
 	a = getarena(b->bp.addr);
 	assert(checkflag(b, Bdirty));
 	assert(b->bp.addr >= 0);
@@ -1115,6 +1122,7 @@
 	}
 	while(fs->syncing != 0)
 		rsleep(&fs->syncrz);
+	/* pass 0: sync arena contents */
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
 		lock(a);
@@ -1125,14 +1133,49 @@
 			a->deferhd = (Bptr){-1, -1, -1};
 			a->defertl = nil;
 		}
-		packarena(a->b->data, Blksz, a, fs);
-		finalize(a->b);
 		finalize(a->logtl);
 		if(syncblk(a->logtl) == -1)
 			sysfatal("sync arena: %r");
-		if(syncblk(a->b) == -1)
+		unlock(a);
+	}
+	/*
+	 * pass 1: sync block headers; if we crash here,
+	 *  the block footers are consistent, and we can
+	 *  use them.
+         */
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		lock(a);
+		packarena(a->hd->data, Blksz, a);
+		finalize(a->hd);
+		if(syncblk(a->hd) == -1)
 			sysfatal("sync arena: %r");
 		unlock(a);
 	}
+	/*
+	 * pass 2: sync block footers; if we crash here,
+	 *  the block headers are consistent, and we can
+	 *  use them.
+         */
+	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
+		lock(a);
+		packarena(a->tl->data, Blksz, a);
+		finalize(a->tl);
+		if(syncblk(a->tl) == -1)
+			sysfatal("sync arena: %r");
+		unlock(a);
+	}
+	for(i = 0; i < fs->narena; i++)
+		fs->arenabp[i] = fs->arenas[i].hd->bp;
+
+	packsb(fs->sb0->buf, Blksz, fs);
+	packsb(fs->sb1->buf, Blksz, fs);
+	finalize(fs->sb0);
+	finalize(fs->sb1);
+	if(syncblk(fs->sb0) == -1)
+		sysfatal("sync sb: %r");
+	if(syncblk(fs->sb1) == -1)
+		sysfatal("sync sb: %r");
 	qunlock(&fs->synclk);
 }
--- a/dat.h
+++ b/dat.h
@@ -67,19 +67,20 @@
 	Kvmax	= Keymax + Inlmax,	/* Key and value */
 	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
 	Wstatmax = 4+8+8+8,		/* mode, size, atime, mtime */
+	Arenasz	= 8+8+8+8,		/* loghd, loghash, size, used */
 	
 	Pivhdsz		= 10,
 	Leafhdsz	= 6,
 	Loghdsz		= 2,
 	Loghashsz	= 8,
-	Dlhdsz		= 2+2+Ptrsz,		/* type, size, chain */
+	Dlhdsz		= 2+2+Ptrsz,			/* type, size, chain */
 	Dlspc		= Blksz - Dlhdsz,
-	Rootsz		= 4+Ptrsz,		/* root pointer */
+	Rootsz		= 4+Ptrsz,			/* root pointer */
 	Pivsz		= Blksz - Pivhdsz,
-	Bufspc		= (Blksz - Pivhdsz)/2,	/* pivot room */
+	Bufspc		= (Blksz - Pivhdsz)/2,		/* pivot room */
 	Pivspc		= Blksz - Pivhdsz - Bufspc,
 	Logspc		= Blksz - Loghdsz,
-	Logslop		= 16+16+8,	/* val, nextb, chain */
+	Logslop		= 16+16+8,			/* val, nextb, chain */
 	Leafspc 	= Blksz - Leafhdsz,
 	Msgmax  	= 1 + (Kvmax > Kpmax ? Kvmax : Kpmax)
 };
@@ -244,7 +245,8 @@
 	Tlog,
 	Tdlist,
 	Tmagic,
-	Tarena = 0x6765,	/* 'ge' bigendian */
+	Tarena,
+	Tsuper = 0x6765,	/* 'ge' bigendian */
 };
 
 enum {
@@ -438,6 +440,7 @@
 	vlong	arenasz;
 	vlong	nextqid;
 	vlong	nextgen;
+	Bptr	*arenabp;
 };
 
 /*
@@ -446,6 +449,9 @@
  */
 struct Gefs {
 	Fshdr;
+	/* superblocks */
+	Blk	*sb0;	/* primary */
+	Blk	*sb1;	/* backup */
 	/* arena allocation */
 	Arena	*arenas;
 	long	roundrobin;
@@ -518,7 +524,8 @@
 	Avltree *free;
 	Blk	**queue;
 	int	nqueue;
-	Blk	*b;			/* arena block */
+	Blk	*hd;			/* arena header */
+	Blk	*tl;			/* arena footer */
 	Blk	**q;			/* write queue */
 	vlong	nq;
 	vlong	size;
@@ -666,13 +673,8 @@
 
 	/* debug */
 	uintptr lasthold;
-	uintptr lasthold0;
-	uintptr lasthold1;
-
 	uintptr lastdrop;
-	uintptr lastdrop0;
-	uintptr lastdrop1;
-
+	uintptr	enqueued;
 	uintptr cached;
 	uintptr uncached;
 	uintptr	alloced;
--- a/fns.h
+++ b/fns.h
@@ -72,7 +72,7 @@
 void	closesnap(Tree*);
 void	reamfs(char*);
 void	growfs(char*);
-int	loadarena(Arena*, Fshdr *fi, vlong);
+int	loadarena(Arena*, Bptr, vlong);
 void	loadfs(char*);
 void	sync(void);
 int	loadlog(Arena*, Bptr);
@@ -134,36 +134,41 @@
 char*	pack64(int*, char*, char*, uvlong);
 char*	packstr(int*, char*, char*, char*);
 
-/* void* is a bit hacky, but we want both signed and unsigned to work */
-char*	unpack8(int*, char*, char*, void*);
-char*	unpack16(int*, char*, char*, void*);
-char*	unpack32(int*, char*, char*, void*);
-char*	unpack64(int*, char*, char*, void*);
-char*	unpackstr(int*, char*, char*, char**);
 int	dir2kv(vlong, Xdir*, Kvp*, char*, int);
-int	kv2statbuf(Kvp*, char*, int);
 int	dir2statbuf(Xdir*, char*, int);
-int	kv2dir(Kvp*, Xdir*);
-void	kv2qid(Kvp*, Qid*);
-void	kv2dlist(Kvp*, Dlist*);
 void	dlist2kv(Dlist*, Kvp*, char*, int);
-void	tree2kv(Tree*, Kvp*, char*, int);
 void	lbl2kv(char*, vlong, Kvp*, char*, int);
 void	link2kv(vlong, vlong, Kvp*, char*, int);
+void	tree2kv(Tree*, Kvp*, char*, int);
+
+int	kv2dir(Kvp*, Xdir*);
+void	kv2dlist(Kvp*, Dlist*);
 void	kv2link(Kvp*, vlong*, vlong*);
+void	kv2qid(Kvp*, Qid*);
+int	kv2statbuf(Kvp*, char*, int);
 
+char*	packarena(char*, int, Arena*);
 char*	packbp(char*, int, Bptr*);
-Bptr	unpackbp(char*, int);
-char*	packtree(char*, int, Tree*);
-Tree*	unpacktree(Tree*, char*, int);
 char*	packdkey(char*, int, vlong, char*);
-char*	unpackdkey(char*, int, vlong*);
 char*	packdval(char*, int, Xdir*);
-char*	packsnap(char*, int, vlong);
 char*	packlabel(char*, int, char*);
+char*	packsnap(char*, int, vlong);
 char*	packsuper(char*, int, vlong);
-char*	packarena(char*, int, Arena*, Fshdr*);
-char*	unpackarena(Arena*, Fshdr*, char*, int);
+char*	packtree(char*, int, Tree*);
+char*	packsb(char*, int, Fshdr*);
+
+char*	unpackarena(Arena*, char*, int);
+Bptr	unpackbp(char*, int);
+char*	unpackdkey(char*, int, vlong*);
+Tree*	unpacktree(Tree*, char*, int);
+char*	unpacksb(Fshdr*, char*, int);
+
+/* void* is a bit hacky, but we want both signed and unsigned to work */
+char*	unpack8(int*, char*, char*, void*);
+char*	unpack16(int*, char*, char*, void*);
+char*	unpack32(int*, char*, char*, void*);
+char*	unpack64(int*, char*, char*, void*);
+char*	unpackstr(int*, char*, char*, char**);
 
 /* fmt */
 int	Bconv(Fmt*);
--- a/load.c
+++ b/load.c
@@ -16,34 +16,37 @@
 	return 0;
 }
 
-static void
-mergeinfo(Gefs *fs, Fshdr *fi)
-{
-	if(fi->blksz != Blksz || fi->bufspc != Bufspc)
-		sysfatal("parameter mismatch");
-	if(fs->gotinfo && fs->narena != fi->narena)
-		sysfatal("arena count mismatch");
-	if(fs->gotinfo && fi->nextgen != fs->nextgen)
-		fprint(2, "not all arenas synced: rolling back\n");
-	fs->Fshdr = *fi;
-}
-
 int
-loadarena(Arena *a, Fshdr *fi, vlong o)
+loadarena(Arena *a, Bptr hdbp, vlong asz)
 {
-	Blk *b;
+	Blk *hd, *tl, *b;
 	Bptr bp;
 
-	bp.addr = o;
-	bp.hash = -1;
-	bp.gen = -1;
-	if((b = getblk(bp, GBnochk)) == nil)
+	/* try to load block pointers with consistency check */
+	bp = hdbp;
+	hd = getblk(bp, 0);
+	bp.addr += asz;
+	tl = getblk(bp, 0);
+
+	/* if neither head nor tail is consistent, we're hosed */
+	b = (hd != nil) ? hd : tl;
+	if(b == nil)
 		return -1;
-	if(unpackarena(a, fi, b->data, Blksz) == nil)
+
+	/* otherwise, we could have crashed mid-pass, just load the blocks */
+	bp = hdbp;
+	if(hd == nil && (hd = getblk(bp, GBnochk)) == nil)
 		return -1;
+	bp.addr += asz;
+	if(tl == nil && (tl = getblk(bp, GBnochk)) == nil)
+		return -1;
+
+	if(unpackarena(a, b->data, Arenasz) == nil)
+		return -1;
 	if((a->free = avlcreate(rangecmp)) == nil)
 		return -1;
-	a->b = b;
+	a->hd = hd;
+	a->tl = tl;
 	return 0;
 }
 
@@ -51,8 +54,8 @@
 loadfs(char *dev)
 {
 	Mount *mnt;
-	Fshdr fi;
 	Arena *a;
+	Bptr bp;
 	char *e;
 	Tree *t;
 	int i, k;
@@ -71,16 +74,25 @@
 	fs->narena = 1;
 	if((fs->fd = open(dev, ORDWR)) == -1)
 		sysfatal("open %s: %r", dev);
-	if((fs->arenas = calloc(1, sizeof(Arena))) == nil)
+	bp = (Bptr){0, -1, -1};
+	if((fs->sb0 = getblk(bp, GBnochk)) == nil)
+		sysfatal("superblock: %r\n");
+	bp = (Bptr){512*MiB, -1, -1};
+	if((fs->sb1 = getblk(bp, GBnochk)) == nil)
+		sysfatal("superblock: %r\n");
+	if(unpacksb(fs, fs->sb0->buf, Blksz) == nil)
+		sysfatal("superblock: %r");
+	if(unpacksb(fs, fs->sb0->buf, Blksz) == nil)
+		sysfatal("superblock: %r");
+	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
 		sysfatal("malloc: %r");
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
-		if((loadarena(a, &fi, i*fs->arenasz)) == -1)
+		if((loadarena(a, fs->arenabp[i], fs->arenasz)) == -1)
 			sysfatal("loadfs: %r");
 		a->reserve = a->size / 1024;
 		if(a->reserve < 32*MiB)
 			a->reserve = 32*MiB;
-		mergeinfo(fs, &fi);
 		if(!fs->gotinfo){
 			if((fs->arenas = realloc(fs->arenas, fs->narena*sizeof(Arena))) == nil)
 				sysfatal("malloc: %r");
@@ -92,9 +104,7 @@
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
 		if(loadlog(a, a->loghd) == -1)
-			sysfatal("load log: %r");
-		if(compresslog(a) == -1)
-			sysfatal("compress log: %r");
+			sysfatal("load log %B: %r", a->loghd);
 	}
 
 	fprint(2, "load %s:\n", dev);
--- a/main.c
+++ b/main.c
@@ -221,7 +221,7 @@
 	loadfs(dev);
 	fs->wrchan = mkchan(32);
 	fs->nsyncers = nproc/2;
-	fs->nreaders = 2;
+	fs->nreaders = 1;
 	if(fs->nsyncers > fs->narena)
 		fs->nsyncers = fs->narena;
 	for(i = 0; i < fs->nsyncers; i++)
--- a/pack.c
+++ b/pack.c
@@ -521,10 +521,51 @@
 }
 
 char*
-packarena(char *p, int sz, Arena *a, Fshdr *fi)
+packarena(char *p, int sz, Arena *a)
 {
+	char *e;
+
+	assert(sz >= Arenasz);
+	e = p + Arenasz;
+	PACK64(p, a->loghd.addr);	p += 8;	/* freelist addr */
+	PACK64(p, a->loghd.hash);	p += 8;	/* freelist hash */
+	PACK64(p, a->size);		p += 8;	/* arena size */
+	PACK64(p, a->used);		p += 8;	/* arena used */
+	assert(p <= e);
+	return p;
+}
+
+char*
+unpackarena(Arena *a, char *p, int sz)
+{
+	char *e;
+
+	assert(sz >= Arenasz);
+	memset(a, 0, sizeof(*a));
+
+	e = p + Arenasz;
+	a->loghd.addr = UNPACK64(p);	p += 8;
+	a->loghd.hash = UNPACK64(p);	p += 8;
+	a->loghd.gen = -1;		p += 0;
+	a->size = UNPACK64(p);		p += 8;
+	a->used = UNPACK64(p);		p += 8;
+	a->logtl = nil;
+
+	a->deferhd.addr = -1;
+	a->deferhd.hash = -1;
+	a->deferhd.gen = -1;
+	a->defertl = nil;
+	assert(p <= e);
+	return p;
+}
+
+char*
+packsb(char *p, int sz, Fshdr *fi)
+{
+	int i;
+
 	assert(sz == Blksz);
-	memcpy(p, "gefs0004", 8);	p += 8;
+	memcpy(p, "gefs0005", 8);	p += 8;
 	PACK32(p, Blksz);		p += 4;
 	PACK32(p, Bufspc);		p += 4;
 	PACK32(p, fi->snap.ht);		p += 4;
@@ -534,44 +575,39 @@
 	PACK64(p, fi->arenasz);		p += 8;
 	PACK64(p, fi->nextqid);		p += 8;
 	PACK64(p, fi->nextgen);		p += 8;
-	PACK64(p, a->loghd.addr);	p += 8;	/* freelist addr */
-	PACK64(p, a->loghd.hash);	p += 8;	/* freelist hash */
-	PACK64(p, a->size);		p += 8;	/* arena size */
-	PACK64(p, a->used);		p += 8;	/* arena used */
+	for(i = 0; i < fi->narena; i++){
+		PACK64(p, fi->arenabp[i].addr);	p += 8;
+		PACK64(p, fi->arenabp[i].hash);	p += 8;
+	}
 	return p;
 }
 
 char*
-unpackarena(Arena *a, Fshdr *fi, char *p, int sz)
+unpacksb(Fshdr *fi, char *p, int sz)
 {
+	int i;
+
 	assert(sz == Blksz);
-	memset(a, 0, sizeof(*a));
-	memset(fi, 0, sizeof(*fi));
-	if(memcmp(p, "gefs0004", 8) != 0){
+	if(memcmp(p, "gefs0005", 8) != 0){
 		werrstr("wrong block header %.8s\n", p);
 		return nil;
 	}
 	p += 8;
-	fi->blksz = UNPACK32(p);		p += 4;
-	fi->bufspc = UNPACK32(p);		p += 4;
+	fi->blksz = UNPACK32(p);	p += 4;
+	fi->bufspc = UNPACK32(p);	p += 4;
 	fi->snap.ht = UNPACK32(p);	p += 4;
 	fi->snap.bp.addr = UNPACK64(p);	p += 8;
 	fi->snap.bp.hash = UNPACK64(p);	p += 8;
 	fi->snap.bp.gen = -1;		p += 0;
-	fi->narena = UNPACK32(p);		p += 4;
+	fi->narena = UNPACK32(p);	p += 4;
 	fi->arenasz = UNPACK64(p);	p += 8;
 	fi->nextqid = UNPACK64(p);	p += 8;
 	fi->nextgen = UNPACK64(p);	p += 8;
-	a->loghd.addr = UNPACK64(p);	p += 8;
-	a->loghd.hash = UNPACK64(p);	p += 8;
-	a->loghd.gen = -1;		p += 0;
-	a->size = UNPACK64(p);		p += 8;
-	a->used = UNPACK64(p);		p += 8;
-	a->logtl = nil;
-
-	a->deferhd.addr = -1;
-	a->deferhd.hash = -1;
-	a->deferhd.gen = -1;
-	a->defertl = nil;
+	fi->arenabp = malloc(fi->narena * sizeof(Bptr));
+	for(i = 0; i < fi->narena; i++){
+		fi->arenabp[i].addr = UNPACK64(p);	p += 8;
+		fi->arenabp[i].hash = UNPACK64(p);	p += 8;
+		fi->arenabp[i].gen = -1;
+	}
 	return p;
 }
--- a/ream.c
+++ b/ream.c
@@ -158,14 +158,16 @@
 }
 
 static void
-initarena(Arena *a, Fshdr *fi, vlong start, vlong asz)
+initarena(Arena *a, vlong start, vlong asz)
 {
 	vlong addr, bo, bh;
 	char *p;
-	Blk *b;
+	Blk *b, *hd, *tl;
 
 	b = cachepluck();
-	addr = start+Blksz;	/* arena loghder */
+	if(start == 512*MiB)
+		start += Blksz;
+	addr = start+Blksz;	/* leave room for arena hdr */
 
 	a->loghd.addr = -1;
 	a->loghd.hash = -1;
@@ -174,7 +176,6 @@
 	memset(b->buf, 0, sizeof(b->buf));
 	b->type = Tlog;
 	b->bp.addr = addr;
-	b->logsz = 32;
 	b->data = b->buf + Loghdsz;
 	setflag(b, Bdirty);
 
@@ -183,7 +184,13 @@
 	PACK64(p, asz-Blksz);		p += 8;	/* len */
 	PACK64(p, b->bp.addr|LogAlloc);	p += 8;	/* addr */
 	PACK64(p, Blksz);		p += 8;	/* len */
+	/* backup sb */
+	if(start <= 512*MiB && start+asz > 512*MiB){
+		PACK64(p, (512*MiB)|LogAlloc1);
+		p += 8;
+	}
 	PACK64(p, (uvlong)LogEnd);	/* done */
+	b->logsz = p - b->data;
 	finalize(b);
 	if(syncblk(b) == -1)
 		sysfatal("ream: init log");
@@ -192,11 +199,6 @@
 	bh = b->bp.hash;
 	bo = b->bp.addr;
 
-	b = cachepluck();
-	memset(b->buf, 0, sizeof(b->buf));
-	b->type = Tarena;
-	b->bp.addr = start;
-	b->data = b->buf;
 	a->loghd.addr = bo;
 	a->loghd.hash = bh;
 	a->loghd.gen = -1;
@@ -203,17 +205,38 @@
 	a->size = asz;
 	a->used = Blksz;
 	a->logtl = nil;
-	packarena(b->data, Blksz, a, fi);
-	finalize(b);
-	if(syncblk(b) == -1)
+
+	hd = cachepluck();
+	tl = cachepluck();
+
+	memset(hd->buf, 0, sizeof(hd->buf));
+	hd->type = Tarena;
+	hd->bp.addr = start;
+	hd->data = hd->buf+2;
+	finalize(hd);
+
+	memset(tl->buf, 0, sizeof(tl->buf));
+	tl->type = Tarena;
+	tl->bp.addr = start+asz;
+	tl->data = tl->buf+2;
+	finalize(tl);
+
+	packarena(hd->data, Arenasz, a);
+	packarena(tl->data, Arenasz, a);
+	finalize(hd);
+	finalize(tl);
+	if(syncblk(hd) == -1)
 		sysfatal("ream: write arena: %r");
-	dropblk(b);
+	if(syncblk(tl) == -1)
+		sysfatal("ream: write arena: %r");
+	a->hd = hd;
+	a->tl = tl;
 }
 
 void
 reamfs(char *dev)
 {
-	Blk *sb, *mb, *ab, *ub;
+	Blk *sb0, *sb1, *tb, *mb, *ab, *ub;
 	vlong sz, asz, off;
 	Mount *mnt, *adm;
 	Arena *a;
@@ -226,18 +249,20 @@
 		sysfatal("ream: %r");
 	sz = d->length;
 	free(d);
-	if(sz < 512*MiB)
+
+	if(sz < 512*MiB+Blksz)
 		sysfatal("ream: disk too small");
 	if((mnt = mallocz(sizeof(Mount), 1)) == nil)
 		sysfatal("ream: alloc mount: %r");
 	if((mnt->root = mallocz(sizeof(Tree), 1)) == nil)
 		sysfatal("ream: alloc tree: %r");
-
 	if((adm = mallocz(sizeof(Mount), 1)) == nil)
 		sysfatal("ream: alloc mount: %r");
 	if((adm->root = mallocz(sizeof(Tree), 1)) == nil)
 		sysfatal("ream: alloc tree: %r");
 
+	sz = sz - sz%Blksz - 2*Blksz;
+
 	fs->narena = (sz + 64ULL*GiB - 1) / (64ULL*GiB);
 	if(fs->narena < 8)
 		fs->narena = 8;
@@ -246,19 +271,29 @@
 	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
 		sysfatal("malloc: %r");
 
+
+	off = Blksz;
 	asz = sz/fs->narena;
 	asz = asz - (asz % Blksz) - Blksz;
 	fs->arenasz = asz;
-	off = 0;
+
+	sb0 = cachepluck();
+	sb1 = cachepluck();
+	sb0->bp = (Bptr){0, -1, -1};
+	sb1->bp = (Bptr){512*MiB, -1, -1};
+
+	fs->arenabp = malloc(fs->narena * sizeof(Bptr));
 	for(i = 0; i < fs->narena; i++){
+		a = &fs->arenas[i];
 		print("\tarena %d: %lld blocks at %llx\n", i, asz/Blksz, off);
-		initarena(&fs->arenas[i], fs, off, asz);
-		off += asz;
+		initarena(a, off, asz);
+		fs->arenabp[i] = a->hd->bp;
+		off += asz+Blksz;
 	}
 	
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
-		if((loadarena(a, fs, i*asz)) == -1)
+		if((loadarena(a, a->hd->bp, asz)) == -1)
 			sysfatal("ream: loadarena: %r");
 		if(loadlog(a, a->loghd) == -1)
 			sysfatal("load log: %r");
@@ -265,6 +300,7 @@
 		if(compresslog(a) == -1)
 			sysfatal("compress log: %r");
 	}
+
 	if((mb = newblk(mnt->root, Tleaf)) == nil)
 		sysfatal("ream: allocate root: %r");
 	holdblk(mb);
@@ -303,32 +339,65 @@
 	 * a single snap block that the tree will insert
 	 * into, and take a snapshot as the initial state.
 	 */
-	if((sb = newblk(mnt->root, Tleaf)) == nil)
+	if((tb = newblk(mnt->root, Tleaf)) == nil)
 		sysfatal("ream: allocate snaps: %r");
-	holdblk(sb);
-	initsnap(sb, mb, ab);
-	finalize(sb);
-	syncblk(sb);
+	holdblk(tb);
+	initsnap(tb, mb, ab);
+	finalize(tb);
+	syncblk(tb);
 
-	fs->snap.bp = sb->bp;
+	fs->snap.bp = tb->bp;
 	fs->snap.ht = 1;
+	fs->nextqid = Nreamqid;
 
 	dropblk(mb);
 	dropblk(ab);
 	dropblk(ub);
-	dropblk(sb);
+	dropblk(tb);
 	fs->nextqid = Nreamqid;
 
+	/*
+	 * We need to write back all of the arenas
+	 * with the updated free lists
+	 */
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
 		finalize(a->logtl);
 		if(syncblk(a->logtl) == -1)
 			sysfatal("sync arena: %r");
-		packarena(a->b->data, Blksz, a, fs);
-		finalize(a->b);
-		if(syncblk(a->b) == -1)
+		packarena(a->hd->data, Blksz, a);
+		finalize(a->hd);
+		if(syncblk(a->hd) == -1)
 			sysfatal("sync arena: %r");
+		packarena(a->tl->data, Blksz, a);
+		finalize(a->tl);
+		if(syncblk(a->tl) == -1)
+			sysfatal("sync arena: %r");
+		fs->arenabp[i] = a->hd->bp;
+		dropblk(a->hd);
+		dropblk(a->tl);
 	}
+
+	dropblk(mb);
+	dropblk(ab);
+	dropblk(ub);
+	dropblk(tb);
+
+	/*
+	 * Finally, write back the superblock and backup
+	 * superblock.
+	 */
+	packsb(sb0->buf, Blksz, fs);
+	packsb(sb1->buf, Blksz, fs);
+	finalize(sb0);
+	finalize(sb1);
+	if(syncblk(sb0) == -1)
+		sysfatal("sync superblock: %r");
+	if(syncblk(sb1) == -1)
+		sysfatal("sync superblock: %r");
+	dropblk(sb0);
+	dropblk(sb1);
+
 	free(mnt);
 }
 
@@ -337,8 +406,8 @@
 {
 	vlong sz, off;
 	int i, narena;
+	Bptr bp;
 	Arena *a;
-	Fshdr fi;
 	Dir *d;
 
 	if((fs->fd = open(dev, ORDWR)) == -1)
@@ -348,19 +417,17 @@
 	sz = d->length;
 	free(d);
 
-	if((fs->arenas = calloc(1, sizeof(Arena))) == nil)
+	bp = (Bptr){0, -1, -1};
+	if((fs->sb0 = getblk(bp, GBnochk)) == nil)
+		sysfatal("superblock: %r\n");
+	if(unpacksb(fs, fs->sb0->buf, Blksz) == nil)
+		sysfatal("superblock: %r");
+	if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
 		sysfatal("malloc: %r");
-	fs->narena = 1;
 	for(i = 0; i < fs->narena; i++){
-	Arena *a;
 		a = &fs->arenas[i];
-		if((loadarena(a, &fi, i*fs->arenasz)) == -1)
+		if((loadarena(a, fs->arenabp[i], fs->arenasz)) == -1)
 			sysfatal("growfs: %r");
-		if(fs->narena == 1){
-			fs->Fshdr = fi;
-			if((fs->arenas = realloc(fs->arenas, fs->narena*sizeof(Arena))) == nil)
-				sysfatal("malloc: %r");
-		}
 	}
 	narena = sz/fs->arenasz;
 	off = fs->arenasz * fs->narena;
@@ -368,21 +435,35 @@
 		sysfatal("disk too small for more arenas");
 	if((fs->arenas = realloc(fs->arenas, narena*sizeof(Arena))) == nil)
 		sysfatal("malloc: %r");
+	if((fs->arenabp = realloc(fs->arenas, narena*sizeof(Bptr))) == nil)
+		sysfatal("malloc: %r");
 	for(i = fs->narena; i < narena; i++){
 		a = &fs->arenas[i];
 		print("\tadding %d: %lld blocks at %llx\n", i, fs->arenasz/Blksz, off);
-		initarena(&fs->arenas[i], fs, off, fs->arenasz);
-		if((loadarena(a, &fi, i*fs->arenasz)) == -1)
+		initarena(&fs->arenas[i], off, fs->arenasz);
+		if((loadarena(a, fs->arenabp[i], fs->arenasz)) == -1)
 			sysfatal("growfs: %r");
+		fs->arenabp[i] = a->hd->bp;
 		off += fs->arenasz;
 	}
-
 	fs->narena = narena;
 	for(i = 0; i < narena; i++){
 		a = &fs->arenas[i];
-		packarena(a->b->data, Blksz, a, fs);
-		finalize(a->b);
-		if(syncblk(a->b) == -1)
+		packarena(a->hd->data, Blksz, a);
+		packarena(a->tl->data, Blksz, a);
+		finalize(a->hd);
+		finalize(a->tl);
+		if(syncblk(a->hd) == -1)
 			sysfatal("sync arena: %r");
+		if(syncblk(a->tl) == -1)
+			sysfatal("sync arena: %r");
 	}
+	packsb(fs->sb0->buf, Blksz, fs);
+	packsb(fs->sb1->buf, Blksz, fs);
+	finalize(fs->sb0);
+	finalize(fs->sb1);
+	if(syncblk(fs->sb0) == -1)
+		sysfatal("sync superblock: %r");
+	if(syncblk(fs->sb1) == -1)
+		sysfatal("sync superblock: %r");
 }