shithub: gefs

Download patch

ref: c36ed06ba0fd376ca0e346d0639d8cbf18c1cb45
parent: 3b8f163352bfcfec4e1445fd4a2fb55d6d8fec8e
author: Ori Bernstein <ori@eigenstate.org>
date: Sun May 28 17:12:45 EDT 2023

blk: implement sync barriers

stop log replay at last fs sync

--- a/blk.c
+++ b/blk.c
@@ -65,8 +65,9 @@
 static Blk*
 readblk(vlong bp, int flg)
 {
-	Blk *b;
 	vlong off, rem, n;
+	char *p;
+	Blk *b;
 
 	assert(bp != -1);
 	if((b = cachepluck()) == nil)
@@ -88,7 +89,6 @@
 	b->hnext = nil;
 	b->flag = 0;
 
-	b->type = (flg&GBraw) ? Tdat : UNPACK16(b->buf+0);
 	b->bp.addr = bp;
 	b->bp.hash = -1;
 	b->bp.gen = -1;
@@ -100,6 +100,8 @@
 	b->bufsz = 0;
 	b->logsz = 0;
 
+	p = b->buf + 2;
+	b->type = (flg&GBraw) ? Tdat : UNPACK16(b->buf+0);
 	switch(b->type){
 	default:
 		fprint(2, "invalid block type %d @%llx\n", b->type, bp);
@@ -110,27 +112,33 @@
 		b->data = b->buf;
 		break;
 	case Tarena:
-		b->data = b->buf + 2;
+		b->data = p;
 		break;
 	case Tdlist:
-		b->deadsz = UNPACK16(b->buf+2);
-		b->deadp = unpackbp(b->buf+4, Blksz-4);
-		b->data = b->buf + Dlhdsz;
+		b->deadsz = UNPACK16(p);	p += 2;
+		b->deadp = unpackbp(p, Ptrsz);	p += Ptrsz;
+		assert(p - b->buf == Dlhdsz);
+		b->data = p;
 		break;
 	case Tlog:
-		b->data = b->buf + Loghdsz;
+		b->logsz = UNPACK16(p);		p += 2;
+		b->loghash = UNPACK64(p);	p += 8;
+		assert(p - b->buf == Loghdsz);
+		b->data = p;
 		break;
 	case Tpivot:
-		b->data = b->buf + Pivhdsz;
-		b->nval = UNPACK16(b->buf+2);
-		b->valsz = UNPACK16(b->buf+4);
-		b->nbuf = UNPACK16(b->buf+6);
-		b->bufsz = UNPACK16(b->buf+8);
+		b->nval = UNPACK16(p);		p += 2;
+		b->valsz = UNPACK16(p);		p += 2;
+		b->nbuf = UNPACK16(p);		p += 2;
+		b->bufsz = UNPACK16(p);		p += 2;
+		assert(p - b->buf == Pivhdsz);
+		b->data = p;
 		break;
 	case Tleaf:
-		b->data = b->buf + Leafhdsz;
-		b->nval = UNPACK16(b->buf+2);
-		b->valsz = UNPACK16(b->buf+4);
+		b->nval = UNPACK16(p);		p += 2;
+		b->valsz = UNPACK16(p);		p += 2;
+		assert(p - b->buf == Leafhdsz);
+		b->data = p;
 		break;
 	}
 	assert(b->magic == Magic);
@@ -248,7 +256,6 @@
 	lb->logsz = Loghashsz;
 	p = lb->data + lb->logsz;
 	PACK64(p+0, o|LogAlloc1);
-	PACK64(p+8, (uvlong)LogEnd);
 	finalize(lb);
 
 	if(syncblk(lb) == -1){
@@ -281,16 +288,25 @@
 {
 	vlong o, ao;
 	Blk *lb;
-	char *p;
+	char *p, *name;
 
 	o = -1;
 	lb = *tl;
-	assert(off % Blksz == 0);
-	assert(op == LogAlloc || op == LogFree);
-	assert(lb == nil || lb->type == Tlog);
-	assert(off >= a->hd->bp.addr + Blksz);
-	assert(off < a->tl->bp.addr);
-	dprint("logop %llx+%llx@%x: %s\n", off, len, lb?lb->logsz:-1, (op == LogAlloc) ? "Alloc" : "Free");
+	assert((off & 0xff) == 0);
+	assert(op == LogAlloc || op == LogFree || op == LogSync);
+	if(op != LogSync){
+		assert(lb == nil || lb->type == Tlog);
+		assert(off >= a->hd->bp.addr + Blksz);
+		assert(off < a->tl->bp.addr);
+	}
+	switch(op){
+	case LogAlloc:	name = "alloc";	break;
+	case LogFree:	name = "free";	break;
+	case LogSync:	name = "sync";	break;
+	default:	name = "???";	break;
+	}
+assert(lb == nil || lb->logsz > 0);
+	dprint("logop %llx+%llx@%x: %s\n", off, len, lb?lb->logsz:-1, name);
 	/*
 	 * move to the next block when we have
 	 * 40 bytes in the log:
@@ -346,9 +362,8 @@
 int
 loadlog(Arena *a, Bptr bp)
 {
-	vlong ent, off, len;
+	vlong ent, off, len, gen;
 	int op, i, n;
-	uvlong bh;
 	char *d;
 	Blk *b;
 
@@ -356,11 +371,11 @@
 	dprint("loadlog %B\n", bp);
 Nextblk:
 	if((b = getblk(bp, GBnochk)) == nil)
-		return -1;
-	bh = UNPACK64(b->data);
+		return -1;;
 	/* the hash covers the log and offset */
-	if(bh != bufhash(b->data+Loghashsz, Logspc-Loghashsz)){
-		werrstr("corrupt log block %B [%llx]", bp, blkhash(b));
+	if(b->loghash != bufhash(b->data, Logspc)){
+		werrstr("corrupt log block %B <%llx> [%llx] <%llx>",
+			bp, b->loghash, blkhash(b), bufhash(b->data, Logspc));
 		return -1;
 	}
 	for(i = Loghashsz; i < Logspc; i += n){
@@ -376,6 +391,19 @@
 				a->logtl = holdblk(b);
 			dropblk(b);
 			return 0;
+		case LogSync:
+			gen = ent >> 8;
+			dprint("\tlog@%d: sync %llx\n", i, gen);
+			if(gen >= fs->syncgen){
+				if(a->logtl == nil){
+					b->logsz = i;
+					a->logtl = holdblk(b);
+					return 0;
+				}
+				dropblk(b);
+				return 0;
+			}
+			break;
 		case LogChain:
 			bp.addr = off & ~0xff;
 			bp.hash = -1;
@@ -412,7 +440,8 @@
 int
 compresslog(Arena *a)
 {
-	vlong v, ba, na, nl, sz, graft, oldhd, *log;
+	vlong v, ba, na, nl, sz;
+	vlong graft, oldhd, *log;
 	int i, n, nr;
 	Blk *b, *hd, *tl;
 	Range *rng;
@@ -574,6 +603,16 @@
 	return 0;
 }
 
+int
+syncbarrier(Arena *a, vlong gen)
+{
+	if(logappend(a, gen<<8, 0, LogSync, &a->logtl) == -1)
+		return -1;
+	if(a->loghd.addr == -1)
+		a->loghd = a->logtl->bp;
+	return 0;
+}
+
 /*
  * Allocate from an arena, with lock
  * held. May be called multiple times
@@ -786,8 +825,6 @@
 void
 finalize(Blk *b)
 {
-	uvlong h;
-
 	if(b->type != Tdat)
 		PACK16(b->buf, b->type);
 
@@ -810,8 +847,9 @@
 		packbp(b->buf+4, Ptrsz, &b->deadp);
 		break;
 	case Tlog:
-		h = bufhash(b->data + Loghashsz, Logspc-Loghashsz);
-		PACK64(b->data, h);
+		b->loghash = bufhash(b->data, Logspc);
+		PACK16(b->buf+2, b->logsz);
+		PACK64(b->buf+4, b->loghash);
 		break;
 	case Tdat:
 	case Tmagic:
@@ -1123,6 +1161,7 @@
 	}
 	while(fs->syncing != 0)
 		rsleep(&fs->syncrz);
+	fs->syncgen++;
 	/* pass 0: sync arena contents */
 	for(i = 0; i < fs->narena; i++){
 		a = &fs->arenas[i];
@@ -1129,11 +1168,12 @@
 		lock(a);
 		if(a->deferhd.addr != -1){
 			chainlog(a->logtl, a->deferhd.addr);
-			loadlog(a, a->deferhd);
+//			loadlog(a, a->deferhd);
 			a->logtl = a->defertl;
 			a->deferhd = (Bptr){-1, -1, -1};
 			a->defertl = nil;
 		}
+		syncbarrier(a, fs->syncgen);
 		finalize(a->logtl);
 		if(syncblk(a->logtl) == -1)
 			sysfatal("sync arena: %r");
--- a/dat.h
+++ b/dat.h
@@ -71,7 +71,7 @@
 	
 	Pivhdsz		= 10,
 	Leafhdsz	= 6,
-	Loghdsz		= 2,
+	Loghdsz		= 12,				/* type, len, hash */
 	Loghashsz	= 8,
 	Dlhdsz		= 2+2+Ptrsz,			/* type, size, chain */
 	Dlspc		= Blksz - Dlhdsz,
@@ -299,7 +299,8 @@
 	LogAlloc1,	/* alloc a block */
 	LogFree1,	/* free a block */
 	LogChain,	/* point to next log block */
-	LogEnd,		/* last entry in log */	
+	LogSync,	/* sync barrier for replay */
+	LogEnd,		/* end of log */
 
 	/* 2-wide entries */
 #define	Log2wide	LogAlloc
@@ -440,6 +441,7 @@
 	vlong	arenasz;
 	vlong	nextqid;
 	vlong	nextgen;
+	vlong	syncgen;
 	Bptr	*arenabp;
 };
 
@@ -664,6 +666,7 @@
 		};
 		struct {
 			int	logsz;	/* @2 for allocation log */
+			vlong	loghash; /* @6 for log */
 		};
 		struct {
 			int	deadsz;	/* @2 size of deadlist */
--- a/fs.c
+++ b/fs.c
@@ -2246,6 +2246,7 @@
 		 * 1/4 of our reserved emergency space seems like a good
 		 * heuristic for big, but it was picked arbitrarily.
 		 */
+		qlock(&fs->synclk);
 		for(i = 0; i < fs->narena; i++){
 			lock(&fs->arenas[i]);
 			c = fs->arenas[i].nlog > fs->arenas[i].reserve/(4*Blksz);
@@ -2255,5 +2256,6 @@
 					fprint(2, "compress log: %r");
 			}
 		}
+		qunlock(&fs->synclk);
 	}
 }
--- a/pack.c
+++ b/pack.c
@@ -565,7 +565,7 @@
 	int i;
 
 	assert(sz == Blksz);
-	memcpy(p, "gefs0005", 8);	p += 8;
+	memcpy(p, "gefs0006", 8);	p += 8;
 	PACK32(p, Blksz);		p += 4;
 	PACK32(p, Bufspc);		p += 4;
 	PACK32(p, fi->snap.ht);		p += 4;
@@ -575,6 +575,7 @@
 	PACK64(p, fi->arenasz);		p += 8;
 	PACK64(p, fi->nextqid);		p += 8;
 	PACK64(p, fi->nextgen);		p += 8;
+	PACK64(p, fi->syncgen);		p += 8;
 	for(i = 0; i < fi->narena; i++){
 		PACK64(p, fi->arenabp[i].addr);	p += 8;
 		PACK64(p, fi->arenabp[i].hash);	p += 8;
@@ -588,7 +589,7 @@
 	int i;
 
 	assert(sz == Blksz);
-	if(memcmp(p, "gefs0005", 8) != 0){
+	if(memcmp(p, "gefs0006", 8) != 0){
 		werrstr("wrong block header %.8s\n", p);
 		return nil;
 	}
@@ -603,6 +604,7 @@
 	fi->arenasz = UNPACK64(p);	p += 8;
 	fi->nextqid = UNPACK64(p);	p += 8;
 	fi->nextgen = UNPACK64(p);	p += 8;
+	fi->syncgen = UNPACK64(p);	p += 8;
 	fi->arenabp = malloc(fi->narena * sizeof(Bptr));
 	for(i = 0; i < fi->narena; i++){
 		fi->arenabp[i].addr = UNPACK64(p);	p += 8;
--- a/ream.c
+++ b/ream.c
@@ -189,6 +189,7 @@
 		PACK64(p, (512*MiB)|LogAlloc1);
 		p += 8;
 	}
+	PACK64(p, (uvlong)LogSync);	p += 8;	/* barrier */
 	PACK64(p, (uvlong)LogEnd);	/* done */
 	b->logsz = p - b->data;
 	finalize(b);