ref: c36ed06ba0fd376ca0e346d0639d8cbf18c1cb45
parent: 3b8f163352bfcfec4e1445fd4a2fb55d6d8fec8e
author: Ori Bernstein <ori@eigenstate.org>
date: Sun May 28 17:12:45 EDT 2023
blk: implement sync barriers stop log replay at last fs sync
--- a/blk.c
+++ b/blk.c
@@ -65,8 +65,9 @@
static Blk*
readblk(vlong bp, int flg)
{
- Blk *b;
vlong off, rem, n;
+ char *p;
+ Blk *b;
assert(bp != -1);
if((b = cachepluck()) == nil)
@@ -88,7 +89,6 @@
b->hnext = nil;
b->flag = 0;
- b->type = (flg&GBraw) ? Tdat : UNPACK16(b->buf+0);
b->bp.addr = bp;
b->bp.hash = -1;
b->bp.gen = -1;
@@ -100,6 +100,8 @@
b->bufsz = 0;
b->logsz = 0;
+ p = b->buf + 2;
+ b->type = (flg&GBraw) ? Tdat : UNPACK16(b->buf+0);
switch(b->type){
default:
fprint(2, "invalid block type %d @%llx\n", b->type, bp);
@@ -110,27 +112,33 @@
b->data = b->buf;
break;
case Tarena:
- b->data = b->buf + 2;
+ b->data = p;
break;
case Tdlist:
- b->deadsz = UNPACK16(b->buf+2);
- b->deadp = unpackbp(b->buf+4, Blksz-4);
- b->data = b->buf + Dlhdsz;
+ b->deadsz = UNPACK16(p); p += 2;
+ b->deadp = unpackbp(p, Ptrsz); p += Ptrsz;
+ assert(p - b->buf == Dlhdsz);
+ b->data = p;
break;
case Tlog:
- b->data = b->buf + Loghdsz;
+ b->logsz = UNPACK16(p); p += 2;
+ b->loghash = UNPACK64(p); p += 8;
+ assert(p - b->buf == Loghdsz);
+ b->data = p;
break;
case Tpivot:
- b->data = b->buf + Pivhdsz;
- b->nval = UNPACK16(b->buf+2);
- b->valsz = UNPACK16(b->buf+4);
- b->nbuf = UNPACK16(b->buf+6);
- b->bufsz = UNPACK16(b->buf+8);
+ b->nval = UNPACK16(p); p += 2;
+ b->valsz = UNPACK16(p); p += 2;
+ b->nbuf = UNPACK16(p); p += 2;
+ b->bufsz = UNPACK16(p); p += 2;
+ assert(p - b->buf == Pivhdsz);
+ b->data = p;
break;
case Tleaf:
- b->data = b->buf + Leafhdsz;
- b->nval = UNPACK16(b->buf+2);
- b->valsz = UNPACK16(b->buf+4);
+ b->nval = UNPACK16(p); p += 2;
+ b->valsz = UNPACK16(p); p += 2;
+ assert(p - b->buf == Leafhdsz);
+ b->data = p;
break;
}
assert(b->magic == Magic);
@@ -248,7 +256,6 @@
lb->logsz = Loghashsz;
p = lb->data + lb->logsz;
PACK64(p+0, o|LogAlloc1);
- PACK64(p+8, (uvlong)LogEnd);
finalize(lb);
if(syncblk(lb) == -1){
@@ -281,16 +288,25 @@
{
vlong o, ao;
Blk *lb;
- char *p;
+ char *p, *name;
o = -1;
lb = *tl;
- assert(off % Blksz == 0);
- assert(op == LogAlloc || op == LogFree);
- assert(lb == nil || lb->type == Tlog);
- assert(off >= a->hd->bp.addr + Blksz);
- assert(off < a->tl->bp.addr);
- dprint("logop %llx+%llx@%x: %s\n", off, len, lb?lb->logsz:-1, (op == LogAlloc) ? "Alloc" : "Free");
+ assert((off & 0xff) == 0);
+ assert(op == LogAlloc || op == LogFree || op == LogSync);
+ if(op != LogSync){
+ assert(lb == nil || lb->type == Tlog);
+ assert(off >= a->hd->bp.addr + Blksz);
+ assert(off < a->tl->bp.addr);
+ }
+ switch(op){
+ case LogAlloc: name = "alloc"; break;
+ case LogFree: name = "free"; break;
+ case LogSync: name = "sync"; break;
+ default: name = "???"; break;
+ }
+assert(lb == nil || lb->logsz > 0);
+ dprint("logop %llx+%llx@%x: %s\n", off, len, lb?lb->logsz:-1, name);
/*
* move to the next block when we have
* 40 bytes in the log:
@@ -346,9 +362,8 @@
int
loadlog(Arena *a, Bptr bp)
{
- vlong ent, off, len;
+ vlong ent, off, len, gen;
int op, i, n;
- uvlong bh;
char *d;
Blk *b;
@@ -356,11 +371,11 @@
dprint("loadlog %B\n", bp);
Nextblk:
if((b = getblk(bp, GBnochk)) == nil)
- return -1;
- bh = UNPACK64(b->data);
+ return -1;;
/* the hash covers the log and offset */
- if(bh != bufhash(b->data+Loghashsz, Logspc-Loghashsz)){
- werrstr("corrupt log block %B [%llx]", bp, blkhash(b));
+ if(b->loghash != bufhash(b->data, Logspc)){
+ werrstr("corrupt log block %B <%llx> [%llx] <%llx>",
+ bp, b->loghash, blkhash(b), bufhash(b->data, Logspc));
return -1;
}
for(i = Loghashsz; i < Logspc; i += n){
@@ -376,6 +391,19 @@
a->logtl = holdblk(b);
dropblk(b);
return 0;
+ case LogSync:
+ gen = ent >> 8;
+ dprint("\tlog@%d: sync %llx\n", i, gen);
+ if(gen >= fs->syncgen){
+ if(a->logtl == nil){
+ b->logsz = i;
+ a->logtl = holdblk(b);
+ return 0;
+ }
+ dropblk(b);
+ return 0;
+ }
+ break;
case LogChain:
bp.addr = off & ~0xff;
bp.hash = -1;
@@ -412,7 +440,8 @@
int
compresslog(Arena *a)
{
- vlong v, ba, na, nl, sz, graft, oldhd, *log;
+ vlong v, ba, na, nl, sz;
+ vlong graft, oldhd, *log;
int i, n, nr;
Blk *b, *hd, *tl;
Range *rng;
@@ -574,6 +603,16 @@
return 0;
}
+int
+syncbarrier(Arena *a, vlong gen)
+{
+ if(logappend(a, gen<<8, 0, LogSync, &a->logtl) == -1)
+ return -1;
+ if(a->loghd.addr == -1)
+ a->loghd = a->logtl->bp;
+ return 0;
+}
+
/*
* Allocate from an arena, with lock
* held. May be called multiple times
@@ -786,8 +825,6 @@
void
finalize(Blk *b)
{
- uvlong h;
-
if(b->type != Tdat)
PACK16(b->buf, b->type);
@@ -810,8 +847,9 @@
packbp(b->buf+4, Ptrsz, &b->deadp);
break;
case Tlog:
- h = bufhash(b->data + Loghashsz, Logspc-Loghashsz);
- PACK64(b->data, h);
+ b->loghash = bufhash(b->data, Logspc);
+ PACK16(b->buf+2, b->logsz);
+ PACK64(b->buf+4, b->loghash);
break;
case Tdat:
case Tmagic:
@@ -1123,6 +1161,7 @@
}
while(fs->syncing != 0)
rsleep(&fs->syncrz);
+ fs->syncgen++;
/* pass 0: sync arena contents */
for(i = 0; i < fs->narena; i++){
a = &fs->arenas[i];
@@ -1129,11 +1168,12 @@
lock(a);
if(a->deferhd.addr != -1){
chainlog(a->logtl, a->deferhd.addr);
- loadlog(a, a->deferhd);
+// loadlog(a, a->deferhd);
a->logtl = a->defertl;
a->deferhd = (Bptr){-1, -1, -1};
a->defertl = nil;
}
+ syncbarrier(a, fs->syncgen);
finalize(a->logtl);
if(syncblk(a->logtl) == -1)
sysfatal("sync arena: %r");
--- a/dat.h
+++ b/dat.h
@@ -71,7 +71,7 @@
Pivhdsz = 10,
Leafhdsz = 6,
- Loghdsz = 2,
+ Loghdsz = 12, /* type, len, hash */
Loghashsz = 8,
Dlhdsz = 2+2+Ptrsz, /* type, size, chain */
Dlspc = Blksz - Dlhdsz,
@@ -299,7 +299,8 @@
LogAlloc1, /* alloc a block */
LogFree1, /* free a block */
LogChain, /* point to next log block */
- LogEnd, /* last entry in log */
+ LogSync, /* sync barrier for replay */
+ LogEnd, /* end of log */
/* 2-wide entries */
#define Log2wide LogAlloc
@@ -440,6 +441,7 @@
vlong arenasz;
vlong nextqid;
vlong nextgen;
+ vlong syncgen;
Bptr *arenabp;
};
@@ -664,6 +666,7 @@
};
struct {
int logsz; /* @2 for allocation log */
+ vlong loghash; /* @6 for log */
};
struct {
int deadsz; /* @2 size of deadlist */
--- a/fs.c
+++ b/fs.c
@@ -2246,6 +2246,7 @@
* 1/4 of our reserved emergency space seems like a good
* heuristic for big, but it was picked arbitrarily.
*/
+ qlock(&fs->synclk);
for(i = 0; i < fs->narena; i++){
lock(&fs->arenas[i]);
c = fs->arenas[i].nlog > fs->arenas[i].reserve/(4*Blksz);
@@ -2255,5 +2256,6 @@
fprint(2, "compress log: %r");
}
}
+ qunlock(&fs->synclk);
}
}
--- a/pack.c
+++ b/pack.c
@@ -565,7 +565,7 @@
int i;
assert(sz == Blksz);
- memcpy(p, "gefs0005", 8); p += 8;
+ memcpy(p, "gefs0006", 8); p += 8;
PACK32(p, Blksz); p += 4;
PACK32(p, Bufspc); p += 4;
PACK32(p, fi->snap.ht); p += 4;
@@ -575,6 +575,7 @@
PACK64(p, fi->arenasz); p += 8;
PACK64(p, fi->nextqid); p += 8;
PACK64(p, fi->nextgen); p += 8;
+ PACK64(p, fi->syncgen); p += 8;
for(i = 0; i < fi->narena; i++){
PACK64(p, fi->arenabp[i].addr); p += 8;
PACK64(p, fi->arenabp[i].hash); p += 8;
@@ -588,7 +589,7 @@
int i;
assert(sz == Blksz);
- if(memcmp(p, "gefs0005", 8) != 0){
+ if(memcmp(p, "gefs0006", 8) != 0){
werrstr("wrong block header %.8s\n", p);
return nil;
}
@@ -603,6 +604,7 @@
fi->arenasz = UNPACK64(p); p += 8;
fi->nextqid = UNPACK64(p); p += 8;
fi->nextgen = UNPACK64(p); p += 8;
+ fi->syncgen = UNPACK64(p); p += 8;
fi->arenabp = malloc(fi->narena * sizeof(Bptr));
for(i = 0; i < fi->narena; i++){
fi->arenabp[i].addr = UNPACK64(p); p += 8;
--- a/ream.c
+++ b/ream.c
@@ -189,6 +189,7 @@
PACK64(p, (512*MiB)|LogAlloc1);
p += 8;
}
+ PACK64(p, (uvlong)LogSync); p += 8; /* barrier */
PACK64(p, (uvlong)LogEnd); /* done */
b->logsz = p - b->data;
finalize(b);