ref: dfd53a7c1835c5caa8a56bbe74f8f9977bb08add
parent: f3ef860750728e330d9199a9fc5716d8525e972d
author: Ori Bernstein <ori@eigenstate.org>
date: Sun Jan 9 12:55:58 EST 2022
snap: remove ref/unref messages they were causing recursive calls to btupsert, which meant that the calls lower in the stack had their new root clobbered by the calls higher in the stack.
--- a/TODO
+++ b/TODO
@@ -1,6 +1,4 @@
*** must have before usable for testing ***
-- correctly implement deadlists
-- fix locking mess on snapshots
- auth against authservers
- implement special file modes: ORCLOSE, OEXCL, etc
@@ -8,12 +6,14 @@
- live alloc log recompression
- Reserve blocks for deletion
- transient exec snapshots
-- testing, debugging
+- testing, debugging, bugfixes
+- basic performance optimization
+ - reduce io ops per update
+ - async page writeback: currently, every write to device is sync.
*** nice to have, can get testing without ***
- add missing management commands in console
- performance optimization:
- - async page writeback: currently, every write to device is sync.
- bulk block frees
- background block frees
- root block reuse
--- a/blk.c
+++ b/blk.c
@@ -17,8 +17,6 @@
static int blkdealloc_lk(vlong);
static Blk* initblk(vlong, int);
-QLock blklock;
-
void
setflag(Blk *b, int flg)
{
@@ -323,6 +321,7 @@
o = b->head.addr|LogChain;
p = a->tail->data + a->tail->logsz;
PBIT64(p, o);
+ finalize(a->tail);
if(syncblk(a->tail) == -1)
return -1;
putblk(a->tail);
@@ -482,11 +481,12 @@
{
Arange *r;
Range *log, *nlog;
- vlong v, bp, nb, graft, oldhd;
+ vlong v, ba, na, graft, oldhd;
int i, n, sz;
Blk *hd, *b;
Oplog ol;
char *p;
+ Bptr bp;
/*
* Sync the current log to disk, and
@@ -500,9 +500,9 @@
* because otherwise we have a deadlock
* allocating the block.
*/
- if((bp = blkalloc_lk(a)) == -1)
+ if((ba = blkalloc_lk(a)) == -1)
return -1;
- if((b = initblk(bp, Tlog)) == nil)
+ if((b = initblk(ba, Tlog)) == nil)
return -1;
setflag(b, Bdirty);
b->logsz = Loghdsz;
@@ -572,9 +572,12 @@
if(syncarena(a) == -1)
return -1;
if(oldhd != -1){
- for(bp = oldhd; bp != -1; bp = nb){
- nb = -1;
- if((b = readblk(bp, 0)) == nil)
+ for(ba = oldhd; ba != -1; ba = na){
+ na = -1;
+ bp.addr = ba;
+ bp.hash = -1;
+ bp.gen = -1;
+ if((b = getblk(bp, GBnochk)) == nil)
return -1;
for(i = Loghdsz; i < Logspc; i += n){
p = b->data + i;
@@ -581,15 +584,15 @@
v = GBIT64(p);
n = ((v&0xff) >= Log2wide) ? 16 : 8;
if((v&0xff) == LogChain){
- nb = v & ~0xff;
+ na = v & ~0xff;
break;
}else if((v&0xff) == LogEnd){
- nb = -1;
+ na = -1;
break;
}
}
lock(a);
- if(blkdealloc_lk(bp) == -1){
+ if(blkdealloc_lk(ba) == -1){
unlock(a);
return -1;
}
@@ -821,20 +824,20 @@
if((b = lookupblk(bp.addr)) != nil)
return cacheblk(b);
- qlock(&blklock);
+ qlock(&fs->blklk);
if((b = lookupblk(bp.addr)) != nil){
cacheblk(b);
- qunlock(&blklock);
+ qunlock(&fs->blklk);
return b;
}
if((b = readblk(bp.addr, flg)) == nil){
- qunlock(&blklock);
+ qunlock(&fs->blklk);
return nil;
}
h = blkhash(b);
if((flg&GBnochk) == 0 && h != bp.hash){
fprint(2, "corrupt block %B: %llx != %llx\n", bp, blkhash(b), bp.hash);
- qunlock(&blklock);
+ qunlock(&fs->blklk);
abort();
return nil;
}
@@ -841,7 +844,7 @@
b->bp.hash = h;
b->bp.gen = bp.gen;
cacheblk(b);
- qunlock(&blklock);
+ qunlock(&fs->blklk);
return b;
}
@@ -1000,7 +1003,6 @@
enqueue(fs->super);
if(r != -1)
r = syncblk(fs->super);
-
qunlock(&fs->snaplk);
return r;
}
--- a/cache.c
+++ b/cache.c
@@ -52,7 +52,6 @@
Blk *e, *c;
u32int h;
- assert(b->bp.addr != 0);
h = ihash(b->bp.addr);
bkt = &fs->cache[h % fs->cmax];
lock(bkt);
--- a/check.c
+++ b/check.c
@@ -122,8 +122,6 @@
case Oinsert: /* new kvp */
case Odelete: /* delete kvp */
case Oclearb: /* delete kvp if exists */
- case Orefsnap:
- case Ounrefsnap:
break;
case Owstat: /* kvp dirent */
if((my.v[0] & ~(Owsize|Owmode|Owmtime)) != 0){
--- a/dat.h
+++ b/dat.h
@@ -95,15 +95,15 @@
Bcached = 1 << 3,
};
-//#define Efs "i will not buy this fs, it is scratched"
+/* internal errors */
#define Eimpl "not implemented"
#define Efs (abort(), "fs broke")
+#define Ebotch "protocol botch"
#define Eio "i/o error"
#define Efid "unknown fid"
#define Etype "invalid fid type"
#define Edscan "invalid dir scan offset"
-#define Eexist "does not exist"
-#define Ebotch "protocol botch"
+#define Eexist "directory entry not found"
#define Emode "unknown mode"
#define Efull "file system full"
#define Eauth "authentication failed"
@@ -259,8 +259,6 @@
Odelete, /* delete kvp */
Oclearb, /* free block ptr if exists */
Owstat, /* kvp dirent */
- Orefsnap, /* increment snap refcount */
- Ounrefsnap, /* decrement snap refcount */
Nmsgtype, /* maximum message type */
};
@@ -429,6 +427,9 @@
/* dent hash table */
Lock dtablk;
Dent *dtab[Ndtab];
+
+ /* slow block io */
+ QLock blklk;
/* protected by lrulk */
Lock lrulk;
--- a/dump.c
+++ b/dump.c
@@ -69,6 +69,7 @@
n = fmtprint(fmt, "ptr:%B", unpackbp(v->v, v->nv));
break;
}
+ break;
case Kent: /* pqid[8] name[n] => dir[n]: serialized Dir */
switch(op){
case Onop:
@@ -122,19 +123,9 @@
}
break;
case Ksnap: /* name[n] => dent[16] ptr[16]: snapshot root */
- switch(op){
- case Orefsnap:
- n = fmtprint(fmt, "ref");
- break;
- case Ounrefsnap:
- n = fmtprint(fmt, "unref");
- break;
- default:
- if(unpacktree(&t, v->v, v->nv) == nil)
- return fmtprint(fmt, "corrupt tree");
- n = fmtprint(fmt, "ref: %d, ht: %d, bp: %B, prev=%lld", t.ref, t.ht, t.bp, t.prev[0]);
- break;
- }
+ if(unpacktree(&t, v->v, v->nv) == nil)
+ return fmtprint(fmt, "corrupt tree");
+ n = fmtprint(fmt, "ref: %d, ht: %d, bp: %B, prev=%lld", t.ref, t.ht, t.bp, t.prev[0]);
break;
case Klabel:
n = fmtprint(fmt, "snap id:\"%llx\"", GBIT64(v->v+1));
@@ -167,8 +158,6 @@
[Odelete] "Odelete",
[Oclearb] "Oclearb",
[Owstat] "Owstat",
- [Orefsnap] "Orefsnap",
- [Ounrefsnap] "Ounrefsnap",
};
Msg *m;
int f, n;
@@ -330,22 +319,22 @@
int i;
fprint(fd, "\tgen:\t%lld\n", t->gen);
-// fprint(fd, "\tref:\t%d\n", t->ref);
-// fprint(fd, "\tht:\t%d\n", t->ht);
-// fprint(fd, "\tbp:\t%B\n", t->bp);
+ fprint(fd, "\tref:\t%d\n", t->ref);
+ fprint(fd, "\tht:\t%d\n", t->ht);
+ fprint(fd, "\tbp:\t%B\n", t->bp);
for(i = 0; i < Ndead; i++){
fprint(fd, "\tdeadlist[%d]: prev=%llx\n", i, t->prev[i]);
-// fprint(fd, "\t\tprev:\t%llx\n", t->prev[i]);
-// fprint(fd, "\t\tfhead:\t%B\n", t->dead[i].head);
-// if(t->dead[i].tail != nil){
-// fprint(fd, "\t\tftailp:%llx\n", t->dead[i].tail->bp.addr);
-// fprint(fd, "\t\tftailh:%llx\n", t->dead[i].tail->bp.hash);
-// }else{
-// fprint(fd, "\t\tftailp:\t-1\n");
-// fprint(fd, "\t\tftailh:\t-1\n");
-// }
-// fprint(fd, "\t\tdead[%d]: (%B)\n", i, t->dead[i].head);
-// scandead(&t->dead[i], showdeadbp, &fd);
+ fprint(fd, "\t\tprev:\t%llx\n", t->prev[i]);
+ fprint(fd, "\t\tfhead:\t%B\n", t->dead[i].head);
+ if(t->dead[i].tail != nil){
+ fprint(fd, "\t\tftailp:%llx\n", t->dead[i].tail->bp.addr);
+ fprint(fd, "\t\tftailh:%llx\n", t->dead[i].tail->bp.hash);
+ }else{
+ fprint(fd, "\t\tftailp:\t-1\n");
+ fprint(fd, "\t\tftailh:\t-1\n");
+ }
+ fprint(fd, "\t\tdead[%d]: (%B)\n", i, t->dead[i].head);
+ scandead(&t->dead[i], showdeadbp, &fd);
}
}
--- a/fns.h
+++ b/fns.h
@@ -17,7 +17,6 @@
Blk* cacheblk(Blk*);
void cachedel(vlong);
Blk* lookupblk(vlong);
-Blk* readblk(vlong, int);
Arena* getarena(vlong);
void putblk(Blk*);
int syncblk(Blk*);
@@ -35,12 +34,12 @@
char* fillsuper(Blk*);
Tree* newsnap(Tree*);
char* freesnap(Tree*, Tree*);
-int savesnap(Tree*);
char* labelsnap(char*, vlong);
char* unlabelsnap(vlong, char*);
char* refsnap(vlong);
char* unrefsnap(vlong, vlong);
Tree* openlabel(char*);
+Tree* opensnap(vlong);
void closesnap(Tree*);
uvlong siphash(void*, usize);
void reamfs(char*);
@@ -116,6 +115,8 @@
Tree* unpacktree(Tree*, char*, int);
char* packdkey(char*, int, vlong, char*);
char* packdval(char*, int, Xdir*);
+char* packsnap(char*, int, vlong);
+char* packlabel(char*, int, char*);
/* fmt */
int Bconv(Fmt*);
--- a/fs.c
+++ b/fs.c
@@ -19,7 +19,6 @@
t = mnt->root;
if(!t->dirty)
return nil;
- qlock(&fs->snaplk);
if((n = newsnap(t)) == nil){
fprint(2, "snap: save %s: %s\n", mnt->name, "create snap");
abort();
@@ -36,7 +35,6 @@
}
mnt->root = n;
closesnap(t);
- qunlock(&fs->snaplk);
return nil;
}
@@ -64,6 +62,7 @@
if(u != nil)
closesnap(u);
closesnap(t);
+ /* we probably want explicit snapshots to be resilient */
sync();
fprint(fd, "snap taken: %s\n", new);
}
@@ -75,7 +74,6 @@
char *e;
t = f->mnt->root;
- qlock(&fs->snaplk);
if((n = newsnap(t)) == nil){
fprint(2, "snap: save %s: %s\n", f->mnt->name, "create snap");
abort();
@@ -92,7 +90,6 @@
}
f->mnt->root = n;
closesnap(t);
- qunlock(&fs->snaplk);
sync();
return nil;
}
@@ -480,7 +477,7 @@
unlock(&fs->fidtablk);
if(o != nil){
- fprint(2, "fid in use: %d == %d", o->fid, new);
+ fprint(2, "fid in use: %d == %d\n", o->fid, new);
abort();
free(n);
return nil;
--- a/load.c
+++ b/load.c
@@ -17,10 +17,14 @@
{
Blk *b;
char *p;
+ Bptr bp;
if((a->free = avlcreate(rangecmp)) == nil)
return -1;
- if((b = readblk(o, 0)) == nil)
+ bp.addr = o;
+ bp.hash = -1;
+ bp.gen = -1;
+ if((b = getblk(bp, GBnochk)) == nil)
return -1;
p = b->data;
a->b = b;
@@ -42,6 +46,7 @@
int i, blksz, bufspc, hdrsz;
vlong sb;
char *p, *e;
+ Bptr bp;
Tree *t;
Blk *b;
Dir *d;
@@ -54,7 +59,10 @@
sb = d->length - (d->length % Blksz) - Blksz;
free(d);
- if((b = readblk(sb, 0)) == nil)
+ bp.addr = sb;
+ bp.hash = -1;
+ bp.gen = -1;
+ if((b = getblk(bp, GBnochk)) == nil)
sysfatal("read superblock: %r");
if(b->type != Tsuper)
sysfatal("corrupt superblock: bad type");
--- a/pack.c
+++ b/pack.c
@@ -309,6 +309,27 @@
}
char*
+packlabel(char *p, int sz, char *name)
+{
+ int n;
+
+ n = strlen(name);
+ assert(sz >= n+1);
+ p[0] = Klabel; p += 1;
+ memcpy(p, name, n); p += n;
+ return p;
+}
+
+char*
+packsnap(char *p, int sz, vlong id)
+{
+ assert(sz >= Snapsz);
+ p[0] = Ksnap; p += 1;
+ PBIT64(p, id); p += 8;
+ return p;
+}
+
+char*
packbp(char *p, int sz, Bptr *bp)
{
assert(sz >= Ptrsz);
--- a/snap.c
+++ b/snap.c
@@ -22,27 +22,30 @@
Tree*
openlabel(char *name)
{
- char dbuf[Keymax], buf[Kvmax];
- char *p, *e;
- vlong id;
- Tree *t;
- int n;
- Key k;
+ char *p, buf[Keymax];
Kvp kv;
+ Key k;
- qlock(&fs->snaplk);
- n = strlen(name);
- p = dbuf;
- p[0] = Klabel; p += 1;
- memcpy(p, name, n); p += n;
- k.k = dbuf;
- k.nk = p - dbuf;
- if((e = btlookup(&fs->snap, &k, &kv, buf, sizeof(buf))) != nil)
- goto Error;
+ if((p = packlabel(buf, sizeof(buf), name)) == nil)
+ return nil;
+ k.k = buf;
+ k.nk = p - buf;
+ if(btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)) != nil)
+ return nil;
if(kv.nv != Snapsz)
- goto Error;
+ return nil;
+ return opensnap(GBIT64(kv.v + 1));
+}
- id = GBIT64(kv.v + 1);
+Tree*
+opensnap(vlong id)
+{
+ char *p, buf[Kvmax];
+ Tree *t;
+ Kvp kv;
+ Key k;
+
+ qlock(&fs->snaplk);
for(t = fs->osnap; t != nil; t = t->snext){
if(t->gen == id){
ainc(&t->memref);
@@ -50,19 +53,16 @@
return t;
}
}
- if((t = mallocz(sizeof(Tree), 1)) == nil){
- fprint(2, "open %s: %s\n", name, e);
+ if((t = mallocz(sizeof(Tree), 1)) == nil)
goto Error;
- }
memset(&t->lk, 0, sizeof(t->lk));
- memmove(dbuf, kv.v, kv.nv);
- k.k = dbuf;
- k.nk = kv.nv;
- if((e = btlookup(&fs->snap, &k, &kv, buf, sizeof(buf))) != nil){
- fprint(2, "open %s: %s\n", name, e);
+ if((p = packsnap(buf, sizeof(buf), id)) == nil)
goto Error;
- }
+ k.k = buf;
+ k.nk = p - buf;
+ if(btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)) != nil)
+ goto Error;
if(unpacktree(t, kv.v, kv.nv) == nil)
goto Error;
t->memref = 1;
@@ -90,7 +90,7 @@
continue;
finalize(t->dead[i].tail);
syncblk(t->dead[i].tail);
-//FIXME putblk(t->dead[i].tail);
+//FIXME: putblk(t->dead[i].tail);
}
p = &fs->osnap;
@@ -104,50 +104,64 @@
free(te);
}
-static char*
-modifysnap(char *name, vlong gen, vlong next, int del)
+char*
+modifysnap(int op, Tree *t)
{
- char dbuf[Keymax], sbuf[Snapsz], nbuf[Snapsz];
+ char kbuf[Snapsz], vbuf[Treesz];
char *p, *e;
- int n, nm;
- Msg m[2];
+ Msg m;
+ int i;
- nm = 0;
-
- p = sbuf;
- p[0] = Ksnap; p += 1;
- PBIT64(p, gen); p += 8;
- m[nm].op = del ? Ounrefsnap : Orefsnap;
- m[nm].k = sbuf;
- m[nm].nk = p - sbuf;
- p = nbuf;
- p[0] = Ksnap; p += 1;
- PBIT64(p, next); p += 8;
- m[nm].v = nbuf;
- m[nm].nv = p - nbuf;
- nm++;
-
- if(name != nil){
- p = dbuf;
- n = strlen(name);
- m[nm].op = del ? Odelete : Oinsert;
- p[0] = Klabel; p += 1;
- memcpy(p, name, n); p += n;
- m[nm].k = dbuf;
- m[nm].nk = p - dbuf;
- m[nm].v = m[nm-1].k;
- m[nm].nv = m[nm-1].nk;
- nm++;
+ for(i = 0; i < Ndead; i++){
+ if(t->dead[i].tail != nil){
+ finalize(t->dead[i].tail);
+ syncblk(t->dead[i].tail);
+ }
}
- if((e = btupsert(&fs->snap, m, nm)) != nil)
+ m.op = op;
+ if((p = packsnap(kbuf, sizeof(kbuf), t->gen)) == nil)
+ return Elength;
+ m.k = kbuf;
+ m.nk = p - kbuf;
+ if(op == Oinsert){
+ if((p = packtree(vbuf, sizeof(vbuf), t)) == nil)
+ return Elength;
+ m.v = vbuf;
+ m.nv = p - vbuf;
+ }else{
+ m.v = nil;
+ m.nv = 0;
+ }
+ if((e = btupsert(&fs->snap, &m, 1)) != nil)
return e;
return nil;
}
-char*
-deletesnap(Tree *snap, Tree *from)
+static char*
+modifylabel(int op, char *name, vlong id)
{
-fprint(2, "deleting snap at %B from %B\n", snap->bp, from->bp);
+ char *p, *e, kbuf[Keymax], vbuf[Snapsz];
+ Msg m;
+
+ if(op == Oinsert)
+ e = refsnap(id);
+ else
+ e = unrefsnap(id, -1);
+ if(e != nil)
+ return e;
+
+ m.op = op;
+ if((p = packlabel(kbuf, sizeof(kbuf), name)) == nil)
+ return Elength;
+ m.k = kbuf;
+ m.nk = p - kbuf;
+ if((p = packsnap(vbuf, sizeof(vbuf), id)) == nil)
+ return Elength;
+ m.v = vbuf;
+ m.nv = p - vbuf;
+
+ if((e = btupsert(&fs->snap, &m, 1)) != nil)
+ return e;
return nil;
}
@@ -154,25 +168,54 @@
char*
labelsnap(char *name, vlong gen)
{
- return modifysnap(name, gen, -1, 0);
+ return modifylabel(Oinsert, name, gen);
}
char*
unlabelsnap(vlong gen, char *name)
{
- return modifysnap(name, gen, -1, 1);
+ return modifylabel(Odelete, name, gen);
}
char*
-refsnap(vlong gen)
+refsnap(vlong id)
{
- return modifysnap(nil, gen, -1, 0);
+ Tree *t;
+ char *e;
+
+ t = opensnap(id);
+ t->ref++;
+ if((e = modifysnap(Oinsert, t)) != nil)
+ return e;
+ closesnap(t);
+ return nil;
}
char*
-unrefsnap(vlong gen, vlong next)
+unrefsnap(vlong id, vlong succ)
{
- return modifysnap(nil, gen, next, 1);
+ Tree *t, *u;
+ char *e;
+
+ if((t = opensnap(id)) == nil)
+ return Eexist;
+ if(--t->ref == 0){
+ if((u = opensnap(succ)) == nil)
+ return Eexist;
+ if((e = freesnap(t, u)) != nil)
+ return e;
+ if((e = modifysnap(Odelete, t)) != nil)
+ return e;
+ if((e = modifysnap(Oinsert, u)) != nil)
+ return e;
+ closesnap(u);
+ closesnap(t);
+ }else{
+ if((e = modifysnap(Oinsert, t)) != nil)
+ return e;
+ closesnap(t);
+ }
+ return nil;
}
void
@@ -197,11 +240,6 @@
assert(snap->gen != next->gen);
assert(next->prev[0] == snap->gen);
-//fprint(2, "next tree\n");
-//showtreeroot(2, next);
-//fprint(2, "snap tree\n");
-//showtreeroot(2, snap);
-
dl = next->dead[Ndead-1];
scandead(&next->dead[0], freedead, nil);
for(i = 0; i < Ndead-2; i++){
@@ -216,42 +254,9 @@
}
scandead(&dl, redeadlist, next);
-//fprint(2, "transferred\n");
-//showtreeroot(2, next);
-//fprint(2, "==================================\n");
return nil;
}
-int
-savesnap(Tree *t)
-{
- char kbuf[Snapsz], vbuf[Treesz];
- char *p, *e;
- Msg m;
- int i;
- for(i = 0; i < Ndead; i++){
- if(t->dead[i].tail != nil){
- finalize(t->dead[i].tail);
- syncblk(t->dead[i].tail);
- }
- }
- p = kbuf;
- p[0] = Ksnap; p += 1;
- PBIT64(p, t->gen); p += 8;
- m.op = Oinsert;
- m.k = kbuf;
- m.nk = p - kbuf;
- p = packtree(vbuf, sizeof(vbuf), t);
- m.v = vbuf;
- m.nv = p - vbuf;
- if((e = btupsert(&fs->snap, &m, 1)) != nil){
- fprint(2, "error snapshotting: %s\n", e);
- return -1;
- }
- return 0;
-}
-
-
Tree*
newsnap(Tree *t)
{
@@ -259,7 +264,7 @@
Tree *r;
int i;
- if(savesnap(t) == -1)
+ if(modifysnap(Oinsert, t) != nil)
return nil;
if((r = malloc(sizeof(Tree))) == nil)
--- a/tree.c
+++ b/tree.c
@@ -394,8 +394,6 @@
int
apply(Kvp *kv, Msg *m, char *buf, int nbuf)
{
- int refs;
-
switch(m->op){
case Oclearb:
case Odelete:
@@ -407,20 +405,6 @@
case Owstat:
assert(keycmp(kv, m) == 0);
statupdate(kv, m);
- return 1;
- case Orefsnap:
- assert(keycmp(kv, m) == 0);
- refs = GBIT32(kv->v) + 1;
- PBIT32(kv->v, refs);
- return 1;
- case Ounrefsnap:
- assert(keycmp(kv, m) == 0);
- refs = GBIT32(kv->v) - 1;
- if(refs == 0){
- dropsnap(kv, m);
- return 0;
- }
- PBIT32(kv->v, refs);
return 1;
default:
abort();