ref: fa5ed8b61d58e141790c263eba6caea2c67945b8
parent: bfd636c4beff968fb3d246cf36ebb6853ef5d509
author: Ori Bernstein <ori@eigenstate.org>
date: Sun Nov 6 10:10:35 EST 2022
tree: avoid scanning freed blocks we used to take a reference to the tree root when we started walking, and keep it around while the scan was still running; there's a problem when we do that, since the tree can be updated under us. we have 3 options to deal with this: 1. we can pin the epoch as long as a scan is attached to a fid in a reader, but this can lead to running out of blocks in the cache when dealing with slow readers, which could lead to a denial of service, and maybe a deadlock where the reader can't make progress until the freelist is processed. 2. we can take a snapshot of the file system when we start listing a directory; this may be an option. 3. the one implemented here: just let the tree change under us, and rewalk on every Tread on a directory.
--- a/cons.c
+++ b/cons.c
@@ -184,7 +184,6 @@
Key k;
vlong pqid;
Scan s;
- int done;
if((t = openlabel("main")) == nil){
fprint(fd, "could not open main snap\n");
@@ -205,21 +204,22 @@
}
fprint(fd, "%P\n", &kv);
}else{
- if((e = btscan(t, &s, k.k, k.nk)) != nil){
+ btnewscan(&s, k.k, k.nk);
+ if((e = btenter(t, &s)) != nil){
fprint(fd, "scan failed: %s\n", e);
goto Out;
}
while(1){
- if((e = btnext(&s, &kv, &done)) != nil){
+ if((e = btnext(t, &s, &kv)) != nil){
fprint(fd, "scan failed: %s\n", e);
- btdone(&s);
+ btexit(t, &s);
goto Out;
}
- if(done)
+ if(s.done)
break;
fprint(fd, "%P\n", &kv);
}
- btdone(&s);
+ btexit(t, &s);
}
Out:
closesnap(t);
--- a/dat.h
+++ b/dat.h
@@ -593,11 +593,11 @@
struct Scan {
vlong offset; /* last read offset */
- Tree root;
- int done;
- int overflow;
- int present;
+ char first;
+ char done;
+ char overflow;
+ char present;
Kvp kv;
Key pfx;
char kvbuf[Kvmax];
--- a/dump.c
+++ b/dump.c
@@ -372,16 +372,12 @@
showsnap(int fd, char **ap, int na)
{
char *e, pfx[Snapsz];
- int sz, done;
Mount *mnt;
vlong id;
- Scan *s;
+ Scan s;
+ int sz;
Tree t;
- if((s = mallocz(sizeof(Scan), 1)) == nil){
- fprint(fd, "no memory\n");
- return;
- }
pfx[0] = Ksnap;
sz = 1;
if(na != 0){
@@ -389,25 +385,27 @@
id = atoll(ap[0]);
PACK64(pfx+1, id);
}
- if((e = btscan(&fs->snap, s, pfx, sz)) != nil){
+ btnewscan(&s, pfx, sz);
+ if((e = btenter(&fs->snap, &s)) != nil){
fprint(fd, "scan: %s\n", e);
- btdone(s);
+ btexit(&fs->snap, &s);
return;
}
while(1){
- if((e = btnext(s, &s->kv, &done)) != nil){
+ if((e = btnext(&fs->snap, &s, &s.kv)) != nil){
fprint(fd, "scan: %s\n", e);
break;
}
- if(done)
+ if(s.done)
break;
- fprint(fd, "snap: %P\n", &s->kv);
- if(unpacktree(&t, s->kv.v, s->kv.nv) == nil){
+ fprint(fd, "snap: %P\n", &s.kv);
+ if(unpacktree(&t, s.kv.v, s.kv.nv) == nil){
fprint(fd, "unpack: garbled tree\n");
break;
}
showtreeroot(fd, &t);
}
+ btexit(&fs->snap, &s);
qlock(&fs->snaplk);
for(mnt = fs->mounts; mnt != nil; mnt = mnt->next){
fprint(fd, "open: %s\n", mnt->name);
--- a/fns.h
+++ b/fns.h
@@ -84,9 +84,10 @@
char* btupsert(Tree*, Msg*, int);
char* btlookup(Tree*, Key*, Kvp*, char*, int);
-char* btscan(Tree*, Scan*, char*, int);
-char* btnext(Scan*, Kvp*, int*);
-void btdone(Scan*);
+void btnewscan(Scan*, char*, int);
+char* btenter(Tree*, Scan*);
+char* btnext(Tree*, Scan*, Kvp*);
+void btexit(Tree*, Scan*);
int checkflag(Blk *b, int);
void setflag(Blk *b, int);
--- a/fs.c
+++ b/fs.c
@@ -1316,7 +1316,6 @@
}
lock(f);
if(f->scan != nil){
- btdone(f->scan);
free(f->scan);
f->scan = nil;
}
@@ -1460,25 +1459,20 @@
candelete(Fid *f)
{
char *e, pfx[Dpfxsz];
- int done;
- Scan *s;
+ Scan s;
if(f->dent->qid.type == QTFILE)
return nil;
- if((s = mallocz(sizeof(Scan), 1)) == nil)
- return Enomem;
-
packdkey(pfx, sizeof(pfx), f->qpath, nil);
- if((e = btscan(f->mnt->root, s, pfx, sizeof(pfx))) != nil)
+ btnewscan(&s, pfx, sizeof(pfx));
+ if((e = btenter(f->mnt->root, &s)) != nil)
goto Out;
- done = 0;
- if((e = btnext(s, &s->kv, &done)) != nil)
+ if((e = btnext(f->mnt->root, &s, &s.kv)) != nil)
goto Out;
- if(!done)
+ if(s.done)
e = Enempty;
Out:
- btdone(s);
- free(s);
+ btexit(f->mnt->root, &s);
return e;
}
@@ -1655,7 +1649,7 @@
readsnap(Fmsg *m, Fid *f, Fcall *r)
{
char pfx[1], *p, *e;
- int n, ns, done;
+ int n, ns;
Scan *s;
Xdir d;
@@ -1666,15 +1660,9 @@
if((s = mallocz(sizeof(Scan), 1)) == nil)
return Enomem;
pfx[0] = Klabel;
- if((e = btscan(&fs->snap, s, pfx, 1)) != nil){
- btdone(s);
- free(s);
- return e;
- }
-
+ btnewscan(s, pfx, 1);
lock(f);
if(f->scan != nil){
- btdone(f->scan);
free(f->scan);
}
f->scan = s;
@@ -1699,10 +1687,12 @@
p += ns;
n -= ns;
}
+ if((e = btenter(&fs->snap, s)) != nil)
+ return e;
while(1){
- if((e = btnext(s, &s->kv, &done)) != nil)
+ if((e = btnext(&fs->snap, s, &s->kv)) != nil)
return e;
- if(done)
+ if(s->done)
break;
memcpy(d.name, s->kv.k+1, s->kv.nk-1);
d.name[s->kv.nk-1] = 0;
@@ -1714,6 +1704,7 @@
p += ns;
n -= ns;
}
+ btexit(&fs->snap, s);
r->count = p - r->data;
return nil;
}
@@ -1722,10 +1713,12 @@
readdir(Fmsg *m, Fid *f, Fcall *r)
{
char pfx[Dpfxsz], *p, *e;
- int n, ns, done;
+ int n, ns;
+ Tree *t;
Scan *s;
s = f->scan;
+ t = f->mnt->root;
if(s != nil && s->offset != 0 && s->offset != m->offset)
return Edscan;
if(s == nil || m->offset == 0){
@@ -1733,17 +1726,10 @@
return Enomem;
packdkey(pfx, sizeof(pfx), f->qpath, nil);
- if((e = btscan(f->mnt->root, s, pfx, sizeof(pfx))) != nil){
- btdone(s);
- free(s);
- return e;
- }
-
+ btnewscan(s, pfx, sizeof(pfx));
lock(f);
- if(f->scan != nil){
- btdone(f->scan);
+ if(f->scan != nil)
free(f->scan);
- }
f->scan = s;
unlock(f);
}
@@ -1762,10 +1748,12 @@
p += ns;
n -= ns;
}
+ if((e = btenter(t, s)) != nil)
+ return e;
while(1){
- if((e = btnext(s, &s->kv, &done)) != nil)
+ if((e = btnext(t, s, &s->kv)) != nil)
return e;
- if(done)
+ if(s->done)
break;
if((ns = kv2statbuf(&s->kv, p, n)) == -1){
s->overflow = 1;
@@ -1774,6 +1762,7 @@
p += ns;
n -= ns;
}
+ btexit(t, s);
r->count = p - r->data;
return nil;
}
--- a/tree.c
+++ b/tree.c
@@ -1362,16 +1362,10 @@
return err;
}
-char*
-btscan(Tree *t, Scan *s, char *pfx, int npfx)
+void
+btnewscan(Scan *s, char *pfx, int npfx)
{
- int i, same;
- Scanp *p;
- Bptr bp;
- Blk *b;
- Msg m;
- Kvp v;
-
+ s->first = 1;
s->done = 0;
s->offset = 0;
s->pfx.k = s->pfxbuf;
@@ -1381,20 +1375,29 @@
s->kv.v = s->kvbuf+npfx;
s->kv.nv = 0;
cpkey(&s->kv, &s->pfx, s->kvbuf, sizeof(s->kvbuf));
+}
- lock(&t->lk);
- s->root = *t;
- unlock(&t->lk);
- if((s->path = calloc(s->root.ht, sizeof(Scanp))) == nil){
- free(s);
+char*
+btenter(Tree *t, Scan *s)
+{
+ int i, same;
+ Scanp *p;
+ Bptr bp;
+ Blk *b;
+ Msg m;
+ Kvp v;
+
+ if(s->done)
return nil;
+ if((s->path = calloc(t->ht, sizeof(Scanp))) == nil){
+ free(s);
+ return Enomem;
}
-
p = s->path;
- if((b = getblk(s->root.bp, 0)) == nil)
+ if((b = getblk(t->bp, 0)) == nil)
return Eio;
p[0].b = b;
- for(i = 0; i < s->root.ht; i++){
+ for(i = 0; i < t->ht; i++){
p[i].vi = blksearch(b, &s->kv, &v, &same);
if(b->type == Tpivot){
if(p[i].vi == -1)
@@ -1406,14 +1409,15 @@
if((b = getblk(bp, 0)) == nil)
return Eio;
p[i+1].b = b;
- }else if(p[i].vi == -1 || !same)
+ }else if(p[i].vi == -1 || !same || !s->first)
p[i].vi++;
}
+ s->first = 0;
return nil;
}
char *
-btnext(Scan *s, Kvp *r, int *done)
+btnext(Tree *t, Scan *s, Kvp *r)
{
int i, j, h, ok, start, srcbuf;
Scanp *p;
@@ -1424,16 +1428,17 @@
/* load up the correct blocks for the scan */
Again:
p = s->path;
- h = s->root.ht;
- *done = 0;
+ h = t->ht;
start = h;
srcbuf = -1;
+ if(s->done)
+ return nil;
for(i = h-1; i >= 0; i--){
if(p[i].b != nil
&&(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf))
break;
if(i == 0){
- *done = 1;
+ s->done = 1;
return nil;
}
if(p[i].b != nil)
@@ -1472,7 +1477,7 @@
}
}
if(m.nk < s->pfx.nk || memcmp(m.k, s->pfx.k, s->pfx.nk) != 0){
- *done = 1;
+ s->done = 1;
return nil;
}
@@ -1498,11 +1503,11 @@
}
void
-btdone(Scan *s)
+btexit(Tree *t, Scan *s)
{
int i;
- for(i = 0; i < s->root.ht; i++)
+ for(i = 0; i < t->ht; i++)
dropblk(s->path[i].b);
free(s->path);
}