shithub: gefs

Download patch

ref: fa5ed8b61d58e141790c263eba6caea2c67945b8
parent: bfd636c4beff968fb3d246cf36ebb6853ef5d509
author: Ori Bernstein <ori@eigenstate.org>
date: Sun Nov 6 10:10:35 EST 2022

tree: avoid scanning freed blocks

we used to take a reference to the tree root when we started walking, and
keep it around while the scan was still running; there's a problem when
we do that, since the tree can be updated under us.

we have 3 options to deal with this:

1. we can pin the epoch as long as a
   scan is attached to a fid in a reader, but this can
   lead to running out of blocks in the cache when
   dealing with slow readers, which could lead to a
   denial of service, and maybe a deadlock where the
   reader can't make progress until the freelist is
   processed.

2. we can take a snapshot of the file system when we
   start listing a directory; this may be an option.

3. the one implemented here: just let the tree change
   under us, and rewalk on every Tread on a directory.

--- a/cons.c
+++ b/cons.c
@@ -184,7 +184,6 @@
 	Key k;
 	vlong pqid;
 	Scan s;
-	int done;
 
 	if((t = openlabel("main")) == nil){
 		fprint(fd, "could not open main snap\n");
@@ -205,21 +204,22 @@
 		}
 		fprint(fd, "%P\n", &kv);
 	}else{
-		if((e = btscan(t, &s, k.k, k.nk)) != nil){
+		btnewscan(&s, k.k, k.nk);
+		if((e = btenter(t, &s)) != nil){
 			fprint(fd, "scan failed: %s\n", e);
 			goto Out;
 		}
 		while(1){
-			if((e = btnext(&s, &kv, &done)) != nil){
+			if((e = btnext(t, &s, &kv)) != nil){
 				fprint(fd, "scan failed: %s\n", e);
-				btdone(&s);
+				btexit(t, &s);
 				goto Out;
 			}
-			if(done)
+			if(s.done)
 				break;
 			fprint(fd, "%P\n", &kv);
 		}
-		btdone(&s);
+		btexit(t, &s);
 	}
 Out:
 	closesnap(t);
--- a/dat.h
+++ b/dat.h
@@ -593,11 +593,11 @@
 
 struct Scan {
 	vlong	offset;	/* last read offset */
-	Tree	root;
 
-	int	done;
-	int	overflow;
-	int	present;
+	char	first;
+	char	done;
+	char	overflow;
+	char	present;
 	Kvp	kv;
 	Key	pfx;
 	char	kvbuf[Kvmax];
--- a/dump.c
+++ b/dump.c
@@ -372,16 +372,12 @@
 showsnap(int fd, char **ap, int na)
 {
 	char *e, pfx[Snapsz];
-	int sz, done;
 	Mount *mnt;
 	vlong id;
-	Scan *s;
+	Scan s;
+	int sz;
 	Tree t;
 
-	if((s = mallocz(sizeof(Scan), 1)) == nil){
-		fprint(fd, "no memory\n");
-		return;
-	}
 	pfx[0] = Ksnap;
 	sz = 1;
 	if(na != 0){
@@ -389,25 +385,27 @@
 		id = atoll(ap[0]);
 		PACK64(pfx+1, id);
 	}
-	if((e = btscan(&fs->snap, s, pfx, sz)) != nil){
+	btnewscan(&s, pfx, sz);
+	if((e = btenter(&fs->snap, &s)) != nil){
 		fprint(fd, "scan: %s\n", e);
-		btdone(s);
+		btexit(&fs->snap, &s);
 		return;
 	}
 	while(1){
-		if((e = btnext(s, &s->kv, &done)) != nil){
+		if((e = btnext(&fs->snap, &s, &s.kv)) != nil){
 			fprint(fd, "scan: %s\n", e);
 			break;
 		}
-		if(done)
+		if(s.done)
 			break;
-		fprint(fd, "snap: %P\n", &s->kv);
-		if(unpacktree(&t, s->kv.v, s->kv.nv) == nil){
+		fprint(fd, "snap: %P\n", &s.kv);
+		if(unpacktree(&t, s.kv.v, s.kv.nv) == nil){
 			fprint(fd, "unpack: garbled tree\n");
 			break;
 		}
 		showtreeroot(fd, &t);
 	}
+	btexit(&fs->snap, &s);
 	qlock(&fs->snaplk);
 	for(mnt = fs->mounts; mnt != nil; mnt = mnt->next){
 		fprint(fd, "open: %s\n", mnt->name);
--- a/fns.h
+++ b/fns.h
@@ -84,9 +84,10 @@
 
 char*	btupsert(Tree*, Msg*, int);
 char*	btlookup(Tree*, Key*, Kvp*, char*, int);
-char*	btscan(Tree*, Scan*, char*, int);
-char*	btnext(Scan*, Kvp*, int*);
-void	btdone(Scan*);
+void	btnewscan(Scan*, char*, int);
+char*	btenter(Tree*, Scan*);
+char*	btnext(Tree*, Scan*, Kvp*);
+void	btexit(Tree*, Scan*);
 
 int	checkflag(Blk *b, int);
 void	setflag(Blk *b, int);
--- a/fs.c
+++ b/fs.c
@@ -1316,7 +1316,6 @@
 	}
 	lock(f);
 	if(f->scan != nil){
-		btdone(f->scan);
 		free(f->scan);
 		f->scan = nil;
 	}
@@ -1460,25 +1459,20 @@
 candelete(Fid *f)
 {
 	char *e, pfx[Dpfxsz];
-	int done;
-	Scan *s;
+	Scan s;
 
 	if(f->dent->qid.type == QTFILE)
 		return nil;
-	if((s = mallocz(sizeof(Scan), 1)) == nil)
-		return Enomem;
-
 	packdkey(pfx, sizeof(pfx), f->qpath, nil);
-	if((e = btscan(f->mnt->root, s, pfx, sizeof(pfx))) != nil)
+	btnewscan(&s, pfx, sizeof(pfx));
+	if((e = btenter(f->mnt->root, &s)) != nil)
 		goto Out;
-	done = 0;
-	if((e = btnext(s, &s->kv, &done)) != nil)
+	if((e = btnext(f->mnt->root, &s, &s.kv)) != nil)
 		goto Out;
-	if(!done)
+	if(s.done)
 		e = Enempty;
 Out:
-	btdone(s);
-	free(s);
+	btexit(f->mnt->root, &s);
 	return e;
 }
 
@@ -1655,7 +1649,7 @@
 readsnap(Fmsg *m, Fid *f, Fcall *r)
 {
 	char pfx[1], *p, *e;
-	int n, ns, done;
+	int n, ns;
 	Scan *s;
 	Xdir d;
 
@@ -1666,15 +1660,9 @@
 		if((s = mallocz(sizeof(Scan), 1)) == nil)
 			return Enomem;
 		pfx[0] = Klabel;
-		if((e = btscan(&fs->snap, s, pfx, 1)) != nil){
-			btdone(s);
-			free(s);
-			return e;
-		}
-
+		btnewscan(s, pfx, 1);
 		lock(f);
 		if(f->scan != nil){
-			btdone(f->scan);
 			free(f->scan);
 		}
 		f->scan = s;
@@ -1699,10 +1687,12 @@
 		p += ns;
 		n -= ns;
 	}
+	if((e = btenter(&fs->snap, s)) != nil)
+		return e;
 	while(1){
-		if((e = btnext(s, &s->kv, &done)) != nil)
+		if((e = btnext(&fs->snap, s, &s->kv)) != nil)
 			return e;
-		if(done)
+		if(s->done)
 			break;
 		memcpy(d.name, s->kv.k+1, s->kv.nk-1);
 		d.name[s->kv.nk-1] = 0;
@@ -1714,6 +1704,7 @@
 		p += ns;
 		n -= ns;
 	}
+	btexit(&fs->snap, s);
 	r->count = p - r->data;
 	return nil;
 }
@@ -1722,10 +1713,12 @@
 readdir(Fmsg *m, Fid *f, Fcall *r)
 {
 	char pfx[Dpfxsz], *p, *e;
-	int n, ns, done;
+	int n, ns;
+	Tree *t;
 	Scan *s;
 
 	s = f->scan;
+	t = f->mnt->root;
 	if(s != nil && s->offset != 0 && s->offset != m->offset)
 		return Edscan;
 	if(s == nil || m->offset == 0){
@@ -1733,17 +1726,10 @@
 			return Enomem;
 
 		packdkey(pfx, sizeof(pfx), f->qpath, nil);
-		if((e = btscan(f->mnt->root, s, pfx, sizeof(pfx))) != nil){
-			btdone(s);
-			free(s);
-			return e;
-		}
-
+		btnewscan(s, pfx, sizeof(pfx));
 		lock(f);
-		if(f->scan != nil){
-			btdone(f->scan);
+		if(f->scan != nil)
 			free(f->scan);
-		}
 		f->scan = s;
 		unlock(f);
 	}
@@ -1762,10 +1748,12 @@
 		p += ns;
 		n -= ns;
 	}
+	if((e = btenter(t, s)) != nil)
+		return e;
 	while(1){
-		if((e = btnext(s, &s->kv, &done)) != nil)
+		if((e = btnext(t, s, &s->kv)) != nil)
 			return e;
-		if(done)
+		if(s->done)
 			break;
 		if((ns = kv2statbuf(&s->kv, p, n)) == -1){
 			s->overflow = 1;
@@ -1774,6 +1762,7 @@
 		p += ns;
 		n -= ns;
 	}
+	btexit(t, s);
 	r->count = p - r->data;
 	return nil;
 }
--- a/tree.c
+++ b/tree.c
@@ -1362,16 +1362,10 @@
 	return err;
 }
 
-char*
-btscan(Tree *t, Scan *s, char *pfx, int npfx)
+void
+btnewscan(Scan *s, char *pfx, int npfx)
 {
-	int i, same;
-	Scanp *p;
-	Bptr bp;
-	Blk *b;
-	Msg m;
-	Kvp v;
-
+	s->first = 1;
 	s->done = 0;
 	s->offset = 0;
 	s->pfx.k = s->pfxbuf;
@@ -1381,20 +1375,29 @@
 	s->kv.v = s->kvbuf+npfx;
 	s->kv.nv = 0;
 	cpkey(&s->kv, &s->pfx, s->kvbuf, sizeof(s->kvbuf));
+}
 
-	lock(&t->lk);
-	s->root = *t;
-	unlock(&t->lk);
-	if((s->path = calloc(s->root.ht, sizeof(Scanp))) == nil){
-		free(s);
+char*
+btenter(Tree *t, Scan *s)
+{
+	int i, same;
+	Scanp *p;
+	Bptr bp;
+	Blk *b;
+	Msg m;
+	Kvp v;
+
+	if(s->done)
 		return nil;
+	if((s->path = calloc(t->ht, sizeof(Scanp))) == nil){
+		free(s);
+		return Enomem;
 	}
-
 	p = s->path;
-	if((b = getblk(s->root.bp, 0)) == nil)
+	if((b = getblk(t->bp, 0)) == nil)
 		return Eio;
 	p[0].b = b;
-	for(i = 0; i < s->root.ht; i++){
+	for(i = 0; i < t->ht; i++){
 		p[i].vi = blksearch(b, &s->kv, &v, &same);
 		if(b->type == Tpivot){
 			if(p[i].vi == -1)
@@ -1406,14 +1409,15 @@
 			if((b = getblk(bp, 0)) == nil)
 				return Eio;
 			p[i+1].b = b;
-		}else if(p[i].vi == -1 || !same)
+		}else if(p[i].vi == -1 || !same || !s->first)
 			p[i].vi++;
 	}
+	s->first = 0;
 	return nil;
 }
 
 char *
-btnext(Scan *s, Kvp *r, int *done)
+btnext(Tree *t, Scan *s, Kvp *r)
 {
 	int i, j, h, ok, start, srcbuf;
 	Scanp *p;
@@ -1424,16 +1428,17 @@
 	/* load up the correct blocks for the scan */
 Again:
 	p = s->path;
-	h = s->root.ht;
-	*done = 0;
+	h = t->ht;
 	start = h;
 	srcbuf = -1;
+	if(s->done)
+		return nil;
 	for(i = h-1; i >= 0; i--){
 		if(p[i].b != nil
 		&&(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf))
 			break;
 		if(i == 0){
-			*done = 1;
+			s->done = 1;
 			return nil;
 		}
 		if(p[i].b != nil)
@@ -1472,7 +1477,7 @@
 		}
 	}
 	if(m.nk < s->pfx.nk || memcmp(m.k, s->pfx.k, s->pfx.nk) != 0){
-		*done = 1;
+		s->done = 1;
 		return nil;
 	}
 
@@ -1498,11 +1503,11 @@
 }
 
 void
-btdone(Scan *s)
+btexit(Tree *t, Scan *s)
 {
 	int i;
 
-	for(i = 0; i < s->root.ht; i++)
+	for(i = 0; i < t->ht; i++)
 		dropblk(s->path[i].b);
 	free(s->path);
 }