shithub: gefs

ref: 1dc33dd936c0a90dc06dd3286698df8b4d770ad4
dir: /snap.c/

View raw version
#include <u.h>
#include <libc.h>
#include <fcall.h>
#include <avl.h>

#include "atomic.h"
#include "dat.h"
#include "fns.h"

static void
dlflush(Dlist *dl)
{
	char kvbuf[512];
	Msg m;

	if(dl->ins == nil)
		return;
	traceb("dlflush", dl->ins->bp);
	enqueue(dl->ins);
	dropblk(dl->ins);
	dl->hd = dl->ins->bp;
	if(dl->tl.addr == dl->hd.addr)
		dl->tl = dl->hd;
	dl->ins = nil;
	/* special case: the snap dlist has gen -1, skip it */
	if(dl->gen != -1){
		m.op = Oinsert;
		dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
		btupsert(&fs->snap, &m, 1);
	}
}

static void
dlcachedel(Dlist *dl, int hdel)
{
	uint h;
	Dlist *d, **p;

	h = ihash(dl->gen) ^ ihash(dl->bgen);
	if(hdel){
		p = &fs->dlcache[h % fs->dlcmax];
		for(d = *p; d != nil; d = d->chain){
			if(d->gen == dl->gen && d->bgen == dl->bgen)
				break;
			p = &d->chain;
		}
		if(d != nil)
			*p  = d->chain;
	}
	if(dl == fs->dlhead)
		fs->dlhead = dl->cnext;
	if(dl == fs->dltail)
		fs->dltail = dl->cprev;
	if(dl->cnext != nil)
		dl->cnext->cprev = dl->cprev;
	if(dl->cprev != nil)
		dl->cprev->cnext = dl->cnext;
	dl->cnext = nil;
	dl->cprev = nil;
}

static Dlist*
dlcacheget(vlong gen, vlong bgen)
{
	Dlist *dl;
	uint h;

	h = ihash(gen) ^ ihash(bgen);
	for(dl = fs->dlcache[h % fs->dlcmax]; dl != nil; dl = dl->chain)
		if(dl->gen == gen && dl->bgen == bgen)
			break;
	if(dl != nil)
		dlcachedel(dl, 0);
	return dl;
}

static Dlist*
getdl(vlong gen, vlong bgen)
{
	char kbuf[Dlksz], kvbuf[Dlkvpsz];
	Dlist *dl, **p;
	uint h;
	Msg m;
	Kvp kv;
	Key k;

	if((dl = dlcacheget(gen, bgen)) != nil)
		return dl;
	dl = emalloc(sizeof(Dlist), 1);
	if(waserror()){
		free(dl);
		nexterror();
	}
	kbuf[0] = Kdlist;
	PACK64(kbuf+1, gen);
	PACK64(kbuf+9, bgen);
	k.k = kbuf;
	k.nk = sizeof(kbuf);

	/* load up existing dlist */
	if(btlookup(&fs->snap, &k, &kv, kvbuf, sizeof(kvbuf))){
		kv2dlist(&kv, dl);
		goto Found;
	}

	/* create a new one if it didn't exist */
	dl->gen = gen;
	dl->bgen = bgen;
	dl->hd.addr = -1;
	dl->tl.addr = -1;
	dl->ins = nil;

	m.op = Oinsert;
	dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
	btupsert(&fs->snap, &m, 1);
Found:
	poperror();
	h = ihash(gen) ^ ihash(bgen);
	p = &fs->dlcache[h % fs->dlcmax];
	dl->chain = *p;
	*p = dl;
	return dl;
}

void
putdl(Dlist *dl)
{
	Dlist *dt;

	if(dl->gen == -1)
		return;
	dlcachedel(dl, 0);
	while(fs->dltail != nil && fs->dlcount >= fs->dlcmax){
		dt = fs->dltail;
		dlflush(dt);
		dlcachedel(dt, 1);
		dropblk(dt->ins);
		free(dt);
	}

	dl->cprev = nil;
	dl->cnext = fs->dlhead;
	if(fs->dltail == nil)
		fs->dltail = dl;
	if(fs->dlhead != nil)
		fs->dlhead->cprev = dl;
	fs->dlhead = dl;
}

void
freedl(Dlist *dl, int docontents)
{
	char buf[Kvmax];
	Arena *a;
	Qent qe;
	Bptr bp;
	Msg m;
	Blk *b;
	char *p;

	bp = dl->hd;
	if(dl->gen != -1){
		m.op = Odelete;
		dlist2kv(dl, &m, buf, sizeof(buf));
		btupsert(&fs->snap, &m, 1);
	}
	while(bp.addr != -1){
		b = getblk(bp, 0);
		/*
		 * Because these deadlists are dead-dead at this point,
		 * they'll never be read from again; we can avoid worrying
		 * about deferred reclamation, and queue them up to be freed
		 * directly, which means we don't need to worry about watiing
		 * for a quiescent state, and the associated out-of-block
		 * deadlocks that come with it.
		 */
		if(docontents){
			for(p = b->data; p != b->data+b->logsz; p += 8){
				qe.op = Qfree;
				qe.bp.addr = UNPACK64(p);
				qe.bp.hash = -1;
				qe.bp.gen = -1;
				qe.b = nil;
				a = getarena(qe.bp.addr);
				qput(a->sync, qe);
				traceb("dlclear", qe.bp);
			}
		}
		bp = b->logp;
		qe.op = Qfree;
		qe.bp = b->bp;
		qe.b = b;
		a = getarena(qe.bp.addr);
		qput(a->sync, qe);
		traceb("dlfreeb", qe.bp);
	}
}

static void
mergedl(vlong merge, vlong gen, vlong bgen)
{
	char buf[2][Kvmax];
	Dlist *d, *m;
	Msg msg[2];
	Blk *b;

	d = nil;
	m = nil;
	if(waserror()){
		putdl(m);
		putdl(d);
		nexterror();
	}
	d = getdl(merge, bgen);
	m = getdl(gen, bgen);
	assert(d != m);
	/*
	 * If the dest dlist didn't exist,
	 * just move the merge dlist over
	 * and be done with it, otherwise
	 * chain onto the existing dlist
	 * tail.
	 */
	if(d->hd.addr == -1){
		assert(d->ins == nil);
		d->hd = m->hd;
		d->tl = m->tl;
		d->ins = m->ins;
		if(d->ins != nil)
			holdblk(d->ins);
	}else{
		if(m->ins != nil){
			enqueue(m->ins);
			dropblk(m->ins);
			m->ins = nil;
		}
		b = getblk(d->tl, 0);
		b->logp = m->hd;
		assert(d->hd.addr != m->hd.addr);
		finalize(b);
		syncblk(b);
		dropblk(b);
	}
	msg[0].op = Odelete;
	dlist2kv(m, &msg[0], buf[0], sizeof(buf[0]));
	msg[1].op = Oinsert;
	dlist2kv(d, &msg[1], buf[1], sizeof(buf[1]));
	btupsert(&fs->snap, msg, 2);
	putdl(m);
	putdl(d);
	poperror();
}

static void
reclaimblocks(vlong gen, vlong succ, vlong prev)
{
	char pfx[9];
	Dlist dl;
	Scan s;

	pfx[0] = Kdlist;
	PACK64(pfx+1, gen);
	btnewscan(&s, pfx, sizeof(pfx));
	btenter(&fs->snap, &s);
	while(1){
		if(!btnext(&s, &s.kv))
			break;
		kv2dlist(&s.kv, &dl);

		if(succ != -1 && dl.bgen <= prev)
			mergedl(succ, dl.gen, dl.bgen);
		else if(dl.bgen <= prev)
			mergedl(prev, dl.gen, dl.bgen);
		else
			freedl(&dl, 1);
	}
	btexit(&s);
	if(succ != -1){
		pfx[0] = Kdlist;
		PACK64(pfx+1, succ);
		btnewscan(&s, pfx, sizeof(pfx));
		btenter(&fs->snap, &s);
		while(1){
			if(!btnext(&s, &s.kv))
				break;
			kv2dlist(&s.kv, &dl);
			if(dl.bgen > prev)
				freedl(&dl, 1);
		}
		btexit(&s);
	}
}

/*
 * Removes a label from a snapshot, allowing
 * it to be reclaimed if it is not a direct
 * predecessor of more than one other snapshot.
 *
 * If it has one successor and no label, then
 * it will be merged with that successor.
 */
void
delsnap(Tree *t, vlong succ, char *name)
{
	char *p, buf[4][Kvmax];
	int nm, deltree;
	Mount *mnt;
	Msg m[4];

	nm = 0;
	deltree = 0;
	if(name != nil){
		if(strcmp(name, "dump") == 0
		|| strcmp(name, "empty") == 0
		|| strcmp(name, "adm") == 0)
			error(Ename);

		m[nm].op = Odelete;
		m[nm].k = buf[nm];
		p = packlbl(buf[nm], sizeof(buf[nm]), name);
		m[nm].nk = p - m[nm].k;
		m[nm].v = nil;
		m[nm].nv = 0;
		t->nlbl--;
		nm++;
	}
 
	if(t->nlbl == 0 && t->nref <= 1){
		deltree = 1;
		m[nm].op = Orelink;
		retag2kv(t->pred, succ, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
		nm++;
		if(t->succ != -1){
			m[nm].op = Oreprev;
			retag2kv(t->succ, t->pred, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
			nm++;
		}
		m[nm].op = Odelete;
		m[nm].k = buf[nm];
		p = packsnap(buf[nm], sizeof(buf[nm]), t->gen);
		m[nm].nk = p - m[nm].k;
		m[nm].v = nil;
		m[nm].nv = 0;
		nm++;
	}
	assert(nm <= nelem(m));
	dlsync();
	btupsert(&fs->snap, m, nm);
	if(deltree){
		reclaimblocks(t->gen, succ, t->pred);
		for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
			if(mnt->root->gen == t->succ)
				mnt->root->pred = t->pred;
			if(mnt->root->gen == t->pred)
				mnt->root->succ = t->succ;
		}
	}
}

/*
 * Attaches a label to a tree, incrementing
 * its reference count. This labelled snapshot
 * will show up in the dump.
 */
void
tagsnap(Tree *t, char *name, int flg)
{
	char buf[3][Kvmax];
	Msg m[3];
	Tree *n;
	int i;

	if(strcmp(name, "dump") == 0
	|| strcmp(name, "empty") == 0
	|| strcmp(name, "adm") == 0)
		error(Ename);

	i = 0;
	n = nil;
	if(waserror()){
		free(n);
		nexterror();
	}
	if(flg & Lmut){
		n = emalloc(sizeof(Tree), 1);
		n->memref = 1;
		n->dirty = 0;
		n->nlbl = 1;
		n->nref = 0;
		n->ht = t->ht;
		n->bp = t->bp;
		n->succ = -1;
		n->pred = t->gen;
		n->base = t->gen;
		n->gen = aincv(&fs->nextgen, 1);
		n->memgen = aincv(&fs->nextgen, 1);

		t->nref++;
		m[i].op = Orelink;
		retag2kv(t->gen, t->succ, 0, 1, &m[i], buf[i], sizeof(buf[i]));
		i++;
		m[i].op = Oinsert;
		lbl2kv(name, n->gen, flg, &m[i], buf[i], sizeof(buf[i]));
		i++;
		m[i].op = Oinsert;
		tree2kv(n, &m[i], buf[i], sizeof(buf[i]));
		i++;
	}else{
		t->nlbl++;
		m[i].op = Orelink;
		retag2kv(t->gen, t->succ, 1, 0, &m[i], buf[i], sizeof(buf[i]));
		i++;

		m[i].op = Oinsert;
		t->pred = t->gen;
		t->nlbl++;
		lbl2kv(name, t->gen, flg, &m[i], buf[i], sizeof(buf[i]));
		i++;
	}
	btupsert(&fs->snap, m, i);
	poperror();
	free(n);
}

/*
 * Updates a snapshot; keeps the generation the same if possible,
 * otherwise moves to a new generation. A snapshot may only stay
 * at the same generation as long as it is at the tip of a snapshot
 * list; once it's observable by a derived snapshot it must be
 * immutable.
 */
void
updatesnap(Tree **r, Tree *o, char *lbl)
{
	char buf[4][Kvmax];
	Msg m[4];
	Tree *t;
	int i;

	if(!o->dirty)
		return;

	traceb("updatesnap", o->bp);
	/* update the old kvp */
	o->nlbl--;
	o->nref++;

	/* create the new one */

	t = emalloc(sizeof(Tree), 1);
	if(waserror()){
		free(t);
		nexterror();
	}
	t->memref = 1;
	t->dirty = 0;

	t->nlbl = 1;
	t->nref = 0;
	t->ht = o->ht;
	t->bp = o->bp;
	t->succ = -1;
	t->base = o->base;
	t->gen = o->memgen;
	t->memgen = aincv(&fs->nextgen, 1);

	i = 0;
	m[i].op = Orelink;
	if(o->nlbl == 0 && o->nref == 1){
		t->pred = o->pred;
		retag2kv(t->pred, t->gen, 0, 0, &m[i], buf[i], sizeof(buf[i]));
	}else{
		t->pred = o->gen;
		retag2kv(t->pred, t->gen, -1, 1, &m[i], buf[i], sizeof(buf[i]));
	}
	i++;

	m[i].op = Oinsert;
	tree2kv(t, &m[i], buf[i], sizeof(buf[i]));
	i++;
	m[i].op = Oinsert;
	lbl2kv(lbl, t->gen, Lmut, &m[i], buf[i], sizeof(buf[i]));
	i++;
	btupsert(&fs->snap, m, i);

	/* only update the dirty status after we sync */
	o->dirty = 0;

	/* this was the last ref to the snap */
	if(o->nlbl == 0 && o->nref == 1)
		delsnap(o, t->gen, nil);
	closesnap(o);
	asetp(r, t);
	poperror();
}

/*
 * open snapshot by label, returning a tree.
 */
Tree*
opensnap(char *label, int *flg)
{
	char *p, buf[Kvmax];
	Tree *t;
	vlong gen;
	Kvp kv;
	Key k;

	/* Klabel{"name"} => Ksnap{id} */
	if((p = packlbl(buf, sizeof(buf), label)) == nil)
		return nil;
	k.k = buf;
	k.nk = p - buf;
	if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
		return nil;
	assert(kv.nv == 1+8+4);
	gen = UNPACK64(kv.v + 1);
	if(flg != nil)
		*flg = UNPACK32(kv.v + 1+8);

	t = mallocz(sizeof(Tree), 1);
	if(waserror()){
		free(t);
		nexterror();
	}
	p = packsnap(buf, sizeof(buf), gen);
	k.k = buf;
	k.nk = p - buf;
	if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
		broke(Efs);
	unpacktree(t, kv.v, kv.nv);
	t->memref = 1;
	t->memgen = aincv(&fs->nextgen, 1);
	poperror();
	return t;
}

/*
 * close snapshot, flushing and freeing in-memory
 * representation.
 */
void
closesnap(Tree *t)
{
	Bfree *f;

	if(t == nil || adec(&t->memref) != 0)
		return;
	f = malloc(sizeof(Bfree));
	f->op = DFtree;
	f->t = t;
	limbo(f);
}

void
dlsync(void)
{
	Dlist *dl, *n;

	tracem("dlsync");
	dlflush(&fs->snapdl);
	for(dl = fs->dlhead; dl != nil; dl = n){
		n = dl->cnext;
		dlflush(dl);
	}
}

/*
 * Marks a block as killed by the tree
 * t, which means that it will be free
 * for use after t is reclaimed.
 *
 * t must be an active snapshot with
 * no successors.
 */
void
killblk(Tree *t, Bptr bp)
{
	Dlist *dl;
	Blk *b;
	char *p;

	/* 
	 * When we have a forked snap, blocks allocated before the fork
	 * are the responsibility of the other chain; in this chain, we
	 * leak it and let the last reference in the other chain clean up
	 */
	if(t == &fs->snap)
		dl = &fs->snapdl;
	else if(bp.gen > t->base)
		dl = getdl(t->memgen, bp.gen);
	else
		return;
	if(waserror()){
		putdl(dl);
		nexterror();
	}
	if(dl->ins == nil || Logspc - dl->ins->logsz < Logslop){
		b = newblk(&fs->snap, Tdlist, 0);
		if(dl->ins != nil){
			enqueue(dl->ins);
			dropblk(dl->ins);
		}
		if(dl->tl.addr == -1)
			dl->tl = b->bp;
		b->logp = dl->hd;
		dl->hd = b->bp;
		dl->ins = b;
		cacheins(b);
	}
	p = dl->ins->data + dl->ins->logsz;
	dl->ins->logsz += 8;
	setflag(dl->ins, Bdirty);
	PACK64(p, bp.addr);
	poperror();
	putdl(dl);
}