ref: 6c35d0b041df30eacf7601d50d187adb90542b4d
dir: /blk.c/
#include <u.h> #include <libc.h> #include <fcall.h> #include <avl.h> #include "dat.h" #include "fns.h" typedef struct Range Range; struct Range { vlong off; vlong len; }; static vlong blkalloc_lk(Arena*); static vlong blkalloc(void); static int blkdealloc_lk(vlong); static Blk* initblk(vlong, int); QLock blklock; void setflag(Blk *b, int flg) { long ov, nv; while(1){ ov = b->flag; nv = ov | flg; if(cas(&b->flag, ov, nv)) break; } } void clrflag(Blk *b, int flg) { long ov, nv; while(1){ ov = b->flag; nv = ov & ~flg; if(cas(&b->flag, ov, nv)) break; } } Blk* readblk(vlong bp, int flg) { Blk *b; vlong off, rem, n; assert(bp != -1); if((b = mallocz(sizeof(Blk), 1)) == nil) return nil; off = bp; rem = Blksz; while(rem != 0){ n = pread(fs->fd, b->buf, rem, off); if(n <= 0){ free(b); return nil; } off += n; rem -= n; } b->type = (flg&GBraw) ? Traw : GBIT16(b->buf+0); b->bp.addr = bp; b->bp.hash = -1; b->bp.gen = -1; b->ref = 1; b->cnext = nil; b->cprev = nil; b->hnext = nil; b->data = b->buf + 10; switch(b->type){ default: if(flg&GBraw) break; fprint(2, "invalid block @%llx\n", bp); abort(); break; case Tarena: case Traw: case Tsuper: case Tlog: break; case Tpivot: b->nval = GBIT16(b->buf+2); b->valsz = GBIT16(b->buf+4); b->nbuf = GBIT16(b->buf+6); b->bufsz = GBIT16(b->buf+8); break; case Tleaf: b->nval = GBIT16(b->buf+2); b->valsz = GBIT16(b->buf+4); break; } return b; } static Arena* pickarena(void) { long n; n = (ainc(&fs->roundrobin)/1024) % fs->narena; return &fs->arenas[n]; } Arena* getarena(vlong b) { int i; i = b / fs->arenasz; if(i < 0 || i >= fs->narena){ werrstr("out of range block %lld", b); abort(); return nil; } return &fs->arenas[i]; } static int freerange(Avltree *t, vlong off, vlong len) { Arange *r, *s; assert(len % Blksz == 0); if((r = calloc(1, sizeof(Arange))) == nil) return -1; r->off = off; r->len = len; avlinsert(t, r); Again: s = (Arange*)avlprev(r); if(s != nil && s->off+s->len == r->off){ avldelete(t, r); s->len = s->len + r->len; free(r); r = s; goto Again; } s = (Arange*)avlnext(r); if(s != nil && r->off+r->len == s->off){ avldelete(t, r); s->off = r->off; s->len = s->len + r->len; free(r); r = s; goto Again; } return 0; } int grabrange(Avltree *t, vlong off, vlong len) { Arange *r, *s, q; vlong l; assert(len % Blksz == 0); q.off = off; q.len = len; r = (Arange*)avllookup(t, &q.Avl, -1); if(r == nil || off + len > r->off + r->len) abort(); if(off == r->off){ r->off += len; r->len -= len; }else if(off + len == r->off + r->len){ r->len -= len; }else if(off > r->off && off+len < r->off + r->len){ if((s = malloc(sizeof(Arange))) == nil) return -1; l = r->len; s->off = off + len; r->len = off - r->off; s->len = l - r->len - len; avlinsert(t, s); }else abort(); if(r->len == 0){ avldelete(t, r); free(r); } return 0; } int logappend(Oplog *ol, Arena *a, vlong off, vlong len, vlong val, int op) { Blk *pb, *lb; vlong o; char *p; assert(off % Blksz == 0); assert(op == LogAlloc || op == LogFree || op == LogDead); lb = ol->tail; if(lb == nil || lb->logsz > Logspc - 8){ pb = lb; if(a == nil) o = blkalloc(); else o = blkalloc_lk(a); if(o == -1) return -1; if((lb = initblk(o, Tlog)) == nil) return -1; lb->logsz = Loghdsz; p = lb->data + lb->logsz; PBIT64(p, (uvlong)LogEnd); finalize(lb); if(syncblk(lb) == -1){ free(lb); return -1; } ol->tail = lb; if(pb != nil){ p = pb->data + pb->logsz; PBIT64(p + 0, lb->bp.addr|LogChain); finalize(pb); if(syncblk(pb) == -1) return -1; } } if(len == Blksz){ if(op == LogAlloc) op = LogAlloc1; else if(op == LogFree) op = LogFree1; } off |= op; p = lb->data + lb->logsz; PBIT64(p, off); lb->logsz += 8; if(op >= Log2wide){ PBIT64(p+8, val); lb->logsz += 8; } /* this gets overwritten by the next append */ p = lb->data + lb->logsz; PBIT64(p, (uvlong)LogEnd); if(ol->head.addr == -1) ol->head = lb->bp; if(ol->tail != lb){ putblk(ol->tail); ol->tail = lb; } return 0; } static void showdeadbp(Bptr bp, void *) { fprint(2, "\tdead: %B\n", bp); } void showlst(char *m, Oplog *l) { fprint(2, "showlst: %s\n", m); scandead(l, showdeadbp, nil); } int graft(Oplog *a, Oplog *b) { char *p; vlong o; if(b->head.addr == -1) return 0; if(a->head.addr == -1){ a->head = b->head; a->tail = b->tail; return 0; } showlst("a", a); showlst("b", b); o = b->head.addr|LogChain; p = a->tail->data + a->tail->logsz; PBIT64(p, o); if(syncblk(a->tail) == -1) return -1; putblk(a->tail); a->tail = b->tail; showlst("grafted", b); return 0; } int deadlistappend(Tree *t, Bptr bp) { Oplog *l; int i; dprint("deadlisted %B [%p::%B]\n", bp, t, t->bp); //fprint(2, "predeadlist\n"); //showtreeroot(2, t); //fprint(2, "----\n"); l = nil; for(i = 0; i < Ndead; i++){ l = &t->dead[i]; if(bp.gen > t->prev[i]) break; } if(logappend(l, nil, bp.addr, Blksz, bp.gen, LogDead) == -1) return -1; //fprint(2, "postdeadlist\n"); //showtreeroot(2, t); //fprint(2, "@@@@@@@@@@@@@@@@@@\n"); return 0; } /* * Logs an allocation. Must be called * with arena lock held. Duplicates some/c * of the work in allocblk to prevent * recursion. */ int freelistappend(Arena *a, vlong off, int op) { cachedel(off); if(logappend(&a->log, a, off, Blksz, Blksz, op) == -1) return -1; return 0; } int scandead(Oplog *l, void (*fn)(Bptr, void*), void *dat) { vlong ent, off; int op, i, n; Bptr dead; char *d; Bptr bp; Blk *b; bp = l->head; if(bp.addr == -1) return 0; Nextblk: if((b = getblk(bp, GBnochk)) == nil) return -1; // TODO: hash checks for these chains for(i = Loghdsz; i < Logspc; i += n){ d = b->data + i; ent = GBIT64(d); op = ent & 0xff; off = ent & ~0xff; n = (op >= Log2wide) ? 16 : 8; switch(op){ case LogEnd: dprint("log@%d: end\n", i); return 0; case LogDead: dead.addr = off; dead.hash = -1; dead.gen = GBIT64(d+8); dprint("log@%d: dead %B\n", i, dead); fn(dead, dat); break; case LogChain: bp.addr = off & ~0xff; bp.hash = -1; bp.gen = -1; dprint("log@%d: chain %B\n", i, bp); goto Nextblk; break; default: n = 0; fprint(2, "log@%d: log op %d\n", i, op); abort(); break; } } return 0; } int loadlog(Arena *a) { vlong ent, off, len; int op, i, n; uvlong bh; Bptr bp; char *d; Blk *b; bp = a->log.head; Nextblk: if((b = getblk(bp, GBnochk)) == nil) return -1; bh = GBIT64(b->data); /* the hash covers the log and offset */ if(bh != siphash(b->data+8, Blkspc-8)){ werrstr("corrupt log"); return -1; } for(i = Loghdsz; i < Logspc; i += n){ d = b->data + i; ent = GBIT64(d); op = ent & 0xff; off = ent & ~0xff; n = (op >= Log2wide) ? 16 : 8; switch(op){ case LogEnd: dprint("log@%d: end\n", i); return 0; case LogChain: bp.addr = off & ~0xff; bp.hash = -1; bp.gen = -1; dprint("log@%d: chain %B\n", i, bp); b->logsz = i+n; goto Nextblk; break; case LogAlloc: case LogAlloc1: len = (op >= Log2wide) ? GBIT64(d+8) : Blksz; dprint("log@%d alloc: %llx+%llx\n", i, off, len); if(grabrange(a->free, off & ~0xff, len) == -1) return -1; break; case LogFree: case LogFree1: len = (op >= Log2wide) ? GBIT64(d+8) : Blksz; dprint("log@%d free: %llx+%llx\n", i, off, len); if(freerange(a->free, off & ~0xff, len) == -1) return -1; break; default: n = 0; dprint("log@%d: log op %d\n", i, op); abort(); break; } } return -1; } int compresslog(Arena *a) { Arange *r; Range *log, *nlog; vlong v, bp, nb, graft, oldhd; int i, n, sz; Blk *hd, *ab, *b; Oplog ol; char *p; /* * Sync the current log to disk, and * set up a new block log tail. While * compressing the log, nothing else is * using this arena, so any allocs come * from the log compression, and go into * this new log segment. * * A bit of copy paste from newblk, * because otherwise we have a deadlock * allocating the block. */ if((bp = blkalloc_lk(a)) == -1) return -1; if((b = initblk(bp, Tlog)) == nil) return -1; setflag(b, Bdirty); b->logsz = Loghdsz; PBIT64(b->data+b->logsz, (uvlong)LogEnd); finalize(b); if(syncblk(b) == -1){ free(b); return -1; } graft = b->bp.addr; if(a->log.tail != nil){ finalize(a->log.tail); if(syncblk(a->log.tail) == -1){ free(b); return -1; } } a->log.tail = b; /* * Prepare what we're writing back. * Arenas must be sized so that we can * keep the merged log in memory for * a rewrite. */ n = 0; sz = 512; if((log = malloc(sz*sizeof(Range))) == nil) return -1; for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){ if(n == sz){ sz *= 2; if((nlog = realloc(log, sz*sizeof(Range))) == nil){ free(log); return -1; } log = nlog; } log[n].off = r->off; log[n].len = r->len; n++; } if((b = newblk(Tlog)) == nil){ free(log); return -1; } hd = b; ol.head = b->bp; ol.tail = b; b->logsz = Loghdsz; for(i = 0; i < n; i++) if(logappend(&ol, a, log[i].off, log[i].len, log[i].len, LogFree) == -1) return -1; p = b->data + b->logsz; PBIT64(p, LogChain|graft); free(log); finalize(b); if(syncblk(b) == -1) return -1; oldhd = a->log.head.addr; a->log.head.addr = hd->bp.addr; a->log.head.hash = blkhash(hd); a->log.head.gen = -1; ab = a->b; PBIT64(ab->data + 0, a->log.head.addr); PBIT64(ab->data + 8, a->log.head.hash); finalize(ab); if(syncblk(ab) == -1) return -1; if(oldhd != -1){ for(bp = oldhd; bp != -1; bp = nb){ nb = -1; if((b = readblk(bp, 0)) == nil) return -1; for(i = Loghdsz; i < Logspc; i += n){ p = b->data + i; v = GBIT64(p); n = ((v&0xff) >= Log2wide) ? 16 : 8; if((v&0xff) == LogChain){ nb = v & ~0xff; break; }else if((v&0xff) == LogEnd){ nb = -1; break; } } lock(a); if(blkdealloc_lk(bp) == -1){ unlock(a); return -1; } unlock(a); } } finalize(a->log.tail); if(syncblk(a->log.tail) == -1) return -1; return 0; } /* * Allocate from an arena, with lock * held. May be called multiple times * per operation, to alloc space for * the alloc log. */ static vlong blkalloc_lk(Arena *a) { Avltree *t; Arange *r; vlong b; t = a->free; r = (Arange*)t->root; if(r == nil){ unlock(a); return -1; } /* * A bit of sleight of hand here: * while we're changing the sorting * key, but we know it won't change * the sort order because the tree * covers disjoint ranges */ b = r->off; r->len -= Blksz; r->off += Blksz; if(r->len == 0){ avldelete(t, r); free(r); } return b; } static int blkdealloc_lk(vlong b) { Arena *a; int r; r = -1; a = getarena(b); if(freerange(a->free, b, Blksz) == -1) goto out; if(freelistappend(a, b, LogFree) == -1) goto out; r = 0; out: return r; } static vlong blkalloc(void) { Arena *a; vlong b; int tries; tries = 0; Again: a = pickarena(); if(a == nil || tries == fs->narena){ werrstr("no empty arenas"); return -1; } /* * TODO: there's an extreme edge case * here. * * If the file system has room to alloc * a data block but no log block, then * we end up with it in a stuck state. * The fix is to reserve alloc blocks, * so that we're guaranteed to be able * to log an alloc if the disk is working * correctly. */ tries++; lock(a); if((b = blkalloc_lk(a)) == -1){ unlock(a); goto Again; } if(freelistappend(a, b, LogAlloc) == -1){ unlock(a); return -1; } unlock(a); return b; } static Blk* initblk(vlong bp, int t) { Blk *b; if((b = lookupblk(bp)) == nil){ if((b = mallocz(sizeof(Blk), 1)) == nil) return nil; /* * If the block is cached, * then the cache holds a ref * to the block, so we only * want to reset the refs * on an allocation. */ b->ref = 1; b->cnext = nil; b->cprev = nil; b->hnext = nil; } b->type = t; b->bp.addr = bp; b->bp.hash = -1; b->bp.gen = fs->nextgen; b->data = b->buf + Hdrsz; b->fnext = nil; setflag(b, Bdirty); b->nval = 0; b->valsz = 0; b->nbuf = 0; b->bufsz = 0; b->logsz = 0; b->lognxt = 0; return cacheblk(b); } Blk* newblk(int t) { vlong bp; Blk *b; if((bp = blkalloc()) == -1) return nil; if((b = initblk(bp, t)) == nil) return nil; setmalloctag(b, getcallerpc(&t)); return b; } char* fillsuper(Blk *b) { char *p; assert(b->type == Tsuper); p = b->data; setflag(b, Bdirty); memcpy(p, "gefs0001", 8); p += 8; PBIT32(p, 0); p += 4; /* dirty */ PBIT32(p, Blksz); p += 4; PBIT32(p, Bufspc); p += 4; PBIT32(p, Hdrsz); p += 4; PBIT32(p, fs->snap.ht); p += 4; PBIT64(p, fs->snap.bp.addr); p += 8; PBIT64(p, fs->snap.bp.hash); p += 8; PBIT64(p, fs->snap.bp.gen); p += 8; PBIT32(p, fs->narena); p += 4; PBIT64(p, fs->arenasz); p += 8; PBIT64(p, fs->nextqid); p += 8; return p; } void finalize(Blk *b) { vlong h; setflag(b, Bfinal); if(b->type != Traw) PBIT16(b->buf, b->type); switch(b->type){ default: case Tnone: abort(); break; case Tpivot: PBIT16(b->buf+2, b->nval); PBIT16(b->buf+4, b->valsz); PBIT16(b->buf+6, b->nbuf); PBIT16(b->buf+8, b->bufsz); b->bp.hash = blkhash(b); break; case Tleaf: PBIT16(b->buf+2, b->nval); PBIT16(b->buf+4, b->valsz); b->bp.hash = blkhash(b); break; case Tlog: h = siphash(b->data + 8, Blkspc-8); PBIT64(b->data, h); b->bp.hash = blkhash(b); break; case Traw: b->bp.hash = blkhash(b); break; case Tsuper: case Tarena: break; } } Blk* getblk(Bptr bp, int flg) { Blk *b; uvlong h; if((b = lookupblk(bp.addr)) != nil) return cacheblk(b); qlock(&blklock); if((b = lookupblk(bp.addr)) != nil){ cacheblk(b); qunlock(&blklock); return b; } if((b = readblk(bp.addr, flg)) == nil){ qunlock(&blklock); return nil; } h = blkhash(b); if((flg&GBnochk) == 0 && h != bp.hash){ fprint(2, "corrupt block %B: %llx != %llx\n", bp, blkhash(b), bp.hash); qunlock(&blklock); abort(); return nil; } b->bp.hash = h; b->bp.gen = bp.gen; cacheblk(b); qunlock(&blklock); return b; } Blk* dupblk(vlong bp, uvlong bh) { USED(bp, bh); return nil; } Blk* refblk(Blk *b) { ainc(&b->ref); return b; } ushort blkfill(Blk *b) { switch(b->type){ case Tpivot: return 2*b->nbuf + b->bufsz + 2*b->nval + b->valsz; case Tleaf: return 2*b->nval + b->valsz; default: fprint(2, "invalid block @%lld\n", b->bp.addr); abort(); } return 0; // shut up kencc } void putblk(Blk *b) { if(b == nil || adec(&b->ref) != 0) return; assert(!(b->flag & Bcached)); assert((b->flag & Bqueued) || !(b->flag & Bdirty)); free(b); } void freebp(Tree *t, Bptr bp) { Bfree *f; dprint("[%s] free blk %B\n", (t == &fs->snap) ? "snap" : "data", bp); if(bp.gen <= t->gen){ deadlistappend(t, bp); return; } if((f = malloc(sizeof(Bfree))) == nil) return; f->bp = bp; lock(&fs->freelk); f->next = fs->freehd; fs->freehd = f; unlock(&fs->freelk); } void freeblk(Tree *t, Blk *b) { assert(!(b->flag & Bqueued)); b->freed = getcallerpc(&b); freebp(t, b->bp); } void reclaimblk(Bptr bp) { Arena *a; a = getarena(bp.addr); lock(a); blkdealloc_lk(bp.addr); cachedel(bp.addr); unlock(a); } void quiesce(int tid) { int i, allquiesced; Bfree *p, *n; lock(&fs->activelk); allquiesced = 1; fs->active[tid]++; for(i = 0; i < fs->nproc; i++){ /* * Odd parity on quiescence implies * that we're between the exit from * and waiting for the next message * that enters us into the critical * section. */ if((fs->active[i] & 1) == 0) continue; if(fs->active[i] == fs->lastactive[i]) allquiesced = 0; } if(allquiesced) for(i = 0; i < fs->nproc; i++) fs->lastactive[i] = fs->active[i]; unlock(&fs->activelk); if(!allquiesced) return; lock(&fs->freelk); p = nil; if(fs->freep != nil){ p = fs->freep->next; fs->freep->next = nil; } unlock(&fs->freelk); while(p != nil){ n = p->next; reclaimblk(p->bp); free(p); p = n; } fs->freep = fs->freehd; } int sync(void) { int i, r; Arena *a; // Blk *b; r = 0; qlock(&fs->snaplk); for(i = 0; i < fs->narena; i++){ a = &fs->arenas[i]; finalize(a->log.tail); if(syncblk(a->log.tail) == -1) r = -1; } // // for(b = fs->chead; b != nil; b = b->cnext){ // if((b->flag & Bdirty) == 0) // continue; // if(syncblk(b) == -1) // r = -1; // } fillsuper(fs->super); finalize(fs->super); enqueue(fs->super); if(r != -1) r = syncblk(fs->super); qunlock(&fs->snaplk); return r; }