ref: 6b02773727cc3b9c434ef1ce5bbd94dd6b706aef
parent: e0fffa8ae9627363f518cc7077e5e8e204135db5
author: Ori Bernstein <ori@eigenstate.org>
date: Fri Mar 19 12:08:17 EDT 2021
fs: allocate blocks correctly, group insertions, snapshot
--- a/blk.c
+++ b/blk.c
@@ -1,66 +1,862 @@
#include <u.h>
#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
#include "dat.h"
#include "fns.h"
+static vlong blkalloc_lk(Arena*);
+static Blk *cacheblk(Blk*);
+static Blk *lookupblk(vlong);
+
Blk*
+readblk(vlong bp, int flg)
+{
+ Blk *b;
+ vlong off, rem, n;
+
+ assert(bp != -1);
+ if((b = malloc(sizeof(Blk))) == nil)
+ return nil;
+ off = bp;
+ rem = Blksz;
+ while(rem != 0){
+ n = pread(fs->fd, b->buf, rem, off);
+ if(n <= 0){
+ free(b);
+ return nil;
+ }
+ off += n;
+ rem -= n;
+ }
+ memset(&b->RWLock, 0, sizeof(RWLock));
+ b->type = (flg&GBraw) ? Traw : GBIT16(b->buf+0);
+ b->off = bp;
+ b->cnext = nil;
+ b->cprev = nil;
+ b->hnext = nil;
+ b->data = b->buf + 10;
+ switch(b->type){
+ default:
+ if(flg&GBraw)
+ break;
+ fprint(2, "invalid block @%llx\n", bp);
+ abort();
+ break;
+ case Tarena:
+ case Traw:
+ case Tsuper:
+ case Tlog:
+ break;
+ case Tpivot:
+ b->nval = GBIT16(b->buf+2);
+ b->valsz = GBIT16(b->buf+4);
+ b->nbuf = GBIT16(b->buf+6);
+ b->bufsz = GBIT16(b->buf+8);
+ break;
+ case Tleaf:
+ b->nval = GBIT16(b->buf+2);
+ b->valsz = GBIT16(b->buf+4);
+ break;
+ }
+ return b;
+}
+
+static Arena*
+pickarena(vlong hint)
+{
+ long n;
+
+ n = -1; /* shut up, ken */
+ if(hint > 0 || hint < fs->narena)
+ n = hint / fs->arenasz;
+ else if(hint == -1)
+ n = ainc(&fs->nextarena) % fs->narena;
+ else
+ abort();
+ return &fs->arenas[n];
+}
+
+static Arena*
+getarena(vlong b)
+{
+ int i;
+
+ i = b / fs->arenasz;
+ if(i < 0 || i >= fs->narena){
+ werrstr("out of range block %lld", b);
+ abort();
+// return nil;
+ }
+ return &fs->arenas[i];
+}
+
+static int
+freerange(Avltree *t, vlong off, vlong len)
+{
+ Arange *r, *s, q;
+
+ assert(len % Blksz == 0);
+
+ q.off = off;
+ q.len = len;
+ r = (Arange*)avllookup(t, &q.Avl, -1);
+ if(r == nil){
+ if((r = calloc(1, sizeof(Arange))) == nil)
+ return -1;
+ r->off = off;
+ r->len = len;
+ avlinsert(t, r);
+ }else if(off+len == r->off){
+ r->len += len;
+ r->off -= len;
+ }else if(off == r->off + r->len){
+ r->len += len;
+ }else{
+ r = (Arange*)avlnext(r);
+ if(r != nil && off+len == r->off){
+ r->off -= len;
+ r->len += len;
+ } else {
+ if((r = calloc(1, sizeof(Arange))) == nil)
+ return -1;
+ r->off = off;
+ r->len = len;
+ avlinsert(t, r);
+ }
+ }
+
+ /* merge with previous */
+ s = (Arange*)avlprev(r);
+ if(s != nil && s->off + s->len == r->off){
+ s->off += r->len;
+ avldelete(t, r);
+ }
+ /* merge with next */
+ s = (Arange*)avlnext(r);
+ if(s != nil && r->off + r->len == s->off){
+ r->off += r->len;
+ avldelete(t, s);
+ }
+ return 0;
+}
+
+int
+grabrange(Avltree *t, vlong off, vlong len)
+{
+ Arange *r, *s, q;
+
+ assert(len % Blksz == 0);
+ q.off = off;
+ q.len = len;
+ r = (Arange*)avllookup(t, &q.Avl, -1);
+ if(r == nil || off + len > r->off + r->len){
+ fprint(2, "no blocks: %llx+%llx in:\n", off, len);
+ for(r = (Arange*)avlmin(t); r != nil; r = (Arange*)avlnext(r))
+ fprint(2, "\t%llx+%llx\n", r->off, r->len);
+ abort();
+ return -1;
+ }
+ if(off == r->off)
+ r->off += len;
+ else if(off == r->off + r->len)
+ r->len -= len;
+ else if(off + len > r->off && off > r->off + r->len){
+ if((s = malloc(sizeof(Arange))) == nil)
+ return -1;
+ s->off = off;
+ s->len = r->len - (off - r->off) - len;;
+ r->len = off - r->off;
+ avlinsert(t, s);
+ }
+ if(r->len == 0){
+ avldelete(t, r);
+ free(r);
+ }
+ return 0;
+}
+
+int
+synclog(Blk *b, vlong link, vlong sz)
+{
+
+ if(sz == -1)
+ sz = b->logsz;
+ PBIT64(b->data+8, link);
+ PBIT64(b->data+16, sz);
+ finalize(b);
+fprint(2, "synclog: @%llx\n", b->off);
+showfree("synclog");
+ return pwrite(fs->fd, b->buf, Blksz, b->off);
+}
+
+Blk*
+logappend(Arena *a, Blk *lb, vlong off, vlong len, int op, vlong graft)
+{
+ Blk *pb;
+ vlong o, n;
+ char *p;
+
+ assert(off % Blksz == 0);
+ if(lb == nil || lb->logsz+16+24 > Blkspc){
+ pb = lb;
+ if((o = blkalloc_lk(a)) == -1)
+ return nil;
+ if((lb = mallocz(sizeof(Blk), 1)) == nil)
+ return nil;
+ lb->data = lb->buf + Hdrsz;
+ lb->flag |= Bdirty;
+ lb->type = Tlog;
+ lb->off = o;
+ lb->logsz = 0;
+ if(synclog(lb, graft, 0) == -1)
+ return nil;
+
+ a->logtl = lb;
+ if(pb != nil)
+ if(synclog(pb, lb->off, -1) == -1)
+ return nil;
+ }
+ n = 8;
+ p = lb->data + 24 + lb->logsz;
+ if(len == Blksz && op == LgAlloc)
+ op = LgAlloc1;
+ off |= op;
+ PBIT64(p, off);
+ if(op == LgAlloc || op == LgFree){
+ PBIT64(p+8, len);
+ n += 8;
+ }
+ lb->logsz += n;
+ return lb;
+}
+
+/*
+ * Logs an allocation. Must be called
+ * with arena lock held. Duplicates some/c
+ * of the work in allocblk to prevent
+ * recursion.
+ */
+int
+logalloc(Arena *a, vlong off, int op)
+{
+ Blk *b;
+
+ if((b = logappend(a, a->logtl, off, Blksz, op, -1)) == nil)
+ return -1;
+ if(a->logtl == nil){
+ a->log = b->off;
+ a->logtl = b;
+ }
+ return 0;
+}
+
+int
+loadlog(Arena *a)
+{
+ Blk *b;
+ vlong bp, ent, off, len;
+ uvlong bh;
+ char *p, *d;
+ int op, i, n;
+
+ bp = a->log;
+ while(bp != -1){
+ dprint("log block: %llx\n", bp);
+ if((b = readblk(bp, 0)) == nil)
+ return -1;
+ p = b->data;
+ bh = GBIT64(p + 0);
+ bp = GBIT64(p + 8);
+ b->logsz = GBIT64(p + 16);
+ /* the hash covers the log and offset */
+ if(bh != siphash(p+8, Blkspc-8)){
+ werrstr("corrupt log");
+ return -1;
+ }
+ for(i = 0; i < b->logsz; i += n){
+ d = p + i + 24;
+ ent = GBIT64(d);
+ op = ent & 0xff;
+ off = ent & ~0xff;
+ switch(op){
+ case LgFlush:
+ dprint("log@%d: flush: %llx\n", i, off>>8);
+ n = 8;
+ lock(&fs->genlk);
+ fs->gen = off >> 8;
+ unlock(&fs->genlk);
+ break;
+ case LgAlloc1:
+ n = 8;
+ dprint("log@%d alloc1: %llx\n", i, off);
+ if(grabrange(a->free, off & ~0xff, Blksz) == -1)
+ return -1;
+ break;
+ case LgAlloc:
+ n = 16;
+ len = GBIT64(d+8);
+ dprint("log@%d alloc: %llx+%llx\n", i, off, len);
+ if(grabrange(a->free, off & ~0xff, len) == -1)
+ return -1;
+ break;
+ case LgFree:
+ n = 16;
+ len = GBIT64(d+8);
+ dprint("log@%d free: %llx+%llx\n", i, off, len);
+ if(freerange(a->free, off & ~0xff, len) == -1)
+ return -1;
+ break;
+ case LgRef1:
+ case LgUnref1:
+ n = 8;
+ fprint(2, "unimplemented ref op at log@%d: log op %d\n", i, op);
+ break;
+ default:
+ n = 0;
+ dprint("log@%d: log op %d\n", i, op);
+ abort();
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+int
+compresslog(Arena *a)
+{
+ Arange *r;
+ vlong *log, *p, bp, hd, hh, graft;
+ int i, n, sz;
+ Blk *pb, *ab, *b;
+
+showfree("precompress");
+fprint(2, "compress start\n");
+ /*
+ * Sync the current log to disk. While
+ * compressing the log, nothing else is
+ * using this arena, so any allocs come
+ * from the log compression.
+ *
+ * A bit of copy paste from newblk,
+ * because otherwise we have bootstrap
+ * issues.
+ *
+ * Set up the head of the new log as
+ * an empty block.
+ */
+ if((bp = blkalloc_lk(a)) == -1)
+ return -1;
+ if((b = mallocz(sizeof(Blk), 1)) == nil)
+ return -1;
+ b->type = Tlog;
+ b->flag = Bdirty;
+ b->off = bp;
+ b->ref = 1;
+ b->data = b->buf + Hdrsz;
+checkfs();
+ if(synclog(b, -1, 0) == -1)
+ return -1;
+checkfs();
+
+ /*
+ * When reaming or loading, we may not
+ * have set up the log.
+ */
+checkfs();
+ if(a->logtl != nil)
+ synclog(a->logtl, b->off, -1);
+checkfs();
+ graft = b->off;
+ a->logtl = b;
+
+ /*
+ * Prepare what we're writing back.
+ * Arenas must be sized so that we can
+ * keep the merged log in memory for
+ * a rewrite.
+ */
+ n = 0;
+ sz = 512;
+checkfs();
+ if((log = malloc(2*sz*sizeof(vlong))) == nil)
+ return -1;
+checkfs();
+ for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+ if(n == sz){
+ sz *= 2;
+ if((p = realloc(log, 2*sz*sizeof(vlong))) == nil){
+ free(log);
+ return -1;
+ }
+ log = p;
+ }
+ log[2*n+0] = r->off;
+ log[2*n+1] = r->len;
+ n++;
+ }
+ if((b = newblk(Tlog)) == nil){
+ free(log);
+ return -1;
+ }
+checkfs();
+ pb = b;
+ hd = b->off;
+ hh = -1;
+ PBIT64(b->data + 8, graft); /* link */
+ PBIT64(b->data + 16, 0ULL); /* count */
+checkfs();
+
+ for(i = 0; i < n; i++){
+ if((b = logappend(a, b, log[2*i], log[2*i+1], LgFree, graft)) == nil)
+ return -1;
+checkfs();
+ if(b != pb){
+checkfs();
+ synclog(pb, b->off, -1);
+checkfs();
+ if(pb->off == hd)
+ hh = blkhash(pb);
+ b = pb;
+ }
+ }
+ free(log);
+checkfs();
+ if(synclog(b, graft, -1) == -1)
+ return -1;
+checkfs();
+ if(pb->off == hd)
+ hh = blkhash(pb);
+checkfs();
+
+ a->log = hd;
+ a->logh = hh;
+ ab = a->b;
+ PBIT64(ab->data + 0, hd);
+ PBIT64(ab->data + 8, hh);
+checkfs();
+ pwrite(fs->fd, ab->buf, Blksz, ab->off);
+checkfs();
+fprint(2, "compress done\n");
+ return 0;
+}
+/*
+ * Allocate from an arena, with lock
+ * held. May be called recursively, to
+ * alloc space for the alloc log.
+ */
+static vlong
+blkalloc_lk(Arena *a)
+{
+ Avltree *t;
+ Arange *r;
+ vlong b;
+
+ t = a->free;
+ r = (Arange*)t->root;
+ if(r == nil){
+ unlock(a);
+ return -1;
+ }
+
+ /*
+ * A bit of sleight of hand here:
+ * while we're changing the sorting
+ * key, but we know it won't change
+ * the sort order because the tree
+ * covers disjoint ranges
+ */
+ b = r->off;
+ r->len -= Blksz;
+ r->off += Blksz;
+ if(r->len == 0){
+ avldelete(t, r);
+ free(r);
+ }
+ return b;
+}
+
+int
+blkrelease(vlong b)
+{
+ Arena *a;
+ int r;
+
+ r = -1;
+ a = getarena(b);
+ lock(a);
+ if(freerange(a->free, b, Blksz) == -1)
+ goto out;
+ if(logalloc(a, b, LgFree) == -1)
+ goto out;
+ r = 0;
+out:
+ unlock(a);
+ return r;
+}
+
+vlong
+blkalloc(vlong hint)
+{
+ Arena *a;
+ vlong b;
+ int tries;
+
+ tries = 0;
+again:
+ a = pickarena(hint);
+ if(a == nil || tries == fs->narena){
+ werrstr("no empty arenas");
+ return -1;
+ }
+ lock(a);
+ /*
+ * TODO: there's an extreme edge case
+ * here.
+ *
+ * If the file system has room to alloc
+ * a data block but no log block, then
+ * we end up with it in a stuck state.
+ * The fix is to reserve alloc blocks,
+ * so that we're guaranteed to be able
+ * to log an alloc if the disk is working
+ * correctly.
+ */
+ tries++;
+ if((b = blkalloc_lk(a)) == -1){
+ unlock(a);
+ goto again;
+ }
+ if(logalloc(a, b, LgAlloc) == -1){
+ unlock(a);
+ return -1;
+ }
+ unlock(a);
+ return b;
+}
+
+Blk*
newblk(int t)
{
Blk *b;
+ vlong bp;
- b = emalloc(sizeof(Blk));
- b->flag |= Bdirty;
+ if((bp = blkalloc(-1)) == -1)
+ return nil;
+ if((b = mallocz(sizeof(Blk), 1)) == nil)
+ return nil;
b->type = t;
- return b;
+ b->flag = Bdirty;
+ b->off = bp;
+ b->ref = 1;
+ b->data = b->buf + Hdrsz;
+ return cacheblk(b);
}
-Blk*
-getblk(uvlong bp, uvlong bh)
+static Blk*
+lookupblk(vlong off)
{
+ Bucket *bkt;
+ u32int h;
Blk *b;
- b = (Blk*)bp;
- if(blkhash(b) != bh){
- werrstr("corrupt block %llx\n", bp);
- fprint(2, "corrupt block %llx\n", bp);
-// abort();
+ h = ihash(off);
+
+ bkt = &fs->cache[h % fs->cmax];
+ lock(bkt);
+ for(b = bkt->b; b != nil; b = b->hnext)
+ if(b->off == off)
+ break;
+ if(b != nil)
+ pinblk(b);
+ unlock(bkt);
+ return b;
+}
+
+static Blk*
+cacheblk(Blk *b)
+{
+ Bucket *bkt;
+ Blk *e, *c;
+ u32int h;
+
+ /* FIXME: better hash. */
+ assert(b->off != 0);
+ h = ihash(b->off);
+// dprint("cache %lld (h=%xm, bkt=%d) => %p\n", b->off, h%fs->cmax, h, b);
+ ainc(&b->ref);
+ bkt = &fs->cache[h % fs->cmax];
+ lock(bkt);
+ for(e = bkt->b; e != nil; e = e->hnext){
+ if(b == e)
+ goto found;
+ assert(b->off != e->off);
}
+ bkt->b = b;
+found:
+ ainc(&b->ref);
+ unlock(bkt);
+
+ lock(&fs->lrulk);
+ if(b == fs->chead)
+ goto Cached;
+ if(b == fs->ctail)
+ fs->ctail = b->cprev;
+
+ if(b->cnext != nil)
+ b->cnext->cprev = b->cprev;
+ if(b->cprev != nil)
+ b->cprev->cnext = b->cnext;
+ if(fs->ctail == nil)
+ fs->ctail = b;
+ if(fs->chead != nil)
+ fs->chead->cprev = b;
+ if(fs->ctail == nil)
+ fs->ctail = b;
+ b->cnext = fs->chead;
+ b->cprev = nil;
+ fs->chead = b;
+ if((b->flag&Bcache) == 0){
+ b->flag |= Bcache;
+ fs->ccount++;
+ ainc(&b->ref);
+ }
+ c=0;
+ USED(c);
+/*
+ for(c = fs->ctail; c != nil && fs->ccount >= fs->cmax; c = fs->ctail){
+ fs->ctail = c->cprev;
+ fs->ccount--;
+ putblk(c);
+ }
+*/
+Cached:
+ unlock(&fs->lrulk);
return b;
}
+static void
+cachedel(Blk *del)
+{
+ Bucket *bkt;
+ Blk *b, **p;
+ u32int h;
+
+ /* FIXME: better hash. */
+ h = ihash(del->off);
+
+ bkt = &fs->cache[h % fs->cmax];
+ lock(bkt);
+ p = &bkt->b;
+ for(b = bkt->b; b != nil; b = b->hnext){
+ if(b == del){
+ *p = del->hnext;
+ break;
+ }
+ p = &b->hnext;
+ }
+ unlock(bkt);
+
+ lock(&fs->lrulk);
+ if(b->cnext != nil)
+ b->cnext->cprev = b->cprev;
+ if(b->cprev != nil)
+ b->cprev->cnext = b->cnext;
+ if(fs->ctail == b)
+ fs->ctail = b->cprev;
+ if(fs->chead == nil)
+ fs->chead = b;
+ unlock(&fs->lrulk);
+}
+
void
-freeblk(Blk *b)
+enqueue(Blk *b)
{
- b->refs--;
- if(b->refs == 0)
- free(b);
+ if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1){
+ ainc(&fs->broken);
+ fprint(2, "write: %r");
+ return;
+ }
+ wlock(b);
+ b->flag &= ~(Bqueued|Bdirty|Bfinal);
+ wunlock(b);
+
}
-uvlong
-blkhash(Blk *b)
+void
+fillsuper(Blk *b)
{
- return siphash(b->data, Blksz);
+ char *p;
+
+ assert(b->type == Tsuper);
+ p = b->data;
+ memcpy(p + 0, "gefs0001", 8);
+ PBIT32(p + 8, 0); /* dirty */
+ PBIT32(p + 12, Blksz);
+ PBIT32(p + 16, Bufspc);
+ PBIT32(p + 20, Hdrsz);
+ PBIT32(p + 24, fs->height);
+ PBIT64(p + 32, fs->rootb);
+ PBIT64(p + 40, fs->rooth);
+ PBIT32(p + 48, fs->narena);
+ PBIT64(p + 56, fs->arenasz);
+ PBIT64(p + 64, fs->gen);
+ PBIT64(p + 72, fs->nextqid);
}
+void
+finalize(Blk *b)
+{
+ vlong h;
+
+// assert((b->flag & Bfinal) == 0);
+ b->flag |= Bfinal;
+ PBIT16(b->buf, b->type);
+ switch(b->type){
+ default:
+ case Tnone:
+ abort();
+ break;
+ case Tpivot:
+ PBIT16(b->buf+2, b->nval);
+ PBIT16(b->buf+4, b->valsz);
+ PBIT16(b->buf+6, b->nbuf);
+ PBIT16(b->buf+8, b->bufsz);
+ break;
+ case Tleaf:
+ PBIT16(b->buf+2, b->nval);
+ PBIT16(b->buf+4, b->valsz);
+ break;
+ case Tlog:
+ h = siphash(b->data + 8, Blkspc-8);
+ PBIT64(b->data, h);
+ case Tsuper:
+ case Tarena:
+ case Traw:
+ break;
+ }
+}
+
+Blk*
+getblk(vlong bp, uvlong bh)
+{
+ Blk *b;
+
+ if((b = lookupblk(bp)) == nil){
+ if((b = readblk(bp, 0)) == nil)
+ return nil;
+ if(siphash(b->buf, Blksz) != bh){
+ werrstr("corrupt block %llx", bp);
+ return nil;
+ }
+ }
+ return cacheblk(b);
+}
+
+Blk*
+dupblk(vlong bp, uvlong bh)
+{
+ USED(bp, bh);
+ return nil;
+}
+
+Blk*
+pinblk(Blk *b)
+{
+ ainc(&b->ref);
+ return b;
+}
+
+int
+refblk(Blk *b)
+{
+ Arena *a;
+ int r;
+
+ a = getarena(b->off);
+ lock(a);
+ r = logalloc(a, b->off, LgRef1);
+ unlock(a);
+ return r;
+}
+
+int
+unrefblk(Blk *b)
+{
+ Arena *a;
+ int r;
+
+ a = getarena(b->off);
+ lock(a);
+ r = logalloc(a, b->off, LgUnref1);
+ unlock(a);
+ return r;
+}
+
ushort
blkfill(Blk *b)
{
switch(b->type){
- case Pivot:
- return 2*b->nmsg + b->bufsz + 2*b->nent + b->valsz;
- case Leaf:
- return 2*b->nent + b->valsz;
+ case Tpivot:
+ return 2*b->nbuf + b->bufsz + 2*b->nval + b->valsz;
+ case Tleaf:
+ return 2*b->nval + b->valsz;
default:
+ fprint(2, "invalid block @%lld\n", b->off);
abort();
- return 0; // shut up kencc
}
+ return 0; // shut up kencc
}
-int
+void
putblk(Blk *b)
{
- USED(b);
- /* TODO: unref. */
- return 0;
+ if(b == nil)
+ return;
+ if((b->flag & (Bdirty|Bqueued)) == Bdirty)
+ enqueue(b);
+ if(adec(&b->ref) == 0){
+ cachedel(b);
+ free(b);
+ }
+}
+
+void
+freeblk(Blk *b)
+{
+ Arena *a;
+
+ assert(b->ref == 1 && b->flag & (Bdirty|Bqueued) == Bdirty);
+ a = getarena(b->off);
+ lock(a);
+ /*
+ * TODO: what to do if we fail to log a free here??
+ * This is already an error path!
+ */
+ logalloc(a, b->off, LgRef1);
+ unlock(a);
+ free(b);
+}
+
+int
+sync(void)
+{
+ int i, r;
+ Blk *b;
+
+ dprint("syncing\n");
+ r = 0;
+ for(i = 0; i < fs->narena; i++)
+ if(synclog(fs->arenas[i].logtl, -1, -1) == -1)
+ r = -1;
+ for(b = fs->chead; b != nil; b = b->cnext){
+// dprint("sync %p\n", b);
+ if(!(b->flag & Bdirty))
+ continue;
+ if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1)
+ r = -1;
+ }
+ return r;
}
--- a/check.c
+++ b/check.c
@@ -1,14 +1,15 @@
#include <u.h>
#include <libc.h>
-#include <bio.h>
+#include <fcall.h>
+#include <avl.h>
+
#include "dat.h"
#include "fns.h"
char spc[128];
-int debug;
static int
-invalidblk(Blk *b, int h, Kvp *lo, Kvp *hi)
+badblk(Blk *b, int h, Kvp *lo, Kvp *hi)
{
Kvp x, y;
int i, r;
@@ -16,17 +17,17 @@
int fail;
fail = 0;
- if(b->type == Leaf){
- if(h != fs->height){
+ if(b->type == Tleaf){
+ if(h != 0){
fprint(2, "unbalanced leaf\n");
fail++;
}
- if(h != 1 && b->nent < 2){
+ if(h != 0 && b->nval < 2){
fprint(2, "underfilled leaf\n");
fail++;
}
}
- if(b->type == Pivot && b->nent < 2){
+ if(b->type == Tpivot && b->nval < 2){
fprint(2, "underfilled pivot\n");
fail++;
}
@@ -33,19 +34,23 @@
getval(b, 0, &x);
if(lo && keycmp(lo, &x) != 0)
fprint(2, "out of range keys %P != %P\n", lo, &x);
- for(i = 1; i < b->nent; i++){
+ for(i = 1; i < b->nval; i++){
getval(b, i, &y);
if(hi && keycmp(&y, hi) >= 0){
fprint(2, "out of range keys %P >= %P\n", &y, hi);
fail++;
}
- if(b->type == Pivot){
- c = getblk(x.bp, x.bh);
+ if(b->type == Tpivot){
+ if((c = getblk(x.bp, x.bh)) == nil){
+ fprint(2, "corrupt block: %r\n");
+ fail++;
+ continue;
+ }
if(blkfill(c) != x.fill){
- fprint(2, "mismatched block fill");
+ fprint(2, "mismatched block fill\n");
fail++;
}
- if(invalidblk(c, h + 1, &x, &y))
+ if(badblk(c, h - 1, &x, &y))
fail++;
}
r = keycmp(&x, &y);
@@ -65,17 +70,25 @@
}
return fail;
}
-
/* TODO: this will grow into fsck. */
int
checkfs(void)
{
- return invalidblk(fs->root, 1, nil, nil) == 0;
+ int ok, height;
+ Blk *b;
+
+ ok = 1;
+ if((b = getroot(&height)) != nil){
+ if(badblk(b, height-1, nil, 0))
+ ok = 0;
+ putblk(b);
+ }
+ return ok;
}
-static void
-rshowblk(Blk *b, int indent)
+void
+rshowblk(int fd, Blk *b, int indent, int recurse)
{
Blk *c;
int i;
@@ -85,22 +98,24 @@
if(indent > sizeof(spc)/4)
indent = sizeof(spc)/4;
if(b == nil){
- print("NIL\n");
+ fprint(fd, "NIL\n");
return;
}
- if(b->type == Pivot){
- for(i = 0; i < b->nmsg; i++){
+ if(b->type == Tpivot){
+ for(i = 0; i < b->nbuf; i++){
getmsg(b, i, &m);
- print("%.*s|%M\n", 4*indent, spc, &m);
+ fprint(fd, "%.*s|%M\n", 4*indent, spc, &m);
}
}
- for(i = 0; i < b->nent; i++){
+ for(i = 0; i < b->nval; i++){
getval(b, i, &kv);
- print("%.*s|%P\n", 4*indent, spc, &kv);
- if(b->type == Pivot){
+ fprint(fd, "%.*s|%P\n", 4*indent, spc, &kv);
+ if(b->type == Tpivot){
if((c = getblk(kv.bp, kv.bh)) == nil)
- sysfatal("falied load: %r");
- rshowblk(c, indent + 1);
+ sysfatal("failed load: %r");
+ if(recurse)
+ rshowblk(fd, c, indent + 1, 1);
+ putblk(c);
}
}
}
@@ -117,22 +132,29 @@
}
void
-showblk(Blk *b, char *m)
+showblk(Blk *b, char *m, int recurse)
{
- if(m != nil)
- print("=== %s\n", m);
- rshowblk(b, 0);
+ fprint(2, "=== %s\n", m);
+ rshowblk(2, b, 0, recurse);
}
void
+fshowfs(int fd, char *m)
+{
+ int h;
+
+ fprint(fd, "=== %s\n", m);
+ fprint(fd, "fs->height: %d\n", fs->height);
+ rshowblk(fd, getroot(&h), 0, 1);
+}
+
+void
showfs(char *m)
{
- if(m != nil)
- print("=== %s\n", m);
- rshowblk(fs->root, 0);
+ if(debug)
+ fshowfs(2, m);
}
-
void
showpath(Path *p, int np)
{
@@ -142,5 +164,21 @@
print("==> b=%p, n=%p, idx=%d\n", p[i].b, p[i].n, p[i].idx);
print("\tclear=(%d, %d):%d\n", p[i].lo, p[i].hi, p[i].sz);
print("\tl=%p, r=%p, n=%p, split=%d\n", p[i].l, p[i].r, p[i].n, p[i].split);
+ }
+}
+
+void
+showfree(char *m)
+{
+ Arange *r;
+ int i;
+
+ if(!debug)
+ return;
+ print("=== %s\n", m);
+ for(i = 0; i < fs->narena; i++){
+ print("allocs[%d]:\n", i);
+ for(r = (Arange*)avlmin(fs->arenas[i].free); r != nil; r = (Arange*)avlnext(r))
+ print("\t%llx+%llx\n", r->off, r->len);
}
}
--- a/dat.h
+++ b/dat.h
@@ -1,35 +1,80 @@
typedef struct Blk Blk;
typedef struct Gefs Gefs;
+typedef struct Fmsg Fmsg;
+typedef struct Fid Fid;
typedef struct Msg Msg;
typedef struct Key Key;
typedef struct Val Val;
typedef struct Kvp Kvp;
+typedef struct Bptr Bptr;
typedef struct Path Path;
+typedef struct Scan Scan;
+typedef struct Dent Dent;
+typedef struct Scanp Scanp;
+typedef struct Arena Arena;
+typedef struct Arange Arange;
+typedef struct Bucket Bucket;
+typedef struct Chan Chan;
enum {
- // buffer for the whole block
- Blksz = 256,
- // will store what type of block
- Hdrsz = 16,
+ KiB = 1024ULL,
+ MiB = 1024ULL*KiB,
+ GiB = 1024ULL*MiB,
+ TiB = 1024ULL*GiB,
+
+ Lgblk = 9,
+ Blksz = (1ULL<<Lgblk),
+
+ Nrefbuf = 1024, /* number of ref incs before syncing */
+ Nfidtab = 1024, /* number of fit hash entries */
+ Ndtab = 1024, /* number of dir tab entries */
+ Max9p = 16*KiB, /* biggest message size we're willing to negotiate */
+ Nsec = 1000*1000*1000, /* nanoseconds to the second */
+ Maxname = 256, /* maximum size of a name element */
+ Maxent = 9+Maxname+1, /* maximum size of ent key, with terminator */
+
+ /*
+ * Kpmax must be no more than 1/4 of pivspc, or
+ * there is no way to get a valid split of a
+ * maximally filled tree.
+ */
+ Keymax = 32, /* key data limit */
+ Inlmax = 128, /* inline data limit */
+ Ptrsz = 18, /* off, hash, fill */
+ Kvmax = Keymax + Inlmax, /* Key and value */
+ Kpmax = Keymax + Ptrsz, /* Key and pointer */
+
+ Hdrsz = 10,
Blkspc = Blksz - Hdrsz,
- // space for message buffer
Bufspc = Blkspc / 2,
- // space for pivot keys and offsets
Pivspc = Blkspc - Bufspc,
Leafspc = Blkspc,
- Keymax = 16,
- Inlmax = 64,
- Ptrsz = 18,
- Kvmax = Keymax + Inlmax, /* Key and value */
- Kpmax = Keymax + Ptrsz, /* Key and pointer */
Msgmax = 1 + (Kvmax > Kpmax ? Kvmax : Kpmax)
};
enum {
- Bdirty = 1 << 0,
+ /*
+ * dent: pqid[8] qid[8] -- a directory entry key.
+ * ptr: off[8] hash[8] -- a key for an Dir block.
+ * dir: fixed statbuf header, user ids
+ */
+ Kref, /* off[8] => ptr[16]: pointer to refcount page */
+ Kdat, /* qid[8] off[8] => ptr[16]: pointer to data page */
+ Kent, /* pqid[8] name[n] => dir[n]: serialized Dir */
+ Ksnap, /* name[n] => dent[16] ptr[16]: snapshot root */
+ Ksuper, /* qid[8] => pqid[8]: parent dir */
};
+enum {
+ Bdirty = 1 << 0,
+ Bqueued = 1 << 1,
+ Bfinal = 1 << 2,
+ Bcache = 1 << 3,
+};
+
#define Efs "i will not buy this fs, it is scratched"
+#define Efid "bad fid"
+#define Edscan "invalid dir scan offset"
#define Eexist "does not exist"
#define Ebotch "protocol botch"
#define Emode "unknown mode"
@@ -37,20 +82,93 @@
#define Eauth "authentication failed"
#define Elength "name too long"
#define Eperm "permission denied"
+#define Einuse "resource in use"
+#define Ebadf "invalid file"
+#define Emem "out of memory"
+#define Ename "invalid file name"
+#define Enomem "out of memory"
/*
- * The type of block. Pivot nodes are internal to the tree,
- * while leaves inhabit the edges. Pivots are split in half,
- * containing a buffer for the data, and keys to direct the
- * searches. Their buffers contain messages en-route to the
- * leaves.
+ * All metadata blocks share a common header:
+ *
+ * type[2]
*
- * Leaves contain the key, and some chunk of data as the
- * value.
+ * The None type is reserved for file data blocks
+ * and refcount blocks.
+ *
+ * The superblock has this layout:
+ * version[8] always "gefs0001"
+ * flags[4] status flags:
+ * dirty=1<<0,
+ * blksz[4] block size in bytes
+ * bufsz[4] portion of leaf nodes
+ * allocated to buffers,
+ * in bytes
+ * hdrsz[4] size of tree node header,
+ * in bytes.
+ * height[4] tree height of root node
+ * rootb[8] address of root in last
+ * snapshot.
+ * rooth[8] hash of root node
+ * narena[4] number of arenas in tree
+ * arenasz[8] maximum size of arenas;
+ * they may be smaller.
+ * gen[8] The flush generation
+ *
+ * The arena zone blocks have this layout, and
+ * are overwritten in place:
+ *
+ * log[8] The head of the alloc log
+ * logh[8] The hash of the alloc log
+ *
+ * The log blocks have this layout, and are one of
+ * two types of blocks that get overwritten in place:
+ *
+ * hash[8] The hash of the rest of the block
+ * link[8] The next block, or -1 if nil.
+ * entsz[8] number of bytes of log entries.
+ *
+ * The remainder of the block is filled with log
+ * entries. Each log entry has at least 8 bytes
+ * of entry. Some are longer. The opcode is or'ed
+ * into the low order bits of the first vlong.
+ * These ops take the following form:
+ *
+ * Alloc, Free:
+ * off[8] len[8]
+ * Alloc1, Free1:
+ * off[8]
+ * Ref:
+ * off[8]
+ * Flush:
+ * gen[8]
+ *
+ * Pivots have the following layout:
+ *
+ * nval[2]
+ * valsz[2]
+ * nbuf[2]
+ * bufsz[2]
+ *
+ * Leaves have the following layout:
+ *
+ * nval[2]
+ * valsz[2]
+ * _pad[4]
+ *
+ * Within these nodes, pointers have the following
+ * layout:
+ *
+ * off[8] hash[8] fill[2]
*/
enum {
- Pivot,
- Leaf,
+ Tnone,
+ Traw,
+ Tsuper,
+ Tarena,
+ Tlog,
+ Tpivot,
+ Tleaf,
};
enum {
@@ -59,13 +177,53 @@
};
enum {
- Ocreate,
+ GBraw = 1<<0,
+ GBwrite = 1<<1,
+};
+
+enum {
+ Oinsert,
Odelete,
- Owrite,
Owstat,
+ /* wstat flags */
+ Owsize = 1<<4,
+ Owname = 1<<5,
+ Owmode = 1<<6,
+ Owmtime = 1<<7,
};
/*
+ * Operations for the allocation log.
+ */
+enum {
+ LgFree, /* free a range: 16 byte */
+ LgAlloc, /* alloc a range: 16 byte */
+ LgAlloc1, /* alloc a block */
+ LgRef1, /* ref a block */
+ LgUnref1, /* free a block */
+ LgFlush, /* flush log, bump gen */
+};
+
+struct Arange {
+ Avl;
+ vlong off;
+ vlong len;
+};
+
+struct Bucket {
+ Lock;
+ Blk *b;
+};
+
+struct Fmsg {
+ Fcall;
+ int fd; /* the fd to repsond on */
+ QLock *wrlk; /* write lock on fd */
+ int sz; /* the size of the message buf */
+ uchar buf[];
+};
+
+/*
* Overall state of the file sytem.
* Shadows the superblock contents.
*/
@@ -75,11 +233,59 @@
int pivsz;
int hdrsz;
+ QLock snaplk;
+ Blk* super;
+
+ Chan *wrchan;
+ Chan *rdchan;
+
+ int fd;
+ long broken;
+
+ Lock rootlk;
+ /* protected by rootlk */
+ vlong rootb;
+ vlong rooth;
int height;
- Blk *root;
-
+
+ Lock genlk;
+ vlong gen;
+ Lock qidlk;
+ vlong nextqid;
+
+ Arena *arenas;
+ int narena;
+ long nextarena;
+ vlong arenasz;
+
+ Lock fidtablk;
+ Fid *fidtab[Nfidtab];
+ Lock dtablk;
+ Dent *dtab[Ndtab];
+
+ Lock lrulk;
+ /* protected by lrulk */
+ Bucket *cache;
+ Blk *chead;
+ Blk *ctail;
+ int ccount;
+ int cmax;
};
+struct Arena {
+ Lock;
+ Avltree *free;
+ Avltree *partial;
+
+ vlong log; /* log head */
+ vlong logh; /* log head hash */
+ Blk *logtl; /* tail block open for writing */
+ Blk *b; /* arena block */
+
+ Blk **q; /* write queue */
+ vlong nq;
+};
+
struct Key{
char *k;
int nk;
@@ -112,6 +318,39 @@
Kvp;
};
+struct Dent {
+ RWLock;
+ Key;
+ Dent *next;
+ long ref;
+
+ Qid qid;
+ vlong length;
+ vlong rootb;
+ char buf[Maxent];
+};
+
+struct Fid {
+ Lock;
+ Fid *next;
+ /*
+ * if opened with OEXEC, we want to use a snapshot,
+ * instead of the most recent root, to prevent
+ * paging in the wrong executable.
+ */
+ vlong rootb;
+ vlong rooth;
+ int height;
+
+ u32int fid;
+ vlong qpath;
+ int mode;
+ int iounit;
+
+ Scan *scan; /* in progres scan */
+ Dent *dent; /* (pqid, name) ref, modified on rename */
+};
+
struct Path {
/* Flowing down for flush */
Blk *b; /* insertion */
@@ -130,21 +369,65 @@
* and midx points at the left of
* the two nodes.
*/
- Blk *ml; /* merged/rotated left */
- Blk *mr; /* merged/rotated right */
+ Blk *nl; /* merged/rotated left */
+ Blk *nr; /* merged/rotated right */
int midx; /* merged index */
char split; /* did we split? */
char merge; /* did we merge? */
};
+struct Scanp {
+ int bi;
+ int vi;
+ Blk *b;
+};
+
+struct Scan {
+ vlong offset; /* last read offset */
+ vlong rootb;
+ vlong rooth;
+ int height;
+
+ int done;
+ Kvp kv;
+ Key pfx;
+ char kvbuf[Kvmax];
+ char pfxbuf[Keymax];
+ Scanp *path;
+};
+
struct Blk {
- char type;
- char flag;
- short nent;
- short valsz;
- short nmsg;
- short bufsz;
+ RWLock;
+
+ /* cache entry */
+ Blk *cnext;
+ Blk *cprev;
+ Blk *hnext;
+
+ short flag;
+
+ /* serialized to disk in header */
+ short type; /* @0, for all */
+ short nval; /* @2, for Leaf, Pivot: data[0:2] */
+ short valsz; /* @4, for Leaf, Pivot: data[2:4] */
+ short nbuf; /* @6, for Pivot */
+ short bufsz; /* @8, for Pivot */
+
+ vlong logsz; /* for allocation log */
+ vlong lognxt; /* for allocation log */
+
vlong off; /* -1 for unallocated */
- int refs; /* TODO: move out */
- char data[Blksz];
+ long ref; /* TODO: move out */
+ char *data;
+ char buf[Blksz];
+};
+
+struct Chan {
+ int size; /* size of queue */
+ long count; /* how many in queue (semaphore) */
+ long avail; /* how many available to send (semaphore) */
+ Lock rl, wl; /* circular pointers */
+ void **rp;
+ void **wp;
+ void* args[]; /* list of saved pointers, [->size] */
};
--- a/fns.h
+++ b/fns.h
@@ -3,26 +3,44 @@
#pragma varargck type "K" Key*
#pragma varargck type "V" Val*
#pragma varargck type "B" Blk*
+#pragma varargck type "R" Arange*
+#pragma varargck type "X" char*
extern Gefs *fs;
extern int debug;
-void initfs(void);
-
Blk* newblk(int type);
Blk* shadow(Blk*, Path*, Path*);
-int putblk(Blk*);
-Blk* getblk(uvlong bp, uvlong bh);
-void freeblk(Blk *b);
+Blk* getroot(int*);
+Blk* getblk(vlong, uvlong);
+Blk* pinblk(Blk*);
+Blk* readblk(vlong, int);
+void putblk(Blk*);
+void enqueue(Blk*);
+void freeblk(Blk*);
ushort blkfill(Blk*);
uvlong blkhash(Blk*);
+u32int ihash(vlong);
+void finalize(Blk*);
+void fillsuper(Blk*);
+int snapshot(void);
uvlong siphash(void*, usize);
+void reamfs(char*);
+int loadarena(Arena*, vlong);
+void loadfs(char*);
+int sync(void);
+int loadlog(Arena *a);
+int synclog(Blk*, vlong, vlong);
+int endfs(void);
+int compresslog(Arena *a);
-int fsupsert(Msg*);
-char *fswalk1(Key*, Kvp*);
+int btupsert(Msg*, int);
+char *btlookup(Key*, Kvp*, Blk**);
+char *btlookupat(Blk*, Key*, Kvp*, Blk**);
+char *btscan(Scan*, Key*, vlong, vlong, int);
+char *btnext(Scan*, Kvp*, int*);
+void btdone(Scan*);
-void* emalloc(usize);
-void* erealloc(void*, usize);
char* estrdup(char*);
int keycmp(Key *, Key *);
@@ -32,14 +50,48 @@
void getmsg(Blk *, int, Msg *);
void initshow(void);
-void showblk(Blk*, char*);
+void showblk(Blk*, char*, int);
void showpath(Path*, int);
void showfs(char*);
+void fshowfs(int, char*);
+void showfree(char*);
int checkfs(void);
+#define dprint(...) \
+ do{ \
+ if(1) fprint(2, __VA_ARGS__); \
+ }while(0)
+
+char *pack8(int*, char*, char*, uchar);
+char *pack16(int*, char*, char*, ushort);
+char *pack32(int*, char*, char*, uint);
+char *pack64(int*, char*, char*, uvlong);
+char *packstr(int*, char*, char*, char*);
+
+/* void* is a bit hacky, but we want both signed and unsigned to work */
+char *unpack8(int*, char*, char*, void*);
+char *unpack16(int*, char*, char*, void*);
+char *unpack32(int*, char*, char*, void*);
+char *unpack64(int*, char*, char*, void*);
+char *unpackstr(int*, char*, char*, char**);
+int dir2kv(vlong, Dir*, Kvp*, char*, int);
+int kv2statbuf(Kvp*, char*, int);
+int kv2dir(Kvp*, Dir*);
+int kv2qid(Kvp*, Qid*);
+
/* scratch */
void setmsg(Blk *, int, Msg *);
void bufinsert(Blk *, Msg *);
-void victim(Blk *b, Path *p);
-void blkinsert(Blk *b, Kvp *kv);
+void victim(Blk *b, Path *p);
+void blkinsert(Blk *b, Kvp *kv);
+Chan *mkchan(int);
+Fmsg *chrecv(Chan*);
+void chsend(Chan*, Fmsg*);
+void runfs(void*);
+void runwrite(void*);
+void runread(void*);
+void runctl(void*);
+
+/* it's in libc... */
+extern int cas(long *, long, long);
--- /dev/null
+++ b/fs.c
@@ -1,0 +1,1103 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+
+int
+okname(char *name)
+{
+ int i;
+
+ if(name[0] == 0)
+ return -1;
+ if(strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
+ return -1;
+ for(i = 0; i < Maxname; i++){
+ if(name[i] == 0)
+ return 0;
+ if((name[i]&0xff) < 0x20 || name[i] == '/')
+ return -1;
+ }
+ return -1;
+}
+
+static vlong
+nextqid(void)
+{
+ vlong q;
+
+ lock(&fs->qidlk);
+ q = fs->nextqid++;
+ unlock(&fs->qidlk);
+ return q;
+}
+
+static char*
+fslookup(Fid *f, Key *k, Kvp *kv, Blk **bp, int lk)
+{
+ char *e;
+ Blk *b;
+
+ if(f->rootb == -1)
+ b = getroot(nil);
+ else
+ b = getblk(f->rootb, f->rooth);
+ if(b == nil)
+ return Efs;
+ if(lk)
+ rlock(f->dent);
+ e = btlookupat(b, k, kv, bp);
+ if(lk)
+ runlock(f->dent);
+ return e;
+}
+
+static Dent*
+getdent(vlong root, vlong pqid, Dir *d)
+{
+ Dent *e;
+ char *ek, *eb;
+ u32int h;
+ int err;
+
+ h = (ihash(d->qid.path) ^ ihash(root)) % Ndtab;
+ dprint("hash: %d\n", h);
+ lock(&fs->dtablk);
+ for(e = fs->dtab[h]; e != nil; e = e->next){
+ if(e->qid.path == d->qid.path && e->rootb == root){
+ dprint("found %p\n", e);
+ ainc(&e->ref);
+ unlock(&fs->dtablk);
+ return e;
+ }
+ }
+
+ err = 0;
+ if((e = mallocz(sizeof(Dent), 1)) == nil)
+ return nil;
+ e->ref = 1;
+ e->qid = d->qid;
+ e->rootb = root;
+ e->k = e->buf;
+ e->nk = 9 + strlen(d->name) + 1;
+
+ ek = e->buf;
+ eb = ek + sizeof(e->buf);
+ ek = pack8(&err, ek, eb, Kent);
+ ek = pack64(&err, ek, eb, pqid);
+ ek = packstr(&err, ek, eb, d->name);
+ e->nk = ek - e->buf;
+ e->next = fs->dtab[h];
+ fs->dtab[h] = e;
+
+ unlock(&fs->dtablk);
+ return e;
+}
+
+static void
+clunkdent(Dent *de)
+{
+ Dent *e, **pe;
+ u32int h;
+
+ if(adec(&de->ref) == 0){
+ h = (ihash(de->qid.path) ^ ihash(de->rootb)) % Ndtab;
+dprint("freeing dent, hash=%d, p=0x%p\n", h, de);
+ lock(&fs->dtablk);
+ pe = &fs->dtab[h];
+ for(e = fs->dtab[h]; e != nil; e = e->next){
+ if(e == de){
+ *pe = e->next;
+ unlock(&fs->dtablk);
+ free(de);
+ return;
+ }
+ pe = &e->next;
+ }
+ abort();
+ }
+}
+
+void
+showfids(void)
+{
+ int i;
+ Fid *f;
+
+ if(!debug)
+ return;
+ fprint(2, "fids:---\n");
+ for(i = 0; i < Nfidtab; i++)
+ for(f = fs->fidtab[i]; f != nil; f = f->next){
+ rlock(f->dent);
+ fprint(2, "\tfid[%d]: %d [refs=%ld, k=%K]\n", i, f->fid, f->dent->ref, &f->dent->Key);
+ runlock(f->dent);
+ }
+}
+
+Fid*
+getfid(u32int fid)
+{
+ u32int h;
+ Fid *f;
+
+ h = ihash(fid) % Nfidtab;
+ lock(&fs->fidtablk);
+ for(f = fs->fidtab[h]; f != nil; f = f->next)
+ if(f->fid == fid)
+ break;
+ unlock(&fs->fidtablk);
+ return f;
+}
+
+Fid*
+dupfid(int new, Fid *f)
+{
+ Fid *n, *o;
+ u32int h;
+
+ h = ihash(new) % Nfidtab;
+ if((n = malloc(sizeof(Fid))) == nil)
+ return nil;
+
+ *n = *f;
+ n->fid = new;
+ n->mode = -1;
+ n->next = nil;
+
+ lock(&fs->fidtablk);
+ ainc(&n->dent->ref);
+ for(o = fs->fidtab[h]; o != nil; o = o->next)
+ if(o->fid == new)
+ break;
+ if(o == nil){
+ n->next = fs->fidtab[h];
+ fs->fidtab[h] = n;
+ }
+ unlock(&fs->fidtablk);
+
+ if(o != nil){
+ werrstr("fid in use: %d == %d", o->fid, new);
+ abort();
+ free(n);
+ return nil;
+ }
+ return n;
+}
+
+void
+clunkfid(Fid *fid)
+{
+ Fid *f, **pf;
+ u32int h;
+
+ lock(&fs->fidtablk);
+ h = ihash(fid->fid) % Nfidtab;
+ pf = &fs->fidtab[h];
+ for(f = fs->fidtab[h]; f != nil; f = f->next){
+ if(f == fid){
+ *pf = f->next;
+ goto Found;
+ }
+ pf = &f->next;
+ }
+ abort();
+Found:
+ clunkdent(fid->dent);
+ unlock(&fs->fidtablk);
+ free(fid);
+}
+
+void
+fshangup(int fd, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vfprint(2, fmt, ap);
+ va_end(ap);
+ close(fd);
+ abort();
+}
+
+Fmsg*
+readmsg(int fd, int max)
+{
+ char szbuf[4];
+ int sz;
+ Fmsg *m;
+
+ if(readn(fd, szbuf, 4) != 4)
+ return nil;
+ sz = GBIT32(szbuf);
+ if(sz > max)
+ return nil;
+ if((m = malloc(sizeof(Fmsg)+sz)) == nil)
+ return nil;
+ if(readn(fd, m->buf+4, sz-4) != sz-4){
+ werrstr("short read: %r");
+ free(m);
+ return nil;
+ }
+ m->fd = fd;
+ m->sz = sz;
+ PBIT32(m->buf, sz);
+ return m;
+}
+
+void
+respond(Fmsg *m, Fcall *r)
+{
+ uchar buf[Max9p];
+ int w, n;
+
+ r->tag = m->tag;
+ dprint("→ %F\n", r);
+ if((n = convS2M(r, buf, sizeof(buf))) == 0)
+ abort();
+ qlock(m->wrlk);
+ w = write(m->fd, buf, n);
+ qunlock(m->wrlk);
+ if(w != n)
+ fshangup(m->fd, "failed write");
+ free(m);
+}
+
+void
+rerror(Fmsg *m, char *fmt, ...)
+{
+ char buf[128];
+ va_list ap;
+ Fcall r;
+
+ va_start(ap, fmt);
+ vsnprint(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+ r.type = Rerror;
+ r.ename = buf;
+ respond(m, &r);
+}
+
+Chan*
+mkchan(int size)
+{
+ Chan *c;
+
+ if((c = mallocz(sizeof(Chan) + size*sizeof(void*), 1)) == nil)
+ sysfatal("create channel");
+ c->size = size;
+ c->avail = size;
+ c->count = 0;
+ c->rp = c->args;
+ c->wp = c->args;
+ return c;
+
+}
+
+Fmsg*
+chrecv(Chan *c)
+{
+ void *a;
+ long v;
+
+ v = c->count;
+ if(v == 0 || cas(&c->count, v, v-1) == 0)
+ semacquire(&c->count, 1);
+ lock(&c->rl);
+ a = *c->rp;
+ if(++c->rp >= &c->args[c->size])
+ c->rp = c->args;
+ unlock(&c->rl);
+ semrelease(&c->avail, 1);
+ return a;
+}
+
+void
+chsend(Chan *c, Fmsg *m)
+{
+ long v;
+
+ v = c->avail;
+ if(v == 0 || cas(&c->avail, v, v-1) == 0)
+ semacquire(&c->avail, 1);
+ lock(&c->wl);
+ *c->wp = m;
+ if(++c->wp >= &c->args[c->size])
+ c->wp = c->args;
+ unlock(&c->wl);
+ semrelease(&c->count, 1);
+
+}
+
+void
+fsversion(Fmsg *m, int *msz)
+{
+ Fcall r;
+
+ memset(&r, 0, sizeof(Fcall));
+ if(strcmp(m->version, "9P2000") == 0){
+ if(m->msize < *msz)
+ *msz = m->msize;
+ r.type = Rversion;
+ r.msize = *msz;
+ }else{
+ r.type = Rversion;
+ r.version = "unknown";
+ }
+ respond(m, &r);
+}
+
+void
+fsauth(Fmsg *m)
+{
+ Fcall r;
+
+ showfids();
+ r.type = Rerror;
+ r.ename = "unimplemented auth";
+ respond(m, &r);
+}
+
+void
+fsattach(Fmsg *m, int iounit)
+{
+ char *p, *ep, buf[128];
+ int err;
+ Dent *e;
+ Fcall r;
+ Kvp kv;
+ Key dk;
+ Blk *b;
+ Fid f;
+ Dir d;
+
+ err = 0;
+ p = buf;
+ ep = buf + sizeof(buf);
+ p = pack8(&err, p, ep, Kent);
+ p = pack64(&err, p, ep, -1ULL);
+ p = packstr(&err, p, ep, "");
+ dk.k = buf;
+ dk.nk = p - buf;
+showfs("attach");
+ if(btlookup(&dk, &kv, &b) != nil){
+ rerror(m, Efs);
+ return;
+ }
+ r.type = Rattach;
+ if(kv2dir(&kv, &d) == -1){
+ rerror(m, Efs);
+ putblk(b);
+ return;
+ }
+ if((e = getdent(-1, -1, &d)) == nil){
+ rerror(m, Efs);
+ putblk(b);
+ return;
+ }
+ putblk(b);
+
+ /*
+ * A bit of a hack; we're duping a fid
+ * that doesn't exist, so we'd be off
+ * by one on the refcount. Adjust to
+ * compensate for the dup.
+ */
+ adec(&e->ref);
+
+ memset(&f, 0, sizeof(Fid));
+ f.fid = NOFID;
+ f.qpath = d.qid.path;
+ f.mode = -1;
+ f.rooth = -1;
+ f.rootb = -1;
+ f.iounit = iounit;
+ f.dent = e;
+showfids();
+ if(dupfid(m->fid, &f) == nil){
+ rerror(m, Enomem);
+ return;
+ }
+checkfs();
+showfids();
+ r.qid = d.qid;
+ respond(m, &r);
+ return;
+}
+
+void
+fswalk(Fmsg *m)
+{
+ char *p, *e, *estr, kbuf[Maxent];
+ int i, err;
+ vlong up, prev;
+ Fid *o, *f;
+ Dent *dent;
+ Fcall r;
+ Blk *b;
+ Kvp kv;
+ Key k;
+ Dir d;
+
+ if((o = getfid(m->fid)) == nil){
+ rerror(m, Efid);
+ return;
+ }
+ if(o->mode != -1){
+ rerror(m, Einuse);
+ return;
+ }
+ err = 0;
+ estr = nil;
+ up = o->qpath;
+ prev = o->qpath;
+ r.type = Rwalk;
+ r.nwqid = 0;
+ for(i = 0; i < m->nwname; i++){
+ up = prev;
+ p = kbuf;
+ e = p + sizeof(kbuf);
+ p = pack8(&err, p, e, Kent);
+ p = pack64(&err, p, e, up);
+ p = packstr(&err, p, e, m->wname[i]);
+ if(err){
+ rerror(m, "bad walk: %r");
+ return;
+ }
+ k.k = kbuf;
+ k.nk = p - kbuf;
+//showfs("walking");
+dprint("looking up %K\n", &k);
+ if((estr = fslookup(o, &k, &kv, &b, 0)) != nil)
+ break;
+ if(kv2dir(&kv, &d) == -1){
+ rerror(m, Efs);
+ putblk(b);
+ return;
+ }
+ prev = d.qid.path;
+ putblk(b);
+ r.wqid[r.nwqid] = d.qid;
+ r.nwqid++;
+ }
+ if(i == 0 && m->nwname != 0){
+ rerror(m, estr);
+ return;
+ }
+ f = o;
+ if(m->fid != m->newfid){
+ if((f = dupfid(m->newfid, o)) == nil){
+ rerror(m, "%r");
+ return;
+ }
+ }
+ if(i > 0){
+ d.name = m->wname[i-1];
+ dent = getdent(f->rootb, up, &d);
+ if(dent == nil){
+ if(m->fid != m->newfid)
+ clunkfid(f);
+ rerror(m, Enomem);
+ return;
+ }
+ f->qpath = r.wqid[r.nwqid-1].path;
+ f->dent = dent;
+ }
+ respond(m, &r);
+}
+
+void
+fsstat(Fmsg *m)
+{
+ char buf[STATMAX];
+ Fcall r;
+ Fid *f;
+ Kvp kv;
+ Blk *b;
+ int n;
+
+ showfids();
+ if((f = getfid(m->fid)) == nil){
+ rerror(m, "no such fid");
+ return;
+ }
+ if(btlookup(f->dent, &kv, &b) != nil){
+ rerror(m, Eexist);
+ return;
+ }
+ if((n = kv2statbuf(&kv, buf, sizeof(buf))) == -1){
+ rerror(m, "stat: %r");
+ return;
+ }
+ r.type = Rstat;
+ r.stat = (uchar*)buf;
+ r.nstat = n;
+ respond(m, &r);
+ putblk(b);
+}
+
+void
+fswstat(Fmsg *m)
+{
+ USED(m);
+ rerror(m, "wstat unimplemented");
+}
+
+void
+fsclunk(Fmsg *m)
+{
+ Fcall r;
+ Fid *f;
+
+ if((f = getfid(m->fid)) == nil){
+ rerror(m, "no such fid");
+ return;
+ }
+ clunkfid(f);
+ r.type = Rclunk;
+ respond(m, &r);
+}
+
+void
+fscreate(Fmsg *m)
+{
+ char buf[Kvmax];
+ Dent *dent;
+ Fcall r;
+ Msg mb;
+ Fid *f;
+ Dir d;
+
+ if(okname(m->name) == -1){
+ rerror(m, Ename);
+ return;
+ }
+ if((f = getfid(m->fid)) == nil){
+ rerror(m, "no such fid");
+ return;
+ }
+ if(m->perm & (DMMOUNT|DMAUTH)){
+ rerror(m, "unknown permission");
+ return;
+ }
+ d.qid.type = 0;
+ if(m->perm & DMDIR)
+ d.qid.type |= QTDIR;
+ if(m->perm & DMAPPEND)
+ d.qid.type |= QTAPPEND;
+ if(m->perm & DMEXCL)
+ d.qid.type |= QTEXCL;
+ if(m->perm & DMTMP)
+ d.qid.type |= QTTMP;
+ d.qid.path = nextqid();
+ d.qid.vers = 0;
+ d.mode = m->perm;
+ d.name = m->name;
+ d.atime = nsec();
+ d.mtime = d.atime;
+ d.length = 0;
+ d.uid = "glenda";
+ d.gid = "glenda";
+ d.muid = "glenda";
+ mb.op = Oinsert;
+ if(dir2kv(f->qpath, &d, &mb, buf, sizeof(buf)) == -1){
+ rerror(m, "%r");
+ return;
+ }
+//showfs("precreate");
+ if(btupsert(&mb, 1) == -1){
+ rerror(m, "%r");
+ return;
+ }
+//showfs("postcreate");
+ dent = getdent(f->rootb, f->qpath, &d);
+ if(dent == nil){
+ if(m->fid != m->newfid)
+ clunkfid(f);
+ rerror(m, Enomem);
+ return;
+ }
+
+ lock(f);
+ if(f->mode != -1){
+ unlock(f);
+ clunkdent(dent);
+ rerror(m, Einuse);
+ return;
+ }
+ f->mode = m->mode;
+ f->qpath = d.qid.path;
+ f->dent = dent;
+ unlock(f);
+
+ r.type = Rcreate;
+ r.qid = d.qid;
+ r.iounit = f->iounit;
+ respond(m, &r);
+
+}
+
+int
+fsaccess(Dir*, int)
+{
+ /* all is permitted */
+ return 0;
+}
+
+void
+fsopen(Fmsg *m)
+{
+ Fcall r;
+ char *e;
+ Dir d;
+ Fid *f;
+ Blk *b;
+ Kvp kv;
+
+ if((f = getfid(m->fid)) == nil){
+ rerror(m, Efid);
+ return;
+ }
+ if((e = fslookup(f, f->dent, &kv, &b, 0)) != nil){
+ rerror(m, e);
+ return;
+ }
+ if(kv2dir(&kv, &d) == -1){
+ rerror(m, Efs);
+ putblk(b);
+ }
+ if(fsaccess(&d, m->mode) == -1){
+ rerror(m, Eperm);
+ putblk(b);
+ return;
+ }
+ wlock(f->dent);
+ f->dent->length = d.length;
+ wunlock(f->dent);
+ r.type = Ropen;
+ r.qid = d.qid;
+ r.iounit = f->iounit;
+ putblk(b);
+
+ lock(f);
+ if(f->mode != -1){
+ rerror(m, Einuse);
+ unlock(f);
+ return;
+ }
+ f->mode = m->mode;
+ if((f->mode & 0x7) == OEXEC){
+ lock(&fs->rootlk);
+ f->rootb = fs->rootb;
+ f->rooth = fs->rooth;
+ f->height = fs->height;
+// refblk(fs->rootb);
+ unlock(&fs->rootlk);
+ }
+ unlock(f);
+ respond(m, &r);
+}
+
+char*
+fsreaddir(Fmsg *m, Fid *f, Fcall *r)
+{
+ char pfx[9], *p, *e;
+ int n, ns, done;
+ Scan *s;
+ Kvp kv;
+
+ s = f->scan;
+ if(s != nil && s->offset != 0 && s->offset != m->offset)
+ return Edscan;
+ if(s == nil || m->offset == 0){
+ pfx[0] = Kent;
+ PBIT64(pfx+1, f->qpath);
+ kv.k = pfx;
+ kv.nk = sizeof(pfx);
+
+ print("scan starting\n");
+ if((s = mallocz(sizeof(Scan), 1)) == nil)
+ return "out of memory";
+ if((e = btscan(s, &kv, f->rootb, f->rooth, f->height)) != nil){
+ free(r->data);
+ btdone(s);
+ return e;
+ }
+
+ lock(f);
+ if(f->scan != nil)
+ btdone(f->scan);
+ f->scan = s;
+ unlock(f);
+ }
+ if(s->done){
+ fprint(2, "early done...\n");
+ r->count = 0;
+ return nil;
+ }
+ p = r->data;
+ n = m->count;
+ while(1){
+ if((e = btnext(s, &kv, &done)) != nil)
+ return e;
+ if(done)
+ break;
+ if((ns = kv2statbuf(&kv, p, n)) == -1){
+ fprint(2, "** could not fill buf: %r\n");
+ break;
+ }
+ fprint(2, "*** nscan: %d\n", ns);
+ p += ns;
+ n -= ns;
+ }
+ r->count = p - r->data;
+ return nil;
+}
+
+int
+readb(Fid *f, char *d, vlong o, vlong n, int sz)
+{
+ char *e, buf[17];
+ vlong bp, bh, bo;
+ Blk *b;
+ Key k;
+ Kvp kv;
+
+ if(o >= sz)
+ return 0;
+
+ bp = o & ~(Blksz-1);
+ bo = o & (Blksz-1);
+
+ k.k = buf;
+ k.nk = sizeof(buf);
+ k.k[0] = Kdat;
+ PBIT64(k.k+1, f->qpath);
+ PBIT64(k.k+9, bp);
+
+ e = fslookup(f, &k, &kv, &b, 0);
+ if(e != nil && e != Eexist){
+ fprint(2, "!!! error: %s", e);
+ werrstr(e);
+ return -1;
+ }
+ fprint(2, "\treadb: key=%K, val=%P\n", &k, &kv);
+ bp = GBIT64(kv.v+0);
+ bh = GBIT64(kv.v+8);
+ putblk(b);
+
+ if((b = getblk(bp, bh)) == nil)
+ return -1;
+ fprint(2, "\treading from %llx (%llx) %s %s\n", bp, b->off, b->buf, b->data);
+ if(bo+n > Blksz)
+ n = Blksz-bo;
+ if(b != nil){
+ fprint(2, "\tcopying %lld to resp %p\n", n, d);
+ memcpy(d, b->buf, n);
+ putblk(b);
+ }else
+ memset(d, 0, n);
+ return n;
+}
+
+char*
+fsreadfile(Fmsg *m, Fid *f, Fcall *r)
+{
+ vlong n, c, o;
+ char *p;
+ Dent *e;
+
+ r->type = Rread;
+ r->count = 0;
+ e = f->dent;
+ rlock(e);
+ if(m->offset > e->length){
+ runlock(e);
+ return nil;
+ }
+ if((r->data = malloc(m->count)) == nil){
+ runlock(e);
+ return Enomem;
+ }
+ p = r->data;
+ c = m->count;
+ o = m->offset;
+ if(m->offset + m->count > e->length)
+ c = e->length - m->offset;
+//showfs("pre-readb");
+ while(c != 0){
+ n = readb(f, p, o, c, e->length);
+ if(n == -1){
+ runlock(e);
+ return Efs;
+ }
+ r->count += n;
+ if(n == 0)
+ break;
+ p += n;
+ o += n;
+ c -= n;
+ }
+ runlock(e);
+ return nil;
+}
+
+void
+fsread(Fmsg *m)
+{
+ char *e;
+ Fcall r;
+ Fid *f;
+
+ if((f = getfid(m->fid)) == nil){
+ rerror(m, Efid);
+ return;
+ }
+//showfs("fs contents before read");
+ fprint(2, "\n");
+ r.type = Rread;
+ r.count = 0;
+ if((r.data = malloc(m->count)) == nil){
+ rerror(m, Emem);
+ return;
+ }
+ fprint(2, "\nread{{{{\n");
+ if(f->dent->qid.type & QTDIR)
+ e = fsreaddir(m, f, &r);
+ else
+ e = fsreadfile(m, f, &r);
+ if(e != nil){
+ rerror(m, e);
+ return;
+ }else{
+ respond(m, &r);
+ free(r.data);
+ }
+ fprint(2, "\n}}}read\n");
+}
+
+int
+writeb(Fid *f, Msg *m, char *s, vlong o, vlong n, int sz)
+{
+ vlong fb, fo, bp, bh;
+ Blk *b, *t;
+ Kvp kv;
+
+ fb = o & ~(Blksz-1);
+ fo = o & (Blksz-1);
+
+ m->k[0] = Kdat;
+// dprint("offset: %llx\n", fb);
+ PBIT64(m->k+1, f->qpath);
+ PBIT64(m->k+9, fb);
+
+ b = newblk(Traw);
+ if(b == nil)
+ return -1;
+ if(o < sz){
+ fprint(2, "\tappending to block %llx\n", b->off);
+ if(fslookup(f, m, &kv, &t, 0) != nil)
+ goto new;
+ bp = GBIT64(kv.v+0);
+ bh = GBIT64(kv.v+8);
+ putblk(t);
+
+ if((t = getblk(bp, bh)) == nil)
+ return -1;
+ memcpy(b->buf, t->buf, Blksz);
+ putblk(t);
+ }
+new:
+ if(fo+n > Blksz)
+ n = Blksz-fo;
+ memcpy(b->buf, s, n);
+ putblk(b);
+ fprint(2, "\twrote to new blk %llx at offset %lld\n", b->off, o);
+ bh = blkhash(b);
+ PBIT64(m->v+0, b->off);
+ PBIT64(m->v+8, bh);
+ fprint(2, "\tkv: %M", m);
+ return n;
+}
+
+void
+fswrite(Fmsg *m)
+{
+ char sbuf[8], offbuf[4][13+16], *p;
+ vlong n, o, c;
+ Msg kv[4];
+ Fcall r;
+ Fid *f;
+ int i;
+
+
+ if((f = getfid(m->fid)) == nil){
+ rerror(m, Efid);
+ return;
+ }
+ if((f->mode&0x7) != OWRITE){
+ dprint("f->mode: %x\n", f->mode);
+ rerror(m, Einuse);
+ return;
+ }
+
+ p = m->data;
+ o = m->offset;
+ c = m->count;
+ for(i = 0; i < nelem(kv)-1 && c != 0; i++){
+ kv[i].op = Oinsert;
+ kv[i].k = offbuf[i];
+ kv[i].nk = 17;
+ kv[i].v = offbuf[i]+17;
+ kv[i].nv = 16;
+ n = writeb(f, &kv[i], p, o, c, f->dent->length);
+ if(n == -1){
+ // badwrite(f, i);
+ // FIXME: free pages
+ rerror(m, "%r");
+ }
+ p += n;
+ o += n;
+ c -= n;
+ }
+
+ wlock(f->dent);
+ kv[i].op = Owstat;
+ kv[i].k = f->dent->k;
+ kv[i].nk = f->dent->nk;
+ kv[i].v = sbuf;
+ kv[i].nv = 0;
+ if(m->offset+m->count > f->dent->length){
+ fprint(2, "bumping size...\n");
+ kv[i].op |= Owsize;
+ kv[i].nv += 8;
+ PBIT64(kv[i].v, m->offset+m->count);
+ f->dent->length = m->offset+m->count;
+ }
+ btupsert(kv, i+1);
+ wunlock(f->dent);
+
+ r.type = Rwrite;
+ r.count = m->count;
+ respond(m, &r);
+}
+
+void
+runfs(void *pfd)
+{
+ int fd, msgmax;
+ char err[128];
+ QLock *wrlk;
+ Fcall r;
+ Fmsg *m;
+
+ fd = (uintptr)pfd;
+ msgmax = Max9p;
+ if((wrlk = mallocz(sizeof(QLock), 1)) == nil)
+ fshangup(fd, "alloc wrlk: %r");
+ while(1){
+ if((m = readmsg(fd, msgmax)) == nil){
+ fshangup(fd, "truncated message: %r");
+ return;
+ }
+ if(convM2S(m->buf, m->sz, m) == 0){
+ fshangup(fd, "invalid message: %r");
+ return;
+ }
+/*
+ if(m->type != Tversion && !versioned){
+ fshangup(fd, "version required");
+ return;
+ }
+*/
+ m->wrlk = wrlk;
+ dprint("← %F\n", &m->Fcall);
+ switch(m->type){
+ /* sync setup */
+ case Tversion: fsversion(m, &msgmax); break;
+ case Tauth: fsauth(m); break;
+ case Tclunk: fsclunk(m); break;
+ case Topen: fsopen(m); break;
+ case Tattach: fsattach(m, msgmax); break;
+
+ /* mutators */
+ case Tflush: chsend(fs->wrchan, m); break;
+ case Tcreate: chsend(fs->wrchan, m); break;
+ case Twrite: chsend(fs->wrchan, m); break;
+ case Twstat: chsend(fs->wrchan, m); break;
+ case Tremove: chsend(fs->wrchan, m); break;
+
+ /* reads */
+ case Twalk: chsend(fs->rdchan, m); break;
+ case Tread: chsend(fs->rdchan, m); break;
+ case Tstat: chsend(fs->rdchan, m); break;
+ default:
+ fprint(2, "unknown message %F\n", &m->Fcall);
+ snprint(err, sizeof(err), "unknown message: %F", &m->Fcall);
+ r.type = Rerror;
+ r.ename = err;
+ respond(m, &r);
+ break;
+ }
+ }
+}
+
+void
+runwrite(void *)
+{
+ Fmsg *m;
+
+ while(1){
+ m = chrecv(fs->wrchan);
+ switch(m->type){
+ case Tflush: rerror(m, "unimplemented flush"); break;
+ case Tcreate: fscreate(m); break;
+ case Twrite: fswrite(m); break;
+ case Twstat: fswstat(m); break;
+ case Tremove: rerror(m, "unimplemented remove"); break;
+ }
+ }
+}
+
+void
+runread(void *)
+{
+ Fmsg *m;
+
+ while(1){
+ m = chrecv(fs->rdchan);
+ switch(m->type){
+ case Twalk: fswalk(m); break;
+ case Tread: fsread(m); break;
+ case Tstat: fsstat(m); break;
+ }
+ }
+}
+
+void
+runctl(void *pfd)
+{
+ char buf[128];
+ int fd, n;
+
+ fd = (uintptr)pfd;
+ while(1){
+ if((n = read(fd, buf, sizeof(buf)-1)) == -1)
+ break;
+ buf[n--] = 0;
+ while(buf[n] == ' ' || buf[n] == '\t' || buf[n] == '\n')
+ buf[n--] = 0;
+ fprint(2, "ctl message: %s\n", buf);
+ fprint(fd, "ctl message: %s\n", buf);
+ if(strcmp(buf, "show") == 0)
+ fshowfs(fd, "show");
+ else if(strcmp(buf, "check") == 0)
+ checkfs();
+ else
+ fprint(fd, "unknown command %s", buf);
+ }
+}
--- a/hash.c
+++ b/hash.c
@@ -30,7 +30,11 @@
#include <u.h>
#include <libc.h>
#include <fcall.h>
+#include <avl.h>
+#include "dat.h"
+#include "fns.h"
+
#define _le64toh(x) \
GBIT64((char*)&x)
@@ -96,4 +100,19 @@
{
char key[16] = "gefsgefsgefsgefs";
return siphash24(src, len, key);
+}
+
+uvlong
+blkhash(Blk *b)
+{
+ return siphash(b->buf, Blksz);
+}
+
+u32int
+ihash(vlong x)
+{
+ x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
+ x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
+ x = x ^ (x >> 31);
+ return x;
}
--- /dev/null
+++ b/load.c
@@ -1,0 +1,91 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+rangecmp(Avl *a, Avl *b)
+{
+ return ((Arange*)a)->off - ((Arange*)b)->off;
+}
+
+int
+loadarena(Arena *a, vlong o)
+{
+ Blk *b;
+
+ if((a->free = avlcreate(rangecmp)) == nil)
+ return -1;
+ if((b = readblk(o, 0)) == nil)
+ return -1;
+ a->b = b;
+ a->log = GBIT64(b->data+0);
+ a->logh = GBIT64(b->data+8);
+ if(loadlog(a) == -1)
+ return -1;
+ if(compresslog(a) == -1)
+ return -1;
+ return 0;
+}
+
+void
+loadfs(char *dev)
+{
+ vlong sb;
+ char *p;
+ Blk *b;
+ Dir *d;
+ int i, dirty;
+
+ if((fs->fd = open(dev, ORDWR)) == -1)
+ sysfatal("open %s: %r", dev);
+ if((d = dirfstat(fs->fd)) == nil)
+ sysfatal("ream: %r");
+ sb = d->length - (d->length % Blksz) - Blksz;
+print("superblock @%llx\n", sb);
+ free(d);
+
+ if((b = readblk(sb, 0)) == nil)
+ sysfatal("read superblock: %r");
+ if(b->type != Tsuper)
+ sysfatal("corrupt superblock: bad type");
+ p = b->data;
+ if(memcmp(p, "gefs0001", 8) != 0)
+ sysfatal("corrupt superblock: bad magic");
+ dirty = GBIT32(p + 8);
+ if(GBIT32(p + 12) != Blksz)
+ sysfatal("fs uses different block size");
+ if(GBIT32(p + 16) != Bufspc)
+ sysfatal("fs uses different buffer size");
+ if(GBIT32(p + 20) != Hdrsz)
+ sysfatal("fs uses different buffer size");
+ fs->height = GBIT32(p + 24);
+ fs->rootb = GBIT64(p + 32);
+ fs->rooth = GBIT64(p + 40);
+ fs->narena = GBIT32(p + 48);
+ fs->arenasz = GBIT64(p + 56);
+ fs->arenasz = GBIT64(p + 56);
+ fs->gen = GBIT64(p + 64);
+ fs->nextqid = GBIT64(p + 72);
+ fs->super = b;
+ fprint(2, "load: %8s\n", p);
+ fprint(2, "\theight:\t%d\n", fs->height);
+ fprint(2, "\trootb:\t%llx\n", fs->rootb);
+ fprint(2, "\trooth:\t%llx\n", fs->rooth);
+ fprint(2, "\tarenas:\t%d\n", fs->narena);
+ fprint(2, "\tarenasz:\t%lld\n", fs->arenasz);
+ fprint(2, "\trootgen:\t%lld\n", fs->gen);
+ fprint(2, "\tnextqid:\t%lld\n", fs->nextqid);
+ if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+ sysfatal("malloc: %r");
+ for(i = 0; i < fs->narena; i++)
+ if((loadarena(&fs->arenas[i], i*fs->arenasz)) == -1)
+ sysfatal("loadfs: %r");
+ if(dirty){
+ fprint(2, "file system was not unmounted cleanly");
+ /* TODO: start gc pass */
+ }
+}
--- a/main.c
+++ b/main.c
@@ -1,11 +1,19 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
+#include <avl.h>
+#include <fcall.h>
+#include <ctype.h>
+
#include "dat.h"
#include "fns.h"
Gefs *fs;
+int ream;
+int debug;
+char *srvname = "gefs";
+
static int
Bconv(Fmt *fmt)
{
@@ -14,22 +22,40 @@
b = va_arg(fmt->args, Blk*);
if(b == nil)
return fmtprint(fmt, "Blk(nil)");
- return fmtprint(fmt, "Blk(%c)", (b->type == Pivot) ? 'P' : 'L');
+ return fmtprint(fmt, "Blk(%c)", (b->type == Tpivot) ? 'P' : 'L');
}
+void
+launch(void (*f)(void *), void *arg, char *text)
+{
+ int pid;
+
+
+ pid = rfork(RFPROC|RFMEM|RFNOWAIT);
+ if (pid < 0)
+ sysfatal("can't fork: %r");
+ if (pid == 0) {
+ procsetname("%s", text);
+ (*f)(arg);
+ exits("child returned");
+ }
+}
+
static int
Mconv(Fmt *fmt)
{
char *opname[] = {
- [Ocreate] "Ocreate",
+ [Oinsert] "Oinsert",
[Odelete] "Odelete",
- [Owrite] "Owrite",
[Owstat] "Owstat",
};
Msg *m;
m = va_arg(fmt->args, Msg*);
- return fmtprint(fmt, "Msg(%s, %.*s,%.*s)", opname[m->op], m->nk, m->k, m->nv, m->v);
+ return fmtprint(fmt, "Msg(%s, [%d]%.*X,[%d]%.*X)",
+ opname[m->op&0xf],
+ m->nk, m->nk, m->k,
+ m->nv, m->nv, m->v);
}
static int
@@ -39,126 +65,159 @@
kv = va_arg(fmt->args, Kvp*);
if(kv->type == Vinl)
- return fmtprint(fmt, "Kvp(%.*s,%.*s)", kv->nk, kv->k, kv->nv, kv->v);
+ return fmtprint(fmt, "Kvp([%d]%.*X,[%d]%.*X)",
+ kv->nk, kv->nk, kv->k,
+ kv->nv, kv->nv, kv->v);
else
- return fmtprint(fmt, "Kvp(%.*s,(%llux,%llux,%ud))",
- kv->nk, kv->k, kv->bp, kv->bh, kv->fill);
+ return fmtprint(fmt, "Kvp([%d]%.*X,(%llux,%llux,%ud))",
+ kv->nk, kv->nk, kv->k,
+ kv->bp, kv->bh, kv->fill);
}
static int
+Rconv(Fmt *fmt)
+{
+ Arange *r;
+
+ r = va_arg(fmt->args, Arange*);
+ if(r == nil)
+ return fmtprint(fmt, "<Arange:nil>");
+ else
+ return fmtprint(fmt, "Arange(%lld+%lld)", r->off, r->len);
+}
+
+static int
Kconv(Fmt *fmt)
{
Key *k;
k = va_arg(fmt->args, Key*);
- return fmtprint(fmt, "Key(%.*s)", k->nk, k->k);
+ return fmtprint(fmt, "Key([%d]%.*X)", k->nk, k->nk, k->k);
}
-static void
-init(void)
+static int
+Xconv(Fmt *fmt)
{
- initshow();
- quotefmtinstall();
- fmtinstall('B', Bconv);
- fmtinstall('M', Mconv);
- fmtinstall('P', Pconv);
- fmtinstall('K', Kconv);
- fs = emalloc(sizeof(Gefs));
- fs->root = newblk(Leaf);
- fs->height = 1;
+ char *s, *e;
+ int n, i;
+
+ n = 0;
+ i = fmt->prec;
+ s = va_arg(fmt->args, char*);
+ e = s + fmt->prec;
+ for(; s != e; s++){
+ if(i % 4 == 0 && i != 0)
+ n += fmtprint(fmt, ":");
+ i--;
+ if(isalnum(*s))
+ n += fmtrune(fmt, *s);
+ else
+ n += fmtprint(fmt, "%02x", *s&0xff);
+ }
+ return n;
}
-int
-test(char *path)
+void
+initfs(vlong cachesz)
{
- Biobuf *bfd;
- char *e, *ln, *f[3];
- int nf;
- Msg m;
- Kvp r;
- Key k;
+ if((fs = mallocz(sizeof(Gefs), 1)) == nil)
+ sysfatal("malloc: %r");
- if((bfd = Bopen(path, OREAD)) == nil)
- sysfatal("open %s: %r", path);
- while((ln = Brdstr(bfd, '\n', 1)) != nil){
- memset(f, 0, sizeof(f));
- nf = tokenize(ln, f, nelem(f));
- if(nf < 1 || strlen(f[0]) != 1)
- sysfatal("malformed test file");
- switch(*f[0]){
- case '#':
- break;
- case 'I':
- if(nf != 3)
- sysfatal("malformed insert");
- m.type = Vinl;
- m.k = f[1];
- m.v = f[2];
- m.op = Ocreate;
- m.nk = strlen(f[1]);
- m.nv = strlen(f[2]);
- print("insert (%s, %s)\n", m.k, m.v);
- if(fsupsert(&m) == -1){
- print("failed insert (%s, %s): %r\n", f[1], f[2]);
- return -1;
- }
- break;
- case 'D':
- if(nf != 2)
- sysfatal("malformed delete");
- m.type = Vinl;
- m.op = Odelete;
- m.k = f[1];
- m.v = nil;
- m.nk = strlen(f[1]);
- m.nv = 0;
- print("delete %s\n", f[1]);
- if(fsupsert(&m) == -1){
- print("failed delete (%s): %r\n", f[1]);
- return -1;
- }
- break;
- case 'G':
- k.k = f[1];
- k.nk = strlen(f[1]);
- e = fswalk1(&k, &r);
- if(e != nil){
- print("failed lookup on (%s): %s\n", f[1], e);
- return -1;
- }
- break;
- case 'S':
- showfs("fs");
- print("\n\n");
- break;
- case 'C':
- checkfs();
- break;
- case 'V':
- debug++;
- break;
- case 'X':
- exits(f[1]);
- break;
- }
-// if(!checkfs())
-// abort();
- free(ln);
- }
- return 0;
+ fs->cmax = cachesz/Blksz;
+ if(fs->cmax >= (2*GiB)/sizeof(Bucket))
+ sysfatal("cache too big");
+ if((fs->cache = mallocz(fs->cmax*sizeof(Bucket), 1)) == nil)
+ sysfatal("malloc: %r");
+}
+
+int
+postfd(char *name, char *suff)
+{
+ char buf[80];
+ int fd[2];
+ int cfd;
+
+ if(pipe(fd) < 0)
+ sysfatal("can't make a pipe");
+ snprint(buf, sizeof buf, "/srv/%s%s", name, suff);
+ if((cfd = create(buf, OWRITE|ORCLOSE|OCEXEC, 0600)) == -1)
+ sysfatal("create %s: %r", buf);
+ if(fprint(cfd, "%d", fd[0]) == -1)
+ sysfatal("write %s: %r", buf);
+ close(fd[0]);
+ return fd[1];
}
void
+usage(void)
+{
+ fprint(2, "usage: %s [-r] dev\n", argv0);
+ exits("usage");
+}
+
+void
main(int argc, char **argv)
{
- int i;
+ int srvfd, ctlfd;
+ vlong cachesz;
+ cachesz = 16*MiB;
ARGBEGIN{
+ case 'r':
+ ream = 1;
+ break;
+ case 'c':
+ cachesz = strtoll(EARGF(usage()), nil, 0)*MiB;
+ break;
+ case 'd':
+ debug++;
+ break;
+ case 's':
+ srvname = EARGF(usage());
+ break;
+ default:
+ usage();
+ break;
}ARGEND;
+ if(argc == 0)
+ usage();
- init();
- for(i = 0; i < argc; i++)
- if(test(argv[0]) == -1)
- sysfatal("test %s: %r\n", argv[i]);
- exits(nil);
+ /*
+ * sanity checks -- I've tuned these to stupid
+ * values in the past.
+ */
+// assert(4*Kpmax < Pivspc);
+// assert(2*Msgmax < Bufspc);
+
+ initfs(cachesz);
+ initshow();
+ quotefmtinstall();
+ fmtinstall('B', Bconv);
+ fmtinstall('M', Mconv);
+ fmtinstall('P', Pconv);
+ fmtinstall('K', Kconv);
+ fmtinstall('R', Rconv);
+ fmtinstall('X', Xconv);
+ fmtinstall('F', fcallfmt);
+ if(ream){
+ reamfs(argv[0]);
+ exits(nil);
+ }else{
+ fs->rdchan = mkchan(128);
+ fs->wrchan = mkchan(128);
+ srvfd = postfd(srvname, "");
+ ctlfd = postfd(srvname, ".ctl");
+ loadfs(argv[0]);
+ launch(runctl, (void*)ctlfd, "ctl");
+ launch(runwrite, nil, "writeio");
+ launch(runread, nil, "readio");
+// launch(runfs, (void*)srvfd, "fs");
+ runfs((void*)srvfd);
+// launch(syncproc, nil, "sync");
+// launch(flushproc, &fs->flushev, "flush");
+// for(i = 1; i < argc; i++)
+// if(test(argv[i]) == -1)
+// sysfatal("test %s: %r\n", argv[i]);
+ exits(nil);
+ }
}
--- a/mkfile
+++ b/mkfile
@@ -4,8 +4,12 @@
OFILES=\
blk.$O\
check.$O\
+ fs.$O\
hash.$O\
+ load.$O\
main.$O\
+ pack.$O\
+ ream.$O\
tree.$O\
util.$O\
--- /dev/null
+++ b/pack.c
@@ -1,0 +1,266 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+
+char*
+unpack8(int *err, char *p, char *e, void *v)
+{
+ if (e - p < 1 || *err){
+ *err = 1;
+ return p;
+ }
+ *(uchar*)v = p[0];
+ return p+1;
+}
+
+char*
+unpack16(int *err, char *p, char *e, void *v)
+{
+ if (e - p < 2 || *err){
+ *err = 1;
+ return p;
+ }
+ *(ushort*)v = GBIT16(p);
+ return p+2;
+}
+
+char*
+unpack32(int *err, char *p, char *e, void *v)
+{
+ if (e - p < 4 || *err){
+ *err = 1;
+ return p;
+ }
+ *(uint*)v = GBIT32(p);
+ return p+4;
+}
+
+char*
+unpack64(int *err, char *p, char *e, void *v)
+{
+ if (e - p < 8 || *err){
+ *err = 1;
+ return p;
+ }
+ *(uvlong*)v = GBIT64(p);
+ return p+8;
+}
+
+/* Terminated so we can use them directly in C */
+char*
+unpackstr(int *err, char *p, char *e, char **s)
+{
+ int n;
+
+ if (e - p < 3 || *err){
+ *err = 1;
+ return p;
+ }
+ n = GBIT16(p);
+ if(e - p < n + 3 || p[n+2] != 0){
+ *err = 1;
+ return p;
+ }
+ *s = p+2;
+ return p+3+n;
+}
+
+char*
+pack8(int *err, char *p, char *e, uchar v)
+{
+ if (e - p < 1 || *err){
+ *err = 1;
+ return p;
+ }
+ p[0] = v;
+ return p+1;
+}
+
+char*
+pack16(int *err, char *p, char *e, ushort v)
+{
+ if (e - p < 2 || *err){
+ *err = 1;
+ return p;
+ }
+ PBIT16(p, v);
+ return p+2;
+}
+
+char*
+pack32(int *err, char *p, char *e, uint v)
+{
+ if (e - p < 4 || *err){
+ *err = 1;
+ return p;
+ }
+ PBIT32(p, v);
+ return p+4;
+}
+
+char*
+pack64(int *err, char *p, char *e, uvlong v)
+{
+ if (e - p < 8 || *err){
+ *err = 1;
+ return p;
+ }
+ PBIT64(p, v);
+ return p+8;
+}
+
+/* Terminated so we can use them directly in C */
+char*
+packstr(int *err, char *p, char *e, char *s)
+{
+ int n;
+
+ n = strlen(s);
+ if (e - p < n+3 || *err){
+ *err = 1;
+ return p;
+ }
+ PBIT16(p+0, n);
+ memcpy(p+2, s, n);
+ p[2+n] = 0;
+ return p+3+n;
+}
+
+int
+dir2kv(vlong up, Dir *d, Kvp *kv, char *buf, int nbuf)
+{
+ char *k, *ek, *v, *ev, *eb;
+ int err;
+
+ err = 0;
+ k = buf;
+ ek = buf;
+ eb = buf + nbuf;
+ ek = pack8(&err, ek, eb, Kent);
+ ek = pack64(&err, ek, eb, up);
+ ek = packstr(&err, ek, eb, d->name);
+
+ v = ek;
+ ev = ek;
+ ev = pack64(&err, ev, eb, d->qid.path);
+ ev = pack32(&err, ev, eb, d->qid.vers);
+ ev = pack8(&err, ev, eb, d->qid.type);
+ ev = pack32(&err, ev, eb, d->mode);
+ ev = pack64(&err, ev, eb, d->atime*Nsec);
+ ev = pack64(&err, ev, eb, d->mtime*Nsec);
+ ev = pack64(&err, ev, eb, d->length);
+ ev = packstr(&err, ev, eb, d->uid);
+ ev = packstr(&err, ev, eb, d->gid);
+ ev = packstr(&err, ev, eb, d->muid);
+ if(err){
+ werrstr("stat too big: %.*s...", 32, d->name);
+ return -1;
+ }
+ kv->type = Vinl;
+ kv->k = k;
+ kv->nk = ek - k;
+ kv->v = v;
+ kv->nv = ev - v;
+ return 0;
+}
+
+int
+name2dkey(vlong up, char *name, Key *k, char *buf, int nbuf)
+{
+ char *ek, *eb;
+ int err;
+
+ err = 0;
+ ek = buf;
+ eb = buf + nbuf;
+ ek = pack8(&err, ek, eb, Kent);
+ ek = pack64(&err, ek, eb, up);
+ ek = packstr(&err, ek, eb, name);
+ if(err)
+ return -1;
+ k->k = buf;
+ k->nk = ek - buf;
+ return k->nk;
+}
+
+int
+kv2dir(Kvp *kv, Dir *d)
+{
+ char *k, *ek, *v, *ev;
+ vlong atime, mtime;
+ int err;
+
+ err = 0;
+ k = kv->k + 9;
+ ek = kv->k + kv->nk;
+dprint("unpacking... [%d %d]\n", k[0], k[1]);
+ k = unpackstr(&err, k, ek, &d->name);
+
+ v = kv->v;
+ ev = v + kv->nv;
+ v = unpack64(&err, v, ev, &d->qid.path);
+ v = unpack32(&err, v, ev, &d->qid.vers);
+ v = unpack8(&err, v, ev, &d->qid.type);
+ v = unpack32(&err, v, ev, &d->mode);
+ v = unpack64(&err, v, ev, &atime);
+ v = unpack64(&err, v, ev, &mtime);
+ v = unpack64(&err, v, ev, &d->length);
+ v = unpackstr(&err, v, ev, &d->uid);
+ v = unpackstr(&err, v, ev, &d->gid);
+ v = unpackstr(&err, v, ev, &d->muid);
+ if(err){
+ werrstr("kv too small");
+ return -1;
+ }
+ if(k != ek){
+ werrstr("invalid path");
+ return -1;
+ }
+ if(v != ev){
+ werrstr("stat full of fuck");
+ return -1;
+ }
+ d->atime = (atime+Nsec/2)/Nsec;
+ d->mtime = (mtime+Nsec/2)/Nsec;
+ return 0;
+}
+
+int
+kv2statbuf(Kvp *kv, char *buf, int nbuf)
+{
+ Dir d;
+ int n;
+
+ if(kv2dir(kv, &d) == -1)
+ return -1;
+ dprint("have %d bytes to pack into\n", nbuf);
+ if((n = convD2M(&d, (uchar*)buf, nbuf)) <= BIT16SZ){
+ fprint(2, "here...failed convert??, needed %d\n", GBIT16(buf));
+ return -1;
+ }
+ return n;
+}
+
+int
+kv2qid(Kvp *kv, Qid *q)
+{
+ char *v, *ev;
+ int err;
+
+ err = 0;
+ v = kv->v;
+ ev = v + kv->nv;
+ v = unpack64(&err, v, ev, &q->path);
+ v = unpack32(&err, v, ev, &q->vers);
+ unpack8(&err, v, ev, &q->type);
+ if(err){
+ werrstr("kv too small");
+ return -1;
+ }
+ return 0;
+}
--- /dev/null
+++ b/ream.c
@@ -1,0 +1,157 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+
+static void
+initroot(Blk *r)
+{
+ char buf[512];
+ Kvp kv;
+ Dir d;
+
+ memset(&d, 0, sizeof(Dir));
+ d.qid = (Qid){fs->nextqid++, 0, QTDIR};
+ d.mode = 0755;
+ d.atime = 0;
+ d.mtime = 0;
+ d.length = 0;
+ d.name = "";
+ d.uid = "glenda";
+ d.gid = "glenda";
+ d.muid = "glenda";
+ if(dir2kv(-1, &d, &kv, buf, sizeof(buf)) == -1)
+ sysfatal("ream: pack root: %r");
+ blkinsert(r, &kv);
+
+ kv.k = buf;
+ kv.nk = 9;
+ kv.v = buf+9;
+ kv.nv = 8;
+ buf[0] = Ksuper;
+ PBIT64(buf+1, 0);
+ PBIT64(buf+9, 0);
+ blkinsert(r, &kv);
+}
+
+static void
+reamarena(Arena *a, vlong start, vlong asz)
+{
+ vlong off, bo, bh;
+ char *p;
+ Blk *b;
+
+ off = start;
+ if((b = mallocz(sizeof(Blk), 1)) == nil)
+ sysfatal("ream: %r");
+ off += Blksz; /* arena header */
+
+ a->log = -1;
+ memset(b, 0, sizeof(Blk));
+ b->type = Tlog;
+ b->off = off;
+ b->logsz = 32;
+ b->data = b->buf + Hdrsz;
+ b->flag |= Bdirty;
+
+ p = b->data;
+ PBIT64(p+24, off|LgFree); /* off */
+ PBIT64(p+32, asz); /* len */
+ PBIT64(p+40, b->off|LgAlloc); /* off */
+ PBIT64(p+48, Blksz); /* len */
+ finalize(b);
+ synclog(b, -1, 32);
+
+ bh = blkhash(b);
+ bo = b->off;
+
+ memset(b, 0, sizeof(Blk));
+ b->type = Tarena;
+ b->off = start;
+ p = b->buf + Hdrsz;
+ print("b->off: %llx\n", b->off);
+ PBIT64(p+0, bo);
+ PBIT64(p+8, bh);
+ finalize(b);
+ if(pwrite(fs->fd, b->buf, Blksz, b->off) == -1)
+ sysfatal("ream: write arena: %r");
+}
+
+void
+reamfs(char *dev)
+{
+ vlong sz, asz, off;
+ Blk *s, *r;
+ Dir *d;
+ int i;
+
+ if((fs->fd = open(dev, ORDWR)) == -1)
+ sysfatal("open %s: %r", dev);
+ if((d = dirfstat(fs->fd)) == nil)
+ sysfatal("ream: %r");
+ if(d->length < 64*MiB)
+ sysfatal("ream: disk too small");
+ if((s = mallocz(sizeof(Blk), 1)) == nil)
+ sysfatal("ream: %r");
+
+
+ sz = d->length;
+ sz = sz - (sz % Blksz) - Blksz;
+ fs->narena = sz / (128*GiB);
+ if(fs->narena < 1)
+ fs->narena = 1;
+ if(fs->narena >= 128)
+ fs->narena = 128;
+ if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+ sysfatal("malloc: %r");
+ free(d);
+
+ asz = sz/fs->narena;
+ asz = asz - (asz % Blksz) - Blksz;
+ fs->arenasz = asz;
+ off = 0;
+ fprint(2, "reaming %d arenas:\n", fs->narena);
+
+ for(i = 0; i < fs->narena; i++){
+ print("\tarena %d: %lld blocks at %lld\n", i, asz/Blksz, off);
+ reamarena(&fs->arenas[i], off, asz);
+ asz += off;
+ }
+
+ s->type = Tsuper;
+ s->off = sz;
+ s->data = s->buf + Hdrsz;
+ fillsuper(s);
+ finalize(s);
+
+print("superblock @%llx\n", s->off);
+ for(i = 0; i < fs->narena; i++)
+ if((loadarena(&fs->arenas[i], i*asz)) == -1)
+ sysfatal("ream: loadarena: %r");
+
+ /*
+ * Now that we have a completely empty fs, give it
+ * a single root block that the tree will insert
+ * into, and take a snapshot as the initial state.
+ */
+ if((r = newblk(Tleaf)) == nil)
+ sysfatal("ream: allocate root: %r");
+ initroot(r);
+ finalize(r);
+
+ fs->super = s;
+ fs->rootb = r->off;
+ fs->rooth = blkhash(r);
+ fs->height = 1;
+ snapshot();
+
+ putblk(s);
+ putblk(r);
+ if(sync() == -1)
+ sysfatal("ream: sync: %r");
+}
--- a/tree.c
+++ b/tree.c
@@ -1,23 +1,44 @@
#include <u.h>
#include <libc.h>
#include <fcall.h>
+#include <avl.h>
#include "dat.h"
#include "fns.h"
-int lookup;
void
-dprint(char *fmt, ...)
+cpkey(Key *dst, Key *src, char *buf, int nbuf)
{
- va_list ap;
+ assert(src->nk <= nbuf);
+ dst->k = buf;
+ dst->nk = src->nk;
+ memcpy(dst->k, src->k, src->nk);
+}
- if(!debug)
- return;
- va_start(ap, fmt);
- vfprint(2, fmt, ap);
- va_end(ap);
+void
+cpkvp(Kvp *dst, Kvp *src, char *buf, int nbuf)
+{
+ assert(src->nk+src->nv <= nbuf);
+ cpkey(dst, src, buf, nbuf);
+ dst->type = src->type;
+ if(src->type == Vinl){
+ dst->v = buf+src->nk;
+ dst->nv = src->nv;
+ }else{
+ dst->bp = src->bp;
+ dst->bh = src->bh;
+ dst->fill = src->fill;
+ }
+ memcpy(dst->v, src->v, src->nv);
}
+void
+cpmsg(Msg *dst, Msg *src, char *buf, int nbuf)
+{
+ dst->op = src->op;
+ cpkvp(dst, src, buf, nbuf);
+}
+
int
keycmp(Key *a, Key *b)
{
@@ -54,9 +75,9 @@
{
int o;
- assert(i >= 0 && i < b->nent);
+ assert(i >= 0 && i < b->nval);
o = GBIT16(b->data + 2*i);
- if(b->type == Pivot){
+ if(b->type == Tpivot){
kv->type = Vref;
kv->nk = GBIT16(b->data + o);
kv->k = b->data + o + 2;
@@ -78,16 +99,16 @@
int o, nk, nv, ksz, vsz, spc;
char *p;
- assert(i >= 0 && i <= b->nent);
- spc = (b->type == Leaf) ? Leafspc : Pivspc;
+ assert(i >= 0 && i <= b->nval);
+ spc = (b->type == Tleaf) ? Leafspc : Pivspc;
p = b->data + 2*i;
nk = 2 + kv->nk;
nv = (kv->type == Vref) ? Ptrsz : 2 + kv->nv;
if (i < 0)
i = 0;
- if(!replace || b->nent == i){
- memmove(p + 2, p, 2*(b->nent - i));
- b->nent++;
+ if(!replace || b->nval == i){
+ memmove(p + 2, p, 2*(b->nval - i));
+ b->nval++;
b->valsz += nk + nv;
o = spc - b->valsz;
}else{
@@ -99,7 +120,7 @@
*/
o = GBIT16(b->data + 2*i);
ksz = 2 + GBIT16(b->data + o);
- if(b->type == Leaf)
+ if(b->type == Tleaf)
vsz = 2 + GBIT16(b->data + o + ksz);
else
vsz = 16;
@@ -109,13 +130,16 @@
}
}
- if(2*b->nent + b->valsz > spc)
- showblk(b, "setval overflow");
- assert(2*b->nent + b->valsz <= spc);
- assert(2*b->nent < o);
+ if(2*b->nval + b->valsz > spc){
+ dprint("2*%d + %d > %d [ksz: %d, vsz: %d]\n",
+ 2*b->nval, b->valsz, spc, kv->nk, kv->nv);
+ showblk(b, "setval overflow", 1);
+ abort();
+ }
+ assert(2*b->nval + b->valsz <= spc);
+ assert(2*b->nval <= o);
p = b->data + o;
- if(b->type == Pivot){
- assert(kv->type == Vref);
+ if(b->type == Tpivot){
PBIT16(b->data + 2*i, o);
PBIT16(p + 0, kv->nk);
memcpy(p + 2, kv->k, kv->nk);
@@ -123,7 +147,6 @@
PBIT64(p + kv->nk + 10, kv->bh);
PBIT16(p + kv->nk + 18, kv->fill);
} else {
- assert(kv->type == Vinl);
PBIT16(b->data + 2*i, o);
PBIT16(p + 0, kv->nk);
memcpy(p + 2, kv->k, kv->nk);
@@ -137,10 +160,10 @@
{
char *p;
- assert(i >= 0 && i <= b->nent);
- b->nent--;
+ assert(i >= 0 && i <= b->nval);
+ b->nval--;
p = b->data + 2*i;
- memmove(p, p + 2, 2*(b->nent - i));
+ memmove(p, p + 2, 2*(b->nval - i));
return 0;
}
@@ -150,15 +173,15 @@
char *p;
int o;
- assert(b->type == Pivot);
- assert(i >= 0 && i <= b->nmsg);
- b->nmsg++;
+ assert(b->type == Tpivot);
+ assert(i >= 0 && i <= b->nbuf);
+ b->nbuf++;
b->bufsz += msgsz(m);
- assert(2*b->nent + b->bufsz <= Bufspc);
+ assert(2*b->nbuf + b->bufsz <= Bufspc);
p = b->data + Pivspc;
o = Pivspc - b->bufsz;
- memmove(p + 2*(i+1), p+2*i, 2*(b->nmsg - i));
+ memmove(p + 2*(i+1), p+2*i, 2*(b->nbuf - i));
PBIT16(p + 2*i, o);
p = b->data + Bufspc + o;
@@ -175,8 +198,8 @@
char *p;
int o;
- assert(b->type == Pivot);
- assert(i >= 0 && i < b->nmsg);
+ assert(b->type == Tpivot);
+ assert(i >= 0 && i < b->nbuf);
o = GBIT16(b->data + Pivspc + 2*i);
p = b->data + Pivspc + o;
@@ -196,7 +219,7 @@
r = -1;
lo = 0;
- hi = b->nent;
+ hi = b->nval;
while(lo < hi){
mid = (hi + lo) / 2;
getval(b, mid, &cmp);
@@ -206,7 +229,7 @@
else
lo = mid + 1;
}
- if(lo < b->nent){
+ if(lo < b->nval){
getval(b, lo, &cmp);
r = keycmp(kv, &cmp);
}
@@ -220,12 +243,12 @@
Msg cmp;
lo = 0;
- hi = b->nmsg;
+ hi = b->nbuf;
while(lo < hi){
mid = (hi + lo) / 2;
getmsg(b, mid, &cmp);
r = keycmp(m, &cmp);
- if(r <= 0)
+ if(r < 0)
hi = mid;
else
lo = mid + 1;
@@ -234,26 +257,30 @@
}
static int
-bufsearch(Blk *b, Key *k, Msg *m, int *idx)
+bufsearch(Blk *b, Key *k, Msg *m, int *same)
{
int lo, hi, mid, r;
+ r = -1;
lo = 0;
- hi = b->nmsg - 1;
+ hi = b->nbuf - 1;
while(lo <= hi){
mid = (hi + lo) / 2;
getmsg(b, mid, m);
r = keycmp(k, m);
- if(r == 0){
- *idx = mid;
- return 0;
- }
if(r < 0)
hi = mid - 1;
else
lo = mid + 1;
}
- return -1;
+ lo--;
+ if(lo >= 0){
+ getmsg(b, lo, m);
+ r = keycmp(k, m);
+ }
+ if(same != nil)
+ *same = (r == 0);
+ return lo;
}
int
@@ -263,7 +290,7 @@
Kvp cmp;
lo = 0;
- hi = b->nent - 1;
+ hi = b->nval - 1;
while(lo <= hi){
mid = (hi + lo) / 2;
getval(b, mid, &cmp);
@@ -279,7 +306,7 @@
}
int
-blksearch(Blk *b, Key *k, Kvp *rp, int *idx, int *same)
+blksearch(Blk *b, Key *k, Kvp *rp, int *same)
{
int lo, hi, mid, r;
Kvp cmp;
@@ -286,7 +313,7 @@
r = -1;
lo = 0;
- hi = b->nent;
+ hi = b->nval;
while(lo < hi){
mid = (hi + lo) / 2;
getval(b, mid, &cmp);
@@ -303,16 +330,14 @@
}
if(same != nil)
*same = (r == 0);
- if(idx != nil)
- *idx = lo;
- return 0;
+ return lo;
}
int
filledbuf(Blk *b, int needed)
{
- assert(b->type == Pivot);
- return 2*b->nmsg + b->bufsz > Bufspc - needed;
+ assert(b->type == Tpivot);
+ return 2*(b->nbuf+1) + b->bufsz + needed > Bufspc;
}
@@ -319,8 +344,8 @@
int
filledleaf(Blk *b, int needed)
{
- assert(b->type == Leaf);
- return 2*b->nent + b->valsz > Leafspc - needed;
+ assert(b->type == Tleaf);
+ return 2*(b->nval+1) + b->valsz + needed > Leafspc;
}
int
@@ -329,10 +354,10 @@
/*
* We need to guarantee there's room for one message
* at all times, so that splits along the whole path
- * have somewhere to go.
+ * have somewhere to go as they propagate up.
*/
- assert(b->type == Pivot);
- return 2*b->nent + b->valsz > Pivspc - reserve*Kpmax;
+ assert(b->type == Tpivot);
+ return 2*(b->nval+1) + b->valsz + reserve*Kpmax > Pivspc;
}
int
@@ -344,7 +369,7 @@
if(pp->split){
getval(pp->l, 0, &kv);
kv.type = Vref;
- kv.bp = (uintptr)pp->l;
+ kv.bp = pp->l->off;
kv.bh = blkhash(pp->l);
kv.fill = blkfill(pp->l);
setval(n, i++, &kv, 0);
@@ -353,7 +378,7 @@
getval(pp->r, 0, &kv);
kv.type = Vref;
- kv.bp = (uintptr)pp->r;
+ kv.bp = pp->r->off;
kv.bh = blkhash(pp->r);
kv.fill = blkfill(pp->r);
setval(n, i++, &kv, 0);
@@ -362,7 +387,7 @@
}else{
getval(pp->n, 0, &kv);
kv.type = Vref;
- kv.bp = (uintptr)pp->n;
+ kv.bp = pp->n->off;
kv.bh = blkhash(pp->n);
kv.fill = blkfill(pp->n);
setval(n, i++, &kv, 1);
@@ -395,15 +420,15 @@
midx = (p != nil && p->merge) ? p->midx : -1;
if((n = newblk(b->type)) == nil)
return -1;
- for(i = 0; i < b->nent; i++){
+ for(i = 0; i < b->nval; i++){
if(i == pidx){
j = copyup(n, j, pp, nil);
continue;
}else if(i == midx){
- getval(p->ml, 0, &m);
+ getval(p->nl, 0, &m);
setval(n, j++, &m, 0);
- if(p->mr){
- getval(p->mr, 0, &m);
+ if(p->nr){
+ getval(p->nr, 0, &m);
setval(n, j++, &m, 0);
}
continue;
@@ -411,18 +436,18 @@
getval(b, i, &m);
setval(n, j++, &m, 0);
}
- if(b->type == Pivot){
+ if(b->type == Tpivot){
i = 0;
j = 0;
- while(i < b->nmsg){
+ while(i < b->nbuf){
if(i == lo)
i = hi;
- if(i == b->nmsg)
+ if(i == b->nbuf)
break;
getmsg(b, i++, &m);
setmsg(n, j++, &m);
}
- b->nmsg = j;
+ b->nbuf = j;
}
p->n = n;
return 0;
@@ -462,8 +487,20 @@
j = 0;
copied = 0;
- halfsz = (2*b->nent + b->valsz)/2;
- for(i = 0; copied < halfsz; i++){
+ halfsz = (2*b->nval + b->valsz)/2;
+ assert(b->nval >= 4);
+ for(i = 0; i < b->nval; i++){
+ /*
+ * We're trying to balance size,
+ * but we need at least 2 nodes
+ * in each half of the split if
+ * we want a valid tree.
+ */
+ if(i == b->nval-2)
+ break;
+ if(i >= 2 && copied >= halfsz)
+ break;
+
if(i == pidx){
j = copyup(l, j, pp, &copied);
continue;
@@ -474,7 +511,7 @@
}
j = 0;
getval(b, i, mid);
- for(; i < b->nent; i++){
+ for(; i < b->nval; i++){
if(i == pidx){
j = copyup(r, j, pp, nil);
continue;
@@ -482,12 +519,12 @@
getval(b, i, &t);
setval(r, j++, &t, 0);
}
- if(b->type == Pivot){
+ if(b->type == Tpivot){
i = 0;
- for(j = 0; i < b->nmsg; i++, j++){
+ for(j = 0; i < b->nbuf; i++, j++){
if(i == lo)
i = hi;
- if(i == b->nmsg)
+ if(i == b->nbuf)
break;
getmsg(b, i, &m);
if(keycmp(&m, mid) >= 0)
@@ -494,10 +531,10 @@
break;
setmsg(l, j, &m);
}
- for(j = 0; i < b->nmsg; i++, j++){
+ for(j = 0; i < b->nbuf; i++, j++){
if(i == lo)
i = hi;
- if(i == b->nmsg)
+ if(i == b->nbuf)
break;
getmsg(b, i, &m);
setmsg(r, j, &m);
@@ -512,23 +549,55 @@
int
apply(Blk *b, Msg *m)
{
- assert(b->type == Leaf);
+ int idx, same;
+ vlong v;
+ char *p;
+ Kvp kv;
+
+ assert(b->type == Tleaf);
assert(b->flag & Bdirty);
- switch(m->op){
- case Ocreate:
+ switch(m->op&0xf){
+ case Oinsert:
blkinsert(b, m);
break;
case Odelete:
blkdelete(b, m);
break;
- case Owrite:
- werrstr("unimplemented");
- goto error;
+ case Owstat:
+ p = m->v;
+ idx = blksearch(b, m, &kv, &same);
+ if(idx == -1 || !same)
+ abort();
+ /* bump version */
+ v = GBIT32(kv.v+8);
+ PBIT32(kv.v+8, v+1);
+ if(m->op & Owmtime){
+ v = GBIT64(p);
+ p += 8;
+ PBIT32(kv.v+25, v);
+ }
+ if(m->op & Owsize){
+ fprint(2, "wstat: incrementing size");
+ v = GBIT64(p);
+ p += 8;
+ PBIT64(kv.v+33, v);
+ }
+ if(m->op & Owmode){
+ v = GBIT32(p);
+ p += 4;
+ PBIT32(kv.v+33, v);
+ }
+ if(m->op & Owname){
+ fprint(2, "renames not yet supported");
+ abort();
+ }
+ if(p != m->v + m->nv)
+ fprint(2, "malformed wstat message");
+ break;
+ default:
+ abort();
}
return 0;
-error:
- werrstr("invalid upsert");
- return -1;
}
int
@@ -542,20 +611,21 @@
if((d = newblk(a->type)) == nil)
return -1;
o = 0;
- for(i = 0; i < a->nent; i++){
+ for(i = 0; i < a->nval; i++){
getval(a, i, &m);
setval(d, o++, &m, 0);
}
- for(i = 0; i < b->nent; i++){
+ for(i = 0; i < b->nval; i++){
getval(b, i, &m);
setval(d, o++, &m, 0);
}
- if(a->type == Pivot){
- for(i = 0; i < a->nmsg; i++){
+ if(a->type == Tpivot){
+ o = 0;
+ for(i = 0; i < a->nbuf; i++){
getmsg(a, i, &m);
setmsg(d, o++, &m);
}
- for(i = 0; i < b->nmsg; i++){
+ for(i = 0; i < b->nbuf; i++){
getmsg(b, i, &m);
setmsg(d, o++, &m);
}
@@ -562,42 +632,61 @@
}
p->merge = 1;
p->midx = idx;
- p->ml = d;
- p->mr = nil;
+ p->nl = d;
+ p->nr = nil;
return 0;
}
/*
- * Returns whether the keys in b between
- * idx and m would spill out of the buffer
- * of d.
+ * Scan a single block for the split offset;
+ * returns 1 if we'd spill out of the buffer,
+ * updates *idx and returns 0 otherwise.
*/
int
-spillsbuf(Blk *d, Blk *b, Msg *m, int *idx)
+spillscan(Blk *d, Blk *b, Msg *m, int *idx, int o)
{
int i, used;
Msg n;
- if(b->type == Leaf)
- return 0;
- used = 2*d->nmsg + d->bufsz;
- for(i = *idx; i < b->nmsg; i++){
+ used = 2*d->nbuf + d->bufsz;
+ for(i = 0; i < b->nbuf; i++){
getmsg(b, i, &n);
- if(keycmp(&n, m) > 0)
- break;
+ print("check %P before %P: %d\n", &n.Kvp, &m->Kvp, keycmp(&n, m));
+ if(keycmp(m, &n) <= 0){
+ *idx = i + o;
+ return 0;
+ }
+ print("would copy %P before %P: %d\n", &n.Kvp, &m->Kvp, keycmp(&n, m));
used += 2 + msgsz(&n);
- if(used >= Bufspc)
+ if(used > Bufspc)
return 1;
}
- print("does not spill");
- *idx = i;
+ *idx = b->nbuf;
return 0;
}
+/*
+ * Returns whether the keys in b between
+ * idx and m would spill out of the buffer
+ * of d.
+ */
int
-rotate(Path *p, int idx, Blk *a, Blk *b, int halfbuf)
+spillsbuf(Blk *d, Blk *l, Blk *r, Msg *m, int *idx)
{
- int i, o, copied, split, ovfidx;
+ if(l->type == Tleaf)
+ return 0;
+
+ if(*idx < l->nbuf && spillscan(d, l, m, idx, 0))
+ return 1;
+ if(*idx >= l->nbuf && spillscan(d, r, m, idx, l->nbuf))
+ return 1;
+ return 0;
+}
+
+int
+rotate(Path *p, int midx, Blk *a, Blk *b, int halfpiv)
+{
+ int i, o, cp, sp, idx;
Blk *d, *l, *r;
Msg m;
@@ -610,34 +699,35 @@
}
o = 0;
d = l;
- split = -1;
- ovfidx = 0;
- copied = 0;
- for(i = 0; i < a->nent; i++){
+ cp = 0;
+ sp = -1;
+ idx = 0;
+ for(i = 0; i < a->nval; i++){
getval(a, i, &m);
- if(d == l && (copied >= halfbuf || spillsbuf(d, a, &m, &ovfidx))){
- split = i;
- o = 0;
+
+ if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+ sp = idx;
d = r;
+ o = 0;
}
setval(d, o++, &m, 0);
- copied += valsz(&m);
+ cp += valsz(&m);
}
- for(i = 0; i < b->nent; i++){
- if(d == l && (copied >= halfbuf || spillsbuf(d, b, &m, &ovfidx))){
- split = i;
- o = 0;
+ for(i = 0; i < b->nval; i++){
+ getval(b, i, &m);
+ if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+ sp = idx;
d = r;
+ o = 0;
}
- getval(b, i, &m);
setval(d, o++, &m, 0);
- copied += valsz(&m);
+ cp += valsz(&m);
}
- if(a->type == Pivot){
+ if(a->type == Tpivot){
d = l;
o = 0;
- for(i = 0; i < a->nmsg; i++){
- if(i == split){
+ for(i = 0; i < a->nbuf; i++){
+ if(o == sp){
d = r;
o = 0;
}
@@ -644,8 +734,8 @@
getmsg(a, i, &m);
setmsg(d, o++, &m);
}
- for(i = 0; i < b->nmsg; i++){
- if(i == split){
+ for(i = 0; i < b->nbuf; i++){
+ if(o == sp){
d = r;
o = 0;
}
@@ -654,9 +744,9 @@
}
}
p->merge = 1;
- p->midx = idx;
- p->ml = l;
- p->mr = r;
+ p->midx = midx;
+ p->nl = l;
+ p->nr = r;
return 0;
}
@@ -667,14 +757,14 @@
assert(a->type == b->type);
- na = 2*a->nent + a->valsz;
- nb = 2*b->nent + b->valsz;
- if(a->type == Leaf){
+ na = 2*a->nval + a->valsz;
+ nb = 2*b->nval + b->valsz;
+ if(a->type == Tleaf){
ma = 0;
mb = 0;
}else{
- ma = 2*a->nmsg + a->bufsz;
- mb = 2*b->nmsg + b->bufsz;
+ ma = 2*a->nbuf + a->bufsz;
+ mb = 2*b->nbuf + b->bufsz;
}
imbalance = na - nb;
if(imbalance < 0)
@@ -682,7 +772,7 @@
/* works for leaf, because 0 always < Bufspc */
if(na + nb < Pivspc && ma + mb < Bufspc)
return merge(p, idx, a, b);
- else if(imbalance < 4*Msgmax)
+ else if(imbalance > 4*Msgmax)
return rotate(p, idx, a, b, (na + nb)/2);
else
return 0;
@@ -709,7 +799,7 @@
}
/* Try merging left */
getval(p->b, idx, &km);
- if(idx > 0){
+ if(idx-1 >= 0){
getval(p->b, idx-1, &kl);
if(kl.fill + km.fill >= Blkspc)
goto next;
@@ -720,7 +810,7 @@
goto done;
}
next:
- if(idx < p->b->nent){
+ if(idx+1 < p->b->nval){
getval(p->b, idx+1, &kr);
if(kr.fill + km.fill >= Blkspc)
goto done;
@@ -741,17 +831,15 @@
}
static int
-flush(Path *path, int npath, Msg *ins, int *redo)
+flush(Path *path, int npath)
{
- static int nins;
Path *p, *pp;
Blk *b;
Kvp mid;
Msg m;
- int i, ret;
+ int i;
- ret = -1;
/*
* The path must contain at minimum two elements:
* we must have 1 node we're inserting into, and
@@ -759,10 +847,9 @@
* we put the new root into if the root gets split.
*/
assert(npath >= 2);
- *redo = 0;
pp = nil;
p = &path[npath - 1];
- if(p->b->type == Leaf){
+ if(p->b->type == Tleaf){
if(!filledleaf(p->b, p[-1].sz)){
if(update(p, pp) == -1)
return -1;
@@ -772,6 +859,7 @@
break;
apply(p->n, &m);
}
+ finalize(p->n);
}else{
if(split(p->b, p, pp, &mid) == -1)
goto error;
@@ -785,6 +873,8 @@
if(apply(b, &m) == -1)
goto error;
}
+ finalize(p->l);
+ finalize(p->r);
}
assert(p->n || (p->l && p->r));
p->midx = -1;
@@ -793,11 +883,17 @@
}
for(; p > path; p--){
if(!filledpiv(p->b, 1)){
+#ifdef NOPE
if(trymerge(p, pp, p->idx) == -1)
goto error;
+#endif
/* If we merged the root node, break out. */
- if(p[-1].b == nil && p[0].merge && p[0].mr == nil && p[0].b->nent == 2)
+ if(p[-1].b == nil && p[0].merge && p[0].nr == nil && p[0].b->nval == 2){
+ /* FIXME: shouldn't p[1].n already be right? */
+ p[1].n = p[0].nl;
+ p[0].n = nil;
break;
+ }
if(update(p, pp) == -1)
goto error;
for(i = p[-1].lo; i < p[-1].hi; i++){
@@ -806,7 +902,9 @@
break;
bufinsert(p->n, &m);
}
+ finalize(p->n);
}else{
+ dprint("-- split\n");
if(split(p->b, p, pp, &mid) == -1)
goto error;
b = p->l;
@@ -818,24 +916,28 @@
continue;
bufinsert(b, &m);
}
+ finalize(p->l);
+ Blk *x = getblk(p->l->off, -1);
+ assert(p->l == x);
+ finalize(p->r);
}
pp = p;
}
if(path[1].split){
- if((path[0].n = newblk(Pivot)) == nil)
+ if((path[0].n = newblk(Tpivot)) == nil)
goto error;
copyup(path[0].n, 0, &path[1], nil);
- bufinsert(path[0].n, ins);
- }else{
- if(path[1].b->type == Leaf && !filledleaf(path[1].b, msgsz(ins)))
- apply(path[1].n, ins);
- else if(!filledbuf(path[1].n, msgsz(ins)))
- bufinsert(path[1].n, ins);
- else
- *redo = 1;
}
- ret = 0;
+ return 0;
error:
+ return -1;
+}
+
+void
+freepath(Path *path, int npath)
+{
+ Path *p;
+
for(p = path; p != path + npath; p++){
if(p->b != nil)
putblk(p->b);
@@ -846,7 +948,6 @@
if(p->r != nil)
putblk(p->r);
}
- return ret;
}
/*
@@ -869,13 +970,13 @@
* go to the first node. Stop *after* the last entry,
* because entries >= the last entry all go into it.
*/
- for(i = 1; i <= b->nent; i++){
- if(i < b->nent)
+ for(i = 1; i <= b->nval; i++){
+ if(i < b->nval)
getval(b, i, &kv);
cursz = 0;
- for(; j < b->nmsg; j++){
+ for(; j < b->nbuf; j++){
getmsg(b, j, &m);
- if(i < b->nent && keycmp(&m, &kv) >= 0)
+ if(i < b->nval && keycmp(&m, &kv) >= 0)
break;
/* 2 bytes for offset, plus message size in buffer */
cursz += 2 + msgsz(&m);
@@ -892,36 +993,59 @@
}
int
-fsupsert(Msg *m)
+btupsert(Msg *msg, int nmsg)
{
- Blk *b, *n, *s;
- int npath, redo;
+ int i, npath, redo, dh, nm, height;
+ vlong rh;
Path *path;
+ Blk *b, *rb;
Kvp sep;
- if(m->nk + 2 > Keymax){
- werrstr("overlong key");
+ nm = 0;
+ for(i = 0; i < nmsg; i++){
+ if(msg[i].nk + 2 > Keymax){
+ werrstr("overlong key");
+ return -1;
+ }
+ nm += msgsz(&msg[i]);
+ }
+
+again:
+ if((b = getroot(&height)) == nil){
+ werrstr("get root: %r");
return -1;
}
+
/*
+ * Fast path: the root of the tree has room,
+ * and nobody else has observed the node, so
+ * we can modify it with gleeful abandon.
+ */
+ if(b->type == Tpivot && canwlock(b)) {
+ if((b->flag & Bdirty) && !filledbuf(b, nm)){
+ for(i = 0; i < nmsg; i++)
+ bufinsert(b, &msg[i]);
+ wunlock(b);
+ return 0;
+ }
+ }
+
+ /*
* The tree can grow in height by 1 when we
* split, so we allocate room for one extra
* node in the path.
*/
-again:
- n = nil;
- s = nil;
redo = 0;
npath = 0;
- path = emalloc((fs->height + 2)*sizeof(Path));
+ if((path = calloc((height + 2), sizeof(Path))) == nil)
+ return -1;
path[npath].b = nil;
path[npath].idx = -1;
path[npath].midx = -1;
npath++;
- b = fs->root;
- path[0].sz = msgsz(m);
- while(b->type == Pivot){
+ path[0].sz = nm;
+ while(b->type == Tpivot){
if(!filledbuf(b, path[npath - 1].sz))
break;
victim(b, &path[npath]);
@@ -936,75 +1060,352 @@
path[npath].lo = 0;
path[npath].hi = 0;
npath++;
- if(flush(path, npath, m, &redo) == -1)
+
+ dh = -1;
+ rb = nil;
+// showfs("flushing");
+ if(flush(path, npath) == -1)
goto error;
+
if(path[0].n != nil){
- fs->height++;
- fs->root = path[0].n;
+ dh = 1;
+ rb = path[0].n;
}else if(path[1].n != nil){
- fs->root = path[1].n;
+ dh = 0;
+ rb = path[1].n;
}else if(npath >2 && path[2].n != nil){
- fs->height--;
- fs->root = path[2].n;
+ dh = -1;
+ rb = path[2].n;
}else
abort();
+
+ if(rb->type == Tleaf && !filledleaf(rb, nm))
+ for(i = 0; i < nmsg; i++)
+ apply(rb, &msg[i]);
+ else if(rb->type == Tpivot && !filledbuf(rb, nm))
+ for(i = 0; i < nmsg; i++)
+ bufinsert(rb, &msg[i]);
+ else
+ redo = 1;
+ finalize(rb);
+
+ assert(rb->off != 0);
+ rh = blkhash(rb);
+ lock(&fs->rootlk);
+ fs->height += dh;
+ fs->rootb = rb->off;
+ fs->rooth = rh;
+ unlock(&fs->rootlk);
+
+ freepath(path, npath);
free(path);
+ if(!checkfs()){
+ showfs("broken");
+ abort();
+ }
if(redo)
goto again;
+// showfs("fs");
+ snapshot();
return 0;
error:
- if(n != nil) freeblk(n);
- if(s != nil) freeblk(s);
+ freepath(path, npath);
free(path);
return -1;
}
+Blk*
+getroot(int *h)
+{
+ vlong bp, bh;
+
+ lock(&fs->rootlk);
+ bp = fs->rootb;
+ bh = fs->rooth;
+ if(h != nil)
+ *h = fs->height;
+ unlock(&fs->rootlk);
+ return getblk(bp, bh);
+}
+
static char*
collect(Blk *b, Key *k, Kvp *r, int *done)
{
- int i, idx;
+ int i, idx, same;
+ char *err;
Msg m;
*done = 0;
- if(bufsearch(b, k, &m, &idx) != 0)
+dprint("collecting %K\n", k);
+//showblk(b, "collecting from", 0);
+ idx = bufsearch(b, k, &m, &same);
+dprint("idx=%d, same=%d\n", idx, same);
+ if(!same)
return nil;
- for(i = idx; i < b->nmsg; i++){
+ err = Eexist;
+ for(i = idx; i < b->nbuf; i++){
getmsg(b, i, &m);
+dprint("msg=%M\n", &m);
+dprint("cmp=%d\n", keycmp(&m, k));
if(keycmp(&m, k) != 0)
break;
switch(m.op){
- case Ocreate:
+ case Oinsert:
*r = m.Kvp;
*done = 1;
- return nil;
+ err = nil;
+ break;
case Odelete:
*done = 1;
- return Eexist;
- /* The others don't affect walks */
+ err = Eexist;
+ break;
+ default:
+ return Efs;
}
}
- return nil;
+ return err;
}
char*
-fswalk1(Key *k, Kvp *r)
+btlookupat(Blk *b0, Key *k, Kvp *r, Blk **bp)
{
int idx, same, done;
char *ret;
Blk *b;
- b = fs->root;
- while(b->type == Pivot){
+ *bp = nil;
+ b = pinblk(b0);
+ assert(k != r);
+ while(b->type == Tpivot){
ret = collect(b, k, r, &done);
if(done)
return ret;
- if(blksearch(b, k, r, &idx, nil) == -1 || idx == -1)
+ idx = blksearch(b, k, r, nil);
+ if(idx == -1)
return Eexist;
+ putblk(b);
if((b = getblk(r->bp, r->bh)) == nil)
return Efs;
}
- assert(b->type == Leaf);
- if(blksearch(b, k, r, nil, &same) == -1 || !same)
+ assert(b->type == Tleaf);
+ blksearch(b, k, r, &same);
+ if(!same){
+ putblk(b);
return Eexist;
+ }
+ *bp = b;
return nil;
+}
+
+char*
+btlookup(Key *k, Kvp *r, Blk **bp)
+{
+ char *ret;
+ Blk *b;
+
+ *bp = nil;
+ if((b = getroot(nil)) == nil)
+ return Efs;
+ ret = btlookupat(b, k, r, bp);
+ putblk(b);
+
+ return ret;
+}
+
+char*
+btscan(Scan *s, Key *pfx, vlong rootb, vlong rooth, int height)
+{
+ int i, same;
+ Scanp *p;
+ Msg m;
+ Kvp v;
+ Blk *b;
+
+ s->done = 0;
+ s->offset = 0;
+ s->height = height;
+ cpkey(&s->pfx, pfx, s->pfxbuf, sizeof(s->pfxbuf));
+
+ s->kv.v = s->kvbuf+pfx->nk;
+ s->kv.nv = 0;
+ cpkey(&s->kv, pfx, s->kvbuf, sizeof(s->kvbuf));
+
+ if(rootb != -1){
+ s->rootb = rootb;
+ s->rooth = rooth;
+ s->height = height;
+ }else{
+ lock(&fs->rootlk);
+ s->rootb = fs->rootb;
+ s->rooth = fs->rooth;
+ s->height = fs->height;
+dprint("height %d\n", s->height);
+ unlock(&fs->rootlk);
+ }
+ if((s->path = calloc(s->height, sizeof(Scanp))) == nil){
+ free(s);
+ return nil;
+ }
+
+ p = s->path;
+ if((b = getblk(s->rootb, s->rooth)) == nil)
+ return "error reading block";
+ p[0].b = b;
+ for(i = 0; i < s->height; i++){
+ p[i].vi = blksearch(b, &s->kv, &v, &same);
+ if(p[i].vi == -1 || (!same && b->type == Tleaf))
+ getval(b, ++p[i].vi, &v);
+ if(b->type == Tpivot){
+ p[i].bi = bufsearch(b, &s->kv, &m, &same);
+ if(p[i].bi == -1 || !same)
+ p[i].bi++;
+ if((b = getblk(v.bp, v.bh)) == nil)
+ return "error readivg block";
+ p[i+1].b = b;
+ }else{
+ assert(i == s->height-1);
+ }
+ }
+dprint("inited\n");
+for(i = 0; i < s->height; i++){
+dprint("\t%p", p[i].b);
+dprint(" (%d %d)\n", p[i].vi, p[i].bi);
+}
+ return nil;
+}
+
+char *
+btnext(Scan *s, Kvp *r, int *done)
+{
+ Scanp *p;
+ int i, j, h, start;
+ Msg m, n, t;
+ Kvp kv;
+
+ p = s->path;
+ h = s->height;
+ *done = 0;
+ start = h;
+ for(i = h-1; i > 0; i--){
+dprint("advancing (i=%d)\n", i);
+for(j = 0; j < h; j++){
+dprint("\t%p", p[j].b);
+dprint(" (%d %d)\n", p[j].vi, p[j].bi);
+}
+ if(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf)
+ break;
+ if(i == 0){
+ *done = 1;
+ return nil;
+ }
+ start = i;
+ p[i-1].vi++;
+ p[i].vi = 0;
+ p[i].bi = 0;
+ }
+ for(i = start; i < h; i++){
+ getval(p[i-1].b, p[i-1].vi, &kv);
+ if((p[i].b = getblk(kv.bp, kv.bh)) == nil)
+ return "error reading block";
+ }
+ getval(p[h-1].b, p[h-1].vi, &m);
+ for(i = h-2; i >= 0; i--){
+ if(p[i].bi == p[i].b->nbuf)
+ continue;
+ getmsg(p[i].b, p[i].bi, &n);
+ if(keycmp(&m, &n) >= 0)
+ m = n;
+ }
+ if(m.nk < s->pfx.nk || memcmp(m.k, s->pfx.k, s->pfx.nk) != 0){
+ *done = 1;
+ return nil;
+ }
+ cpkvp(r, &m, s->kvbuf, sizeof(s->kvbuf));
+ getval(p[h-1].b, p[h-1].vi, &t);
+ if(keycmp(&m, &t) == 0)
+ p[h-1].vi++;
+ for(i = h-2; i >= 0; i--){
+ for(j = p[i].bi; j < p[i].b->nbuf; j++){
+ getmsg(p[i].b, j, &t);
+ if(keycmp(&m, &t) != 0)
+ break;
+ p[i].bi++;
+ }
+ }
+ return nil;
+}
+
+void
+btdone(Scan *s)
+{
+ int i;
+
+ for(i = 0; i < s->height; i++)
+ if(s->path[i].b != nil)
+ putblk(s->path[i].b);
+ free(s->path);
+}
+
+int
+snapshot(void)
+{
+ Blk *s;
+ int r;
+
+ s = fs->super;
+ qlock(&fs->snaplk);
+ lock(&fs->rootlk);
+ fillsuper(s);
+ finalize(s);
+ unlock(&fs->rootlk);
+print("snap superblock @%llx\n", s->off);
+showfree("snap");
+ r = pwrite(fs->fd, s->buf, Blksz, s->off);
+ qunlock(&fs->snaplk);
+ return r;
+}
+
+int
+getrefpg(vlong off, Blk **p)
+{
+ char kbuf[9], *e;
+ vlong bp, bh;
+ Blk *b;
+ Kvp kv;
+
+ *p = nil;
+ kbuf[0] = Kref;
+ PBIT64(kbuf+1, off & ~(Blksz-1));
+ kv.k = kbuf;
+ kv.nk = sizeof(kbuf);
+ e = btlookup(&kv, &kv, &b);
+ if(e == Eexist)
+ return 0;
+ if(e != nil || kv.nv != 16)
+ return -1;
+ bp = GBIT64(kv.v+0);
+ bh = GBIT64(kv.v+8);
+ putblk(b);
+ if((*p = getblk(bp, bh)) == nil)
+ return -1;
+ return 0;
+}
+
+int
+setrefpg(vlong pg, vlong bp, vlong bh)
+{
+ char kbuf[9], vbuf[16];
+ Msg m;
+
+ kbuf[0] = Kref;
+ PBIT64(kbuf+1, pg & ~(Blksz-1));
+ PBIT64(vbuf+0, bp);
+ PBIT64(vbuf+8, bh);
+
+ m.op = Oinsert;
+ m.k = kbuf;
+ m.nk = sizeof(kbuf);
+ m.v = vbuf;
+ m.nv = sizeof(vbuf);
+ return btupsert(&m, 1);
}
--- a/util.c
+++ b/util.c
@@ -1,5 +1,8 @@
#include <u.h>
#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
#include "dat.h"
#include "fns.h"