ref: 9cf49c3aa370070c2c386ce5c91299579ea5a962
author: Noam Preil <noam@pixelhero.dev>
date: Wed Nov 29 04:23:49 EST 2023
neoventi: initial commit. Working read-only!
--- /dev/null
+++ b/.gitignore
@@ -1,0 +1,4 @@
+*.o
+venti.conf
+neoventi
+bin/
--- /dev/null
+++ b/Makefile
@@ -1,0 +1,41 @@
+.PHONY:all
+
+CC=cproc
+CFLAGS=-std=c17 -pedantic -Wall -O1 -g -D _POSIX_C_SOURCE=200809 -Werror -Wextra -I. -Icore
+LDFLAGS=-lplatform
+
+all:bin/neoventi bin/vpart
+
+bin/vpart: bin/partitioner/main.o bin/partitioner/linux.o
+ @mkdir -p bin
+ @echo LD $@
+ @$(CC) -o $@ $^ -lrt $(CFLAGS) $(LDFLAGS)
+
+bin/neoventi: bin/venti/main.o bin/venti/config.o bin/venti/utils.o
+ @mkdir -p bin
+ @echo LD $@
+ @$(CC) -o $@ $^ -lrt $(CFLAGS) $(LDFLAGS)
+
+bin/ical: bin/rfc5545.o
+ @mkdir -p bin
+ @echo LD $@
+ @$(CC) -o $@ $^ -lrt $(CFLAGS) $(LDFLAGS)
+
+bin/partitioner/%.o: partitioner/%.c
+ @mkdir -p bin/partitioner/
+ @echo CC $<
+ $(CC) $(CFLAGS) $(EXTRA_CFLAGS) -c $< -o $@
+
+bin/venti/%.o: venti/%.c venti/dat.h venti/fns.h
+ @mkdir -p bin/venti/
+ @echo CC $<
+ @$(CC) $(CFLAGS) $(EXTRA_CFLAGS) -c $< -o $@
+
+bin/%.o: aux/%.c
+ @mkdir -p bin
+ @echo CC $<
+ @$(CC) $(CFLAGS) $(EXTRA_CFLAGS) -c $< -o $@
+
+clean:
+ $(RM) -r bin
+
--- /dev/null
+++ b/main.c
@@ -1,0 +1,670 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+
+#include "whack.h"
+
+#define U8GET(p) ((p)[0])
+#define U16GET(p) (((p)[0]<<8)|(p)[1])
+#define U32GET(p) ((u32int)(((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3]))
+#define U64GET(p) (((u64int)U32GET(p)<<32)|(u64int)U32GET((p)+4))
+#define MACHINE "kaladin"
+
+typedef enum {
+ VtRerror = 1,
+ VtTping = 2,
+ VtRping,
+ VtThello = 4,
+ VtRhello,
+ VtTgoodbye = 6,
+ VtRgoodbye, /* not used */
+ VtTauth0 = 8,
+ VtRauth0,
+ VtTauth1 = 10,
+ VtRauth1,
+ VtTread = 12,
+ VtRread,
+ VtTwrite = 14,
+ VtRwrite,
+ VtTsync = 16,
+ VtRsync,
+ VtTmax
+} VtTag;
+
+enum {
+ /* blank space at beginning of partition - useful for config
+ * or for when you accidentally {mount /dev/.../arenas /mnt/venti} */
+ PartBlank = 256 * 1024,
+ NameSize = 64,
+ /* FIXME make arena part head a full block? breaking change for new arena partitions, removes
+ * need for special casing and magic around it. Just read the damn block. */
+ HeadSize = 512,
+ ArenaPartMagic = 0xa9e4a5e7U,
+ ISectMagic = 0xd15c5ec7U,
+ IBucketSize = 6,
+ IEntrySize = 38,
+ MaxAMap = 31*1024,
+ ClumpInfoSize = 25,
+ ABlockLog = 9, /* All reads are of 512 byte sectors??? Yikes. We should probably use a larger size, FIXME. */
+};
+
+typedef struct {
+ int fd;
+} VtConn;
+
+typedef struct {
+ char name[NameSize];
+ u32int clumpmagic;
+ u32int clumpmax;
+ u32int blocksize;
+ u32int version;
+ u32int ctime, wtime;
+
+ /* disk info */
+ u64int size;
+ u64int base;
+ int fd;
+ struct {
+ u32int clumps;
+ } memstats;
+} VtArena;
+
+typedef struct {
+ int blocklog; /* log2(blocksize) */
+ int buckmax; /* max. entries in a index bucket */
+ u32int tabbase; /* base address of index config table on disk */
+ u32int tabsize; /* max. bytes in index config */
+
+ u32int version;
+ u32int bucketmagic;
+ char name[NameSize]; /* text label */
+ char index[NameSize]; /* index owning the section */
+ u32int blocksize; /* size of hash buckets in index */
+ u32int blockbase; /* address of start of on disk index table */
+ u32int blocks; /* total blocks on disk; some may be unused */
+ u32int start; /* first bucket in this section */
+ u32int stop; /* limit of buckets in this section */
+ int fd;
+} VtISect;
+
+typedef struct {
+ char name[NameSize];
+ u64int start, stop;
+} MapEntry;
+
+struct {
+ u32int blocksize;
+ u32int buckets;
+ VtISect *sects;
+ int nsects;
+ u32int div;
+ u32int namap;
+ MapEntry *amap;
+} index;
+
+VtArena *arenas = nil;
+u32int numarenas = 0;
+
+int
+stru64int(char *s, u64int *r)
+{
+ char *t;
+ u64int n, nn, m;
+ int c;
+
+ m = ((u64int)~(u64int)0) / 10;
+ n = 0;
+ for(t = s; ; t++){
+ c = *t;
+ if(c < '0' || c > '9')
+ break;
+ if(n > m)
+ return -1;
+ nn = n * 10 + c - '0';
+ if(nn < n)
+ return -1;
+ n = nn;
+ }
+ *r = n;
+ return s != t && *t == '\0';
+}
+
+int
+u64log2(u64int v)
+{
+ for(int i = 0; i < 64; i++)
+ if((v >> i) <= 1)
+ return i;
+ return -1;
+}
+
+int
+stru32int(char *s, u32int *r)
+{
+ u64int tmp;
+ if(stru64int(s, &tmp) < 0)
+ return -1;
+ if(tmp & 0xFFFFFFFF00000000)
+ return -1;
+ *r = (u32int)(tmp & 0xFFFFFFFF);
+ return 0;
+}
+
+static int
+Brdu32(Biobufhdr *bio, u32int *u32)
+{
+ char *line = Brdline(bio, '\n');
+ if(line == nil)
+ return 0;
+ return stru32int(line, u32) == 0;
+}
+
+void
+parseargs(int argc, char **argv)
+{
+ if(argc != 1)
+ sysfatal("TODO arg parsing");
+}
+
+static int
+parsemap(Biobufhdr *b, MapEntry **map, u32int *nmap)
+{
+ u32int i;
+ char *s;
+ char *fields[4];
+ if(!Brdu32(b, nmap))
+ return 0;
+ if(*nmap > MaxAMap)
+ return 0;
+ print("reading %d map entries\n", *nmap);
+ *map = realloc(*map, *nmap * sizeof(MapEntry));
+ for(i = 0; i < *nmap; i += 1){
+ s = Brdline(b, '\n');
+ if(getfields(s, fields, 3, 0, "\t") != 3)
+ sysfatal("corrupt index map: %s", s);
+ memcpy((*map)[i].name, fields[0], NameSize);
+ (*map)[i].name[NameSize-1] = 0;
+ if(stru64int(fields[1], &(*map)[i].start) < 0)
+ sysfatal("corrupt index map: %s", fields[1]);
+ if(stru64int(fields[2], &(*map)[i].stop) < 0)
+ sysfatal("corrupt index map: %s", fields[2]);
+ // print("amap entry %d: [%llud, %llud)\n", i, (*map)[i].start, (*map)[i].stop);
+ }
+ return 1;
+}
+
+static u64int
+partlen(int fd, char *path)
+{
+ Dir *dir = dirfstat(fd);
+ u64int len;
+ if(dir == nil)
+ sysfatal("Cannot stat partition %s", path);
+ if(dir->length == 0)
+ sysfatal("can't determine size of partition %s", path);
+ len = dir->length;
+ free(dir);
+ return len;
+}
+
+static void
+loadarena(VtArena *arena)
+{
+ u32int magic, version;
+ char *buf = malloc(arena->blocksize);
+ u8int *p = (void*)buf;
+ if(pread(arena->fd, buf, arena->blocksize, arena->base + arena->size) != arena->blocksize)
+ sysfatal("failed to pread");
+ magic = U32GET(p);
+ version = U32GET(p + 4);
+ if(strncmp(arena->name, buf + 8, strlen(arena->name)) != 0)
+ sysfatal("arena name mismatch: %s vs %s, ver %d", arena->name, buf + 8, version);
+
+}
+
+// FIXME see initarenapart and initarena/loadarena in venti/venti
+
+static void
+initarena(VtArena *arena, int fd, MapEntry entry, u32int blocksize)
+{
+ arena->fd = fd;
+ arena->blocksize = blocksize;
+ arena->clumpmax = blocksize / ClumpInfoSize;
+ arena->base = entry.start + blocksize;
+ arena->size = entry.stop - entry.start - 2*blocksize;
+ memcpy(arena->name, entry.name, NameSize);
+ loadarena(arena);
+}
+
+static void
+readarenatable(int fd, u32int tabbase, u32int tabsize, char *path, u32int blocksize)
+{
+ Biobufhdr bio;
+ char *buf;
+ MapEntry *map = nil;
+ u32int nmap;
+ buf = malloc(tabsize);
+ if(buf == nil)
+ sysfatal("oom; you're a loser: %r");
+ if(Binits(&bio, fd, OREAD, (uchar*)buf, tabsize))
+ sysfatal("failed to init biobuf: %r");
+ if(Bseek(&bio, tabbase, 0) != tabbase)
+ sysfatal("seek failed: %r");
+ parsemap(&bio, &map, &nmap);
+ print("found %d arenas\n", nmap);
+ arenas = realloc(arenas, sizeof(VtArena) * (nmap + numarenas));
+ if(!arenas)
+ sysfatal("oom");
+ for(; nmap > 0; nmap -= 1)
+ initarena(&arenas[numarenas++], fd, map[nmap-1], blocksize);
+ free(map);
+}
+
+static void
+initarenapart(char *path)
+{
+ u32int version, magic, blocksize, arenabase, tabbase, tabsize;
+ u64int size;
+ char buf[HeadSize];
+ u8int *p = (void*)buf;
+ MapEntry *map;
+ u32int nmap;
+ int fd;
+
+ if((fd = open(path, OREAD)) < 0)
+ sysfatal("failed to open arena %s: %r", path);
+ if(pread(fd, buf, HeadSize, PartBlank) != HeadSize)
+ sysfatal("failed to read arena header table: %r");
+ magic = U32GET(p);
+ version = U32GET(p + 4);
+ blocksize = U32GET(p + 8);
+ arenabase = U32GET(p + 12);
+ if(magic != ArenaPartMagic)
+ sysfatal("bad arena partition magic number: %#ux expected ArenaPartMagic (%#ux)", magic, ArenaPartMagic);
+ if(version != 3)
+ sysfatal("bad arena partition version: only 3 is supported, found %d", version);
+ if(blocksize & (blocksize - 1))
+ sysfatal("invalid block size: %d is not a power of two", blocksize);
+ /* Head is not perfectly aligned; table must be aligned as first block */
+ tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
+ if(tabbase >= arenabase)
+ sysfatal("arena partition table overlaps with storage");
+ tabsize = arenabase - tabbase;
+ size = partlen(fd, path) & ~(u64int)(blocksize - 1);
+
+ readarenatable(fd, tabbase, tabsize, path, blocksize);
+}
+
+static void
+initarenas(void)
+{
+ initarenapart("/dev/" MACHINE "/arenas");
+}
+
+static void
+initisectpart(char *path)
+{
+ u32int magic;
+ char buf[HeadSize];
+ u8int *p = (void*)buf;
+
+ index.sects = realloc(index.sects, sizeof(VtISect) * (index.nsects + 1));
+ VtISect *sect = &index.sects[index.nsects++];
+
+ if((sect->fd = open(path, OREAD)) < 0)
+ sysfatal("failed to open index section");
+ if(pread(sect->fd, buf, HeadSize, PartBlank) != HeadSize)
+ sysfatal("failed to read index section header");
+ magic = U32GET(p);
+ sect->version = U32GET(p + 4);
+ memcpy(sect->name, buf + 8, NameSize);
+ memcpy(sect->index, buf + 8 + NameSize, NameSize);
+ sect->blocksize = U32GET(p + 8 + 2*NameSize);
+ sect->blockbase = U32GET(p + 12 + 2*NameSize);
+ sect->blocks = U32GET(p + 16 + 2 * NameSize);
+ sect->start = U32GET(p + 20 + 2 * NameSize);
+ sect->stop = U32GET(p + 24 + 2 * NameSize);
+ sect->index[NameSize-1] = 0;
+ sect->name[NameSize-1] = 0;
+ sect->bucketmagic = 0;
+ if(magic != ISectMagic)
+ sysfatal("invalid / corrupt index section");
+ if(sect->version == 2)
+ sect->bucketmagic = U32GET(p + 28 + 2*NameSize);
+ else if(sect->version != 1)
+ sysfatal("unrecognized index section version %d; only 1 and 2 are supported", sect->version);
+ sect->buckmax = (sect->blocksize - IBucketSize) / IEntrySize;
+ sect->blocklog = u64log2(sect->blocksize);
+ if(sect->blocksize != (1 << sect->blocklog))
+ sysfatal("Illegal or corrupt index section");
+ sect->tabbase = (PartBlank + HeadSize + sect->blocksize - 1) & ~(sect->blocksize - 1);
+ if(sect->tabbase >= sect->blockbase)
+ sysfatal("illegal or corrupt index section: config table overlaps bucket store");
+ sect->tabsize = sect->blockbase - sect->tabbase;
+ if(sect->blockbase + (u64int)sect->blocks * sect->blocksize != partlen(sect->fd, path) & ~(u64int)(sect->blocksize - 1))
+ sysfatal("invalid or corrupt index section header: invalid blocks");
+ if(sect->stop - sect->start > sect->blocks)
+ sysfatal("invalid or corrupt index section: section overflows available space");
+ if(sect->stop < sect->start)
+ sysfatal("invalid or corrupt index section: impossible range");
+}
+
+static void
+parseindex(void)
+{
+ /* parse the index header from the first section */
+ u32int version;
+ int i;
+ Biobufhdr bio;
+ char *buf = malloc(index.sects[0].tabsize);
+ char *line;
+ if(Binits(&bio, index.sects[0].fd, OREAD, (uchar*)buf, index.sects[0].tabsize))
+ sysfatal("failed to init biobuf: %r");
+ if(Bseek(&bio, index.sects[0].tabbase, 0) != index.sects[0].tabbase)
+ sysfatal("seek failed: %r");
+ line = Brdline(&bio, '\n');
+ if(memcmp(line, "venti index configuration", 25) != 0)
+ sysfatal("invalid magic found in index header");
+ if(!Brdu32(&bio, &version) || version != 1)
+ sysfatal("failed to read version or version unsupported");
+ print("Parsing index v1...\n");
+ line = Brdline(&bio, '\n');
+ if(Blinelen(&bio) >= NameSize)
+ sysfatal("invalid or corrupt index: name too big");
+ if(memcmp(line, index.sects[0].index, strlen(index.sects[0].index)) != 0)
+ sysfatal("invalid or corrupt index: index/section mismatch");
+ if(!Brdu32(&bio, &index.blocksize))
+ sysfatal("invalid or corrupt index: failed to read blocksize");
+ /* Section map, then arena map; see parseamap */
+ /* Parse both maps, overwrite the section map; we don't need it */
+ parsemap(&bio, &index.amap, &index.namap);
+ parsemap(&bio, &index.amap, &index.namap);
+ /* Validation code here */
+ for(i = 0; i < index.nsects; i += 1){
+ /* TODO validate section */
+ index.buckets = index.sects[i].stop;
+ }
+ index.div = (((u64int)1<<32)+index.buckets-1) / index.buckets;
+ if((((u64int)1 << 32) - 1) / index.div + 1 != index.buckets)
+ sysfatal("corrupt index: divisor and buckets inconsistent");
+ /* Lastly, maparenas */
+}
+
+static void
+initindex(void)
+{
+ initisectpart("/dev/" MACHINE "/isect");
+ parseindex();
+}
+
+static void
+init(void)
+{
+ initarenas();
+ initindex();
+}
+
+static void
+validate(void)
+{
+// sysfatal("valid arenas are impossible.");
+}
+
+static void
+vtversion(VtConn conn)
+{
+ char c;
+ if(fprint(conn.fd, "venti-02-neoventi\n") == 18)
+ while(read(conn.fd, &c, 1) == 1)
+ if(c == '\n')
+ return;
+ fprint(conn.fd, "FUCK OFF\n");
+ close(conn.fd);
+ sysfatal("venti handshake failed: %r");
+}
+
+static int
+vtrecv(VtConn conn, char *buf)
+{
+ u16int len;
+ if(read(conn.fd, buf, 2) != 2){
+ werrstr("Failed to read message size: %r");
+ return 0;
+ }
+ len = (buf[0] << 8 | buf[1]);
+ if(read(conn.fd, buf + 2, len) != len){
+ werrstr("Failed to read message: %r");
+ return 0;
+ }
+ return 1;
+}
+
+VtISect
+isectforbucket(u32int buck)
+{
+ int r, l, m;
+
+ l = 1;
+ r = index.nsects - 1;
+ while(l <= r){
+ m = (r + l) >> 1;
+ if(index.sects[m].start <= buck)
+ l = m + 1;
+ else
+ r = m - 1;
+ }
+ return index.sects[l - 1];
+}
+
+static int
+bucketlookup(u8int *bucket, u16int nb, u8int *score, u16int *entry)
+{
+ for(*entry = 0; *entry <= nb; *entry += 1){
+ if(memcmp(&bucket[*entry * IEntrySize], score, 20) == 0)
+ return 1;
+ }
+ return 0;
+}
+
+static VtArena
+arenafromindex(u64int aindex)
+{
+ u64int i;
+ for(i = 0; i < numarenas; i += 1){
+ if(strcmp(arenas[i].name, index.amap[aindex].name) == 0)
+ return arenas[i];
+ }
+ sysfatal("arena not found");
+ return arenas[0];
+}
+
+static u64int
+aindexfromaddr(u64int addr)
+{
+ u64int a;
+ for(a = 0; a < index.namap; a += 1)
+ if(addr >= index.amap[a].start && addr < index.amap[a].stop)
+ return a;
+ sysfatal("internal corruption: arena not found for arenaindex");
+ return 0;
+}
+
+static int
+vtreadlookup(u8int *score, VtArena *arena, u64int *addr, u16int *size, u8int *blocks)
+{
+ u8int *buf;
+ u16int bentries;
+ u64int bucket = U32GET(score) / index.div;
+ u16int entry;
+ u64int aindex;
+ VtISect sect = isectforbucket(bucket);
+ bucket -= sect.start;
+ buf = malloc(sect.blocksize);
+ if(buf == nil)
+ sysfatal("OOM");
+ if(pread(sect.fd, (char*)buf, sect.blocksize, sect.blockbase + (bucket << sect.blocklog)) != sect.blocksize)
+ sysfatal("Failed to read bucket");
+ bentries = U16GET(buf);
+ if(sect.bucketmagic && U32GET(buf + 2) != sect.bucketmagic)
+ sysfatal("index is corrupt: invalid bucket magic: sect %lux, buck %lux", sect.bucketmagic, U32GET(buf + 2));
+ if(!bucketlookup(buf + 6, bentries, score, &entry))
+ sysfatal("entry not found in bucket");
+ *addr = U64GET((buf + 6 + (entry * IEntrySize) + 26));
+ *size = U16GET((buf + 6 + (entry * IEntrySize) + 34));
+ *blocks = buf[6 + (entry*IEntrySize) + 37];
+ aindex = aindexfromaddr(*addr);
+ *arena = arenafromindex(aindex);
+ *addr -= index.amap[aindex].start;
+ free(buf);
+ return 1;
+}
+
+static u64int
+arenadirsize(VtArena arena)
+{
+ return ((arena.memstats.clumps / (arena.blocksize / 25)) + 1) * arena.blocksize;
+}
+
+static void
+vtreadarena(VtArena arena, u64int addr, uchar *dbuf, u16int *size)
+{
+ u64int end = arena.size - arenadirsize(arena);
+ char *buf = malloc(arena.blocksize);
+ u16int off, n, m;
+ if(addr + *size > end)
+ *size = end - addr;
+ addr += arena.base;
+ off = addr & (arena.blocksize-1);
+ addr -= off;
+ n = 0;
+ while(n < *size){
+ // Read the next block
+ if(pread(arena.fd, buf, arena.blocksize, addr) != arena.blocksize)
+ sysfatal("pread failed!");
+ m = arena.blocksize - off;
+ if(m > *size - n)
+ m = *size - n;
+ memmove(&dbuf[n], &buf[off], m);
+ n += m;
+ off = 0;
+ addr += arena.blocksize;
+ }
+}
+
+static int
+readclump(uchar *dst, VtArena arena, u64int addr, u8int blocks)
+{
+ uchar *buf = malloc(blocks << ABlockLog);
+ u32int magic;
+ u16int size;
+ size = blocks<<ABlockLog;
+ vtreadarena(arena, addr, buf, &size);
+ size = U16GET(buf+7);
+ if(buf[29] == 2){
+ // Needs decompression
+ Unwhack uw;
+ unwhackinit(&uw);
+ if(unwhack(&uw, dst, size, buf+38, U16GET(buf+5)) != size)
+ sysfatal("decompression failed: %s", uw.err);
+ } else if(buf[29] == 1)
+ memcpy(dst, buf+38, size);
+ free(buf);
+ return 1;
+}
+
+static void
+vtread(VtConn conn, char *buf)
+{
+ u8int *score;
+ VtArena arena;
+ u64int addr;
+ u16int size;
+ u32int off;
+ u8int blocks;
+ uchar *dbuf;
+ score = (u8int*)buf + 4;
+ if(!vtreadlookup(score, &arena, &addr, &size, &blocks))
+ sysfatal("todo graceful read errors");
+ // Response: VtRread, msg tag, data
+ dbuf = malloc(4 + size);
+ dbuf[0] = (size+2)>>8;
+ dbuf[1] = (size+2) & 0xFF;
+ dbuf[2] = VtRread;
+ dbuf[3] = buf[3];
+ readclump(dbuf+4, arena, addr, blocks);
+ if(write(conn.fd, dbuf, size + 4) != size+4)
+ sysfatal("failed to write data");
+}
+
+static int
+vtconnhandle(VtConn conn, char *buf)
+{
+ switch(buf[2]){
+ case VtTread:
+ vtread(conn, buf);
+ return 1;
+ case VtTgoodbye:
+ return 0;
+ case VtTsync:
+ print("we don't support vtsync yet. Hanging up!\n");
+ return 0;
+ default:
+ sysfatal("TODO safely hang up vtconns");
+ }
+ return 0;
+}
+
+static void
+handle(int ctl, char *dir)
+{
+ char buf[0x10002];
+ VtConn conn;
+ conn.fd = accept(ctl, dir);
+ if(conn.fd < 0)
+ sysfatal("failed to accept connection: %r");
+ print("received a connection at %s, fd %d\n", dir, conn.fd);
+ vtversion(conn);
+ if(!vtrecv(conn, buf))
+ sysfatal("msg recv failed: %r");
+ if(buf[2] != VtThello)
+ sysfatal("received message before hello: %d", buf[2]);
+ if(buf[4] != 0 || buf[5] != 2 || buf[6] != '0' || buf[7] != '2')
+ sysfatal("unsupported protocol version requested in Thello: %d %d %d %d", buf[4], buf[5], buf[6], buf[7]);
+ buf[2] = VtRhello;
+ buf[6] = 'n';
+ buf[7] = 'o';
+ buf[1] = 8;
+ if(write(conn.fd, buf, 10) != 10)
+ sysfatal("failed to rhello: %r");
+ while(vtrecv(conn, buf)){
+ if(!vtconnhandle(conn, buf))
+ break;
+ }
+ close(conn.fd);
+}
+
+static void
+serve(void)
+{
+ char adir[NETPATHLEN], dir[NETPATHLEN];
+ int fd, ctl;
+ fd = announce("tcp!127.1!14011", adir);
+ if(fd < 0)
+ sysfatal("%r");
+ procsetname("neoventi/server");
+ for(ctl = listen(adir, dir); ctl >= 0; ctl = listen(adir, dir)){
+ handle(ctl, dir);
+ close(ctl);
+ }
+ fprint(2, "server has died\n");
+}
+
+void
+main(int argc, char **argv)
+{
+ parseargs(argc, argv);
+ print("Initializing neoventi build 1...\n");
+ init();
+ validate();
+ serve();
+}
--- /dev/null
+++ b/mkfile
@@ -1,0 +1,18 @@
+</$objtype/mkfile
+
+TARG=neoventi
+BIN=/$objtype/bin
+OFILES=main.$O unwhack.$O
+
+HFILES=\
+
+</sys/src/cmd/mkone
+
+run:VE: $O.out
+ ./$O.out & pid=$apid
+ sleep 2
+ venti/read -h tcp!127.1!14011 vac:4091f8b8aafc7b8a815f38534b70a171e4ae3e44
+ echo kill >/proc/$pid/ctl
+
+hook:
+ hook BufWritePost '.*\.c' 'mk run'
--- /dev/null
+++ b/notebook
@@ -1,0 +1,358 @@
+Let's get venti lookups working. At present, we're not finding the entry in the bucket, even though the score was taken directly from fossil, which I'm typing this into, so it definitely is there.
+
+Walk backwards through the logic in oldventi, compare the results, find the problem, document this here.
+
+lookupscore in icache.c checks the index cache, then falls back to loadientry if it's not found there. That takes in the index, score, and type, and generates an index entry; let's check that out in /sys/src/cmd/venti/srv/index.c:652...
+
+First, it checks the bloom filter; that's used to detect absence early, so we can skip it.
+
+Then, it loads the index bucket - double check that we're grabbing the right bucket, then check that we're looking up the entry in the bucket correctly...
+
+
+loadibucket goes through loadibucket1, but nothing happens in loadibucket. loadibucket1 processes the args a bit for loadibucket0, which does the real work.
+
+Given the index, the bucket number, an out pointer to which the index section is written if found, an out pointer to which the section-relative bucket number is written if found, an out pointer to which the index bucket is unpacked from the raw bytes, and the mode with which to retrieve the disk block.
+
+The absolute bucket index is the hashbits(score, 32) divided by the index divisor. Aside from that, all parameters are passed through from loadibucket exactly. The score is a u8*, and I think I used char* for neoventi. This is quite probably a bug even if it's not the one I'm looking for. Nothing else is of interest from loadientry, parameter-wise.
+
+Let's go in order, though. Double check the hash...
+
+Given u8* score and the result bits, the hash is calculated as the first four bytes of the score in little-endian order. The maximum length of the hash is 32 bits; if fewer bits are requested, the four bytes are simply shifted over.
+
+...we're using U32GET(score) to retrieve the hash, but score is char*. U32GET only casts the _result_ to u32, not the parameter. Fix that, compare the resulting bucket index...
+
+That was definitely _one_ of the issues; this moves us from bucket 4880611 to bucket 4226740. There's likely other stupid type errors in here still >_>
+
+Going to preemptively switch to using u8int* for the score everywhere. That might be enough to solve this, and even if not, it will prevent other stupid bugs in the future.
+
+Didn't fix this, but I wasn't really expecting it, so it's fine ☺
+
+Idea: maybe write a frontend to venti/srv code that dumps intermediate info for comparison? That should be relatively easy, there's other venti/srv frontends (wrarena, for instance) I can use as a template, and it would it easy to verify.
+
+Yeah, that was a good idea % ./6.tester
+found in bucket 1913977
+
+It COMPLETELY disagrees? Need to double-check unit; venti/tester is reporting section-relative, I don't think I am in neoventi.
+
+Nope, not the problem.
+
+venti/tester modified reports: found in bucket 1913977, [0, 4880645], 1684300339/880 = 1913977
+
+The index divisor is 880, the 'hash' is 1684300339.
+
+neoventi reports 'Looking in bucket 3719531835 / 880 = 4226740'.
+
+The hash is wrong??
+
+THE SCORE IS WRONG! >_>
+
+We have it in string form, we want it in raw bytes!
+
+...no, actually, we have it in raw bytes. I was embedding it in string form in the tester >_> Oops.
+
+With that fixed, % ./6.tester
+found in bucket 4226740, [0, 4880645], 3719531835/880 = 4226740
+
+We're looking in the right bucket! Yay!
+
+That means we're misreading the bucket? Not scanning it correctly?
+
+Bucket data is after the u16 entries and u32 magic; IBucketSize in venti/srv is U32Size+U16Size. unpackibucket sets the n to the u16 at the front, and the data to buf+IBucketSize. magic is checked as the u32 after the first u16.
+
+We're doing all that right.
+
+We're reading the bucket with one entry in it. Probably our bucketlookup that's wrong, then?
+
+...uh. Or... maybe not?
+
+Extended the tester to unpack and report the bucket contents:
+
+ IBucket b;
+ unpackibucket(&b, no->data, sect->bucketmagic);
+ print("bucket entries: %d\n", b.n);
+ int h = bucklook(score, -1, b.data, b.n);
+ print("bucklook: %d\n", h);
+ if(h&1){
+ h ^= 1;
+ print("Found at offset %d\n", h);
+ } else {
+ print("not found in bucket\n");
+ }
+
+Result:
+
+bucket entries: 1
+bucklook: 38
+not found in bucket
+
+venti/srv doesn't see that score, either??? What the hell?
+vacfs: vacfsopen: no block with score ddb38d3b21f95644b181052dc096ebdf2862dd83/16 exists
+
+Yeah, confirmed. Huh??? I copied that score from fossil/l... fossil/last. Are its output scores invalid? ...yep. Oops.
+
+Let's try with a new score! 4091f8b8aafc7b8a815f38534b70a171e4ae3e44 taken from fscons.
+
+Establish a baseline first, which I should ALWAYS do: vacfs is able to mount that one. okay. venti/tester does not see it, though. Are vac scores not direct venti scores?
+
+I'm fairly sure they should be, and a quick skim through vacfs code supports that. Okay, let's try a different approach: venti/tester expansion to loadientry directly, none of the manual bucket futzing:
+
+ IEntry e;
+ if(loadientry(index, score, -1, &e) < 0){
+ print("failed to load ientry\n");
+ } else {
+ print("ientry loaded\n");
+ }
+
+failed to load ientry
+
+...hmmm. Is it possible for data in the latest arena to be unindexed? Perhaps the issue is the assumption that the data should be in the index to begin with.
+
+Walking through the source; readlump=>lookupscore, which, if the entry is not in the cache, will loadientry; if loadientry fails, it passes an error up, and readlump fails.
+
+...data in the latest arena has to be loaded into the index on startup, doesn't it? is it injecting it directly into the index cache and leaving it dirty??
+
+Yep, syncindex(). Okay, let's add a syncindex call to venti/tester...
+
+That's not working, either. let's go a step higher: try readlump()?
+
+failed to read lump: no block with score %V%/4542084 exists
+
+what.
+seterr(EOk, "no block with score %V/%d exists", score, type);
+
+Ahhhhhh, %V format handler not installed, so score isn't printed, and it's printing the score as the type.
+ventifmtinstall();
+
+failed to read lump: no block with score 4091f8b8aafc7b8a815f38534b70a171e4ae3e44/-1 exists
+
+much better! Now, copy-paste that score and vacfs it again...
+
+ % vacfs -h tcp!127.1!13031 vac:4091f8b8aafc7b8a815f38534b70a171e4ae3e44
+% ls /n/vac
+/n/vac/active
+/n/vac/archive
+/n/vac/snapshot
+
+and lo! This is bullshit! Ah wait, reading the lump before syncindex, let's fix that...
+ if(syncindex(index) < 0)
+ sysfatal("sync failed: %r");
+ Packet *data = readlump(score, -1, -1, nil);
+ if(data == nil)
+ print("failed to read lump: %r\n");
+ else
+ print("successfully read lump!\n");
+ IEntry e;
+ if(loadientry(index, score, -1, &e) < 0){
+ ...
+
+
+and huh!
+
+successfully read lump!
+failed to load ientry
+
+Progress!
+
+...is it being inserted into the lump cache on startup, but not indexed?
+
+ int cached;
+ Packet *data = readlump(score, -1, -1, &cached);
+ if(data == nil)
+ print("failed to read lump: %r\n");
+ else
+ print("successfully read lump, cached: %d\n", cached);
+successfully read lump, cached: 0
+
+The plot thickens! It's NOT in the lump cache! readlump then uses lookupscore to find the index address... which checks the index cache, the loads the ientry. Let's check the icache?
+ IAddr addr;
+ if(lookupscore(score, -1, &addr) == 0)
+ print("found score in index cache at %llud, size %lud, type %d, blocks %d\n", addr.addr, addr.size, addr.type, addr.blocks);
+ else
+ print("not found in index cache\n");
+
+found score in index cache at 75373226395, size 300, type 16, blocks 1
+failed to load ientry
+
+Yeeeep, there it is. it's inserted into the index cache on startup. That's separate from syncindex, though?
+
+Nope, removing syncindex results in it not being in the index cache. Whelp, pretty sure that's a bug in venti/venti: I don't think it will allow you to read the latest data when in read-only mode, where it skips syncindex.
+
+Okay, I don't intend to have this behavior, necessarily. Let's just do a full flush.
+% hget http://127.1:13032/flushicache
+
+Note: the index flusher is disgustingly slow. Disk I/O and CPU were both at effectively zero for minutes while it ran. I didn't realize it was _working_ it's so slow O_o.
+
+venti/read -h tcp!127.1!14011 vac:4091f8b8aafc7b8a815f38534b70a171e4ae3e44
+kill 108118
+Initializing neoventi build 1...
+reading 1416 map entries
+found 1416 arenas
+Parsing index v1...
+reading 1 map entries
+reading 1416 map entries
+received a connection at /net/tcp/18, fd 7
+handling read of score 4091f8b8aafc7b8a815f38534b70a171e4ae3e44
+Looking in bucket 1083308216 / 880 = 1231032
+relative bucket: 1231032, section [0, 4880645]
+entries: 3
+arena arenas0.140, start 212543899, size 17
+WE HAVE THE DATA!!!!!
+
+Okay! We're able to read the data now. All RPC calls start with the RPC size, then message type, then tag. For Rread, the rest of the message is the data.
+
+ dbuf = malloc(4 + size);
+ dbuf[0] = (size+2)>>8;
+ dbuf[1] = (size+2) & 0xFF;
+ dbuf[2] = VtRread;
+ dbuf[3] = buf[3];
+ vtreadarena(arena, addr, dbuf+4, &size);
+ print("WE HAVE THE DATA!!!!! size: %ud\n", size);
+ // just slam the dbuf packet over the wire!
+ if(write(conn.fd, dbuf, size + 4) != size+4){
+ print("failed to write data\n");
+
+...
+
+
+venti/read: could not read block: read asked for 4091f8b8aafc7b8a815f38534b70a171e4ae3e44 got db1a96f91d93fca56f68c352a428a5fbdb71f0d5
+
+Oh, lovely. Bucket's good, but actual data read's busted. Back to venti/tester...
+ Packet *data = readlump(score, -1, -1, &cached);
+ if(data == nil)
+ print("failed to read lump: %r\n");
+ else
+ print("successfully read lump, cached: %d, size: %lud\n", cached, packetsize(data));
+successfully read lump, cached: 0, size: 300
+
+okay. back to readlump...
+
+Looks up the score as an IAddr, which is... an address into the index? It then uses the index to map that to an address inside of an arena, before actually loading the clump from that arena.
+
+Our vtreadlookup combines these steps: given a score, it looks it up in the index, finds the arena+address, and yields that info all to the caller. So, let's compare the intermediate results.
+
+First off, lookupscore() already gives us the right size. venti/tester already exposes it:
+
+ if(lookupscore(score, -1, &addr) == 0)
+ print("found score in index cache at %llud, size %lud, type %d, blocks %d\n", addr.addr, addr.size, addr.type, addr.blocks);
+
+ found score in index cache at 75374255700, size 300, type 0, blocks 1
+
+lookupscore() grabs the score from loadientry(), which is just loading the bucket and finding the entry in the bucket.
+
+Found the issue while reading the code, no test needed :P
+
+ *addr = U64GET((buf + 6 + (entry * IEntrySize) + 26));
+ *size = U16GET((buf + 6 + (entry * IEntrySize) + 28));
+
+The address is eight bytes, not two. The offset should be 34, not 28. With that fixed,
+
+ arena arenas0.140, start 212543899, size 300
+WE HAVE THE DATA!!!!! size: 300
+
+could not read block: read asked for 4091f8b8aafc7b8a815f38534b70a171e4ae3e44 got 746311957a8fafc724bc122884778f97d4a5c769
+
+That's a different wrong hash, at least? and now the size is correct! Either the address is wrong, the conversion is wrong, or the way it's being read/sent is wrong, I think.
+
+tester update:
+ u64int aa;
+ Arena *arena = amapitoa(index, addr.addr, &aa);
+ if(arena)
+ print("mapped to arena addr %llud\n", aa);
+
+mapped to arena addr 213573204
+
+On our side, we've got:
+ arena arenas0.140, start 212543899, size 300
+
+That address looks wrong?
+
+Let's expand the tester a bit:
+ Clump cl;
+ u8int readscore[20];
+ ZBlock *zb = loadclump(arena, aa, addr.blocks, &cl, readscore, 1);
+ if(zb)
+ print("read clump from disk: size %lud\n", cl.info.uncsize);
+
+read clump from disk: size 300
+
+okay. let's double check that it is the offset that's the problem by hardcoding it and seeing what happens.
+
+venti/read: could not read block: read asked for 4091f8b8aafc7b8a815f38534b70a171e4ae3e44 got f8e0d5ed942a9fe3c234569b14b18de0536cdd61
+
+We just get a different wrong hash >_>
+
+Okay, that was maybe a silly approach. let's figure out _why_ the addresses differ...
+
+First, let's double check our pre-mapping addresses.
+
+tester gives > found score in index cache at 75374255700, size 300, type 0, blocks 1
+
+75374255700.
+
+we've got > found base addr 75373226395
+
+whoops. Our index entry is wrong, then? Let's double check the bucket, and where in it we are...
+
+tester gives offset 38 for the bucket.
+
+we've got... found in bucket at offset 0
+
+...huh. Oops? One entry is 38 bytes; we're finding it as the first entry in the bucket, while venti/srv finds it as the second?
+
+Ahhhh, the type is probably wrong. The hash for both entries is the same? Looks like the same data was written with multiple types??
+
+...ohhhh, the clump on disk isn't just the raw bytes! Oops!
+
+It'll probably work without checking the type if I fix the disk read?
+
+Yeaaaaaaah, loadclump:
+• Reads blocks, not bytes
+• calls unpackclump to actually get the data out
+• uncompresses if necessary
+
+We can't ignore any of that. The clump encoding may need more than the listed size.
+
+...we can try something stupid, though. The clump on disk consists of:
+4 bytes of magic
+1 byte of type
+2 bytes for size
+2 bytes for uncompressed size
+20 bytes for the score
+1 byte for the encoding (compression info)
+4 bytes for the creator
+4 bytes for the time
+and then the data
+
+For an uncompressed block, we shooooould be able to just read 38 bytes plus the data, and skip the first 38 bytes?
+
+Didn't work; still had the wrong hash. Hopefully that means it's compressed?
+ char clump[38];
+ vtreadarena(arena, addr, clump, &size);
+ print("clump encoding: %d\n", clump[29]);
+
+Didn't even need the tester for this; the encoding is 2, which is - according to venti/srv/dat.h - ClumpECompress. It's compressed and needs unwhacking.
+
+Let's just copy unwhack in wholesale. That's ~200 lines of code, and there's no point reimplementing _that_ or, bluntly, understanding it.
+
+it fucking wooooorks!
+
+static int
+readclump(uchar *dst, VtArena arena, u64int addr, u8int blocks)
+{
+ uchar *buf = malloc(blocks << ABlockLog);
+ u32int magic;
+ u16int size;
+ size = blocks<<ABlockLog;
+ vtreadarena(arena, addr, buf, &size);
+ size = U16GET(buf+7);
+ if(buf[29] == 2){
+ // Needs decompression
+ Unwhack uw;
+ unwhackinit(&uw);
+ if(unwhack(&uw, dst, size, buf+38, U16GET(buf+5)) != size)
+ sysfatal("decompression failed: %s", uw.err);
+ } else if(buf[29] == 1)
+ memcpy(dst, buf+38, size);
+ free(buf);
+ return 1;
+}
--- /dev/null
+++ b/partitioner/design
@@ -1,0 +1,53 @@
+Needs to:
+
+* Be given a list of disks
+ * Raw disks, partitions, files
+ * Disks can be partitioned
+ * Files and partitions must be used as a full partition each
+ * Disks may be empty
+ * Can partition, or use whole disk as one region
+ * Disks may contain partitions and empty space
+ * In which case, may want to use part of the disk
+ * Replace existing partitions
+ * Use unused space
+ * If neither, unusable
+ * Or realize it's the wrong disk
+
+* Be given desired constraints
+ * How many fossils, in what size range?
+
+* Determine usable regions
+ * Size
+ * Disk
+ * Offset
+ * Redundancy
+ * Sequential performance
+ * Random-access performance
+
+* Determine what to do with each region within constraints
+v1:
+ * Have bloom filter if there's enough space
+ * Allocate five fossils, each 1% of total space
+ * Statically size index and arenas as percentages of the total remaining space
+ * Create one index section, one arena partition, bloom filter, and fossils
+ * Try to give index minimal random read latency, and arena maximum sequential read speed
+ * No interactivity
+
+v2:
+ * Give the user the ability to edit the plan before executing it
+
+v3:
+ * Talk with /dev/fs, take into account redundancy
+ * Create multiple index sections and arena partitions, as needed
+ * In order of highest to lowest priority:
+ * Try not to use arena disks for anything else
+ * Try not to use fossil disks for anything else
+ * Try not to use index disks for anything else
+
+
+* Create the partitions
+* Format the partitions
+* Generate the venti configuration file
+
+
+
--- /dev/null
+++ b/partitioner/linux.c
@@ -1,0 +1,74 @@
+#include "platform.h"
+#include "partitioner.h"
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+static const char *types[] = { "unknown", "bad", "disk", "partition", "file", };
+
+/* Resolves partition to disk offset */
+static void
+probepart(Region *r)
+{
+ char *startpath = aprintf("/sys/class/block/%s/start", r->uuid);
+ char *str = readfile(startpath, NULL);
+ free(startpath);
+ r->offset = atoll(str);
+ free(str);
+ r->type = disk;
+}
+
+static char*
+extract(char *file, char *key)
+{
+ char *val = strstr(file, key);
+ val += strlen(key) + 1;
+ return strndup(val, strchr(val, '\n') - val);
+}
+
+static void
+probeblock(Region *r, struct stat buf)
+{
+ char *devpath = aprintf("/sys/dev/block/%d:%d/uevent", major(buf.st_rdev), minor(buf.st_rdev));
+ char *contents = readfile(devpath, NULL);
+ char *devtype;
+ free(devpath);
+ devtype = extract(contents, "DEVTYPE");
+ r->uuid = extract(contents, "DEVNAME");
+ free(contents);
+ r->type = streql(devtype, "disk") ? disk : part;
+ if(r->type == part)
+ probepart(r);
+}
+
+void
+proberegiontype(Region *r)
+{
+ struct stat buf;
+ if(stat(r->path, &buf) != 0 || !(buf.st_mode & (S_IFMT | S_IFBLK)))
+ r->type = bad;
+ else if(buf.st_mode & S_IFREG)
+ r->type = file;
+ else
+ probeblock(r, buf);
+}
+
+void
+proberegionredundancy(Region *r)
+{
+ /* TODO: integrate with mdadm? LUKS? cryptsetup? All of the above? */
+ (void)r;
+}
+
+int
+ispartofdisk(Region *maybedisk, Region *maybepart)
+{
+ if(maybedisk->type != disk || maybepart->type != disk)
+ return 0;
+ char *partpath = aprintf("/sys/class/block/%s/%s/partition", maybedisk->uuid, maybepart->uuid);
+ char *contents = readfile(partpath, NULL);
+ free(partpath);
+ free(contents);
+ return contents != NULL;
+}
+
--- /dev/null
+++ b/partitioner/main.c
@@ -1,0 +1,360 @@
+#include "platform.h"
+#include "partitioner.h"
+#include <stdio.h>
+
+static const char *types[] = { "unknown", "bad", "disk", "partition", "file", };
+static const char *uses[] = { "undecided","none","arena","index","bloom","index and bloom", "fossil" };
+
+const char *usage = "%s [disk1] [disk2] [...diskn]";
+
+int ispartofdisk(Region *disk, Region *maybepart);
+
+static struct {
+ int usebloom;
+ size_t indexsize, arenasize;
+ size_t regioncount;
+ Region *regions;
+ size_t fossilcount;
+ size_t fossilsize;
+ int *fds;
+} state = {
+ .usebloom = 0,
+};
+
+static void
+parseargs(int argc, char **argv)
+{
+ size_t offset = 1;
+ state.fossilcount = 0;
+ state.fossilsize = 10*(size_t)1024*1024*1024;
+ state.regioncount = argc - 1;
+ state.regions = calloc(sizeof(Region), state.regioncount);
+ for(int i = 1; i < argc; i += 1){
+ if(streql(argv[i], "-f")){
+ if(state.fossilcount == 5)
+ sysfatal("Cannot give -f multiple times");
+ state.fossilcount = 5;
+ offset = 2;
+ continue;
+ }
+ state.regions[i - offset].path = argv[i];
+ }
+ state.regioncount = argc - offset;
+}
+
+static void
+roundup(size_t *val, size_t base)
+{
+ size_t abovebase = *val % base;
+ if(abovebase == 0)
+ return;
+ /* The number is *not* a multiple of the base. Removing the modulus reduces to the
+ * prior multiple; adding the base then moves it to the next one. */
+ *val = *val - abovebase + base;
+}
+
+/* If directory containing 'ctl', disk. Otherwise, raw file [or partition, they're treated identically] */
+static void
+opendisks(void)
+{
+ printf("Probing disk presence...\n");
+ for(size_t i = 0; i < state.regioncount; i += 1){
+ state.regions[i].fd = open(state.regions[i].path, OREAD | __O_DIRECT);
+ state.regions[i].size = fdsize(state.regions[i].fd);
+ }
+}
+
+static void
+removeregion(int(*check)(size_t index), char *hint)
+{
+ size_t newcount = 0;
+ size_t i;
+ Region *newregions = calloc(sizeof(Region), state.regioncount);
+ if(newregions == NULL)
+ sysfatal("Insufficient memory");
+ for(i = 0; i < state.regioncount; i += 1)
+ if(!check(i)){
+ newregions[newcount] = state.regions[i];
+ newcount += 1;
+ } else{
+ printf("Eliding %s, which is %s\n", state.regions[i].path, hint);
+ }
+ state.regioncount = newcount;
+ free(state.regions);
+ state.regions = newregions;
+}
+
+static int
+istoosmall(size_t index)
+{
+ return state.regions[index].size < MinSize || state.regions[index].size == (size_t)-1;
+}
+
+static int
+isclosed(size_t index)
+{
+ return state.regions[index].fd < 0;
+}
+
+static int
+isunused(size_t index)
+{
+ return state.regions[index].type == bad;
+}
+
+static int
+isduplicate(size_t index)
+{
+ for(size_t i = 0; i < state.regioncount; i += 1){
+ if(i == index)
+ continue;
+ if(streql(state.regions[index].path, state.regions[i].path))
+ return 1;
+ }
+ return 0;
+}
+
+static int
+issubregion(size_t index)
+{
+ for(size_t i = 0; i < state.regioncount; i += 1)
+ if(ispartofdisk(&state.regions[i], &state.regions[index]))
+ return 1;
+ return 0;
+}
+
+static size_t
+totalsize(void)
+{
+ size_t size = 0;
+ for(size_t i = 0; i < state.regioncount; i += 1)
+ size += state.regions[i].size;
+ return size;
+}
+
+static void
+dump(size_t *indices)
+{
+ size_t j = 0;
+ printf("\tTotal size: %.2fMiB\n", (double)totalsize() / 1024 / 1024);
+ for(size_t i = 0; i < state.regioncount; i += 1){
+ if(indices == NULL || indices[j] == i){
+ printf("\t\tRegion %s", state.regions[i].path);
+ if(state.regions[i].offset > 0)
+ printf(":%luMiB", state.regions[i].offset/1024/1024);
+ printf(", uuid %s, size %.2f MiB, %uMiB/s seqread, %uus latency, %s, %d redundancy, %s, %s\n", state.regions[i].uuid, (double)state.regions[i].size/1024/1024, state.regions[i].seqspeed, state.regions[i].randlatency, types[state.regions[i].type], state.regions[i].redundancy, uses[state.regions[i].use], state.regions[i].why != NULL ? state.regions[i].why : "CAUSE UNKNOWN");
+ j += 1;
+ }
+ }
+}
+
+static size_t
+roffset(Region *r, int random, size_t alignment, size_t size)
+{
+ size_t offset;
+ if(random)
+ offset = rand() % r->size;
+ else
+ offset = lseek(r->fd, size, SEEK_CUR) - size;
+ roundup(&offset, alignment);
+ return offset;
+}
+
+static void
+proberegionspeed(Region *r, int random)
+{
+ long count = 0, ocount;
+ char *buf = aligned_alloc(512, 0x2000);
+ size_t offset;
+ double t;
+ if(buf == NULL)
+ /* Continue without determining speed */
+ return;
+ delta();
+ for(t = 0; t < 0.1; t += delta()){
+ offset = roffset(r, random, random ? 512 : 4096, random ? 512 : 4096);
+ ocount = pread(r->fd, buf, random ? 512 : 4096, offset);
+ if(ocount < 0 && count == 0){
+ fprintf(stderr, "Unable to probe %s read speed of '%s'\n", random ? "random" : "sequential", r->path);
+ break;
+ }
+ count += ocount;
+ }
+ if(random)
+ r->randlatency = (t * 1000 * 1000) / (count / 512);
+ else
+ r->seqspeed = count / (t * 1000 * 1000);
+ free(buf);
+}
+
+static void
+proberegion(Region *r)
+{
+ printf("\tProbing '%s'...\n", r->path);
+ proberegiontype(r);
+ proberegionspeed(r, 0);
+ proberegionspeed(r, 1);
+ proberegionredundancy(r);
+}
+
+static void
+proberegions(void)
+{
+ if(state.regioncount == 0)
+ sysfatal("No usable disks provided!");
+ printf("Probing disks...\n");
+ for(size_t i = 0; i < state.regioncount; i += 1)
+ proberegion(&state.regions[i]);
+}
+
+static void
+reduceregion(size_t r, size_t size)
+{
+ /* If the region happens to be exactly the size intended, don't split it */
+ if(state.regions[r].size == size)
+ return;
+ Region *newregions = calloc(sizeof(Region), state.regioncount + 1);
+ if(newregions == NULL)
+ sysfatal("Insufficient memory");
+ memcpy(newregions, state.regions, sizeof(Region) * state.regioncount);
+ free(state.regions);
+ state.regions = newregions;
+ state.regions[state.regioncount] = state.regions[r];
+ state.regions[r].size = size;
+ state.regions[state.regioncount].offset += size;
+ state.regions[state.regioncount].size -= size;
+ state.regions[state.regioncount].use = undecided;
+ state.regioncount += 1;
+}
+
+static void
+decidebloom(void)
+{
+ state.usebloom = totalsize() > ((size_t)32 * 1024 * 1024 * 1024);
+ printf("Bloom filter: %s...\n", state.usebloom ? "enabled" : "disabled");
+}
+
+static void
+decidesizes(void)
+{
+ size_t total = totalsize() - (state.usebloom ? 512 * 1024 * 1024 : 0);
+ state.fossilsize = totalsize() / 100;
+ if(state.fossilcount > 0 && state.fossilsize < 512 * 1024 * 1024)
+ sysfatal("Not enough space for state.fossilcount!");
+ total -= state.fossilcount * state.fossilsize;
+ state.indexsize = total * 15 / 105;
+ roundup(&state.indexsize, 512*1024*1024);
+ state.arenasize = total - state.indexsize;
+}
+
+/* if minsize == -1, finds the largest unused region
+ * Otherwise, picks a random unused region >= minsize */
+static size_t
+findunusedregion(size_t minsize)
+{
+ size_t largest = -1, largestsize = 0;
+ for(size_t i = 0; i < state.regioncount; i += 1){
+ if(state.regions[i].use != undecided)
+ continue;
+ if(minsize != (size_t)-1 && state.regions[i].size >= minsize)
+ return i;
+ if(minsize == (size_t)-1 && state.regions[i].size > largestsize){
+ largest = i;
+ largestsize = state.regions[i].size;
+ }
+ }
+ return largest;
+}
+
+static void
+picklargest(size_t minsize, int use)
+{
+ size_t i = findunusedregion(minsize);
+ if(i == (size_t)-1){
+ dump(NULL);
+ sysfatal("no region big enough for %s partition, needed size %ldMiB", uses[use], minsize/1024/1024);
+ }
+ reduceregion(i, minsize);
+ state.regions[i].why = "it was the first region found which was sufficiently large";
+ state.regions[i].use = use;
+}
+
+/* As picklargest, but will split among regions as needed */
+static void
+scatterpicks(size_t minsize, int use)
+{
+ size_t i = findunusedregion(minsize);
+ if(i == (size_t)-1){
+ dump(NULL);
+ sysfatal("no region big enough for %s partition, needed size %ldMiB", uses[use], minsize/1024/1024);
+ }
+ reduceregion(i, minsize);
+ state.regions[i].why = "it was the first region found which was sufficiently large";
+ state.regions[i].use = use;
+}
+
+static void
+plan(void)
+{
+ printf("Planning...\n");
+ decidebloom();
+ decidesizes();
+ scatterpicks(state.arenasize, arena);
+ scatterpicks(state.indexsize, index);
+ if(state.usebloom)
+ picklargest(512*1024*1024, bloom);
+ for(size_t f = 0; f < state.fossilcount; f += 1)
+ picklargest(state.fossilsize, fossil);
+}
+
+static void
+partition(void)
+{
+ printf("Partitioning...\n");
+ sysfatal("TODO");
+}
+
+static void
+format(void)
+{
+ printf("Formatting...\n");
+ sysfatal("TODO");
+}
+
+static void
+execute(void)
+{
+ partition();
+ format();
+}
+
+static void
+verify(void)
+{
+ char buf[3];
+ printf("Plan:\n");
+ dump(NULL);
+ printf("Proceed? ");
+ fflush(stdout);
+ if(fgets(buf, 3, stdin) == NULL || (buf[0] != 'y' && buf[0] != 'Y'))
+ sysfatal("Aborting!");
+}
+
+int
+main(int argc, char **argv)
+{
+ parseargs(argc, argv);
+ opendisks();
+ /* don't probe bad regions; however, without probing, some duplicates cannot be detected */
+ /* so, first remove bad regions; then probe; then do another check for duplicates */
+ removeregion(isclosed, "not a valid disk");
+ removeregion(isunused, "useless");
+ removeregion(isduplicate, "duplicates");
+ removeregion(istoosmall, "too small");
+ proberegions();
+ removeregion(issubregion, "a subregion of another, known region");
+ plan();
+ verify();
+ execute();
+ return 0;
+}
--- /dev/null
+++ b/partitioner/partitioner.h
@@ -1,0 +1,26 @@
+typedef struct {
+ char *path;
+ int fd;
+ size_t offset, size;
+ enum { unknown, bad, disk, part, file } type;
+ enum { undecided, none, arena, index, bloom, indexbloom, fossil } use;
+ /* Random read latency; unit: average microseconds to read one 512-byte block */
+ uint32_t randlatency;
+ /* Sequential read speed; unit: MiB/s when reading 4K blocks */
+ uint32_t seqspeed;
+ /* The number of drives that must fail to bring down this region */
+ uint16_t redundancy;
+ /* The reason the region is being used the way it is */
+ char *why;
+ /* device-specific unique identifier */
+ char *uuid;
+} Region;
+
+enum{
+ /* Minimum size for a partition to be considered for usage */
+ MinSize = 16 * 1024 * 1024,
+};
+
+void proberegiontype(Region *r);
+void proberegionredundancy(Region *r);
+
--- /dev/null
+++ b/unwhack.c
@@ -1,0 +1,181 @@
+#include <u.h>
+#include <libc.h>
+#include "whack.h"
+
+enum
+{
+ DMaxFastLen = 7,
+ DBigLenCode = 0x3c, /* minimum code for large lenth encoding */
+ DBigLenBits = 6,
+ DBigLenBase = 1 /* starting items to encode for big lens */
+};
+
+static uchar lenval[1 << (DBigLenBits - 1)] =
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4,
+ 5,
+ 6,
+ 255,
+ 255
+};
+
+static uchar lenbits[] =
+{
+ 0, 0, 0,
+ 2, 3, 5, 5,
+};
+
+static uchar offbits[16] =
+{
+ 5, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 12, 13
+};
+
+static ushort offbase[16] =
+{
+ 0, 0x20,
+ 0x40, 0x60,
+ 0x80, 0xc0,
+ 0x100, 0x180,
+ 0x200, 0x300,
+ 0x400, 0x600,
+ 0x800, 0xc00,
+ 0x1000,
+ 0x2000
+};
+
+void
+unwhackinit(Unwhack *uw)
+{
+ uw->err[0] = '\0';
+}
+
+int
+unwhack(Unwhack *uw, uchar *dst, int ndst, uchar *src, int nsrc)
+{
+ uchar *s, *d, *dmax, *smax, lit;
+ ulong uwbits, lithist;
+ int i, off, len, bits, use, code, uwnbits, overbits;
+
+ d = dst;
+ dmax = d + ndst;
+
+ smax = src + nsrc;
+ uwnbits = 0;
+ uwbits = 0;
+ overbits = 0;
+ lithist = ~0;
+ while(src < smax || uwnbits - overbits >= MinDecode){
+ while(uwnbits <= 24){
+ uwbits <<= 8;
+ if(src < smax)
+ uwbits |= *src++;
+ else
+ overbits += 8;
+ uwnbits += 8;
+ }
+
+ /*
+ * literal
+ */
+ len = lenval[(uwbits >> (uwnbits - 5)) & 0x1f];
+ if(len == 0){
+ if(lithist & 0xf){
+ uwnbits -= 9;
+ lit = (uwbits >> uwnbits) & 0xff;
+ lit &= 255;
+ }else{
+ uwnbits -= 8;
+ lit = (uwbits >> uwnbits) & 0x7f;
+ if(lit < 32){
+ if(lit < 24){
+ uwnbits -= 2;
+ lit = (lit << 2) | ((uwbits >> uwnbits) & 3);
+ }else{
+ uwnbits -= 3;
+ lit = (lit << 3) | ((uwbits >> uwnbits) & 7);
+ }
+ lit = (lit - 64) & 0xff;
+ }
+ }
+ if(d >= dmax){
+ snprint(uw->err, WhackErrLen, "too much output");
+ return -1;
+ }
+ *d++ = lit;
+ lithist = (lithist << 1) | (lit < 32) | (lit > 127);
+ continue;
+ }
+
+ /*
+ * length
+ */
+ if(len < 255)
+ uwnbits -= lenbits[len];
+ else{
+ uwnbits -= DBigLenBits;
+ code = ((uwbits >> uwnbits) & ((1 << DBigLenBits) - 1)) - DBigLenCode;
+ len = DMaxFastLen;
+ use = DBigLenBase;
+ bits = (DBigLenBits & 1) ^ 1;
+ while(code >= use){
+ len += use;
+ code -= use;
+ code <<= 1;
+ uwnbits--;
+ if(uwnbits < 0){
+ snprint(uw->err, WhackErrLen, "len out of range");
+ return -1;
+ }
+ code |= (uwbits >> uwnbits) & 1;
+ use <<= bits;
+ bits ^= 1;
+ }
+ len += code;
+
+ while(uwnbits <= 24){
+ uwbits <<= 8;
+ if(src < smax)
+ uwbits |= *src++;
+ else
+ overbits += 8;
+ uwnbits += 8;
+ }
+ }
+
+ /*
+ * offset
+ */
+ uwnbits -= 4;
+ bits = (uwbits >> uwnbits) & 0xf;
+ off = offbase[bits];
+ bits = offbits[bits];
+
+ uwnbits -= bits;
+ off |= (uwbits >> uwnbits) & ((1 << bits) - 1);
+ off++;
+
+ if(off > d - dst){
+ snprint(uw->err, WhackErrLen, "offset out of range: off=%d d=%zd len=%d nbits=%d",
+ off, d - dst, len, uwnbits);
+ return -1;
+ }
+ if(d + len > dmax){
+ snprint(uw->err, WhackErrLen, "len out of range");
+ return -1;
+ }
+ s = d - off;
+ for(i = 0; i < len; i++)
+ d[i] = s[i];
+ d += len;
+ }
+ if(uwnbits < overbits){
+ snprint(uw->err, WhackErrLen, "compressed data overrun");
+ return -1;
+ }
+
+ len = d - dst;
+
+ return len;
+}
--- /dev/null
+++ b/venti/config.c
@@ -1,0 +1,114 @@
+#include "platform.h"
+#include "dat.h"
+#include "fns.h"
+
+#include <stdio.h>
+
+static char magic[] = "venti config\n";
+static uint8_t magiclen = sizeof(magic);
+
+// Precondition: buf points to an array of >= MaxConfig+1 bytes
+void
+readconfig(char *path, char *buf)
+{
+ size_t fsize, n = 0, offset = 0;
+ int fd = open(path, OREAD);
+ if(fd < 0)
+ sysfatal("Unable to open '%s' for config file", path);
+ fsize = fdsize(fd);
+ if(fsize > PartBlank)
+ offset = PartBlank - MaxConfig;
+ else if(fsize > MaxConfig)
+ sysfatal("Config file '%s' too large (size %lu, max %lu)!", path, fsize, MaxConfig);
+ n = pread(fd, buf, MaxConfig, offset);
+ if(n == 0)
+ sysfatal("Config file '%s' empty!", path);
+ if(offset != 0){
+ if(memcmp(buf, magic, magiclen) != 0)
+ sysfatal("bad venti config magic in '%s'", path);
+ memmove(buf, buf + magiclen, MaxConfig - magiclen);
+ n -= magiclen;
+ }
+ buf[n] = 0;
+}
+
+/* Skips to the next part of the string with meaning - i.e. skips comments and whitespace */
+/* *line indicates whether a line was found */
+static char*
+skip(char *c, int *line)
+{
+ if(c == NULL)
+ return NULL;
+ if(line != NULL)
+ *line = 0;
+ while(*c == '#' || isspace(*c)){
+ if(*c == '#'){
+ while(*c != '\n' && *c != 0)
+ c += 1;
+ /* either at a newline or end-of-string */
+ }
+ while(isspace(*c)){
+ if(*c == '\n' && line != NULL)
+ *line = 1;
+ c += 1;
+ }
+ }
+ if(*c == 0)
+ return NULL;
+ return c;
+}
+
+static int
+confis(char **c, char *test, char **out)
+{
+ int lined;
+ if(!strbegins(*c, test))
+ return 0;
+ *c += strlen(test) + 1;
+ *out = untilspace(*c);
+ *c += strlen(*out);
+ if(*out == NULL)
+ sysfatal("Out of memory");
+ *c = skip(*c, &lined);
+ if(lined == 0)
+ sysfatal("config error: garbage found after '%s' line", test);
+ return 1;
+}
+
+static void
+indexname(config *conf, char *name)
+{
+ if(conf->index != NULL)
+ sysfatal("configuration error: multiple names specified for index");
+ if(!nameok(name))
+ sysfatal("Invalid name specified for index: '%s'", name);
+ conf->index = name;
+}
+
+static void
+parseconfig(config *conf, char *buf)
+{
+ char *c = buf, *tmp;
+ while((c = skip(c, NULL)) != NULL){
+ if(confis(&c, "index", &tmp))
+ indexname(conf, tmp);
+ else if(confis(&c, "isect", &tmp))
+ sysfatal("TODO isect '%s'", tmp);
+ else if(strbegins(c, "arenas "))
+ sysfatal("TODO arenas");
+ else
+ sysfatal("TODO load config from '%s' (offset %d)", c, c - buf);
+ }
+}
+
+config
+loadconfig(char *path)
+{
+ char buf[MaxConfig + 1];
+ config conf;
+ memset(&conf, 0, sizeof(conf));
+ readconfig(path, buf);
+ parseconfig(&conf, buf);
+ return conf;
+}
+
--- /dev/null
+++ b/venti/dat.h
@@ -1,0 +1,27 @@
+enum{
+ ABlockLog = 9, /* log2(512), the quantum for reading arenas */
+ ANameSize = 64,
+ MaxDiskBlock = 64*1024, /* max. allowed size for a disk block */
+ MaxIoSize = 64*1024, /* max. allowed size for a disk io operation */
+ PartBlank = 256*1024, /* untouched section at beginning of partition */
+ HeadSize = 512, /* size of a header after PartBlank */
+ MinArenaSize = 1*1024*1024, /* smallest reasonable arena size */
+ MaxConfig = 8 * 1024, /* Maximum size of the configuration file */
+ IndexBase = 1024*1024, /* initial address to use in an index */
+ MaxIo = 64*1024, /* max size of a single read or write operation */
+ ICacheBits = 16, /* default bits for indexing icache */
+ MaxAMap = 31*1024, /* max. allowed arenas in an address mapping; must be < 32*1024 */
+};
+
+typedef struct {
+ uint32_t version;
+} arenapart;
+
+typedef struct{
+ char *index;
+ /* Amount of memory to use for the cache */
+ uint64_t mem;
+ uint32_t naparts;
+ arenapart **parts;
+} config;
+
--- /dev/null
+++ b/venti/fns.h
@@ -1,0 +1,3 @@
+config loadconfig(char *path);
+int nameok(char *name);
+
--- /dev/null
+++ b/venti/main.c
@@ -1,0 +1,43 @@
+#include "platform.h"
+#include "dat.h"
+#include "fns.h"
+
+struct {
+ char *file;
+} params;
+
+static inline char*
+next(int argc, char **argv, int *i)
+{
+ if (*i + 1 == argc)
+ sysfatal("Expected argument to '%s'!", argv[*i]);
+ *i += 1;
+ return argv[*i];
+}
+
+static void
+initargs(void)
+{
+ params.file = "venti.conf";
+}
+
+static void
+parseargs(int argc, char **argv)
+{
+ for(int i = 1; i < argc; i += 1){
+ if(streql("-c", argv[i]))
+ params.file = next(argc, argv, &i);
+ else
+ sysfatal("Unrecognized argument: '%s'", argv[i]);
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ initargs();
+ parseargs(argc, argv);
+ loadconfig(params.file);
+ sysfatal("TODO: launch server");
+ return 0;
+}
--- /dev/null
+++ b/venti/utils.c
@@ -1,0 +1,15 @@
+#include "platform.h"
+#include "dat.h"
+#include "fns.h"
+
+int
+nameok(char *name)
+{
+ if(name == NULL || strlen(name) >= ANameSize)
+ return 0;
+ for(size_t i = 0; i < strlen(name); i += 1)
+ if(name[i] < 0x20 || name[i] >= 0x80)
+ return 0;
+ return 1;
+}
+
--- /dev/null
+++ b/whack.h
@@ -1,0 +1,40 @@
+typedef struct Whack Whack;
+typedef struct Unwhack Unwhack;
+
+enum
+{
+ WhackStats = 8,
+ WhackErrLen = 64, /* max length of error message from thwack or unthwack */
+ WhackMaxOff = 16*1024, /* max allowed offset */
+
+ HashLog = 14,
+ HashSize = 1<<HashLog,
+ HashMask = HashSize - 1,
+
+ MinMatch = 3, /* shortest match possible */
+
+ MinDecode = 8, /* minimum bits to decode a match or lit; >= 8 */
+
+ MaxSeqMask = 8, /* number of bits in coding block mask */
+ MaxSeqStart = 256 /* max offset of initial coding block */
+};
+
+struct Whack
+{
+ ushort begin; /* time of first byte in hash */
+ ushort hash[HashSize];
+ ushort next[WhackMaxOff];
+ uchar *data;
+};
+
+struct Unwhack
+{
+ char err[WhackErrLen];
+};
+
+void whackinit(Whack*, int level);
+void unwhackinit(Unwhack*);
+int whack(Whack*, uchar *dst, uchar *src, int nsrc, ulong stats[WhackStats]);
+int unwhack(Unwhack*, uchar *dst, int ndst, uchar *src, int nsrc);
+
+int whackblock(uchar *dst, uchar *src, int ssize);