ref: cda9ba6f76bfe31f24670280f0feea69b3804507
dir: /disk.c/
#include <u.h>
#include <libc.h>
#include <bio.h>
#include "neoventi.h"
VtArena *arenas = nil;
u32int numarenas = 0;
struct {
u32int blocksize;
u32int buckets;
VtISect *sects;
int nsects;
u32int div;
u32int namap;
MapEntry *amap;
} index;
int
isectforbucket(u32int buck)
{
int r, l, m;
l = 1;
r = index.nsects - 1;
while(l <= r){
m = (r + l) >> 1;
if(index.sects[m].start <= buck)
l = m + 1;
else
r = m - 1;
}
return l-1;
}
static int
bucketlookup(u8int *ibuf, u8int *score, u16int *entry)
{
u16int nb = U16GET(ibuf);
ibuf += 6;
for(*entry = 0; *entry <= nb; *entry += 1){
if(memcmp(ibuf, score, 20) == 0)
return 1;
ibuf += IEntrySize;
}
return 0;
}
static u64int
aindexfromaddr(u64int addr)
{
u64int a;
for(a = 0; a < index.namap; a += 1)
if(addr >= index.amap[a].start && addr < index.amap[a].stop)
return a;
sysfatal("internal corruption: arena not found for arenaindex");
return 0;
}
int
vtreadlookup(u8int *score, VtAddress *addr)
{
u8int *buf;
u16int entry;
u64int aindex;
u64int bucket = U32GET(score) / index.div;
VtISect *s_sect = &index.sects[isectforbucket(bucket)];
bucket -= s_sect->start;
u16int key = s_sect->cacheindex + (bucket >> 16);
if(!cachelookup((char**)&buf, key, bucket & 0xffff)){
if(pread(s_sect->fd, (char*)buf, s_sect->blocksize, s_sect->blockbase + (bucket << s_sect->blocklog)) != s_sect->blocksize)
sysfatal("Failed to read bucket");
}
if(s_sect->bucketmagic && U32GET(buf + 2) != s_sect->bucketmagic)
sysfatal("index is corrupt: invalid bucket magic: sect %ux, buck %ux", s_sect->bucketmagic, U32GET(buf + 2));
if(!bucketlookup(buf, score, &entry))
sysfatal("entry not found in bucket");
addr->offset = U64GET((buf + 6 + (entry * IEntrySize) + 26));
addr->size = U16GET((buf + 6 + (entry * IEntrySize) + 34));
addr->blocks = buf[6 + (entry*IEntrySize) + 37];
aindex = aindexfromaddr(addr->offset);
addr->s_arena = index.amap[aindex].arena;
addr->offset -= index.amap[aindex].start;
return 1;
}
static u64int
arenadirsize(VtArena *arena)
{
return ((arena->memstats.clumps / (arena->blocksize / 25)) + 1) * arena->blocksize;
}
static u64int
partlen(int fd, char *path)
{
Dir *dir = dirfstat(fd);
u64int len;
if(dir == nil)
sysfatal("Cannot stat partition %s", path);
if(dir->length == 0)
sysfatal("can't determine size of partition %s", path);
len = dir->length;
free(dir);
return len;
}
// Reads one block from disk into the cache, returning a pointer into the
// cache.
// If the data is already in cache, it will not be read again.
// Caller is responsible for calling cachedone(arena->fd, blockindex);
char*
vtreadarenablock(VtArena *arena, u32int blockindex)
{
char *buf;
if(arena->blocksize != 8192)
sysfatal("invalid blocksize %d\n", arena->blocksize);
if(!cachelookup(&buf, arena->index, blockindex)){
if(pread(arena->fd, buf, arena->blocksize, arena->base+(blockindex*arena->blocksize)) != arena->blocksize){
return nil;
}
}
return buf;
}
u16int
vtreadarena(VtArena *arena, u64int addr, uchar *dbuf, u16int reqsize)
{
u64int end = arena->size - arenadirsize(arena);
u16int off, n, m, size;
char *buf;
size = reqsize;
if(addr + reqsize > end)
size = end - addr;
off = addr & (arena->blocksize-1);
addr -= off;
n = 0;
while(n < size){
buf = vtreadarenablock(arena, addr/arena->blocksize);
if(buf == nil)
// TODO: I/O error should not crash the disk layer.
// Might be good to be able to recover cached data in this case?
sysfatal("I/O error ☹");
m = arena->blocksize - off;
if(m > size - n)
m = size - n;
memcpy(&dbuf[n], &buf[off], m);
n += m;
off = 0;
addr += arena->blocksize;
}
return size;
}
int
readclump(uchar *dst, VtAddress addr)
{
u16int size = addr.blocks<<ABlockLog;
uchar buf[0x10000];
vtreadarena(addr.s_arena, addr.offset, buf, size);
size = U16GET(buf+7);
if(buf[29] == 2){
if(unwhack(dst, size, buf+38, U16GET(buf+5)) != size){
free(buf);
sysfatal("decompression failed: %r. block index %llx", addr.offset/addr.s_arena->blocksize);
return 0;
}
} else if(buf[29] == 1)
memcpy(dst, buf+38, size);
return 1;
}
static int
parsemap(Biobufhdr *b, MapEntry **map, u32int *nmap)
{
u32int i;
char *s;
char *fields[4];
if(!Brdu32(b, nmap))
return 0;
if(*nmap > MaxAMap)
return 0;
*map = realloc(*map, *nmap * sizeof(MapEntry));
for(i = 0; i < *nmap; i += 1){
s = Brdline(b, '\n');
if(getfields(s, fields, 3, 0, "\t") != 3)
sysfatal("corrupt index map: %s", s);
memcpy((*map)[i].name, fields[0], NameSize);
(*map)[i].name[NameSize-1] = 0;
if(stru64int(fields[1], &(*map)[i].start) < 0)
sysfatal("corrupt index map: %s", fields[1]);
if(stru64int(fields[2], &(*map)[i].stop) < 0)
sysfatal("corrupt index map: %s", fields[2]);
}
return 1;
}
static void
loadarena(VtArena *arena)
{
u32int version;
char *buf = malloc(arena->blocksize);
u8int *p = (void*)buf;
if(pread(arena->fd, buf, arena->blocksize, arena->base + arena->size) != arena->blocksize)
sysfatal("failed to pread");
version = U32GET(p + 4);
if(strncmp(arena->name, buf + 8, strlen(arena->name)) != 0)
sysfatal("arena name mismatch: %s vs %s, ver %d", arena->name, buf + 8, version);
}
static void
initarena(VtArena *arena, int fd, MapEntry entry, u32int blocksize)
{
arena->fd = fd;
arena->blocksize = blocksize;
arena->clumpmax = blocksize / ClumpInfoSize;
arena->base = entry.start + blocksize;
arena->size = entry.stop - entry.start - 2*blocksize;
memcpy(arena->name, entry.name, NameSize);
loadarena(arena);
}
static void
readarenatable(int fd, u32int tabbase, u32int tabsize, u32int blocksize)
{
Biobufhdr bio;
char *buf;
MapEntry *map = nil;
u32int nmap;
buf = malloc(tabsize);
if(buf == nil)
sysfatal("oom; you're a loser: %r");
if(Binits(&bio, fd, OREAD, (uchar*)buf, tabsize))
sysfatal("failed to init biobuf: %r");
if(Bseek(&bio, tabbase, 0) != tabbase)
sysfatal("seek failed: %r");
parsemap(&bio, &map, &nmap);
arenas = realloc(arenas, sizeof(VtArena) * (nmap + numarenas));
if(!arenas)
sysfatal("oom");
for(; nmap > 0; nmap -= 1){
arenas[numarenas].index = numarenas;
initarena(&arenas[numarenas++], fd, map[nmap-1], blocksize);
}
free(map);
}
static void
arenapartcheck(u32int magic, u32int version, u32int blocksize, u32int arenabase, u32int tabbase)
{
if(magic != ArenaPartMagic)
sysfatal("bad arena partition magic number: %#ux expected ArenaPartMagic (%#ux)", magic, ArenaPartMagic);
if(version != 3)
sysfatal("bad arena partition version: only 3 is supported, found %d", version);
if(blocksize & (blocksize - 1))
sysfatal("invalid block size: %d is not a power of two", blocksize);
if(tabbase >= arenabase)
sysfatal("corrupt arena partition: partition table overlaps with storage");
}
static void
initarenapart(char *path)
{
u32int version, magic, blocksize, arenabase, tabbase, tabsize;
char buf[HeadSize];
u8int *p = (void*)buf;
int fd;
/* This file descriptor is deliberately never closed; it is used to read
* blocks from the arenas throughout the server's lifetime, and thus we
* can rely on the OS to clean it up when we close. */
if((fd = open(path, OREAD)) < 0)
sysfatal("failed to open arena %s: %r", path);
if(pread(fd, buf, HeadSize, PartBlank) != HeadSize)
sysfatal("failed to read arena header table: %r");
magic = U32GET(p);
version = U32GET(p + 4);
blocksize = U32GET(p + 8);
arenabase = U32GET(p + 12);
/* Head is not perfectly aligned; table must be aligned as first block */
tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1);
tabsize = arenabase - tabbase;
arenapartcheck(magic, version, blocksize, arenabase, tabbase);
readarenatable(fd, tabbase, tabsize, blocksize);
}
void
initarenas(void)
{
initarenapart(arenapath);
}
static void
loadisect(VtISect *sect, char *buf)
{
u8int *p = (u8int*)buf;
sect->version = U32GET(p + 4);
memcpy(sect->name, buf + 8, NameSize);
memcpy(sect->index, buf + 8 + NameSize, NameSize);
sect->blocksize = U32GET(p + 8 + 2*NameSize);
sect->blockbase = U32GET(p + 12 + 2*NameSize);
sect->blocks = U32GET(p + 16 + 2 * NameSize);
sect->start = U32GET(p + 20 + 2 * NameSize);
sect->stop = U32GET(p + 24 + 2 * NameSize);
sect->index[NameSize-1] = 0;
sect->name[NameSize-1] = 0;
sect->bucketmagic = 0;
if(sect->version == 2)
sect->bucketmagic = U32GET(p + 28 + 2*NameSize);
sect->buckmax = (sect->blocksize - IBucketSize) / IEntrySize;
sect->blocklog = u64log2(sect->blocksize);
sect->tabbase = (PartBlank + HeadSize + sect->blocksize - 1) & ~(sect->blocksize - 1);
sect->tabsize = sect->blockbase - sect->tabbase;
sect->cacheindex = numarenas;
}
static void
validateisect(VtISect *sect, u32int magic, char *path)
{
if(magic != ISectMagic)
sysfatal("invalid / corrupt index section");
if(sect->version != 1 && sect->version != 2)
sysfatal("unrecognized index section version %d; only 1 and 2 are supported", sect->version);
if(sect->blocksize != (1 << sect->blocklog))
sysfatal("Illegal or corrupt index section");
if(sect->tabbase >= sect->blockbase)
sysfatal("illegal or corrupt index section: config table overlaps bucket store");
if(sect->blockbase + (u64int)sect->blocks * sect->blocksize != partlen(sect->fd, path) & ~(u64int)(sect->blocksize - 1))
sysfatal("invalid or corrupt index section header: invalid blocks");
if(sect->stop - sect->start > sect->blocks)
sysfatal("invalid or corrupt index section: section overflows available space");
if(sect->stop < sect->start)
sysfatal("invalid or corrupt index section: impossible range");
}
static void
initisectpart(char *path)
{
char buf[HeadSize];
index.sects = realloc(index.sects, sizeof(VtISect) * (index.nsects + 1));
VtISect *sect = &index.sects[index.nsects++];
if((sect->fd = open(path, OREAD)) < 0)
sysfatal("failed to open index section");
if(pread(sect->fd, buf, HeadSize, PartBlank) != HeadSize)
sysfatal("failed to read index section header");
loadisect(sect, buf);
validateisect(sect, U32GET((u8int*)buf), path);
}
static void
indexcalc(void)
{
index.buckets = index.sects[index.nsects-1].stop;
index.div = (((u64int)1<<32)+index.buckets-1) / index.buckets;
if((((u64int)1 << 32) - 1) / index.div + 1 != index.buckets)
sysfatal("corrupt index: divisor and buckets inconsistent");
}
// The index header is found in the first section; parse it.
static void
parseindex(void)
{
u32int version;
Biobufhdr bio;
uchar *buf = malloc(index.sects[0].tabsize + Bungetsize);
if(buf == nil)
sysfatal("insufficient memory to start up");
// Binits cannot fail when given a valid mode; see /sys/src/libbio/binit.c:/^Binits
Binits(&bio, index.sects[0].fd, OREAD, buf, index.sects[0].tabsize+Bungetsize);
if(Bseek(&bio, index.sects[0].tabbase, 0) != index.sects[0].tabbase)
sysfatal("invalid or corrupt index: unable to read header");
if(memcmp(Brdline(&bio, '\n'), "venti index configuration", 25) != 0)
sysfatal("invalid or corrupt index: invalid magic");
if(!Brdu32(&bio, &version) || version != 1)
sysfatal("invalid or corrupt index: index version unsupported");
if(memcmp(Brdline(&bio, '\n'), index.sects[0].index, strlen(index.sects[0].index)) != 0)
sysfatal("invalid or corrupt index: index/section mismatch");
if(!Brdu32(&bio, &index.blocksize))
sysfatal("invalid or corrupt index: failed to read blocksize");
/* TODO(mandatory feature): support multiple index sections instead of dropping them */
/* The first line here skips the section map */
parsemap(&bio, &index.amap, &index.namap);
parsemap(&bio, &index.amap, &index.namap);
indexcalc();
}
void
initindex(void)
{
initisectpart(isectpath);
parseindex();
for(int i = 0; i < index.namap; i += 1){
int found = 0;
for(int j = 0; j < numarenas; j += 1)
if(strcmp(arenas[j].name, index.amap[i].name) == 0){
found = 1;
index.amap[i].arena = &arenas[j];
break;
}
if(!found)
sysfatal("unable to build arena map");
}
}