ref: 4b60d3dd32efd00a1d07cb08662926ba9836719f
dir: /disk.c/
#include <u.h> #include <libc.h> #include <bio.h> #include "neoventi.h" VtArena *arenas = nil; u32int numarenas = 0; struct { u32int blocksize; u32int buckets; VtISect *sects; int nsects; u32int div; u32int namap; MapEntry *amap; } index; int isectforbucket(u32int buck) { int r, l, m; l = 1; r = index.nsects - 1; while(l <= r){ m = (r + l) >> 1; if(index.sects[m].start <= buck) l = m + 1; else r = m - 1; } return l-1; } static int bucketlookup(u8int *ibuf, u8int *score, u16int *entry) { u16int nb = U16GET(ibuf); ibuf += 6; for(*entry = 0; *entry <= nb; *entry += 1){ if(memcmp(ibuf, score, 20) == 0) return 1; ibuf += IEntrySize; } return 0; } static u64int aindexfromaddr(u64int addr) { u64int a; for(a = 0; a < index.namap; a += 1) if(addr >= index.amap[a].start && addr < index.amap[a].stop) return a; sysfatal("internal corruption: arena not found for arenaindex"); return 0; } int vtreadlookup(u8int *score, VtAddress *addr) { u8int *buf; u16int entry; u64int aindex; u64int bucket = U32GET(score) / index.div; VtISect *s_sect = &index.sects[isectforbucket(bucket)]; bucket -= s_sect->start; u16int key = s_sect->cacheindex + (bucket >> 16); if(!cachelookup((char**)&buf, key, bucket & 0xffff)){ if(pread(s_sect->fd, (char*)buf, s_sect->blocksize, s_sect->blockbase + (bucket << s_sect->blocklog)) != s_sect->blocksize){ cacheunlock(key, bucket & 0xffff); werrstr("Failed to read bucket"); return 0; } } if(s_sect->bucketmagic && U32GET(buf + 2) != s_sect->bucketmagic) sysfatal("index is corrupt: invalid bucket magic: sect %ux, buck %ux", s_sect->bucketmagic, U32GET(buf + 2)); if(!bucketlookup(buf, score, &entry)){ cacheunlock(key, bucket & 0xffff); werrstr("entry not found in bucket"); return 0; } addr->offset = U64GET((buf + 6 + (entry * IEntrySize) + 26)); addr->size = U16GET((buf + 6 + (entry * IEntrySize) + 34)); addr->blocks = buf[6 + (entry*IEntrySize) + 37]; cacheunlock(key, bucket & 0xffff); aindex = aindexfromaddr(addr->offset); addr->s_arena = index.amap[aindex].arena; addr->offset -= index.amap[aindex].start; return 1; } static u64int arenadirsize(VtArena *arena) { return ((arena->memstats.clumps / (arena->blocksize / 25)) + 1) * arena->blocksize; } static u64int partlen(int fd, char *path) { Dir *dir = dirfstat(fd); u64int len; if(dir == nil) sysfatal("Cannot stat partition %s", path); if(dir->length == 0) sysfatal("can't determine size of partition %s", path); len = dir->length; free(dir); return len; } // Reads one block from disk into the cache, returning a pointer into the // cache. // If the data is already in cache, it will not be read again. // Caller is responsible for calling cachedone(arena->fd, blockindex); char* vtreadarenablock(VtArena *arena, u32int blockindex) { char *buf; if(arena->blocksize != 8192) sysfatal("invalid blocksize %d\n", arena->blocksize); if(!cachelookup(&buf, arena->index, blockindex)){ if(pread(arena->fd, buf, arena->blocksize, arena->base+(blockindex*arena->blocksize)) != arena->blocksize){ return nil; } } return buf; } u16int vtreadarena(VtArena *arena, u64int addr, uchar *dbuf, u16int reqsize) { u64int end = arena->size - arenadirsize(arena); u16int off, n, m, size; u32int blockindex; char *buf; size = reqsize; if(addr + reqsize > end) size = end - addr; off = addr & (arena->blocksize-1); addr -= off; n = 0; while(n < size){ blockindex = addr/arena->blocksize; buf = vtreadarenablock(arena, blockindex); if(buf == nil) // TODO: I/O error should not crash the disk layer. // Might be good to be able to recover cached data in this case? sysfatal("I/O error ☹"); m = arena->blocksize - off; if(m > size - n) m = size - n; memcpy(&dbuf[n], &buf[off], m); cacheunlock(arena->index, blockindex); n += m; off = 0; addr += arena->blocksize; } return size; } int readclump(uchar *dst, VtAddress addr) { u16int size = addr.blocks<<ABlockLog; uchar buf[0x10000]; if(!vtreadarena(addr.s_arena, addr.offset, buf, size)){ werrstr("arena read failed: %r"); return 0; } size = U16GET(buf+7); if(buf[29] == 2){ if(unwhack(dst, size, buf+38, U16GET(buf+5)) != size){ sysfatal("decompression failed: %r. block index %llx", addr.offset/addr.s_arena->blocksize); return 0; } } else if(buf[29] == 1) memcpy(dst, buf+38, size); return 1; } static int parsemap(Biobufhdr *b, MapEntry **map, u32int *nmap) { u32int i; char *s; char *fields[4]; if(!Brdu32(b, nmap)) return 0; if(*nmap > MaxAMap) return 0; *map = realloc(*map, *nmap * sizeof(MapEntry)); for(i = 0; i < *nmap; i += 1){ s = Brdline(b, '\n'); if(getfields(s, fields, 3, 0, "\t") != 3) sysfatal("corrupt index map: %s", s); memcpy((*map)[i].name, fields[0], NameSize); (*map)[i].name[NameSize-1] = 0; if(stru64int(fields[1], &(*map)[i].start) < 0) sysfatal("corrupt index map: %s", fields[1]); if(stru64int(fields[2], &(*map)[i].stop) < 0) sysfatal("corrupt index map: %s", fields[2]); } return 1; } static void loadarena(VtArena *arena) { u32int version; char *buf = malloc(arena->blocksize); u8int *p = (void*)buf; if(pread(arena->fd, buf, arena->blocksize, arena->base + arena->size) != arena->blocksize) sysfatal("failed to pread"); version = U32GET(p + 4); if(strncmp(arena->name, buf + 8, strlen(arena->name)) != 0) sysfatal("arena name mismatch: %s vs %s, ver %d", arena->name, buf + 8, version); } static void initarena(VtArena *arena, int fd, MapEntry entry, u32int blocksize) { arena->fd = fd; arena->blocksize = blocksize; arena->clumpmax = blocksize / ClumpInfoSize; arena->base = entry.start + blocksize; arena->size = entry.stop - entry.start - 2*blocksize; memcpy(arena->name, entry.name, NameSize); loadarena(arena); } static void readarenatable(int fd, u32int tabbase, u32int tabsize, u32int blocksize) { Biobufhdr bio; char *buf; MapEntry *map = nil; u32int nmap; buf = malloc(tabsize); if(buf == nil) sysfatal("oom; you're a loser: %r"); if(Binits(&bio, fd, OREAD, (uchar*)buf, tabsize)) sysfatal("failed to init biobuf: %r"); if(Bseek(&bio, tabbase, 0) != tabbase) sysfatal("seek failed: %r"); if(!parsemap(&bio, &map, &nmap)) sysfatal("failed to parse arena map of tabbase %d: %r", tabbase); arenas = realloc(arenas, sizeof(VtArena) * (nmap + numarenas)); if(!arenas) sysfatal("oom"); for(; nmap > 0; nmap -= 1){ arenas[numarenas].index = numarenas; initarena(&arenas[numarenas++], fd, map[nmap-1], blocksize); } free(map); } static void arenapartcheck(u32int magic, u32int version, u32int blocksize, u32int arenabase, u32int tabbase) { if(magic != ArenaPartMagic) sysfatal("bad arena partition magic number: %#ux expected ArenaPartMagic (%#ux)", magic, ArenaPartMagic); if(version != 3) sysfatal("bad arena partition version: only 3 is supported, found %d", version); if(blocksize & (blocksize - 1)) sysfatal("invalid block size: %d is not a power of two", blocksize); if(tabbase >= arenabase) sysfatal("corrupt arena partition: partition table overlaps with storage"); } static void initarenapart(char *path) { u32int version, magic, blocksize, arenabase, tabbase, tabsize; char buf[HeadSize]; u8int *p = (void*)buf; int fd; /* This file descriptor is deliberately never closed; it is used to read * blocks from the arenas throughout the server's lifetime, and thus we * can rely on the OS to clean it up when we close. */ if((fd = open(path, OREAD)) < 0) sysfatal("failed to open arena %s: %r", path); if(pread(fd, buf, HeadSize, PartBlank) != HeadSize) sysfatal("failed to read arena header table: %r"); magic = U32GET(p); version = U32GET(p + 4); blocksize = U32GET(p + 8); arenabase = U32GET(p + 12); /* Head is not perfectly aligned; table must be aligned as first block */ tabbase = (PartBlank + HeadSize + blocksize - 1) & ~(blocksize - 1); tabsize = arenabase - tabbase; arenapartcheck(magic, version, blocksize, arenabase, tabbase); readarenatable(fd, tabbase, tabsize, blocksize); } void initarenas(void) { initarenapart(arenapath); } static void loadisect(VtISect *sect, char *buf) { u8int *p = (u8int*)buf; sect->version = U32GET(p + 4); memcpy(sect->name, buf + 8, NameSize); memcpy(sect->index, buf + 8 + NameSize, NameSize); sect->blocksize = U32GET(p + 8 + 2*NameSize); sect->blockbase = U32GET(p + 12 + 2*NameSize); sect->blocks = U32GET(p + 16 + 2 * NameSize); sect->start = U32GET(p + 20 + 2 * NameSize); sect->stop = U32GET(p + 24 + 2 * NameSize); sect->index[NameSize-1] = 0; sect->name[NameSize-1] = 0; sect->bucketmagic = 0; if(sect->version == 2) sect->bucketmagic = U32GET(p + 28 + 2*NameSize); sect->buckmax = (sect->blocksize - IBucketSize) / IEntrySize; sect->blocklog = u64log2(sect->blocksize); sect->tabbase = (PartBlank + HeadSize + sect->blocksize - 1) & ~(sect->blocksize - 1); sect->tabsize = sect->blockbase - sect->tabbase; sect->cacheindex = numarenas; } static void validateisect(VtISect *sect, u32int magic, char *path) { if(magic != ISectMagic) sysfatal("invalid / corrupt index section"); if(sect->version != 1 && sect->version != 2) sysfatal("unrecognized index section version %d; only 1 and 2 are supported", sect->version); if(sect->blocksize != (1 << sect->blocklog)) sysfatal("Illegal or corrupt index section"); if(sect->tabbase >= sect->blockbase) sysfatal("illegal or corrupt index section: config table overlaps bucket store"); if(sect->blockbase + (u64int)sect->blocks * sect->blocksize != partlen(sect->fd, path) & ~(u64int)(sect->blocksize - 1)) sysfatal("invalid or corrupt index section header: invalid blocks"); if(sect->stop - sect->start > sect->blocks) sysfatal("invalid or corrupt index section: section overflows available space"); if(sect->stop < sect->start) sysfatal("invalid or corrupt index section: impossible range"); } static void initisectpart(char *path) { char buf[HeadSize]; index.sects = realloc(index.sects, sizeof(VtISect) * (index.nsects + 1)); VtISect *sect = &index.sects[index.nsects++]; if((sect->fd = open(path, OREAD)) < 0) sysfatal("failed to open index section"); if(pread(sect->fd, buf, HeadSize, PartBlank) != HeadSize) sysfatal("failed to read index section header"); loadisect(sect, buf); validateisect(sect, U32GET((u8int*)buf), path); } static void indexcalc(void) { index.buckets = index.sects[index.nsects-1].stop; index.div = (((u64int)1<<32)+index.buckets-1) / index.buckets; if((((u64int)1 << 32) - 1) / index.div + 1 != index.buckets) sysfatal("corrupt index: divisor and buckets inconsistent"); } // The index header is found in the first section; parse it. static void parseindex(void) { u32int version; Biobufhdr bio; uchar *buf = malloc(index.sects[0].tabsize + Bungetsize); if(buf == nil) sysfatal("insufficient memory to start up"); // Binits cannot fail when given a valid mode; see /sys/src/libbio/binit.c:/^Binits Binits(&bio, index.sects[0].fd, OREAD, buf, index.sects[0].tabsize+Bungetsize); if(Bseek(&bio, index.sects[0].tabbase, 0) != index.sects[0].tabbase) sysfatal("invalid or corrupt index: unable to read header"); if(memcmp(Brdline(&bio, '\n'), "venti index configuration", 25) != 0) sysfatal("invalid or corrupt index: invalid magic"); if(!Brdu32(&bio, &version) || version != 1) sysfatal("invalid or corrupt index: index version unsupported"); if(memcmp(Brdline(&bio, '\n'), index.sects[0].index, strlen(index.sects[0].index)) != 0) sysfatal("invalid or corrupt index: index/section mismatch"); if(!Brdu32(&bio, &index.blocksize)) sysfatal("invalid or corrupt index: failed to read blocksize"); /* TODO(mandatory feature): support multiple index sections instead of dropping them */ /* The first line here skips the section map */ parsemap(&bio, &index.amap, &index.namap); parsemap(&bio, &index.amap, &index.namap); indexcalc(); } void initindex(void) { initisectpart(isectpath); parseindex(); for(int i = 0; i < index.namap; i += 1){ int found = 0; for(int j = 0; j < numarenas; j += 1) if(strcmp(arenas[j].name, index.amap[i].name) == 0){ found = 1; index.amap[i].arena = &arenas[j]; break; } if(!found) sysfatal("unable to build arena map"); } }