ref: 475c090fef34ae2e9949c8783216aa3e77fe6098
parent: 6413e86cc41bc6d0a0b5e8949e69765fd28dcb21
author: Noam Preil <noam@pixelhero.dev>
date: Mon Feb 26 02:19:37 EST 2024
cache design
--- a/disk.c
+++ b/disk.c
@@ -117,6 +117,15 @@
return len;
}
+// Reads one block from disk into buf, which must be big enough
+int
+vtreadarenablock(VtArena *arena, u32int blockindex, char *buf)
+{
+ if(pread(arena->fd, buf, arena->blocksize, arena->base+(blockindex*arena->blocksize)) != arena->blocksize)
+ return 0;
+ cacheinsertmap(arena->fd, blockindex, buf);
+ return 1;
+}
u16int
vtreadarena(VtArena *arena, u64int addr, uchar *dbuf, u16int reqsize)
@@ -127,14 +136,14 @@
size = reqsize;
if(addr + reqsize > end)
size = end - addr;
- addr += arena->base;
off = addr & (arena->blocksize-1);
addr -= off;
n = 0;
while(n < size){
- long r = pread(arena->fd, buf, arena->blocksize, addr);
- if(r != arena->blocksize)
- sysfatal("pread failed: fd %d, r %ld, bsize %ld, partlen %ld!", arena->fd, r, arena->blocksize, partlen(arena->fd, arena->name));
+ if(!vtreadarenablock(arena, addr/arena->blocksize, buf))
+ // TODO: I/O error should not crash the disk layer.
+ // Might be good to be able to recover cached data in this case?
+ sysfatal("I/O error ☹");
m = arena->blocksize - off;
if(m > size - n)
m = size - n;
--- a/mkfile
+++ b/mkfile
@@ -2,7 +2,7 @@
TARG=neoventi
BIN=/$objtype/bin
-OFILES=unwhack.$O server.$O util.$O disk.$O
+OFILES=unwhack.$O server.$O util.$O disk.$O cache.$O
HFILES=\
@@ -12,7 +12,7 @@
vacfs -h tcp!127.1!14011 vac:4091f8b8aafc7b8a815f38534b70a171e4ae3e44
run:VE: $O.neoventi
- ./$O.out & pid=$apid
+ ./$O.neoventi & pid=$apid
sleep 2
venti/read -h tcp!127.1!14011 vac:4091f8b8aafc7b8a815f38534b70a171e4ae3e44
echo kill >/proc/$pid/ctl
--- a/neoventi.c
+++ b/neoventi.c
@@ -16,6 +16,7 @@
{
initarenas();
initindex();
+ cacheinit();
}
static void
--- a/neoventi.h
+++ b/neoventi.h
@@ -129,3 +129,5 @@
void initarenas(void);
void initindex(void);
+void cacheinit(void);
+int cacheinsertmap(int fd, u32int blockindex, char *data);
--- a/notebook
+++ b/notebook
@@ -708,3 +708,102 @@
As far as reading goes, though, I see why venti requires a block cache, and operates at the block level. Going to add one to neoventi now.
+I want a unified cache for neoventi - for the index, and the arenas. For disk blocks, I want a map of (arena, block index) to (data block). For index, I want a map of (score) to (index entry). What does venti use for its caches?
+
+Venti just uses the disk address as the cache key, and has the partition pointer in the cache entry for confirmation - so, if two partitions happen to have a block at the same offset, both will go in the same bucket, and one will simply be skipped during lookups.
+
+The block size means that we've got at least a few bits we can use for the fd in the cache key, instead of needing to put the fd in the cache entry.
+
+Need two things for a good cache: a good hash table to back it up, and a good eviction algorithm. The fifty tabs of research I've got open suggests that S3-FIFO is probably the best eviction algorithm, but I need to figure out the hash table first.
+
+I'll probably want to take some inspiration from CLHT (the cache-line hash table) for optimal throughput, but I need to do more research and design.
+
+One option for modification is to store keys and values in separate cache lines; this worsens best-case / common time from one cache line to two, but in exchange it should significantly increase what proportion of accesses hit the best-case time, and reduce space overhead.
+
+It's probably reasonable to just implement a CLHT-like, write some benchmarking code, and then worry about improving it later. A CLHT-alike should be more than good enough, and far better than the shit that original venti is doing anyways.
+
+So, short-term: we want 64 bytes per bucket. The key must contain the block index as well as the partition that the block is from. If we require a minimum block size of 256 bytes, which seems reasonable, we can use 8 bytes for the key: 7 for the block index, one for the partition index. The value for the disk cache should be a block; it's reasonable to use a pointer for now. That means 16 bytes per key-value.
+
+We need room for a lock, though. Our libc uses an int for that, so we only strictly need 4 bytes. That still leaves inadequate room for a full fourth entry, short of changing the key or value format. I could use some of that room for chaining, as CLHT does, but I think linear probing should be better.
+
+The other option is to try to reduce the key or value size. If we require one byte per key for the fd, can we reasonably reduce the block index to less than seven bytes? The "obvious" answer of three bytes would give a mere 4096 block indices, nowhere near sufficient. Can the value be reduced? Well, the cache size is static. If we had, say, 32GiB of block cache at 8KiB per entry, that'd be 32*1024^3 / 8*1024 = 4*1024*1024 = 4M entries, which requires 21 bits for indices, or three bytes! Using three full bytes would be fine up to ~128GiB of disk cache, and I'm more than happy setting that as a limit and optimizing for realistic hardware. ...even though my PC could realistically have more than 128GiB if I paid for more RAM >_>
+
+using four bytes gives nicer math, so what if we did that? 8 bytes per key + 4 for value index = 12 bytes per entry; 5 entries + an int for locking gives us our nice bucket size of 64 bytes :D
+
+But! That would make this cache specific to the disk blocks! Could it be reused more generally, or would it need to be a u64->u32 map?
+
+Well, actually, would a u64->u32 map be sufficient for the other caches? For the index cache, in particular? It should be fine for clumps; if we use the on-disk address for clumps, we can just do the same trick for that map as for the disk blocks. For the index, though, keys are 20 bytes. A dedicated cache with key-buckets and value-buckets might work better there.
+
+Three keys plus a lock for the key buckets, 15 values plus a lock for value buckets? Could work decently well. Having the disk cache should be enough to make a huge difference regardless, since it'd mean not needing any I/O at all for redundant index accesses, even though it'd mean redundant memory processing - but with a really good disk cache, that might be enough to catch up to venti on some of our tests anyways >_>
+
+So! Short order of business is a u64→u32 map. Buckets contain a Lock, and five keys and five values. Keys are (block_index<<8)|(fd&0xff); and we assert that no fds are greater than 0xff; values are indices into the raw disk cache buffer, and refer to [blocksize]-sized entries, not bytes.
+
+typedef struct {
+ Lock;
+ // index into raw cache
+ u32int value[5];
+ // block_index<<8 | fd; assert(fd < 0xff);
+ u64int key[5];
+} bucket_t;
+
+The cache is managed as one giant array of buckets; if an entry is missing in a given bucket, we scan forwards to the next bucket. If it's missing from that bucket, we repeat until we find either an empty slot or the correct entry.
+
+We'll start with a hardcoded table size of 1GiB, and hardcoded block size of 8KiB. This gives us 1024^3 / 8*1024 = 128K entries. With five per bucket, that means ~26K buckets, or ~1.6MiB of overhead.
+
+...sizeof(Lock) in libc is actually eight. We can just copy lock.c and #pragma pack it, I guess. _tas won't care, probably? Unless it wants alignment? Which would make sense, and the lock should probably be aligned for maximum performance anyways. Could reduce entry size to four, I guess? Or use three bytes per value? Or try to reduce the key size...
+
+Given 32TiB of storage, and 8K block size, maximum block index is 32*1024^4 / 8*1024 = 4*1024^3 = 4GiB = 4 bytes. We... can probably reduce key size to a u32, too? That would make us a u32→u32 map, and need 8 bytes per entry. It'd mean seven entries per bucket, too. That should work?
+
+Ah! Except we need a byte for the fd, still. We could do nine bytes per entry, and still get six entries total?
+
+typedef struct {
+ Lock;
+ // index into raw cache
+ u32int value[6];
+ u32int index[6];
+ u8int fd[6];
+} bucket_t;
+
+It'd mean two wasted bytes per bucket, though.
+
+Three bytes for the value would give a limit of 128G of cache, but that's probably fine. Three bytes value + one byte fd + four bytes index = 8 bytes per entry again. Lots of finagling, and lower limits, but more than adequate.
+
+typedef struct {
+ Lock;
+ // index into raw cache
+ u32int index[7];
+ // 24 bits of value index plus 8 bits of fd
+ u32int value[7];
+} bucket_t;
+
+When caching disk blocks, we don't want to rely on the score, do we? Probably saner to use the block index as the key there for bucket selection, too; a block will contain multiple scores (clumps / index entries) anyways.
+
+Fibonnaci hashing on the (index|fd) ought to be sufficient for spreading entries among buckets.
+
+Can probably clean up that bucket format a bit, though, but lack of u24int makes it harder :/
+
+We also need a free list to track available slots in the main data bucket. Instead of a linked list, we can make this a large statically-sized stack; the map size can never change. Given 128*1024 entries, we need 128*1024 slots on the stack, maximum, plus a current length, and each slot needs merely to hold a slot index, so a u32 is fine. This will use 512KiB; it could be reduced if we reduced entry count to 64*1024, but that's not a limitation I'm eager to enforce.
+
+To insert, we need to do a few things:
+
+- Pick a bucket
+- Lock the bucket
+- Check if it already contains the key
+ - If so, update the value, and evict the old cache entry
+- Otherwise, grab an empty slot, and set the key
+ - Lock the free list
+ - Grab an empty cache entry from the free list
+ - Remove the entry from the free list
+ - Unlock the free list
+ - Copy the data into that entry
+ - Set the value of the slot to that cache entry
+
+Buckets are picked via fibonnaci hash of the (index<<8|fd).
+
+In order to be able to test this, I need to adapt the disk reading functions to operate in blocks - which they should already be doing, but were not since I misunderstood venti's operation due to rushing through things.
+
+Currently, to read blocks, we allocate space for blocks, read the data into them, then grab the data we want and store it in the destination buffer. Ideally, we'd have the cache lookup in there, such that we either grab the data from cache or reserve a cache entry, read directly into the cache entries, and then extract from the cache entry into the dbuf.
+
+For now, going to just extract the block reading, and have it insert into cache...
+
+That was easy enough.
\ No newline at end of file
--- a/server.c
+++ b/server.c
@@ -118,11 +118,6 @@
vtsend(conn, buf, 8, VtRhello, 0);
}
-typedef struct {
- int ctl;
- char *dir;
-} ClientHandle;
-
static void
vtloop(VtConn conn, char *buf)
{
@@ -133,7 +128,7 @@
}
static int
-inittrampoline(ClientHandle *h, VtConn *conn)
+inittrampoline(VtConn *conn)
{
switch(setjmp(conn->bounce)){
case 0:
@@ -141,8 +136,6 @@
case 1:
fprint(2, "abandoning client: %r\n");
case 2:
- close(conn->fd);
- free(h);
return 0;
default:
fprint(2, "internal error: unexpected bounce code\n");
@@ -151,15 +144,15 @@
}
static void
-handleproc(void *hnd)
+handleproc(void *fd)
{
char buf[MaxPacketSize];
- ClientHandle *h = hnd;
VtConn conn;
- if(!inittrampoline(h, &conn))
+ conn.fd = (int)(usize)fd;
+ if(!inittrampoline(&conn)){
+ close(conn.fd);
return;
- if((conn.fd = accept(h->ctl, h->dir)) < 0)
- vterr(conn, nil, "failed to accept connection: %r");
+ }
vthello(conn, buf);
vtloop(conn, buf);
}
@@ -167,10 +160,12 @@
static void
handle(int ctl, char *dir)
{
- ClientHandle *handle = malloc(sizeof(ClientHandle));
- handle->ctl = ctl;
- handle->dir = dir;
- proccreate(handleproc, handle, mainstacksize);
+ int fd = accept(ctl, dir);
+ if(fd < 0){
+ fprint(2, "failed to accept connection\n");
+ return;
+ }
+ proccreate(handleproc, (void*)fd, mainstacksize);
}
void