ref: 5afe23b5498517df907f724e30f49e0a371d3bf4
parent: bec6ed73e136c4b8cc960230d9f7439ade29184a
author: Noam Preil <noam@pixelhero.dev>
date: Sat Aug 2 09:37:46 EDT 2025
implement writing lumps to arena; TODO: indexing
--- a/disk.c
+++ b/disk.c
@@ -192,39 +192,120 @@
return 1;
}
+static void
+advancearena(void)
+{+ index.arena->arenastats.sealed = 1;
+ if(index.arena->index+1 == numarenas)
+ sysfatal("TODO last arena full!");+ index.arena = &arenas[index.arena->index+1];
+ if(index.arena->block != 0 || index.arena->offset != 0 || index.arena->blockremain != index.arena->blocksize)
+ sysfatal("TODO handle writing to venti which previously experienced nonlinear writes from other software?");+}
+
+static void
+blockflush(void)
+{+ if(pwrite(index.arena->fd, index.arena->buf, index.arena->blocksize, index.arena->base+(index.arena->block*index.arena->blocksize)) != index.arena->blocksize){+ sysfatal("flush failed: %r");+ }
+}
+
+// Advance the block and, if needed, the arena.
+// It is guaranteed that, after invoking this function, the live block will
+// be completely empty (offset == 0, blockremain == blocksize).
+static void
+blockadvance(void)
+{+ if(index.arena->buf != nil){+ blockflush();
+ index.arena->buf = nil;
+ cacheunlock(index.arena->index, index.arena->block);
+ }
+ index.arena->block += 1;
+ index.arena->offset = 0;
+ index.arena->blockremain = index.arena->blocksize;
+ if(index.arena->block * index.arena->blocksize >= index.arena->size)
+ advancearena();
+}
+
+// If the current block doesn't have enough space, skip to the next one.
+static void
+requireminimum(u16int space)
+{+ if(index.arena->blockremain < space)
+ blockadvance();
+}
+
+// Grabs the current block for writing into the cache, reading any existing contents
+// if applicable.
+static void
+getblock(void)
+{+ if(!cachelookup(&index.arena->buf, index.arena->index, index.arena->block)){+ // Don't read when there's no data _to_ read; saves on unnecessary cache fills.
+ if(index.arena->offset == 0)
+ return;
+ if(pread(index.arena->fd, index.arena->buf, index.arena->blocksize, index.arena->base+(index.arena->block*index.arena->blocksize)) != index.arena->blocksize){+ werrstr("read of live block failed");+ sysfatal("TODO error handling in write path: %r");+ }
+ }
+}
+
+static void
+clumpwrite(char *wptr, u16int len)
+{+ U32PUT(wptr, index.arena->clumpmagic);
+ U16PUT(wptr+5, len);
+ U16PUT(wptr+7, len);
+ wptr[29] = 1;
+}
+
+static void
+blockupdate(u16int n)
+{+ index.arena->offset += n;
+ index.arena->blockremain -= n;
+}
+
int
vtwritearenaclump(char *buf, u16int len)
{- u64int addr, n;
- uchar clump[38];
-retry:
- addr = index.arena->base + index.arena->arenastats.used;
- n = len + ClumpSize + 4;
- // Sealed is the uncommon case; grab wlock immediately because we almost certainly
- // need it, and swapping locks is silly overhead in the common path.
- wlock(index.arena);
- // unlikely
- if(index.arena->arenastats.sealed){- // Can access without a lock, this arena is read-only now.
- wunlock(index.arena);
- if(index.arena->index+1 == numarenas){- werrstr("disk full");- return 0;
- }
- index.arena = &arenas[index.arena->index+1];
- goto retry;
+ /* We've got a 38-byte clump header to write, and the contents
+ * of the clump. We'll be splitting it across blocks. First, find
+ * the block for the clump header; if the current block has less
+ * than 38 bytes of room, move to the next block.
+ * Instead of worrying about address on disk, we'll mostly operate
+ * off of blocks. */
+ char *wptr;
+ u16int n = 0, nn;
+ if(index.arena->blocksize != 8192)
+ sysfatal("invalid blocksize %d\n", index.arena->blocksize);+ requireminimum(38);
+ getblock();
+ wptr = index.arena->buf + index.arena->offset;
+ clumpwrite(wptr, len);
+ blockupdate(38);
+ wptr += 38;
+ while(len > n){+ nn = len - n;
+ if(nn > index.arena->blockremain)
+ nn = index.arena->blockremain;
+ // Use however much space is available in the block, then move to the next
+ memcpy(wptr, buf+n, nn);
+ n += nn;
+ if(len != n)
+ blockadvance();
}
- U16PUT(clump+5, len);
- U16PUT(clump+7, len);
- fprint(2, "wanting to write to %08x the clump header\n", addr);
- addr += 38;
- fprint(2, "wanting to write to %08x the clump itself\n", addr);
- // Arena is locked, we have the clump. Uhhhhh okay, what now?
- // Need to be crash safe.
- // So, first write the clump to where it will go, then mark it as written.
- wunlock(index.arena);
- werrstr("TODO: write clump to arena");- return 0;
+ index.arena->arenastats.used += len+38;
+ index.arena->arenastats.uncsize += len;
+ index.arena->arenastats.clumps += 1;
+ blockflush();
+ // TODO ctime/wtime? Shouldn't go in here, though, should be at a higher level.
+ werrstr("TODO");+ return 1;
+
}
int
@@ -300,6 +381,17 @@
} else {arena->arenastats = arena->indexstats;
}
+ arena->block = arena->arenastats.used / arena->blocksize;
+ arena->offset = arena->arenastats.used & (arena->blocksize - 1);
+ // Probably not necessary, but factors out arena->offset to arenastats.used and arena->blocksize
+ // so that this operation depends only on already-computed values, avoiding a false dependency on
+ // the arena->offset calculation; the true value is just "blocksize - offset".
+ // Might speed up initialization by a few microseconds. Realistically, I'm just
+ // trying to stay in the habit of thinking about how code flows through hardware, this
+ // transformation doesn't matter at all.
+ arena->blockremain = arena->blocksize - (arena->arenastats.used & (arena->blocksize - 1));
+ if(arena->arenastats.uncsize != 0 || arena->index == 0)
+ fprint(2, "Arena %d: live block %d, offset %d, remain %d\n", arena->index, arena->block, arena->offset, arena->blockremain);
// We _can_ read the values in even if the arena is invalid, and the code looks
// cleaner with all the parsing and validation grouped, so meh, going to keep it
// like this.
@@ -309,7 +401,7 @@
sysfatal("unsupported arena version %d\n", version);if(strncmp(arena->name, buf + 8, strlen(arena->name)) != 0)
sysfatal("arena name mismatch: %s vs %s", arena->name, buf + 8);- if(arena->indexstats.sealed && index.arena == nil)
+ if(index.arena == nil)
index.arena = arena;
}
@@ -316,6 +408,7 @@
static void
initarena(VtArena *arena, int fd, MapEntry entry, u32int blocksize)
{+ memset(arena, 0, sizeof(*arena));
arena->fd = fd;
arena->blocksize = blocksize;
arena->base = entry.start + blocksize;
@@ -373,7 +466,7 @@
/* This file descriptor is deliberately never closed; it is used to read
* blocks from the arenas throughout the server's lifetime, and thus we
* can rely on the OS to clean it up when we close. */
- if((fd = open(path, OREAD)) < 0)
+ if((fd = open(path, ORDWR)) < 0)
sysfatal("failed to open arena %s: %r", path);if(pread(fd, buf, HeadSize, PartBlank) != HeadSize)
sysfatal("failed to read arena header table: %r");--- a/neoventi.h
+++ b/neoventi.h
@@ -3,6 +3,7 @@
#define U32GET(p) ((u32int)(((p)[0]<<24)|((p)[1]<<16)|((p)[2]<<8)|(p)[3]))
#define U64GET(p) (((u64int)U32GET(p)<<32)|(u64int)U32GET((p)+4))
#define U16PUT(p,v) (p)[0]=(v)>>8;(p)[1]=(v)
+#define U32PUT(p,v) (p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v)
enum {VtRerror = 1,
@@ -62,6 +63,11 @@
/* disk info */
u64int size;
u64int base;
+ /* Write head: current block, offset within block, available space in block. */
+ u32int block;
+ u16int offset;
+ u16int blockremain;
+ char *buf;
int fd;
struct {// Total clump count, and compressed clump count, respectively
--- a/notebook
+++ b/notebook
@@ -2801,3 +2801,81 @@
Okay, so - we see a read (or a write) that accesses an index bucket. That bucket hasn't been initialized. Thus, we can immediately treat the read as a failure, and the write as dirty.
That works fine for reads, at least, though errors aren't propagated to the client correctly.
+
+Not super important right now. Back to writes.
+
+venti/write: vtwrite: neoventi: data insertion failed
+
+interestingly, it gets passed through properly for writes? Okay, good to know
+
+wanting to write to 000c4000 the clump header
+wanting to write to 000c4026 the clump itself
+
+One thing we can test: if I use venti/venti to write this data, will it show up at that address?
+
+Or, since this is a disposable "drive", just do it.
+
+...okay, so, we actually need to write by _blocks_, not clumps.
+
+So, need to back up a bit: while the clump has data left, split off a block (or the entire rest of the clump, if applicable). Load that block from disk into cache (unless we're writing to byte zero of it, as an optimization; can discard the rest there since we're linear). Copy the portion of the clump that fits (either to the end of the block, the entire block, or the beginning of a block). Write back the block to disk.
+
+So,␣writing a clump consists of a loop:
+
+- Find current block
+ - This may require advancing the arena
+- Find offset within block
+ - If heuristic says so, can choose to skip remainder of block and leave that space unused forever (e.g. if there's 10 bytes of space left in one block, and we're writing exactly the size of 2 blocks, it makes more sense to ignore this one, and fit perfectly to the following two)
+- Copy part of chunk to block in cache
+ - Flush through cache
+- ADvance to next block
+
+Can optimize a bit by hoisting "find current block" out of the beginning of the loop, and having the end of the loop do the advancement from a known position whenever there's still data to write - though, really, position should _always be known_ since we're strictly linear!
+
+TODO, btw: write to multiple arenas in parallel for extra speed. THat, or just issue a lot of pwrite calls in parallel, and read back the results only when flushing or some such, for deeper write depths, need to look into this and do more research.
+
+So, store address in the arena? It's just "base + used", so... maybe not? Yeah, no real point. The addition will predate instructinos with high enough latency to hide it anyways, probably.
+
+TODO: optimize block accesses to use bit masking instead of division.
+
+...actually, even easier for the linear-write side, just store block index and offset-within-block.
+
+...except, wait, offsets are _already being computed bitwise?_ I'm... not supporting unaligned arenas, am I. That's probably fine, I can always just skip the first block if it's not aligned, if I'm not already doing that.
+
+Okay, so:
+
+Don't have to find current block or offset; we already have it. Just have to grab the block from cache and, while data remains to be copied, copy it. Treat clump header as distinct?
+
+int
+vtwritearenaclump(char *buf, u16int len)
+{+ /* We've got a 38-byte clump header to write, and the contents
+ * of the clump. We'll be splitting it across blocks. First, find
+ * the block for the clump header; if the current block has less
+ * than 38 bytes of room, move to the next block.
+ * Instead of worrying about address on disk, we'll mostly operate
+ * off of blocks. */
+ char *block, *wptr;
+// writeclumpheader();
+// writeclumpdata();
+ if(index.arena->blocksize != 8192)
+ sysfatal("invalid blocksize %d\n", index.arena->blocksize);+ requireminimum(38);
+ getblock(&buf);
+ wptr = block + arena->offset;
+
+
+Some basic utilities and this approach should work.
+
+Why is n len+ClumpSize+4, and not just len+ClumpSize? What's the extra u32 for?
+
+oldventi adds "compressed size + ClumpSize" to used, and "size" to uncsize.
+
+.......it's writing four zero bytes at the end of every clump, I think. It's memset()ting them before calling writeiclump, and nothing seems to set them after that. It memsets _immediately_ before writeiclump, which immediately calls writeaclump, which does not modify before writing. Cool, we're not doing that.
+
+
+> inappropriate use of fd
+
+...it's open for reads only isn't it.
+
+
+...okay! data is being written. The question, of course, is if it's being written _correctly_. We're also not updating insignificant things like "the arena's used space metrics," so nothing is really being persisted across shutdowns...
--- a/server.c
+++ b/server.c
@@ -110,7 +110,7 @@
vterr(conn, buf, "hash collision detected");
} else {if(!vtwriteclump(buf+8, len))
- vterr(conn, buf, "data insertion failed");
+ vterr(conn, buf, "data insertion failed: %r");
}
memcpy(buf+4, score, 20);
vtsend(conn, buf, 22, VtRwrite, 0);
--
⑨