ref: a2ffdf6dfea9b0128732ca068a84654328b8a038
parent: 933d6632b52ca5fbdab2a2d1f666dbc1410a905f
author: Noam Preil <noam@pixelhero.dev>
date: Thu Jul 4 19:26:12 EDT 2024
cache: restore bucket locking; fixes multiclient reading
--- a/cache.c
+++ b/cache.c
@@ -104,6 +104,17 @@
return -1;
}
+void
+cacheunlock(u16int arenaindex, u32int blockindex)
+{
+ u32int key = (arenaindex << 16) | blockindex;
+ bucket_t *bucket = getbucket(key);
+ int index;
+ while(trybucket(bucket, &index, key) != 1)
+ bucket++;
+ unlock(bucket);
+}
+
// Looks up a cache entry. If the entry is not present in cache,
// this grabs a free buffer, inserts it into cache, and yields
// its index so that the caller may fill it up.
@@ -117,9 +128,11 @@
bucket_t *bucket = getbucket(key);
usize tried = 0;
while(tried < NBUCKETS){
+ lock(bucket);
switch(trybucket(bucket, &i, key)){
case -1:
// miss. linear probe.
+ unlock(bucket);
tried+=1;
bucket++;
if(bucket == bucketend)
@@ -134,6 +147,7 @@
case 1:
// already exists
get(bucket, i, buf);
+ unlock(bucket);
return 1;
}
}
--- a/disk.c
+++ b/disk.c
@@ -69,9 +69,11 @@
u16int key = s_sect->cacheindex + (bucket >> 16);
if(!cachelookup((char**)&buf, key, bucket & 0xffff)){
if(pread(s_sect->fd, (char*)buf, s_sect->blocksize, s_sect->blockbase + (bucket << s_sect->blocklog)) != s_sect->blocksize){
+ cacheunlock(key, bucket & 0xffff);
werrstr("Failed to read bucket");
return 0;
}
+ cacheunlock(key, bucket & 0xffff);
}
if(s_sect->bucketmagic && U32GET(buf + 2) != s_sect->bucketmagic)
sysfatal("index is corrupt: invalid bucket magic: sect %ux, buck %ux", s_sect->bucketmagic, U32GET(buf + 2));
@@ -120,8 +122,10 @@
sysfatal("invalid blocksize %d\n", arena->blocksize);
if(!cachelookup(&buf, arena->index, blockindex)){
if(pread(arena->fd, buf, arena->blocksize, arena->base+(blockindex*arena->blocksize)) != arena->blocksize){
+ cacheunlock(arena->index, blockindex);
return nil;
}
+ cacheunlock(arena->index, blockindex);
}
return buf;
}
--- a/mkfile
+++ b/mkfile
@@ -14,7 +14,7 @@
vac:VE:
vacfs -h tcp!127.1!14011 vac:3a51e294f70a87dc6c12805d82787e0a8a3e117f
-fg:VE: ./$O.neoventi
+fg:VE: $O.neoventi
./$O.neoventi -c /dev/jasnah/arenas
run:VE: $O.neoventi
--- a/neoventi.h
+++ b/neoventi.h
@@ -135,7 +135,7 @@
void initindex(void);
void cacheinit(void);
int cachelookup(char **buf, u16int arenaindex, u32int blockindex);
-void cachedone(u16int arenaindex, u16int blockindex);
+void cacheunlock(u16int arenaindex, u32int blockindex);
extern char *arenapath;
extern char *isectpath;
--- a/notebook
+++ b/notebook
@@ -1399,3 +1399,312 @@
It needs to be bumped, though. Which makes sense; we've got two packet-sized buffers, and those are 64K each, so 128K isn't big enough. Let's bump mainstacksize to, I dunno, 256K should be plenty.
Yep, that's good. Cool!
+
+Though, if I run two vacfses of the same score, start one of them walking /n/vac, give it a minute, then start the otehr doing so, it works fine until the second one catches up (which of course it does because our cache ACTUALLY WORKS *cough* venti, take notes *cough*), at which point it crashes as expected due to the total lock of protection on the buckets
+
+but now i can test that again!!
+
+walk: /n/vac/active/.git/objects/86: read asked for 284e1b75fc49962be625ee7b0d85237a2201d682 got 6355d846c98f941707c
+
+...at least it didn't crash this time? :/
+
+panic: D2B called on non-block 81680df4 (double-free?)
+
+neoventi did, though, eventually >_<
+
+but... only sorta???
+
+one of the vacfses is still working. One of the procs panicced though???? What the HELL?
+
+ahhh, one of the procs handling connections crashed, but its panic _does not affect_ the other procs for some reason?
+% acid 8503
+/proc/8503/text:arm64 plan 9 executable
+/sys/lib/acid/port
+/sys/lib/acid/arm64
+acid: lstk()
+abort()+0xc /sys/src/libc/9sys/abort.c:6
+ppanic(fmt=0x362b1)+0x124 /sys/src/libc/port/malloc.c:162
+ pv=0x300f0
+ msg=0x368d0
+ v=0x81680d28
+ n=0x81680d280000002f
+D2B(v=0x81680df4)+0x70 /sys/src/libc/port/pool.c:926
+ a=0x81680dec
+poolfreel(v=0x81680df4,p=0x34f40)+0x20 /sys/src/libc/port/pool.c:1152
+ ab=0x300f0
+poolfree(p=0x34f40,v=0x81680df4)+0x38 /sys/src/libc/port/pool.c:1287
+free()+0x1c /sys/src/libc/port/malloc.c:250
+readclump(dst=0x816a0f2c,addr=0x145880)+0xe4 /usr/glenda/src/disk/neoventi/disk.c:170
+ buf=0x1292060d67435ec9
+ size=0x10e74000012a6
+vtread(buf=0x816a0f28,conn=0xfefefefe00000009)+0x98 /usr/glenda/src/disk/neoventi/server.c:90
+ addr=0x145880
+vtconnhandle(buf=0x816a0f28,conn=0xfefefefe00000009)+0xf8 /usr/glenda/src/disk/neoventi/server.c:100
+vtloop(conn=0xfefefefe00000009,packetbuf=0x816a0f28)+0x1dc /usr/glenda/src/disk/neoventi/server.c:161
+ sz=0xfefe0000fefe001c
+ buf=0xec310706000c1a00
+ offset=0xc1a00fefe0000
+ len=0xfefe001cfefe001a
+handleproc()+0x2d0 /usr/glenda/src/disk/neoventi/server.c:183
+ conn=0xfefefefe00000009
+ buf=0x7afc5656000c1a00
+launcherarm64(arg=0x9,f=0x11298)+0xc /sys/src/libthread/arm64.c:8
+launcherarm64(arg=0x9,f=0x11298)+0xfffffffffffffff8 /sys/src/libthread/channel.c:583
+abort /sys/src/libc/9sys/abort.c:4
+
+
+Oh my _gosh_. That fre should not exist, that's a static buffer!!
+
+Decompression failed due to some race in the cache, I think :/
+
+sbrk is shared between the procs, it has to be or none of the reads would succeed.
+
+Ohhh, right, inserting the buffer into cache doesn't mean we've read into it yet, need to implement the cacheunlock API properly.
+
+Okay, two clients both walking at once annnnnd seems good! :D
+
+added two more vacfses and it seems stable :)
+
+though... one of the first two is hanging. um.
+
+cool. Did a fresh test with three vacfses and two are hanging. drugs to the rescue!
+
+% ps | grep vacfs
+glenda 6575 0:00 0:00 2944K Pread vacfs
+glenda 7024 0:05 0:01 3008K Pread vacfs
+glenda 7045 0:05 0:01 3008K Pread vacfs
+glenda 7597 0:06 0:02 3008K Pread vacfs
+glenda 8492 0:15 0:05 3008K Pread vacfs
+glenda 8504 0:00 0:00 3008K Pread vacfs
+glenda 10666 0:06 0:02 3008K Pread vacfs
+glenda 10697 0:10 0:03 3008K Pread vacfs
+glenda 11284 0:03 0:01 3008K Pread vacfs
+glenda 11331 0:03 0:01 3008K Pread vacfs
+glenda 12268 0:05 0:02 3008K Pread vacfs
+glenda 12286 0:05 0:02 3008K Pread vacfs
+glenda 12319 0:06 0:02 3008K Pread vacfs
+
+Three most recent are probably the way to go.
+
+/proc/12286/text:arm64 plan 9 executable
+/sys/lib/acid/port
+/sys/lib/acid/arm64
+acid: lstk()
+pread()+0xc /sys/src/libc/9syscall/pread.s:6
+read(buf=0x5be7e,n=0x4000000002)+0x18 /sys/src/libc/9sys/read.c:7
+_vtrecv(z=0x5aa78)+0x74 /sys/src/libventi/send.c:82
+ p=0x5b860
+ size=0x5b86000000000
+ b=0x5be7e
+ n=0x5be7e00000000
+ buf=0x5aa7800000000
+ len=0x2a16c
+vtrecv(z=0x5aa78)+0x8c /sys/src/libventi/send.c:209
+
+Cool, blocked on neoventi. That makes sense. Ish.
+
+Neoventi?
+
+glenda 12250 0:00 0:01 397696K Open 7.neoventi
+glenda 12267 0:01 0:14 397696K Sleep 7.neoventi
+glenda 12285 0:01 0:10 397696K Sleep 7.neoventi
+glenda 12318 0:01 0:14 397696K Pread 7.neoventi
+
+Probably the two that are asleep?
+
+sleep()+0xc /sys/src/libc/9syscall/sleep.s:6
+lock(lk=0x810a0440)+0xb0 /sys/src/libc/arm64/lock.c:28
+ i=0x11a58000003e8
+cachelookup(arenaindex=0x6be,blockindex=0x1938,buf=0x81690df8)+0x3c /usr/glenda/src/disk/neoventi/cache.c:128
+ key=0x806be1938
+ tried=0x1
+ bucket=0x810a0440
+
+Sure seems that way!!
+
+I'm not grabbing the arena block while holding an index lock, am I? ...even if I were, that wouldn't double-lock.
+
+That's from vtreadlookup. Let's check the other proc?
+
+vtreadlookup(score=0x81660ec4,addr=0x81650de0)+0x94 /usr/glenda/src/disk/neoventi/disk.c:70
+ bucket=0x321938
+
+Both are lookups of that bucket.
+
+...oh, shoot, did it linearly probe to a different bucket? I need to linear probe to make sure I'm unlocking the correct bucket.
+
+and also unlock() should error if already unlocked, no??
+
+void
+unlock(Lock *lk)
+{
+ lk->val = 0;
+}
+
+NO?!
+
+Fuck that, I'm making it do so on my branch.
+
+--- a/sys/src/libc/port/lock.c
++++ b/sys/src/libc/port/lock.c
+@@ -35,5 +35,7 @@
+ void
+ unlock(Lock *lk)
+ {
++ if(canlock(lk))
++ sysfatal("double-unlock!");
+ lk->val = 0;
+ }
+
+Should be easy to test, if my hypothesis about the neoventi bug is correct anyways.
+
+...okay, so that didn't get triggered, though two are hanging. hmmm. Acid time :/
+
+glenda 15888 0:01 0:09 397696K Sleep 7.neoventi
+glenda 15908 0:01 0:13 397696K Sleep 7.neoventi
+
+/proc/15888/text:arm64 plan 9 executable
+/sys/lib/acid/port
+/sys/lib/acid/arm64
+acid: sleep()+0xc /sys/src/libc/9syscall/sleep.s:6
+lock(lk=0x810a0440)+0x8c /sys/src/libc/arm64/lock.c:24
+ i=0x11a58000002f5
+cachelookup(arenaindex=0x6be,blockindex=0x1938,buf=0x81650d90)+0x3c /usr/glenda/src/disk/neoventi/cache.c:128
+ key=0x806be1938
+ tried=0x1
+ bucket=0x810a0440
+ i=0x8
+vtreadlookup(score=0x81660ec4,addr=0x81650de0)+0x94 /usr/glenda/src/disk/neoventi/disk.c:70
+ bucket=0x321938
+ s_sect=0xf1df8
+ buf=0x44738
+ key=0xf1df8000006be
+ entry=0x4473800000000
+vtread(buf=0x81660ec0,conn=0xfefefefe00000008)+0x14 /usr/glenda/src/disk/neoventi/server.c:88
+ addr=0x8
+vtconnhandle(buf=0x81660ec0,conn=0xfefefefe00000008)+0xf8 /usr/glenda/src/disk/neoventi/server.c:100
+vtloop(conn=0xfefefefe00000008,packetbuf=0x81660ec0)+0x1dc /usr/glenda/src/disk/neoventi/server.c:161
+ sz=0xfefe0000fefe001c
+ buf=0x78a7bfe5000c1a00
+ offset=0xc1a00fefe0000
+ len=0xfefe001cfefe001a
+handleproc()+0x2d0 /usr/glenda/src/disk/neoventi/server.c:183
+ conn=0xfefefefe00000008
+ buf=0x78a7bfe5000c1a00
+launcherarm64(arg=0x8,f=0x11298)+0xc /sys/src/libthread/arm64.c:8
+launcherarm64(arg=0x8,f=0x11298)+0xfffffffffffffff8 /sys/src/libthread/channel.c:583
+lock+0x8c /sys/src/libc/arm64/lock.c:24
+acid:
+
+/proc/15908/text:arm64 plan 9 executable
+/sys/lib/acid/port
+/sys/lib/acid/arm64
+acid: sleep()+0xc /sys/src/libc/9syscall/sleep.s:6
+lock(lk=0x810a0440)+0x8c /sys/src/libc/arm64/lock.c:24
+ i=0x11a58000003a7
+cachelookup(arenaindex=0x6be,blockindex=0x1938,buf=0x81690df8)+0x3c /usr/glenda/src/disk/neoventi/cache.c:128
+ key=0x806be1938
+ tried=0x1
+ bucket=0x810a0440
+ i=0x8
+vtreadlookup(score=0x816a0f2c,addr=0x81690e48)+0x94 /usr/glenda/src/disk/neoventi/disk.c:70
+ bucket=0x321938
+ s_sect=0xf1df8
+ buf=0x45c78
+ key=0xf1df8000006be
+ entry=0x45c7800000000
+vtread(buf=0x816a0f28,conn=0xfefefefe00000009)+0x14 /usr/glenda/src/disk/neoventi/server.c:88
+ addr=0x9
+vtconnhandle(buf=0x816a0f28,conn=0xfefefefe00000009)+0xf8 /usr/glenda/src/disk/neoventi/server.c:100
+vtloop(conn=0xfefefefe00000009,packetbuf=0x816a0f28)+0x1dc /usr/glenda/src/disk/neoventi/server.c:161
+ sz=0xfefe0000fefe001c
+ buf=0x78a7bfe5000c1a00
+ offset=0xc1a00fefe0000
+ len=0xfefe001cfefe001a
+handleproc()+0x2d0 /usr/glenda/src/disk/neoventi/server.c:183
+ conn=0xfefefefe00000009
+ buf=0x78a7bfe5000c1a00
+launcherarm64(arg=0x9,f=0x11298)+0xc /sys/src/libthread/arm64.c:8
+launcherarm64(arg=0x9,f=0x11298)+0xfffffffffffffff8 /sys/src/libthread/channel.c:583
+lock+0x8c /sys/src/libc/arm64/lock.c:24
+acid:
+echo kill > /proc/15908/ctl
+
+Okay, so both are on bucket 0x810a0440, after a linear probe. I _know_ I'm not unlocking the correct bucket. I assumed maybe this was racing the other proc and had unlocked a bucket locked by the other, though that's laughable in hindsight.
+
+Probably either didn't apply the libc patch properly or that patch doesn't work...
+
+Tested the patch on its own and it's not working?? da heck?
+
+...waaaait a sec is there an asm impl?
+
+arm64/lock.c. ah.
+
+yep, now that patch works :P
+
+So now this should just easily confirm that suspicion.
+
+annnnd I crashed acme by using up all of RAM lol. had 800MiB of acme buffers >_< didn't lose anything other than probably a few lines in here. I should remember to turn autosave on.
+
+Annnd now to retest :P
+
+./7.neoventi: double unlock
+
+yep, as expected :)
+
+probably shouldn't sysfatal. abort would be better so acid works.
+
+double-unlock!
+7.neoventi 24985: suicide: sys: trap: fault read addr=0x0 pc=0x13fe4
+
+<lstk 24985
+/proc/24985/text:arm64 plan 9 executable
+/sys/lib/acid/port
+/sys/lib/acid/arm64
+acid: abort()+0xc /sys/src/libc/9sys/abort.c:6
+unlock(lk=0x810a0440)+0x20 /sys/src/libc/arm64/lock.c:41
+cacheunlock(blockindex=0x1938)+0x18 /usr/glenda/src/disk/neoventi/cache.c:111
+vtreadlookup(score=0x81660ed4,addr=0x81650df0)+0x128 /usr/glenda/src/disk/neoventi/disk.c:76
+ bucket=0x321938
+ s_sect=0xf1e08
+ buf=0x6d2f2e90
+ key=0xf1e08000006be
+ entry=0x6d2f2e9000000000
+vtread(buf=0x81660ed0,conn=0xfefefefe00000008)+0x14 /usr/glenda/src/disk/neoventi/server.c:88
+ addr=0x8
+vtconnhandle(buf=0x81660ed0,conn=0xfefefefe00000008)+0xf8 /usr/glenda/src/disk/neoventi/server.c:100
+vtloop(conn=0xfefefefe00000008,packetbuf=0x81660ed0)+0x1dc /usr/glenda/src/disk/neoventi/server.c:161
+ sz=0xfefe0000fefe001c
+ buf=0x78a7bfe5000c1a00
+ offset=0xc1a00fefe0000
+ len=0xfefe001cfefe001a
+handleproc()+0x2d0 /usr/glenda/src/disk/neoventi/server.c:183
+ conn=0xfefefefe00000008
+ buf=0x78a7bfe5000c1a00
+launcherarm64(arg=0x8,f=0x11298)+0xc /sys/src/libthread/arm64.c:8
+launcherarm64(arg=0x8,f=0x11298)+0xfffffffffffffff8 /sys/src/libthread/channel.c:583
+abort /sys/src/libc/9sys/abort.c:4
+acid:
+echo kill > /proc/24985/ctl
+
+Voila, now this is trivial to find :D
+void
+cacheunlock(u16int arenaindex, u32int blockindex)
+{
+ u32int key = (arenaindex << 16) | blockindex;
+ bucket_t *bucket = getbucket(key);
+ int index;
+ while(trybucket(bucket, &index, key) != 1)
+ bucket++;
+ unlock(bucket);
+}
+
+That should work...
+
+deliberately treating buckets with openings as needing probing in case a key got removed by eviction.
+
+...though that meanse trybucket needs to check the whole bucket even if it finds a gap, doesn't it, if I don't want to move the end of the bucket forwards, which I don't think I do.
+
+on the other hand, evictions should be relatively rare, since I plan on S3-FIFOing.
+
+Also yeah that totally fixed it :D