ref: 28b95adea25a97bb212fae8e7692ff6f7e7b569f
parent: 277d532ae8c8f7d9c40874c91866ba595dc14671
author: Ori Bernstein <ori@eigenstate.org>
date: Sat Mar 23 13:00:09 EDT 2024
gefs: document disk format, and remove some cruft
--- a/TODO
+++ b/TODO
@@ -7,6 +7,7 @@
*** nice to have, can go without ***
- non-adm controlled snapshots
+- custom snapshot timing
- reclaiming data from deleted files is very delayed
- performance optimization:
- bulk block frees
--- a/dat.h
+++ b/dat.h
@@ -102,7 +102,6 @@
/* snapshot keys */
Klabel, /* name[] => snapid[]: snapshot label */
- Ktref, /* tag[8] = snapid[] scratch snapshot label */
Ksnap, /* sid[8] => ref[8], tree[52]: snapshot root */
Kdlist, /* snap[8] gen[8] => hd[ptr],tl[ptr] deadlist */
};
@@ -253,7 +252,6 @@
Tleaf,
Tlog,
Tdlist,
- Tmagic,
Tarena,
Tsuper = 0x6765, /* 'ge' bigendian */
};
--- a/fs.c
+++ b/fs.c
@@ -168,9 +168,9 @@
tracem("snapdl");
wrbarrier();
qunlock(&fs->mutlk);
+ freedl(&dl, 1);
qunlock(&fs->synclk);
tracem("synced");
- freedl(&dl, 1);
poperror();
}
@@ -2407,6 +2407,8 @@
}
if(am->halt){
+ assert(fs->snapdl.hd.addr == -1);
+ assert(fs->snapdl.tl.addr == -1);
postnote(PNGROUP, getpid(), "halted");
exits(nil);
}
@@ -2523,7 +2525,7 @@
if(fs->rdonly)
continue;
if(waserror()){
- fprint(2, "task error: %s", errmsg());
+ fprint(2, "task error: %s\n", errmsg());
continue;
}
a = emalloc(sizeof(Amsg), 1);
--- a/gefs.ms
+++ b/gefs.ms
@@ -21,7 +21,7 @@
On power loss, the file systems may get corrupted.
Partial disk failure is not caught by the file system, and reads may silently return incorrect data.
They tend to require a large, unshrinkable disk for archival dumps, and behave poorly when the disk fills.
-Additionally, all of them do O(n) scans to look up files in directories when walking to a file.
+Additionally, all of them perform O(n) scans to look up files in directories when walking to a file.
This causes poor performance in large directories.
.PP
CWFS, the default file system on 9front, has proven to be performant and reliable, but is not crash safe.
@@ -46,7 +46,7 @@
If the file server crashes before the superblocks are updated,
then the next mount will see the last commit that was synced to disk.
Some data may be lost, but no corruption will occur.
-Furthermore, because of the use of an indexed data structures, directories do not suffer from O(n) lookups,
+Furthermore, because of the use of an indexed data structure, directories do not suffer from O(n) lookups,
solving a long standing performance issue with large directories.
.PP
The file system is based around a relatively novel data structure: the Bε tree [1].
@@ -91,8 +91,8 @@
inserted into its write buffer.
When the write buffer is full, it is inspected, and the number of messages directed
to each child is counted up.
-The child with the largest number of pending writes is picked as the victim, and
-the root's write buffer is flushed towards it.
+The child with the largest number of pending writes is picked as the victim.
+The root's write buffer is flushed into the selected victim.
This proceeds recursively down the tree, until either an intermediate node has
sufficient space in its write buffer, or the messages reach a leaf node, at which
point the value in the leaf is updated.
@@ -189,7 +189,7 @@
For the sake of simplicity, GEFS makes all blocks the same size.
This implies that the Bε tree blocks are smaller than optimal,
and the disk blocks are larger than optimal.
-The simplifications this allows in the block layer appear to be worth it.
+The simplifications this allows in the block layer appear to be worthwhile.
.PP
Within a single block, the pivot keys are stored as offsets to variable width data.
The data itself is unsorted, but the offsets pointing to it are sorted.
@@ -383,7 +383,7 @@
must approach the bandwidth spent on metadata updates, as each block must be scanned and then reclaimed.
Reference counting implies a large number of scattered writes to maintain the reference counts of blocks.
.PP
-As a result, the algorithm for space reclamation is lifted from ZFS [6].
+As a result, the algorithm for space reclamation is borrowed from ZFS [6].
It is based on the idea of using deadlists to track blocks that became free within a snapshot.
If snapshots are immutable, then a block may not be freed as long as a snapshot exists.
This implies that block lifetimes are contiguous.
@@ -405,7 +405,7 @@
So, in order to do better, we can keep track of blocks that we want to delete from this snapshot as we delete them,
instead of trying to reconstruct the list when we delete the snapshot.
When we attempt to delete a block, there are two cases:
-First, block's birth time may be newer than the previous snapshot, in which case it may be freed immediately.
+First, the block's birth time may be newer than the previous snapshot, in which case it may be freed immediately.
And second, the block may have been born in the previous snapshot or earlier, in which case we need to put it on the current
snapshot's deadlist.
When the current snapshot is deleted, the current snapshot's deadlist is merged with the next snapshot's deadlist.
@@ -454,7 +454,7 @@
may be reclaimed.
With this approach, the only lists that need to be scanned are the ones consisting wholly of blocks that must be freed.
.PP
-All of this asumes that there is a single, linear history of snapshots.
+All of this assumes that there is a single, linear history of snapshots.
However, GEFS allows users to take mutable snapshots off of any label, which breaks the assumption.
If the assumption is broken, two different mutable labels may kill the same block,
which would lead to double frees.
@@ -520,7 +520,7 @@
.PE
.PP
The disadvantage of this approach is that appending to the deadlists may need more random writes.
-This is because in the worst case, blocks deleted may be scattered across a large number of generations.
+This is because, in the worst case, blocks deleted may be scattered across a large number of generations.
It seems likely that in practice, most bulk deletions will touch files that were written in a small number of generations,
and not scattered across the whole history of the disk.
.PP
@@ -582,13 +582,13 @@
GEFS is implemented in a multiprocess manner.
There are six types of proc that GEFS uses for its operation:
The
-.IR console ,
-.IR dispatch ,
-.IR mutation ,
-.IR sweeper ,
-.IR reader ,
+.I console ,
+.I dispatch ,
+.I mutation ,
+.I sweeper ,
+.I reader ,
and
-.IR sycer .
+.I sycer .
Most of these processes can be replicated,
however, there may only be one
.IR mutator ,
@@ -664,7 +664,7 @@
arrow from S.S0.e to F.S0.w
arrow from S.S0.e to F.S1.w
arrow from S.S0.e to F.S2.w
-arrow from S.M0.s to S.S0.n
+arrow from S.M0.n to S.S0.s
.PE
.PP
Because the file system is copy on write,
@@ -710,7 +710,424 @@
This epoch based approach allows GEFS to avoid contention between writes and reads.
A writer may freely mutate the tree as multiple readers traverse it, with no locking between the processes,
beyond what is required for the 9p implementation.
-There is still contention on the FID table, the block cache, and a number of other in-memory data structures.
+There is still contention on the FID table, the block cache,
+and a number of other in-memory data structures.
+.NH 1
+Appendix A: Data Formats
+.PP
+The formats used for GEFS on-disk storage are described below.
+There are several data structures that are described:
+Superblocks, arena headers, tree nodes, and tree values.
+.PP
+All blocks except raw data blocks begin with a 2 byte header.
+The superblock header is chosen such that it coincides with
+the ascii representation of 'ge'.
+.PP
+All numbers in GEFS are big-endian integers, byte packed.
+.PP
+The headers are listed below:
+.TS
+allbox center;
+c c
+c l.
+Value Description
+0 Unused
+1 Pivot node
+2 Leaf node
+3 Allocation log
+4 Deadlist log
+5 Arena Header
+0x6765 Superblock header
+.TE
+.NH 2
+Superblocks
+.PP
+Superblocks are the root of the file system,
+containing all information needed to load it.
+There is one superblock at offset 0,
+and one superblock at the last block of the file system.
+These two superblocks are duplicates,
+and only one intact superblock is needed to successfully load GEFS.
+Because the superblock fits into a single block,
+all the arenas must also fit into it.
+This imposes an upper bound on the arena count.
+With 16k blocks, this natural limit is approximately 1000 arenas.
+Gefs imposes a smaller limit internally, limiting to 256 arenas by default.
+.IP
+.I header[8]
+= "gefs9.00"
+.br
+.I blksz[4] ": the block size for this file system"
+.br
+.I bufspc[4] ": the buffer space for this file system"
+.br
+.I snap.ht[4] ": the height of the snapshot tree"
+.br
+.I snap.addr[8] ": the root block of the snapshot tree"
+.br
+.I snap.hash[8] ": the hash of the snapshot tree root"
+.br
+.I snapdl.hd.addr ": the address of the snap deadlist head"
+.br
+.I snapdl.hd.hash ": the hash of the snap deadlist head"
+.br
+.I snapdl.tl.addr ": the address of the snap deadlist tail"
+.br
+.I snapdl.tl.hash ": the hash of the snap deadlist tail"
+.br
+.I narena[4] ": the number of arenas"
+.br
+.I flags[8] ": flags for future expansion"
+.br
+.I nextqid[8] ": the next qid that will be allocated"
+.br
+.I nextgen[8] ": the next generation number that will be written"
+.br
+.I qgen[8] ": the last queue generation synced to disk"
+.br
+.I "arena0.addr[8], arena0.hash[8]" ": the location of the 0th arena"
+.br
+.I "arena1.addr[8], arena1.hash[8]" ": the location of the 1st arena
+.br
+.I ...
+.br
+.I "arenaN.addr[8], arenaN.hash[8]" ": the location of the N'th arena"
+.br
+.I sbhash[8] ": hash of superblock contents up to the last arena"
+.NH 2
+Arenas
+.PP
+An arena header contains the freelist, the arena size,
+and (for debugging) the amount of space used within the arena.
+.IP
+.I type[2]
+= Tarena
+.br
+.I free.addr[8] ": the address of the start of the freelist"
+.br
+.I free.hash[8] ": the hash of the start of the freelist"
+.br
+.I size[8] ": the size of the arena"
+.br
+.I used[8] ": the amount of used space in the arena"
+.NH 2
+Logs
+.PP
+Logs are used to track allocations. They are the only structure that is
+mutated in place, and therefore is not fully merkelized.
+There are two types of log in gefs: Allocation logs and deadlists.
+They share a common structure, but contain slightly different data.
+.PP
+All logs share a common header:
+.IP
+.I type[2]
+= Tlog or Tdlist
+.br
+.I logsz[2] ": the amount of log space used"
+.br
+.I loghash[8] ": the hash of all data after the log header"
+.br
+.I chainp[24] ": the block pointer this log block chains to"
+.NH 3
+Allocation Logs
+.PP
+When the type of a log block is Tlog,
+the contents of the block are formatted as an allocation log.
+In an allocation log, each entry is either a single u64int,
+recording an allocation or free of a single block,
+or a pair of u64ints, representing an operation on a range of blocks.
+.PP
+The operations are listed below:
+.LP
+.TS
+allbox center;
+c c
+c l.
+Value Description
+1 Allocate 1 block
+2 Free 1 block
+3 Sync barrier
+4 Alloc block range
+5 Free block range
+.TE
+Operations are packed with the operation in the low order byte.
+The rest of the value is packed in the upper bits.
+For multi-block operations, the range length is packed in the second byte.
+.IP
+.P1
+PACK64(logent, addr|op);
+if(op == 4 || op == 5)
+ PACK64(logent+8, len);
+.P2
+.NH 3
+Deadlist Logs
+.PP
+Deadlist logs are simpler than allocation logs.
+They only contain a flat list of blocks that have been killed.
+.NH 2
+The Tree
+.PP
+The tree is composed of two types of block:
+Pivot blocks, and leaf blocks.
+The block types were
+.NH 3
+Pivot Blocks
+.PP
+Pivot blocks contain the inner nodes of the tree.
+They have the following header. The layout is as
+described earlier in the paper.
+.IP
+.I type[2] " = Tpivot"
+.br
+.I nval[2] ": the count of values"
+.br
+.I valsz[2] ": the number of bytes of value data"
+.br
+.I nbuf[2] ": the count of buffered messages"
+.br
+.I bufsz[2] ": the number of bytes of buffered messages"
+.PP
+.NH 3
+Pivot leaves
+.PP
+Within the block, the first half of the space after
+the header contains a key/pointer set. The head of
+the space contains an array of 2-byte offsets to keys,
+and the tail of the space contains a packed set of keys
+and block pointers.
+.PP
+The offset table is simple:
+.IP
+.I off[2*nval] ": the offset table"
+.PP
+the keys/pointers are slightly more complicated.
+They contain a length prefixed key, and a pointer
+to the child block for that key.
+.IP
+.I nkey[2] ": the length of the key"
+.br
+.I key[nkey] ": the key data"
+.br
+.I addr ": the address of the pointed to block"
+.br
+.I hash ": the hash of the pointed to block"
+.br
+.I gen ": the generation number of the pointed to block"
+.PP
+The second half of the space consists of messages
+directed to a value in the leaf. This is formatted
+similarly to the key/pointer set, but instead of
+offsets to key/pointer pairs, the offsets point
+to messages.
+.PP
+The array of offsets grows towards the end of the block,
+and the array of values or messages grows towards the start of the block.
+.PP
+The offset table is the same, however, instead of
+having
+.I nval
+entries, it has
+.I nbuf
+entries.
+.IP
+.I off[2*nbuf]
+.PP
+The messages contain a single byte opcode,
+a key, and a message that contains an incremental
+update to the value.
+.IP
+.I op[1] ": the message operation"
+.br
+.I nkey[2] ": the length of the target key"
+.br
+.I key[nkey] ": the contents of the target key"
+.br
+.I nval[2] ": the length of the message"
+.br
+.I val[nval] ": the contents of the message"
+.NH 3
+Leaf Blocks
+.PP
+Leaf blocks contain the leaf nodes of the tree.
+They have the following header. The layout is as
+described earlier in the paper.
+.IP
+.I type[2] ": the block type"
+Tleaf
+.I nval[2] ": the number of key value pairs"
+.br
+.I valsz[2] ": the size of the key value pairs"
+.PP
+Within a leaf, the layout is very similar to a pivot.
+There is a table of key-value offsets,
+and an array of packed messages.
+As before,
+the array of offsets grows towards the end of the block,
+and the array of values grows towards the start of the block.
+.IP
+.I off[2*nval] ": the offset table"
+.PP
+Each key value pair is encoded as below:
+.IP
+.I nkey[2] ": the length of the key"
+.br
+.I key[nkey] ": the contents of the key"
+.br
+.I nval[2] ": the length of the value"
+.br
+.I val[nval] ": the contents of the value"
+.NH 2
+Keys and Values.
+.PP
+In GEFS, keys begin with a single type byte,
+and are followed by a set of data in a known format.
+Here are the types of known keys:
+.PP
+.I "Kdat qid[8] off[8]"
+describes pointer to a data block.
+The value for this data key must be a block pointer.
+Block pointers are encoded as
+.I "addr[8] hash[8] gen[8]" .
+This entry is only valid in file system trees.
+.PP
+.I "Kent pqid[8] name[n]"
+describes a pointer to a file entry (stat structure).
+The value must be the body of a dir structure.
+This entry is only valid in file system trees.
+The dir structure is structured as below:
+.IP
+.I flag[8] ": flags for future expansion"
+.br
+.I qid.path[8] ": the qid path"
+.br
+.I qid.vers[4] ": the qid version"
+.br
+.I qid.type[1] ": the qid type"
+.br
+.I mode[4] ": the permission bits"
+.br
+.I atime[8] ": the access time"
+.br
+.I mtime[8] ": the modification time"
+.br
+.I length[8] ": the file size"
+.br
+.I uid[4] ": the owning user id"
+.br
+.I gid[4] ": the owning group id"
+.br
+.I muid[4] ": the last user that modified the file"
+.PP
+.I "Kup qid[8]"
+describes a pointer to a parent directory.
+The value is the
+.I Kent
+formatted key.
+This key is the entry of the containing directory.
+It's only present for directories.
+This entry is only valid in file system trees.
+.PP
+.I "Klabel name[]"
+describes a label for a snapshot.
+The value is a
+.I snapid[8] ,
+referring to a snapid indexed by Ksnap.
+This key is only valid in snapshot trees.
+.PP
+.I "Ksnap snapid[8]"
+describes a key referring to a snapshot tree.
+The value is a tree entry.
+The tree is formatted as:
+.IP
+.br
+.I nref[4] ": the number of references from other trees"
+.br
+.I nlbl[4] ": the number of references from labels"
+.br
+.I ht[4] ": the height of the tree"
+.br
+.I flag[4] ": flags for future expansion"
+.br
+.I gen[8] ": the tree generation number"
+.br
+.I pred[8] ": the predecessor snapshot"
+.br
+.I succ[8] ": the successor snapshot"
+.br
+.I base[8] ": the base snapshot"
+.br
+.I bp.addr[8] ": the address of the root block"
+.br
+.I bp.hash[8] ": the hash of the root block"
+.br
+.I bp.gen[8] ": the generation of the root block"
+.PP
+.I "Kdlist snap[8] gen[8]"
+describes a key referring to a deadlist.
+The
+.I snap
+field refers to the snapshot that the deadlist belongs to,
+and the
+.I bgen
+field refers to the birth generation of the blocks on the deadlist.
+The value of the deadlist entry is a pair of block pointers,
+pointing to the head and tail of the block list.
+.NH 2
+Messages
+.PP
+.I Oinsert
+and
+.I Odelete
+can have any key/value pair as an operand.
+They replace or remove a key/value pair respectively.
+.PP
+.I Oclearb
+inserts a deferred free of a block,
+without reading it first.
+It has no value, but the key must be a
+.I Kdat
+key.
+.PP
+.I Oclobber
+is similar to
+.I Oclearb ,
+but its operand must be a
+.I Kent
+key.
+.I Owstat
+updates an existing file entry.
+The key of an
+.I Owstat
+message must be a
+.I Kent ,
+and the value is a bit field of fields to update,
+along with the new values.
+The first byte is a set of wstat flags, and the
+remaining data is the packed value associated with each flag.
+It can contain the following updates:
+.IP
+.I "Owsize fsize[8]" ": update file size"
+.br
+.I "Owmode mode[4]" ": update file mode"
+.br
+.I "Owmtime mtime[8]" ": update mtime, in nsec"
+.br
+.I "Owatime atime[8]" ": update atime, in nsec"
+.br
+.I "Owuid uid[4]" ": set uid"
+.br
+.I "Owgid uid[4]" ": set gid"
+.br
+.I "Omuid uid[4]" ": set muid"
+.PP
+.I Orelink
+and
+.I Oreprev
+rechain snapshots.
+The key of either of these messages is a
+.I Ksnap ,
+and the operand is the ID of a new
+predecessor or successor snap.
.NH 1
References
.LP
--- a/pack.c
+++ b/pack.c
@@ -442,9 +442,10 @@
assert(sz == Blksz);
assert(fi->narena < 512);
p = p0;
- memcpy(p, "gefs0009", 8); p += 8;
+ memcpy(p, "gefs9.00", 8); p += 8;
PACK32(p, Blksz); p += 4;
PACK32(p, Bufspc); p += 4;
+ PACK32(p, fi->narena); p += 4;
PACK32(p, fi->snap.ht); p += 4;
PACK64(p, fi->snap.bp.addr); p += 8;
PACK64(p, fi->snap.bp.hash); p += 8;
@@ -452,7 +453,6 @@
PACK64(p, fi->snapdl.hd.hash); p += 8;
PACK64(p, fi->snapdl.tl.addr); p += 8;
PACK64(p, fi->snapdl.tl.hash); p += 8;
- PACK32(p, fi->narena); p += 4;
PACK64(p, fi->flags); p += 8;
PACK64(p, fi->nextqid); p += 8;
PACK64(p, fi->nextgen); p += 8;
@@ -475,11 +475,12 @@
assert(sz == Blksz);
p = p0;
- if(memcmp(p, "gefs0009", 8) != 0)
+ if(memcmp(p, "gefs9.00", 8) != 0)
error("%s %.8s", Efsvers, p);
p += 8;
fi->blksz = UNPACK32(p); p += 4;
fi->bufspc = UNPACK32(p); p += 4;
+ fi->narena = UNPACK32(p); p += 4;
fi->snap.ht = UNPACK32(p); p += 4;
fi->snap.bp.addr = UNPACK64(p); p += 8;
fi->snap.bp.hash = UNPACK64(p); p += 8;
@@ -490,7 +491,6 @@
fi->snapdl.tl.addr = UNPACK64(p); p += 8;
fi->snapdl.tl.hash = UNPACK64(p); p += 8;
fi->snapdl.gen = -1; p += 0;
- fi->narena = UNPACK32(p); p += 4;
fi->flags = UNPACK64(p); p += 8;
fi->nextqid = UNPACK64(p); p += 8;
fi->nextgen = UNPACK64(p); p += 8;