ref: da1daf301206b27d0385c0d4276c18eaa22e8e25
parent: 9fdbc87b8edfb64f28c6a985f123cf514b8827df
author: Ori Bernstein <ori@eigenstate.org>
date: Thu May 16 16:47:43 EDT 2024
gefs: initial import
--- a/rc/bin/fshalt
+++ b/rc/bin/fshalt
@@ -31,6 +31,7 @@
c=`{ls /srv/cwfs*cmd >[2]/dev/null}
h=`{ls /srv/hjfs*cmd >[2]/dev/null}
e=`{ls /srv/ext4*cmd >[2]/dev/null}
+g=`{ls /srv/gefs*cmd >[2]/dev/null}
s=`{awk '/^sd./ {print substr($1,3,1)}' '#S/sdctl' >[2]/dev/null}
# for scram, don't scram other systems
@@ -66,9 +67,9 @@
fn x {
echo
echo -n halting...
- for(i in $c $h $e)
+ for(i in $c $h $e $g)
echo halt >>$i
- for(i in $c $h $e){
+ for(i in $c $h $e $g){
echo -n $i...
while(test -e $i)
sleep 1
--- /dev/null
+++ b/sys/doc/gefs.ms
@@ -1,0 +1,1179 @@
+.am DS
+.ft I
+..
+.ta 1i 2.3i 4.5i (optional to set tabs)
+.TL
+GEFS, A Good Enough File System
+.AU
+Ori Bernstein
+ori@eigenstate.org
+.AB
+GEFS is a new file system built for Plan 9.
+It aims to be a crash-safe, corruption-detecting, simple, and fast snapshotting file system, in that order.
+GEFS achieves these goals by building a traditional 9p file system interface on top of a forest of copy-on-write Bε trees.
+It doesn't try to be optimal on all axes, but good enough for daily use.
+.AE
+.NH 1
+The Current Situation
+.PP
+Plan 9 has several general purpose disk file systems available.
+While they have served us well, all of them leave much to be desired.
+On power loss, the file systems may get corrupted.
+Partial disk failure is not caught by the file system, and reads may silently return incorrect data.
+They tend to require a large, unshrinkable disk for archival dumps, and behave poorly when the disk fills.
+Additionally, all of them perform O(n) scans to look up files in directories when walking to a file.
+This causes poor performance in large directories.
+.PP
+CWFS, the default file system on 9front, has proven to be performant and reliable, but is not crash safe.
+While the root file system can be recovered from the dump, this is inconvenient and can lead to a large amount of lost data.
+It has no way to reclaim space from the dump.
+In addition, due to its age, it has a lot of historical baggage and complexity.
+.PP
+HJFS, a new experimental system in 9front, is extremely simple, with fewer lines of code than any of the other on-disk storage options.
+It has dumps, but does not separate dump storage from cache storage, allowing full use of small disks.
+However, it is extremely slow, not crash safe, and lacks consistency check and recovery mechanisms.
+.PP
+Finally, fossil, the default file system on 9legacy, is large and complicated.
+It uses soft-updates for crash safety[7], an approach that has worked poorly in practice for the BSD filesystems[8].
+While the bugs can be fixed as they're found, simplicity requires a rethink of the on disk data structures.
+And even after adding all this complexity, the fossil+venti system provides no way to recover space when the disk fills.
+.NH 1
+Why GEFS Is Good Enough
+.PP
+GEFS aims to solve these problems with the above file systems.
+The data and metadata is copied on write, with atomic commits.
+This happens by construction, with fewer subtle ordering requirements than soft updates.
+If the file server crashes before the superblocks are updated,
+then the next mount will see the last commit that was synced to disk.
+Some data may be lost, but no corruption will occur.
+Furthermore, because of the use of an indexed data structure, directories do not suffer from O(n) lookups,
+solving a long standing performance issue with large directories.
+.PP
+The file system is based around a relatively novel data structure: the Bε tree [1].
+The Bε tree is a write optimized variant of a B+ tree.
+In addition to good overall performance, it plays particularly nicely with copy on write semantics.
+This allows GEFS to greatly reduce write amplification seen with traditional copy on write B-trees.
+The reduced write amplification allows GEFS to get away with a nearly trivial implementation of snapshotting.
+.PP
+As a result of the choice of data structure, archival dumps are replaced with snapshots.
+Snapshots may be deleted at any time, allowing data within a snapshot to be reclaimed for reuse.
+To enable this, each block pointer contains a birth generation.
+Blocks are reclaimed using a deadlist algorithm inspired by ZFS.
+This algorithm is described later in the paper.
+.PP
+While snapshot consistency is useful to keep data consistent, disks often fail over time.
+In order to detect corruption, block pointers contain a hash of the data that they point at.
+If corrupted data is returned by the underlying storage medium, this is detected via the block hashes.
+And if a programmer error causes the file system to write garbage to disk, this can be often be caught early.
+The corruption is reported, and the damaged data may then be recovered from backups, RAID restoration, or some other means.
+.PP
+By selecting a suitable data structure, a large amount of complexity elsewhere in the file system falls away.
+The complexity of the core data structure pays dividends.
+Being able to atomically update multiple attributes in the Bε tree,
+making the core data structure safely traversable without locks,
+and having a simple, unified set of operations makes everything else simpler.
+.NH 1
+Bε Trees: A Short Summary
+.PP
+The core data structure used in GEFS is a Bε tree.
+A Bε tree is a modification of a B+ tree, which optimizes writes
+by adding a write buffer to the pivot nodes.
+Like B-trees, Bε trees consist of leaf nodes, which contain keys and values, and pivot nodes.
+Like B-trees, the pivot nodes contain pointers to their children, which are either pivot nodes or leaf nodes.
+Unlike B-trees, the pivot nodes also contain a write buffer.
+.PP
+The Bε tree implements a simple key-value API, with point queries and range scans.
+It diverges form a traditional B-tree key value store by the addition of an upsert operation.
+Upsert operations are operations that insert a modification message into the tree.
+These modifications are addressed to a key.
+.PP
+To insert to the tree, the root node is copied, and the new message is
+inserted into its write buffer.
+When the write buffer is full, it is inspected, and the number of messages directed
+to each child is counted up.
+The child with the largest number of pending writes is picked as the victim.
+The root's write buffer is flushed into the selected victim.
+This proceeds recursively down the tree, until either an intermediate node has
+sufficient space in its write buffer, or the messages reach a leaf node, at which
+point the value in the leaf is updated.
+.PP
+In order to query a value, the tree is walked as normal, however the path to the
+leaf node is recorded.
+When a value is found, the write buffers along the path to the root are inspected,
+and any messages that have not yet reached the leaves are applied to the final
+value read back.
+.PP
+Because mutations to the leaf nodes are messages that describe a mutation, updates to
+data may be performed without inspecting the data at all.
+For example, when writing to a file, the modification time and QID version of the file
+may be incremented without inspecting the current QID; a 'new version' message may
+be upserted instead.
+This allows skipping read-modify-write cycles that access distant regions of the tree,
+in favor of a simple insertion into the root nodes write buffer.
+Additionally, because all upserts go into the root node, a number of operations may
+be upserted in a single update. As long as we ensure that there is sufficient space
+in the root node's write buffer, the batch insert is atomic.
+Inserts and deletions are upserts, but so are mutations to existing data.
+.PS
+.ps 6
+.vs 4
+boxht=0.2
+down
+
+R: [
+ right
+R0: box "k0" wid 0.2
+ box "k16" wid 0.2
+ box "k32" wid 0.2
+R1: box "k48" wid 0.2
+ box "m0" wid 0.2 fill
+ box "m1" wid 0.2 fill
+ box wid 0.6 fill
+]
+move down 0.5
+P: [
+ right
+P0: box "k0" wid 0.2
+P1: box "k4" wid 0.2
+ box "k8" wid 0.2
+ box "k12" wid 0.2
+ box "m0" wid 0.2 fill
+ box "m1" wid 0.2 fill
+ box wid 0.6 fill
+
+ box invis wid 1 "..."
+
+P2: box "k48" wid 0.2
+ box "k56" wid 0.2
+ box "k60" wid 0.2
+ box "k64" wid 0.2
+ box "m0" wid 0.2 fill
+ box "m1" wid 0.2 fill
+ box wid 0.6 fill
+]
+move down 0.5
+
+L: [
+ right
+L0: box "k0" wid 0.2
+ box "v0" wid 0.2
+ box "..." wid 0.2
+ box "k3" wid 0.2
+ box "v3" wid 0.2
+
+ box invis wid 1
+
+L1: box "k4" wid 0.2
+ box "v4" wid 0.2
+ box "..." wid 0.2
+ box "k7" wid 0.2
+ box "v7" wid 0.2
+
+B0: box invis wid 1 "..."
+
+L2: box "k48" wid 0.2
+ box "v49" wid 0.2
+ box "..." wid 0.2
+ box "k54" wid 0.2
+ box "v55" wid 0.2
+]
+
+arrow from R.R0.s to P.P0.n
+arrow from R.R1.s to P.P2.n
+
+arrow from P.P0.s to L.L0.n
+arrow from P.P1.s to L.L1.n
+arrow from P.P2.s to L.L2.n
+.PE
+.PP
+For the sake of simplicity, GEFS makes all blocks the same size.
+This implies that the Bε tree blocks are smaller than optimal,
+and the disk blocks are larger than optimal.
+The simplifications this allows in the block layer appear to be worthwhile.
+.PP
+Within a single block, the pivot keys are stored as offsets to variable width data.
+The data itself is unsorted, but the offsets pointing to it are sorted.
+This allows O(1) access to the keys and values given an index, or O(log(n))
+access while searching, while allowing variable size keys and values.
+.PS
+.ps 6
+.vs 4
+boxht=0.3
+box "o0" wid 0.2
+box "o1" wid 0.2
+box "o2" wid 0.2
+box "unused" wid 0.8 fill
+box "k2" wid 0.2
+box "v2" wid 0.7
+box "k0" wid 0.2
+box "v0" wid 0.3
+box "k1" wid 0.4
+box "v1" wid 0.2
+.PE
+.PP
+In order to allow for efficient copy on write operation, the Bε tree in GEFS relaxes several
+of the balance properties of B-trees [5].
+It allows for a smaller amount of fill than would normally be required, and merges nodes with
+their siblings opportunistically.
+In order to prevent sideways pointers between sibling nodes that would need copy on write updates,
+the fill levels are stored in the parent blocks, and updated when updating the child pointers.
+.NH 1
+Mapping Files to Bε Operations
+.PP
+With a description of the core data structure completed, we now need
+to describe how a file system is mapped on to Bε trees.
+.PP
+A GEFS file system consists of a snapshot tree, which points to a number of file system trees.
+The snapshot tree exists to track snapshots, and will be covered later.
+Each snapshot points to a single GEFS metadata tree, which contains all file system state for
+a single version of the file system.
+GEFS is somewhat unique in that all file system data is recorded within a single flat key value
+store.
+There are no directory structures, no indirect blocks, and no other traditional structures.
+Instead, GEFS has the following key-value pairs:
+.LP
+.CW "Kdat(qid, offset) → (ptr)"
+.IP
+Data keys store pointers to data blocks.
+The key is the file qid, concatenated to the block-aligned file offset.
+The value is the pointer to the data block that is being looked up.
+.LP
+.CW "Kent(pqid, name) → (stat)"
+.IP
+Entry keys contain file metadata.
+The key is the qid of the containing directory, concatenated to the name of the file within the directory.
+The value is a stat struct, containing the file metadata, including the qid of the directory entry.
+.LP
+.CW "Kup(qid) → Kent(pqid, name)"
+.IP
+Up keys are maintained so that '..' walks can find their parent directory.
+The key is the qid of the directory.
+The value is the key for the parent directory.
+.PP
+Walking a path is done by starting at the root, which has a parent qid of ~0, and a name of "/".
+The QID of the root is looked up, and the key for the next step on the walk is constructed
+by concatenating the walk element with the root qid.
+This produces the key for the next walk element, which is then looked up, and the next key
+for the walk path is constructed. This continues until the full walk has completed.
+If one of the path elements is '..' instead of a name, then the super key is inspected
+instead to find the parent link of the directory.
+.PP
+If we had a file hierarchy containing the paths 'foo/bar', 'foo/baz/meh', 'quux', 'blorp',
+with 'blorp' containing the text 'hello world', this file system may be represented
+with the following set of keys and values:
+.P1
+Kdat(qid=3, off=0) → Bptr(off=0x712000, hash=04a73, gen=712)
+Kent(pqid=1, name='blorp') → Dir(qid=3, mode=0644, ...)
+Kent(pqid=1, name='foo') → Dir(qid=2, mode=DMDIR|0755, ...)
+Kent(pqid=1, name='quux') → Dir(qid=4, mode=0644, ...)
+Kent(pqid=2, name='bar') → Dir(qid=6, mode=DMDIR|0755, ...)
+Kent(pqid=2, name='baz') → Dir(qid=5, mode=DMDIR|0755, ...)
+Kent(pqid=5, name='meh') → Dir(qid=5, mode=0600, ...)
+Kent(pqid=-1, name='') → Dir(qid=1, mode=DMDIR|0755, ...)
+Kup(qid=2) → Kent(pqid=-1, name='')
+Kup(qid=5) → Kent(pqid=2, name='foo')
+.P2
+Note that all of the keys for a single directory are grouped because they sort together,
+and that if we were to read a file sequentially, all of the data keys for the file would
+be similarly grouped.
+.PP
+If we were to walk
+.CW "foo/bar"
+then we would begin by constructing the key
+.CW "Kent(-1, '')"
+to get the root directory entry.
+The directory entry contains the qid.
+For this example, let's assume that the root qid is 123.
+The key for
+.CW foo
+is then constructed by concatenating the root qid to the first walk name, giving the key
+.CW "Kent(123, foo)"
+This is then looked up, giving the directory entry for
+.CW foo .
+If the directory entry contains the qid 234, then the key
+.CW "Kent(234, bar)"
+is then constructed and looked up.
+The walk is then done.
+.PP
+Because a Bε tree is a sorted data structure, range scans are efficient.
+As a result, listing a directory is done by doing a range scan of all keys
+that start with the qid of the directory entry.
+.PP
+Reading from a file proceeds in a similar way, though with less iteration: When
+writing to a file, the qid is known, so the block key is created by
+concatenating the file qid with the read offset.
+This is then looked up, and the address of the block containing the data is found.
+The block is then read, and the data is returned.
+.PP
+Writing proceeds in a similar manner to reading, and in the general case begins by
+looking up the existing block containing the data so that it can be modified and
+updated.
+If a write happens to fully cover a data block, then a blind upsert of the data
+is done instead.
+Atomically along with the upsert of the new data, a blind write of the version number incremnt,
+mtime, and muid is performed.
+.PP
+Stats and wstat operations both construct and look up the keys for the directory entries,
+either upserting modifications or reading the data back directly.
+.NH 1
+Snapshots
+.PP
+Snapshots are an important feature of GEFS.
+Each GEFS snapshot is referred to by a unique integer id, and is fully immutable once it is taken.
+Snapshots are labelled with a human readable string.
+When marked mutable, the labels move to new snapshots as the file system is written to and synced.
+A snapshot may only be referred to by 0 or 1 mutable labels, along with as many immutable labels as desired.
+.PP
+If there was no space reclamation in gefs, then snapshots would be trivial.
+The tree is copy on write.
+Therefore, as long as blocks are never reclaimed, it would be sufficient to save the current root of the tree
+once all blocks in it were synced to disk.
+However, because snapshots are taken every 5 seconds, disk space would get used uncomfortably quickly.
+.PS
+.ps 6
+.vs 4
+boxht=0.2
+down
+
+R: [
+ right
+R0: box "piv" wid 0.4
+ box "buf" wid 0.2 fill
+ box wid 0.2 fill 0.75
+ move right 0.5
+R1: box "piv" wid 0.4
+ box "buf" wid 0.3 fill
+ box wid 0.1 fill 0.75
+]
+move down 0.5
+P: [
+ right
+P0: box "piv" wid 0.4
+ box "buf" wid 0.4 fill
+
+ box invis wid 1 "..."
+
+P1: box "piv" wid 0.4
+ box "buf" wid 0.4 fill
+]
+move down 0.5
+L: [
+ right
+L0: box "vals" wid 1
+ box invis wid 1
+L1: box "vals" wid 1
+ box invis wid 1 "..."
+L2: box "vals" wid 1
+]
+
+arrow from R.R0.sw to P.P0.n
+arrow from R.R0.se to P.P1.n
+arrow from R.R1.sw to P.P0.n
+arrow from R.R1.se to P.P1.n
+arrow from P.P0.sw to L.L0.n
+arrow from P.P0.se to L.L1.n
+arrow from P.P1.s to L.L2.n
+.PE
+.PP
+There are a number of options for space reclamation.
+Some that were considered when implementing GEFS included garbage collection, in the style of HAMMER [3],
+or optimized reference counting in the style of BTRFS [4], but both of these options have significant downsides.
+Garbage collection requires that the entire disk get scanned to find unreferenced blocks.
+This means that there are scheduled performance degradations, and in the limit of throughput, the bandwidth spent scanning
+must approach the bandwidth spent on metadata updates, as each block must be scanned and then reclaimed.
+Reference counting implies a large number of scattered writes to maintain the reference counts of blocks.
+.PP
+As a result, the algorithm for space reclamation is borrowed from ZFS [6].
+It is based on the idea of using deadlists to track blocks that became free within a snapshot.
+If snapshots are immutable, then a block may not be freed as long as a snapshot exists.
+This implies that block lifetimes are contiguous.
+A block may not exist in a snapshot and be available for reallocation.
+Thus, when freeing a block, there are 2 cases: Either a block was born within the pending snapshot, and died within it,
+or it was born in a previous snapshot and was killed by the pending snapshot.
+.PP
+To build intuition, let's start by imagining the crudest possible implementation of snapshot space reclamation.
+Assuming that block pointers contain their birth generation, we can walk the entire tree.
+When a block's birth time is <= the previous snapshot, it is referred to by an older snapshot.
+We may not reclaim it.
+If the subsequent snapshot refers to this block, then it was born in this snapshot but is still in use.
+We may not reclaim it.
+Otherwise, the block is free, and we can reclaim it.
+.PP
+Obviously, this is slow: It involves full tree walks of multiple snapshots.
+It may walk large numbers of blocks that are not freed.
+.PP
+So, in order to do better, we can keep track of blocks that we want to delete from this snapshot as we delete them,
+instead of trying to reconstruct the list when we delete the snapshot.
+When we attempt to delete a block, there are two cases:
+First, the block's birth time may be newer than the previous snapshot, in which case it may be freed immediately.
+And second, the block may have been born in the previous snapshot or earlier, in which case we need to put it on the current
+snapshot's deadlist.
+When the current snapshot is deleted, the current snapshot's deadlist is merged with the next snapshot's deadlist.
+All blocks on the deadlist that were born after the previous snapshot are freed.
+.PS
+.ps 6
+.vs 4
+down
+H: [
+ P:[
+ move right 0
+ line <-
+ box invis "prev" wid 0.35
+ ]
+ D: [
+ move right 0.5
+ line <-
+ D: box invis "del" wid 0.35
+ ] with .w at P.w - (0, P.ht)
+ N: [
+ move right 1
+ line <-
+ N: box invis "next" wid 0.35
+ ] with .w at D.w - (0, D.ht)
+S: spline -> from D.D.e right 0.2 then to N.N.n
+ "merge" at S.nw + (0.1, 0.1)
+]
+S:[
+ right
+ line with .nw at H.sw + (0, 0.2)
+P: [circle fill wid 0.1]
+ line
+D: [circle below wid 0.1]
+ line
+N: [circle fill wid 0.1]
+ "prev" at P.s + (0, - 0.1)
+ "del" at D.s + (0, -0.1)
+ "next" at N.s + (0, -0.1)
+]
+.PE
+.PP
+There's one further optimization we can do on top of this to make deletions extremely fast.
+The deadlists may be sharded by birth generation.
+When a snapshot is deleted, all deadlists within the snapshot are appended to the descendant
+snapshot, and any deadlists with a birth time after the deleted snapshot in the descendant
+may be reclaimed.
+With this approach, the only lists that need to be scanned are the ones consisting wholly of blocks that must be freed.
+.PP
+All of this assumes that there is a single, linear history of snapshots.
+However, GEFS allows users to take mutable snapshots off of any label, which breaks the assumption.
+If the assumption is broken, two different mutable labels may kill the same block,
+which would lead to double frees.
+GEFS handles this by adding the concept of a
+.I base
+to each snapshot.
+This base id is the first snapshot in a snapshot timeline.
+Any blocks born before the base are not considered owned by the snapshot,
+and no record of their demise will be made in that snapshot.
+The cleanup is left to the snapshot that was used as the base.
+.PS
+.ps 6
+.vs 4
+down
+H: [
+ P:[
+ move right 0
+ L0: line <-
+ T: box invis "b0" wid 0.35
+ L1: line <- with .w at L0.w - (0, 0.15)
+ box invis "b1" wid 0.35
+ L2: line <- with .w at L1.w - (0, 0.15)
+ box invis "b2" wid 0.35
+ ]
+ box invis "prev (gen = 2)" with .w at P.e
+ D: [
+ move right 0.5
+ L0: line <-
+ box invis "b0" wid 0.35
+ L1: line <- at L0.w - (0, 0.15)
+ T: box invis "b1" wid 0.35
+ L1: line <- with .w at L1.w - (0, 0.15)
+ box invis "b2" wid 0.35
+ ] with .w at P.w - (0, P.ht) fill
+ box invis "del (gen = 7)" with .w at D.e + (0.5, 0)
+ N: [
+ move right 1
+ L0: line <-
+ T: box invis "b0" wid 0.35
+ L1: line <- with .w at L0.w - (0, 0.15)
+ box invis "b1" wid 0.35
+ L2: line <- with .w at L1.w - (0, 0.15)
+ box invis "b7" wid 0.35
+ "(free)"
+ ] with .w at D.w - (0, D.ht)
+ box invis "next (gen = 9)" with .w at N.e
+S: spline -> from D.T.e right 0.2 then to N.T.n
+ "merge" at S.sw + (0.15, 0.15)
+
+]
+S:[
+ right
+ line with .nw at H.sw + (0, 0.2)
+P: [circle fill wid 0.1]
+ line
+D: [circle below wid 0.1]
+ line
+N: [circle fill wid 0.1]
+ "prev" at P.s + (0, - 0.1)
+ "del" at D.s + (0, -0.1)
+ "next" at N.s + (0, -0.1)
+]
+.PE
+.PP
+The disadvantage of this approach is that appending to the deadlists may need more random writes.
+This is because, in the worst case, blocks deleted may be scattered across a large number of generations.
+It seems likely that in practice, most bulk deletions will touch files that were written in a small number of generations,
+and not scattered across the whole history of the disk.
+.PP
+The information about the snapshots, deadlists, and labels are stored in a separate
+snapshot tree. The snapshot tree, of course, can never be snapshotted itself.
+However, it's also a copy on write Bε tree where blocks are reclaimed immediately.
+It's kept consistent by syncing both the root of the snapshot tree and the freelists at the same time.
+If any blocks in the snapshot tree are freed, this freeing is only reflected after the snapshot tree is synced to disk fully.
+.PP
+The key-value pairs in the snapshot tree are stored as follows
+.LP
+.CW "Ksnap(id) → (tree)"
+.IP
+Snapshot keys take a unique numeric snapshot id.
+The value contains the tree root.
+This includes the block pointer for the tree, the snapshot generation of the tree, the previous snapshot of the tree,
+its reference count, and its height.
+.LP
+.CW "Klabel(name) → (snapid)"
+.IP
+Label keys contain a human-readable string identifying a snapshot.
+The value is a snapshot id.
+Labels regularly move between snapshots.
+When mounting a mutable snapshot, the label is updated to point at the latest snapshot every time the tree is synced to disk.
+.LP
+.CW "Kslink(snap, next) → ()"
+.IP
+A snap link key contains a snapshot id, and the id of one of its successors.
+Ideally, the successor would be a value, but our Bε tree requires unique keys, so we hack around it by putting both values
+into the key.
+When we have exactly one next link, and no labels that point at this snapshot, we merge with our successor.
+.LP
+.CW "Kdead(snap, gen) → (headptr, tailptr)"
+.IP
+A dead key contains a pair of snapshot id and deadlist generation.
+The value contains a head and tail pointer for a deadlist.
+These are used to quickly look up and merge deadlists, as described earlier in this paper.
+.NH 1
+Block Allocation
+.PP
+In GEFS, blocks are allocated from arenas.
+Within an arena, allocations are stored in a linked list of blocks, which is read at file system initialization.
+The blocks contain a journal of free or allocate operations, which free or allocate regions of disk.
+When the file system starts, it replays this log of allocations and frees, storing the available regions of blocks in an in-memory AVL tree.
+As the file system runs, it appends to the free space log, and occasionally compresses this log,
+collapsing adjacent free or used blocks into larger regions.
+.PP
+Because of the copy on write structure, it's fairly common for metadata blocks to get allocated and deallocated rapidly.
+Drives (even solid state drives) care a lot about sequential access, so it's beneficial to make a best effort attempt at keeping
+data sequential.
+As a result, GEFS selects the arena to allocate from via round robin, offsetting by the type of block.
+If the round robin counter is 10, and we have 7 arenas, then data blocks (type 0) are allocated from arena 3 ((10+0)%7),
+pivot blocks (type 1) are allocated from arena 4 ((10+1)%7), and leaf blocks (type 2) are allocated from arena 5 ((10+2)%7).
+The round robin counter is incremented after every few thousand block writes, in order to balance writes across arenas.
+Since all arenas are the same, if an arena is full, we simply advance to the next arena.
+.NH 1
+Process Structure
+.PP
+GEFS is implemented in a multiprocess manner.
+There are six types of proc that GEFS uses for its operation:
+The
+.I console ,
+.I dispatch ,
+.I mutation ,
+.I sweeper ,
+.I reader ,
+and
+.I sycer .
+Most of these processes can be replicated,
+however, there may only be one
+.IR mutator ,
+.IR sweeper ,
+or
+.I console
+at a time.
+Protocol parsing is handled by one of several dispatch procs.
+There is one of these per posted service or listener.
+Each dispatches 9p messages to the appropriate worker, depending on the 9p message type.
+Read-only messages get dispatched to one of multiple reader procs.
+Write messages get dispatched to the mutator proc, which modifies the in-memory representation of the file system.
+The mutator proc generates dirty blocks purely in memory, and sends them to the syncer procs.
+The job of the syncer proc is simply to write blocks back to disk asynchronously.
+There are also some tasks that may take a long time, and can be done in the background.
+These are sent to the sweeper proc.
+Because the tree is a shared data structure, the sweeper and mutator do not work in parallel.
+Instead, they must hold the mutator lock to accomplish anything.
+Finally, the task proc schedules periodic maintenance operations.
+These include syncing the file system and taking automatic snapshots.
+.PP
+The work of the sweeper could be done by the mutator,
+and in early versions of the file system, it was.
+However, some operations such as removing very large files
+can involve a lot of messages being inserted into the tree,
+which may block other writers.
+As a result, the long running operations are better deferred to a
+background process, which splits them into small chunks, allowing
+the mutator to make progress between them.
+.PP
+Data flow through these processes is unidirectional,
+and any block that has made it out of the mutating processes is immutable.
+This makes it reasonably easy to reason about consistency.
+.PS
+.ps 6
+.vs 4
+R: [
+ down
+C: box "cons" wid 0.7
+ move 0.5
+T: box "task" wid 0.7
+ move 0.5
+P0: box "srv" wid 0.7
+]
+move 0.5
+S: [
+ down
+S0: box "sweeper" wid 0.7
+ move 0.5
+M0: box "mutator" wid 0.7
+ move 0.5
+R0: box "reader0" wid 0.7
+ move 0.5
+R1: box "reader1" wid 0.7
+]
+move 0.5
+F: [
+ down
+S0: box "syncer0" wid 0.7
+ move 0.5
+S1: box "syncer1" wid 0.7
+ move 0.5
+S2: box "syncer2" wid 0.7
+]
+arrow from R.C.e to S.M0.w
+arrow from R.T.e to S.M0.w
+arrow from R.P0.e to S.M0.w
+arrow from R.P0.e to S.R0.w
+arrow from R.P0.e to S.R1.w
+arrow from S.M0.e to F.S0.w
+arrow from S.M0.e to F.S1.w
+arrow from S.M0.e to F.S2.w
+arrow from S.S0.e to F.S0.w
+arrow from S.S0.e to F.S1.w
+arrow from S.S0.e to F.S2.w
+arrow from S.M0.n to S.S0.s
+.PE
+.PP
+Because the file system is copy on write,
+as long as the blocks aren't reclaimed while a reader is accessing the tree, writes need not block reads.
+However, if a block is freed within the same snapshot,
+a naive implementation would allow the reader to observe a corrupt block.
+As a result, some additional cleverness is needed:
+block reclamation needs to be deferred until all readers are done reading a block.
+The algorithm selected for this is epoch based reclamation.
+.PP
+When a proc starts to operate on the tree, it enters an epoch.
+This is done by atomically taking the current global epoch,
+and setting the proc's local epoch to that,
+with an additional bit set to indicate that the proc is active:
+.P1
+ epoch[pid] = atomic_load(globalepoch) | Active
+.P2
+As the mutator frees blocks, instead of immediately making them reusable,
+it puts the blocks on the limbo list for its current generation:
+.P1
+ limbo[gen] = append(limbo[gen], b)
+.P2
+When the proc finishes operating on the tree, it leaves the epoch by clearing the active bit.
+When the mutator leaves the current epoch, it also attempts to advance the global epoch.
+This is done by looping over all worker epochs, and checking if any of them are active in an old epoch.
+If the old epoch is empty, then it's safe to advance the current epoch and clear the old epoch's limbo list.
+.P1
+ge = atomic_load(globalepoch);
+for(w in workers){
+ e = atomic_load(epoch[w]);
+ if((e & Active) && e != (ge | Active))
+ return;
+}
+globalepoch = globalepoch+1
+freeblks(limbo[globalepoch - 2])
+.P2
+.PP
+If the old epoch is not empty, then the blocks are not freed, and the cleanup is deferred.
+If a reader stalls out for a very long time, this can lead to a large accumulation of garbage,
+and as a result, GEFS starts to apply backpressure to the writers if the limbo list begins
+to get too large.
+.PP
+This epoch based approach allows GEFS to avoid contention between writes and reads.
+A writer may freely mutate the tree as multiple readers traverse it, with no locking between the processes,
+beyond what is required for the 9p implementation.
+There is still contention on the FID table, the block cache,
+and a number of other in-memory data structures.
+.NH 1
+Appendix A: Data Formats
+.PP
+The formats used for GEFS on-disk storage are described below.
+There are several data structures that are described:
+Superblocks, arena headers, tree nodes, and tree values.
+.PP
+All blocks except raw data blocks begin with a 2 byte header.
+The superblock header is chosen such that it coincides with
+the ascii representation of 'ge'.
+.PP
+All numbers in GEFS are big-endian integers, byte packed.
+.PP
+The headers are listed below:
+.TS
+allbox center;
+c c
+c l.
+Value Description
+0 Unused
+1 Pivot node
+2 Leaf node
+3 Allocation log
+4 Deadlist log
+5 Arena Header
+0x6765 Superblock header
+.TE
+.NH 2
+Superblocks
+.PP
+Superblocks are the root of the file system,
+containing all information needed to load it.
+There is one superblock at offset 0,
+and one superblock at the last block of the file system.
+These two superblocks are duplicates,
+and only one intact superblock is needed to successfully load GEFS.
+Because the superblock fits into a single block,
+all the arenas must also fit into it.
+This imposes an upper bound on the arena count.
+With 16k blocks, this natural limit is approximately 1000 arenas.
+Gefs imposes a smaller limit internally, limiting to 256 arenas by default.
+.IP
+.I header[8]
+= "gefs9.00"
+.br
+.I blksz[4] ": the block size for this file system"
+.br
+.I bufspc[4] ": the buffer space for this file system"
+.br
+.I snap.ht[4] ": the height of the snapshot tree"
+.br
+.I snap.addr[8] ": the root block of the snapshot tree"
+.br
+.I snap.hash[8] ": the hash of the snapshot tree root"
+.br
+.I snapdl.hd.addr ": the address of the snap deadlist head"
+.br
+.I snapdl.hd.hash ": the hash of the snap deadlist head"
+.br
+.I snapdl.tl.addr ": the address of the snap deadlist tail"
+.br
+.I snapdl.tl.hash ": the hash of the snap deadlist tail"
+.br
+.I narena[4] ": the number of arenas"
+.br
+.I flags[8] ": flags for future expansion"
+.br
+.I nextqid[8] ": the next qid that will be allocated"
+.br
+.I nextgen[8] ": the next generation number that will be written"
+.br
+.I qgen[8] ": the last queue generation synced to disk"
+.br
+.I "arena0.addr[8], arena0.hash[8]" ": the location of the 0th arena"
+.br
+.I "arena1.addr[8], arena1.hash[8]" ": the location of the 1st arena
+.br
+.I ...
+.br
+.I "arenaN.addr[8], arenaN.hash[8]" ": the location of the N'th arena"
+.br
+.I sbhash[8] ": hash of superblock contents up to the last arena"
+.NH 2
+Arenas
+.PP
+An arena header contains the freelist, the arena size,
+and (for debugging) the amount of space used within the arena.
+.IP
+.I type[2]
+= Tarena
+.br
+.I free.addr[8] ": the address of the start of the freelist"
+.br
+.I free.hash[8] ": the hash of the start of the freelist"
+.br
+.I size[8] ": the size of the arena"
+.br
+.I used[8] ": the amount of used space in the arena"
+.NH 2
+Logs
+.PP
+Logs are used to track allocations. They are the only structure that is
+mutated in place, and therefore is not fully merkelized.
+There are two types of log in gefs: Allocation logs and deadlists.
+They share a common structure, but contain slightly different data.
+.PP
+All logs share a common header:
+.IP
+.I type[2]
+= Tlog or Tdlist
+.br
+.I logsz[2] ": the amount of log space used"
+.br
+.I loghash[8] ": the hash of all data after the log header"
+.br
+.I chainp[24] ": the block pointer this log block chains to"
+.NH 3
+Allocation Logs
+.PP
+When the type of a log block is Tlog,
+the contents of the block are formatted as an allocation log.
+In an allocation log, each entry is either a single u64int,
+recording an allocation or free of a single block,
+or a pair of u64ints, representing an operation on a range of blocks.
+.PP
+The operations are listed below:
+.LP
+.TS
+allbox center;
+c c
+c l.
+Value Description
+1 Allocate 1 block
+2 Free 1 block
+3 Sync barrier
+4 Alloc block range
+5 Free block range
+.TE
+Operations are packed with the operation in the low order byte.
+The rest of the value is packed in the upper bits.
+For multi-block operations, the range length is packed in the second byte.
+.IP
+.P1
+PACK64(logent, addr|op);
+if(op == 4 || op == 5)
+ PACK64(logent+8, len);
+.P2
+.NH 3
+Deadlist Logs
+.PP
+Deadlist logs are simpler than allocation logs.
+They only contain a flat list of blocks that have been killed.
+.NH 2
+The Tree
+.PP
+The tree is composed of two types of block:
+Pivot blocks, and leaf blocks.
+The block types were
+.NH 3
+Pivot Blocks
+.PP
+Pivot blocks contain the inner nodes of the tree.
+They have the following header. The layout is as
+described earlier in the paper.
+.IP
+.I type[2] " = Tpivot"
+.br
+.I nval[2] ": the count of values"
+.br
+.I valsz[2] ": the number of bytes of value data"
+.br
+.I nbuf[2] ": the count of buffered messages"
+.br
+.I bufsz[2] ": the number of bytes of buffered messages"
+.PP
+.NH 3
+Pivot leaves
+.PP
+Within the block, the first half of the space after
+the header contains a key/pointer set. The head of
+the space contains an array of 2-byte offsets to keys,
+and the tail of the space contains a packed set of keys
+and block pointers.
+.PP
+The offset table is simple:
+.IP
+.I off[2*nval] ": the offset table"
+.PP
+the keys/pointers are slightly more complicated.
+They contain a length prefixed key, and a pointer
+to the child block for that key.
+.IP
+.I nkey[2] ": the length of the key"
+.br
+.I key[nkey] ": the key data"
+.br
+.I addr ": the address of the pointed to block"
+.br
+.I hash ": the hash of the pointed to block"
+.br
+.I gen ": the generation number of the pointed to block"
+.PP
+The second half of the space consists of messages
+directed to a value in the leaf. This is formatted
+similarly to the key/pointer set, but instead of
+offsets to key/pointer pairs, the offsets point
+to messages.
+.PP
+The array of offsets grows towards the end of the block,
+and the array of values or messages grows towards the start of the block.
+.PP
+The offset table is the same, however, instead of
+having
+.I nval
+entries, it has
+.I nbuf
+entries.
+.IP
+.I off[2*nbuf]
+.PP
+The messages contain a single byte opcode,
+a key, and a message that contains an incremental
+update to the value.
+.IP
+.I op[1] ": the message operation"
+.br
+.I nkey[2] ": the length of the target key"
+.br
+.I key[nkey] ": the contents of the target key"
+.br
+.I nval[2] ": the length of the message"
+.br
+.I val[nval] ": the contents of the message"
+.NH 3
+Leaf Blocks
+.PP
+Leaf blocks contain the leaf nodes of the tree.
+They have the following header. The layout is as
+described earlier in the paper.
+.IP
+.I type[2] ": the block type"
+Tleaf
+.I nval[2] ": the number of key value pairs"
+.br
+.I valsz[2] ": the size of the key value pairs"
+.PP
+Within a leaf, the layout is very similar to a pivot.
+There is a table of key-value offsets,
+and an array of packed messages.
+As before,
+the array of offsets grows towards the end of the block,
+and the array of values grows towards the start of the block.
+.IP
+.I off[2*nval] ": the offset table"
+.PP
+Each key value pair is encoded as below:
+.IP
+.I nkey[2] ": the length of the key"
+.br
+.I key[nkey] ": the contents of the key"
+.br
+.I nval[2] ": the length of the value"
+.br
+.I val[nval] ": the contents of the value"
+.NH 2
+Keys and Values.
+.PP
+In GEFS, keys begin with a single type byte,
+and are followed by a set of data in a known format.
+Here are the types of known keys:
+.PP
+.I "Kdat qid[8] off[8]"
+describes pointer to a data block.
+The value for this data key must be a block pointer.
+Block pointers are encoded as
+.I "addr[8] hash[8] gen[8]" .
+This entry is only valid in file system trees.
+.PP
+.I "Kent pqid[8] name[n]"
+describes a pointer to a file entry (stat structure).
+The value must be the body of a dir structure.
+This entry is only valid in file system trees.
+The dir structure is structured as below:
+.IP
+.I flag[8] ": flags for future expansion"
+.br
+.I qid.path[8] ": the qid path"
+.br
+.I qid.vers[4] ": the qid version"
+.br
+.I qid.type[1] ": the qid type"
+.br
+.I mode[4] ": the permission bits"
+.br
+.I atime[8] ": the access time"
+.br
+.I mtime[8] ": the modification time"
+.br
+.I length[8] ": the file size"
+.br
+.I uid[4] ": the owning user id"
+.br
+.I gid[4] ": the owning group id"
+.br
+.I muid[4] ": the last user that modified the file"
+.PP
+.I "Kup qid[8]"
+describes a pointer to a parent directory.
+The value is the
+.I Kent
+formatted key.
+This key is the entry of the containing directory.
+It's only present for directories.
+This entry is only valid in file system trees.
+.PP
+.I "Klabel name[]"
+describes a label for a snapshot.
+The value is a
+.I snapid[8] ,
+referring to a snapid indexed by Ksnap.
+This key is only valid in snapshot trees.
+.PP
+.I "Ksnap snapid[8]"
+describes a key referring to a snapshot tree.
+The value is a tree entry.
+The tree is formatted as:
+.IP
+.br
+.I nref[4] ": the number of references from other trees"
+.br
+.I nlbl[4] ": the number of references from labels"
+.br
+.I ht[4] ": the height of the tree"
+.br
+.I flag[4] ": flags for future expansion"
+.br
+.I gen[8] ": the tree generation number"
+.br
+.I pred[8] ": the predecessor snapshot"
+.br
+.I succ[8] ": the successor snapshot"
+.br
+.I base[8] ": the base snapshot"
+.br
+.I bp.addr[8] ": the address of the root block"
+.br
+.I bp.hash[8] ": the hash of the root block"
+.br
+.I bp.gen[8] ": the generation of the root block"
+.PP
+.I "Kdlist snap[8] gen[8]"
+describes a key referring to a deadlist.
+The
+.I snap
+field refers to the snapshot that the deadlist belongs to,
+and the
+.I bgen
+field refers to the birth generation of the blocks on the deadlist.
+The value of the deadlist entry is a pair of block pointers,
+pointing to the head and tail of the block list.
+.NH 2
+Messages
+.PP
+.I Oinsert
+and
+.I Odelete
+can have any key/value pair as an operand.
+They replace or remove a key/value pair respectively.
+.PP
+.I Oclearb
+inserts a deferred free of a block,
+without reading it first.
+It has no value, but the key must be a
+.I Kdat
+key.
+.PP
+.I Oclobber
+is similar to
+.I Oclearb ,
+but its operand must be a
+.I Kent
+key.
+.I Owstat
+updates an existing file entry.
+The key of an
+.I Owstat
+message must be a
+.I Kent ,
+and the value is a bit field of fields to update,
+along with the new values.
+The first byte is a set of wstat flags, and the
+remaining data is the packed value associated with each flag.
+It can contain the following updates:
+.IP
+.I "Owsize fsize[8]" ": update file size"
+.br
+.I "Owmode mode[4]" ": update file mode"
+.br
+.I "Owmtime mtime[8]" ": update mtime, in nsec"
+.br
+.I "Owatime atime[8]" ": update atime, in nsec"
+.br
+.I "Owuid uid[4]" ": set uid"
+.br
+.I "Owgid uid[4]" ": set gid"
+.br
+.I "Omuid uid[4]" ": set muid"
+.PP
+.I Orelink
+and
+.I Oreprev
+rechain snapshots.
+The key of either of these messages is a
+.I Ksnap ,
+and the operand is the ID of a new
+predecessor or successor snap.
+.NH 1
+References
+.LP
+[1] Michael A. Bender, Martin Farach-Colton, William Jannen, Rob Johnson,
+Bradley C. Kuszmaul, Donald E. Porter, Jun Yuan, and Yang Zhan,
+.LP
+``An Introduction to Bε Trees and Write-Optimization,''
+.I ";login:",
+ October 2015, Vol. 40, No. 5" ,
+.LP
+[2] William Jannen, Jun Yuan, Yang Zhan, Amogh Akshintala, John Esmet, Yizheng Jiao,
+Ankur Mittal, Prashant Pandey, Phaneendra Reddy, Leif Walsh, Michael Bender,
+Martin Farach-Colton, Rob Johnson, Bradley C. Kuszmaul, and Donald E. Porter,
+``BetrFS: A Right-Optimized Write-Optimized File System,''
+.I "Proceedings of the 13th USENIX Conference on File and Storage Technologies,"
+2015
+.LP
+[3] Matthew Dillon, "The HAMMER Filesystem,"
+June 2008.
+.LP
+[4] Ohad Rodeh, Josef Bacik, Chris Mason, "BTRFS: The Linux B-Tree Filesystem"
+.I "ACM Transactions on Storage, Volume 9, Issue 3, Article No 9, pp 1-32,"
+August 2013
+.LP
+[5] Ohad Rodeh, "B-trees, Shadowing, and Clones",
+.LP
+.I H-0245 (H0611-006)
+November 12, 2006
+.LP
+[6] Matt Ahrens, `` How ZFS Snapshots Really Work,''
+.I BSDCan,
+2019
+.LP
+[7] Gregory R. Ganger, Marshall Kirk McKusick, Craig A. N. Soules,
+and Yale N. Patt.
+``Soft Updates: A Solution to the Metadata Update Problem
+in File Systems,''
+.I "ACM Transactions on Computer Systems" ,
+Vol 18., No. 2, May 2000, pp. 127\-153.
+.LP
+[8] Valerie Aurora,
+``Soft updates, hard problems''
+.I "Linux Weekly News",
+July 1, 2009,
+https://lwn.net/Articles/339337/
+.LP
+[9] kvik,
+.I "Clone",
+https://shithub.us/kvik/clone/HEAD/info.html
--- /dev/null
+++ b/sys/man/4/gefs
@@ -1,0 +1,161 @@
+.TH GEFS 4
+.SH NAME
+gefs \- file server
+.SH SYNOPSIS
+.B gefs
+[
+.B -A
+]
+[
+.B -r
+.I user
+]
+[
+.B -f
+.I file
+]
+[
+.B -m
+.I mem
+]
+[
+.B -n
+.I name
+]
+[
+.B -a
+.I ann
+] ...
+[
+.B -S
+]
+[
+.B -s
+]
+.SH DESCRIPTION
+.PP
+.I Gefs
+is an experimental file server.
+It attempts to be crash safe, snapshotting, and corruption-detecting,
+without giving up too much performance.
+.PP
+Gefs allows multiple snapshots to be mounted and maintained concurrently.
+These snapshots all share the same storage pool, but can be written to,
+snapshotted, and rolled back independently.
+.PP
+The snapshot to mount is selected by using the attach specifier when
+mounting. If the attach specifier begins with a
+.I %
+sigil, then the snapshot is mounted in permissive mode.
+In permissive mode, permissions are not checked, and
+.IR wstat (5)
+may change any attributes of any file including the owner.
+Unless the file system is started with the permissive flag,
+only users in the
+.I adm
+group may mount snapshots permissively.
+.PP
+Gefs accepts the following options:
+.TP
+.B -A
+Disable auth. Permissions are still checked, but anyone will be able
+to attach as any user.
+.TP
+.BI "-a " ann
+Announce and listen on the specified network address.
+.TP
+.BI "-f " file
+Use
+.I file
+as the disk.
+.TP
+.B -g
+Grow the file system to fill the current partition.
+.TP
+.BI "-m " mem
+Specify the amount of memory to use as cache.
+The
+.I mem
+parameter recognizes
+.IR M ,
+.IR G ,
+and
+.I %
+as suffixes.
+If left unspecified, it defaults to 25% of installed RAM.
+.TP
+.BI "-n " name
+Use
+.I name
+as the name of the service.
+If unspecified, the default service name is
+.IR gefs .
+.TP
+.BI "-r " user
+Ream the file system, erasing all of the old data.
+Create a user named
+.I user
+in the
+.I adm
+group.
+After reaming,
+.I gefs
+will exit.
+.TP
+.B -S
+Allow permissive mounts for all users.
+Additionally, if the user file is unreadable, fall back to the default user table.
+Without god, all things are permitted.
+.TP
+.B -s
+Read and write protocol messages on standard file descriptors zero and one.
+.TP
+.B -t
+Set the size of the trace buffer in megabytes.
+If set to 0, no debug traces are recorded.
+By default, 16 megabytes of trace buffer are kept.
+.SH EXAMPLES
+.PP
+Mount snapshots
+.I gefs
+from the partition
+.I /dev/sdE0/fs
+onto a few different mountpoints.
+The
+.I main
+snapshot is mounted to
+.IR /n/gefs .
+The
+.I sys
+snapshot is mounted to
+.IR /n/gefs/sys .
+And finally, the
+.I adm
+snapshot is mounted in permissive mode to
+.IR /n/adm .
+.IP
+.EX
+gefs -f /dev/sdE0/fs
+mount /srv/gefs /n/gefs
+mount /srv/gefs /n/gefs/sys sys
+mount /srv/gefs /n/adm %adm
+.EE
+.PP
+Initialize a new file system on a device.
+Note, this assumes the disk has already been prepared with
+.IR prep (8),
+and a
+.I fs
+partition has been created.
+.IP
+.EX
+gefs -r $user -f /dev/sdE0/fs
+.EE
+.SH SEE ALSO
+.IR cwfs (4),
+.IR hjfs (4),
+.IR gefs (8),
+.IR prep (8),
+.IR sd (3)
+.SH BUGS
+Yes
--- /dev/null
+++ b/sys/man/8/gefs
@@ -1,0 +1,199 @@
+.TH GEFS 8
+.SH NAME
+gefs \- file server maintenance
+.SH SYNOPSIS
+.PD 0
+.PP
+.B check
+.PP
+.B df
+.PP
+.B halt
+.PP
+.B help
+.PP
+.B permit
+[
+.B on
+|
+.BR off
+]
+.PP
+.B save trace
+.I filename
+.PP
+.B snap
+[
+-Smdl
+]
+[
+.I old
+[
+.I new
+]
+]
+.PP
+.B sync
+.PP
+.B users
+.SH DESCRIPTION
+.IR Gefs (4)
+provides an administration console on
+.IR /srv/gefs.cmd .
+By default, this console is only readable
+and writable by the owner of the file system.
+.SH CONSOLE
+.PP
+The console handles the following commands:
+.PP
+.I Check
+applies basic consistency checks to the file system,
+reporting invalid blocks, broken metadata, and other
+similar structural issues.
+.PP
+.I Df
+prints the amount of used space and total space in megabytes,
+as well as the percentage of space occupied.
+.PP
+.I Halt
+syncs all IO to disk and exits the file system.
+While the syncing occurs, the file system does not
+allow new writes.
+.PP
+.I Help
+prints a summary of the available commands.
+This table includes additional debug commands that are
+subject to change, and are intentionally undocumented.
+.PP
+.I Permit
+[
+.B on
+|
+.B off
+]
+has two effects.
+First, if the user table is broken, it allows a fallback to a default user list.
+This allows the system administrator to recover if they reboot with a broken user file.
+Second, it allows mounts to occur in permissive mode by any user.
+Permissive mounts are designated by prefixing the attach spec with a
+.I %
+sigil.
+Permissive disables permissions checks when accessing files, and allows
+.IR wstat (5)
+to modify the owner of the file.
+This may be useful during file system initialization.
+.PP
+.B Snap
+manages snapshots.
+It can be invoked as
+.I snap
+.BR -l ,
+.I snap
+.B -d
+.IR snap ,
+or
+.I snap
+[
+.B -flags
+]
+.IR "old new" ,
+which will list, delete, or create new snapshots respectively.
+It accepts the following options:
+.TP
+.B -l
+Lists snapshots and their attributes.
+.TP
+.BI "-d " snap
+Deletes a snapshot, reclaiming whatever space is not shared
+is not shared with other snapshots.
+.TP
+.B -m
+Flags that the newly created snapshot should be mutable.
+.TP
+.B -S
+Disables automatic snapshots.
+.I old
+and gives it the name
+.IR new .
+.PP
+.I Sync
+writes dirty blocks in memory to the disk.
+.PP
+.B Users
+attempts to reload the user table from
+.IR /adm/users .
+.PP
+.I save trace
+saves a trace of recent operations to a file.
+If a file is not specified, it prints to the console.
+.SH ADM FILES
+.PP
+Gefs supports independent snapshots in the same file system.
+As a result, global configuration needs to be separated from snapshots.
+The global configuration resides in a well known snapshot called
+.IR adm .
+.PP
+The adm snapshot would conventionally be mounted in
+.IR /adm .
+It contains the
+.IR users (6)
+file.
+.IR
+The
+.I users
+file is read at file system startup, or when the
+.I users
+command is run on the console.
+If the users file is malformed at file system start, then the file system will refuse to initialize.
+.I Permissive
+mode will allow the file system to fall back to a default users table.
+It will also allow any user to mount the
+.I adm
+snapshot: this can help recover from disasters.
+.PP
+The
+.B default
+table looks like this:
+.IP
+.EX
+-1:adm:adm:
+0:none::
+1:$user:$user:
+.EE
+.PP
+Where
+.I $user
+is specified at the time that the file system is reamed.
+.SH EXAMPLES
+.PP
+To show current disk usage, the following may be written on the console:
+.IP
+.EX
+df
+.EE
+To create a new snapshot:
+.IP
+.EX
+snap main myimmutable
+.EE
+.PP
+To create a new mutable snapshot that does not take automatic
+checkpoints:
+.IP
+.EX
+snap -Sm main mymutable
+.EE
+.PP
+To delete a snapshot:
+.IP
+.EX
+snap -d mysnap
+.EE
+.SH BUGS
+.PP
+Currently, it's not possible to change the mutability of a snapshot.
+Instead, a new label needs to be created.
+.PP
+.SH SEE ALSO
+.IR gefs (4)
+
--- a/sys/src/9/boot/bootfs.proto
+++ b/sys/src/9/boot/bootfs.proto
@@ -20,6 +20,7 @@
dossrv
echo
cwfs64x
+ gefs
grep
ip
ipconfig
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-386.s
@@ -1,0 +1,109 @@
+#define CMPXCHG /* (CX) */\
+ BYTE $0x0F; BYTE $0xB1; BYTE $0x11
+#define CMPXCHG64 /* (DI) */\
+ BYTE $0x0F; BYTE $0xC7; BYTE $0x0F
+#define XADDL /* BX, (AX) */ \
+ BYTE $0x0F; BYTE $0xC1; BYTE $0x03
+#define XADDLSP /* AX, (SP) */ \
+ BYTE $0x0F; BYTE $0xC1; BYTE $0x04; BYTE $0x24
+
+/* get variants */
+TEXT ageti+0(SB),1,$0
+TEXT agetl+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+ MOVL p+0(FP), AX
+ MOVL 0(AX), AX
+ RET
+
+TEXT agetv+0(SB),1,$0
+ MOVL r+0(FP), AX
+ MOVL p+4(FP), BX
+ FMOVD (BX), F0
+ FMOVDP F0, (AX)
+ RET
+
+/* set variants */
+TEXT aseti+0(SB),1,$0
+TEXT asetl+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+ MOVL p+0(FP), BX
+ MOVL v+4(FP), AX
+ LOCK; XCHGL (BX), AX
+ RET
+
+TEXT asetv+0(SB),1,$0
+ MOVL p+4(FP), DI
+ MOVL nv+8(FP), BX
+ MOVL nv+12(FP), CX
+ MOVL 0(DI), AX
+ MOVL 4(DI), DX
+loop:
+ LOCK; CMPXCHG64
+ JNE loop
+ MOVL p+0(FP),DI
+ MOVL AX, 0(DI)
+ MOVL DX, 4(DI)
+ RET
+
+/* inc variants */
+TEXT ainci+0(SB),1,$0
+TEXT aincl+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+ MOVL p+0(FP), BX
+ MOVL v+4(FP), CX
+ MOVL CX, AX
+ LOCK; XADDL
+ ADDL CX, AX
+ RET
+
+TEXT aincv+0(SB),1,$0
+ MOVL p+4(FP), DI
+retry:
+ MOVL 0(DI), AX
+ MOVL 4(DI), DX
+ MOVL AX, BX
+ MOVL DX, CX
+ ADDL v+8(FP), BX
+ ADCL v+12(FP), CX
+ LOCK; CMPXCHG64
+ JNE retry
+ MOVL r+0(FP), DI
+ MOVL BX, 0x0(DI)
+ MOVL CX, 0x4(DI)
+ RET
+
+/* cas variants */
+TEXT acasi+0(SB),1,$0
+TEXT acasl+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+ MOVL p+0(FP), CX
+ MOVL ov+4(FP), AX
+ MOVL nv+8(FP), DX
+ LOCK; CMPXCHG
+ JNE fail32
+ MOVL $1,AX
+ RET
+fail32:
+ MOVL $0,AX
+ RET
+
+TEXT acasv+0(SB),1,$0
+ MOVL p+0(FP), DI
+ MOVL ov+4(FP), AX
+ MOVL ov+8(FP), DX
+ MOVL nv+12(FP), BX
+ MOVL nv+16(FP), CX
+ LOCK; CMPXCHG64
+ JNE fail64
+ MOVL $1,AX
+ RET
+fail64:
+ MOVL $0,AX
+ RET
+
+/* barriers (do we want to distinguish types?) */
+TEXT coherence+0(SB),1,$0
+ /* this is essentially mfence but that requires sse2 */
+ XORL AX, AX
+ LOCK; XADDLSP
+ RET
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-amd64.s
@@ -1,0 +1,59 @@
+/* get variants */
+TEXT agetl+0(SB),1,$0
+ MOVL (RARG), AX
+ RET
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+ MOVQ (RARG), AX
+ RET
+
+/* set variants */
+TEXT asetl+0(SB),1,$0
+ MOVL v+8(FP), AX
+ LOCK; XCHGL (RARG), AX
+ RET
+
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+ MOVQ v+8(FP), AX
+ LOCK; XCHGQ (RARG), AX
+ RET
+
+/* inc variants */
+TEXT aincl+0(SB),1,$0
+ MOVQ v+8(FP), BX
+ MOVQ BX, AX
+ LOCK; XADDL AX, (RARG)
+ ADDQ BX, AX
+ RET
+
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+ MOVQ v+8(FP), BX
+ MOVQ BX, AX
+ LOCK; XADDQ AX, (RARG)
+ ADDQ BX, AX
+ RET
+
+/* cas variants */
+TEXT acasl+0(SB),1,$0
+ MOVL c+8(FP), AX
+ MOVL v+16(FP), BX
+ LOCK; CMPXCHGL BX, (RARG)
+ SETEQ AX
+ MOVBLZX AX, AX
+ RET
+
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+ MOVQ c+8(FP), AX
+ MOVQ v+16(FP), BX
+ LOCK; CMPXCHGQ BX, (RARG)
+ SETEQ AX
+ MOVBLZX AX, AX
+ RET
+
+/* barriers (do we want to distinguish types?) */
+TEXT coherence+0(SB),1,$0
+ MFENCE
+ RET
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-arm.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+ uintptr x = (uintptr)p;
+
+ /* constants from splitmix32 rng */
+ x = (x ^ (x >> 16)) * 0x85ebca6b;
+ x = (x ^ (x >> 13)) * 0xc2b2ae35;
+ x = (x ^ (x >> 16));
+ return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+ T n(T *p) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define SET(T, n) \
+ T n(T *p, T v) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ *p = v; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define INC(T, n) \
+ T n(T *p, T dv) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ *p += dv; \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define CAS(T, n) \
+ int n(T *p, T ov, T nv) \
+ { \
+ uintptr h; \
+ int r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ if(*p == ov){ \
+ *p = nv; \
+ r = 1; \
+ }else \
+ r = 0; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-arm64.s
@@ -1,0 +1,79 @@
+/* get variants */
+TEXT agetl+0(SB),1,$0
+ MOVW (R0), R0
+ RETURN
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+ MOV (R0), R0
+ RETURN
+
+/* set variants */
+TEXT asetl+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV R0, R2
+_setl:
+ LDAXRW (R2), R0
+ STLXRW R1, (R2), R3
+ CBNZW R3, _setl
+ RETURN
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV R0, R2
+_setp:
+ LDAXR (R2), R0
+ STLXR R1, (R2), R3
+ CBNZW R3, _setp
+ RETURN
+
+/* inc variants */
+TEXT aincl+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV R0, R2
+_incl:
+ LDAXRW (R2), R0
+ ADDW R1, R0, R3
+ STLXRW R3, (R2), R4
+ CBNZW R4, _incl
+ RETURN
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV R0, R2
+_incp:
+ LDAXR (R2), R0
+ ADD R1, R0, R3
+ STLXR R3, (R2), R4
+ CBNZW R4, _incp
+ RETURN
+
+/* cas variants */
+TEXT acasl+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV 0x10(FP), R2
+ LDAXRW (R0), R3
+ CMPW R1, R3
+ BNE _casl
+ STLXRW R2, (R0), R4
+ CMPW $0, R4
+_casl:
+ CSETW EQ, R0
+ RETURN
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+ MOV 0x08(FP), R1
+ MOV 0x10(FP), R2
+ LDAXR (R0), R3
+ CMP R1, R3
+ BNE _casp
+ STLXR R2, (R0), R4
+ CMPW $0, R4
+_casp:
+ CSETW EQ, R0
+ RETURN
+
+/* barriers */
+#define ISH (2<<2 | 3)
+TEXT coherence+0(SB),1,$0
+ DMB $ISH
+ RETURN
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-mips.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+ uintptr x = (uintptr)p;
+
+ /* constants from splitmix32 rng */
+ x = (x ^ (x >> 16)) * 0x85ebca6b;
+ x = (x ^ (x >> 13)) * 0xc2b2ae35;
+ x = (x ^ (x >> 16));
+ return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+ T n(T *p) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define SET(T, n) \
+ T n(T *p, T v) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ *p = v; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define INC(T, n) \
+ T n(T *p, T dv) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ *p += dv; \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define CAS(T, n) \
+ int n(T *p, T ov, T nv) \
+ { \
+ uintptr h; \
+ int r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ if(*p == ov){ \
+ *p = nv; \
+ r = 1; \
+ }else \
+ r = 0; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-power64.s
@@ -1,0 +1,106 @@
+/* get variants */
+TEXT agetl+0(SB),1,$0
+ SYNC
+ // See ISA 3.0B section B.2.3, "Safe Fetch"
+ MOVWZ 0(R3), R3
+ CMPW R3, R3, CR7
+ BC 4, 30, 1(PC) // bne- cr7,0x4
+ ISYNC
+ MOVW R3, ret+8(FP)
+ RETURN
+
+TEXT agetv+0(SB),1,$0
+TEXT agetp+0(SB),1,$0
+ SYNC
+ // See ISA 3.0B section B.2.3, "Safe Fetch"
+ MOVD 0(R3), R3
+ CMP R3, R3, CR7
+ BC 4, 30, 1(PC) // bne- cr7,0x4
+ ISYNC
+ MOVD R3, ret+8(FP)
+ RETURN
+
+/* set variants */
+TEXT asetl+0(SB),1,$0
+ MOVW val+8(FP), R4
+ SYNC
+ MOVW R4, 0(R3)
+ RETURN
+
+TEXT asetv+0(SB),1,$0
+TEXT asetp+0(SB),1,$0
+ MOVD val+8(FP), R4
+ SYNC
+ MOVD R4, 0(R3)
+ RETURN
+
+/* inc variants */
+TEXT aincl+0(SB),1,$0
+ MOVD R3, R4
+ MOVW delta+8(FP), R5
+ LWSYNC
+ LWAR (R4), R3
+ ADD R5, R3
+ STWCCC R3, (R4)
+ BNE -3(PC)
+ MOVW R3, ret+16(FP)
+ RETURN
+
+TEXT aincv+0(SB),1,$0
+TEXT aincp+0(SB),1,$0
+ MOVD delta+8(FP), R5
+ LWSYNC
+ LDAR (R3), R4
+ ADD R5, R4
+ STDCCC R4, (R3)
+ BNE -3(PC)
+ MOVD R4, ret+16(FP)
+ RETURN
+
+/* cas variants */
+TEXT acasl+0(SB),1,$0
+ MOVWZ old+8(FP), R4
+ MOVWZ new+12(FP), R5
+ LWSYNC
+casagain:
+ LWAR (R3), R6
+ CMPW R6, R4
+ BNE casfail
+ STWCCC R5, (R3)
+ BNE casagain
+ MOVD $1, R3
+ LWSYNC
+ MOVB R3, ret+16(FP)
+ RETURN
+casfail:
+ LWSYNC
+ MOVB R0, ret+16(FP)
+ RETURN
+
+TEXT acasv+0(SB),1,$0
+TEXT acasp+0(SB),1,$0
+ MOVD old+8(FP), R4
+ MOVD new+16(FP), R5
+ LWSYNC
+cas64again:
+ LDAR (R3), R6
+ CMP R6, R4
+ BNE cas64fail
+ STDCCC R5, (R3)
+ BNE cas64again
+ MOVD $1, R3
+ LWSYNC
+ MOVB R3, ret+24(FP)
+ RETURN
+cas64fail:
+ LWSYNC
+ MOVB R0, ret+24(FP)
+ RETURN
+
+/* barriers */
+TEXT coherence+0(SB),1,$0
+ // LWSYNC is the "export" barrier recommended by Power ISA
+ // v2.07 book II, appendix B.2.2.2.
+ // LWSYNC is a load/load, load/store, and store/store barrier.
+ LWSYNC
+ RETURN
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic-spim.c
@@ -1,0 +1,95 @@
+#include <u.h>
+#include <libc.h>
+
+#include "atomic.h"
+
+static Lock locktab[128];
+
+static u32int
+ihash(void *p)
+{
+ uintptr x = (uintptr)p;
+
+ /* constants from splitmix32 rng */
+ x = (x ^ (x >> 16)) * 0x85ebca6b;
+ x = (x ^ (x >> 13)) * 0xc2b2ae35;
+ x = (x ^ (x >> 16));
+ return x & (nelem(locktab)-1);
+}
+
+#define GET(T, n) \
+ T n(T *p) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define SET(T, n) \
+ T n(T *p, T v) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ r = *p; \
+ *p = v; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define INC(T, n) \
+ T n(T *p, T dv) \
+ { \
+ uintptr h; \
+ T r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ *p += dv; \
+ r = *p; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+#define CAS(T, n) \
+ int n(T *p, T ov, T nv) \
+ { \
+ uintptr h; \
+ int r; \
+ \
+ h = ihash(p); \
+ lock(&locktab[h]); \
+ if(*p == ov){ \
+ *p = nv; \
+ r = 1; \
+ }else \
+ r = 0; \
+ unlock(&locktab[h]); \
+ return r; \
+ }
+
+GET(int, ageti)
+GET(long, agetl)
+GET(vlong, agetv)
+GET(void*, agetp)
+
+SET(int, aseti)
+SET(long, asetl)
+SET(vlong, asetv)
+SET(void*, asetp)
+
+INC(int, ainci)
+INC(long, aincl)
+INC(vlong, aincv)
+
+CAS(int, acasi)
+CAS(long, acasl)
+CAS(vlong, acasv)
+CAS(void*, acasp)
--- /dev/null
+++ b/sys/src/cmd/gefs/atomic.h
@@ -1,0 +1,16 @@
+long agetl(long*);
+vlong agetv(vlong*);
+void* agetp(void**);
+
+long asetl(long*, long);
+vlong asetv(vlong*, vlong);
+void* asetp(void**, void*);
+
+long aincl(long*, long);
+vlong aincv(vlong*, vlong);
+
+int acasl(long*, long, long);
+int acasv(vlong*, vlong, vlong);
+int acasp(void**, void*, void*);
+
+void coherence(void);
--- /dev/null
+++ b/sys/src/cmd/gefs/blk.c
@@ -1,0 +1,1095 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+static vlong blkalloc_lk(Arena*);
+static vlong blkalloc(int, uint);
+static void blkdealloc_lk(Arena*, vlong);
+static Blk* initblk(Blk*, vlong, vlong, int);
+
+int
+checkflag(Blk *b, int f)
+{
+ long v;
+
+ v = agetl(&b->flag);
+ return (v & f) == f;
+}
+
+void
+setflag(Blk *b, int f)
+{
+ long ov, nv;
+
+ while(1){
+ ov = agetl(&b->flag);
+ nv = ov | f;
+ if(acasl(&b->flag, ov, nv))
+ break;
+ }
+}
+
+void
+clrflag(Blk *b, int f)
+{
+ long ov, nv;
+
+ while(1){
+ ov = agetl(&b->flag);
+ nv = ov & ~f;
+ if(acasl(&b->flag, ov, nv))
+ break;
+ }
+}
+
+void
+syncblk(Blk *b)
+{
+ assert(checkflag(b, Bfinal));
+ assert(b->bp.addr >= 0);
+ clrflag(b, Bdirty);
+ if(pwrite(fs->fd, b->buf, Blksz, b->bp.addr) == -1)
+ broke("%B %s: %r", b->bp, Eio);
+}
+
+static Blk*
+readblk(vlong bp, int flg)
+{
+ vlong off, rem, n;
+ char *p;
+ Blk *b;
+
+ assert(bp != -1);
+ b = cachepluck();
+ b->alloced = getcallerpc(&bp);
+ off = bp;
+ rem = Blksz;
+ while(rem != 0){
+ n = pread(fs->fd, b->buf, rem, off);
+ if(n <= 0)
+ error("%s: %r", Eio);
+ off += n;
+ rem -= n;
+ }
+ b->cnext = nil;
+ b->cprev = nil;
+ b->hnext = nil;
+ b->flag = 0;
+
+ b->bp.addr = bp;
+ b->bp.hash = -1;
+ b->bp.gen = -1;
+ b->fnext = nil;
+
+ b->nval = 0;
+ b->valsz = 0;
+ b->nbuf = 0;
+ b->bufsz = 0;
+ b->logsz = 0;
+
+ p = b->buf + 2;
+ b->type = (flg&GBraw) ? Tdat : UNPACK16(b->buf+0);
+ switch(b->type){
+ default:
+ broke("invalid block type %d @%llx", b->type, bp);
+ break;
+ case Tdat:
+ case Tsuper:
+ b->data = b->buf;
+ break;
+ case Tarena:
+ b->data = p;
+ break;
+ case Tdlist:
+ case Tlog:
+ b->logsz = UNPACK16(p); p += 2;
+ b->logh = UNPACK64(p); p += 8;
+ b->logp = unpackbp(p, Ptrsz); p += Ptrsz;
+ assert(p - b->buf == Loghdsz);
+ b->data = p;
+ break;
+ case Tpivot:
+ b->nval = UNPACK16(p); p += 2;
+ b->valsz = UNPACK16(p); p += 2;
+ b->nbuf = UNPACK16(p); p += 2;
+ b->bufsz = UNPACK16(p); p += 2;
+ assert(p - b->buf == Pivhdsz);
+ b->data = p;
+ break;
+ case Tleaf:
+ b->nval = UNPACK16(p); p += 2;
+ b->valsz = UNPACK16(p); p += 2;
+ assert(p - b->buf == Leafhdsz);
+ b->data = p;
+ break;
+ }
+ assert(b->magic == Magic);
+ return b;
+}
+
+static Arena*
+pickarena(uint ty, uint hint, int tries)
+{
+ uint n;
+
+ n = hint + tries + ainc(&fs->roundrobin)/1024;
+ if(ty == Tdat)
+ n++;
+ if(hint % fs->narena == 0)
+ n++;
+ return &fs->arenas[n%fs->narena];
+}
+
+Arena*
+getarena(vlong b)
+{
+ int hi, lo, mid;
+ vlong alo, ahi;
+ Arena *a;
+
+ lo = 0;
+ hi = fs->narena;
+ if(b == 0)
+ return &fs->arenas[0];
+ while(1){
+ mid = (hi + lo)/2;
+ a = &fs->arenas[mid];
+ alo = a->h0->bp.addr;
+ ahi = alo + a->size + 2*Blksz;
+ if(b < alo)
+ hi = mid-1;
+ else if(b > ahi)
+ lo = mid+1;
+ else
+ return a;
+ }
+}
+
+
+static void
+freerange(Avltree *t, vlong off, vlong len)
+{
+ Arange *r, *s;
+
+ assert(len % Blksz == 0);
+ if((r = calloc(1, sizeof(Arange))) == nil)
+ error(Enomem);
+ r->off = off;
+ r->len = len;
+ assert(avllookup(t, r, 0) == nil);
+ avlinsert(t, r);
+
+Again:
+ s = (Arange*)avlprev(r);
+ if(s != nil && s->off+s->len == r->off){
+ avldelete(t, r);
+ s->len = s->len + r->len;
+ free(r);
+ r = s;
+ goto Again;
+ }
+ s = (Arange*)avlnext(r);
+ if(s != nil && r->off+r->len == s->off){
+ avldelete(t, r);
+ s->off = r->off;
+ s->len = s->len + r->len;
+ free(r);
+ r = s;
+ goto Again;
+ }
+}
+
+static void
+grabrange(Avltree *t, vlong off, vlong len)
+{
+ Arange *r, *s, q;
+ vlong l;
+
+ assert(len % Blksz == 0);
+ q.off = off;
+ q.len = len;
+ r = (Arange*)avllookup(t, &q.Avl, -1);
+ if(r == nil || off + len > r->off + r->len)
+ abort();
+
+ if(off == r->off){
+ r->off += len;
+ r->len -= len;
+ }else if(off + len == r->off + r->len){
+ r->len -= len;
+ }else if(off > r->off && off+len < r->off + r->len){
+ s = emalloc(sizeof(Arange), 0);
+ l = r->len;
+ s->off = off + len;
+ r->len = off - r->off;
+ s->len = l - r->len - len;
+ avlinsert(t, s);
+ }else
+ abort();
+
+ if(r->len == 0){
+ avldelete(t, r);
+ free(r);
+ }
+}
+
+static Blk*
+mklogblk(Arena *a, vlong o)
+{
+ Blk *lb;
+
+ lb = a->logbuf[a->lbidx++ % nelem(a->logbuf)];
+ if(lb->bp.addr != -1)
+ cachedel(lb->bp.addr);
+ initblk(lb, o, -1, Tlog);
+ finalize(lb);
+ syncblk(lb);
+ traceb("logblk" , lb->bp);
+ return lb;
+}
+
+/*
+ * Logs an allocation. Must be called
+ * with arena lock held. Duplicates some
+ * of the work in allocblk to prevent
+ * recursion.
+ */
+static void
+logappend(Arena *a, vlong off, vlong len, int op)
+{
+ vlong o, start, end;
+ Blk *nl, *lb;
+ char *p, *name;
+
+ lb = a->logtl;
+ assert((off & 0xff) == 0);
+ assert(op == LogAlloc || op == LogFree || op == LogSync);
+ if(op != LogSync){
+ start = a->h0->bp.addr;
+ end = start + a->size + 2*Blksz;
+ assert(lb == nil || lb->type == Tlog);
+ assert(off >= start);
+ assert(off <= end);
+ }
+ switch(op){
+ case LogAlloc: name = "alloc"; break;
+ case LogFree: name = "free"; break;
+ case LogSync: name = "sync"; break;
+ default: name = "???"; break;
+ }
+ assert(lb == nil || lb->logsz >= 0);
+ dprint("logop %llx+%llx@%x: %s\n", off, len, lb?lb->logsz:-1, name);
+ /*
+ * move to the next block when we have
+ * too little room in the log:
+ * We're appending up to 16 bytes as
+ * part of the operation, followed by
+ * 16 bytes of new log entry allocation
+ * and chaining.
+ */
+ if(lb == nil || lb->logsz >= Logspc - Logslop){
+ o = blkalloc_lk(a);
+ if(o == -1)
+ error(Efull);
+ nl = mklogblk(a, o);
+ p = lb->data + lb->logsz;
+ PACK64(p, o|LogAlloc1);
+ lb->logsz += 8;
+ lb->logp = nl->bp;
+ finalize(lb);
+ syncblk(lb);
+ a->logtl = nl;
+ a->nlog++;
+ lb = nl;
+ }
+
+ setflag(lb, Bdirty);
+ if(len == Blksz){
+ if(op == LogAlloc)
+ op = LogAlloc1;
+ else if(op == LogFree)
+ op = LogFree1;
+ }
+ off |= op;
+ p = lb->data + lb->logsz;
+ PACK64(p, off);
+ lb->logsz += 8;
+ if(op >= Log2wide){
+ PACK64(p+8, len);
+ lb->logsz += 8;
+ }
+}
+
+void
+loadlog(Arena *a, Bptr bp)
+{
+ vlong ent, off, len, gen;
+ int op, i, n;
+ char *d;
+ Blk *b;
+
+
+ dprint("loadlog %B\n", bp);
+ traceb("loadlog", bp);
+ while(1){
+ b = getblk(bp, 0);
+ dprint("\tload %B chain %B\n", bp, b->logp);
+ /* the hash covers the log and offset */
+ for(i = 0; i < b->logsz; i += n){
+ d = b->data + i;
+ ent = UNPACK64(d);
+ op = ent & 0xff;
+ off = ent & ~0xff;
+ n = (op >= Log2wide) ? 16 : 8;
+ switch(op){
+ case LogSync:
+ gen = ent >> 8;
+ dprint("\tlog@%x: sync %lld\n", i, gen);
+ if(gen >= fs->qgen){
+ if(a->logtl == nil){
+ b->logsz = i;
+ a->logtl = holdblk(b);
+ return;
+ }
+ dropblk(b);
+ return;
+ }
+ break;
+
+ case LogAlloc:
+ case LogAlloc1:
+ len = (op >= Log2wide) ? UNPACK64(d+8) : Blksz;
+ dprint("\tlog@%x alloc: %llx+%llx\n", i, off, len);
+ grabrange(a->free, off & ~0xff, len);
+ a->used += len;
+ break;
+ case LogFree:
+ case LogFree1:
+ len = (op >= Log2wide) ? UNPACK64(d+8) : Blksz;
+ dprint("\tlog@%x free: %llx+%llx\n", i, off, len);
+ freerange(a->free, off & ~0xff, len);
+ a->used -= len;
+ break;
+ default:
+ dprint("\tlog@%x: log op %d\n", i, op);
+ abort();
+ break;
+ }
+ }
+ if(b->logp.addr == -1){
+ a->logtl = b;
+ return;
+ }
+ bp = b->logp;
+ dropblk(b);
+ }
+}
+
+void
+compresslog(Arena *a)
+{
+
+ int i, nr, nblks;
+ vlong sz, *blks;
+ Blk *b, *nb;
+ Arange *r;
+ Bptr hd;
+ char *p;
+
+ tracem("compresslog");
+ if(a->logtl != nil){
+ finalize(a->logtl);
+ syncblk(a->logtl);
+ }
+ /*
+ * Prepare what we're writing back.
+ * Arenas must be sized so that we can
+ * keep the merged log in memory for
+ * a rewrite.
+ */
+ sz = 0;
+ nr = 0;
+ a->nlog = 0;
+ for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+ sz += 16;
+ nr++;
+ }
+
+ /*
+ * Make a pessimistic estimate of the number of blocks
+ * needed to store the ranges, as well as the blocks
+ * used to store the range allocations.
+ *
+ * This does modify the tree, but it's safe because
+ * we can only be removing entries from the tree, not
+ * splitting or inserting new ones.
+ */
+ nblks = (sz+Logspc)/(Logspc - Logslop) + 16*nr/(Logspc-Logslop) + 1;
+ if((blks = calloc(nblks, sizeof(vlong))) == nil)
+ error(Enomem);
+ if(waserror()){
+ free(blks);
+ nexterror();
+ }
+ for(i = 0; i < nblks; i++){
+ blks[i] = blkalloc_lk(a);
+ if(blks[i] == -1)
+ error(Efull);
+ }
+ /* fill up the log with the ranges from the tree */
+ i = 0;
+ hd = (Bptr){blks[0], -1, -1};
+ b = a->logbuf[a->lbidx++ % nelem(a->logbuf)];
+ a->logbuf[a->lbidx % nelem(a->logbuf)]->bp = Zb;
+ if(b->bp.addr != -1)
+ cachedel(b->bp.addr);
+ initblk(b, blks[i++], -1, Tlog);
+ finalize(b);
+ for(r = (Arange*)avlmin(a->free); r != nil; r = (Arange*)avlnext(r)){
+ if(b->logsz >= Logspc - Logslop){
+ a->nlog++;
+ nb = a->logbuf[a->lbidx++ % nelem(a->logbuf)];
+ if(nb->bp.addr != -1)
+ cachedel(nb->bp.addr);
+ initblk(nb, blks[i++], -1, Tlog);
+ b->logp = nb->bp;
+ setflag(b, Bdirty);
+ finalize(b);
+ syncblk(b);
+ b = nb;
+ }
+ p = b->data + b->logsz;
+ PACK64(p+0, r->off|LogFree);
+ PACK64(p+8, r->len);
+ b->logsz += 16;
+ }
+ finalize(b);
+ syncblk(b);
+
+ /*
+ * now we have a valid freelist, and we can start
+ * appending stuff to it. Clean up the eagerly
+ * allocated extra blocks.
+ */
+ a->loghd = hd;
+ a->logtl = b;
+ for(; i < nblks; i++){
+ cachedel(b->bp.addr);
+ blkdealloc_lk(a, blks[i]);
+ }
+ poperror();
+ free(blks);
+}
+
+int
+logbarrier(Arena *a, vlong gen)
+{
+ logappend(a, gen<<8, 0, LogSync);
+ if(a->loghd.addr == -1)
+ a->loghd = a->logtl->bp;
+ return 0;
+}
+
+/*
+ * Allocate from an arena, with lock
+ * held. May be called multiple times
+ * per operation, to alloc space for
+ * the alloc log.
+ */
+static vlong
+blkalloc_lk(Arena *a)
+{
+ Avltree *t;
+ Arange *r;
+ vlong b;
+
+ t = a->free;
+ r = (Arange*)t->root;
+ if(!usereserve && a->size - a->used <= a->reserve)
+ return -1;
+ if(r == nil)
+ broke(Estuffed);
+
+ /*
+ * A bit of sleight of hand here:
+ * while we're changing the sorting
+ * key, but we know it won't change
+ * the sort order because the tree
+ * covers disjoint ranges
+ */
+ b = r->off;
+ r->len -= Blksz;
+ r->off += Blksz;
+ if(r->len == 0){
+ avldelete(t, r);
+ free(r);
+ }
+ a->used += Blksz;
+ return b;
+}
+
+static void
+blkdealloc_lk(Arena *a, vlong b)
+{
+ logappend(a, b, Blksz, LogFree);
+ if(a->loghd.addr == -1)
+ a->loghd = a->logtl->bp;
+ freerange(a->free, b, Blksz);
+ a->used -= Blksz;
+}
+
+void
+blkdealloc(vlong b)
+{
+ Arena *a;
+
+ a = getarena(b);
+ qlock(a);
+ blkdealloc_lk(a, b);
+ qunlock(a);
+}
+
+static vlong
+blkalloc(int ty, uint hint)
+{
+ Arena *a;
+ vlong b;
+ int tries;
+
+ tries = 0;
+Again:
+ a = pickarena(ty, hint, tries);
+ /*
+ * Loop through the arena up to 2 times.
+ * The first pass tries to find an arena
+ * that has space and is not in use, the
+ * second waits until an arena is free.
+ */
+ if(tries == 2*fs->narena)
+ error(Efull);
+ tries++;
+ if(tries < fs->narena){
+ if(canqlock(a) == 0)
+ goto Again;
+ }else
+ qlock(a);
+ if(waserror()){
+ qunlock(a);
+ nexterror();
+ }
+ b = blkalloc_lk(a);
+ if(b == -1){
+ qunlock(a);
+ poperror();
+ goto Again;
+ }
+ logappend(a, b, Blksz, LogAlloc);
+ if(a->loghd.addr == -1)
+ a->loghd = a->logtl->bp;
+ qunlock(a);
+ poperror();
+ return b;
+}
+
+static Blk*
+initblk(Blk *b, vlong bp, vlong gen, int ty)
+{
+ Blk *ob;
+
+ ob = cacheget(bp);
+ if(ob != nil)
+ fatal("double alloc: %#p %B %#p %B", b, b->bp, ob, ob->bp);
+ b->type = ty;
+ b->bp.addr = bp;
+ b->bp.hash = -1;
+ b->bp.gen = gen;
+ switch(ty){
+ case Tdat:
+ b->data = b->buf;
+ break;
+ case Tarena:
+ b->data = b->buf+2;
+ break;
+ case Tdlist:
+ case Tlog:
+ b->logsz = 0;
+ b->logp = (Bptr){-1, -1, -1};
+ b->data = b->buf + Loghdsz;
+ break;
+ case Tpivot:
+ b->data = b->buf + Pivhdsz;
+ break;
+ case Tleaf:
+ b->data = b->buf + Leafhdsz;
+ break;
+ }
+ b->fnext = nil;
+
+ setflag(b, Bdirty);
+ b->nval = 0;
+ b->valsz = 0;
+ b->nbuf = 0;
+ b->bufsz = 0;
+ b->logsz = 0;
+ b->alloced = getcallerpc(&b);
+
+ return b;
+}
+
+Blk*
+newblk(Tree *t, int ty, vlong hint)
+{
+ vlong bp;
+ Blk *b;
+
+ bp = blkalloc(ty, hint);
+ b = cachepluck();
+ initblk(b, bp, t->memgen, ty);
+ b->alloced = getcallerpc(&t);
+ tracex("newblk" , b->bp, ty, -1);
+ return b;
+}
+
+Blk*
+dupblk(Tree *t, Blk *b)
+{
+ Blk *r;
+
+ if((r = newblk(t, b->type, 0)) == nil)
+ return nil;
+
+ tracex("dup" , b->bp, b->type, t->gen);
+ setflag(r, Bdirty);
+ r->bp.hash = -1;
+ r->nval = b->nval;
+ r->valsz = b->valsz;
+ r->nbuf = b->nbuf;
+ r->bufsz = b->bufsz;
+ r->logsz = b->logsz;
+ r->alloced = getcallerpc(&t);
+ memcpy(r->buf, b->buf, sizeof(r->buf));
+ return r;
+}
+
+void
+finalize(Blk *b)
+{
+ if(b->type != Tdat)
+ PACK16(b->buf, b->type);
+
+ switch(b->type){
+ default:
+ abort();
+ break;
+ case Tpivot:
+ PACK16(b->buf+2, b->nval);
+ PACK16(b->buf+4, b->valsz);
+ PACK16(b->buf+6, b->nbuf);
+ PACK16(b->buf+8, b->bufsz);
+ break;
+ case Tleaf:
+ PACK16(b->buf+2, b->nval);
+ PACK16(b->buf+4, b->valsz);
+ break;
+ case Tdlist:
+ case Tlog:
+ b->logh = bufhash(b->data, b->logsz);
+ PACK16(b->buf+2, b->logsz);
+ PACK64(b->buf+4, b->logh);
+ packbp(b->buf+12, Ptrsz, &b->logp);
+ break;
+ case Tdat:
+ case Tarena:
+ case Tsuper:
+ break;
+ }
+
+ b->bp.hash = blkhash(b);
+ setflag(b, Bfinal);
+ cacheins(b);
+ b->cached = getcallerpc(&b);
+}
+
+Blk*
+getblk(Bptr bp, int flg)
+{
+ uvlong xh, ck;
+ Blk *b;
+ int i;
+
+ i = ihash(bp.addr) % nelem(fs->blklk);
+ tracex("get" , bp, getcallerpc(&bp), -1);
+ qlock(&fs->blklk[i]);
+ if(waserror()){
+ qunlock(&fs->blklk[i]);
+ nexterror();
+ }
+ if((b = cacheget(bp.addr)) != nil){
+ b->lasthold = getcallerpc(&bp);
+ qunlock(&fs->blklk[i]);
+ poperror();
+ return b;
+ }
+ b = readblk(bp.addr, flg);
+ b->alloced = getcallerpc(&bp);
+ b->bp.hash = blkhash(b);
+ if((flg&GBnochk) == 0){
+ if(b->type == Tlog || b->type == Tdlist){
+ xh = b->logh;
+ ck = bufhash(b->data, b->logsz);
+ }else{
+ xh = bp.hash;
+ ck = b->bp.hash;
+ }
+ if(ck != xh){
+ if(flg & GBsoftchk){
+ fprint(2, "%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+ error(Ecorrupt);
+ }else{
+ broke("%s: %ullx %llux != %llux", Ecorrupt, bp.addr, xh, ck);
+ }
+ }
+ }
+ b->bp.gen = bp.gen;
+ b->lasthold = getcallerpc(&bp);
+ cacheins(b);
+ qunlock(&fs->blklk[i]);
+ poperror();
+
+ return b;
+}
+
+
+Blk*
+holdblk(Blk *b)
+{
+ ainc(&b->ref);
+ b->lasthold = getcallerpc(&b);
+ return b;
+}
+
+void
+dropblk(Blk *b)
+{
+ assert(b == nil || b->ref > 0);
+ if(b == nil || adec(&b->ref) != 0)
+ return;
+ b->lastdrop = getcallerpc(&b);
+ /*
+ * freed blocks go to the LRU bottom
+ * for early reuse.
+ */
+ if(checkflag(b, Bfreed))
+ lrubot(b);
+ else
+ lrutop(b);
+}
+
+ushort
+blkfill(Blk *b)
+{
+ switch(b->type){
+ case Tpivot:
+ return 2*b->nbuf + b->bufsz + 2*b->nval + b->valsz;
+ case Tleaf:
+ return 2*b->nval + b->valsz;
+ default:
+ fprint(2, "invalid block @%lld\n", b->bp.addr);
+ abort();
+ }
+}
+
+void
+limbo(Bfree *f)
+{
+ Bfree *p;
+ ulong ge;
+
+ while(1){
+ ge = agetl(&fs->epoch);
+ p = agetp(&fs->limbo[ge]);
+ f->next = p;
+ if(acasp(&fs->limbo[ge], p, f)){
+ aincl(&fs->nlimbo, 1);
+ break;
+ }
+ }
+}
+
+void
+freeblk(Tree *t, Blk *b, Bptr bp)
+{
+ Bfree *f;
+
+ if(t == &fs->snap || (t != nil && bp.gen < t->memgen)){
+ tracex("killb", bp, getcallerpc(&t), -1);
+ killblk(t, bp);
+ return;
+ }
+
+ tracex("freeb", bp, getcallerpc(&t), -1);
+ f = emalloc(sizeof(Bfree), 0);
+ f->op = DFblk;
+ f->bp = bp;
+ f->b = nil;
+ if(b != nil){
+ setflag(b, Blimbo);
+ b->freed = getcallerpc(&t);
+ f->b = holdblk(b);
+ }
+ limbo(f);
+}
+
+void
+epochstart(int tid)
+{
+ ulong ge;
+
+ ge = agetl(&fs->epoch);
+ asetl(&fs->lepoch[tid], ge | Eactive);
+}
+
+void
+epochend(int tid)
+{
+ ulong le;
+
+ le = agetl(&fs->lepoch[tid]);
+ asetl(&fs->lepoch[tid], le &~ Eactive);
+}
+
+void
+epochwait(void)
+{
+ int i, delay;
+ ulong e, ge;
+
+ delay = 0;
+Again:
+ ge = agetl(&fs->epoch);
+ for(i = 0; i < fs->nworker; i++){
+ e = agetl(&fs->lepoch[i]);
+ if((e & Eactive) && e != (ge | Eactive)){
+ if(delay < 100)
+ delay++;
+ else
+ fprint(2, "stalled epoch %lx [worker %d]\n", e, i);
+ sleep(delay);
+ goto Again;
+ }
+ }
+}
+
+void
+epochclean(void)
+{
+ ulong c, e, ge;
+ Bfree *p, *n;
+ Arena *a;
+ Qent qe;
+ int i;
+
+ c = agetl(&fs->nlimbo);
+ ge = agetl(&fs->epoch);
+ for(i = 0; i < fs->nworker; i++){
+ e = agetl(&fs->lepoch[i]);
+ if((e & Eactive) && e != (ge | Eactive)){
+ if(c < fs->cmax/4)
+ return;
+ epochwait();
+ }
+ }
+ epochwait();
+ p = asetp(&fs->limbo[(ge+1)%3], nil);
+ asetl(&fs->epoch, (ge+1)%3);
+
+ for(; p != nil; p = n){
+ n = p->next;
+ switch(p->op){
+ case DFtree:
+ free(p->t);
+ break;
+ case DFmnt:
+ free(p->m);
+ break;
+ case DFblk:
+ a = getarena(p->bp.addr);
+ qe.op = Qfree;
+ qe.bp = p->bp;
+ qe.b = nil;
+ qput(a->sync, qe);
+ if(p->b != nil){
+ clrflag(p->b, Blimbo);
+ setflag(p->b, Bfreed);
+ dropblk(p->b);
+ }
+ break;
+ default:
+ abort();
+ }
+ aincl(&fs->nlimbo, -1);
+ free(p);
+ }
+}
+
+void
+enqueue(Blk *b)
+{
+ Arena *a;
+ Qent qe;
+
+ assert(checkflag(b, Bdirty));
+ assert(b->bp.addr >= 0);
+
+ b->enqueued = getcallerpc(&b);
+ a = getarena(b->bp.addr);
+ holdblk(b);
+ finalize(b);
+ traceb("queueb", b->bp);
+ setflag(b, Bqueued);
+ b->queued = getcallerpc(&b);
+ qe.op = Qwrite;
+ qe.bp = b->bp;
+ qe.b = b;
+ qput(a->sync, qe);
+}
+
+void
+qinit(Syncq *q)
+{
+ q->fullrz.l = &q->lk;
+ q->emptyrz.l = &q->lk;
+ q->nheap = 0;
+ q->heapsz = fs->cmax;
+ q->heap = emalloc(q->heapsz*sizeof(Qent), 1);
+
+}
+
+int
+qcmp(Qent *a, Qent *b)
+{
+ if(a->qgen != b->qgen)
+ return (a->qgen < b->qgen) ? -1 : 1;
+ if(a->op != b->op)
+ return (a->op < b->op) ? -1 : 1;
+ if(a->bp.addr != b->bp.addr)
+ return (a->bp.addr < b->bp.addr) ? -1 : 1;
+ return 0;
+}
+
+void
+qput(Syncq *q, Qent qe)
+{
+ int i;
+
+ if(qe.op == Qfree || qe.op == Qwrite)
+ assert(qe.bp.addr != 0 && (qe.bp.addr & (Blksz-1)) == 0);
+ else if(qe.op == Qfence)
+ assert(fs->syncing > 0);
+ else
+ abort();
+ qlock(&q->lk);
+ qe.qgen = agetv(&fs->qgen);
+ while(q->nheap == q->heapsz)
+ rsleep(&q->fullrz);
+ for(i = q->nheap; i > 0; i = (i-1)/2){
+ if(qcmp(&qe, &q->heap[(i-1)/2]) == 1)
+ break;
+ q->heap[i] = q->heap[(i-1)/2];
+ }
+ q->heap[i] = qe;
+ q->nheap++;
+ rwakeup(&q->emptyrz);
+ qunlock(&q->lk);
+}
+
+static Qent
+qpop(Syncq *q)
+{
+ int i, l, r, m;
+ Qent e, t;
+
+ qlock(&q->lk);
+ while(q->nheap == 0)
+ rsleep(&q->emptyrz);
+ e = q->heap[0];
+ if(--q->nheap == 0)
+ goto Out;
+
+ i = 0;
+ q->heap[0] = q->heap[q->nheap];
+ while(1){
+ m = i;
+ l = 2*i+1;
+ r = 2*i+2;
+ if(l < q->nheap && qcmp(&q->heap[m], &q->heap[l]) == 1)
+ m = l;
+ if(r < q->nheap && qcmp(&q->heap[m], &q->heap[r]) == 1)
+ m = r;
+ if(m == i)
+ break;
+ t = q->heap[m];
+ q->heap[m] = q->heap[i];
+ q->heap[i] = t;
+ i = m;
+ }
+Out:
+ rwakeup(&q->fullrz);
+ qunlock(&q->lk);
+ if(e.b != nil){
+ clrflag(e.b, Bqueued);
+ e.b->queued = 0;
+ }
+ return e;
+}
+
+void
+runsync(int, void *p)
+{
+ Arena *a;
+ Syncq *q;
+ Qent qe;
+
+ q = p;
+ if(waserror()){
+ aincl(&fs->rdonly, 1);
+ fprint(2, "error syncing: %s\n", errmsg());
+ return;
+ }
+ while(1){
+ qe = qpop(q);
+ switch(qe.op){
+ case Qfree:
+ tracex("qfreeb", qe.bp, qe.qgen, -1);
+ a = getarena(qe.bp.addr);
+ qlock(a);
+ cachedel(qe.bp.addr);
+ blkdealloc_lk(a, qe.bp.addr);
+ if(qe.b != nil)
+ dropblk(qe.b);
+ qunlock(a);
+ break;
+ case Qfence:
+ tracev("qfence", qe.qgen);
+ qlock(&fs->synclk);
+ if(--fs->syncing == 0)
+ rwakeupall(&fs->syncrz);
+ qunlock(&fs->synclk);
+ break;
+ case Qwrite:
+ tracex("qsyncb", qe.bp, qe.qgen, -1);
+ if(checkflag(qe.b, Bfreed) == 0)
+ syncblk(qe.b);
+ dropblk(qe.b);
+ break;
+ default:
+ abort();
+ }
+ assert(estacksz() == 1);
+ }
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/cache.c
@@ -1,0 +1,194 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static void
+lrudel(Blk *b)
+{
+ if(b == fs->chead)
+ fs->chead = b->cnext;
+ if(b == fs->ctail)
+ fs->ctail = b->cprev;
+ if(b->cnext != nil)
+ b->cnext->cprev = b->cprev;
+ if(b->cprev != nil)
+ b->cprev->cnext = b->cnext;
+ b->cnext = nil;
+ b->cprev = nil;
+}
+
+void
+lrutop(Blk *b)
+{
+ qlock(&fs->lrulk);
+ /*
+ * Someone got in first and did a
+ * cache lookup; we no longer want
+ * to put this into the LRU, because
+ * its now in use.
+ */
+ assert(b->magic == Magic);
+ if(b->ref != 0){
+ qunlock(&fs->lrulk);
+ return;
+ }
+ lrudel(b);
+ if(fs->chead != nil)
+ fs->chead->cprev = b;
+ if(fs->ctail == nil)
+ fs->ctail = b;
+ b->cnext = fs->chead;
+ fs->chead = b;
+ rwakeup(&fs->lrurz);
+ qunlock(&fs->lrulk);
+}
+
+void
+lrubot(Blk *b)
+{
+ qlock(&fs->lrulk);
+ /*
+ * Someone got in first and did a
+ * cache lookup; we no longer want
+ * to put this into the LRU, because
+ * its now in use.
+ */
+ assert(b->magic == Magic);
+ if(b->ref != 0){
+ qunlock(&fs->lrulk);
+ return;
+ }
+ lrudel(b);
+ if(fs->ctail != nil)
+ fs->ctail->cnext = b;
+ if(fs->chead == nil)
+ fs->chead = b;
+ b->cprev = fs->ctail;
+ fs->ctail = b;
+ rwakeup(&fs->lrurz);
+ qunlock(&fs->lrulk);
+}
+
+void
+cacheins(Blk *b)
+{
+ Bucket *bkt;
+ u32int h;
+
+ assert(b->magic == Magic);
+ h = ihash(b->bp.addr);
+ bkt = &fs->bcache[h % fs->cmax];
+ qlock(&fs->lrulk);
+ traceb("cache", b->bp);
+ lock(bkt);
+ if(checkflag(b, Bcached)){
+ unlock(bkt);
+ qunlock(&fs->lrulk);
+ return;
+ }
+ assert(b->hnext == nil);
+ for(Blk *bb = bkt->b; bb != nil; bb = bb->hnext)
+ assert(b != bb);
+ setflag(b, Bcached);
+ b->cached = getcallerpc(&b);
+ b->hnext = bkt->b;
+ bkt->b = b;
+ unlock(bkt);
+ qunlock(&fs->lrulk);
+}
+
+void
+cachedel_lk(vlong addr)
+{
+ Bucket *bkt;
+ Blk *b, **p;
+ u32int h;
+
+ if(addr == -1)
+ return;
+
+ tracex("uncache", Zb, addr, getcallerpc(&addr));
+ h = ihash(addr);
+ bkt = &fs->bcache[h % fs->cmax];
+ lock(bkt);
+ p = &bkt->b;
+ for(b = bkt->b; b != nil; b = b->hnext){
+ if(b->bp.addr == addr){
+ *p = b->hnext;
+ clrflag(b, Bcached);
+ b->uncached = getcallerpc(&addr);
+ b->hnext = nil;
+ break;
+ }
+ p = &b->hnext;
+ }
+ unlock(bkt);
+}
+void
+cachedel(vlong addr)
+{
+ qlock(&fs->lrulk);
+ tracex("uncachelk", Zb, addr, getcallerpc(&addr));
+ cachedel_lk(addr);
+ qunlock(&fs->lrulk);
+}
+
+Blk*
+cacheget(vlong addr)
+{
+ Bucket *bkt;
+ u32int h;
+ Blk *b;
+
+ h = ihash(addr);
+ bkt = &fs->bcache[h % fs->cmax];
+ qlock(&fs->lrulk);
+ lock(bkt);
+ for(b = bkt->b; b != nil; b = b->hnext){
+ if(b->bp.addr == addr){
+ holdblk(b);
+ lrudel(b);
+ b->lasthold = getcallerpc(&addr);
+ break;
+ }
+ }
+ unlock(bkt);
+ qunlock(&fs->lrulk);
+
+ return b;
+}
+
+/*
+ * Pulls the block from the bottom of the LRU for reuse.
+ */
+Blk*
+cachepluck(void)
+{
+ Blk *b;
+
+ qlock(&fs->lrulk);
+ while(fs->ctail == nil)
+ rsleep(&fs->lrurz);
+
+ b = fs->ctail;
+ assert(b->magic == Magic);
+ assert(b->ref == 0);
+ if(checkflag(b, Bcached))
+ cachedel_lk(b->bp.addr);
+ if(checkflag(b, Bcached))
+ fprint(2, "%B cached %#p freed %#p\n", b->bp, b->cached, b->freed);
+ lrudel(b);
+ assert(!checkflag(b, Bcached));
+ b->flag = 0;
+ b->lasthold = 0;
+ b->lastdrop = 0;
+ b->freed = 0;
+ b->hnext = nil;
+ qunlock(&fs->lrulk);
+
+ return holdblk(b);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/check.c
@@ -1,0 +1,305 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <atomic.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+isfree(vlong bp)
+{
+ Arange *r, q;
+ Arena *a;
+
+ q.off = bp;
+ q.len = Blksz;
+
+ a = getarena(bp);
+ r = (Arange*)avllookup(a->free, &q, -1);
+ if(r == nil)
+ return 0;
+ return bp < (r->off + r->len);
+}
+
+static int
+checktree(int fd, Blk *b, int h, Kvp *lo, Kvp *hi)
+{
+ Kvp x, y;
+ Msg mx, my;
+ int i, r, fill;
+ Blk *c;
+ int fail;
+ Bptr bp;
+
+ fail = 0;
+ if(h < 0){
+ fprint(fd, "node too deep (loop?\n");
+ fail++;
+ return fail;
+ }
+ if(b->type == Tleaf){
+ if(h != 0){
+ fprint(fd, "unbalanced leaf\n");
+ fail++;
+ }
+ if(h != 0 && b->nval < 2){
+ fprint(fd, "warning: underfilled leaf %B\n", b->bp);
+ fail++;
+ }
+ }
+ if(b->type == Tpivot && b->nval < 2)
+ fprint(fd, "warning: underfilled pivot %B\n", b->bp);
+ getval(b, 0, &x);
+ if(lo && keycmp(lo, &x) > 0){
+ fprint(fd, "out of range keys %P != %P\n", lo, &x);
+ showblk(fd, b, "out of range", 1);
+ fail++;
+ }
+ for(i = 1; i < b->nval; i++){
+ getval(b, i, &y);
+ if(hi && keycmp(&y, hi) >= 0){
+ fprint(fd, "out of range keys %P >= %P\n", &y, hi);
+ fail++;
+ }
+ if(b->type == Tpivot){
+ bp = getptr(&x, &fill);
+ if(isfree(bp.addr)){
+ fprint(fd, "freed block in use: %llx\n", bp.addr);
+ fail++;
+ }
+ if((c = getblk(bp, 0)) == nil){
+ fprint(fd, "corrupt block: %B\n", bp);
+ fail++;
+ continue;
+ }
+ if(blkfill(c) != fill){
+ fprint(fd, "mismatched block fill\n");
+ fail++;
+ }
+ if(checktree(fd, c, h - 1, &x, &y))
+ fail++;
+ dropblk(c);
+ }
+ r = keycmp(&x, &y);
+ switch(r){
+ case -1:
+ break;
+ case 0:
+ fprint(fd, "duplicate keys %P, %P\n", &x, &y);
+ fail++;
+ break;
+ case 1:
+ fprint(fd, "misordered keys %P, %P\n", &x, &y);
+ fail++;
+ break;
+ }
+ x = y;
+ }
+ if(b->type == Tpivot){
+ getval(b, b->nval-1, &y);
+ bp = getptr(&x, &fill);
+ if((c = getblk(bp, 0)) == nil){
+ fprint(fd, "corrupt block: %B\n", bp);
+ fail++;
+ }
+ if(c != nil && checktree(fd, c, h - 1, &y, nil))
+ fail++;
+ dropblk(c);
+ if(b->nbuf > 0){
+ getmsg(b, 0, &mx);
+ if(hi && keycmp(&mx, hi) >= 0){
+ fprint(fd, "out of range messages %P != %M\n", hi, &mx);
+ fail++;
+ }
+ }
+ for(i = 1; i < b->nbuf; i++){
+ getmsg(b, i, &my);
+ switch(my.op){
+ case Owstat: /* kvp dirent */
+ if((my.v[0] & ~(Owsize|Owmode|Owmtime|Owatime|Owuid|Owgid|Owmuid)) != 0){
+ fprint(fd, "invalid stat op %x\n", my.v[0]);
+ fail++;
+ }
+ break;
+ default:
+ if(my.op <= 0 || my.op >= Nmsgtype){
+ fprint(fd, "invalid message op %d\n", my.op);
+ fail++;
+ }
+ break;
+ }
+ if(hi && keycmp(&y, hi) > 0){
+ fprint(fd, "out of range keys %P >= %P\n", &y, hi);
+ fail++;
+ }
+ if(keycmp(&mx, &my) == 1){
+ fprint(fd, "misordered keys %P, %P\n", &x, &y);
+ fail++;
+ break;
+ }
+ mx = my;
+ }
+
+ }
+ return fail;
+}
+
+static int
+checklog(int fd, Bptr hd)
+{
+ Bptr bp, nb;
+ Blk *b;
+
+ bp = (Bptr){-1, -1, -1};
+ for(bp = hd; bp.addr != -1; bp = nb){
+ if(waserror()){
+ fprint(fd, "error loading %B\n", bp);
+ return 0;
+ }
+ b = getblk(bp, 0);
+ nb = b->logp;
+ dropblk(b);
+ poperror();
+ }
+ return 1;
+}
+
+static int
+checkfree(int fd)
+{
+ Arena *a;
+ Arange *r, *n;
+ int i, fail;
+
+ fail = 0;
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ qlock(a);
+ r = (Arange*)avlmin(a->free);
+ for(n = (Arange*)avlnext(r); n != nil; n = (Arange*)avlnext(n)){
+ if(r->off >= n->off){
+ fprint(2, "misordered length %llx >= %llx\n", r->off, n->off);
+ fail++;
+ }
+ if(r->off+r->len >= n->off){
+ fprint(2, "overlaping range %llx+%llx >= %llx\n", r->off, r->len, n->off);
+ fail++;
+ }
+ r = n;
+ }
+ if(!checklog(fd, a->loghd))
+ fprint(fd, "arena %d: broken freelist\n", i);
+ qunlock(a);
+ }
+ return fail;
+}
+
+static int
+checkdlist(int fd)
+{
+ char pfx[1];
+ Dlist dl;
+ Scan s;
+
+ checklog(fd, fs->snapdl.hd);
+ pfx[0] = Kdlist;
+ btnewscan(&s, pfx, 1);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ kv2dlist(&s.kv, &dl);
+ if(!checklog(fd, dl.hd))
+ print("bad dlist %P: %s\n", &s.kv, errmsg());
+ }
+ btexit(&s);
+ return 0;
+}
+
+static int
+checkdata(int, Tree *t)
+{
+ char pfx[1];
+ Bptr bp;
+ Scan s;
+ Blk *b;
+
+ pfx[0] = Klabel;
+ btnewscan(&s, pfx, 1);
+ btenter(t, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ bp = unpackbp(s.kv.v, s.kv.nv);
+ if(isfree(bp.addr)){
+ fprint(2, "free block in use: %B\n", bp);
+ error("free block in use");
+ }
+ b = getblk(bp, GBraw);
+ dropblk(b);
+ }
+ btexit(&s);
+ return 0;
+}
+
+int
+checkfs(int fd)
+{
+ int ok, height;
+ char pfx[1], name[Keymax+1];
+ Tree *t;
+ Scan s;
+ Blk *b;
+
+ ok = 1;
+ aincl(&fs->rdonly, 1);
+ epochwait();
+ if(waserror()){
+ fprint(fd, "error checking %s\n", errmsg());
+ return 0;
+ }
+ fprint(fd, "checking freelist\n");
+ if(checkfree(fd))
+ ok = 0;
+ fprint(fd, "checking deadlist\n");
+ if(checkdlist(fd))
+ ok = 0;
+ fprint(fd, "checking snap tree: %B\n", fs->snap.bp);
+ if((b = getroot(&fs->snap, &height)) != nil){
+ if(checktree(fd, b, height-1, nil, 0))
+ ok = 0;
+ dropblk(b);
+ }
+ pfx[0] = Klabel;
+ btnewscan(&s, pfx, 1);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ if(waserror()){
+ fprint(fd, "moving on: %s\n", errmsg());
+ continue;
+ }
+ memcpy(name, s.kv.k+1, s.kv.nk-1);
+ name[s.kv.nk-1] = 0;
+ if((t = opensnap(name, nil)) == nil){
+ fprint(2, "invalid snap label %s\n", name);
+ ok = 0;
+ break;
+ }
+ fprint(fd, "checking snap %s: %B\n", name, t->bp);
+ b = getroot(t, &height);
+ if(checktree(fd, b, height-1, nil, 0))
+ ok = 0;
+ if(checkdata(fd, t))
+ ok = 0;
+ dropblk(b);
+ poperror();
+ }
+ btexit(&s);
+ aincl(&fs->rdonly, -1);
+ poperror();
+ return ok;
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/cons.c
@@ -1,0 +1,439 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+
+typedef struct Cmd Cmd;
+
+struct Cmd {
+ char *name;
+ char *sub;
+ int minarg;
+ int maxarg;
+ void (*fn)(int, char**, int);
+};
+
+static void
+setdbg(int fd, char **ap, int na)
+{
+ debug = (na == 1) ? atoi(ap[0]) : !debug;
+ fprint(fd, "debug → %d\n", debug);
+}
+
+static void
+sendsync(int fd, int halt)
+{
+ Amsg *a;
+
+ a = mallocz(sizeof(Amsg), 1);
+ if(a == nil){
+ fprint(fd, "alloc sync msg: %r\n");
+ free(a);
+ return;
+ }
+ a->op = AOsync;
+ a->halt = halt;
+ a->fd = fd;
+ chsend(fs->admchan, a);
+}
+
+static void
+syncfs(int fd, char **, int)
+{
+ sendsync(fd, 0);
+ fprint(fd, "synced\n");
+}
+
+static void
+haltfs(int fd, char **, int)
+{
+ sendsync(fd, 1);
+ fprint(fd, "gefs: ending...\n");
+}
+
+static void
+listsnap(int fd)
+{
+ char pfx[Snapsz];
+ Scan s;
+ uint flg;
+ int sz;
+
+ pfx[0] = Klabel;
+ sz = 1;
+ btnewscan(&s, pfx, sz);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ flg = UNPACK32(s.kv.v+1+8);
+ fprint(fd, "snap %.*s", s.kv.nk-1, s.kv.k+1);
+ if(flg != 0)
+ fprint(fd, " [");
+ if(flg & Lmut)
+ fprint(fd, " mutable");
+ if(flg & Lauto)
+ fprint(fd, " auto");
+ if(flg & Ltsnap)
+ fprint(fd, " tsnap");
+ if(flg != 0)
+ fprint(fd, " ]");
+ fprint(fd, "\n");
+ }
+ btexit(&s);
+}
+
+static void
+snapfs(int fd, char **ap, int na)
+{
+ Amsg *a;
+ int i;
+
+ if((a = mallocz(sizeof(Amsg), 1)) == nil){
+ fprint(fd, "alloc sync msg: %r\n");
+ return;
+ }
+ a->op = AOsnap;
+ a->fd = fd;
+ a->flag = Ltsnap;
+ while(ap[0][0] == '-'){
+ for(i = 1; ap[0][i]; i++){
+ switch(ap[0][i]){
+ case 'S': a->flag &= ~Ltsnap; break;
+ case 'm': a->flag |= Lmut; break;
+ case 'd': a->delete++; break;
+ case 'l':
+ listsnap(fd);
+ free(a);
+ return;
+ default:
+ fprint(fd, "usage: snap -[Smdl] [old [new]]\n");
+ free(a);
+ return;
+ }
+ }
+ na--;
+ ap++;
+ }
+ if(a->delete && na != 1 || !a->delete && na != 2){
+ fprint(fd, "usage: snap -[md] old [new]\n");
+ free(a);
+ return;
+ }
+ if(na >= 1)
+ strecpy(a->old, a->old+sizeof(a->old), ap[0]);
+ if(na >= 2)
+ strecpy(a->new, a->new+sizeof(a->new), ap[1]);
+ sendsync(fd, 0);
+ chsend(fs->admchan, a);
+}
+
+static void
+fsckfs(int fd, char**, int)
+{
+ if(checkfs(fd))
+ fprint(fd, "ok\n");
+ else
+ fprint(fd, "broken\n");
+}
+
+static void
+refreshusers(int fd, char **, int)
+{
+ Mount *mnt;
+
+ if((mnt = getmount("adm")) == nil){
+ fprint(fd, "load users: missing 'adm'\n");
+ return;
+ }
+ if(waserror()){
+ fprint(fd, "load users: %s\n", errmsg());
+ clunkmount(mnt);
+ return;
+ }
+ loadusers(fd, mnt->root);
+ fprint(fd, "refreshed users\n");
+ clunkmount(mnt);
+}
+
+static void
+showbstate(int fd, char**, int)
+{
+ char *p, fbuf[8];
+ Blk *b;
+
+ for(b = blkbuf; b != blkbuf+fs->cmax; b++){
+ p = fbuf;
+ if(b->flag & Bdirty) *p++ = 'd';
+ if(b->flag & Bfinal) *p++ = 'f';
+ if(b->flag & Bfreed) *p++ = 'F';
+ if(b->flag & Bcached) *p++ = 'c';
+ if(b->flag & Bqueued) *p++ = 'q';
+ if(b->flag & Blimbo) *p++ = 'L';
+ *p = 0;
+ fprint(fd, "blk %#p type %d flag %s bp %B ref %ld alloc %#p queued %#p, hold %#p drop %#p cached %#p\n",
+ b, b->type, fbuf, b->bp, b->ref, b->alloced, b->queued, b->lasthold, b->lastdrop, b->cached);
+ }
+}
+
+static void
+showusers(int fd, char**, int)
+{
+ User *u, *v;
+ int i, j;
+ char *sep;
+
+ rlock(&fs->userlk);
+ for(i = 0; i < fs->nusers; i++){
+ u = &fs->users[i];
+ fprint(fd, "%d:%s:", u->id, u->name);
+ if((v = uid2user(u->lead)) == nil)
+ fprint(fd, "???:");
+ else
+ fprint(fd, "%s:", v->name);
+ sep = "";
+ for(j = 0; j < u->nmemb; j++){
+ if((v = uid2user(u->memb[j])) == nil)
+ fprint(fd, "%s???", sep);
+ else
+ fprint(fd, "%s%s", sep, v->name);
+ sep = ",";
+ }
+ fprint(fd, "\n");
+ }
+ runlock(&fs->userlk);
+}
+
+static void
+showdf(int fd, char**, int)
+{
+ char *units[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", nil};
+ vlong size, used, free;
+ double hsize, hused, hfree;
+ double pct;
+ Arena *a;
+ int i, us, uu, uf;
+
+ size = 0;
+ used = 0;
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ qlock(a);
+ size += a->size;
+ used += a->used;
+ qunlock(a);
+ fprint(fd, "arena %d: %llx/%llx (%.2f%%)\n", i, a->used, a->size, 100*(double)a->used/(double)a->size);
+ }
+ free = size - used;
+ hsize = size;
+ hused = used;
+ hfree = free;
+ for(us = 0; us < nelem(units)-1 && hsize >= 500 ; us++)
+ hsize /= 1024;
+ for(uu = 0; uu < nelem(units)-1 && hused >= 500 ; uu++)
+ hused /= 1024;
+ for(uf = 0; uf < nelem(units)-1 && hfree >= 500 ; uf++)
+ hfree /= 1024;
+ pct = 100.0*(double)used/(double)size;
+ fprint(fd, "fill:\t%.2f%%\n", pct);
+ fprint(fd, "used:\t%lld (%.2f %s)\n", used, hused, units[uu]);
+ fprint(fd, "size:\t%lld (%.2f %s)\n", size, hsize, units[us]);
+ fprint(fd, "free:\t%lld (%.2f %s)\n", free, hfree, units[uf]);
+}
+
+void
+showfid(int fd, char**, int)
+{
+ int i;
+ Fid *f;
+ Conn *c;
+
+ for(c = fs->conns; c != nil; c = c->next){
+ fprint(fd, "fids:\n");
+ for(i = 0; i < Nfidtab; i++){
+ lock(&c->fidtablk[i]);
+ for(f = c->fidtab[i]; f != nil; f = f->next){
+ rlock(f->dent);
+ fprint(fd, "\tfid[%d] from %#zx: %d [refs=%ld, k=%K, qid=%Q]\n",
+ i, getmalloctag(f), f->fid, f->dent->ref, &f->dent->Key, f->dent->qid);
+ runlock(f->dent);
+ }
+ unlock(&c->fidtablk[i]);
+ }
+ }
+}
+
+void
+showtree(int fd, char **ap, int na)
+{
+ char *name;
+ Tree *t;
+ Blk *b;
+ int h;
+
+ name = "main";
+ memset(&t, 0, sizeof(t));
+ if(na == 1)
+ name = ap[0];
+ if(strcmp(name, "snap") == 0)
+ t = &fs->snap;
+ else if((t = opensnap(name, nil)) == nil){
+ fprint(fd, "open %s: %r\n", name);
+ return;
+ }
+ b = getroot(t, &h);
+ fprint(fd, "=== [%s] %B @%d\n", name, t->bp, t->ht);
+ showblk(fd, b, "contents", 1);
+ dropblk(b);
+ if(t != &fs->snap)
+ closesnap(t);
+}
+
+static void
+permflip(int fd, char **ap, int)
+{
+ if(strcmp(ap[0], "on") == 0)
+ permissive = 1;
+ else if(strcmp(ap[0], "off") == 0)
+ permissive = 0;
+ else
+ fprint(2, "unknown permissive %s\n", ap[0]);
+ fprint(fd, "permissive: %d → %d\n", !permissive, permissive);
+}
+
+static void
+savetrace(int fd, char **ap, int na)
+{
+ Biobuf *bfd;
+ Trace *t;
+ int i;
+
+ if(na == 0)
+ bfd = Bfdopen(dup(fd, -1), OWRITE);
+ else
+ bfd = Bopen(ap[0], OWRITE);
+ if(bfd == nil){
+ fprint(fd, "error opening output");
+ return;
+ }
+ for(i = 0; i < fs->ntrace; i++){
+ t = &fs->trace[(fs->traceidx + i) % fs->ntrace];
+ if(t->msg[0] == 0)
+ continue;
+ Bprint(bfd, "[%d@%d] %s", t->tid, t->qgen, t->msg);
+ if(t->bp.addr != -1)
+ Bprint(bfd, " %B", t->bp);
+ if(t->v0 != -1)
+ Bprint(bfd, " %llx", t->v0);
+ if(t->v1 != -1)
+ Bprint(bfd, " %llx", t->v1);
+ Bprint(bfd, "\n");
+ }
+ Bterm(bfd);
+ fprint(fd, "saved\n");
+}
+
+static void
+unreserve(int fd, char **ap, int)
+{
+ if(strcmp(ap[0], "on") == 0)
+ usereserve = 0;
+ else if(strcmp(ap[0], "off") == 0)
+ usereserve = 1;
+ else
+ fprint(2, "unknown reserve %s\n", ap[0]);
+ fprint(fd, "reserve: %d → %d\n", !permissive, permissive);
+}
+
+static void
+help(int fd, char**, int)
+{
+ char *msg =
+ "help -- show this help\n"
+ "check -- check for consistency\n"
+ "df -- show disk usage\n"
+ "halt -- stop all writers, sync, and go read-only\n"
+ "permit [on|off] -- switch to/from permissive mode\n"
+ "reserve [on|off] -- enable block reserves\n"
+ "snap -[Smdl] [old [new]] -- manage snapshots\n"
+ "sync -- flush all pending writes to disk\n"
+ "users -- reload user table from adm snapshot\n"
+ "save trace [name] -- save a trace of recent activity\n"
+ "show -- debug dumps\n"
+ " tree [name]\n"
+ " fid\n"
+ " users\n";
+ fprint(fd, "%s", msg);
+}
+
+Cmd cmdtab[] = {
+ /* admin */
+ {.name="check", .sub=nil, .minarg=0, .maxarg=0, .fn=fsckfs},
+ {.name="df", .sub=nil, .minarg=0, .maxarg=0, .fn=showdf},
+ {.name="halt", .sub=nil, .minarg=0, .maxarg=0, .fn=haltfs},
+ {.name="help", .sub=nil, .minarg=0, .maxarg=0, .fn=help},
+ {.name="permit", .sub=nil, .minarg=1, .maxarg=1, .fn=permflip},
+ {.name="snap", .sub=nil, .minarg=1, .maxarg=3, .fn=snapfs},
+ {.name="sync", .sub=nil, .minarg=0, .maxarg=0, .fn=syncfs},
+ {.name="reserve", .sub=nil, .minarg=0, .maxarg=1, .fn=unreserve},
+ {.name="users", .sub=nil, .minarg=0, .maxarg=1, .fn=refreshusers},
+
+ /* debugging */
+ {.name="show", .sub="fid", .minarg=0, .maxarg=0, .fn=showfid},
+ {.name="show", .sub="tree", .minarg=0, .maxarg=1, .fn=showtree},
+ {.name="show", .sub="users", .minarg=0, .maxarg=0, .fn=showusers},
+ {.name="show", .sub="bstate", .minarg=0, .maxarg=0, .fn=showbstate},
+ {.name="debug", .sub=nil, .minarg=0, .maxarg=1, .fn=setdbg},
+ {.name="save", .sub="trace", .minarg=0, .maxarg=1, .fn=savetrace},
+ {.name=nil, .sub=nil},
+};
+
+void
+runcons(int tid, void *pfd)
+{
+ char buf[256], *f[4], **ap;
+ int i, n, nf, na, fd;
+ Cmd *c;
+
+ fd = (uintptr)pfd;
+ while(1){
+ fprint(fd, "gefs# ");
+ if((n = read(fd, buf, sizeof(buf)-1)) == -1)
+ break;
+ epochstart(tid);
+ buf[n] = 0;
+ nf = tokenize(buf, f, nelem(f));
+ if(nf == 0 || strlen(f[0]) == 0)
+ goto Next;
+ for(c = cmdtab; c->name != nil; c++){
+ ap = f;
+ na = nf;
+ if(strcmp(c->name, *ap) != 0)
+ continue;
+ ap++;
+ na--;
+ if(c->sub != nil){
+ if(na == 0 || strcmp(c->sub, *ap) != 0)
+ continue;
+ ap++;
+ na--;
+ }
+ if(na < c->minarg || na > c->maxarg)
+ continue;
+ c->fn(fd, ap, na);
+ break;
+ }
+ if(c->name == nil){
+ fprint(fd, "unknown command '%s", f[0]);
+ for(i = 1; i < nf; i++)
+ fprint(fd, " %s", f[i]);
+ fprint(fd, "'\n");
+ }
+Next:
+ epochend(tid);
+ }
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/dat.h
@@ -1,0 +1,763 @@
+typedef struct Blk Blk;
+typedef struct Amsg Amsg;
+typedef struct Gefs Gefs;
+typedef struct Errctx Errctx;
+typedef struct Fmsg Fmsg;
+typedef struct Fid Fid;
+typedef struct Msg Msg;
+typedef struct Key Key;
+typedef struct Val Val;
+typedef struct Kvp Kvp;
+typedef struct Xdir Xdir;
+typedef struct Bptr Bptr;
+typedef struct Bfree Bfree;
+typedef struct Scan Scan;
+typedef struct Dent Dent;
+typedef struct Scanp Scanp;
+typedef struct Arena Arena;
+typedef struct Arange Arange;
+typedef struct Bucket Bucket;
+typedef struct Chan Chan;
+typedef struct Syncq Syncq;
+typedef struct Qent Qent;
+typedef struct Trace Trace;
+typedef struct Tree Tree;
+typedef struct Dlist Dlist;
+typedef struct Mount Mount;
+typedef struct User User;
+typedef struct Conn Conn;
+
+enum {
+ KiB = 1024ULL,
+ MiB = 1024ULL*KiB,
+ GiB = 1024ULL*MiB,
+ TiB = 1024ULL*GiB,
+
+ Lgblk = 14,
+ Blksz = (1ULL<<Lgblk),
+
+ Nrefbuf = 1024, /* number of ref incs before syncing */
+ Nfidtab = 1024, /* number of fit hash entries */
+ Nflushtab = 1024, /* flush table size */
+ Ndtab = 1024, /* number of dir tab entries */
+ Max9p = 32*KiB, /* biggest message size we're willing to negotiate */
+ Nsec = 1000LL*1000*1000, /* nanoseconds to the second */
+ Maxname = 256, /* maximum size of a name element */
+ Maxent = 9+Maxname+1, /* maximum size of ent key, with terminator */
+ Maxtag = 1<<16, /* maximum tag in 9p */
+
+ /*
+ * Kpmax must be no more than 1/4 of pivspc, or
+ * there is no way to get a valid split of a
+ * maximally filled tree.
+ */
+ Keymax = 128, /* key data limit */
+ Inlmax = 512, /* inline data limit */
+ Ptrsz = 24, /* off, hash, gen */
+ Pptrsz = 26, /* off, hash, gen, fill */
+ Fillsz = 2, /* block fill count */
+ Offksz = 17, /* type, qid, off */
+ Snapsz = 9, /* tag, snapid */
+ Dpfxsz = 9, /* directory prefix */
+ Upksz = 9, /* directory prefix */
+ Dlksz = 1+8+8, /* tag, death, birth */
+ Dlvsz = Ptrsz+Ptrsz, /* hd,tl of deadlist */
+ Dlkvpsz = Dlksz+Dlvsz, /* full size of dlist kvp */
+ Treesz = 4+4+4+4 /* ref, ht, flg, gen, pred, succ, base, root */
+ +8+8+8+8+Ptrsz,
+ Kvmax = Keymax + Inlmax, /* Key and value */
+ Kpmax = Keymax + Ptrsz, /* Key and pointer */
+ Wstatmax = 4+8+8+8, /* mode, size, atime, mtime */
+ Arenasz = 8+8+8+8, /* loghd, loghash, size, used */
+
+ Pivhdsz = 10,
+ Leafhdsz = 6,
+ Loghdsz = 2+2+8+Ptrsz, /* type, len, hash, chain */
+ Rootsz = 4+Ptrsz, /* root pointer */
+ Pivsz = Blksz - Pivhdsz,
+ Bufspc = (Blksz - Pivhdsz)/2, /* pivot room */
+ Pivspc = Blksz - Pivhdsz - Bufspc,
+ Logspc = Blksz - Loghdsz,
+ Logslop = 16+16+8, /* val, nextb, chain */
+ Leafspc = Blksz - Leafhdsz,
+ Msgmax = 1 + (Kvmax > Kpmax ? Kvmax : Kpmax),
+ Estacksz = 64,
+};
+
+enum {
+ Eactive = 1UL<<30, /* epoch active flag */
+};
+
+enum {
+ /*
+ * dent: pqid[8] qid[8] -- a directory entry key.
+ * ptr: off[8] hash[8] gen[8] -- a key for an Dir block.
+ * dir: serialized Xdir
+ */
+
+ /* fs keys */
+ Kdat, /* qid[8] off[8] => ptr: pointer to data page */
+ Kent, /* pqid[8] name[n] => dir[n]: serialized Dir */
+ Kup, /* qid[8] => Kent: parent dir */
+
+ /* snapshot keys */
+ Klabel, /* name[] => snapid[]: snapshot label */
+ Ksnap, /* sid[8] => ref[8], tree[52]: snapshot root */
+ Kdlist, /* snap[8] gen[8] => hd[ptr],tl[ptr] deadlist */
+};
+
+enum {
+ Bdirty = 1 << 0,
+ Bfinal = 1 << 1,
+ Bfreed = 1 << 2,
+ Bcached = 1 << 3,
+ Bqueued = 1 << 4,
+ Blimbo = 1 << 5,
+};
+
+enum {
+ Lmut = 1 << 0, /* can we modify snaps via this label */
+ Lauto = 1 << 1, /* was this label generated automatically */
+ Ltsnap = 1 << 2, /* should we skip the timed snapshots */
+};
+
+enum {
+ Qdump = 1ULL << 63,
+};
+
+#define Zb (Bptr){-1, -1, -1}
+
+/* internal errors */
+#define Efs (abort(), "fs broke")
+extern char Ecorrupt[];
+extern char Efsvers[];
+extern char Eimpl[];
+extern char Ebotch[];
+extern char Eio[];
+extern char Enofid[];
+extern char Efid[];
+extern char Etype[];
+extern char Edscan[];
+extern char Esrch[];
+extern char Eexist[];
+extern char Emode[];
+extern char Efull[];
+extern char Estuffed[];
+extern char Eauth[];
+extern char Elength[];
+extern char Eperm[];
+extern char Einuse[];
+extern char Ebadf[];
+extern char Ename[];
+extern char Enomem[];
+extern char Eattach[];
+extern char Enosnap[];
+extern char Esnap[];
+extern char Edir[];
+extern char Esyntax[];
+extern char Enouser[];
+extern char Enogrp[];
+extern char Efsize[];
+extern char Ebadu[];
+extern char Erdonly[];
+extern char Elocked[];
+extern char Eauthp[];
+extern char Eauthd[];
+extern char Eauthph[];
+extern char Ephase[];
+extern char Enone[];
+extern char Enoauth[];
+
+extern char Ewstatb[];
+extern char Ewstatd[];
+extern char Ewstatg[];
+extern char Ewstatl[];
+extern char Ewstatm[];
+extern char Ewstato[];
+extern char Ewstatp[];
+extern char Ewstatq[];
+extern char Ewstatu[];
+extern char Ewstatv[];
+extern char Enempty[];
+
+/*
+ * All metadata blocks share a common header:
+ *
+ * type[2]
+ *
+ * The None type is reserved for file data blocks
+ * and refcount blocks.
+ *
+ * The superblock has this layout:
+ * version[8] always "gefsNNNNN"
+ * blksz[4] block size in bytes
+ * bufsz[4] portion of leaf nodes
+ * allocated to buffers,
+ * in bytes
+ * height[4] tree height of root node
+ * rootb[8] address of root in last
+ * snapshot.
+ * rooth[8] hash of root node
+ * narena[4] number of arenas in tree
+ * flag[8] feature flag
+ * gen[8] The flush generation
+ *
+ * The arena zone blocks have this layout, and
+ * are overwritten in place:
+ *
+ * log[8] The head of the alloc log
+ * logh[8] The hash of the alloc log
+ *
+ * The log blocks have this layout, and are one of
+ * two types of blocks that get overwritten in place:
+ *
+ * hash[8] The hash of the previous log block
+ *
+ * The remainder of the block is filled with log
+ * entries. Each log entry has at least 8 bytes
+ * of entry. Some are longer. The opcode is or'ed
+ * into the low order bits of the first vlong.
+ * These ops take the following form:
+ *
+ * Alloc, Free:
+ * off[8] len[8]
+ * Alloc1, Free1:
+ * off[8]
+ * Ref:
+ * off[8]
+ * Flush:
+ * gen[8]
+ *
+ * Pivots have the following layout:
+ *
+ * nval[2]
+ * valsz[2]
+ * nbuf[2]
+ * bufsz[2]
+ *
+ * Leaves have the following layout:
+ *
+ * nval[2]
+ * valsz[2]
+ * pad[4]sure,
+ *
+ * Within these nodes, pointers have the following
+ * layout:
+ *
+ * off[8] hash[8] fill[2]
+ */
+enum {
+ Tdat,
+ Tpivot,
+ Tleaf,
+ Tlog,
+ Tdlist,
+ Tarena,
+ Tsuper = 0x6765, /* 'ge' bigendian */
+};
+
+enum {
+ Vinl, /* Inline value */
+ Vref, /* Block pointer */
+};
+
+enum {
+ GBraw = 1<<0,
+ GBwrite = 1<<1,
+ GBnochk = 1<<2,
+ GBsoftchk = 1<<3,
+};
+
+enum {
+ Onop, /* nothing */
+ Oinsert, /* new kvp */
+ Odelete, /* delete kvp */
+ Oclearb, /* free block ptr if exists */
+ Oclobber, /* remove file if it exists */
+ Owstat, /* update kvp dirent */
+ Orelink, /* rechain forwards */
+ Oreprev, /* rechain backwards */
+ Nmsgtype, /* maximum message type */
+};
+
+enum {
+ Magic = 0x979b929e98969c8c,
+};
+
+/*
+ * Wstat ops come with associated data, in the order
+ * of the bit flag.
+ */
+enum{
+ /* wstat flag */
+ Owsize = 1<<0, /* [8]fsize: update file size */
+ Owmode = 1<<1, /* [4]mode: update file mode */
+ Owmtime = 1<<2, /* [8]mtime: update mtime, in nsec */
+ Owatime = 1<<3, /* [8]atime: update atime, in nsec */
+ Owuid = 1<<4, /* [4]uid: set uid */
+ Owgid = 1<<5, /* [4]uid: set gid */
+ Owmuid = 1<<6, /* [4]uid: set muid */
+};
+
+/*
+ * Operations for the allocation log.
+ */
+enum {
+ LogNop, /* unused */
+ /* 1-wide entries */
+ LogAlloc1, /* alloc a block */
+ LogFree1, /* free a block */
+ LogSync, /* sync barrier for replay */
+
+ /* 2-wide entries */
+#define Log2wide LogAlloc
+ LogAlloc, /* alloc a range */
+ LogFree, /* free a range */
+};
+
+enum {
+ AOnone,
+ AOsnap,
+ AOsync,
+ AOclear,
+ AOrclose,
+};
+
+struct Bptr {
+ vlong addr;
+ uvlong hash;
+ vlong gen;
+};
+
+struct Key{
+ char *k;
+ int nk;
+};
+
+struct Val {
+ short nv;
+ char *v;
+};
+
+struct Kvp {
+ Key;
+ Val;
+};
+
+struct Msg {
+ char op;
+ Kvp;
+};
+
+struct Dlist {
+ Dlist *cnext; /* cache next entry */
+ Dlist *cprev; /* cache prev entry */
+ Dlist *chain; /* hash table chain */
+ Blk *ins; /* loaded head */
+
+ vlong gen; /* deadlist gen */
+ vlong bgen; /* birth gen */
+ Bptr hd; /* deadlist head */
+ Bptr tl; /* deadlist tail */
+};
+
+struct Errctx {
+ long tid;
+ char err[128];
+ jmp_buf errlab[Estacksz];
+ int nerrlab;
+};
+
+struct Arange {
+ Avl;
+ vlong off;
+ vlong len;
+};
+
+struct Bucket {
+ Lock;
+ Blk *b;
+};
+
+struct Amsg {
+ int op;
+ int fd;
+ union {
+ struct { /* AOsnap */
+ char old[128];
+ char new[128];
+ int flag;
+ char delete;
+
+ };
+ struct { /* AOsync */
+ int halt;
+ };
+ struct { /* AOclear, AOrclose */
+ Mount *mnt;
+ Dent *dent;
+ vlong qpath;
+ vlong off;
+ vlong end;
+ };
+ };
+};
+
+struct Fmsg {
+ Fcall;
+ Conn *conn;
+ int sz; /* the size of the message buf */
+ uchar buf[];
+};
+
+struct Tree {
+ /* in-memory */
+ Lock lk;
+ long memref; /* number of in-memory references to this */
+ vlong memgen; /* wip next generation */
+ int dirty;
+
+ /* on-disk */
+ int nref; /* number snapshots forked/after us */
+ int nlbl; /* number of labels referring to us */
+ int ht; /* height of the tree */
+ uint flag; /* flag set */
+ Bptr bp; /* block pointer of root */
+ vlong gen; /* generation */
+ vlong pred; /* previous snapshot */
+ vlong succ; /* next snapshot */
+ vlong base; /* base snapshot */
+};
+
+enum {
+ DFblk,
+ DFmnt,
+ DFtree,
+};
+
+struct Bfree {
+ Bfree *next;
+ int op;
+ Mount *m;
+ Tree *t;
+ Blk *b;
+ Bptr bp;
+};
+
+struct User {
+ int id;
+ int lead;
+ int *memb;
+ int nmemb;
+ char name[128];
+};
+
+enum {
+ /* in priority order */
+ Qnone,
+ Qfence,
+ Qwrite,
+ Qfree,
+};
+
+struct Qent {
+ vlong qgen;
+ Bptr bp;
+ Blk *b;
+ int op;
+};
+
+struct Syncq {
+ QLock lk;
+ Rendez fullrz;
+ Rendez emptyrz;
+ Qent *heap;
+ int nheap;
+ int heapsz;
+};
+
+struct Trace {
+ int tid;
+ int qgen;
+ char msg[16];
+ Bptr bp;
+ vlong v0;
+ vlong v1;
+};
+
+/*
+ * Overall state of the file sytem.
+ * Shadows the superblock contents.
+ */
+struct Gefs {
+ int blksz;
+ int bufspc;
+ Tree snap;
+ Dlist snapdl;
+ int narena;
+ vlong flag;
+ vlong nextqid;
+ vlong nextgen;
+ vlong qgen;
+ Bptr *arenabp;
+
+ /* superblocks */
+ Blk *sb0; /* primary */
+ Blk *sb1; /* backup */
+
+ /* arena allocation */
+ Arena *arenas;
+ long roundrobin;
+ long syncing;
+ long nsyncers;
+ long nreaders;
+
+ QLock synclk;
+ Rendez syncrz;
+
+ Mount *mounts;
+ Mount *snapmnt;
+ Lock connlk;
+ Conn *conns;
+
+ Chan *wrchan;
+ Chan *admchan;
+ Chan **rdchan;
+
+ QLock mutlk;
+ long nworker;
+ long epoch;
+ long lepoch[32];
+ Bfree *limbo[3];
+ long nlimbo;
+
+ Syncq syncq[32];
+
+
+ int fd;
+ long rdonly;
+ int noauth;
+
+ /* user list */
+ RWLock userlk;
+ User *users;
+ int nusers;
+
+ /* open directory entries */
+ Lock dtablk;
+ Dent *dtab[Ndtab];
+
+ /* slow block io */
+ QLock blklk[32];
+
+ /* deadlist cache */
+ Dlist **dlcache;
+ Dlist *dlhead;
+ Dlist *dltail;
+ int dlcount;
+ int dlcmax;
+
+ /* block lru */
+ QLock lrulk;
+ Rendez lrurz;
+ Bucket *bcache;
+ Blk *chead;
+ Blk *ctail;
+ usize ccount;
+ usize cmax;
+
+ RWLock flushq[Nflushtab];
+ int flushop[Nflushtab];
+
+ Trace *trace;
+ long traceidx;
+ long ntrace;
+};
+
+struct Arena {
+ QLock;
+ Avltree *free;
+ Blk **queue;
+ int nqueue;
+ int lbidx;
+ Blk *logbuf[2]; /* preallocated log pages */
+ Blk *h0; /* arena header */
+ Blk *h1; /* arena footer */
+ Blk **q; /* write queue */
+ vlong nq;
+ vlong size;
+ vlong used;
+ vlong reserve;
+ /* allocation log */
+ vlong nlog; /* logged since last copression */
+ Bptr loghd; /* allocation log */
+ Blk *logtl; /* end of the log, open for writing */
+ Syncq *sync;
+};
+
+struct Xdir {
+ /* file data */
+ uvlong flag; /* storage flag */
+ Qid qid; /* unique id from server */
+ ulong mode; /* permissions */
+ vlong atime; /* last read time: nsec */
+ vlong mtime; /* last write time: nsec */
+ uvlong length; /* file length */
+ int uid; /* owner name */
+ int gid; /* group name */
+ int muid; /* last modifier name */
+ char *name; /* last element of path */
+};
+
+struct Dent {
+ RWLock;
+ Key;
+ Xdir;
+ Dent *next;
+ QLock trunclk;
+ Rendez truncrz;
+ vlong up;
+ long ref;
+ char gone;
+ char trunc;
+
+ char buf[Maxent];
+};
+
+struct Mount {
+ Lock;
+ Mount *next;
+ long ref;
+ vlong gen;
+ char name[64];
+ Tree *root; /* EBR protected */
+
+ int flag;
+
+ /* snapshot history */
+ char minutely[60][128];
+ char hourly[24][128];
+};
+
+struct Conn {
+ Conn *next;
+ QLock wrlk;
+ int rfd;
+ int wfd;
+ int iounit;
+ int versioned;
+
+ /* fid hash table */
+ Lock fidtablk[Nfidtab];
+ Fid *fidtab[Nfidtab];
+};
+
+struct Fid {
+ Lock;
+ Fid *next;
+ /*
+ * if opened with OEXEC, we want to use a snapshot,
+ * instead of the most recent root, to prevent
+ * paging in the wrong executable.
+ */
+ Mount *mnt;
+ Scan *scan; /* in progres scan */
+ Dent *dent; /* (pqid, name) ref, modified on rename */
+ void *auth;
+
+ u32int fid;
+ vlong qpath;
+ vlong pqpath;
+ long ref;
+ int mode;
+ int iounit;
+
+ int uid;
+ int duid;
+ int dgid;
+ int dmode;
+
+ char permit;
+ char rclose;
+};
+
+enum {
+ POmod,
+ POrot,
+ POsplit,
+ POmerge,
+};
+
+struct Scanp {
+ int bi;
+ int vi;
+ Blk *b;
+};
+
+struct Scan {
+ vlong offset; /* last read offset */
+ char first;
+ char donescan;
+ char overflow;
+ char present;
+ int ht;
+ Kvp kv;
+ Key pfx;
+ char kvbuf[Kvmax];
+ char pfxbuf[Keymax];
+ Scanp *path;
+};
+
+struct Blk {
+ /* cache entry */
+ Blk *cnext;
+ Blk *cprev;
+ Blk *hnext;
+
+ /* Freelist entry */
+ Blk *fnext;
+
+ long flag;
+
+ /* serialized to disk in header */
+ short type; /* @0, for all */
+ union {
+ struct {
+ short nval; /* @2, for Leaf, Pivot: data[0:2] */
+ short valsz; /* @4, for Leaf, Pivot: data[2:4] */
+ short nbuf; /* @6, for Pivot */
+ short bufsz; /* @8, for Pivot */
+ };
+ struct {
+ int logsz; /* @2 for allocation log */
+ uvlong logh; /* @4 for log body hash */
+ Bptr logp; /* @12 next deadlist chain */
+ };
+ };
+
+ /* debug */
+ uintptr queued;
+ uintptr lasthold;
+ uintptr lastdrop;
+ uintptr enqueued;
+ uintptr cached;
+ uintptr uncached;
+ uintptr alloced;
+ uintptr freed;
+
+ Bptr bp;
+ long ref;
+ char *data;
+ char buf[Blksz];
+ vlong magic;
+};
+
+struct Chan {
+ int size; /* size of queue */
+ long count; /* how many in queue (semaphore) */
+ long avail; /* how many available to send (semaphore) */
+ Lock rl, wl; /* circular pointers */
+ void **rp;
+ void **wp;
+ void* args[]; /* list of saved pointers, [->size] */
+};
--- /dev/null
+++ b/sys/src/cmd/gefs/dump.c
@@ -1,0 +1,365 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include <ctype.h>
+
+#include "dat.h"
+#include "fns.h"
+
+char spc[128];
+
+static int
+showkey(Fmt *fmt, Key *k)
+{
+ int n;
+
+ /*
+ * dent: pqid[8] qid[8] -- a directory entry key.
+ * ptr: off[8] hash[8] -- a key for an Dir block.
+ * dir: fixed statbuf header, user ids
+ */
+ if(k->nk == 0)
+ return fmtprint(fmt, "\"\"");
+ switch(k->k[0]){
+ case Kdat: /* qid[8] off[8] => ptr[16]: pointer to data page */
+ n = fmtprint(fmt, "dat qid:%llx off:%llx",
+ UNPACK64(k->k+1), UNPACK64(k->k+9));
+ break;
+ case Kent: /* pqid[8] name[n] => dir[n]: serialized Dir */
+ n = fmtprint(fmt, "ent dir:%llx, name:\"%.*s\"",
+ UNPACK64(k->k+1), k->nk-11, k->k+11);
+ break;
+ case Klabel: /* name[n] => tree[24]: snapshot ref */
+ n = fmtprint(fmt, "label name:\"%.*s\"", k->nk-1, k->k+1);
+ break;
+ case Ksnap: /* name[n] => tree[24]: snapshot root */
+ n = fmtprint(fmt, "snap id:%lld", UNPACK64(k->k+1));
+ break;
+ case Kup: /* qid[8] => pqid[8]: parent dir */
+ n = fmtprint(fmt, "up dir:%llx", UNPACK64(k->k+1));
+ break;
+ case Kdlist:
+ n = fmtprint(fmt, "dlist gen:%lld, bgen:%lld",
+ UNPACK64(k->k+1), UNPACK64(k->k+9));
+ break;
+ default:
+ n = fmtprint(fmt, "??? %.*H", k->nk, k->k);
+ break;
+ }
+ return n;
+}
+
+static int
+showval(Fmt *fmt, Kvp *v, int op, int flg)
+{
+ int n, ws;
+ char *p;
+ Tree t;
+ Xdir d;
+
+ n = 0;
+ if(flg){
+ assert(v->nv == Ptrsz+2);
+ n = fmtprint(fmt, "(%B,%d)", unpackbp(v->v, v->nv), UNPACK16(v->v+Ptrsz));
+ return n;
+ }
+ if(op == Odelete || op == Oclearb){
+ n = fmtprint(fmt, "delete");
+ return n;
+ }
+ switch(v->k[0]){
+ case Kdat: /* qid[8] off[8] => ptr[16]: pointer to data page */
+ switch(op){
+ case Odelete:
+ case Oclearb:
+ n = 0;
+ break;
+ case Onop:
+ case Oinsert:
+ if(v->nv == Ptrsz)
+ n = fmtprint(fmt, "ptr:%B", unpackbp(v->v, v->nv));
+ else
+ n = fmtprint(fmt, "BROKEN ptr %.*H", v->nk, v->k);
+ break;
+ }
+ break;
+ case Kent: /* pqid[8] name[n] => dir[n]: serialized Dir */
+ switch(op){
+ case Onop:
+ case Oinsert:
+ kv2dir(v, &d);
+ n = fmtprint(fmt, "[qid=(%llux,%lud,%d), %luo, t=%lld,%lld, l=%lld]",
+ d.qid.path, d.qid.vers, d.qid.type,
+ d.mode, d.atime, d.mtime, d.length);
+ break;
+ case Odelete:
+ n = fmtprint(fmt, "delete");
+ break;
+ case Owstat:
+ p = v->v;
+ ws = *p++;
+ if(ws & Owsize){
+ n += fmtprint(fmt, "size:%llx ", UNPACK64(p));
+ p += 8;
+ }
+ if(ws & Owmode){
+ n += fmtprint(fmt, "mode:%uo ", UNPACK32(p));
+ p += 4;
+ }
+ if(ws & Owmtime){
+ n += fmtprint(fmt, "mtime:%llx ", UNPACK64(p));
+ p += 8;
+ }
+ if(ws & Owatime){
+ n += fmtprint(fmt, "mtime:%llx ", UNPACK64(p));
+ p += 8;
+ }
+ if(ws & Owuid){
+ n += fmtprint(fmt, "uid:%d ", UNPACK32(p));
+ p += 4;
+ }
+ if(ws & Owgid){
+ n += fmtprint(fmt, "gid:%d ", UNPACK32(p));
+ p += 4;
+ }
+ if(ws & Owmuid){
+ n += fmtprint(fmt, "muid:%d ", UNPACK32(p));
+ p += 4;
+ }
+ if(p != v->v + v->nv){
+ fprint(2, "v->nv: %d, sz=%d\n", v->nv, (int)(p - v->v));
+ abort();
+ }
+ break;
+ }
+ break;
+ case Ksnap: /* name[n] => dent[16] ptr[16]: snapshot root */
+ switch(op){
+ case Orelink:
+ case Oreprev:
+ n = fmtprint(fmt, "gen: %lld, dlbl: %d, dref: %d",
+ UNPACK64(v->v), v->v[8], v->v[9]);
+ break;
+ case Onop:
+ case Oinsert:
+ if(unpacktree(&t, v->v, v->nv) == nil)
+ n = fmtprint(fmt, "corrupt tree");
+ else
+ n = fmtprint(fmt, "<tree %B [pred=%lld, succ=%lld, nref=%d, nlbl=%d]>",
+ t.bp, t.pred, t.succ, t.nref, t.nlbl);
+ break;
+ default:
+ n = fmtprint(fmt, "?? unknown op %d", op);
+ }
+ break;
+ case Klabel:
+ n = fmtprint(fmt, "snap id:%lld", UNPACK64(v->v+1));
+ break;
+ case Kup: /* qid[8] => pqid[8]: parent dir */
+ n = fmtprint(fmt, "super dir:%llx, name:\"%.*s\")",
+ UNPACK64(v->v+1), v->nv-11, v->v+11);
+ break;
+ case Kdlist:
+ n = fmtprint(fmt, "hd:%B, tl:%B",
+ unpackbp(v->v, v->nv),
+ unpackbp(v->v+Ptrsz, v->nv-Ptrsz));
+ break;
+ default:
+ n = fmtprint(fmt, "??? %.*H", v->nk, v->k);
+ break;
+ }
+ return n;
+
+}
+
+int
+Bconv(Fmt *fmt)
+{
+ Bptr bp;
+
+ bp = va_arg(fmt->args, Bptr);
+ return fmtprint(fmt, "(%llx,%.16llux,%llx)", bp.addr, bp.hash, bp.gen);
+}
+
+int
+Mconv(Fmt *fmt)
+{
+ char *opname[Nmsgtype] = {
+ [Oinsert] "Oinsert",
+ [Odelete] "Odelete",
+ [Oclearb] "Oclearb",
+ [Oclobber] "Oclobber",
+ [Owstat] "Owstat",
+ [Orelink] "Orelink",
+ [Oreprev] "Oreprev",
+ };
+ Msg *m;
+ int f, n;
+
+ f = (fmt->flags & FmtSharp) != 0;
+ m = va_arg(fmt->args, Msg*);
+ if(m == nil)
+ return fmtprint(fmt, "Msg{nil}");
+ n = fmtprint(fmt, "Msg(%s, ", opname[m->op]);
+ n += showkey(fmt, m);
+ n += fmtprint(fmt, ") => (");
+ n += showval(fmt, m, m->op, f);
+ n += fmtprint(fmt, ")");
+ return n;
+}
+
+int
+Pconv(Fmt *fmt)
+{
+ Kvp *kv;
+ int f, n;
+
+ f = (fmt->flags & FmtSharp) != 0;
+ kv = va_arg(fmt->args, Kvp*);
+ if(kv == nil)
+ return fmtprint(fmt, "Kvp{nil}");
+ n = fmtprint(fmt, "Kvp(");
+ n += showkey(fmt, kv);
+ n += fmtprint(fmt, ") => (");
+ n += showval(fmt, kv, Onop, f);
+ n += fmtprint(fmt, ")");
+ return n;
+}
+
+int
+Kconv(Fmt *fmt)
+{
+ Key *k;
+ int n;
+
+ k = va_arg(fmt->args, Key*);
+ if(k == nil)
+ return fmtprint(fmt, "Key{nil}");
+ n = fmtprint(fmt, "Key(");
+ n += showkey(fmt, k);
+ n += fmtprint(fmt, ")");
+ return n;
+}
+
+int
+Rconv(Fmt *fmt)
+{
+ Arange *r;
+
+ r = va_arg(fmt->args, Arange*);
+ if(r == nil)
+ return fmtprint(fmt, "<Arange:nil>");
+ else
+ return fmtprint(fmt, "Arange(%lld+%lld)", r->off, r->len);
+}
+
+int
+Qconv(Fmt *fmt)
+{
+ Qid q;
+
+ q = va_arg(fmt->args, Qid);
+ return fmtprint(fmt, "(%llx %ld %d)", q.path, q.vers, q.type);
+}
+
+static void
+rshowblk(int fd, Blk *b, int indent, int recurse)
+{
+ Blk *c;
+ int i;
+ Bptr bp;
+ Kvp kv;
+ Msg m;
+
+ if(indent > sizeof(spc)/4)
+ indent = sizeof(spc)/4;
+ if(b == nil){
+ fprint(fd, "NIL\n");
+ return;
+ }
+ fprint(fd, "%.*s[BLK]|{%B}\n", 4*indent, spc, b->bp);
+ switch(b->type){
+ case Tpivot:
+ for(i = 0; i < b->nbuf; i++){
+ getmsg(b, i, &m);
+ fprint(fd, "%.*s[%03d]|%M\n", 4*indent, spc, i, &m);
+ }
+ /* wet floor */
+ case Tleaf:
+ for(i = 0; i < b->nval; i++){
+ getval(b, i, &kv);
+ if(b->type == Tpivot){
+ fprint(fd, "%.*s[%03d]|%#P\n", 4*indent, spc, i, &kv);
+ bp = unpackbp(kv.v, kv.nv);
+ c = getblk(bp, 0);
+ if(recurse)
+ rshowblk(fd, c, indent + 1, 1);
+ dropblk(c);
+ }else{
+ fprint(fd, "%.*s[%03d]|%P\n", 4*indent, spc, i, &kv);
+ }
+ }
+ break;
+ case Tarena:
+ fprint(fd, "arena -- ");
+ goto Show;
+ case Tlog:
+ fprint(fd, "log -- ");
+ goto Show;
+ case Tdlist:
+ fprint(fd, "dlist -- ");
+ goto Show;
+ case Tdat:
+ fprint(fd, "dat -- ");
+ Show:
+ for(i = 0; i < 32; i++){
+ fprint(fd, "%x", b->buf[i] & 0xff);
+ if(i % 4 == 3)
+ fprint(fd, " ");
+ }
+ fprint(fd, "\n");
+ break;
+ }
+}
+
+void
+showblk(int fd, Blk *b, char *m, int recurse)
+{
+ fprint(fd, "=== %s\n", m);
+ rshowblk(fd, b, 0, recurse);
+}
+
+void
+showbp(int fd, Bptr bp, int recurse)
+{
+ Blk *b;
+
+ b = getblk(bp, GBnochk);
+ rshowblk(fd, b, 0, recurse);
+ dropblk(b);
+}
+
+void
+showtreeroot(int fd, Tree *t)
+{
+ fprint(fd, "\tflag\t0x%x\n", t->flag);
+ fprint(fd, "\tgen:\t%lld\n", t->gen);
+ fprint(fd, "\tbase\t%lld\n", t->base);
+ fprint(fd, "\tpred:\t%lld\n", t->pred);
+ fprint(fd, "\tsucc:\t%lld\n", t->succ);
+ fprint(fd, "\tnref:\t%d\n", t->nref);
+ fprint(fd, "\tnlbl:\t%d\n", t->nlbl);
+ fprint(fd, "\tht:\t%d\n", t->ht);
+ fprint(fd, "\tbp:\t%B\n", t->bp);
+}
+
+void
+initshow(void)
+{
+ int i;
+
+ memset(spc, ' ', sizeof(spc));
+ for(i = 0; i < sizeof(spc); i += 4)
+ spc[i] = '|';
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/error.c
@@ -1,0 +1,77 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include "dat.h"
+
+char Ecorrupt[] = "block contents corrupted";
+char Efsvers[] = "unknown fs version";
+char Eimpl[] = "not implemented";
+char Ebotch[] = "protocol botch";
+char Eio[] = "i/o error";
+char Enofid[] = "unknown fid";
+char Efid[] = "fid in use";
+char Etype[] = "invalid fid type";
+char Edscan[] = "invalid dir scan offset";
+char Esrch[] = "directory entry not found";
+char Eexist[] = "create/wstat -- file exists";
+char Emode[] = "open/create -- unknown mode";
+char Efull[] = "file system full";
+char Estuffed[] = "emergency blocks exhausted";
+char Eauth[] = "authentication failed";
+char Elength[] = "name too long";
+char Eperm[] = "permission denied";
+char Einuse[] = "resource in use";
+char Ebadf[] = "invalid file";
+char Ename[] = "create/wstat -- bad character in file name";
+char Enomem[] = "out of memory";
+char Eattach[] = "attach required";
+char Enosnap[] = "attach -- bad specifier";
+char Edir[] = "invalid directory";
+char Esyntax[] = "syntax error";
+char Enouser[] = "user does not exist";
+char Enogrp[] = "group does not exist";
+char Efsize[] = "file too big";
+char Ebadu[] = "attach -- unknown user or failed authentication";
+char Erdonly[] = "file system read only";
+char Elocked[] = "open/create -- file is locked";
+char Eauthp[] = "authread -- auth protocol not finished";
+char Eauthd[] = "authread -- not enough data";
+char Eauthph[] = "auth phase error";
+char Enone[] = "auth -- user 'none' requires no authentication";
+char Enoauth[] = "auth -- authentication disabled";
+char Ephase[] = "phase error -- use after remove";
+
+char Ewstatb[] = "wstat -- unknown bits in qid.type/mode";
+char Ewstatd[] = "wstat -- attempt to change directory";
+char Ewstatg[] = "wstat -- not in group";
+char Ewstatl[] = "wstat -- attempt to make length negative";
+char Ewstatm[] = "wstat -- attempt to change muid";
+char Ewstato[] = "wstat -- not owner or group leader";
+char Ewstatp[] = "wstat -- attempt to change qid.path";
+char Ewstatq[] = "wstat -- qid.type/dir.mode mismatch";
+char Ewstatu[] = "wstat -- not owner";
+char Ewstatv[] = "wstat -- attempt to change qid.vers";
+char Enempty[] = "directory is not empty";
+
+//char Echar[] = "bad character in directory name";
+//char Eopen[] = "read/write -- on non open fid";
+//char Ecount[] = "read/write -- count too big";
+//char Ealloc[] = "phase error -- directory entry not allocated";
+//char Eqid[] = "phase error -- qid does not match";
+//char Eaccess[] = "access permission denied";
+//char Eentry[] = "directory entry not found";
+//char Edir1[] = "walk -- in a non-directory";
+//char Edir2[] = "create -- in a non-directory";
+//char Edot[] = "create/wstat -- . and .. illegal names";
+//char Ewalk[] = "walk -- too many (system wide)";
+//char Eoffset[] = "read/write -- offset negative";
+//char Ebroken[] = "read/write -- lock is broken";
+//char Eauth[] = "attach -- authentication failed";
+//char Eauth2[] = "read/write -- authentication unimplemented";
+//char Etoolong[] = "name too long";
+//char Efidinuse[] = "fid in use";
+//char Eversion[] = "version conversion";
+//char Eauthnone[] = "auth -- user 'none' requires no authentication";
+//char Eauthdisabled[] = "auth -- authentication disabled"; /* development */
+//char Eauthfile[] = "auth -- out of auth files";
--- /dev/null
+++ b/sys/src/cmd/gefs/fns.h
@@ -1,0 +1,211 @@
+#pragma varargck type "M" Msg*
+#pragma varargck type "P" Kvp*
+#pragma varargck type "K" Key*
+#pragma varargck type "V" Val*
+#pragma varargck type "B" Bptr
+#pragma varargck type "R" Arange*
+#pragma varargck type "X" char*
+#pragma varargck type "Q" Qid
+
+extern Gefs* fs;
+extern int debug;
+extern int permissive;
+extern int usereserve;
+extern char* reamuser;
+extern Errctx** errctx;
+extern Blk* blkbuf;
+extern int noneid;
+extern int nogroupid;
+extern int admid;
+
+#define UNPACK8(p) (((uchar*)(p))[0])
+#define UNPACK16(p) ((((uchar*)(p))[0]<<8)|(((uchar*)(p))[1]))
+#define UNPACK32(p) ((((uchar*)(p))[0]<<24)|(((uchar*)(p))[1]<<16)|\
+ (((uchar*)(p))[2]<<8)|(((uchar*)(p))[3]))
+#define UNPACK64(p) (((u64int)((((uchar*)(p))[0]<<24)|(((uchar*)(p))[1]<<16)|\
+ (((uchar*)(p))[2]<<8)|(((uchar*)(p))[3])))<<32 |\
+ ((u64int)((((uchar*)(p))[4]<<24)|(((uchar*)(p))[5]<<16)|\
+ (((uchar*)(p))[6]<<8)|(((uchar*)(p))[7]))))
+
+#define PACK8(p,v) do{(p)[0]=(v);}while(0)
+#define PACK16(p,v) do{(p)[0]=(v)>>8;(p)[1]=(v);}while(0)
+#define PACK32(p,v) do{(p)[0]=(v)>>24;(p)[1]=(v)>>16;(p)[2]=(v)>>8;(p)[3]=(v);}while(0)
+#define PACK64(p,v) do{(p)[0]=(v)>>56;(p)[1]=(v)>>48;(p)[2]=(v)>>40;(p)[3]=(v)>>32;\
+ (p)[4]=(v)>>24;(p)[5]=(v)>>16;(p)[6]=(v)>>8;(p)[7]=(v);}while(0)
+
+void* emalloc(usize, int);
+
+Blk* newblk(Tree *, int, vlong);
+Blk* dupblk(Tree *, Blk*);
+Blk* getroot(Tree*, int*);
+Blk* getblk(Bptr, int);
+Blk* holdblk(Blk*);
+void dropblk(Blk*);
+
+void lrutop(Blk*);
+void lrubot(Blk*);
+void cacheins(Blk*);
+void cachedel(vlong);
+Blk* cacheget(vlong);
+Blk* cachepluck(void);
+
+void qinit(Syncq*);
+void qput(Syncq*, Qent);
+
+Arena* getarena(vlong);
+void syncblk(Blk*);
+void enqueue(Blk*);
+void epochstart(int);
+void epochend(int);
+void epochwait(void);
+void epochclean(void);
+void limbo(Bfree*);
+void freeblk(Tree*, Blk*, Bptr);
+int logbarrier(Arena *, vlong);
+void dlappend(Dlist *dl, Bptr);
+void killblk(Tree*, Bptr);
+void blkdealloc(vlong);
+ushort blkfill(Blk*);
+uvlong blkhash(Blk*);
+uvlong bufhash(void*, usize);
+u32int ihash(uvlong);
+void finalize(Blk*);
+
+Mount* getmount(char*);
+void clunkmount(Mount*);
+
+void updatesnap(Tree**, Tree*, char*, int);
+void tagsnap(Tree*, char*, int);
+void delsnap(Tree*, vlong, char*);
+void freedl(Dlist*, int);
+Tree* opensnap(char*, int*);
+
+void closesnap(Tree*);
+void reamfs(char*);
+void growfs(char*);
+void loadarena(Arena*, Bptr);
+void loadfs(char*);
+void loadlog(Arena*, Bptr);
+int scandead(Dlist*, int, void(*)(Bptr, void*), void*);
+int endfs(void);
+void compresslog(Arena*);
+void dlsync(void);
+void setval(Blk*, Kvp*);
+
+Conn* newconn(int, int);
+
+int walk1(Tree*, vlong, char*, Qid*, vlong*);
+void loadusers(int, Tree*);
+User* uid2user(int);
+User* name2user(char*);
+
+void btupsert(Tree*, Msg*, int);
+int btlookup(Tree*, Key*, Kvp*, char*, int);
+void btnewscan(Scan*, char*, int);
+void btenter(Tree*, Scan*);
+int btnext(Scan*, Kvp*);
+void btexit(Scan*);
+
+int checkflag(Blk *b, int);
+void setflag(Blk *b, int);
+void clrflag(Blk *b, int);
+
+char* estrdup(char*);
+
+int keycmp(Key *, Key *);
+void cpkey(Key*, Key*, char*, int);
+void cpkvp(Kvp*, Kvp*, char*, int);
+
+/* for dumping */
+void getval(Blk*, int, Kvp*);
+void getmsg(Blk*, int, Msg*);
+Bptr getptr(Kvp*, int*);
+
+void initshow(void);
+void showblk(int, Blk*, char*, int);
+void showbp(int, Bptr, int);
+void showtreeroot(int, Tree*);
+int checkfs(int);
+
+#define dprint(...) \
+ do{ \
+ if(debug) fprint(2, __VA_ARGS__); \
+ }while(0)
+
+#define fatal(...) \
+ do{ \
+ fprint(2, __VA_ARGS__); \
+ abort(); \
+ }while(0)
+
+#define tracex(msg, bp, v0, v1) \
+ do{ \
+ if(fs->trace != nil) \
+ _trace(msg, bp, v0, v1); \
+ } while(0)
+
+#define traceb(msg, bp) tracex(msg, bp, -1, -1)
+#define tracev(msg, v0) tracex(msg, Zb, v0, -1)
+#define tracem(msg) tracex(msg, Zb, -1, -1)
+
+jmp_buf* _waserror(void);
+_Noreturn void error(char*, ...);
+_Noreturn void broke(char*, ...);
+_Noreturn void nexterror(void);
+#define waserror() (setjmp(*_waserror()))
+#define errmsg() ((*errctx)->err)
+#define poperror() assert((*errctx)->nerrlab-- > 0)
+#define estacksz() ((*errctx)->nerrlab)
+void _trace(char*, Bptr, vlong, vlong);
+char* packstr(char*, char*, char*);
+
+void dir2kv(vlong, Xdir*, Kvp*, char*, int);
+int dir2statbuf(Xdir*, char*, int);
+void dlist2kv(Dlist*, Kvp*, char*, int);
+void lbl2kv(char*, vlong, uint, Kvp*, char*, int);
+void link2kv(vlong, vlong, Kvp*, char*, int);
+void retag2kv(vlong, vlong, int, int, Kvp*, char*, int);
+void tree2kv(Tree*, Kvp*, char*, int);
+
+void kv2dir(Kvp*, Xdir*);
+void kv2dlist(Kvp*, Dlist*);
+void kv2link(Kvp*, vlong*, vlong*);
+void kv2qid(Kvp*, Qid*);
+int kv2statbuf(Kvp*, char*, int);
+
+char* packarena(char*, int, Arena*);
+char* packbp(char*, int, Bptr*);
+char* packdkey(char*, int, vlong, char*);
+char* packdval(char*, int, Xdir*);
+char* packlbl(char*, int, char*);
+char* packsnap(char*, int, vlong);
+char* packsuper(char*, int, vlong);
+char* packtree(char*, int, Tree*);
+char* packsb(char*, int, Gefs*);
+
+char* unpackarena(Arena*, char*, int);
+Bptr unpackbp(char*, int);
+char* unpackdkey(char*, int, vlong*);
+Tree* unpacktree(Tree*, char*, int);
+char* unpacksb(Gefs*, char*, int);
+char* unpackstr(char*, char*, char**);
+
+/* fmt */
+int Bconv(Fmt*);
+int Mconv(Fmt*);
+int Pconv(Fmt*);
+int Rconv(Fmt*);
+int Kconv(Fmt*);
+int Qconv(Fmt*);
+
+Chan* mkchan(int);
+void* chrecv(Chan*);
+void chsend(Chan*, void*);
+void runfs(int, void*);
+void runmutate(int, void*);
+void runread(int, void*);
+void runcons(int, void*);
+void runtasks(int, void*);
+void runsync(int, void*);
+void runsweep(int, void*);
+void runsweep(int, void*);
--- /dev/null
+++ b/sys/src/cmd/gefs/fs.c
@@ -1,0 +1,2657 @@
+#include <u.h>
+#include <libc.h>
+#include <auth.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+static void respond(Fmsg*, Fcall*);
+static void rerror(Fmsg*, char*, ...);
+static void clunkfid(Conn*, Fid*, Amsg**);
+
+int
+walk1(Tree *t, vlong up, char *name, Qid *qid, vlong *len)
+{
+ char *p, kbuf[Keymax], rbuf[Kvmax];
+ int err;
+ Xdir d;
+ Kvp kv;
+ Key k;
+
+ err = 0;
+ p = packdkey(kbuf, sizeof(kbuf), up, name);
+ k.k = kbuf;
+ k.nk = p - kbuf;
+ if(err)
+ return -1;
+ if(!btlookup(t, &k, &kv, rbuf, sizeof(rbuf)))
+ return -1;
+ kv2dir(&kv, &d);
+ *qid = d.qid;
+ *len = d.length;
+ return 0;
+}
+
+static void
+wrbarrier(void)
+{
+ Qent qe;
+ int i;
+
+ aincv(&fs->qgen, 1);
+ tracev("barrier", fs->qgen);
+ fs->syncing = fs->nsyncers;
+ for(i = 0; i < fs->nsyncers; i++){
+ qe.op = Qfence;
+ qe.bp.addr = 0;
+ qe.bp.hash = -1;
+ qe.bp.gen = -1;
+ qe.b = nil;
+ qput(&fs->syncq[i], qe);
+ }
+ aincv(&fs->qgen, 1);
+ while(fs->syncing != 0)
+ rsleep(&fs->syncrz);
+ tracev("flushed", fs->qgen);
+}
+
+static void
+sync(void)
+{
+ Mount *mnt;
+ Arena *a;
+ Dlist dl;
+ int i;
+
+
+ qlock(&fs->synclk);
+ if(waserror()){
+ fprint(2, "failed to sync: %s\n", errmsg());
+ qunlock(&fs->synclk);
+ nexterror();
+ }
+
+ /*
+ * Wait for data that we're syncing to hit disk
+ */
+ tracem("flush1");
+ wrbarrier();
+ /*
+ * pass 0: Update all open snapshots, and
+ * pack the blocks we want to sync. Snap
+ * while holding the write lock, and then
+ * wait until all the blocks they point at
+ * have hit disk; once they're on disk, we
+ * can take a consistent snapshot.
+ */
+ qlock(&fs->mutlk);
+ tracem("packb");
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next)
+ updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+ /*
+ * Now that we've updated the snaps, we can sync the
+ * dlist; the snap tree will not change from here.
+ */
+ dlsync();
+ dl = fs->snapdl;
+ fs->snapdl.hd = Zb;
+ fs->snapdl.tl = Zb;
+ fs->snapdl.ins = nil;
+ traceb("syncdl.dl", dl.hd);
+ traceb("syncdl.rb", fs->snap.bp);
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ qlock(a);
+ /*
+ * because the log uses preallocated
+ * blocks, we need to write the log
+ * block out synchronously, or it may
+ * get reused.
+ */
+ logbarrier(a, fs->qgen);
+ finalize(a->logtl);
+ syncblk(a->logtl);
+
+ packarena(a->h0->data, Blksz, a);
+ packarena(a->h1->data, Blksz, a);
+ finalize(a->h0);
+ finalize(a->h1);
+ setflag(a->h0, Bdirty);
+ setflag(a->h1, Bdirty);
+ fs->arenabp[i] = a->h0->bp;
+ qunlock(a);
+ }
+ assert(fs->snapdl.hd.addr == -1);
+ traceb("packsb.rb", fs->snap.bp);
+ packsb(fs->sb0->buf, Blksz, fs);
+ packsb(fs->sb1->buf, Blksz, fs);
+ finalize(fs->sb0);
+ finalize(fs->sb1);
+ fs->snap.dirty = 0;
+ qunlock(&fs->mutlk);
+
+ /*
+ * pass 1: sync block headers; if we crash here,
+ * the block footers are consistent, and we can
+ * use them.
+ */
+ tracem("arenas0");
+ for(i = 0; i < fs->narena; i++)
+ enqueue(fs->arenas[i].h0);
+ wrbarrier();
+
+ /*
+ * pass 2: sync superblock; we have a consistent
+ * set of block headers, so if we crash, we can
+ * use the loaded block headers; the footers will
+ * get synced after so that we can use them next
+ * time around.
+ */
+ qlock(&fs->mutlk);
+ tracem("supers");
+ syncblk(fs->sb0);
+ syncblk(fs->sb1);
+
+ /*
+ * pass 3: sync block footers; if we crash here,
+ * the block headers are consistent, and we can
+ * use them.
+ */
+ tracem("arenas1");
+ for(i = 0; i < fs->narena; i++)
+ enqueue(fs->arenas[i].h1);
+
+ /*
+ * Pass 4: clean up the old snap tree's deadlist
+ */
+ tracem("snapdl");
+ wrbarrier();
+ qunlock(&fs->mutlk);
+ freedl(&dl, 1);
+ qunlock(&fs->synclk);
+ tracem("synced");
+ poperror();
+}
+
+static void
+snapfs(Amsg *a, Tree **tp)
+{
+ Tree *t, *s;
+ Mount *mnt;
+
+ if(waserror()){
+ *tp = nil;
+ nexterror();
+ }
+ t = nil;
+ *tp = nil;
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+ if(strcmp(a->old, mnt->name) == 0){
+ updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+ t = agetp(&mnt->root);
+ ainc(&t->memref);
+ break;
+ }
+ }
+ if(t == nil && (t = opensnap(a->old, nil)) == nil){
+ if(a->fd != -1)
+ fprint(a->fd, "snap: open '%s': does not exist\n", a->old);
+ poperror();
+ return;
+ }
+ if(a->delete){
+ if(mnt != nil) {
+ if(a->fd != -1)
+ fprint(a->fd, "snap: snap is mounted: '%s'\n", a->old);
+ poperror();
+ return;
+ }
+ if(t->nlbl == 1 && t->nref <= 1 && t->succ == -1){
+ aincl(&t->memref, 1);
+ *tp = t;
+ }
+ delsnap(t, t->succ, a->old);
+ }else{
+ if((s = opensnap(a->new, nil)) != nil){
+ if(a->fd != -1)
+ fprint(a->fd, "snap: already exists '%s'\n", a->new);
+ closesnap(s);
+ poperror();
+ return;
+ }
+ tagsnap(t, a->new, a->flag);
+ }
+ closesnap(t);
+ poperror();
+ if(a->fd != -1){
+ if(a->delete)
+ fprint(a->fd, "deleted: %s\n", a->old);
+ else if(a->flag & Lmut)
+ fprint(a->fd, "forked: %s from %s\n", a->new, a->old);
+ else
+ fprint(a->fd, "labeled: %s from %s\n", a->new, a->old);
+ }
+}
+
+static void
+filldumpdir(Xdir *d)
+{
+ memset(d, 0, sizeof(Xdir));
+ d->name = "/";
+ d->qid.path = Qdump;
+ d->qid.vers = fs->nextgen;
+ d->qid.type = QTDIR;
+ d->mode = 0555;
+ d->atime = 0;
+ d->mtime = 0;
+ d->length = 0;
+ d->uid = -1;
+ d->gid = -1;
+ d->muid = -1;
+}
+
+static int
+okname(char *name)
+{
+ int i;
+
+ if(name[0] == 0)
+ return -1;
+ if(strcmp(name, ".") == 0 || strcmp(name, "..") == 0)
+ return -1;
+ for(i = 0; i < Maxname; i++){
+ if(name[i] == 0)
+ return 0;
+ if((name[i]&0xff) < 0x20 || name[i] == '/')
+ return -1;
+ }
+ return -1;
+}
+
+Chan*
+mkchan(int size)
+{
+ Chan *c;
+
+ if((c = mallocz(sizeof(Chan) + size*sizeof(void*), 1)) == nil)
+ sysfatal("create channel");
+ c->size = size;
+ c->avail = size;
+ c->count = 0;
+ c->rp = c->args;
+ c->wp = c->args;
+ return c;
+
+}
+
+void*
+chrecv(Chan *c)
+{
+ void *a;
+ long v;
+
+ v = agetl(&c->count);
+ if(v == 0 || !acasl(&c->count, v, v-1))
+ semacquire(&c->count, 1);
+ lock(&c->rl);
+ a = *c->rp;
+ if(++c->rp >= &c->args[c->size])
+ c->rp = c->args;
+ unlock(&c->rl);
+ semrelease(&c->avail, 1);
+ return a;
+}
+
+void
+chsend(Chan *c, void *m)
+{
+ long v;
+
+ v = agetl(&c->avail);
+ if(v == 0 || !acasl(&c->avail, v, v-1))
+ semacquire(&c->avail, 1);
+ lock(&c->wl);
+ *c->wp = m;
+ if(++c->wp >= &c->args[c->size])
+ c->wp = c->args;
+ unlock(&c->wl);
+ semrelease(&c->count, 1);
+}
+
+static void
+fshangup(Conn *c, char *fmt, ...)
+{
+ char buf[ERRMAX];
+ va_list ap;
+ Amsg *a;
+ Fid *f;
+ int i;
+
+ va_start(ap, fmt);
+ vsnprint(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+ fprint(2, "hangup: %s\n", buf);
+ close(c->rfd);
+ close(c->wfd);
+ for(i = 0; i < Nfidtab; i++){
+ lock(&c->fidtablk[i]);
+ for(f = c->fidtab[i]; f != nil; f = f->next){
+ lock(f);
+ if(waserror()){
+ unlock(f);
+ continue;
+ }
+ a = nil;
+ clunkfid(c, f, &a);
+ unlock(f);
+ if(a != nil)
+ chsend(fs->admchan, a);
+ nexterror();
+ }
+ unlock(&c->fidtablk[i]);
+ }
+}
+
+static void
+respond(Fmsg *m, Fcall *r)
+{
+ RWLock *lk;
+ uchar buf[Max9p+IOHDRSZ];
+ int w, n;
+
+ r->tag = m->tag;
+ dprint("→ %F\n", r);
+ assert(m->type+1 == r->type || r->type == Rerror);
+ if((n = convS2M(r, buf, sizeof(buf))) == 0)
+ abort();
+ qlock(&m->conn->wrlk);
+ w = write(m->conn->wfd, buf, n);
+ qunlock(&m->conn->wrlk);
+ if(w != n)
+ fshangup(m->conn, Eio);
+ if(m->type == Tflush){
+ lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
+ wunlock(lk);
+ }else{
+ lk = &fs->flushq[ihash(m->tag) % Nflushtab];
+ runlock(lk);
+ }
+ free(m);
+}
+
+static void
+rerror(Fmsg *m, char *fmt, ...)
+{
+ char buf[128];
+ va_list ap;
+ Fcall r;
+
+ va_start(ap, fmt);
+ vsnprint(buf, sizeof(buf), fmt, ap);
+ va_end(ap);
+ r.type = Rerror;
+ r.ename = buf;
+ respond(m, &r);
+}
+
+
+static void
+upsert(Mount *mnt, Msg *m, int nm)
+{
+ if(!(mnt->flag & Lmut))
+ error(Erdonly);
+ if(mnt->root->nlbl != 1 || mnt->root->nref != 0)
+ updatesnap(&mnt->root, mnt->root, mnt->name, mnt->flag);
+ btupsert(mnt->root, m, nm);
+}
+
+/*
+ * When truncating a file, mutations need
+ * to wait for the sweeper to finish; this
+ * means the mutator needs to release the
+ * mutation lock, exit the epoch, and
+ * allow the sweeper to finish its job
+ * before resuming.
+ */
+static void
+truncwait(Dent *de, int id)
+{
+ epochend(id);
+ qunlock(&fs->mutlk);
+ qlock(&de->trunclk);
+ while(de->trunc)
+ rsleep(&de->truncrz);
+ qunlock(&de->trunclk);
+ qlock(&fs->mutlk);
+ epochstart(id);
+}
+
+static int
+readb(Tree *t, Fid *f, char *d, vlong o, vlong n, vlong sz)
+{
+ char buf[17], kvbuf[17+32];
+ vlong fb, fo;
+ Bptr bp;
+ Blk *b;
+ Key k;
+ Kvp kv;
+
+ if(o >= sz)
+ return 0;
+
+ fb = o & ~(Blksz-1);
+ fo = o & (Blksz-1);
+ if(fo+n > Blksz)
+ n = Blksz-fo;
+
+ k.k = buf;
+ k.nk = sizeof(buf);
+ k.k[0] = Kdat;
+ PACK64(k.k+1, f->qpath);
+ PACK64(k.k+9, fb);
+
+ if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf))){
+ memset(d, 0, n);
+ return n;
+ }
+
+ bp = unpackbp(kv.v, kv.nv);
+ b = getblk(bp, GBraw);
+ memcpy(d, b->buf+fo, n);
+ dropblk(b);
+ return n;
+}
+
+static int
+writeb(Fid *f, Msg *m, Bptr *ret, char *s, vlong o, vlong n, vlong sz)
+{
+ char buf[Kvmax];
+ vlong fb, fo;
+ Blk *b, *t;
+ Tree *r;
+ Bptr bp;
+ Kvp kv;
+
+ fb = o & ~(Blksz-1);
+ fo = o & (Blksz-1);
+
+ m->k[0] = Kdat;
+ PACK64(m->k+1, f->qpath);
+ PACK64(m->k+9, fb);
+
+ b = newblk(f->mnt->root, Tdat, f->qpath);
+ t = nil;
+ r = f->mnt->root;
+ if(btlookup(r, m, &kv, buf, sizeof(buf))){
+ bp = unpackbp(kv.v, kv.nv);
+ if(fb < sz && (fo != 0 || n != Blksz)){
+ t = getblk(bp, GBraw);
+ memcpy(b->buf, t->buf, Blksz);
+ dropblk(t);
+ }
+ }
+ if(fo+n > Blksz)
+ n = Blksz-fo;
+ memcpy(b->buf+fo, s, n);
+ if(t == nil){
+ if(fo > 0)
+ memset(b->buf, 0, fo);
+ if(fo+n < Blksz)
+ memset(b->buf+fo+n, 0, Blksz-fo-n);
+ }
+ enqueue(b);
+
+ packbp(m->v, m->nv, &b->bp);
+ *ret = b->bp;
+ dropblk(b);
+ return n;
+}
+
+static Dent*
+getdent(vlong pqid, Xdir *d)
+{
+ Dent *de;
+ char *e;
+ u32int h;
+
+ h = ihash(d->qid.path) % Ndtab;
+ lock(&fs->dtablk);
+ for(de = fs->dtab[h]; de != nil; de = de->next){
+ if(de->qid.path == d->qid.path){
+ ainc(&de->ref);
+ goto Out;
+ }
+ }
+
+ de = emalloc(sizeof(Dent), 1);
+ de->Xdir = *d;
+ de->ref = 1;
+ de->up = pqid;
+ de->qid = d->qid;
+ de->length = d->length;
+ de->truncrz.l = &de->trunclk;
+
+ if((e = packdkey(de->buf, sizeof(de->buf), pqid, d->name)) == nil){
+ free(de);
+ de = nil;
+ goto Out;
+ }
+ de->k = de->buf;
+ de->nk = e - de->buf;
+ de->name = de->buf + 11;
+ de->next = fs->dtab[h];
+ fs->dtab[h] = de;
+
+Out:
+ unlock(&fs->dtablk);
+ return de;
+}
+
+static void
+loadautos(Mount *mnt)
+{
+ char pfx[128];
+ int m, h, ns;
+ uint flg;
+ Scan s;
+
+ m = 0;
+ h = 0;
+ pfx[0] = Klabel;
+ ns = snprint(pfx+1, sizeof(pfx)-1, "%s@minute.", mnt->name);
+ btnewscan(&s, pfx, ns+1);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ flg = UNPACK32(s.kv.v+1+8);
+ if(flg & Lauto){
+ memcpy(mnt->minutely[m], s.kv.k+1, s.kv.nk-1);
+ mnt->minutely[m][s.kv.nk-1] = 0;
+ m = (m+1)%60;
+ continue;
+ }
+ }
+ btexit(&s);
+
+ pfx[0] = Klabel;
+ ns = snprint(pfx+1, sizeof(pfx)-1, "%s@hour.", mnt->name);
+ btnewscan(&s, pfx, ns+1);
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ flg = UNPACK32(s.kv.v+1+8);
+ if(flg & Lauto){
+ memcpy(mnt->hourly[h], s.kv.k+1, s.kv.nk-1);
+ mnt->hourly[h][s.kv.nk-1] = 0;
+ h = (h+1)%24;
+ continue;
+ }
+ }
+ btexit(&s);
+}
+
+Mount *
+getmount(char *name)
+{
+ Mount *mnt;
+ Tree *t;
+ int flg;
+
+ if(strcmp(name, "dump") == 0){
+ ainc(&fs->snapmnt->ref);
+ return fs->snapmnt;
+ }
+
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+ if(strcmp(name, mnt->name) == 0){
+ ainc(&mnt->ref);
+ goto Out;
+ }
+ }
+
+ if((mnt = mallocz(sizeof(*mnt), 1)) == nil)
+ error(Enomem);
+ if(waserror()){
+ free(mnt);
+ nexterror();
+ }
+ mnt->ref = 1;
+ snprint(mnt->name, sizeof(mnt->name), "%s", name);
+ if((t = opensnap(name, &flg)) == nil)
+ error(Enosnap);
+ loadautos(mnt);
+ mnt->flag = flg;
+ mnt->root = t;
+ mnt->next = fs->mounts;
+ asetp(&fs->mounts, mnt);
+ poperror();
+
+Out:
+ return mnt;
+}
+
+void
+clunkmount(Mount *mnt)
+{
+ Mount *me, **p;
+ Bfree *f;
+
+ if(mnt == nil)
+ return;
+ if(adec(&mnt->ref) == 0){
+ for(p = &fs->mounts; (me = *p) != nil; p = &me->next){
+ if(me == mnt)
+ break;
+ }
+ assert(me != nil);
+ f = emalloc(sizeof(Bfree), 0);
+ f->op = DFmnt;
+ f->m = mnt;
+ *p = me->next;
+ limbo(f);
+ }
+}
+
+static void
+clunkdent(Dent *de)
+{
+ Dent *e, **pe;
+ u32int h;
+
+ if(de == nil)
+ return;
+ if(de->qid.type & QTAUTH && adec(&de->ref) == 0){
+ free(de);
+ return;
+ }
+ lock(&fs->dtablk);
+ if(adec(&de->ref) != 0)
+ goto Out;
+ h = ihash(de->qid.path) % Ndtab;
+ pe = &fs->dtab[h];
+ for(e = fs->dtab[h]; e != nil; e = e->next){
+ if(e == de)
+ break;
+ pe = &e->next;
+ }
+ assert(e != nil);
+ *pe = e->next;
+ free(de);
+Out:
+ unlock(&fs->dtablk);
+}
+
+static Fid*
+getfid(Conn *c, u32int fid)
+{
+ u32int h;
+ Fid *f;
+
+ h = ihash(fid) % Nfidtab;
+ lock(&c->fidtablk[h]);
+ for(f = c->fidtab[h]; f != nil; f = f->next)
+ if(f->fid == fid){
+ ainc(&f->ref);
+ break;
+ }
+ unlock(&c->fidtablk[h]);
+ return f;
+}
+
+static void
+putfid(Fid *f)
+{
+ if(adec(&f->ref) != 0)
+ return;
+ clunkmount(f->mnt);
+ clunkdent(f->dent);
+ free(f);
+}
+
+static Fid*
+dupfid(Conn *c, u32int new, Fid *f)
+{
+ Fid *n, *o;
+ u32int h;
+
+ h = ihash(new) % Nfidtab;
+ if((n = malloc(sizeof(Fid))) == nil)
+ return nil;
+
+ *n = *f;
+ n->fid = new;
+ n->ref = 2; /* one for dup, one for clunk */
+ n->mode = -1;
+ n->next = nil;
+
+ lock(&c->fidtablk[h]);
+ for(o = c->fidtab[h]; o != nil; o = o->next)
+ if(o->fid == new)
+ break;
+ if(o == nil){
+ n->next = c->fidtab[h];
+ c->fidtab[h] = n;
+ }
+ unlock(&c->fidtablk[h]);
+
+ if(o != nil){
+ fprint(2, "fid in use: %d == %d\n", o->fid, new);
+ free(n);
+ return nil;
+ }
+ if(n->mnt != nil)
+ ainc(&n->mnt->ref);
+ ainc(&n->dent->ref);
+ setmalloctag(n, getcallerpc(&c));
+ return n;
+}
+
+static void
+clunkfid(Conn *c, Fid *fid, Amsg **ao)
+{
+ Fid *f, **pf;
+ u32int h;
+
+ h = ihash(fid->fid) % Nfidtab;
+ lock(&c->fidtablk[h]);
+ pf = &c->fidtab[h];
+ for(f = c->fidtab[h]; f != nil; f = f->next){
+ if(f == fid){
+ assert(adec(&f->ref) != 0);
+ *pf = f->next;
+ break;
+ }
+ pf = &f->next;
+ }
+ assert(f != nil);
+ if(f->scan != nil){
+ free(f->scan);
+ f->scan = nil;
+ }
+ if(f->rclose){
+ qlock(&f->dent->trunclk);
+ f->dent->trunc = 1;
+ qunlock(&f->dent->trunclk);
+ wlock(f->dent);
+ f->dent->gone = 1;
+ wunlock(f->dent);
+ *ao = emalloc(sizeof(Amsg), 1);
+ aincl(&f->dent->ref, 1);
+ aincl(&f->mnt->ref, 1);
+ (*ao)->op = AOrclose;
+ (*ao)->mnt = f->mnt;
+ (*ao)->qpath = f->qpath;
+ (*ao)->off = 0;
+ (*ao)->end = f->dent->length;
+ (*ao)->dent = f->dent;
+ }
+ unlock(&c->fidtablk[h]);
+}
+
+static int
+readmsg(Conn *c, Fmsg **pm)
+{
+ char szbuf[4];
+ int sz, n;
+ Fmsg *m;
+
+ n = readn(c->rfd, szbuf, 4);
+ if(n <= 0){
+ *pm = nil;
+ return n;
+ }
+ if(n != 4){
+ werrstr("short read: %r");
+ return -1;
+ }
+ sz = GBIT32(szbuf);
+ if(sz > c->iounit){
+ werrstr("message size too large");
+ return -1;
+ }
+ if((m = malloc(sizeof(Fmsg)+sz)) == nil)
+ return -1;
+ if(readn(c->rfd, m->buf+4, sz-4) != sz-4){
+ werrstr("short read: %r");
+ free(m);
+ return -1;
+ }
+ m->conn = c;
+ m->sz = sz;
+ PBIT32(m->buf, sz);
+ *pm = m;
+ return 0;
+}
+
+static void
+fsversion(Fmsg *m)
+{
+ Fcall r;
+ char *p;
+
+ memset(&r, 0, sizeof(Fcall));
+ p = strchr(m->version, '.');
+ if(p != nil)
+ *p = '\0';
+ r.type = Rversion;
+ r.msize = Max9p + IOHDRSZ;
+ if(strcmp(m->version, "9P2000") == 0){
+ if(m->msize < r.msize)
+ r.msize = m->msize;
+ r.version = "9P2000";
+ m->conn->versioned = 1;
+ m->conn->iounit = r.msize;
+ }else{
+ r.version = "unknown";
+ m->conn->versioned = 0;
+ }
+ respond(m, &r);
+}
+
+void
+authfree(AuthRpc *auth)
+{
+ AuthRpc *rpc;
+
+ if(rpc = auth){
+ close(rpc->afd);
+ auth_freerpc(rpc);
+ }
+}
+
+AuthRpc*
+authnew(void)
+{
+ static char *keyspec = "proto=p9any role=server";
+ AuthRpc *rpc;
+ int fd;
+
+ if(access("/mnt/factotum", 0) < 0)
+ if((fd = open("/srv/factotum", ORDWR)) >= 0)
+ mount(fd, -1, "/mnt", MBEFORE, "");
+ if((fd = open("/mnt/factotum/rpc", ORDWR)) < 0)
+ return nil;
+ if((rpc = auth_allocrpc(fd)) == nil){
+ close(fd);
+ return nil;
+ }
+ if(auth_rpc(rpc, "start", keyspec, strlen(keyspec)) != ARok){
+ authfree(rpc);
+ return nil;
+ }
+ return rpc;
+}
+
+static void
+authread(Fid *f, Fcall *r, void *data, vlong count)
+{
+ AuthInfo *ai;
+ AuthRpc *rpc;
+ User *u;
+
+ if((rpc = f->auth) == nil)
+ error(Etype);
+
+ switch(auth_rpc(rpc, "read", nil, 0)){
+ default:
+ error(Eauthp);
+ case ARdone:
+ if((ai = auth_getinfo(rpc)) == nil)
+ goto Phase;
+ rlock(&fs->userlk);
+ u = name2user(ai->cuid);
+ auth_freeAI(ai);
+ if(u == nil){
+ runlock(&fs->userlk);
+ error(Enouser);
+ }
+ f->uid = u->id;
+ runlock(&fs->userlk);
+ return;
+ case ARok:
+ if(count < rpc->narg)
+ error(Eauthd);
+ memmove(data, rpc->arg, rpc->narg);
+ r->count = rpc->narg;
+ return;
+ case ARphase:
+ Phase:
+ error(Eauthph);
+ }
+}
+
+static void
+authwrite(Fid *f, Fcall *r, void *data, vlong count)
+{
+ AuthRpc *rpc;
+
+ if((rpc = f->auth) == nil)
+ error(Etype);
+ if(auth_rpc(rpc, "write", data, count) != ARok)
+ error(Ebotch);
+ r->type = Rwrite;
+ r->count = count;
+
+}
+
+static void
+fsauth(Fmsg *m)
+{
+ Dent *de;
+ Fcall r;
+ Fid f;
+
+ if(fs->noauth){
+ rerror(m, Eauth);
+ return;
+ }
+ if(strcmp(m->uname, "none") == 0){
+ rerror(m, Enone);
+ return;
+ }
+ if((de = mallocz(sizeof(Dent), 1)) == nil){
+ rerror(m, Enomem);
+ return;
+ }
+ memset(de, 0, sizeof(Dent));
+ de->ref = 0;
+ de->qid.type = QTAUTH;
+ de->qid.path = aincv(&fs->nextqid, 1);
+ de->qid.vers = 0;
+ de->length = 0;
+ de->k = nil;
+ de->nk = 0;
+
+ memset(&f, 0, sizeof(Fid));
+ f.fid = NOFID;
+ f.mnt = nil;
+ f.qpath = de->qid.path;
+ f.pqpath = de->qid.path;
+ f.mode = -1;
+ f.iounit = m->conn->iounit;
+ f.dent = de;
+ f.uid = -1;
+ f.duid = -1;
+ f.dgid = -1;
+ f.dmode = 0600;
+ f.auth = authnew();
+ if(dupfid(m->conn, m->afid, &f) == nil){
+ rerror(m, Efid);
+ free(de);
+ return;
+ }
+ r.type = Rauth;
+ r.aqid = de->qid;
+ respond(m, &r);
+}
+
+static int
+ingroup(int uid, int gid)
+{
+ User *u, *g;
+ int i, in;
+
+ rlock(&fs->userlk);
+ in = 0;
+ u = uid2user(uid);
+ g = uid2user(gid);
+ if(u != nil && g != nil)
+ if(u->id == g->id)
+ in = 1;
+ else for(i = 0; i < g->nmemb; i++)
+ if(u->id == g->memb[i])
+ in = 1;
+ runlock(&fs->userlk);
+ return in;
+}
+
+static int
+groupleader(int uid, int gid)
+{
+ User *g;
+ int i, lead;
+
+ lead = 0;
+ rlock(&fs->userlk);
+ g = uid2user(gid);
+ if(g != nil){
+ if(g->lead == 0){
+ for(i = 0; i < g->nmemb; i++)
+ if(g->memb[i] == uid){
+ lead = 1;
+ break;
+ }
+ }else if(uid == g->lead)
+ lead = 1;
+ }
+ runlock(&fs->userlk);
+ return lead;
+
+}
+
+static int
+mode2bits(int req)
+{
+ int m;
+
+ m = 0;
+ switch(req&0xf){
+ case OREAD: m = DMREAD; break;
+ case OWRITE: m = DMWRITE; break;
+ case ORDWR: m = DMREAD|DMWRITE; break;
+ case OEXEC: m = DMREAD|DMEXEC; break;
+ }
+ if(req&OTRUNC)
+ m |= DMWRITE;
+ return m;
+}
+
+static int
+fsaccess(Fid *f, ulong fmode, int fuid, int fgid, int m)
+{
+ /* uid none gets only other permissions */
+ if(f->permit)
+ return 0;
+ if(f->uid != noneid) {
+ if(f->uid == fuid)
+ if((m & (fmode>>6)) == m)
+ return 0;
+ if(ingroup(f->uid, fgid))
+ if((m & (fmode>>3)) == m)
+ return 0;
+ }
+ if(m & fmode) {
+ if((fmode & DMDIR) && (m == DMEXEC))
+ return 0;
+ if(!ingroup(f->uid, nogroupid))
+ return 0;
+ }
+ return -1;
+}
+
+static void
+fsattach(Fmsg *m)
+{
+ char dbuf[Kvmax], kvbuf[Kvmax];
+ char *p, *n, *aname;
+ Mount *mnt;
+ Dent *de;
+ Tree *t;
+ User *u;
+ Fcall r;
+ Xdir d;
+ Kvp kv;
+ Key dk;
+ Fid f, *af;
+ int uid;
+
+ de = nil;
+ mnt = nil;
+ if(waserror()){
+ rerror(m, errmsg());
+ goto Err;
+ }
+ aname = m->aname;
+ if(aname[0] == '%')
+ aname++;
+ if(aname[0] == '\0')
+ aname = "main";
+ if((mnt = getmount(aname)) == nil)
+ error(Enosnap);
+
+ rlock(&fs->userlk);
+ n = m->uname;
+ /*
+ * to allow people to add themselves to the user file,
+ * we need to force the user id to one that exists.
+ */
+ if(permissive && strcmp(aname, "adm") == 0)
+ n = "adm";
+ if((u = name2user(n)) == nil){
+ runlock(&fs->userlk);
+ error(Enouser);
+ }
+ uid = u->id;
+ runlock(&fs->userlk);
+
+ if(m->afid != NOFID){
+ r.data = nil;
+ r.count = 0;
+ if((af = getfid(m->conn, m->afid)) == nil)
+ error(Enofid);
+ authread(af, &r, nil, 0);
+ putfid(af);
+ if(af->uid != uid)
+ error(Ebadu);
+ }else if(!fs->noauth && strcmp(m->uname, "none") != 0)
+ error(Ebadu);
+
+ if(strcmp(m->aname, "dump") == 0){
+ memset(&d, 0, sizeof(d));
+ filldumpdir(&d);
+ }else{
+ if((p = packdkey(dbuf, sizeof(dbuf), -1ULL, "")) == nil)
+ error(Elength);
+ dk.k = dbuf;
+ dk.nk = p - dbuf;
+ t = agetp(&mnt->root);
+ if(!btlookup(t, &dk, &kv, kvbuf, sizeof(kvbuf)))
+ error(Enosnap);
+ kv2dir(&kv, &d);
+ }
+ de = getdent(-1, &d);
+ memset(&f, 0, sizeof(Fid));
+ f.fid = NOFID;
+ f.mnt = mnt;
+ f.qpath = d.qid.path;
+ f.pqpath = d.qid.path;
+ f.mode = -1;
+ f.iounit = m->conn->iounit;
+ f.dent = de;
+ f.uid = uid;
+ f.duid = d.uid;
+ f.dgid = d.gid;
+ f.dmode = d.mode;
+ if(m->aname[0] == '%'){
+ if(!permissive && !ingroup(uid, admid))
+ error(Eperm);
+ f.permit = 1;
+ }
+ if(dupfid(m->conn, m->fid, &f) == nil)
+ error(Efid);
+
+ r.type = Rattach;
+ r.qid = d.qid;
+ respond(m, &r);
+ poperror();
+
+
+Err: clunkdent(de);
+ clunkmount(mnt);
+}
+
+static int
+findparent(Tree *t, Fid *f, vlong *qpath, char **name, char *buf, int nbuf)
+{
+ char *p, kbuf[Keymax];
+ Kvp kv;
+ Key k;
+
+ p = packsuper(kbuf, sizeof(kbuf), f->pqpath);
+ k.k = kbuf;
+ k.nk = p - kbuf;
+ if(!btlookup(t, &k, &kv, buf, nbuf))
+ return 0;
+ *name = unpackdkey(kv.v, kv.nv, qpath);
+ return 1;
+}
+
+static void
+fswalk(Fmsg *m)
+{
+ char *p, *name, kbuf[Maxent], kvbuf[Kvmax];
+ int duid, dgid, dmode;
+ vlong up, prev;
+ Fid *o, *f;
+ Dent *dent;
+ Mount *mnt;
+ Tree *t;
+ Fcall r;
+ Xdir d;
+ Kvp kv;
+ Key k;
+ int i;
+
+ if((o = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ if(waserror()){
+ rerror(m, errmsg());
+ putfid(o);
+ return;
+ }
+ if(o->mode != -1)
+ error(Einuse);
+ t = o->mnt->root;
+ mnt = o->mnt;
+ up = o->qpath;
+ prev = o->qpath;
+ rlock(o->dent);
+ d = *o->dent;
+ runlock(o->dent);
+ duid = d.uid;
+ dgid = d.gid;
+ dmode = d.mode;
+ r.type = Rwalk;
+ for(i = 0; i < m->nwname; i++){
+ if(fsaccess(o, d.mode, d.uid, d.gid, DMEXEC) != 0)
+ error(Eperm);
+ name = m->wname[i];
+ if(d.qid.path == Qdump){
+ if((mnt = getmount(m->wname[i])) == nil)
+ error(Esrch);
+ if(waserror()){
+ clunkmount(mnt);
+ nexterror();
+ }
+ t = mnt->root;
+ p = packdkey(kbuf, sizeof(kbuf), -1ULL, "");
+ poperror();
+ }else{
+ if(strcmp(m->wname[i], "..") == 0){
+ if(o->pqpath == Qdump){
+ mnt = fs->snapmnt;
+ filldumpdir(&d);
+ duid = d.uid;
+ dgid = d.gid;
+ dmode = d.mode;
+ goto Found;
+ }
+ if(!findparent(t, o, &prev, &name, kbuf, sizeof(kbuf)))
+ error(Esrch);
+ }
+ p = packdkey(kbuf, sizeof(kbuf), prev, name);
+ }
+ duid = d.uid;
+ dgid = d.gid;
+ dmode = d.mode;
+ k.k = kbuf;
+ k.nk = p - kbuf;
+ if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+ break;
+ kv2dir(&kv, &d);
+Found:
+ up = prev;
+ prev = d.qid.path;
+ r.wqid[i] = d.qid;
+ }
+ r.nwqid = i;
+ if(i == 0 && m->nwname != 0)
+ error(Esrch);
+ f = o;
+ if(m->fid != m->newfid && i == m->nwname){
+ if((f = dupfid(m->conn, m->newfid, o)) == nil)
+ error(Efid);
+ putfid(o);
+ }
+ if(i > 0 && i == m->nwname){
+ lock(f);
+ if(waserror()){
+ if(f != o)
+ clunkfid(m->conn, f, nil);
+ unlock(f);
+ nexterror();
+ }
+ if(up == Qdump)
+ dent = getdent(-1ULL, &d);
+ else
+ dent = getdent(up, &d);
+ if(mnt != f->mnt){
+ clunkmount(f->mnt);
+ ainc(&mnt->ref);
+ f->mnt = mnt;
+ }
+ clunkdent(f->dent);
+ f->qpath = r.wqid[i-1].path;
+ f->pqpath = up;
+ f->dent = dent;
+ f->duid = duid;
+ f->dgid = dgid;
+ f->dmode = dmode;
+ poperror();
+ unlock(f);
+ }
+ respond(m, &r);
+ poperror();
+ putfid(f);
+}
+
+static void
+fsstat(Fmsg *m)
+{
+ char buf[STATMAX];
+ Fcall r;
+ Fid *f;
+ int n;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ if(waserror()){
+ rerror(m, errmsg());
+ putfid(f);
+ return;
+ }
+ rlock(f->dent);
+ if((n = dir2statbuf(f->dent, buf, sizeof(buf))) == -1)
+ error(Efs);
+ runlock(f->dent);
+ r.type = Rstat;
+ r.stat = (uchar*)buf;
+ r.nstat = n;
+ respond(m, &r);
+ poperror();
+ putfid(f);
+}
+
+static void
+fswstat(Fmsg *m, int id, Amsg **ao)
+{
+ char rnbuf[Kvmax], opbuf[Kvmax], upbuf[Upksz];
+ char *p, strs[65535];
+ int op, nm, rename;
+ vlong oldlen;
+ Qid old;
+ Fcall r;
+ Dent *de;
+ Msg mb[3];
+ Xdir n;
+ Dir d;
+ Tree *t;
+ Fid *f;
+ Key k;
+ User *u;
+
+ *ao = nil;
+ rename = 0;
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ de = f->dent;
+ truncwait(de, id);
+ wlock(de);
+ if(waserror()){
+ rerror(m, errmsg());
+ free(*ao);
+ *ao = nil;
+ goto Err;
+ }
+ if(de->gone)
+ error(Ephase);
+ if((de->qid.type & QTAUTH) || (de->qid.path & Qdump))
+ error(Emode);
+ if(convM2D(m->stat, m->nstat, &d, strs) <= BIT16SZ)
+ error(Edir);
+
+ t = agetp(&f->mnt->root);
+ n = de->Xdir;
+ n.qid.vers++;
+ p = opbuf+1;
+ op = 0;
+
+ /* check validity of updated fields and construct Owstat message */
+ if(d.qid.path != ~0 || d.qid.vers != ~0){
+ if(d.qid.path != de->qid.path)
+ error(Ewstatp);
+ if(d.qid.vers != de->qid.vers)
+ error(Ewstatv);
+ }
+ if(*d.name != '\0'){
+ if(strcmp(d.name, de->name) != 0){
+ rename = 1;
+ if(okname(d.name) == -1)
+ error(Ename);
+ if(walk1(t, f->dent->up, d.name, &old, &oldlen) == 0)
+ error(Eexist);
+ n.name = d.name;
+ }
+ }
+ if(d.length != ~0){
+ if(d.length < 0)
+ error(Ewstatl);
+ if(d.length != de->length){
+ if(d.length < de->length){
+ if((*ao = malloc(sizeof(Amsg))) == nil)
+ error(Enomem);
+ qlock(&de->trunclk);
+ de->trunc = 1;
+ qunlock(&de->trunclk);
+ aincl(&de->ref, 1);
+ aincl(&f->mnt->ref, 1);
+ (*ao)->op = AOclear;
+ (*ao)->mnt = f->mnt;
+ (*ao)->qpath = f->qpath;
+ (*ao)->off = d.length;
+ (*ao)->end = f->dent->length;
+ (*ao)->dent = de;
+ }
+ de->length = d.length;
+ n.length = d.length;
+ op |= Owsize;
+ PACK64(p, n.length);
+ p += 8;
+ }
+ }
+ if(d.mode != ~0){
+ if((d.mode^de->mode) & DMDIR)
+ error(Ewstatd);
+ if(d.mode & ~(DMDIR|DMAPPEND|DMEXCL|DMTMP|0777))
+ error(Ewstatb);
+ if(d.mode != de->mode){
+ n.mode = d.mode;
+ n.qid.type = d.mode>>24;
+ op |= Owmode;
+ PACK32(p, n.mode);
+ p += 4;
+ }
+ }
+ if(d.mtime != ~0){
+ n.mtime = d.mtime*Nsec;
+ if(n.mtime != de->mtime){
+ op |= Owmtime;
+ PACK64(p, n.mtime);
+ p += 8;
+ }
+ }
+ if(*d.uid != '\0'){
+ rlock(&fs->userlk);
+ u = name2user(d.uid);
+ if(u == nil){
+ runlock(&fs->userlk);
+ error(Enouser);
+ }
+ n.uid = u->id;
+ runlock(&fs->userlk);
+ if(n.uid != de->uid){
+ op |= Owuid;
+ PACK32(p, n.uid);
+ p += 4;
+ }
+ }
+ if(*d.gid != '\0'){
+ rlock(&fs->userlk);
+ u = name2user(d.gid);
+ if(u == nil){
+ runlock(&fs->userlk);
+ error(Enogrp);
+ }
+ n.gid = u->id;
+ runlock(&fs->userlk);
+ if(n.gid != de->gid){
+ op |= Owgid;
+ PACK32(p, n.gid);
+ p += 4;
+ }
+ }
+ op |= Owmuid;
+ n.muid = f->uid;
+ PACK32(p, n.muid);
+ p += 4;
+
+ /* check permissions */
+ if(rename)
+ if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
+ error(Eperm);
+ if(op & Owsize)
+ if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1)
+ error(Eperm);
+ if(op & (Owmode|Owmtime))
+ if(!f->permit && f->uid != de->uid && !groupleader(f->uid, de->gid))
+ error(Ewstato);
+ if(op & Owuid)
+ if(!f->permit)
+ error(Ewstatu);
+ if(op & Owgid)
+ if(!f->permit
+ && !(f->uid == de->uid && ingroup(f->uid, n.gid))
+ && !(groupleader(f->uid, de->gid) && groupleader(f->uid, n.gid)))
+ error(Ewstatg);
+
+ /* update directory entry */
+ nm = 0;
+ if(rename && !de->gone){
+ mb[nm].op = Oclobber;
+ mb[nm].Key = de->Key;
+ mb[nm].v = nil;
+ mb[nm].nv = 0;
+ nm++;
+
+ mb[nm].op = Oinsert;
+ dir2kv(f->pqpath, &n, &mb[nm], rnbuf, sizeof(rnbuf));
+ k = mb[nm].Key;
+ nm++;
+
+ if(de->qid.type & QTDIR){
+ packsuper(upbuf, sizeof(upbuf), f->qpath);
+ mb[nm].op = Oinsert;
+ mb[nm].k = upbuf;
+ mb[nm].nk = Upksz;
+ mb[nm].v = mb[nm-1].k;
+ mb[nm].nv = mb[nm-1].nk;
+ nm++;
+ }
+ }else{
+ opbuf[0] = op;
+ mb[nm].op = Owstat;
+ mb[nm].Key = de->Key;
+ mb[nm].v = opbuf;
+ mb[nm].nv = p - opbuf;
+ nm++;
+ }
+ assert(nm <= nelem(mb));
+ upsert(f->mnt, mb, nm);
+
+ de->Xdir = n;
+ if(rename)
+ cpkey(de, &k, de->buf, sizeof(de->buf));
+
+ r.type = Rwstat;
+ respond(m, &r);
+ poperror();
+
+Err: wunlock(de);
+ putfid(f);
+}
+
+
+static void
+fsclunk(Fmsg *m, Amsg **ao)
+{
+ Fcall r;
+ Fid *f;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ lock(f);
+ clunkfid(m->conn, f, ao);
+ unlock(f);
+ r.type = Rclunk;
+ respond(m, &r);
+ putfid(f);
+}
+
+static void
+fscreate(Fmsg *m)
+{
+ char *p, buf[Kvmax], upkbuf[Keymax], upvbuf[Inlmax];
+ Dent *de;
+ vlong oldlen;
+ Qid old;
+ Fcall r;
+ Msg mb[2];
+ Fid *f;
+ Xdir d;
+ int nm;
+
+ if(okname(m->name) == -1){
+ rerror(m, Ename);
+ return;
+ }
+ if(m->perm & (DMMOUNT|DMAUTH)){
+ rerror(m, Ebotch);
+ return;
+ }
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ lock(f);
+
+ if(waserror()){
+ rerror(m, errmsg());
+ goto Err;
+
+ }
+ if(f->mode != -1){
+ rerror(m, Einuse);
+ goto Out;
+ }
+ de = f->dent;
+ if(walk1(f->mnt->root, f->qpath, m->name, &old, &oldlen) == 0){
+ rerror(m, Eexist);
+ goto Out;
+ }
+
+ rlock(de);
+ if(fsaccess(f, de->mode, de->uid, de->gid, DMWRITE) == -1){
+ rerror(m, Eperm);
+ runlock(de);
+ goto Out;
+ }
+
+ d.gid = de->gid;
+ runlock(de);
+
+ nm = 0;
+ d.qid.type = 0;
+ if(m->perm & DMDIR)
+ d.qid.type |= QTDIR;
+ if(m->perm & DMAPPEND)
+ d.qid.type |= QTAPPEND;
+ if(m->perm & DMEXCL)
+ d.qid.type |= QTEXCL;
+ if(m->perm & DMTMP)
+ d.qid.type |= QTTMP;
+ d.qid.path = aincv(&fs->nextqid, 1);
+ d.qid.vers = 0;
+ d.mode = m->perm;
+ if(m->perm & DMDIR)
+ d.mode &= ~0777 | de->mode & 0777;
+ else
+ d.mode &= ~0666 | de->mode & 0666;
+ d.name = m->name;
+ d.atime = nsec();
+ d.mtime = d.atime;
+ d.length = 0;
+ d.uid = f->uid;
+ d.muid = f->uid;
+
+ mb[nm].op = Oinsert;
+ dir2kv(f->qpath, &d, &mb[nm], buf, sizeof(buf));
+ nm++;
+
+ if(m->perm & DMDIR){
+ mb[nm].op = Oinsert;
+ if((p = packsuper(upkbuf, sizeof(upkbuf), d.qid.path)) == nil)
+ sysfatal("ream: pack super");
+ mb[nm].k = upkbuf;
+ mb[nm].nk = p - upkbuf;
+ if((p = packdkey(upvbuf, sizeof(upvbuf), f->qpath, d.name)) == nil)
+ sysfatal("ream: pack super");
+ mb[nm].v = upvbuf;
+ mb[nm].nv = p - upvbuf;
+ nm++;
+ }
+ upsert(f->mnt, mb, nm);
+
+ de = getdent(f->qpath, &d);
+ clunkdent(f->dent);
+ f->mode = mode2bits(m->mode);
+ f->pqpath = f->qpath;
+ f->qpath = d.qid.path;
+ f->dent = de;
+ if(m->mode & ORCLOSE)
+ f->rclose = 1;
+
+ r.type = Rcreate;
+ r.qid = d.qid;
+ r.iounit = f->iounit;
+ respond(m, &r);
+Out: poperror();
+Err: unlock(f);
+ putfid(f);
+ return;
+}
+
+static char*
+candelete(Fid *f)
+{
+ char *e, pfx[Dpfxsz];
+ Tree *t;
+ Scan s;
+
+ if(!(f->dent->qid.type & QTDIR))
+ return nil;
+
+ t = agetp(&f->mnt->root);
+ packdkey(pfx, sizeof(pfx), f->qpath, nil);
+ btnewscan(&s, pfx, sizeof(pfx));
+ btenter(t, &s);
+ if(btnext(&s, &s.kv))
+ e = Enempty;
+ else
+ e = nil;
+ btexit(&s);
+ return e;
+}
+
+static void
+fsremove(Fmsg *m, int id, Amsg **ao)
+{
+ char *e, buf[Kvmax];
+ Fcall r;
+ Msg mb[2];
+ Tree *t;
+ Kvp kv;
+ Fid *f;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ t = f->mnt->root;
+ clunkfid(m->conn, f, nil);
+
+ truncwait(f->dent, id);
+ wlock(f->dent);
+ *ao = nil;
+ if(waserror()){
+ rerror(m, errmsg());
+ free(*ao);
+ *ao = nil;
+ goto Err;
+ }
+ if(f->dent->gone)
+ error(Ephase);
+ /*
+ * we need a double check that the file is in the tree
+ * here, because the walk to the fid is done in a reader
+ * proc that can look it up in a stale version of the
+ * tree, while we clunk the dent in the mutator proc.
+ *
+ * this means we can theoretically get some deletions
+ * of files that are already gone.
+ */
+ if(!btlookup(t, &f->dent->Key, &kv, buf, sizeof(buf)))
+ error(Ephase);
+ if((e = candelete(f)) != nil)
+ error(e);
+ if(fsaccess(f, f->dmode, f->duid, f->dgid, DMWRITE) == -1)
+ error(Eperm);
+ mb[0].op = Odelete;
+ mb[0].k = f->dent->k;
+ mb[0].nk = f->dent->nk;
+ mb[0].nv = 0;
+
+ if(f->dent->qid.type & QTDIR){
+ packsuper(buf, sizeof(buf), f->qpath);
+ mb[1].op = Oclobber;
+ mb[1].k = buf;
+ mb[1].nk = Upksz;
+ mb[1].nv = 0;
+ upsert(f->mnt, mb, 2);
+ }else{
+ *ao = emalloc(sizeof(Amsg), 1);
+ aincl(&f->mnt->ref, 1);
+ (*ao)->op = AOclear;
+ (*ao)->mnt = f->mnt;
+ (*ao)->qpath = f->qpath;
+ (*ao)->off = 0;
+ (*ao)->end = f->dent->length;
+ (*ao)->dent = nil;
+ upsert(f->mnt, mb, 1);
+ }
+ f->dent->gone = 1;
+ r.type = Rremove;
+ respond(m, &r);
+ poperror();
+Err:
+ wunlock(f->dent);
+ putfid(f);
+ return;
+}
+
+static void
+fsopen(Fmsg *m, int id, Amsg **ao)
+{
+ char *p, *e, buf[Kvmax];
+ int mbits;
+ Tree *t;
+ Fcall r;
+ Xdir d;
+ Fid *f;
+ Kvp kv;
+ Msg mb;
+
+ mbits = mode2bits(m->mode);
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ if(waserror()){
+ rerror(m, errmsg());
+ putfid(f);
+ return;
+ }
+ if(m->mode & OTRUNC)
+ truncwait(f->dent, id);
+ t = agetp(&f->mnt->root);
+ if((f->qpath & Qdump) != 0){
+ filldumpdir(&d);
+ }else{
+ if(!btlookup(t, f->dent, &kv, buf, sizeof(buf)))
+ error(Esrch);
+ kv2dir(&kv, &d);
+ }
+ wlock(f->dent);
+ if(waserror()){
+ wunlock(f->dent);
+ nexterror();
+ }
+ if(f->dent->gone)
+ error(Ephase);
+ if(f->dent->qid.type & QTEXCL)
+ if(f->dent->ref != 1)
+ error(Elocked);
+ if(m->mode & ORCLOSE)
+ if((e = candelete(f)) != nil)
+ error(e);
+ if(fsaccess(f, d.mode, d.uid, d.gid, mbits) == -1)
+ error(Eperm);
+ f->dent->length = d.length;
+ poperror();
+ wunlock(f->dent);
+ r.type = Ropen;
+ r.qid = d.qid;
+ r.iounit = f->iounit;
+
+ lock(f);
+ if(f->mode != -1){
+ unlock(f);
+ error(Einuse);
+ }
+ if((m->mode & OTRUNC) && !(f->dent->mode & DMAPPEND)){
+ wlock(f->dent);
+
+ if(waserror()){
+ wunlock(f->dent);
+ free(*ao);
+ *ao = nil;
+ nexterror();
+ }
+ *ao = emalloc(sizeof(Amsg), 1);
+ qlock(&f->dent->trunclk);
+ f->dent->trunc = 1;
+ qunlock(&f->dent->trunclk);
+ aincl(&f->dent->ref, 1);
+ aincl(&f->mnt->ref, 1);
+ (*ao)->op = AOclear;
+ (*ao)->mnt = f->mnt;
+ (*ao)->qpath = f->qpath;
+ (*ao)->off = 0;
+ (*ao)->end = f->dent->length;
+ (*ao)->dent = f->dent;
+
+ f->dent->muid = f->uid;
+ f->dent->qid.vers++;
+ f->dent->length = 0;
+
+ mb.op = Owstat;
+ p = buf;
+ p[0] = Owsize|Owmuid; p += 1;
+ PACK64(p, 0); p += 8;
+ PACK32(p, f->uid); p += 4;
+ mb.k = f->dent->k;
+ mb.nk = f->dent->nk;
+ mb.v = buf;
+ mb.nv = p - buf;
+
+ upsert(f->mnt, &mb, 1);
+ wunlock(f->dent);
+ poperror();
+ }
+ f->mode = mode2bits(m->mode);
+ if(m->mode & ORCLOSE)
+ f->rclose = 1;
+ unlock(f);
+ poperror();
+ respond(m, &r);
+ putfid(f);
+}
+
+static void
+readsnap(Fmsg *m, Fid *f, Fcall *r)
+{
+ char pfx[1], *p;
+ int n, ns;
+ Scan *s;
+ Xdir d;
+
+ s = f->scan;
+ if(s != nil && s->offset != 0 && s->offset != m->offset)
+ error(Edscan);
+ if(s == nil || m->offset == 0){
+ s = emalloc(sizeof(Scan), 1);
+ pfx[0] = Klabel;
+ btnewscan(s, pfx, 1);
+ lock(f);
+ if(f->scan != nil){
+ free(f->scan);
+ }
+ f->scan = s;
+ unlock(f);
+ }
+ if(s->donescan){
+ r->count = 0;
+ return;
+ }
+ p = r->data;
+ n = m->count;
+ d = f->dent->Xdir;
+ if(s->overflow){
+ memcpy(d.name, s->kv.k+1, s->kv.nk-1);
+ d.name[s->kv.nk-1] = 0;
+ d.qid.path = UNPACK64(s->kv.v + 1);
+ if((ns = dir2statbuf(&d, p, n)) == -1){
+ r->count = 0;
+ return;
+ }
+ s->overflow = 0;
+ p += ns;
+ n -= ns;
+ }
+ btenter(&fs->snap, s);
+ while(1){
+ if(!btnext(s, &s->kv))
+ break;
+ memcpy(d.name, s->kv.k+1, s->kv.nk-1);
+ d.name[s->kv.nk-1] = 0;
+ d.qid.path = UNPACK64(s->kv.v + 1);
+ if((ns = dir2statbuf(&d, p, n)) == -1){
+ s->overflow = 1;
+ break;
+ }
+ p += ns;
+ n -= ns;
+ }
+ btexit(s);
+ r->count = p - r->data;
+ return;
+}
+
+static void
+readdir(Fmsg *m, Fid *f, Fcall *r)
+{
+ char pfx[Dpfxsz], *p;
+ int n, ns;
+ Tree *t;
+ Scan *s;
+
+ s = f->scan;
+ t = agetp(&f->mnt->root);
+ if(s != nil && s->offset != 0 && s->offset != m->offset)
+ error(Edscan);
+ if(s == nil || m->offset == 0){
+ s = emalloc(sizeof(Scan), 1);
+ packdkey(pfx, sizeof(pfx), f->qpath, nil);
+ btnewscan(s, pfx, sizeof(pfx));
+ lock(f);
+ if(f->scan != nil)
+ free(f->scan);
+ f->scan = s;
+ unlock(f);
+ }
+ if(s->donescan){
+ r->count = 0;
+ return;
+ }
+ p = r->data;
+ n = m->count;
+ if(s->overflow){
+ if((ns = kv2statbuf(&s->kv, p, n)) == -1){
+ r->count = 0;
+ return;
+ }
+ s->overflow = 0;
+ p += ns;
+ n -= ns;
+ }
+ btenter(t, s);
+ while(1){
+ if(!btnext(s, &s->kv))
+ break;
+ if((ns = kv2statbuf(&s->kv, p, n)) == -1){
+ s->overflow = 1;
+ break;
+ }
+ p += ns;
+ n -= ns;
+ }
+ btexit(s);
+ r->count = p - r->data;
+}
+
+static void
+readfile(Fmsg *m, Fid *f, Fcall *r)
+{
+ vlong n, c, o;
+ char *p;
+ Dent *e;
+ Tree *t;
+
+ e = f->dent;
+ rlock(e);
+ if(m->offset > e->length){
+ runlock(e);
+ return;
+ }
+ p = r->data;
+ c = m->count;
+ o = m->offset;
+ t = agetp(&f->mnt->root);
+ if(m->offset + m->count > e->length)
+ c = e->length - m->offset;
+ while(c != 0){
+ n = readb(t, f, p, o, c, e->length);
+ r->count += n;
+ if(n == 0)
+ break;
+ p += n;
+ o += n;
+ c -= n;
+ }
+ runlock(e);
+}
+
+static void
+fsread(Fmsg *m)
+{
+ Fcall r;
+ Fid *f;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ r.type = Rread;
+ r.count = 0;
+ r.data = nil;
+ if(waserror()){
+ rerror(m, errmsg());
+ free(r.data);
+ putfid(f);
+ return;
+ }
+ r.data = emalloc(m->count, 0);
+ if(f->dent->qid.type & QTAUTH)
+ authread(f, &r, r.data, m->count);
+ else if(f->dent->qid.path == Qdump)
+ readsnap(m, f, &r);
+ else if(f->dent->qid.type & QTDIR)
+ readdir(m, f, &r);
+ else
+ readfile(m, f, &r);
+ respond(m, &r);
+ free(r.data);
+ poperror();
+ putfid(f);
+}
+
+static void
+fswrite(Fmsg *m, int id)
+{
+ char sbuf[Wstatmax], kbuf[Max9p/Blksz+2][Offksz], vbuf[Max9p/Blksz+2][Ptrsz];
+ Bptr bp[Max9p/Blksz + 2];
+ Msg kv[Max9p/Blksz + 2];
+ vlong n, o, c, w;
+ int i, j;
+ char *p;
+ Fcall r;
+ Tree *t;
+ Fid *f;
+
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ return;
+ }
+ if(!(f->mode & DMWRITE)){
+ rerror(m, Einuse);
+ putfid(f);
+ return;
+ }
+ truncwait(f->dent, id);
+ wlock(f->dent);
+ if(waserror()){
+ rerror(m, errmsg());
+ wunlock(f->dent);
+ putfid(f);
+ return;
+ }
+ if(f->dent->gone)
+ error(Ephase);
+ if(f->dent->qid.type & QTAUTH){
+ authwrite(f, &r, m->data, m->count);
+ goto Out;
+ }
+
+ w = 0;
+ p = m->data;
+ o = m->offset;
+ c = m->count;
+ if(f->dent->mode & DMAPPEND)
+ o = f->dent->length;
+ t = agetp(&f->mnt->root);
+ for(i = 0; i < nelem(kv)-1 && c != 0; i++){
+ assert(i == 0 || o%Blksz == 0);
+ kv[i].op = Oinsert;
+ kv[i].k = kbuf[i];
+ kv[i].nk = sizeof(kbuf[i]);
+ kv[i].v = vbuf[i];
+ kv[i].nv = sizeof(vbuf[i]);
+ if(waserror()){
+ if(!fs->rdonly)
+ for(j = 0; j < i; j++)
+ freeblk(t, nil, bp[j]);
+ nexterror();
+ }
+ n = writeb(f, &kv[i], &bp[i], p, o, c, f->dent->length);
+ poperror();
+ w += n;
+ p += n;
+ o += n;
+ c -= n;
+ }
+
+ p = sbuf;
+ kv[i].op = Owstat;
+ kv[i].k = f->dent->k;
+ kv[i].nk = f->dent->nk;
+ *p++ = 0;
+ if(o > f->dent->length){
+ sbuf[0] |= Owsize;
+ PACK64(p, o);
+ p += 8;
+ f->dent->length = m->offset+m->count;
+ }
+ sbuf[0] |= Owmtime;
+ f->dent->mtime = nsec();
+ PACK64(p, f->dent->mtime);
+ p += 8;
+ sbuf[0] |= Owmuid;
+ PACK32(p, f->uid);
+ p += 4;
+
+ kv[i].v = sbuf;
+ kv[i].nv = p - sbuf;
+ upsert(f->mnt, kv, i+1);
+
+ r.type = Rwrite;
+ r.count = w;
+Out:
+ poperror();
+ respond(m, &r);
+ wunlock(f->dent);
+ putfid(f);
+}
+
+void
+fsflush(Fmsg *m)
+{
+ Fcall r;
+
+ r.type = Rflush;
+ respond(m, &r);
+}
+
+Conn *
+newconn(int rfd, int wfd)
+{
+ Conn *c;
+
+ if((c = mallocz(sizeof(*c), 1)) == nil)
+ return nil;
+ c->rfd = rfd;
+ c->wfd = wfd;
+ c->iounit = Max9p;
+ c->next = fs->conns;
+ lock(&fs->connlk);
+ fs->conns = c;
+ unlock(&fs->connlk);
+ return c;
+}
+
+void
+runfs(int, void *pc)
+{
+ char err[128];
+ RWLock *lk;
+ Amsg *a;
+ Conn *c;
+ Fcall r;
+ Fmsg *m;
+ u32int h;
+
+ c = pc;
+ while(1){
+ if(readmsg(c, &m) < 0){
+ fshangup(c, "read message: %r");
+ return;
+ }
+ if(m == nil)
+ break;
+ if(convM2S(m->buf, m->sz, m) == 0){
+ fshangup(c, "invalid message: %r");
+ return;
+ }
+ if(m->type != Tversion && !c->versioned){
+ fshangup(c, "version required");
+ return;
+ }
+ dprint("← %F\n", &m->Fcall);
+
+ if(m->type == Tflush){
+ lk = &fs->flushq[ihash(m->oldtag) % Nflushtab];
+ wlock(lk);
+ }else{
+ lk = &fs->flushq[ihash(m->tag) % Nflushtab];
+ rlock(lk);
+ }
+
+ a = nil;
+ h = ihash(m->fid) % fs->nreaders;
+ switch(m->type){
+ /* sync setup, must not access tree */
+ case Tversion: fsversion(m); break;
+ case Tauth: fsauth(m); break;
+ case Tflush: fsflush(m); break;
+ case Tclunk: fsclunk(m, &a); break;
+
+ /* mutators */
+ case Tcreate: chsend(fs->wrchan, m); break;
+ case Twrite: chsend(fs->wrchan, m); break;
+ case Twstat: chsend(fs->wrchan, m); break;
+ case Tremove: chsend(fs->wrchan, m); break;
+
+ /* reads */
+ case Tattach: chsend(fs->rdchan[h], m); break;
+ case Twalk: chsend(fs->rdchan[h], m); break;
+ case Tread: chsend(fs->rdchan[h], m); break;
+ case Tstat: chsend(fs->rdchan[h], m); break;
+
+ /* both */
+ case Topen:
+ if((m->mode & OTRUNC) || (m->mode & ORCLOSE) != 0)
+ chsend(fs->wrchan, m);
+ else
+ chsend(fs->rdchan[h], m);
+ break;
+
+ default:
+ fprint(2, "unknown message %F\n", &m->Fcall);
+ snprint(err, sizeof(err), "unknown message: %F", &m->Fcall);
+ r.type = Rerror;
+ r.ename = err;
+ respond(m, &r);
+ break;
+ }
+ assert(estacksz() == 0);
+ if(a != nil)
+ chsend(fs->admchan, a);
+ }
+}
+
+void
+runmutate(int id, void *)
+{
+ Fmsg *m;
+ Amsg *a;
+ Fid *f;
+
+ while(1){
+ a = nil;
+ m = chrecv(fs->wrchan);
+ if(fs->rdonly){
+ /*
+ * special case: even if Tremove fails, we need
+ * to clunk the fid.
+ */
+ if(m->type == Tremove){
+ if((f = getfid(m->conn, m->fid)) == nil){
+ rerror(m, Enofid);
+ continue;
+ }
+ clunkfid(m->conn, f, nil);
+ putfid(f);
+ }
+ rerror(m, Erdonly);
+ continue;
+ }
+
+ qlock(&fs->mutlk);
+ epochstart(id);
+ fs->snap.dirty = 1;
+ switch(m->type){
+ case Tcreate: fscreate(m); break;
+ case Twrite: fswrite(m, id); break;
+ case Twstat: fswstat(m, id, &a); break;
+ case Tremove: fsremove(m, id, &a); break;
+ case Topen: fsopen(m, id, &a); break;
+ default: abort(); break;
+ }
+ assert(estacksz() == 0);
+ epochend(id);
+ epochclean();
+ qunlock(&fs->mutlk);
+
+ if(a != nil)
+ chsend(fs->admchan, a);
+ }
+}
+
+void
+runread(int id, void *ch)
+{
+ Fmsg *m;
+
+ while(1){
+ m = chrecv(ch);
+ epochstart(id);
+ switch(m->type){
+ case Tattach: fsattach(m); break;
+ case Twalk: fswalk(m); break;
+ case Tread: fsread(m); break;
+ case Tstat: fsstat(m); break;
+ case Topen: fsopen(m, id, nil); break;
+ }
+ assert(estacksz() == 0);
+ epochend(id);
+ }
+}
+
+void
+freetree(Bptr rb, vlong pred)
+{
+ Bptr bp;
+ Blk *b;
+ Kvp kv;
+ int i;
+
+ b = getblk(rb, 0);
+ if(b->type == Tpivot){
+ for(i = 0; i < b->nval; i++){
+ getval(b, i, &kv);
+ bp = unpackbp(kv.v, kv.nv);
+ freetree(bp, pred);
+ qlock(&fs->mutlk);
+ epochclean();
+ qunlock(&fs->mutlk);
+ }
+ }
+ if(rb.gen > pred)
+ freeblk(nil, nil, rb);
+ dropblk(b);
+}
+
+/*
+ * Here, we clean epochs frequently, but we run outside of
+ * an epoch; this is because the caller of this function
+ * has already waited for an epoch to tick over, there's
+ * nobody that can be accessing the tree other than us,
+ * and we just need to keep the limbo list short.
+ *
+ * Because this is the last reference to the tree, we don't
+ * need to hold the mutlk, other than when we free or kill
+ * blocks via epochclean.
+ */
+void
+sweeptree(Tree *t)
+{
+ char pfx[1];
+ Scan s;
+ Bptr bp;
+ pfx[0] = Kdat;
+ btnewscan(&s, pfx, 1);
+ btenter(t, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ bp = unpackbp(s.kv.v, s.kv.nv);
+ if(bp.gen > t->pred)
+ freeblk(nil, nil, bp);
+ qlock(&fs->mutlk);
+ epochclean();
+ qunlock(&fs->mutlk);
+ }
+ btexit(&s);
+ freetree(t->bp, t->pred);
+}
+
+void
+runsweep(int id, void*)
+{
+ char buf[Kvmax];
+ Bptr bp, nb, *oldhd;
+ vlong off;
+ Tree *t;
+ Arena *a;
+ Amsg *am;
+ Blk *b;
+ Msg m, mb[2];
+ int i, nm;
+
+ if((oldhd = calloc(fs->narena, sizeof(Bptr))) == nil)
+ sysfatal("malloc log heads");
+ while(1){
+ am = chrecv(fs->admchan);
+ if(agetl(&fs->rdonly)){
+ fprint(2, "spurious adm message\n");
+ break;
+ }
+ switch(am->op){
+ case AOsync:
+ tracem("syncreq");
+ if(!fs->snap.dirty && !am->halt)
+ continue;
+ if(agetl(&fs->rdonly))
+ goto Justhalt;
+ if(waserror()){
+ fprint(2, "sync error: %s\n", errmsg());
+ ainc(&fs->rdonly);
+ break;
+ }
+
+ if(am->halt)
+ ainc(&fs->rdonly);
+ qlock(&fs->mutlk);
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ qlock(a);
+ if(a->nlog < a->reserve/(10*Blksz)){
+ oldhd[i].addr = -1;
+ oldhd[i].hash = -1;
+ oldhd[i].gen = -1;
+ qunlock(a);
+ continue;
+ }
+ if(waserror()){
+ qunlock(&fs->mutlk);
+ qunlock(a);
+ nexterror();
+ }
+ oldhd[i] = a->loghd;
+ epochstart(id);
+ compresslog(a);
+ qunlock(a);
+ epochend(id);
+ epochclean();
+ poperror();
+ }
+ qunlock(&fs->mutlk);
+ sync();
+
+ for(i = 0; i < fs->narena; i++){
+ for(bp = oldhd[i]; bp.addr != -1; bp = nb){
+ qlock(&fs->mutlk);
+ epochstart(id);
+ b = getblk(bp, 0);
+ nb = b->logp;
+ freeblk(nil, b, b->bp);
+ dropblk(b);
+ epochend(id);
+ epochclean();
+ qunlock(&fs->mutlk);
+ }
+ }
+
+Justhalt:
+ if(am->halt){
+ assert(fs->snapdl.hd.addr == -1);
+ assert(fs->snapdl.tl.addr == -1);
+ postnote(PNGROUP, getpid(), "halted");
+ exits(nil);
+ }
+ poperror();
+ break;
+
+ case AOsnap:
+ tracem("snapreq");
+ if(agetl(&fs->rdonly)){
+ fprint(2, "read only fs");
+ continue;
+ }
+ if(waserror()){
+ fprint(2, "taking snap: %s\n", errmsg());
+ ainc(&fs->rdonly);
+ break;
+ }
+
+ qlock(&fs->mutlk);
+ if(waserror()){
+ qunlock(&fs->mutlk);
+ nexterror();
+ }
+ epochstart(id);
+ snapfs(am, &t);
+ epochend(id);
+ poperror();
+ qunlock(&fs->mutlk);
+
+ sync();
+
+ if(t != nil){
+ epochwait();
+ sweeptree(t);
+ closesnap(t);
+ }
+ poperror();
+ break;
+
+ case AOrclose:
+ nm = 0;
+ mb[nm].op = Odelete;
+ mb[nm].k = am->dent->k;
+ mb[nm].nk = am->dent->nk;
+ mb[nm].nv = 0;
+ nm++;
+ if(am->dent->qid.type & QTDIR){
+ packsuper(buf, sizeof(buf), am->qpath);
+ mb[nm].op = Oclobber;
+ mb[nm].k = buf;
+ mb[nm].nk = Upksz;
+ mb[nm].nv = 0;
+ nm++;
+ }
+ upsert(am->mnt, mb, nm);
+ /* fallthrough */
+ case AOclear:
+ tracem("bgclear");
+ if(waserror()){
+ fprint(2, "clear file %llx: %s\n", am->qpath, errmsg());
+ ainc(&fs->rdonly);
+ break;
+ }
+ if(am->dent != nil)
+ qlock(&am->dent->trunclk);
+ fs->snap.dirty = 1;
+ for(off = am->off; off < am->end; off += Blksz){
+ qlock(&fs->mutlk);
+ if(waserror()){
+ qunlock(&fs->mutlk);
+ nexterror();
+ }
+ epochstart(id);
+ m.k = buf;
+ m.nk = sizeof(buf);
+ m.op = Oclearb;
+ m.k[0] = Kdat;
+ PACK64(m.k+1, am->qpath);
+ PACK64(m.k+9, off);
+ m.v = nil;
+ m.nv = 0;
+ upsert(am->mnt, &m, 1);
+ epochend(id);
+ epochclean();
+ qunlock(&fs->mutlk);
+ poperror();
+ }
+ if(am->dent != nil){
+ am->dent->trunc = 0;
+ rwakeup(&am->dent->truncrz);
+ qunlock(&am->dent->trunclk);
+ clunkdent(am->dent);
+ }
+ clunkmount(am->mnt);
+ poperror();
+ break;
+ }
+ assert(estacksz() == 0);
+ free(am);
+ }
+}
+
+void
+snapmsg(char *old, char *new, int flg)
+{
+ Amsg *a;
+
+ a = emalloc(sizeof(Amsg), 1);
+ a->op = AOsnap;
+ a->fd = -1;
+ a->flag = flg;
+ strecpy(a->old, a->old+sizeof(a->old), old);
+ if(new == nil)
+ a->delete = 1;
+ else
+ strecpy(a->new, a->new+sizeof(a->new), new);
+ chsend(fs->admchan, a);
+}
+
+void
+runtasks(int, void *)
+{
+ char buf[128];
+ Tm now, then;
+ Mount *mnt;
+ int m, h;
+ Amsg *a;
+
+ m = 0;
+ h = 0;
+ tmnow(&then, nil);
+ tmnow(&now, nil);
+ while(1){
+ sleep(5000);
+ if(fs->rdonly)
+ continue;
+ if(waserror()){
+ fprint(2, "task error: %s\n", errmsg());
+ continue;
+ }
+ a = emalloc(sizeof(Amsg), 1);
+ a->op = AOsync;
+ a->halt = 0;
+ a->fd = -1;
+ chsend(fs->admchan, a);
+
+ tmnow(&now, nil);
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+ if(!(mnt->flag & Ltsnap))
+ continue;
+ if(now.yday != then.yday){
+ snprint(buf, sizeof(buf),
+ "%s@day.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+ snapmsg("main", buf, Lauto);
+ }
+ if(now.hour != then.hour){
+ if(mnt->hourly[h][0] != 0)
+ snapmsg(mnt->hourly[h], nil, 0);
+ snprint(mnt->hourly[h], sizeof(mnt->hourly[h]),
+ "%s@hour.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+ snapmsg("main", mnt->hourly[h], Lauto);
+ }
+ if(now.min != then.min){
+ if(mnt->minutely[m][0] != 0)
+ snapmsg(mnt->minutely[m], nil, 0);
+ snprint(mnt->minutely[m], sizeof(mnt->minutely[m]),
+ "%s@minute.%τ", mnt->name, tmfmt(&now, "YYYY.MM.DD[_]hh:mm:ss"));
+ snapmsg("main", mnt->minutely[m], Lauto);
+ }
+ }
+ if(now.hour != then.hour)
+ h = (h+1)%24;
+ if(now.min != then.min)
+ m = (m+1)%60;
+ then = now;
+ poperror();
+ }
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/hash.c
@@ -1,0 +1,153 @@
+// metrohash64.cpp
+//
+// The MIT License (MIT)
+//
+// Copyright (c) 2015 J. Andrew Rogers
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+#define _le64toh(x) \
+ GBIT64((char*)&x)
+
+
+#define ROTATE(x, b) (u64int)( ((x) << (b)) | ( (x) >> (64 - (b))) )
+
+#define HALF_ROUND(a,b,c,d,s,t) \
+ a += b; c += d; \
+ b = ROTATE(b, s) ^ a; \
+ d = ROTATE(d, t) ^ c; \
+ a = ROTATE(a, 32);
+
+#define DOUBLE_ROUND(v0,v1,v2,v3) \
+ HALF_ROUND(v0,v1,v2,v3,13,16); \
+ HALF_ROUND(v2,v1,v0,v3,17,21); \
+ HALF_ROUND(v0,v1,v2,v3,13,16); \
+ HALF_ROUND(v2,v1,v0,v3,17,21);
+
+#define rotate_right(v, k)\
+ ((v >> k) | (v << (64 - k)))
+#define read_u64(ptr) \
+ (*(u64int*)ptr)
+#define read_u32(ptr) \
+ (*(u32int*)ptr)
+#define read_u16(ptr) \
+ (*(u16int*)ptr)
+#define read_u8(ptr) \
+ (*(u8int*)ptr)
+
+uvlong
+metrohash64_1(void * key, u64int len, u32int seed)
+{
+ static const u64int k0 = 0xC83A91E1;
+ static const u64int k1 = 0x8648DBDB;
+ static const u64int k2 = 0x7BDEC03B;
+ static const u64int k3 = 0x2F5870A5;
+
+ const uchar * ptr = key;
+ const uchar * const end = ptr + len;
+
+ u64int hash = ((((u64int) seed) + k2) * k0) + len;
+
+ if(len >= 32){
+ u64int v[4];
+ v[0] = hash;
+ v[1] = hash;
+ v[2] = hash;
+ v[3] = hash;
+
+ do{
+ v[0] += read_u64(ptr) * k0; ptr += 8; v[0] = rotate_right(v[0],29) + v[2];
+ v[1] += read_u64(ptr) * k1; ptr += 8; v[1] = rotate_right(v[1],29) + v[3];
+ v[2] += read_u64(ptr) * k2; ptr += 8; v[2] = rotate_right(v[2],29) + v[0];
+ v[3] += read_u64(ptr) * k3; ptr += 8; v[3] = rotate_right(v[3],29) + v[1];
+ }
+ while(ptr <= (end - 32));
+
+ v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1;
+ v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0;
+ v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1;
+ v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0;
+ hash += v[0] ^ v[1];
+ }
+
+ if((end - ptr) >= 16){
+ u64int v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1;
+ u64int v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2;
+ v0 ^= rotate_right(v0 * k0, 35) + v1;
+ v1 ^= rotate_right(v1 * k3, 35) + v0;
+ hash += v1;
+ }
+
+ if((end - ptr) >= 8){
+ hash += read_u64(ptr) * k3; ptr += 8;
+ hash ^= rotate_right(hash, 33) * k1;
+
+ }
+
+ if((end - ptr) >= 4){
+ hash += read_u32(ptr) * k3; ptr += 4;
+ hash ^= rotate_right(hash, 15) * k1;
+ }
+
+ if((end - ptr) >= 2){
+ hash += read_u16(ptr) * k3; ptr += 2;
+ hash ^= rotate_right(hash, 13) * k1;
+ }
+
+ if((end - ptr) >= 1){
+ hash += read_u8 (ptr) * k3;
+ hash ^= rotate_right(hash, 25) * k1;
+ }
+
+ hash ^= rotate_right(hash, 33);
+ hash *= k0;
+ hash ^= rotate_right(hash, 33);
+
+ return hash;
+}
+
+uvlong
+bufhash(void *src, usize len)
+{
+ return metrohash64_1(src, len, 0x6765);
+}
+
+uvlong
+blkhash(Blk *b)
+{
+ return metrohash64_1(b->buf, Blksz, 0x6765);
+}
+
+u32int
+ihash(uvlong x)
+{
+ x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
+ x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
+ x = x ^ (x >> 31);
+ return x;
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/load.c
@@ -1,0 +1,142 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static int
+rangecmp(Avl *a, Avl *b)
+{
+ if(((Arange*)a)->off < ((Arange*)b)->off)
+ return -1;
+ if(((Arange*)a)->off > ((Arange*)b)->off)
+ return 1;
+ return 0;
+}
+
+void
+loadarena(Arena *a, Bptr hd)
+{
+ Blk *h0, *h1, *b;
+ Bptr bp;
+
+ /* try to load block pointers with consistency check */
+ bp = hd;
+ h0 = nil;
+ h1 = nil;
+ if(!waserror()){
+ h0 = getblk(bp, GBsoftchk);
+ poperror();
+ }else
+ print("loading arena primary header: %s\n", errmsg());
+ bp.addr += Blksz;
+ if(!waserror()){
+ h1 = getblk(bp, GBsoftchk);
+ poperror();
+ }else
+ print("loading arena backup header: %s\n", errmsg());
+
+ /* if neither head nor tail is consistent, we're hosed */
+ b = (h0 != nil) ? h0 : h1;
+ if(b == nil)
+ error(Efs);
+
+ /* otherwise, we could have crashed mid-pass, just load the blocks */
+ bp = hd;
+ if(h0 == nil)
+ h0 = getblk(bp, GBnochk);
+ bp.addr += Blksz;
+ if(h1 == nil)
+ h1 = getblk(bp, GBnochk);
+
+ unpackarena(a, b->data, Arenasz);
+ if((a->free = avlcreate(rangecmp)) == nil)
+ error(Enomem);
+ a->h0 = h0;
+ a->h1 = h1;
+ a->used = a->size;
+}
+
+void
+loadfs(char *dev)
+{
+ Bptr bhd, btl;
+ Mount *dump;
+ Arena *a;
+ Tree *t;
+ Dir *d;
+ int i;
+ vlong eb;
+
+ if((dump = mallocz(sizeof(*dump), 1)) == nil)
+ sysfatal("malloc: %r");
+ if(waserror())
+ sysfatal("load fs: %s", errmsg());
+ snprint(dump->name, sizeof(dump->name), "dump");
+ dump->ref = 1;
+ dump->gen = -1;
+ dump->root = &fs->snap;
+
+ fs->snapmnt = dump;
+ fs->narena = 1;
+ if((fs->fd = open(dev, ORDWR)) == -1)
+ sysfatal("open %s: %r", dev);
+ if((d = dirfstat(fs->fd)) == nil)
+ sysfatal("stat %s: %r", dev);
+ eb = d->length;
+ eb = eb - (eb%Blksz) - Blksz;
+ bhd = (Bptr){0, -1, -1};
+ btl = (Bptr){eb, -1, -1};
+ fs->sb0 = getblk(bhd, GBnochk);
+ fs->sb1 = getblk(btl, GBnochk);
+ if(!waserror()){
+ unpacksb(fs, fs->sb0->buf, Blksz);
+ poperror();
+ }else{
+ fprint(2, "unable to load primary superblock: %s\n", errmsg());
+ if(waserror()){
+ fprint(2, "unable to load primary superblock: %s\n", errmsg());
+ exits("corrupt");
+ }
+ unpacksb(fs, fs->sb1->buf, Blksz);
+ poperror();
+ }
+
+ if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+ sysfatal("malloc: %r");
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ loadarena(a, fs->arenabp[i]);
+ a->reserve = a->size / 1024;
+ if(a->reserve < 512*KiB)
+ a->reserve = 512*KiB;
+ if(a->reserve > 8*MiB)
+ a->reserve = 8*MiB;
+ }
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ a->logbuf[0] = cachepluck();
+ a->logbuf[1] = cachepluck();
+ a->logbuf[0]->bp = (Bptr){-1, -1, -1};
+ a->logbuf[1]->bp = (Bptr){-1, -1, -1};
+ loadlog(a, a->loghd);
+ }
+
+ if((t = opensnap("adm", nil)) == nil)
+ sysfatal("load users: no adm label");
+ loadusers(2, t);
+ poperror();
+
+ fprint(2, "load %s:\n", dev);
+ fprint(2, "\tsnaptree:\t%B\n", fs->snap.bp);
+ fprint(2, "\tnarenas:\t%d\n", fs->narena);
+ fprint(2, "\tfeatures:\t%lld\n", fs->flag);
+ fprint(2, "\tnextqid:\t%lld\n", fs->nextqid);
+ fprint(2, "\tlastqgen:\t%lld\n", fs->qgen);
+ fprint(2, "\tnextgen:\t%lld\n", fs->nextgen);
+ fprint(2, "\tblocksize:\t%lld\n", Blksz);
+ fprint(2, "\tcachesz:\t%lld MiB\n", fs->cmax*Blksz/MiB);
+ closesnap(t);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/main.c
@@ -1,0 +1,435 @@
+#include <u.h>
+#include <libc.h>
+#include <avl.h>
+#include <fcall.h>
+#include <bio.h>
+
+#include "dat.h"
+#include "fns.h"
+#include "atomic.h"
+
+Gefs *fs;
+
+int ream;
+int grow;
+int debug;
+int stdio;
+int noauth;
+int nproc;
+int permissive;
+int usereserve;
+int checkonly;
+char *reamuser;
+char *dev;
+vlong tracesz = 16*MiB;
+vlong cachesz = 512*MiB;
+char *srvname = "gefs";
+int noneid = 0;
+int nogroupid = 9999;
+int admid = -1;
+Blk *blkbuf;
+Errctx **errctx;
+
+void
+_trace(char *msg, Bptr bp, vlong v0, vlong v1)
+{
+ Trace *t;
+ ulong idx;
+
+ idx = aincl(&fs->traceidx, 1);
+ t = &fs->trace[(idx-1) % fs->ntrace];
+ strecpy(t->msg, t->msg+sizeof(t->msg), msg);
+ t->tid = (*errctx)->tid;
+ t->qgen = agetv(&fs->qgen);
+ t->bp = bp;
+ t->v0 = v0;
+ t->v1 = v1;
+}
+
+static void
+nokill(void)
+{
+ char buf[128];
+ int fd;
+
+ snprint(buf, sizeof(buf), "/proc/%d/ctl", getpid());
+ if((fd = open(buf, OWRITE)) == -1){
+ fprint(2, "nokill: open %s: %r", buf);
+ return;
+ }
+ if(fprint(fd, "noswap\n") == -1){
+ fprint(2, "nokill: write %s: %r", buf);
+ return;
+ }
+}
+
+static uvlong
+memsize(void)
+{
+ char *ln, *f[2];
+ vlong mem;
+ Biobuf *bp;
+
+ mem = 512*MiB;
+ if((bp = Bopen("/dev/swap", OREAD)) == nil)
+ return mem;
+ while((ln = Brdstr(bp, '\n', 1)) != nil){
+ if(tokenize(ln, f, nelem(f)) != 2)
+ continue;
+ if(strcmp(f[1], "memory") == 0){
+ mem = strtoll(f[0], 0, 0);
+ free(ln);
+ break;
+ }
+ free(ln);
+ }
+ Bterm(bp);
+ return mem;
+}
+
+jmp_buf*
+_waserror(void)
+{
+ Errctx *c;
+
+ c = *errctx;
+ c->nerrlab++;
+ assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+ return c->errlab + (c->nerrlab-1);
+}
+
+_Noreturn static void
+errorv(char *fmt, va_list ap, int broke)
+{
+ Errctx *c;
+
+ c = *errctx;
+ vsnprint(c->err, sizeof(c->err), fmt, ap);
+ if(broke){
+ fprint(2, "%s\n", c->err);
+ abort();
+ }
+ assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+ longjmp(c->errlab[--c->nerrlab], -1);
+}
+
+_Noreturn void
+broke(char *fmt, ...)
+{
+ va_list ap;
+
+ aincl(&fs->rdonly, 1);
+ va_start(ap, fmt);
+ errorv(fmt, ap, 1);
+}
+
+_Noreturn void
+error(char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ errorv(fmt, ap, 0);
+}
+
+_Noreturn void
+nexterror(void)
+{
+ Errctx *c;
+
+ c = *errctx;
+ assert(c->nerrlab > 0 && c->nerrlab < Estacksz);
+ longjmp(c->errlab[--c->nerrlab], -1);
+}
+
+void*
+emalloc(usize sz, int zero)
+{
+ void *p;
+
+ if((p = mallocz(sz, zero)) == nil)
+ error(Enomem);
+ setmalloctag(p, getcallerpc(&sz));
+ return p;
+}
+
+static void
+initfs(vlong cachesz)
+{
+ Blk *b;
+
+ if((fs = mallocz(sizeof(Gefs), 1)) == nil)
+ sysfatal("malloc: %r");
+
+ if(tracesz != 0){
+ fs->trace = emalloc(tracesz, 1);
+ fs->ntrace = tracesz/sizeof(Trace);
+ }
+ fs->lrurz.l = &fs->lrulk;
+ fs->syncrz.l = &fs->synclk;
+ fs->noauth = noauth;
+ fs->cmax = cachesz/Blksz;
+ if(fs->cmax > (1<<30))
+ sysfatal("cache too big");
+ if((fs->bcache = mallocz(fs->cmax*sizeof(Bucket), 1)) == nil)
+ sysfatal("malloc: %r");
+ fs->dlcmax = fs->cmax/10;
+ if(fs->dlcmax < 4)
+ fs->dlcmax = 4;
+ if(fs->dlcmax > 512)
+ fs->dlcmax = 512;
+ if((fs->dlcache = mallocz(fs->dlcmax*sizeof(Dlist*), 1)) == nil)
+ sysfatal("malloc: %r");
+
+ blkbuf = sbrk(fs->cmax * sizeof(Blk));
+ if(blkbuf == (void*)-1)
+ sysfatal("sbrk: %r");
+ for(b = blkbuf; b != blkbuf+fs->cmax; b++){
+ b->bp.addr = -1;
+ b->bp.hash = -1;
+ b->magic = Magic;
+ lrutop(b);
+ }
+}
+
+static void
+launch(void (*f)(int, void *), void *arg, char *text)
+{
+ long pid, id;
+
+ assert(fs->nworker < nelem(fs->lepoch));
+ pid = rfork(RFPROC|RFMEM|RFNOWAIT);
+ if (pid < 0)
+ sysfatal("can't fork: %r");
+ if (pid == 0) {
+ nokill();
+ id = aincl(&fs->nworker, 1);
+ if((*errctx = mallocz(sizeof(Errctx), 1)) == nil)
+ sysfatal("malloc: %r");
+ (*errctx)->tid = id;
+ procsetname("%s.%ld", text, id);
+ (*f)(id, arg);
+ exits("child returned");
+ }
+}
+
+static int
+postfd(char *name, char *suff, int mode)
+{
+ char buf[80];
+ int fd[2];
+ int cfd;
+
+ if(pipe(fd) < 0)
+ sysfatal("can't make a pipe");
+ snprint(buf, sizeof buf, "/srv/%s%s", name, suff);
+ if((cfd = create(buf, OWRITE|ORCLOSE|OCEXEC, mode)) == -1)
+ sysfatal("create %s: %r", buf);
+ if(fprint(cfd, "%d", fd[0]) == -1)
+ sysfatal("write %s: %r", buf);
+ close(fd[0]);
+ return fd[1];
+}
+
+static void
+runannounce(int, void *arg)
+{
+ char *ann, adir[40], ldir[40];
+ int actl, lctl, fd;
+ Conn *c;
+
+ ann = arg;
+ if((actl = announce(ann, adir)) < 0)
+ sysfatal("announce %s: %r", ann);
+ while(1){
+ if((lctl = listen(adir, ldir)) < 0){
+ fprint(2, "listen %s: %r", adir);
+ break;
+ }
+ fd = accept(lctl, ldir);
+ close(lctl);
+ if(fd < 0){
+ fprint(2, "accept %s: %r", ldir);
+ continue;
+ }
+ if(!(c = newconn(fd, fd))){
+ close(fd);
+ fprint(2, "%r");
+ continue;
+ }
+
+ launch(runfs, c, "netio");
+ }
+ close(actl);
+}
+
+static void
+usage(void)
+{
+ fprint(2, "usage: %s [-SA] [-r user] [-m mem] [-n srv] [-a net]... -f dev\n", argv0);
+ exits("usage");
+}
+
+void
+main(int argc, char **argv)
+{
+ int i, srvfd, ctlfd, nann;
+ char *s, *e, *ann[16];
+ vlong v, memsz;
+ Conn *c;
+
+ nann = 0;
+ memsz = memsize();
+ cachesz = 25*memsz/100;
+ ARGBEGIN{
+ case 'a':
+ if(nann == nelem(ann))
+ sysfatal("too many announces");
+ ann[nann++] = EARGF(usage());
+ break;
+ case 'r':
+ ream = 1;
+ reamuser = EARGF(usage());
+ break;
+ case 'c':
+ checkonly = 1;
+ break;
+ case 'g':
+ grow = 1;
+ break;
+ case 't':
+ tracesz = strtoll(EARGF(usage()), &e, 0);
+ tracesz *= MiB;
+ break;
+ case 'm':
+ v = strtoll(EARGF(usage()), &e, 0);
+ switch(*e){
+ case 'M': case 'm': case 0:
+ cachesz = v*MiB;
+ break;
+ case 'G': case 'g':
+ cachesz = v*GiB;
+ break;
+ case '%':
+ cachesz = v*memsz/100;
+ break;
+ default:
+ sysfatal("unknown suffix %s", e);
+ }
+ break;
+ case 'd':
+ debug++;
+ break;
+ case 'n':
+ srvname = EARGF(usage());
+ break;
+ case 's':
+ stdio = 1;
+ break;
+ case 'A':
+ noauth = 1;
+ break;
+ case 'S':
+ permissive = 1;
+ break;
+ case 'f':
+ dev = EARGF(usage());
+ break;
+ default:
+ usage();
+ break;
+ }ARGEND;
+ if(dev == nil)
+ usage();
+
+ /*
+ * sanity checks -- I've tuned these to stupid
+ * values in the past.
+ */
+ assert(4*Kpmax < Pivspc);
+ assert(2*Msgmax < Bufspc);
+ assert(Treesz < Inlmax);
+
+ initfs(cachesz);
+ initshow();
+ errctx = privalloc();
+ if((*errctx = mallocz(sizeof(Errctx), 1)) == nil)
+ sysfatal("malloc: %r");
+ tmfmtinstall();
+ fmtinstall('H', encodefmt);
+ fmtinstall('B', Bconv);
+ fmtinstall('M', Mconv);
+ fmtinstall('P', Pconv);
+ fmtinstall('K', Kconv);
+ fmtinstall('R', Rconv);
+ fmtinstall('F', fcallfmt);
+ fmtinstall('Q', Qconv);
+
+ if((s = getenv("NPROC")) != nil)
+ nproc = atoi(s);
+ free(s);
+
+ /*
+ * too few procs, we can't parallelize io,
+ * too many, we suffer lock contention
+ */
+ if(nproc < 2)
+ nproc = 2;
+ if(nproc > 8)
+ nproc = 8;
+ if(ream){
+ reamfs(dev);
+ exits(nil);
+ }
+ if(grow){
+ growfs(dev);
+ exits(nil);
+ }
+ if(checkonly){
+ loadfs(dev);
+ if(!checkfs(2))
+ sysfatal("broken fs: %r");
+ exits(nil);
+ }
+
+ rfork(RFNOTEG);
+ nokill();
+ loadfs(dev);
+ fs->wrchan = mkchan(32);
+ fs->admchan = mkchan(32);
+ fs->nsyncers = nproc/2;
+ fs->nreaders = nproc/2;
+ if(fs->nsyncers > fs->narena)
+ fs->nsyncers = fs->narena;
+ for(i = 0; i < fs->nsyncers; i++)
+ qinit(&fs->syncq[i]);
+ if((fs->rdchan = malloc(fs->nreaders*sizeof(Chan*))) == nil)
+ sysfatal("malloc: %r");
+ for(i = 0; i < fs->nreaders; i++)
+ fs->rdchan[i] = mkchan(32);
+ for(i = 0; i < fs->narena; i++)
+ fs->arenas[i].sync = &fs->syncq[i%fs->nsyncers];
+ srvfd = postfd(srvname, "", 0666);
+ ctlfd = postfd(srvname, ".cmd", 0600);
+ launch(runcons, (void*)ctlfd, "ctl");
+ launch(runmutate, nil, "mutate");
+ launch(runsweep, nil, "sweep");
+ launch(runtasks, nil, "tasks");
+ for(i = 0; i < fs->nreaders; i++)
+ launch(runread, fs->rdchan[i], "readio");
+ for(i = 0; i < fs->nsyncers; i++)
+ launch(runsync, &fs->syncq[i], "syncio");
+ for(i = 0; i < nann; i++)
+ launch(runannounce, ann[i], "announce");
+ if(srvfd != -1){
+ if((c = newconn(srvfd, srvfd)) == nil)
+ sysfatal("%r");
+ launch(runfs, c, "srvio");
+ }
+ if(stdio){
+ if((c = newconn(0, 1)) == nil)
+ sysfatal("%r");
+ launch(runfs, c, "stdio");
+ }
+ exits(nil);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/mkfile
@@ -1,0 +1,39 @@
+</$objtype/mkfile
+
+TARG=gefs
+BIN=/$objtype/bin
+OFILES=\
+ blk.$O\
+ cache.$O\
+ check.$O\
+ cons.$O\
+ dump.$O\
+ error.$O\
+ fs.$O\
+ hash.$O\
+ load.$O\
+ main.$O\
+ pack.$O\
+ ream.$O\
+ snap.$O\
+ tree.$O\
+ user.$O\
+ \
+ atomic-$objtype.$O
+
+HFILES=\
+ dat.h\
+ fns.h\
+ atomic.h
+
+</sys/src/cmd/mkone
+</sys/doc/fonts
+
+%.ps: %.ms
+ { echo $FONTS; cat $stem.ms } | pic | tbl | eqn | troff -ms | lp -dstdout > $target
+%.pdf: %.ps
+ ps2pdf $stem.ps $stem.pdf
+
+man.install: gefs.4.man gefs.8.man
+ cp gefs.4.man /sys/man/4/gefs
+ cp gefs.8.man /sys/man/8/gefs
--- /dev/null
+++ b/sys/src/cmd/gefs/pack.c
@@ -1,0 +1,512 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+/* Terminated so we can use them directly in C */
+char*
+unpackstr(char *p, char *e, char **s)
+{
+ int n;
+
+ if (e - p < 3)
+ error(Elength);
+ n = UNPACK16(p);
+ if(e - p < n + 3 || p[n+2] != 0)
+ broke(Efs);
+ *s = p+2;
+ return p+3+n;
+}
+
+/* Terminated so we can use them directly in C */
+char*
+packstr(char *p, char *e, char *s)
+{
+ int n;
+
+ n = strlen(s);
+ if (e - p < n+3)
+ error(Elength);
+ PACK16(p, n); p += 2;
+ memmove(p, s, n); p += n;
+ *p = 0; p += 1;
+ return p;
+}
+
+void
+dir2kv(vlong up, Xdir *d, Kvp *kv, char *buf, int nbuf)
+{
+ char *ek, *ev, *eb;
+
+ ek = packdkey(buf, nbuf, up, d->name);
+ kv->k = buf;
+ kv->nk = ek - buf;
+ eb = buf + nbuf;
+ ev = packdval(ek, eb - ek, d);
+ kv->v = ek;
+ kv->nv = ev - ek;
+}
+
+char*
+packdkey(char *p, int sz, vlong up, char *name)
+{
+ char *ep;
+
+ ep = p + sz;
+ PACK8(p, Kent); p += 1;
+ PACK64(p, up); p += 8;
+ if(name != nil)
+ p = packstr(p, ep, name);
+ return p;
+}
+
+char*
+unpackdkey(char *p, int sz, vlong *up)
+{
+ char key, *ep, *name;
+
+ ep = p + sz;
+ assert(sz > 9);
+ key = UNPACK8(p); p += 1;
+ *up = UNPACK64(p); p += 8;
+ assert(key == Kent);
+ p = unpackstr(p, ep, &name);
+ assert(p <= ep);
+ return name;
+}
+
+char*
+packsuper(char *p, int sz, vlong up)
+{
+ char *ep;
+
+ ep = p+sz;
+ PACK8(p, Kup); p += 1;
+ PACK64(p, up); p += 8;
+ assert(p <= ep);
+ return p;
+}
+
+char*
+packdval(char *p, int sz, Xdir *d)
+{
+ char *e;
+
+ e = p + sz;
+ PACK64(p, d->flag); p += 8;
+ PACK64(p, d->qid.path); p += 8;
+ PACK32(p, d->qid.vers); p += 4;
+ PACK8(p, d->qid.type); p += 1;
+ PACK32(p, d->mode); p += 4;
+ PACK64(p, d->atime); p += 8;
+ PACK64(p, d->mtime); p += 8;
+ PACK64(p, d->length); p += 8;
+ PACK32(p, d->uid); p += 4;
+ PACK32(p, d->gid); p += 4;
+ PACK32(p, d->muid); p += 4;
+ assert(p <= e);
+ return p;
+}
+
+void
+kv2dir(Kvp *kv, Xdir *d)
+{
+ char *k, *ek, *v, *ev;
+
+ memset(d, 0, sizeof(Xdir));
+ k = kv->k + 9;
+ ek = kv->k + kv->nk;
+ k = unpackstr(k, ek, &d->name);
+
+ v = kv->v;
+ ev = v + kv->nv;
+ d->flag = UNPACK64(v); v += 8;
+ d->qid.path = UNPACK64(v); v += 8;
+ d->qid.vers = UNPACK32(v); v += 4;
+ d->qid.type = UNPACK8(v); v += 1;
+ d->mode = UNPACK32(v); v += 4;
+ d->atime = UNPACK64(v); v += 8;
+ d->mtime = UNPACK64(v); v += 8;
+ d->length = UNPACK64(v); v += 8;
+ d->uid = UNPACK32(v); v += 4;
+ d->gid = UNPACK32(v); v += 4;
+ d->muid = UNPACK32(v); v += 4;
+ assert(v <= ev);
+ if(k != ek)
+ broke(Efs);
+ if(v != ev)
+ broke(Efs);
+}
+
+int
+dir2statbuf(Xdir *d, char *buf, int nbuf)
+{
+ int sz, nn, nu, ng, nm;
+ vlong atime, mtime;
+ User *u, *g, *m;
+ char *p;
+
+ rlock(&fs->userlk);
+ if((u = uid2user(d->uid)) == nil)
+ u = uid2user(noneid);
+ if((g = uid2user(d->gid)) == nil)
+ u = uid2user(nogroupid);
+ if((m = uid2user(d->muid)) == nil)
+ m = uid2user(noneid);
+ if(u == nil || g == nil || m == nil)
+ error(Eperm);
+
+ p = buf;
+ nn = strlen(d->name);
+ nu = strlen(u->name);
+ ng = strlen(g->name);
+ nm = strlen(m->name);
+ atime = (d->atime+Nsec/2)/Nsec;
+ mtime = (d->mtime+Nsec/2)/Nsec;
+ sz = STATFIXLEN + nn + nu + ng + nm;
+ if(sz > nbuf){
+ runlock(&fs->userlk);
+ return -1;
+ }
+
+ PBIT16(p, sz-2); p += 2;
+ PBIT16(p, -1 /*type*/); p += 2;
+ PBIT32(p, -1 /*dev*/); p += 4;
+ PBIT8(p, d->qid.type); p += 1;
+ PBIT32(p, d->qid.vers); p += 4;
+ PBIT64(p, d->qid.path); p += 8;
+ PBIT32(p, d->mode); p += 4;
+ PBIT32(p, atime); p += 4;
+ PBIT32(p, mtime); p += 4;
+ PBIT64(p, d->length); p += 8;
+
+ PBIT16(p, nn); p += 2;
+ memcpy(p, d->name, nn); p += nn;
+ PBIT16(p, nu); p += 2;
+ memcpy(p, u->name, nu); p += nu;
+ PBIT16(p, ng); p += 2;
+ memcpy(p, g->name, ng); p += ng;
+ PBIT16(p, nm); p += 2;
+ memcpy(p, m->name, nm); p += nm;
+ assert(p - buf == sz);
+ runlock(&fs->userlk);
+ return sz;
+}
+
+int
+kv2statbuf(Kvp *kv, char *buf, int nbuf)
+{
+ Xdir d;
+
+ kv2dir(kv, &d);
+ return dir2statbuf(&d, buf, nbuf);
+}
+
+void
+kv2qid(Kvp *kv, Qid *q)
+{
+ char *v, *e;
+
+ v = kv->v;
+ e = v + kv->nv;
+ q->path = UNPACK64(v); v += 8;
+ q->vers = UNPACK64(v); v += 8;
+ assert(v <= e);
+}
+
+void
+kv2dlist(Kvp *kv, Dlist *dl)
+{
+ char *p, *e;
+
+ p = kv->k;
+ e = p + kv->nk;
+ p++;
+ dl->gen = UNPACK64(p); p += 8;
+ dl->bgen = UNPACK64(p); p += 8;
+ assert(p <= e);
+
+ p = kv->v;
+ e = p + kv->nv;
+ dl->hd = unpackbp(p, e-p); p += Ptrsz;
+ dl->tl = unpackbp(p, e-p); p += Ptrsz;
+ assert(p <= e);
+}
+
+void
+dlist2kv(Dlist *dl, Kvp *kv, char *buf, int nbuf)
+{
+ char *p, *e;
+
+ assert(nbuf >= Dlkvpsz);
+ p = buf;
+ e = buf+nbuf;
+
+ kv->k = p;
+ *p++ = Kdlist;
+ PACK64(p, dl->gen); p += 8;
+ PACK64(p, dl->bgen); p += 8;
+ kv->nk = (p - kv->k);
+
+ kv->v = p;
+ p = packbp(p, e-p, &dl->hd);
+ p = packbp(p, e-p, &dl->tl);
+ kv->nv = (p - kv->v);
+}
+
+void
+tree2kv(Tree *t, Kvp *kv, char *buf, int nbuf)
+{
+ char *p, *e;
+
+ p = buf;
+ e = buf+nbuf;
+
+ kv->k = p;
+ if((p = packsnap(p, e-p, t->gen)) == nil)
+ abort();
+ kv->nk = p - kv->k;
+
+ kv->v = p;
+ if((p = packtree(p, e-p, t)) == nil)
+ abort();
+ kv->nv = p - kv->v;
+}
+
+void
+retag2kv(vlong gen, vlong link, int dlbl, int dref, Kvp *kv, char *buf, int nbuf)
+{
+ char *p;
+
+ assert(nbuf >= 8+1+1);
+ kv->k = buf;
+ if((p = packsnap(buf, nbuf, gen)) == nil)
+ abort();
+ kv->nk = p - buf;
+
+ kv->v = p;
+ PACK64(p, link); p += 8;
+ *p = dlbl; p += 1;
+ *p = dref; p += 1;
+ kv->nv = p - kv->v;
+}
+
+void
+lbl2kv(char *lbl, vlong gen, uint flg, Kvp *kv, char *buf, int nbuf)
+{
+ char *p;
+ int n;
+
+ n = strlen(lbl);
+ assert(nbuf >= 1+n + 1+8+4);
+
+ p = buf;
+ kv->k = p;
+ p[0] = Klabel; p += 1;
+ memcpy(p, lbl, n); p += n;
+ kv->nk = p - kv->k;
+
+ kv->v = p;
+ p[0] = Ksnap; p += 1;
+ PACK64(p, gen); p += 8;
+ PACK32(p, flg); p += 4;
+ kv->nv = p - kv->v;
+}
+
+char*
+packlbl(char *p, int sz, char *name)
+{
+ int n;
+
+ n = strlen(name);
+ assert(sz >= n+1);
+ p[0] = Klabel; p += 1;
+ memcpy(p, name, n); p += n;
+ return p;
+}
+
+char*
+packsnap(char *p, int sz, vlong id)
+{
+ assert(sz >= Snapsz);
+ p[0] = Ksnap; p += 1;
+ PACK64(p, id); p += 8;
+ return p;
+}
+
+char*
+packbp(char *p, int sz, Bptr *bp)
+{
+ assert(sz >= Ptrsz);
+ PACK64(p, bp->addr); p += 8;
+ PACK64(p, bp->hash); p += 8;
+ PACK64(p, bp->gen); p += 8;
+ return p;
+}
+
+Bptr
+unpackbp(char *p, int sz)
+{
+ Bptr bp;
+
+ assert(sz >= Ptrsz);
+ bp.addr = UNPACK64(p); p += 8;
+ bp.hash = UNPACK64(p); p += 8;
+ bp.gen = UNPACK64(p);
+ return bp;
+}
+
+Tree*
+unpacktree(Tree *t, char *p, int sz)
+{
+ assert(sz >= Treesz);
+ memset(t, 0, sizeof(Tree));
+ t->nref = UNPACK32(p); p += 4;
+ t->nlbl = UNPACK32(p); p += 4;
+ t->ht = UNPACK32(p); p += 4;
+ t->flag = UNPACK32(p); p += 4;
+ t->gen = UNPACK64(p); p += 8;
+ t->pred = UNPACK64(p); p += 8;
+ t->succ = UNPACK64(p); p += 8;
+ t->base = UNPACK64(p); p += 8;
+ t->bp.addr = UNPACK64(p); p += 8;
+ t->bp.hash = UNPACK64(p); p += 8;
+ t->bp.gen = UNPACK64(p); //p += 8;
+
+ return t;
+}
+
+char*
+packtree(char *p, int sz, Tree *t)
+{
+ assert(sz >= Treesz);
+ PACK32(p, t->nref); p += 4;
+ PACK32(p, t->nlbl); p += 4;
+ PACK32(p, t->ht); p += 4;
+ PACK32(p, t->flag); p += 4;
+ PACK64(p, t->gen); p += 8;
+ PACK64(p, t->pred); p += 8;
+ PACK64(p, t->succ); p += 8;
+ PACK64(p, t->base); p += 8;
+ PACK64(p, t->bp.addr); p += 8;
+ PACK64(p, t->bp.hash); p += 8;
+ PACK64(p, t->bp.gen); p += 8;
+ return p;
+}
+
+char*
+packarena(char *p, int sz, Arena *a)
+{
+ char *e;
+
+ assert(sz >= Arenasz);
+ e = p + Arenasz;
+ PACK64(p, a->loghd.addr); p += 8; /* freelist addr */
+ PACK64(p, a->loghd.hash); p += 8; /* freelist hash */
+ PACK64(p, a->size); p += 8; /* arena size */
+ PACK64(p, a->used); p += 8; /* arena used */
+ assert(p <= e);
+ return p;
+}
+
+char*
+unpackarena(Arena *a, char *p, int sz)
+{
+ char *e;
+
+ assert(sz >= Arenasz);
+ memset(a, 0, sizeof(*a));
+
+ e = p + Arenasz;
+ a->loghd.addr = UNPACK64(p); p += 8;
+ a->loghd.hash = UNPACK64(p); p += 8;
+ a->loghd.gen = -1; p += 0;
+ a->size = UNPACK64(p); p += 8;
+ a->used = UNPACK64(p); p += 8;
+ a->logtl = nil;
+
+ assert(p <= e);
+ return p;
+}
+
+char*
+packsb(char *p0, int sz, Gefs *fi)
+{
+ uvlong h;
+ char *p;
+ int i;
+
+ assert(sz == Blksz);
+ assert(fi->narena < 512);
+ p = p0;
+ memcpy(p, "gefs9.00", 8); p += 8;
+ PACK32(p, Blksz); p += 4;
+ PACK32(p, Bufspc); p += 4;
+ PACK32(p, fi->narena); p += 4;
+ PACK32(p, fi->snap.ht); p += 4;
+ PACK64(p, fi->snap.bp.addr); p += 8;
+ PACK64(p, fi->snap.bp.hash); p += 8;
+ PACK64(p, fi->snapdl.hd.addr); p += 8;
+ PACK64(p, fi->snapdl.hd.hash); p += 8;
+ PACK64(p, fi->snapdl.tl.addr); p += 8;
+ PACK64(p, fi->snapdl.tl.hash); p += 8;
+ PACK64(p, fi->flag); p += 8;
+ PACK64(p, fi->nextqid); p += 8;
+ PACK64(p, fi->nextgen); p += 8;
+ PACK64(p, fi->qgen); p += 8;
+ for(i = 0; i < fi->narena; i++){
+ PACK64(p, fi->arenabp[i].addr); p += 8;
+ PACK64(p, fi->arenabp[i].hash); p += 8;
+ }
+ h = bufhash(p0, p - p0);
+ PACK64(p, h); p += 8;
+ return p;
+}
+
+char*
+unpacksb(Gefs *fi, char *p0, int sz)
+{
+ uvlong dh, xh;
+ char *p;
+ int i;
+
+ assert(sz == Blksz);
+ p = p0;
+ if(memcmp(p, "gefs9.00", 8) != 0)
+ error("%s %.8s", Efsvers, p);
+ p += 8;
+ fi->blksz = UNPACK32(p); p += 4;
+ fi->bufspc = UNPACK32(p); p += 4;
+ fi->narena = UNPACK32(p); p += 4;
+ fi->snap.ht = UNPACK32(p); p += 4;
+ fi->snap.bp.addr = UNPACK64(p); p += 8;
+ fi->snap.bp.hash = UNPACK64(p); p += 8;
+ fi->snap.bp.gen = -1; p += 0;
+ fi->snapdl.hd.addr = UNPACK64(p); p += 8;
+ fi->snapdl.hd.hash = UNPACK64(p); p += 8;
+ fi->snapdl.hd.gen = -1; p += 0;
+ fi->snapdl.gen = -1; p += 0;
+ fi->snapdl.tl.addr = UNPACK64(p); p += 8;
+ fi->snapdl.tl.hash = UNPACK64(p); p += 8;
+ fi->snapdl.hd.gen = -1; p += 0;
+ fi->snapdl.gen = -1; p += 0;
+ fi->flag = UNPACK64(p); p += 8;
+ fi->nextqid = UNPACK64(p); p += 8;
+ fi->nextgen = UNPACK64(p); p += 8;
+ fi->qgen = UNPACK64(p); p += 8;
+ fi->arenabp = emalloc(fi->narena * sizeof(Bptr), 0);
+ for(i = 0; i < fi->narena; i++){
+ fi->arenabp[i].addr = UNPACK64(p); p += 8;
+ fi->arenabp[i].hash = UNPACK64(p); p += 8;
+ fi->arenabp[i].gen = -1;
+ }
+ xh = bufhash(p0, p - p0);
+ dh = UNPACK64(p); p += 8;
+ if(dh != xh)
+ error("corrupt superblock: %llx != %llx", dh, xh);
+ assert(fi->narena < 256); /* should be more than anyone needs */
+ return p;
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/ream.c
@@ -1,0 +1,462 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+enum {
+ Qmainroot,
+ Qadmroot,
+ Qadmuser,
+ Nreamqid,
+};
+
+static void
+fillxdir(Xdir *d, vlong qid, char *name, int type, int mode)
+{
+ memset(d, 0, sizeof(Xdir));
+ d->qid = (Qid){qid, 0, type};
+ d->mode = mode;
+ d->atime = 0;
+ d->mtime = 0;
+ d->length = 0;
+ d->name = name;
+ d->uid = -1;
+ d->gid = -1;
+ d->muid = 0;
+}
+
+static void
+initadm(Blk *r, Blk *u, int nu)
+{
+ char *p, kbuf[Keymax], vbuf[Inlmax];
+ Kvp kv;
+ Xdir d;
+
+ /* nb: values must be inserted in key order */
+ kv.k = kbuf;
+ kv.nk = Offksz;
+ kv.v = vbuf;
+ kv.nv = Ptrsz;
+ kbuf[0] = Kdat;
+ PACK64(kbuf+1, (uvlong)Qadmuser);
+ PACK64(kbuf+9, 0ULL);
+ packbp(kv.v, kv.nv, &u->bp);
+ setval(r, &kv);
+
+ fillxdir(&d, Qadmuser, "users", QTFILE, 0664);
+ d.length = nu;
+ dir2kv(Qadmroot, &d, &kv, vbuf, sizeof(vbuf));
+ setval(r, &kv);
+ fillxdir(&d, Qadmroot, "", QTDIR, DMDIR|0775);
+ dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
+ setval(r, &kv);
+
+ p = packsuper(kbuf, sizeof(kbuf), 0);
+ kv.k = kbuf;
+ kv.nk = p - kbuf;
+ p = packdkey(vbuf, sizeof(vbuf), -1, "");
+ kv.v = vbuf;
+ kv.nv = p - vbuf;
+ setval(r, &kv);
+}
+
+static void
+initroot(Blk *r)
+{
+ char *p, kbuf[Keymax], vbuf[Inlmax];
+ Kvp kv;
+ Xdir d;
+
+ /* nb: values must be inserted in key order */
+ fillxdir(&d, Qmainroot, "", QTDIR, DMDIR|0775);
+ dir2kv(-1, &d, &kv, vbuf, sizeof(vbuf));
+ setval(r, &kv);
+
+ p = packsuper(kbuf, sizeof(kbuf), 0);
+ kv.k = kbuf;
+ kv.nk = p - kbuf;
+ p = packdkey(vbuf, sizeof(vbuf), -1, "");
+ kv.v = vbuf;
+ kv.nv = p - vbuf;
+ setval(r, &kv);
+}
+
+static void
+initsnap(Blk *s, Blk *r, Blk *a)
+{
+ char *p, *e, buf[Kvmax];
+ Tree t;
+ Kvp kv;
+
+ lbl2kv("adm", 1, Lmut|Ltsnap, &kv, buf, sizeof(buf));
+ setval(s, &kv);
+ lbl2kv("empty", 0, 0, &kv, buf, sizeof(buf));
+ setval(s, &kv);
+ lbl2kv("main", 2, Lmut|Ltsnap, &kv, buf, sizeof(buf));
+ setval(s, &kv);
+
+ p = buf;
+ e = p + sizeof(buf);
+
+ /* empty */
+ kv.k = p;
+ p = packsnap(buf, e - p, 0);
+ kv.nk = p - kv.k;
+ kv.v = p;
+ memset(&t, 0, sizeof(Tree));
+ t.flag = 0;
+ t.nref = 2;
+ t.nlbl = 1;
+ t.ht = 1;
+ t.gen = fs->nextgen++;
+ t.pred = 0;
+ t.succ = 2;
+ t.bp = r->bp;
+ p = packtree(p, e - p, &t);
+ kv.nv = p - kv.v;
+ setval(s, &kv);
+
+ p = buf;
+ e = p + sizeof(buf);
+
+ /* adm */
+ kv.k = p;
+ p = packsnap(p, e - p, 1);
+ kv.nk = p - kv.k;
+ kv.v = p;
+ memset(&t, 0, sizeof(Tree));
+ t.nref = 0;
+ t.nlbl = 1;
+ t.ht = 1;
+ t.gen = fs->nextgen++;
+ t.pred = 0;
+ t.succ = -1;
+ t.bp = a->bp;
+ p = packtree(p, e - p, &t);
+ kv.nv = p - kv.v;
+ setval(s, &kv);
+
+ p = buf;
+ e = p + sizeof(buf);
+
+ /* main */
+ kv.k = p;
+ p = packsnap(buf, e - p, 2);
+ kv.nk = p - kv.k;
+ kv.v = p;
+ memset(&t, 0, sizeof(Tree));
+ t.nref = 0;
+ t.nlbl = 1;
+ t.ht = 1;
+ t.gen = fs->nextgen++;
+ t.pred = 0;
+ t.succ = -1;
+ t.bp = r->bp;
+ p = packtree(p, e - p, &t);
+ kv.nv = p - kv.v;
+ setval(s, &kv);
+}
+
+static void
+initarena(Arena *a, uvlong hdaddr, vlong asz)
+{
+ Blk *b, *h0, *h1;
+ uvlong addr;
+ char *p;
+
+ b = cachepluck();
+ addr = hdaddr+2*Blksz; /* leave room for arena hdr */
+
+ a->loghd.addr = -1;
+ a->loghd.hash = -1;
+ a->loghd.gen = -1;
+
+ memset(b->buf, 0, sizeof(b->buf));
+ b->type = Tlog;
+ b->bp.addr = addr;
+ b->logsz = 0;
+ b->logp = (Bptr){-1, -1, -1};
+ b->data = b->buf + Loghdsz;
+ setflag(b, Bdirty);
+
+ p = b->buf + Loghdsz;
+ b->logp = (Bptr){-1, -1, -1};
+ PACK64(p, addr|LogFree); p += 8; /* addr */
+ PACK64(p, asz-2*Blksz); p += 8; /* len */
+ PACK64(p, b->bp.addr|LogAlloc); p += 8; /* addr */
+ PACK64(p, Blksz); p += 8; /* len */
+ PACK64(p, (uvlong)LogSync); p += 8; /* barrier */
+ b->logsz = p - b->data;
+ finalize(b);
+ syncblk(b);
+ dropblk(b);
+
+ a->loghd = b->bp;
+ a->loghd.gen = -1;
+ a->size = asz;
+ a->used = Blksz;
+
+ h0 = cachepluck();
+ h1 = cachepluck();
+
+ memset(h0->buf, 0, sizeof(h0->buf));
+ h0->type = Tarena;
+ h0->bp.addr = hdaddr;
+ h0->data = h0->buf+2;
+ finalize(h0);
+
+ memset(h1->buf, 0, sizeof(h1->buf));
+ h1->type = Tarena;
+ h1->bp.addr = hdaddr+Blksz;
+ h1->data = h1->buf+2;
+ finalize(h1);
+
+ packarena(h0->data, Arenasz, a);
+ packarena(h1->data, Arenasz, a);
+ finalize(h0);
+ finalize(h1);
+ syncblk(h0);
+ syncblk(h1);
+ a->h0 = h0;
+ a->h1 = h1;
+}
+
+void
+reamfs(char *dev)
+{
+ Blk *sb0, *sb1, *tb, *mb, *ab, *ub;
+ vlong sz, asz, off;
+ Mount *mnt, *adm;
+ Arena *a;
+ char *utab;
+ Dir *d;
+ int i;
+
+ if(waserror())
+ sysfatal("ream %s: %s\n", dev, errmsg());
+ if((fs->fd = open(dev, ORDWR)) == -1)
+ sysfatal("open %s: %r", dev);
+ if((d = dirfstat(fs->fd)) == nil)
+ sysfatal("ream: %r");
+ sz = d->length;
+ free(d);
+
+ print("reaming %s\n", dev);
+ if(sz < 128*MiB+Blksz)
+ sysfatal("ream: disk too small");
+ mnt = emalloc(sizeof(Mount), 1);
+ mnt->root = mallocz(sizeof(Tree), 1);
+ adm = mallocz(sizeof(Mount), 1);
+ adm->root = mallocz(sizeof(Tree), 1);
+
+ sz = sz - sz%Blksz - 2*Blksz;
+ fs->narena = (sz + 4096ULL*GiB - 1) / (4096ULL*GiB);
+ if(fs->narena < 8)
+ fs->narena = 8;
+ if(fs->narena >= 32)
+ fs->narena = 32;
+ fs->arenas = emalloc(fs->narena*sizeof(Arena), 1);
+
+
+ off = Blksz;
+ asz = sz/fs->narena;
+ asz = asz - (asz % Blksz) - 2*Blksz;
+
+ sb0 = cachepluck();
+ sb1 = cachepluck();
+ sb0->bp = (Bptr){0, -1, -1};
+ sb1->bp = (Bptr){sz+Blksz, -1, -1};
+
+ fs->arenabp = emalloc(fs->narena * sizeof(Bptr), 1);
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ print("\tarena %d: %lld blocks at %llx\n", i, asz/Blksz, off);
+ initarena(a, off, asz);
+ fs->arenabp[i] = a->h0->bp;
+ off += asz+2*Blksz;
+
+ }
+
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ loadarena(a, a->h0->bp);
+ loadlog(a, a->loghd);
+ }
+
+ if((mb = newblk(mnt->root, Tleaf, 0)) == nil)
+ sysfatal("ream: allocate root: %r");
+ holdblk(mb);
+ initroot(mb);
+ finalize(mb);
+ syncblk(mb);
+
+ mnt->root->ht = 1;
+ mnt->root->bp = mb->bp;
+
+ if((ab = newblk(adm->root, Tleaf, 0)) == nil)
+ sysfatal("ream: allocate root: %r");
+ if((ub = newblk(adm->root, Tdat, 0)) == nil)
+ sysfatal("ream: allocate root: %r");
+ holdblk(ab);
+ holdblk(ub);
+ utab = smprint(
+ "-1:adm::%s\n"
+ "0:none::\n"
+ "1:%s:%s:\n",
+ reamuser, reamuser, reamuser);
+ memcpy(ub->data, utab, strlen(utab));
+ finalize(ub);
+ syncblk(ub);
+ initadm(ab, ub, strlen(utab));
+ finalize(ab);
+ syncblk(ab);
+
+ adm->root->ht = 1;
+ adm->root->bp = ab->bp;
+
+ /*
+ * Now that we have a completely empty fs, give it
+ * a single snap block that the tree will insert
+ * into, and take a snapshot as the initial state.
+ */
+ if((tb = newblk(mnt->root, Tleaf, 0)) == nil)
+ sysfatal("ream: allocate snaps: %r");
+ holdblk(tb);
+ initsnap(tb, mb, ab);
+ finalize(tb);
+ syncblk(tb);
+
+ fs->snap.bp = tb->bp;
+ fs->snap.ht = 1;
+ fs->snapdl.hd.addr = -1;
+ fs->snapdl.hd.hash = -1;
+ fs->snapdl.tl.addr = -1;
+ fs->snapdl.tl.hash = -1;
+ fs->nextqid = Nreamqid;
+
+ dropblk(mb);
+ dropblk(ab);
+ dropblk(ub);
+ dropblk(tb);
+ fs->nextqid = Nreamqid;
+
+ /*
+ * We need to write back all of the arenas
+ * with the updated free lists
+ */
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ finalize(a->logtl);
+ syncblk(a->logtl);
+ packarena(a->h0->data, Blksz, a);
+ finalize(a->h0);
+ syncblk(a->h0);
+ packarena(a->h1->data, Blksz, a);
+ finalize(a->h1);
+ syncblk(a->h1);
+ fs->arenabp[i] = a->h0->bp;
+ dropblk(a->h0);
+ dropblk(a->h1);
+ }
+
+ dropblk(mb);
+ dropblk(ab);
+ dropblk(ub);
+ dropblk(tb);
+
+ /*
+ * Finally, write back the superblock and backup
+ * superblock.
+ */
+ packsb(sb0->buf, Blksz, fs);
+ packsb(sb1->buf, Blksz, fs);
+ finalize(sb0);
+ finalize(sb1);
+ syncblk(sb0);
+ syncblk(sb1);
+ dropblk(sb0);
+ dropblk(sb1);
+ free(mnt);
+ poperror();
+}
+
+void
+growfs(char *dev)
+{
+ vlong oldsz, newsz, asz, off, eb;
+ int i, narena;
+ Arena *a;
+ Bptr bp;
+ Dir *d;
+
+ if(waserror())
+ sysfatal("grow %s: %s\n", dev, errmsg());
+ if((fs->fd = open(dev, ORDWR)) == -1)
+ sysfatal("open %s: %r", dev);
+ if((d = dirfstat(fs->fd)) == nil)
+ sysfatal("ream: %r");
+
+ bp = (Bptr){0, -1, -1};
+ fs->sb0 = getblk(bp, GBnochk);
+ unpacksb(fs, fs->sb0->buf, Blksz);
+ if((fs->arenas = calloc(fs->narena, sizeof(Arena))) == nil)
+ sysfatal("malloc: %r");
+ for(i = 0; i < fs->narena; i++){
+ a = &fs->arenas[i];
+ loadarena(a, fs->arenabp[i]);
+ fs->arenabp[i] = a->h0->bp;
+ }
+ a = &fs->arenas[fs->narena-1];
+ oldsz = a->h0->bp.addr + a->size + 2*Blksz;
+ newsz = d->length - d->length%Blksz - 2*Blksz;
+ if(newsz - oldsz < 64*MiB)
+ sysfatal("new arenas too small (%lld < %lld), not growing", newsz - oldsz, 64*MiB);
+ asz = (newsz - oldsz)/4;
+ asz = asz - asz % Blksz - 2*Blksz;
+ narena = fs->narena + 4;
+ assert(oldsz % Blksz == 0);
+ if((fs->arenas = realloc(fs->arenas, narena*sizeof(Arena))) == nil)
+ error(Enomem);
+ if((fs->arenabp = realloc(fs->arenabp, narena*sizeof(Bptr))) == nil)
+ error(Enomem);
+
+ off = oldsz;
+ for(i = fs->narena; i < narena; i++){
+ a = &fs->arenas[i];
+ print("\tnew arena %d: adding %lld blocks at %llx\n", i, asz/Blksz, off);
+ initarena(&fs->arenas[i], off, asz);
+ loadarena(a, a->h0->bp);
+ loadlog(a, a->loghd);
+ a = &fs->arenas[i];
+ packarena(a->h0->data, Blksz, a);
+ packarena(a->h1->data, Blksz, a);
+ finalize(a->h0);
+ finalize(a->h1);
+ syncblk(a->h0);
+ syncblk(a->h1);
+
+ fs->arenabp[i] = a->h0->bp;
+ off += asz+2*Blksz;
+ }
+ fs->narena = narena;
+ packsb(fs->sb0->buf, Blksz, fs);
+ finalize(fs->sb0);
+ syncblk(fs->sb0);
+ /*
+ * We're being a bit tricksy here: because we're on a bigger
+ * partition, we don't know where the end is; just load the
+ * first block, and patch the address in to the right place
+ * when we write it back.
+ */
+ eb = d->length;
+ eb = eb - (eb%Blksz) - Blksz;
+ fs->sb0->bp = (Bptr){eb, -1, -1};
+ packsb(fs->sb0->buf, Blksz, fs);
+ finalize(fs->sb0);
+ syncblk(fs->sb0);
+ free(d);
+ poperror();
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/snap.c
@@ -1,0 +1,617 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "atomic.h"
+#include "dat.h"
+#include "fns.h"
+
+static void
+dlflush(Dlist *dl)
+{
+ char kvbuf[512];
+ Msg m;
+
+ if(dl->ins == nil)
+ return;
+ traceb("dlflush", dl->ins->bp);
+ enqueue(dl->ins);
+ dropblk(dl->ins);
+ dl->hd = dl->ins->bp;
+ if(dl->tl.addr == dl->hd.addr)
+ dl->tl = dl->hd;
+ dl->ins = nil;
+ /* special case: the snap dlist has gen -1, skip it */
+ if(dl->gen != -1){
+ m.op = Oinsert;
+ dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+ btupsert(&fs->snap, &m, 1);
+ }
+}
+
+static void
+dlcachedel(Dlist *dl, int hdel)
+{
+ uint h;
+ Dlist *d, **p;
+
+ h = ihash(dl->gen) ^ ihash(dl->bgen);
+ if(hdel){
+ p = &fs->dlcache[h % fs->dlcmax];
+ for(d = *p; d != nil; d = d->chain){
+ if(d->gen == dl->gen && d->bgen == dl->bgen)
+ break;
+ p = &d->chain;
+ }
+ if(d != nil)
+ *p = d->chain;
+ }
+ if(dl == fs->dlhead)
+ fs->dlhead = dl->cnext;
+ if(dl == fs->dltail)
+ fs->dltail = dl->cprev;
+ if(dl->cnext != nil)
+ dl->cnext->cprev = dl->cprev;
+ if(dl->cprev != nil)
+ dl->cprev->cnext = dl->cnext;
+ dl->cnext = nil;
+ dl->cprev = nil;
+}
+
+static Dlist*
+dlcacheget(vlong gen, vlong bgen)
+{
+ Dlist *dl;
+ uint h;
+
+ h = ihash(gen) ^ ihash(bgen);
+ for(dl = fs->dlcache[h % fs->dlcmax]; dl != nil; dl = dl->chain)
+ if(dl->gen == gen && dl->bgen == bgen)
+ break;
+ if(dl != nil)
+ dlcachedel(dl, 0);
+ return dl;
+}
+
+static Dlist*
+getdl(vlong gen, vlong bgen)
+{
+ char kbuf[Dlksz], kvbuf[Dlkvpsz];
+ Dlist *dl, **p;
+ uint h;
+ Msg m;
+ Kvp kv;
+ Key k;
+
+ if((dl = dlcacheget(gen, bgen)) != nil)
+ return dl;
+ dl = emalloc(sizeof(Dlist), 1);
+ if(waserror()){
+ free(dl);
+ nexterror();
+ }
+ kbuf[0] = Kdlist;
+ PACK64(kbuf+1, gen);
+ PACK64(kbuf+9, bgen);
+ k.k = kbuf;
+ k.nk = sizeof(kbuf);
+
+ /* load up existing dlist */
+ if(btlookup(&fs->snap, &k, &kv, kvbuf, sizeof(kvbuf))){
+ kv2dlist(&kv, dl);
+ goto Found;
+ }
+
+ /* create a new one if it didn't exist */
+ dl->gen = gen;
+ dl->bgen = bgen;
+ dl->hd.addr = -1;
+ dl->tl.addr = -1;
+ dl->ins = nil;
+
+ m.op = Oinsert;
+ dlist2kv(dl, &m, kvbuf, sizeof(kvbuf));
+ btupsert(&fs->snap, &m, 1);
+Found:
+ poperror();
+ h = ihash(gen) ^ ihash(bgen);
+ p = &fs->dlcache[h % fs->dlcmax];
+ dl->chain = *p;
+ *p = dl;
+ return dl;
+}
+
+void
+putdl(Dlist *dl)
+{
+ Dlist *dt;
+
+ if(dl->gen == -1)
+ return;
+ dlcachedel(dl, 0);
+ while(fs->dltail != nil && fs->dlcount >= fs->dlcmax){
+ dt = fs->dltail;
+ dlflush(dt);
+ dlcachedel(dt, 1);
+ dropblk(dt->ins);
+ free(dt);
+ }
+
+ dl->cprev = nil;
+ dl->cnext = fs->dlhead;
+ if(fs->dltail == nil)
+ fs->dltail = dl;
+ if(fs->dlhead != nil)
+ fs->dlhead->cprev = dl;
+ fs->dlhead = dl;
+}
+
+void
+freedl(Dlist *dl, int docontents)
+{
+ char buf[Kvmax];
+ Arena *a;
+ Qent qe;
+ Bptr bp;
+ Msg m;
+ Blk *b;
+ char *p;
+
+ bp = dl->hd;
+ if(dl->gen != -1){
+ m.op = Odelete;
+ dlist2kv(dl, &m, buf, sizeof(buf));
+ btupsert(&fs->snap, &m, 1);
+ }
+ while(bp.addr != -1){
+ b = getblk(bp, 0);
+ /*
+ * Because these deadlists are dead-dead at this point,
+ * they'll never be read from again; we can avoid worrying
+ * about deferred reclamation, and queue them up to be freed
+ * directly, which means we don't need to worry about watiing
+ * for a quiescent state, and the associated out-of-block
+ * deadlocks that come with it.
+ */
+ if(docontents){
+ for(p = b->data; p != b->data+b->logsz; p += 8){
+ qe.op = Qfree;
+ qe.bp.addr = UNPACK64(p);
+ qe.bp.hash = -1;
+ qe.bp.gen = -1;
+ qe.b = nil;
+ a = getarena(qe.bp.addr);
+ qput(a->sync, qe);
+ traceb("dlclear", qe.bp);
+ }
+ }
+ bp = b->logp;
+ qe.op = Qfree;
+ qe.bp = b->bp;
+ qe.b = b;
+ a = getarena(qe.bp.addr);
+ qput(a->sync, qe);
+ traceb("dlfreeb", qe.bp);
+ }
+}
+
+static void
+mergedl(vlong merge, vlong gen, vlong bgen)
+{
+ char buf[2][Kvmax];
+ Dlist *d, *m;
+ Msg msg[2];
+ Blk *b;
+
+ d = nil;
+ m = nil;
+ if(waserror()){
+ putdl(m);
+ putdl(d);
+ nexterror();
+ }
+ d = getdl(merge, bgen);
+ m = getdl(gen, bgen);
+ assert(d != m);
+ /*
+ * If the dest dlist didn't exist,
+ * just move the merge dlist over
+ * and be done with it, otherwise
+ * chain onto the existing dlist
+ * tail.
+ */
+ if(d->hd.addr == -1){
+ assert(d->ins == nil);
+ d->hd = m->hd;
+ d->tl = m->tl;
+ d->ins = m->ins;
+ if(d->ins != nil)
+ holdblk(d->ins);
+ }else{
+ if(m->ins != nil){
+ enqueue(m->ins);
+ dropblk(m->ins);
+ m->ins = nil;
+ }
+ b = getblk(d->tl, 0);
+ b->logp = m->hd;
+ assert(d->hd.addr != m->hd.addr);
+ finalize(b);
+ syncblk(b);
+ dropblk(b);
+ }
+ msg[0].op = Odelete;
+ dlist2kv(m, &msg[0], buf[0], sizeof(buf[0]));
+ msg[1].op = Oinsert;
+ dlist2kv(d, &msg[1], buf[1], sizeof(buf[1]));
+ btupsert(&fs->snap, msg, 2);
+ putdl(m);
+ putdl(d);
+ poperror();
+}
+
+static void
+reclaimblocks(vlong gen, vlong succ, vlong prev)
+{
+ char pfx[9];
+ Dlist dl;
+ Scan s;
+
+ pfx[0] = Kdlist;
+ PACK64(pfx+1, gen);
+ btnewscan(&s, pfx, sizeof(pfx));
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ kv2dlist(&s.kv, &dl);
+
+ if(succ != -1 && dl.bgen <= prev)
+ mergedl(succ, dl.gen, dl.bgen);
+ else if(dl.bgen <= prev)
+ mergedl(prev, dl.gen, dl.bgen);
+ else
+ freedl(&dl, 1);
+ }
+ btexit(&s);
+ if(succ != -1){
+ pfx[0] = Kdlist;
+ PACK64(pfx+1, succ);
+ btnewscan(&s, pfx, sizeof(pfx));
+ btenter(&fs->snap, &s);
+ while(1){
+ if(!btnext(&s, &s.kv))
+ break;
+ kv2dlist(&s.kv, &dl);
+ if(dl.bgen > prev)
+ freedl(&dl, 1);
+ }
+ btexit(&s);
+ }
+}
+
+/*
+ * Removes a label from a snapshot, allowing
+ * it to be reclaimed if it is not a direct
+ * predecessor of more than one other snapshot.
+ *
+ * If it has one successor and no label, then
+ * it will be merged with that successor.
+ */
+void
+delsnap(Tree *t, vlong succ, char *name)
+{
+ char *p, buf[4][Kvmax];
+ int nm, deltree;
+ Mount *mnt;
+ Msg m[4];
+
+ nm = 0;
+ deltree = 0;
+ if(name != nil){
+ if(strcmp(name, "dump") == 0
+ || strcmp(name, "empty") == 0
+ || strcmp(name, "adm") == 0)
+ error(Ename);
+
+ m[nm].op = Odelete;
+ m[nm].k = buf[nm];
+ p = packlbl(buf[nm], sizeof(buf[nm]), name);
+ m[nm].nk = p - m[nm].k;
+ m[nm].v = nil;
+ m[nm].nv = 0;
+ t->nlbl--;
+ nm++;
+ }
+
+ if(t->nlbl == 0 && t->nref <= 1){
+ deltree = 1;
+ m[nm].op = Orelink;
+ retag2kv(t->pred, succ, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
+ nm++;
+ if(t->succ != -1){
+ m[nm].op = Oreprev;
+ retag2kv(t->succ, t->pred, 0, 0, &m[nm], buf[nm], sizeof(buf[nm]));
+ nm++;
+ }
+ m[nm].op = Odelete;
+ m[nm].k = buf[nm];
+ p = packsnap(buf[nm], sizeof(buf[nm]), t->gen);
+ m[nm].nk = p - m[nm].k;
+ m[nm].v = nil;
+ m[nm].nv = 0;
+ nm++;
+ }
+ assert(nm <= nelem(m));
+ dlsync();
+ btupsert(&fs->snap, m, nm);
+ if(deltree){
+ reclaimblocks(t->gen, succ, t->pred);
+ for(mnt = agetp(&fs->mounts); mnt != nil; mnt = mnt->next){
+ if(mnt->root->gen == t->succ)
+ mnt->root->pred = t->pred;
+ if(mnt->root->gen == t->pred)
+ mnt->root->succ = t->succ;
+ }
+ }
+}
+
+/*
+ * Attaches a label to a tree, incrementing
+ * its reference count. This labelled snapshot
+ * will show up in the dump.
+ */
+void
+tagsnap(Tree *t, char *name, int flg)
+{
+ char buf[3][Kvmax];
+ Msg m[3];
+ Tree *n;
+ int i;
+
+ if(strcmp(name, "dump") == 0
+ || strcmp(name, "empty") == 0
+ || strcmp(name, "adm") == 0)
+ error(Ename);
+
+ i = 0;
+ n = nil;
+ if(waserror()){
+ free(n);
+ nexterror();
+ }
+ if(flg & Lmut){
+ n = emalloc(sizeof(Tree), 1);
+ n->memref = 1;
+ n->dirty = 0;
+ n->nlbl = 1;
+ n->nref = 0;
+ n->ht = t->ht;
+ n->bp = t->bp;
+ n->succ = -1;
+ n->pred = t->gen;
+ n->base = t->gen;
+ n->gen = aincv(&fs->nextgen, 1);
+ n->memgen = aincv(&fs->nextgen, 1);
+
+ t->nref++;
+ m[i].op = Orelink;
+ retag2kv(t->gen, t->succ, 0, 1, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ m[i].op = Oinsert;
+ lbl2kv(name, n->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ m[i].op = Oinsert;
+ tree2kv(n, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ }else{
+ t->nlbl++;
+ m[i].op = Orelink;
+ retag2kv(t->gen, t->succ, 1, 0, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+
+ m[i].op = Oinsert;
+ t->pred = t->gen;
+ t->nlbl++;
+ lbl2kv(name, t->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ }
+ btupsert(&fs->snap, m, i);
+ poperror();
+ free(n);
+}
+
+/*
+ * Updates a snapshot; keeps the generation the same if possible,
+ * otherwise moves to a new generation. A snapshot may only stay
+ * at the same generation as long as it is at the tip of a snapshot
+ * list; once it's observable by a derived snapshot it must be
+ * immutable.
+ */
+void
+updatesnap(Tree **r, Tree *o, char *lbl, int flg)
+{
+ char buf[4][Kvmax];
+ Msg m[4];
+ Tree *t;
+ int i;
+
+ if(!o->dirty)
+ return;
+
+ traceb("updatesnap", o->bp);
+ /* update the old kvp */
+ o->nlbl--;
+ o->nref++;
+
+ /* create the new one */
+
+ t = emalloc(sizeof(Tree), 1);
+ if(waserror()){
+ free(t);
+ nexterror();
+ }
+ t->memref = 1;
+ t->dirty = 0;
+
+ t->nlbl = 1;
+ t->nref = 0;
+ t->ht = o->ht;
+ t->bp = o->bp;
+ t->succ = -1;
+ t->base = o->base;
+ t->gen = o->memgen;
+ t->memgen = aincv(&fs->nextgen, 1);
+
+ i = 0;
+ m[i].op = Orelink;
+ if(o->nlbl == 0 && o->nref == 1){
+ t->pred = o->pred;
+ retag2kv(t->pred, t->gen, 0, 0, &m[i], buf[i], sizeof(buf[i]));
+ }else{
+ t->pred = o->gen;
+ retag2kv(t->pred, t->gen, -1, 1, &m[i], buf[i], sizeof(buf[i]));
+ }
+ i++;
+
+ m[i].op = Oinsert;
+ tree2kv(t, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ m[i].op = Oinsert;
+ lbl2kv(lbl, t->gen, flg, &m[i], buf[i], sizeof(buf[i]));
+ i++;
+ btupsert(&fs->snap, m, i);
+
+ /* only update the dirty status after we sync */
+ o->dirty = 0;
+
+ /* this was the last ref to the snap */
+ if(o->nlbl == 0 && o->nref == 1)
+ delsnap(o, t->gen, nil);
+ closesnap(o);
+ asetp(r, t);
+ poperror();
+}
+
+/*
+ * open snapshot by label, returning a tree.
+ */
+Tree*
+opensnap(char *label, int *flg)
+{
+ char *p, buf[Kvmax];
+ Tree *t;
+ vlong gen;
+ Kvp kv;
+ Key k;
+
+ /* Klabel{"name"} => Ksnap{id} */
+ if((p = packlbl(buf, sizeof(buf), label)) == nil)
+ return nil;
+ k.k = buf;
+ k.nk = p - buf;
+ if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
+ return nil;
+ assert(kv.nv == 1+8+4);
+ gen = UNPACK64(kv.v + 1);
+ if(flg != nil)
+ *flg = UNPACK32(kv.v + 1+8);
+
+ t = mallocz(sizeof(Tree), 1);
+ if(waserror()){
+ free(t);
+ nexterror();
+ }
+ p = packsnap(buf, sizeof(buf), gen);
+ k.k = buf;
+ k.nk = p - buf;
+ if(!btlookup(&fs->snap, &k, &kv, buf, sizeof(buf)))
+ broke(Efs);
+ unpacktree(t, kv.v, kv.nv);
+ t->memref = 1;
+ t->memgen = aincv(&fs->nextgen, 1);
+ poperror();
+ return t;
+}
+
+/*
+ * close snapshot, flushing and freeing in-memory
+ * representation.
+ */
+void
+closesnap(Tree *t)
+{
+ Bfree *f;
+
+ if(t == nil || adec(&t->memref) != 0)
+ return;
+ f = malloc(sizeof(Bfree));
+ f->op = DFtree;
+ f->t = t;
+ limbo(f);
+}
+
+void
+dlsync(void)
+{
+ Dlist *dl, *n;
+
+ tracem("dlsync");
+ dlflush(&fs->snapdl);
+ for(dl = fs->dlhead; dl != nil; dl = n){
+ n = dl->cnext;
+ dlflush(dl);
+ }
+}
+
+/*
+ * Marks a block as killed by the tree
+ * t, which means that it will be free
+ * for use after t is reclaimed.
+ *
+ * t must be an active snapshot with
+ * no successors.
+ */
+void
+killblk(Tree *t, Bptr bp)
+{
+ Dlist *dl;
+ Blk *b;
+ char *p;
+
+ /*
+ * When we have a forked snap, blocks allocated before the fork
+ * are the responsibility of the other chain; in this chain, we
+ * leak it and let the last reference in the other chain clean up
+ */
+ if(t == &fs->snap)
+ dl = &fs->snapdl;
+ else if(bp.gen > t->base)
+ dl = getdl(t->memgen, bp.gen);
+ else
+ return;
+ if(waserror()){
+ putdl(dl);
+ nexterror();
+ }
+ if(dl->ins == nil || Logspc - dl->ins->logsz < Logslop){
+ b = newblk(&fs->snap, Tdlist, 0);
+ if(dl->ins != nil){
+ enqueue(dl->ins);
+ dropblk(dl->ins);
+ }
+ if(dl->tl.addr == -1)
+ dl->tl = b->bp;
+ b->logp = dl->hd;
+ dl->hd = b->bp;
+ dl->ins = b;
+ cacheins(b);
+ }
+ p = dl->ins->data + dl->ins->logsz;
+ dl->ins->logsz += 8;
+ setflag(dl->ins, Bdirty);
+ PACK64(p, bp.addr);
+ poperror();
+ putdl(dl);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/tree.c
@@ -1,0 +1,1540 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+typedef struct Path Path;
+
+struct Path {
+ /* Flowing down for flush */
+ Msg *ins; /* inserted values, bounded by lo..hi */
+ Blk *b; /* to shadow */
+ int idx; /* insert at */
+ int lo; /* key range */
+ int hi; /* key range */
+ int sz; /* size of range */
+
+ /* Flowing up from flush */
+ int op; /* change done along path */
+ Blk *m; /* node merged against, for post-update free */
+ Blk *nl; /* new left */
+ Blk *nr; /* new right, if we split or rotated */
+ int midx; /* modification index */
+ int npull; /* number of messages successfully pulled */
+ int pullsz; /* size of pulled messages */
+};
+
+#define efreeblk(t, b) do { \
+ if(b != nil) \
+ freeblk(t, b, b->bp); \
+ } while(0)
+
+static void
+stablesort(Msg *m, int nm)
+{
+ int i, j;
+ Msg t;
+
+ for(i = 1; i < nm; i++){
+ for(j = i; j > 0; j--){
+ if(keycmp(&m[j-1], &m[j]) <= 0)
+ break;
+ t = m[j-1];
+ m[j-1] = m[j];
+ m[j] = t;
+ }
+ }
+}
+
+void
+cpkey(Key *dst, Key *src, char *buf, int nbuf)
+{
+ assert(src->nk <= nbuf);
+ memmove(buf, src->k, src->nk);
+ dst->k = buf;
+ dst->nk = src->nk;
+}
+
+void
+cpkvp(Kvp *dst, Kvp *src, char *buf, int nbuf)
+{
+ assert(src->nk+src->nv <= nbuf);
+ memmove(buf, src->k, src->nk);
+ memmove(buf+ src->nk, src->v, src->nv);
+ dst->k = buf;
+ dst->nk = src->nk;
+ dst->v = buf+src->nk;
+ dst->nv = src->nv;
+}
+
+int
+keycmp(Key *a, Key *b)
+{
+ int c, n;
+
+ n = (a->nk < b->nk) ? a->nk : b->nk;
+ if((c = memcmp(a->k, b->k, n)) != 0)
+ return c < 0 ? -1 : 1;
+ if(a->nk < b->nk)
+ return -1;
+ else if(a->nk > b->nk)
+ return 1;
+ else
+ return 0;
+}
+
+static int
+msgsz(Msg *m)
+{
+ /* disp + op + klen + key + vlen + v */
+ return 2+1+2+m->nk +2+ m->nv;
+}
+
+static int
+valsz(Kvp *kv)
+{
+ return 2 + 2+kv->nk + 2+kv->nv;
+}
+
+void
+getval(Blk *b, int i, Kvp *kv)
+{
+ char *p;
+ int o;
+
+ assert(i >= 0 && i < b->nval);
+ p = b->data + 2*i;
+ o = UNPACK16(p); p = b->data + o;
+ kv->nk = UNPACK16(p); p += 2;
+ kv->k = p; p += kv->nk;
+ kv->nv = UNPACK16(p); p += 2;
+ kv->v = p;
+}
+
+Bptr
+getptr(Kvp *kv, int *fill)
+{
+ assert(kv->nv == Ptrsz || kv->nv == Ptrsz+2);
+ *fill = UNPACK16(kv->v + Ptrsz);
+ return unpackbp(kv->v, kv->nv);
+}
+
+/* Exported for reaming */
+void
+setval(Blk *b, Kvp *kv)
+{
+ int off, spc;
+ char *p;
+
+ spc = (b->type == Tleaf) ? Leafspc : Pivspc;
+ b->valsz += 2 + kv->nk + 2 + kv->nv;
+ off = spc - b->valsz;
+
+ assert(2*(b->nval+1) + b->valsz <= spc);
+ assert(2*(b->nval+1) <= off);
+
+ p = b->data + 2*b->nval;
+ PACK16(p, off);
+
+ p = b->data + off;
+ PACK16(p, kv->nk); p += 2;
+ memmove(p, kv->k, kv->nk); p += kv->nk;
+ PACK16(p, kv->nv); p += 2;
+ memmove(p, kv->v, kv->nv);
+
+ b->nval++;
+}
+
+static void
+setptr(Blk *b, Key *k, Bptr bp, int fill)
+{
+ char *p, buf[Ptrsz+2];
+ Kvp kv;
+
+ kv.k = k->k;
+ kv.nk = k->nk;
+ kv.v = buf;
+ kv.nv = sizeof(buf);
+ p = packbp(buf, sizeof(buf), &bp);
+ PACK16(p, fill);
+ setval(b, &kv);
+}
+
+static void
+setmsg(Blk *b, Msg *m)
+{
+ char *p;
+ int o;
+
+ assert(b->type == Tpivot);
+ b->bufsz += msgsz(m)-2;
+
+ p = b->data + Pivspc + 2*b->nbuf;
+ o = Bufspc - b->bufsz;
+ PACK16(p, o);
+
+ p = b->data + Pivspc + o;
+ *p = m->op; p += 1;
+ PACK16(p, m->nk); p += 2;
+ memmove(p, m->k, m->nk); p += m->nk;
+ PACK16(p, m->nv); p += 2;
+ memmove(p, m->v, m->nv);
+
+ b->nbuf++;
+}
+
+void
+getmsg(Blk *b, int i, Msg *m)
+{
+ char *p;
+ int o;
+
+ assert(b->type == Tpivot);
+ assert(i >= 0 && i < b->nbuf);
+ p = b->data + Pivspc + 2*i;
+ o = UNPACK16(p);
+ p = b->data + Pivspc + o;
+ m->op = *p; p += 1;
+ m->nk = UNPACK16(p); p += 2;
+ m->k = p; p += m->nk;
+ m->nv = UNPACK16(p); p += 2;
+ m->v = p;
+}
+
+static int
+bufsearch(Blk *b, Key *k, Msg *m, int *same)
+{
+ int lo, hi, ri, mid, r;
+ Msg cmp;
+
+ ri = -1;
+ lo = 0;
+ hi = b->nbuf-1;
+ while(lo <= hi){
+ mid = (hi + lo) / 2;
+ getmsg(b, mid, &cmp);
+ r = keycmp(k, &cmp);
+ switch(r){
+ case -1:
+ hi = mid-1;
+ break;
+ case 0:
+ ri = mid;
+ hi = mid-1;
+ break;
+ case 1:
+ lo = mid+1;
+ break;
+ }
+ }
+ /*
+ * we can have duplicate messages, and we
+ * want to point to the first of them:
+ * scan backwards.
+ */
+ *same = 0;
+ if(ri == -1)
+ ri = lo-1;
+ else
+ *same = 1;
+ if(m != nil && ri >= 0)
+ getmsg(b, ri, m);
+ return ri;
+}
+
+static int
+blksearch(Blk *b, Key *k, Kvp *rp, int *same)
+{
+ int lo, hi, ri, mid, r;
+ Kvp cmp;
+
+ ri = -1;
+ lo = 0;
+ hi = b->nval-1;
+ while(lo <= hi){
+ mid = (hi + lo) / 2;
+ getval(b, mid, &cmp);
+ r = keycmp(k, &cmp);
+ switch(r){
+ case -1:
+ hi = mid-1;
+ break;
+ case 0:
+ ri = mid;
+ hi = mid-1;
+ break;
+ case 1:
+ lo = mid+1;
+ break;
+ }
+ }
+ *same = 0;
+ if(ri == -1)
+ ri = lo-1;
+ else
+ *same = 1;
+ if(ri >= 0)
+ getval(b, ri, rp);
+ return ri;
+}
+
+static int
+buffill(Blk *b)
+{
+ assert(b->type == Tpivot);
+ return 2*b->nbuf + b->bufsz;
+}
+
+static int
+filledbuf(Blk *b, int nmsg, int needed)
+{
+ assert(b->type == Tpivot);
+ return 2*(b->nbuf+nmsg) + b->bufsz + needed > Bufspc;
+}
+
+static int
+filledleaf(Blk *b, int needed)
+{
+ assert(b->type == Tleaf);
+ return 2*(b->nval+1) + b->valsz + needed > Leafspc;
+}
+
+static int
+filledpiv(Blk *b, int reserve)
+{
+ /*
+ * We need to guarantee there's room for one message
+ * at all times, so that splits along the whole path
+ * have somewhere to go as they propagate up.
+ */
+ assert(b->type == Tpivot);
+ return 2*(b->nval+1) + b->valsz + reserve*Kpmax > Pivspc;
+}
+
+static void
+copyup(Blk *n, Path *pp, int *nbytes)
+{
+ Kvp kv;
+ Msg m;
+
+ /*
+ * It's possible for the previous node to have
+ * been fully cleared out by a large number of
+ * delete messages, so we need to check if
+ * there's anything in it to copy up.
+ */
+ if(pp->nl->nval > 0){
+ getval(pp->nl, 0, &kv);
+ if(pp->nl->nbuf > 0){
+ getmsg(pp->nl, 0, &m);
+ if(keycmp(&kv, &m) > 0)
+ kv.Key = m.Key;
+ }
+ setptr(n, &kv, pp->nl->bp, blkfill(pp->nl));
+ if(nbytes != nil)
+ *nbytes += valsz(&kv);
+ }
+ if(pp->nr != nil && pp->nr->nval > 0){
+ getval(pp->nr, 0, &kv);
+ if(pp->nr->nbuf > 0){
+ getmsg(pp->nr, 0, &m);
+ if(keycmp(&kv, &m) > 0)
+ kv.Key = m.Key;
+ }
+ setptr(n, &kv, pp->nr->bp, blkfill(pp->nr));
+ if(nbytes != nil)
+ *nbytes += valsz(&kv);
+ }
+}
+
+static void
+statupdate(Kvp *kv, Msg *m)
+{
+ int op;
+ char *p;
+ Xdir d;
+
+ p = m->v;
+ op = *p++;
+ kv2dir(kv, &d);
+ /* bump version */
+ d.qid.vers++;
+ if(op & Owsize){
+ d.length = UNPACK64(p);
+ p += 8;
+ }
+ if(op & Owmode){
+ d.mode = UNPACK32(p);
+ d.qid.type = d.mode>>24;
+ p += 4;
+ }
+ if(op & Owmtime){
+ d.mtime = UNPACK64(p);
+ p += 8;
+ }
+ if(op & Owatime){
+ d.atime = UNPACK64(p);
+ p += 8;
+ }
+ if(op & Owuid){
+ d.uid = UNPACK32(p);
+ p += 4;
+ }
+ if(op & Owgid){
+ d.gid = UNPACK32(p);
+ p += 4;
+ }
+ if(op & Owmuid){
+ d.muid = UNPACK32(p);
+ p += 4;
+ }
+ if(p != m->v + m->nv)
+ fatal("malformed stat: kv=%P, m=%M\n", kv, m);
+ if(packdval(kv->v, kv->nv, &d) == nil)
+ fatal("repacking dir failed\n");
+}
+
+static int
+apply(Kvp *kv, Msg *m, char *buf, int nbuf)
+{
+ vlong *pv;
+ char *p;
+ Tree t;
+
+ switch(m->op){
+ case Oclearb:
+ case Odelete:
+ case Oclobber:
+ assert(keycmp(kv, m) == 0);
+ return 0;
+ case Oinsert:
+ cpkvp(kv, m, buf, nbuf);
+ return 1;
+ case Owstat:
+ assert(keycmp(kv, m) == 0);
+ statupdate(kv, m);
+ return 1;
+ case Orelink:
+ case Oreprev:
+ unpacktree(&t, kv->v, kv->nv);
+ p = m->v;
+ pv = (m->op == Orelink) ? &t.succ : &t.pred;
+ *pv = UNPACK64(p); p += 8;
+ t.nlbl += *p; p++;
+ t.nref += *p; p++;
+ assert(t.nlbl >= 0 && t.nref >= 0);
+ assert(p == m->v + m->nv);
+ packtree(kv->v, kv->nv, &t);
+ return 1;
+ default:
+ fatal("invalid op %d\n", m->op);
+ }
+ return 0;
+}
+
+static int
+pullmsg(Path *p, int i, Kvp *v, Msg *m, int *full, int spc)
+{
+ if(i < 0 || i >= p->hi || *full)
+ return -1;
+
+ if(p->ins != nil)
+ *m = p->ins[i];
+ else
+ getmsg(p->b, i, m);
+ if(msgsz(m) <= spc)
+ return (v == nil) ? 0 : keycmp(v, m);
+ *full = 1;
+ return -1;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1,
+ */
+static void
+updateleaf(Tree *t, Path *up, Path *p)
+{
+ char buf[Msgmax];
+ int i, j, c, ok, full, spc;
+ Blk *b, *n;
+ Bptr bp;
+ Msg m;
+ Kvp v;
+
+ i = 0;
+ j = up->lo;
+ b = p->b;
+ /*
+ * spc is the amount of room we have
+ * to copy data down from the parent; it's
+ * necessarily a bit conservative, because
+ * deletion messages don't take space -- but
+ * we don't know how what the types of all
+ * messages are.
+ */
+ full = 0;
+ spc = Leafspc - blkfill(b);
+ n = newblk(t, b->type, 0);
+ assert(i >= 0 && j >= 0);
+ while(i < b->nval || j < up->hi){
+ if(i >= b->nval)
+ c = 1;
+ else{
+ c = -1;
+ getval(p->b, i, &v);
+ if(j < up->hi){
+ if(up->ins != nil)
+ m = up->ins[j];
+ else
+ getmsg(up->b, j, &m);
+ if(msgsz(&m) <= spc)
+ c = keycmp(&v, &m);
+ else
+ full = 1;
+ }
+ }
+ switch(c){
+ /* Value before message: just copy value */
+ case -1:
+ i++;
+ setval(n, &v);
+ break;
+ /* Value merges with message sequence */
+ case 0:
+ i++;
+ j++;
+ cpkvp(&v, &v, buf, sizeof(buf));
+ if(v.nk > 0 && v.k[0] == Kdat)
+ if(m.op == Oclearb
+ || m.op == Oinsert
+ || m.op == Odelete){
+ bp = unpackbp(v.v, v.nv);
+ freeblk(t, nil, bp);
+ }
+ ok = apply(&v, &m, buf, sizeof(buf));
+ goto Copyloop;
+ /* Message before value: Insert message sequence */
+ case 1:
+ j++;
+ cpkvp(&v, &m, buf, sizeof(buf));
+ ok = 0;
+ if(m.op != Oclearb && m.op != Oclobber){
+ spc -= valsz(&m);
+ p->pullsz += msgsz(&m);
+ ok = 1;
+ }
+ goto Copyloop;
+ Copyloop:
+ while(j < up->hi){
+ if(pullmsg(up, j, &v, &m, &full, spc) != 0)
+ break;
+ if(ok && v.nk > 0 && v.k[0] == Kdat)
+ if(m.op == Oclearb
+ || m.op == Oinsert
+ || m.op == Odelete){
+ bp = unpackbp(v.v, v.nv);
+ freeblk(t, nil, bp);
+ }
+ p->pullsz += msgsz(&m);
+ ok = apply(&v, &m, buf, sizeof(buf));
+ j++;
+ }
+ if(ok)
+ setval(n, &v);
+ break;
+ }
+ }
+ p->npull = (j - up->lo);
+ p->nl = n;
+}
+
+/*
+ * Creates a new block with the contents of the old
+ * block. When copying the contents, it repacks them
+ * to minimize the space uses, and applies the changes
+ * pending from the downpath blocks.
+ *
+ * When pidx != -1,
+ */
+static void
+updatepiv(Tree *t, Path *up, Path *p, Path *pp)
+{
+ char buf[Msgmax];
+ int i, j, sz, full, spc;
+ Blk *b, *n;
+ Msg m, u;
+
+ b = p->b;
+ n = newblk(t, b->type, 0);
+ for(i = 0; i < b->nval; i++){
+ if(pp != nil && i == p->midx){
+ copyup(n, pp, nil);
+ if(pp->op == POrot || pp->op == POmerge)
+ i++;
+ }else{
+ getval(b, i, &m);
+ setval(n, &m);
+ }
+ }
+ i = 0;
+ j = up->lo;
+ sz = 0;
+ full = 0;
+ spc = Bufspc - buffill(b);
+ if(pp != nil)
+ spc += pp->pullsz;
+ while(i < b->nbuf){
+ if(i == p->lo)
+ i += pp->npull;
+ if(i == b->nbuf)
+ break;
+ getmsg(b, i, &m);
+ switch(pullmsg(up, j, &m, &u, &full, spc - sz)){
+ case -1:
+ case 0:
+ setmsg(n, &m);
+ i++;
+ break;
+ case 1:
+ cpkvp(&m, &u, buf, sizeof(buf));
+ while(pullmsg(up, j, &m, &u, &full, spc) == 0){
+ setmsg(n, &u);
+ sz = msgsz(&u);
+ p->pullsz += sz;
+ spc -= sz;
+ j++;
+ }
+ }
+ }
+ while(j < up->hi){
+ pullmsg(up, j, nil, &u, &full, spc);
+ if(full)
+ break;
+ setmsg(n, &u);
+ sz = msgsz(&u);
+ p->pullsz += sz;
+ spc -= sz;
+ j++;
+ }
+ p->npull = (j - up->lo);
+ p->nl = n;
+}
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the tree by more than 1.
+ */
+static void
+splitleaf(Tree *t, Path *up, Path *p, Kvp *mid)
+{
+ char buf[Msgmax];
+ Blk *b, *d, *l, *r;
+ int full, copied, spc, ok, halfsz;
+ int i, j, c;
+ Bptr bp;
+ Msg m;
+ Kvp v;
+
+ /*
+ * If the block one entry up the
+ * p is nil, we're at the root,
+ * so we want to make a new block.
+ */
+ b = p->b;
+ l = nil;
+ r = nil;
+ if(waserror()){
+ efreeblk(t, l);
+ efreeblk(t, r);
+ nexterror();
+ }
+ l = newblk(t, b->type, 0);
+ r = newblk(t, b->type, 0);
+
+ d = l;
+ i = 0;
+ j = up->lo;
+ full = 0;
+ copied = 0;
+ halfsz = (2*b->nval + b->valsz + up->sz) / 2;
+ if(halfsz > Leafspc/2)
+ halfsz = Leafspc/2;
+ spc = Leafspc - (halfsz + Msgmax);
+ assert(b->nval >= 4);
+ while(i < b->nval){
+ /*
+ * We're trying to balance size,
+ * but we need at least 2 nodes
+ * in each half of the split if
+ * we want a valid tree.
+ */
+ if(d == l)
+ if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){
+ d = r;
+ spc = Leafspc - (halfsz + Msgmax);
+ getval(b, i, mid);
+ }
+ getval(b, i, &v);
+ c = pullmsg(up, j, &v, &m, &full, spc);
+ switch(c){
+ case -1:
+ i++;
+ setval(d, &v);
+ copied += valsz(&v);
+ break;
+ case 0:
+ i++;
+ j++;
+ cpkvp(&v, &v, buf, sizeof(buf));
+ copied += valsz(&v);
+ if(v.nk > 0 && v.k[0] == Kdat)
+ if(m.op == Oclearb
+ || m.op == Oinsert
+ || m.op == Odelete){
+ bp = unpackbp(v.v, v.nv);
+ freeblk(t, nil, bp);
+ }
+ ok = apply(&v, &m, buf, sizeof(buf));
+ goto Copyloop;
+ case 1:
+ j++;
+ cpkvp(&v, &m, buf, sizeof(buf));
+ copied += valsz(&v);
+ ok = 0;
+ if(m.op != Oclearb && m.op != Oclobber){
+ spc -= valsz(&m);
+ p->pullsz += msgsz(&m);
+ ok = 1;
+ }
+ goto Copyloop;
+ Copyloop:
+ while(j < up->hi){
+ if(pullmsg(up, j, &v, &m, &full, spc) != 0)
+ break;
+ if(ok && v.nk > 0 && v.k[0] == Kdat)
+ if(m.op == Oclearb
+ || m.op == Oinsert
+ || m.op == Odelete){
+ bp = unpackbp(v.v, v.nv);
+ freeblk(t, nil, bp);
+ }
+ p->pullsz += msgsz(&m);
+ ok = apply(&v, &m, buf, sizeof(buf));
+ j++;
+ }
+ if(ok)
+ setval(d, &v);
+ break;
+ }
+ }
+ p->npull = (j - up->lo);
+ p->op = POsplit;
+ p->nl = l;
+ p->nr = r;
+ poperror();
+}
+
+/*
+ * Splits a node, returning the block that msg
+ * would be inserted into. Split must never
+ * grow the total height of the tree by more
+ * than one.
+ */
+static void
+splitpiv(Tree *t, Path *, Path *p, Path *pp, Kvp *mid)
+{
+ int i, copied, halfsz;
+ Blk *b, *d, *l, *r;
+ Kvp tk;
+ Msg m;
+
+ /*
+ * If the bp->lock one entry up the
+ * p is nil, we're at the root,
+ * so we want to make a new bp->lock.
+ */
+ b = p->b;
+ l = nil;
+ r = nil;
+ if(waserror()){
+ efreeblk(t, l);
+ efreeblk(t, r);
+ nexterror();
+ }
+ l = newblk(t, b->type, 0);
+ r = newblk(t, b->type, 0);
+ d = l;
+ copied = 0;
+ halfsz = (2*b->nval + b->valsz)/2;
+ assert(b->nval >= 4);
+ for(i = 0; i < b->nval; i++){
+ /*
+ * We're trying to balance size,
+ * but we need at least 2 nodes
+ * in each half of the split if
+ * we want a valid tree.
+ */
+ if(d == l)
+ if((i == b->nval-2) || (i >= 2 && copied >= halfsz)){
+ d = r;
+ getval(b, i, mid);
+ }
+ if(i == p->idx){
+ copyup(d, pp, &copied);
+ continue;
+ }
+ getval(b, i, &tk);
+ setval(d, &tk);
+ copied += valsz(&tk);
+ }
+ d = l;
+ for(i = 0; i < b->nbuf; i++){
+ if(i == p->lo)
+ i += pp->npull;
+ if(i == b->nbuf)
+ break;
+ getmsg(b, i, &m);
+ if(d == l && keycmp(&m, mid) >= 0)
+ d = r;
+ setmsg(d, &m);
+ }
+ p->op = POsplit;
+ p->nl = l;
+ p->nr = r;
+ poperror();
+}
+
+static void
+merge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
+{
+ Blk *d;
+ Msg m;
+ int i;
+
+ d = newblk(t, a->type, 0);
+ for(i = 0; i < a->nval; i++){
+ getval(a, i, &m);
+ setval(d, &m);
+ }
+ for(i = 0; i < b->nval; i++){
+ getval(b, i, &m);
+ setval(d, &m);
+ }
+ if(a->type == Tpivot){
+ for(i = 0; i < a->nbuf; i++){
+ getmsg(a, i, &m);
+ setmsg(d, &m);
+ }
+ for(i = 0; i < b->nbuf; i++){
+ getmsg(b, i, &m);
+ setmsg(d, &m);
+ }
+ }
+ enqueue(d);
+ p->midx = idx;
+ pp->nl = d;
+ pp->op = POmerge;
+ pp->nr = nil;
+}
+
+/*
+ * Scan a single block for the split offset;
+ * returns 1 if we'd spill out of the buffer,
+ * updates *idx and returns 0 otherwise.
+ */
+static int
+spillscan(Blk *d, Blk *b, Msg *m, int *idx, int o)
+{
+ int i, used;
+ Msg n;
+
+ used = 2*d->nbuf + d->bufsz;
+ for(i = *idx; i < b->nbuf; i++){
+ getmsg(b, i, &n);
+ if(keycmp(m, &n) <= 0){
+ *idx = i + o;
+ return 0;
+ }
+ used += msgsz(&n);
+ if(used > Bufspc)
+ return 1;
+ }
+ *idx = b->nbuf;
+ return 0;
+}
+
+/*
+ * Returns whether the keys in b between
+ * idx and m would spill out of the buffer
+ * of d.
+ */
+static int
+spillsbuf(Blk *d, Blk *l, Blk *r, Msg *m, int *idx)
+{
+ if(l->type == Tleaf)
+ return 0;
+
+ if(*idx < l->nbuf && spillscan(d, l, m, idx, 0))
+ return 1;
+ if(*idx >= l->nbuf && spillscan(d, r, m, idx, l->nbuf))
+ return 1;
+ return 0;
+}
+
+static void
+rotate(Tree *t, Path *p, Path *pp, int midx, Blk *a, Blk *b, int halfpiv)
+{
+ int i, o, cp, sp, idx;
+ Blk *d, *l, *r;
+ Msg m;
+
+ l = nil;
+ r = nil;
+ if(waserror()){
+ efreeblk(t, l);
+ efreeblk(t, r);
+ nexterror();
+ }
+ l = newblk(t, a->type, 0);
+ r = newblk(t, a->type, 0);
+ d = l;
+ cp = 0;
+ sp = -1;
+ idx = 0;
+ for(i = 0; i < a->nval; i++){
+ getval(a, i, &m);
+ if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+ sp = idx;
+ d = r;
+ }
+ setval(d, &m);
+ cp += valsz(&m);
+ }
+ for(i = 0; i < b->nval; i++){
+ getval(b, i, &m);
+ if(d == l && (cp >= halfpiv || spillsbuf(d, a, b, &m, &idx))){
+ sp = idx;
+ d = r;
+ }
+ setval(d, &m);
+ cp += valsz(&m);
+ }
+ if(a->type == Tpivot){
+ d = l;
+ o = 0;
+ for(i = 0; i < a->nbuf; i++){
+ if(o == sp){
+ d = r;
+ o = 0;
+ }
+ getmsg(a, i, &m);
+ setmsg(d, &m);
+ o++;
+ }
+ for(i = 0; i < b->nbuf; i++){
+ if(o == sp){
+ d = r;
+ o = 0;
+ }
+ getmsg(b, i, &m);
+ setmsg(d, &m);
+ o++;
+ }
+ }
+ enqueue(l);
+ enqueue(r);
+ p->midx = midx;
+ pp->op = POrot;
+ pp->nl = l;
+ pp->nr = r;
+ poperror();
+}
+
+static void
+rotmerge(Tree *t, Path *p, Path *pp, int idx, Blk *a, Blk *b)
+{
+ int na, nb, ma, mb, imbalance;
+
+ assert(a->type == b->type);
+
+ na = 2*a->nval + a->valsz;
+ nb = 2*b->nval + b->valsz;
+ if(a->type == Tleaf){
+ ma = 0;
+ mb = 0;
+ }else{
+ ma = 2*a->nbuf + a->bufsz;
+ mb = 2*b->nbuf + b->bufsz;
+ }
+ imbalance = na - nb;
+ if(imbalance < 0)
+ imbalance *= -1;
+ /* works for leaf, because 0 always < Bufspc */
+ if(na + nb < (Pivspc - 4*Msgmax) && ma + mb < Bufspc)
+ merge(t, p, pp, idx, a, b);
+ else if(imbalance > 4*Msgmax)
+ rotate(t, p, pp, idx, a, b, (na + nb)/2);
+}
+
+static void
+trybalance(Tree *t, Path *p, Path *pp, int idx)
+{
+ Blk *l, *m, *r;
+ Kvp kl, kr;
+ int spc, fill;
+ Bptr bp;
+
+ if(p->idx == -1 || pp == nil || pp->nl == nil)
+ return;
+ if(pp->op != POmod || pp->op != POmerge)
+ return;
+
+ l = nil;
+ r = nil;
+ m = holdblk(pp->nl);
+ if(waserror()){
+ dropblk(m);
+ dropblk(l);
+ dropblk(r);
+ nexterror();
+ }
+ spc = (m->type == Tleaf) ? Leafspc : Pivspc;
+ if(idx-1 >= 0){
+ getval(p->b, idx-1, &kl);
+ bp = getptr(&kl, &fill);
+ if(fill + blkfill(m) < spc){
+ l = getblk(bp, 0);
+ rotmerge(t, p, pp, idx-1, l, m);
+ goto Done;
+ }
+ }
+ if(idx+1 < p->b->nval){
+ getval(p->b, idx+1, &kr);
+ bp = getptr(&kr, &fill);
+ if(fill + blkfill(m) < spc){
+ r = getblk(bp, 0);
+ rotmerge(t, p, pp, idx, m, r);
+ goto Done;
+ }
+ }
+Done:
+ dropblk(m);
+ dropblk(l);
+ dropblk(r);
+ poperror();
+}
+
+static Path*
+flush(Tree *t, Path *path, int npath)
+{
+
+ Path *up, *p, *pp, *rp;
+ Kvp mid;
+
+ /*
+ * The path must contain at minimum two elements:
+ * we must have 1 node we're inserting into, and
+ * an empty element at the top of the path that
+ * we put the new root into if the root gets split.
+ */
+ assert(npath >= 2);
+ rp = nil;
+ pp = nil;
+ p = &path[npath - 1];
+ up = &path[npath - 2];
+ if(p->b->type == Tleaf){
+ if(!filledleaf(p->b, up->sz)){
+ updateleaf(t, p-1, p);
+ enqueue(p->nl);
+ rp = p;
+ }else{
+ splitleaf(t, up, p, &mid);
+ enqueue(p->nl);
+ enqueue(p->nr);
+ }
+ p->midx = -1;
+ pp = p;
+ up--;
+ p--;
+ }
+ while(p != path){
+ if(!filledpiv(p->b, 1)){
+ trybalance(t, p, pp, p->idx);
+ /* If we merged the root node, break out. */
+ if(up == path && pp != nil && pp->op == POmerge && p->b->nval == 2){
+ rp = pp;
+ goto Out;
+ }
+ updatepiv(t, up, p, pp);
+ enqueue(p->nl);
+ rp = p;
+ }else{
+ splitpiv(t, up, p, pp, &mid);
+ enqueue(p->nl);
+ enqueue(p->nr);
+ }
+ pp = p;
+ up--;
+ p--;
+ }
+ if(pp->nl != nil && pp->nr != nil){
+ rp = &path[0];
+ rp->nl = newblk(t, Tpivot, 0);
+ rp->npull = pp->npull;
+ rp->pullsz = pp->pullsz;
+ copyup(rp->nl, pp, nil);
+ enqueue(rp->nl);
+ }
+Out:
+ return rp;
+}
+
+static void
+freepath(Tree *t, Path *path, int npath)
+{
+ Path *p;
+
+ for(p = path; p != path + npath; p++){
+ if(p->b != nil)
+ freeblk(t, p->b, p->b->bp);
+ if(p->m != nil)
+ freeblk(t, p->b, p->m->bp);
+ dropblk(p->b);
+ dropblk(p->nl);
+ dropblk(p->nr);
+ }
+ free(path);
+}
+
+/*
+ * Select child node that with the largest message
+ * segment in the current node's buffer.
+ */
+static void
+victim(Blk *b, Path *p)
+{
+ int i, j, lo, maxsz, cursz;
+ Kvp kv;
+ Msg m;
+
+ j = 0;
+ maxsz = 0;
+ p->b = b;
+ /*
+ * Start at the second pivot: all values <= this
+ * go to the first node. Stop *after* the last entry,
+ * because entries >= the last entry all go into it.
+ */
+ for(i = 1; i <= b->nval; i++){
+ if(i < b->nval)
+ getval(b, i, &kv);
+ cursz = 0;
+ lo = j;
+ for(; j < b->nbuf; j++){
+ getmsg(b, j, &m);
+ if(i < b->nval && keycmp(&m, &kv) >= 0)
+ break;
+ /* 2 bytes for offset, plus message size in buffer */
+ cursz += msgsz(&m);
+ }
+ if(cursz > maxsz){
+ maxsz = cursz;
+ p->op = POmod;
+ p->lo = lo;
+ p->hi = j;
+ p->sz = maxsz;
+ p->idx = i - 1;
+ p->midx = i - 1;
+ p->npull = 0;
+ p->pullsz = 0;
+ }
+ }
+}
+
+static void
+fastupsert(Tree *t, Blk *b, Msg *msg, int nmsg)
+{
+ int i, c, o, ri, lo, hi, mid, nbuf;
+ Msg cmp;
+ char *p;
+ Blk *r;
+
+ if((r = dupblk(t, b)) == nil)
+ error(Enomem);
+
+ nbuf = r->nbuf;
+ for(i = 0; i < nmsg; i++)
+ setmsg(r, &msg[i]);
+
+ for(i = 0; i < nmsg; i++){
+ ri = -1;
+ lo = 0;
+ hi = nbuf+i-1;
+ while(lo <= hi){
+ mid = (hi + lo) / 2;
+ getmsg(r, mid, &cmp);
+ c = keycmp(&msg[i], &cmp);
+ switch(c){
+ case -1:
+ hi = mid-1;
+ break;
+ case 0:
+ ri = mid+1;
+ lo = mid+1;
+ break;
+ case 1:
+ lo = mid+1;
+ break;
+ }
+ }
+ if(ri == -1)
+ ri = hi+1;
+ p = r->data + Pivspc + 2*(nbuf+i);
+ o = UNPACK16(p);
+ p = r->data + Pivspc + 2*ri;
+ memmove(p+2, p, 2*(nbuf+i-ri));
+ PACK16(p, o);
+ }
+ enqueue(r);
+
+ lock(&t->lk);
+ t->bp = r->bp;
+ t->dirty = 1;
+ unlock(&t->lk);
+
+ freeblk(t, b, b->bp);
+ dropblk(b);
+ dropblk(r);
+}
+
+
+void
+btupsert(Tree *t, Msg *msg, int nmsg)
+{
+ int i, npath, npull, dh, sz, height;
+ Path *path, *rp;
+ Blk *b, *rb;
+ Kvp sep;
+ Bptr bp;
+
+ sz = 0;
+ stablesort(msg, nmsg);
+ for(i = 0; i < nmsg; i++)
+ sz += msgsz(&msg[i]);
+ npull = 0;
+ path = nil;
+ npath = 0;
+
+Again:
+ if(waserror()){
+ freepath(t, path, npath);
+ nexterror();
+ }
+
+ b = getroot(t, &height);
+ if(npull == 0 && b->type == Tpivot && !filledbuf(b, nmsg, sz)){
+ fastupsert(t, b, msg, nmsg);
+ poperror();
+ return;
+ }
+ /*
+ * The tree can grow in height by 1 when we
+ * split, so we allocate room for one extra
+ * node in the path.
+ */
+ npath = 0;
+ if((path = calloc((height + 2), sizeof(Path))) == nil)
+ error(Enomem);
+ path[npath].b = nil;
+ path[npath].idx = -1;
+ path[npath].midx = -1;
+ npath++;
+
+ path[0].sz = sz;
+ path[0].ins = msg;
+ path[0].lo = npull;
+ path[0].hi = nmsg;
+ while(b->type == Tpivot){
+ if(!filledbuf(b, nmsg, path[npath - 1].sz))
+ break;
+ victim(b, &path[npath]);
+ getval(b, path[npath].idx, &sep);
+ bp = unpackbp(sep.v, sep.nv);
+ b = getblk(bp, 0);
+ npath++;
+ }
+ path[npath].b = b;
+ path[npath].idx = -1;
+ path[npath].midx = -1;
+ path[npath].lo = -1;
+ path[npath].hi = -1;
+ path[npath].npull = 0;
+ path[npath].pullsz = 0;
+ npath++;
+
+ rp = flush(t, path, npath);
+ rb = rp->nl;
+
+ if(path[0].nl != nil)
+ dh = 1;
+ else if(path[1].nl != nil)
+ dh = 0;
+ else if(npath >2 && path[2].nl != nil)
+ dh = -1;
+ else
+ fatal("broken path change");
+
+ assert(rb->bp.addr != 0);
+ assert(rb->bp.addr != 0);
+
+ lock(&t->lk);
+ traceb("setroot", rb->bp);
+ t->ht += dh;
+ t->bp = rb->bp;
+ t->dirty = 1;
+ unlock(&t->lk);
+
+ npull += rp->npull;
+ freepath(t, path, npath);
+ poperror();
+
+ if(npull != nmsg){
+ tracem("short pull");
+ goto Again;
+ }
+}
+
+Blk*
+getroot(Tree *t, int *h)
+{
+ Bptr bp;
+
+ lock(&t->lk);
+ bp = t->bp;
+ if(h != nil)
+ *h = t->ht;
+ unlock(&t->lk);
+
+ return getblk(bp, 0);
+}
+
+int
+btlookup(Tree *t, Key *k, Kvp *r, char *buf, int nbuf)
+{
+ int i, j, h, ok, same;
+ Blk *b, **p;
+ Bptr bp;
+ Msg m;
+
+ b = getroot(t, &h);
+ if((p = calloc(h, sizeof(Blk*))) == nil){
+ dropblk(b);
+ error(Enomem);
+ }
+ ok = 0;
+ p[0] = holdblk(b);
+ for(i = 1; i < h; i++){
+ if(blksearch(p[i-1], k, r, &same) == -1)
+ break;
+ bp = unpackbp(r->v, r->nv);
+ p[i] = getblk(bp, 0);
+ }
+ if(p[h-1] != nil)
+ blksearch(p[h-1], k, r, &ok);
+ if(ok)
+ cpkvp(r, r, buf, nbuf);
+ for(i = h-2; i >= 0; i--){
+ if(p[i] == nil)
+ continue;
+ j = bufsearch(p[i], k, &m, &same);
+ if(j < 0 || !same)
+ continue;
+ if(!(ok || m.op == Oinsert || m.op == Oclearb))
+ fatal("lookup %K << %M missing insert\n", k, &m);
+ ok = apply(r, &m, buf, nbuf);
+ for(j++; j < p[i]->nbuf; j++){
+ getmsg(p[i], j, &m);
+ if(keycmp(k, &m) != 0)
+ break;
+ ok = apply(r, &m, buf, nbuf);
+ }
+ }
+ for(i = 0; i < h; i++)
+ if(p[i] != nil)
+ dropblk(p[i]);
+ dropblk(b);
+ free(p);
+ return ok;
+}
+
+void
+btnewscan(Scan *s, char *pfx, int npfx)
+{
+ memset(s, 0, sizeof(*s));
+ s->first = 1;
+ s->donescan = 0;
+ s->offset = 0;
+ s->pfx.k = s->pfxbuf;
+ s->pfx.nk = npfx;
+ memmove(s->pfxbuf, pfx, npfx);
+
+ s->kv.v = s->kvbuf+npfx;
+ s->kv.nv = 0;
+ cpkey(&s->kv, &s->pfx, s->kvbuf, sizeof(s->kvbuf));
+}
+
+void
+btenter(Tree *t, Scan *s)
+{
+ int i, same;
+ Scanp *p;
+ Msg m, c;
+ Bptr bp;
+ Blk *b;
+ Kvp v;
+
+ if(s->donescan)
+ return;
+ b = getroot(t, &s->ht);
+ if((s->path = calloc(s->ht, sizeof(Scanp))) == nil){
+ dropblk(b);
+ error(Enomem);
+ }
+ p = s->path;
+ p[0].b = b;
+ for(i = 0; i < s->ht; i++){
+ p[i].vi = blksearch(b, &s->kv, &v, &same);
+ if(b->type == Tpivot){
+ if(p[i].vi == -1)
+ getval(b, ++p[i].vi, &v);
+ p[i].bi = bufsearch(b, &s->kv, &m, &same);
+ if(p[i].bi == -1){
+ p[i].bi++;
+ }else if(!same || !s->first){
+ /* scan past repeated messages */
+ while(p[i].bi < p[i].b->nbuf){
+ getmsg(p[i].b, p[i].bi, &c);
+ if(keycmp(&m, &c) != 0)
+ break;
+ p[i].bi++;
+ }
+ }
+ bp = unpackbp(v.v, v.nv);
+ b = getblk(bp, 0);
+ p[i+1].b = b;
+ }else if(p[i].vi == -1 || !same || !s->first)
+ p[i].vi++;
+ }
+ s->first = 0;
+}
+
+int
+btnext(Scan *s, Kvp *r)
+{
+ int i, j, h, ok, start, bufsrc;
+ Scanp *p;
+ Msg m, n;
+ Bptr bp;
+ Kvp kv;
+
+Again:
+ p = s->path;
+ h = s->ht;
+ start = h;
+ bufsrc = -1;
+ if(s->donescan)
+ return 0;
+ if(waserror()){
+ btexit(s);
+ nexterror();
+ }
+ /* load up the correct blocks for the scan */
+ for(i = h-1; i >= 0; i--){
+ if(p[i].b != nil
+ &&(p[i].vi < p[i].b->nval || p[i].bi < p[i].b->nbuf))
+ break;
+ if(i == 0){
+ s->donescan = 1;
+ poperror();
+ return 0;
+ }
+ if(p[i].b != nil)
+ dropblk(p[i].b);
+ p[i].b = nil;
+ p[i].vi = 0;
+ p[i].bi = 0;
+ p[i-1].vi++;
+ start = i;
+ }
+
+ if(p[start-1].vi < p[start-1].b->nval){
+ for(i = start; i < h; i++){
+ getval(p[i-1].b, p[i-1].vi, &kv);
+ bp = unpackbp(kv.v, kv.nv);
+ p[i].b = getblk(bp, 0);
+ }
+
+ /* find the minimum key along the path up */
+ m.op = Oinsert;
+ getval(p[h-1].b, p[h-1].vi, &m);
+ }else{
+ getmsg(p[start-1].b, p[start-1].bi, &m);
+ assert(m.op == Oinsert);
+ bufsrc = start-1;
+ }
+
+ for(i = h-2; i >= 0; i--){
+ if(p[i].b == nil || p[i].bi == p[i].b->nbuf)
+ continue;
+ getmsg(p[i].b, p[i].bi, &n);
+ if(keycmp(&n, &m) < 0){
+ bufsrc = i;
+ m = n;
+ }
+ }
+ if(m.nk < s->pfx.nk || memcmp(m.k, s->pfx.k, s->pfx.nk) != 0){
+ s->donescan = 1;
+ poperror();
+ return 0;
+ }
+
+ /* scan all messages applying to the message */
+ ok = 1;
+ cpkvp(r, &m, s->kvbuf, sizeof(s->kvbuf));
+ if(bufsrc == -1)
+ p[h-1].vi++;
+ else
+ p[bufsrc].bi++;
+ for(i = h-2; i >= 0; i--){
+ for(j = p[i].bi; p[i].b != nil && j < p[i].b->nbuf; j++){
+ getmsg(p[i].b, j, &m);
+ if(keycmp(r, &m) != 0)
+ break;
+ ok = apply(r, &m, s->kvbuf, sizeof(s->kvbuf));
+ p[i].bi++;
+ }
+ }
+ poperror();
+ if(!ok)
+ goto Again;
+ return 1;
+}
+
+void
+btexit(Scan *s)
+{
+ int i;
+
+ for(i = 0; i < s->ht; i++)
+ dropblk(s->path[i].b);
+ free(s->path);
+}
--- /dev/null
+++ b/sys/src/cmd/gefs/user.c
@@ -1,0 +1,260 @@
+#include <u.h>
+#include <libc.h>
+#include <fcall.h>
+#include <avl.h>
+
+#include "dat.h"
+#include "fns.h"
+
+static char*
+slurp(Tree *t, vlong path, vlong len)
+{
+ char *ret, buf[Offksz], kvbuf[Offksz + Ptrsz];
+ vlong o;
+ Blk *b;
+ Bptr bp;
+ Key k;
+ Kvp kv;
+
+ if((ret = malloc(len + 1)) == nil)
+ error(Enomem);
+ k.k = buf;
+ k.nk = Offksz;
+ for(o = 0; o < len; o += Blksz){
+ k.k[0] = Kdat;
+ PACK64(k.k+1, path);
+ PACK64(k.k+9, o);
+ if(!btlookup(t, &k, &kv, kvbuf, sizeof(kvbuf)))
+ error(Esrch);
+ bp = unpackbp(kv.v, kv.nv);
+ b = getblk(bp, GBraw);
+ if(len - o >= Blksz)
+ memcpy(ret + o, b->buf, Blksz);
+ else
+ memcpy(ret + o, b->buf, len - o);
+ }
+ ret[len] = 0;
+ return ret;
+}
+
+static char*
+readline(char **p, char *buf, int nbuf)
+{
+ char *e;
+ int n;
+
+ if((e = strchr(*p, '\n')) == nil)
+ return nil;
+ n = (e - *p) + 1;
+ if(n >= nbuf)
+ n = nbuf - 1;
+ strecpy(buf, buf + n, *p);
+ *p = e+1;
+ return buf;
+}
+
+static char*
+getfield(char **p, char delim)
+{
+ char *r;
+
+ if(*p == nil)
+ return nil;
+ r = *p;
+ *p = strchr(*p, delim);
+ if(*p != nil){
+ **p = '\0';
+ *p += 1;
+ }
+ return r;
+}
+
+User*
+name2user(char *name)
+{
+ int i;
+
+ for(i = 0; i < fs->nusers; i++)
+ if(strcmp(fs->users[i].name, name) == 0)
+ return &fs->users[i];
+ return nil;
+}
+
+User*
+uid2user(int id)
+{
+ int i;
+
+ for(i = 0; i < fs->nusers; i++)
+ if(fs->users[i].id == id)
+ return &fs->users[i];
+ return nil;
+}
+
+static char*
+parseusers(int fd, char *udata)
+{
+ char *pu, *p, *f, *m, *err, buf[8192];
+ int i, j, lnum, ngrp, nusers, usersz;
+ User *u, *n, *users;
+ int *g, *grp;
+
+ i = 0;
+ err = nil;
+ nusers = 0;
+ usersz = 8;
+ if((users = calloc(usersz, sizeof(User))) == nil)
+ return Enomem;
+ pu = udata;
+ lnum = 0;
+ while((p = readline(&pu, buf, sizeof(buf))) != nil){
+ lnum++;
+ if(p[0] == '#' || p[0] == 0)
+ continue;
+ if(i == usersz){
+ usersz *= 2;
+ n = realloc(users, usersz*sizeof(User));
+ if(n == nil){
+ free(users);
+ return Enomem;
+ }
+ users = n;
+ }
+ if((f = getfield(&p, ':')) == nil){
+ fprint(fd, "/adm/users:%d: missing ':' after id\n", lnum);
+ err = Esyntax;
+ goto Error;
+ }
+ u = &users[i];
+ u->id = atol(f);
+ if((f = getfield(&p, ':')) == nil){
+ fprint(fd, "/adm/users:%d: missing ':' after name\n", lnum);
+ err = Esyntax;
+ goto Error;
+ }
+ snprint(u->name, sizeof(u->name), "%s", f);
+ u->memb = nil;
+ u->nmemb = 0;
+ i++;
+ }
+ nusers = i;
+
+
+ i = 0;
+ pu = udata;
+ lnum = 0;
+ while((p = readline(&pu, buf, sizeof(buf))) != nil){
+ lnum++;
+ if(buf[0] == '#' || buf[0] == 0)
+ continue;
+ getfield(&p, ':'); /* skip id */
+ getfield(&p, ':'); /* skip name */
+ if((f = getfield(&p, ':')) == nil){
+ fprint(fd, "/adm/users:%d: missing ':' after name\n", lnum);
+ err = Esyntax;
+ goto Error;
+ }
+ if(f[0] != '\0'){
+ u = nil;
+ for(j = 0; j < nusers; j++)
+ if(strcmp(users[j].name, f) == 0)
+ u = &users[j];
+ if(u == nil){
+ fprint(fd, "/adm/users:%d: leader %s does not exist\n", lnum, f);
+ err = Enouser;
+ goto Error;
+ }
+ users[i].lead = u->id;
+ }
+ if((f = getfield(&p, ':')) == nil){
+ err = Esyntax;
+ goto Error;
+ }
+ grp = nil;
+ ngrp = 0;
+ while((m = getfield(&f, ',')) != nil){
+ if(m[0] == '\0')
+ continue;
+ u = nil;
+ for(j = 0; j < nusers; j++)
+ if(strcmp(users[j].name, m) == 0)
+ u = &users[j];
+ if(u == nil){
+ fprint(fd, "/adm/users:%d: user %s does not exist\n", lnum, m);
+ free(grp);
+ err = Enouser;
+ goto Error;
+ }
+ if((g = realloc(grp, (ngrp+1)*sizeof(int))) == nil){
+ free(grp);
+ err = Enomem;
+ goto Error;
+ }
+ grp = g;
+ grp[ngrp++] = u->id;
+ }
+ users[i].memb = grp;
+ users[i].nmemb = ngrp;
+ i++;
+ }
+
+ wlock(&fs->userlk);
+ n = fs->users;
+ i = fs->nusers;
+ fs->users = users;
+ fs->nusers = nusers;
+ wunlock(&fs->userlk);
+ users = n;
+ nusers = i;
+
+Error:
+ if(users != nil)
+ for(i = 0; i < nusers; i++)
+ free(users[i].memb);
+ free(users);
+
+ return err;
+
+}
+
+void
+loadusers(int fd, Tree *t)
+{
+ char *s, *e;
+ vlong len;
+ Qid q;
+ User *u;
+
+ if(walk1(t, -1, "", &q, &len) == -1)
+ error(Efs);
+ if(walk1(t, q.path, "users", &q, &len) == -1)
+ error(Esrch);
+ if(q.type & QTDIR)
+ error(Etype);
+ if(len >= 1*MiB)
+ error(Efsize);
+ s = slurp(t, q.path, len);
+ e = parseusers(fd, s);
+ if(e != nil){
+ if(fs->users != nil){
+ fprint(2, "load users: %s\n", e);
+ fprint(2, "keeping old table\n");
+ error(e);
+ }
+ if(!permissive){
+ fprint(2, "user table broken: %s\n", e);
+ fprint(2, "\tnot permissive: bailing\n");
+ error(e);
+ }
+ fprint(2, "user table broken: %s\n", e);
+ fprint(2, "\tfalling back to default\n");
+ parseusers(fd, "-1:adm::\n0:none::\n");
+ }
+ if((u = name2user("none")) != nil)
+ noneid = u->id;
+ if((u = name2user("adm")) != nil)
+ admid = u->id;
+ if((u = name2user("nogroup")) != nil)
+ nogroupid = u->id;
+ free(s);
+}