shithub: gefs

ref: 56cc98ba722d066c454b107a786b51cb740bd2b2
dir: /dat.h/

View raw version
typedef struct Blk	Blk;
typedef struct Gefs	Gefs;
typedef struct Fmsg	Fmsg;
typedef struct Fid	Fid;
typedef struct Msg	Msg;
typedef struct Key	Key;
typedef struct Val	Val;
typedef struct Kvp	Kvp;
typedef struct Bptr	Bptr;
typedef struct Bfree	Bfree;
typedef struct Path	Path;
typedef struct Scan	Scan;
typedef struct Dent	Dent;
typedef struct Scanp	Scanp;
typedef struct Arena	Arena;
typedef struct Arange	Arange;
typedef struct Bucket	Bucket;
typedef struct Chan	Chan;
typedef struct Tree	Tree;
typedef struct Mount	Mount;

enum {
	KiB	= 1024ULL,
	MiB	= 1024ULL*KiB,
	GiB	= 1024ULL*MiB,
	TiB	= 1024ULL*GiB,

	Lgblk	= 13,
	Blksz	= (1ULL<<Lgblk),

	Nrefbuf	= 1024,			/* number of ref incs before syncing */
	Nfidtab	= 1024,			/* number of fit hash entries */
	Ndtab	= 1024,			/* number of dir tab entries */
	Max9p	= 16*KiB,		/* biggest message size we're willing to negotiate */
	Nsec	= 1000*1000*1000,	/* nanoseconds to the second */
	Maxname	= 256,			/* maximum size of a name element */
	Maxent	= 9+Maxname+1,		/* maximum size of ent key, with terminator */
	Maxproc	= 8,			/* maximum number of worker procs */

	/*
	 * Kpmax must be no more than 1/4 of pivspc, or
	 * there is no way to get a valid split of a
	 * maximally filled tree.
	 */
	Loghdsz	= 8,			/* log hash */
	Keymax	= 128,			/* key data limit */
	Inlmax	= 256,			/* inline data limit */
	Ptrsz	= 24,			/* off, hash, gen */
	Fillsz	= 2,			/* block fill count */
	Offksz	= 17,			/* type, qid, off */
	Snapsz	= 9,			/* tag, snapid */
	Treesz	= 4+Ptrsz+Ptrsz,	/* height, root, deadlist */
	Kvmax	= Keymax + Inlmax,	/* Key and value */
	Kpmax	= Keymax + Ptrsz,	/* Key and pointer */
	

	Hdrsz	= 10,
	Rootsz	= 4+Ptrsz,		/* root pointer */
	Blkspc	= Blksz - Hdrsz,
	Bufspc  = Blkspc / 2,
	Pivspc	= Blkspc - Bufspc,
	Logspc	= Blkspc - Loghdsz,
	Leafspc = Blkspc,
	Msgmax  = 1 + (Kvmax > Kpmax ? Kvmax : Kpmax)
};

enum {
	/*
	 * dent: pqid[8] qid[8] -- a directory entry key.
	 * ptr:  off[8] hash[8] -- a key for an Dir block.
	 * dir:  fixed statbuf header, user ids
	 */
	Kdat,	/* qid[8] off[8] => ptr[16]:	pointer to data page */
	Kent,	/* pqid[8] name[n] => dir[n]:	serialized Dir */
	Kdset,	/* name[] => snapid[]:		dataset (snapshot ref) */
	Ksnap,	/* sid[8] => ref[8], tree[52]:	snapshot root */
	Ksuper,	/* qid[8] => pqid[8]:		parent dir */
};

enum {
	Bdirty	= 1 << 0,
	Bqueued	= 1 << 1,
	Bfinal	= 1 << 2,
	Bcached	= 1 << 3,
	Bzombie	= 1 << 4,
};

//#define Efs	"i will not buy this fs, it is scratched"
#define Efs (abort(), "nope")
#define Eio	"i/o error"
#define Efid	"bad fid"
#define Edscan	"invalid dir scan offset"
#define Eexist	"does not exist"
#define Ebotch	"protocol botch"
#define Emode	"unknown mode"
#define Efull	"file system full"
#define Eauth	"authentication failed"
#define Elength	"name too long"
#define Eperm	"permission denied"
#define Einuse	"resource in use"
#define Ebadf	"invalid file"
#define Emem	"out of memory"
#define Ename	"invalid file name"
#define Enomem	"out of memory"
#define Eattach	"attach required"
#define Enosnap	"no snapshot by that name exists"

/*
 * All metadata blocks share a common header:
 * 
 *	type[2]
 *
 * The None type is reserved for file data blocks
 * and refcount blocks.
 *
 * The superblock has this layout:
 *	version[8]	always "gefs0001"
 *	flags[4]	status flags:
 *				dirty=1<<0,
 *	blksz[4]	block size in bytes
 *	bufsz[4]	portion of leaf nodes
 *			allocated to buffers,
 *			in bytes
 *	hdrsz[4]	size of tree node header,
 *			in bytes.
 *	height[4]	tree height of root node
 *	rootb[8]	address of root in last
 *			snapshot.
 *	rooth[8]	hash of root node
 *	narena[4]	number of arenas in tree
 *	arenasz[8]	maximum size of arenas;
 *			they may be smaller.
 *	gen[8]		The flush generation
 *
 * The arena zone blocks have this layout, and
 * are overwritten in place:
 *
 *	log[8]		The head of the alloc log
 *	logh[8]		The hash of the alloc log
 *
 * The log blocks have this layout, and are one of
 * two types of blocks that get overwritten in place:
 *
 *	hash[8]		The hash of the previous log block
 *
 *	The remainder of the block is filled with log
 *	entries. Each log entry has at least 8 bytes
 *	of entry. Some are longer. The opcode is or'ed
 *	into the low order bits of the first vlong.
 *	These ops take the following form:
 *
 *	Alloc, Free:
 *		off[8] len[8]
 *	Alloc1, Free1:
 *		off[8]
 *	Ref:
 *		off[8]
 *	Flush:	
 *		gen[8]
 *
 * Pivots have the following layout:
 *
 *	nval[2]
 *	valsz[2]
 *	nbuf[2]
 *	bufsz[2]
 *
 * Leaves have the following layout:
 *
 *	nval[2]
 *	valsz[2]
 *	pad[4]sure, 
 *
 * Within these nodes, pointers have the following
 * layout:
 *
 *	off[8] hash[8] fill[2]
 */
enum {
	Tnone,
	Traw,
	Tsuper,
	Tarena,
	Tlog,
	Tpivot,
	Tleaf,
};

enum {
	Vinl,	/* Inline value */
	Vref,	/* Block pointer */
};

enum {
	GBraw	= 1<<0,
	GBwrite	= 1<<1,
};

enum {
	Onop,
	Oinsert,	/* new kvp */
	Odelete,	/* delete kvp */
	Oclearb,	/* free block ptr if exists */
	Owstat,		/* kvp dirent */
	/* wstat flags */
	Owsize	= 1<<4,
	Owmode	= 1<<5,
	Owmtime	= 1<<6,
};

/*
 * Operations for the allocation log.
 */
enum {
	/* 1-wide entries */
	LogAlloc1,	/* alloc a block */
	LogFree1,	/* alloc a block */
	LogDead1,	/* free a block */
	LogFlush,	/* flush log, bump gen */
	LogChain,	/* point to next log block */
	LogEnd,		/* last entry in log */	

	/* 2-wide entries */
	Log2w	= 1<<5,
	LogAlloc = LogAlloc1|Log2w,	/* alloc a range */
	LogFree	= LogFree1|Log2w,	/* free a range */
};

struct Arange {
	Avl;
	vlong	off;
	vlong	len;
};

struct Bucket {
	Lock;
	Blk	*b;
};

struct Fmsg {
	Fcall;
	int	fd;	/* the fd to repsond on */
	QLock	*wrlk;	/* write lock on fd */
	int	sz;	/* the size of the message buf */
	uchar	buf[];
};

struct Bptr {
	vlong	addr;
	vlong	hash;
	vlong	gen;
};

struct Tree {
	Lock	lk;
	Bptr	bp;
	Bptr	dp;
	int	ht;
};

struct Bfree {
	Bptr	bp;
	Bfree	*next;
};

/*
 * Overall state of the file sytem.
 * Shadows the superblock contents.
 */
struct Gefs {
	int	blksz;
	int	bufsz;
	int	pivsz;
	int	hdrsz;

	QLock	snaplk;
	Blk*	super;

	Chan	*wrchan;
	Chan	*rdchan;
	int	nproc;

	Lock	activelk;
	int	active[Maxproc];
	int	lastactive[Maxproc];
	Lock	freelk;
	Bfree	*freep;
	Bfree	*freehd;

	int	fd;
	long	broken;

	Tree	snap;

	Lock	qidlk;
	uvlong	nextqid;
	uvlong	nextgen; /* unlocked: only touched by mutator thread */

	Arena	*arenas;
	int	narena;
	long	nextarena;
	vlong	arenasz;

	Lock	fidtablk;
	Fid	*fidtab[Nfidtab];
	Lock	dtablk;
	Dent	*dtab[Ndtab];

	Lock	lrulk;
	/* protected by lrulk */
	Bucket	*cache;
	Blk	*chead;
	Blk	*ctail;
	int	ccount;
	int	cmax;
};

struct Arena {
	Lock;
	Avltree *free;
	Avltree *partial;

	vlong	log;	/* log head */
	vlong	logh;	/* log head hash */
	Blk	*logtl;	/* tail block open for writing */
	Blk	*b;	/* arena block */

	Blk	**q;	/* write queue */
	vlong	nq;
};

struct Key{
	char	*k;
	int	nk;
};

struct Val {
	int type;
	union {
		/* block pointer */
		struct {
			Bptr	bp;
			ushort	fill;
		};
		/* inline values */
		struct {
			short	nv;
			char	*v;
		};
	};
};

struct Kvp {
	Key;
	Val;
};

struct Msg {
	char	op;
	char	statop;
	Kvp;
};

struct Dent {
	RWLock;
	Key;
	Dent	*next;
	long	ref;

	Qid	qid;
	vlong	length;
	char	buf[Maxent];
};

struct Mount {
	long	ref;
	char	*name;
	Tree	root;
};

struct Fid {
	Lock;
	Fid	*next;
	/*
	 * if opened with OEXEC, we want to use a snapshot,
	 * instead of the most recent root, to prevent
	 * paging in the wrong executable.
	 */
	char	snap[64];
	Mount	*mnt;

	u32int	fid;
	vlong	qpath;
	long	ref;
	int	mode;
	int	iounit;

	Scan	*scan;	/* in progres scan */
	Dent	*dent;	/* (pqid, name) ref, modified on rename */
};

enum {
	POmod,
	POrot,
	POsplit,
	POmerge,
};

struct Path {
	/* Flowing down for flush */
	Msg	*ins;	/* inserted values, bounded by lo..hi */
	Blk	*b;	/* to shadow */
	int	idx;	/* insert at */
	int	lo;	/* key range */
	int	hi;	/* key range */
	int	sz;	/* size of range */

	/* Flowing up from flush */
	int	op;	/* change done along path */
	Blk	*m;	/* node merged against, for post-update free */
	Blk	*nl;	/* new left */
	Blk	*nr;	/* new right, if we split or rotated */
	int	midx;	/* modification index */
	int	npull;	/* number of messages successfully pulled */
	int	pullsz;	/* size of pulled messages */
};

struct Scanp {
	int	bi;
	int	vi;
	Blk	*b;
};

struct Scan {
	vlong	offset;	/* last read offset */
	Tree	root;

	int	done;
	int	overflow;
	int	present;
	Kvp	kv;
	Key	pfx;
	char	kvbuf[Kvmax];
	char	pfxbuf[Keymax];
	Scanp	*path;
};

struct Blk {
	Lock;

	/* cache entry */
	Blk	*cnext;
	Blk	*cprev;
	Blk	*hnext;

	/* Freelist entry */
	Blk	*fnext;

	short	flag;

	/* serialized to disk in header */
	short	type;	/* @0, for all */
	short	nval;	/* @2, for Leaf, Pivot: data[0:2] */
	short	valsz;	/* @4, for Leaf, Pivot: data[2:4] */
	short   nbuf;	/* @6, for Pivot */
	short   bufsz;	/* @8, for Pivot */

	vlong	logsz;	/* for allocation log */
	vlong	lognxt;	/* for allocation log */

	uintptr	freed;	/* debug */

	Bptr	bp;
	long	ref;
	char	*data;
	char	buf[Blksz];
};

struct Chan {
	int	size;	/* size of queue */
	long	count;	/* how many in queue (semaphore) */
	long	avail;	/* how many available to send (semaphore) */
	Lock	rl, wl;	/* circular pointers */
	void	**rp;
	void	**wp;
	void*	args[];	/* list of saved pointers, [->size] */
};