shithub: git9

Download patch

ref: 513bda7539f41b108d4441f815df3370332ea128
parent: 6f011d94cf9ceaf1d8f931cfe1ab605dcb91606e
author: Ori Bernstein <ori@eigenstate.org>
date: Sat Aug 22 16:22:51 EDT 2020

implement git/repack

This takes a git repository and puts all the loose objects into
a single packfile. The work is the same as what's needed for a
more efficient git/push, and for a less wasteful git/serve.

--- /dev/null
+++ b/delta.c
@@ -1,0 +1,196 @@
+#include <u.h>
+#include <libc.h>
+
+#include "git.h"
+
+typedef struct Dblock	Dblock;
+typedef struct Delta	Delta;
+typedef struct Dset	Dset;
+typedef struct Objmeta	Objmeta;
+typedef struct Deltaset	Deltaset;
+
+enum {
+	Blksz	= 127,
+	Hshift	= 113,
+	Hlast	= 1692137473L,
+};
+
+struct Dblock {
+	uchar	*p;
+	uchar	*s;
+	uchar	*e;
+	u64int	rhash;
+	Hash	chash;
+};
+
+struct Objmeta {
+	Object	*obj;
+	char	*name;
+	vlong	time;
+	Delta	*delta;
+	int	ndelta;
+};
+
+struct Deltaset {
+	Objset	skip;
+	Objset	send;
+	Objmeta	*meta;
+	int	nmeta;
+	int	metasz;
+};
+
+static u64int
+addh(u64int h, uchar v)
+{
+	return h + v;
+}
+
+static int
+blkcmp(void *pa, void *pb)
+{
+	Dblock *a, *b;
+
+	a = (Dblock*)pa;
+	b = (Dblock*)pb;
+	if(b->rhash == a->rhash)
+		return 0;
+	return (a->rhash > b->rhash) ? -1 : 1;
+}
+
+static void
+initblk(Dblock *b, uchar *p, uchar *s, uchar *e, u64int rh)
+{
+	b->p = p;
+	b->s = s;
+	b->e = e;
+	b->rhash = rh;
+	sha1(s, e - s, b->chash.h, nil);
+}
+
+static Dblock*
+findrough(Dblock *b, int nb, u64int rh)
+{
+	int mid, lo, hi;
+
+	lo = 0;
+	hi = nb;
+	while(lo <= hi){
+		mid = (lo + hi)/2;
+		if(b[mid].rhash == rh)
+			return &b[mid];
+		else if(b[mid].rhash > rh)
+			lo = mid + 1;
+		else
+			hi = mid - 1;
+	}
+	return nil;
+}
+
+static int
+nextblk(uchar *s, uchar *e, u64int *rh)
+{
+	uchar *p;
+	int i;
+
+	p = s;
+	*rh = 0;
+	for(i = 0; i < Blksz; i++){
+		if(p == e)
+			break;
+		/* FIXME: better hash */
+		*rh *= Hshift;
+		*rh += *p++;
+	}
+	return p - s;
+}
+
+static int
+sameblk(Dblock *b, Hash h, uchar *s, uchar *e)
+{
+	int n;
+
+	n = b->e - b->s;
+	if(n != e - s)
+		return 0;
+	return hasheq(&b->chash, &h) && memcmp(b->s, s, n) == 0;
+}
+
+static int
+emitdelta(Delta **pd, int *nd, int cpy, int off, int len)
+{
+	Delta *d;
+
+	if(len == 0)
+		return 0;
+	*nd += 1;
+	*pd = erealloc(*pd, *nd * sizeof(Delta));
+	d = &(*pd)[*nd - 1];
+	d->cpy = cpy;
+	d->off = off;
+	d->len = len;
+	return len;
+}
+
+
+Delta*
+deltify(void *targ, int ntarg, void *base, int nbase, int *pnd)
+{
+	Hash h;
+	Dblock *b, *k;
+	Delta *d;
+	uchar *l, *s, *e, *eb, *bp, *tp;
+	int i, nb, nd;
+	u64int rh;
+
+	bp = base;
+	tp = targ;
+	s = bp;
+	e = bp;
+	nb = (nbase + Blksz - 1) / Blksz;
+	b = emalloc(nb*sizeof(Dblock));
+	for(i = 0; i < nb; i++){
+		e += nextblk(s, bp + nbase, &rh);
+		initblk(&b[i], bp, s, e, rh);
+		s = e;
+	}
+	qsort(b, nb, sizeof(*b), blkcmp);
+
+	l = targ;
+	s = targ;
+	e = targ;
+	d = nil;
+	nd = 0;
+	e += nextblk(s, tp + ntarg, &rh);
+	while(1){
+		if((k = findrough(b, nb, rh)) != nil){
+			sha1(s, e - s, h.h, nil);
+			if(sameblk(k, h, s, e)){
+				eb = k->e;
+				/* stretch the block as far as it'll go */
+				for(i = 0; i < (1<<24) - Blksz; i++){
+					if(e == tp + ntarg || eb == bp + nbase)
+						break;
+					if(*e != *eb)
+						break;
+					eb++;
+					e++;
+				}
+				emitdelta(&d, &nd, 0, l - tp, s - l);
+				emitdelta(&d, &nd, 1, k->s - k->p, eb - k->s);
+				s = e;
+				l = e;
+				e += nextblk(s, tp + ntarg, &rh);
+				continue;
+			}
+		}
+		if(e == tp + ntarg)
+			break;
+		/* got a better hash? apply within! */
+		rh -= *s++ * Hlast;
+		rh *= Hshift;
+		rh += *e++;
+	}
+	emitdelta(&d, &nd, 0, l - tp, tp + ntarg - l);
+	*pnd = nd;
+	return d;
+}
--- a/git.h
+++ b/git.h
@@ -131,8 +131,8 @@
 
 	/* Significant win on memory use */
 	union {
-		Cinfo *commit;
-		Tinfo *tree;
+		Cinfo	*commit;
+		Tinfo	*tree;
 	};
 };
 
@@ -210,10 +210,11 @@
 #define isblank(c) \
 	(((c) != '\n') && isspace(c))
 
-extern Reprog *authorpat;
-extern Objset objcache;
-extern Hash Zhash;
-extern int chattygit;
+extern Reprog	*authorpat;
+extern Objset	objcache;
+extern Hash	Zhash;
+extern int	chattygit;
+extern int	debug;
 
 #pragma varargck type "H" Hash
 #pragma varargck type "T" int
@@ -253,6 +254,7 @@
 void	olsfree(Objlist *);
 
 /* util functions */
+int	dprint(char *, ...);
 void	*emalloc(ulong);
 void	*erealloc(void *, ulong);
 char	*estrdup(char *);
@@ -263,7 +265,7 @@
 char	*strip(char *);
 
 /* packing */
-Delta*	deltify(void*, int, void *, int, int *, int *);
+Delta*	deltify(void*, int, void *, int, int *);
 
 /* proto handling */
 int	readpkt(Conn*, char*, int);
--- a/mkfile
+++ b/mkfile
@@ -6,6 +6,7 @@
 	fetch\
 	fs\
 	query\
+	repack\
 	save\
 	send\
 	walk
@@ -27,6 +28,7 @@
 	rm
 
 OFILES=\
+	delta.$O\
 	objset.$O\
 	ols.$O\
 	pack.$O\
--- a/pack.c
+++ b/pack.c
@@ -312,10 +312,6 @@
 
 	while(d != ed){
 		c = *d++;
-		if(!c){
-			werrstr("bad delta encoding");
-			return -1;
-		}
 		/* copy from base */
 		if(c & 0x80){
 			o = 0;
@@ -446,6 +442,10 @@
 			return -1;
 		l |= (c & 0x7f) << s;
 		s += 7;
+	}
+	if(l >= (1ULL << 32)){
+		werrstr("object too big");
+		return -1;
 	}
 
 	switch(t){
--- /dev/null
+++ b/repack.c
@@ -1,0 +1,518 @@
+#include <u.h>
+#include <libc.h>
+#include <pool.h>
+
+#include "git.h"
+
+#define TMPPATH(suff) (".git/objects/pack/repack."suff)
+
+typedef struct Objmeta	Objmeta;
+typedef struct Objq	Objq;
+typedef struct Buf	Buf;
+typedef struct Compout	Compout;
+
+struct Objmeta {
+	int	type;
+	char	*path;
+	vlong	mtime;
+	Hash	hash;
+
+	Object	*obj;
+	Object	*base;
+	Delta	*delta;
+	int	ndelta;
+};
+
+struct Objq {
+	Objq	*next;
+	Object	*obj;
+};
+
+struct Buf {
+	int off;
+	int sz;
+	uchar *data;
+};
+
+struct Compout {
+	Biobuf *bfd;
+	DigestState *st;
+};
+
+void
+usage(void)
+{
+	fprint(2, "usage: %s [-d]\n", argv0);
+	exits("usage");
+}
+
+static int
+dsortcmp(void *pa, void *pb)
+{
+	Objmeta *a, *b;
+	int cmp;
+
+	a = pa;
+	b = pb;
+	if(a->type != b->type)
+		return a->type - b->type;
+	cmp = strcmp(a->path, b->path);
+	if(cmp != 0)
+		return cmp;
+	if(a->mtime != b->mtime)
+		return a->mtime - b->mtime;
+	return memcmp(a->hash.h, b->hash.h, sizeof(a->hash.h));
+}
+
+static int
+timecmp(void *pa, void *pb)
+{
+	Objmeta *a, *b;
+
+	a = pa;
+	b = pb;
+	return b->mtime - a->mtime;
+}
+
+static void
+addmeta(Objmeta **m, int *nm, int type, Hash h, char *path, vlong mtime)
+{
+	*m = erealloc(*m, (*nm + 1)*sizeof(Objmeta));
+	memset(&(*m)[*nm], 0, sizeof(Objmeta));
+	(*m)[*nm].type = type;
+	(*m)[*nm].path = path;
+	(*m)[*nm].mtime = mtime;
+	(*m)[*nm].hash = h;
+	*nm += 1;
+}
+
+static int
+loadtree(Objmeta **m, int *nm, Hash tree, char *dpath, vlong mtime, Objset *has)
+{
+	Object *t, *o;
+	Dirent *e;
+	char *p;
+	int i, k;
+
+	if(oshas(has, tree))
+		return 0;
+	if((t = readobject(tree)) == nil)
+		return -1;
+//	osadd(has, t);
+	addmeta(m, nm, t->type, t->hash, dpath, mtime);
+	for(i = 0; i < t->tree->nent; i++){
+		e = &t->tree->ent[i];
+		if(oshas(has, e->h))
+			continue;
+		if(e->ismod)
+			continue;
+		k = (e->mode & DMDIR) ? GTree : GBlob;
+		o = clearedobject(e->h, k);
+		p = smprint("%s/%s", dpath, e->name);
+		addmeta(m, nm, k, o->hash, p, mtime);
+//		if(k == GBlob)
+//			osadd(has, o);
+		if(k == GTree && loadtree(m, nm, e->h, p, mtime, has) == -1)
+			return -1;
+		osadd(has, o);
+	}
+	unref(t);
+	return 0;
+}
+
+static int
+loadcommit(Objmeta **m, int *nm, Hash h, Objset *has)
+{
+	Object *c, *p;
+	Objq *q, *e, *t, *n;
+	int i;
+
+	if((c = readobject(h)) == nil)
+		return -1;
+	if(c->type != GCommit)
+		sysfatal("object %H not commit", c->hash);
+	q = emalloc(sizeof(Objq));
+	e = q;
+	q->next = nil;
+	q->obj = c;
+	for(; q != nil; q = n){
+		c = q->obj;
+		if(oshas(has, c->hash))
+			goto nextiter;
+		osadd(has, c);
+		for(i = 0; i < c->commit->nparent; i++){
+			if((p = readobject(c->commit->parent[i])) == nil)
+				return -1;
+			t = emalloc(sizeof(Objq));
+			t->next = nil;
+			t->obj = p;
+			e->next = t;
+			e = t;
+		}
+		addmeta(m, nm, c->type, c->hash, estrdup(""), c->commit->ctime);
+		if(loadtree(m, nm, c->commit->tree, "", c->commit->ctime, has) == -1)
+			goto error;
+nextiter:
+		n = q->next;
+		unref(q->obj);
+		free(q);
+	}
+	return 0;
+error:
+	for(; q != nil; q = n) {
+		n = q->next;
+		free(q);
+	}
+	return -1;
+}
+
+static int
+readmeta(Objmeta **m, Hash *roots, int nroots, Hash * /*leaves */, int /*nleaves*/)
+{
+	Objset has;
+	int i, nm;
+
+	*m = nil;
+	nm = 0;
+	osinit(&has);
+	for(i = 0; i < nroots; i++)
+		if(loadcommit(m, &nm, roots[i], &has) == -1){
+			free(*m);
+			return -1;
+		}
+	return nm;
+}
+
+static int
+deltasz(Delta *d, int nd)
+{
+	int i, sz;
+	sz = 32;
+	for(i = 0; i < nd; i++)
+		sz += d[i].cpy ? 7 : d[i].len + 1;
+	return sz;
+}
+
+static void
+pickdeltas(Objmeta *meta, int nmeta)
+{
+	Objmeta *m, *p;
+	Object *a, *b;
+	Delta *d;
+	int i, nd, sz, best;
+
+	qsort(meta, nmeta, sizeof(Objmeta), dsortcmp);
+	for(i = 0; i < nmeta; i++){
+		m = &meta[i];
+		p = meta;
+		if(i > 10)
+			p = m - 10;
+		if((a = readobject(m->hash)) == nil)
+			sysfatal("missing object %H", m->hash);
+		best = a->size;
+		m->base = nil;
+		m->delta = nil;
+		m->ndelta = 0;
+		for(; p != m; p++){
+			if((b = readobject(p->hash)) == nil)
+				sysfatal("missing object %H", p->hash);
+			d = deltify(a->data, a->size, b->data, b->size, &nd);
+			sz = deltasz(d, nd);
+			if(sz + 32 < best){
+				free(m->delta);
+				best = sz;
+				m->base = b;
+				m->delta = d;
+				m->ndelta = nd;
+			}else
+				free(d);
+			unref(b);
+		}
+		unref(a);
+	}
+}
+
+static int
+hwrite(Biobuf *b, void *buf, int len, DigestState **st)
+{
+	*st = sha1(buf, len, nil, *st);
+	return Bwrite(b, buf, len);
+}
+
+int
+compread(void *p, void *dst, int n)
+{
+	Buf *b;
+
+	b = p;
+	if(n > b->sz - b->off)
+		n = b->sz - b->off;
+	memcpy(dst, b->data + b->off, n);
+	b->off += n;
+	return n;
+}
+
+int
+compwrite(void *p, void *buf, int n)
+{
+	return hwrite(((Compout *)p)->bfd, buf, n, &((Compout*)p)->st);
+}
+
+int
+hcompress(Biobuf *bfd, void *buf, int sz, DigestState **st)
+{
+	int r;
+	Buf b ={
+		.off=0,
+		.data=buf,
+		.sz=sz,
+	};
+	Compout o = {
+		.bfd = bfd,
+		.st = *st,
+	};
+
+	r = deflatezlib(&o, compwrite, &b, compread, 6, 0);
+	*st = o.st;
+	return r;
+}
+
+void
+append(char **p, int *len, int *sz, void *seg, int nseg)
+{
+	if(*len + nseg >= *sz){
+		while(*len + nseg >= *sz)
+			*sz += *sz/2;
+		*p = erealloc(*p, *sz);
+	}
+	memcpy(*p + *len, seg, nseg);
+	*len += nseg;
+}
+
+int
+encodedelta(Objmeta *m, Object *o, Object *b, void **pp)
+{
+	char *p, *bp, buf[16];
+	int len, sz, n, i, j;
+	Delta *d;
+
+	sz = 128;
+	len = 0;
+	p = emalloc(sz);
+
+	/* base object size */
+	buf[0] = b->size & 0x7f;
+	n = b->size >> 7;
+	for(i = 1; n > 0; i++){
+		buf[i - 1] |= 0x80;
+		buf[i] = n & 0x7f;
+		n >>= 7;
+	}
+	append(&p, &len, &sz, buf, i);
+
+	/* target object size */
+	buf[0] = o->size & 0x7f;
+	n = o->size >> 7;
+	for(i = 1; n > 0; i++){
+		buf[i - 1] |= 0x80;
+		buf[i] = n & 0x7f;
+		n >>= 7;
+	}
+	append(&p, &len, &sz, buf, i);
+	for(j = 0; j < m->ndelta; j++){
+		d = &m->delta[j];
+		if(d->cpy){
+			n = d->off;
+			bp = buf + 1;
+			buf[0] = 0x81;
+			buf[1] = 0x00;
+			for(i = 0; i < sizeof(buf); i++) {
+				buf[0] |= 1<<i;
+				*bp++ = n & 0xff;
+				n >>= 8;
+				if(n == 0)
+					break;
+			}
+
+			n = d->len;
+			if(n != 0x10000) {
+				buf[0] |= 0x1<<4;
+				for(i = 0; i < sizeof(buf)-4 && n > 0; i++){
+					buf[0] |= 1<<(i + 4);
+					*bp++ = n & 0xff;
+					n >>= 8;
+				}
+			}
+			append(&p, &len, &sz, buf, bp - buf);
+		}else{
+			n = 0;
+			while(n != d->len){
+				buf[0] = (d->len - n < 127) ? d->len - n : 127;
+				append(&p, &len, &sz, buf, 1);
+				append(&p, &len, &sz, o->data + d->off + n, buf[0]);
+				n += buf[0];
+			}
+		}
+	}
+	*pp = p;
+	return len;
+}
+
+static int
+writepack(int fd, Objmeta *meta, int nmeta, Hash *h)
+{
+	int i, j, n, x, res, pcnt, len, ret;
+	DigestState *st;
+	Biobuf *bfd;
+	Objmeta *m;
+	Object *o;
+	char *p, buf[16];
+
+	st = nil;
+	ret = -1;
+	pcnt = 0;
+	if((fd = dup(fd, -1)) == -1)
+		return -1;
+	if((bfd = Bfdopen(fd, OWRITE)) == nil)
+		return -1;
+	if(hwrite(bfd, "PACK", 4, &st) == -1)
+		return -1;
+	PUTBE32(buf, 2);
+	if(hwrite(bfd, buf, 4, &st) == -1)
+		return -1;
+	PUTBE32(buf, nmeta);
+	if(hwrite(bfd, buf, 4, &st) == -1)
+		return -1;
+	qsort(meta, nmeta, sizeof(Objmeta), timecmp);
+	fprint(2, "deltifying %d objects:   0%%", nmeta);
+	for(i = 0; i < nmeta; i++){
+		m = &meta[i];
+		x = (i*100) / nmeta;
+		if(x > pcnt){
+			pcnt = x;
+			if(pcnt%10 == 0)
+				fprint(2, "\b\b\b\b%3d%%", pcnt);
+		}
+		if((o = readobject(m->hash)) == nil)
+			return -1;
+		if(m->delta == nil){
+			len = o->size;
+			buf[0] = o->type << 4;
+			buf[0] |= len & 0xf;
+			len >>= 4;
+			for(j = 1; len != 0; j++){
+				assert(j < sizeof(buf));
+				buf[j-1] |= 0x80;
+				buf[j] = len & 0x7f;
+				len >>= 7;
+			}
+			hwrite(bfd, buf, j, &st);
+			if(hcompress(bfd, o->data, o->size, &st) == -1)
+				goto error;
+		}else{
+			n = encodedelta(m, o, m->base, &p);
+			len = n;
+			buf[0] = GRdelta << 4;
+			buf[0] |= len & 0xf;
+			len >>= 4;
+			for(j = 1; len != 0; j++){
+				assert(j < sizeof(buf));
+				buf[j-1] |= 0x80;
+				buf[j] = len & 0x7f;
+				len >>= 7;
+			}
+			hwrite(bfd, buf, j, &st);
+			hwrite(bfd, m->base->hash.h, sizeof(m->base->hash.h), &st);
+			res = hcompress(bfd, p, n, &st);
+			free(p);
+			if(res == -1)
+				goto error;
+		}
+		unref(o);
+	}
+	fprint(2, "\b\b\b\b100%%\n");
+	sha1(nil, 0, h->h, st);
+	if(Bwrite(bfd, h->h, sizeof(h->h)) == -1)
+		goto error;
+	ret = 0;
+error:
+	if(Bterm(bfd) == -1)
+		return -1;
+	return ret;
+}
+
+int
+cleanup(Hash h)
+{
+	char newpfx[42], dpath[256], fpath[256];
+	int i, j, nd;
+	Dir *d;
+
+	snprint(newpfx, sizeof(newpfx), "%H.", h);
+	for(i = 0; i < 256; i++){
+		snprint(dpath, sizeof(dpath), ".git/objects/%02x", i);
+		if((nd = slurpdir(dpath, &d)) == -1)
+			continue;
+		for(j = 0; j < nd; j++){
+			snprint(fpath, sizeof(fpath), ".git/objects/%02x/%s", i, d[j].name);
+			remove(fpath);
+		}
+		remove(dpath);
+		free(d);
+	}
+	snprint(dpath, sizeof(dpath), ".git/objects/pack");
+	if((nd = slurpdir(dpath, &d)) == -1)
+		return -1;
+	for(i = 0; i < nd; i++){
+		if(strncmp(d[i].name, newpfx, strlen(newpfx)) == 0)
+			continue;
+		snprint(fpath, sizeof(fpath), ".git/objects/pack/%s", d[i].name);
+		remove(fpath);
+	}
+	return 0;
+}
+
+void
+main(int argc, char **argv)
+{
+	int fd, nmeta, nrefs;
+	Objmeta *meta;
+	Hash *refs, h;
+	char path[128];
+	Dir rn;
+
+	ARGBEGIN{
+	case 'd':
+		debug++;
+		break;
+	default:
+		usage();
+	}ARGEND;
+
+	gitinit();
+	refs = nil;
+	if((nrefs = listrefs(&refs)) == -1)
+		sysfatal("load refs: %r");
+	if((nmeta = readmeta(&meta, refs, nrefs, nil, 0)) == -1)
+		sysfatal("read object metadata: %r");
+	pickdeltas(meta, nmeta);
+	if((fd = create(TMPPATH("pack.tmp"), OWRITE, 0644)) == -1)
+		sysfatal("open %s: %r", TMPPATH("pack.tmp"));
+	if(writepack(fd, meta, nmeta, &h) == -1)
+		sysfatal("writepack: %r");
+	if(indexpack(TMPPATH("pack.tmp"), TMPPATH("idx.tmp"), h) == -1)
+		sysfatal("indexpack: %r");
+	close(fd);
+
+	nulldir(&rn);
+	rn.name = path;
+	snprint(path, sizeof(path), "%H.pack", h);
+	if(dirwstat(TMPPATH("pack.tmp"), &rn) == -1)
+		sysfatal("rename pack: %r");
+	snprint(path, sizeof(path), "%H.idx", h);
+	if(dirwstat(TMPPATH("idx.tmp"), &rn) == -1)
+		sysfatal("rename pack: %r");
+	if(cleanup(h) == -1)
+		sysfatal("cleanup: %r");
+	exits(nil);
+}
--- a/util.c
+++ b/util.c
@@ -6,6 +6,7 @@
 
 Reprog *authorpat;
 Hash Zhash;
+int debug;
 
 Object*
 emptydir(void)
@@ -241,4 +242,18 @@
 	while(e > s && isspace(*--e))
 		*e = 0;
 	return s;
+}
+
+int
+dprint(char *fmt, ...)
+{
+	va_list ap;
+	int n;
+
+	if(!debug)
+		return 0;
+	va_start(ap, fmt);
+	n = vfprint(2, fmt, ap);
+	va_end(ap);
+	return n;
 }