ref: 513bda7539f41b108d4441f815df3370332ea128
parent: 6f011d94cf9ceaf1d8f931cfe1ab605dcb91606e
author: Ori Bernstein <ori@eigenstate.org>
date: Sat Aug 22 16:22:51 EDT 2020
implement git/repack This takes a git repository and puts all the loose objects into a single packfile. The work is the same as what's needed for a more efficient git/push, and for a less wasteful git/serve.
--- /dev/null
+++ b/delta.c
@@ -1,0 +1,196 @@
+#include <u.h>
+#include <libc.h>
+
+#include "git.h"
+
+typedef struct Dblock Dblock;
+typedef struct Delta Delta;
+typedef struct Dset Dset;
+typedef struct Objmeta Objmeta;
+typedef struct Deltaset Deltaset;
+
+enum {
+ Blksz = 127,
+ Hshift = 113,
+ Hlast = 1692137473L,
+};
+
+struct Dblock {
+ uchar *p;
+ uchar *s;
+ uchar *e;
+ u64int rhash;
+ Hash chash;
+};
+
+struct Objmeta {
+ Object *obj;
+ char *name;
+ vlong time;
+ Delta *delta;
+ int ndelta;
+};
+
+struct Deltaset {
+ Objset skip;
+ Objset send;
+ Objmeta *meta;
+ int nmeta;
+ int metasz;
+};
+
+static u64int
+addh(u64int h, uchar v)
+{
+ return h + v;
+}
+
+static int
+blkcmp(void *pa, void *pb)
+{
+ Dblock *a, *b;
+
+ a = (Dblock*)pa;
+ b = (Dblock*)pb;
+ if(b->rhash == a->rhash)
+ return 0;
+ return (a->rhash > b->rhash) ? -1 : 1;
+}
+
+static void
+initblk(Dblock *b, uchar *p, uchar *s, uchar *e, u64int rh)
+{
+ b->p = p;
+ b->s = s;
+ b->e = e;
+ b->rhash = rh;
+ sha1(s, e - s, b->chash.h, nil);
+}
+
+static Dblock*
+findrough(Dblock *b, int nb, u64int rh)
+{
+ int mid, lo, hi;
+
+ lo = 0;
+ hi = nb;
+ while(lo <= hi){
+ mid = (lo + hi)/2;
+ if(b[mid].rhash == rh)
+ return &b[mid];
+ else if(b[mid].rhash > rh)
+ lo = mid + 1;
+ else
+ hi = mid - 1;
+ }
+ return nil;
+}
+
+static int
+nextblk(uchar *s, uchar *e, u64int *rh)
+{
+ uchar *p;
+ int i;
+
+ p = s;
+ *rh = 0;
+ for(i = 0; i < Blksz; i++){
+ if(p == e)
+ break;
+ /* FIXME: better hash */
+ *rh *= Hshift;
+ *rh += *p++;
+ }
+ return p - s;
+}
+
+static int
+sameblk(Dblock *b, Hash h, uchar *s, uchar *e)
+{
+ int n;
+
+ n = b->e - b->s;
+ if(n != e - s)
+ return 0;
+ return hasheq(&b->chash, &h) && memcmp(b->s, s, n) == 0;
+}
+
+static int
+emitdelta(Delta **pd, int *nd, int cpy, int off, int len)
+{
+ Delta *d;
+
+ if(len == 0)
+ return 0;
+ *nd += 1;
+ *pd = erealloc(*pd, *nd * sizeof(Delta));
+ d = &(*pd)[*nd - 1];
+ d->cpy = cpy;
+ d->off = off;
+ d->len = len;
+ return len;
+}
+
+
+Delta*
+deltify(void *targ, int ntarg, void *base, int nbase, int *pnd)
+{
+ Hash h;
+ Dblock *b, *k;
+ Delta *d;
+ uchar *l, *s, *e, *eb, *bp, *tp;
+ int i, nb, nd;
+ u64int rh;
+
+ bp = base;
+ tp = targ;
+ s = bp;
+ e = bp;
+ nb = (nbase + Blksz - 1) / Blksz;
+ b = emalloc(nb*sizeof(Dblock));
+ for(i = 0; i < nb; i++){
+ e += nextblk(s, bp + nbase, &rh);
+ initblk(&b[i], bp, s, e, rh);
+ s = e;
+ }
+ qsort(b, nb, sizeof(*b), blkcmp);
+
+ l = targ;
+ s = targ;
+ e = targ;
+ d = nil;
+ nd = 0;
+ e += nextblk(s, tp + ntarg, &rh);
+ while(1){
+ if((k = findrough(b, nb, rh)) != nil){
+ sha1(s, e - s, h.h, nil);
+ if(sameblk(k, h, s, e)){
+ eb = k->e;
+ /* stretch the block as far as it'll go */
+ for(i = 0; i < (1<<24) - Blksz; i++){
+ if(e == tp + ntarg || eb == bp + nbase)
+ break;
+ if(*e != *eb)
+ break;
+ eb++;
+ e++;
+ }
+ emitdelta(&d, &nd, 0, l - tp, s - l);
+ emitdelta(&d, &nd, 1, k->s - k->p, eb - k->s);
+ s = e;
+ l = e;
+ e += nextblk(s, tp + ntarg, &rh);
+ continue;
+ }
+ }
+ if(e == tp + ntarg)
+ break;
+ /* got a better hash? apply within! */
+ rh -= *s++ * Hlast;
+ rh *= Hshift;
+ rh += *e++;
+ }
+ emitdelta(&d, &nd, 0, l - tp, tp + ntarg - l);
+ *pnd = nd;
+ return d;
+}
--- a/git.h
+++ b/git.h
@@ -131,8 +131,8 @@
/* Significant win on memory use */
union {
- Cinfo *commit;
- Tinfo *tree;
+ Cinfo *commit;
+ Tinfo *tree;
};
};
@@ -210,10 +210,11 @@
#define isblank(c) \
(((c) != '\n') && isspace(c))
-extern Reprog *authorpat;
-extern Objset objcache;
-extern Hash Zhash;
-extern int chattygit;
+extern Reprog *authorpat;
+extern Objset objcache;
+extern Hash Zhash;
+extern int chattygit;
+extern int debug;
#pragma varargck type "H" Hash
#pragma varargck type "T" int
@@ -253,6 +254,7 @@
void olsfree(Objlist *);
/* util functions */
+int dprint(char *, ...);
void *emalloc(ulong);
void *erealloc(void *, ulong);
char *estrdup(char *);
@@ -263,7 +265,7 @@
char *strip(char *);
/* packing */
-Delta* deltify(void*, int, void *, int, int *, int *);
+Delta* deltify(void*, int, void *, int, int *);
/* proto handling */
int readpkt(Conn*, char*, int);
--- a/mkfile
+++ b/mkfile
@@ -6,6 +6,7 @@
fetch\
fs\
query\
+ repack\
save\
send\
walk
@@ -27,6 +28,7 @@
rm
OFILES=\
+ delta.$O\
objset.$O\
ols.$O\
pack.$O\
--- a/pack.c
+++ b/pack.c
@@ -312,10 +312,6 @@
while(d != ed){
c = *d++;
- if(!c){
- werrstr("bad delta encoding");
- return -1;
- }
/* copy from base */
if(c & 0x80){
o = 0;
@@ -446,6 +442,10 @@
return -1;
l |= (c & 0x7f) << s;
s += 7;
+ }
+ if(l >= (1ULL << 32)){
+ werrstr("object too big");
+ return -1;
}
switch(t){
--- /dev/null
+++ b/repack.c
@@ -1,0 +1,518 @@
+#include <u.h>
+#include <libc.h>
+#include <pool.h>
+
+#include "git.h"
+
+#define TMPPATH(suff) (".git/objects/pack/repack."suff)
+
+typedef struct Objmeta Objmeta;
+typedef struct Objq Objq;
+typedef struct Buf Buf;
+typedef struct Compout Compout;
+
+struct Objmeta {
+ int type;
+ char *path;
+ vlong mtime;
+ Hash hash;
+
+ Object *obj;
+ Object *base;
+ Delta *delta;
+ int ndelta;
+};
+
+struct Objq {
+ Objq *next;
+ Object *obj;
+};
+
+struct Buf {
+ int off;
+ int sz;
+ uchar *data;
+};
+
+struct Compout {
+ Biobuf *bfd;
+ DigestState *st;
+};
+
+void
+usage(void)
+{
+ fprint(2, "usage: %s [-d]\n", argv0);
+ exits("usage");
+}
+
+static int
+dsortcmp(void *pa, void *pb)
+{
+ Objmeta *a, *b;
+ int cmp;
+
+ a = pa;
+ b = pb;
+ if(a->type != b->type)
+ return a->type - b->type;
+ cmp = strcmp(a->path, b->path);
+ if(cmp != 0)
+ return cmp;
+ if(a->mtime != b->mtime)
+ return a->mtime - b->mtime;
+ return memcmp(a->hash.h, b->hash.h, sizeof(a->hash.h));
+}
+
+static int
+timecmp(void *pa, void *pb)
+{
+ Objmeta *a, *b;
+
+ a = pa;
+ b = pb;
+ return b->mtime - a->mtime;
+}
+
+static void
+addmeta(Objmeta **m, int *nm, int type, Hash h, char *path, vlong mtime)
+{
+ *m = erealloc(*m, (*nm + 1)*sizeof(Objmeta));
+ memset(&(*m)[*nm], 0, sizeof(Objmeta));
+ (*m)[*nm].type = type;
+ (*m)[*nm].path = path;
+ (*m)[*nm].mtime = mtime;
+ (*m)[*nm].hash = h;
+ *nm += 1;
+}
+
+static int
+loadtree(Objmeta **m, int *nm, Hash tree, char *dpath, vlong mtime, Objset *has)
+{
+ Object *t, *o;
+ Dirent *e;
+ char *p;
+ int i, k;
+
+ if(oshas(has, tree))
+ return 0;
+ if((t = readobject(tree)) == nil)
+ return -1;
+// osadd(has, t);
+ addmeta(m, nm, t->type, t->hash, dpath, mtime);
+ for(i = 0; i < t->tree->nent; i++){
+ e = &t->tree->ent[i];
+ if(oshas(has, e->h))
+ continue;
+ if(e->ismod)
+ continue;
+ k = (e->mode & DMDIR) ? GTree : GBlob;
+ o = clearedobject(e->h, k);
+ p = smprint("%s/%s", dpath, e->name);
+ addmeta(m, nm, k, o->hash, p, mtime);
+// if(k == GBlob)
+// osadd(has, o);
+ if(k == GTree && loadtree(m, nm, e->h, p, mtime, has) == -1)
+ return -1;
+ osadd(has, o);
+ }
+ unref(t);
+ return 0;
+}
+
+static int
+loadcommit(Objmeta **m, int *nm, Hash h, Objset *has)
+{
+ Object *c, *p;
+ Objq *q, *e, *t, *n;
+ int i;
+
+ if((c = readobject(h)) == nil)
+ return -1;
+ if(c->type != GCommit)
+ sysfatal("object %H not commit", c->hash);
+ q = emalloc(sizeof(Objq));
+ e = q;
+ q->next = nil;
+ q->obj = c;
+ for(; q != nil; q = n){
+ c = q->obj;
+ if(oshas(has, c->hash))
+ goto nextiter;
+ osadd(has, c);
+ for(i = 0; i < c->commit->nparent; i++){
+ if((p = readobject(c->commit->parent[i])) == nil)
+ return -1;
+ t = emalloc(sizeof(Objq));
+ t->next = nil;
+ t->obj = p;
+ e->next = t;
+ e = t;
+ }
+ addmeta(m, nm, c->type, c->hash, estrdup(""), c->commit->ctime);
+ if(loadtree(m, nm, c->commit->tree, "", c->commit->ctime, has) == -1)
+ goto error;
+nextiter:
+ n = q->next;
+ unref(q->obj);
+ free(q);
+ }
+ return 0;
+error:
+ for(; q != nil; q = n) {
+ n = q->next;
+ free(q);
+ }
+ return -1;
+}
+
+static int
+readmeta(Objmeta **m, Hash *roots, int nroots, Hash * /*leaves */, int /*nleaves*/)
+{
+ Objset has;
+ int i, nm;
+
+ *m = nil;
+ nm = 0;
+ osinit(&has);
+ for(i = 0; i < nroots; i++)
+ if(loadcommit(m, &nm, roots[i], &has) == -1){
+ free(*m);
+ return -1;
+ }
+ return nm;
+}
+
+static int
+deltasz(Delta *d, int nd)
+{
+ int i, sz;
+ sz = 32;
+ for(i = 0; i < nd; i++)
+ sz += d[i].cpy ? 7 : d[i].len + 1;
+ return sz;
+}
+
+static void
+pickdeltas(Objmeta *meta, int nmeta)
+{
+ Objmeta *m, *p;
+ Object *a, *b;
+ Delta *d;
+ int i, nd, sz, best;
+
+ qsort(meta, nmeta, sizeof(Objmeta), dsortcmp);
+ for(i = 0; i < nmeta; i++){
+ m = &meta[i];
+ p = meta;
+ if(i > 10)
+ p = m - 10;
+ if((a = readobject(m->hash)) == nil)
+ sysfatal("missing object %H", m->hash);
+ best = a->size;
+ m->base = nil;
+ m->delta = nil;
+ m->ndelta = 0;
+ for(; p != m; p++){
+ if((b = readobject(p->hash)) == nil)
+ sysfatal("missing object %H", p->hash);
+ d = deltify(a->data, a->size, b->data, b->size, &nd);
+ sz = deltasz(d, nd);
+ if(sz + 32 < best){
+ free(m->delta);
+ best = sz;
+ m->base = b;
+ m->delta = d;
+ m->ndelta = nd;
+ }else
+ free(d);
+ unref(b);
+ }
+ unref(a);
+ }
+}
+
+static int
+hwrite(Biobuf *b, void *buf, int len, DigestState **st)
+{
+ *st = sha1(buf, len, nil, *st);
+ return Bwrite(b, buf, len);
+}
+
+int
+compread(void *p, void *dst, int n)
+{
+ Buf *b;
+
+ b = p;
+ if(n > b->sz - b->off)
+ n = b->sz - b->off;
+ memcpy(dst, b->data + b->off, n);
+ b->off += n;
+ return n;
+}
+
+int
+compwrite(void *p, void *buf, int n)
+{
+ return hwrite(((Compout *)p)->bfd, buf, n, &((Compout*)p)->st);
+}
+
+int
+hcompress(Biobuf *bfd, void *buf, int sz, DigestState **st)
+{
+ int r;
+ Buf b ={
+ .off=0,
+ .data=buf,
+ .sz=sz,
+ };
+ Compout o = {
+ .bfd = bfd,
+ .st = *st,
+ };
+
+ r = deflatezlib(&o, compwrite, &b, compread, 6, 0);
+ *st = o.st;
+ return r;
+}
+
+void
+append(char **p, int *len, int *sz, void *seg, int nseg)
+{
+ if(*len + nseg >= *sz){
+ while(*len + nseg >= *sz)
+ *sz += *sz/2;
+ *p = erealloc(*p, *sz);
+ }
+ memcpy(*p + *len, seg, nseg);
+ *len += nseg;
+}
+
+int
+encodedelta(Objmeta *m, Object *o, Object *b, void **pp)
+{
+ char *p, *bp, buf[16];
+ int len, sz, n, i, j;
+ Delta *d;
+
+ sz = 128;
+ len = 0;
+ p = emalloc(sz);
+
+ /* base object size */
+ buf[0] = b->size & 0x7f;
+ n = b->size >> 7;
+ for(i = 1; n > 0; i++){
+ buf[i - 1] |= 0x80;
+ buf[i] = n & 0x7f;
+ n >>= 7;
+ }
+ append(&p, &len, &sz, buf, i);
+
+ /* target object size */
+ buf[0] = o->size & 0x7f;
+ n = o->size >> 7;
+ for(i = 1; n > 0; i++){
+ buf[i - 1] |= 0x80;
+ buf[i] = n & 0x7f;
+ n >>= 7;
+ }
+ append(&p, &len, &sz, buf, i);
+ for(j = 0; j < m->ndelta; j++){
+ d = &m->delta[j];
+ if(d->cpy){
+ n = d->off;
+ bp = buf + 1;
+ buf[0] = 0x81;
+ buf[1] = 0x00;
+ for(i = 0; i < sizeof(buf); i++) {
+ buf[0] |= 1<<i;
+ *bp++ = n & 0xff;
+ n >>= 8;
+ if(n == 0)
+ break;
+ }
+
+ n = d->len;
+ if(n != 0x10000) {
+ buf[0] |= 0x1<<4;
+ for(i = 0; i < sizeof(buf)-4 && n > 0; i++){
+ buf[0] |= 1<<(i + 4);
+ *bp++ = n & 0xff;
+ n >>= 8;
+ }
+ }
+ append(&p, &len, &sz, buf, bp - buf);
+ }else{
+ n = 0;
+ while(n != d->len){
+ buf[0] = (d->len - n < 127) ? d->len - n : 127;
+ append(&p, &len, &sz, buf, 1);
+ append(&p, &len, &sz, o->data + d->off + n, buf[0]);
+ n += buf[0];
+ }
+ }
+ }
+ *pp = p;
+ return len;
+}
+
+static int
+writepack(int fd, Objmeta *meta, int nmeta, Hash *h)
+{
+ int i, j, n, x, res, pcnt, len, ret;
+ DigestState *st;
+ Biobuf *bfd;
+ Objmeta *m;
+ Object *o;
+ char *p, buf[16];
+
+ st = nil;
+ ret = -1;
+ pcnt = 0;
+ if((fd = dup(fd, -1)) == -1)
+ return -1;
+ if((bfd = Bfdopen(fd, OWRITE)) == nil)
+ return -1;
+ if(hwrite(bfd, "PACK", 4, &st) == -1)
+ return -1;
+ PUTBE32(buf, 2);
+ if(hwrite(bfd, buf, 4, &st) == -1)
+ return -1;
+ PUTBE32(buf, nmeta);
+ if(hwrite(bfd, buf, 4, &st) == -1)
+ return -1;
+ qsort(meta, nmeta, sizeof(Objmeta), timecmp);
+ fprint(2, "deltifying %d objects: 0%%", nmeta);
+ for(i = 0; i < nmeta; i++){
+ m = &meta[i];
+ x = (i*100) / nmeta;
+ if(x > pcnt){
+ pcnt = x;
+ if(pcnt%10 == 0)
+ fprint(2, "\b\b\b\b%3d%%", pcnt);
+ }
+ if((o = readobject(m->hash)) == nil)
+ return -1;
+ if(m->delta == nil){
+ len = o->size;
+ buf[0] = o->type << 4;
+ buf[0] |= len & 0xf;
+ len >>= 4;
+ for(j = 1; len != 0; j++){
+ assert(j < sizeof(buf));
+ buf[j-1] |= 0x80;
+ buf[j] = len & 0x7f;
+ len >>= 7;
+ }
+ hwrite(bfd, buf, j, &st);
+ if(hcompress(bfd, o->data, o->size, &st) == -1)
+ goto error;
+ }else{
+ n = encodedelta(m, o, m->base, &p);
+ len = n;
+ buf[0] = GRdelta << 4;
+ buf[0] |= len & 0xf;
+ len >>= 4;
+ for(j = 1; len != 0; j++){
+ assert(j < sizeof(buf));
+ buf[j-1] |= 0x80;
+ buf[j] = len & 0x7f;
+ len >>= 7;
+ }
+ hwrite(bfd, buf, j, &st);
+ hwrite(bfd, m->base->hash.h, sizeof(m->base->hash.h), &st);
+ res = hcompress(bfd, p, n, &st);
+ free(p);
+ if(res == -1)
+ goto error;
+ }
+ unref(o);
+ }
+ fprint(2, "\b\b\b\b100%%\n");
+ sha1(nil, 0, h->h, st);
+ if(Bwrite(bfd, h->h, sizeof(h->h)) == -1)
+ goto error;
+ ret = 0;
+error:
+ if(Bterm(bfd) == -1)
+ return -1;
+ return ret;
+}
+
+int
+cleanup(Hash h)
+{
+ char newpfx[42], dpath[256], fpath[256];
+ int i, j, nd;
+ Dir *d;
+
+ snprint(newpfx, sizeof(newpfx), "%H.", h);
+ for(i = 0; i < 256; i++){
+ snprint(dpath, sizeof(dpath), ".git/objects/%02x", i);
+ if((nd = slurpdir(dpath, &d)) == -1)
+ continue;
+ for(j = 0; j < nd; j++){
+ snprint(fpath, sizeof(fpath), ".git/objects/%02x/%s", i, d[j].name);
+ remove(fpath);
+ }
+ remove(dpath);
+ free(d);
+ }
+ snprint(dpath, sizeof(dpath), ".git/objects/pack");
+ if((nd = slurpdir(dpath, &d)) == -1)
+ return -1;
+ for(i = 0; i < nd; i++){
+ if(strncmp(d[i].name, newpfx, strlen(newpfx)) == 0)
+ continue;
+ snprint(fpath, sizeof(fpath), ".git/objects/pack/%s", d[i].name);
+ remove(fpath);
+ }
+ return 0;
+}
+
+void
+main(int argc, char **argv)
+{
+ int fd, nmeta, nrefs;
+ Objmeta *meta;
+ Hash *refs, h;
+ char path[128];
+ Dir rn;
+
+ ARGBEGIN{
+ case 'd':
+ debug++;
+ break;
+ default:
+ usage();
+ }ARGEND;
+
+ gitinit();
+ refs = nil;
+ if((nrefs = listrefs(&refs)) == -1)
+ sysfatal("load refs: %r");
+ if((nmeta = readmeta(&meta, refs, nrefs, nil, 0)) == -1)
+ sysfatal("read object metadata: %r");
+ pickdeltas(meta, nmeta);
+ if((fd = create(TMPPATH("pack.tmp"), OWRITE, 0644)) == -1)
+ sysfatal("open %s: %r", TMPPATH("pack.tmp"));
+ if(writepack(fd, meta, nmeta, &h) == -1)
+ sysfatal("writepack: %r");
+ if(indexpack(TMPPATH("pack.tmp"), TMPPATH("idx.tmp"), h) == -1)
+ sysfatal("indexpack: %r");
+ close(fd);
+
+ nulldir(&rn);
+ rn.name = path;
+ snprint(path, sizeof(path), "%H.pack", h);
+ if(dirwstat(TMPPATH("pack.tmp"), &rn) == -1)
+ sysfatal("rename pack: %r");
+ snprint(path, sizeof(path), "%H.idx", h);
+ if(dirwstat(TMPPATH("idx.tmp"), &rn) == -1)
+ sysfatal("rename pack: %r");
+ if(cleanup(h) == -1)
+ sysfatal("cleanup: %r");
+ exits(nil);
+}
--- a/util.c
+++ b/util.c
@@ -6,6 +6,7 @@
Reprog *authorpat;
Hash Zhash;
+int debug;
Object*
emptydir(void)
@@ -241,4 +242,18 @@
while(e > s && isspace(*--e))
*e = 0;
return s;
+}
+
+int
+dprint(char *fmt, ...)
+{
+ va_list ap;
+ int n;
+
+ if(!debug)
+ return 0;
+ va_start(ap, fmt);
+ n = vfprint(2, fmt, ap);
+ va_end(ap);
+ return n;
}