ref: 3c7125f599580538d08a1380b80ebd1f2323a171
parent: 6794de8f796f03ac666c80cec116c60e33206ff4
author: Ori Bernstein <ori@eigenstate.org>
date: Sat Dec 12 23:23:59 EST 2020
packfiles: redo deltification Our deltification sucked a lot. It was very slow, and in addition it didn't deltify much. The new implementation is based around some ideas from FastCDC, and it actually works reasonably well. In addition, deltas between different object types are disabled, and commits aren't deltified because it makes many operations slow.
--- a/delta.c
+++ b/delta.c
@@ -4,22 +4,62 @@
#include "git.h"
enum {
- K = 3,
- Bconst = 42,
- Bmask = 0x7f,
- Bshift = 7,
- Hshift = 113,
- Hlast = 1692137473L,
+ Minchunk = 128,
+ Maxchunk = 8192,
+ Splitmask = (1<<8)-1,
+
};
+static u32int geartab[] = {
+ 0x67ed26b7, 0x32da500c, 0x53d0fee0, 0xce387dc7, 0xcd406d90, 0x2e83a4d4, 0x9fc9a38d, 0xb67259dc,
+ 0xca6b1722, 0x6d2ea08c, 0x235cea2e, 0x3149bb5f, 0x1beda787, 0x2a6b77d5, 0x2f22d9ac, 0x91fc0544,
+ 0xe413acfa, 0x5a30ff7a, 0xad6fdde0, 0x444fd0f5, 0x7ad87864, 0x58c5ff05, 0x8d2ec336, 0x2371f853,
+ 0x550f8572, 0x6aa448dd, 0x7c9ddbcf, 0x95221e14, 0x2a82ec33, 0xcbec5a78, 0xc6795a0d, 0x243995b7,
+ 0x1c909a2f, 0x4fded51c, 0x635d334b, 0x0e2b9999, 0x2702968d, 0x856de1d5, 0x3325d60e, 0xeb6a7502,
+ 0xec2a9844, 0x0905835a, 0xa1820375, 0xa4be5cab, 0x96a6c058, 0x2c2ccd70, 0xba40fce3, 0xd794c46b,
+ 0x8fbae83e, 0xc3aa7899, 0x3d3ff8ed, 0xa0d42b5b, 0x571c0c97, 0xd2811516, 0xf7e7b96c, 0x4fd2fcbd,
+ 0xe2fdec94, 0x282cc436, 0x78e8e95c, 0x80a3b613, 0xcfbee20c, 0xd4a32d1c, 0x2a12ff13, 0x6af82936,
+ 0xe5630258, 0x8efa6a98, 0x294fb2d1, 0xdeb57086, 0x5f0fddb3, 0xeceda7ce, 0x4c87305f, 0x3a6d3307,
+ 0xe22d2942, 0x9d060217, 0x1e42ed02, 0xb6f63b52, 0x4367f39f, 0x055cf262, 0x03a461b2, 0x5ef9e382,
+ 0x386bc03a, 0x2a1e79c7, 0xf1a0058b, 0xd4d2dea9, 0x56baf37d, 0x5daff6cc, 0xf03a951d, 0xaef7de45,
+ 0xa8f4581e, 0x3960b555, 0xffbfff6d, 0xbe702a23, 0x8f5b6d6f, 0x061739fb, 0x98696f47, 0x3fd596d4,
+ 0x151eac6b, 0xa9fcc4f5, 0x69181a12, 0x3ac5a107, 0xb5198fe7, 0x96bcb1da, 0x1b5ddf8e, 0xc757d650,
+ 0x65865c3a, 0x8fc0a41a, 0x87435536, 0x99eda6f2, 0x41874794, 0x29cff4e8, 0xb70efd9a, 0x3103f6e7,
+ 0x84d2453b, 0x15a450bd, 0x74f49af1, 0x60f664b1, 0xa1c86935, 0xfdafbce1, 0xe36353e3, 0x5d9ba739,
+ 0xbc0559ba, 0x708b0054, 0xd41d808c, 0xb2f31723, 0x9027c41f, 0xf136d165, 0xb5374b12, 0x9420a6ac,
+ 0x273958b6, 0xe6c2fad0, 0xebdc1f21, 0xfb33af8b, 0xc71c25cd, 0xe9a2d8e5, 0xbeb38a50, 0xbceb7cc2,
+ 0x4e4e73f0, 0xcd6c251d, 0xde4c032c, 0x4b04ac30, 0x725b8b21, 0x4eb8c33b, 0x20d07b75, 0x0567aa63,
+ 0xb56b2bb7, 0xc1f5fd3a, 0xcafd35ca, 0x470dd4da, 0xfe4f94cd, 0xfb8de424, 0xe8dbcf40, 0xfe50a37a,
+ 0x62db5b5d, 0xf32f4ab6, 0x2c4a8a51, 0x18473dc0, 0xfe0cbb6e, 0xfe399efd, 0xdf34ecc9, 0x6ccd5055,
+ 0x46097073, 0x139135c2, 0x721c76f6, 0x1c6a94b4, 0x6eee014d, 0x8a508e02, 0x3da538f5, 0x280d394f,
+ 0x5248a0c4, 0x3ce94c6c, 0x9a71ad3a, 0x8493dd05, 0xe43f0ab6, 0x18e4ed42, 0x6c5c0e09, 0x42b06ec9,
+ 0x8d330343, 0xa45b6f59, 0x2a573c0c, 0xd7fd3de6, 0xeedeab68, 0x5c84dafc, 0xbbd1b1a8, 0xa3ce1ad1,
+ 0x85b70bed, 0xb6add07f, 0xa531309c, 0x8f8ab852, 0x564de332, 0xeac9ed0c, 0x73da402c, 0x3ec52761,
+ 0x43af2f4d, 0xd6ff45c8, 0x4c367462, 0xd553bd6a, 0x44724855, 0x3b2aa728, 0x56e5eb65, 0xeaf16173,
+ 0x33fa42ff, 0xd714bb5d, 0xfbd0a3b9, 0xaf517134, 0x9416c8cd, 0x534cf94f, 0x548947c2, 0x34193569,
+ 0x32f4389a, 0xfe7028bc, 0xed73b1ed, 0x9db95770, 0x468e3922, 0x0440c3cd, 0x60059a62, 0x33504562,
+ 0x2b229fbd, 0x5174dca5, 0xf7028752, 0xd63c6aa8, 0x31276f38, 0x0646721c, 0xb0191da8, 0xe00e6de0,
+ 0x9eac1a6e, 0x9f7628a5, 0xed6c06ea, 0x0bb8af15, 0xf119fb12, 0x38693c1c, 0x732bc0fe, 0x84953275,
+ 0xb82ec888, 0x33a4f1b3, 0x3099835e, 0x028a8782, 0x5fdd51d7, 0xc6c717b3, 0xb06caf71, 0x17c8c111,
+ 0x61bad754, 0x9fd03061, 0xe09df1af, 0x3bc9eb73, 0x85878413, 0x9889aaf2, 0x3f5a9e46, 0x42c9f01f,
+ 0x9984a4f4, 0xd5de43cc, 0xd294daed, 0xbecba2d2, 0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba,
+};
+
+static u64int
+hash(void *p, int n)
+{
+ uchar buf[SHA1dlen];
+ sha1((uchar*)p, n, buf, nil);
+ return GETBE64(buf);
+}
+
static void
-addblk(Dtab *dt, void *buf, int len, int off, u64int rh)
+addblk(Dtab *dt, void *buf, int len, int off, u64int h)
{
int i, sz, probe;
Dblock *db;
- rh >>= Bshift;
- probe = rh % dt->sz;
+ probe = h % dt->sz;
while(dt->b[probe].buf != nil){
if(len == dt->b[probe].len && memcmp(buf, dt->b[probe].buf, len) == 0)
return;
@@ -29,7 +69,7 @@
dt->b[probe].buf = buf;
dt->b[probe].len = len;
dt->b[probe].off = off;
- dt->b[probe].rhash = rh;
+ dt->b[probe].hash = h;
dt->nb++;
if(dt->sz < 2*dt->nb){
sz = dt->sz;
@@ -39,58 +79,85 @@
dt->b = eamalloc(dt->sz, sizeof(Dblock));
for(i = 0; i < sz; i++)
if(db[i].buf != nil)
- addblk(dt, db[i].buf, db[i].len, db[i].off, db[i].rhash);
+ addblk(dt, db[i].buf, db[i].len, db[i].off, db[i].hash);
free(db);
}
}
static Dblock*
-findrough(Dtab *dt, u64int rh)
+lookup(Dtab *dt, uchar *p, int n)
{
int probe;
+ u64int h;
- rh >>= Bshift;
- for(probe = rh % dt->sz; dt->b[probe].buf != nil; probe = (probe + 1) % dt->sz)
- if(dt->b[probe].rhash == rh)
- return &dt->b[probe];
+ h = hash(p, n);
+ for(probe = h % dt->sz; dt->b[probe].buf != nil; probe = (probe + 1) % dt->sz){
+ if(dt->b[probe].hash != h)
+ continue;
+ if(n != dt->b[probe].len)
+ continue;
+ if(memcmp(p, dt->b[probe].buf, n) != 0)
+ continue;
+ return &dt->b[probe];
+ }
return nil;
}
static int
-nextblk(uchar *s, uchar *e, u64int *ph)
+nextblk(uchar *s, uchar *e)
{
- u64int rh;
+ u32int gh;
uchar *p;
- int i;
- p = s;
- rh = 0;
- for(i = 0; (rh & Bmask) != Bconst; i++){
- if(p == e)
+ if((e - s) < Minchunk)
+ return e - s;
+ p = s + Minchunk;
+ if((e - s) > Maxchunk)
+ e = s + Maxchunk;
+ gh = 0;
+ while(p != e){
+ gh = (gh<<1) + geartab[*p++];
+ if((gh & Splitmask) == 0)
break;
- /* FIXME: better hash */
- rh *= Hshift;
- rh += *p++ + K;
}
- *ph = rh;
return p - s;
}
-static int
-sameblk(Dblock *b, uchar *s, uchar *e)
+void
+dtinit(Dtab *dt, void *base, int nbase)
{
- if(b->len != e - s)
- return 0;
- return memcmp(b->buf, s, b->len) == 0;
+ uchar *s, *e;
+ u64int h;
+ vlong n, o;
+
+ o = 0;
+ s = base;
+ e = s + nbase;
+ dt->nb = 0;
+ dt->sz = 128;
+ dt->b = eamalloc(dt->sz, sizeof(Dblock));
+ dt->base = base;
+ dt->nbase = nbase;
+ while(s != e){
+ n = nextblk(s, e);
+ h = hash(s, n);
+ addblk(dt, s, n, o, h);
+ s += n;
+ o += n;
+ }
}
+void
+dtclear(Dtab *dt)
+{
+ free(dt->b);
+}
+
static int
emitdelta(Delta **pd, int *nd, int cpy, int off, int len)
{
Delta *d;
- if(len == 0)
- return 0;
*nd += 1;
*pd = erealloc(*pd, *nd * sizeof(Delta));
d = &(*pd)[*nd - 1];
@@ -100,81 +167,52 @@
return len;
}
-void
-dtinit(Dtab *dt, void *base, int nbase)
+static int
+stretch(Dtab *dt, Dblock *b, uchar *s, uchar *e, int n)
{
- uchar *bp, *s, *e;
- u64int rh;
-
- bp = base;
- s = bp;
- e = bp;
- rh = 0;
- dt->nb = 0;
- dt->sz = 128;
- dt->b = eamalloc(dt->sz, sizeof(Dblock));
- while(e != bp + nbase){
- e += nextblk(s, bp + nbase, &rh);
- addblk(dt, s, e - s, s - bp, rh);
- s = e;
+ uchar *p, *q, *eb;
+
+ if(b == nil)
+ return n;
+ p = s + n;
+ q = dt->base + b->off + n;
+ eb = dt->base + dt->nbase;
+ while(n < (1<<24)){
+ if(p == e || q == eb)
+ break;
+ if(*p != *q)
+ break;
+ p++;
+ q++;
+ n++;
}
+ return n;
}
-void
-dtclear(Dtab *dt)
-{
- free(dt->b);
-}
-
Delta*
deltify(void *targ, int ntarg, Dtab *dt, int *pnd)
{
- Dblock *k;
Delta *d;
- uchar *l, *s, *e, *eb, *tp;
- int i, nd, nb;
- u64int rh;
-
-
- tp = targ;
- l = targ;
- s = targ;
- e = targ;
+ Dblock *b;
+ uchar *s, *e;
+ vlong n, o;
+
+ o = 0;
d = nil;
- nd = 0;
- rh = 0;
- e += nextblk(s, tp + ntarg, &rh);
- while(1){
- if((rh & Bmask) == Bconst && (k = findrough(dt, rh)) != nil){
- if(sameblk(k, s, e)){
- nb = k->len;
- eb = k->buf + k->len;
- /* stretch the block: 1<<24 is the max packfiles support. */
- for(i = 0; i < (1<<24) - nb; i++){
- if(e == tp + ntarg || eb == dt->base + dt->nbase)
- break;
- if(*e != *eb)
- break;
- nb++;
- eb++;
- e++;
- }
- emitdelta(&d, &nd, 0, l - tp, s - l);
- emitdelta(&d, &nd, 1, k->off, nb);
- s = e;
- l = e;
- e += nextblk(s, tp + ntarg, &rh);
- continue;
- }
- }
- if(e == tp + ntarg)
- break;
- /* got a better hash? apply within! */
- rh -= *s++ * Hlast;
- rh *= Hshift;
- rh += *e++ + K;
+ s = targ;
+ e = s + ntarg;
+ *pnd = 0;
+ while(s != e){
+ n = nextblk(s, e);
+ b = lookup(dt, s, n);
+ n = stretch(dt, b, s, e, n);
+ if(b != nil)
+ emitdelta(&d, pnd, 1, b->off, n);
+ else
+ emitdelta(&d, pnd, 0, o, n);
+ s += n;
+ o += n;
}
- emitdelta(&d, &nd, 0, l - tp, tp + ntarg - l);
- *pnd = nd;
+ assert(o == ntarg);
return d;
}
--- a/git.h
+++ b/git.h
@@ -162,7 +162,7 @@
uchar *buf;
int len;
int off;
- u64int rhash;
+ u64int hash;
};
struct Delta {
--- a/pack.c
+++ b/pack.c
@@ -7,6 +7,7 @@
typedef struct Buf Buf;
typedef struct Objmeta Objmeta;
typedef struct Compout Compout;
+typedef struct Packf Packf;
#define max(x, y) ((x) > (y) ? (x) : (y))
@@ -42,6 +43,13 @@
char *data;
};
+struct Packf {
+ char *path;
+ int refs;
+ int idxfd;
+ int packfd;
+};
+
static int readpacked(Biobuf *, Object *, int);
static Object *readidxobject(Biobuf *, Hash, int);
@@ -50,6 +58,8 @@
Object *lrutail;
int ncache;
int cachemax = 1024;
+Packf *packf;
+int npackf;
static void
clear(Object *o)
@@ -150,7 +160,81 @@
}
}
+static Packf*
+findpackf(char *path)
+{
+ int np, pfd, ifd;
+ char buf[192];
+ Packf *pf;
+ np = strlen(path);
+ if(np > 4 && strcmp(path + np - 4, ".idx") == 0)
+ np -= 4;
+ else if(np > 5 && strcmp(path + np - 5, ".pack") == 0)
+ np -= 5;
+ else{
+ sysfatal("invalid pack path %s", path);
+ return nil;
+ }
+
+ for(pf = &packf[0]; pf != &packf[npackf]; pf++)
+ if(strncmp(pf->path, path, np) == 0 && strlen(pf->path) == np)
+ return pf;
+ snprint(buf, sizeof(buf), ".git/objects/pack/%.*s.idx", np, path);
+ ifd = open(buf, OREAD);
+ snprint(buf, sizeof(buf), ".git/objects/pack/%.*s.pack", np, path);
+ pfd = open(buf, OREAD);
+ if(ifd == -1 || pfd == -1)
+ goto error;
+ packf = earealloc(packf, ++npackf, sizeof(Packf));
+ pf = &packf[npackf-1];
+ pf->path = smprint("%.*s", np, path);
+ pf->idxfd = ifd;
+ pf->packfd = pfd;
+ return pf;
+
+error:
+ print("pack open error for %s: %r\n", path);
+ if(ifd != -1) close(ifd);
+ if(pfd != -1) close(pfd);
+ return nil;
+}
+
+static Biobuf*
+packopen(char *path)
+{
+ Biobuf *bfd;
+ Packf *pf;
+
+ if((pf = findpackf(path)) == nil)
+ return nil;
+ bfd = emalloc(sizeof(Biobuf));
+ Binit(bfd, pf->packfd, OREAD);
+ return bfd;
+
+
+}
+
+static Biobuf*
+idxopen(char *path)
+{
+ Biobuf *bfd;
+ Packf *pf;
+
+ if((pf = findpackf(path)) == nil)
+ return nil;
+ bfd = emalloc(sizeof(Biobuf));
+ Binit(bfd, pf->idxfd, OREAD);
+ return bfd;
+}
+
+static void
+pfclose(Biobuf *f)
+{
+ Bterm(f);
+ free(f);
+}
+
static u32int
crc32(u32int crc, char *b, int nb)
{
@@ -888,29 +972,23 @@
l = strlen(d[i].name);
if(l > 4 && strcmp(d[i].name + l - 4, ".idx") != 0)
continue;
- snprint(path, sizeof(path), ".git/objects/pack/%s", d[i].name);
- if((f = Bopen(path, OREAD)) == nil)
+ if((f = idxopen(d[i].name)) == nil)
continue;
o = searchindex(f, h);
- Bterm(f);
- if(o == -1)
- continue;
- break;
+ pfclose(f);
+ if(o != -1)
+ break;
}
-
if (o == -1)
goto error;
- if((n = snprint(path, sizeof(path), "%s", path)) >= sizeof(path) - 4)
+ if((f = packopen(d[i].name)) == nil)
goto error;
- memcpy(path + n - 4, ".pack", 6);
- if((f = Bopen(path, OREAD)) == nil)
- goto error;
if(Bseek(f, o, 0) == -1)
goto errorf;
if(readpacked(f, obj, flag) == -1)
goto errorf;
- Bterm(f);
+ pfclose(f);
parseobject(obj);
free(d);
cache(obj);
@@ -1069,7 +1147,8 @@
}
nvalid = n;
}
- fprint(2, "\b\b\b\b100%%\n");
+ if(interactive)
+ fprint(2, "\b\b\b\b100%%\n");
Bterm(f);
st = nil;
@@ -1294,19 +1373,21 @@
for(i = 0; i < nmeta; i++){
m = meta[i];
pct = showprogress((i*100) / nmeta, pct);
- if((a = readobject(m->obj->hash)) == nil)
- sysfatal("readobject %H: %r", m->obj->hash);
- best = a->size;
m->base = nil;
m->delta = nil;
m->ndelta = 0;
+ if(m->obj->type == GCommit || m->obj->type == GTag)
+ continue;
+ if((a = readobject(m->obj->hash)) == nil)
+ sysfatal("readobject %H: %r", m->obj->hash);
+ best = a->size;
dtinit(&m->dtab, a->data, a->size);
if(i >= 11)
dtclear(&meta[i-11]->dtab);
for(j = max(0, i - 10); j < i; j++){
p = meta[j];
- /* lets not make the chains too long */
- if(p->nchain >= 4095)
+ /* long chains make unpacking slow */
+ if(p->nchain >= 128 || p->obj->type != a->type)
continue;
if((b = readobject(p->obj->hash)) == nil)
sysfatal("readobject %H: %r", p->obj->hash);
--- a/util.c
+++ b/util.c
@@ -317,7 +317,7 @@
return 0;
if(x > pct){
pct = x;
- fprint(1, "\b\b\b\b%3d%%", pct);
+ fprint(2, "\b\b\b\b%3d%%", pct);
}
return pct;
}