shithub: MicroHs

ref: 6c47eba740757adbe3cbdcd61645cadeb125c47d
dir: /src/runtime/lz77.c/

View raw version
#include <inttypes.h>
#include <stdlib.h>
#if 0
#include <stdio.h>
#endif

#define MAX(x,y) ((x) > (y) ? (x) : (y))
#define MIN(x,y) ((x) < (y) ? (x) : (y))

#define MAXWIN 8192
#define MAXLEN (9 + 255)
#define MINMATCH 3
#define MINOFFS 1

/*
 * Encoding inspired by FastLZ
 *  000nnnnn                      - literal run follows, with n+1 bytes
 *  111ooooo pppppppp nnnnnnnn    - at offset o*256+p, copy n+9 bytes (maximum offset is 8191)
 *  nnnooooo pppppppp             - at offset o*256+p, copy n+2 bytes (n != 0, n != 7)
 * Match length is >= 3.
 */

static void
put(uint8_t **bufp, size_t *sizep, size_t *offsp, uint8_t byte)
{
  if (*offsp >= *sizep) {
    *sizep *= 2;
    *bufp = realloc(*bufp, *sizep);
  }
  (*bufp)[(*offsp)++] = byte;
}
#define PUT(x) put(&outbuf, &outsize, &outoffs, (x))

/* dst must be big enough to hold the decompressed result */
size_t
lz77d(uint8_t *src, size_t srclen, uint8_t **bufp)
{
  uint8_t *end = src + srclen;
  size_t outsize = 100000;
  uint8_t *outbuf = malloc(outsize);
  size_t outoffs = 0;

  while (src < end) {
    int op = *src++;
    int opx = op & 0x1f;        /* part of offset, or literal length */
    op >>= 5;
    if (op == 0) {
      /* 000nnnnn */
      do {
        PUT(*src++);
      } while (--opx >= 0);
    } else {
      size_t len;
      size_t offs = opx * 256 + *src++ + MINOFFS;
      if (op == 7) {
        /* 111ooooo pppppppp nnnnnnnn */
        len = 9 + *src++;
      } else {
        /* nnnooooo pppppppp */
        len = 2 + op;
      }
      //printf("match cur=%d offs=%d len=%d\n", (int)(dst - adst), (int)offs, (int)len);
      offs = outoffs - offs;
      for (size_t i = 0; i < len; i++) {
        uint8_t b = outbuf[offs + i];
        PUT(b);
      }
    }
  }
  *bufp = outbuf;
  return outoffs;
}

static INLINE size_t
match(uint8_t *src, uint8_t *win, uint8_t *end)
{
  size_t n;

  for(n = 0; *src == *win && src < end && n < MAXLEN; src++, win++, n++)
    ;
  return n;
}

/* XXX finding the match really needs some clever speedup */
size_t
lz77c(uint8_t *src, size_t srclen, uint8_t **bufp)
{
  uint8_t *cur;
  uint8_t *end = src + srclen;
  size_t outsize = 25000;
  uint8_t *outbuf = malloc(outsize);
  size_t outoffs = 0;

  for (cur = src; cur < end; ) {
    size_t win_end = cur - src;
    size_t win_len = MIN(win_end, MAXWIN);
    //size_t max_len = MIN(MAXLEN, end - cur);
    /* Inefficient match loop */
    size_t match_len = MINMATCH-1;
    size_t match_offs = 0;
    /* Compression is slow, since we use brute force to find a match. */
    for (size_t offs = MINOFFS; offs < win_len; offs++) {
      size_t n = match(cur, cur - offs, end);
      if (n > match_len) {
        match_len = n;
        match_offs = offs;
      }
    }
    if (match_len >= MINMATCH) {
      //printf("match cur=%d offs=%d len=%d str=%.*s\n", (int)(cur-src), (int)match_offs, (int)match_len, (int)match_len, cur);
      //printf("match cur=%d offs=%d len=%d\n", (int)(cur-src), (int)match_offs, (int)match_len);
      /* found a match */
      cur += match_len;
      match_offs -= MINOFFS;
      match_len -= 2;
      int hi = match_offs >> 8;
      int lo = match_offs & 0xff;
      if (match_len < 7) {
        PUT((match_len << 5) + hi);
        PUT(lo);
      } else {
        PUT((7 << 5) + hi);
        PUT(lo);
        PUT(match_len - 7);
      }
    } else {
      /* generate a literal */
      /* how long should it be?  3 seems to be a sweet spot */
      size_t len = MIN(3, end - cur);
      PUT(len - 1);
      for (size_t i = 0; i < len; i++) {
        PUT(*cur++);
      }
    }
  }
  *bufp = outbuf;
  return outoffs;
}

#if 0
int
main(int argc, char **argv)
{
  FILE *fi = fopen(argv[1], "r");
  FILE *fo = fopen(argv[2], "w");
  int dec = argc > 3;
  fseek(fi, 0, SEEK_END);
  size_t ilen = ftell(fi);
  size_t olen;
  fseek(fi, 0, SEEK_SET);
  uint8_t *ibuf = malloc(ilen);
  uint8_t *obuf;
  size_t n = fread(ibuf, 1, ilen, fi);
  if (n != ilen) exit(1);
  if (dec)
    olen = lz77d(ibuf, ilen, &obuf);
  else
    olen = lz77c(ibuf, ilen, &obuf);
  fwrite(obuf, olen, 1, fo);
  exit(0);
}
#endif