ref: 5453bdf906a8b8827d0b4f49a50cea9fddca43af
dir: /src/runtime/bfile.c/
/***************** BFILE *******************/ /* * BFILE is a family of stream objects. * There are two kinds: endpoints (source/sink) and transducers. * The streams are typically bytes. * * FILE source/sink for stdio FILE handles, handles * buf source/sink where the read/write uses a memory buffer. * * lz77 transducer for LZ77 compression * put - compresses a byte * get - returns an decompressed byte * rle transducer for Run Length Encoding * put - compresses an ASCII character * get - returns a decompressed ASCII * bwt transducer for Burrows-Wheeler Transform * put - transforms a byte * get - untransforms a byte * utf8 transducer for UTF8 encoding * put - encodes a Unicode code point (int) to bytes * get - decodes a Unicode code point */ /**** Buffers for collecting data. */ struct bfbuffer { size_t size; /* total size of buffer */ size_t pos; /* current index into buffer */ uint8_t *buf; /* actual data */ }; void bfbuffer_init(struct bfbuffer *bf, size_t size) { bf->size = size; bf->pos = 0; bf->buf = MALLOC(size); if (!bf->buf) ERR("bfbuffer_init"); } void bfbuffer_snoc(struct bfbuffer *bf, int byte) { if (bf->pos >= bf->size) { bf->size *= 2; bf->buf = REALLOC(bf->buf, bf->size); if (!bf->buf) ERR("bfbuffer_snoc"); } bf->buf[bf->pos++] = byte; } void bfbuffer_free(struct bfbuffer *bf) { FREE(bf->buf); } /*********************************/ /* Sanity checking */ #define CHECKBFILE(p, f) do { if (p->getb != f) { ERR("CHECKBFILE"); } } while(0) /* BFILE will have different implementations, they all have these methods */ typedef struct BFILE { int (*getb)(struct BFILE*); void (*ungetb)(int, struct BFILE*); void (*putb)(int, struct BFILE*); void (*flushb)(struct BFILE*); void (*closeb)(struct BFILE*); } BFILE; static INLINE int getb(BFILE *p) { return p->getb(p); } static INLINE void ungetb(int c, BFILE *p) { p->ungetb(c, p); } static INLINE void putb(int c, BFILE *p) { p->putb(c, p); } static INLINE void closeb(BFILE *p) { p->closeb(p); } static INLINE void flushb(BFILE *p) { p->flushb(p); } void putsb(const char *str, struct BFILE *p) { char c; while ((c = *str++)) putb(c, p); } size_t readb(void *abuf, size_t size, BFILE *p) { uint8_t *buf = abuf; size_t s; for (s = 0; s < size; s++) { int c = getb(p); if (c < 0) break; buf[s] = c; } return s; } /* convert -n to a string, handles MINBOUND correctly */ void putnegb(value_t n, BFILE *p) { int c = '0' - n % 10; if (n <= -10) { putnegb(n / 10, p); } putb(c, p); } void putdecb(value_t n, BFILE *p) { if (n < 0) { putb('-', p); putnegb(n, p); } else { putnegb(-n, p); } } void putint32(value_t n, BFILE *p) { putb(n % 256, p); n /= 256; putb(n % 256, p); n /= 256; putb(n % 256, p); n /= 256; putb(n % 256, p); } value_t getint32(BFILE *p) { value_t b0 = getb(p); value_t b1 = getb(p); value_t b2 = getb(p); value_t b3 = getb(p); return ((((b3 * 256) + b2) * 256) + b1) * 256 + b0; } /***************** BFILE from/to memory buffer *******************/ struct BFILE_buffer { BFILE mets; size_t b_size; size_t b_pos; uint8_t *b_buffer; }; int getb_buf(BFILE *bp) { struct BFILE_buffer *p = (struct BFILE_buffer *)bp; CHECKBFILE(bp, getb_buf); if (p->b_pos >= p->b_size) return -1; return p->b_buffer[p->b_pos++]; } void putb_buf(int c, BFILE *bp) { struct BFILE_buffer *p = (struct BFILE_buffer *)bp; CHECKBFILE(bp, getb_buf); if (p->b_pos >= p->b_size) { p->b_size *= 2; p->b_buffer = realloc(p->b_buffer, p->b_size); if (!p->b_buffer) ERR("putb_buf"); } p->b_buffer[p->b_pos++] = c; } void ungetb_buf(int c, BFILE *bp) { struct BFILE_buffer *p = (struct BFILE_buffer *)bp; CHECKBFILE(bp, getb_buf); if (p->b_pos == 0) ERR("ungetb"); p->b_buffer[--p->b_pos] = (uint8_t)c; } void closeb_rd_buf(BFILE *bp) { CHECKBFILE(bp, getb_buf); FREE(bp); } void closeb_wr_buf(BFILE *bp) { CHECKBFILE(bp, getb_buf); FREE(bp); } void flushb_buf(BFILE *bp) { CHECKBFILE(bp, getb_buf); } struct BFILE* openb_rd_buf(uint8_t *buf, size_t len) { struct BFILE_buffer *p = MALLOC(sizeof(struct BFILE_buffer)); if (!p) memerr(); p->mets.getb = getb_buf; p->mets.ungetb = ungetb_buf; p->mets.putb = 0; p->mets.flushb = 0; p->mets.closeb = closeb_rd_buf; p->b_size = len; p->b_pos = 0; p->b_buffer = buf; return (struct BFILE*)p; } struct BFILE* openb_wr_buf(void) { struct BFILE_buffer *p = MALLOC(sizeof(struct BFILE_buffer)); if (!p) memerr(); p->mets.getb = getb_buf; /* Just to make CHECKFILE happy */ p->mets.ungetb = 0; p->mets.putb = putb_buf; p->mets.flushb = flushb_buf; p->mets.closeb = closeb_wr_buf; p->b_size = 1000; p->b_pos = 0; p->b_buffer = malloc(p->b_size); if (!p->b_buffer) ERR("openb_wr_buf"); return (struct BFILE*)p; } /* * Get the buffer used by writing. * This should be the last operation before closing, * since the buffer can move when writing. * The caller of openb_wr_buf() and get_buf() owns * the memory and must free it. */ void get_buf(struct BFILE *bp, uint8_t **bufp, size_t *lenp) { struct BFILE_buffer *p = (struct BFILE_buffer *)bp; CHECKBFILE(bp, getb_buf); *bufp = p->b_buffer; *lenp = p->b_pos; } #if WANT_STDIO /***************** BFILE via FILE *******************/ struct BFILE_file { BFILE mets; FILE *file; }; int getb_file(BFILE *bp) { struct BFILE_file *p = (struct BFILE_file *)bp; CHECKBFILE(bp, getb_file); return fgetc(p->file); } void ungetb_file(int c, BFILE *bp) { struct BFILE_file *p = (struct BFILE_file *)bp; CHECKBFILE(bp, getb_file); ungetc(c, p->file); } void putb_file(int c, BFILE *bp) { struct BFILE_file *p = (struct BFILE_file *)bp; CHECKBFILE(bp, getb_file); (void)fputc(c, p->file); } void flushb_file(BFILE *bp) { struct BFILE_file *p = (struct BFILE_file *)bp; CHECKBFILE(bp, getb_file); fflush(p->file); } void closeb_file(BFILE *bp) { struct BFILE_file *p = (struct BFILE_file *)bp; CHECKBFILE(bp, getb_file); fclose(p->file); FREE(p); } void freeb_file(BFILE *bp) { struct BFILE_file *p = (struct BFILE_file *)bp; CHECKBFILE(bp, getb_file); FREE(p); } BFILE * add_FILE(FILE *f) { struct BFILE_file *p = MALLOC(sizeof (struct BFILE_file)); if (!p) memerr(); p->mets.getb = getb_file; p->mets.ungetb = ungetb_file; p->mets.putb = putb_file; p->mets.flushb = flushb_file; p->mets.closeb = closeb_file; p->file = f; return (BFILE*)p; } #endif #if WANT_LZ77 /***************** BFILE via simple LZ77 decompression *******************/ struct BFILE_lz77 { BFILE mets; BFILE *bfile; /* underlying BFILE */ struct bfbuffer bf; int read; int numflush; }; int getb_lz77(BFILE *bp) { struct BFILE_lz77 *p = (struct BFILE_lz77*)bp; CHECKBFILE(bp, getb_lz77); if (p->bf.pos >= p->bf.size) return -1; return p->bf.buf[p->bf.pos++]; } void ungetb_lz77(int c, BFILE *bp) { struct BFILE_lz77 *p = (struct BFILE_lz77*)bp; CHECKBFILE(bp, getb_lz77); p->bf.pos--; } void putb_lz77(int b, BFILE *bp) { struct BFILE_lz77 *p = (struct BFILE_lz77*)bp; CHECKBFILE(bp, getb_lz77); bfbuffer_snoc(&p->bf, b); } /* Compress and write to output BFILE */ void flushb_lz77(BFILE *bp) { struct BFILE_lz77 *p = (struct BFILE_lz77*)bp; CHECKBFILE(bp, getb_lz77); /* If we have had a flush, and there is no new data then do nothing */ if (p->numflush++ && !p->bf.pos) return; uint8_t *obuf; size_t olen = lz77c(p->bf.buf, p->bf.pos, &obuf); putsb("LZ1", p->bfile); /* Version no */ putint32(olen, p->bfile); /* 32 bit length */ for (size_t i = 0; i < olen; i++) { putb(obuf[i], p->bfile); /* and the data */ } free(obuf); p->bf.pos = 0; flushb(p->bfile); } void closeb_lz77(BFILE *bp) { struct BFILE_lz77 *p = (struct BFILE_lz77*)bp; CHECKBFILE(bp, getb_lz77); if (!p->read) { /* We are in write mode, so compress and push it down */ flushb_lz77(bp); bfbuffer_free(&p->bf); } closeb(p->bfile); FREE(p); } BFILE * add_lz77_decompressor(BFILE *file) { struct BFILE_lz77 *p = MALLOC(sizeof(struct BFILE_lz77)); if (!p) memerr(); memset(p, 0, sizeof(struct BFILE_lz77)); p->mets.getb = getb_lz77; p->mets.ungetb = ungetb_lz77; p->mets.putb = 0; p->mets.flushb = 0; p->mets.closeb = closeb_lz77; p->read = 1; p->bfile = file; p->numflush = 0; /* First check version */ if (getb(file) != 'L' || getb(file) != 'Z' || getb(file) != '1') ERR("Bad LZ77 signature"); size_t size = getint32(file); /* then read size */ uint8_t *buf = MALLOC(size); /* temporary buffer for input */ if (!buf) memerr(); for(size_t i = 0; i < size; i++) { buf[i] = getb(file); /* and read data */ } p->bf.size = lz77d(buf, size, &p->bf.buf); /* decompress */ FREE(buf); p->bf.pos = 0; return (BFILE*)p; } BFILE * add_lz77_compressor(BFILE *file) { struct BFILE_lz77 *p = MALLOC(sizeof(struct BFILE_lz77)); if (!p) memerr(); memset(p, 0, sizeof(struct BFILE_lz77)); p->mets.getb = getb_lz77; p->mets.ungetb = 0; p->mets.putb = putb_lz77; p->mets.flushb = flushb_lz77; p->mets.closeb = closeb_lz77; p->read = 0; p->bfile = file; p->numflush = 0; bfbuffer_init(&p->bf, 25000); return (BFILE*)p; } #endif /* WANT_LZ77 */ #if WANT_RLE /***************** BFILE via RLE decompression *******************/ /* * Run Length Encoding for ASCII * Format * c - c one ASCII character * 0x80+n c - n repetitions of ASCII character c * 0x80+n 0x80+m c - n*128+m repetitions of ASCII character c * ... for longer run lengths * Non-ASCII (i.e., >= 128) have very poor encoding: * 0x81 c-0x80 */ struct BFILE_rle { BFILE mets; BFILE *bfile; /* underlying BFILE */ size_t count; int byte; int unget; int read; }; int get_rep(BFILE *in) { size_t n = 0; for(;;) { int c = getb(in); //fprintf(stderr,"get_rep %02x\n", c); if (c < 0) return -1; if (c < 128) { ungetb(c, in); return n; } n = n * 128 + c - 128; } } int getb_rle(BFILE *bp) { struct BFILE_rle *p = (struct BFILE_rle*)bp; CHECKBFILE(bp, getb_rle); if (p->unget >= 0) { int c = p->unget; p->unget = -1; return c; } if (p->count) { p->count--; return p->byte; } else { int n = get_rep(p->bfile); if (n < 0) { return -1; } else if (n == 1) { /* repetition count == 1 means a non-ASCII byte */ return getb(p->bfile) | 0x80; } else { p->count = n; p->byte = getb(p->bfile); return p->byte; } } } void ungetb_rle(int c, BFILE *bp) { struct BFILE_rle *p = (struct BFILE_rle*)bp; CHECKBFILE(bp, getb_rle); p->unget = c; } void put_rep(BFILE *out, size_t n) { if (n > 127) put_rep(out, n / 128); putb(n % 128 + 128, out); } void out_rle(struct BFILE_rle *p) { if (p->count > 2) { /* More than 2 repeating chars, it's worth compressing */ put_rep(p->bfile, p->count - 1); putb(p->byte, p->bfile); } else { while(p->count-- > 0) putb(p->byte, p->bfile); } } void putb_rle(int b, BFILE *bp) { struct BFILE_rle *p = (struct BFILE_rle*)bp; CHECKBFILE(bp, getb_rle); if (b & 0x80) { /* Encode non-ASCII as repetition count 1, followed by the byte with MSB removed */ out_rle(p); put_rep(p->bfile, 1); putb(b & 0x7f, p->bfile); p->count = 0; p->byte = -1; } else if (b == p->byte) { p->count++; } else { out_rle(p); p->count = 1; p->byte = b; } } void closeb_rle(BFILE *bp) { struct BFILE_rle *p = (struct BFILE_rle*)bp; CHECKBFILE(bp, getb_rle); if (!p->read) flushb(bp); closeb(p->bfile); } void flushb_rle(BFILE *bp) { struct BFILE_rle *p = (struct BFILE_rle*)bp; CHECKBFILE(bp, getb_rle); /* output last byte(s) */ out_rle(p); p->count = 0; flushb(p->bfile); } BFILE * add_rle_decompressor(BFILE *file) { struct BFILE_rle *p = MALLOC(sizeof(struct BFILE_rle)); if (!p) memerr(); p->mets.getb = getb_rle; p->mets.ungetb = ungetb_rle; p->mets.putb = 0; p->mets.flushb = 0; p->mets.closeb = closeb_rle; p->count = 0; p->unget = -1; p->bfile = file; p->read = 1; return (BFILE*)p; } BFILE * add_rle_compressor(BFILE *file) { struct BFILE_rle *p = MALLOC(sizeof(struct BFILE_rle)); if (!p) memerr(); p->mets.getb = getb_rle; p->mets.ungetb = 0; p->mets.putb = putb_rle; p->mets.flushb = flushb_rle; p->mets.closeb = closeb_rle; p->count = 0; p->byte = -1; p->bfile = file; p->read = 0; return (BFILE*)p; } #endif /* WANT_RLE */ #if WANT_BWT /***************** BFILE via Burrows-Wheeler Transform *******************/ /* */ struct BFILE_bwt { BFILE mets; BFILE *bfile; /* underlying BFILE */ size_t count; struct bfbuffer bf; int read; int numflush; }; int getb_bwt(BFILE *bp) { struct BFILE_bwt *p = (struct BFILE_bwt*)bp; CHECKBFILE(bp, getb_bwt); if (p->bf.pos >= p->bf.size) return -1; return p->bf.buf[p->bf.pos++]; } void ungetb_bwt(int c, BFILE *bp) { struct BFILE_bwt *p = (struct BFILE_bwt*)bp; CHECKBFILE(bp, getb_bwt); p->bf.pos--; } void putb_bwt(int b, BFILE *bp) { struct BFILE_bwt *p = (struct BFILE_bwt*)bp; CHECKBFILE(bp, getb_bwt); bfbuffer_snoc(&p->bf, b); } void closeb_bwt(BFILE *bp) { struct BFILE_bwt *p = (struct BFILE_bwt*)bp; CHECKBFILE(bp, getb_bwt); if (!p->read) flushb(bp); closeb(p->bfile); } /* Sort all rotations of buf, and the indices of the sorted strings in res. */ /* * |.......................................| * ^ ^ * a b * <- n -> * <- m -> * <- o -> */ static uint8_t *compar_arg; static size_t compar_len; int compar(const void *pa, const void *pb) { uint32_t a = *(uint32_t*)pa; uint32_t b = *(uint32_t*)pb; int r; if (a == b) return 0; if (a < b) { size_t n = compar_len - b; /* bytes until end of buffer */ r = memcmp(compar_arg + a, compar_arg + b, n); if (r) return r; size_t m = b - a; r = memcmp(compar_arg + a + n, compar_arg, m); if (r) return r; size_t o = a; return memcmp(compar_arg, compar_arg + m, o); } else { size_t n = compar_len - a; /* bytes until end of buffer */ r = memcmp(compar_arg + a, compar_arg + b, n); if (r) return r; size_t m = a - b; r = memcmp(compar_arg, compar_arg + b + n, m); if (r) return r; size_t o = a; return memcmp(compar_arg + m, compar_arg, o); } return 0; } void sort_buffer(uint8_t *buf, size_t buflen, uint32_t *res) { for(size_t i = 0; i < buflen; i++) res[i] = i; compar_arg = buf; compar_len = buflen; qsort(res, buflen, sizeof(uint32_t), compar); } uint32_t encode_bwt(uint8_t *data, size_t len, uint8_t *last) { uint32_t *res = malloc(len * sizeof(uint32_t)); if (!res) ERR("encode_bwt"); sort_buffer(data, len, res); uint32_t zero = 0; for(size_t i = 0; i < len; i++) { uint32_t offs = res[i]; last[i] = data[(offs + len - 1) % len]; if (offs == 0) zero = i; } return zero; } void flushb_bwt(BFILE *bp) { struct BFILE_bwt *p = (struct BFILE_bwt*)bp; CHECKBFILE(bp, getb_bwt); /* If we have had a flush, and there is no new data then do nothing */ if (p->numflush++ && !p->bf.pos) return; putsb("BW1", p->bfile); /* version no */ putint32(p->bf.pos, p->bfile); /* 32 bit length */ uint8_t *last = malloc(p->bf.pos); if (!last) ERR("flushb_bwt"); size_t zero = encode_bwt(p->bf.buf, p->bf.pos, last); putint32(zero, p->bfile); for(size_t i = 0; i < p->bf.pos; i++) putb(last[i], p->bfile); FREE(last); p->bf.pos = 0; flushb(p->bfile); } #define MAXBYTE 256 void decode_bwt(uint8_t *data, size_t len, uint8_t *odata, size_t zero) { size_t count[MAXBYTE]; uint32_t *pred = malloc(len * sizeof(uint32_t)); for(size_t i = 0; i < MAXBYTE; i++) { count[i] = 0; } for(size_t i = 0; i < len; i++) { pred[i] = count[data[i]]++; } size_t sum = 0; for(size_t i = 0; i < MAXBYTE; i++) { size_t s = count[i]; count[i] = sum; sum += s; } size_t i = zero; for(size_t j = len; j > 0; j--) { odata[j - 1] = data[i]; i = pred[i] + count[data[i]]; } } BFILE * add_bwt_decompressor(BFILE *file) { struct BFILE_bwt *p = MALLOC(sizeof(struct BFILE_bwt)); if (!p) memerr(); memset(p, 0, sizeof(struct BFILE_bwt)); p->mets.getb = getb_bwt; p->mets.ungetb = ungetb_bwt; p->mets.putb = 0; p->mets.flushb = 0; p->mets.closeb = closeb_bwt; p->read = 1; p->bfile = file; p->numflush = 0; /* First check version */ if (getb(file) != 'B' || getb(file) != 'W' || getb(file) != '1') ERR("Bad BWT signature"); size_t size = getint32(file); /* then read size */ uint32_t zero = getint32(file); uint8_t *buf = MALLOC(size); /* temporary buffer for input */ if (!buf) memerr(); for(size_t i = 0; i < size; i++) { buf[i] = getb(file); /* and read data */ } bfbuffer_init(&p->bf, size); decode_bwt(buf, size, p->bf.buf, zero); /* decode */ FREE(buf); return (BFILE*)p; } BFILE * add_bwt_compressor(BFILE *file) { struct BFILE_bwt *p = MALLOC(sizeof(struct BFILE_bwt)); if (!p) memerr(); p->mets.getb = getb_bwt; p->mets.ungetb = 0; p->mets.putb = putb_bwt; p->mets.flushb = flushb_bwt; p->mets.closeb = closeb_bwt; p->read = 0; p->bfile = file; p->numflush = 0; bfbuffer_init(&p->bf, 25000); return (BFILE*)p; } #endif /* WANT_BWT */ /***************** BFILE with UTF8 encode/decode *******************/ struct BFILE_utf8 { BFILE mets; BFILE *bfile; int unget; }; /* This is not right with WORD_SIZE==16 */ int getb_utf8(BFILE *bp) { struct BFILE_utf8 *p = (struct BFILE_utf8*)bp; CHECKBFILE(bp, getb_utf8); int c1, c2, c3, c4; /* Do we have an ungetb character? */ if (p->unget >= 0) { c1 = p->unget; p->unget = -1; return c1; } c1 = getb(p->bfile); if (c1 < 0) return -1; if ((c1 & 0x80) == 0) return c1; c2 = getb(p->bfile); if (c2 < 0) return -1; if ((c1 & 0xe0) == 0xc0) return ((c1 & 0x1f) << 6) | (c2 & 0x3f); c3 = getb(p->bfile); if (c3 < 0) return -1; if ((c1 & 0xf0) == 0xe0) return ((c1 & 0x0f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f); c4 = getb(p->bfile); if (c4 < 0) return -1; if ((c1 & 0xf8) == 0xf0) return ((c1 & 0x07) << 18) | ((c2 & 0x3f) << 12) | ((c3 & 0x3f) << 6) | (c4 & 0x3f); ERR("getb_utf8"); } void ungetb_utf8(int c, BFILE *bp) { struct BFILE_utf8 *p = (struct BFILE_utf8*)bp; CHECKBFILE(bp, getb_utf8); if (p->unget >= 0) ERR("ungetb_utf8"); p->unget = c; } void putb_utf8(int c, BFILE *bp) { struct BFILE_utf8 *p = (struct BFILE_utf8 *)bp; CHECKBFILE(bp, getb_utf8); if (c < 0) ERR("putb_utf8: < 0"); if (c < 0x80) { putb(c, p->bfile); return; } if (c < 0x800) { putb(((c >> 6 ) ) | 0xc0, p->bfile); putb(((c ) & 0x3f) | 0x80, p->bfile); return; } if (c < 0x10000) { putb(((c >> 12) ) | 0xe0, p->bfile); putb(((c >> 6 ) & 0x3f) | 0x80, p->bfile); putb(((c ) & 0x3f) | 0x80, p->bfile); return; } if (c < 0x110000) { putb(((c >> 18) ) | 0xf0, p->bfile); putb(((c >> 12) & 0x3f) | 0x80, p->bfile); putb(((c >> 6 ) & 0x3f) | 0x80, p->bfile); putb(((c ) & 0x3f) | 0x80, p->bfile); return; } ERR("putb_utf8"); } void flushb_utf8(BFILE *bp) { struct BFILE_utf8 *p = (struct BFILE_utf8*)bp; CHECKBFILE(bp, getb_utf8); flushb(p->bfile); } void closeb_utf8(BFILE *bp) { struct BFILE_utf8 *p = (struct BFILE_utf8*)bp; CHECKBFILE(bp, getb_utf8); closeb(p->bfile); FREE(p); } BFILE * add_utf8(BFILE *file) { struct BFILE_utf8 *p = MALLOC(sizeof(struct BFILE_utf8)); if (!p) memerr(); p->mets.getb = getb_utf8; p->mets.ungetb = ungetb_utf8; p->mets.putb = putb_utf8; p->mets.flushb = flushb_utf8; p->mets.closeb = closeb_utf8; p->bfile = file; p->unget = -1; return (BFILE*)p; }