shithub: pdffs

ref: ecd40a88198f1a03b0e87a4f3066f944f09cf0d8
dir: pdffs/pdf.c

View raw version
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ctype.h>
#include "pdf.h"

int Tfmt(Fmt *f);

/*
 * pre-1.5 xref section reader
 * PDF>=1.5 may have BOTH (or either) old xref format and xref streams
 */
static int
xrefread(Pdf *pdf, int xref0, int nxref)
{
	int i, j, sz, n, newnxref;
	Xref xref;
	char *s, *e;
	Xref *x;

	s = nil;
	if((x = realloc(pdf->xref, (pdf->nxref + nxref)*sizeof(Xref))) == nil)
		goto err;
	pdf->xref = x;

	/* read the entire thing at once */
	sz = nxref*20;
	if((s = malloc(sz)) == nil)
		goto err;
	for(i = 0; i < sz; i += n){
		if((n = Bread(pdf->bio, s+i, sz-i)) < 1)
			goto err;
	}

	/* store non-free objects only */
	newnxref = pdf->nxref;
	for(e = s, i = 0; i < nxref; i++, e += 20){
		if(!isspace(e[10]) || !isspace(e[18]) || !isspace(e[19])){
			werrstr("invalid xref line (%d/%d)", i, nxref);
			goto err;
		}
		xref.id = xref0 + i;
		xref.off = strtoul(e, nil, 10);

		/* search in already existing xrefs, update if found */
		for(j = 0; j < pdf->nxref; j++){
			if(pdf->xref[j].id != xref.id)
				continue;
			if(e[17] == 'f') /* it was freed */
				pdf->xref[j].id = 0;
			else if(e[17] == 'n')
				pdf->xref[j].off = xref.off;
			break;
		}
		if(j >= pdf->nxref && e[17] == 'n') /* that's a new one, insert unless it's free */
			pdf->xref[newnxref++] = xref;
	}
	free(s);
	s = nil;

	/* scale down */
	for(i = j = 0; i < newnxref; i++){
		if(pdf->xref[i].id != 0)
			pdf->xref[j++] = pdf->xref[i];
	}
	if((x = realloc(pdf->xref, j*sizeof(Xref))) == nil)
		goto err;
	pdf->xref = x;
	pdf->nxref = j;

	return 0;
err:
	free(s);
	return -1;
}

static int
trailerread(Pdf *pdf)
{
	Object *o;

	if((o = pdfobj(pdf, pdf->bio)) == nil)
		goto err;

	if(o->type != Odict){
		werrstr("isn't a dictionary");
		goto err;
	}

	pdf->root = pdfref(dictget(o, "Root"));
	pdf->info = pdfref(dictget(o, "Info"));
	pdfobjfree(o);

	return 0;
err:
	pdfobjfree(o);
	return -1;
}

Pdf *
pdfopen(Biobuf *b)
{
	Pdf *pdf;
	Object *o;
	char tmp[64], *s, *x;
	int xref0; /* 7.5.4 xref subsection first object number */
	int nxref; /* 7.5.4 xref subsection number of objects */
	int xreftb; /* 7.5.4 xref table offset from the beginning of the file */
	int i, n, off, w[3];
	Stream *stream;

	fmtinstall('T', Tfmt);

	o = nil;
	if((pdf = calloc(1, sizeof(*pdf))) == nil)
		goto err;
	pdf->bio = b;

	/* check header */
	if(Bread(b, tmp, 8) != 8 ||
	   strncmp(tmp, "%PDF-", 5) != 0 || !isdigit(tmp[5]) || tmp[6] != '.' || !isdigit(tmp[7])){
		werrstr("not a pdf");
		goto err;
	}

	/* 7.5.4, 7.5.8 xref table */

	/* read a block of data */
	n = sizeof(tmp)-1;
	Bseek(b, -n, 2);
	if(Bread(b, tmp, n) != n){
badtrailer:
		werrstr("invalid trailer");
		goto err;
	}
	tmp[n] = 0;

	/* search for a valid string that the block ends with */
	for(i = n-1, s = &tmp[i]; i > 0 && *s != 0; i--, s--);
	s++;

	/* find "startxref" */
	if((x = strrchr(s, 'f')) == nil || !isws(x[1]) || x-8 < s+1 || memcmp(x-8, "startxref", 9) != 0)
		goto badtrailer;
	x++;
	if((xreftb = strtol(x, nil, 10)) < 1)
		goto badtrailer;

	/* read xref */
	if(Bseek(b, xreftb, 0) != xreftb){
		werrstr("xref position out of range");
		goto err;
	}
morexref:
	off = Boffset(b);
	n = sizeof(tmp)-1;
	if((n = Bread(b, tmp, n)) < 16){
badxref:
		werrstr("invalid xref: %r");
		goto err;
	}
	tmp[n] = 0;
	if(memcmp(tmp, "xref", 4) == 0){
		/* 7.5.4 xref */
		x = tmp+4;
		xref0 = strtol(x, &x, 10);
		nxref = strtol(x, &x, 10);
		/* skip whitespace and move to the first subsection */
		for(; isws(*x) && x < tmp+n; x++);
		n = x-tmp+off;
		if(Bseek(b, n, 0) != n)
			goto badxref;
		if(xref0 >= 0 && nxref > 0 && xrefread(pdf, xref0, nxref) != 0)
			goto badxref;
		goto morexref; /* there could be more updates, try it */
	}else if(memcmp(tmp, "trailer", 7) == 0){ /* 7.5.5 file trailer */
		/* move to the trailer dictionary */
		n = off + 8;
		if(Bseek(b, n, 0) != n || trailerread(pdf) != 0){
			werrstr("invalid trailer: %r");
			goto err;
		}
	}else if(isdigit(tmp[0])){ /* could be 7.5.8 xref stream (since PDF 1.5) */
		Bseek(b, xreftb, 0);
		if((o = pdfobj(pdf, b)) == nil || (stream = streamopen(o)) == nil){
			werrstr("failed to stream xref: %r");
			goto badxref;
		}
		if(dictints(o, "W", w, nelem(w)) != 3){
			werrstr("W isn't 3 elements");
			goto badxref;
		}
		streamclose(stream);
		pdf->root = dictget(o, "Root");
		pdf->info = dictget(o, "Info");
	}

	/* root is required, info is optional */
	if(pdf->root == nil){
		werrstr("no root");
		goto err;
	}

	return pdf;
err:
	werrstr("pdfopen: %r [at %p]", (void*)Boffset(b));
	pdfclose(pdf);
	pdfobjfree(o);
	return nil;
}

void
pdfclose(Pdf *pdf)
{
	if(pdf == nil)
		return;
	if(pdf->bio != nil)
		Bterm(pdf->bio);
	free(pdf->xref);
	free(pdf);
}