shithub: pdffs

Download patch

ref: 474117ed563f8f84f11d3dcf90635c584be29ec1
parent: 17128cefa8384e9433de8a725686b4e544a83308
author: Sigrid Haflínudóttir <ftrvxmtrx@gmail.com>
date: Mon Aug 31 12:27:10 EDT 2020

move xref logic into a separate file

--- a/misc.c
+++ b/misc.c
@@ -72,3 +72,15 @@
 		c == '[' || c == ']' || c == '{' || c == '}' ||
 		c == '/' || c == '%';
 }
+
+int
+Bgetint(Biobuf *b, int *i)
+{
+	double d;
+
+	if(Bgetd(b, &d) != 1 || isNaN(d))
+		return -1;
+	*i = d;
+
+	return 1;
+}
--- a/mkfile
+++ b/mkfile
@@ -18,6 +18,7 @@
 	pdffs.$O\
 	stream.$O\
 	string.$O\
+	xref.$O\
 
 HFILES=\
 	pdf.h\
--- a/pdf.c
+++ b/pdf.c
@@ -4,80 +4,7 @@
 #include <ctype.h>
 #include "pdf.h"
 
-int Tfmt(Fmt *f);
-int ⊗fmt(Fmt *f);
-
-/*
- * pre-1.5 xref section reader
- * PDF>=1.5 may have BOTH (or either) old xref format and xref streams
- */
 static int
-xrefread(Pdf *pdf, int xref0, int nxref)
-{
-	int i, j, sz, n, newnxref;
-	Xref xref;
-	char *s, *e;
-	Xref *x;
-
-	s = nil;
-	if((x = realloc(pdf->xref, (pdf->nxref + nxref)*sizeof(Xref))) == nil)
-		goto err;
-	pdf->xref = x;
-
-	/* read the entire thing at once */
-	sz = nxref*20;
-	if((s = malloc(sz)) == nil)
-		goto err;
-	for(i = 0; i < sz; i += n){
-		if((n = Bread(pdf->bio, s+i, sz-i)) < 1)
-			goto err;
-	}
-
-	/* store non-free objects only */
-	newnxref = pdf->nxref;
-	for(e = s, i = 0; i < nxref; i++, e += 20){
-		if(!isspace(e[10]) || !isspace(e[18]) || !isspace(e[19])){
-			werrstr("invalid xref line (%d/%d)", i, nxref);
-			goto err;
-		}
-		xref.id = xref0 + i;
-		xref.off = strtoul(e, nil, 10);
-		/* FIXME xref.gen */
-		xref.type = Xusual;
-
-		/* search in already existing xrefs, update if found */
-		for(j = 0; j < pdf->nxref; j++){
-			if(pdf->xref[j].id != xref.id)
-				continue;
-			if(e[17] == 'f') /* it was freed */
-				pdf->xref[j].id = 0;
-			else if(e[17] == 'n')
-				pdf->xref[j].off = xref.off;
-			break;
-		}
-		if(j >= pdf->nxref && e[17] == 'n') /* that's a new one, insert unless it's free */
-			pdf->xref[newnxref++] = xref;
-	}
-	free(s);
-	s = nil;
-
-	/* scale down */
-	for(i = j = 0; i < newnxref; i++){
-		if(pdf->xref[i].id != 0)
-			pdf->xref[j++] = pdf->xref[i];
-	}
-	if((x = realloc(pdf->xref, j*sizeof(Xref))) == nil)
-		goto err;
-	pdf->xref = x;
-	pdf->nxref = j;
-
-	return 0;
-err:
-	free(s);
-	return -1;
-}
-
-static int
 trailerread(Pdf *pdf)
 {
 	Object *o;
@@ -100,104 +27,6 @@
 	return -1;
 }
 
-static int
-getint(uchar *b, int sz, int dflt)
-{
-	int x, i;
-
-	if(sz == 0)
-		return dflt;
-	x = 0;
-	for(i = 0; i < sz; i++)
-		x = x<<8 | b[i];
-
-	return x;
-}
-
-/* 7.5.8.3 */
-static int
-xrefstreamread(Pdf *pdf)
-{
-	Object *o;
-	Stream *s;
-	Xref *x;
-	uchar buf[32];
-	int w[8], nw, i, c, n, nxref, newnxref, prev, extra;
-
-	s = nil;
-	if((o = pdfobj(pdf, pdf->bio)) == nil){
-		werrstr("xref stream obj: %r");
-		goto err;
-	}
-	if((prev = dictint(o, "Prev")) > 0){
-		if(Bseek(pdf->bio, prev, 0) != prev){
-			werrstr("xref stream prev seek failed");
-			goto err;
-		}
-		if(xrefstreamread(pdf) != 0){
-			pdfobjfree(o);
-			return -1;
-		}
-	}
-	if((s = streamopen(o)) == nil){
-		werrstr("failed to stream xref: %r");
-		goto err;
-	}
-	if((nw = dictints(o, "W", w, nelem(w))) < 3 || nw >= nelem(w)){
-		werrstr("nW=%d", nw);
-		goto err;
-	}
-
-	for(n = i = 0; i < nw; i++)
-		n += w[i]; /* size of each element. w[i] MAY be 0 */
-	if(n > sizeof(buf)){
-		werrstr("W is beyond imaginable: %d bytes", n);
-		goto err;
-	}
-	if((nxref = streamsize(s)/n) < 1){
-		werrstr("no xref elements in the stream");
-		goto err;
-	}
-	extra = streamsize(s) % (nxref*n);
-	if(extra != 0)
-		fprint(2, "extra %d bytes in xref stream", extra);
-
-	newnxref = pdf->nxref + nxref;
-	if((x = realloc(pdf->xref, newnxref*sizeof(Xref))) == nil)
-		goto err;
-	pdf->xref = x;
-	x += pdf->nxref;
-	while(Bread(s->bio, buf, n) == n){ /* stop on short read or error */
-		c = getint(buf, w[0], 1); /* default type is 1 */
-		if(c == 1){ /* not compressed */
-			x->off = getint(buf+w[0], w[1], 0);
-			x->gen = getint(buf+w[0]+w[1], w[2], 0);
-			x->type = Xuncompressed;
-			pdf->nxref++;
-			fprint(2, "xref %⊗\n", *x);
-			x++;
-		}else if(c == 2){ /* compressed */
-			x->objnum = getint(buf+w[0], w[1], 0);
-			x->id = getint(buf+w[0]+w[1], w[2], 0);
-			x->type = Xcompressed;
-			pdf->nxref++;
-			fprint(2, "xref %⊗\n", *x);
-			x++;
-		}
-	}
-
-	streamclose(s);
-	pdf->root = pdfref(dictget(o, "Root"));
-	pdf->info = pdfref(dictget(o, "Info"));
-	pdfobjfree(o);
-
-	return 0;
-err:
-	streamclose(s);
-	pdfobjfree(o);
-	return -1;
-}
-
 Pdf *
 pdfopen(Biobuf *b)
 {
@@ -204,8 +33,6 @@
 	Pdf *pdf;
 	Object *o;
 	char tmp[64], *s, *x;
-	int xref0; /* 7.5.4 xref subsection first object number */
-	int nxref; /* 7.5.4 xref subsection number of objects */
 	int xreftb; /* 7.5.4 xref table offset from the beginning of the file */
 	int i, n, off;
 
@@ -252,40 +79,33 @@
 		werrstr("xref position out of range");
 		goto err;
 	}
-morexref:
-	off = Boffset(b);
-	n = sizeof(tmp)-1;
-	if((n = Bread(b, tmp, n)) < 16){
+	for(;;){
+		off = Boffset(b);
+		if(Bread(b, tmp, sizeof(tmp)) < 8){
 badxref:
-		werrstr("invalid xref: %r");
-		goto err;
-	}
-	tmp[n] = 0;
-	if(memcmp(tmp, "xref", 4) == 0){
-		/* 7.5.4 xref */
-		x = tmp+4;
-		xref0 = strtol(x, &x, 10);
-		nxref = strtol(x, &x, 10);
-		/* skip whitespace and move to the first subsection */
-		for(; isws(*x) && x < tmp+n; x++);
-		n = x-tmp+off;
-		if(Bseek(b, n, 0) != n)
-			goto badxref;
-		if(xref0 >= 0 && nxref > 0 && xrefread(pdf, xref0, nxref) != 0)
-			goto badxref;
-		goto morexref; /* there could be more updates, try it */
-	}else if(memcmp(tmp, "trailer", 7) == 0){ /* 7.5.5 file trailer */
-		/* move to the trailer dictionary */
-		n = off + 8;
-		if(Bseek(b, n, 0) != n || trailerread(pdf) != 0){
-			werrstr("invalid trailer: %r");
+			werrstr("invalid xref: %r");
 			goto err;
 		}
-	}else if(isdigit(tmp[0])){ /* could be 7.5.8 xref stream (since PDF 1.5) */
-		if(Bseek(b, xreftb, 0) != xreftb)
-			goto badxref;
-		if(xrefstreamread(pdf) != 0)
-			goto err;
+		if(memcmp(tmp, "xref", 4) == 0){
+			if(Bseek(b, -sizeof(tmp)+5, 1) < 0 || xrefreadold(pdf) != 0)
+				goto err;
+			/* there could be more updates, try it */
+		}else if(memcmp(tmp, "trailer", 7) == 0){ /* 7.5.5 file trailer */
+			/* move to the trailer dictionary */
+			n = off + 8;
+			if(Bseek(b, n, 0) != n || trailerread(pdf) != 0){
+				werrstr("invalid trailer: %r");
+				goto err;
+			}
+			/* trailer is supposed to be the last thing */
+			break;
+		}else if(isdigit(tmp[0])){ /* could be 7.5.8 xref stream (since PDF 1.5) */
+			if(Bseek(b, xreftb, 0) != xreftb)
+				goto badxref;
+			if(xrefreadstream(pdf) != 0)
+				goto err;
+			break;
+		}
 	}
 
 	/* root is required, info is optional */
--- a/pdf.h
+++ b/pdf.h
@@ -21,11 +21,7 @@
 typedef struct Pdf Pdf;
 typedef struct Stream Stream;
 typedef struct Xref Xref;
-#pragma incomplete Filter
 
-#pragma varargck type "T" Object*
-#pragma varargck type "⊗" Xref
-
 struct Buffer {
 	uchar *b;
 	int ro;
@@ -177,3 +173,12 @@
 int bufput(Buffer *b, uchar *d, int sz);
 int bufget(Buffer *b, uchar *d, int sz);
 void bufdump(Buffer *b);
+
+#pragma varargck type "T" Object*
+#pragma varargck type "⊗" Xref
+int Tfmt(Fmt *f);
+int ⊗fmt(Fmt *f);
+int Bgetint(Biobuf *b, int *i);
+
+int xrefreadold(Pdf *pdf);
+int xrefreadstream(Pdf *pdf);
--- /dev/null
+++ b/xref.c
@@ -1,0 +1,189 @@
+#include <u.h>
+#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
+#include "pdf.h"
+
+/*
+ * 7.5.4 pre-1.5 xref section reader
+ * PDF>=1.5 may have BOTH (or either) old xref format and xref streams
+ */
+int
+xrefreadold(Pdf *pdf)
+{
+	int xref0; /* 7.5.4 xref subsection first object number */
+	int nxref; /* 7.5.4 xref subsection number of objects */
+	int i, j, sz, n, newnxref;
+	Xref xref;
+	char *s, *e;
+	Xref *x;
+
+	if(Bgetint(pdf->bio, &xref0) != 1 || xref0 < 0){
+		werrstr("invalid xref0");
+		return -1;
+	}
+	if(Bgetint(pdf->bio, &nxref) != 1 || nxref < 0){
+		werrstr("invalid nxref");
+		return -1;
+	}
+
+	/* skip whitespace and move to the first subsection */
+	while(isspace(Bgetc(pdf->bio)));
+	Bungetc(pdf->bio);
+
+	s = nil;
+	if((x = realloc(pdf->xref, (pdf->nxref + nxref)*sizeof(Xref))) == nil)
+		goto err;
+	pdf->xref = x;
+
+	/* read the entire thing at once */
+	sz = nxref*20;
+	if((s = malloc(sz)) == nil)
+		goto err;
+	for(i = 0; i < sz; i += n){
+		if((n = Bread(pdf->bio, s+i, sz-i)) < 1)
+			goto err;
+	}
+
+	/* store non-free objects only */
+	newnxref = pdf->nxref;
+	for(e = s, i = 0; i < nxref; i++, e += 20){
+		if(!isspace(e[10]) || !isspace(e[18]) || !isspace(e[19])){
+			werrstr("invalid xref line (%d/%d)", i, nxref);
+			goto err;
+		}
+		xref.id = xref0 + i;
+		xref.off = strtoul(e, nil, 10);
+		/* FIXME xref.gen */
+		xref.type = Xusual;
+
+		/* search in already existing xrefs, update if found */
+		for(j = 0; j < pdf->nxref; j++){
+			if(pdf->xref[j].id != xref.id)
+				continue;
+			if(e[17] == 'f') /* it was freed */
+				pdf->xref[j].id = 0;
+			else if(e[17] == 'n')
+				pdf->xref[j].off = xref.off;
+			break;
+		}
+		if(j >= pdf->nxref && e[17] == 'n') /* that's a new one, insert unless it's free */
+			pdf->xref[newnxref++] = xref;
+	}
+	free(s);
+	s = nil;
+
+	/* scale down */
+	for(i = j = 0; i < newnxref; i++){
+		if(pdf->xref[i].id != 0)
+			pdf->xref[j++] = pdf->xref[i];
+	}
+	if((x = realloc(pdf->xref, j*sizeof(Xref))) == nil)
+		goto err;
+	pdf->xref = x;
+	pdf->nxref = j;
+
+	return 0;
+err:
+	free(s);
+	return -1;
+}
+
+static int
+getint(uchar *b, int sz, int dflt)
+{
+	int x, i;
+
+	if(sz == 0)
+		return dflt;
+	x = 0;
+	for(i = 0; i < sz; i++)
+		x = x<<8 | b[i];
+
+	return x;
+}
+
+/* 7.5.8.3 */
+int
+xrefreadstream(Pdf *pdf)
+{
+	Object *o;
+	Stream *s;
+	Xref *x;
+	uchar buf[32];
+	int w[8], nw, i, c, n, nxref, newnxref, prev, extra;
+
+	s = nil;
+	if((o = pdfobj(pdf, pdf->bio)) == nil){
+		werrstr("xref stream obj: %r");
+		goto err;
+	}
+	if((prev = dictint(o, "Prev")) > 0){
+		if(Bseek(pdf->bio, prev, 0) != prev){
+			werrstr("xref stream prev seek failed");
+			goto err;
+		}
+		if(xrefreadstream(pdf) != 0){
+			pdfobjfree(o);
+			return -1;
+		}
+	}
+	if((s = streamopen(o)) == nil){
+		werrstr("failed to stream xref: %r");
+		goto err;
+	}
+	if((nw = dictints(o, "W", w, nelem(w))) < 3 || nw >= nelem(w)){
+		werrstr("nW=%d", nw);
+		goto err;
+	}
+
+	for(n = i = 0; i < nw; i++)
+		n += w[i]; /* size of each element. w[i] MAY be 0 */
+	if(n > sizeof(buf)){
+		werrstr("W is beyond imaginable: %d bytes", n);
+		goto err;
+	}
+	if((nxref = streamsize(s)/n) < 1){
+		werrstr("no xref elements in the stream");
+		goto err;
+	}
+	extra = streamsize(s) % (nxref*n);
+	if(extra != 0)
+		fprint(2, "extra %d bytes in xref stream", extra);
+
+	newnxref = pdf->nxref + nxref;
+	if((x = realloc(pdf->xref, newnxref*sizeof(Xref))) == nil)
+		goto err;
+	pdf->xref = x;
+	x += pdf->nxref;
+	while(Bread(s->bio, buf, n) == n){ /* stop on short read or error */
+		c = getint(buf, w[0], 1); /* default type is 1 */
+		if(c == 1){ /* not compressed */
+			x->off = getint(buf+w[0], w[1], 0);
+			x->gen = getint(buf+w[0]+w[1], w[2], 0);
+			x->type = Xuncompressed;
+			pdf->nxref++;
+			fprint(2, "xref %⊗\n", *x);
+			x++;
+		}else if(c == 2){ /* compressed */
+			x->objnum = getint(buf+w[0], w[1], 0);
+			x->id = getint(buf+w[0]+w[1], w[2], 0);
+			x->type = Xcompressed;
+			pdf->nxref++;
+			fprint(2, "xref %⊗\n", *x);
+			x++;
+		}
+	}
+
+	streamclose(s);
+	pdf->root = pdfref(dictget(o, "Root"));
+	pdf->info = pdfref(dictget(o, "Info"));
+	pdfobjfree(o);
+
+	return 0;
+err:
+	streamclose(s);
+	pdfobjfree(o);
+	return -1;
+}
+