shithub: pdffs

Download patch

ref: f8f7ffe655e3a6439adf0614d0232e7520757566
parent: 73b21f1bdadc28eddee84ae0a4c70f7ec3bd29ed
author: Sigrid Haflínudóttir <ftrvxmtrx@gmail.com>
date: Thu Aug 27 13:32:25 EDT 2020

add more stuff

--- a/main.c
+++ b/main.c
@@ -1,21 +1,47 @@
 #include <u.h>
 #include <libc.h>
+#include <thread.h>
+#include "pdf.h"
 
+static void
+usage(void)
+{
+	fprint(2, "usage: %s FILE\n", argv0);
+	threadexitsall("usage");
+}
+
 void
-main(int argc, char **argv)
+threadmain(int argc, char **argv)
 {
-	USED(argc); USED(argv);
+	int fd;
+	Pdf *pdf;
 
 	quotefmtinstall();
 
+	ARGBEGIN{
+	default:
+		usage();
+	}ARGEND
+
 #ifdef TEST
 #define T(x) \
 	void x(void); \
 	x();
 
-	T(test_pdfstring);
-	T(test_pdfname);
+	if(argc != 1){
+		T(test_pdfstring);
+		T(test_pdfname);
+		threadexitsall(nil);
+	}
 #endif
 
-	exits(nil);
+	if(argc != 1)
+		usage();
+	if((fd = open(argv[0], OREAD)) < 0)
+		sysfatal("%r");
+	if((pdf = pdfopen(fd)) == nil)
+		sysfatal("pdfopen: %r");
+	pdfclose(pdf);
+
+	threadexitsall(nil);
 }
--- a/mkfile
+++ b/mkfile
@@ -1,7 +1,6 @@
 </$objtype/mkfile
 
 CFLAGS=$CFLAGS -DTEST
-
 TARG=pdfs
 
 OFILES=\
@@ -8,8 +7,18 @@
 	filter.$O\
 	main.$O\
 	name.$O\
+	object.$O\
+	pdf.$O\
 	pdfs.$O\
 	string.$O\
+
+HFILES=\
+	pdf.h\
+
+UPDATE=\
+	$HFILES\
+	${OFILES:%.$O=%.c}\
+	mkfile\
 
 default:V:	all
 
--- /dev/null
+++ b/object.c
@@ -1,0 +1,80 @@
+#include <u.h>
+#include <libc.h>
+#include <ctype.h>
+#include "pdf.h"
+
+Object *
+pdfobject(char *p, char **e, int len)
+{
+	Object *o;
+	int sz;
+
+	o = nil;
+	for(; len > 0 && isws(*p); p++, len--);
+	if(len < 2){
+		werrstr("too short");
+		goto err;
+	}
+
+	if(*p < 1){
+		werrstr("unexpected non-ascii char");
+		goto err;
+	}
+
+	switch(*p){
+	case '<': /* dictionary or a string */
+		if(len < 2){
+			werrstr("too short");
+			goto err;
+		}
+		if(p[1] == '<'){ /* dictionary */
+			o = pdfdict(p, e, len);
+			break;
+		}
+		/* fall through */
+
+	case '(': /* string */
+		if((sz = pdfstring(p, e, len)) < 0)
+			goto err;
+		if((o = malloc(sizeof(*o)+sz+1)) != nil){
+			o->type = Ostr;
+			o->str = (char*)(o+1);
+			strcpy(o->str, p);
+		}
+		break;
+
+	case '/':
+		if((sz = pdfname(p, e, len)) < 0)
+			goto err;
+		if((o = malloc(sizeof(*o)+sz+1)) != nil){
+			o->type = Oname;
+			o->str = (char*)(o+1);
+			strcpy(o->str, p);
+		}
+		break;
+
+	default:
+		if(isdigit(*p)){
+			o->type = Onum;
+			o->num = strtod(p, e);
+			break;
+		}
+		werrstr("unexpected char %c", *p);
+		goto err;
+	}
+
+	if(o != nil)
+		return o;
+err:
+	werrstr("object: %r");
+	freeobject(o);
+	return nil;
+}
+
+void
+freeobject(Object *o)
+{
+	if(o == nil)
+		return;
+	free(o);
+}
--- a/pdf.c
+++ b/pdf.c
@@ -1,5 +1,238 @@
 #include <u.h>
 #include <libc.h>
+#include <bio.h>
+#include <ctype.h>
 #include "pdf.h"
 
-static char whitespace[] = {0x00, 0x09, 0x10, 0x0c, 0x0d, 0x20};
+/* 7.2.2 whitespace */
+int
+isws(char c)
+{
+	return /* \0 is missing on purpose */
+		c == '\t' || c == '\n' || c == '\f' || c == '\r' ||
+		c == ' ';
+}
+
+/* 7.2.2 delimeters */
+int
+isdelim(char c)
+{
+	return
+		c == '(' || c == ')' || c == '<' || c == '>' ||
+		c == '[' || c == ']' || c == '{' || c == '}' ||
+		c == '/' || c == '%';
+}
+
+/*
+ * pre-1.5 xref section reader
+ * PDF>=1.5 may have BOTH (or either) old xref format and xref streams
+ */
+static int
+xrefread(Pdf *pdf, int xref0, int nxref)
+{
+	int i, j, sz, n, newnxref;
+	Xref xref;
+	char *s, *e;
+	Xref *x;
+
+	s = nil;
+	if((x = realloc(pdf->xref, (pdf->nxref + nxref)*sizeof(Xref))) == nil)
+		goto err;
+	pdf->xref = x;
+
+	/* read the entire thing at once */
+	sz = nxref*20;
+	if((s = malloc(sz)) == nil)
+		goto err;
+	for(i = 0; i < sz; i += n){
+		if((n = Bread(pdf->bio, s+i, sz-i)) < 1)
+			goto err;
+	}
+
+	/* store non-free objects only */
+	newnxref = pdf->nxref;
+	for(e = s, i = 0; i < nxref; i++, e += 20){
+		if(e[10] != ' ' || e[18] != '\r' || e[19] != '\n'){
+			werrstr("invalid xref line");
+			goto err;
+		}
+		xref.id = xref0 + nxref;
+		xref.off = strtoul(e, nil, 10);
+
+		/* search in already existing xrefs, update if found */
+		for(j = 0; j < pdf->nxref; j++){
+			if(pdf->xref[j].id != xref.id)
+				continue;
+			if(e[17] == 'f') /* it was freed */
+				pdf->xref[j].id = 0;
+			else if(e[17] == 'n')
+				pdf->xref[j].off = xref.off;
+			break;
+		}
+		if(j >= pdf->nxref && e[17] == 'n') /* that's a new one, insert unless it's free */
+			pdf->xref[newnxref++] = xref;
+	}
+	free(s);
+	s = nil;
+
+	/* scale down */
+	for(i = j = 0; i < newnxref; i++){
+		if(pdf->xref[i].id != 0)
+			pdf->xref[j++] = pdf->xref[i];
+	}
+	if((x = realloc(pdf->xref, j*sizeof(Xref))) == nil)
+		goto err;
+	pdf->xref = x;
+	pdf->nxref = j;
+
+	return 0;
+err:
+	free(s);
+	return -1;
+}
+
+static int
+trailerread(Pdf *pdf)
+{
+	int i;
+	char *s, *e;
+	Object *o;
+	KeyValue *kv;
+
+	o = nil;
+	if((s = Brdstr(pdf->bio, 0, 1)) == nil || (o = pdfobject(s, &e, Blinelen(pdf->bio))) == nil)
+		goto err;
+	free(s);
+	s = nil;
+
+	if(o->type != Odict){
+		werrstr("isn't a dictionary");
+		goto err;
+	}
+
+	for(i = 0, kv = o->dict.kv; i < o->dict.nkv; i++, kv++){
+		if(strcmp(kv->key, "Root") == 0 && kv->value.type == Onum)
+			pdf->root = kv->value.num;
+		else if(strcmp(kv->key, "Info") == 0 && kv->value.type == Onum)
+			pdf->info = kv->value.num;
+	}
+	freeobject(o);
+	o = nil;
+
+	/* root is required */
+	if(pdf->root == 0){
+		werrstr("no root");
+		goto err;
+	}
+
+	/* info is optional */
+
+	return 0;
+err:
+	freeobject(o);
+	free(s);
+	return -1;
+}
+
+Pdf *
+pdfopen(int fd)
+{
+	Pdf *pdf;
+	Biobuf *b;
+	char tmp[64], *s, *x;
+	int xref0; /* 7.5.4 xref subsection first object number */
+	int nxref; /* 7.5.4 xref subsection number of objects */
+	int xreftb; /* 7.5.4 xref table offset from the beginning of the file */
+	int i, n, off;
+
+	if((pdf = calloc(1, sizeof(*pdf))) == nil || (b = Bfdopen(fd, OREAD)) == nil)
+		goto err;
+	pdf->bio = b;
+
+	/* check header */
+	if(Bread(b, tmp, 8) != 8 ||
+	   strncmp(tmp, "%PDF-", 5) != 0 || !isdigit(tmp[5]) || tmp[6] != '.' || !isdigit(tmp[7])){
+		werrstr("not a pdf");
+		goto err;
+	}
+
+	/* 7.5.4, 7.5.8 xref table */
+
+	/* read a block of data */
+	n = sizeof(tmp)-1;
+	Bseek(b, -n, 2);
+	if(Bread(b, tmp, n) != n){
+badtrailer:
+		werrstr("invalid trailer");
+		goto err;
+	}
+	tmp[n] = 0;
+
+	/* search for a valid string that the block ends with */
+	for(i = n-1, s = &tmp[i]; i > 0 && *s != 0; i--, s--);
+	s++;
+
+	/* find "startxref" */
+	if((x = strrchr(s, 'f')) == nil || !isws(x[1]) || x-8 < s+1 || memcmp(x-8, "startxref", 9) != 0)
+		goto badtrailer;
+	x++;
+	if((xreftb = strtol(x, nil, 10)) < 1)
+		goto badtrailer;
+
+	/* read xref */
+	if(Bseek(b, xreftb, 0) != xreftb){
+		werrstr("xref position out of range");
+		goto err;
+	}
+morexref:
+	off = Bseek(b, 0, 1);
+	n = sizeof(tmp)-1;
+	if((n = Bread(b, tmp, n)) < 16){
+badxref:
+		werrstr("invalid xref: %r");
+		goto err;
+	}
+	tmp[n] = 0;
+	if(memcmp(tmp, "xref", 4) == 0){
+		/* 7.5.4 xref */
+		x = tmp+4;
+		nxref = -1;
+		if((xref0 = strtol(x, &x, 10)) < 0 || (nxref = strtol(x, &x, 10)) < 1){
+			werrstr("xref0=%d nxref=%d", xref0, nxref);
+			goto badxref;
+		}
+
+		/* skip whitespace and move to the first subsection */
+		for(; isws(*x) && x < tmp+n; x++);
+		n = x-tmp+off;
+		if(Bseek(b, n, 0) != n || xrefread(pdf, xref0, nxref) != 0)
+			goto badxref;
+		goto morexref; /* there could be more updates, try it */
+	}else if(memcmp(tmp, "trailer", 7) == 0){ /* 7.5.5 file trailer */
+		/* move to the trailer dictionary */
+		n = off + 8;
+		if(Bseek(b, n, 0) != n || trailerread(pdf) != 0){
+			werrstr("invalid trailer: %r");
+			goto err;
+		}
+	}else{ /* could be 7.5.8 xref stream (since PDF 1.5) */
+		werrstr("FIXME xref streams not implemented");
+		goto err;
+	}
+
+	return pdf;
+err:
+	pdfclose(pdf);
+	return nil;
+}
+
+void
+pdfclose(Pdf *pdf)
+{
+	if(pdf == nil)
+		return;
+	if(pdf->bio != nil)
+		Bterm(pdf->bio);
+	free(pdf->xref);
+	free(pdf);
+}
--- a/pdf.h
+++ b/pdf.h
@@ -10,7 +10,10 @@
 	Oindir,  /* 7.3.10 */
 };
 
+typedef struct KeyValue KeyValue;
 typedef struct Object Object;
+typedef struct Pdf Pdf;
+typedef struct Xref Xref;
 
 struct Object {
 	int type;
@@ -18,13 +21,45 @@
 		int bool;
 		double num;
 		char *str;
+		char *name;
+
 		struct {
 			int id;
 			int gen;
 		}indir;
+
+		struct {
+			KeyValue *kv;
+			int nkv;
+		}dict;
 	};
 };
 
+struct KeyValue {
+	char *key;
+	Object value;
+};
+
+struct Pdf {
+	void *bio;
+	Xref *xref;
+	int nxref; /* 7.5.4 xref subsection number of objects */
+
+	u32int root; /* 7.5.5 root object */
+	u32int info; /* 7.5.5 info dictionary */
+};
+
+struct Xref {
+	u32int id;
+	u32int off;
+};
+
+Pdf *pdfopen(int fd);
+void pdfclose(Pdf *pdf);
+
+Object *pdfobject(char *p, char **e, int len);
+void freeobject(Object *o);
+
 /*
  * 7.3.4 String Objects
  *
@@ -40,5 +75,7 @@
  *
  * Works the same way as pdfstring, but for name objects.
  */
-int
-pdfname(char *p, char **e, int len);
+int pdfname(char *p, char **e, int len);
+
+int isws(char c);
+int isdelim(char c);