ref: f8f7ffe655e3a6439adf0614d0232e7520757566
parent: 73b21f1bdadc28eddee84ae0a4c70f7ec3bd29ed
author: Sigrid Haflínudóttir <ftrvxmtrx@gmail.com>
date: Thu Aug 27 13:32:25 EDT 2020
add more stuff
--- a/main.c
+++ b/main.c
@@ -1,21 +1,47 @@
#include <u.h>
#include <libc.h>
+#include <thread.h>
+#include "pdf.h"
+static void
+usage(void)
+{
+ fprint(2, "usage: %s FILE\n", argv0);
+ threadexitsall("usage");
+}
+
void
-main(int argc, char **argv)
+threadmain(int argc, char **argv)
{
- USED(argc); USED(argv);
+ int fd;
+ Pdf *pdf;
quotefmtinstall();
+ ARGBEGIN{
+ default:
+ usage();
+ }ARGEND
+
#ifdef TEST
#define T(x) \
void x(void); \
x();
- T(test_pdfstring);
- T(test_pdfname);
+ if(argc != 1){
+ T(test_pdfstring);
+ T(test_pdfname);
+ threadexitsall(nil);
+ }
#endif
- exits(nil);
+ if(argc != 1)
+ usage();
+ if((fd = open(argv[0], OREAD)) < 0)
+ sysfatal("%r");
+ if((pdf = pdfopen(fd)) == nil)
+ sysfatal("pdfopen: %r");
+ pdfclose(pdf);
+
+ threadexitsall(nil);
}
--- a/mkfile
+++ b/mkfile
@@ -1,7 +1,6 @@
</$objtype/mkfile
CFLAGS=$CFLAGS -DTEST
-
TARG=pdfs
OFILES=\
@@ -8,8 +7,18 @@
filter.$O\
main.$O\
name.$O\
+ object.$O\
+ pdf.$O\
pdfs.$O\
string.$O\
+
+HFILES=\
+ pdf.h\
+
+UPDATE=\
+ $HFILES\
+ ${OFILES:%.$O=%.c}\
+ mkfile\
default:V: all
--- /dev/null
+++ b/object.c
@@ -1,0 +1,80 @@
+#include <u.h>
+#include <libc.h>
+#include <ctype.h>
+#include "pdf.h"
+
+Object *
+pdfobject(char *p, char **e, int len)
+{
+ Object *o;
+ int sz;
+
+ o = nil;
+ for(; len > 0 && isws(*p); p++, len--);
+ if(len < 2){
+ werrstr("too short");
+ goto err;
+ }
+
+ if(*p < 1){
+ werrstr("unexpected non-ascii char");
+ goto err;
+ }
+
+ switch(*p){
+ case '<': /* dictionary or a string */
+ if(len < 2){
+ werrstr("too short");
+ goto err;
+ }
+ if(p[1] == '<'){ /* dictionary */
+ o = pdfdict(p, e, len);
+ break;
+ }
+ /* fall through */
+
+ case '(': /* string */
+ if((sz = pdfstring(p, e, len)) < 0)
+ goto err;
+ if((o = malloc(sizeof(*o)+sz+1)) != nil){
+ o->type = Ostr;
+ o->str = (char*)(o+1);
+ strcpy(o->str, p);
+ }
+ break;
+
+ case '/':
+ if((sz = pdfname(p, e, len)) < 0)
+ goto err;
+ if((o = malloc(sizeof(*o)+sz+1)) != nil){
+ o->type = Oname;
+ o->str = (char*)(o+1);
+ strcpy(o->str, p);
+ }
+ break;
+
+ default:
+ if(isdigit(*p)){
+ o->type = Onum;
+ o->num = strtod(p, e);
+ break;
+ }
+ werrstr("unexpected char %c", *p);
+ goto err;
+ }
+
+ if(o != nil)
+ return o;
+err:
+ werrstr("object: %r");
+ freeobject(o);
+ return nil;
+}
+
+void
+freeobject(Object *o)
+{
+ if(o == nil)
+ return;
+ free(o);
+}
--- a/pdf.c
+++ b/pdf.c
@@ -1,5 +1,238 @@
#include <u.h>
#include <libc.h>
+#include <bio.h>
+#include <ctype.h>
#include "pdf.h"
-static char whitespace[] = {0x00, 0x09, 0x10, 0x0c, 0x0d, 0x20};
+/* 7.2.2 whitespace */
+int
+isws(char c)
+{
+ return /* \0 is missing on purpose */
+ c == '\t' || c == '\n' || c == '\f' || c == '\r' ||
+ c == ' ';
+}
+
+/* 7.2.2 delimeters */
+int
+isdelim(char c)
+{
+ return
+ c == '(' || c == ')' || c == '<' || c == '>' ||
+ c == '[' || c == ']' || c == '{' || c == '}' ||
+ c == '/' || c == '%';
+}
+
+/*
+ * pre-1.5 xref section reader
+ * PDF>=1.5 may have BOTH (or either) old xref format and xref streams
+ */
+static int
+xrefread(Pdf *pdf, int xref0, int nxref)
+{
+ int i, j, sz, n, newnxref;
+ Xref xref;
+ char *s, *e;
+ Xref *x;
+
+ s = nil;
+ if((x = realloc(pdf->xref, (pdf->nxref + nxref)*sizeof(Xref))) == nil)
+ goto err;
+ pdf->xref = x;
+
+ /* read the entire thing at once */
+ sz = nxref*20;
+ if((s = malloc(sz)) == nil)
+ goto err;
+ for(i = 0; i < sz; i += n){
+ if((n = Bread(pdf->bio, s+i, sz-i)) < 1)
+ goto err;
+ }
+
+ /* store non-free objects only */
+ newnxref = pdf->nxref;
+ for(e = s, i = 0; i < nxref; i++, e += 20){
+ if(e[10] != ' ' || e[18] != '\r' || e[19] != '\n'){
+ werrstr("invalid xref line");
+ goto err;
+ }
+ xref.id = xref0 + nxref;
+ xref.off = strtoul(e, nil, 10);
+
+ /* search in already existing xrefs, update if found */
+ for(j = 0; j < pdf->nxref; j++){
+ if(pdf->xref[j].id != xref.id)
+ continue;
+ if(e[17] == 'f') /* it was freed */
+ pdf->xref[j].id = 0;
+ else if(e[17] == 'n')
+ pdf->xref[j].off = xref.off;
+ break;
+ }
+ if(j >= pdf->nxref && e[17] == 'n') /* that's a new one, insert unless it's free */
+ pdf->xref[newnxref++] = xref;
+ }
+ free(s);
+ s = nil;
+
+ /* scale down */
+ for(i = j = 0; i < newnxref; i++){
+ if(pdf->xref[i].id != 0)
+ pdf->xref[j++] = pdf->xref[i];
+ }
+ if((x = realloc(pdf->xref, j*sizeof(Xref))) == nil)
+ goto err;
+ pdf->xref = x;
+ pdf->nxref = j;
+
+ return 0;
+err:
+ free(s);
+ return -1;
+}
+
+static int
+trailerread(Pdf *pdf)
+{
+ int i;
+ char *s, *e;
+ Object *o;
+ KeyValue *kv;
+
+ o = nil;
+ if((s = Brdstr(pdf->bio, 0, 1)) == nil || (o = pdfobject(s, &e, Blinelen(pdf->bio))) == nil)
+ goto err;
+ free(s);
+ s = nil;
+
+ if(o->type != Odict){
+ werrstr("isn't a dictionary");
+ goto err;
+ }
+
+ for(i = 0, kv = o->dict.kv; i < o->dict.nkv; i++, kv++){
+ if(strcmp(kv->key, "Root") == 0 && kv->value.type == Onum)
+ pdf->root = kv->value.num;
+ else if(strcmp(kv->key, "Info") == 0 && kv->value.type == Onum)
+ pdf->info = kv->value.num;
+ }
+ freeobject(o);
+ o = nil;
+
+ /* root is required */
+ if(pdf->root == 0){
+ werrstr("no root");
+ goto err;
+ }
+
+ /* info is optional */
+
+ return 0;
+err:
+ freeobject(o);
+ free(s);
+ return -1;
+}
+
+Pdf *
+pdfopen(int fd)
+{
+ Pdf *pdf;
+ Biobuf *b;
+ char tmp[64], *s, *x;
+ int xref0; /* 7.5.4 xref subsection first object number */
+ int nxref; /* 7.5.4 xref subsection number of objects */
+ int xreftb; /* 7.5.4 xref table offset from the beginning of the file */
+ int i, n, off;
+
+ if((pdf = calloc(1, sizeof(*pdf))) == nil || (b = Bfdopen(fd, OREAD)) == nil)
+ goto err;
+ pdf->bio = b;
+
+ /* check header */
+ if(Bread(b, tmp, 8) != 8 ||
+ strncmp(tmp, "%PDF-", 5) != 0 || !isdigit(tmp[5]) || tmp[6] != '.' || !isdigit(tmp[7])){
+ werrstr("not a pdf");
+ goto err;
+ }
+
+ /* 7.5.4, 7.5.8 xref table */
+
+ /* read a block of data */
+ n = sizeof(tmp)-1;
+ Bseek(b, -n, 2);
+ if(Bread(b, tmp, n) != n){
+badtrailer:
+ werrstr("invalid trailer");
+ goto err;
+ }
+ tmp[n] = 0;
+
+ /* search for a valid string that the block ends with */
+ for(i = n-1, s = &tmp[i]; i > 0 && *s != 0; i--, s--);
+ s++;
+
+ /* find "startxref" */
+ if((x = strrchr(s, 'f')) == nil || !isws(x[1]) || x-8 < s+1 || memcmp(x-8, "startxref", 9) != 0)
+ goto badtrailer;
+ x++;
+ if((xreftb = strtol(x, nil, 10)) < 1)
+ goto badtrailer;
+
+ /* read xref */
+ if(Bseek(b, xreftb, 0) != xreftb){
+ werrstr("xref position out of range");
+ goto err;
+ }
+morexref:
+ off = Bseek(b, 0, 1);
+ n = sizeof(tmp)-1;
+ if((n = Bread(b, tmp, n)) < 16){
+badxref:
+ werrstr("invalid xref: %r");
+ goto err;
+ }
+ tmp[n] = 0;
+ if(memcmp(tmp, "xref", 4) == 0){
+ /* 7.5.4 xref */
+ x = tmp+4;
+ nxref = -1;
+ if((xref0 = strtol(x, &x, 10)) < 0 || (nxref = strtol(x, &x, 10)) < 1){
+ werrstr("xref0=%d nxref=%d", xref0, nxref);
+ goto badxref;
+ }
+
+ /* skip whitespace and move to the first subsection */
+ for(; isws(*x) && x < tmp+n; x++);
+ n = x-tmp+off;
+ if(Bseek(b, n, 0) != n || xrefread(pdf, xref0, nxref) != 0)
+ goto badxref;
+ goto morexref; /* there could be more updates, try it */
+ }else if(memcmp(tmp, "trailer", 7) == 0){ /* 7.5.5 file trailer */
+ /* move to the trailer dictionary */
+ n = off + 8;
+ if(Bseek(b, n, 0) != n || trailerread(pdf) != 0){
+ werrstr("invalid trailer: %r");
+ goto err;
+ }
+ }else{ /* could be 7.5.8 xref stream (since PDF 1.5) */
+ werrstr("FIXME xref streams not implemented");
+ goto err;
+ }
+
+ return pdf;
+err:
+ pdfclose(pdf);
+ return nil;
+}
+
+void
+pdfclose(Pdf *pdf)
+{
+ if(pdf == nil)
+ return;
+ if(pdf->bio != nil)
+ Bterm(pdf->bio);
+ free(pdf->xref);
+ free(pdf);
+}
--- a/pdf.h
+++ b/pdf.h
@@ -10,7 +10,10 @@
Oindir, /* 7.3.10 */
};
+typedef struct KeyValue KeyValue;
typedef struct Object Object;
+typedef struct Pdf Pdf;
+typedef struct Xref Xref;
struct Object {
int type;
@@ -18,13 +21,45 @@
int bool;
double num;
char *str;
+ char *name;
+
struct {
int id;
int gen;
}indir;
+
+ struct {
+ KeyValue *kv;
+ int nkv;
+ }dict;
};
};
+struct KeyValue {
+ char *key;
+ Object value;
+};
+
+struct Pdf {
+ void *bio;
+ Xref *xref;
+ int nxref; /* 7.5.4 xref subsection number of objects */
+
+ u32int root; /* 7.5.5 root object */
+ u32int info; /* 7.5.5 info dictionary */
+};
+
+struct Xref {
+ u32int id;
+ u32int off;
+};
+
+Pdf *pdfopen(int fd);
+void pdfclose(Pdf *pdf);
+
+Object *pdfobject(char *p, char **e, int len);
+void freeobject(Object *o);
+
/*
* 7.3.4 String Objects
*
@@ -40,5 +75,7 @@
*
* Works the same way as pdfstring, but for name objects.
*/
-int
-pdfname(char *p, char **e, int len);
+int pdfname(char *p, char **e, int len);
+
+int isws(char c);
+int isdelim(char c);