shithub: pdffs

Download patch

ref: c0f0f0d63588d1d8ad1492d053eef211e4399cbd
parent: 613828fbd75a8753e03e088c7d1804fdca097942
author: Noam Preil <noam@pixelhero.dev>
date: Fri Aug 6 13:06:39 EDT 2021

pdfeval: cleaner caching layout

pdfeval evaluates a PDF object, returning its value.  Most objects are
returned as-is; the notable exceptions are indirect objects, which are
basically pointers to other PDF objects.

Typically, indirect objects start out unresolved, making them cheap.
Parsing the object they point to is expensive, so pdfeval naturally
caches the results.  Before this commit, pdfeval took in a pointer to an
indirect object, resolved it, *overwrote the indirect object with the
result*, and returned the result.

This has a major problem: when, for instance, an array contains an
indirect object, fetching the object implicitly evaluates it - and would
then remove the indirect object from the array in favor of the evaluated
object.  This is great for caching, but presents a major problem when
you're done with the resource.  There is no way to get the indirect
object back, so it is impossible to free the resource and then reparse
it later.

This commit changes that: instead, pdfeval is responsible for handling
caching internally, and is not allowed to overwrite the indirect object.
Currently, it stores the result in the indirect object instead; future
work will switch this to use a full-fledged cache with eviction of old
data, which is necessary for pdffs to be useful as a long-running
process on large files without hogging system memory.

--- a/array.c
+++ b/array.c
@@ -87,7 +87,7 @@
 		sysfatal("array: indexing out of range");
 	o = o->type == Oarray ? o->array.e[i] : o;
 
-	return pdfeval(&o);
+	return pdfeval(o);
 }
 
 int
--- a/dict.c
+++ b/dict.c
@@ -69,12 +69,12 @@
 {
 	int i;
 
-	pdfeval(&o);
+	o = pdfeval(o);
 	if((o->type != Ostream && o->type != Odict) || name == nil)
 		return &null;
 	for(i = 0; i < o->dict.nkv; i++){
 		if(strcmp(name, o->dict.kv[i].key) == 0){
-			o = pdfeval(i < o->dict.nkv ? &o->dict.kv[i].value : nil);
+			o = pdfeval(i < o->dict.nkv ? o->dict.kv[i].value : nil);
 			return o;
 		}
 	}
--- a/eval.c
+++ b/eval.c
@@ -46,7 +46,7 @@
 			}
 			if((o = pdfobj(pdf, s)) == nil)
 				goto err;
-			o = pdfeval(&o);
+			o = pdfeval(o);
 			break;
 		}
 	}
@@ -63,21 +63,18 @@
 }
 
 Object *
-pdfeval(Object **oo)
+pdfeval(Object *o)
 {
-	Object *d, *o;
+	Object *d;
 	Xref *x;
 	int i;
 
-	if(oo == nil)
-		sysfatal("nil oo");
-	if(*oo == nil){
-		*oo = &null;
+	if(o == nil)
 		return &null;
-	}
-	o = *oo;
 	if(o->type != Oindir)
 		return o;
+	if(o->indir.o != nil)
+		return o->indir.o;
 
 	for(x = nil, i = 0; i < o->pdf->nxref; i++){
 		x = &o->pdf->xref[i];
@@ -89,10 +86,10 @@
 		return &null;
 	}
 	if(x->objstm > 0){
-		if((o = evalobjstm(o->pdf, x)) == &null)
+		if((d = evalobjstm(o->pdf, x)) == &null)
 			werrstr("ObjStm: %r");
-		*oo = o;
-		return o;
+		o->indir.o = d;
+		return d;
 	}
 
 	if(Sseek(o->pdf->s, x->off, 0) != x->off){
@@ -103,8 +100,6 @@
 		werrstr("eval: %r [at %p]", (void*)x->off);
 		return &null;
 	}
-	*oo = d;
-	pdfobjfree(o);
-
+	o->indir.o = d;
 	return d;
 }
--- a/main.c
+++ b/main.c
@@ -139,8 +139,7 @@
 			o.pdf = pdf;
 			o.type = Oindir;
 			o.indir.id = atoi(argv[i]+1);
-			v = &o;
-			pdfeval(&v);
+			pdfeval(&o);
 		}else{
 			v = dictget(v, argv[i]);
 		}
--- a/object.c
+++ b/object.c
@@ -148,6 +148,9 @@
 		if((o = calloc(1, sizeof(*o))) == nil)
 			goto err;
 		o->pdf = pdf;
+		// IMPORTANT: since we modify the union here, we MUST NOT
+		// assume any members are zero initialized! Might be worth
+		// switching to malloc to avoid giving that false impression.
 		Sgetd(s, &o->num.d); /* get the first number */
 		o->num.i = o->num.d;
 		off = Soffset(s); /* seek here if not an indirect object later */
@@ -160,6 +163,8 @@
 				o->type = Oindir;
 				o->indir.id = o->num.i;
 				o->indir.gen = o2->num.i;
+				// See note above; this is NOT zero initialized by calloc
+				o->indir.o = nil;
 				pdfobjfree(o2);
 				return o;
 			}
--- a/pdf.h
+++ b/pdf.h
@@ -65,6 +65,7 @@
 		struct {
 			u32int id;
 			u16int gen;
+			Object *o;
 		}indir;
 
 		struct {
@@ -158,6 +159,7 @@
 	Object *top;
 	Object *root; /* 7.7.2 root object */
 	Object *info; /* 14.3.3 info dictionary */
+	Object *cache; /* Indirect object cache */
 };
 
 struct Xref {
@@ -203,7 +205,7 @@
  * not recursive, ie values of a dictionary won't be resolved
  * automatically.
  */
-Object *pdfeval(Object **o);
+Object *pdfeval(Object *o);
 
 /*
  * Increment refcount of an object. Freeing an object (or its
--- a/stream.c
+++ b/stream.c
@@ -25,7 +25,8 @@
 	int i, nflts, r;
 
 	s = nil;
-	if(pdfeval(&o)->type != Ostream){ /* FIXME open a string object as a stream as well? */
+	o = pdfeval(o);
+	if(o->type != Ostream){ /* FIXME open a string object as a stream as well? */
 		werrstr("not a stream");
 		return nil;
 	}
@@ -38,7 +39,8 @@
 
 	/* see if there are any filters */
 	if((of = dictget(o, "Filter")) != &null){
-		if(pdfeval(&of)->type == Oname){ /* one filter */
+		of = pdfeval(of);
+		if(of->type == Oname){ /* one filter */
 			flts = &of;
 			nflts = 1;
 		}else if(of->type == Oarray){ /* array of filters */