ref: c0f0f0d63588d1d8ad1492d053eef211e4399cbd
parent: 613828fbd75a8753e03e088c7d1804fdca097942
author: Noam Preil <noam@pixelhero.dev>
date: Fri Aug 6 13:06:39 EDT 2021
pdfeval: cleaner caching layout pdfeval evaluates a PDF object, returning its value. Most objects are returned as-is; the notable exceptions are indirect objects, which are basically pointers to other PDF objects. Typically, indirect objects start out unresolved, making them cheap. Parsing the object they point to is expensive, so pdfeval naturally caches the results. Before this commit, pdfeval took in a pointer to an indirect object, resolved it, *overwrote the indirect object with the result*, and returned the result. This has a major problem: when, for instance, an array contains an indirect object, fetching the object implicitly evaluates it - and would then remove the indirect object from the array in favor of the evaluated object. This is great for caching, but presents a major problem when you're done with the resource. There is no way to get the indirect object back, so it is impossible to free the resource and then reparse it later. This commit changes that: instead, pdfeval is responsible for handling caching internally, and is not allowed to overwrite the indirect object. Currently, it stores the result in the indirect object instead; future work will switch this to use a full-fledged cache with eviction of old data, which is necessary for pdffs to be useful as a long-running process on large files without hogging system memory.
--- a/array.c
+++ b/array.c
@@ -87,7 +87,7 @@
sysfatal("array: indexing out of range");
o = o->type == Oarray ? o->array.e[i] : o;
- return pdfeval(&o);
+ return pdfeval(o);
}
int
--- a/dict.c
+++ b/dict.c
@@ -69,12 +69,12 @@
{
int i;
- pdfeval(&o);
+ o = pdfeval(o);
if((o->type != Ostream && o->type != Odict) || name == nil)
return &null;
for(i = 0; i < o->dict.nkv; i++){
if(strcmp(name, o->dict.kv[i].key) == 0){
- o = pdfeval(i < o->dict.nkv ? &o->dict.kv[i].value : nil);
+ o = pdfeval(i < o->dict.nkv ? o->dict.kv[i].value : nil);
return o;
}
}
--- a/eval.c
+++ b/eval.c
@@ -46,7 +46,7 @@
}
if((o = pdfobj(pdf, s)) == nil)
goto err;
- o = pdfeval(&o);
+ o = pdfeval(o);
break;
}
}
@@ -63,21 +63,18 @@
}
Object *
-pdfeval(Object **oo)
+pdfeval(Object *o)
{
- Object *d, *o;
+ Object *d;
Xref *x;
int i;
- if(oo == nil)
- sysfatal("nil oo");
- if(*oo == nil){
- *oo = &null;
+ if(o == nil)
return &null;
- }
- o = *oo;
if(o->type != Oindir)
return o;
+ if(o->indir.o != nil)
+ return o->indir.o;
for(x = nil, i = 0; i < o->pdf->nxref; i++){
x = &o->pdf->xref[i];
@@ -89,10 +86,10 @@
return &null;
}
if(x->objstm > 0){
- if((o = evalobjstm(o->pdf, x)) == &null)
+ if((d = evalobjstm(o->pdf, x)) == &null)
werrstr("ObjStm: %r");
- *oo = o;
- return o;
+ o->indir.o = d;
+ return d;
}
if(Sseek(o->pdf->s, x->off, 0) != x->off){
@@ -103,8 +100,6 @@
werrstr("eval: %r [at %p]", (void*)x->off);
return &null;
}
- *oo = d;
- pdfobjfree(o);
-
+ o->indir.o = d;
return d;
}
--- a/main.c
+++ b/main.c
@@ -139,8 +139,7 @@
o.pdf = pdf;
o.type = Oindir;
o.indir.id = atoi(argv[i]+1);
- v = &o;
- pdfeval(&v);
+ pdfeval(&o);
}else{
v = dictget(v, argv[i]);
}
--- a/object.c
+++ b/object.c
@@ -148,6 +148,9 @@
if((o = calloc(1, sizeof(*o))) == nil)
goto err;
o->pdf = pdf;
+ // IMPORTANT: since we modify the union here, we MUST NOT
+ // assume any members are zero initialized! Might be worth
+ // switching to malloc to avoid giving that false impression.
Sgetd(s, &o->num.d); /* get the first number */
o->num.i = o->num.d;
off = Soffset(s); /* seek here if not an indirect object later */
@@ -160,6 +163,8 @@
o->type = Oindir;
o->indir.id = o->num.i;
o->indir.gen = o2->num.i;
+ // See note above; this is NOT zero initialized by calloc
+ o->indir.o = nil;
pdfobjfree(o2);
return o;
}
--- a/pdf.h
+++ b/pdf.h
@@ -65,6 +65,7 @@
struct {
u32int id;
u16int gen;
+ Object *o;
}indir;
struct {
@@ -158,6 +159,7 @@
Object *top;
Object *root; /* 7.7.2 root object */
Object *info; /* 14.3.3 info dictionary */
+ Object *cache; /* Indirect object cache */
};
struct Xref {
@@ -203,7 +205,7 @@
* not recursive, ie values of a dictionary won't be resolved
* automatically.
*/
-Object *pdfeval(Object **o);
+Object *pdfeval(Object *o);
/*
* Increment refcount of an object. Freeing an object (or its
--- a/stream.c
+++ b/stream.c
@@ -25,7 +25,8 @@
int i, nflts, r;
s = nil;
- if(pdfeval(&o)->type != Ostream){ /* FIXME open a string object as a stream as well? */
+ o = pdfeval(o);
+ if(o->type != Ostream){ /* FIXME open a string object as a stream as well? */
werrstr("not a stream");
return nil;
}
@@ -38,7 +39,8 @@
/* see if there are any filters */
if((of = dictget(o, "Filter")) != &null){
- if(pdfeval(&of)->type == Oname){ /* one filter */
+ of = pdfeval(of);
+ if(of->type == Oname){ /* one filter */
flts = &of;
nflts = 1;
}else if(of->type == Oarray){ /* array of filters */