ref: c3134491a3010fcce440a45968407c6511766671
parent: 060a2afad426347a9d63544f8ff7a5c2c96ae8e1
author: Robin Watts <Robin.Watts@artifex.com>
date: Tue Jan 21 13:16:37 EST 2020
Optimise jbig2_compose_image. Work at byte level rather than bit level, and use static inline templating methods to generate optimised versions for each different operator.
--- a/jbig2_image.c
+++ b/jbig2_image.c
@@ -164,134 +164,249 @@
return image;
}
-/* composite one jbig2_image onto another
- slow but general version */
-static int
-jbig2_image_compose_unopt(Jbig2Ctx *ctx, Jbig2Image *dst, Jbig2Image *src, int x, int y, Jbig2ComposeOp op)
+static void
+template_image_compose_opt(const uint8_t * JBIG2_RESTRICT ss, uint8_t * JBIG2_RESTRICT dd, int early, int late, uint8_t leftmask, uint8_t rightmask, uint32_t bytewidth_, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride, Jbig2ComposeOp op)
{
- uint32_t i, j;
- uint32_t sw = src->width;
- uint32_t sh = src->height;
- uint32_t sx = 0;
- uint32_t sy = 0;
+ int i;
+ uint32_t j;
+ int bytewidth = (int)bytewidth_;
- /* clip to the dst image boundaries */
- if (x < 0) {
- sx += -x;
- if (sw < (uint32_t) -x)
- sw = 0;
- else
- sw -= -x;
- x = 0;
+ if (bytewidth == 1) {
+ for (j = 0; j < h; j++) {
+ /* Only 1 byte! */
+ uint8_t v = (((early ? 0 : ss[0]<<8) | (late ? 0 : ss[1]))>>shift);
+ if (op == JBIG2_COMPOSE_OR)
+ *dd |= v & leftmask;
+ else if (op == JBIG2_COMPOSE_AND)
+ *dd &= (v & leftmask) | ~leftmask;
+ else if (op == JBIG2_COMPOSE_XOR)
+ *dd ^= v & leftmask;
+ else if (op == JBIG2_COMPOSE_XNOR)
+ *dd ^= (~v) & leftmask;
+ else /* Replace */
+ *dd = (v & leftmask) | (*dd & ~leftmask);
+ dd += dstride;
+ ss += sstride;
+ }
+ return;
}
- if (y < 0) {
- sy += -y;
- if (sh < (uint32_t) -y)
- sh = 0;
- else
- sh -= -y;
- y = 0;
+ bytewidth -= 2;
+ if (shift == 0) {
+ ss++;
+ for (j = 0; j < h; j++) {
+ /* Left byte */
+ const uint8_t * JBIG2_RESTRICT s = ss;
+ uint8_t * JBIG2_RESTRICT d = dd;
+ if (op == JBIG2_COMPOSE_OR)
+ *d++ |= *s++ & leftmask;
+ else if (op == JBIG2_COMPOSE_AND)
+ *d++ &= (*s++ & leftmask) | ~leftmask;
+ else if (op == JBIG2_COMPOSE_XOR)
+ *d++ ^= *s++ & leftmask;
+ else if (op == JBIG2_COMPOSE_XNOR)
+ *d++ ^= (~*s++) & leftmask;
+ else /* Replace */
+ *d = (*s++ & leftmask) | (*d & ~leftmask), d++;
+ /* Central run */
+ for (i = bytewidth; i != 0; i--)
+ {
+ if (op == JBIG2_COMPOSE_OR)
+ *d++ |= *s++;
+ else if (op == JBIG2_COMPOSE_AND)
+ *d++ &= *s++;
+ else if (op == JBIG2_COMPOSE_XOR)
+ *d++ ^= *s++;
+ else if (op == JBIG2_COMPOSE_XNOR)
+ *d++ ^= ~*s++;
+ else /* Replace */
+ *d++ = *s++;
+ }
+ /* Right byte */
+ if (op == JBIG2_COMPOSE_OR)
+ *d |= *s & rightmask;
+ else if (op == JBIG2_COMPOSE_AND)
+ *d &= (*s & rightmask) | ~rightmask;
+ else if (op == JBIG2_COMPOSE_XOR)
+ *d ^= *s & rightmask;
+ else if (op == JBIG2_COMPOSE_XNOR)
+ *d ^= (~*s) & rightmask;
+ else /* Replace */
+ *d = (*s & rightmask) | (*d & ~rightmask);
+ dd += dstride;
+ ss += sstride;
+ }
+ } else {
+ for (j = 0; j < h; j++) {
+ /* Left byte */
+ const uint8_t * JBIG2_RESTRICT s = ss;
+ uint8_t * JBIG2_RESTRICT d = dd;
+ uint8_t s0, s1, v;
+ s0 = early ? 0 : *s;
+ s++;
+ s1 = *s++;
+ v = ((s0<<8) | s1)>>shift;
+ if (op == JBIG2_COMPOSE_OR)
+ *d++ |= v & leftmask;
+ else if (op == JBIG2_COMPOSE_AND)
+ *d++ &= (v & leftmask) | ~leftmask;
+ else if (op == JBIG2_COMPOSE_XOR)
+ *d++ ^= v & leftmask;
+ else if (op == JBIG2_COMPOSE_XNOR)
+ *d++ ^= (~v) & leftmask;
+ else /* Replace */
+ *d = (v & leftmask) | (*d & ~leftmask), d++;
+ /* Central run */
+ for (i = bytewidth; i > 0; i--)
+ {
+ s0 = s1; s1 = *s++;
+ v = ((s0<<8) | s1)>>shift;
+ if (op == JBIG2_COMPOSE_OR)
+ *d++ |= v;
+ else if (op == JBIG2_COMPOSE_AND)
+ *d++ &= v;
+ else if (op == JBIG2_COMPOSE_XOR)
+ *d++ ^= v;
+ else if (op == JBIG2_COMPOSE_XNOR)
+ *d++ ^= ~v;
+ else /* Replace */
+ *d++ = v;
+ }
+ /* Right byte */
+ s0 = s1; s1 = (late ? 0 : *s);
+ v = (((s0<<8) | s1)>>shift);
+ if (op == JBIG2_COMPOSE_OR)
+ *d |= v & rightmask;
+ else if (op == JBIG2_COMPOSE_AND)
+ *d &= (v & rightmask) | ~rightmask;
+ else if (op == JBIG2_COMPOSE_XOR)
+ *d ^= v & rightmask;
+ else if (op == JBIG2_COMPOSE_XNOR)
+ *d ^= ~v & rightmask;
+ else /* Replace */
+ *d = (v & rightmask) | (*d & ~rightmask);
+ dd += dstride;
+ ss += sstride;
+ }
}
- if ((uint32_t) x + sw >= dst->width) {
- if (dst->width >= (uint32_t) x)
- sw = dst->width - x;
- else
- sw = 0;
- }
- if ((uint32_t) y + sh >= dst->height) {
- if (dst->height >= (uint32_t) y)
- sh = dst->height - y;
- else
- sh = 0;
- }
+}
- switch (op) {
- case JBIG2_COMPOSE_OR:
- for (j = 0; j < sh; j++) {
- for (i = 0; i < sw; i++) {
- jbig2_image_set_pixel(dst, i + x, j + y, jbig2_image_get_pixel(src, i + sx, j + sy) | jbig2_image_get_pixel(dst, i + x, j + y));
- }
- }
- break;
- case JBIG2_COMPOSE_AND:
- for (j = 0; j < sh; j++) {
- for (i = 0; i < sw; i++) {
- jbig2_image_set_pixel(dst, i + x, j + y, jbig2_image_get_pixel(src, i + sx, j + sy) & jbig2_image_get_pixel(dst, i + x, j + y));
- }
- }
- break;
- case JBIG2_COMPOSE_XOR:
- for (j = 0; j < sh; j++) {
- for (i = 0; i < sw; i++) {
- jbig2_image_set_pixel(dst, i + x, j + y, jbig2_image_get_pixel(src, i + sx, j + sy) ^ jbig2_image_get_pixel(dst, i + x, j + y));
- }
- }
- break;
- case JBIG2_COMPOSE_XNOR:
- for (j = 0; j < sh; j++) {
- for (i = 0; i < sw; i++) {
- jbig2_image_set_pixel(dst, i + x, j + y, (jbig2_image_get_pixel(src, i + sx, j + sy) == jbig2_image_get_pixel(dst, i + x, j + y)));
- }
- }
- break;
- case JBIG2_COMPOSE_REPLACE:
- for (j = 0; j < sh; j++) {
- for (i = 0; i < sw; i++) {
- jbig2_image_set_pixel(dst, i + x, j + y, jbig2_image_get_pixel(src, i + sx, j + sy));
- }
- }
- break;
- }
+static void
+jbig2_image_compose_opt_OR(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+ if (early || late)
+ template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_OR);
+ else
+ template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_OR);
+}
- return 0;
+static void
+jbig2_image_compose_opt_AND(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+ if (early || late)
+ template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_AND);
+ else
+ template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_AND);
}
+static void
+jbig2_image_compose_opt_XOR(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+ if (early || late)
+ template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_XOR);
+ else
+ template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_XOR);
+}
+
+static void
+jbig2_image_compose_opt_XNOR(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+ if (early || late)
+ template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_XNOR);
+ else
+ template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_XNOR);
+}
+
+static void
+jbig2_image_compose_opt_REPLACE(const uint8_t *s, uint8_t *d, int early, int late, uint8_t mask, uint8_t rightmask, uint32_t bytewidth, uint32_t h, uint32_t shift, uint32_t dstride, uint32_t sstride)
+{
+ if (early || late)
+ template_image_compose_opt(s, d, early, late, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_REPLACE);
+ else
+ template_image_compose_opt(s, d, 0, 0, mask, rightmask, bytewidth, h, shift, dstride, sstride, JBIG2_COMPOSE_REPLACE);
+}
+
/* composite one jbig2_image onto another */
int
jbig2_image_compose(Jbig2Ctx *ctx, Jbig2Image *dst, Jbig2Image *src, int x, int y, Jbig2ComposeOp op)
{
- uint32_t i, j;
uint32_t w, h;
- uint32_t leftbyte, rightbyte;
uint32_t shift;
- uint8_t *s, *ss;
- uint8_t *d, *dd;
- uint8_t mask, rightmask;
+ uint32_t leftbyte;
+ uint8_t *ss;
+ uint8_t *dd;
+ uint8_t leftmask, rightmask;
+ int early = x >= 0;
+ int late;
+ uint32_t bytewidth;
+ uint32_t syoffset = 0;
if (src == NULL)
return 0;
- /* The optimized code for the OR operator below doesn't
- handle the source image partially placed outside the
- destination (above and/or to the left). The affected
- intersection of the destination is computed correctly,
- however the correct subset of the source image is not
- chosen. Instead the upper left corner of the source image
- is always used.
+ /* This code takes a src image and combines it onto dst at offset (x,y), with operation op. */
- In the unoptimized version that handles all operators
- (including OR) the correct subset of the source image is
- chosen.
+ /* Data is packed msb first within a byte, so with bits numbered: 01234567.
+ * Second byte is: 89abcdef. So to combine into a run, we use:
+ * (s[0]<<8) | s[1] == 0123456789abcdef.
+ * To read from src into dst at offset 3, we need to read:
+ * read: 0123456789abcdef...
+ * write: 0123456798abcdef...
+ * In general, to read from src and write into dst at offset x, we need to shift
+ * down by (x&7) bits to allow for bit alignment. So shift = x&7.
+ * So the 'central' part of our runs will see us doing:
+ * *d++ op= ((s[0]<<8)|s[1])>>shift;
+ * with special cases on the left and right edges of the run to mask.
+ * With the left hand edge, we have to be careful not to 'underread' the start of
+ * the src image; this is what the early flag is about. Similarly we have to be
+ * careful not to read off the right hand edge; this is what the late flag is for.
+ */
- The workaround is to check whether the x/y coordinates to
- the composition operator are negative and in this case use
- the unoptimized implementation.
-
- TODO: Fix the optimized OR implementation if possible. */
- if (op != JBIG2_COMPOSE_OR || x < 0 || y < 0) {
- /* hand off the the general routine */
- return jbig2_image_compose_unopt(ctx, dst, src, x, y, op);
- }
-
- /* optimized code for the prevalent OR operator */
-
/* clip */
w = src->width;
h = src->height;
- ss = src->data;
+ shift = (x & 7);
+ ss = src->data - early;
- w = ((uint32_t) x + w < dst->width) ? w : ((dst->width >= (uint32_t) x) ? dst->width - (uint32_t) x : 0);
- h = ((uint32_t) y + h < dst->height) ? h : ((dst->height >= (uint32_t) y) ? dst->height - (uint32_t) y : 0);
+ if (x < 0) {
+ if (w < (uint32_t) -x)
+ w = 0;
+ else
+ w += x;
+ ss += (-x-1)>>3;
+ x = 0;
+ }
+ if (y < 0) {
+ if (h < (uint32_t) -y)
+ h = 0;
+ else
+ h += y;
+ syoffset = -y * src->stride;
+ y = 0;
+ }
+ if ((uint32_t)x + w > dst->width)
+ {
+ if (dst->width < (uint32_t)x)
+ w = 0;
+ else
+ w = dst->width - x;
+ }
+ if ((uint32_t)y + h > dst->height)
+ {
+ if (dst->height < (uint32_t)y)
+ h = 0;
+ else
+ h = dst->height - y;
+ }
#ifdef JBIG2_DEBUG
jbig2_error(ctx, JBIG2_SEVERITY_DEBUG, -1, "compositing %dx%d at (%d, %d) after clipping", w, h, x, y);
#endif
@@ -305,55 +420,32 @@
}
leftbyte = (uint32_t) x >> 3;
- rightbyte = ((uint32_t) x + w - 1) >> 3;
- shift = x & 7;
+ dd = dst->data + y * dst->stride + leftbyte;
+ bytewidth = (((uint32_t) x + w - 1) >> 3) - leftbyte + 1;
+ leftmask = 255>>(x&7);
+ rightmask = (((x+w)&7) == 0) ? 255 : ~(255>>((x+w)&7));
+ if (bytewidth == 1)
+ leftmask &= rightmask;
+ late = (ss + bytewidth >= src->data + ((src->width+7)>>3));
+ ss += syoffset;
- /* general OR case */
- s = ss;
- d = dd = dst->data + y * dst->stride + leftbyte;
- if (d < dst->data ||
- leftbyte > dst->stride ||
- d - leftbyte + (size_t) h * dst->stride > dst->data + (size_t) dst->height * dst->stride ||
- s - leftbyte + (size_t) (h - 1) * src->stride + rightbyte > src->data + (size_t) src->height * src->stride) {
- return jbig2_error(ctx, JBIG2_SEVERITY_FATAL, -1, "preventing heap overflow in jbig2_image_compose");
- }
- if (leftbyte == rightbyte) {
- mask = 0x100 - (0x100 >> w);
- for (j = 0; j < h; j++) {
- *d |= (*s & mask) >> shift;
- d += dst->stride;
- s += src->stride;
- }
- } else if (shift == 0) {
- rightmask = (w & 7) ? 0x100 - (1 << (8 - (w & 7))) : 0xFF;
- for (j = 0; j < h; j++) {
- for (i = leftbyte; i < rightbyte; i++)
- *d++ |= *s++;
- *d |= *s & rightmask;
- d = (dd += dst->stride);
- s = (ss += src->stride);
- }
- } else {
- bool overlap = (((w + 7) >> 3) < ((x + w + 7) >> 3) - (x >> 3));
-
- mask = 0x100 - (1 << shift);
- if (overlap)
- rightmask = (0x100 - (0x100 >> ((x + w) & 7))) >> (8 - shift);
- else
- rightmask = 0x100 - (0x100 >> (w & 7));
- for (j = 0; j < h; j++) {
- *d++ |= (*s & mask) >> shift;
- for (i = leftbyte; i < rightbyte - 1; i++) {
- *d |= ((*s++ & ~mask) << (8 - shift));
- *d++ |= ((*s & mask) >> shift);
- }
- if (overlap)
- *d |= (*s & rightmask) << (8 - shift);
- else
- *d |= ((s[0] & ~mask) << (8 - shift)) | ((s[1] & rightmask) >> shift);
- d = (dd += dst->stride);
- s = (ss += src->stride);
- }
+ switch(op)
+ {
+ case JBIG2_COMPOSE_OR:
+ jbig2_image_compose_opt_OR(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+ break;
+ case JBIG2_COMPOSE_AND:
+ jbig2_image_compose_opt_AND(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+ break;
+ case JBIG2_COMPOSE_XOR:
+ jbig2_image_compose_opt_XOR(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+ break;
+ case JBIG2_COMPOSE_XNOR:
+ jbig2_image_compose_opt_XNOR(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+ break;
+ case JBIG2_COMPOSE_REPLACE:
+ jbig2_image_compose_opt_REPLACE(ss, dd, early, late, leftmask, rightmask, bytewidth, h, shift, dst->stride, src->stride);
+ break;
}
return 0;
--- a/jbig2_priv.h
+++ b/jbig2_priv.h
@@ -116,11 +116,7 @@
#define jbig2_renew(ctx, p, t, size) ((t *)jbig2_realloc(ctx->allocator, (p), size, sizeof(t)))
-int jbig2_error(Jbig2Ctx *ctx, Jbig2Severity severity, int32_t seg_idx, const char *fmt, ...)
-#ifdef __GNUC__
- __attribute__ ((format (__printf__, 4, 5)))
-#endif
- ;
+int jbig2_error(Jbig2Ctx *ctx, Jbig2Severity severity, int32_t seg_idx, const char *fmt, ...);
/* The word stream design is a compromise between simplicity and
trying to amortize the number of method calls. Each ::get_next_word
@@ -136,5 +132,16 @@
Jbig2WordStream *jbig2_word_stream_buf_new(Jbig2Ctx *ctx, const byte *data, size_t size);
void jbig2_word_stream_buf_free(Jbig2Ctx *ctx, Jbig2WordStream *ws);
+
+/* restrict is standard in C99, but not in all C++ compilers. */
+#if defined (__STDC_VERSION_) && (__STDC_VERSION__ >= 199901L) /* C99 */
+#define JBIG2_RESTRICT restrict
+#elif defined(_MSC_VER) && (_MSC_VER >= 1600) /* MSVC 10 or newer */
+#define JBIG2_RESTRICT __restrict
+#elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC 3 or newer */
+#define JBIG2_RESTRICT __restrict
+#else /* Unknown or ancient */
+#define JBIG2_RESTRICT
+#endif
#endif /* _JBIG2_PRIV_H */