ref: 0261579d78c5ca789508aef01cdaf8d374432c13
parent: 331c5ecbaca705eb8c899e814afe94971b14207b
author: Tor Andersson <tor.andersson@artifex.com>
date: Thu May 14 09:59:34 EDT 2020
Support embedded 0 in strings by using modified UTF-8.
--- a/docs/reference.html
+++ b/docs/reference.html
@@ -65,8 +65,11 @@
CESU-8 is a variant of UTF-8 which encodes supplementary unicode characters as
surrogate pairs. This maintains compatibility with the UTF-16 nature of
JavaScript, but requires attention when passing strings using supplementary
-unicode characters to and from the MuJS library. It also means that you cannot
-have any JavaScript strings with a zero character value in MuJS.
+unicode characters to and from the MuJS library.
+
+<p>
+The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in
+modified UTF-8.
<h3>Environments</h3>
--- a/jslex.c
+++ b/jslex.c
@@ -158,6 +158,10 @@
static void jsY_next(js_State *J)
{
Rune c;
+ if (*J->source == 0) {
+ J->lexchar = EOF;
+ return;
+ }
J->source += chartorune(&c, J->source);
/* consume CR LF as one unit */
if (c == '\r' && *J->source == '\n')
@@ -201,17 +205,24 @@
static void textpush(js_State *J, Rune c)
{
- int n = runelen(c);
+ int n;
+ if (c == EOF)
+ n = 1;
+ else
+ n = runelen(c);
if (J->lexbuf.len + n > J->lexbuf.cap) {
J->lexbuf.cap = J->lexbuf.cap * 2;
J->lexbuf.text = js_realloc(J, J->lexbuf.text, J->lexbuf.cap);
}
- J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);
+ if (c == EOF)
+ J->lexbuf.text[J->lexbuf.len++] = 0;
+ else
+ J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);
}
static char *textend(js_State *J)
{
- textpush(J, 0);
+ textpush(J, EOF);
return J->lexbuf.text;
}
@@ -224,7 +235,7 @@
static int lexcomment(js_State *J)
{
/* already consumed initial '/' '*' sequence */
- while (J->lexchar != 0) {
+ while (J->lexchar != EOF) {
if (jsY_accept(J, '*')) {
while (J->lexchar == '*')
jsY_next(J);
@@ -385,7 +396,7 @@
return 0;
switch (J->lexchar) {
- case 0: jsY_error(J, "unterminated escape sequence");
+ case EOF: jsY_error(J, "unterminated escape sequence");
case 'u':
jsY_next(J);
if (!jsY_ishex(J->lexchar)) return 1; else { x |= jsY_tohex(J->lexchar) << 12; jsY_next(J); }
@@ -425,7 +436,7 @@
textinit(J);
while (J->lexchar != q) {
- if (J->lexchar == 0 || J->lexchar == '\n')
+ if (J->lexchar == EOF || J->lexchar == '\n')
jsY_error(J, "string not terminated");
if (jsY_accept(J, '\\')) {
if (lexescape(J))
@@ -475,7 +486,7 @@
/* regexp body */
while (J->lexchar != '/' || inclass) {
- if (J->lexchar == 0 || J->lexchar == '\n') {
+ if (J->lexchar == EOF || J->lexchar == '\n') {
jsY_error(J, "regular expression not terminated");
} else if (jsY_accept(J, '\\')) {
if (jsY_accept(J, '/')) {
@@ -482,7 +493,7 @@
textpush(J, '/');
} else {
textpush(J, '\\');
- if (J->lexchar == 0 || J->lexchar == '\n')
+ if (J->lexchar == EOF || J->lexchar == '\n')
jsY_error(J, "regular expression not terminated");
textpush(J, J->lexchar);
jsY_next(J);
@@ -688,7 +699,7 @@
return TK_XOR_ASS;
return '^';
- case 0:
+ case EOF:
return 0; /* EOF */
}
@@ -803,7 +814,7 @@
textinit(J);
while (J->lexchar != '"') {
- if (J->lexchar == 0)
+ if (J->lexchar == EOF)
jsY_error(J, "unterminated string");
else if (J->lexchar < 32)
jsY_error(J, "invalid control character in string");
@@ -857,7 +868,7 @@
jsY_next(J); jsY_expect(J, 'r'); jsY_expect(J, 'u'); jsY_expect(J, 'e');
return TK_TRUE;
- case 0:
+ case EOF:
return 0; /* EOF */
}
--- a/jsrun.c
+++ b/jsrun.c
@@ -469,7 +469,7 @@
static void js_pushrune(js_State *J, Rune rune)
{
char buf[UTFmax + 1];
- if (rune > 0) {
+ if (rune >= 0) {
buf[runetochar(buf, &rune)] = 0;
js_pushstring(J, buf);
} else {
--- a/jsstring.c
+++ b/jsstring.c
@@ -21,12 +21,12 @@
int js_runeat(js_State *J, const char *s, int i)
{
- Rune rune = 0;
+ Rune rune = EOF;
while (i-- >= 0) {
rune = *(unsigned char*)s;
if (rune < Runeself) {
if (rune == 0)
- return 0;
+ return EOF;
++s;
} else
s += chartorune(&rune, s);
@@ -93,7 +93,7 @@
const char *s = checkstring(J, 0);
int pos = js_tointeger(J, 1);
Rune rune = js_runeat(J, s, pos);
- if (rune > 0) {
+ if (rune >= 0) {
buf[runetochar(buf, &rune)] = 0;
js_pushstring(J, buf);
} else {
@@ -106,7 +106,7 @@
const char *s = checkstring(J, 0);
int pos = js_tointeger(J, 1);
Rune rune = js_runeat(J, s, pos);
- if (rune > 0)
+ if (rune >= 0)
js_pushnumber(J, rune);
else
js_pushnumber(J, NAN);
--- a/regexp.c
+++ b/regexp.c
@@ -116,11 +116,16 @@
static int nextrune(struct cstate *g)
{
+ if (!*g->source) {
+ g->yychar = EOF;
+ return 0;
+ }
g->source += chartorune(&g->yychar, g->source);
if (g->yychar == '\\') {
+ if (!*g->source)
+ die(g, "unterminated escape sequence");
g->source += chartorune(&g->yychar, g->source);
switch (g->yychar) {
- case 0: die(g, "unterminated escape sequence"); break;
case 'f': g->yychar = '\f'; return 0;
case 'n': g->yychar = '\n'; return 0;
case 'r': g->yychar = '\r'; return 0;
@@ -147,6 +152,9 @@
return 1;
}
return 0;
+ case 0:
+ g->yychar = '0';
+ return 1;
}
if (strchr(ESCAPES, g->yychar))
return 1;
@@ -272,7 +280,7 @@
havesave = havedash = 0;
for (;;) {
- if (g->yychar == 0)
+ if (g->yychar == EOF)
die(g, "unterminated character class");
if (!quoted && g->yychar == ']')
break;
@@ -363,7 +371,7 @@
}
switch (g->yychar) {
- case 0:
+ case EOF:
case '$': case ')': case '*': case '+':
case '.': case '?': case '^': case '|':
return g->yychar;
@@ -561,11 +569,11 @@
static Renode *parsecat(struct cstate *g)
{
Renode *cat, *head, **tail;
- if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+ if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
/* Build a right-leaning tree by splicing in new 'cat' at the tail. */
head = parserep(g);
tail = &head;
- while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {
+ while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
cat = newnode(g, P_CAT);
cat->x = *tail;
cat->y = parserep(g);
@@ -866,7 +874,7 @@
node = parsealt(&g);
if (g.lookahead == ')')
die(&g, "unmatched ')'");
- if (g.lookahead != 0)
+ if (g.lookahead != EOF)
die(&g, "syntax error");
#ifdef TEST
@@ -1026,23 +1034,20 @@
break;
case I_ANYNL:
+ if (!*sp) return 1;
sp += chartorune(&c, sp);
- if (c == 0)
- return 1;
pc = pc + 1;
break;
case I_ANY:
+ if (!*sp) return 1;
sp += chartorune(&c, sp);
- if (c == 0)
- return 1;
if (isnewline(c))
return 1;
pc = pc + 1;
break;
case I_CHAR:
+ if (!*sp) return 1;
sp += chartorune(&c, sp);
- if (c == 0)
- return 1;
if (flags & REG_ICASE)
c = canon(c);
if (c != pc->c)
@@ -1050,9 +1055,8 @@
pc = pc + 1;
break;
case I_CCLASS:
+ if (!*sp) return 1;
sp += chartorune(&c, sp);
- if (c == 0)
- return 1;
if (flags & REG_ICASE) {
if (!incclasscanon(pc->cc, canon(c)))
return 1;
@@ -1063,9 +1067,8 @@
pc = pc + 1;
break;
case I_NCCLASS:
+ if (!*sp) return 1;
sp += chartorune(&c, sp);
- if (c == 0)
- return 1;
if (flags & REG_ICASE) {
if (incclasscanon(pc->cc, canon(c)))
return 1;
--- a/utf.c
+++ b/utf.c
@@ -48,6 +48,12 @@
int c, c1, c2;
int l;
+ /* overlong null character */
+ if((uchar)str[0] == 0xc0 && (uchar)str[1] == 0x80) {
+ *rune = 0;
+ return 2;
+ }
+
/*
* one character sequence
* 00000-0007F => T1
@@ -101,13 +107,19 @@
int
runetochar(char *str, const Rune *rune)
{
- int c;
+ int c = *rune;
+ /* overlong null character */
+ if (c == 0) {
+ str[0] = 0xc0;
+ str[1] = 0x80;
+ return 2;
+ }
+
/*
* one character sequence
* 00000-0007F => 00-7F
*/
- c = *rune;
if(c <= Rune1) {
str[0] = c;
return 1;