shithub: libmujs

--- a/docs/reference.html

+++ b/docs/reference.html

@@ -65,8 +65,11 @@

 CESU-8 is a variant of UTF-8 which encodes supplementary unicode characters as

 surrogate pairs. This maintains compatibility with the UTF-16 nature of

 JavaScript, but requires attention when passing strings using supplementary

-unicode characters to and from the MuJS library. It also means that you cannot

-have any JavaScript strings with a zero character value in MuJS.

+unicode characters to and from the MuJS library.

+<p>

+The U+0000 character is encoded as the two-byte sequence <C0 80>, same as in

+modified UTF-8.

 <h3>Environments</h3>

--- a/jslex.c

+++ b/jslex.c

@@ -158,6 +158,10 @@

 static void jsY_next(js_State *J)

 	Rune c;

+	if (*J->source == 0) {

+		J->lexchar = EOF;

+		return;

+	}

 	J->source += chartorune(&c, J->source);

 	/* consume CR LF as one unit */

 	if (c == '\r' && *J->source == '\n')

@@ -201,17 +205,24 @@

 static void textpush(js_State *J, Rune c)

-	int n = runelen(c);

+	int n;

+	if (c == EOF)

+		n = 1;

+	else

+		n = runelen(c);

 	if (J->lexbuf.len + n > J->lexbuf.cap) {

 		J->lexbuf.cap = J->lexbuf.cap * 2;

 		J->lexbuf.text = js_realloc(J, J->lexbuf.text, J->lexbuf.cap);

-	J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);

+	if (c == EOF)

+		J->lexbuf.text[J->lexbuf.len++] = 0;

+	else

+		J->lexbuf.len += runetochar(J->lexbuf.text + J->lexbuf.len, &c);

 static char *textend(js_State *J)

-	textpush(J, 0);

+	textpush(J, EOF);

 	return J->lexbuf.text;

@@ -224,7 +235,7 @@

 static int lexcomment(js_State *J)

 	/* already consumed initial '/' '*' sequence */

-	while (J->lexchar != 0) {

+	while (J->lexchar != EOF) {

 		if (jsY_accept(J, '*')) {

 			while (J->lexchar == '*')

 				jsY_next(J);

@@ -385,7 +396,7 @@

 		return 0;

 	switch (J->lexchar) {

-	case 0: jsY_error(J, "unterminated escape sequence");

+	case EOF: jsY_error(J, "unterminated escape sequence");

 	case 'u':

 		jsY_next(J);

 		if (!jsY_ishex(J->lexchar)) return 1; else { x |= jsY_tohex(J->lexchar) << 12; jsY_next(J); }

@@ -425,7 +436,7 @@

 	textinit(J);

 	while (J->lexchar != q) {

-		if (J->lexchar == 0 || J->lexchar == '\n')

+		if (J->lexchar == EOF || J->lexchar == '\n')

 			jsY_error(J, "string not terminated");

 		if (jsY_accept(J, '\\')) {

 			if (lexescape(J))

@@ -475,7 +486,7 @@

 	/* regexp body */

 	while (J->lexchar != '/' || inclass) {

-		if (J->lexchar == 0 || J->lexchar == '\n') {

+		if (J->lexchar == EOF || J->lexchar == '\n') {

 			jsY_error(J, "regular expression not terminated");

 		} else if (jsY_accept(J, '\\')) {

 			if (jsY_accept(J, '/')) {

@@ -482,7 +493,7 @@

 				textpush(J, '/');

 			} else {

 				textpush(J, '\\');

-				if (J->lexchar == 0 || J->lexchar == '\n')

+				if (J->lexchar == EOF || J->lexchar == '\n')

 					jsY_error(J, "regular expression not terminated");

 				textpush(J, J->lexchar);

 				jsY_next(J);

@@ -688,7 +699,7 @@

 				return TK_XOR_ASS;

 			return '^';

-		case 0:

+		case EOF:

 			return 0; /* EOF */

@@ -803,7 +814,7 @@

 	textinit(J);

 	while (J->lexchar != '"') {

-		if (J->lexchar == 0)

+		if (J->lexchar == EOF)

 			jsY_error(J, "unterminated string");

 		else if (J->lexchar < 32)

 			jsY_error(J, "invalid control character in string");

@@ -857,7 +868,7 @@

 			jsY_next(J); jsY_expect(J, 'r'); jsY_expect(J, 'u'); jsY_expect(J, 'e');

 			return TK_TRUE;

-		case 0:

+		case EOF:

 			return 0; /* EOF */

--- a/jsrun.c

+++ b/jsrun.c

@@ -469,7 +469,7 @@

 static void js_pushrune(js_State *J, Rune rune)

 	char buf[UTFmax + 1];

-	if (rune > 0) {

+	if (rune >= 0) {

 		buf[runetochar(buf, &rune)] = 0;

 		js_pushstring(J, buf);

 	} else {

--- a/jsstring.c

+++ b/jsstring.c

@@ -21,12 +21,12 @@

 int js_runeat(js_State *J, const char *s, int i)

-	Rune rune = 0;

+	Rune rune = EOF;

 	while (i-- >= 0) {

 		rune = *(unsigned char*)s;

 		if (rune < Runeself) {

 			if (rune == 0)

-				return 0;

+				return EOF;

 			++s;

 		} else

 			s += chartorune(&rune, s);

@@ -93,7 +93,7 @@

 	const char *s = checkstring(J, 0);

 	int pos = js_tointeger(J, 1);

 	Rune rune = js_runeat(J, s, pos);

-	if (rune > 0) {

+	if (rune >= 0) {

 		buf[runetochar(buf, &rune)] = 0;

 		js_pushstring(J, buf);

 	} else {

@@ -106,7 +106,7 @@

 	const char *s = checkstring(J, 0);

 	int pos = js_tointeger(J, 1);

 	Rune rune = js_runeat(J, s, pos);

-	if (rune > 0)

+	if (rune >= 0)

 		js_pushnumber(J, rune);

 	else

 		js_pushnumber(J, NAN);

--- a/regexp.c

+++ b/regexp.c

@@ -116,11 +116,16 @@

 static int nextrune(struct cstate *g)

+	if (!*g->source) {

+		g->yychar = EOF;

+		return 0;

+	}

 	g->source += chartorune(&g->yychar, g->source);

 	if (g->yychar == '\\') {

+		if (!*g->source)

+			die(g, "unterminated escape sequence");

 		g->source += chartorune(&g->yychar, g->source);

 		switch (g->yychar) {

-		case 0: die(g, "unterminated escape sequence"); break;

 		case 'f': g->yychar = '\f'; return 0;

 		case 'n': g->yychar = '\n'; return 0;

 		case 'r': g->yychar = '\r'; return 0;

@@ -147,6 +152,9 @@

 				return 1;

 			return 0;

+		case 0:

+			g->yychar = '0';

+			return 1;

 		if (strchr(ESCAPES, g->yychar))

 			return 1;

@@ -272,7 +280,7 @@

 	havesave = havedash = 0;

 	for (;;) {

-		if (g->yychar == 0)

+		if (g->yychar == EOF)

 			die(g, "unterminated character class");

 		if (!quoted && g->yychar == ']')

 			break;

@@ -363,7 +371,7 @@

 	switch (g->yychar) {

-	case 0:

+	case EOF:

 	case '$': case ')': case '*': case '+':

 	case '.': case '?': case '^': case '|':

 		return g->yychar;

@@ -561,11 +569,11 @@

 static Renode *parsecat(struct cstate *g)

 	Renode *cat, *head, **tail;

-	if (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {

+	if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {

 		/* Build a right-leaning tree by splicing in new 'cat' at the tail. */

 		head = parserep(g);

 		tail = &head;

-		while (g->lookahead && g->lookahead != '|' && g->lookahead != ')') {

+		while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {

 			cat = newnode(g, P_CAT);

 			cat->x = *tail;

 			cat->y = parserep(g);

@@ -866,7 +874,7 @@

 	node = parsealt(&g);

 	if (g.lookahead == ')')

 		die(&g, "unmatched ')'");

-	if (g.lookahead != 0)

+	if (g.lookahead != EOF)

 		die(&g, "syntax error");

 #ifdef TEST

@@ -1026,23 +1034,20 @@

 			break;

 		case I_ANYNL:

+			if (!*sp) return 1;

 			sp += chartorune(&c, sp);

-			if (c == 0)

-				return 1;

 			pc = pc + 1;

 			break;

 		case I_ANY:

+			if (!*sp) return 1;

 			sp += chartorune(&c, sp);

-			if (c == 0)

-				return 1;

 			if (isnewline(c))

 				return 1;

 			pc = pc + 1;

 			break;

 		case I_CHAR:

+			if (!*sp) return 1;

 			sp += chartorune(&c, sp);

-			if (c == 0)

-				return 1;

 			if (flags & REG_ICASE)

 				c = canon(c);

 			if (c != pc->c)

@@ -1050,9 +1055,8 @@

 			pc = pc + 1;

 			break;

 		case I_CCLASS:

+			if (!*sp) return 1;

 			sp += chartorune(&c, sp);

-			if (c == 0)

-				return 1;

 			if (flags & REG_ICASE) {

 				if (!incclasscanon(pc->cc, canon(c)))

 					return 1;

@@ -1063,9 +1067,8 @@

 			pc = pc + 1;

 			break;

 		case I_NCCLASS:

+			if (!*sp) return 1;

 			sp += chartorune(&c, sp);

-			if (c == 0)

-				return 1;

 			if (flags & REG_ICASE) {

 				if (incclasscanon(pc->cc, canon(c)))

 					return 1;

--- a/utf.c

+++ b/utf.c

@@ -48,6 +48,12 @@

 	int c, c1, c2;

 	int l;

+	/* overlong null character */

+	if((uchar)str[0] == 0xc0 && (uchar)str[1] == 0x80) {

+		*rune = 0;

+		return 2;

+	}

/*

 	 * one character sequence

 	 *	00000-0007F => T1

@@ -101,13 +107,19 @@

int

 runetochar(char *str, const Rune *rune)

-	int c;

+	int c = *rune;

+	/* overlong null character */

+	if (c == 0) {

+		str[0] = 0xc0;

+		str[1] = 0x80;

+		return 2;

+	}

/*

 	 * one character sequence

 	 *	00000-0007F => 00-7F

*/

-	c = *rune;

 	if(c <= Rune1) {

 		str[0] = c;

 		return 1;

--

⑨