ref: d2697fd6435d0ffaae5c424e00ccf3a4b5a21f56
parent: a442a0418f06b673559c93a69e3167b4460d6d07
author: Tor Andersson <tor.andersson@artifex.com>
date: Tue Jan 23 09:33:04 EST 2024
Expose extended unicode characters as surrogate pairs in String methods. Split extended characters into surrogate pairs for charCodeAt, string indexing, and the string slice/subset functions. Escape surrogate code points in JSON stringify.
--- a/jsi.h
+++ b/jsi.h
@@ -144,8 +144,8 @@
js_Regexp *js_toregexp(js_State *J, int idx);
int js_isarrayindex(js_State *J, const char *str, int *idx);
int js_runeat(js_State *J, const char *s, int i);
+int js_utflen(const char *s);
int js_utfptrtoidx(const char *s, const char *p);
-const char *js_utfidxtoptr(const char *s, int i);
void js_dup(js_State *J);
void js_dup2(js_State *J);
--- a/json.c
+++ b/json.c
@@ -185,7 +185,7 @@
static void fmtstr(js_State *J, js_Buffer **sb, const char *s)
{
- static const char *HEX = "0123456789ABCDEF";
+ static const char *HEX = "0123456789abcdef";
int i, n;
Rune c;
js_putc(J, sb, '"');
@@ -200,7 +200,7 @@
case '\r': js_puts(J, sb, "\\r"); break;
case '\t': js_puts(J, sb, "\\t"); break;
default:
- if (c < ' ') {
+ if (c < ' ' || (c >= 0xd800 && c <= 0xdfff)) {
js_putc(J, sb, '\\');
js_putc(J, sb, 'u');
js_putc(J, sb, HEX[(c>>12)&15]);
--- a/jsstring.c
+++ b/jsstring.c
@@ -20,31 +20,54 @@
int js_runeat(js_State *J, const char *s, int i)
{
Rune rune = EOF;
- while (i-- >= 0) {
+ while (i >= 0) {
rune = *(unsigned char*)s;
if (rune < Runeself) {
if (rune == 0)
return EOF;
++s;
- } else
+ --i;
+ } else {
s += chartorune(&rune, s);
+ if (rune >= 0x10000)
+ i -= 2;
+ else
+ --i;
+ }
}
+ if (rune >= 0x10000) {
+ /* high surrogate */
+ if (i == -2)
+ return 0xd800 + ((rune - 0x10000) >> 10);
+ /* low surrogate */
+ else
+ return 0xdc00 + ((rune - 0x10000) & 0x3ff);
+ }
return rune;
}
-const char *js_utfidxtoptr(const char *s, int i)
+int js_utflen(const char *s)
{
+ int c;
+ int n;
Rune rune;
- while (i-- > 0) {
- rune = *(unsigned char*)s;
- if (rune < Runeself) {
- if (rune == 0)
- return NULL;
- ++s;
- } else
+
+ n = 0;
+ for(;;) {
+ c = *(unsigned char *)s;
+ if (c < Runeself) {
+ if (c == 0)
+ return n;
+ s++;
+ n++;
+ } else {
s += chartorune(&rune, s);
+ if (rune >= 0x10000)
+ n += 2;
+ else
+ n++;
+ }
}
- return s;
}
int js_utfptrtoidx(const char *s, const char *p)
@@ -56,7 +79,10 @@
++s;
else
s += chartorune(&rune, s);
- ++i;
+ if (rune >= 0x10000)
+ i += 2;
+ else
+ i += 1;
}
return i;
}
@@ -190,11 +216,67 @@
js_pushnumber(J, strcmp(a, b));
}
+static void Sp_substring_imp(js_State *J, const char *s, int a, int n)
+{
+ Rune head_rune = 0, tail_rune = 0;
+ const char *head, *tail;
+ char *p;
+ int i, k, head_len, tail_len;
+
+ /* find start of substring */
+ head = s;
+ for (i = 0; i < a; ++i) {
+ head += chartorune(&head_rune, head);
+ if (head_rune >= 0x10000)
+ ++i;
+ }
+
+ /* find end of substring */
+ tail = head;
+ for (k = i - a; k < n; ++k) {
+ tail += chartorune(&tail_rune, tail);
+ if (tail_rune >= 0x10000)
+ ++k;
+ }
+
+ /* no surrogate pair splits! */
+ if (i == a && k == n) {
+ js_pushlstring(J, head, tail - head);
+ return;
+ }
+
+ if (js_try(J)) {
+ js_free(J, p);
+ js_throw(J);
+ }
+
+ p = js_malloc(J, UTFmax + (tail - head));
+
+ /* substring starts with low surrogate (head is just after character) */
+ if (i > a) {
+ head_rune = 0xdc00 + ((head_rune - 0x10000) & 0x3ff);
+ head_len = runetochar(p, &head_rune);
+ memcpy(p + head_len, head, tail - head);
+ js_pushlstring(J, p, head_len + (tail - head));
+ }
+
+ /* substring ends with high surrogate (tail is just after character) */
+ if (k > n) {
+ tail -= runelen(tail_rune);
+ memcpy(p, head, tail - head);
+ tail_rune = 0xd800 + ((tail_rune - 0x10000) >> 10);
+ tail_len = runetochar(p + (tail - head), &tail_rune);
+ js_pushlstring(J, p, (tail - head) + tail_len);
+ }
+
+ js_endtry(J);
+ js_free(J, p);
+}
+
static void Sp_slice(js_State *J)
{
const char *str = checkstring(J, 0);
- const char *ss, *ee;
- int len = utflen(str);
+ int len = js_utflen(str);
int s = js_tointeger(J, 1);
int e = js_isdefined(J, 2) ? js_tointeger(J, 2) : len;
@@ -204,22 +286,16 @@
s = s < 0 ? 0 : s > len ? len : s;
e = e < 0 ? 0 : e > len ? len : e;
- if (s < e) {
- ss = js_utfidxtoptr(str, s);
- ee = js_utfidxtoptr(ss, e - s);
- } else {
- ss = js_utfidxtoptr(str, e);
- ee = js_utfidxtoptr(ss, s - e);
- }
-
- js_pushlstring(J, ss, ee - ss);
+ if (s < e)
+ Sp_substring_imp(J, str, s, e - s);
+ else
+ Sp_substring_imp(J, str, e, s - e);
}
static void Sp_substring(js_State *J)
{
const char *str = checkstring(J, 0);
- const char *ss, *ee;
- int len = utflen(str);
+ int len = js_utflen(str);
int s = js_tointeger(J, 1);
int e = js_isdefined(J, 2) ? js_tointeger(J, 2) : len;
@@ -226,15 +302,10 @@
s = s < 0 ? 0 : s > len ? len : s;
e = e < 0 ? 0 : e > len ? len : e;
- if (s < e) {
- ss = js_utfidxtoptr(str, s);
- ee = js_utfidxtoptr(ss, e - s);
- } else {
- ss = js_utfidxtoptr(str, e);
- ee = js_utfidxtoptr(ss, s - e);
- }
-
- js_pushlstring(J, ss, ee - ss);
+ if (s < e)
+ Sp_substring_imp(J, str, s, e - s);
+ else
+ Sp_substring_imp(J, str, e, s - e);
}
static void Sp_toLowerCase(js_State *J)
--- a/jsvalue.c
+++ b/jsvalue.c
@@ -388,7 +388,7 @@
} else {
obj->u.s.string = js_strdup(J, v);
}
- obj->u.s.length = utflen(v);
+ obj->u.s.length = js_utflen(v);
return obj;
}
--- a/utf.c
+++ b/utf.c
@@ -194,26 +194,6 @@
return runetochar(str, &rune);
}
-int
-utflen(const char *s)
-{
- int c;
- int n;
- Rune rune;
-
- n = 0;
- for(;;) {
- c = *(uchar*)s;
- if(c < Runeself) {
- if(c == 0)
- return n;
- s++;
- } else
- s += chartorune(&rune, s);
- n++;
- }
-}
-
static const Rune *
ucd_bsearch(Rune c, const Rune *t, int n, int ne)
{
--- a/utf.h
+++ b/utf.h
@@ -19,7 +19,6 @@
#define chartorune jsU_chartorune
#define runetochar jsU_runetochar
#define runelen jsU_runelen
-#define utflen jsU_utflen
#define isalpharune jsU_isalpharune
#define islowerrune jsU_islowerrune
@@ -39,7 +38,6 @@
int chartorune(Rune *rune, const char *str);
int runetochar(char *str, const Rune *rune);
int runelen(int c);
-int utflen(const char *s);
int isalpharune(Rune c);
int islowerrune(Rune c);