ref: 4d21588eb27c19a2c69145ab343fc1217cd2f21d
parent: e596dbfc80112c5da0fa8906fc262c63bb6c63c9
author: Rangi <remy.oukaour+rangi42@gmail.com>
date: Tue Apr 20 08:24:01 EDT 2021
Make invalid UTF-8 characters in strings non-fatal STRLEN and STRSUB report the erroneous bytes Fixes #848
--- a/src/asm/parser.y
+++ b/src/asm/parser.y
@@ -82,6 +82,11 @@
return NULL;
}
+static void errorInvalidUTF8Byte(uint8_t byte, char const *functionName)
+{
+ error("%s: Invalid UTF-8 byte 0x%02hhX\n", functionName, byte);
+}
+
static size_t strlenUTF8(char const *s)
{
size_t len = 0;
@@ -88,10 +93,13 @@
uint32_t state = 0;
for (uint32_t codep = 0; *s; s++) {
- switch (decode(&state, &codep, *s)) {
+ uint8_t byte = *s;
+
+ switch (decode(&state, &codep, byte)) {
case 1:
- fatalerror("STRLEN: Invalid UTF-8 character\n");
- break;
+ errorInvalidUTF8Byte(byte, "STRLEN");
+ state = 0;
+ /* fallthrough */
case 0:
len++;
break;
@@ -100,7 +108,7 @@
/* Check for partial code point. */
if (state != 0)
- fatalerror("STRLEN: Invalid UTF-8 character\n");
+ error("STRLEN: Incomplete UTF-8 character\n");
return len;
}
@@ -116,14 +124,16 @@
/* Advance to starting position in source string. */
while (src[srcIndex] && curPos < pos) {
- switch (decode(&state, &codep, src[srcIndex++])) {
+ switch (decode(&state, &codep, src[srcIndex])) {
case 1:
- fatalerror("STRSUB: Invalid UTF-8 character\n");
- break;
+ errorInvalidUTF8Byte(src[srcIndex], "STRSUB");
+ state = 0;
+ /* fallthrough */
case 0:
curPos++;
break;
}
+ srcIndex++;
}
/*
@@ -138,8 +148,9 @@
while (src[srcIndex] && destIndex < destLen - 1 && curLen < len) {
switch (decode(&state, &codep, src[srcIndex])) {
case 1:
- fatalerror("STRSUB: Invalid UTF-8 character\n");
- break;
+ errorInvalidUTF8Byte(src[srcIndex], "STRSUB");
+ state = 0;
+ /* fallthrough */
case 0:
curLen++;
break;
@@ -152,7 +163,7 @@
/* Check for partial code point. */
if (state != 0)
- fatalerror("STRSUB: Invalid UTF-8 character\n");
+ error("STRSUB: Incomplete UTF-8 character\n");
dest[destIndex] = '\0';
}
--- /dev/null
+++ b/test/asm/invalid-utf-8-strings.asm
@@ -1,0 +1,23 @@
+; characters:
+; 1: U+0061 a
+; 2: U+00E4 a with diaresis (0xC3 0xA4)
+; 3: U+0062 b
+; 4: U+6F22 kanji (0xE6 0xBC 0xA2)
+; 5: U+002C ,
+; 6: U+0061 a
+; 7: invalid byte 0xA3
+; 8: invalid byte 0xA4
+; 9: U+0062 b
+; 10: invalid bytes 0xE6 0xF0
+; 11: invalid byte 0xA2
+; 12: U+0021 !
+invalid EQUS "aäb漢,a��b���!"
+
+n = STRLEN("{invalid}")
+copy EQUS STRSUB("{invalid}", 1)
+
+println "\"{invalid}\" == \"{copy}\" ({d:n})"
+
+mid1 EQUS STRSUB("{invalid}", 5, 2)
+mid2 EQUS STRSUB("{invalid}", 9, 1)
+println "\"{mid2}{mid1}\""
--- /dev/null
+++ b/test/asm/invalid-utf-8-strings.err
@@ -1,0 +1,45 @@
+ERROR: invalid-utf-8-strings.asm(16):
+ STRLEN: Invalid UTF-8 byte 0xA3
+ERROR: invalid-utf-8-strings.asm(16):
+ STRLEN: Invalid UTF-8 byte 0xA4
+ERROR: invalid-utf-8-strings.asm(16):
+ STRLEN: Invalid UTF-8 byte 0xF0
+ERROR: invalid-utf-8-strings.asm(16):
+ STRLEN: Invalid UTF-8 byte 0xA2
+ERROR: invalid-utf-8-strings.asm(17):
+ STRLEN: Invalid UTF-8 byte 0xA3
+ERROR: invalid-utf-8-strings.asm(17):
+ STRLEN: Invalid UTF-8 byte 0xA4
+ERROR: invalid-utf-8-strings.asm(17):
+ STRLEN: Invalid UTF-8 byte 0xF0
+ERROR: invalid-utf-8-strings.asm(17):
+ STRLEN: Invalid UTF-8 byte 0xA2
+ERROR: invalid-utf-8-strings.asm(17):
+ STRSUB: Invalid UTF-8 byte 0xA3
+ERROR: invalid-utf-8-strings.asm(17):
+ STRSUB: Invalid UTF-8 byte 0xA4
+ERROR: invalid-utf-8-strings.asm(17):
+ STRSUB: Invalid UTF-8 byte 0xF0
+ERROR: invalid-utf-8-strings.asm(17):
+ STRSUB: Invalid UTF-8 byte 0xA2
+ERROR: invalid-utf-8-strings.asm(21):
+ STRLEN: Invalid UTF-8 byte 0xA3
+ERROR: invalid-utf-8-strings.asm(21):
+ STRLEN: Invalid UTF-8 byte 0xA4
+ERROR: invalid-utf-8-strings.asm(21):
+ STRLEN: Invalid UTF-8 byte 0xF0
+ERROR: invalid-utf-8-strings.asm(21):
+ STRLEN: Invalid UTF-8 byte 0xA2
+ERROR: invalid-utf-8-strings.asm(22):
+ STRLEN: Invalid UTF-8 byte 0xA3
+ERROR: invalid-utf-8-strings.asm(22):
+ STRLEN: Invalid UTF-8 byte 0xA4
+ERROR: invalid-utf-8-strings.asm(22):
+ STRLEN: Invalid UTF-8 byte 0xF0
+ERROR: invalid-utf-8-strings.asm(22):
+ STRLEN: Invalid UTF-8 byte 0xA2
+ERROR: invalid-utf-8-strings.asm(22):
+ STRSUB: Invalid UTF-8 byte 0xA3
+ERROR: invalid-utf-8-strings.asm(22):
+ STRSUB: Invalid UTF-8 byte 0xA4
+error: Assembly aborted (22 errors)!
--- /dev/null
+++ b/test/asm/invalid-utf-8-strings.out
@@ -1,0 +1,2 @@
+"aäb漢,a��b���!" == "aäb漢,a��b���!" (12)
+"b,a"
--- a/test/asm/invalid-utf-8.asm
+++ b/test/asm/invalid-utf-8.asm
@@ -1,5 +1,5 @@
; This test tries to pass invalid UTF-8 through a macro argument
-; to exercise the lexer's reportGarbageChar
+; to exercise the lexer's unknown character reporting
m:MACRO
\1
ENDM