ref: c0e57a711517201e3b3bcd621b47c367a3fcd39f
parent: 86852694bf8e935145b40a536b859d44910b4ed4
author: Sigrid Solveig Haflínudóttir <sigrid@ftrv.se>
date: Sat Apr 19 02:46:06 EDT 2025
str-length: add ability to detect invalid utf-8 data Passing NIL as the fourth argument will make the function return NIL on strings that aren't valid UTF-8.
--- a/src/str.c
+++ b/src/str.c
@@ -19,12 +19,13 @@
BUILTIN("str-length", str_length)
{
usize start = 0;
- if(nargs < 1 || nargs > 3)
+ if(nargs < 1 || nargs > 4)
argcount(nargs, 1);
if(!isstr(args[0]))
bthrow(type_error(nil, "str", args[0]));
usize len = cv_len(ptr(args[0]));
usize stop = len;
+ bool err = false, *errp = nil;
if(nargs > 1){
start = tosize(args[1]);
if(start > len)
@@ -35,10 +36,16 @@
bthrow(bounds_error(args[0], args[2]));
if(stop <= start)
return fixnum(0);
+ if(nargs > 3){
+ if(args[3] != sl_nil)
+ bthrow(type_error("err", "null", args[3]));
+ errp = &err;
+ }
}
}
char *str = cvalue_data(args[0]);
- return size_wrap(u8_charnum(str+start, stop-start));
+ len = size_wrap(u8_runelen(str+start, stop-start, errp));
+ return err ? sl_nil : len;
}
BUILTIN("str-width", str_width)
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -38,27 +38,33 @@
return trailingBytesForUTF8[(u8int)s[0]] + 1;
}
+/* modified runenlen from 9front */
usize
-u8_runelen(const char *s, usize nb)
+u8_runelen(const char *s, usize m, bool *err)
{
- usize nr, i;
- for(i = nr = 0; i < nb; nr++)
- i += trailingBytesForUTF8[(u8int)s[i]] + 1;
- return nr;
-}
+ int c;
+ usize n;
+ Rune rune;
+ const char *es;
-/* byte offset => charnum */
-usize
-u8_charnum(const char *s, usize offset)
-{
- usize charnum = 0, i = 0;
-
- while(i < offset){
- if((s[i++] & 0x80) != 0 && !isutf(s[++i]) && !isutf(s[++i]))
- i++;
- charnum++;
+ if(err != nil)
+ *err = false;
+ es = s + m;
+ for(n = 0; s < es; n++) {
+ c = *(const u8int*)s;
+ if(c < Runeself){
+ s++;
+ continue;
+ }
+ if(!fullrune(s, es-s))
+ break;
+ s += chartorune(&rune, s);
+ if(err != nil && rune == Runeerror){
+ *err = true;
+ break;
+ }
}
- return charnum;
+ return n;
}
ssize
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -3,9 +3,6 @@
/* is c the start of a utf8 sequence? */
#define isutf(c) (((c)&0xC0) != 0x80)
-/* byte offset to character number */
-usize u8_charnum(const char *s, usize offset) sl_purefn;
-
/* next character without NUL character terminator */
Rune u8_nextmemchar(const char *s, usize *i);
@@ -13,7 +10,7 @@
usize u8_seqlen(const char *s) sl_purefn;
/* length of a utf-8 string in runes */
-usize u8_runelen(const char *s, usize nb) sl_purefn;
+usize u8_runelen(const char *s, usize m, bool *err) sl_purefn;
char read_escape_control_char(char c) sl_constfn;
--- a/test/unittest.sl
+++ b/test/unittest.sl
@@ -521,8 +521,10 @@
(assert (eq? 21 (length s)))
(assert (eq? 11 (str-length s)))
(assert (eq? 11 (str-length s 0)))
+(assert (eq? 10 (str-length s 2 (sizeof s) nil)))
(assert (eq? 10 (str-length s 2)))
-(assert (eq? 9 (str-length s 3)))
+(assert (eq? 10 (str-length s 3)))
+(assert (not (str-length s 3 (sizeof s) nil)))
(assert (eq? 0 (str-length s 21)))
(assert-fail (str-length s -1))
(assert-fail (str-length s 22))