shithub: sl

Download patch

ref: c0e57a711517201e3b3bcd621b47c367a3fcd39f
parent: 86852694bf8e935145b40a536b859d44910b4ed4
author: Sigrid Solveig Haflínudóttir <sigrid@ftrv.se>
date: Sat Apr 19 02:46:06 EDT 2025

str-length: add ability to detect invalid utf-8 data

Passing NIL as the fourth argument will make the function return
NIL on strings that aren't valid UTF-8.

--- a/src/str.c
+++ b/src/str.c
@@ -19,12 +19,13 @@
 BUILTIN("str-length", str_length)
 {
 	usize start = 0;
-	if(nargs < 1 || nargs > 3)
+	if(nargs < 1 || nargs > 4)
 		argcount(nargs, 1);
 	if(!isstr(args[0]))
 		bthrow(type_error(nil, "str", args[0]));
 	usize len = cv_len(ptr(args[0]));
 	usize stop = len;
+	bool err = false, *errp = nil;
 	if(nargs > 1){
 		start = tosize(args[1]);
 		if(start > len)
@@ -35,10 +36,16 @@
 				bthrow(bounds_error(args[0], args[2]));
 			if(stop <= start)
 				return fixnum(0);
+			if(nargs > 3){
+				if(args[3] != sl_nil)
+					bthrow(type_error("err", "null", args[3]));
+				errp = &err;
+			}
 		}
 	}
 	char *str = cvalue_data(args[0]);
-	return size_wrap(u8_charnum(str+start, stop-start));
+	len = size_wrap(u8_runelen(str+start, stop-start, errp));
+	return err ? sl_nil : len;
 }
 
 BUILTIN("str-width", str_width)
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -38,27 +38,33 @@
 	return trailingBytesForUTF8[(u8int)s[0]] + 1;
 }
 
+/* modified runenlen from 9front */
 usize
-u8_runelen(const char *s, usize nb)
+u8_runelen(const char *s, usize m, bool *err)
 {
-	usize nr, i;
-	for(i = nr = 0; i < nb; nr++)
-		i += trailingBytesForUTF8[(u8int)s[i]] + 1;
-	return nr;
-}
+	int c;
+	usize n;
+	Rune rune;
+	const char *es;
 
-/* byte offset => charnum */
-usize
-u8_charnum(const char *s, usize offset)
-{
-	usize charnum = 0, i = 0;
-
-	while(i < offset){
-		if((s[i++] & 0x80) != 0 && !isutf(s[++i]) && !isutf(s[++i]))
-			i++;
-		charnum++;
+	if(err != nil)
+		*err = false;
+	es = s + m;
+	for(n = 0; s < es; n++) {
+		c = *(const u8int*)s;
+		if(c < Runeself){
+			s++;
+			continue;
+		}
+		if(!fullrune(s, es-s))
+			break;
+		s += chartorune(&rune, s);
+		if(err != nil && rune == Runeerror){
+			*err = true;
+			break;
+		}
 	}
-	return charnum;
+	return n;
 }
 
 ssize
--- a/src/utf8.h
+++ b/src/utf8.h
@@ -3,9 +3,6 @@
 /* is c the start of a utf8 sequence? */
 #define isutf(c) (((c)&0xC0) != 0x80)
 
-/* byte offset to character number */
-usize u8_charnum(const char *s, usize offset) sl_purefn;
-
 /* next character without NUL character terminator */
 Rune u8_nextmemchar(const char *s, usize *i);
 
@@ -13,7 +10,7 @@
 usize u8_seqlen(const char *s) sl_purefn;
 
 /* length of a utf-8 string in runes */
-usize u8_runelen(const char *s, usize nb) sl_purefn;
+usize u8_runelen(const char *s, usize m, bool *err) sl_purefn;
 
 char read_escape_control_char(char c) sl_constfn;
 
--- a/test/unittest.sl
+++ b/test/unittest.sl
@@ -521,8 +521,10 @@
 (assert (eq? 21 (length s)))
 (assert (eq? 11 (str-length s)))
 (assert (eq? 11 (str-length s 0)))
+(assert (eq? 10 (str-length s 2 (sizeof s) nil)))
 (assert (eq? 10 (str-length s 2)))
-(assert (eq? 9 (str-length s 3)))
+(assert (eq? 10 (str-length s 3)))
+(assert (not (str-length s 3 (sizeof s) nil)))
 (assert (eq? 0 (str-length s 21)))
 (assert-fail (str-length s -1))
 (assert-fail (str-length s 22))