shithub: mc

Download patch

ref: 346140113cb979d60f27fb3795f08e5976c0c712
parent: 3f5de2514eb4a90a8f7925bbeaf6dfdfb07debf8
author: Ori Bernstein <ori@eigenstate.org>
date: Wed Dec 26 16:43:45 EST 2018

Implement `std.bygrapheme`.

--- a/lib/std/striter.myr
+++ b/lib/std/striter.myr
@@ -4,6 +4,8 @@
 use "strfind"
 use "option"
 use "chartype"
+use "slpush"
+use "alloc"
 
 pkg std =
 	type chariter = struct
@@ -10,6 +12,10 @@
 		rest	: byte[:]
 	;;
 
+	type graphemeiter = struct
+		rest	: byte[:]
+	;;
+
 	type charoffiter = struct
 		str	: byte[:]
 		idx	: size
@@ -25,17 +31,24 @@
 		idx	: size
 	;;
 
-	impl iterable chariter	-> char
+	impl iterable chariter		-> char
+	impl iterable graphemeiter	-> char[:]
 	impl iterable charoffiter	-> (char, size)
-	impl iterable splititer -> byte[:]
-	impl iterable tokiter -> byte[:]
+	impl iterable splititer		-> byte[:]
+	impl iterable tokiter		-> byte[:]
 
-	const bychar	: (str : byte[:] -> chariter)
-	const bycharoff	: (str : byte[:] -> charoffiter)
-	const bysplit	: (str : byte[:], split : byte[:] -> splititer)
-	const bytok	: (str : byte[:] -> tokiter)
+	const bychar		: (str : byte[:] -> chariter)
+	const bygrapheme	: (str : byte[:] -> graphemeiter)
+	const bycharoff		: (str : byte[:] -> charoffiter)
+	const bysplit		: (str : byte[:], split : byte[:] -> splititer)
+	const bytok		: (str : byte[:] -> tokiter)
 ;;
 
+/*
+ * Iterate through a string char by char,
+ * decoding the utf8 bytes into a single
+ * codepoint.
+ */
 impl iterable chariter -> char =
 	__iternext__ = {ci, c
 		if ci.rest.len == 0
@@ -53,7 +66,50 @@
 	-> [.rest = str]
 }
 
+/* 
+ * Iterate through a string grapheme by grapheme,
+ * returning a slice of characters composing the
+ * grapheme.
+ */
+impl iterable graphemeiter -> char[:] =
+	__iternext__ = {ci, g : char[:]#
+		var gb, gc : char[:]
+		if ci.rest.len == 0
+			-> false
+		;;
+		(gb, ci.rest) = graphemestep(ci.rest)
 
+		/* 
+		 * Graphemestep returns bytes, but we
+		 * want to a slice of chars.
+		 */
+		gc = [][:]
+		for c : std.bychar(gb)
+			std.slpush(&gc, c)
+		;;
+		g# = gc
+		-> true
+	}
+
+	__iterfin__ = {ci, g
+		std.slfree(g#)
+	}
+;;
+
+const bygrapheme = {str
+	-> [.rest = str]
+}
+
+
+/*
+ * Iterates through a string character by
+ * character, similar to chariter, but returns
+ * the offset into the string of the codepoint.
+ * For example,
+ *     "ὐbὐc
+ * would return the sequence:
+ * 	(ὐ, 0), (b, 3), (ὐ, 4), (c, 7)
+ */
 impl iterable charoffiter -> (char, size) = 
 	__iternext__ = {ci, cv
 		var c
@@ -75,6 +131,10 @@
 	-> [.str=s, .idx=0]
 }
 
+/*
+ * Iterates through the splits of a string by a
+ * delimiter, skippin gthe delimiter.
+ */
 impl iterable splititer -> byte[:] =
 	__iternext__ = {si, sp
 		match std.strfind(si.rest, si.split)
@@ -100,6 +160,10 @@
 	-> [.rest = str, .split = split]
 }
 
+/*
+ * Tokenizes a string by spaces, iterating over
+ * the results.
+ */
 impl iterable tokiter -> byte[:] =
 	__iternext__ = {it, sp
 		var s, lo, hi, c
--- a/lib/std/test/striter.myr
+++ b/lib/std/test/striter.myr
@@ -3,6 +3,18 @@
 const main = {
 	var chars = ['a', 'b', 'c']
 	var splits = ["foo", "+bar"]
+	var graphemes = [
+		[0x300][:],
+		[0x61][:],
+		[0x53f2][:],
+		[0x63][:],
+		[0x9][:],
+		[0x42f][:],
+		[0x78, 0x300, 0x300, 0x300, 0x300, 0x300][:],
+		[0xa][:],
+		[0x7a, 0x309][:]
+	]
+
 	var i
 
 	i = 0
@@ -16,4 +28,10 @@
 		std.assert(std.eq(splits[i++], sp), "wrong split {}", sp)
 	;;
 	std.assert(i == splits.len, "wrong split count")
+
+	i = 0
+	for g : std.bygrapheme("̀a史c\tЯx̀̀̀̀̀\nz̉")
+		std.assert(std.eq(g, graphemes[i++]), "mismatched grapheme cluster\n")
+	;;
+	std.assert(i == graphemes.len, "wrong grapheme set length")
 }