shithub: mc

Info • Files • Log • Branches
ref: cec2ed10f355e59322f3d148837dca47aab083a6
dir: /lib/std/striter.myr/
use "die"
use "types"
use "utf"
use "strfind"
use "option"
use "chartype"
use "slpush"
use "alloc"

pkg std =
	type chariter = struct
		rest	: byte[:]
	;;

	type graphemeiter = struct
		rest	: byte[:]
	;;

	type charoffiter = struct
		str	: byte[:]
		idx	: size
	;;

	type splititer = struct
		rest	: byte[:]
		split	: byte[:]
	;;

	type tokiter = struct
		str	: byte[:]
		idx	: size
	;;

	impl iterable chariter		-> char
	impl iterable graphemeiter	-> char[:]
	impl iterable charoffiter	-> (char, size)
	impl iterable splititer		-> byte[:]
	impl iterable tokiter		-> byte[:]

	const bychar		: (str : byte[:] -> chariter)
	const bygrapheme	: (str : byte[:] -> graphemeiter)
	const bycharoff		: (str : byte[:] -> charoffiter)
	const bysplit		: (str : byte[:], split : byte[:] -> splititer)
	const bytok		: (str : byte[:] -> tokiter)
;;

/*
 * Iterate through a string char by char,
 * decoding the utf8 bytes into a single
 * codepoint.
 */
impl iterable chariter -> char =
	__iternext__ = {ci, c
		if ci.rest.len == 0
			-> false
		;;
		(c#, ci.rest) = charstep(ci.rest)
		-> true
	}

	__iterfin__ = {ci, c
	}
;;

const bychar = {str
	-> [.rest = str]
}

/* 
 * Iterate through a string grapheme by grapheme,
 * returning a slice of characters composing the
 * grapheme.
 */
impl iterable graphemeiter -> char[:] =
	__iternext__ = {ci, g : char[:]#
		var gb, gc : char[:]
		if ci.rest.len == 0
			-> false
		;;
		(gb, ci.rest) = graphemestep(ci.rest)

		/* 
		 * Graphemestep returns bytes, but we
		 * want to a slice of chars.
		 */
		gc = [][:]
		for c : std.bychar(gb)
			std.slpush(&gc, c)
		;;
		g# = gc
		-> true
	}

	__iterfin__ = {ci, g
		std.slfree(g#)
	}
;;

const bygrapheme = {str
	-> [.rest = str]
}


/*
 * Iterates through a string character by
 * character, similar to chariter, but returns
 * the offset into the string of the codepoint.
 * For example,
 *     "ὐbὐc
 * would return the sequence:
 * 	(ὐ, 0), (b, 3), (ὐ, 4), (c, 7)
 */
impl iterable charoffiter -> (char, size) = 
	__iternext__ = {ci, cv
		var c

		if ci.idx == ci.str.len
			-> false
		;;
		c = std.decode(ci.str[ci.idx:])
		ci.idx += std.charlen(c)
		cv# = (c, ci.idx)
		-> true
	}

	__iterfin__ = {ci, c
	}
;;

const bycharoff = {s
	-> [.str=s, .idx=0]
}

/*
 * Iterates through the splits of a string by a
 * delimiter, skippin gthe delimiter.
 */
impl iterable splititer -> byte[:] =
	__iternext__ = {si, sp
		match std.strfind(si.rest, si.split)
		| `Some off:
			sp# = si.rest[:off]
			si.rest = si.rest[off + si.split.len:]
			-> true
		| `None:
			if si.rest.len > 0
				sp# = si.rest
				si.rest = ""
				-> true
			;;
		;;
		-> false
	}

	__iterfin__ = {ci, c
	}
;;

const bysplit = {str, split
	-> [.rest = str, .split = split]
}

/*
 * Tokenizes a string by spaces, iterating over
 * the results.
 */
impl iterable tokiter -> byte[:] =
	__iternext__ = {it, sp
		var s, lo, hi, c
		
		s = it.str
		lo = it.idx
		while lo < s.len
			c = std.decode(s[lo:])
			if !isspace(c)
				break
			;;
			lo += charlen(c)
		;;
		hi = lo
		while hi < s.len
			c = std.decode(s[hi:])
			if isspace(c)
				break
			;;
			hi += charlen(c)
		;;
		it.idx = hi
		sp# = s[lo:hi]
		-> hi > lo
	}

	__iterfin__ = {ci, c
	}
;;

const bytok = {str
	-> [.str = str, .idx = 0]
}