std::text::unicodeUnicode character classification and UTF-8 string helpers.
Provides codepoint-level predicates mirroring Go's unicode package, plus
UTF-8-aware string helpers for counting and iterating Unicode codepoints.
All classification functions operate on codepoint values (i64), not raw
bytes. Codepoints are produced by unicode.codepoint_at(s, i) (the
codepoint at position i) or by iterating unicode.runes(s).
The underlying char_count_utf8 and codepoint_at_utf8 methods are
exposed on string by std::string (bound to hew_string_char_count and
hew_string_char_at_utf8 in hew-runtime).
import std::text::unicode;
fn main() {
// Char classification
let cp = unicode.codepoint_at("café", 3); // 'é' = 233
println(unicode.is_lower(cp)); // true
println(unicode.is_upper(cp)); // false
println(unicode.to_upper(cp)); // 201 ('É')
// UTF-8 helpers
println(unicode.rune_count("café")); // 4
let cps = unicode.runes("日本");
println(cps.len()); // 2
}
is_valid_runeReturn true when cp is a valid Unicode scalar value.
unicode.is_valid_rune(65) // true ('A')
unicode.is_valid_rune(0x10FFFF) // true (highest scalar)
unicode.is_valid_rune(0xD800) // false (surrogate)
unicode.is_valid_rune(-1) // false
is_upperTest whether the Unicode codepoint cp is an uppercase letter.
Follows Unicode general category Lu (Uppercase Letter).
unicode.is_upper(65) // true ('A')
unicode.is_upper(97) // false ('a')
unicode.is_upper(201) // true ('É', U+00C9)
is_lowerTest whether the Unicode codepoint cp is a lowercase letter.
Follows Unicode general category Ll (Lowercase Letter).
unicode.is_lower(97) // true ('a')
unicode.is_lower(65) // false ('A')
unicode.is_lower(233) // true ('é', U+00E9)
is_digitTest whether the Unicode codepoint cp is a decimal digit (0-9).
Matches ASCII digits only (U+0030-U+0039). Arabic-Indic and other Unicode digit forms are not included.
unicode.is_digit(48) // true ('0')
unicode.is_digit(57) // true ('9')
unicode.is_digit(65) // false ('A')
is_spaceTest whether the Unicode codepoint cp is a whitespace character.
Matches Unicode White_Space property: ASCII spaces/tabs/newlines plus Unicode-specific whitespace codepoints (U+00A0 NO-BREAK SPACE, etc.).
unicode.is_space(32) // true (space)
unicode.is_space(9) // true (\t)
unicode.is_space(65) // false ('A')
is_letterTest whether the Unicode codepoint cp is an alphabetic letter.
Matches Unicode Alphabetic property, which includes Latin, CJK, Arabic, Devanagari, and all other script letters.
unicode.is_letter(65) // true ('A')
unicode.is_letter(0x65E5) // true ('日')
unicode.is_letter(48) // false ('0')
is_alnumTest whether the Unicode codepoint cp is a letter or decimal digit.
Equivalent to is_letter(cp) || is_digit(cp). Matches Unicode
Alphabetic or Decimal_Number properties.
unicode.is_alnum(65) // true ('A')
unicode.is_alnum(48) // true ('0')
unicode.is_alnum(32) // false (space)
is_punctTest whether cp is punctuation.
unicode.is_punct(33) // true ('!')
unicode.is_punct(0x3002) // true (IDEOGRAPHIC FULL STOP)
unicode.is_punct(65) // false ('A')
to_upperConvert the Unicode codepoint cp to its uppercase equivalent.
Returns the first codepoint of the Unicode to-uppercase mapping. For
codepoints with no uppercase form (digits, punctuation, already-uppercase
letters), returns cp unchanged. Invalid codepoints are returned
unchanged.
unicode.to_upper(97) // 65 ('a' -> 'A')
unicode.to_upper(65) // 65 ('A' unchanged)
unicode.to_upper(233) // 201 ('é' -> 'É')
to_lowerConvert the Unicode codepoint cp to its lowercase equivalent.
Returns the first codepoint of the Unicode to-lowercase mapping. For
codepoints with no lowercase form, returns cp unchanged. Invalid
codepoints are returned unchanged.
unicode.to_lower(65) // 97 ('A' -> 'a')
unicode.to_lower(97) // 97 ('a' unchanged)
unicode.to_lower(201) // 233 ('É' -> 'é')
to_titleConvert cp to titlecase.
For scalar mappings currently exposed here, titlecase is the same single-codepoint mapping as uppercase.
unicode.to_title(97) // 65 ('a' -> 'A')
unicode.to_title(233) // 201 ('é' -> 'É')
unicode.to_title(49) // 49 ('1' unchanged)
codepoint_atReturn the Unicode codepoint at codepoint-index i in string s.
The index is a rune (codepoint) offset, not a byte offset. Returns -1 if
i is out of bounds. O(n) in the number of codepoints up to i.
unicode.codepoint_at("café", 3) // 233 ('é', U+00E9)
unicode.codepoint_at("ABC", 1) // 66 ('B')
unicode.codepoint_at("hi", 5) // -1 (out of bounds)
try_codepoint_atReturn the Unicode codepoint at codepoint-index i, or an error.
unicode.try_codepoint_at("café", 3) // Ok(233)
unicode.try_codepoint_at("hi", 2) // Err(...)
rune_countReturn the number of Unicode codepoints (runes) in string s.
For ASCII strings this equals s.len(). For strings containing multi-byte
UTF-8 sequences (accented letters, CJK, emoji), the rune count is strictly
less than the byte length.
unicode.rune_count("hello") // 5
unicode.rune_count("café") // 4
unicode.rune_count("日本語") // 3
unicode.rune_count("") // 0
rune_lenReturn the number of UTF-8 bytes needed to encode cp, or -1 if invalid.
unicode.rune_len(65) // 1
unicode.rune_len(233) // 2
unicode.rune_len(0x65E5) // 3
unicode.rune_len(-1) // -1
try_rune_lenReturn the number of UTF-8 bytes needed to encode cp, or an error.
unicode.try_rune_len(65) // Ok(1)
unicode.try_rune_len(0x1F642) // Ok(4)
unicode.try_rune_len(-1) // Err(...)
runesReturn all Unicode codepoints in s as a Vec<i64>.
Each element is a Unicode scalar value (codepoint). The length of the
returned vec equals rune_count(s). Order matches the original string.
let cps = unicode.runes("AB");
// cps == [65, 66]
let cps2 = unicode.runes("日");
// cps2.get(0) == 0x65E5
let empty = unicode.runes("");
// empty.len() == 0