Module std::text::unicode

Unicode character classification and UTF-8 string helpers.

Provides codepoint-level predicates mirroring Go's unicode package, plus UTF-8-aware string helpers for counting and iterating Unicode codepoints.

All classification functions operate on codepoint values (i64), not raw bytes. Codepoints are produced by unicode.codepoint_at(s, i) (the codepoint at position i) or by iterating unicode.runes(s).

The underlying char_count_utf8 and codepoint_at_utf8 methods are exposed on string by std::string (bound to hew_string_char_count and hew_string_char_at_utf8 in hew-runtime).

Examples

import std::text::unicode;

fn main() {
    // Char classification
    let cp = unicode.codepoint_at("café", 3);   // 'é' = 233
    println(unicode.is_lower(cp));              // true
    println(unicode.is_upper(cp));              // false
    println(unicode.to_upper(cp));              // 201 ('É')

    // UTF-8 helpers
    println(unicode.rune_count("café"));        // 4
    let cps = unicode.runes("日本");
    println(cps.len());                         // 2
}

Contents

Functions

Function is_valid_rune

pub fn is_valid_rune(cp: i64) -> bool

Return true when cp is a valid Unicode scalar value.

Examples

unicode.is_valid_rune(65)        // true  ('A')
unicode.is_valid_rune(0x10FFFF)  // true  (highest scalar)
unicode.is_valid_rune(0xD800)    // false (surrogate)
unicode.is_valid_rune(-1)        // false

Function is_upper

pub fn is_upper(cp: i64) -> bool

Test whether the Unicode codepoint cp is an uppercase letter.

Follows Unicode general category Lu (Uppercase Letter).

Examples

unicode.is_upper(65)   // true  ('A')
unicode.is_upper(97)   // false ('a')
unicode.is_upper(201)  // true  ('É', U+00C9)

Function is_lower

pub fn is_lower(cp: i64) -> bool

Test whether the Unicode codepoint cp is a lowercase letter.

Follows Unicode general category Ll (Lowercase Letter).

Examples

unicode.is_lower(97)   // true  ('a')
unicode.is_lower(65)   // false ('A')
unicode.is_lower(233)  // true  ('é', U+00E9)

Function is_digit

pub fn is_digit(cp: i64) -> bool

Test whether the Unicode codepoint cp is a decimal digit (0-9).

Matches ASCII digits only (U+0030-U+0039). Arabic-Indic and other Unicode digit forms are not included.

Examples

unicode.is_digit(48)  // true  ('0')
unicode.is_digit(57)  // true  ('9')
unicode.is_digit(65)  // false ('A')

Function is_space

pub fn is_space(cp: i64) -> bool

Test whether the Unicode codepoint cp is a whitespace character.

Matches Unicode White_Space property: ASCII spaces/tabs/newlines plus Unicode-specific whitespace codepoints (U+00A0 NO-BREAK SPACE, etc.).

Examples

unicode.is_space(32)  // true  (space)
unicode.is_space(9)   // true  (\t)
unicode.is_space(65)  // false ('A')

Function is_letter

pub fn is_letter(cp: i64) -> bool

Test whether the Unicode codepoint cp is an alphabetic letter.

Matches Unicode Alphabetic property, which includes Latin, CJK, Arabic, Devanagari, and all other script letters.

Examples

unicode.is_letter(65)      // true  ('A')
unicode.is_letter(0x65E5)  // true  ('日')
unicode.is_letter(48)      // false ('0')

Function is_alnum

pub fn is_alnum(cp: i64) -> bool

Test whether the Unicode codepoint cp is a letter or decimal digit.

Equivalent to is_letter(cp) || is_digit(cp). Matches Unicode Alphabetic or Decimal_Number properties.

Examples

unicode.is_alnum(65)   // true  ('A')
unicode.is_alnum(48)   // true  ('0')
unicode.is_alnum(32)   // false (space)

Function is_punct

pub fn is_punct(cp: i64) -> bool

Test whether cp is punctuation.

Examples

unicode.is_punct(33)      // true  ('!')
unicode.is_punct(0x3002)  // true  (IDEOGRAPHIC FULL STOP)
unicode.is_punct(65)      // false ('A')

Function to_upper

pub fn to_upper(cp: i64) -> i64

Convert the Unicode codepoint cp to its uppercase equivalent.

Returns the first codepoint of the Unicode to-uppercase mapping. For codepoints with no uppercase form (digits, punctuation, already-uppercase letters), returns cp unchanged. Invalid codepoints are returned unchanged.

Examples

unicode.to_upper(97)   // 65   ('a' -> 'A')
unicode.to_upper(65)   // 65   ('A' unchanged)
unicode.to_upper(233)  // 201  ('é' -> 'É')

Function to_lower

pub fn to_lower(cp: i64) -> i64

Convert the Unicode codepoint cp to its lowercase equivalent.

Returns the first codepoint of the Unicode to-lowercase mapping. For codepoints with no lowercase form, returns cp unchanged. Invalid codepoints are returned unchanged.

Examples

unicode.to_lower(65)   // 97   ('A' -> 'a')
unicode.to_lower(97)   // 97   ('a' unchanged)
unicode.to_lower(201)  // 233  ('É' -> 'é')

Function to_title

pub fn to_title(cp: i64) -> i64

Convert cp to titlecase.

For scalar mappings currently exposed here, titlecase is the same single-codepoint mapping as uppercase.

Examples

unicode.to_title(97)   // 65   ('a' -> 'A')
unicode.to_title(233)  // 201  ('é' -> 'É')
unicode.to_title(49)   // 49   ('1' unchanged)

Function codepoint_at

pub fn codepoint_at(s: string, i: i64) -> i64

Return the Unicode codepoint at codepoint-index i in string s.

The index is a rune (codepoint) offset, not a byte offset. Returns -1 if i is out of bounds. O(n) in the number of codepoints up to i.

Examples

unicode.codepoint_at("café", 3)  // 233 ('é', U+00E9)
unicode.codepoint_at("ABC", 1)   // 66  ('B')
unicode.codepoint_at("hi", 5)    // -1  (out of bounds)

Function try_codepoint_at

pub fn try_codepoint_at(s: string, i: i64) -> Result<i64, string>

Return the Unicode codepoint at codepoint-index i, or an error.

Examples

unicode.try_codepoint_at("café", 3)  // Ok(233)
unicode.try_codepoint_at("hi", 2)    // Err(...)

Function rune_count

pub fn rune_count(s: string) -> i64

Return the number of Unicode codepoints (runes) in string s.

For ASCII strings this equals s.len(). For strings containing multi-byte UTF-8 sequences (accented letters, CJK, emoji), the rune count is strictly less than the byte length.

Examples

unicode.rune_count("hello")  // 5
unicode.rune_count("café")   // 4
unicode.rune_count("日本語")  // 3
unicode.rune_count("")       // 0

Function rune_len

pub fn rune_len(cp: i64) -> i64

Return the number of UTF-8 bytes needed to encode cp, or -1 if invalid.

Examples

unicode.rune_len(65)      // 1
unicode.rune_len(233)     // 2
unicode.rune_len(0x65E5)  // 3
unicode.rune_len(-1)      // -1

Function try_rune_len

pub fn try_rune_len(cp: i64) -> Result<i64, string>

Return the number of UTF-8 bytes needed to encode cp, or an error.

Examples

unicode.try_rune_len(65)      // Ok(1)
unicode.try_rune_len(0x1F642) // Ok(4)
unicode.try_rune_len(-1)      // Err(...)

Function runes

pub fn runes(s: string) -> Vec<i64>

Return all Unicode codepoints in s as a Vec<i64>.

Each element is a Unicode scalar value (codepoint). The length of the returned vec equals rune_count(s). Order matches the original string.

Examples

let cps = unicode.runes("AB");
// cps == [65, 66]

let cps2 = unicode.runes("日");
// cps2.get(0) == 0x65E5

let empty = unicode.runes("");
// empty.len() == 0