/src/llama.cpp/src/unicode.h

Source
#pragma once

#include <cstdint>
#include <string>
#include <vector>

// TODO: reimplement this structure in endian-independent way
struct unicode_cpt_flags {
    enum {
        UNDEFINED       = 0x0001,
        NUMBER          = 0x0002,  // regex: \p{N}
        LETTER          = 0x0004,  // regex: \p{L}
        SEPARATOR       = 0x0008,  // regex: \p{Z}
        ACCENT_MARK     = 0x0010,  // regex: \p{M}
        PUNCTUATION     = 0x0020,  // regex: \p{P}
        SYMBOL          = 0x0040,  // regex: \p{S}
        CONTROL         = 0x0080,  // regex: \p{C}
        MASK_CATEGORIES = 0x00FF,
        WHITESPACE      = 0x0100,
        LOWERCASE       = 0x0200,
        UPPERCASE       = 0x0400,
        NFD             = 0x0800,
    };

    // codepoint type
    uint16_t is_undefined   : 1;
    uint16_t is_number      : 1;  // regex: \p{N}
    uint16_t is_letter      : 1;  // regex: \p{L}
    uint16_t is_separator   : 1;  // regex: \p{Z}
    uint16_t is_accent_mark : 1;  // regex: \p{M}
    uint16_t is_punctuation : 1;  // regex: \p{P}
    uint16_t is_symbol      : 1;  // regex: \p{S}
    uint16_t is_control     : 1;  // regex: \p{C}
    // helper flags
    uint16_t is_whitespace  : 1;  // regex: \s
    uint16_t is_lowercase   : 1;
    uint16_t is_uppercase   : 1;
    uint16_t is_nfd         : 1;

    // decode from uint16
    inline unicode_cpt_flags(const uint16_t flags = 0) {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        *reinterpret_cast<uint16_t*>(this) = flags;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
        is_undefined   = (flags & UNDEFINED)   ? 1 : 0;
        is_number      = (flags & NUMBER)      ? 1 : 0;
        is_letter      = (flags & LETTER)      ? 1 : 0;
        is_separator   = (flags & SEPARATOR)   ? 1 : 0;
        is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
        is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
        is_symbol      = (flags & SYMBOL)      ? 1 : 0;
        is_control     = (flags & CONTROL)     ? 1 : 0;
        is_whitespace  = (flags & WHITESPACE)  ? 1 : 0;
        is_lowercase   = (flags & LOWERCASE)   ? 1 : 0;
        is_uppercase   = (flags & UPPERCASE)   ? 1 : 0;
        is_nfd         = (flags & NFD)         ? 1 : 0;
#else
#error Unexpected or undefined __BYTE_ORDER__
#endif
    }

    inline uint16_t as_uint() const {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
        return *reinterpret_cast<const uint16_t*>(this);
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
        uint16_t result =
              is_undefined   * UNDEFINED
            + is_number      * NUMBER
            + is_letter      * LETTER
            + is_separator   * SEPARATOR
            + is_accent_mark * ACCENT_MARK
            + is_punctuation * PUNCTUATION
            + is_symbol      * SYMBOL
            + is_control     * CONTROL
            + is_whitespace  * WHITESPACE
            + is_lowercase   * LOWERCASE
            + is_uppercase   * UPPERCASE
            + is_nfd         * NFD
            ;

        return result;
#else
#error Unexpected or undefined __BYTE_ORDER__
#endif
    }

    inline uint16_t category_flag() const {
        return this->as_uint() & MASK_CATEGORIES;
    }
};

size_t unicode_len_utf8(char src);

std::string unicode_cpt_to_utf8  (uint32_t cpt);
uint32_t    unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);

std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);

std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);

unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);

std::string unicode_byte_to_utf8(uint8_t byte);
uint8_t     unicode_utf8_to_byte(const std::string & utf8);

uint32_t unicode_tolower(uint32_t cpt);

bool unicode_cpt_is_han(uint32_t cpt);

std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);

Coverage Report

Created: 2025-11-28 06:56

Line	Count	Source
1		#pragma once
2
3		#include <cstdint>
4		#include <string>
5		#include <vector>
6
7		// TODO: reimplement this structure in endian-independent way
8		struct unicode_cpt_flags {
9		enum {
10		UNDEFINED = 0x0001,
11		NUMBER = 0x0002, // regex: \p{N}
12		LETTER = 0x0004, // regex: \p{L}
13		SEPARATOR = 0x0008, // regex: \p{Z}
14		ACCENT_MARK = 0x0010, // regex: \p{M}
15		PUNCTUATION = 0x0020, // regex: \p{P}
16		SYMBOL = 0x0040, // regex: \p{S}
17		CONTROL = 0x0080, // regex: \p{C}
18		MASK_CATEGORIES = 0x00FF,
19		WHITESPACE = 0x0100,
20		LOWERCASE = 0x0200,
21		UPPERCASE = 0x0400,
22		NFD = 0x0800,
23		};
24
25		// codepoint type
26		uint16_t is_undefined : 1;
27		uint16_t is_number : 1; // regex: \p{N}
28		uint16_t is_letter : 1; // regex: \p{L}
29		uint16_t is_separator : 1; // regex: \p{Z}
30		uint16_t is_accent_mark : 1; // regex: \p{M}
31		uint16_t is_punctuation : 1; // regex: \p{P}
32		uint16_t is_symbol : 1; // regex: \p{S}
33		uint16_t is_control : 1; // regex: \p{C}
34		// helper flags
35		uint16_t is_whitespace : 1; // regex: \s
36		uint16_t is_lowercase : 1;
37		uint16_t is_uppercase : 1;
38		uint16_t is_nfd : 1;
39
40		// decode from uint16
41	0	inline unicode_cpt_flags(const uint16_t flags = 0) {
42	0	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
43	0	reinterpret_cast<uint16_t>(this) = flags;
44		#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
45		is_undefined = (flags & UNDEFINED) ? 1 : 0;
46		is_number = (flags & NUMBER) ? 1 : 0;
47		is_letter = (flags & LETTER) ? 1 : 0;
48		is_separator = (flags & SEPARATOR) ? 1 : 0;
49		is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
50		is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
51		is_symbol = (flags & SYMBOL) ? 1 : 0;
52		is_control = (flags & CONTROL) ? 1 : 0;
53		is_whitespace = (flags & WHITESPACE) ? 1 : 0;
54		is_lowercase = (flags & LOWERCASE) ? 1 : 0;
55		is_uppercase = (flags & UPPERCASE) ? 1 : 0;
56		is_nfd = (flags & NFD) ? 1 : 0;
57		#else
58		#error Unexpected or undefined __BYTE_ORDER__
59		#endif
60	0	}
61
62	0	inline uint16_t as_uint() const {
63	0	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
64	0	return reinterpret_cast<const uint16_t>(this);
65		#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
66		uint16_t result =
67		is_undefined * UNDEFINED
68		+ is_number * NUMBER
69		+ is_letter * LETTER
70		+ is_separator * SEPARATOR
71		+ is_accent_mark * ACCENT_MARK
72		+ is_punctuation * PUNCTUATION
73		+ is_symbol * SYMBOL
74		+ is_control * CONTROL
75		+ is_whitespace * WHITESPACE
76		+ is_lowercase * LOWERCASE
77		+ is_uppercase * UPPERCASE
78		+ is_nfd * NFD
79		;
80
81		return result;
82		#else
83		#error Unexpected or undefined __BYTE_ORDER__
84		#endif
85	0	}
86
87	0	inline uint16_t category_flag() const {
88	0	return this->as_uint() & MASK_CATEGORIES;
89	0	}
90		};
91
92		size_t unicode_len_utf8(char src);
93
94		std::string unicode_cpt_to_utf8 (uint32_t cpt);
95		uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
96
97		std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
98
99		std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
100
101		unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
102		unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
103
104		std::string unicode_byte_to_utf8(uint8_t byte);
105		uint8_t unicode_utf8_to_byte(const std::string & utf8);
106
107		uint32_t unicode_tolower(uint32_t cpt);
108
109		bool unicode_cpt_is_han(uint32_t cpt);
110
111		std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);