Coverage Report

Created: 2025-11-28 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/llama.cpp/src/unicode.h
Line
Count
Source
1
#pragma once
2
3
#include <cstdint>
4
#include <string>
5
#include <vector>
6
7
// TODO: reimplement this structure in endian-independent way
8
struct unicode_cpt_flags {
9
    enum {
10
        UNDEFINED       = 0x0001,
11
        NUMBER          = 0x0002,  // regex: \p{N}
12
        LETTER          = 0x0004,  // regex: \p{L}
13
        SEPARATOR       = 0x0008,  // regex: \p{Z}
14
        ACCENT_MARK     = 0x0010,  // regex: \p{M}
15
        PUNCTUATION     = 0x0020,  // regex: \p{P}
16
        SYMBOL          = 0x0040,  // regex: \p{S}
17
        CONTROL         = 0x0080,  // regex: \p{C}
18
        MASK_CATEGORIES = 0x00FF,
19
        WHITESPACE      = 0x0100,
20
        LOWERCASE       = 0x0200,
21
        UPPERCASE       = 0x0400,
22
        NFD             = 0x0800,
23
    };
24
25
    // codepoint type
26
    uint16_t is_undefined   : 1;
27
    uint16_t is_number      : 1;  // regex: \p{N}
28
    uint16_t is_letter      : 1;  // regex: \p{L}
29
    uint16_t is_separator   : 1;  // regex: \p{Z}
30
    uint16_t is_accent_mark : 1;  // regex: \p{M}
31
    uint16_t is_punctuation : 1;  // regex: \p{P}
32
    uint16_t is_symbol      : 1;  // regex: \p{S}
33
    uint16_t is_control     : 1;  // regex: \p{C}
34
    // helper flags
35
    uint16_t is_whitespace  : 1;  // regex: \s
36
    uint16_t is_lowercase   : 1;
37
    uint16_t is_uppercase   : 1;
38
    uint16_t is_nfd         : 1;
39
40
    // decode from uint16
41
0
    inline unicode_cpt_flags(const uint16_t flags = 0) {
42
0
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
43
0
        *reinterpret_cast<uint16_t*>(this) = flags;
44
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
45
        is_undefined   = (flags & UNDEFINED)   ? 1 : 0;
46
        is_number      = (flags & NUMBER)      ? 1 : 0;
47
        is_letter      = (flags & LETTER)      ? 1 : 0;
48
        is_separator   = (flags & SEPARATOR)   ? 1 : 0;
49
        is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
50
        is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
51
        is_symbol      = (flags & SYMBOL)      ? 1 : 0;
52
        is_control     = (flags & CONTROL)     ? 1 : 0;
53
        is_whitespace  = (flags & WHITESPACE)  ? 1 : 0;
54
        is_lowercase   = (flags & LOWERCASE)   ? 1 : 0;
55
        is_uppercase   = (flags & UPPERCASE)   ? 1 : 0;
56
        is_nfd         = (flags & NFD)         ? 1 : 0;
57
#else
58
#error Unexpected or undefined __BYTE_ORDER__
59
#endif
60
0
    }
61
62
0
    inline uint16_t as_uint() const {
63
0
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
64
0
        return *reinterpret_cast<const uint16_t*>(this);
65
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
66
        uint16_t result =
67
              is_undefined   * UNDEFINED
68
            + is_number      * NUMBER
69
            + is_letter      * LETTER
70
            + is_separator   * SEPARATOR
71
            + is_accent_mark * ACCENT_MARK
72
            + is_punctuation * PUNCTUATION
73
            + is_symbol      * SYMBOL
74
            + is_control     * CONTROL
75
            + is_whitespace  * WHITESPACE
76
            + is_lowercase   * LOWERCASE
77
            + is_uppercase   * UPPERCASE
78
            + is_nfd         * NFD
79
            ;
80
81
        return result;
82
#else
83
#error Unexpected or undefined __BYTE_ORDER__
84
#endif
85
0
    }
86
87
0
    inline uint16_t category_flag() const {
88
0
        return this->as_uint() & MASK_CATEGORIES;
89
0
    }
90
};
91
92
size_t unicode_len_utf8(char src);
93
94
std::string unicode_cpt_to_utf8  (uint32_t cpt);
95
uint32_t    unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
96
97
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
98
99
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
100
101
unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
102
unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
103
104
std::string unicode_byte_to_utf8(uint8_t byte);
105
uint8_t     unicode_utf8_to_byte(const std::string & utf8);
106
107
uint32_t unicode_tolower(uint32_t cpt);
108
109
bool unicode_cpt_is_han(uint32_t cpt);
110
111
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);