/src/llama.cpp/src/unicode.h
Line | Count | Source |
1 | | #pragma once |
2 | | |
3 | | #include <cstdint> |
4 | | #include <string> |
5 | | #include <vector> |
6 | | |
7 | | // TODO: reimplement this structure in endian-independent way |
8 | | struct unicode_cpt_flags { |
9 | | enum { |
10 | | UNDEFINED = 0x0001, |
11 | | NUMBER = 0x0002, // regex: \p{N} |
12 | | LETTER = 0x0004, // regex: \p{L} |
13 | | SEPARATOR = 0x0008, // regex: \p{Z} |
14 | | ACCENT_MARK = 0x0010, // regex: \p{M} |
15 | | PUNCTUATION = 0x0020, // regex: \p{P} |
16 | | SYMBOL = 0x0040, // regex: \p{S} |
17 | | CONTROL = 0x0080, // regex: \p{C} |
18 | | MASK_CATEGORIES = 0x00FF, |
19 | | WHITESPACE = 0x0100, |
20 | | LOWERCASE = 0x0200, |
21 | | UPPERCASE = 0x0400, |
22 | | NFD = 0x0800, |
23 | | }; |
24 | | |
25 | | // codepoint type |
26 | | uint16_t is_undefined : 1; |
27 | | uint16_t is_number : 1; // regex: \p{N} |
28 | | uint16_t is_letter : 1; // regex: \p{L} |
29 | | uint16_t is_separator : 1; // regex: \p{Z} |
30 | | uint16_t is_accent_mark : 1; // regex: \p{M} |
31 | | uint16_t is_punctuation : 1; // regex: \p{P} |
32 | | uint16_t is_symbol : 1; // regex: \p{S} |
33 | | uint16_t is_control : 1; // regex: \p{C} |
34 | | // helper flags |
35 | | uint16_t is_whitespace : 1; // regex: \s |
36 | | uint16_t is_lowercase : 1; |
37 | | uint16_t is_uppercase : 1; |
38 | | uint16_t is_nfd : 1; |
39 | | |
40 | | // decode from uint16 |
41 | 0 | inline unicode_cpt_flags(const uint16_t flags = 0) { |
42 | 0 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
43 | 0 | *reinterpret_cast<uint16_t*>(this) = flags; |
44 | | #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
45 | | is_undefined = (flags & UNDEFINED) ? 1 : 0; |
46 | | is_number = (flags & NUMBER) ? 1 : 0; |
47 | | is_letter = (flags & LETTER) ? 1 : 0; |
48 | | is_separator = (flags & SEPARATOR) ? 1 : 0; |
49 | | is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0; |
50 | | is_punctuation = (flags & PUNCTUATION) ? 1 : 0; |
51 | | is_symbol = (flags & SYMBOL) ? 1 : 0; |
52 | | is_control = (flags & CONTROL) ? 1 : 0; |
53 | | is_whitespace = (flags & WHITESPACE) ? 1 : 0; |
54 | | is_lowercase = (flags & LOWERCASE) ? 1 : 0; |
55 | | is_uppercase = (flags & UPPERCASE) ? 1 : 0; |
56 | | is_nfd = (flags & NFD) ? 1 : 0; |
57 | | #else |
58 | | #error Unexpected or undefined __BYTE_ORDER__ |
59 | | #endif |
60 | 0 | } |
61 | | |
62 | 0 | inline uint16_t as_uint() const { |
63 | 0 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
64 | 0 | return *reinterpret_cast<const uint16_t*>(this); |
65 | | #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ |
66 | | uint16_t result = |
67 | | is_undefined * UNDEFINED |
68 | | + is_number * NUMBER |
69 | | + is_letter * LETTER |
70 | | + is_separator * SEPARATOR |
71 | | + is_accent_mark * ACCENT_MARK |
72 | | + is_punctuation * PUNCTUATION |
73 | | + is_symbol * SYMBOL |
74 | | + is_control * CONTROL |
75 | | + is_whitespace * WHITESPACE |
76 | | + is_lowercase * LOWERCASE |
77 | | + is_uppercase * UPPERCASE |
78 | | + is_nfd * NFD |
79 | | ; |
80 | | |
81 | | return result; |
82 | | #else |
83 | | #error Unexpected or undefined __BYTE_ORDER__ |
84 | | #endif |
85 | 0 | } |
86 | | |
87 | 0 | inline uint16_t category_flag() const { |
88 | 0 | return this->as_uint() & MASK_CATEGORIES; |
89 | 0 | } |
90 | | }; |
91 | | |
92 | | size_t unicode_len_utf8(char src); |
93 | | |
94 | | std::string unicode_cpt_to_utf8 (uint32_t cpt); |
95 | | uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset); |
96 | | |
97 | | std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8); |
98 | | |
99 | | std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts); |
100 | | |
101 | | unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt); |
102 | | unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8); |
103 | | |
104 | | std::string unicode_byte_to_utf8(uint8_t byte); |
105 | | uint8_t unicode_utf8_to_byte(const std::string & utf8); |
106 | | |
107 | | uint32_t unicode_tolower(uint32_t cpt); |
108 | | |
109 | | bool unicode_cpt_is_han(uint32_t cpt); |
110 | | |
111 | | std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs); |