/src/duckdb/third_party/utf8proc/utf8proc_wrapper.cpp
Line | Count | Source |
1 | | #include "utf8proc_wrapper.hpp" |
2 | | #include "utf8proc.hpp" |
3 | | #include "duckdb/common/assert.hpp" |
4 | | #include "duckdb/common/exception.hpp" |
5 | | #include "duckdb/common/helper.hpp" |
6 | | |
7 | | |
8 | | namespace duckdb { |
9 | | |
10 | | // This function efficiently checks if a string is valid UTF8. |
11 | | // It was originally written by Sjoerd Mullender. |
12 | | |
13 | | // Here is the table that makes it work: |
14 | | |
15 | | // B = Number of Bytes in UTF8 encoding |
16 | | // C_MIN = First Unicode code point |
17 | | // C_MAX = Last Unicode code point |
18 | | // B1 = First Byte Prefix |
19 | | |
20 | | // B C_MIN C_MAX B1 |
21 | | // 1 U+000000 U+00007F 0xxxxxxx |
22 | | // 2 U+000080 U+0007FF 110xxxxx |
23 | | // 3 U+000800 U+00FFFF 1110xxxx |
24 | | // 4 U+010000 U+10FFFF 11110xxx |
25 | | |
26 | | static void AssignInvalidUTF8Reason(UnicodeInvalidReason *invalid_reason, size_t *invalid_pos, size_t pos, |
27 | 18.0k | UnicodeInvalidReason reason) { |
28 | 18.0k | if (invalid_reason) { |
29 | 428 | *invalid_reason = reason; |
30 | 428 | } |
31 | 18.0k | if (invalid_pos) { |
32 | 428 | *invalid_pos = pos; |
33 | 428 | } |
34 | 18.0k | } |
35 | | |
36 | | template <const int nextra_bytes, const int mask> |
37 | | static inline UnicodeType UTF8ExtraByteLoop(const int first_pos_seq, int utf8char, size_t &i, const char *s, |
38 | | const size_t len, UnicodeInvalidReason *invalid_reason, |
39 | 5.11M | size_t *invalid_pos) { |
40 | 5.11M | if ((len - i) < (nextra_bytes + 1)) { |
41 | | /* incomplete byte sequence */ |
42 | 496 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH); |
43 | 496 | return UnicodeType::INVALID; |
44 | 496 | } |
45 | 15.3M | for (size_t j = 0; j < nextra_bytes; j++) { |
46 | 10.2M | int c = (int)s[++i]; |
47 | | /* now validate the extra bytes */ |
48 | 10.2M | if ((c & 0xC0) != 0x80) { |
49 | | /* extra byte is not in the format 10xxxxxx */ |
50 | 2.62k | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH); |
51 | 2.62k | return UnicodeType::INVALID; |
52 | 2.62k | } |
53 | 10.2M | utf8char = (utf8char << 6) | (c & 0x3F); |
54 | 10.2M | } |
55 | 5.11M | if ((utf8char & mask) == 0) { |
56 | | /* invalid UTF-8 codepoint, not shortest possible */ |
57 | 227 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); |
58 | 227 | return UnicodeType::INVALID; |
59 | 227 | } |
60 | 5.11M | if (utf8char > 0x10FFFF) { |
61 | | /* value not representable by Unicode */ |
62 | 12 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); |
63 | 12 | return UnicodeType::INVALID; |
64 | 12 | } |
65 | 5.11M | if ((utf8char & 0x1FFF800) == 0xD800) { |
66 | | /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8 |
67 | | */ |
68 | 17 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); |
69 | 17 | return UnicodeType::INVALID; |
70 | 17 | } |
71 | 5.11M | return UnicodeType::UTF8; |
72 | 5.11M | } utf8proc_wrapper.cpp:duckdb::UnicodeType duckdb::UTF8ExtraByteLoop<1, 1920>(int, int, unsigned long&, char const*, unsigned long, duckdb::UnicodeInvalidReason*, unsigned long*) Line | Count | Source | 39 | 9.57k | size_t *invalid_pos) { | 40 | 9.57k | if ((len - i) < (nextra_bytes + 1)) { | 41 | | /* incomplete byte sequence */ | 42 | 130 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH); | 43 | 130 | return UnicodeType::INVALID; | 44 | 130 | } | 45 | 17.0k | for (size_t j = 0; j < nextra_bytes; j++) { | 46 | 9.44k | int c = (int)s[++i]; | 47 | | /* now validate the extra bytes */ | 48 | 9.44k | if ((c & 0xC0) != 0x80) { | 49 | | /* extra byte is not in the format 10xxxxxx */ | 50 | 1.84k | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH); | 51 | 1.84k | return UnicodeType::INVALID; | 52 | 1.84k | } | 53 | 7.60k | utf8char = (utf8char << 6) | (c & 0x3F); | 54 | 7.60k | } | 55 | 7.60k | if ((utf8char & mask) == 0) { | 56 | | /* invalid UTF-8 codepoint, not shortest possible */ | 57 | 125 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); | 58 | 125 | return UnicodeType::INVALID; | 59 | 125 | } | 60 | 7.47k | if (utf8char > 0x10FFFF) { | 61 | | /* value not representable by Unicode */ | 62 | 0 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); | 63 | 0 | return UnicodeType::INVALID; | 64 | 0 | } | 65 | 7.47k | if ((utf8char & 0x1FFF800) == 0xD800) { | 66 | | /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8 | 67 | | */ | 68 | 0 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); | 69 | 0 | return UnicodeType::INVALID; | 70 | 0 | } | 71 | 7.47k | return UnicodeType::UTF8; | 72 | 7.47k | } |
utf8proc_wrapper.cpp:duckdb::UnicodeType duckdb::UTF8ExtraByteLoop<2, 63488>(int, int, unsigned long&, char const*, unsigned long, duckdb::UnicodeInvalidReason*, unsigned long*) Line | Count | Source | 39 | 5.09M | size_t *invalid_pos) { | 40 | 5.09M | if ((len - i) < (nextra_bytes + 1)) { | 41 | | /* incomplete byte sequence */ | 42 | 127 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH); | 43 | 127 | return UnicodeType::INVALID; | 44 | 127 | } | 45 | 15.2M | for (size_t j = 0; j < nextra_bytes; j++) { | 46 | 10.1M | int c = (int)s[++i]; | 47 | | /* now validate the extra bytes */ | 48 | 10.1M | if ((c & 0xC0) != 0x80) { | 49 | | /* extra byte is not in the format 10xxxxxx */ | 50 | 481 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH); | 51 | 481 | return UnicodeType::INVALID; | 52 | 481 | } | 53 | 10.1M | utf8char = (utf8char << 6) | (c & 0x3F); | 54 | 10.1M | } | 55 | 5.08M | if ((utf8char & mask) == 0) { | 56 | | /* invalid UTF-8 codepoint, not shortest possible */ | 57 | 101 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); | 58 | 101 | return UnicodeType::INVALID; | 59 | 101 | } | 60 | 5.08M | if (utf8char > 0x10FFFF) { | 61 | | /* value not representable by Unicode */ | 62 | 0 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); | 63 | 0 | return UnicodeType::INVALID; | 64 | 0 | } | 65 | 5.08M | if ((utf8char & 0x1FFF800) == 0xD800) { | 66 | | /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8 | 67 | | */ | 68 | 17 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); | 69 | 17 | return UnicodeType::INVALID; | 70 | 17 | } | 71 | 5.08M | return UnicodeType::UTF8; | 72 | 5.08M | } |
utf8proc_wrapper.cpp:duckdb::UnicodeType duckdb::UTF8ExtraByteLoop<3, 2031616>(int, int, unsigned long&, char const*, unsigned long, duckdb::UnicodeInvalidReason*, unsigned long*) Line | Count | Source | 39 | 15.4k | size_t *invalid_pos) { | 40 | 15.4k | if ((len - i) < (nextra_bytes + 1)) { | 41 | | /* incomplete byte sequence */ | 42 | 239 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH); | 43 | 239 | return UnicodeType::INVALID; | 44 | 239 | } | 45 | 60.0k | for (size_t j = 0; j < nextra_bytes; j++) { | 46 | 45.1k | int c = (int)s[++i]; | 47 | | /* now validate the extra bytes */ | 48 | 45.1k | if ((c & 0xC0) != 0x80) { | 49 | | /* extra byte is not in the format 10xxxxxx */ | 50 | 308 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH); | 51 | 308 | return UnicodeType::INVALID; | 52 | 308 | } | 53 | 44.8k | utf8char = (utf8char << 6) | (c & 0x3F); | 54 | 44.8k | } | 55 | 14.9k | if ((utf8char & mask) == 0) { | 56 | | /* invalid UTF-8 codepoint, not shortest possible */ | 57 | 1 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); | 58 | 1 | return UnicodeType::INVALID; | 59 | 1 | } | 60 | 14.9k | if (utf8char > 0x10FFFF) { | 61 | | /* value not representable by Unicode */ | 62 | 12 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); | 63 | 12 | return UnicodeType::INVALID; | 64 | 12 | } | 65 | 14.9k | if ((utf8char & 0x1FFF800) == 0xD800) { | 66 | | /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8 | 67 | | */ | 68 | 0 | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE); | 69 | 0 | return UnicodeType::INVALID; | 70 | 0 | } | 71 | 14.9k | return UnicodeType::UTF8; | 72 | 14.9k | } |
|
73 | | |
74 | 62.4M | UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) { |
75 | 62.4M | UnicodeType type = UnicodeType::ASCII; |
76 | | |
77 | 62.4M | static constexpr uint64_t MASK = 0x8080808080808080U; |
78 | 126M | for (size_t i = 0; i < len;) { |
79 | | // Check 8 bytes at a time until we hit non-ASCII |
80 | 118M | for (; i + sizeof(uint64_t) <= len; i += sizeof(uint64_t)) { |
81 | 56.3M | if (Load<uint64_t>(const_data_ptr_cast(s + i)) & MASK) { |
82 | 2.14M | break; // Non-ASCII in the next 8 bytes |
83 | 2.14M | } |
84 | 56.3M | } |
85 | | // Check 1 byte at a time for the next 8 bytes |
86 | 64.4M | const auto end = MinValue(i + sizeof(uint64_t), len); |
87 | 337M | for (; i < end; i++) { |
88 | 273M | int c = (int)s[i]; |
89 | 273M | if ((c & 0x80) == 0) { |
90 | 268M | continue; |
91 | 268M | } |
92 | 5.12M | int first_pos_seq = i; |
93 | | |
94 | 5.12M | if ((c & 0xE0) == 0xC0) { |
95 | | /* 2 byte sequence */ |
96 | 9.57k | int utf8char = c & 0x1F; |
97 | 9.57k | type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos); |
98 | 5.11M | } else if ((c & 0xF0) == 0xE0) { |
99 | | /* 3 byte sequence */ |
100 | 5.09M | int utf8char = c & 0x0F; |
101 | 5.09M | type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos); |
102 | 5.09M | } else if ((c & 0xF8) == 0xF0) { |
103 | | /* 4 byte sequence */ |
104 | 15.4k | int utf8char = c & 0x07; |
105 | 15.4k | type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos); |
106 | 15.4k | } else { |
107 | | /* invalid UTF-8 start byte */ |
108 | 7.10k | AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH); |
109 | 7.10k | return UnicodeType::INVALID; |
110 | 7.10k | } |
111 | 5.11M | if (type == UnicodeType::INVALID) { |
112 | 3.38k | return type; |
113 | 3.38k | } |
114 | 5.11M | } |
115 | 64.4M | } |
116 | 62.4M | return type; |
117 | 62.4M | } |
118 | | |
119 | 0 | void Utf8Proc::MakeValid(char *s, size_t len, char special_flag) { |
120 | 0 | D_ASSERT(special_flag <= 127); |
121 | 0 | UnicodeType type = UnicodeType::ASCII; |
122 | 0 | for (size_t i = 0; i < len; i++) { |
123 | 0 | int c = (int)s[i]; |
124 | 0 | if ((c & 0x80) == 0) { |
125 | 0 | continue; |
126 | 0 | } |
127 | 0 | int first_pos_seq = i; |
128 | 0 | if ((c & 0xE0) == 0xC0) { |
129 | | /* 2 byte sequence */ |
130 | 0 | int utf8char = c & 0x1F; |
131 | 0 | type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr); |
132 | 0 | } else if ((c & 0xF0) == 0xE0) { |
133 | | /* 3 byte sequence */ |
134 | 0 | int utf8char = c & 0x0F; |
135 | 0 | type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr); |
136 | 0 | } else if ((c & 0xF8) == 0xF0) { |
137 | | /* 4 byte sequence */ |
138 | 0 | int utf8char = c & 0x07; |
139 | 0 | type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr); |
140 | 0 | } else { |
141 | | /* invalid UTF-8 start byte */ |
142 | 0 | s[i] = special_flag; // Rewrite invalid byte |
143 | 0 | } |
144 | 0 | if (type == UnicodeType::INVALID) { |
145 | 0 | for (size_t j = first_pos_seq; j <= i; j++) { |
146 | 0 | s[j] = special_flag; // Rewrite each byte of the invalid sequence |
147 | 0 | } |
148 | 0 | type = UnicodeType::ASCII; |
149 | 0 | } |
150 | 0 | } |
151 | 0 | D_ASSERT(Utf8Proc::IsValid(s, len)); |
152 | 0 | } |
153 | | |
154 | 0 | char *Utf8Proc::Normalize(const char *s, size_t len) { |
155 | 0 | assert(s); |
156 | 0 | assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID); |
157 | 0 | return (char *)utf8proc_NFC((const utf8proc_uint8_t *)s, len); |
158 | 0 | } |
159 | | |
160 | 877k | bool Utf8Proc::IsValid(const char *s, size_t len) { |
161 | 877k | return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID; |
162 | 877k | } |
163 | | |
164 | 0 | std::string Utf8Proc::RemoveInvalid(const char *s, size_t len) { |
165 | 0 | std::string result; |
166 | 0 | result.reserve(len); // Reserve the maximum possible size |
167 | |
|
168 | 0 | for (size_t i = 0; i < len; i++) { |
169 | 0 | int c = (int)s[i]; |
170 | 0 | if ((c & 0x80) == 0) { |
171 | | // ASCII character - always valid |
172 | 0 | result.push_back(s[i]); |
173 | 0 | continue; |
174 | 0 | } |
175 | | |
176 | 0 | int first_pos_seq = i; |
177 | 0 | if ((c & 0xE0) == 0xC0) { |
178 | | /* 2 byte sequence */ |
179 | 0 | int utf8char = c & 0x1F; |
180 | 0 | UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr); |
181 | 0 | } else if ((c & 0xF0) == 0xE0) { |
182 | | /* 3 byte sequence */ |
183 | 0 | int utf8char = c & 0x0F; |
184 | 0 | UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr); |
185 | 0 | } else if ((c & 0xF8) == 0xF0) { |
186 | | /* 4 byte sequence */ |
187 | 0 | int utf8char = c & 0x07; |
188 | 0 | UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr); |
189 | 0 | } else { |
190 | | // invalid, do not write to output |
191 | 0 | continue; |
192 | 0 | } |
193 | | |
194 | | // If we get here, the sequence is valid, so add all bytes of the sequence to result |
195 | 0 | for (size_t j = first_pos_seq; j <= i; j++) { |
196 | 0 | result.push_back(s[j]); |
197 | 0 | } |
198 | 0 | } |
199 | |
|
200 | 0 | D_ASSERT(Utf8Proc::IsValid(result.c_str(), result.size())); |
201 | 0 | return result; |
202 | 0 | } |
203 | | |
204 | 24.9M | size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) { |
205 | 24.9M | int sz; |
206 | 24.9M | auto prev_codepoint = Utf8Proc::UTF8ToCodepoint(s + cpos, sz); |
207 | 24.9M | utf8proc_int32_t state = 0; |
208 | 24.9M | while (true) { |
209 | 24.9M | cpos += sz; |
210 | 24.9M | if (cpos >= len) { |
211 | 1.72M | return cpos; |
212 | 1.72M | } |
213 | 23.2M | auto next_codepoint = Utf8Proc::UTF8ToCodepoint(s + cpos, sz); |
214 | 23.2M | if (utf8proc_grapheme_break_stateful(prev_codepoint, next_codepoint, &state)) { |
215 | | // found a grapheme break here |
216 | 23.2M | return cpos; |
217 | 23.2M | } |
218 | | // not a grapheme break, move on to next codepoint |
219 | 1.84k | prev_codepoint = next_codepoint; |
220 | 1.84k | } |
221 | 24.9M | } |
222 | | |
223 | 0 | size_t Utf8Proc::GraphemeCount(const char *input_data, size_t input_size) { |
224 | 0 | size_t num_characters = 0; |
225 | 0 | for (auto cluster : Utf8Proc::GraphemeClusters(input_data, input_size)) { |
226 | 0 | (void)cluster; |
227 | 0 | num_characters++; |
228 | 0 | } |
229 | 0 | return num_characters; |
230 | 0 | } |
231 | | |
232 | 0 | int32_t Utf8Proc::CodepointToUpper(int32_t codepoint) { |
233 | 0 | return utf8proc_toupper(codepoint); |
234 | 0 | } |
235 | | |
236 | 0 | int32_t Utf8Proc::CodepointToLower(int32_t codepoint) { |
237 | 0 | return utf8proc_tolower(codepoint); |
238 | 0 | } |
239 | | |
240 | 0 | GraphemeIterator::GraphemeIterator(const char *s, size_t len) : s(s), len(len) { |
241 | 0 | } |
242 | | |
243 | 0 | GraphemeIterator Utf8Proc::GraphemeClusters(const char *s, size_t len) { |
244 | 0 | return GraphemeIterator(s, len); |
245 | 0 | } |
246 | | |
247 | 0 | GraphemeIterator::GraphemeClusterIterator::GraphemeClusterIterator(const char *s_p, size_t len_p) : s(s_p), len(len_p) { |
248 | 0 | if (s) { |
249 | 0 | cluster.start = 0; |
250 | 0 | cluster.end = 0; |
251 | 0 | Next(); |
252 | 0 | } else { |
253 | 0 | SetInvalid(); |
254 | 0 | } |
255 | 0 | } |
256 | | |
257 | 0 | void GraphemeIterator::GraphemeClusterIterator::SetInvalid() { |
258 | 0 | s = nullptr; |
259 | 0 | len = 0; |
260 | 0 | cluster.start = 0; |
261 | 0 | cluster.end = 0; |
262 | 0 | } |
263 | | |
264 | 0 | bool GraphemeIterator::GraphemeClusterIterator::IsInvalid() const { |
265 | 0 | return !s; |
266 | 0 | } |
267 | | |
268 | 0 | void GraphemeIterator::GraphemeClusterIterator::Next() { |
269 | 0 | if (IsInvalid()) { |
270 | 0 | throw std::runtime_error("Grapheme cluster out of bounds!"); |
271 | 0 | } |
272 | 0 | if (cluster.end >= len) { |
273 | | // out of bounds |
274 | 0 | SetInvalid(); |
275 | 0 | return; |
276 | 0 | } |
277 | 0 | size_t next_pos = Utf8Proc::NextGraphemeCluster(s, len, cluster.end); |
278 | 0 | cluster.start = cluster.end; |
279 | 0 | cluster.end = next_pos; |
280 | 0 | } |
281 | | |
282 | 0 | GraphemeIterator::GraphemeClusterIterator &GraphemeIterator::GraphemeClusterIterator::operator++() { |
283 | 0 | Next(); |
284 | 0 | return *this; |
285 | 0 | } |
286 | 0 | bool GraphemeIterator::GraphemeClusterIterator::operator!=(const GraphemeClusterIterator &other) const { |
287 | 0 | return !(len == other.len && s == other.s && cluster.start == other.cluster.start && |
288 | 0 | cluster.end == other.cluster.end); |
289 | 0 | } |
290 | | |
291 | 0 | GraphemeCluster GraphemeIterator::GraphemeClusterIterator::operator*() const { |
292 | 0 | if (IsInvalid()) { |
293 | 0 | throw std::runtime_error("Grapheme cluster out of bounds!"); |
294 | 0 | } |
295 | 0 | return cluster; |
296 | 0 | } |
297 | | |
298 | 0 | size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) { |
299 | 0 | if (!Utf8Proc::IsValid(s, len)) { |
300 | 0 | return cpos - 1; |
301 | 0 | } |
302 | 0 | size_t current_pos = 0; |
303 | 0 | while (true) { |
304 | 0 | size_t new_pos = NextGraphemeCluster(s, len, current_pos); |
305 | 0 | if (new_pos <= current_pos || new_pos >= cpos) { |
306 | 0 | return current_pos; |
307 | 0 | } |
308 | 0 | current_pos = new_pos; |
309 | 0 | } |
310 | 0 | } |
311 | | |
312 | 0 | bool Utf8Proc::CodepointToUtf8(int cp, int &sz, char *c) { |
313 | 0 | if (cp <= 0x7F) { |
314 | 0 | sz = 1; |
315 | 0 | c[0] = cp; |
316 | 0 | } else if (cp <= 0x7FF) { |
317 | 0 | sz = 2; |
318 | 0 | c[0] = (cp >> 6) + 192; |
319 | 0 | c[1] = (cp & 63) + 128; |
320 | 0 | } else if (0xd800 <= cp && cp <= 0xdfff) { |
321 | 0 | sz = -1; |
322 | | // invalid block of utf |
323 | 0 | return false; |
324 | 0 | } else if (cp <= 0xFFFF) { |
325 | 0 | sz = 3; |
326 | 0 | c[0] = (cp >> 12) + 224; |
327 | 0 | c[1] = ((cp >> 6) & 63) + 128; |
328 | 0 | c[2] = (cp & 63) + 128; |
329 | 0 | } else if (cp <= 0x10FFFF) { |
330 | 0 | sz = 4; |
331 | 0 | c[0] = (cp >> 18) + 240; |
332 | 0 | c[1] = ((cp >> 12) & 63) + 128; |
333 | 0 | c[2] = ((cp >> 6) & 63) + 128; |
334 | 0 | c[3] = (cp & 63) + 128; |
335 | 0 | } else { |
336 | 0 | sz = -1; |
337 | 0 | return false; |
338 | 0 | } |
339 | 0 | return true; |
340 | 0 | } |
341 | | |
342 | 0 | int Utf8Proc::CodepointLength(int cp) { |
343 | 0 | if (cp <= 0x7F) { |
344 | 0 | return 1; |
345 | 0 | } |
346 | 0 | if (cp <= 0x7FF) { |
347 | 0 | return 2; |
348 | 0 | } |
349 | 0 | if (0xd800 <= cp && cp <= 0xdfff) { |
350 | 0 | throw InternalException("invalid code point detected in Utf8Proc::CodepointLength (0xd800 to 0xdfff), likely due to invalid UTF-8"); |
351 | 0 | } |
352 | 0 | if (cp <= 0xFFFF) { |
353 | 0 | return 3; |
354 | 0 | } |
355 | 0 | if (cp <= 0x10FFFF) { |
356 | 0 | return 4; |
357 | 0 | } |
358 | 0 | throw InternalException("invalid code point detected in Utf8Proc::CodepointLength, likely due to invalid UTF-8"); |
359 | 0 | } |
360 | | |
361 | 73.2M | int32_t Utf8Proc::UTF8ToCodepoint(const char *u_input, int &sz) { |
362 | | // from http://www.zedwood.com/article/cpp-utf8-char-to-codepoint |
363 | 73.2M | auto u = reinterpret_cast<const unsigned char *>(u_input); |
364 | 73.2M | unsigned char u0 = u[0]; |
365 | 73.2M | if (u0 <= 127) { |
366 | 66.5M | sz = 1; |
367 | 66.5M | return u0; |
368 | 66.5M | } |
369 | 6.62M | unsigned char u1 = u[1]; |
370 | 6.62M | if (u0 >= 192 && u0 <= 223) { |
371 | 11.0k | sz = 2; |
372 | 11.0k | return (u0 - 192) * 64 + (u1 - 128); |
373 | 11.0k | } |
374 | 6.61M | if (u[0] == 0xed && (u[1] & 0xa0) == 0xa0) { |
375 | 0 | throw InternalException("invalid code point detected in Utf8Proc::UTF8ToCodepoint (0xd800 to 0xdfff), likely due to invalid UTF-8"); |
376 | 0 | } |
377 | 6.61M | unsigned char u2 = u[2]; |
378 | 6.61M | if (u0 >= 224 && u0 <= 239) { |
379 | 6.61M | sz = 3; |
380 | 6.61M | return (u0 - 224) * 4096 + (u1 - 128) * 64 + (u2 - 128); |
381 | 6.61M | } |
382 | 553 | unsigned char u3 = u[3]; |
383 | 553 | if (u0 >= 240 && u0 <= 247) { |
384 | 553 | sz = 4; |
385 | 553 | return (u0 - 240) * 262144 + (u1 - 128) * 4096 + (u2 - 128) * 64 + (u3 - 128); |
386 | 553 | } |
387 | 0 | throw InternalException("invalid code point detected in Utf8Proc::UTF8ToCodepoint, likely due to invalid UTF-8"); |
388 | 553 | } |
389 | | |
390 | 24.9M | size_t Utf8Proc::RenderWidth(const char *s, size_t len, size_t pos) { |
391 | 24.9M | int sz; |
392 | 24.9M | auto codepoint = Utf8Proc::UTF8ToCodepoint(s + pos, sz); |
393 | 24.9M | auto properties = duckdb::utf8proc_get_property(codepoint); |
394 | 24.9M | return properties->charwidth; |
395 | 24.9M | } |
396 | | |
397 | 0 | size_t Utf8Proc::RenderWidth(const std::string &str) { |
398 | 0 | size_t render_width = 0; |
399 | 0 | for (auto cluster : Utf8Proc::GraphemeClusters(str.c_str(), str.size())) { |
400 | | // use the width of the first codepoint in the grapheme cluster |
401 | | // combining marks, ZWJ, variation selectors, etc. have charwidth 0 |
402 | | // and multi-codepoint clusters (e.g. ZWJ emoji sequences) should only |
403 | | // count the base character's width, not the sum of all codepoints |
404 | 0 | render_width += Utf8Proc::RenderWidth(str.c_str(), str.size(), cluster.start); |
405 | 0 | } |
406 | 0 | return render_width; |
407 | 0 | } |
408 | | |
409 | | } // namespace duckdb |