Coverage Report

Created: 2026-03-31 07:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/duckdb/third_party/utf8proc/utf8proc_wrapper.cpp
Line
Count
Source
1
#include "utf8proc_wrapper.hpp"
2
#include "utf8proc.hpp"
3
#include "duckdb/common/assert.hpp"
4
#include "duckdb/common/exception.hpp"
5
#include "duckdb/common/helper.hpp"
6
7
8
namespace duckdb {
9
10
// This function efficiently checks if a string is valid UTF8.
11
// It was originally written by Sjoerd Mullender.
12
13
// Here is the table that makes it work:
14
15
// B    = Number of Bytes in UTF8 encoding
16
// C_MIN  = First Unicode code point
17
// C_MAX  = Last Unicode code point
18
// B1     = First Byte Prefix
19
20
//  B C_MIN   C_MAX   B1
21
//  1 U+000000  U+00007F    0xxxxxxx
22
//  2 U+000080  U+0007FF    110xxxxx
23
//  3 U+000800  U+00FFFF    1110xxxx
24
//  4 U+010000  U+10FFFF    11110xxx
25
26
static void AssignInvalidUTF8Reason(UnicodeInvalidReason *invalid_reason, size_t *invalid_pos, size_t pos,
27
18.0k
                                    UnicodeInvalidReason reason) {
28
18.0k
  if (invalid_reason) {
29
428
    *invalid_reason = reason;
30
428
  }
31
18.0k
  if (invalid_pos) {
32
428
    *invalid_pos = pos;
33
428
  }
34
18.0k
}
35
36
template <const int nextra_bytes, const int mask>
37
static inline UnicodeType UTF8ExtraByteLoop(const int first_pos_seq, int utf8char, size_t &i, const char *s,
38
                                            const size_t len, UnicodeInvalidReason *invalid_reason,
39
5.11M
                                            size_t *invalid_pos) {
40
5.11M
  if ((len - i) < (nextra_bytes + 1)) {
41
    /* incomplete byte sequence */
42
496
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH);
43
496
    return UnicodeType::INVALID;
44
496
  }
45
15.3M
  for (size_t j = 0; j < nextra_bytes; j++) {
46
10.2M
    int c = (int)s[++i];
47
    /* now validate the extra bytes */
48
10.2M
    if ((c & 0xC0) != 0x80) {
49
      /* extra byte is not in the format 10xxxxxx */
50
2.62k
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
51
2.62k
      return UnicodeType::INVALID;
52
2.62k
    }
53
10.2M
    utf8char = (utf8char << 6) | (c & 0x3F);
54
10.2M
  }
55
5.11M
  if ((utf8char & mask) == 0) {
56
    /* invalid UTF-8 codepoint, not shortest possible */
57
227
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
58
227
    return UnicodeType::INVALID;
59
227
  }
60
5.11M
  if (utf8char > 0x10FFFF) {
61
    /* value not representable by Unicode */
62
12
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
63
12
    return UnicodeType::INVALID;
64
12
  }
65
5.11M
  if ((utf8char & 0x1FFF800) == 0xD800) {
66
    /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8
67
     */
68
17
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
69
17
    return UnicodeType::INVALID;
70
17
  }
71
5.11M
  return UnicodeType::UTF8;
72
5.11M
}
utf8proc_wrapper.cpp:duckdb::UnicodeType duckdb::UTF8ExtraByteLoop<1, 1920>(int, int, unsigned long&, char const*, unsigned long, duckdb::UnicodeInvalidReason*, unsigned long*)
Line
Count
Source
39
9.57k
                                            size_t *invalid_pos) {
40
9.57k
  if ((len - i) < (nextra_bytes + 1)) {
41
    /* incomplete byte sequence */
42
130
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH);
43
130
    return UnicodeType::INVALID;
44
130
  }
45
17.0k
  for (size_t j = 0; j < nextra_bytes; j++) {
46
9.44k
    int c = (int)s[++i];
47
    /* now validate the extra bytes */
48
9.44k
    if ((c & 0xC0) != 0x80) {
49
      /* extra byte is not in the format 10xxxxxx */
50
1.84k
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
51
1.84k
      return UnicodeType::INVALID;
52
1.84k
    }
53
7.60k
    utf8char = (utf8char << 6) | (c & 0x3F);
54
7.60k
  }
55
7.60k
  if ((utf8char & mask) == 0) {
56
    /* invalid UTF-8 codepoint, not shortest possible */
57
125
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
58
125
    return UnicodeType::INVALID;
59
125
  }
60
7.47k
  if (utf8char > 0x10FFFF) {
61
    /* value not representable by Unicode */
62
0
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
63
0
    return UnicodeType::INVALID;
64
0
  }
65
7.47k
  if ((utf8char & 0x1FFF800) == 0xD800) {
66
    /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8
67
     */
68
0
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
69
0
    return UnicodeType::INVALID;
70
0
  }
71
7.47k
  return UnicodeType::UTF8;
72
7.47k
}
utf8proc_wrapper.cpp:duckdb::UnicodeType duckdb::UTF8ExtraByteLoop<2, 63488>(int, int, unsigned long&, char const*, unsigned long, duckdb::UnicodeInvalidReason*, unsigned long*)
Line
Count
Source
39
5.09M
                                            size_t *invalid_pos) {
40
5.09M
  if ((len - i) < (nextra_bytes + 1)) {
41
    /* incomplete byte sequence */
42
127
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH);
43
127
    return UnicodeType::INVALID;
44
127
  }
45
15.2M
  for (size_t j = 0; j < nextra_bytes; j++) {
46
10.1M
    int c = (int)s[++i];
47
    /* now validate the extra bytes */
48
10.1M
    if ((c & 0xC0) != 0x80) {
49
      /* extra byte is not in the format 10xxxxxx */
50
481
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
51
481
      return UnicodeType::INVALID;
52
481
    }
53
10.1M
    utf8char = (utf8char << 6) | (c & 0x3F);
54
10.1M
  }
55
5.08M
  if ((utf8char & mask) == 0) {
56
    /* invalid UTF-8 codepoint, not shortest possible */
57
101
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
58
101
    return UnicodeType::INVALID;
59
101
  }
60
5.08M
  if (utf8char > 0x10FFFF) {
61
    /* value not representable by Unicode */
62
0
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
63
0
    return UnicodeType::INVALID;
64
0
  }
65
5.08M
  if ((utf8char & 0x1FFF800) == 0xD800) {
66
    /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8
67
     */
68
17
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
69
17
    return UnicodeType::INVALID;
70
17
  }
71
5.08M
  return UnicodeType::UTF8;
72
5.08M
}
utf8proc_wrapper.cpp:duckdb::UnicodeType duckdb::UTF8ExtraByteLoop<3, 2031616>(int, int, unsigned long&, char const*, unsigned long, duckdb::UnicodeInvalidReason*, unsigned long*)
Line
Count
Source
39
15.4k
                                            size_t *invalid_pos) {
40
15.4k
  if ((len - i) < (nextra_bytes + 1)) {
41
    /* incomplete byte sequence */
42
239
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::BYTE_MISMATCH);
43
239
    return UnicodeType::INVALID;
44
239
  }
45
60.0k
  for (size_t j = 0; j < nextra_bytes; j++) {
46
45.1k
    int c = (int)s[++i];
47
    /* now validate the extra bytes */
48
45.1k
    if ((c & 0xC0) != 0x80) {
49
      /* extra byte is not in the format 10xxxxxx */
50
308
      AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
51
308
      return UnicodeType::INVALID;
52
308
    }
53
44.8k
    utf8char = (utf8char << 6) | (c & 0x3F);
54
44.8k
  }
55
14.9k
  if ((utf8char & mask) == 0) {
56
    /* invalid UTF-8 codepoint, not shortest possible */
57
1
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
58
1
    return UnicodeType::INVALID;
59
1
  }
60
14.9k
  if (utf8char > 0x10FFFF) {
61
    /* value not representable by Unicode */
62
12
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
63
12
    return UnicodeType::INVALID;
64
12
  }
65
14.9k
  if ((utf8char & 0x1FFF800) == 0xD800) {
66
    /* Unicode characters from U+D800 to U+DFFF are surrogate characters used by UTF-16 which are invalid in UTF-8
67
     */
68
0
    AssignInvalidUTF8Reason(invalid_reason, invalid_pos, first_pos_seq, UnicodeInvalidReason::INVALID_UNICODE);
69
0
    return UnicodeType::INVALID;
70
0
  }
71
14.9k
  return UnicodeType::UTF8;
72
14.9k
}
73
74
62.4M
UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *invalid_reason, size_t *invalid_pos) {
75
62.4M
  UnicodeType type = UnicodeType::ASCII;
76
77
62.4M
  static constexpr uint64_t MASK = 0x8080808080808080U;
78
126M
  for (size_t i = 0; i < len;) {
79
    // Check 8 bytes at a time until we hit non-ASCII
80
118M
    for (; i + sizeof(uint64_t) <= len; i += sizeof(uint64_t)) {
81
56.3M
      if (Load<uint64_t>(const_data_ptr_cast(s + i)) & MASK) {
82
2.14M
        break; // Non-ASCII in the next 8 bytes
83
2.14M
      }
84
56.3M
    }
85
    // Check 1 byte at a time for the next 8 bytes
86
64.4M
    const auto end = MinValue(i + sizeof(uint64_t), len);
87
337M
    for (; i < end; i++) {
88
273M
      int c = (int)s[i];
89
273M
      if ((c & 0x80) == 0) {
90
268M
        continue;
91
268M
      }
92
5.12M
      int first_pos_seq = i;
93
94
5.12M
      if ((c & 0xE0) == 0xC0) {
95
        /* 2 byte sequence */
96
9.57k
        int utf8char = c & 0x1F;
97
9.57k
        type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
98
5.11M
      } else if ((c & 0xF0) == 0xE0) {
99
        /* 3 byte sequence */
100
5.09M
        int utf8char = c & 0x0F;
101
5.09M
        type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
102
5.09M
      } else if ((c & 0xF8) == 0xF0) {
103
        /* 4 byte sequence */
104
15.4k
        int utf8char = c & 0x07;
105
15.4k
        type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, invalid_reason, invalid_pos);
106
15.4k
      } else {
107
        /* invalid UTF-8 start byte */
108
7.10k
        AssignInvalidUTF8Reason(invalid_reason, invalid_pos, i, UnicodeInvalidReason::BYTE_MISMATCH);
109
7.10k
        return UnicodeType::INVALID;
110
7.10k
      }
111
5.11M
      if (type == UnicodeType::INVALID) {
112
3.38k
        return type;
113
3.38k
      }
114
5.11M
    }
115
64.4M
  }
116
62.4M
  return type;
117
62.4M
}
118
119
0
void Utf8Proc::MakeValid(char *s, size_t len, char special_flag) {
120
0
  D_ASSERT(special_flag <= 127);
121
0
  UnicodeType type = UnicodeType::ASCII;
122
0
  for (size_t i = 0; i < len; i++) {
123
0
    int c = (int)s[i];
124
0
    if ((c & 0x80) == 0) {
125
0
      continue;
126
0
    }
127
0
    int first_pos_seq = i;
128
0
    if ((c & 0xE0) == 0xC0) {
129
      /* 2 byte sequence */
130
0
      int utf8char = c & 0x1F;
131
0
      type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr);
132
0
    } else if ((c & 0xF0) == 0xE0) {
133
      /* 3 byte sequence */
134
0
      int utf8char = c & 0x0F;
135
0
      type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr);
136
0
    } else if ((c & 0xF8) == 0xF0) {
137
      /* 4 byte sequence */
138
0
      int utf8char = c & 0x07;
139
0
      type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr);
140
0
    } else {
141
      /* invalid UTF-8 start byte */
142
0
      s[i] = special_flag; // Rewrite invalid byte
143
0
    }
144
0
    if (type == UnicodeType::INVALID) {
145
0
      for (size_t j = first_pos_seq; j <= i; j++) {
146
0
        s[j] = special_flag; // Rewrite each byte of the invalid sequence
147
0
      }
148
0
      type = UnicodeType::ASCII;
149
0
    }
150
0
  }
151
0
  D_ASSERT(Utf8Proc::IsValid(s, len));
152
0
}
153
154
0
char *Utf8Proc::Normalize(const char *s, size_t len) {
155
0
  assert(s);
156
0
  assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID);
157
0
  return (char *)utf8proc_NFC((const utf8proc_uint8_t *)s, len);
158
0
}
159
160
877k
bool Utf8Proc::IsValid(const char *s, size_t len) {
161
877k
  return Utf8Proc::Analyze(s, len) != UnicodeType::INVALID;
162
877k
}
163
164
0
std::string Utf8Proc::RemoveInvalid(const char *s, size_t len) {
165
0
  std::string result;
166
0
  result.reserve(len); // Reserve the maximum possible size
167
168
0
  for (size_t i = 0; i < len; i++) {
169
0
    int c = (int)s[i];
170
0
    if ((c & 0x80) == 0) {
171
      // ASCII character - always valid
172
0
      result.push_back(s[i]);
173
0
      continue;
174
0
    }
175
176
0
    int first_pos_seq = i;
177
0
    if ((c & 0xE0) == 0xC0) {
178
      /* 2 byte sequence */
179
0
      int utf8char = c & 0x1F;
180
0
      UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr);
181
0
    } else if ((c & 0xF0) == 0xE0) {
182
      /* 3 byte sequence */
183
0
      int utf8char = c & 0x0F;
184
0
      UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr);
185
0
    } else if ((c & 0xF8) == 0xF0) {
186
      /* 4 byte sequence */
187
0
      int utf8char = c & 0x07;
188
0
      UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr);
189
0
    } else {
190
      // invalid, do not write to output
191
0
      continue;
192
0
    }
193
194
    // If we get here, the sequence is valid, so add all bytes of the sequence to result
195
0
    for (size_t j = first_pos_seq; j <= i; j++) {
196
0
      result.push_back(s[j]);
197
0
    }
198
0
  }
199
200
0
  D_ASSERT(Utf8Proc::IsValid(result.c_str(), result.size()));
201
0
  return result;
202
0
}
203
204
24.9M
size_t Utf8Proc::NextGraphemeCluster(const char *s, size_t len, size_t cpos) {
205
24.9M
  int sz;
206
24.9M
  auto prev_codepoint = Utf8Proc::UTF8ToCodepoint(s + cpos, sz);
207
24.9M
  utf8proc_int32_t state = 0;
208
24.9M
  while (true) {
209
24.9M
    cpos += sz;
210
24.9M
    if (cpos >= len) {
211
1.72M
      return cpos;
212
1.72M
    }
213
23.2M
    auto next_codepoint = Utf8Proc::UTF8ToCodepoint(s + cpos, sz);
214
23.2M
    if (utf8proc_grapheme_break_stateful(prev_codepoint, next_codepoint, &state)) {
215
      // found a grapheme break here
216
23.2M
      return cpos;
217
23.2M
    }
218
    // not a grapheme break, move on to next codepoint
219
1.84k
    prev_codepoint = next_codepoint;
220
1.84k
  }
221
24.9M
}
222
223
0
size_t Utf8Proc::GraphemeCount(const char *input_data, size_t input_size) {
224
0
  size_t num_characters = 0;
225
0
  for (auto cluster : Utf8Proc::GraphemeClusters(input_data, input_size)) {
226
0
    (void)cluster;
227
0
    num_characters++;
228
0
  }
229
0
  return num_characters;
230
0
}
231
232
0
int32_t Utf8Proc::CodepointToUpper(int32_t codepoint) {
233
0
  return utf8proc_toupper(codepoint);
234
0
}
235
236
0
int32_t Utf8Proc::CodepointToLower(int32_t codepoint) {
237
0
  return utf8proc_tolower(codepoint);
238
0
}
239
240
0
GraphemeIterator::GraphemeIterator(const char *s, size_t len) : s(s), len(len) {
241
0
}
242
243
0
GraphemeIterator Utf8Proc::GraphemeClusters(const char *s, size_t len) {
244
0
  return GraphemeIterator(s, len);
245
0
}
246
247
0
GraphemeIterator::GraphemeClusterIterator::GraphemeClusterIterator(const char *s_p, size_t len_p) : s(s_p), len(len_p) {
248
0
  if (s) {
249
0
    cluster.start = 0;
250
0
    cluster.end = 0;
251
0
    Next();
252
0
  } else {
253
0
    SetInvalid();
254
0
  }
255
0
}
256
257
0
void GraphemeIterator::GraphemeClusterIterator::SetInvalid() {
258
0
  s = nullptr;
259
0
  len = 0;
260
0
  cluster.start = 0;
261
0
  cluster.end = 0;
262
0
}
263
264
0
bool GraphemeIterator::GraphemeClusterIterator::IsInvalid() const {
265
0
  return !s;
266
0
}
267
268
0
void GraphemeIterator::GraphemeClusterIterator::Next() {
269
0
  if (IsInvalid()) {
270
0
    throw std::runtime_error("Grapheme cluster out of bounds!");
271
0
  }
272
0
  if (cluster.end >= len) {
273
    // out of bounds
274
0
    SetInvalid();
275
0
    return;
276
0
  }
277
0
  size_t next_pos = Utf8Proc::NextGraphemeCluster(s, len, cluster.end);
278
0
  cluster.start = cluster.end;
279
0
  cluster.end = next_pos;
280
0
}
281
282
0
GraphemeIterator::GraphemeClusterIterator &GraphemeIterator::GraphemeClusterIterator::operator++() {
283
0
  Next();
284
0
  return *this;
285
0
}
286
0
bool GraphemeIterator::GraphemeClusterIterator::operator!=(const GraphemeClusterIterator &other) const {
287
0
  return !(len == other.len && s == other.s && cluster.start == other.cluster.start &&
288
0
           cluster.end == other.cluster.end);
289
0
}
290
291
0
GraphemeCluster GraphemeIterator::GraphemeClusterIterator::operator*() const {
292
0
  if (IsInvalid()) {
293
0
    throw std::runtime_error("Grapheme cluster out of bounds!");
294
0
  }
295
0
  return cluster;
296
0
}
297
298
0
size_t Utf8Proc::PreviousGraphemeCluster(const char *s, size_t len, size_t cpos) {
299
0
  if (!Utf8Proc::IsValid(s, len)) {
300
0
    return cpos - 1;
301
0
  }
302
0
  size_t current_pos = 0;
303
0
  while (true) {
304
0
    size_t new_pos = NextGraphemeCluster(s, len, current_pos);
305
0
    if (new_pos <= current_pos || new_pos >= cpos) {
306
0
      return current_pos;
307
0
    }
308
0
    current_pos = new_pos;
309
0
  }
310
0
}
311
312
0
bool Utf8Proc::CodepointToUtf8(int cp, int &sz, char *c) {
313
0
  if (cp <= 0x7F) {
314
0
    sz = 1;
315
0
    c[0] = cp;
316
0
  } else if (cp <= 0x7FF) {
317
0
    sz = 2;
318
0
    c[0] = (cp >> 6) + 192;
319
0
    c[1] = (cp & 63) + 128;
320
0
  } else if (0xd800 <= cp && cp <= 0xdfff) {
321
0
    sz = -1;
322
    // invalid block of utf
323
0
    return false;
324
0
  } else if (cp <= 0xFFFF) {
325
0
    sz = 3;
326
0
    c[0] = (cp >> 12) + 224;
327
0
    c[1] = ((cp >> 6) & 63) + 128;
328
0
    c[2] = (cp & 63) + 128;
329
0
  } else if (cp <= 0x10FFFF) {
330
0
    sz = 4;
331
0
    c[0] = (cp >> 18) + 240;
332
0
    c[1] = ((cp >> 12) & 63) + 128;
333
0
    c[2] = ((cp >> 6) & 63) + 128;
334
0
    c[3] = (cp & 63) + 128;
335
0
  } else {
336
0
    sz = -1;
337
0
    return false;
338
0
  }
339
0
  return true;
340
0
}
341
342
0
int Utf8Proc::CodepointLength(int cp) {
343
0
  if (cp <= 0x7F) {
344
0
    return 1;
345
0
  }
346
0
   if (cp <= 0x7FF) {
347
0
    return 2;
348
0
  }
349
0
   if (0xd800 <= cp && cp <= 0xdfff) {
350
0
    throw InternalException("invalid code point detected in Utf8Proc::CodepointLength (0xd800 to 0xdfff), likely due to invalid UTF-8");
351
0
  }
352
0
   if (cp <= 0xFFFF) {
353
0
    return 3;
354
0
  }
355
0
   if (cp <= 0x10FFFF) {
356
0
    return 4;
357
0
  }
358
0
  throw InternalException("invalid code point detected in Utf8Proc::CodepointLength, likely due to invalid UTF-8");
359
0
}
360
361
73.2M
int32_t Utf8Proc::UTF8ToCodepoint(const char *u_input, int &sz) {
362
  // from http://www.zedwood.com/article/cpp-utf8-char-to-codepoint
363
73.2M
  auto u = reinterpret_cast<const unsigned char *>(u_input);
364
73.2M
  unsigned char u0 = u[0];
365
73.2M
  if (u0 <= 127) {
366
66.5M
    sz = 1;
367
66.5M
    return u0;
368
66.5M
  }
369
6.62M
  unsigned char u1 = u[1];
370
6.62M
  if (u0 >= 192 && u0 <= 223) {
371
11.0k
    sz = 2;
372
11.0k
    return (u0 - 192) * 64 + (u1 - 128);
373
11.0k
  }
374
6.61M
  if (u[0] == 0xed && (u[1] & 0xa0) == 0xa0) {
375
0
    throw InternalException("invalid code point detected in Utf8Proc::UTF8ToCodepoint (0xd800 to 0xdfff), likely due to invalid UTF-8");
376
0
  }
377
6.61M
  unsigned char u2 = u[2];
378
6.61M
  if (u0 >= 224 && u0 <= 239) {
379
6.61M
    sz = 3;
380
6.61M
    return (u0 - 224) * 4096 + (u1 - 128) * 64 + (u2 - 128);
381
6.61M
  }
382
553
  unsigned char u3 = u[3];
383
553
  if (u0 >= 240 && u0 <= 247) {
384
553
    sz = 4;
385
553
    return (u0 - 240) * 262144 + (u1 - 128) * 4096 + (u2 - 128) * 64 + (u3 - 128);
386
553
  }
387
0
  throw InternalException("invalid code point detected in Utf8Proc::UTF8ToCodepoint, likely due to invalid UTF-8");
388
553
}
389
390
24.9M
size_t Utf8Proc::RenderWidth(const char *s, size_t len, size_t pos) {
391
24.9M
  int sz;
392
24.9M
  auto codepoint = Utf8Proc::UTF8ToCodepoint(s + pos, sz);
393
24.9M
  auto properties = duckdb::utf8proc_get_property(codepoint);
394
24.9M
  return properties->charwidth;
395
24.9M
}
396
397
0
size_t Utf8Proc::RenderWidth(const std::string &str) {
398
0
  size_t render_width = 0;
399
0
  for (auto cluster : Utf8Proc::GraphemeClusters(str.c_str(), str.size())) {
400
    // use the width of the first codepoint in the grapheme cluster
401
    // combining marks, ZWJ, variation selectors, etc. have charwidth 0
402
    // and multi-codepoint clusters (e.g. ZWJ emoji sequences) should only
403
    // count the base character's width, not the sum of all codepoints
404
0
    render_width += Utf8Proc::RenderWidth(str.c_str(), str.size(), cluster.start);
405
0
  }
406
0
  return render_width;
407
0
}
408
409
} // namespace duckdb