Coverage Report

Created: 2026-01-10 06:41

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/include/simdutf/scalar/utf8.h
Line
Count
Source
1
#ifndef SIMDUTF_UTF8_H
2
#define SIMDUTF_UTF8_H
3
4
namespace simdutf {
5
namespace scalar {
6
namespace {
7
namespace utf8 {
8
9
// credit: based on code from Google Fuchsia (Apache Licensed)
10
template <class BytePtr>
11
simdutf_constexpr23 simdutf_warn_unused bool validate(BytePtr data,
12
0
                                                      size_t len) noexcept {
13
0
  static_assert(
14
0
      std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
15
0
      "dereferencing the data pointer must result in a uint8_t");
16
0
  uint64_t pos = 0;
17
0
  uint32_t code_point = 0;
18
0
  while (pos < len) {
19
0
    uint64_t next_pos;
20
0
#if SIMDUTF_CPLUSPLUS23
21
0
    if !consteval
22
0
#endif
23
0
    { // check if the next 16 bytes are ascii.
24
0
      next_pos = pos + 16;
25
0
      if (next_pos <= len) { // if it is safe to read 16 more bytes, check
26
0
                             // that they are ascii
27
0
        uint64_t v1{};
28
0
        std::memcpy(&v1, data + pos, sizeof(uint64_t));
29
0
        uint64_t v2{};
30
0
        std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
31
0
        uint64_t v{v1 | v2};
32
0
        if ((v & 0x8080808080808080) == 0) {
33
0
          pos = next_pos;
34
0
          continue;
35
0
        }
36
0
      }
37
0
    }
38
0
39
0
    unsigned char byte = data[pos];
40
0
41
0
    while (byte < 0b10000000) {
42
0
      if (++pos == len) {
43
0
        return true;
44
0
      }
45
0
      byte = data[pos];
46
0
    }
47
0
48
0
    if ((byte & 0b11100000) == 0b11000000) {
49
0
      next_pos = pos + 2;
50
0
      if (next_pos > len) {
51
0
        return false;
52
0
      }
53
0
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
54
0
        return false;
55
0
      }
56
0
      // range check
57
0
      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
58
0
      if ((code_point < 0x80) || (0x7ff < code_point)) {
59
0
        return false;
60
0
      }
61
0
    } else if ((byte & 0b11110000) == 0b11100000) {
62
0
      next_pos = pos + 3;
63
0
      if (next_pos > len) {
64
0
        return false;
65
0
      }
66
0
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
67
0
        return false;
68
0
      }
69
0
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
70
0
        return false;
71
0
      }
72
0
      // range check
73
0
      code_point = (byte & 0b00001111) << 12 |
74
0
                   (data[pos + 1] & 0b00111111) << 6 |
75
0
                   (data[pos + 2] & 0b00111111);
76
0
      if ((code_point < 0x800) || (0xffff < code_point) ||
77
0
          (0xd7ff < code_point && code_point < 0xe000)) {
78
0
        return false;
79
0
      }
80
0
    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
81
0
      next_pos = pos + 4;
82
0
      if (next_pos > len) {
83
0
        return false;
84
0
      }
85
0
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
86
0
        return false;
87
0
      }
88
0
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
89
0
        return false;
90
0
      }
91
0
      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
92
0
        return false;
93
0
      }
94
0
      // range check
95
0
      code_point =
96
0
          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
97
0
          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
98
0
      if (code_point <= 0xffff || 0x10ffff < code_point) {
99
0
        return false;
100
0
      }
101
0
    } else {
102
0
      // we may have a continuation
103
0
      return false;
104
0
    }
105
0
    pos = next_pos;
106
0
  }
107
0
  return true;
108
0
}
Unexecuted instantiation: roundtrip.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const*>(unsigned char const*, unsigned long)
Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const*>(unsigned char const*, unsigned long)
Unexecuted instantiation: misc.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const*>(unsigned char const*, unsigned long)
Unexecuted instantiation: conversion.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const*>(unsigned char const*, unsigned long)
109
110
simdutf_really_inline simdutf_warn_unused bool validate(const char *buf,
111
0
                                                        size_t len) noexcept {
112
0
  return validate(reinterpret_cast<const uint8_t *>(buf), len);
113
0
}
Unexecuted instantiation: roundtrip.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const*, unsigned long)
Unexecuted instantiation: base64.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const*, unsigned long)
Unexecuted instantiation: misc.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const*, unsigned long)
Unexecuted instantiation: conversion.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const*, unsigned long)
114
115
template <class BytePtr>
116
simdutf_constexpr23 simdutf_warn_unused result
117
0
validate_with_errors(BytePtr data, size_t len) noexcept {
118
0
  static_assert(
119
0
      std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
120
0
      "dereferencing the data pointer must result in a uint8_t");
121
0
  size_t pos = 0;
122
0
  uint32_t code_point = 0;
123
0
  while (pos < len) {
124
0
    // check of the next 16 bytes are ascii.
125
0
    size_t next_pos = pos + 16;
126
0
    if (next_pos <=
127
0
        len) { // if it is safe to read 16 more bytes, check that they are ascii
128
0
      uint64_t v1;
129
0
      std::memcpy(&v1, data + pos, sizeof(uint64_t));
130
0
      uint64_t v2;
131
0
      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
132
0
      uint64_t v{v1 | v2};
133
0
      if ((v & 0x8080808080808080) == 0) {
134
0
        pos = next_pos;
135
0
        continue;
136
0
      }
137
0
    }
138
0
    unsigned char byte = data[pos];
139
0
140
0
    while (byte < 0b10000000) {
141
0
      if (++pos == len) {
142
0
        return result(error_code::SUCCESS, len);
143
0
      }
144
0
      byte = data[pos];
145
0
    }
146
0
147
0
    if ((byte & 0b11100000) == 0b11000000) {
148
0
      next_pos = pos + 2;
149
0
      if (next_pos > len) {
150
0
        return result(error_code::TOO_SHORT, pos);
151
0
      }
152
0
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
153
0
        return result(error_code::TOO_SHORT, pos);
154
0
      }
155
0
      // range check
156
0
      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
157
0
      if ((code_point < 0x80) || (0x7ff < code_point)) {
158
0
        return result(error_code::OVERLONG, pos);
159
0
      }
160
0
    } else if ((byte & 0b11110000) == 0b11100000) {
161
0
      next_pos = pos + 3;
162
0
      if (next_pos > len) {
163
0
        return result(error_code::TOO_SHORT, pos);
164
0
      }
165
0
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
166
0
        return result(error_code::TOO_SHORT, pos);
167
0
      }
168
0
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
169
0
        return result(error_code::TOO_SHORT, pos);
170
0
      }
171
0
      // range check
172
0
      code_point = (byte & 0b00001111) << 12 |
173
0
                   (data[pos + 1] & 0b00111111) << 6 |
174
0
                   (data[pos + 2] & 0b00111111);
175
0
      if ((code_point < 0x800) || (0xffff < code_point)) {
176
0
        return result(error_code::OVERLONG, pos);
177
0
      }
178
0
      if (0xd7ff < code_point && code_point < 0xe000) {
179
0
        return result(error_code::SURROGATE, pos);
180
0
      }
181
0
    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
182
0
      next_pos = pos + 4;
183
0
      if (next_pos > len) {
184
0
        return result(error_code::TOO_SHORT, pos);
185
0
      }
186
0
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
187
0
        return result(error_code::TOO_SHORT, pos);
188
0
      }
189
0
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
190
0
        return result(error_code::TOO_SHORT, pos);
191
0
      }
192
0
      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
193
0
        return result(error_code::TOO_SHORT, pos);
194
0
      }
195
0
      // range check
196
0
      code_point =
197
0
          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
198
0
          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
199
0
      if (code_point <= 0xffff) {
200
0
        return result(error_code::OVERLONG, pos);
201
0
      }
202
0
      if (0x10ffff < code_point) {
203
0
        return result(error_code::TOO_LARGE, pos);
204
0
      }
205
0
    } else {
206
0
      // we either have too many continuation bytes or an invalid leading byte
207
0
      if ((byte & 0b11000000) == 0b10000000) {
208
0
        return result(error_code::TOO_LONG, pos);
209
0
      } else {
210
0
        return result(error_code::HEADER_BITS, pos);
211
0
      }
212
0
    }
213
0
    pos = next_pos;
214
0
  }
215
0
  return result(error_code::SUCCESS, len);
216
0
}
Unexecuted instantiation: roundtrip.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const*>(unsigned char const*, unsigned long)
Unexecuted instantiation: base64.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const*>(unsigned char const*, unsigned long)
Unexecuted instantiation: misc.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const*>(unsigned char const*, unsigned long)
Unexecuted instantiation: conversion.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const*>(unsigned char const*, unsigned long)
217
218
simdutf_really_inline simdutf_warn_unused result
219
0
validate_with_errors(const char *buf, size_t len) noexcept {
220
0
  return validate_with_errors(reinterpret_cast<const uint8_t *>(buf), len);
221
0
}
Unexecuted instantiation: roundtrip.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const*, unsigned long)
Unexecuted instantiation: base64.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const*, unsigned long)
Unexecuted instantiation: misc.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const*, unsigned long)
Unexecuted instantiation: conversion.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const*, unsigned long)
222
223
// Finds the previous leading byte starting backward from buf and validates with
224
// errors from there Used to pinpoint the location of an error when an invalid
225
// chunk is detected We assume that the stream starts with a leading byte, and
226
// to check that it is the case, we ask that you pass a pointer to the start of
227
// the stream (start).
228
inline simdutf_warn_unused result rewind_and_validate_with_errors(
229
0
    const char *start, const char *buf, size_t len) noexcept {
230
0
  // First check that we start with a leading byte
231
0
  if ((*start & 0b11000000) == 0b10000000) {
232
0
    return result(error_code::TOO_LONG, 0);
233
0
  }
234
0
  size_t extra_len{0};
235
0
  // A leading byte cannot be further than 4 bytes away
236
0
  for (int i = 0; i < 5; i++) {
237
0
    unsigned char byte = *buf;
238
0
    if ((byte & 0b11000000) != 0b10000000) {
239
0
      break;
240
0
    } else {
241
0
      buf--;
242
0
      extra_len++;
243
0
    }
244
0
  }
245
0
246
0
  result res = validate_with_errors(buf, len + extra_len);
247
0
  res.count -= extra_len;
248
0
  return res;
249
0
}
Unexecuted instantiation: roundtrip.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const*, char const*, unsigned long)
Unexecuted instantiation: base64.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const*, char const*, unsigned long)
Unexecuted instantiation: misc.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const*, char const*, unsigned long)
Unexecuted instantiation: conversion.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const*, char const*, unsigned long)
250
251
template <typename InputPtr>
252
#if SIMDUTF_CPLUSPLUS20
253
  requires simdutf::detail::indexes_into_byte_like<InputPtr>
254
#endif
255
simdutf_constexpr23 size_t count_code_points(InputPtr data, size_t len) {
256
  size_t counter{0};
257
  for (size_t i = 0; i < len; i++) {
258
    // -65 is 0b10111111, anything larger in two-complement's should start a new
259
    // code point.
260
    if (int8_t(data[i]) > -65) {
261
      counter++;
262
    }
263
  }
264
  return counter;
265
}
266
267
template <typename InputPtr>
268
#if SIMDUTF_CPLUSPLUS20
269
  requires simdutf::detail::indexes_into_byte_like<InputPtr>
270
#endif
271
simdutf_constexpr23 size_t utf16_length_from_utf8(InputPtr data, size_t len) {
272
  size_t counter{0};
273
  for (size_t i = 0; i < len; i++) {
274
    if (int8_t(data[i]) > -65) {
275
      counter++;
276
    }
277
    if (uint8_t(data[i]) >= 240) {
278
      counter++;
279
    }
280
  }
281
  return counter;
282
}
283
284
template <typename InputPtr>
285
#if SIMDUTF_CPLUSPLUS20
286
  requires simdutf::detail::indexes_into_byte_like<InputPtr>
287
#endif
288
simdutf_warn_unused simdutf_constexpr23 size_t
289
trim_partial_utf8(InputPtr input, size_t length) {
290
  if (length < 3) {
291
    switch (length) {
292
    case 2:
293
      if (uint8_t(input[length - 1]) >= 0xc0) {
294
        return length - 1;
295
      } // 2-, 3- and 4-byte characters with only 1 byte left
296
      if (uint8_t(input[length - 2]) >= 0xe0) {
297
        return length - 2;
298
      } // 3- and 4-byte characters with only 2 bytes left
299
      return length;
300
    case 1:
301
      if (uint8_t(input[length - 1]) >= 0xc0) {
302
        return length - 1;
303
      } // 2-, 3- and 4-byte characters with only 1 byte left
304
      return length;
305
    case 0:
306
      return length;
307
    }
308
  }
309
  if (uint8_t(input[length - 1]) >= 0xc0) {
310
    return length - 1;
311
  } // 2-, 3- and 4-byte characters with only 1 byte left
312
  if (uint8_t(input[length - 2]) >= 0xe0) {
313
    return length - 2;
314
  } // 3- and 4-byte characters with only 1 byte left
315
  if (uint8_t(input[length - 3]) >= 0xf0) {
316
    return length - 3;
317
  } // 4-byte characters with only 3 bytes left
318
  return length;
319
}
320
321
} // namespace utf8
322
} // unnamed namespace
323
} // namespace scalar
324
} // namespace simdutf
325
326
#endif