Coverage Report

Created: 2025-11-11 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/simdutf/src/scalar/utf8.h
Line
Count
Source
1
#ifndef SIMDUTF_UTF8_H
2
#define SIMDUTF_UTF8_H
3
4
namespace simdutf {
5
namespace scalar {
6
namespace {
7
namespace utf8 {
8
#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV
9
// only used by the fallback kernel.
10
// credit: based on code from Google Fuchsia (Apache Licensed)
11
3.44k
inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
12
3.44k
  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
13
3.44k
  uint64_t pos = 0;
14
3.44k
  uint32_t code_point = 0;
15
5.92M
  while (pos < len) {
16
    // check of the next 16 bytes are ascii.
17
5.92M
    uint64_t next_pos = pos + 16;
18
5.92M
    if (next_pos <=
19
5.92M
        len) { // if it is safe to read 16 more bytes, check that they are ascii
20
5.91M
      uint64_t v1;
21
5.91M
      std::memcpy(&v1, data + pos, sizeof(uint64_t));
22
5.91M
      uint64_t v2;
23
5.91M
      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
24
5.91M
      uint64_t v{v1 | v2};
25
5.91M
      if ((v & 0x8080808080808080) == 0) {
26
4.86M
        pos = next_pos;
27
4.86M
        continue;
28
4.86M
      }
29
5.91M
    }
30
1.06M
    unsigned char byte = data[pos];
31
32
6.07M
    while (byte < 0b10000000) {
33
5.01M
      if (++pos == len) {
34
1.04k
        return true;
35
1.04k
      }
36
5.01M
      byte = data[pos];
37
5.01M
    }
38
39
1.06M
    if ((byte & 0b11100000) == 0b11000000) {
40
679k
      next_pos = pos + 2;
41
679k
      if (next_pos > len) {
42
57
        return false;
43
57
      }
44
679k
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
45
250
        return false;
46
250
      }
47
      // range check
48
678k
      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
49
678k
      if ((code_point < 0x80) || (0x7ff < code_point)) {
50
38
        return false;
51
38
      }
52
678k
    } else if ((byte & 0b11110000) == 0b11100000) {
53
318k
      next_pos = pos + 3;
54
318k
      if (next_pos > len) {
55
58
        return false;
56
58
      }
57
317k
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
58
101
        return false;
59
101
      }
60
317k
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
61
54
        return false;
62
54
      }
63
      // range check
64
317k
      code_point = (byte & 0b00001111) << 12 |
65
317k
                   (data[pos + 1] & 0b00111111) << 6 |
66
317k
                   (data[pos + 2] & 0b00111111);
67
317k
      if ((code_point < 0x800) || (0xffff < code_point) ||
68
317k
          (0xd7ff < code_point && code_point < 0xe000)) {
69
63
        return false;
70
63
      }
71
317k
    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
72
62.7k
      next_pos = pos + 4;
73
62.7k
      if (next_pos > len) {
74
49
        return false;
75
49
      }
76
62.7k
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
77
113
        return false;
78
113
      }
79
62.6k
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
80
43
        return false;
81
43
      }
82
62.5k
      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
83
40
        return false;
84
40
      }
85
      // range check
86
62.5k
      code_point =
87
62.5k
          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
88
62.5k
          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
89
62.5k
      if (code_point <= 0xffff || 0x10ffff < code_point) {
90
80
        return false;
91
80
      }
92
62.5k
    } else {
93
      // we may have a continuation
94
855
      return false;
95
855
    }
96
1.05M
    pos = next_pos;
97
1.05M
  }
98
597
  return true;
99
3.44k
}
100
#endif
101
102
inline simdutf_warn_unused result validate_with_errors(const char *buf,
103
6.77k
                                                       size_t len) noexcept {
104
6.77k
  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
105
6.77k
  size_t pos = 0;
106
6.77k
  uint32_t code_point = 0;
107
5.93M
  while (pos < len) {
108
    // check of the next 16 bytes are ascii.
109
5.93M
    size_t next_pos = pos + 16;
110
5.93M
    if (next_pos <=
111
5.93M
        len) { // if it is safe to read 16 more bytes, check that they are ascii
112
5.92M
      uint64_t v1;
113
5.92M
      std::memcpy(&v1, data + pos, sizeof(uint64_t));
114
5.92M
      uint64_t v2;
115
5.92M
      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
116
5.92M
      uint64_t v{v1 | v2};
117
5.92M
      if ((v & 0x8080808080808080) == 0) {
118
4.86M
        pos = next_pos;
119
4.86M
        continue;
120
4.86M
      }
121
5.92M
    }
122
1.06M
    unsigned char byte = data[pos];
123
124
6.09M
    while (byte < 0b10000000) {
125
5.02M
      if (++pos == len) {
126
1.04k
        return result(error_code::SUCCESS, len);
127
1.04k
      }
128
5.02M
      byte = data[pos];
129
5.02M
    }
130
131
1.06M
    if ((byte & 0b11100000) == 0b11000000) {
132
681k
      next_pos = pos + 2;
133
681k
      if (next_pos > len) {
134
171
        return result(error_code::TOO_SHORT, pos);
135
171
      }
136
681k
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
137
750
        return result(error_code::TOO_SHORT, pos);
138
750
      }
139
      // range check
140
680k
      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
141
680k
      if ((code_point < 0x80) || (0x7ff < code_point)) {
142
114
        return result(error_code::OVERLONG, pos);
143
114
      }
144
680k
    } else if ((byte & 0b11110000) == 0b11100000) {
145
319k
      next_pos = pos + 3;
146
319k
      if (next_pos > len) {
147
174
        return result(error_code::TOO_SHORT, pos);
148
174
      }
149
319k
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
150
303
        return result(error_code::TOO_SHORT, pos);
151
303
      }
152
319k
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
153
162
        return result(error_code::TOO_SHORT, pos);
154
162
      }
155
      // range check
156
319k
      code_point = (byte & 0b00001111) << 12 |
157
319k
                   (data[pos + 1] & 0b00111111) << 6 |
158
319k
                   (data[pos + 2] & 0b00111111);
159
319k
      if ((code_point < 0x800) || (0xffff < code_point)) {
160
123
        return result(error_code::OVERLONG, pos);
161
123
      }
162
318k
      if (0xd7ff < code_point && code_point < 0xe000) {
163
66
        return result(error_code::SURROGATE, pos);
164
66
      }
165
318k
    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
166
63.9k
      next_pos = pos + 4;
167
63.9k
      if (next_pos > len) {
168
147
        return result(error_code::TOO_SHORT, pos);
169
147
      }
170
63.7k
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
171
339
        return result(error_code::TOO_SHORT, pos);
172
339
      }
173
63.4k
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
174
129
        return result(error_code::TOO_SHORT, pos);
175
129
      }
176
63.3k
      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
177
120
        return result(error_code::TOO_SHORT, pos);
178
120
      }
179
      // range check
180
63.2k
      code_point =
181
63.2k
          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
182
63.2k
          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
183
63.2k
      if (code_point <= 0xffff) {
184
135
        return result(error_code::OVERLONG, pos);
185
135
      }
186
63.0k
      if (0x10ffff < code_point) {
187
105
        return result(error_code::TOO_LARGE, pos);
188
105
      }
189
63.0k
    } else {
190
      // we either have too many continuation bytes or an invalid leading byte
191
2.29k
      if ((byte & 0b11000000) == 0b10000000) {
192
1.23k
        return result(error_code::TOO_LONG, pos);
193
1.23k
      } else {
194
1.05k
        return result(error_code::HEADER_BITS, pos);
195
1.05k
      }
196
2.29k
    }
197
1.06M
    pos = next_pos;
198
1.06M
  }
199
597
  return result(error_code::SUCCESS, len);
200
6.77k
}
201
202
// Finds the previous leading byte starting backward from buf and validates with
203
// errors from there Used to pinpoint the location of an error when an invalid
204
// chunk is detected We assume that the stream starts with a leading byte, and
205
// to check that it is the case, we ask that you pass a pointer to the start of
206
// the stream (start).
207
inline simdutf_warn_unused result rewind_and_validate_with_errors(
208
3.60k
    const char *start, const char *buf, size_t len) noexcept {
209
  // First check that we start with a leading byte
210
3.60k
  if ((*start & 0b11000000) == 0b10000000) {
211
274
    return result(error_code::TOO_LONG, 0);
212
274
  }
213
3.32k
  size_t extra_len{0};
214
  // A leading byte cannot be further than 4 bytes away
215
3.49k
  for (int i = 0; i < 5; i++) {
216
3.49k
    unsigned char byte = *buf;
217
3.49k
    if ((byte & 0b11000000) != 0b10000000) {
218
3.32k
      break;
219
3.32k
    } else {
220
170
      buf--;
221
170
      extra_len++;
222
170
    }
223
3.49k
  }
224
225
3.32k
  result res = validate_with_errors(buf, len + extra_len);
226
3.32k
  res.count -= extra_len;
227
3.32k
  return res;
228
3.60k
}
229
230
14.9k
inline size_t count_code_points(const char *buf, size_t len) {
231
14.9k
  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
232
14.9k
  size_t counter{0};
233
179M
  for (size_t i = 0; i < len; i++) {
234
    // -65 is 0b10111111, anything larger in two-complement's should start a new
235
    // code point.
236
179M
    if (p[i] > -65) {
237
176M
      counter++;
238
176M
    }
239
179M
  }
240
14.9k
  return counter;
241
14.9k
}
242
243
5.73k
inline size_t utf16_length_from_utf8(const char *buf, size_t len) {
244
5.73k
  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
245
5.73k
  size_t counter{0};
246
66.3M
  for (size_t i = 0; i < len; i++) {
247
66.3M
    if (p[i] > -65) {
248
65.0M
      counter++;
249
65.0M
    }
250
66.3M
    if (uint8_t(p[i]) >= 240) {
251
157k
      counter++;
252
157k
    }
253
66.3M
  }
254
5.73k
  return counter;
255
5.73k
}
256
257
simdutf_warn_unused inline size_t trim_partial_utf8(const char *input,
258
0
                                                    size_t length) {
259
0
  if (length < 3) {
260
0
    switch (length) {
261
0
    case 2:
262
0
      if (uint8_t(input[length - 1]) >= 0xc0) {
263
0
        return length - 1;
264
0
      } // 2-, 3- and 4-byte characters with only 1 byte left
265
0
      if (uint8_t(input[length - 2]) >= 0xe0) {
266
0
        return length - 2;
267
0
      } // 3- and 4-byte characters with only 2 bytes left
268
0
      return length;
269
0
    case 1:
270
0
      if (uint8_t(input[length - 1]) >= 0xc0) {
271
0
        return length - 1;
272
0
      } // 2-, 3- and 4-byte characters with only 1 byte left
273
0
      return length;
274
0
    case 0:
275
0
      return length;
276
0
    }
277
0
  }
278
0
  if (uint8_t(input[length - 1]) >= 0xc0) {
279
0
    return length - 1;
280
0
  } // 2-, 3- and 4-byte characters with only 1 byte left
281
0
  if (uint8_t(input[length - 2]) >= 0xe0) {
282
0
    return length - 2;
283
0
  } // 3- and 4-byte characters with only 1 byte left
284
0
  if (uint8_t(input[length - 3]) >= 0xf0) {
285
0
    return length - 3;
286
0
  } // 4-byte characters with only 3 bytes left
287
0
  return length;
288
0
}
289
290
} // namespace utf8
291
} // unnamed namespace
292
} // namespace scalar
293
} // namespace simdutf
294
295
#endif