/src/simdutf/src/scalar/utf8.h

Source
#ifndef SIMDUTF_UTF8_H
#define SIMDUTF_UTF8_H

namespace simdutf {
namespace scalar {
namespace {
namespace utf8 {
#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV
// only used by the fallback kernel.
// credit: based on code from Google Fuchsia (Apache Licensed)
inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
  uint64_t pos = 0;
  uint32_t code_point = 0;
  while (pos < len) {
    // check of the next 16 bytes are ascii.
    uint64_t next_pos = pos + 16;
    if (next_pos <=
        len) { // if it is safe to read 16 more bytes, check that they are ascii
      uint64_t v1;
      std::memcpy(&v1, data + pos, sizeof(uint64_t));
      uint64_t v2;
      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
      uint64_t v{v1 | v2};
      if ((v & 0x8080808080808080) == 0) {
        pos = next_pos;
        continue;
      }
    }
    unsigned char byte = data[pos];

    while (byte < 0b10000000) {
      if (++pos == len) {
        return true;
      }
      byte = data[pos];
    }

    if ((byte & 0b11100000) == 0b11000000) {
      next_pos = pos + 2;
      if (next_pos > len) {
        return false;
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return false;
      }
      // range check
      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
      if ((code_point < 0x80) || (0x7ff < code_point)) {
        return false;
      }
    } else if ((byte & 0b11110000) == 0b11100000) {
      next_pos = pos + 3;
      if (next_pos > len) {
        return false;
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return false;
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return false;
      }
      // range check
      code_point = (byte & 0b00001111) << 12 |
                   (data[pos + 1] & 0b00111111) << 6 |
                   (data[pos + 2] & 0b00111111);
      if ((code_point < 0x800) || (0xffff < code_point) ||
          (0xd7ff < code_point && code_point < 0xe000)) {
        return false;
      }
    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
      next_pos = pos + 4;
      if (next_pos > len) {
        return false;
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return false;
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return false;
      }
      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
        return false;
      }
      // range check
      code_point =
          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
      if (code_point <= 0xffff || 0x10ffff < code_point) {
        return false;
      }
    } else {
      // we may have a continuation
      return false;
    }
    pos = next_pos;
  }
  return true;
}
#endif

inline simdutf_warn_unused result validate_with_errors(const char *buf,
                                                       size_t len) noexcept {
  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
  size_t pos = 0;
  uint32_t code_point = 0;
  while (pos < len) {
    // check of the next 16 bytes are ascii.
    size_t next_pos = pos + 16;
    if (next_pos <=
        len) { // if it is safe to read 16 more bytes, check that they are ascii
      uint64_t v1;
      std::memcpy(&v1, data + pos, sizeof(uint64_t));
      uint64_t v2;
      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
      uint64_t v{v1 | v2};
      if ((v & 0x8080808080808080) == 0) {
        pos = next_pos;
        continue;
      }
    }
    unsigned char byte = data[pos];

    while (byte < 0b10000000) {
      if (++pos == len) {
        return result(error_code::SUCCESS, len);
      }
      byte = data[pos];
    }

    if ((byte & 0b11100000) == 0b11000000) {
      next_pos = pos + 2;
      if (next_pos > len) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      // range check
      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
      if ((code_point < 0x80) || (0x7ff < code_point)) {
        return result(error_code::OVERLONG, pos);
      }
    } else if ((byte & 0b11110000) == 0b11100000) {
      next_pos = pos + 3;
      if (next_pos > len) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      // range check
      code_point = (byte & 0b00001111) << 12 |
                   (data[pos + 1] & 0b00111111) << 6 |
                   (data[pos + 2] & 0b00111111);
      if ((code_point < 0x800) || (0xffff < code_point)) {
        return result(error_code::OVERLONG, pos);
      }
      if (0xd7ff < code_point && code_point < 0xe000) {
        return result(error_code::SURROGATE, pos);
      }
    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
      next_pos = pos + 4;
      if (next_pos > len) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      // range check
      code_point =
          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
      if (code_point <= 0xffff) {
        return result(error_code::OVERLONG, pos);
      }
      if (0x10ffff < code_point) {
        return result(error_code::TOO_LARGE, pos);
      }
    } else {
      // we either have too many continuation bytes or an invalid leading byte
      if ((byte & 0b11000000) == 0b10000000) {
        return result(error_code::TOO_LONG, pos);
      } else {
        return result(error_code::HEADER_BITS, pos);
      }
    }
    pos = next_pos;
  }
  return result(error_code::SUCCESS, len);
}

// Finds the previous leading byte starting backward from buf and validates with
// errors from there Used to pinpoint the location of an error when an invalid
// chunk is detected We assume that the stream starts with a leading byte, and
// to check that it is the case, we ask that you pass a pointer to the start of
// the stream (start).
inline simdutf_warn_unused result rewind_and_validate_with_errors(
    const char *start, const char *buf, size_t len) noexcept {
  // First check that we start with a leading byte
  if ((*start & 0b11000000) == 0b10000000) {
    return result(error_code::TOO_LONG, 0);
  }
  size_t extra_len{0};
  // A leading byte cannot be further than 4 bytes away
  for (int i = 0; i < 5; i++) {
    unsigned char byte = *buf;
    if ((byte & 0b11000000) != 0b10000000) {
      break;
    } else {
      buf--;
      extra_len++;
    }
  }

  result res = validate_with_errors(buf, len + extra_len);
  res.count -= extra_len;
  return res;
}

inline size_t count_code_points(const char *buf, size_t len) {
  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    // -65 is 0b10111111, anything larger in two-complement's should start a new
    // code point.
    if (p[i] > -65) {
      counter++;
    }
  }
  return counter;
}

inline size_t utf16_length_from_utf8(const char *buf, size_t len) {
  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    if (p[i] > -65) {
      counter++;
    }
    if (uint8_t(p[i]) >= 240) {
      counter++;
    }
  }
  return counter;
}

simdutf_warn_unused inline size_t trim_partial_utf8(const char *input,
                                                    size_t length) {
  if (length < 3) {
    switch (length) {
    case 2:
      if (uint8_t(input[length - 1]) >= 0xc0) {
        return length - 1;
      } // 2-, 3- and 4-byte characters with only 1 byte left
      if (uint8_t(input[length - 2]) >= 0xe0) {
        return length - 2;
      } // 3- and 4-byte characters with only 2 bytes left
      return length;
    case 1:
      if (uint8_t(input[length - 1]) >= 0xc0) {
        return length - 1;
      } // 2-, 3- and 4-byte characters with only 1 byte left
      return length;
    case 0:
      return length;
    }
  }
  if (uint8_t(input[length - 1]) >= 0xc0) {
    return length - 1;
  } // 2-, 3- and 4-byte characters with only 1 byte left
  if (uint8_t(input[length - 2]) >= 0xe0) {
    return length - 2;
  } // 3- and 4-byte characters with only 1 byte left
  if (uint8_t(input[length - 3]) >= 0xf0) {
    return length - 3;
  } // 4-byte characters with only 3 bytes left
  return length;
}

} // namespace utf8
} // unnamed namespace
} // namespace scalar
} // namespace simdutf

#endif

Coverage Report

Created: 2025-11-11 06:42

Line	Count	Source
1		#ifndef SIMDUTF_UTF8_H
2		#define SIMDUTF_UTF8_H
3
4		namespace simdutf {
5		namespace scalar {
6		namespace {
7		namespace utf8 {
8		#if SIMDUTF_IMPLEMENTATION_FALLBACK \|\| SIMDUTF_IMPLEMENTATION_RVV
9		// only used by the fallback kernel.
10		// credit: based on code from Google Fuchsia (Apache Licensed)
11	3.44k	inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
12	3.44k	const uint8_t data = reinterpret_cast<const uint8_t >(buf);
13	3.44k	uint64_t pos = 0;
14	3.44k	uint32_t code_point = 0;
15	5.92M	while (pos < len) {
16		// check of the next 16 bytes are ascii.
17	5.92M	uint64_t next_pos = pos + 16;
18	5.92M	if (next_pos <=
19	5.92M	len) { // if it is safe to read 16 more bytes, check that they are ascii
20	5.91M	uint64_t v1;
21	5.91M	std::memcpy(&v1, data + pos, sizeof(uint64_t));
22	5.91M	uint64_t v2;
23	5.91M	std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
24	5.91M	uint64_t v{v1 \| v2};
25	5.91M	if ((v & 0x8080808080808080) == 0) {
26	4.86M	pos = next_pos;
27	4.86M	continue;
28	4.86M	}
29	5.91M	}
30	1.06M	unsigned char byte = data[pos];
31
32	6.07M	while (byte < 0b10000000) {
33	5.01M	if (++pos == len) {
34	1.04k	return true;
35	1.04k	}
36	5.01M	byte = data[pos];
37	5.01M	}
38
39	1.06M	if ((byte & 0b11100000) == 0b11000000) {
40	679k	next_pos = pos + 2;
41	679k	if (next_pos > len) {
42	57	return false;
43	57	}
44	679k	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
45	250	return false;
46	250	}
47		// range check
48	678k	code_point = (byte & 0b00011111) << 6 \| (data[pos + 1] & 0b00111111);
49	678k	if ((code_point < 0x80) \|\| (0x7ff < code_point)) {
50	38	return false;
51	38	}
52	678k	} else if ((byte & 0b11110000) == 0b11100000) {
53	318k	next_pos = pos + 3;
54	318k	if (next_pos > len) {
55	58	return false;
56	58	}
57	317k	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
58	101	return false;
59	101	}
60	317k	if ((data[pos + 2] & 0b11000000) != 0b10000000) {
61	54	return false;
62	54	}
63		// range check
64	317k	code_point = (byte & 0b00001111) << 12 \|
65	317k	(data[pos + 1] & 0b00111111) << 6 \|
66	317k	(data[pos + 2] & 0b00111111);
67	317k	if ((code_point < 0x800) \|\| (0xffff < code_point) \|\|
68	317k	(0xd7ff < code_point && code_point < 0xe000)) {
69	63	return false;
70	63	}
71	317k	} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
72	62.7k	next_pos = pos + 4;
73	62.7k	if (next_pos > len) {
74	49	return false;
75	49	}
76	62.7k	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
77	113	return false;
78	113	}
79	62.6k	if ((data[pos + 2] & 0b11000000) != 0b10000000) {
80	43	return false;
81	43	}
82	62.5k	if ((data[pos + 3] & 0b11000000) != 0b10000000) {
83	40	return false;
84	40	}
85		// range check
86	62.5k	code_point =
87	62.5k	(byte & 0b00000111) << 18 \| (data[pos + 1] & 0b00111111) << 12 \|
88	62.5k	(data[pos + 2] & 0b00111111) << 6 \| (data[pos + 3] & 0b00111111);
89	62.5k	if (code_point <= 0xffff \|\| 0x10ffff < code_point) {
90	80	return false;
91	80	}
92	62.5k	} else {
93		// we may have a continuation
94	855	return false;
95	855	}
96	1.05M	pos = next_pos;
97	1.05M	}
98	597	return true;
99	3.44k	}
100		#endif
101
102		inline simdutf_warn_unused result validate_with_errors(const char *buf,
103	6.77k	size_t len) noexcept {
104	6.77k	const uint8_t data = reinterpret_cast<const uint8_t >(buf);
105	6.77k	size_t pos = 0;
106	6.77k	uint32_t code_point = 0;
107	5.93M	while (pos < len) {
108		// check of the next 16 bytes are ascii.
109	5.93M	size_t next_pos = pos + 16;
110	5.93M	if (next_pos <=
111	5.93M	len) { // if it is safe to read 16 more bytes, check that they are ascii
112	5.92M	uint64_t v1;
113	5.92M	std::memcpy(&v1, data + pos, sizeof(uint64_t));
114	5.92M	uint64_t v2;
115	5.92M	std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
116	5.92M	uint64_t v{v1 \| v2};
117	5.92M	if ((v & 0x8080808080808080) == 0) {
118	4.86M	pos = next_pos;
119	4.86M	continue;
120	4.86M	}
121	5.92M	}
122	1.06M	unsigned char byte = data[pos];
123
124	6.09M	while (byte < 0b10000000) {
125	5.02M	if (++pos == len) {
126	1.04k	return result(error_code::SUCCESS, len);
127	1.04k	}
128	5.02M	byte = data[pos];
129	5.02M	}
130
131	1.06M	if ((byte & 0b11100000) == 0b11000000) {
132	681k	next_pos = pos + 2;
133	681k	if (next_pos > len) {
134	171	return result(error_code::TOO_SHORT, pos);
135	171	}
136	681k	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
137	750	return result(error_code::TOO_SHORT, pos);
138	750	}
139		// range check
140	680k	code_point = (byte & 0b00011111) << 6 \| (data[pos + 1] & 0b00111111);
141	680k	if ((code_point < 0x80) \|\| (0x7ff < code_point)) {
142	114	return result(error_code::OVERLONG, pos);
143	114	}
144	680k	} else if ((byte & 0b11110000) == 0b11100000) {
145	319k	next_pos = pos + 3;
146	319k	if (next_pos > len) {
147	174	return result(error_code::TOO_SHORT, pos);
148	174	}
149	319k	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
150	303	return result(error_code::TOO_SHORT, pos);
151	303	}
152	319k	if ((data[pos + 2] & 0b11000000) != 0b10000000) {
153	162	return result(error_code::TOO_SHORT, pos);
154	162	}
155		// range check
156	319k	code_point = (byte & 0b00001111) << 12 \|
157	319k	(data[pos + 1] & 0b00111111) << 6 \|
158	319k	(data[pos + 2] & 0b00111111);
159	319k	if ((code_point < 0x800) \|\| (0xffff < code_point)) {
160	123	return result(error_code::OVERLONG, pos);
161	123	}
162	318k	if (0xd7ff < code_point && code_point < 0xe000) {
163	66	return result(error_code::SURROGATE, pos);
164	66	}
165	318k	} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
166	63.9k	next_pos = pos + 4;
167	63.9k	if (next_pos > len) {
168	147	return result(error_code::TOO_SHORT, pos);
169	147	}
170	63.7k	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
171	339	return result(error_code::TOO_SHORT, pos);
172	339	}
173	63.4k	if ((data[pos + 2] & 0b11000000) != 0b10000000) {
174	129	return result(error_code::TOO_SHORT, pos);
175	129	}
176	63.3k	if ((data[pos + 3] & 0b11000000) != 0b10000000) {
177	120	return result(error_code::TOO_SHORT, pos);
178	120	}
179		// range check
180	63.2k	code_point =
181	63.2k	(byte & 0b00000111) << 18 \| (data[pos + 1] & 0b00111111) << 12 \|
182	63.2k	(data[pos + 2] & 0b00111111) << 6 \| (data[pos + 3] & 0b00111111);
183	63.2k	if (code_point <= 0xffff) {
184	135	return result(error_code::OVERLONG, pos);
185	135	}
186	63.0k	if (0x10ffff < code_point) {
187	105	return result(error_code::TOO_LARGE, pos);
188	105	}
189	63.0k	} else {
190		// we either have too many continuation bytes or an invalid leading byte
191	2.29k	if ((byte & 0b11000000) == 0b10000000) {
192	1.23k	return result(error_code::TOO_LONG, pos);
193	1.23k	} else {
194	1.05k	return result(error_code::HEADER_BITS, pos);
195	1.05k	}
196	2.29k	}
197	1.06M	pos = next_pos;
198	1.06M	}
199	597	return result(error_code::SUCCESS, len);
200	6.77k	}
201
202		// Finds the previous leading byte starting backward from buf and validates with
203		// errors from there Used to pinpoint the location of an error when an invalid
204		// chunk is detected We assume that the stream starts with a leading byte, and
205		// to check that it is the case, we ask that you pass a pointer to the start of
206		// the stream (start).
207		inline simdutf_warn_unused result rewind_and_validate_with_errors(
208	3.60k	const char start, const char buf, size_t len) noexcept {
209		// First check that we start with a leading byte
210	3.60k	if ((*start & 0b11000000) == 0b10000000) {
211	274	return result(error_code::TOO_LONG, 0);
212	274	}
213	3.32k	size_t extra_len{0};
214		// A leading byte cannot be further than 4 bytes away
215	3.49k	for (int i = 0; i < 5; i++) {
216	3.49k	unsigned char byte = *buf;
217	3.49k	if ((byte & 0b11000000) != 0b10000000) {
218	3.32k	break;
219	3.32k	} else {
220	170	buf--;
221	170	extra_len++;
222	170	}
223	3.49k	}
224
225	3.32k	result res = validate_with_errors(buf, len + extra_len);
226	3.32k	res.count -= extra_len;
227	3.32k	return res;
228	3.60k	}
229
230	14.9k	inline size_t count_code_points(const char *buf, size_t len) {
231	14.9k	const int8_t p = reinterpret_cast<const int8_t >(buf);
232	14.9k	size_t counter{0};
233	179M	for (size_t i = 0; i < len; i++) {
234		// -65 is 0b10111111, anything larger in two-complement's should start a new
235		// code point.
236	179M	if (p[i] > -65) {
237	176M	counter++;
238	176M	}
239	179M	}
240	14.9k	return counter;
241	14.9k	}
242
243	5.73k	inline size_t utf16_length_from_utf8(const char *buf, size_t len) {
244	5.73k	const int8_t p = reinterpret_cast<const int8_t >(buf);
245	5.73k	size_t counter{0};
246	66.3M	for (size_t i = 0; i < len; i++) {
247	66.3M	if (p[i] > -65) {
248	65.0M	counter++;
249	65.0M	}
250	66.3M	if (uint8_t(p[i]) >= 240) {
251	157k	counter++;
252	157k	}
253	66.3M	}
254	5.73k	return counter;
255	5.73k	}
256
257		simdutf_warn_unused inline size_t trim_partial_utf8(const char *input,
258	0	size_t length) {
259	0	if (length < 3) {
260	0	switch (length) {
261	0	case 2:
262	0	if (uint8_t(input[length - 1]) >= 0xc0) {
263	0	return length - 1;
264	0	} // 2-, 3- and 4-byte characters with only 1 byte left
265	0	if (uint8_t(input[length - 2]) >= 0xe0) {
266	0	return length - 2;
267	0	} // 3- and 4-byte characters with only 2 bytes left
268	0	return length;
269	0	case 1:
270	0	if (uint8_t(input[length - 1]) >= 0xc0) {
271	0	return length - 1;
272	0	} // 2-, 3- and 4-byte characters with only 1 byte left
273	0	return length;
274	0	case 0:
275	0	return length;
276	0	}
277	0	}
278	0	if (uint8_t(input[length - 1]) >= 0xc0) {
279	0	return length - 1;
280	0	} // 2-, 3- and 4-byte characters with only 1 byte left
281	0	if (uint8_t(input[length - 2]) >= 0xe0) {
282	0	return length - 2;
283	0	} // 3- and 4-byte characters with only 1 byte left
284	0	if (uint8_t(input[length - 3]) >= 0xf0) {
285	0	return length - 3;
286	0	} // 4-byte characters with only 3 bytes left
287	0	return length;
288	0	}
289
290		} // namespace utf8
291		} // unnamed namespace
292		} // namespace scalar
293		} // namespace simdutf
294
295		#endif