/work/include/simdutf/scalar/utf8.h

Source
#ifndef SIMDUTF_UTF8_H
#define SIMDUTF_UTF8_H

namespace simdutf {
namespace scalar {
namespace {
namespace utf8 {

// credit: based on code from Google Fuchsia (Apache Licensed)
template <class BytePtr>
simdutf_constexpr23 simdutf_warn_unused bool validate(BytePtr data,
                                                      size_t len) noexcept {
  static_assert(
      std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
      "dereferencing the data pointer must result in a uint8_t");
  uint64_t pos = 0;
  uint32_t code_point = 0;
  while (pos < len) {
    uint64_t next_pos;
#if SIMDUTF_CPLUSPLUS23
    if !consteval
#endif
    { // check if the next 16 bytes are ascii.
      next_pos = pos + 16;
      if (next_pos <= len) { // if it is safe to read 16 more bytes, check
                             // that they are ascii
        uint64_t v1{};
        std::memcpy(&v1, data + pos, sizeof(uint64_t));
        uint64_t v2{};
        std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
        uint64_t v{v1 | v2};
        if ((v & 0x8080808080808080) == 0) {
          pos = next_pos;
          continue;
        }
      }
    }

    unsigned char byte = data[pos];

    while (byte < 0b10000000) {
      if (++pos == len) {
        return true;
      }
      byte = data[pos];
    }

    if ((byte & 0b11100000) == 0b11000000) {
      next_pos = pos + 2;
      if (next_pos > len) {
        return false;
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return false;
      }
      // range check
      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
      if ((code_point < 0x80) || (0x7ff < code_point)) {
        return false;
      }
    } else if ((byte & 0b11110000) == 0b11100000) {
      next_pos = pos + 3;
      if (next_pos > len) {
        return false;
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return false;
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return false;
      }
      // range check
      code_point = (byte & 0b00001111) << 12 |
                   (data[pos + 1] & 0b00111111) << 6 |
                   (data[pos + 2] & 0b00111111);
      if ((code_point < 0x800) || (0xffff < code_point) ||
          (0xd7ff < code_point && code_point < 0xe000)) {
        return false;
      }
    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
      next_pos = pos + 4;
      if (next_pos > len) {
        return false;
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return false;
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return false;
      }
      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
        return false;
      }
      // range check
      code_point =
          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
      if (code_point <= 0xffff || 0x10ffff < code_point) {
        return false;
      }
    } else {
      // we may have a continuation
      return false;
    }
    pos = next_pos;
  }
  return true;
}

simdutf_really_inline simdutf_warn_unused bool validate(const char *buf,
                                                        size_t len) noexcept {
  return validate(reinterpret_cast<const uint8_t *>(buf), len);
}

template <class BytePtr>
simdutf_constexpr23 simdutf_warn_unused result
validate_with_errors(BytePtr data, size_t len) noexcept {
  static_assert(
      std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
      "dereferencing the data pointer must result in a uint8_t");
  size_t pos = 0;
  uint32_t code_point = 0;
  while (pos < len) {
    // check of the next 16 bytes are ascii.
    size_t next_pos = pos + 16;
    if (next_pos <=
        len) { // if it is safe to read 16 more bytes, check that they are ascii
      uint64_t v1;
      std::memcpy(&v1, data + pos, sizeof(uint64_t));
      uint64_t v2;
      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
      uint64_t v{v1 | v2};
      if ((v & 0x8080808080808080) == 0) {
        pos = next_pos;
        continue;
      }
    }
    unsigned char byte = data[pos];

    while (byte < 0b10000000) {
      if (++pos == len) {
        return result(error_code::SUCCESS, len);
      }
      byte = data[pos];
    }

    if ((byte & 0b11100000) == 0b11000000) {
      next_pos = pos + 2;
      if (next_pos > len) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      // range check
      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
      if ((code_point < 0x80) || (0x7ff < code_point)) {
        return result(error_code::OVERLONG, pos);
      }
    } else if ((byte & 0b11110000) == 0b11100000) {
      next_pos = pos + 3;
      if (next_pos > len) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      // range check
      code_point = (byte & 0b00001111) << 12 |
                   (data[pos + 1] & 0b00111111) << 6 |
                   (data[pos + 2] & 0b00111111);
      if ((code_point < 0x800) || (0xffff < code_point)) {
        return result(error_code::OVERLONG, pos);
      }
      if (0xd7ff < code_point && code_point < 0xe000) {
        return result(error_code::SURROGATE, pos);
      }
    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
      next_pos = pos + 4;
      if (next_pos > len) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
        return result(error_code::TOO_SHORT, pos);
      }
      // range check
      code_point =
          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
      if (code_point <= 0xffff) {
        return result(error_code::OVERLONG, pos);
      }
      if (0x10ffff < code_point) {
        return result(error_code::TOO_LARGE, pos);
      }
    } else {
      // we either have too many continuation bytes or an invalid leading byte
      if ((byte & 0b11000000) == 0b10000000) {
        return result(error_code::TOO_LONG, pos);
      } else {
        return result(error_code::HEADER_BITS, pos);
      }
    }
    pos = next_pos;
  }
  return result(error_code::SUCCESS, len);
}

simdutf_really_inline simdutf_warn_unused result
validate_with_errors(const char *buf, size_t len) noexcept {
  return validate_with_errors(reinterpret_cast<const uint8_t *>(buf), len);
}

// Finds the previous leading byte starting backward from buf and validates with
// errors from there Used to pinpoint the location of an error when an invalid
// chunk is detected We assume that the stream starts with a leading byte, and
// to check that it is the case, we ask that you pass a pointer to the start of
// the stream (start).
inline simdutf_warn_unused result rewind_and_validate_with_errors(
    const char *start, const char *buf, size_t len) noexcept {
  // First check that we start with a leading byte
  if ((*start & 0b11000000) == 0b10000000) {
    return result(error_code::TOO_LONG, 0);
  }
  size_t extra_len{0};
  // A leading byte cannot be further than 4 bytes away
  for (int i = 0; i < 5; i++) {
    unsigned char byte = *buf;
    if ((byte & 0b11000000) != 0b10000000) {
      break;
    } else {
      buf--;
      extra_len++;
    }
  }

  result res = validate_with_errors(buf, len + extra_len);
  res.count -= extra_len;
  return res;
}

template <typename InputPtr>
#if SIMDUTF_CPLUSPLUS20
  requires simdutf::detail::indexes_into_byte_like<InputPtr>
#endif
simdutf_constexpr23 size_t count_code_points(InputPtr data, size_t len) {
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    // -65 is 0b10111111, anything larger in two-complement's should start a new
    // code point.
    if (int8_t(data[i]) > -65) {
      counter++;
    }
  }
  return counter;
}

template <typename InputPtr>
#if SIMDUTF_CPLUSPLUS20
  requires simdutf::detail::indexes_into_byte_like<InputPtr>
#endif
simdutf_constexpr23 size_t utf16_length_from_utf8(InputPtr data, size_t len) {
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    if (int8_t(data[i]) > -65) {
      counter++;
    }
    if (uint8_t(data[i]) >= 240) {
      counter++;
    }
  }
  return counter;
}

template <typename InputPtr>
#if SIMDUTF_CPLUSPLUS20
  requires simdutf::detail::indexes_into_byte_like<InputPtr>
#endif
simdutf_warn_unused simdutf_constexpr23 size_t
trim_partial_utf8(InputPtr input, size_t length) {
  if (length < 3) {
    switch (length) {
    case 2:
      if (uint8_t(input[length - 1]) >= 0xc0) {
        return length - 1;
      } // 2-, 3- and 4-byte characters with only 1 byte left
      if (uint8_t(input[length - 2]) >= 0xe0) {
        return length - 2;
      } // 3- and 4-byte characters with only 2 bytes left
      return length;
    case 1:
      if (uint8_t(input[length - 1]) >= 0xc0) {
        return length - 1;
      } // 2-, 3- and 4-byte characters with only 1 byte left
      return length;
    case 0:
      return length;
    }
  }
  if (uint8_t(input[length - 1]) >= 0xc0) {
    return length - 1;
  } // 2-, 3- and 4-byte characters with only 1 byte left
  if (uint8_t(input[length - 2]) >= 0xe0) {
    return length - 2;
  } // 3- and 4-byte characters with only 1 byte left
  if (uint8_t(input[length - 3]) >= 0xf0) {
    return length - 3;
  } // 4-byte characters with only 3 bytes left
  return length;
}

} // namespace utf8
} // unnamed namespace
} // namespace scalar
} // namespace simdutf

#endif

Coverage Report

Created: 2026-01-10 06:41

Line	Count	Source
1		#ifndef SIMDUTF_UTF8_H
2		#define SIMDUTF_UTF8_H
3
4		namespace simdutf {
5		namespace scalar {
6		namespace {
7		namespace utf8 {
8
9		// credit: based on code from Google Fuchsia (Apache Licensed)
10		template <class BytePtr>
11		simdutf_constexpr23 simdutf_warn_unused bool validate(BytePtr data,
12	0	size_t len) noexcept {
13	0	static_assert(
14	0	std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
15	0	"dereferencing the data pointer must result in a uint8_t");
16	0	uint64_t pos = 0;
17	0	uint32_t code_point = 0;
18	0	while (pos < len) {
19	0	uint64_t next_pos;
20	0	#if SIMDUTF_CPLUSPLUS23
21	0	if !consteval
22	0	#endif
23	0	{ // check if the next 16 bytes are ascii.
24	0	next_pos = pos + 16;
25	0	if (next_pos <= len) { // if it is safe to read 16 more bytes, check
26	0	// that they are ascii
27	0	uint64_t v1{};
28	0	std::memcpy(&v1, data + pos, sizeof(uint64_t));
29	0	uint64_t v2{};
30	0	std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
31	0	uint64_t v{v1 \| v2};
32	0	if ((v & 0x8080808080808080) == 0) {
33	0	pos = next_pos;
34	0	continue;
35	0	}
36	0	}
37	0	}
38	0
39	0	unsigned char byte = data[pos];
40	0
41	0	while (byte < 0b10000000) {
42	0	if (++pos == len) {
43	0	return true;
44	0	}
45	0	byte = data[pos];
46	0	}
47	0
48	0	if ((byte & 0b11100000) == 0b11000000) {
49	0	next_pos = pos + 2;
50	0	if (next_pos > len) {
51	0	return false;
52	0	}
53	0	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
54	0	return false;
55	0	}
56	0	// range check
57	0	code_point = (byte & 0b00011111) << 6 \| (data[pos + 1] & 0b00111111);
58	0	if ((code_point < 0x80) \|\| (0x7ff < code_point)) {
59	0	return false;
60	0	}
61	0	} else if ((byte & 0b11110000) == 0b11100000) {
62	0	next_pos = pos + 3;
63	0	if (next_pos > len) {
64	0	return false;
65	0	}
66	0	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
67	0	return false;
68	0	}
69	0	if ((data[pos + 2] & 0b11000000) != 0b10000000) {
70	0	return false;
71	0	}
72	0	// range check
73	0	code_point = (byte & 0b00001111) << 12 \|
74	0	(data[pos + 1] & 0b00111111) << 6 \|
75	0	(data[pos + 2] & 0b00111111);
76	0	if ((code_point < 0x800) \|\| (0xffff < code_point) \|\|
77	0	(0xd7ff < code_point && code_point < 0xe000)) {
78	0	return false;
79	0	}
80	0	} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
81	0	next_pos = pos + 4;
82	0	if (next_pos > len) {
83	0	return false;
84	0	}
85	0	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
86	0	return false;
87	0	}
88	0	if ((data[pos + 2] & 0b11000000) != 0b10000000) {
89	0	return false;
90	0	}
91	0	if ((data[pos + 3] & 0b11000000) != 0b10000000) {
92	0	return false;
93	0	}
94	0	// range check
95	0	code_point =
96	0	(byte & 0b00000111) << 18 \| (data[pos + 1] & 0b00111111) << 12 \|
97	0	(data[pos + 2] & 0b00111111) << 6 \| (data[pos + 3] & 0b00111111);
98	0	if (code_point <= 0xffff \|\| 0x10ffff < code_point) {
99	0	return false;
100	0	}
101	0	} else {
102	0	// we may have a continuation
103	0	return false;
104	0	}
105	0	pos = next_pos;
106	0	}
107	0	return true;
108	0	} Unexecuted instantiation: roundtrip.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const>(unsigned char const, unsigned long) Unexecuted instantiation: base64.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const>(unsigned char const, unsigned long) Unexecuted instantiation: misc.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const>(unsigned char const, unsigned long) Unexecuted instantiation: conversion.cpp:bool simdutf::scalar::(anonymous namespace)::utf8::validate<unsigned char const>(unsigned char const, unsigned long)
109
110		simdutf_really_inline simdutf_warn_unused bool validate(const char *buf,
111	0	size_t len) noexcept {
112	0	return validate(reinterpret_cast<const uint8_t *>(buf), len);
113	0	} Unexecuted instantiation: roundtrip.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const, unsigned long) Unexecuted instantiation: base64.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const, unsigned long) Unexecuted instantiation: misc.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const, unsigned long) Unexecuted instantiation: conversion.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate(char const, unsigned long)
114
115		template <class BytePtr>
116		simdutf_constexpr23 simdutf_warn_unused result
117	0	validate_with_errors(BytePtr data, size_t len) noexcept {
118	0	static_assert(
119	0	std::is_same<typename std::decay<decltype(*data)>::type, uint8_t>::value,
120	0	"dereferencing the data pointer must result in a uint8_t");
121	0	size_t pos = 0;
122	0	uint32_t code_point = 0;
123	0	while (pos < len) {
124	0	// check of the next 16 bytes are ascii.
125	0	size_t next_pos = pos + 16;
126	0	if (next_pos <=
127	0	len) { // if it is safe to read 16 more bytes, check that they are ascii
128	0	uint64_t v1;
129	0	std::memcpy(&v1, data + pos, sizeof(uint64_t));
130	0	uint64_t v2;
131	0	std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
132	0	uint64_t v{v1 \| v2};
133	0	if ((v & 0x8080808080808080) == 0) {
134	0	pos = next_pos;
135	0	continue;
136	0	}
137	0	}
138	0	unsigned char byte = data[pos];
139	0
140	0	while (byte < 0b10000000) {
141	0	if (++pos == len) {
142	0	return result(error_code::SUCCESS, len);
143	0	}
144	0	byte = data[pos];
145	0	}
146	0
147	0	if ((byte & 0b11100000) == 0b11000000) {
148	0	next_pos = pos + 2;
149	0	if (next_pos > len) {
150	0	return result(error_code::TOO_SHORT, pos);
151	0	}
152	0	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
153	0	return result(error_code::TOO_SHORT, pos);
154	0	}
155	0	// range check
156	0	code_point = (byte & 0b00011111) << 6 \| (data[pos + 1] & 0b00111111);
157	0	if ((code_point < 0x80) \|\| (0x7ff < code_point)) {
158	0	return result(error_code::OVERLONG, pos);
159	0	}
160	0	} else if ((byte & 0b11110000) == 0b11100000) {
161	0	next_pos = pos + 3;
162	0	if (next_pos > len) {
163	0	return result(error_code::TOO_SHORT, pos);
164	0	}
165	0	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
166	0	return result(error_code::TOO_SHORT, pos);
167	0	}
168	0	if ((data[pos + 2] & 0b11000000) != 0b10000000) {
169	0	return result(error_code::TOO_SHORT, pos);
170	0	}
171	0	// range check
172	0	code_point = (byte & 0b00001111) << 12 \|
173	0	(data[pos + 1] & 0b00111111) << 6 \|
174	0	(data[pos + 2] & 0b00111111);
175	0	if ((code_point < 0x800) \|\| (0xffff < code_point)) {
176	0	return result(error_code::OVERLONG, pos);
177	0	}
178	0	if (0xd7ff < code_point && code_point < 0xe000) {
179	0	return result(error_code::SURROGATE, pos);
180	0	}
181	0	} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
182	0	next_pos = pos + 4;
183	0	if (next_pos > len) {
184	0	return result(error_code::TOO_SHORT, pos);
185	0	}
186	0	if ((data[pos + 1] & 0b11000000) != 0b10000000) {
187	0	return result(error_code::TOO_SHORT, pos);
188	0	}
189	0	if ((data[pos + 2] & 0b11000000) != 0b10000000) {
190	0	return result(error_code::TOO_SHORT, pos);
191	0	}
192	0	if ((data[pos + 3] & 0b11000000) != 0b10000000) {
193	0	return result(error_code::TOO_SHORT, pos);
194	0	}
195	0	// range check
196	0	code_point =
197	0	(byte & 0b00000111) << 18 \| (data[pos + 1] & 0b00111111) << 12 \|
198	0	(data[pos + 2] & 0b00111111) << 6 \| (data[pos + 3] & 0b00111111);
199	0	if (code_point <= 0xffff) {
200	0	return result(error_code::OVERLONG, pos);
201	0	}
202	0	if (0x10ffff < code_point) {
203	0	return result(error_code::TOO_LARGE, pos);
204	0	}
205	0	} else {
206	0	// we either have too many continuation bytes or an invalid leading byte
207	0	if ((byte & 0b11000000) == 0b10000000) {
208	0	return result(error_code::TOO_LONG, pos);
209	0	} else {
210	0	return result(error_code::HEADER_BITS, pos);
211	0	}
212	0	}
213	0	pos = next_pos;
214	0	}
215	0	return result(error_code::SUCCESS, len);
216	0	} Unexecuted instantiation: roundtrip.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const>(unsigned char const, unsigned long) Unexecuted instantiation: base64.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const>(unsigned char const, unsigned long) Unexecuted instantiation: misc.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const>(unsigned char const, unsigned long) Unexecuted instantiation: conversion.cpp:simdutf::result simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors<unsigned char const>(unsigned char const, unsigned long)
217
218		simdutf_really_inline simdutf_warn_unused result
219	0	validate_with_errors(const char *buf, size_t len) noexcept {
220	0	return validate_with_errors(reinterpret_cast<const uint8_t *>(buf), len);
221	0	} Unexecuted instantiation: roundtrip.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const, unsigned long) Unexecuted instantiation: base64.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const, unsigned long) Unexecuted instantiation: misc.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const, unsigned long) Unexecuted instantiation: conversion.cpp:simdutf::scalar::(anonymous namespace)::utf8::validate_with_errors(char const, unsigned long)
222
223		// Finds the previous leading byte starting backward from buf and validates with
224		// errors from there Used to pinpoint the location of an error when an invalid
225		// chunk is detected We assume that the stream starts with a leading byte, and
226		// to check that it is the case, we ask that you pass a pointer to the start of
227		// the stream (start).
228		inline simdutf_warn_unused result rewind_and_validate_with_errors(
229	0	const char start, const char buf, size_t len) noexcept {
230	0	// First check that we start with a leading byte
231	0	if ((*start & 0b11000000) == 0b10000000) {
232	0	return result(error_code::TOO_LONG, 0);
233	0	}
234	0	size_t extra_len{0};
235	0	// A leading byte cannot be further than 4 bytes away
236	0	for (int i = 0; i < 5; i++) {
237	0	unsigned char byte = *buf;
238	0	if ((byte & 0b11000000) != 0b10000000) {
239	0	break;
240	0	} else {
241	0	buf--;
242	0	extra_len++;
243	0	}
244	0	}
245	0
246	0	result res = validate_with_errors(buf, len + extra_len);
247	0	res.count -= extra_len;
248	0	return res;
249	0	} Unexecuted instantiation: roundtrip.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const, char const, unsigned long) Unexecuted instantiation: base64.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const, char const, unsigned long) Unexecuted instantiation: misc.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const, char const, unsigned long) Unexecuted instantiation: conversion.cpp:simdutf::scalar::(anonymous namespace)::utf8::rewind_and_validate_with_errors(char const, char const, unsigned long)
250
251		template <typename InputPtr>
252		#if SIMDUTF_CPLUSPLUS20
253		requires simdutf::detail::indexes_into_byte_like<InputPtr>
254		#endif
255		simdutf_constexpr23 size_t count_code_points(InputPtr data, size_t len) {
256		size_t counter{0};
257		for (size_t i = 0; i < len; i++) {
258		// -65 is 0b10111111, anything larger in two-complement's should start a new
259		// code point.
260		if (int8_t(data[i]) > -65) {
261		counter++;
262		}
263		}
264		return counter;
265		}
266
267		template <typename InputPtr>
268		#if SIMDUTF_CPLUSPLUS20
269		requires simdutf::detail::indexes_into_byte_like<InputPtr>
270		#endif
271		simdutf_constexpr23 size_t utf16_length_from_utf8(InputPtr data, size_t len) {
272		size_t counter{0};
273		for (size_t i = 0; i < len; i++) {
274		if (int8_t(data[i]) > -65) {
275		counter++;
276		}
277		if (uint8_t(data[i]) >= 240) {
278		counter++;
279		}
280		}
281		return counter;
282		}
283
284		template <typename InputPtr>
285		#if SIMDUTF_CPLUSPLUS20
286		requires simdutf::detail::indexes_into_byte_like<InputPtr>
287		#endif
288		simdutf_warn_unused simdutf_constexpr23 size_t
289		trim_partial_utf8(InputPtr input, size_t length) {
290		if (length < 3) {
291		switch (length) {
292		case 2:
293		if (uint8_t(input[length - 1]) >= 0xc0) {
294		return length - 1;
295		} // 2-, 3- and 4-byte characters with only 1 byte left
296		if (uint8_t(input[length - 2]) >= 0xe0) {
297		return length - 2;
298		} // 3- and 4-byte characters with only 2 bytes left
299		return length;
300		case 1:
301		if (uint8_t(input[length - 1]) >= 0xc0) {
302		return length - 1;
303		} // 2-, 3- and 4-byte characters with only 1 byte left
304		return length;
305		case 0:
306		return length;
307		}
308		}
309		if (uint8_t(input[length - 1]) >= 0xc0) {
310		return length - 1;
311		} // 2-, 3- and 4-byte characters with only 1 byte left
312		if (uint8_t(input[length - 2]) >= 0xe0) {
313		return length - 2;
314		} // 3- and 4-byte characters with only 1 byte left
315		if (uint8_t(input[length - 3]) >= 0xf0) {
316		return length - 3;
317		} // 4-byte characters with only 3 bytes left
318		return length;
319		}
320
321		} // namespace utf8
322		} // unnamed namespace
323		} // namespace scalar
324		} // namespace simdutf
325
326		#endif