/work/include/simdutf/scalar/utf16.h

Source
#ifndef SIMDUTF_UTF16_H
#define SIMDUTF_UTF16_H

namespace simdutf {
namespace scalar {
namespace utf16 {

template <endianness big_endian>
simdutf_warn_unused simdutf_constexpr23 bool
validate_as_ascii(const char16_t *data, size_t len) noexcept {
  for (size_t pos = 0; pos < len; pos++) {
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
    if (word >= 0x80) {
      return false;
    }
  }
  return true;
}

template <endianness big_endian>
inline simdutf_warn_unused simdutf_constexpr23 bool
validate(const char16_t *data, size_t len) noexcept {
  uint64_t pos = 0;
  while (pos < len) {
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
    if ((word & 0xF800) == 0xD800) {
      if (pos + 1 >= len) {
        return false;
      }
      char16_t diff = char16_t(word - 0xD800);
      if (diff > 0x3FF) {
        return false;
      }
      char16_t next_word = !match_system(big_endian)
                               ? u16_swap_bytes(data[pos + 1])
                               : data[pos + 1];
      char16_t diff2 = char16_t(next_word - 0xDC00);
      if (diff2 > 0x3FF) {
        return false;
      }
      pos += 2;
    } else {
      pos++;
    }
  }
  return true;
}

template <endianness big_endian>
inline simdutf_warn_unused simdutf_constexpr23 result
validate_with_errors(const char16_t *data, size_t len) noexcept {
  size_t pos = 0;
  while (pos < len) {
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
    if ((word & 0xF800) == 0xD800) {
      if (pos + 1 >= len) {
        return result(error_code::SURROGATE, pos);
      }
      char16_t diff = char16_t(word - 0xD800);
      if (diff > 0x3FF) {
        return result(error_code::SURROGATE, pos);
      }
      char16_t next_word = !match_system(big_endian)
                               ? u16_swap_bytes(data[pos + 1])
                               : data[pos + 1];
      char16_t diff2 = uint16_t(next_word - 0xDC00);
      if (diff2 > 0x3FF) {
        return result(error_code::SURROGATE, pos);
      }
      pos += 2;
    } else {
      pos++;
    }
  }
  return result(error_code::SUCCESS, pos);
}

template <endianness big_endian>
simdutf_constexpr23 size_t count_code_points(const char16_t *p, size_t len) {
  // We are not BOM aware.
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
    counter += ((word & 0xFC00) != 0xDC00);
  }
  return counter;
}

template <endianness big_endian>
simdutf_constexpr23 size_t utf8_length_from_utf16(const char16_t *p,
                                                  size_t len) {
  // We are not BOM aware.
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
    counter++; // ASCII
    counter += static_cast<size_t>(
        word >
        0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
    counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
                                   (word >= 0xE000)); // three-byte
  }
  return counter;
}

template <endianness big_endian>
simdutf_constexpr23 size_t utf32_length_from_utf16(const char16_t *p,
                                                   size_t len) {
  // We are not BOM aware.
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
    counter += ((word & 0xFC00) != 0xDC00);
  }
  return counter;
}

simdutf_really_inline simdutf_constexpr23 void
change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) {
  for (size_t i = 0; i < size; i++) {
    *output++ = char16_t(input[i] >> 8 | input[i] << 8);
  }
}

template <endianness big_endian>
simdutf_warn_unused simdutf_constexpr23 size_t
trim_partial_utf16(const char16_t *input, size_t length) {
  if (length == 0) {
    return 0;
  }
  uint16_t last_word = uint16_t(input[length - 1]);
  last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
  length -= ((last_word & 0xFC00) == 0xD800);
  return length;
}

template <endianness big_endian>
simdutf_constexpr bool is_high_surrogate(char16_t c) {
  c = scalar::utf16::swap_if_needed<big_endian>(c);
  return (0xd800 <= c && c <= 0xdbff);
}

template <endianness big_endian>
simdutf_constexpr bool is_low_surrogate(char16_t c) {
  c = scalar::utf16::swap_if_needed<big_endian>(c);
  return (0xdc00 <= c && c <= 0xdfff);
}

simdutf_really_inline constexpr bool high_surrogate(char16_t c) {
  return (0xd800 <= c && c <= 0xdbff);
}

simdutf_really_inline constexpr bool low_surrogate(char16_t c) {
  return (0xdc00 <= c && c <= 0xdfff);
}

template <endianness big_endian>
simdutf_constexpr23 result
utf8_length_from_utf16_with_replacement(const char16_t *p, size_t len) {
  bool any_surrogates = false;
  // We are not BOM aware.
  size_t counter{0};
  for (size_t i = 0; i < len; i++) {
    if (is_high_surrogate<big_endian>(p[i])) {
      any_surrogates = true;
      // surrogate pair
      if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
        counter += 4;
        i++; // skip low surrogate
      } else {
        counter += 3; // unpaired high surrogate replaced by U+FFFD
      }
      continue;
    } else if (is_low_surrogate<big_endian>(p[i])) {
      any_surrogates = true;
      counter += 3; // unpaired low surrogate replaced by U+FFFD
      continue;
    }
    char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
    counter++; // at least 1 byte
    counter +=
        static_cast<size_t>(word > 0x7F); // non-ASCII is at least 2 bytes
    counter += static_cast<size_t>(word > 0x7FF); // three-byte
  }
  return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
          counter};
}

// variable templates are a C++14 extension
template <endianness big_endian> constexpr char16_t replacement() {
  return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
}

template <endianness big_endian>
simdutf_constexpr23 void to_well_formed_utf16(const char16_t *input, size_t len,
                                              char16_t *output) {
  const char16_t replacement = utf16::replacement<big_endian>();
  bool high_surrogate_prev = false, high_surrogate, low_surrogate;
  size_t i = 0;
  for (; i < len; i++) {
    char16_t c = input[i];
    high_surrogate = is_high_surrogate<big_endian>(c);
    low_surrogate = is_low_surrogate<big_endian>(c);
    if (high_surrogate_prev && !low_surrogate) {
      output[i - 1] = replacement;
    }

    if (!high_surrogate_prev && low_surrogate) {
      output[i] = replacement;
    } else {
      output[i] = input[i];
    }
    high_surrogate_prev = high_surrogate;
  }

  /* string may not end with high surrogate */
  if (high_surrogate_prev) {
    output[i - 1] = replacement;
  }
}

} // namespace utf16
} // namespace scalar
} // namespace simdutf

#endif

Coverage Report

Created: 2026-02-14 06:56

Line	Count	Source
1		#ifndef SIMDUTF_UTF16_H
2		#define SIMDUTF_UTF16_H
3
4		namespace simdutf {
5		namespace scalar {
6		namespace utf16 {
7
8		template <endianness big_endian>
9		simdutf_warn_unused simdutf_constexpr23 bool
10		validate_as_ascii(const char16_t *data, size_t len) noexcept {
11		for (size_t pos = 0; pos < len; pos++) {
12		char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
13		if (word >= 0x80) {
14		return false;
15		}
16		}
17		return true;
18		}
19
20		template <endianness big_endian>
21		inline simdutf_warn_unused simdutf_constexpr23 bool
22		validate(const char16_t *data, size_t len) noexcept {
23		uint64_t pos = 0;
24		while (pos < len) {
25		char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
26		if ((word & 0xF800) == 0xD800) {
27		if (pos + 1 >= len) {
28		return false;
29		}
30		char16_t diff = char16_t(word - 0xD800);
31		if (diff > 0x3FF) {
32		return false;
33		}
34		char16_t next_word = !match_system(big_endian)
35		? u16_swap_bytes(data[pos + 1])
36		: data[pos + 1];
37		char16_t diff2 = char16_t(next_word - 0xDC00);
38		if (diff2 > 0x3FF) {
39		return false;
40		}
41		pos += 2;
42		} else {
43		pos++;
44		}
45		}
46		return true;
47		}
48
49		template <endianness big_endian>
50		inline simdutf_warn_unused simdutf_constexpr23 result
51		validate_with_errors(const char16_t *data, size_t len) noexcept {
52		size_t pos = 0;
53		while (pos < len) {
54		char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
55		if ((word & 0xF800) == 0xD800) {
56		if (pos + 1 >= len) {
57		return result(error_code::SURROGATE, pos);
58		}
59		char16_t diff = char16_t(word - 0xD800);
60		if (diff > 0x3FF) {
61		return result(error_code::SURROGATE, pos);
62		}
63		char16_t next_word = !match_system(big_endian)
64		? u16_swap_bytes(data[pos + 1])
65		: data[pos + 1];
66		char16_t diff2 = uint16_t(next_word - 0xDC00);
67		if (diff2 > 0x3FF) {
68		return result(error_code::SURROGATE, pos);
69		}
70		pos += 2;
71		} else {
72		pos++;
73		}
74		}
75		return result(error_code::SUCCESS, pos);
76		}
77
78		template <endianness big_endian>
79		simdutf_constexpr23 size_t count_code_points(const char16_t *p, size_t len) {
80		// We are not BOM aware.
81		size_t counter{0};
82		for (size_t i = 0; i < len; i++) {
83		char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
84		counter += ((word & 0xFC00) != 0xDC00);
85		}
86		return counter;
87		}
88
89		template <endianness big_endian>
90		simdutf_constexpr23 size_t utf8_length_from_utf16(const char16_t *p,
91		size_t len) {
92		// We are not BOM aware.
93		size_t counter{0};
94		for (size_t i = 0; i < len; i++) {
95		char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
96		counter++; // ASCII
97		counter += static_cast<size_t>(
98		word >
99		0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
100		counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) \|\|
101		(word >= 0xE000)); // three-byte
102		}
103		return counter;
104		}
105
106		template <endianness big_endian>
107		simdutf_constexpr23 size_t utf32_length_from_utf16(const char16_t *p,
108		size_t len) {
109		// We are not BOM aware.
110		size_t counter{0};
111		for (size_t i = 0; i < len; i++) {
112		char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
113		counter += ((word & 0xFC00) != 0xDC00);
114		}
115		return counter;
116		}
117
118		simdutf_really_inline simdutf_constexpr23 void
119		change_endianness_utf16(const char16_t input, size_t size, char16_t output) {
120		for (size_t i = 0; i < size; i++) {
121		*output++ = char16_t(input[i] >> 8 \| input[i] << 8);
122		}
123		}
124
125		template <endianness big_endian>
126		simdutf_warn_unused simdutf_constexpr23 size_t
127		trim_partial_utf16(const char16_t *input, size_t length) {
128		if (length == 0) {
129		return 0;
130		}
131		uint16_t last_word = uint16_t(input[length - 1]);
132		last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
133		length -= ((last_word & 0xFC00) == 0xD800);
134		return length;
135		}
136
137		template <endianness big_endian>
138		simdutf_constexpr bool is_high_surrogate(char16_t c) {
139		c = scalar::utf16::swap_if_needed<big_endian>(c);
140		return (0xd800 <= c && c <= 0xdbff);
141		}
142
143		template <endianness big_endian>
144		simdutf_constexpr bool is_low_surrogate(char16_t c) {
145		c = scalar::utf16::swap_if_needed<big_endian>(c);
146		return (0xdc00 <= c && c <= 0xdfff);
147		}
148
149		simdutf_really_inline constexpr bool high_surrogate(char16_t c) {
150		return (0xd800 <= c && c <= 0xdbff);
151		}
152
153	0	simdutf_really_inline constexpr bool low_surrogate(char16_t c) {
154	0	return (0xdc00 <= c && c <= 0xdfff);
155	0	}
156
157		template <endianness big_endian>
158		simdutf_constexpr23 result
159		utf8_length_from_utf16_with_replacement(const char16_t *p, size_t len) {
160		bool any_surrogates = false;
161		// We are not BOM aware.
162		size_t counter{0};
163		for (size_t i = 0; i < len; i++) {
164		if (is_high_surrogate<big_endian>(p[i])) {
165		any_surrogates = true;
166		// surrogate pair
167		if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
168		counter += 4;
169		i++; // skip low surrogate
170		} else {
171		counter += 3; // unpaired high surrogate replaced by U+FFFD
172		}
173		continue;
174		} else if (is_low_surrogate<big_endian>(p[i])) {
175		any_surrogates = true;
176		counter += 3; // unpaired low surrogate replaced by U+FFFD
177		continue;
178		}
179		char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
180		counter++; // at least 1 byte
181		counter +=
182		static_cast<size_t>(word > 0x7F); // non-ASCII is at least 2 bytes
183		counter += static_cast<size_t>(word > 0x7FF); // three-byte
184		}
185		return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
186		counter};
187		}
188
189		// variable templates are a C++14 extension
190		template <endianness big_endian> constexpr char16_t replacement() {
191		return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
192		}
193
194		template <endianness big_endian>
195		simdutf_constexpr23 void to_well_formed_utf16(const char16_t *input, size_t len,
196		char16_t *output) {
197		const char16_t replacement = utf16::replacement<big_endian>();
198		bool high_surrogate_prev = false, high_surrogate, low_surrogate;
199		size_t i = 0;
200		for (; i < len; i++) {
201		char16_t c = input[i];
202		high_surrogate = is_high_surrogate<big_endian>(c);
203		low_surrogate = is_low_surrogate<big_endian>(c);
204		if (high_surrogate_prev && !low_surrogate) {
205		output[i - 1] = replacement;
206		}
207
208		if (!high_surrogate_prev && low_surrogate) {
209		output[i] = replacement;
210		} else {
211		output[i] = input[i];
212		}
213		high_surrogate_prev = high_surrogate;
214		}
215
216		/* string may not end with high surrogate */
217		if (high_surrogate_prev) {
218		output[i - 1] = replacement;
219		}
220		}
221
222		} // namespace utf16
223		} // namespace scalar
224		} // namespace simdutf
225
226		#endif