Coverage Report

Created: 2026-02-14 06:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/work/include/simdutf/scalar/utf16.h
Line
Count
Source
1
#ifndef SIMDUTF_UTF16_H
2
#define SIMDUTF_UTF16_H
3
4
namespace simdutf {
5
namespace scalar {
6
namespace utf16 {
7
8
template <endianness big_endian>
9
simdutf_warn_unused simdutf_constexpr23 bool
10
validate_as_ascii(const char16_t *data, size_t len) noexcept {
11
  for (size_t pos = 0; pos < len; pos++) {
12
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
13
    if (word >= 0x80) {
14
      return false;
15
    }
16
  }
17
  return true;
18
}
19
20
template <endianness big_endian>
21
inline simdutf_warn_unused simdutf_constexpr23 bool
22
validate(const char16_t *data, size_t len) noexcept {
23
  uint64_t pos = 0;
24
  while (pos < len) {
25
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
26
    if ((word & 0xF800) == 0xD800) {
27
      if (pos + 1 >= len) {
28
        return false;
29
      }
30
      char16_t diff = char16_t(word - 0xD800);
31
      if (diff > 0x3FF) {
32
        return false;
33
      }
34
      char16_t next_word = !match_system(big_endian)
35
                               ? u16_swap_bytes(data[pos + 1])
36
                               : data[pos + 1];
37
      char16_t diff2 = char16_t(next_word - 0xDC00);
38
      if (diff2 > 0x3FF) {
39
        return false;
40
      }
41
      pos += 2;
42
    } else {
43
      pos++;
44
    }
45
  }
46
  return true;
47
}
48
49
template <endianness big_endian>
50
inline simdutf_warn_unused simdutf_constexpr23 result
51
validate_with_errors(const char16_t *data, size_t len) noexcept {
52
  size_t pos = 0;
53
  while (pos < len) {
54
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]);
55
    if ((word & 0xF800) == 0xD800) {
56
      if (pos + 1 >= len) {
57
        return result(error_code::SURROGATE, pos);
58
      }
59
      char16_t diff = char16_t(word - 0xD800);
60
      if (diff > 0x3FF) {
61
        return result(error_code::SURROGATE, pos);
62
      }
63
      char16_t next_word = !match_system(big_endian)
64
                               ? u16_swap_bytes(data[pos + 1])
65
                               : data[pos + 1];
66
      char16_t diff2 = uint16_t(next_word - 0xDC00);
67
      if (diff2 > 0x3FF) {
68
        return result(error_code::SURROGATE, pos);
69
      }
70
      pos += 2;
71
    } else {
72
      pos++;
73
    }
74
  }
75
  return result(error_code::SUCCESS, pos);
76
}
77
78
template <endianness big_endian>
79
simdutf_constexpr23 size_t count_code_points(const char16_t *p, size_t len) {
80
  // We are not BOM aware.
81
  size_t counter{0};
82
  for (size_t i = 0; i < len; i++) {
83
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
84
    counter += ((word & 0xFC00) != 0xDC00);
85
  }
86
  return counter;
87
}
88
89
template <endianness big_endian>
90
simdutf_constexpr23 size_t utf8_length_from_utf16(const char16_t *p,
91
                                                  size_t len) {
92
  // We are not BOM aware.
93
  size_t counter{0};
94
  for (size_t i = 0; i < len; i++) {
95
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
96
    counter++; // ASCII
97
    counter += static_cast<size_t>(
98
        word >
99
        0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
100
    counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
101
                                   (word >= 0xE000)); // three-byte
102
  }
103
  return counter;
104
}
105
106
template <endianness big_endian>
107
simdutf_constexpr23 size_t utf32_length_from_utf16(const char16_t *p,
108
                                                   size_t len) {
109
  // We are not BOM aware.
110
  size_t counter{0};
111
  for (size_t i = 0; i < len; i++) {
112
    char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]);
113
    counter += ((word & 0xFC00) != 0xDC00);
114
  }
115
  return counter;
116
}
117
118
simdutf_really_inline simdutf_constexpr23 void
119
change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) {
120
  for (size_t i = 0; i < size; i++) {
121
    *output++ = char16_t(input[i] >> 8 | input[i] << 8);
122
  }
123
}
124
125
template <endianness big_endian>
126
simdutf_warn_unused simdutf_constexpr23 size_t
127
trim_partial_utf16(const char16_t *input, size_t length) {
128
  if (length == 0) {
129
    return 0;
130
  }
131
  uint16_t last_word = uint16_t(input[length - 1]);
132
  last_word = scalar::utf16::swap_if_needed<big_endian>(last_word);
133
  length -= ((last_word & 0xFC00) == 0xD800);
134
  return length;
135
}
136
137
template <endianness big_endian>
138
simdutf_constexpr bool is_high_surrogate(char16_t c) {
139
  c = scalar::utf16::swap_if_needed<big_endian>(c);
140
  return (0xd800 <= c && c <= 0xdbff);
141
}
142
143
template <endianness big_endian>
144
simdutf_constexpr bool is_low_surrogate(char16_t c) {
145
  c = scalar::utf16::swap_if_needed<big_endian>(c);
146
  return (0xdc00 <= c && c <= 0xdfff);
147
}
148
149
simdutf_really_inline constexpr bool high_surrogate(char16_t c) {
150
  return (0xd800 <= c && c <= 0xdbff);
151
}
152
153
0
simdutf_really_inline constexpr bool low_surrogate(char16_t c) {
154
0
  return (0xdc00 <= c && c <= 0xdfff);
155
0
}
156
157
template <endianness big_endian>
158
simdutf_constexpr23 result
159
utf8_length_from_utf16_with_replacement(const char16_t *p, size_t len) {
160
  bool any_surrogates = false;
161
  // We are not BOM aware.
162
  size_t counter{0};
163
  for (size_t i = 0; i < len; i++) {
164
    if (is_high_surrogate<big_endian>(p[i])) {
165
      any_surrogates = true;
166
      // surrogate pair
167
      if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) {
168
        counter += 4;
169
        i++; // skip low surrogate
170
      } else {
171
        counter += 3; // unpaired high surrogate replaced by U+FFFD
172
      }
173
      continue;
174
    } else if (is_low_surrogate<big_endian>(p[i])) {
175
      any_surrogates = true;
176
      counter += 3; // unpaired low surrogate replaced by U+FFFD
177
      continue;
178
    }
179
    char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i];
180
    counter++; // at least 1 byte
181
    counter +=
182
        static_cast<size_t>(word > 0x7F); // non-ASCII is at least 2 bytes
183
    counter += static_cast<size_t>(word > 0x7FF); // three-byte
184
  }
185
  return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS,
186
          counter};
187
}
188
189
// variable templates are a C++14 extension
190
template <endianness big_endian> constexpr char16_t replacement() {
191
  return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd;
192
}
193
194
template <endianness big_endian>
195
simdutf_constexpr23 void to_well_formed_utf16(const char16_t *input, size_t len,
196
                                              char16_t *output) {
197
  const char16_t replacement = utf16::replacement<big_endian>();
198
  bool high_surrogate_prev = false, high_surrogate, low_surrogate;
199
  size_t i = 0;
200
  for (; i < len; i++) {
201
    char16_t c = input[i];
202
    high_surrogate = is_high_surrogate<big_endian>(c);
203
    low_surrogate = is_low_surrogate<big_endian>(c);
204
    if (high_surrogate_prev && !low_surrogate) {
205
      output[i - 1] = replacement;
206
    }
207
208
    if (!high_surrogate_prev && low_surrogate) {
209
      output[i] = replacement;
210
    } else {
211
      output[i] = input[i];
212
    }
213
    high_surrogate_prev = high_surrogate;
214
  }
215
216
  /* string may not end with high surrogate */
217
  if (high_surrogate_prev) {
218
    output[i - 1] = replacement;
219
  }
220
}
221
222
} // namespace utf16
223
} // namespace scalar
224
} // namespace simdutf
225
226
#endif