/work/include/simdutf/scalar/utf16.h
Line | Count | Source |
1 | | #ifndef SIMDUTF_UTF16_H |
2 | | #define SIMDUTF_UTF16_H |
3 | | |
4 | | namespace simdutf { |
5 | | namespace scalar { |
6 | | namespace utf16 { |
7 | | |
8 | | template <endianness big_endian> |
9 | | simdutf_warn_unused simdutf_constexpr23 bool |
10 | | validate_as_ascii(const char16_t *data, size_t len) noexcept { |
11 | | for (size_t pos = 0; pos < len; pos++) { |
12 | | char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]); |
13 | | if (word >= 0x80) { |
14 | | return false; |
15 | | } |
16 | | } |
17 | | return true; |
18 | | } |
19 | | |
20 | | template <endianness big_endian> |
21 | | inline simdutf_warn_unused simdutf_constexpr23 bool |
22 | | validate(const char16_t *data, size_t len) noexcept { |
23 | | uint64_t pos = 0; |
24 | | while (pos < len) { |
25 | | char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]); |
26 | | if ((word & 0xF800) == 0xD800) { |
27 | | if (pos + 1 >= len) { |
28 | | return false; |
29 | | } |
30 | | char16_t diff = char16_t(word - 0xD800); |
31 | | if (diff > 0x3FF) { |
32 | | return false; |
33 | | } |
34 | | char16_t next_word = !match_system(big_endian) |
35 | | ? u16_swap_bytes(data[pos + 1]) |
36 | | : data[pos + 1]; |
37 | | char16_t diff2 = char16_t(next_word - 0xDC00); |
38 | | if (diff2 > 0x3FF) { |
39 | | return false; |
40 | | } |
41 | | pos += 2; |
42 | | } else { |
43 | | pos++; |
44 | | } |
45 | | } |
46 | | return true; |
47 | | } |
48 | | |
49 | | template <endianness big_endian> |
50 | | inline simdutf_warn_unused simdutf_constexpr23 result |
51 | | validate_with_errors(const char16_t *data, size_t len) noexcept { |
52 | | size_t pos = 0; |
53 | | while (pos < len) { |
54 | | char16_t word = scalar::utf16::swap_if_needed<big_endian>(data[pos]); |
55 | | if ((word & 0xF800) == 0xD800) { |
56 | | if (pos + 1 >= len) { |
57 | | return result(error_code::SURROGATE, pos); |
58 | | } |
59 | | char16_t diff = char16_t(word - 0xD800); |
60 | | if (diff > 0x3FF) { |
61 | | return result(error_code::SURROGATE, pos); |
62 | | } |
63 | | char16_t next_word = !match_system(big_endian) |
64 | | ? u16_swap_bytes(data[pos + 1]) |
65 | | : data[pos + 1]; |
66 | | char16_t diff2 = uint16_t(next_word - 0xDC00); |
67 | | if (diff2 > 0x3FF) { |
68 | | return result(error_code::SURROGATE, pos); |
69 | | } |
70 | | pos += 2; |
71 | | } else { |
72 | | pos++; |
73 | | } |
74 | | } |
75 | | return result(error_code::SUCCESS, pos); |
76 | | } |
77 | | |
78 | | template <endianness big_endian> |
79 | | simdutf_constexpr23 size_t count_code_points(const char16_t *p, size_t len) { |
80 | | // We are not BOM aware. |
81 | | size_t counter{0}; |
82 | | for (size_t i = 0; i < len; i++) { |
83 | | char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]); |
84 | | counter += ((word & 0xFC00) != 0xDC00); |
85 | | } |
86 | | return counter; |
87 | | } |
88 | | |
89 | | template <endianness big_endian> |
90 | | simdutf_constexpr23 size_t utf8_length_from_utf16(const char16_t *p, |
91 | | size_t len) { |
92 | | // We are not BOM aware. |
93 | | size_t counter{0}; |
94 | | for (size_t i = 0; i < len; i++) { |
95 | | char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]); |
96 | | counter++; // ASCII |
97 | | counter += static_cast<size_t>( |
98 | | word > |
99 | | 0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes |
100 | | counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) || |
101 | | (word >= 0xE000)); // three-byte |
102 | | } |
103 | | return counter; |
104 | | } |
105 | | |
106 | | template <endianness big_endian> |
107 | | simdutf_constexpr23 size_t utf32_length_from_utf16(const char16_t *p, |
108 | | size_t len) { |
109 | | // We are not BOM aware. |
110 | | size_t counter{0}; |
111 | | for (size_t i = 0; i < len; i++) { |
112 | | char16_t word = scalar::utf16::swap_if_needed<big_endian>(p[i]); |
113 | | counter += ((word & 0xFC00) != 0xDC00); |
114 | | } |
115 | | return counter; |
116 | | } |
117 | | |
118 | | simdutf_really_inline simdutf_constexpr23 void |
119 | | change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) { |
120 | | for (size_t i = 0; i < size; i++) { |
121 | | *output++ = char16_t(input[i] >> 8 | input[i] << 8); |
122 | | } |
123 | | } |
124 | | |
125 | | template <endianness big_endian> |
126 | | simdutf_warn_unused simdutf_constexpr23 size_t |
127 | | trim_partial_utf16(const char16_t *input, size_t length) { |
128 | | if (length == 0) { |
129 | | return 0; |
130 | | } |
131 | | uint16_t last_word = uint16_t(input[length - 1]); |
132 | | last_word = scalar::utf16::swap_if_needed<big_endian>(last_word); |
133 | | length -= ((last_word & 0xFC00) == 0xD800); |
134 | | return length; |
135 | | } |
136 | | |
137 | | template <endianness big_endian> |
138 | | simdutf_constexpr bool is_high_surrogate(char16_t c) { |
139 | | c = scalar::utf16::swap_if_needed<big_endian>(c); |
140 | | return (0xd800 <= c && c <= 0xdbff); |
141 | | } |
142 | | |
143 | | template <endianness big_endian> |
144 | | simdutf_constexpr bool is_low_surrogate(char16_t c) { |
145 | | c = scalar::utf16::swap_if_needed<big_endian>(c); |
146 | | return (0xdc00 <= c && c <= 0xdfff); |
147 | | } |
148 | | |
149 | | simdutf_really_inline constexpr bool high_surrogate(char16_t c) { |
150 | | return (0xd800 <= c && c <= 0xdbff); |
151 | | } |
152 | | |
153 | 0 | simdutf_really_inline constexpr bool low_surrogate(char16_t c) { |
154 | 0 | return (0xdc00 <= c && c <= 0xdfff); |
155 | 0 | } |
156 | | |
157 | | template <endianness big_endian> |
158 | | simdutf_constexpr23 result |
159 | | utf8_length_from_utf16_with_replacement(const char16_t *p, size_t len) { |
160 | | bool any_surrogates = false; |
161 | | // We are not BOM aware. |
162 | | size_t counter{0}; |
163 | | for (size_t i = 0; i < len; i++) { |
164 | | if (is_high_surrogate<big_endian>(p[i])) { |
165 | | any_surrogates = true; |
166 | | // surrogate pair |
167 | | if (i + 1 < len && is_low_surrogate<big_endian>(p[i + 1])) { |
168 | | counter += 4; |
169 | | i++; // skip low surrogate |
170 | | } else { |
171 | | counter += 3; // unpaired high surrogate replaced by U+FFFD |
172 | | } |
173 | | continue; |
174 | | } else if (is_low_surrogate<big_endian>(p[i])) { |
175 | | any_surrogates = true; |
176 | | counter += 3; // unpaired low surrogate replaced by U+FFFD |
177 | | continue; |
178 | | } |
179 | | char16_t word = !match_system(big_endian) ? u16_swap_bytes(p[i]) : p[i]; |
180 | | counter++; // at least 1 byte |
181 | | counter += |
182 | | static_cast<size_t>(word > 0x7F); // non-ASCII is at least 2 bytes |
183 | | counter += static_cast<size_t>(word > 0x7FF); // three-byte |
184 | | } |
185 | | return {any_surrogates ? error_code::SURROGATE : error_code::SUCCESS, |
186 | | counter}; |
187 | | } |
188 | | |
189 | | // variable templates are a C++14 extension |
190 | | template <endianness big_endian> constexpr char16_t replacement() { |
191 | | return !match_system(big_endian) ? scalar::u16_swap_bytes(0xfffd) : 0xfffd; |
192 | | } |
193 | | |
194 | | template <endianness big_endian> |
195 | | simdutf_constexpr23 void to_well_formed_utf16(const char16_t *input, size_t len, |
196 | | char16_t *output) { |
197 | | const char16_t replacement = utf16::replacement<big_endian>(); |
198 | | bool high_surrogate_prev = false, high_surrogate, low_surrogate; |
199 | | size_t i = 0; |
200 | | for (; i < len; i++) { |
201 | | char16_t c = input[i]; |
202 | | high_surrogate = is_high_surrogate<big_endian>(c); |
203 | | low_surrogate = is_low_surrogate<big_endian>(c); |
204 | | if (high_surrogate_prev && !low_surrogate) { |
205 | | output[i - 1] = replacement; |
206 | | } |
207 | | |
208 | | if (!high_surrogate_prev && low_surrogate) { |
209 | | output[i] = replacement; |
210 | | } else { |
211 | | output[i] = input[i]; |
212 | | } |
213 | | high_surrogate_prev = high_surrogate; |
214 | | } |
215 | | |
216 | | /* string may not end with high surrogate */ |
217 | | if (high_surrogate_prev) { |
218 | | output[i - 1] = replacement; |
219 | | } |
220 | | } |
221 | | |
222 | | } // namespace utf16 |
223 | | } // namespace scalar |
224 | | } // namespace simdutf |
225 | | |
226 | | #endif |