/src/icu/icu4c/source/common/unicode/utfiterator.h
Line | Count | Source |
1 | | // © 2024 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: https://www.unicode.org/copyright.html |
3 | | |
4 | | // utfiterator.h |
5 | | // created: 2024aug12 Markus W. Scherer |
6 | | |
7 | | #ifndef __UTFITERATOR_H__ |
8 | | #define __UTFITERATOR_H__ |
9 | | |
10 | | #include "unicode/utypes.h" |
11 | | |
12 | | #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H) |
13 | | |
14 | | #include <iterator> |
15 | | #if __has_include(<version>) |
16 | | #include <version> |
17 | | #endif |
18 | | #if defined(__cpp_lib_ranges) |
19 | | #include <ranges> |
20 | | #endif |
21 | | #include <string> |
22 | | #include <string_view> |
23 | | #include <type_traits> |
24 | | #include "unicode/utf16.h" |
25 | | #include "unicode/utf8.h" |
26 | | #include "unicode/uversion.h" |
27 | | |
28 | | /** |
29 | | * \file |
30 | | * \brief C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed). |
31 | | * |
32 | | * See the User Guide chapter about |
33 | | * <a href="https://unicode-org.github.io/icu/userguide/strings/cpp-code-point-iterator.html">C++ Code Point Iterators</a>. |
34 | | * |
35 | | * Sample code: |
36 | | * \code |
37 | | * #include <string_view> |
38 | | * #include <iostream> |
39 | | * #include "unicode/utypes.h" |
40 | | * #include "unicode/utfiterator.h" |
41 | | * |
42 | | * using icu::header::utfIterator; |
43 | | * using icu::header::utfStringCodePoints; |
44 | | * using icu::header::unsafeUTFIterator; |
45 | | * using icu::header::unsafeUTFStringCodePoints; |
46 | | * |
47 | | * int32_t rangeLoop16(std::u16string_view s) { |
48 | | * // We are just adding up the code points for minimal-code demonstration purposes. |
49 | | * int32_t sum = 0; |
50 | | * for (auto units : utfStringCodePoints<UChar32, UTF_BEHAVIOR_NEGATIVE>(s)) { |
51 | | * sum += units.codePoint(); // < 0 if ill-formed |
52 | | * } |
53 | | * return sum; |
54 | | * } |
55 | | * |
56 | | * int32_t loopIterPlusPlus16(std::u16string_view s) { |
57 | | * auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s); |
58 | | * int32_t sum = 0; |
59 | | * for (auto iter = range.begin(), limit = range.end(); iter != limit;) { |
60 | | * sum += (*iter++).codePoint(); // U+FFFD if ill-formed |
61 | | * } |
62 | | * return sum; |
63 | | * } |
64 | | * |
65 | | * int32_t backwardLoop16(std::u16string_view s) { |
66 | | * auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s); |
67 | | * int32_t sum = 0; |
68 | | * for (auto start = range.begin(), iter = range.end(); start != iter;) { |
69 | | * sum += (*--iter).codePoint(); // surrogate code point if unpaired / ill-formed |
70 | | * } |
71 | | * return sum; |
72 | | * } |
73 | | * |
74 | | * int32_t reverseLoop8(std::string_view s) { |
75 | | * auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s); |
76 | | * int32_t sum = 0; |
77 | | * for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) { |
78 | | * sum += iter->codePoint(); // U+FFFD if ill-formed |
79 | | * } |
80 | | * return sum; |
81 | | * } |
82 | | * |
83 | | * int32_t countCodePoints16(std::u16string_view s) { |
84 | | * auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s); |
85 | | * return std::distance(range.begin(), range.end()); |
86 | | * } |
87 | | * |
88 | | * int32_t unsafeRangeLoop16(std::u16string_view s) { |
89 | | * int32_t sum = 0; |
90 | | * for (auto units : unsafeUTFStringCodePoints<UChar32>(s)) { |
91 | | * sum += units.codePoint(); |
92 | | * } |
93 | | * return sum; |
94 | | * } |
95 | | * |
96 | | * int32_t unsafeReverseLoop8(std::string_view s) { |
97 | | * auto range = unsafeUTFStringCodePoints<UChar32>(s); |
98 | | * int32_t sum = 0; |
99 | | * for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) { |
100 | | * sum += iter->codePoint(); |
101 | | * } |
102 | | * return sum; |
103 | | * } |
104 | | * |
105 | | * char32_t firstCodePointOrFFFD16(std::u16string_view s) { |
106 | | * if (s.empty()) { return 0xfffd; } |
107 | | * auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s); |
108 | | * return range.begin()->codePoint(); |
109 | | * } |
110 | | * |
111 | | * std::string_view firstSequence8(std::string_view s) { |
112 | | * if (s.empty()) { return {}; } |
113 | | * auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s); |
114 | | * auto units = *(range.begin()); |
115 | | * if (units.wellFormed()) { |
116 | | * return units.stringView(); |
117 | | * } else { |
118 | | * return {}; |
119 | | * } |
120 | | * } |
121 | | * |
122 | | * template<typename InputStream> // some istream or streambuf |
123 | | * std::u32string cpFromInput(InputStream &in) { |
124 | | * // This is a single-pass input_iterator. |
125 | | * std::istreambuf_iterator bufIter(in); |
126 | | * std::istreambuf_iterator<typename InputStream::char_type> bufLimit; |
127 | | * auto iter = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufIter); |
128 | | * auto limit = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufLimit); |
129 | | * std::u32string s32; |
130 | | * for (; iter != limit; ++iter) { |
131 | | * s32.push_back(iter->codePoint()); |
132 | | * } |
133 | | * return s32; |
134 | | * } |
135 | | * |
136 | | * std::u32string cpFromStdin() { return cpFromInput(std::cin); } |
137 | | * std::u32string cpFromWideStdin() { return cpFromInput(std::wcin); } |
138 | | * \endcode |
139 | | */ |
140 | | |
141 | | #ifndef U_HIDE_DRAFT_API |
142 | | |
143 | | /** |
144 | | * Some defined behaviors for handling ill-formed Unicode strings. |
145 | | * This is a template parameter for UTFIterator and related classes. |
146 | | * |
147 | | * When a validating UTFIterator encounters an ill-formed code unit sequence, |
148 | | * then CodeUnits.codePoint() is a value according to this parameter. |
149 | | * |
150 | | * @draft ICU 78 |
151 | | * @see CodeUnits |
152 | | * @see UTFIterator |
153 | | * @see UTFStringCodePoints |
154 | | */ |
155 | | typedef enum UTFIllFormedBehavior { |
156 | | /** |
157 | | * Returns a negative value (-1=U_SENTINEL) instead of a code point. |
158 | | * If the CP32 template parameter for the relevant classes is an unsigned type, |
159 | | * then the negative value becomes 0xffffffff=UINT32_MAX. |
160 | | * |
161 | | * @draft ICU 78 |
162 | | */ |
163 | | UTF_BEHAVIOR_NEGATIVE, |
164 | | /** Returns U+FFFD Replacement Character. @draft ICU 78 */ |
165 | | UTF_BEHAVIOR_FFFD, |
166 | | /** |
167 | | * UTF-8: Not allowed; |
168 | | * UTF-16: returns the unpaired surrogate; |
169 | | * UTF-32: returns the surrogate code point, or U+FFFD if out of range. |
170 | | * |
171 | | * @draft ICU 78 |
172 | | */ |
173 | | UTF_BEHAVIOR_SURROGATE |
174 | | } UTFIllFormedBehavior; |
175 | | |
176 | | namespace U_HEADER_ONLY_NAMESPACE { |
177 | | |
178 | | namespace prv { |
179 | | #if U_CPLUSPLUS_VERSION >= 20 |
180 | | |
181 | | /** @internal */ |
182 | | template<typename Iter> |
183 | | using iter_value_t = typename std::iter_value_t<Iter>; |
184 | | |
185 | | /** @internal */ |
186 | | template<typename Iter> |
187 | | using iter_difference_t = std::iter_difference_t<Iter>; |
188 | | |
189 | | /** @internal */ |
190 | | template<typename Iter> |
191 | | constexpr bool forward_iterator = std::forward_iterator<Iter>; |
192 | | |
193 | | /** @internal */ |
194 | | template<typename Iter> |
195 | | constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>; |
196 | | |
197 | | /** @internal */ |
198 | | template<typename Range> |
199 | | constexpr bool range = std::ranges::range<Range>; |
200 | | |
201 | | #else |
202 | | |
203 | | /** @internal */ |
204 | | template<typename Iter> |
205 | | using iter_value_t = typename std::iterator_traits<Iter>::value_type; |
206 | | |
207 | | /** @internal */ |
208 | | template<typename Iter> |
209 | | using iter_difference_t = typename std::iterator_traits<Iter>::difference_type; |
210 | | |
211 | | /** @internal */ |
212 | | template<typename Iter> |
213 | | constexpr bool forward_iterator = |
214 | | std::is_base_of_v< |
215 | | std::forward_iterator_tag, |
216 | | typename std::iterator_traits<Iter>::iterator_category>; |
217 | | |
218 | | /** @internal */ |
219 | | template<typename Iter> |
220 | | constexpr bool bidirectional_iterator = |
221 | | std::is_base_of_v< |
222 | | std::bidirectional_iterator_tag, |
223 | | typename std::iterator_traits<Iter>::iterator_category>; |
224 | | |
225 | | /** @internal */ |
226 | | template<typename Range, typename = void> |
227 | | struct range_type : std::false_type {}; |
228 | | |
229 | | /** @internal */ |
230 | | template<typename Range> |
231 | | struct range_type< |
232 | | Range, |
233 | | std::void_t<decltype(std::declval<Range>().begin()), |
234 | | decltype(std::declval<Range>().end())>> : std::true_type {}; |
235 | | |
236 | | /** @internal */ |
237 | | template<typename Range> |
238 | | constexpr bool range = range_type<Range>::value; |
239 | | |
240 | | #endif |
241 | | |
242 | | /** @internal */ |
243 | | template <typename T> struct is_basic_string_view : std::false_type {}; |
244 | | |
245 | | /** @internal */ |
246 | | template <typename... Args> |
247 | | struct is_basic_string_view<std::basic_string_view<Args...>> : std::true_type {}; |
248 | | |
249 | | /** @internal */ |
250 | | template <typename T> constexpr bool is_basic_string_view_v = is_basic_string_view<T>::value; |
251 | | |
252 | | /** @internal */ |
253 | | template<typename CP32, bool skipSurrogates> |
254 | | class CodePointsIterator { |
255 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
256 | | public: |
257 | | /** C++ iterator boilerplate @internal */ |
258 | | using value_type = CP32; |
259 | | /** C++ iterator boilerplate @internal */ |
260 | | using reference = value_type; |
261 | | /** C++ iterator boilerplate @internal */ |
262 | | using pointer = CP32 *; |
263 | | /** C++ iterator boilerplate @internal */ |
264 | | using difference_type = int32_t; |
265 | | /** C++ iterator boilerplate @internal */ |
266 | | using iterator_category = std::forward_iterator_tag; |
267 | | |
268 | | /** @internal */ |
269 | | inline CodePointsIterator(CP32 c) : c_(c) {} |
270 | | /** @internal */ |
271 | | inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; } |
272 | | /** @internal */ |
273 | | inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); } |
274 | | /** @internal */ |
275 | | inline CP32 operator*() const { return c_; } |
276 | | /** @internal */ |
277 | | inline CodePointsIterator &operator++() { // pre-increment |
278 | | ++c_; |
279 | | if (skipSurrogates && c_ == 0xd800) { |
280 | | c_ = 0xe000; |
281 | | } |
282 | | return *this; |
283 | | } |
284 | | /** @internal */ |
285 | | inline CodePointsIterator operator++(int) { // post-increment |
286 | | CodePointsIterator result(*this); |
287 | | ++(*this); |
288 | | return result; |
289 | | } |
290 | | |
291 | | private: |
292 | | CP32 c_; |
293 | | }; |
294 | | |
295 | | } // namespace prv |
296 | | |
297 | | /** |
298 | | * A C++ "range" over all Unicode code points U+0000..U+10FFFF. |
299 | | * https://www.unicode.org/glossary/#code_point |
300 | | * |
301 | | * Intended for test and builder code. |
302 | | * |
303 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t |
304 | | * @draft ICU 78 |
305 | | * @see U_IS_CODE_POINT |
306 | | */ |
307 | | template<typename CP32> |
308 | | class AllCodePoints { |
309 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
310 | | public: |
311 | | /** Constructor. @draft ICU 78 */ |
312 | | AllCodePoints() {} |
313 | | /** |
314 | | * @return an iterator over all Unicode code points. |
315 | | * The iterator returns CP32 integers. |
316 | | * @draft ICU 78 |
317 | | */ |
318 | | auto begin() const { return prv::CodePointsIterator<CP32, false>(0); } |
319 | | /** |
320 | | * @return an exclusive-end iterator over all Unicode code points. |
321 | | * @draft ICU 78 |
322 | | */ |
323 | | auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); } |
324 | | }; |
325 | | |
326 | | /** |
327 | | * A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF. |
328 | | * That is, all code points except surrogates. |
329 | | * Only scalar values can be represented in well-formed UTF-8/16/32. |
330 | | * https://www.unicode.org/glossary/#unicode_scalar_value |
331 | | * |
332 | | * Intended for test and builder code. |
333 | | * |
334 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t |
335 | | * @draft ICU 78 |
336 | | * @see U_IS_SCALAR_VALUE |
337 | | */ |
338 | | template<typename CP32> |
339 | | class AllScalarValues { |
340 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
341 | | public: |
342 | | /** Constructor. @draft ICU 78 */ |
343 | | AllScalarValues() {} |
344 | | /** |
345 | | * @return an iterator over all Unicode scalar values. |
346 | | * The iterator returns CP32 integers. |
347 | | * @draft ICU 78 |
348 | | */ |
349 | | auto begin() const { return prv::CodePointsIterator<CP32, true>(0); } |
350 | | /** |
351 | | * @return an exclusive-end iterator over all Unicode scalar values. |
352 | | * @draft ICU 78 |
353 | | */ |
354 | | auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); } |
355 | | }; |
356 | | |
357 | | /** |
358 | | * Result of decoding a code unit sequence for one code point. |
359 | | * Returned from non-validating Unicode string code point iterators. |
360 | | * Base class for class CodeUnits which is returned from validating iterators. |
361 | | * |
362 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; |
363 | | * should be signed if UTF_BEHAVIOR_NEGATIVE |
364 | | * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: |
365 | | * UTF-8: char or char8_t or uint8_t; |
366 | | * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; |
367 | | * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t |
368 | | * @see UnsafeUTFIterator |
369 | | * @see UnsafeUTFStringCodePoints |
370 | | * @draft ICU 78 |
371 | | */ |
372 | | template<typename CP32, typename UnitIter, typename = void> |
373 | | class UnsafeCodeUnits { |
374 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
375 | | using Unit = typename prv::iter_value_t<UnitIter>; |
376 | | public: |
377 | | /** @internal */ |
378 | | UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) : |
379 | 699 | c_(codePoint), len_(length), start_(start), limit_(limit) {} |
380 | | |
381 | | /** Copy constructor. @draft ICU 78 */ |
382 | | UnsafeCodeUnits(const UnsafeCodeUnits &other) = default; |
383 | | /** Copy assignment operator. @draft ICU 78 */ |
384 | | UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default; |
385 | | |
386 | | /** |
387 | | * @return the Unicode code point decoded from the code unit sequence. |
388 | | * If the sequence is ill-formed and the iterator validates, |
389 | | * then this is a replacement value according to the iterator‘s |
390 | | * UTFIllFormedBehavior template parameter. |
391 | | * @draft ICU 78 |
392 | | */ |
393 | 116 | CP32 codePoint() const { return c_; } |
394 | | |
395 | | /** |
396 | | * @return the start of the code unit sequence for one code point. |
397 | | * Only enabled if UnitIter is a (multi-pass) forward_iterator or better. |
398 | | * @draft ICU 78 |
399 | | */ |
400 | 0 | UnitIter begin() const { return start_; } |
401 | | |
402 | | /** |
403 | | * @return the limit (exclusive end) of the code unit sequence for one code point. |
404 | | * Only enabled if UnitIter is a (multi-pass) forward_iterator or better. |
405 | | * @draft ICU 78 |
406 | | */ |
407 | 0 | UnitIter end() const { return limit_; } |
408 | | |
409 | | /** |
410 | | * @return the length of the code unit sequence for one code point. |
411 | | * @draft ICU 78 |
412 | | */ |
413 | | uint8_t length() const { return len_; } |
414 | | |
415 | | #if U_CPLUSPLUS_VERSION >= 20 |
416 | | /** |
417 | | * @return a string_view of the code unit sequence for one code point. |
418 | | * Only works if UnitIter is a pointer or a contiguous_iterator. |
419 | | * @draft ICU 78 |
420 | | */ |
421 | | template<std::contiguous_iterator Iter = UnitIter> |
422 | | std::basic_string_view<Unit> stringView() const { |
423 | | return std::basic_string_view<Unit>(begin(), end()); |
424 | | } |
425 | | #else |
426 | | /** |
427 | | * @return a string_view of the code unit sequence for one code point. |
428 | | * Only works if UnitIter is a pointer or a contiguous_iterator. |
429 | | * @draft ICU 78 |
430 | | */ |
431 | | template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type> |
432 | | std::enable_if_t<std::is_pointer_v<Iter> || |
433 | | std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> || |
434 | | std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> || |
435 | | std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> || |
436 | | std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>, |
437 | | std::basic_string_view<Unit>> |
438 | | stringView() const { |
439 | | return std::basic_string_view<Unit>(&*start_, len_); |
440 | | } |
441 | | #endif |
442 | | |
443 | | private: |
444 | | // Order of fields with padding and access frequency in mind. |
445 | | CP32 c_; |
446 | | uint8_t len_; |
447 | | UnitIter start_; |
448 | | UnitIter limit_; |
449 | | }; |
450 | | |
451 | | #ifndef U_IN_DOXYGEN |
452 | | // Partial template specialization for single-pass input iterator. |
453 | | // No UnitIter field, no getter for it, no stringView(). |
454 | | template<typename CP32, typename UnitIter> |
455 | | class UnsafeCodeUnits< |
456 | | CP32, |
457 | | UnitIter, |
458 | | std::enable_if_t<!prv::forward_iterator<UnitIter>>> { |
459 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
460 | | public: |
461 | | UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {} |
462 | | |
463 | | UnsafeCodeUnits(const UnsafeCodeUnits &other) = default; |
464 | | UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default; |
465 | | |
466 | | CP32 codePoint() const { return c_; } |
467 | | |
468 | | uint8_t length() const { return len_; } |
469 | | |
470 | | private: |
471 | | // Order of fields with padding and access frequency in mind. |
472 | | CP32 c_; |
473 | | uint8_t len_; |
474 | | }; |
475 | | #endif // U_IN_DOXYGEN |
476 | | |
477 | | /** |
478 | | * Result of validating and decoding a code unit sequence for one code point. |
479 | | * Returned from validating Unicode string code point iterators. |
480 | | * Adds function wellFormed() to base class UnsafeCodeUnits. |
481 | | * |
482 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; |
483 | | * should be signed if UTF_BEHAVIOR_NEGATIVE |
484 | | * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: |
485 | | * UTF-8: char or char8_t or uint8_t; |
486 | | * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; |
487 | | * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t |
488 | | * @see UTFIterator |
489 | | * @see UTFStringCodePoints |
490 | | * @draft ICU 78 |
491 | | */ |
492 | | template<typename CP32, typename UnitIter, typename = void> |
493 | | class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> { |
494 | | public: |
495 | | /** @internal */ |
496 | | CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) : |
497 | 699 | UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {} |
498 | | |
499 | | /** Copy constructor. @draft ICU 78 */ |
500 | | CodeUnits(const CodeUnits &other) = default; |
501 | | /** Copy assignment operator. @draft ICU 78 */ |
502 | | CodeUnits &operator=(const CodeUnits &other) = default; |
503 | | |
504 | | /** |
505 | | * @return true if the decoded code unit sequence is well-formed. |
506 | | * @draft ICU 78 |
507 | | */ |
508 | 181 | bool wellFormed() const { return ok_; } |
509 | | |
510 | | private: |
511 | | bool ok_; |
512 | | }; |
513 | | |
514 | | #ifndef U_IN_DOXYGEN |
515 | | // Partial template specialization for single-pass input iterator. |
516 | | // No UnitIter field, no getter for it, no stringView(). |
517 | | template<typename CP32, typename UnitIter> |
518 | | class CodeUnits< |
519 | | CP32, |
520 | | UnitIter, |
521 | | std::enable_if_t<!prv::forward_iterator<UnitIter>>> : |
522 | | public UnsafeCodeUnits<CP32, UnitIter> { |
523 | | public: |
524 | | CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) : |
525 | | UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {} |
526 | | |
527 | | CodeUnits(const CodeUnits &other) = default; |
528 | | CodeUnits &operator=(const CodeUnits &other) = default; |
529 | | |
530 | | bool wellFormed() const { return ok_; } |
531 | | |
532 | | private: |
533 | | bool ok_; |
534 | | }; |
535 | | #endif // U_IN_DOXYGEN |
536 | | |
537 | | // Validating implementations ---------------------------------------------- *** |
538 | | |
539 | | #ifndef U_IN_DOXYGEN |
540 | | template<typename CP32, UTFIllFormedBehavior behavior, |
541 | | typename UnitIter, typename LimitIter = UnitIter, typename = void> |
542 | | class UTFImpl; |
543 | | |
544 | | // Note: readAndInc() functions take both a p0 and a p iterator. |
545 | | // They must have the same value. |
546 | | // For a multi-pass UnitIter, the caller must copy its p into a local variable p0, |
547 | | // and readAndInc() copies p0 and the incremented p into the CodeUnits. |
548 | | // For a single-pass UnitIter, which may not be default-constructible nor coypable, |
549 | | // the caller can pass p into both references, and readAndInc() does not use p0 |
550 | | // and constructs CodeUnits without them. |
551 | | // Moving the p0 variable into the call site avoids having to declare it inside readAndInc() |
552 | | // which may not be possible for a single-pass iterator. |
553 | | |
554 | | // UTF-8 |
555 | | template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter> |
556 | | class UTFImpl< |
557 | | CP32, behavior, |
558 | | UnitIter, LimitIter, |
559 | | std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> { |
560 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
561 | | static_assert(behavior != UTF_BEHAVIOR_SURROGATE, |
562 | | "For 8-bit strings, the SURROGATE option does not have an equivalent."); |
563 | | public: |
564 | | // Handle ill-formed UTF-8 |
565 | | U_FORCE_INLINE static CP32 sub() { |
566 | | if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) { |
567 | | return U_SENTINEL; |
568 | | } else { |
569 | | static_assert(behavior == UTF_BEHAVIOR_FFFD); |
570 | | return 0xfffd; |
571 | | } |
572 | | } |
573 | | |
574 | | U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) { |
575 | | // Very similar to U8_FWD_1(). |
576 | | uint8_t b = *p; |
577 | | ++p; |
578 | | if (U8_IS_LEAD(b) && p != limit) { |
579 | | uint8_t t1 = *p; |
580 | | if ((0xe0 <= b && b < 0xf0)) { |
581 | | if (U8_IS_VALID_LEAD3_AND_T1(b, t1) && |
582 | | ++p != limit && U8_IS_TRAIL(*p)) { |
583 | | ++p; |
584 | | } |
585 | | } else if (b < 0xe0) { |
586 | | if (U8_IS_TRAIL(t1)) { |
587 | | ++p; |
588 | | } |
589 | | } else /* b >= 0xf0 */ { |
590 | | if (U8_IS_VALID_LEAD4_AND_T1(b, t1) && |
591 | | ++p != limit && U8_IS_TRAIL(*p) && |
592 | | ++p != limit && U8_IS_TRAIL(*p)) { |
593 | | ++p; |
594 | | } |
595 | | } |
596 | | } |
597 | | } |
598 | | |
599 | | U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) { |
600 | | // Very similar to U8_BACK_1(). |
601 | | uint8_t c = *--p; |
602 | | if (U8_IS_TRAIL(c) && p != start) { |
603 | | UnitIter p1 = p; |
604 | | uint8_t b1 = *--p1; |
605 | | if (U8_IS_LEAD(b1)) { |
606 | | if (b1 < 0xe0 || |
607 | | (b1 < 0xf0 ? |
608 | | U8_IS_VALID_LEAD3_AND_T1(b1, c) : |
609 | | U8_IS_VALID_LEAD4_AND_T1(b1, c))) { |
610 | | p = p1; |
611 | | return; |
612 | | } |
613 | | } else if (U8_IS_TRAIL(b1) && p1 != start) { |
614 | | uint8_t b2 = *--p1; |
615 | | if (0xe0 <= b2 && b2 <= 0xf4) { |
616 | | if (b2 < 0xf0 ? |
617 | | U8_IS_VALID_LEAD3_AND_T1(b2, b1) : |
618 | | U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { |
619 | | p = p1; |
620 | | return; |
621 | | } |
622 | | } else if (U8_IS_TRAIL(b2) && p1 != start) { |
623 | | uint8_t b3 = *--p1; |
624 | | if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { |
625 | | p = p1; |
626 | | return; |
627 | | } |
628 | | } |
629 | | } |
630 | | } |
631 | | } |
632 | | |
633 | | U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc( |
634 | | UnitIter &p0, UnitIter &p, const LimitIter &limit) { |
635 | | constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; |
636 | | // Very similar to U8_NEXT_OR_FFFD(). |
637 | | CP32 c = uint8_t(*p); |
638 | | ++p; |
639 | | if (U8_IS_SINGLE(c)) { |
640 | | if constexpr (isMultiPass) { |
641 | | return {c, 1, true, p0, p}; |
642 | | } else { |
643 | | return {c, 1, true}; |
644 | | } |
645 | | } |
646 | | uint8_t length = 1; |
647 | | uint8_t t = 0; |
648 | | if (p != limit && |
649 | | // fetch/validate/assemble all but last trail byte |
650 | | (c >= 0xe0 ? |
651 | | (c < 0xf0 ? // U+0800..U+FFFF except surrogates |
652 | | U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) && |
653 | | (t &= 0x3f, 1) |
654 | | : // U+10000..U+10FFFF |
655 | | (c -= 0xf0) <= 4 && |
656 | | U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) && |
657 | | (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) && |
658 | | (t = *p - 0x80) <= 0x3f) && |
659 | | // valid second-to-last trail byte |
660 | | (c = (c << 6) | t, ++length, ++p != limit) |
661 | | : // U+0080..U+07FF |
662 | | c >= 0xc2 && (c &= 0x1f, 1)) && |
663 | | // last trail byte |
664 | | (t = *p - 0x80) <= 0x3f) { |
665 | | c = (c << 6) | t; |
666 | | ++length; |
667 | | ++p; |
668 | | if constexpr (isMultiPass) { |
669 | | return {c, length, true, p0, p}; |
670 | | } else { |
671 | | return {c, length, true}; |
672 | | } |
673 | | } |
674 | | if constexpr (isMultiPass) { |
675 | | return {sub(), length, false, p0, p}; |
676 | | } else { |
677 | | return {sub(), length, false}; |
678 | | } |
679 | | } |
680 | | |
681 | | U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) { |
682 | | // Very similar to U8_PREV_OR_FFFD(). |
683 | | UnitIter p0 = p; |
684 | | CP32 c = uint8_t(*--p); |
685 | | if (U8_IS_SINGLE(c)) { |
686 | | return {c, 1, true, p, p0}; |
687 | | } |
688 | | if (U8_IS_TRAIL(c) && p != start) { |
689 | | UnitIter p1 = p; |
690 | | uint8_t b1 = *--p1; |
691 | | if (U8_IS_LEAD(b1)) { |
692 | | if (b1 < 0xe0) { |
693 | | p = p1; |
694 | | c = ((b1 - 0xc0) << 6) | (c & 0x3f); |
695 | | return {c, 2, true, p, p0}; |
696 | | } else if (b1 < 0xf0 ? |
697 | | U8_IS_VALID_LEAD3_AND_T1(b1, c) : |
698 | | U8_IS_VALID_LEAD4_AND_T1(b1, c)) { |
699 | | // Truncated 3- or 4-byte sequence. |
700 | | p = p1; |
701 | | return {sub(), 2, false, p, p0}; |
702 | | } |
703 | | } else if (U8_IS_TRAIL(b1) && p1 != start) { |
704 | | // Extract the value bits from the last trail byte. |
705 | | c &= 0x3f; |
706 | | uint8_t b2 = *--p1; |
707 | | if (0xe0 <= b2 && b2 <= 0xf4) { |
708 | | if (b2 < 0xf0) { |
709 | | b2 &= 0xf; |
710 | | if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { |
711 | | p = p1; |
712 | | c = (b2 << 12) | ((b1 & 0x3f) << 6) | c; |
713 | | return {c, 3, true, p, p0}; |
714 | | } |
715 | | } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) { |
716 | | // Truncated 4-byte sequence. |
717 | | p = p1; |
718 | | return {sub(), 3, false, p, p0}; |
719 | | } |
720 | | } else if (U8_IS_TRAIL(b2) && p1 != start) { |
721 | | uint8_t b3 = *--p1; |
722 | | if (0xf0 <= b3 && b3 <= 0xf4) { |
723 | | b3 &= 7; |
724 | | if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) { |
725 | | p = p1; |
726 | | c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c; |
727 | | return {c, 4, true, p, p0}; |
728 | | } |
729 | | } |
730 | | } |
731 | | } |
732 | | } |
733 | | return {sub(), 1, false, p, p0}; |
734 | | } |
735 | | }; |
736 | | |
737 | | // UTF-16 |
738 | | template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter> |
739 | | class UTFImpl< |
740 | | CP32, behavior, |
741 | | UnitIter, LimitIter, |
742 | | std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> { |
743 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
744 | | public: |
745 | | // Handle ill-formed UTF-16: One unpaired surrogate. |
746 | 65 | U_FORCE_INLINE static CP32 sub(CP32 surrogate) { |
747 | | if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) { |
748 | | return U_SENTINEL; |
749 | 65 | } else if constexpr (behavior == UTF_BEHAVIOR_FFFD) { |
750 | 65 | return 0xfffd; |
751 | | } else { |
752 | | static_assert(behavior == UTF_BEHAVIOR_SURROGATE); |
753 | | return surrogate; |
754 | | } |
755 | 65 | } |
756 | | |
757 | 0 | U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) { |
758 | | // Very similar to U16_FWD_1(). |
759 | 0 | auto c = *p; |
760 | 0 | ++p; |
761 | 0 | if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) { |
762 | 0 | ++p; |
763 | 0 | } |
764 | 0 | } |
765 | | |
766 | | U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) { |
767 | | // Very similar to U16_BACK_1(). |
768 | | UnitIter p1; |
769 | | if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) { |
770 | | p = p1; |
771 | | } |
772 | | } |
773 | | |
774 | | U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc( |
775 | 181 | UnitIter &p0, UnitIter &p, const LimitIter &limit) { |
776 | 181 | constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; |
777 | | // Very similar to U16_NEXT_OR_FFFD(). |
778 | 181 | CP32 c = static_cast<CP32>(*p); |
779 | 181 | ++p; |
780 | 181 | if (!U16_IS_SURROGATE(c)) { |
781 | 111 | if constexpr (isMultiPass) { |
782 | 111 | return {c, 1, true, p0, p}; |
783 | | } else { |
784 | | return {c, 1, true}; |
785 | | } |
786 | 111 | } else { |
787 | 70 | uint16_t c2; |
788 | 70 | if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) { |
789 | 5 | ++p; |
790 | 5 | c = U16_GET_SUPPLEMENTARY(c, c2); |
791 | 5 | if constexpr (isMultiPass) { |
792 | 5 | return {c, 2, true, p0, p}; |
793 | | } else { |
794 | | return {c, 2, true}; |
795 | | } |
796 | 65 | } else { |
797 | 65 | if constexpr (isMultiPass) { |
798 | 65 | return {sub(c), 1, false, p0, p}; |
799 | | } else { |
800 | | return {sub(c), 1, false}; |
801 | | } |
802 | 65 | } |
803 | 70 | } |
804 | 181 | } |
805 | | |
806 | | U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) { |
807 | | // Very similar to U16_PREV_OR_FFFD(). |
808 | | UnitIter p0 = p; |
809 | | CP32 c = static_cast<CP32>(*--p); |
810 | | if (!U16_IS_SURROGATE(c)) { |
811 | | return {c, 1, true, p, p0}; |
812 | | } else { |
813 | | UnitIter p1; |
814 | | uint16_t c2; |
815 | | if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) { |
816 | | p = p1; |
817 | | c = U16_GET_SUPPLEMENTARY(c2, c); |
818 | | return {c, 2, true, p, p0}; |
819 | | } else { |
820 | | return {sub(c), 1, false, p, p0}; |
821 | | } |
822 | | } |
823 | | } |
824 | | }; |
825 | | |
826 | | // UTF-32: trivial, but still validating |
827 | | template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter> |
828 | | class UTFImpl< |
829 | | CP32, behavior, |
830 | | UnitIter, LimitIter, |
831 | | std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> { |
832 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
833 | | public: |
834 | | // Handle ill-formed UTF-32 |
835 | | U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) { |
836 | | if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) { |
837 | | return U_SENTINEL; |
838 | | } else if constexpr (behavior == UTF_BEHAVIOR_FFFD) { |
839 | | return 0xfffd; |
840 | | } else { |
841 | | static_assert(behavior == UTF_BEHAVIOR_SURROGATE); |
842 | | return forSurrogate ? surrogate : 0xfffd; |
843 | | } |
844 | | } |
845 | | |
846 | | U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) { |
847 | | ++p; |
848 | | } |
849 | | |
850 | | U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) { |
851 | | --p; |
852 | | } |
853 | | |
854 | | U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc( |
855 | | UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) { |
856 | | constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; |
857 | | uint32_t uc = *p; |
858 | | CP32 c = uc; |
859 | | ++p; |
860 | | if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) { |
861 | | if constexpr (isMultiPass) { |
862 | | return {c, 1, true, p0, p}; |
863 | | } else { |
864 | | return {c, 1, true}; |
865 | | } |
866 | | } else { |
867 | | if constexpr (isMultiPass) { |
868 | | return {sub(uc < 0xe000, c), 1, false, p0, p}; |
869 | | } else { |
870 | | return {sub(uc < 0xe000, c), 1, false}; |
871 | | } |
872 | | } |
873 | | } |
874 | | |
875 | | U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) { |
876 | | UnitIter p0 = p; |
877 | | uint32_t uc = *--p; |
878 | | CP32 c = uc; |
879 | | if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) { |
880 | | return {c, 1, true, p, p0}; |
881 | | } else { |
882 | | return {sub(uc < 0xe000, c), 1, false, p, p0}; |
883 | | } |
884 | | } |
885 | | }; |
886 | | |
887 | | // Non-validating implementations ------------------------------------------ *** |
888 | | |
889 | | template<typename CP32, typename UnitIter, typename = void> |
890 | | class UnsafeUTFImpl; |
891 | | |
892 | | // UTF-8 |
893 | | template<typename CP32, typename UnitIter> |
894 | | class UnsafeUTFImpl< |
895 | | CP32, |
896 | | UnitIter, |
897 | | std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> { |
898 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
899 | | public: |
900 | | U_FORCE_INLINE static void inc(UnitIter &p) { |
901 | | // Very similar to U8_FWD_1_UNSAFE(). |
902 | | uint8_t b = *p; |
903 | | std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b)); |
904 | | } |
905 | | |
906 | | U_FORCE_INLINE static void dec(UnitIter &p) { |
907 | | // Very similar to U8_BACK_1_UNSAFE(). |
908 | | while (U8_IS_TRAIL(*--p)) {} |
909 | | } |
910 | | |
911 | | U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) { |
912 | | constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; |
913 | | // Very similar to U8_NEXT_UNSAFE(). |
914 | | CP32 c = uint8_t(*p); |
915 | | ++p; |
916 | | if (U8_IS_SINGLE(c)) { |
917 | | if constexpr (isMultiPass) { |
918 | | return {c, 1, p0, p}; |
919 | | } else { |
920 | | return {c, 1}; |
921 | | } |
922 | | } else if (c < 0xe0) { |
923 | | c = ((c & 0x1f) << 6) | (*p & 0x3f); |
924 | | ++p; |
925 | | if constexpr (isMultiPass) { |
926 | | return {c, 2, p0, p}; |
927 | | } else { |
928 | | return {c, 2}; |
929 | | } |
930 | | } else if (c < 0xf0) { |
931 | | // No need for (c&0xf) because the upper bits are truncated |
932 | | // after <<12 in the cast to uint16_t. |
933 | | c = uint16_t(c << 12) | ((*p & 0x3f) << 6); |
934 | | ++p; |
935 | | c |= *p & 0x3f; |
936 | | ++p; |
937 | | if constexpr (isMultiPass) { |
938 | | return {c, 3, p0, p}; |
939 | | } else { |
940 | | return {c, 3}; |
941 | | } |
942 | | } else { |
943 | | c = ((c & 7) << 18) | ((*p & 0x3f) << 12); |
944 | | ++p; |
945 | | c |= (*p & 0x3f) << 6; |
946 | | ++p; |
947 | | c |= *p & 0x3f; |
948 | | ++p; |
949 | | if constexpr (isMultiPass) { |
950 | | return {c, 4, p0, p}; |
951 | | } else { |
952 | | return {c, 4}; |
953 | | } |
954 | | } |
955 | | } |
956 | | |
957 | | U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) { |
958 | | // Very similar to U8_PREV_UNSAFE(). |
959 | | UnitIter p0 = p; |
960 | | CP32 c = uint8_t(*--p); |
961 | | if (U8_IS_SINGLE(c)) { |
962 | | return {c, 1, p, p0}; |
963 | | } |
964 | | // U8_IS_TRAIL(c) if well-formed |
965 | | c &= 0x3f; |
966 | | uint8_t count = 1; |
967 | | for (uint8_t shift = 6;;) { |
968 | | uint8_t b = *--p; |
969 | | if (b >= 0xc0) { |
970 | | U8_MASK_LEAD_BYTE(b, count); |
971 | | c |= uint32_t{b} << shift; |
972 | | break; |
973 | | } else { |
974 | | c |= (uint32_t{b} & 0x3f) << shift; |
975 | | ++count; |
976 | | shift += 6; |
977 | | } |
978 | | } |
979 | | ++count; |
980 | | return {c, count, p, p0}; |
981 | | } |
982 | | }; |
983 | | |
984 | | // UTF-16 |
985 | | template<typename CP32, typename UnitIter> |
986 | | class UnsafeUTFImpl< |
987 | | CP32, |
988 | | UnitIter, |
989 | | std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> { |
990 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
991 | | public: |
992 | | U_FORCE_INLINE static void inc(UnitIter &p) { |
993 | | // Very similar to U16_FWD_1_UNSAFE(). |
994 | | auto c = *p; |
995 | | ++p; |
996 | | if (U16_IS_LEAD(c)) { |
997 | | ++p; |
998 | | } |
999 | | } |
1000 | | |
1001 | | U_FORCE_INLINE static void dec(UnitIter &p) { |
1002 | | // Very similar to U16_BACK_1_UNSAFE(). |
1003 | | if (U16_IS_TRAIL(*--p)) { |
1004 | | --p; |
1005 | | } |
1006 | | } |
1007 | | |
1008 | | U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) { |
1009 | | constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; |
1010 | | // Very similar to U16_NEXT_UNSAFE(). |
1011 | | CP32 c = static_cast<CP32>(*p); |
1012 | | ++p; |
1013 | | if (!U16_IS_LEAD(c)) { |
1014 | | if constexpr (isMultiPass) { |
1015 | | return {c, 1, p0, p}; |
1016 | | } else { |
1017 | | return {c, 1}; |
1018 | | } |
1019 | | } else { |
1020 | | uint16_t c2 = *p; |
1021 | | ++p; |
1022 | | c = U16_GET_SUPPLEMENTARY(c, c2); |
1023 | | if constexpr (isMultiPass) { |
1024 | | return {c, 2, p0, p}; |
1025 | | } else { |
1026 | | return {c, 2}; |
1027 | | } |
1028 | | } |
1029 | | } |
1030 | | |
1031 | | U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) { |
1032 | | // Very similar to U16_PREV_UNSAFE(). |
1033 | | UnitIter p0 = p; |
1034 | | CP32 c = static_cast<CP32>(*--p); |
1035 | | if (!U16_IS_TRAIL(c)) { |
1036 | | return {c, 1, p, p0}; |
1037 | | } else { |
1038 | | uint16_t c2 = *--p; |
1039 | | c = U16_GET_SUPPLEMENTARY(c2, c); |
1040 | | return {c, 2, p, p0}; |
1041 | | } |
1042 | | } |
1043 | | }; |
1044 | | |
1045 | | // UTF-32: trivial |
1046 | | template<typename CP32, typename UnitIter> |
1047 | | class UnsafeUTFImpl< |
1048 | | CP32, |
1049 | | UnitIter, |
1050 | | std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> { |
1051 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
1052 | | public: |
1053 | | U_FORCE_INLINE static void inc(UnitIter &p) { |
1054 | | ++p; |
1055 | | } |
1056 | | |
1057 | | U_FORCE_INLINE static void dec(UnitIter &p) { |
1058 | | --p; |
1059 | | } |
1060 | | |
1061 | | U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) { |
1062 | | constexpr bool isMultiPass = prv::forward_iterator<UnitIter>; |
1063 | | CP32 c = *p; |
1064 | | ++p; |
1065 | | if constexpr (isMultiPass) { |
1066 | | return {c, 1, p0, p}; |
1067 | | } else { |
1068 | | return {c, 1}; |
1069 | | } |
1070 | | } |
1071 | | |
1072 | | U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) { |
1073 | | UnitIter p0 = p; |
1074 | | CP32 c = *--p; |
1075 | | return {c, 1, p, p0}; |
1076 | | } |
1077 | | }; |
1078 | | |
1079 | | #endif |
1080 | | |
1081 | | // Validating iterators ---------------------------------------------------- *** |
1082 | | |
1083 | | /** |
1084 | | * Validating iterator over the code points in a Unicode string. |
1085 | | * |
1086 | | * The UnitIter can be |
1087 | | * an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer). |
1088 | | * The UTFIterator will have the corresponding iterator_category. |
1089 | | * |
1090 | | * Call utfIterator() to have the compiler deduce the UnitIter and LimitIter types. |
1091 | | * |
1092 | | * For reverse iteration, either use this iterator directly as in <code>*--iter</code> |
1093 | | * or wrap it using std::make_reverse_iterator(iter). |
1094 | | * |
1095 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; |
1096 | | * should be signed if UTF_BEHAVIOR_NEGATIVE |
1097 | | * @tparam behavior How to handle ill-formed Unicode strings |
1098 | | * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: |
1099 | | * UTF-8: char or char8_t or uint8_t; |
1100 | | * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; |
1101 | | * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t |
1102 | | * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type. |
1103 | | * @draft ICU 78 |
1104 | | * @see utfIterator |
1105 | | */ |
1106 | | template<typename CP32, UTFIllFormedBehavior behavior, |
1107 | | typename UnitIter, typename LimitIter = UnitIter, typename = void> |
1108 | | class UTFIterator { |
1109 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
1110 | | using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>; |
1111 | | |
1112 | | // Proxy type for operator->() (required by LegacyInputIterator) |
1113 | | // so that we don't promise always returning CodeUnits. |
1114 | | class Proxy { |
1115 | | public: |
1116 | 297 | explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {} |
1117 | | CodeUnits<CP32, UnitIter> &operator*() { return units_; } |
1118 | 297 | CodeUnits<CP32, UnitIter> *operator->() { return &units_; } |
1119 | | private: |
1120 | | CodeUnits<CP32, UnitIter> units_; |
1121 | | }; |
1122 | | |
1123 | | public: |
1124 | | /** C++ iterator boilerplate @internal */ |
1125 | | using value_type = CodeUnits<CP32, UnitIter>; |
1126 | | /** C++ iterator boilerplate @internal */ |
1127 | | using reference = value_type; |
1128 | | /** C++ iterator boilerplate @internal */ |
1129 | | using pointer = Proxy; |
1130 | | /** C++ iterator boilerplate @internal */ |
1131 | | using difference_type = prv::iter_difference_t<UnitIter>; |
1132 | | /** C++ iterator boilerplate @internal */ |
1133 | | using iterator_category = std::conditional_t< |
1134 | | prv::bidirectional_iterator<UnitIter>, |
1135 | | std::bidirectional_iterator_tag, |
1136 | | std::forward_iterator_tag>; |
1137 | | |
1138 | | /** |
1139 | | * Constructor with start <= p < limit. |
1140 | | * All of these iterators/pointers should be at code point boundaries. |
1141 | | * Only enabled if UnitIter is a (multi-pass) forward_iterator or better. |
1142 | | * |
1143 | | * When using a code unit sentinel (UnitIter≠LimitIter), |
1144 | | * then that sentinel also works as a sentinel for this code point iterator. |
1145 | | * |
1146 | | * @param start Start of the range |
1147 | | * @param p Initial position inside the range |
1148 | | * @param limit Limit (exclusive end) of the range |
1149 | | * @draft ICU 78 |
1150 | | */ |
1151 | | U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) : |
1152 | 317 | p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {} |
1153 | | /** |
1154 | | * Constructor with start == p < limit. |
1155 | | * All of these iterators/pointers should be at code point boundaries. |
1156 | | * |
1157 | | * When using a code unit sentinel (UnitIter≠LimitIter), |
1158 | | * then that sentinel also works as a sentinel for this code point iterator. |
1159 | | * |
1160 | | * @param p Start of the range, and the initial position |
1161 | | * @param limit Limit (exclusive end) of the range |
1162 | | * @draft ICU 78 |
1163 | | */ |
1164 | | U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : |
1165 | 201 | p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {} |
1166 | | /** |
1167 | | * Constructs an iterator start or limit sentinel. |
1168 | | * The iterator/pointer should be at a code point boundary. |
1169 | | * Requires UnitIter to be copyable. |
1170 | | * |
1171 | | * When using a code unit sentinel (UnitIter≠LimitIter), |
1172 | | * then that sentinel also works as a sentinel for this code point iterator. |
1173 | | * |
1174 | | * @param p Range start or limit |
1175 | | * @draft ICU 78 |
1176 | | */ |
1177 | | U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {} |
1178 | | /** |
1179 | | * Default constructor. Makes a non-functional iterator. |
1180 | | * |
1181 | | * @draft ICU 78 |
1182 | | */ |
1183 | | U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {} |
1184 | | |
1185 | | /** Move constructor. @draft ICU 78 */ |
1186 | | U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default; |
1187 | | /** Move assignment operator. @draft ICU 78 */ |
1188 | | U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default; |
1189 | | |
1190 | | /** Copy constructor. @draft ICU 78 */ |
1191 | | U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default; |
1192 | | /** Copy assignment operator. @draft ICU 78 */ |
1193 | | U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default; |
1194 | | |
1195 | | /** |
1196 | | * @param other Another iterator |
1197 | | * @return true if this iterator is at the same position as the other one |
1198 | | * @draft ICU 78 |
1199 | | */ |
1200 | 317 | U_FORCE_INLINE bool operator==(const UTFIterator &other) const { |
1201 | 317 | return base() == other.base(); |
1202 | 317 | } |
1203 | | /** |
1204 | | * @param other Another iterator |
1205 | | * @return true if this iterator is not at the same position as the other one |
1206 | | * @draft ICU 78 |
1207 | | */ |
1208 | 116 | U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); } |
1209 | | |
1210 | | // Asymmetric equality & nonequality with a sentinel type. |
1211 | | |
1212 | | /** |
1213 | | * @param iter A UTFIterator |
1214 | | * @param s A unit iterator sentinel |
1215 | | * @return true if the iterator’s position is equal to the sentinel |
1216 | | * @draft ICU 78 |
1217 | | */ |
1218 | | template<typename Sentinel> U_FORCE_INLINE friend |
1219 | | std::enable_if_t< |
1220 | | !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
1221 | | bool> |
1222 | | operator==(const UTFIterator &iter, const Sentinel &s) { |
1223 | | return iter.base() == s; |
1224 | | } |
1225 | | |
1226 | | #if U_CPLUSPLUS_VERSION < 20 |
1227 | | // C++17: Need to define all four combinations of == / != vs. parameter order. |
1228 | | // Once we require C++20, we could remove all but the first == because |
1229 | | // the compiler would generate the rest. |
1230 | | |
1231 | | /** |
1232 | | * @param s A unit iterator sentinel |
1233 | | * @param iter A UTFIterator |
1234 | | * @return true if the iterator’s position is equal to the sentinel |
1235 | | * @internal |
1236 | | */ |
1237 | | template<typename Sentinel> U_FORCE_INLINE friend |
1238 | | std::enable_if_t< |
1239 | | !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
1240 | | bool> |
1241 | | operator==(const Sentinel &s, const UTFIterator &iter) { |
1242 | | return iter.base() == s; |
1243 | | } |
1244 | | /** |
1245 | | * @param iter A UTFIterator |
1246 | | * @param s A unit iterator sentinel |
1247 | | * @return true if the iterator’s position is not equal to the sentinel |
1248 | | * @internal |
1249 | | */ |
1250 | | template<typename Sentinel> U_FORCE_INLINE friend |
1251 | | std::enable_if_t< |
1252 | | !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
1253 | | bool> |
1254 | | operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); } |
1255 | | /** |
1256 | | * @param s A unit iterator sentinel |
1257 | | * @param iter A UTFIterator |
1258 | | * @return true if the iterator’s position is not equal to the sentinel |
1259 | | * @internal |
1260 | | */ |
1261 | | template<typename Sentinel> U_FORCE_INLINE friend |
1262 | | std::enable_if_t< |
1263 | | !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
1264 | | bool> |
1265 | | operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); } |
1266 | | #endif // C++17 |
1267 | | |
1268 | | /** |
1269 | | * Returns the current position as a code unit iterator. |
1270 | | * Similar to iter->begin() but also works at the exclusive end(). |
1271 | | * |
1272 | | * @return current position as a code unit iterator |
1273 | | * @draft ICU 79 |
1274 | | */ |
1275 | 634 | U_FORCE_INLINE UnitIter base() const { |
1276 | | // Return the logical position. |
1277 | 634 | return state_ <= 0 ? p_ : units_.begin(); |
1278 | 634 | } |
1279 | | |
1280 | | /** |
1281 | | * Decodes the code unit sequence at the current position. |
1282 | | * |
1283 | | * @return CodeUnits with the decoded code point etc. |
1284 | | * @draft ICU 78 |
1285 | | */ |
1286 | | U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const { |
1287 | | if (state_ == 0) { |
1288 | | UnitIter p0 = p_; |
1289 | | units_ = Impl::readAndInc(p0, p_, limit_); |
1290 | | state_ = 1; |
1291 | | } |
1292 | | return units_; |
1293 | | } |
1294 | | |
1295 | | /** |
1296 | | * Decodes the code unit sequence at the current position. |
1297 | | * Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc. |
1298 | | * |
1299 | | * @return CodeUnits with the decoded code point etc., wrapped into |
1300 | | * an opaque proxy object so that <code>iter->codePoint()</code> etc. works. |
1301 | | * @draft ICU 78 |
1302 | | */ |
1303 | 297 | U_FORCE_INLINE Proxy operator->() const { |
1304 | 297 | if (state_ == 0) { |
1305 | 181 | UnitIter p0 = p_; |
1306 | 181 | units_ = Impl::readAndInc(p0, p_, limit_); |
1307 | 181 | state_ = 1; |
1308 | 181 | } |
1309 | 297 | return Proxy(units_); |
1310 | 297 | } |
1311 | | |
1312 | | /** |
1313 | | * Pre-increment operator. |
1314 | | * |
1315 | | * @return this iterator |
1316 | | * @draft ICU 78 |
1317 | | */ |
1318 | 116 | U_FORCE_INLINE UTFIterator &operator++() { // pre-increment |
1319 | 116 | if (state_ > 0) { |
1320 | | // operator*() called readAndInc() so p_ is already ahead. |
1321 | 116 | state_ = 0; |
1322 | 116 | } else if (state_ == 0) { |
1323 | 0 | Impl::inc(p_, limit_); |
1324 | 0 | } else /* state_ < 0 */ { |
1325 | | // operator--() called decAndRead() so we know how far to skip. |
1326 | 0 | p_ = units_.end(); |
1327 | 0 | state_ = 0; |
1328 | 0 | } |
1329 | 116 | return *this; |
1330 | 116 | } |
1331 | | |
1332 | | /** |
1333 | | * Post-increment operator. |
1334 | | * |
1335 | | * @return a copy of this iterator from before the increment. |
1336 | | * If UnitIter is a single-pass input_iterator, then this function |
1337 | | * returns an opaque proxy object so that <code>*iter++</code> still works. |
1338 | | * @draft ICU 78 |
1339 | | */ |
1340 | | U_FORCE_INLINE UTFIterator operator++(int) { // post-increment |
1341 | | if (state_ > 0) { |
1342 | | // operator*() called readAndInc() so p_ is already ahead. |
1343 | | UTFIterator result(*this); |
1344 | | state_ = 0; |
1345 | | return result; |
1346 | | } else if (state_ == 0) { |
1347 | | UnitIter p0 = p_; |
1348 | | units_ = Impl::readAndInc(p0, p_, limit_); |
1349 | | UTFIterator result(*this); |
1350 | | result.state_ = 1; |
1351 | | // keep this->state_ == 0 |
1352 | | return result; |
1353 | | } else /* state_ < 0 */ { |
1354 | | UTFIterator result(*this); |
1355 | | // operator--() called decAndRead() so we know how far to skip. |
1356 | | p_ = units_.end(); |
1357 | | state_ = 0; |
1358 | | return result; |
1359 | | } |
1360 | | } |
1361 | | |
1362 | | /** |
1363 | | * Pre-decrement operator. |
1364 | | * Only enabled if UnitIter is a bidirectional_iterator (including a pointer). |
1365 | | * |
1366 | | * @return this iterator |
1367 | | * @draft ICU 78 |
1368 | | */ |
1369 | | template<typename Iter = UnitIter> |
1370 | | U_FORCE_INLINE |
1371 | | std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &> |
1372 | | operator--() { // pre-decrement |
1373 | | if (state_ > 0) { |
1374 | | // operator*() called readAndInc() so p_ is ahead of the logical position. |
1375 | | p_ = units_.begin(); |
1376 | | } |
1377 | | units_ = Impl::decAndRead(start_, p_); |
1378 | | state_ = -1; |
1379 | | return *this; |
1380 | | } |
1381 | | |
1382 | | /** |
1383 | | * Post-decrement operator. |
1384 | | * Only enabled if UnitIter is a bidirectional_iterator (including a pointer). |
1385 | | * |
1386 | | * @return a copy of this iterator from before the decrement. |
1387 | | * @draft ICU 78 |
1388 | | */ |
1389 | | template<typename Iter = UnitIter> |
1390 | | U_FORCE_INLINE |
1391 | | std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator> |
1392 | | operator--(int) { // post-decrement |
1393 | | UTFIterator result(*this); |
1394 | | operator--(); |
1395 | | return result; |
1396 | | } |
1397 | | |
1398 | | private: |
1399 | | friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>; |
1400 | | |
1401 | | // operator*() etc. are logically const. |
1402 | | mutable UnitIter p_; |
1403 | | // In a validating iterator, we need start_ & limit_ so that when we read a code point |
1404 | | // (forward or backward) we can test if there are enough code units. |
1405 | | UnitIter start_; |
1406 | | LimitIter limit_; |
1407 | | // Keep state so that we call readAndInc() only once for both operator*() and ++ |
1408 | | // to make it easy for the compiler to optimize. |
1409 | | mutable CodeUnits<CP32, UnitIter> units_; |
1410 | | // >0: units_ = readAndInc(), p_ = units limit |
1411 | | // which means that p_ is ahead of its logical position |
1412 | | // 0: initial state |
1413 | | // <0: units_ = decAndRead(), p_ = units start |
1414 | | mutable int8_t state_ = 0; |
1415 | | }; |
1416 | | |
1417 | | #ifndef U_IN_DOXYGEN |
1418 | | // Partial template specialization for single-pass input iterator. |
1419 | | template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter> |
1420 | | class UTFIterator< |
1421 | | CP32, behavior, |
1422 | | UnitIter, LimitIter, |
1423 | | std::enable_if_t<!prv::forward_iterator<UnitIter>>> { |
1424 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
1425 | | using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>; |
1426 | | |
1427 | | // Proxy type for post-increment return value, to make *iter++ work. |
1428 | | // Also for operator->() (required by LegacyInputIterator) |
1429 | | // so that we don't promise always returning CodeUnits. |
1430 | | class Proxy { |
1431 | | public: |
1432 | | explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {} |
1433 | | CodeUnits<CP32, UnitIter> &operator*() { return units_; } |
1434 | | CodeUnits<CP32, UnitIter> *operator->() { return &units_; } |
1435 | | private: |
1436 | | CodeUnits<CP32, UnitIter> units_; |
1437 | | }; |
1438 | | |
1439 | | public: |
1440 | | using value_type = CodeUnits<CP32, UnitIter>; |
1441 | | using reference = value_type; |
1442 | | using pointer = Proxy; |
1443 | | using difference_type = prv::iter_difference_t<UnitIter>; |
1444 | | using iterator_category = std::input_iterator_tag; |
1445 | | |
1446 | | U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {} |
1447 | | |
1448 | | // Constructs an iterator start or limit sentinel. |
1449 | | // Requires p to be copyable. |
1450 | | U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {} |
1451 | | |
1452 | | U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default; |
1453 | | U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default; |
1454 | | |
1455 | | U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default; |
1456 | | U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default; |
1457 | | |
1458 | | U_FORCE_INLINE bool operator==(const UTFIterator &other) const { |
1459 | | return p_ == other.p_ && ahead_ == other.ahead_; |
1460 | | // Strictly speaking, we should check if the logical position is the same. |
1461 | | // However, we cannot advance, or do arithmetic with, a single-pass UnitIter. |
1462 | | } |
1463 | | U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); } |
1464 | | |
1465 | | template<typename Sentinel> U_FORCE_INLINE friend |
1466 | | std::enable_if_t< |
1467 | | !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
1468 | | bool> |
1469 | | operator==(const UTFIterator &iter, const Sentinel &s) { |
1470 | | return !iter.ahead_ && iter.p_ == s; |
1471 | | } |
1472 | | |
1473 | | #if U_CPLUSPLUS_VERSION < 20 |
1474 | | template<typename Sentinel> U_FORCE_INLINE friend |
1475 | | std::enable_if_t< |
1476 | | !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
1477 | | bool> |
1478 | | operator==(const Sentinel &s, const UTFIterator &iter) { |
1479 | | return !iter.ahead_ && iter.p_ == s; |
1480 | | } |
1481 | | |
1482 | | template<typename Sentinel> U_FORCE_INLINE friend |
1483 | | std::enable_if_t< |
1484 | | !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
1485 | | bool> |
1486 | | operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); } |
1487 | | |
1488 | | template<typename Sentinel> U_FORCE_INLINE friend |
1489 | | std::enable_if_t< |
1490 | | !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
1491 | | bool> |
1492 | | operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); } |
1493 | | #endif // C++17 |
1494 | | |
1495 | | U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const { |
1496 | | if (!ahead_) { |
1497 | | units_ = Impl::readAndInc(p_, p_, limit_); |
1498 | | ahead_ = true; |
1499 | | } |
1500 | | return units_; |
1501 | | } |
1502 | | |
1503 | | U_FORCE_INLINE Proxy operator->() const { |
1504 | | if (!ahead_) { |
1505 | | units_ = Impl::readAndInc(p_, p_, limit_); |
1506 | | ahead_ = true; |
1507 | | } |
1508 | | return Proxy(units_); |
1509 | | } |
1510 | | |
1511 | | U_FORCE_INLINE UTFIterator &operator++() { // pre-increment |
1512 | | if (ahead_) { |
1513 | | // operator*() called readAndInc() so p_ is already ahead. |
1514 | | ahead_ = false; |
1515 | | } else { |
1516 | | Impl::inc(p_, limit_); |
1517 | | } |
1518 | | return *this; |
1519 | | } |
1520 | | |
1521 | | U_FORCE_INLINE Proxy operator++(int) { // post-increment |
1522 | | if (ahead_) { |
1523 | | // operator*() called readAndInc() so p_ is already ahead. |
1524 | | ahead_ = false; |
1525 | | } else { |
1526 | | units_ = Impl::readAndInc(p_, p_, limit_); |
1527 | | // keep this->ahead_ == false |
1528 | | } |
1529 | | return Proxy(units_); |
1530 | | } |
1531 | | |
1532 | | private: |
1533 | | // operator*() etc. are logically const. |
1534 | | mutable UnitIter p_; |
1535 | | // In a validating iterator, we need limit_ so that when we read a code point |
1536 | | // we can test if there are enough code units. |
1537 | | LimitIter limit_; |
1538 | | // Keep state so that we call readAndInc() only once for both operator*() and ++ |
1539 | | // so that we can use a single-pass input iterator for UnitIter. |
1540 | | mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false}; |
1541 | | // true: units_ = readAndInc(), p_ = units limit |
1542 | | // which means that p_ is ahead of its logical position |
1543 | | // false: initial state |
1544 | | mutable bool ahead_ = false; |
1545 | | }; |
1546 | | #endif // U_IN_DOXYGEN |
1547 | | |
1548 | | } // namespace U_HEADER_ONLY_NAMESPACE |
1549 | | |
1550 | | #ifndef U_IN_DOXYGEN |
1551 | | // Bespoke specialization of reverse_iterator. |
1552 | | // The default implementation implements reverse operator*() and ++ in a way |
1553 | | // that does most of the same work twice for reading variable-length sequences. |
1554 | | template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter> |
1555 | | class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> { |
1556 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
1557 | | using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>; |
1558 | | using CodeUnits_ = U_HEADER_ONLY_NAMESPACE::CodeUnits<CP32, UnitIter>; |
1559 | | |
1560 | | // Proxy type for operator->() (required by LegacyInputIterator) |
1561 | | // so that we don't promise always returning CodeUnits. |
1562 | | class Proxy { |
1563 | | public: |
1564 | | explicit Proxy(CodeUnits_ units) : units_(units) {} |
1565 | | CodeUnits_ &operator*() { return units_; } |
1566 | | CodeUnits_ *operator->() { return &units_; } |
1567 | | private: |
1568 | | CodeUnits_ units_; |
1569 | | }; |
1570 | | |
1571 | | public: |
1572 | | using value_type = CodeUnits_; |
1573 | | using reference = value_type; |
1574 | | using pointer = Proxy; |
1575 | | using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>; |
1576 | | using iterator_category = std::bidirectional_iterator_tag; |
1577 | | |
1578 | | U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter> iter) : |
1579 | | p_(iter.base()), start_(iter.start_), limit_(iter.limit_), |
1580 | | units_(0, 0, false, p_, p_) {} |
1581 | | U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {} |
1582 | | |
1583 | | U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default; |
1584 | | U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default; |
1585 | | |
1586 | | U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default; |
1587 | | U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default; |
1588 | | |
1589 | | U_FORCE_INLINE bool operator==(const reverse_iterator &other) const { |
1590 | | return getLogicalPosition() == other.getLogicalPosition(); |
1591 | | } |
1592 | | U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); } |
1593 | | |
1594 | | U_FORCE_INLINE U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter> base() const { |
1595 | | return U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>( |
1596 | | start_, getLogicalPosition(), limit_); |
1597 | | } |
1598 | | |
1599 | | U_FORCE_INLINE CodeUnits_ operator*() const { |
1600 | | if (state_ == 0) { |
1601 | | units_ = Impl::decAndRead(start_, p_); |
1602 | | state_ = -1; |
1603 | | } |
1604 | | return units_; |
1605 | | } |
1606 | | |
1607 | | U_FORCE_INLINE Proxy operator->() const { |
1608 | | if (state_ == 0) { |
1609 | | units_ = Impl::decAndRead(start_, p_); |
1610 | | state_ = -1; |
1611 | | } |
1612 | | return Proxy(units_); |
1613 | | } |
1614 | | |
1615 | | U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment |
1616 | | if (state_ < 0) { |
1617 | | // operator*() called decAndRead() so p_ is already behind. |
1618 | | state_ = 0; |
1619 | | } else if (state_ == 0) { |
1620 | | Impl::dec(start_, p_); |
1621 | | } else /* state_ > 0 */ { |
1622 | | // operator--() called readAndInc() so we know how far to skip. |
1623 | | p_ = units_.begin(); |
1624 | | state_ = 0; |
1625 | | } |
1626 | | return *this; |
1627 | | } |
1628 | | |
1629 | | U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment |
1630 | | if (state_ < 0) { |
1631 | | // operator*() called decAndRead() so p_ is already behind. |
1632 | | reverse_iterator result(*this); |
1633 | | state_ = 0; |
1634 | | return result; |
1635 | | } else if (state_ == 0) { |
1636 | | units_ = Impl::decAndRead(start_, p_); |
1637 | | reverse_iterator result(*this); |
1638 | | result.state_ = -1; |
1639 | | // keep this->state_ == 0 |
1640 | | return result; |
1641 | | } else /* state_ > 0 */ { |
1642 | | reverse_iterator result(*this); |
1643 | | // operator--() called readAndInc() so we know how far to skip. |
1644 | | p_ = units_.begin(); |
1645 | | state_ = 0; |
1646 | | return result; |
1647 | | } |
1648 | | } |
1649 | | |
1650 | | U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement |
1651 | | if (state_ < 0) { |
1652 | | // operator*() called decAndRead() so p_ is behind the logical position. |
1653 | | p_ = units_.end(); |
1654 | | } |
1655 | | UnitIter p0 = p_; |
1656 | | units_ = Impl::readAndInc(p0, p_, limit_); |
1657 | | state_ = 1; |
1658 | | return *this; |
1659 | | } |
1660 | | |
1661 | | U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement |
1662 | | reverse_iterator result(*this); |
1663 | | operator--(); |
1664 | | return result; |
1665 | | } |
1666 | | |
1667 | | private: |
1668 | | U_FORCE_INLINE UnitIter getLogicalPosition() const { |
1669 | | return state_ >= 0 ? p_ : units_.end(); |
1670 | | } |
1671 | | |
1672 | | // operator*() etc. are logically const. |
1673 | | mutable UnitIter p_; |
1674 | | // In a validating iterator, we need start_ & limit_ so that when we read a code point |
1675 | | // (forward or backward) we can test if there are enough code units. |
1676 | | UnitIter start_; |
1677 | | UnitIter limit_; |
1678 | | // Keep state so that we call decAndRead() only once for both operator*() and ++ |
1679 | | // to make it easy for the compiler to optimize. |
1680 | | mutable CodeUnits_ units_; |
1681 | | // >0: units_ = readAndInc(), p_ = units limit |
1682 | | // 0: initial state |
1683 | | // <0: units_ = decAndRead(), p_ = units start |
1684 | | // which means that p_ is behind its logical position |
1685 | | mutable int8_t state_ = 0; |
1686 | | }; |
1687 | | #endif // U_IN_DOXYGEN |
1688 | | |
1689 | | namespace U_HEADER_ONLY_NAMESPACE { |
1690 | | |
1691 | | /** |
1692 | | * UTFIterator factory function for start <= p < limit. |
1693 | | * Deduces the UnitIter and LimitIter template parameters from the inputs. |
1694 | | * Only enabled if UnitIter is a (multi-pass) forward_iterator or better. |
1695 | | * |
1696 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t |
1697 | | * @tparam behavior How to handle ill-formed Unicode strings |
1698 | | * @tparam UnitIter Can usually be omitted/deduced: |
1699 | | * An iterator (often a pointer) that returns a code unit type: |
1700 | | * UTF-8: char or char8_t or uint8_t; |
1701 | | * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; |
1702 | | * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t |
1703 | | * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type. |
1704 | | * @param start start code unit iterator |
1705 | | * @param p current-position code unit iterator |
1706 | | * @param limit limit (exclusive-end) code unit iterator. |
1707 | | * When using a code unit sentinel (UnitIter≠LimitIter), |
1708 | | * then that sentinel also works as a sentinel for the code point iterator. |
1709 | | * @return a UTFIterator<CP32, behavior, UnitIter> |
1710 | | * for the given code unit iterators or character pointers |
1711 | | * @draft ICU 78 |
1712 | | */ |
1713 | | template<typename CP32, UTFIllFormedBehavior behavior, |
1714 | | typename UnitIter, typename LimitIter = UnitIter> |
1715 | 317 | auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) { |
1716 | 317 | return UTFIterator<CP32, behavior, UnitIter, LimitIter>( |
1717 | 317 | std::move(start), std::move(p), std::move(limit)); |
1718 | 317 | } |
1719 | | |
1720 | | /** |
1721 | | * UTFIterator factory function for start = p < limit. |
1722 | | * Deduces the UnitIter and LimitIter template parameters from the inputs. |
1723 | | * |
1724 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t |
1725 | | * @tparam behavior How to handle ill-formed Unicode strings |
1726 | | * @tparam UnitIter Can usually be omitted/deduced: |
1727 | | * An iterator (often a pointer) that returns a code unit type: |
1728 | | * UTF-8: char or char8_t or uint8_t; |
1729 | | * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; |
1730 | | * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t |
1731 | | * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type. |
1732 | | * @param p start and current-position code unit iterator |
1733 | | * @param limit limit (exclusive-end) code unit iterator. |
1734 | | * When using a code unit sentinel (UnitIter≠LimitIter), |
1735 | | * then that sentinel also works as a sentinel for the code point iterator. |
1736 | | * @return a UTFIterator<CP32, behavior, UnitIter> |
1737 | | * for the given code unit iterators or character pointers |
1738 | | * @draft ICU 78 |
1739 | | */ |
1740 | | template<typename CP32, UTFIllFormedBehavior behavior, |
1741 | | typename UnitIter, typename LimitIter = UnitIter> |
1742 | 201 | auto utfIterator(UnitIter p, LimitIter limit) { |
1743 | 201 | return UTFIterator<CP32, behavior, UnitIter, LimitIter>( |
1744 | 201 | std::move(p), std::move(limit)); |
1745 | 201 | } |
1746 | | |
1747 | | // Note: We should only enable the following factory function for a copyable UnitIter. |
1748 | | // In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator, |
1749 | | // but a function template partial specialization is not allowed. |
1750 | | // In C++20, we might be able to require the std::copyable concept. |
1751 | | |
1752 | | /** |
1753 | | * UTFIterator factory function for a start or limit sentinel. |
1754 | | * Deduces the UnitIter template parameter from the input. |
1755 | | * Requires UnitIter to be copyable. |
1756 | | * |
1757 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t |
1758 | | * @tparam behavior How to handle ill-formed Unicode strings |
1759 | | * @tparam UnitIter Can usually be omitted/deduced: |
1760 | | * An iterator (often a pointer) that returns a code unit type: |
1761 | | * UTF-8: char or char8_t or uint8_t; |
1762 | | * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; |
1763 | | * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t |
1764 | | * @param p code unit iterator. |
1765 | | * When using a code unit sentinel, |
1766 | | * then that sentinel also works as a sentinel for the code point iterator. |
1767 | | * @return a UTFIterator<CP32, behavior, UnitIter> |
1768 | | * for the given code unit iterator or character pointer |
1769 | | * @draft ICU 78 |
1770 | | */ |
1771 | | template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter> |
1772 | | auto utfIterator(UnitIter p) { |
1773 | | return UTFIterator<CP32, behavior, UnitIter>(std::move(p)); |
1774 | | } |
1775 | | |
1776 | | /** |
1777 | | * A C++ "range" for validating iteration over all of the code points of a code unit range. |
1778 | | * |
1779 | | * Call utfStringCodePoints() to have the compiler deduce the Range type. |
1780 | | * |
1781 | | * UTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range |
1782 | | * so is UTFStringCodePoints<CP32, behavior, Range>. |
1783 | | * Note that when given a range r that is an lvalue and is not a view, utfStringCodePoints(r) uses a |
1784 | | * ref_view of r as the Range type, which is a borrowed range. |
1785 | | * In practice, this means that given a container variable r, the iterators of utfStringCodePoints(r) can |
1786 | | * be used as long as iterators on r are valid, without having to keep utfStringCodePoints(r) around. |
1787 | | * For instance: |
1788 | | * \code |
1789 | | * std::u8string s = "𒇧𒇧"; |
1790 | | * // it outlives utfStringCodePoints<char32_t>(s). |
1791 | | * auto it = utfStringCodePoints<char32_t>(s).begin(); |
1792 | | * ++it; |
1793 | | * char32_t second_code_point = it->codePoint(); // OK. |
1794 | | * \endcode |
1795 | | * |
1796 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; |
1797 | | * should be signed if UTF_BEHAVIOR_NEGATIVE |
1798 | | * @tparam behavior How to handle ill-formed Unicode strings |
1799 | | * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units |
1800 | | * @draft ICU 78 |
1801 | | * @see utfStringCodePoints |
1802 | | */ |
1803 | | template<typename CP32, UTFIllFormedBehavior behavior, typename Range> |
1804 | | #if defined(__cpp_lib_ranges) |
1805 | | requires std::ranges::range<Range> |
1806 | | #endif |
1807 | | class UTFStringCodePoints { |
1808 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
1809 | | public: |
1810 | | /** |
1811 | | * Constructs an empty C++ "range" object. |
1812 | | * @draft ICU 78 |
1813 | | */ |
1814 | | UTFStringCodePoints() = default; |
1815 | | |
1816 | | /** |
1817 | | * Constructs a C++ "range" object over the code points in the string. |
1818 | | * @param unitRange input range |
1819 | | * @draft ICU 78 |
1820 | | */ |
1821 | | template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>> |
1822 | 201 | explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {} |
1823 | | /** |
1824 | | * Constructs a C++ "range" object over the code points in the string, |
1825 | | * keeping a reference to the code unit range. This overload is used by |
1826 | | * utfStringCodePoints in C++17; in C+20, a ref_view is used instead (via |
1827 | | * views::all). |
1828 | | * @param unitRange input range |
1829 | | * @draft ICU 78 |
1830 | | */ |
1831 | | template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void> |
1832 | | explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {} |
1833 | | |
1834 | | /** Copy constructor. @draft ICU 78 */ |
1835 | | UTFStringCodePoints(const UTFStringCodePoints &other) = default; |
1836 | | |
1837 | | /** Copy assignment operator. @draft ICU 78 */ |
1838 | | UTFStringCodePoints &operator=(const UTFStringCodePoints &other) = default; |
1839 | | |
1840 | | /** |
1841 | | * @return the range start iterator |
1842 | | * @draft ICU 78 |
1843 | | */ |
1844 | | auto begin() { |
1845 | | return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end()); |
1846 | | } |
1847 | | |
1848 | | /** |
1849 | | * @return the range start iterator |
1850 | | * @draft ICU 78 |
1851 | | */ |
1852 | | template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>> |
1853 | 201 | auto begin() const { |
1854 | 201 | return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end()); |
1855 | 201 | } |
1856 | | |
1857 | | /** |
1858 | | * @return the range limit (exclusive end) iterator |
1859 | | * @draft ICU 78 |
1860 | | */ |
1861 | | auto end() { |
1862 | | using UnitIter = decltype(unitRange.begin()); |
1863 | | using LimitIter = decltype(unitRange.end()); |
1864 | | if constexpr (!std::is_same_v<UnitIter, LimitIter>) { |
1865 | | // Return the code unit sentinel. |
1866 | | return unitRange.end(); |
1867 | | } else if constexpr (prv::bidirectional_iterator<UnitIter>) { |
1868 | | return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end()); |
1869 | | } else { |
1870 | | // The input iterator specialization has no three-argument constructor. |
1871 | | return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end()); |
1872 | | } |
1873 | | } |
1874 | | |
1875 | | /** |
1876 | | * @return the range limit (exclusive end) iterator |
1877 | | * @draft ICU 78 |
1878 | | */ |
1879 | | template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>> |
1880 | 317 | auto end() const { |
1881 | 317 | using UnitIter = decltype(unitRange.begin()); |
1882 | 317 | using LimitIter = decltype(unitRange.end()); |
1883 | | if constexpr (!std::is_same_v<UnitIter, LimitIter>) { |
1884 | | // Return the code unit sentinel. |
1885 | | return unitRange.end(); |
1886 | 317 | } else if constexpr (prv::bidirectional_iterator<UnitIter>) { |
1887 | 317 | return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end()); |
1888 | | } else { |
1889 | | // The input iterator specialization has no three-argument constructor. |
1890 | | return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end()); |
1891 | | } |
1892 | 317 | } |
1893 | | |
1894 | | /** |
1895 | | * @return std::reverse_iterator(end()) |
1896 | | * @draft ICU 78 |
1897 | | */ |
1898 | | auto rbegin() const { |
1899 | | return std::make_reverse_iterator(end()); |
1900 | | } |
1901 | | |
1902 | | /** |
1903 | | * @return std::reverse_iterator(begin()) |
1904 | | * @draft ICU 78 |
1905 | | */ |
1906 | | auto rend() const { |
1907 | | return std::make_reverse_iterator(begin()); |
1908 | | } |
1909 | | |
1910 | | /** |
1911 | | * Returns the CodeUnits for the first character/code point. |
1912 | | * Requires that the range is not empty. |
1913 | | * |
1914 | | * @return the CodeUnits for the first character/code point. |
1915 | | * @draft ICU 79 |
1916 | | */ |
1917 | | auto front() const { |
1918 | | return *begin(); |
1919 | | } |
1920 | | |
1921 | | /** |
1922 | | * Returns the CodeUnits for the last character/code point. |
1923 | | * Requires that the range is not empty. |
1924 | | * |
1925 | | * @return the CodeUnits for the last character/code point. |
1926 | | * @draft ICU 79 |
1927 | | */ |
1928 | | auto back() const { |
1929 | | return *(--end()); |
1930 | | } |
1931 | | |
1932 | | private: |
1933 | | Range unitRange; |
1934 | | }; |
1935 | | |
1936 | | /** @internal */ |
1937 | | template<typename CP32, UTFIllFormedBehavior behavior> |
1938 | | struct UTFStringCodePointsAdaptor |
1939 | | #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \ |
1940 | | __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3. |
1941 | | : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>> |
1942 | | #endif |
1943 | | { |
1944 | | /** @internal */ |
1945 | | template<typename Range> |
1946 | 201 | auto operator()(Range &&unitRange) const { |
1947 | | #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2. |
1948 | | return UTFStringCodePoints<CP32, behavior, std::ranges::views::all_t<Range>>( |
1949 | | std::forward<Range>(unitRange)); |
1950 | | #else |
1951 | 201 | if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) { |
1952 | | // Take basic_string_view by copy, not by reference. In C++20 this is handled by |
1953 | | // all_t<Range>, which is Range if Range is a view. |
1954 | 201 | return UTFStringCodePoints<CP32, behavior, std::decay_t<Range>>( |
1955 | 201 | std::forward<Range>(unitRange)); |
1956 | | } else { |
1957 | | return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange)); |
1958 | | } |
1959 | 201 | #endif |
1960 | 201 | } |
1961 | | }; |
1962 | | |
1963 | | /** |
1964 | | * Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of code |
1965 | | * points in a code unit range, which validates while decoding. |
1966 | | * Deduces the Range template parameter from the input, taking into account the value category: the |
1967 | | * code units will be referenced if possible, and moved if necessary. |
1968 | | * |
1969 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t; |
1970 | | * should be signed if UTF_BEHAVIOR_NEGATIVE |
1971 | | * @tparam behavior How to handle ill-formed Unicode strings |
1972 | | * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units |
1973 | | * @param unitRange input range |
1974 | | * @return a UTFStringCodePoints<CP32, behavior, Range> for the given unitRange |
1975 | | * @draft ICU 78 |
1976 | | */ |
1977 | | template<typename CP32, UTFIllFormedBehavior behavior> |
1978 | | constexpr UTFStringCodePointsAdaptor<CP32, behavior> utfStringCodePoints; |
1979 | | |
1980 | | // Non-validating iterators ------------------------------------------------ *** |
1981 | | |
1982 | | /** |
1983 | | * Non-validating iterator over the code points in a Unicode string. |
1984 | | * The string must be well-formed. |
1985 | | * |
1986 | | * The UnitIter can be |
1987 | | * an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer). |
1988 | | * The UTFIterator will have the corresponding iterator_category. |
1989 | | * |
1990 | | * Call unsafeUTFIterator() to have the compiler deduce the UnitIter type. |
1991 | | * |
1992 | | * For reverse iteration, either use this iterator directly as in <code>*--iter</code> |
1993 | | * or wrap it using std::make_reverse_iterator(iter). |
1994 | | * |
1995 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t |
1996 | | * @tparam UnitIter An iterator (often a pointer) that returns a code unit type: |
1997 | | * UTF-8: char or char8_t or uint8_t; |
1998 | | * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; |
1999 | | * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t |
2000 | | * @draft ICU 78 |
2001 | | * @see unsafeUTFIterator |
2002 | | */ |
2003 | | template<typename CP32, typename UnitIter, typename = void> |
2004 | | class UnsafeUTFIterator { |
2005 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
2006 | | using Impl = UnsafeUTFImpl<CP32, UnitIter>; |
2007 | | |
2008 | | // Proxy type for operator->() (required by LegacyInputIterator) |
2009 | | // so that we don't promise always returning UnsafeCodeUnits. |
2010 | | class Proxy { |
2011 | | public: |
2012 | | explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {} |
2013 | | UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; } |
2014 | | UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; } |
2015 | | private: |
2016 | | UnsafeCodeUnits<CP32, UnitIter> units_; |
2017 | | }; |
2018 | | |
2019 | | public: |
2020 | | /** C++ iterator boilerplate @internal */ |
2021 | | using value_type = UnsafeCodeUnits<CP32, UnitIter>; |
2022 | | /** C++ iterator boilerplate @internal */ |
2023 | | using reference = value_type; |
2024 | | /** C++ iterator boilerplate @internal */ |
2025 | | using pointer = Proxy; |
2026 | | /** C++ iterator boilerplate @internal */ |
2027 | | using difference_type = prv::iter_difference_t<UnitIter>; |
2028 | | /** C++ iterator boilerplate @internal */ |
2029 | | using iterator_category = std::conditional_t< |
2030 | | prv::bidirectional_iterator<UnitIter>, |
2031 | | std::bidirectional_iterator_tag, |
2032 | | std::forward_iterator_tag>; |
2033 | | |
2034 | | /** |
2035 | | * Constructor; the iterator/pointer should be at a code point boundary. |
2036 | | * |
2037 | | * When using a code unit sentinel, |
2038 | | * then that sentinel also works as a sentinel for this code point iterator. |
2039 | | * |
2040 | | * @param p Initial position inside the range, or a range sentinel |
2041 | | * @draft ICU 78 |
2042 | | */ |
2043 | | U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {} |
2044 | | /** |
2045 | | * Default constructor. Makes a non-functional iterator. |
2046 | | * |
2047 | | * @draft ICU 78 |
2048 | | */ |
2049 | | U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {} |
2050 | | |
2051 | | /** Move constructor. @draft ICU 78 */ |
2052 | | U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default; |
2053 | | /** Move assignment operator. @draft ICU 78 */ |
2054 | | U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default; |
2055 | | |
2056 | | /** Copy constructor. @draft ICU 78 */ |
2057 | | U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default; |
2058 | | /** Copy assignment operator. @draft ICU 78 */ |
2059 | | U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default; |
2060 | | |
2061 | | /** |
2062 | | * @param other Another iterator |
2063 | | * @return true if this iterator is at the same position as the other one |
2064 | | * @draft ICU 78 |
2065 | | */ |
2066 | | U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const { |
2067 | | return base() == other.base(); |
2068 | | } |
2069 | | /** |
2070 | | * @param other Another iterator |
2071 | | * @return true if this iterator is not at the same position as the other one |
2072 | | * @draft ICU 78 |
2073 | | */ |
2074 | | U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); } |
2075 | | |
2076 | | /** |
2077 | | * @param iter An UnsafeUTFIterator |
2078 | | * @param s A unit iterator sentinel |
2079 | | * @return true if the iterator’s position is equal to the sentinel |
2080 | | * @draft ICU 78 |
2081 | | */ |
2082 | | template<typename Sentinel> U_FORCE_INLINE friend |
2083 | | std::enable_if_t< |
2084 | | !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
2085 | | bool> |
2086 | | operator==(const UnsafeUTFIterator &iter, const Sentinel &s) { |
2087 | | return iter.base() == s; |
2088 | | } |
2089 | | |
2090 | | #if U_CPLUSPLUS_VERSION < 20 |
2091 | | /** |
2092 | | * @param s A unit iterator sentinel |
2093 | | * @param iter An UnsafeUTFIterator |
2094 | | * @return true if the iterator’s position is equal to the sentinel |
2095 | | * @internal |
2096 | | */ |
2097 | | template<typename Sentinel> U_FORCE_INLINE friend |
2098 | | std::enable_if_t< |
2099 | | !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
2100 | | bool> |
2101 | | operator==(const Sentinel &s, const UnsafeUTFIterator &iter) { |
2102 | | return iter.base() == s; |
2103 | | } |
2104 | | /** |
2105 | | * @param iter An UnsafeUTFIterator |
2106 | | * @param s A unit iterator sentinel |
2107 | | * @return true if the iterator’s position is not equal to the sentinel |
2108 | | * @internal |
2109 | | */ |
2110 | | template<typename Sentinel> U_FORCE_INLINE friend |
2111 | | std::enable_if_t< |
2112 | | !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
2113 | | bool> |
2114 | | operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); } |
2115 | | /** |
2116 | | * @param s A unit iterator sentinel |
2117 | | * @param iter An UnsafeUTFIterator |
2118 | | * @return true if the iterator’s position is not equal to the sentinel |
2119 | | * @internal |
2120 | | */ |
2121 | | template<typename Sentinel> U_FORCE_INLINE friend |
2122 | | std::enable_if_t< |
2123 | | !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
2124 | | bool> |
2125 | | operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); } |
2126 | | #endif // C++17 |
2127 | | |
2128 | | /** |
2129 | | * Returns the current position as a code unit iterator. |
2130 | | * Similar to iter->begin() but also works at the exclusive end(). |
2131 | | * |
2132 | | * @return current position as a code unit iterator |
2133 | | * @draft ICU 79 |
2134 | | */ |
2135 | | U_FORCE_INLINE UnitIter base() const { |
2136 | | // Return the logical position. |
2137 | | return state_ <= 0 ? p_ : units_.begin(); |
2138 | | } |
2139 | | |
2140 | | /** |
2141 | | * Decodes the code unit sequence at the current position. |
2142 | | * |
2143 | | * @return CodeUnits with the decoded code point etc. |
2144 | | * @draft ICU 78 |
2145 | | */ |
2146 | | U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const { |
2147 | | if (state_ == 0) { |
2148 | | UnitIter p0 = p_; |
2149 | | units_ = Impl::readAndInc(p0, p_); |
2150 | | state_ = 1; |
2151 | | } |
2152 | | return units_; |
2153 | | } |
2154 | | |
2155 | | /** |
2156 | | * Decodes the code unit sequence at the current position. |
2157 | | * Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc. |
2158 | | * |
2159 | | * @return CodeUnits with the decoded code point etc., wrapped into |
2160 | | * an opaque proxy object so that <code>iter->codePoint()</code> etc. works. |
2161 | | * @draft ICU 78 |
2162 | | */ |
2163 | | U_FORCE_INLINE Proxy operator->() const { |
2164 | | if (state_ == 0) { |
2165 | | UnitIter p0 = p_; |
2166 | | units_ = Impl::readAndInc(p0, p_); |
2167 | | state_ = 1; |
2168 | | } |
2169 | | return Proxy(units_); |
2170 | | } |
2171 | | |
2172 | | /** |
2173 | | * Pre-increment operator. |
2174 | | * |
2175 | | * @return this iterator |
2176 | | * @draft ICU 78 |
2177 | | */ |
2178 | | U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment |
2179 | | if (state_ > 0) { |
2180 | | // operator*() called readAndInc() so p_ is already ahead. |
2181 | | state_ = 0; |
2182 | | } else if (state_ == 0) { |
2183 | | Impl::inc(p_); |
2184 | | } else /* state_ < 0 */ { |
2185 | | // operator--() called decAndRead() so we know how far to skip. |
2186 | | p_ = units_.end(); |
2187 | | state_ = 0; |
2188 | | } |
2189 | | return *this; |
2190 | | } |
2191 | | |
2192 | | /** |
2193 | | * Post-increment operator. |
2194 | | * |
2195 | | * @return a copy of this iterator from before the increment. |
2196 | | * If UnitIter is a single-pass input_iterator, then this function |
2197 | | * returns an opaque proxy object so that <code>*iter++</code> still works. |
2198 | | * @draft ICU 78 |
2199 | | */ |
2200 | | U_FORCE_INLINE UnsafeUTFIterator operator++(int) { // post-increment |
2201 | | if (state_ > 0) { |
2202 | | // operator*() called readAndInc() so p_ is already ahead. |
2203 | | UnsafeUTFIterator result(*this); |
2204 | | state_ = 0; |
2205 | | return result; |
2206 | | } else if (state_ == 0) { |
2207 | | UnitIter p0 = p_; |
2208 | | units_ = Impl::readAndInc(p0, p_); |
2209 | | UnsafeUTFIterator result(*this); |
2210 | | result.state_ = 1; |
2211 | | // keep this->state_ == 0 |
2212 | | return result; |
2213 | | } else /* state_ < 0 */ { |
2214 | | UnsafeUTFIterator result(*this); |
2215 | | // operator--() called decAndRead() so we know how far to skip. |
2216 | | p_ = units_.end(); |
2217 | | state_ = 0; |
2218 | | return result; |
2219 | | } |
2220 | | } |
2221 | | |
2222 | | /** |
2223 | | * Pre-decrement operator. |
2224 | | * Only enabled if UnitIter is a bidirectional_iterator (including a pointer). |
2225 | | * |
2226 | | * @return this iterator |
2227 | | * @draft ICU 78 |
2228 | | */ |
2229 | | template<typename Iter = UnitIter> |
2230 | | U_FORCE_INLINE |
2231 | | std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &> |
2232 | | operator--() { // pre-decrement |
2233 | | if (state_ > 0) { |
2234 | | // operator*() called readAndInc() so p_ is ahead of the logical position. |
2235 | | p_ = units_.begin(); |
2236 | | } |
2237 | | units_ = Impl::decAndRead(p_); |
2238 | | state_ = -1; |
2239 | | return *this; |
2240 | | } |
2241 | | |
2242 | | /** |
2243 | | * Post-decrement operator. |
2244 | | * Only enabled if UnitIter is a bidirectional_iterator (including a pointer). |
2245 | | * |
2246 | | * @return a copy of this iterator from before the decrement. |
2247 | | * @draft ICU 78 |
2248 | | */ |
2249 | | template<typename Iter = UnitIter> |
2250 | | U_FORCE_INLINE |
2251 | | std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator> |
2252 | | operator--(int) { // post-decrement |
2253 | | UnsafeUTFIterator result(*this); |
2254 | | operator--(); |
2255 | | return result; |
2256 | | } |
2257 | | |
2258 | | private: |
2259 | | friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>; |
2260 | | |
2261 | | // operator*() etc. are logically const. |
2262 | | mutable UnitIter p_; |
2263 | | // Keep state so that we call readAndInc() only once for both operator*() and ++ |
2264 | | // to make it easy for the compiler to optimize. |
2265 | | mutable UnsafeCodeUnits<CP32, UnitIter> units_; |
2266 | | // >0: units_ = readAndInc(), p_ = units limit |
2267 | | // which means that p_ is ahead of its logical position |
2268 | | // 0: initial state |
2269 | | // <0: units_ = decAndRead(), p_ = units start |
2270 | | mutable int8_t state_ = 0; |
2271 | | }; |
2272 | | |
2273 | | #ifndef U_IN_DOXYGEN |
2274 | | // Partial template specialization for single-pass input iterator. |
2275 | | template<typename CP32, typename UnitIter> |
2276 | | class UnsafeUTFIterator< |
2277 | | CP32, |
2278 | | UnitIter, |
2279 | | std::enable_if_t<!prv::forward_iterator<UnitIter>>> { |
2280 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
2281 | | using Impl = UnsafeUTFImpl<CP32, UnitIter>; |
2282 | | |
2283 | | // Proxy type for post-increment return value, to make *iter++ work. |
2284 | | // Also for operator->() (required by LegacyInputIterator) |
2285 | | // so that we don't promise always returning UnsafeCodeUnits. |
2286 | | class Proxy { |
2287 | | public: |
2288 | | explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {} |
2289 | | UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; } |
2290 | | UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; } |
2291 | | private: |
2292 | | UnsafeCodeUnits<CP32, UnitIter> units_; |
2293 | | }; |
2294 | | |
2295 | | public: |
2296 | | using value_type = UnsafeCodeUnits<CP32, UnitIter>; |
2297 | | using reference = value_type; |
2298 | | using pointer = Proxy; |
2299 | | using difference_type = prv::iter_difference_t<UnitIter>; |
2300 | | using iterator_category = std::input_iterator_tag; |
2301 | | |
2302 | | U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {} |
2303 | | |
2304 | | U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default; |
2305 | | U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default; |
2306 | | |
2307 | | U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default; |
2308 | | U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default; |
2309 | | |
2310 | | U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const { |
2311 | | return p_ == other.p_ && ahead_ == other.ahead_; |
2312 | | // Strictly speaking, we should check if the logical position is the same. |
2313 | | // However, we cannot advance, or do arithmetic with, a single-pass UnitIter. |
2314 | | } |
2315 | | U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); } |
2316 | | |
2317 | | template<typename Sentinel> U_FORCE_INLINE friend |
2318 | | std::enable_if_t< |
2319 | | !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
2320 | | bool> |
2321 | | operator==(const UnsafeUTFIterator &iter, const Sentinel &s) { |
2322 | | return !iter.ahead_ && iter.p_ == s; |
2323 | | } |
2324 | | |
2325 | | #if U_CPLUSPLUS_VERSION < 20 |
2326 | | template<typename Sentinel> U_FORCE_INLINE friend |
2327 | | std::enable_if_t< |
2328 | | !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
2329 | | bool> |
2330 | | operator==(const Sentinel &s, const UnsafeUTFIterator &iter) { |
2331 | | return !iter.ahead_ && iter.p_ == s; |
2332 | | } |
2333 | | |
2334 | | template<typename Sentinel> U_FORCE_INLINE friend |
2335 | | std::enable_if_t< |
2336 | | !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
2337 | | bool> |
2338 | | operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); } |
2339 | | |
2340 | | template<typename Sentinel> U_FORCE_INLINE friend |
2341 | | std::enable_if_t< |
2342 | | !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>, |
2343 | | bool> |
2344 | | operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); } |
2345 | | #endif // C++17 |
2346 | | |
2347 | | U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const { |
2348 | | if (!ahead_) { |
2349 | | units_ = Impl::readAndInc(p_, p_); |
2350 | | ahead_ = true; |
2351 | | } |
2352 | | return units_; |
2353 | | } |
2354 | | |
2355 | | U_FORCE_INLINE Proxy operator->() const { |
2356 | | if (!ahead_) { |
2357 | | units_ = Impl::readAndInc(p_, p_); |
2358 | | ahead_ = true; |
2359 | | } |
2360 | | return Proxy(units_); |
2361 | | } |
2362 | | |
2363 | | U_FORCE_INLINE UnsafeUTFIterator &operator++() { // pre-increment |
2364 | | if (ahead_) { |
2365 | | // operator*() called readAndInc() so p_ is already ahead. |
2366 | | ahead_ = false; |
2367 | | } else { |
2368 | | Impl::inc(p_); |
2369 | | } |
2370 | | return *this; |
2371 | | } |
2372 | | |
2373 | | U_FORCE_INLINE Proxy operator++(int) { // post-increment |
2374 | | if (ahead_) { |
2375 | | // operator*() called readAndInc() so p_ is already ahead. |
2376 | | ahead_ = false; |
2377 | | } else { |
2378 | | units_ = Impl::readAndInc(p_, p_); |
2379 | | // keep this->ahead_ == false |
2380 | | } |
2381 | | return Proxy(units_); |
2382 | | } |
2383 | | |
2384 | | private: |
2385 | | // operator*() etc. are logically const. |
2386 | | mutable UnitIter p_; |
2387 | | // Keep state so that we call readAndInc() only once for both operator*() and ++ |
2388 | | // so that we can use a single-pass input iterator for UnitIter. |
2389 | | mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0}; |
2390 | | // true: units_ = readAndInc(), p_ = units limit |
2391 | | // which means that p_ is ahead of its logical position |
2392 | | // false: initial state |
2393 | | mutable bool ahead_ = false; |
2394 | | }; |
2395 | | #endif // U_IN_DOXYGEN |
2396 | | |
2397 | | } // namespace U_HEADER_ONLY_NAMESPACE |
2398 | | |
2399 | | #ifndef U_IN_DOXYGEN |
2400 | | // Bespoke specialization of reverse_iterator. |
2401 | | // The default implementation implements reverse operator*() and ++ in a way |
2402 | | // that does most of the same work twice for reading variable-length sequences. |
2403 | | template<typename CP32, typename UnitIter> |
2404 | | class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> { |
2405 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
2406 | | using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>; |
2407 | | using UnsafeCodeUnits_ = U_HEADER_ONLY_NAMESPACE::UnsafeCodeUnits<CP32, UnitIter>; |
2408 | | |
2409 | | // Proxy type for operator->() (required by LegacyInputIterator) |
2410 | | // so that we don't promise always returning UnsafeCodeUnits. |
2411 | | class Proxy { |
2412 | | public: |
2413 | | explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {} |
2414 | | UnsafeCodeUnits_ &operator*() { return units_; } |
2415 | | UnsafeCodeUnits_ *operator->() { return &units_; } |
2416 | | private: |
2417 | | UnsafeCodeUnits_ units_; |
2418 | | }; |
2419 | | |
2420 | | public: |
2421 | | using value_type = UnsafeCodeUnits_; |
2422 | | using reference = value_type; |
2423 | | using pointer = Proxy; |
2424 | | using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>; |
2425 | | using iterator_category = std::bidirectional_iterator_tag; |
2426 | | |
2427 | | U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) : |
2428 | | p_(iter.base()), units_(0, 0, p_, p_) {} |
2429 | | U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {} |
2430 | | |
2431 | | U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default; |
2432 | | U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default; |
2433 | | |
2434 | | U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default; |
2435 | | U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default; |
2436 | | |
2437 | | U_FORCE_INLINE bool operator==(const reverse_iterator &other) const { |
2438 | | return getLogicalPosition() == other.getLogicalPosition(); |
2439 | | } |
2440 | | U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); } |
2441 | | |
2442 | | U_FORCE_INLINE U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> base() const { |
2443 | | return U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>( |
2444 | | getLogicalPosition()); |
2445 | | } |
2446 | | |
2447 | | U_FORCE_INLINE UnsafeCodeUnits_ operator*() const { |
2448 | | if (state_ == 0) { |
2449 | | units_ = Impl::decAndRead(p_); |
2450 | | state_ = -1; |
2451 | | } |
2452 | | return units_; |
2453 | | } |
2454 | | |
2455 | | U_FORCE_INLINE Proxy operator->() const { |
2456 | | if (state_ == 0) { |
2457 | | units_ = Impl::decAndRead(p_); |
2458 | | state_ = -1; |
2459 | | } |
2460 | | return Proxy(units_); |
2461 | | } |
2462 | | |
2463 | | U_FORCE_INLINE reverse_iterator &operator++() { // pre-increment |
2464 | | if (state_ < 0) { |
2465 | | // operator*() called decAndRead() so p_ is already behind. |
2466 | | state_ = 0; |
2467 | | } else if (state_ == 0) { |
2468 | | Impl::dec(p_); |
2469 | | } else /* state_ > 0 */ { |
2470 | | // operator--() called readAndInc() so we know how far to skip. |
2471 | | p_ = units_.begin(); |
2472 | | state_ = 0; |
2473 | | } |
2474 | | return *this; |
2475 | | } |
2476 | | |
2477 | | U_FORCE_INLINE reverse_iterator operator++(int) { // post-increment |
2478 | | if (state_ < 0) { |
2479 | | // operator*() called decAndRead() so p_ is already behind. |
2480 | | reverse_iterator result(*this); |
2481 | | state_ = 0; |
2482 | | return result; |
2483 | | } else if (state_ == 0) { |
2484 | | units_ = Impl::decAndRead(p_); |
2485 | | reverse_iterator result(*this); |
2486 | | result.state_ = -1; |
2487 | | // keep this->state_ == 0 |
2488 | | return result; |
2489 | | } else /* state_ > 0 */ { |
2490 | | reverse_iterator result(*this); |
2491 | | // operator--() called readAndInc() so we know how far to skip. |
2492 | | p_ = units_.begin(); |
2493 | | state_ = 0; |
2494 | | return result; |
2495 | | } |
2496 | | } |
2497 | | |
2498 | | U_FORCE_INLINE reverse_iterator &operator--() { // pre-decrement |
2499 | | if (state_ < 0) { |
2500 | | // operator*() called decAndRead() so p_ is behind the logical position. |
2501 | | p_ = units_.end(); |
2502 | | } |
2503 | | UnitIter p0 = p_; |
2504 | | units_ = Impl::readAndInc(p0, p_); |
2505 | | state_ = 1; |
2506 | | return *this; |
2507 | | } |
2508 | | |
2509 | | U_FORCE_INLINE reverse_iterator operator--(int) { // post-decrement |
2510 | | reverse_iterator result(*this); |
2511 | | operator--(); |
2512 | | return result; |
2513 | | } |
2514 | | |
2515 | | private: |
2516 | | U_FORCE_INLINE UnitIter getLogicalPosition() const { |
2517 | | return state_ >= 0 ? p_ : units_.end(); |
2518 | | } |
2519 | | |
2520 | | // operator*() etc. are logically const. |
2521 | | mutable UnitIter p_; |
2522 | | // Keep state so that we call decAndRead() only once for both operator*() and ++ |
2523 | | // to make it easy for the compiler to optimize. |
2524 | | mutable UnsafeCodeUnits_ units_; |
2525 | | // >0: units_ = readAndInc(), p_ = units limit |
2526 | | // 0: initial state |
2527 | | // <0: units_ = decAndRead(), p_ = units start |
2528 | | // which means that p_ is behind its logical position |
2529 | | mutable int8_t state_ = 0; |
2530 | | }; |
2531 | | #endif // U_IN_DOXYGEN |
2532 | | |
2533 | | namespace U_HEADER_ONLY_NAMESPACE { |
2534 | | |
2535 | | /** |
2536 | | * UnsafeUTFIterator factory function. |
2537 | | * Deduces the UnitIter template parameter from the input. |
2538 | | * |
2539 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t |
2540 | | * @tparam UnitIter Can usually be omitted/deduced: |
2541 | | * An iterator (often a pointer) that returns a code unit type: |
2542 | | * UTF-8: char or char8_t or uint8_t; |
2543 | | * UTF-16: char16_t or uint16_t or (on Windows) wchar_t; |
2544 | | * UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t |
2545 | | * @param iter code unit iterator |
2546 | | * @return an UnsafeUTFIterator<CP32, UnitIter> |
2547 | | * for the given code unit iterator or character pointer |
2548 | | * @draft ICU 78 |
2549 | | */ |
2550 | | template<typename CP32, typename UnitIter> |
2551 | | auto unsafeUTFIterator(UnitIter iter) { |
2552 | | return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter)); |
2553 | | } |
2554 | | |
2555 | | /** |
2556 | | * A C++ "range" for non-validating iteration over all of the code points of a code unit range. |
2557 | | * The string must be well-formed. |
2558 | | * |
2559 | | * Call unsafeUTFStringCodePoints() to have the compiler deduce the Range type. |
2560 | | * |
2561 | | * UnsafeUTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range |
2562 | | * so is UnsafeUTFStringCodePoints<CP32, behavior, Range>. |
2563 | | * Note that when given a range r that is an lvalue and is not a view, unsafeUTFStringCodePoints(r) uses |
2564 | | * a ref_view of r as the Range type, which is a borrowed range. |
2565 | | * In practice, this means that given a container variable r, the iterators of |
2566 | | * unsafeUTFStringCodePoints(r) can be used as long as iterators on r are valid, without having to keep |
2567 | | * unsafeUTFStringCodePoints(r) around. |
2568 | | * For instance: |
2569 | | * \code |
2570 | | * std::u8string s = "𒇧𒇧"; |
2571 | | * // it outlives unsafeUTFStringCodePoints<char32_t>(s). |
2572 | | * auto it = unsafeUTFStringCodePoints<char32_t>(s).begin(); |
2573 | | * ++it; |
2574 | | * char32_t second_code_point = it->codePoint(); // OK. |
2575 | | * \endcode |
2576 | | * |
2577 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t |
2578 | | * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units |
2579 | | * @draft ICU 78 |
2580 | | * @see unsafeUTFStringCodePoints |
2581 | | */ |
2582 | | template<typename CP32, typename Range> |
2583 | | #if defined(__cpp_lib_ranges) |
2584 | | requires std::ranges::range<Range> |
2585 | | #endif |
2586 | | class UnsafeUTFStringCodePoints { |
2587 | | static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point"); |
2588 | | public: |
2589 | | /** |
2590 | | * Constructs an empty C++ "range" object. |
2591 | | * @draft ICU 78 |
2592 | | */ |
2593 | | UnsafeUTFStringCodePoints() = default; |
2594 | | |
2595 | | /** |
2596 | | * Constructs a C++ "range" object over the code points in the string. |
2597 | | * @param unitRange input range |
2598 | | * @draft ICU 78 |
2599 | | */ |
2600 | | template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>> |
2601 | | explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {} |
2602 | | /** |
2603 | | * Constructs a C++ "range" object over the code points in the string, |
2604 | | * keeping a reference to the code unit range. This overload is used by |
2605 | | * utfStringCodePoints in C++17; in C++20, a ref_view is used instead (via |
2606 | | * views::all). |
2607 | | * @param unitRange input range |
2608 | | * @draft ICU 78 |
2609 | | */ |
2610 | | template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void> |
2611 | | explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {} |
2612 | | |
2613 | | /** Copy constructor. @draft ICU 78 */ |
2614 | | UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other) = default; |
2615 | | |
2616 | | /** Copy assignment operator. @draft ICU 78 */ |
2617 | | UnsafeUTFStringCodePoints &operator=(const UnsafeUTFStringCodePoints &other) = default; |
2618 | | |
2619 | | /** |
2620 | | * @return the range start iterator |
2621 | | * @draft ICU 78 |
2622 | | */ |
2623 | | auto begin() { |
2624 | | return unsafeUTFIterator<CP32>(unitRange.begin()); |
2625 | | } |
2626 | | |
2627 | | /** |
2628 | | * @return the range start iterator |
2629 | | * @draft ICU 78 |
2630 | | */ |
2631 | | template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>> |
2632 | | auto begin() const { |
2633 | | return unsafeUTFIterator<CP32>(unitRange.begin()); |
2634 | | } |
2635 | | |
2636 | | /** |
2637 | | * @return the range limit (exclusive end) iterator |
2638 | | * @draft ICU 78 |
2639 | | */ |
2640 | | auto end() { |
2641 | | using UnitIter = decltype(unitRange.begin()); |
2642 | | using LimitIter = decltype(unitRange.end()); |
2643 | | if constexpr (!std::is_same_v<UnitIter, LimitIter>) { |
2644 | | // Return the code unit sentinel. |
2645 | | return unitRange.end(); |
2646 | | } else { |
2647 | | return unsafeUTFIterator<CP32>(unitRange.end()); |
2648 | | } |
2649 | | } |
2650 | | |
2651 | | /** |
2652 | | * @return the range limit (exclusive end) iterator |
2653 | | * @draft ICU 78 |
2654 | | */ |
2655 | | template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>> |
2656 | | auto end() const { |
2657 | | using UnitIter = decltype(unitRange.begin()); |
2658 | | using LimitIter = decltype(unitRange.end()); |
2659 | | if constexpr (!std::is_same_v<UnitIter, LimitIter>) { |
2660 | | // Return the code unit sentinel. |
2661 | | return unitRange.end(); |
2662 | | } else { |
2663 | | return unsafeUTFIterator<CP32>(unitRange.end()); |
2664 | | } |
2665 | | } |
2666 | | |
2667 | | /** |
2668 | | * @return std::reverse_iterator(end()) |
2669 | | * @draft ICU 78 |
2670 | | */ |
2671 | | auto rbegin() const { |
2672 | | return std::make_reverse_iterator(end()); |
2673 | | } |
2674 | | |
2675 | | /** |
2676 | | * @return std::reverse_iterator(begin()) |
2677 | | * @draft ICU 78 |
2678 | | */ |
2679 | | auto rend() const { |
2680 | | return std::make_reverse_iterator(begin()); |
2681 | | } |
2682 | | |
2683 | | /** |
2684 | | * Returns the CodeUnits for the first character/code point. |
2685 | | * Requires that the range is not empty. |
2686 | | * |
2687 | | * @return the CodeUnits for the first character/code point. |
2688 | | * @draft ICU 79 |
2689 | | */ |
2690 | | auto front() const { |
2691 | | return *begin(); |
2692 | | } |
2693 | | |
2694 | | /** |
2695 | | * Returns the CodeUnits for the last character/code point. |
2696 | | * Requires that the range is not empty. |
2697 | | * |
2698 | | * @return the CodeUnits for the last character/code point. |
2699 | | * @draft ICU 79 |
2700 | | */ |
2701 | | auto back() const { |
2702 | | return *(--end()); |
2703 | | } |
2704 | | |
2705 | | private: |
2706 | | Range unitRange; |
2707 | | }; |
2708 | | |
2709 | | /** @internal */ |
2710 | | template<typename CP32> |
2711 | | struct UnsafeUTFStringCodePointsAdaptor |
2712 | | #if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 && \ |
2713 | | __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3. |
2714 | | : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>> |
2715 | | #endif |
2716 | | { |
2717 | | /** @internal */ |
2718 | | template<typename Range> |
2719 | | auto operator()(Range &&unitRange) const { |
2720 | | #if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10 // We need https://wg21.link/P2415R2. |
2721 | | return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange)); |
2722 | | #else |
2723 | | if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) { |
2724 | | // Take basic_string_view by copy, not by reference. In C++20 this is handled by |
2725 | | // all_t<Range>, which is Range if Range is a view. |
2726 | | return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange)); |
2727 | | } else { |
2728 | | return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange)); |
2729 | | } |
2730 | | #endif |
2731 | | } |
2732 | | }; |
2733 | | |
2734 | | |
2735 | | /** |
2736 | | * Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a |
2737 | | * "range" of code points in a code unit range. The string must be well-formed. |
2738 | | * Deduces the Range template parameter from the input, taking into account the value category: the |
2739 | | * code units will be referenced if possible, and moved if necessary. |
2740 | | * |
2741 | | * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t |
2742 | | * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units |
2743 | | * @param unitRange input range |
2744 | | * @return an UnsafeUTFStringCodePoints<CP32, Range> for the given unitRange |
2745 | | * @draft ICU 78 |
2746 | | */ |
2747 | | template<typename CP32> |
2748 | | constexpr UnsafeUTFStringCodePointsAdaptor<CP32> unsafeUTFStringCodePoints; |
2749 | | |
2750 | | } // namespace U_HEADER_ONLY_NAMESPACE |
2751 | | |
2752 | | |
2753 | | #if defined(__cpp_lib_ranges) |
2754 | | template <typename CP32, UTFIllFormedBehavior behavior, typename Range> |
2755 | | constexpr bool std::ranges::enable_borrowed_range< |
2756 | | U_HEADER_ONLY_NAMESPACE::UTFStringCodePoints<CP32, behavior, Range>> = |
2757 | | std::ranges::enable_borrowed_range<Range>; |
2758 | | |
2759 | | template <typename CP32, typename Range> |
2760 | | constexpr bool std::ranges::enable_borrowed_range< |
2761 | | U_HEADER_ONLY_NAMESPACE::UnsafeUTFStringCodePoints<CP32, Range>> = |
2762 | | std::ranges::enable_borrowed_range<Range>; |
2763 | | #endif |
2764 | | |
2765 | | #endif // U_HIDE_DRAFT_API |
2766 | | #endif // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API |
2767 | | #endif // __UTFITERATOR_H__ |