Coverage Report

Created: 2026-06-23 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/unicode/utfiterator.h
Line
Count
Source
1
// © 2024 and later: Unicode, Inc. and others.
2
// License & terms of use: https://www.unicode.org/copyright.html
3
4
// utfiterator.h
5
// created: 2024aug12 Markus W. Scherer
6
7
#ifndef __UTFITERATOR_H__
8
#define __UTFITERATOR_H__
9
10
#include "unicode/utypes.h"
11
12
#if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API || !defined(UTYPES_H)
13
14
#include <iterator>
15
#if __has_include(<version>)
16
#include <version>
17
#endif
18
#if defined(__cpp_lib_ranges)
19
#include <ranges>
20
#endif
21
#include <string>
22
#include <string_view>
23
#include <type_traits>
24
#include "unicode/utf16.h"
25
#include "unicode/utf8.h"
26
#include "unicode/uversion.h"
27
28
/**
29
 * \file
30
 * \brief C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed).
31
 *
32
 * See the User Guide chapter about
33
 * <a href="https://unicode-org.github.io/icu/userguide/strings/cpp-code-point-iterator.html">C++ Code Point Iterators</a>.
34
 *
35
 * Sample code:
36
 * \code
37
 * #include <string_view>
38
 * #include <iostream>
39
 * #include "unicode/utypes.h"
40
 * #include "unicode/utfiterator.h"
41
 *
42
 * using icu::header::utfIterator;
43
 * using icu::header::utfStringCodePoints;
44
 * using icu::header::unsafeUTFIterator;
45
 * using icu::header::unsafeUTFStringCodePoints;
46
 *
47
 * int32_t rangeLoop16(std::u16string_view s) {
48
 *     // We are just adding up the code points for minimal-code demonstration purposes.
49
 *     int32_t sum = 0;
50
 *     for (auto units : utfStringCodePoints<UChar32, UTF_BEHAVIOR_NEGATIVE>(s)) {
51
 *         sum += units.codePoint();  // < 0 if ill-formed
52
 *     }
53
 *     return sum;
54
 * }
55
 *
56
 * int32_t loopIterPlusPlus16(std::u16string_view s) {
57
 *     auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
58
 *     int32_t sum = 0;
59
 *     for (auto iter = range.begin(), limit = range.end(); iter != limit;) {
60
 *         sum += (*iter++).codePoint();  // U+FFFD if ill-formed
61
 *     }
62
 *     return sum;
63
 * }
64
 *
65
 * int32_t backwardLoop16(std::u16string_view s) {
66
 *     auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
67
 *     int32_t sum = 0;
68
 *     for (auto start = range.begin(), iter = range.end(); start != iter;) {
69
 *         sum += (*--iter).codePoint();  // surrogate code point if unpaired / ill-formed
70
 *     }
71
 *     return sum;
72
 * }
73
 *
74
 * int32_t reverseLoop8(std::string_view s) {
75
 *     auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
76
 *     int32_t sum = 0;
77
 *     for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
78
 *         sum += iter->codePoint();  // U+FFFD if ill-formed
79
 *     }
80
 *     return sum;
81
 * }
82
 *
83
 * int32_t countCodePoints16(std::u16string_view s) {
84
 *     auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
85
 *     return std::distance(range.begin(), range.end());
86
 * }
87
 *
88
 * int32_t unsafeRangeLoop16(std::u16string_view s) {
89
 *     int32_t sum = 0;
90
 *     for (auto units : unsafeUTFStringCodePoints<UChar32>(s)) {
91
 *         sum += units.codePoint();
92
 *     }
93
 *     return sum;
94
 * }
95
 *
96
 * int32_t unsafeReverseLoop8(std::string_view s) {
97
 *     auto range = unsafeUTFStringCodePoints<UChar32>(s);
98
 *     int32_t sum = 0;
99
 *     for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
100
 *         sum += iter->codePoint();
101
 *     }
102
 *     return sum;
103
 * }
104
 *
105
 * char32_t firstCodePointOrFFFD16(std::u16string_view s) {
106
 *     if (s.empty()) { return 0xfffd; }
107
 *     auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
108
 *     return range.begin()->codePoint();
109
 * }
110
 *
111
 * std::string_view firstSequence8(std::string_view s) {
112
 *     if (s.empty()) { return {}; }
113
 *     auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
114
 *     auto units = *(range.begin());
115
 *     if (units.wellFormed()) {
116
 *         return units.stringView();
117
 *     } else {
118
 *         return {};
119
 *     }
120
 * }
121
 *
122
 * template<typename InputStream>  // some istream or streambuf
123
 * std::u32string cpFromInput(InputStream &in) {
124
 *     // This is a single-pass input_iterator.
125
 *     std::istreambuf_iterator bufIter(in);
126
 *     std::istreambuf_iterator<typename InputStream::char_type> bufLimit;
127
 *     auto iter = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufIter);
128
 *     auto limit = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufLimit);
129
 *     std::u32string s32;
130
 *     for (; iter != limit; ++iter) {
131
 *         s32.push_back(iter->codePoint());
132
 *     }
133
 *     return s32;
134
 * }
135
 *
136
 * std::u32string cpFromStdin() { return cpFromInput(std::cin); }
137
 * std::u32string cpFromWideStdin() { return cpFromInput(std::wcin); }
138
 * \endcode
139
 */
140
141
#ifndef U_HIDE_DRAFT_API
142
143
/**
144
 * Some defined behaviors for handling ill-formed Unicode strings.
145
 * This is a template parameter for UTFIterator and related classes.
146
 *
147
 * When a validating UTFIterator encounters an ill-formed code unit sequence,
148
 * then CodeUnits.codePoint() is a value according to this parameter.
149
 *
150
 * @draft ICU 78
151
 * @see CodeUnits
152
 * @see UTFIterator
153
 * @see UTFStringCodePoints
154
 */
155
typedef enum UTFIllFormedBehavior {
156
    /**
157
     * Returns a negative value (-1=U_SENTINEL) instead of a code point.
158
     * If the CP32 template parameter for the relevant classes is an unsigned type,
159
     * then the negative value becomes 0xffffffff=UINT32_MAX.
160
     *
161
     * @draft ICU 78
162
     */
163
    UTF_BEHAVIOR_NEGATIVE,
164
    /** Returns U+FFFD Replacement Character. @draft ICU 78 */
165
    UTF_BEHAVIOR_FFFD,
166
    /**
167
     * UTF-8: Not allowed;
168
     * UTF-16: returns the unpaired surrogate;
169
     * UTF-32: returns the surrogate code point, or U+FFFD if out of range.
170
     *
171
     * @draft ICU 78
172
     */
173
    UTF_BEHAVIOR_SURROGATE
174
} UTFIllFormedBehavior;
175
176
namespace U_HEADER_ONLY_NAMESPACE {
177
178
namespace prv {
179
#if U_CPLUSPLUS_VERSION >= 20
180
181
/** @internal */
182
template<typename Iter>
183
using iter_value_t = typename std::iter_value_t<Iter>;
184
185
/** @internal */
186
template<typename Iter>
187
using iter_difference_t = std::iter_difference_t<Iter>;
188
189
/** @internal */
190
template<typename Iter>
191
constexpr bool forward_iterator = std::forward_iterator<Iter>;
192
193
/** @internal */
194
template<typename Iter>
195
constexpr bool bidirectional_iterator = std::bidirectional_iterator<Iter>;
196
197
/** @internal */
198
template<typename Range>
199
constexpr bool range = std::ranges::range<Range>;
200
201
#else
202
203
/** @internal */
204
template<typename Iter>
205
using iter_value_t = typename std::iterator_traits<Iter>::value_type;
206
207
/** @internal */
208
template<typename Iter>
209
using iter_difference_t = typename std::iterator_traits<Iter>::difference_type;
210
211
/** @internal */
212
template<typename Iter>
213
constexpr bool forward_iterator =
214
    std::is_base_of_v<
215
        std::forward_iterator_tag,
216
        typename std::iterator_traits<Iter>::iterator_category>;
217
218
/** @internal */
219
template<typename Iter>
220
constexpr bool bidirectional_iterator =
221
    std::is_base_of_v<
222
        std::bidirectional_iterator_tag,
223
        typename std::iterator_traits<Iter>::iterator_category>;
224
225
/** @internal */
226
template<typename Range, typename = void>
227
struct range_type : std::false_type {};
228
229
/** @internal */
230
template<typename Range>
231
struct range_type<
232
    Range,
233
    std::void_t<decltype(std::declval<Range>().begin()),
234
                decltype(std::declval<Range>().end())>> : std::true_type {};
235
236
/** @internal */
237
template<typename Range>
238
constexpr bool range = range_type<Range>::value;
239
240
#endif
241
242
/** @internal */
243
template <typename T> struct is_basic_string_view : std::false_type {};
244
245
/** @internal */
246
template <typename... Args>
247
struct is_basic_string_view<std::basic_string_view<Args...>> : std::true_type {};
248
249
/** @internal */
250
template <typename T> constexpr bool is_basic_string_view_v = is_basic_string_view<T>::value;
251
252
/** @internal */
253
template<typename CP32, bool skipSurrogates>
254
class CodePointsIterator {
255
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
256
public:
257
    /** C++ iterator boilerplate @internal */
258
    using value_type = CP32;
259
    /** C++ iterator boilerplate @internal */
260
    using reference = value_type;
261
    /** C++ iterator boilerplate @internal */
262
    using pointer = CP32 *;
263
    /** C++ iterator boilerplate @internal */
264
    using difference_type = int32_t;
265
    /** C++ iterator boilerplate @internal */
266
    using iterator_category = std::forward_iterator_tag;
267
268
    /** @internal */
269
    inline CodePointsIterator(CP32 c) : c_(c) {}
270
    /** @internal */
271
    inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
272
    /** @internal */
273
    inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
274
    /** @internal */
275
    inline CP32 operator*() const { return c_; }
276
    /** @internal */
277
    inline CodePointsIterator &operator++() {  // pre-increment
278
        ++c_;
279
        if (skipSurrogates && c_ == 0xd800) {
280
            c_ = 0xe000;
281
        }
282
        return *this;
283
    }
284
    /** @internal */
285
    inline CodePointsIterator operator++(int) {  // post-increment
286
        CodePointsIterator result(*this);
287
        ++(*this);
288
        return result;
289
    }
290
291
private:
292
    CP32 c_;
293
};
294
295
}  // namespace prv
296
297
/**
298
 * A C++ "range" over all Unicode code points U+0000..U+10FFFF.
299
 * https://www.unicode.org/glossary/#code_point
300
 *
301
 * Intended for test and builder code.
302
 *
303
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
304
 * @draft ICU 78
305
 * @see U_IS_CODE_POINT
306
 */
307
template<typename CP32>
308
class AllCodePoints {
309
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
310
public:
311
    /** Constructor. @draft ICU 78 */
312
    AllCodePoints() {}
313
    /**
314
     * @return an iterator over all Unicode code points.
315
     *     The iterator returns CP32 integers.
316
     * @draft ICU 78
317
     */
318
    auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
319
    /**
320
     * @return an exclusive-end iterator over all Unicode code points.
321
     * @draft ICU 78
322
     */
323
    auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
324
};
325
326
/**
327
 * A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
328
 * That is, all code points except surrogates.
329
 * Only scalar values can be represented in well-formed UTF-8/16/32.
330
 * https://www.unicode.org/glossary/#unicode_scalar_value
331
 *
332
 * Intended for test and builder code.
333
 *
334
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
335
 * @draft ICU 78
336
 * @see U_IS_SCALAR_VALUE
337
 */
338
template<typename CP32>
339
class AllScalarValues {
340
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
341
public:
342
    /** Constructor. @draft ICU 78 */
343
    AllScalarValues() {}
344
    /**
345
     * @return an iterator over all Unicode scalar values.
346
     *     The iterator returns CP32 integers.
347
     * @draft ICU 78
348
     */
349
    auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
350
    /**
351
     * @return an exclusive-end iterator over all Unicode scalar values.
352
     * @draft ICU 78
353
     */
354
    auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
355
};
356
357
/**
358
 * Result of decoding a code unit sequence for one code point.
359
 * Returned from non-validating Unicode string code point iterators.
360
 * Base class for class CodeUnits which is returned from validating iterators.
361
 *
362
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
363
 *              should be signed if UTF_BEHAVIOR_NEGATIVE
364
 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
365
 *     UTF-8: char or char8_t or uint8_t;
366
 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
367
 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
368
 * @see UnsafeUTFIterator
369
 * @see UnsafeUTFStringCodePoints
370
 * @draft ICU 78
371
 */
372
template<typename CP32, typename UnitIter, typename = void>
373
class UnsafeCodeUnits {
374
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
375
    using Unit = typename prv::iter_value_t<UnitIter>;
376
public:
377
    /** @internal */
378
    UnsafeCodeUnits(CP32 codePoint, uint8_t length, UnitIter start, UnitIter limit) :
379
699
            c_(codePoint), len_(length), start_(start), limit_(limit) {}
380
381
    /** Copy constructor. @draft ICU 78 */
382
    UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
383
    /** Copy assignment operator. @draft ICU 78 */
384
    UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
385
386
    /**
387
     * @return the Unicode code point decoded from the code unit sequence.
388
     *     If the sequence is ill-formed and the iterator validates,
389
     *     then this is a replacement value according to the iterator‘s
390
     *     UTFIllFormedBehavior template parameter.
391
     * @draft ICU 78
392
     */
393
116
    CP32 codePoint() const { return c_; }
394
395
    /**
396
     * @return the start of the code unit sequence for one code point.
397
     * Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
398
     * @draft ICU 78
399
     */
400
0
    UnitIter begin() const { return start_; }
401
402
    /**
403
     * @return the limit (exclusive end) of the code unit sequence for one code point.
404
     * Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
405
     * @draft ICU 78
406
     */
407
0
    UnitIter end() const { return limit_; }
408
409
    /**
410
     * @return the length of the code unit sequence for one code point.
411
     * @draft ICU 78
412
     */
413
    uint8_t length() const { return len_; }
414
415
#if U_CPLUSPLUS_VERSION >= 20
416
    /**
417
     * @return a string_view of the code unit sequence for one code point.
418
     * Only works if UnitIter is a pointer or a contiguous_iterator.
419
     * @draft ICU 78
420
     */
421
    template<std::contiguous_iterator Iter = UnitIter>
422
    std::basic_string_view<Unit> stringView() const {
423
        return std::basic_string_view<Unit>(begin(), end());
424
    }
425
#else
426
    /**
427
     * @return a string_view of the code unit sequence for one code point.
428
     * Only works if UnitIter is a pointer or a contiguous_iterator.
429
     * @draft ICU 78
430
     */
431
    template<typename Iter = UnitIter, typename Unit = typename std::iterator_traits<Iter>::value_type>
432
    std::enable_if_t<std::is_pointer_v<Iter> ||
433
                         std::is_same_v<Iter, typename std::basic_string<Unit>::iterator> ||
434
                         std::is_same_v<Iter, typename std::basic_string<Unit>::const_iterator> ||
435
                         std::is_same_v<Iter, typename std::basic_string_view<Unit>::iterator> ||
436
                         std::is_same_v<Iter, typename std::basic_string_view<Unit>::const_iterator>,
437
                     std::basic_string_view<Unit>>
438
    stringView() const {
439
        return std::basic_string_view<Unit>(&*start_, len_);
440
    }
441
#endif
442
443
private:
444
    // Order of fields with padding and access frequency in mind.
445
    CP32 c_;
446
    uint8_t len_;
447
    UnitIter start_;
448
    UnitIter limit_;
449
};
450
451
#ifndef U_IN_DOXYGEN
452
// Partial template specialization for single-pass input iterator.
453
// No UnitIter field, no getter for it, no stringView().
454
template<typename CP32, typename UnitIter>
455
class UnsafeCodeUnits<
456
        CP32,
457
        UnitIter,
458
        std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
459
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
460
public:
461
    UnsafeCodeUnits(CP32 codePoint, uint8_t length) : c_(codePoint), len_(length) {}
462
463
    UnsafeCodeUnits(const UnsafeCodeUnits &other) = default;
464
    UnsafeCodeUnits &operator=(const UnsafeCodeUnits &other) = default;
465
466
    CP32 codePoint() const { return c_; }
467
468
    uint8_t length() const { return len_; }
469
470
private:
471
    // Order of fields with padding and access frequency in mind.
472
    CP32 c_;
473
    uint8_t len_;
474
};
475
#endif  // U_IN_DOXYGEN
476
477
/**
478
 * Result of validating and decoding a code unit sequence for one code point.
479
 * Returned from validating Unicode string code point iterators.
480
 * Adds function wellFormed() to base class UnsafeCodeUnits.
481
 *
482
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
483
 *              should be signed if UTF_BEHAVIOR_NEGATIVE
484
 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
485
 *     UTF-8: char or char8_t or uint8_t;
486
 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
487
 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
488
 * @see UTFIterator
489
 * @see UTFStringCodePoints
490
 * @draft ICU 78
491
 */
492
template<typename CP32, typename UnitIter, typename = void>
493
class CodeUnits : public UnsafeCodeUnits<CP32, UnitIter> {
494
public:
495
    /** @internal */
496
    CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed, UnitIter start, UnitIter limit) :
497
699
            UnsafeCodeUnits<CP32, UnitIter>(codePoint, length, start, limit), ok_(wellFormed) {}
498
499
    /** Copy constructor. @draft ICU 78 */
500
    CodeUnits(const CodeUnits &other) = default;
501
    /** Copy assignment operator. @draft ICU 78 */
502
    CodeUnits &operator=(const CodeUnits &other) = default;
503
504
    /**
505
     * @return true if the decoded code unit sequence is well-formed.
506
     * @draft ICU 78
507
     */
508
181
    bool wellFormed() const { return ok_; }
509
510
private:
511
    bool ok_;
512
};
513
514
#ifndef U_IN_DOXYGEN
515
// Partial template specialization for single-pass input iterator.
516
// No UnitIter field, no getter for it, no stringView().
517
template<typename CP32, typename UnitIter>
518
class CodeUnits<
519
        CP32,
520
        UnitIter,
521
        std::enable_if_t<!prv::forward_iterator<UnitIter>>> :
522
            public UnsafeCodeUnits<CP32, UnitIter> {
523
public:
524
    CodeUnits(CP32 codePoint, uint8_t length, bool wellFormed) :
525
            UnsafeCodeUnits<CP32, UnitIter>(codePoint, length), ok_(wellFormed) {}
526
527
    CodeUnits(const CodeUnits &other) = default;
528
    CodeUnits &operator=(const CodeUnits &other) = default;
529
530
    bool wellFormed() const { return ok_; }
531
532
private:
533
    bool ok_;
534
};
535
#endif  // U_IN_DOXYGEN
536
537
// Validating implementations ---------------------------------------------- ***
538
539
#ifndef U_IN_DOXYGEN
540
template<typename CP32, UTFIllFormedBehavior behavior,
541
         typename UnitIter, typename LimitIter = UnitIter, typename = void>
542
class UTFImpl;
543
544
// Note: readAndInc() functions take both a p0 and a p iterator.
545
// They must have the same value.
546
// For a multi-pass UnitIter, the caller must copy its p into a local variable p0,
547
// and readAndInc() copies p0 and the incremented p into the CodeUnits.
548
// For a single-pass UnitIter, which may not be default-constructible nor coypable,
549
// the caller can pass p into both references, and readAndInc() does not use p0
550
// and constructs CodeUnits without them.
551
// Moving the p0 variable into the call site avoids having to declare it inside readAndInc()
552
// which may not be possible for a single-pass iterator.
553
554
// UTF-8
555
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
556
class UTFImpl<
557
        CP32, behavior,
558
        UnitIter, LimitIter,
559
        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
560
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
561
    static_assert(behavior != UTF_BEHAVIOR_SURROGATE,
562
                  "For 8-bit strings, the SURROGATE option does not have an equivalent.");
563
public:
564
    // Handle ill-formed UTF-8
565
    U_FORCE_INLINE static CP32 sub() {
566
        if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) {
567
            return U_SENTINEL;
568
        } else {
569
            static_assert(behavior == UTF_BEHAVIOR_FFFD);
570
            return 0xfffd;
571
        }
572
    }
573
574
    U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
575
        // Very similar to U8_FWD_1().
576
        uint8_t b = *p;
577
        ++p;
578
        if (U8_IS_LEAD(b) && p != limit) {
579
            uint8_t t1 = *p;
580
            if ((0xe0 <= b && b < 0xf0)) {
581
                if (U8_IS_VALID_LEAD3_AND_T1(b, t1) &&
582
                        ++p != limit && U8_IS_TRAIL(*p)) {
583
                    ++p;
584
                }
585
            } else if (b < 0xe0) {
586
                if (U8_IS_TRAIL(t1)) {
587
                    ++p;
588
                }
589
            } else /* b >= 0xf0 */ {
590
                if (U8_IS_VALID_LEAD4_AND_T1(b, t1) &&
591
                        ++p != limit && U8_IS_TRAIL(*p) &&
592
                        ++p != limit && U8_IS_TRAIL(*p)) {
593
                    ++p;
594
                }
595
            }
596
        }
597
    }
598
599
    U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
600
        // Very similar to U8_BACK_1().
601
        uint8_t c = *--p;
602
        if (U8_IS_TRAIL(c) && p != start) {
603
            UnitIter p1 = p;
604
            uint8_t b1 = *--p1;
605
            if (U8_IS_LEAD(b1)) {
606
                if (b1 < 0xe0 ||
607
                        (b1 < 0xf0 ?
608
                            U8_IS_VALID_LEAD3_AND_T1(b1, c) :
609
                            U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
610
                    p = p1;
611
                    return;
612
                }
613
            } else if (U8_IS_TRAIL(b1) && p1 != start) {
614
                uint8_t b2 = *--p1;
615
                if (0xe0 <= b2 && b2 <= 0xf4) {
616
                    if (b2 < 0xf0 ?
617
                            U8_IS_VALID_LEAD3_AND_T1(b2, b1) :
618
                            U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
619
                        p = p1;
620
                        return;
621
                    }
622
                } else if (U8_IS_TRAIL(b2) && p1 != start) {
623
                    uint8_t b3 = *--p1;
624
                    if (0xf0 <= b3 && b3 <= 0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
625
                        p = p1;
626
                        return;
627
                    }
628
                }
629
            }
630
        }
631
    }
632
633
    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
634
            UnitIter &p0, UnitIter &p, const LimitIter &limit) {
635
        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
636
        // Very similar to U8_NEXT_OR_FFFD().
637
        CP32 c = uint8_t(*p);
638
        ++p;
639
        if (U8_IS_SINGLE(c)) {
640
            if constexpr (isMultiPass) {
641
                return {c, 1, true, p0, p};
642
            } else {
643
                return {c, 1, true};
644
            }
645
        }
646
        uint8_t length = 1;
647
        uint8_t t = 0;
648
        if (p != limit &&
649
                // fetch/validate/assemble all but last trail byte
650
                (c >= 0xe0 ?
651
                    (c < 0xf0 ?  // U+0800..U+FFFF except surrogates
652
                        U8_LEAD3_T1_BITS[c &= 0xf] & (1 << ((t = *p) >> 5)) &&
653
                        (t &= 0x3f, 1)
654
                    :  // U+10000..U+10FFFF
655
                        (c -= 0xf0) <= 4 &&
656
                        U8_LEAD4_T1_BITS[(t = *p) >> 4] & (1 << c) &&
657
                        (c = (c << 6) | (t & 0x3f), ++length, ++p != limit) &&
658
                        (t = *p - 0x80) <= 0x3f) &&
659
                    // valid second-to-last trail byte
660
                    (c = (c << 6) | t, ++length, ++p != limit)
661
                :  // U+0080..U+07FF
662
                    c >= 0xc2 && (c &= 0x1f, 1)) &&
663
                // last trail byte
664
                (t = *p - 0x80) <= 0x3f) {
665
            c = (c << 6) | t;
666
            ++length;
667
            ++p;
668
            if constexpr (isMultiPass) {
669
                return {c, length, true, p0, p};
670
            } else {
671
                return {c, length, true};
672
            }
673
        }
674
        if constexpr (isMultiPass) {
675
            return {sub(), length, false, p0, p};
676
        } else {
677
            return {sub(), length, false};
678
        }
679
    }
680
681
    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
682
        // Very similar to U8_PREV_OR_FFFD().
683
        UnitIter p0 = p;
684
        CP32 c = uint8_t(*--p);
685
        if (U8_IS_SINGLE(c)) {
686
            return {c, 1, true, p, p0};
687
        }
688
        if (U8_IS_TRAIL(c) && p != start) {
689
            UnitIter p1 = p;
690
            uint8_t b1 = *--p1;
691
            if (U8_IS_LEAD(b1)) {
692
                if (b1 < 0xe0) {
693
                    p = p1;
694
                    c = ((b1 - 0xc0) << 6) | (c & 0x3f);
695
                    return {c, 2, true, p, p0};
696
                } else if (b1 < 0xf0 ?
697
                            U8_IS_VALID_LEAD3_AND_T1(b1, c) :
698
                            U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
699
                    // Truncated 3- or 4-byte sequence.
700
                    p = p1;
701
                    return {sub(), 2, false, p, p0};
702
                }
703
            } else if (U8_IS_TRAIL(b1) && p1 != start) {
704
                // Extract the value bits from the last trail byte.
705
                c &= 0x3f;
706
                uint8_t b2 = *--p1;
707
                if (0xe0 <= b2 && b2 <= 0xf4) {
708
                    if (b2 < 0xf0) {
709
                        b2 &= 0xf;
710
                        if (U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
711
                            p = p1;
712
                            c = (b2 << 12) | ((b1 & 0x3f) << 6) | c;
713
                            return {c, 3, true, p, p0};
714
                        }
715
                    } else if (U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
716
                        // Truncated 4-byte sequence.
717
                        p = p1;
718
                        return {sub(), 3, false, p, p0};
719
                    }
720
                } else if (U8_IS_TRAIL(b2) && p1 != start) {
721
                    uint8_t b3 = *--p1;
722
                    if (0xf0 <= b3 && b3 <= 0xf4) {
723
                        b3 &= 7;
724
                        if (U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
725
                            p = p1;
726
                            c = (b3 << 18) | ((b2 & 0x3f) << 12) | ((b1 & 0x3f) << 6) | c;
727
                            return {c, 4, true, p, p0};
728
                        }
729
                    }
730
                }
731
            }
732
        }
733
        return {sub(), 1, false, p, p0};
734
    }
735
};
736
737
// UTF-16
738
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
739
class UTFImpl<
740
        CP32, behavior,
741
        UnitIter, LimitIter,
742
        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
743
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
744
public:
745
    // Handle ill-formed UTF-16: One unpaired surrogate.
746
65
    U_FORCE_INLINE static CP32 sub(CP32 surrogate) {
747
        if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) {
748
            return U_SENTINEL;
749
65
        } else if constexpr (behavior == UTF_BEHAVIOR_FFFD) {
750
65
            return 0xfffd;
751
        } else {
752
            static_assert(behavior == UTF_BEHAVIOR_SURROGATE);
753
            return surrogate;
754
        }
755
65
    }
756
757
0
    U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &limit) {
758
        // Very similar to U16_FWD_1().
759
0
        auto c = *p;
760
0
        ++p;
761
0
        if (U16_IS_LEAD(c) && p != limit && U16_IS_TRAIL(*p)) {
762
0
            ++p;
763
0
        }
764
0
    }
765
766
    U_FORCE_INLINE static void dec(UnitIter start, UnitIter &p) {
767
        // Very similar to U16_BACK_1().
768
        UnitIter p1;
769
        if (U16_IS_TRAIL(*--p) && p != start && (p1 = p, U16_IS_LEAD(*--p1))) {
770
            p = p1;
771
        }
772
    }
773
774
    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
775
181
            UnitIter &p0, UnitIter &p, const LimitIter &limit) {
776
181
        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
777
        // Very similar to U16_NEXT_OR_FFFD().
778
181
        CP32 c = static_cast<CP32>(*p);
779
181
        ++p;
780
181
        if (!U16_IS_SURROGATE(c)) {
781
111
            if constexpr (isMultiPass) {
782
111
                return {c, 1, true, p0, p};
783
            } else {
784
                return {c, 1, true};
785
            }
786
111
        } else {
787
70
            uint16_t c2;
788
70
            if (U16_IS_SURROGATE_LEAD(c) && p != limit && U16_IS_TRAIL(c2 = *p)) {
789
5
                ++p;
790
5
                c = U16_GET_SUPPLEMENTARY(c, c2);
791
5
                if constexpr (isMultiPass) {
792
5
                    return {c, 2, true, p0, p};
793
                } else {
794
                    return {c, 2, true};
795
                }
796
65
            } else {
797
65
                if constexpr (isMultiPass) {
798
65
                    return {sub(c), 1, false, p0, p};
799
                } else {
800
                    return {sub(c), 1, false};
801
                }
802
65
            }
803
70
        }
804
181
    }
805
806
    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter start, UnitIter &p) {
807
        // Very similar to U16_PREV_OR_FFFD().
808
        UnitIter p0 = p;
809
        CP32 c = static_cast<CP32>(*--p);
810
        if (!U16_IS_SURROGATE(c)) {
811
            return {c, 1, true, p, p0};
812
        } else {
813
            UnitIter p1;
814
            uint16_t c2;
815
            if (U16_IS_SURROGATE_TRAIL(c) && p != start && (p1 = p, U16_IS_LEAD(c2 = *--p1))) {
816
                p = p1;
817
                c = U16_GET_SUPPLEMENTARY(c2, c);
818
                return {c, 2, true, p, p0};
819
            } else {
820
                return {sub(c), 1, false, p, p0};
821
            }
822
        }
823
    }
824
};
825
826
// UTF-32: trivial, but still validating
827
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
828
class UTFImpl<
829
        CP32, behavior,
830
        UnitIter, LimitIter,
831
        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
832
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
833
public:
834
    // Handle ill-formed UTF-32
835
    U_FORCE_INLINE static CP32 sub(bool forSurrogate, CP32 surrogate) {
836
        if constexpr (behavior == UTF_BEHAVIOR_NEGATIVE) {
837
            return U_SENTINEL;
838
        } else if constexpr (behavior == UTF_BEHAVIOR_FFFD) {
839
            return 0xfffd;
840
        } else {
841
            static_assert(behavior == UTF_BEHAVIOR_SURROGATE);
842
            return forSurrogate ? surrogate : 0xfffd;
843
        }
844
    }
845
846
    U_FORCE_INLINE static void inc(UnitIter &p, const LimitIter &/*limit*/) {
847
        ++p;
848
    }
849
850
    U_FORCE_INLINE static void dec(UnitIter /*start*/, UnitIter &p) {
851
        --p;
852
    }
853
854
    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> readAndInc(
855
            UnitIter &p0, UnitIter &p, const LimitIter &/*limit*/) {
856
        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
857
        uint32_t uc = *p;
858
        CP32 c = uc;
859
        ++p;
860
        if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
861
            if constexpr (isMultiPass) {
862
                return {c, 1, true, p0, p};
863
            } else {
864
                return {c, 1, true};
865
            }
866
        } else {
867
            if constexpr (isMultiPass) {
868
                return {sub(uc < 0xe000, c), 1, false, p0, p};
869
            } else {
870
                return {sub(uc < 0xe000, c), 1, false};
871
            }
872
        }
873
    }
874
875
    U_FORCE_INLINE static CodeUnits<CP32, UnitIter> decAndRead(UnitIter /*start*/, UnitIter &p) {
876
        UnitIter p0 = p;
877
        uint32_t uc = *--p;
878
        CP32 c = uc;
879
        if (uc < 0xd800 || (0xe000 <= uc && uc <= 0x10ffff)) {
880
            return {c, 1, true, p, p0};
881
        } else {
882
            return {sub(uc < 0xe000, c), 1, false, p, p0};
883
        }
884
    }
885
};
886
887
// Non-validating implementations ------------------------------------------ ***
888
889
template<typename CP32, typename UnitIter, typename = void>
890
class UnsafeUTFImpl;
891
892
// UTF-8
893
template<typename CP32, typename UnitIter>
894
class UnsafeUTFImpl<
895
        CP32,
896
        UnitIter,
897
        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 1>> {
898
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
899
public:
900
    U_FORCE_INLINE static void inc(UnitIter &p) {
901
        // Very similar to U8_FWD_1_UNSAFE().
902
        uint8_t b = *p;
903
        std::advance(p, 1 + U8_COUNT_TRAIL_BYTES_UNSAFE(b));
904
    }
905
906
    U_FORCE_INLINE static void dec(UnitIter &p) {
907
        // Very similar to U8_BACK_1_UNSAFE().
908
        while (U8_IS_TRAIL(*--p)) {}
909
    }
910
911
    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
912
        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
913
        // Very similar to U8_NEXT_UNSAFE().
914
        CP32 c = uint8_t(*p);
915
        ++p;
916
        if (U8_IS_SINGLE(c)) {
917
            if constexpr (isMultiPass) {
918
                return {c, 1, p0, p};
919
            } else {
920
                return {c, 1};
921
            }
922
        } else if (c < 0xe0) {
923
            c = ((c & 0x1f) << 6) | (*p & 0x3f);
924
            ++p;
925
            if constexpr (isMultiPass) {
926
                return {c, 2, p0, p};
927
            } else {
928
                return {c, 2};
929
            }
930
        } else if (c < 0xf0) {
931
            // No need for (c&0xf) because the upper bits are truncated
932
            // after <<12 in the cast to uint16_t.
933
            c = uint16_t(c << 12) | ((*p & 0x3f) << 6);
934
            ++p;
935
            c |= *p & 0x3f;
936
            ++p;
937
            if constexpr (isMultiPass) {
938
                return {c, 3, p0, p};
939
            } else {
940
                return {c, 3};
941
            }
942
        } else {
943
            c = ((c & 7) << 18) | ((*p & 0x3f) << 12);
944
            ++p;
945
            c |= (*p & 0x3f) << 6;
946
            ++p;
947
            c |= *p & 0x3f;
948
            ++p;
949
            if constexpr (isMultiPass) {
950
                return {c, 4, p0, p};
951
            } else {
952
                return {c, 4};
953
            }
954
        }
955
    }
956
957
    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
958
        // Very similar to U8_PREV_UNSAFE().
959
        UnitIter p0 = p;
960
        CP32 c = uint8_t(*--p);
961
        if (U8_IS_SINGLE(c)) {
962
            return {c, 1, p, p0};
963
        }
964
        // U8_IS_TRAIL(c) if well-formed
965
        c &= 0x3f;
966
        uint8_t count = 1;
967
        for (uint8_t shift = 6;;) {
968
            uint8_t b = *--p;
969
            if (b >= 0xc0) {
970
                U8_MASK_LEAD_BYTE(b, count);
971
                c |= uint32_t{b} << shift;
972
                break;
973
            } else {
974
                c |= (uint32_t{b} & 0x3f) << shift;
975
                ++count;
976
                shift += 6;
977
            }
978
        }
979
        ++count;
980
        return {c, count, p, p0};
981
    }
982
};
983
984
// UTF-16
985
template<typename CP32, typename UnitIter>
986
class UnsafeUTFImpl<
987
        CP32,
988
        UnitIter,
989
        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 2>> {
990
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
991
public:
992
    U_FORCE_INLINE static void inc(UnitIter &p) {
993
        // Very similar to U16_FWD_1_UNSAFE().
994
        auto c = *p;
995
        ++p;
996
        if (U16_IS_LEAD(c)) {
997
            ++p;
998
        }
999
    }
1000
1001
    U_FORCE_INLINE static void dec(UnitIter &p) {
1002
        // Very similar to U16_BACK_1_UNSAFE().
1003
        if (U16_IS_TRAIL(*--p)) {
1004
            --p;
1005
        }
1006
    }
1007
1008
    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1009
        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1010
        // Very similar to U16_NEXT_UNSAFE().
1011
        CP32 c = static_cast<CP32>(*p);
1012
        ++p;
1013
        if (!U16_IS_LEAD(c)) {
1014
            if constexpr (isMultiPass) {
1015
                return {c, 1, p0, p};
1016
            } else {
1017
                return {c, 1};
1018
            }
1019
        } else {
1020
            uint16_t c2 = *p;
1021
            ++p;
1022
            c = U16_GET_SUPPLEMENTARY(c, c2);
1023
            if constexpr (isMultiPass) {
1024
                return {c, 2, p0, p};
1025
            } else {
1026
                return {c, 2};
1027
            }
1028
        }
1029
    }
1030
1031
    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1032
        // Very similar to U16_PREV_UNSAFE().
1033
        UnitIter p0 = p;
1034
        CP32 c = static_cast<CP32>(*--p);
1035
        if (!U16_IS_TRAIL(c)) {
1036
            return {c, 1, p, p0};
1037
        } else {
1038
            uint16_t c2 = *--p;
1039
            c = U16_GET_SUPPLEMENTARY(c2, c);
1040
            return {c, 2, p, p0};
1041
        }
1042
    }
1043
};
1044
1045
// UTF-32: trivial
1046
template<typename CP32, typename UnitIter>
1047
class UnsafeUTFImpl<
1048
        CP32,
1049
        UnitIter,
1050
        std::enable_if_t<sizeof(typename prv::iter_value_t<UnitIter>) == 4>> {
1051
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1052
public:
1053
    U_FORCE_INLINE static void inc(UnitIter &p) {
1054
        ++p;
1055
    }
1056
1057
    U_FORCE_INLINE static void dec(UnitIter &p) {
1058
        --p;
1059
    }
1060
1061
    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> readAndInc(UnitIter &p0, UnitIter &p) {
1062
        constexpr bool isMultiPass = prv::forward_iterator<UnitIter>;
1063
        CP32 c = *p;
1064
        ++p;
1065
        if constexpr (isMultiPass) {
1066
            return {c, 1, p0, p};
1067
        } else {
1068
            return {c, 1};
1069
        }
1070
    }
1071
1072
    U_FORCE_INLINE static UnsafeCodeUnits<CP32, UnitIter> decAndRead(UnitIter &p) {
1073
        UnitIter p0 = p;
1074
        CP32 c = *--p;
1075
        return {c, 1, p, p0};
1076
    }
1077
};
1078
1079
#endif
1080
1081
// Validating iterators ---------------------------------------------------- ***
1082
1083
/**
1084
 * Validating iterator over the code points in a Unicode string.
1085
 *
1086
 * The UnitIter can be
1087
 * an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer).
1088
 * The UTFIterator will have the corresponding iterator_category.
1089
 *
1090
 * Call utfIterator() to have the compiler deduce the UnitIter and LimitIter types.
1091
 *
1092
 * For reverse iteration, either use this iterator directly as in <code>*--iter</code>
1093
 * or wrap it using std::make_reverse_iterator(iter).
1094
 *
1095
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
1096
 *              should be signed if UTF_BEHAVIOR_NEGATIVE
1097
 * @tparam behavior How to handle ill-formed Unicode strings
1098
 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
1099
 *     UTF-8: char or char8_t or uint8_t;
1100
 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
1101
 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
1102
 * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type.
1103
 * @draft ICU 78
1104
 * @see utfIterator
1105
 */
1106
template<typename CP32, UTFIllFormedBehavior behavior,
1107
         typename UnitIter, typename LimitIter = UnitIter, typename = void>
1108
class UTFIterator {
1109
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1110
    using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1111
1112
    // Proxy type for operator->() (required by LegacyInputIterator)
1113
    // so that we don't promise always returning CodeUnits.
1114
    class Proxy {
1115
    public:
1116
297
        explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1117
        CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1118
297
        CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1119
    private:
1120
        CodeUnits<CP32, UnitIter> units_;
1121
    };
1122
1123
public:
1124
    /** C++ iterator boilerplate @internal */
1125
    using value_type = CodeUnits<CP32, UnitIter>;
1126
    /** C++ iterator boilerplate @internal */
1127
    using reference = value_type;
1128
    /** C++ iterator boilerplate @internal */
1129
    using pointer = Proxy;
1130
    /** C++ iterator boilerplate @internal */
1131
    using difference_type = prv::iter_difference_t<UnitIter>;
1132
    /** C++ iterator boilerplate @internal */
1133
    using iterator_category = std::conditional_t<
1134
        prv::bidirectional_iterator<UnitIter>,
1135
        std::bidirectional_iterator_tag,
1136
        std::forward_iterator_tag>;
1137
1138
    /**
1139
     * Constructor with start <= p < limit.
1140
     * All of these iterators/pointers should be at code point boundaries.
1141
     * Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
1142
     *
1143
     * When using a code unit sentinel (UnitIter≠LimitIter),
1144
     * then that sentinel also works as a sentinel for this code point iterator.
1145
     *
1146
     * @param start Start of the range
1147
     * @param p Initial position inside the range
1148
     * @param limit Limit (exclusive end) of the range
1149
     * @draft ICU 78
1150
     */
1151
    U_FORCE_INLINE UTFIterator(UnitIter start, UnitIter p, LimitIter limit) :
1152
317
            p_(p), start_(start), limit_(limit), units_(0, 0, false, p, p) {}
1153
    /**
1154
     * Constructor with start == p < limit.
1155
     * All of these iterators/pointers should be at code point boundaries.
1156
     *
1157
     * When using a code unit sentinel (UnitIter≠LimitIter),
1158
     * then that sentinel also works as a sentinel for this code point iterator.
1159
     *
1160
     * @param p Start of the range, and the initial position
1161
     * @param limit Limit (exclusive end) of the range
1162
     * @draft ICU 78
1163
     */
1164
    U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) :
1165
201
            p_(p), start_(p), limit_(limit), units_(0, 0, false, p, p) {}
1166
    /**
1167
     * Constructs an iterator start or limit sentinel.
1168
     * The iterator/pointer should be at a code point boundary.
1169
     * Requires UnitIter to be copyable.
1170
     *
1171
     * When using a code unit sentinel (UnitIter≠LimitIter),
1172
     * then that sentinel also works as a sentinel for this code point iterator.
1173
     *
1174
     * @param p Range start or limit
1175
     * @draft ICU 78
1176
     */
1177
    U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(p), start_(p), limit_(p), units_(0, 0, false, p, p) {}
1178
    /**
1179
     * Default constructor. Makes a non-functional iterator.
1180
     *
1181
     * @draft ICU 78
1182
     */
1183
    U_FORCE_INLINE UTFIterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1184
1185
    /** Move constructor. @draft ICU 78 */
1186
    U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1187
    /** Move assignment operator. @draft ICU 78 */
1188
    U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1189
1190
    /** Copy constructor. @draft ICU 78 */
1191
    U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1192
    /** Copy assignment operator. @draft ICU 78 */
1193
    U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
1194
1195
    /**
1196
     * @param other Another iterator
1197
     * @return true if this iterator is at the same position as the other one
1198
     * @draft ICU 78
1199
     */
1200
317
    U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1201
317
        return base() == other.base();
1202
317
    }
1203
    /**
1204
     * @param other Another iterator
1205
     * @return true if this iterator is not at the same position as the other one
1206
     * @draft ICU 78
1207
     */
1208
116
    U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1209
1210
    // Asymmetric equality & nonequality with a sentinel type.
1211
1212
    /**
1213
     * @param iter A UTFIterator
1214
     * @param s A unit iterator sentinel
1215
     * @return true if the iterator’s position is equal to the sentinel
1216
     * @draft ICU 78
1217
     */
1218
    template<typename Sentinel> U_FORCE_INLINE friend
1219
    std::enable_if_t<
1220
        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1221
        bool>
1222
    operator==(const UTFIterator &iter, const Sentinel &s) {
1223
        return iter.base() == s;
1224
    }
1225
1226
#if U_CPLUSPLUS_VERSION < 20
1227
    // C++17: Need to define all four combinations of == / != vs. parameter order.
1228
    // Once we require C++20, we could remove all but the first == because
1229
    // the compiler would generate the rest.
1230
1231
    /**
1232
     * @param s A unit iterator sentinel
1233
     * @param iter A UTFIterator
1234
     * @return true if the iterator’s position is equal to the sentinel
1235
     * @internal
1236
     */
1237
    template<typename Sentinel> U_FORCE_INLINE friend
1238
    std::enable_if_t<
1239
        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1240
        bool>
1241
    operator==(const Sentinel &s, const UTFIterator &iter) {
1242
        return iter.base() == s;
1243
    }
1244
    /**
1245
     * @param iter A UTFIterator
1246
     * @param s A unit iterator sentinel
1247
     * @return true if the iterator’s position is not equal to the sentinel
1248
     * @internal
1249
     */
1250
    template<typename Sentinel> U_FORCE_INLINE friend
1251
    std::enable_if_t<
1252
        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1253
        bool>
1254
    operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1255
    /**
1256
     * @param s A unit iterator sentinel
1257
     * @param iter A UTFIterator
1258
     * @return true if the iterator’s position is not equal to the sentinel
1259
     * @internal
1260
     */
1261
    template<typename Sentinel> U_FORCE_INLINE friend
1262
    std::enable_if_t<
1263
        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1264
        bool>
1265
    operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1266
#endif  // C++17
1267
1268
    /**
1269
     * Returns the current position as a code unit iterator.
1270
     * Similar to iter->begin() but also works at the exclusive end().
1271
     *
1272
     * @return current position as a code unit iterator
1273
     * @draft ICU 79
1274
     */
1275
634
    U_FORCE_INLINE UnitIter base() const {
1276
        // Return the logical position.
1277
634
        return state_ <= 0 ? p_ : units_.begin();
1278
634
    }
1279
1280
    /**
1281
     * Decodes the code unit sequence at the current position.
1282
     *
1283
     * @return CodeUnits with the decoded code point etc.
1284
     * @draft ICU 78
1285
     */
1286
    U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
1287
        if (state_ == 0) {
1288
            UnitIter p0 = p_;
1289
            units_ = Impl::readAndInc(p0, p_, limit_);
1290
            state_ = 1;
1291
        }
1292
        return units_;
1293
    }
1294
1295
    /**
1296
     * Decodes the code unit sequence at the current position.
1297
     * Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc.
1298
     *
1299
     * @return CodeUnits with the decoded code point etc., wrapped into
1300
     *     an opaque proxy object so that <code>iter->codePoint()</code> etc. works.
1301
     * @draft ICU 78
1302
     */
1303
297
    U_FORCE_INLINE Proxy operator->() const {
1304
297
        if (state_ == 0) {
1305
181
            UnitIter p0 = p_;
1306
181
            units_ = Impl::readAndInc(p0, p_, limit_);
1307
181
            state_ = 1;
1308
181
        }
1309
297
        return Proxy(units_);
1310
297
    }
1311
1312
    /**
1313
     * Pre-increment operator.
1314
     *
1315
     * @return this iterator
1316
     * @draft ICU 78
1317
     */
1318
116
    U_FORCE_INLINE UTFIterator &operator++() {  // pre-increment
1319
116
        if (state_ > 0) {
1320
            // operator*() called readAndInc() so p_ is already ahead.
1321
116
            state_ = 0;
1322
116
        } else if (state_ == 0) {
1323
0
            Impl::inc(p_, limit_);
1324
0
        } else /* state_ < 0 */ {
1325
            // operator--() called decAndRead() so we know how far to skip.
1326
0
            p_ = units_.end();
1327
0
            state_ = 0;
1328
0
        }
1329
116
        return *this;
1330
116
    }
1331
1332
    /**
1333
     * Post-increment operator.
1334
     *
1335
     * @return a copy of this iterator from before the increment.
1336
     *     If UnitIter is a single-pass input_iterator, then this function
1337
     *     returns an opaque proxy object so that <code>*iter++</code> still works.
1338
     * @draft ICU 78
1339
     */
1340
    U_FORCE_INLINE UTFIterator operator++(int) {  // post-increment
1341
        if (state_ > 0) {
1342
            // operator*() called readAndInc() so p_ is already ahead.
1343
            UTFIterator result(*this);
1344
            state_ = 0;
1345
            return result;
1346
        } else if (state_ == 0) {
1347
            UnitIter p0 = p_;
1348
            units_ = Impl::readAndInc(p0, p_, limit_);
1349
            UTFIterator result(*this);
1350
            result.state_ = 1;
1351
            // keep this->state_ == 0
1352
            return result;
1353
        } else /* state_ < 0 */ {
1354
            UTFIterator result(*this);
1355
            // operator--() called decAndRead() so we know how far to skip.
1356
            p_ = units_.end();
1357
            state_ = 0;
1358
            return result;
1359
        }
1360
    }
1361
1362
    /**
1363
     * Pre-decrement operator.
1364
     * Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
1365
     *
1366
     * @return this iterator
1367
     * @draft ICU 78
1368
     */
1369
    template<typename Iter = UnitIter>
1370
    U_FORCE_INLINE
1371
    std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator &>
1372
    operator--() {  // pre-decrement
1373
        if (state_ > 0) {
1374
            // operator*() called readAndInc() so p_ is ahead of the logical position.
1375
            p_ = units_.begin();
1376
        }
1377
        units_ = Impl::decAndRead(start_, p_);
1378
        state_ = -1;
1379
        return *this;
1380
    }
1381
1382
    /**
1383
     * Post-decrement operator.
1384
     * Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
1385
     *
1386
     * @return a copy of this iterator from before the decrement.
1387
     * @draft ICU 78
1388
     */
1389
    template<typename Iter = UnitIter>
1390
    U_FORCE_INLINE
1391
    std::enable_if_t<prv::bidirectional_iterator<Iter>, UTFIterator>
1392
    operator--(int) {  // post-decrement
1393
        UTFIterator result(*this);
1394
        operator--();
1395
        return result;
1396
    }
1397
1398
private:
1399
    friend class std::reverse_iterator<UTFIterator<CP32, behavior, UnitIter>>;
1400
1401
    // operator*() etc. are logically const.
1402
    mutable UnitIter p_;
1403
    // In a validating iterator, we need start_ & limit_ so that when we read a code point
1404
    // (forward or backward) we can test if there are enough code units.
1405
    UnitIter start_;
1406
    LimitIter limit_;
1407
    // Keep state so that we call readAndInc() only once for both operator*() and ++
1408
    // to make it easy for the compiler to optimize.
1409
    mutable CodeUnits<CP32, UnitIter> units_;
1410
    // >0: units_ = readAndInc(), p_ = units limit
1411
    //     which means that p_ is ahead of its logical position
1412
    //  0: initial state
1413
    // <0: units_ = decAndRead(), p_ = units start
1414
    mutable int8_t state_ = 0;
1415
};
1416
1417
#ifndef U_IN_DOXYGEN
1418
// Partial template specialization for single-pass input iterator.
1419
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter, typename LimitIter>
1420
class UTFIterator<
1421
        CP32, behavior,
1422
        UnitIter, LimitIter,
1423
        std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
1424
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1425
    using Impl = UTFImpl<CP32, behavior, UnitIter, LimitIter>;
1426
1427
    // Proxy type for post-increment return value, to make *iter++ work.
1428
    // Also for operator->() (required by LegacyInputIterator)
1429
    // so that we don't promise always returning CodeUnits.
1430
    class Proxy {
1431
    public:
1432
        explicit Proxy(CodeUnits<CP32, UnitIter> &units) : units_(units) {}
1433
        CodeUnits<CP32, UnitIter> &operator*() { return units_; }
1434
        CodeUnits<CP32, UnitIter> *operator->() { return &units_; }
1435
    private:
1436
        CodeUnits<CP32, UnitIter> units_;
1437
    };
1438
1439
public:
1440
    using value_type = CodeUnits<CP32, UnitIter>;
1441
    using reference = value_type;
1442
    using pointer = Proxy;
1443
    using difference_type = prv::iter_difference_t<UnitIter>;
1444
    using iterator_category = std::input_iterator_tag;
1445
1446
    U_FORCE_INLINE UTFIterator(UnitIter p, LimitIter limit) : p_(std::move(p)), limit_(std::move(limit)) {}
1447
1448
    // Constructs an iterator start or limit sentinel.
1449
    // Requires p to be copyable.
1450
    U_FORCE_INLINE explicit UTFIterator(UnitIter p) : p_(std::move(p)), limit_(p_) {}
1451
1452
    U_FORCE_INLINE UTFIterator(UTFIterator &&src) noexcept = default;
1453
    U_FORCE_INLINE UTFIterator &operator=(UTFIterator &&src) noexcept = default;
1454
1455
    U_FORCE_INLINE UTFIterator(const UTFIterator &other) = default;
1456
    U_FORCE_INLINE UTFIterator &operator=(const UTFIterator &other) = default;
1457
1458
    U_FORCE_INLINE bool operator==(const UTFIterator &other) const {
1459
        return p_ == other.p_ && ahead_ == other.ahead_;
1460
        // Strictly speaking, we should check if the logical position is the same.
1461
        // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
1462
    }
1463
    U_FORCE_INLINE bool operator!=(const UTFIterator &other) const { return !operator==(other); }
1464
1465
    template<typename Sentinel> U_FORCE_INLINE friend
1466
    std::enable_if_t<
1467
        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1468
        bool>
1469
    operator==(const UTFIterator &iter, const Sentinel &s) {
1470
        return !iter.ahead_ && iter.p_ == s;
1471
    }
1472
1473
#if U_CPLUSPLUS_VERSION < 20
1474
    template<typename Sentinel> U_FORCE_INLINE friend
1475
    std::enable_if_t<
1476
        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1477
        bool>
1478
    operator==(const Sentinel &s, const UTFIterator &iter) {
1479
        return !iter.ahead_ && iter.p_ == s;
1480
    }
1481
1482
    template<typename Sentinel> U_FORCE_INLINE friend
1483
    std::enable_if_t<
1484
        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1485
        bool>
1486
    operator!=(const UTFIterator &iter, const Sentinel &s) { return !(iter == s); }
1487
1488
    template<typename Sentinel> U_FORCE_INLINE friend
1489
    std::enable_if_t<
1490
        !std::is_same_v<Sentinel, UTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
1491
        bool>
1492
    operator!=(const Sentinel &s, const UTFIterator &iter) { return !(iter == s); }
1493
#endif  // C++17
1494
1495
    U_FORCE_INLINE CodeUnits<CP32, UnitIter> operator*() const {
1496
        if (!ahead_) {
1497
            units_ = Impl::readAndInc(p_, p_, limit_);
1498
            ahead_ = true;
1499
        }
1500
        return units_;
1501
    }
1502
1503
    U_FORCE_INLINE Proxy operator->() const {
1504
        if (!ahead_) {
1505
            units_ = Impl::readAndInc(p_, p_, limit_);
1506
            ahead_ = true;
1507
        }
1508
        return Proxy(units_);
1509
    }
1510
1511
    U_FORCE_INLINE UTFIterator &operator++() {  // pre-increment
1512
        if (ahead_) {
1513
            // operator*() called readAndInc() so p_ is already ahead.
1514
            ahead_ = false;
1515
        } else {
1516
            Impl::inc(p_, limit_);
1517
        }
1518
        return *this;
1519
    }
1520
1521
    U_FORCE_INLINE Proxy operator++(int) {  // post-increment
1522
        if (ahead_) {
1523
            // operator*() called readAndInc() so p_ is already ahead.
1524
            ahead_ = false;
1525
        } else {
1526
            units_ = Impl::readAndInc(p_, p_, limit_);
1527
            // keep this->ahead_ == false
1528
        }
1529
        return Proxy(units_);
1530
    }
1531
1532
private:
1533
    // operator*() etc. are logically const.
1534
    mutable UnitIter p_;
1535
    // In a validating iterator, we need limit_ so that when we read a code point
1536
    // we can test if there are enough code units.
1537
    LimitIter limit_;
1538
    // Keep state so that we call readAndInc() only once for both operator*() and ++
1539
    // so that we can use a single-pass input iterator for UnitIter.
1540
    mutable CodeUnits<CP32, UnitIter> units_ = {0, 0, false};
1541
    // true: units_ = readAndInc(), p_ = units limit
1542
    //     which means that p_ is ahead of its logical position
1543
    // false: initial state
1544
    mutable bool ahead_ = false;
1545
};
1546
#endif  // U_IN_DOXYGEN
1547
1548
}  // namespace U_HEADER_ONLY_NAMESPACE
1549
1550
#ifndef U_IN_DOXYGEN
1551
// Bespoke specialization of reverse_iterator.
1552
// The default implementation implements reverse operator*() and ++ in a way
1553
// that does most of the same work twice for reading variable-length sequences.
1554
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1555
class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>> {
1556
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1557
    using Impl = U_HEADER_ONLY_NAMESPACE::UTFImpl<CP32, behavior, UnitIter>;
1558
    using CodeUnits_ = U_HEADER_ONLY_NAMESPACE::CodeUnits<CP32, UnitIter>;
1559
1560
    // Proxy type for operator->() (required by LegacyInputIterator)
1561
    // so that we don't promise always returning CodeUnits.
1562
    class Proxy {
1563
    public:
1564
        explicit Proxy(CodeUnits_ units) : units_(units) {}
1565
        CodeUnits_ &operator*() { return units_; }
1566
        CodeUnits_ *operator->() { return &units_; }
1567
    private:
1568
        CodeUnits_ units_;
1569
    };
1570
1571
public:
1572
    using value_type = CodeUnits_;
1573
    using reference = value_type;
1574
    using pointer = Proxy;
1575
    using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
1576
    using iterator_category = std::bidirectional_iterator_tag;
1577
1578
    U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter> iter) :
1579
            p_(iter.base()), start_(iter.start_), limit_(iter.limit_),
1580
            units_(0, 0, false, p_, p_) {}
1581
    U_FORCE_INLINE reverse_iterator() : p_{}, start_{}, limit_{}, units_(0, 0, false, p_, p_) {}
1582
1583
    U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
1584
    U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
1585
1586
    U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
1587
    U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
1588
1589
    U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
1590
        return getLogicalPosition() == other.getLogicalPosition();
1591
    }
1592
    U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
1593
1594
    U_FORCE_INLINE U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter> base() const {
1595
        return U_HEADER_ONLY_NAMESPACE::UTFIterator<CP32, behavior, UnitIter>(
1596
            start_, getLogicalPosition(), limit_);
1597
    }
1598
1599
    U_FORCE_INLINE CodeUnits_ operator*() const {
1600
        if (state_ == 0) {
1601
            units_ = Impl::decAndRead(start_, p_);
1602
            state_ = -1;
1603
        }
1604
        return units_;
1605
    }
1606
1607
    U_FORCE_INLINE Proxy operator->() const {
1608
        if (state_ == 0) {
1609
            units_ = Impl::decAndRead(start_, p_);
1610
            state_ = -1;
1611
        }
1612
        return Proxy(units_);
1613
    }
1614
1615
    U_FORCE_INLINE reverse_iterator &operator++() {  // pre-increment
1616
        if (state_ < 0) {
1617
            // operator*() called decAndRead() so p_ is already behind.
1618
            state_ = 0;
1619
        } else if (state_ == 0) {
1620
            Impl::dec(start_, p_);
1621
        } else /* state_ > 0 */ {
1622
            // operator--() called readAndInc() so we know how far to skip.
1623
            p_ = units_.begin();
1624
            state_ = 0;
1625
        }
1626
        return *this;
1627
    }
1628
1629
    U_FORCE_INLINE reverse_iterator operator++(int) {  // post-increment
1630
        if (state_ < 0) {
1631
            // operator*() called decAndRead() so p_ is already behind.
1632
            reverse_iterator result(*this);
1633
            state_ = 0;
1634
            return result;
1635
        } else if (state_ == 0) {
1636
            units_ = Impl::decAndRead(start_, p_);
1637
            reverse_iterator result(*this);
1638
            result.state_ = -1;
1639
            // keep this->state_ == 0
1640
            return result;
1641
        } else /* state_ > 0 */ {
1642
            reverse_iterator result(*this);
1643
            // operator--() called readAndInc() so we know how far to skip.
1644
            p_ = units_.begin();
1645
            state_ = 0;
1646
            return result;
1647
        }
1648
    }
1649
1650
    U_FORCE_INLINE reverse_iterator &operator--() {  // pre-decrement
1651
        if (state_ < 0) {
1652
            // operator*() called decAndRead() so p_ is behind the logical position.
1653
            p_ = units_.end();
1654
        }
1655
        UnitIter p0 = p_;
1656
        units_ = Impl::readAndInc(p0, p_, limit_);
1657
        state_ = 1;
1658
        return *this;
1659
    }
1660
1661
    U_FORCE_INLINE reverse_iterator operator--(int) {  // post-decrement
1662
        reverse_iterator result(*this);
1663
        operator--();
1664
        return result;
1665
    }
1666
1667
private:
1668
    U_FORCE_INLINE UnitIter getLogicalPosition() const {
1669
        return state_ >= 0 ? p_ : units_.end();
1670
    }
1671
1672
    // operator*() etc. are logically const.
1673
    mutable UnitIter p_;
1674
    // In a validating iterator, we need start_ & limit_ so that when we read a code point
1675
    // (forward or backward) we can test if there are enough code units.
1676
    UnitIter start_;
1677
    UnitIter limit_;
1678
    // Keep state so that we call decAndRead() only once for both operator*() and ++
1679
    // to make it easy for the compiler to optimize.
1680
    mutable CodeUnits_ units_;
1681
    // >0: units_ = readAndInc(), p_ = units limit
1682
    //  0: initial state
1683
    // <0: units_ = decAndRead(), p_ = units start
1684
    //     which means that p_ is behind its logical position
1685
    mutable int8_t state_ = 0;
1686
};
1687
#endif  // U_IN_DOXYGEN
1688
1689
namespace U_HEADER_ONLY_NAMESPACE {
1690
1691
/**
1692
 * UTFIterator factory function for start <= p < limit.
1693
 * Deduces the UnitIter and LimitIter template parameters from the inputs.
1694
 * Only enabled if UnitIter is a (multi-pass) forward_iterator or better.
1695
 *
1696
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
1697
 * @tparam behavior How to handle ill-formed Unicode strings
1698
 * @tparam UnitIter Can usually be omitted/deduced:
1699
 *     An iterator (often a pointer) that returns a code unit type:
1700
 *     UTF-8: char or char8_t or uint8_t;
1701
 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
1702
 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
1703
 * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type.
1704
 * @param start start code unit iterator
1705
 * @param p current-position code unit iterator
1706
 * @param limit limit (exclusive-end) code unit iterator.
1707
 *     When using a code unit sentinel (UnitIter≠LimitIter),
1708
 *     then that sentinel also works as a sentinel for the code point iterator.
1709
 * @return a UTFIterator&lt;CP32, behavior, UnitIter&gt;
1710
 *     for the given code unit iterators or character pointers
1711
 * @draft ICU 78
1712
 */
1713
template<typename CP32, UTFIllFormedBehavior behavior,
1714
         typename UnitIter, typename LimitIter = UnitIter>
1715
317
auto utfIterator(UnitIter start, UnitIter p, LimitIter limit) {
1716
317
    return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
1717
317
        std::move(start), std::move(p), std::move(limit));
1718
317
}
1719
1720
/**
1721
 * UTFIterator factory function for start = p < limit.
1722
 * Deduces the UnitIter and LimitIter template parameters from the inputs.
1723
 *
1724
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
1725
 * @tparam behavior How to handle ill-formed Unicode strings
1726
 * @tparam UnitIter Can usually be omitted/deduced:
1727
 *     An iterator (often a pointer) that returns a code unit type:
1728
 *     UTF-8: char or char8_t or uint8_t;
1729
 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
1730
 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
1731
 * @tparam LimitIter Either the same as UnitIter, or an iterator sentinel type.
1732
 * @param p start and current-position code unit iterator
1733
 * @param limit limit (exclusive-end) code unit iterator.
1734
 *     When using a code unit sentinel (UnitIter≠LimitIter),
1735
 *     then that sentinel also works as a sentinel for the code point iterator.
1736
 * @return a UTFIterator&lt;CP32, behavior, UnitIter&gt;
1737
 *     for the given code unit iterators or character pointers
1738
 * @draft ICU 78
1739
 */
1740
template<typename CP32, UTFIllFormedBehavior behavior,
1741
         typename UnitIter, typename LimitIter = UnitIter>
1742
201
auto utfIterator(UnitIter p, LimitIter limit) {
1743
201
    return UTFIterator<CP32, behavior, UnitIter, LimitIter>(
1744
201
        std::move(p), std::move(limit));
1745
201
}
1746
1747
// Note: We should only enable the following factory function for a copyable UnitIter.
1748
// In C++17, we would have to partially specialize with enable_if_t testing for forward_iterator,
1749
// but a function template partial specialization is not allowed.
1750
// In C++20, we might be able to require the std::copyable concept.
1751
1752
/**
1753
 * UTFIterator factory function for a start or limit sentinel.
1754
 * Deduces the UnitIter template parameter from the input.
1755
 * Requires UnitIter to be copyable.
1756
 *
1757
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
1758
 * @tparam behavior How to handle ill-formed Unicode strings
1759
 * @tparam UnitIter Can usually be omitted/deduced:
1760
 *     An iterator (often a pointer) that returns a code unit type:
1761
 *     UTF-8: char or char8_t or uint8_t;
1762
 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
1763
 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
1764
 * @param p code unit iterator.
1765
 *     When using a code unit sentinel,
1766
 *     then that sentinel also works as a sentinel for the code point iterator.
1767
 * @return a UTFIterator&lt;CP32, behavior, UnitIter&gt;
1768
 *     for the given code unit iterator or character pointer
1769
 * @draft ICU 78
1770
 */
1771
template<typename CP32, UTFIllFormedBehavior behavior, typename UnitIter>
1772
auto utfIterator(UnitIter p) {
1773
    return UTFIterator<CP32, behavior, UnitIter>(std::move(p));
1774
}
1775
1776
/**
1777
 * A C++ "range" for validating iteration over all of the code points of a code unit range.
1778
 *
1779
 * Call utfStringCodePoints() to have the compiler deduce the Range type.
1780
 *
1781
 * UTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range
1782
 * so is UTFStringCodePoints<CP32, behavior, Range>.
1783
 * Note that when given a range r that is an lvalue and is not a view,  utfStringCodePoints(r) uses a
1784
 * ref_view of r as the Range type, which is a borrowed range.
1785
 * In practice, this means that given a container variable r, the iterators of utfStringCodePoints(r) can
1786
 * be used as long as iterators on r are valid, without having to keep utfStringCodePoints(r) around.
1787
 * For instance:
1788
 * \code
1789
 *     std::u8string s = "𒇧𒇧";
1790
 *     // it outlives utfStringCodePoints<char32_t>(s).
1791
 *     auto it = utfStringCodePoints<char32_t>(s).begin();
1792
 *     ++it;
1793
 *     char32_t second_code_point = it->codePoint();  // OK.
1794
 * \endcode
1795
 * 
1796
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
1797
 *              should be signed if UTF_BEHAVIOR_NEGATIVE
1798
 * @tparam behavior How to handle ill-formed Unicode strings
1799
 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
1800
 * @draft ICU 78
1801
 * @see utfStringCodePoints
1802
 */
1803
template<typename CP32, UTFIllFormedBehavior behavior, typename Range>
1804
#if defined(__cpp_lib_ranges)
1805
    requires std::ranges::range<Range>
1806
#endif
1807
class UTFStringCodePoints {
1808
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
1809
public:
1810
    /**
1811
     * Constructs an empty C++ "range" object.
1812
     * @draft ICU 78
1813
     */
1814
    UTFStringCodePoints() = default;
1815
1816
    /**
1817
     * Constructs a C++ "range" object over the code points in the string.
1818
     * @param unitRange input range
1819
     * @draft ICU 78
1820
     */
1821
    template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
1822
201
    explicit UTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
1823
    /**
1824
     * Constructs a C++ "range" object over the code points in the string,
1825
     * keeping a reference to the code unit range.  This overload is used by
1826
     * utfStringCodePoints in C++17; in C+20, a ref_view is used instead (via
1827
     * views::all).
1828
     * @param unitRange input range
1829
     * @draft ICU 78
1830
     */
1831
    template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
1832
    explicit UTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
1833
1834
    /** Copy constructor. @draft ICU 78 */
1835
    UTFStringCodePoints(const UTFStringCodePoints &other) = default;
1836
1837
    /** Copy assignment operator. @draft ICU 78 */
1838
    UTFStringCodePoints &operator=(const UTFStringCodePoints &other) = default;
1839
1840
    /**
1841
     * @return the range start iterator
1842
     * @draft ICU 78
1843
     */
1844
    auto begin() {
1845
        return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1846
    }
1847
1848
    /**
1849
     * @return the range start iterator
1850
     * @draft ICU 78
1851
     */
1852
    template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1853
201
    auto begin() const {
1854
201
        return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end());
1855
201
    }
1856
1857
    /**
1858
     * @return the range limit (exclusive end) iterator
1859
     * @draft ICU 78
1860
     */
1861
    auto end() {
1862
        using UnitIter = decltype(unitRange.begin());
1863
        using LimitIter = decltype(unitRange.end());
1864
        if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1865
            // Return the code unit sentinel.
1866
            return unitRange.end();
1867
        } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1868
            return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1869
        } else {
1870
            // The input iterator specialization has no three-argument constructor.
1871
            return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1872
        }
1873
    }
1874
1875
    /**
1876
     * @return the range limit (exclusive end) iterator
1877
     * @draft ICU 78
1878
     */
1879
    template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
1880
317
    auto end() const {
1881
317
        using UnitIter = decltype(unitRange.begin());
1882
317
        using LimitIter = decltype(unitRange.end());
1883
        if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
1884
            // Return the code unit sentinel.
1885
            return unitRange.end();
1886
317
        } else if constexpr (prv::bidirectional_iterator<UnitIter>) {
1887
317
            return utfIterator<CP32, behavior>(unitRange.begin(), unitRange.end(), unitRange.end());
1888
        } else {
1889
            // The input iterator specialization has no three-argument constructor.
1890
            return utfIterator<CP32, behavior>(unitRange.end(), unitRange.end());
1891
        }
1892
317
    }
1893
1894
    /**
1895
     * @return std::reverse_iterator(end())
1896
     * @draft ICU 78
1897
     */
1898
    auto rbegin() const {
1899
        return std::make_reverse_iterator(end());
1900
    }
1901
1902
    /**
1903
     * @return std::reverse_iterator(begin())
1904
     * @draft ICU 78
1905
     */
1906
    auto rend() const {
1907
        return std::make_reverse_iterator(begin());
1908
    }
1909
1910
    /**
1911
     * Returns the CodeUnits for the first character/code point.
1912
     * Requires that the range is not empty.
1913
     *
1914
     * @return the CodeUnits for the first character/code point.
1915
     * @draft ICU 79
1916
     */
1917
    auto front() const {
1918
        return *begin();
1919
    }
1920
1921
    /**
1922
     * Returns the CodeUnits for the last character/code point.
1923
     * Requires that the range is not empty.
1924
     *
1925
     * @return the CodeUnits for the last character/code point.
1926
     * @draft ICU 79
1927
     */
1928
    auto back() const {
1929
        return *(--end());
1930
    }
1931
1932
private:
1933
    Range unitRange;
1934
};
1935
1936
/** @internal */
1937
template<typename CP32, UTFIllFormedBehavior behavior>
1938
struct UTFStringCodePointsAdaptor
1939
#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 &&                                         \
1940
    __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
1941
    : std::ranges::range_adaptor_closure<UTFStringCodePointsAdaptor<CP32, behavior>>
1942
#endif
1943
{
1944
    /** @internal */
1945
    template<typename Range>
1946
201
    auto operator()(Range &&unitRange) const {
1947
#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10  // We need https://wg21.link/P2415R2.
1948
        return UTFStringCodePoints<CP32, behavior, std::ranges::views::all_t<Range>>(
1949
            std::forward<Range>(unitRange));
1950
#else
1951
201
        if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
1952
            // Take basic_string_view by copy, not by reference.  In C++20 this is handled by
1953
            // all_t<Range>, which is Range if Range is a view.
1954
201
            return UTFStringCodePoints<CP32, behavior, std::decay_t<Range>>(
1955
201
                std::forward<Range>(unitRange));
1956
        } else {
1957
            return UTFStringCodePoints<CP32, behavior, Range>(std::forward<Range>(unitRange));
1958
        }
1959
201
#endif
1960
201
    }
1961
};
1962
1963
/**
1964
 * Range adaptor function object returning a UTFStringCodePoints object that represents a "range" of code
1965
 * points in a code unit range, which validates while decoding.
1966
 * Deduces the Range template parameter from the input, taking into account the value category: the
1967
 * code units will be referenced if possible, and moved if necessary.
1968
 *
1969
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t;
1970
 *              should be signed if UTF_BEHAVIOR_NEGATIVE
1971
 * @tparam behavior How to handle ill-formed Unicode strings
1972
 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
1973
 * @param unitRange input range
1974
 * @return a UTFStringCodePoints&lt;CP32, behavior, Range&gt; for the given unitRange
1975
 * @draft ICU 78
1976
 */
1977
template<typename CP32, UTFIllFormedBehavior behavior>
1978
constexpr UTFStringCodePointsAdaptor<CP32, behavior> utfStringCodePoints;
1979
1980
// Non-validating iterators ------------------------------------------------ ***
1981
1982
/**
1983
 * Non-validating iterator over the code points in a Unicode string.
1984
 * The string must be well-formed.
1985
 *
1986
 * The UnitIter can be
1987
 * an input_iterator, a forward_iterator, or a bidirectional_iterator (including a pointer).
1988
 * The UTFIterator will have the corresponding iterator_category.
1989
 *
1990
 * Call unsafeUTFIterator() to have the compiler deduce the UnitIter type.
1991
 *
1992
 * For reverse iteration, either use this iterator directly as in <code>*--iter</code>
1993
 * or wrap it using std::make_reverse_iterator(iter).
1994
 *
1995
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
1996
 * @tparam UnitIter An iterator (often a pointer) that returns a code unit type:
1997
 *     UTF-8: char or char8_t or uint8_t;
1998
 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
1999
 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
2000
 * @draft ICU 78
2001
 * @see unsafeUTFIterator
2002
 */
2003
template<typename CP32, typename UnitIter, typename = void>
2004
class UnsafeUTFIterator {
2005
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2006
    using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2007
2008
    // Proxy type for operator->() (required by LegacyInputIterator)
2009
    // so that we don't promise always returning UnsafeCodeUnits.
2010
    class Proxy {
2011
    public:
2012
        explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2013
        UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
2014
        UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
2015
    private:
2016
        UnsafeCodeUnits<CP32, UnitIter> units_;
2017
    };
2018
2019
public:
2020
    /** C++ iterator boilerplate @internal */
2021
    using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2022
    /** C++ iterator boilerplate @internal */
2023
    using reference = value_type;
2024
    /** C++ iterator boilerplate @internal */
2025
    using pointer = Proxy;
2026
    /** C++ iterator boilerplate @internal */
2027
    using difference_type = prv::iter_difference_t<UnitIter>;
2028
    /** C++ iterator boilerplate @internal */
2029
    using iterator_category = std::conditional_t<
2030
        prv::bidirectional_iterator<UnitIter>,
2031
        std::bidirectional_iterator_tag,
2032
        std::forward_iterator_tag>;
2033
2034
    /**
2035
     * Constructor; the iterator/pointer should be at a code point boundary.
2036
     *
2037
     * When using a code unit sentinel,
2038
     * then that sentinel also works as a sentinel for this code point iterator.
2039
     *
2040
     * @param p Initial position inside the range, or a range sentinel
2041
     * @draft ICU 78
2042
     */
2043
    U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(p), units_(0, 0, p, p) {}
2044
    /**
2045
     * Default constructor. Makes a non-functional iterator.
2046
     *
2047
     * @draft ICU 78
2048
     */
2049
    U_FORCE_INLINE UnsafeUTFIterator() : p_{}, units_(0, 0, p_, p_) {}
2050
2051
    /** Move constructor. @draft ICU 78 */
2052
    U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
2053
    /** Move assignment operator. @draft ICU 78 */
2054
    U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
2055
2056
    /** Copy constructor. @draft ICU 78 */
2057
    U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
2058
    /** Copy assignment operator. @draft ICU 78 */
2059
    U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default;
2060
2061
    /**
2062
     * @param other Another iterator
2063
     * @return true if this iterator is at the same position as the other one
2064
     * @draft ICU 78
2065
     */
2066
    U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2067
        return base() == other.base();
2068
    }
2069
    /**
2070
     * @param other Another iterator
2071
     * @return true if this iterator is not at the same position as the other one
2072
     * @draft ICU 78
2073
     */
2074
    U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2075
2076
    /**
2077
     * @param iter An UnsafeUTFIterator
2078
     * @param s A unit iterator sentinel
2079
     * @return true if the iterator’s position is equal to the sentinel
2080
     * @draft ICU 78
2081
     */
2082
    template<typename Sentinel> U_FORCE_INLINE friend
2083
    std::enable_if_t<
2084
        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2085
        bool>
2086
    operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2087
        return iter.base() == s;
2088
    }
2089
2090
#if U_CPLUSPLUS_VERSION < 20
2091
    /**
2092
     * @param s A unit iterator sentinel
2093
     * @param iter An UnsafeUTFIterator
2094
     * @return true if the iterator’s position is equal to the sentinel
2095
     * @internal
2096
     */
2097
    template<typename Sentinel> U_FORCE_INLINE friend
2098
    std::enable_if_t<
2099
        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2100
        bool>
2101
    operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2102
        return iter.base() == s;
2103
    }
2104
    /**
2105
     * @param iter An UnsafeUTFIterator
2106
     * @param s A unit iterator sentinel
2107
     * @return true if the iterator’s position is not equal to the sentinel
2108
     * @internal
2109
     */
2110
    template<typename Sentinel> U_FORCE_INLINE friend
2111
    std::enable_if_t<
2112
        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2113
        bool>
2114
    operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2115
    /**
2116
     * @param s A unit iterator sentinel
2117
     * @param iter An UnsafeUTFIterator
2118
     * @return true if the iterator’s position is not equal to the sentinel
2119
     * @internal
2120
     */
2121
    template<typename Sentinel> U_FORCE_INLINE friend
2122
    std::enable_if_t<
2123
        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2124
        bool>
2125
    operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2126
#endif  // C++17
2127
2128
    /**
2129
     * Returns the current position as a code unit iterator.
2130
     * Similar to iter->begin() but also works at the exclusive end().
2131
     *
2132
     * @return current position as a code unit iterator
2133
     * @draft ICU 79
2134
     */
2135
    U_FORCE_INLINE UnitIter base() const {
2136
        // Return the logical position.
2137
        return state_ <= 0 ? p_ : units_.begin();
2138
    }
2139
2140
    /**
2141
     * Decodes the code unit sequence at the current position.
2142
     *
2143
     * @return CodeUnits with the decoded code point etc.
2144
     * @draft ICU 78
2145
     */
2146
    U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
2147
        if (state_ == 0) {
2148
            UnitIter p0 = p_;
2149
            units_ = Impl::readAndInc(p0, p_);
2150
            state_ = 1;
2151
        }
2152
        return units_;
2153
    }
2154
2155
    /**
2156
     * Decodes the code unit sequence at the current position.
2157
     * Used like <code>iter->codePoint()</code> or <code>iter->stringView()</code> etc.
2158
     *
2159
     * @return CodeUnits with the decoded code point etc., wrapped into
2160
     *     an opaque proxy object so that <code>iter->codePoint()</code> etc. works.
2161
     * @draft ICU 78
2162
     */
2163
    U_FORCE_INLINE Proxy operator->() const {
2164
        if (state_ == 0) {
2165
            UnitIter p0 = p_;
2166
            units_ = Impl::readAndInc(p0, p_);
2167
            state_ = 1;
2168
        }
2169
        return Proxy(units_);
2170
    }
2171
2172
    /**
2173
     * Pre-increment operator.
2174
     *
2175
     * @return this iterator
2176
     * @draft ICU 78
2177
     */
2178
    U_FORCE_INLINE UnsafeUTFIterator &operator++() {  // pre-increment
2179
        if (state_ > 0) {
2180
            // operator*() called readAndInc() so p_ is already ahead.
2181
            state_ = 0;
2182
        } else if (state_ == 0) {
2183
            Impl::inc(p_);
2184
        } else /* state_ < 0 */ {
2185
            // operator--() called decAndRead() so we know how far to skip.
2186
            p_ = units_.end();
2187
            state_ = 0;
2188
        }
2189
        return *this;
2190
    }
2191
2192
    /**
2193
     * Post-increment operator.
2194
     *
2195
     * @return a copy of this iterator from before the increment.
2196
     *     If UnitIter is a single-pass input_iterator, then this function
2197
     *     returns an opaque proxy object so that <code>*iter++</code> still works.
2198
     * @draft ICU 78
2199
     */
2200
    U_FORCE_INLINE UnsafeUTFIterator operator++(int) {  // post-increment
2201
        if (state_ > 0) {
2202
            // operator*() called readAndInc() so p_ is already ahead.
2203
            UnsafeUTFIterator result(*this);
2204
            state_ = 0;
2205
            return result;
2206
        } else if (state_ == 0) {
2207
            UnitIter p0 = p_;
2208
            units_ = Impl::readAndInc(p0, p_);
2209
            UnsafeUTFIterator result(*this);
2210
            result.state_ = 1;
2211
            // keep this->state_ == 0
2212
            return result;
2213
        } else /* state_ < 0 */ {
2214
            UnsafeUTFIterator result(*this);
2215
            // operator--() called decAndRead() so we know how far to skip.
2216
            p_ = units_.end();
2217
            state_ = 0;
2218
            return result;
2219
        }
2220
    }
2221
2222
    /**
2223
     * Pre-decrement operator.
2224
     * Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
2225
     *
2226
     * @return this iterator
2227
     * @draft ICU 78
2228
     */
2229
    template<typename Iter = UnitIter>
2230
    U_FORCE_INLINE
2231
    std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator &>
2232
    operator--() {  // pre-decrement
2233
        if (state_ > 0) {
2234
            // operator*() called readAndInc() so p_ is ahead of the logical position.
2235
            p_ = units_.begin();
2236
        }
2237
        units_ = Impl::decAndRead(p_);
2238
        state_ = -1;
2239
        return *this;
2240
    }
2241
2242
    /**
2243
     * Post-decrement operator.
2244
     * Only enabled if UnitIter is a bidirectional_iterator (including a pointer).
2245
     *
2246
     * @return a copy of this iterator from before the decrement.
2247
     * @draft ICU 78
2248
     */
2249
    template<typename Iter = UnitIter>
2250
    U_FORCE_INLINE
2251
    std::enable_if_t<prv::bidirectional_iterator<Iter>, UnsafeUTFIterator>
2252
    operator--(int) {  // post-decrement
2253
        UnsafeUTFIterator result(*this);
2254
        operator--();
2255
        return result;
2256
    }
2257
2258
private:
2259
    friend class std::reverse_iterator<UnsafeUTFIterator<CP32, UnitIter>>;
2260
2261
    // operator*() etc. are logically const.
2262
    mutable UnitIter p_;
2263
    // Keep state so that we call readAndInc() only once for both operator*() and ++
2264
    // to make it easy for the compiler to optimize.
2265
    mutable UnsafeCodeUnits<CP32, UnitIter> units_;
2266
    // >0: units_ = readAndInc(), p_ = units limit
2267
    //     which means that p_ is ahead of its logical position
2268
    //  0: initial state
2269
    // <0: units_ = decAndRead(), p_ = units start
2270
    mutable int8_t state_ = 0;
2271
};
2272
2273
#ifndef U_IN_DOXYGEN
2274
// Partial template specialization for single-pass input iterator.
2275
template<typename CP32, typename UnitIter>
2276
class UnsafeUTFIterator<
2277
        CP32,
2278
        UnitIter,
2279
        std::enable_if_t<!prv::forward_iterator<UnitIter>>> {
2280
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2281
    using Impl = UnsafeUTFImpl<CP32, UnitIter>;
2282
2283
    // Proxy type for post-increment return value, to make *iter++ work.
2284
    // Also for operator->() (required by LegacyInputIterator)
2285
    // so that we don't promise always returning UnsafeCodeUnits.
2286
    class Proxy {
2287
    public:
2288
        explicit Proxy(UnsafeCodeUnits<CP32, UnitIter> &units) : units_(units) {}
2289
        UnsafeCodeUnits<CP32, UnitIter> &operator*() { return units_; }
2290
        UnsafeCodeUnits<CP32, UnitIter> *operator->() { return &units_; }
2291
    private:
2292
        UnsafeCodeUnits<CP32, UnitIter> units_;
2293
    };
2294
2295
public:
2296
    using value_type = UnsafeCodeUnits<CP32, UnitIter>;
2297
    using reference = value_type;
2298
    using pointer = Proxy;
2299
    using difference_type = prv::iter_difference_t<UnitIter>;
2300
    using iterator_category = std::input_iterator_tag;
2301
2302
    U_FORCE_INLINE explicit UnsafeUTFIterator(UnitIter p) : p_(std::move(p)) {}
2303
2304
    U_FORCE_INLINE UnsafeUTFIterator(UnsafeUTFIterator &&src) noexcept = default;
2305
    U_FORCE_INLINE UnsafeUTFIterator &operator=(UnsafeUTFIterator &&src) noexcept = default;
2306
2307
    U_FORCE_INLINE UnsafeUTFIterator(const UnsafeUTFIterator &other) = default;
2308
    U_FORCE_INLINE UnsafeUTFIterator &operator=(const UnsafeUTFIterator &other) = default;
2309
2310
    U_FORCE_INLINE bool operator==(const UnsafeUTFIterator &other) const {
2311
        return p_ == other.p_ && ahead_ == other.ahead_;
2312
        // Strictly speaking, we should check if the logical position is the same.
2313
        // However, we cannot advance, or do arithmetic with, a single-pass UnitIter.
2314
    }
2315
    U_FORCE_INLINE bool operator!=(const UnsafeUTFIterator &other) const { return !operator==(other); }
2316
2317
    template<typename Sentinel> U_FORCE_INLINE friend
2318
    std::enable_if_t<
2319
        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2320
        bool>
2321
    operator==(const UnsafeUTFIterator &iter, const Sentinel &s) {
2322
        return !iter.ahead_ && iter.p_ == s;
2323
    }
2324
2325
#if U_CPLUSPLUS_VERSION < 20
2326
    template<typename Sentinel> U_FORCE_INLINE friend
2327
    std::enable_if_t<
2328
        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2329
        bool>
2330
    operator==(const Sentinel &s, const UnsafeUTFIterator &iter) {
2331
        return !iter.ahead_ && iter.p_ == s;
2332
    }
2333
2334
    template<typename Sentinel> U_FORCE_INLINE friend
2335
    std::enable_if_t<
2336
        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2337
        bool>
2338
    operator!=(const UnsafeUTFIterator &iter, const Sentinel &s) { return !(iter == s); }
2339
2340
    template<typename Sentinel> U_FORCE_INLINE friend
2341
    std::enable_if_t<
2342
        !std::is_same_v<Sentinel, UnsafeUTFIterator> && !std::is_same_v<Sentinel, UnitIter>,
2343
        bool>
2344
    operator!=(const Sentinel &s, const UnsafeUTFIterator &iter) { return !(iter == s); }
2345
#endif  // C++17
2346
2347
    U_FORCE_INLINE UnsafeCodeUnits<CP32, UnitIter> operator*() const {
2348
        if (!ahead_) {
2349
            units_ = Impl::readAndInc(p_, p_);
2350
            ahead_ = true;
2351
        }
2352
        return units_;
2353
    }
2354
2355
    U_FORCE_INLINE Proxy operator->() const {
2356
        if (!ahead_) {
2357
            units_ = Impl::readAndInc(p_, p_);
2358
            ahead_ = true;
2359
        }
2360
        return Proxy(units_);
2361
    }
2362
2363
    U_FORCE_INLINE UnsafeUTFIterator &operator++() {  // pre-increment
2364
        if (ahead_) {
2365
            // operator*() called readAndInc() so p_ is already ahead.
2366
            ahead_ = false;
2367
        } else {
2368
            Impl::inc(p_);
2369
        }
2370
        return *this;
2371
    }
2372
2373
    U_FORCE_INLINE Proxy operator++(int) {  // post-increment
2374
        if (ahead_) {
2375
            // operator*() called readAndInc() so p_ is already ahead.
2376
            ahead_ = false;
2377
        } else {
2378
            units_ = Impl::readAndInc(p_, p_);
2379
            // keep this->ahead_ == false
2380
        }
2381
        return Proxy(units_);
2382
    }
2383
2384
private:
2385
    // operator*() etc. are logically const.
2386
    mutable UnitIter p_;
2387
    // Keep state so that we call readAndInc() only once for both operator*() and ++
2388
    // so that we can use a single-pass input iterator for UnitIter.
2389
    mutable UnsafeCodeUnits<CP32, UnitIter> units_ = {0, 0};
2390
    // true: units_ = readAndInc(), p_ = units limit
2391
    //     which means that p_ is ahead of its logical position
2392
    // false: initial state
2393
    mutable bool ahead_ = false;
2394
};
2395
#endif  // U_IN_DOXYGEN
2396
2397
}  // namespace U_HEADER_ONLY_NAMESPACE
2398
2399
#ifndef U_IN_DOXYGEN
2400
// Bespoke specialization of reverse_iterator.
2401
// The default implementation implements reverse operator*() and ++ in a way
2402
// that does most of the same work twice for reading variable-length sequences.
2403
template<typename CP32, typename UnitIter>
2404
class std::reverse_iterator<U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>> {
2405
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2406
    using Impl = U_HEADER_ONLY_NAMESPACE::UnsafeUTFImpl<CP32, UnitIter>;
2407
    using UnsafeCodeUnits_ = U_HEADER_ONLY_NAMESPACE::UnsafeCodeUnits<CP32, UnitIter>;
2408
2409
    // Proxy type for operator->() (required by LegacyInputIterator)
2410
    // so that we don't promise always returning UnsafeCodeUnits.
2411
    class Proxy {
2412
    public:
2413
        explicit Proxy(UnsafeCodeUnits_ units) : units_(units) {}
2414
        UnsafeCodeUnits_ &operator*() { return units_; }
2415
        UnsafeCodeUnits_ *operator->() { return &units_; }
2416
    private:
2417
        UnsafeCodeUnits_ units_;
2418
    };
2419
2420
public:
2421
    using value_type = UnsafeCodeUnits_;
2422
    using reference = value_type;
2423
    using pointer = Proxy;
2424
    using difference_type = U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t<UnitIter>;
2425
    using iterator_category = std::bidirectional_iterator_tag;
2426
2427
    U_FORCE_INLINE explicit reverse_iterator(U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> iter) :
2428
            p_(iter.base()), units_(0, 0, p_, p_) {}
2429
    U_FORCE_INLINE reverse_iterator() : p_{}, units_(0, 0, p_, p_) {}
2430
2431
    U_FORCE_INLINE reverse_iterator(reverse_iterator &&src) noexcept = default;
2432
    U_FORCE_INLINE reverse_iterator &operator=(reverse_iterator &&src) noexcept = default;
2433
2434
    U_FORCE_INLINE reverse_iterator(const reverse_iterator &other) = default;
2435
    U_FORCE_INLINE reverse_iterator &operator=(const reverse_iterator &other) = default;
2436
2437
    U_FORCE_INLINE bool operator==(const reverse_iterator &other) const {
2438
        return getLogicalPosition() == other.getLogicalPosition();
2439
    }
2440
    U_FORCE_INLINE bool operator!=(const reverse_iterator &other) const { return !operator==(other); }
2441
2442
    U_FORCE_INLINE U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter> base() const {
2443
        return U_HEADER_ONLY_NAMESPACE::UnsafeUTFIterator<CP32, UnitIter>(
2444
            getLogicalPosition());
2445
    }
2446
2447
    U_FORCE_INLINE UnsafeCodeUnits_ operator*() const {
2448
        if (state_ == 0) {
2449
            units_ = Impl::decAndRead(p_);
2450
            state_ = -1;
2451
        }
2452
        return units_;
2453
    }
2454
2455
    U_FORCE_INLINE Proxy operator->() const {
2456
        if (state_ == 0) {
2457
            units_ = Impl::decAndRead(p_);
2458
            state_ = -1;
2459
        }
2460
        return Proxy(units_);
2461
    }
2462
2463
    U_FORCE_INLINE reverse_iterator &operator++() {  // pre-increment
2464
        if (state_ < 0) {
2465
            // operator*() called decAndRead() so p_ is already behind.
2466
            state_ = 0;
2467
        } else if (state_ == 0) {
2468
            Impl::dec(p_);
2469
        } else /* state_ > 0 */ {
2470
            // operator--() called readAndInc() so we know how far to skip.
2471
            p_ = units_.begin();
2472
            state_ = 0;
2473
        }
2474
        return *this;
2475
    }
2476
2477
    U_FORCE_INLINE reverse_iterator operator++(int) {  // post-increment
2478
        if (state_ < 0) {
2479
            // operator*() called decAndRead() so p_ is already behind.
2480
            reverse_iterator result(*this);
2481
            state_ = 0;
2482
            return result;
2483
        } else if (state_ == 0) {
2484
            units_ = Impl::decAndRead(p_);
2485
            reverse_iterator result(*this);
2486
            result.state_ = -1;
2487
            // keep this->state_ == 0
2488
            return result;
2489
        } else /* state_ > 0 */ {
2490
            reverse_iterator result(*this);
2491
            // operator--() called readAndInc() so we know how far to skip.
2492
            p_ = units_.begin();
2493
            state_ = 0;
2494
            return result;
2495
        }
2496
    }
2497
2498
    U_FORCE_INLINE reverse_iterator &operator--() {  // pre-decrement
2499
        if (state_ < 0) {
2500
            // operator*() called decAndRead() so p_ is behind the logical position.
2501
            p_ = units_.end();
2502
        }
2503
        UnitIter p0 = p_;
2504
        units_ = Impl::readAndInc(p0, p_);
2505
        state_ = 1;
2506
        return *this;
2507
    }
2508
2509
    U_FORCE_INLINE reverse_iterator operator--(int) {  // post-decrement
2510
        reverse_iterator result(*this);
2511
        operator--();
2512
        return result;
2513
    }
2514
2515
private:
2516
    U_FORCE_INLINE UnitIter getLogicalPosition() const {
2517
        return state_ >= 0 ? p_ : units_.end();
2518
    }
2519
2520
    // operator*() etc. are logically const.
2521
    mutable UnitIter p_;
2522
    // Keep state so that we call decAndRead() only once for both operator*() and ++
2523
    // to make it easy for the compiler to optimize.
2524
    mutable UnsafeCodeUnits_ units_;
2525
    // >0: units_ = readAndInc(), p_ = units limit
2526
    //  0: initial state
2527
    // <0: units_ = decAndRead(), p_ = units start
2528
    //     which means that p_ is behind its logical position
2529
    mutable int8_t state_ = 0;
2530
};
2531
#endif  // U_IN_DOXYGEN
2532
2533
namespace U_HEADER_ONLY_NAMESPACE {
2534
2535
/**
2536
 * UnsafeUTFIterator factory function.
2537
 * Deduces the UnitIter template parameter from the input.
2538
 *
2539
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
2540
 * @tparam UnitIter Can usually be omitted/deduced:
2541
 *     An iterator (often a pointer) that returns a code unit type:
2542
 *     UTF-8: char or char8_t or uint8_t;
2543
 *     UTF-16: char16_t or uint16_t or (on Windows) wchar_t;
2544
 *     UTF-32: char32_t or UChar32=int32_t or (on Linux) wchar_t
2545
 * @param iter code unit iterator
2546
 * @return an UnsafeUTFIterator&lt;CP32, UnitIter&gt;
2547
 *     for the given code unit iterator or character pointer
2548
 * @draft ICU 78
2549
 */
2550
template<typename CP32, typename UnitIter>
2551
auto unsafeUTFIterator(UnitIter iter) {
2552
    return UnsafeUTFIterator<CP32, UnitIter>(std::move(iter));
2553
}
2554
2555
/**
2556
 * A C++ "range" for non-validating iteration over all of the code points of a code unit range.
2557
 * The string must be well-formed.
2558
 *
2559
 * Call unsafeUTFStringCodePoints() to have the compiler deduce the Range type.
2560
 *
2561
 * UnsafeUTFStringCodePoints is conditionally borrowed; that is, if Range is a borrowed range
2562
 * so is UnsafeUTFStringCodePoints<CP32, behavior, Range>.
2563
 * Note that when given a range r that is an lvalue and is not a view,  unsafeUTFStringCodePoints(r) uses
2564
 * a ref_view of r as the Range type, which is a borrowed range.
2565
 * In practice, this means that given a container variable r, the iterators of
2566
 * unsafeUTFStringCodePoints(r) can be used as long as iterators on r are valid, without having to keep
2567
 * unsafeUTFStringCodePoints(r) around.
2568
 * For instance:
2569
 * \code
2570
 *     std::u8string s = "𒇧𒇧";
2571
 *     // it outlives unsafeUTFStringCodePoints<char32_t>(s).
2572
 *     auto it = unsafeUTFStringCodePoints<char32_t>(s).begin();
2573
 *     ++it;
2574
 *     char32_t second_code_point = it->codePoint();  // OK.
2575
 * \endcode
2576
 *
2577
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
2578
 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
2579
 * @draft ICU 78
2580
 * @see unsafeUTFStringCodePoints
2581
 */
2582
template<typename CP32, typename Range>
2583
#if defined(__cpp_lib_ranges)
2584
    requires std::ranges::range<Range>
2585
#endif
2586
class UnsafeUTFStringCodePoints {
2587
    static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
2588
public:
2589
    /**
2590
     * Constructs an empty C++ "range" object.
2591
     * @draft ICU 78
2592
     */
2593
    UnsafeUTFStringCodePoints() = default;
2594
2595
    /**
2596
     * Constructs a C++ "range" object over the code points in the string.
2597
     * @param unitRange input range
2598
     * @draft ICU 78
2599
     */
2600
    template<typename R = Range, typename = std::enable_if_t<!std::is_reference_v<R>>>
2601
    explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(std::move(unitRange)) {}
2602
    /**
2603
     * Constructs a C++ "range" object over the code points in the string,
2604
     * keeping a reference to the code unit range.  This overload is used by
2605
     * utfStringCodePoints in C++17; in C++20, a ref_view is used instead (via
2606
     * views::all).
2607
     * @param unitRange input range
2608
     * @draft ICU 78
2609
     */
2610
    template<typename R = Range, typename = std::enable_if_t<std::is_reference_v<R>>, typename = void>
2611
    explicit UnsafeUTFStringCodePoints(Range unitRange) : unitRange(unitRange) {}
2612
2613
    /** Copy constructor. @draft ICU 78 */
2614
    UnsafeUTFStringCodePoints(const UnsafeUTFStringCodePoints &other) = default;
2615
2616
    /** Copy assignment operator. @draft ICU 78 */
2617
    UnsafeUTFStringCodePoints &operator=(const UnsafeUTFStringCodePoints &other) = default;
2618
2619
    /**
2620
     * @return the range start iterator
2621
     * @draft ICU 78
2622
     */
2623
    auto begin() {
2624
        return unsafeUTFIterator<CP32>(unitRange.begin());
2625
    }
2626
2627
    /**
2628
     * @return the range start iterator
2629
     * @draft ICU 78
2630
     */
2631
    template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2632
    auto begin() const {
2633
        return unsafeUTFIterator<CP32>(unitRange.begin());
2634
    }
2635
2636
    /**
2637
     * @return the range limit (exclusive end) iterator
2638
     * @draft ICU 78
2639
     */
2640
    auto end() {
2641
        using UnitIter = decltype(unitRange.begin());
2642
        using LimitIter = decltype(unitRange.end());
2643
        if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2644
            // Return the code unit sentinel.
2645
            return unitRange.end();
2646
        } else {
2647
            return unsafeUTFIterator<CP32>(unitRange.end());
2648
        }
2649
    }
2650
2651
    /**
2652
     * @return the range limit (exclusive end) iterator
2653
     * @draft ICU 78
2654
     */
2655
    template<typename R = Range, typename = std::enable_if_t<prv::range<const R>>>
2656
    auto end() const {
2657
        using UnitIter = decltype(unitRange.begin());
2658
        using LimitIter = decltype(unitRange.end());
2659
        if constexpr (!std::is_same_v<UnitIter, LimitIter>) {
2660
            // Return the code unit sentinel.
2661
            return unitRange.end();
2662
        } else {
2663
            return unsafeUTFIterator<CP32>(unitRange.end());
2664
        }
2665
    }
2666
2667
    /**
2668
     * @return std::reverse_iterator(end())
2669
     * @draft ICU 78
2670
     */
2671
    auto rbegin() const {
2672
        return std::make_reverse_iterator(end());
2673
    }
2674
2675
    /**
2676
     * @return std::reverse_iterator(begin())
2677
     * @draft ICU 78
2678
     */
2679
    auto rend() const {
2680
        return std::make_reverse_iterator(begin());
2681
    }
2682
2683
    /**
2684
     * Returns the CodeUnits for the first character/code point.
2685
     * Requires that the range is not empty.
2686
     *
2687
     * @return the CodeUnits for the first character/code point.
2688
     * @draft ICU 79
2689
     */
2690
    auto front() const {
2691
        return *begin();
2692
    }
2693
2694
    /**
2695
     * Returns the CodeUnits for the last character/code point.
2696
     * Requires that the range is not empty.
2697
     *
2698
     * @return the CodeUnits for the last character/code point.
2699
     * @draft ICU 79
2700
     */
2701
    auto back() const {
2702
        return *(--end());
2703
    }
2704
2705
private:
2706
    Range unitRange;
2707
};
2708
2709
/** @internal */
2710
template<typename CP32>
2711
struct UnsafeUTFStringCodePointsAdaptor
2712
#if U_CPLUSPLUS_VERSION >= 23 && __cpp_lib_ranges >= 2022'02 &&                                         \
2713
    __cpp_lib_bind_back >= 2022'02 // http://wg21.link/P2387R3.
2714
    : std::ranges::range_adaptor_closure<UnsafeUTFStringCodePointsAdaptor<CP32>>
2715
#endif
2716
{
2717
    /** @internal */
2718
    template<typename Range>
2719
    auto operator()(Range &&unitRange) const {
2720
#if defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 2021'10  // We need https://wg21.link/P2415R2.
2721
        return UnsafeUTFStringCodePoints<CP32, std::ranges::views::all_t<Range>>(std::forward<Range>(unitRange));
2722
#else
2723
        if constexpr (prv::is_basic_string_view_v<std::decay_t<Range>>) {
2724
            // Take basic_string_view by copy, not by reference.  In C++20 this is handled by
2725
            // all_t<Range>, which is Range if Range is a view.
2726
            return UnsafeUTFStringCodePoints<CP32, std::decay_t<Range>>(std::forward<Range>(unitRange));
2727
        } else {
2728
            return UnsafeUTFStringCodePoints<CP32, Range>(std::forward<Range>(unitRange));
2729
        }
2730
#endif
2731
    }
2732
};
2733
2734
2735
/**
2736
 * Range adaptor function object returning an UnsafeUTFStringCodePoints object that represents a
2737
 * "range" of code points in a code unit range. The string must be well-formed.
2738
 * Deduces the Range template parameter from the input, taking into account the value category: the
2739
 * code units will be referenced if possible, and moved if necessary.
2740
 *
2741
 * @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
2742
 * @tparam Range A C++ "range" of Unicode UTF-8/16/32 code units
2743
 * @param unitRange input range
2744
 * @return an UnsafeUTFStringCodePoints&lt;CP32, Range&gt; for the given unitRange
2745
 * @draft ICU 78
2746
 */
2747
template<typename CP32>
2748
constexpr UnsafeUTFStringCodePointsAdaptor<CP32> unsafeUTFStringCodePoints;
2749
2750
}  // namespace U_HEADER_ONLY_NAMESPACE
2751
2752
2753
#if defined(__cpp_lib_ranges)
2754
template <typename CP32, UTFIllFormedBehavior behavior, typename Range>
2755
constexpr bool std::ranges::enable_borrowed_range<
2756
    U_HEADER_ONLY_NAMESPACE::UTFStringCodePoints<CP32, behavior, Range>> =
2757
    std::ranges::enable_borrowed_range<Range>;
2758
2759
template <typename CP32, typename Range>
2760
constexpr bool std::ranges::enable_borrowed_range<
2761
    U_HEADER_ONLY_NAMESPACE::UnsafeUTFStringCodePoints<CP32, Range>> =
2762
    std::ranges::enable_borrowed_range<Range>;
2763
#endif
2764
2765
#endif  // U_HIDE_DRAFT_API
2766
#endif  // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
2767
#endif  // __UTFITERATOR_H__