Coverage Report

Created: 2025-06-24 06:54

/src/icu/icu4c/source/i18n/utf8collationiterator.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2012-2016, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* utf8collationiterator.h
9
*
10
* created on: 2012nov12 (from utf16collationiterator.h & uitercollationiterator.h)
11
* created by: Markus W. Scherer
12
*/
13
14
#ifndef __UTF8COLLATIONITERATOR_H__
15
#define __UTF8COLLATIONITERATOR_H__
16
17
#include "unicode/utypes.h"
18
19
#if !UCONFIG_NO_COLLATION
20
21
#include "cmemory.h"
22
#include "collation.h"
23
#include "collationdata.h"
24
#include "collationiterator.h"
25
#include "normalizer2impl.h"
26
27
U_NAMESPACE_BEGIN
28
29
/**
30
 * UTF-8 collation element and character iterator.
31
 * Handles normalized UTF-8 text inline, with length or NUL-terminated.
32
 * Unnormalized text is handled by a subclass.
33
 */
34
class U_I18N_API UTF8CollationIterator : public CollationIterator {
35
public:
36
    UTF8CollationIterator(const CollationData *d, UBool numeric,
37
                          const uint8_t *s, int32_t p, int32_t len)
38
0
            : CollationIterator(d, numeric),
39
0
              u8(s), pos(p), length(len) {}
40
41
    virtual ~UTF8CollationIterator();
42
43
    virtual void resetToOffset(int32_t newOffset) override;
44
45
    virtual int32_t getOffset() const override;
46
47
    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
48
49
    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
50
51
protected:
52
    /**
53
     * For byte sequences that are illegal in UTF-8, an error value may be returned
54
     * together with a bogus code point. The caller will ignore that code point.
55
     *
56
     * Special values may be returned for surrogate code points, which are also illegal in UTF-8,
57
     * but the caller will treat them like U+FFFD because forbidSurrogateCodePoints() returns true.
58
     *
59
     * Valid lead surrogates are returned from inside a normalized text segment,
60
     * where handleGetTrailSurrogate() will return the matching trail surrogate.
61
     */
62
    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
63
64
    virtual UBool foundNULTerminator() override;
65
66
    virtual UBool forbidSurrogateCodePoints() const override;
67
68
    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
69
70
    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
71
72
    const uint8_t *u8;
73
    int32_t pos;
74
    int32_t length;  // <0 for NUL-terminated strings
75
};
76
77
/**
78
 * Incrementally checks the input text for FCD and normalizes where necessary.
79
 */
80
class U_I18N_API FCDUTF8CollationIterator : public UTF8CollationIterator {
81
public:
82
    FCDUTF8CollationIterator(const CollationData *data, UBool numeric,
83
                             const uint8_t *s, int32_t p, int32_t len)
84
0
            : UTF8CollationIterator(data, numeric, s, p, len),
85
0
              state(CHECK_FWD), start(p),
86
0
              nfcImpl(data->nfcImpl) {}
87
88
    virtual ~FCDUTF8CollationIterator();
89
90
    virtual void resetToOffset(int32_t newOffset) override;
91
92
    virtual int32_t getOffset() const override;
93
94
    virtual UChar32 nextCodePoint(UErrorCode &errorCode) override;
95
96
    virtual UChar32 previousCodePoint(UErrorCode &errorCode) override;
97
98
protected:
99
    virtual uint32_t handleNextCE32(UChar32 &c, UErrorCode &errorCode) override;
100
101
    virtual char16_t handleGetTrailSurrogate() override;
102
103
    virtual UBool foundNULTerminator() override;
104
105
    virtual void forwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
106
107
    virtual void backwardNumCodePoints(int32_t num, UErrorCode &errorCode) override;
108
109
private:
110
    UBool nextHasLccc() const;
111
    UBool previousHasTccc() const;
112
113
    /**
114
     * Switches to forward checking if possible.
115
     */
116
    void switchToForward();
117
118
    /**
119
     * Extends the FCD text segment forward or normalizes around pos.
120
     * @return true if success
121
     */
122
    UBool nextSegment(UErrorCode &errorCode);
123
124
    /**
125
     * Switches to backward checking.
126
     */
127
    void switchToBackward();
128
129
    /**
130
     * Extends the FCD text segment backward or normalizes around pos.
131
     * @return true if success
132
     */
133
    UBool previousSegment(UErrorCode &errorCode);
134
135
    UBool normalize(const UnicodeString &s, UErrorCode &errorCode);
136
137
    enum State {
138
        /**
139
         * The input text [start..pos[ passes the FCD check.
140
         * Moving forward checks incrementally.
141
         * limit is undefined.
142
         */
143
        CHECK_FWD,
144
        /**
145
         * The input text [pos..limit[ passes the FCD check.
146
         * Moving backward checks incrementally.
147
         * start is undefined.
148
         */
149
        CHECK_BWD,
150
        /**
151
         * The input text [start..limit[ passes the FCD check.
152
         * pos tracks the current text index.
153
         */
154
        IN_FCD_SEGMENT,
155
        /**
156
         * The input text [start..limit[ failed the FCD check and was normalized.
157
         * pos tracks the current index in the normalized string.
158
         */
159
        IN_NORMALIZED
160
    };
161
162
    State state;
163
164
    int32_t start;
165
    int32_t limit;
166
167
    const Normalizer2Impl &nfcImpl;
168
    UnicodeString normalized;
169
};
170
171
U_NAMESPACE_END
172
173
#endif  // !UCONFIG_NO_COLLATION
174
#endif  // __UTF8COLLATIONITERATOR_H__