Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/i18n/collationruleparser.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
*******************************************************************************
5
* Copyright (C) 2013-2014, International Business Machines
6
* Corporation and others.  All Rights Reserved.
7
*******************************************************************************
8
* collationruleparser.h
9
*
10
* created on: 2013apr10
11
* created by: Markus W. Scherer
12
*/
13
14
#ifndef __COLLATIONRULEPARSER_H__
15
#define __COLLATIONRULEPARSER_H__
16
17
#include "unicode/utypes.h"
18
19
#if !UCONFIG_NO_COLLATION
20
21
#include "unicode/ucol.h"
22
#include "unicode/uniset.h"
23
#include "unicode/unistr.h"
24
25
struct UParseError;
26
27
U_NAMESPACE_BEGIN
28
29
struct CollationData;
30
struct CollationTailoring;
31
32
class Locale;
33
class Normalizer2;
34
35
struct CollationSettings;
36
37
class U_I18N_API CollationRuleParser : public UMemory {
38
public:
39
    /** Special reset positions. */
40
    enum Position {
41
        FIRST_TERTIARY_IGNORABLE,
42
        LAST_TERTIARY_IGNORABLE,
43
        FIRST_SECONDARY_IGNORABLE,
44
        LAST_SECONDARY_IGNORABLE,
45
        FIRST_PRIMARY_IGNORABLE,
46
        LAST_PRIMARY_IGNORABLE,
47
        FIRST_VARIABLE,
48
        LAST_VARIABLE,
49
        FIRST_REGULAR,
50
        LAST_REGULAR,
51
        FIRST_IMPLICIT,
52
        LAST_IMPLICIT,
53
        FIRST_TRAILING,
54
        LAST_TRAILING
55
    };
56
57
    /**
58
     * First character of contractions that encode special reset positions.
59
     * U+FFFE cannot be tailored via rule syntax.
60
     *
61
     * The second contraction character is POS_BASE + Position.
62
     */
63
    static const UChar POS_LEAD = 0xfffe;
64
    /**
65
     * Base for the second character of contractions that encode special reset positions.
66
     * Braille characters U+28xx are printable and normalization-inert.
67
     * @see POS_LEAD
68
     */
69
    static const UChar POS_BASE = 0x2800;
70
71
    class U_I18N_API Sink : public UObject {
72
    public:
73
        virtual ~Sink();
74
        /**
75
         * Adds a reset.
76
         * strength=UCOL_IDENTICAL for &str.
77
         * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
78
         */
79
        virtual void addReset(int32_t strength, const UnicodeString &str,
80
                              const char *&errorReason, UErrorCode &errorCode) = 0;
81
        /**
82
         * Adds a relation with strength and prefix | str / extension.
83
         */
84
        virtual void addRelation(int32_t strength, const UnicodeString &prefix,
85
                                 const UnicodeString &str, const UnicodeString &extension,
86
                                 const char *&errorReason, UErrorCode &errorCode) = 0;
87
88
        virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
89
                                          UErrorCode &errorCode);
90
91
        virtual void optimize(const UnicodeSet &set, const char *&errorReason,
92
                              UErrorCode &errorCode);
93
    };
94
95
    class U_I18N_API Importer : public UObject {
96
    public:
97
        virtual ~Importer();
98
        virtual void getRules(
99
                const char *localeID, const char *collationType,
100
                UnicodeString &rules,
101
                const char *&errorReason, UErrorCode &errorCode) = 0;
102
    };
103
104
    /**
105
     * Constructor.
106
     * The Sink must be set before parsing.
107
     * The Importer can be set, otherwise [import locale] syntax is not supported.
108
     */
109
    CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
110
    ~CollationRuleParser();
111
112
    /**
113
     * Sets the pointer to a Sink object.
114
     * The pointer is aliased: Pointer copy without cloning or taking ownership.
115
     */
116
0
    void setSink(Sink *sinkAlias) {
117
0
        sink = sinkAlias;
118
0
    }
119
120
    /**
121
     * Sets the pointer to an Importer object.
122
     * The pointer is aliased: Pointer copy without cloning or taking ownership.
123
     */
124
0
    void setImporter(Importer *importerAlias) {
125
0
        importer = importerAlias;
126
0
    }
127
128
    void parse(const UnicodeString &ruleString,
129
               CollationSettings &outSettings,
130
               UParseError *outParseError,
131
               UErrorCode &errorCode);
132
133
0
    const char *getErrorReason() const { return errorReason; }
134
135
    /**
136
     * Gets a script or reorder code from its string representation.
137
     * @return the script/reorder code, or
138
     * -1 if not recognized
139
     */
140
    static int32_t getReorderCode(const char *word);
141
142
private:
143
    /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
144
    static const int32_t STRENGTH_MASK = 0xf;
145
    static const int32_t STARRED_FLAG = 0x10;
146
    static const int32_t OFFSET_SHIFT = 8;
147
148
    void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
149
    void parseRuleChain(UErrorCode &errorCode);
150
    int32_t parseResetAndPosition(UErrorCode &errorCode);
151
    int32_t parseRelationOperator(UErrorCode &errorCode);
152
    void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
153
    void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
154
    int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
155
    int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
156
157
    /**
158
     * Sets str to a contraction of U+FFFE and (U+2800 + Position).
159
     * @return rule index after the special reset position
160
     */
161
    int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
162
    void parseSetting(UErrorCode &errorCode);
163
    void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
164
    static UColAttributeValue getOnOffValue(const UnicodeString &s);
165
166
    int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
167
    int32_t readWords(int32_t i, UnicodeString &raw) const;
168
    int32_t skipComment(int32_t i) const;
169
170
    void setParseError(const char *reason, UErrorCode &errorCode);
171
    void setErrorContext();
172
173
    /**
174
     * ASCII [:P:] and [:S:]:
175
     * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
176
     */
177
    static UBool isSyntaxChar(UChar32 c);
178
    int32_t skipWhiteSpace(int32_t i) const;
179
180
    const Normalizer2 &nfd, &nfc;
181
182
    const UnicodeString *rules;
183
    const CollationData *const baseData;
184
    CollationSettings *settings;
185
    UParseError *parseError;
186
    const char *errorReason;
187
188
    Sink *sink;
189
    Importer *importer;
190
191
    int32_t ruleIndex;
192
};
193
194
U_NAMESPACE_END
195
196
#endif  // !UCONFIG_NO_COLLATION
197
#endif  // __COLLATIONRULEPARSER_H__