/src/icu/source/i18n/collationruleparser.h
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ******************************************************************************* |
5 | | * Copyright (C) 2013-2014, International Business Machines |
6 | | * Corporation and others. All Rights Reserved. |
7 | | ******************************************************************************* |
8 | | * collationruleparser.h |
9 | | * |
10 | | * created on: 2013apr10 |
11 | | * created by: Markus W. Scherer |
12 | | */ |
13 | | |
14 | | #ifndef __COLLATIONRULEPARSER_H__ |
15 | | #define __COLLATIONRULEPARSER_H__ |
16 | | |
17 | | #include "unicode/utypes.h" |
18 | | |
19 | | #if !UCONFIG_NO_COLLATION |
20 | | |
21 | | #include "unicode/ucol.h" |
22 | | #include "unicode/uniset.h" |
23 | | #include "unicode/unistr.h" |
24 | | |
25 | | struct UParseError; |
26 | | |
27 | | U_NAMESPACE_BEGIN |
28 | | |
29 | | struct CollationData; |
30 | | struct CollationTailoring; |
31 | | |
32 | | class Locale; |
33 | | class Normalizer2; |
34 | | |
35 | | struct CollationSettings; |
36 | | |
37 | | class U_I18N_API CollationRuleParser : public UMemory { |
38 | | public: |
39 | | /** Special reset positions. */ |
40 | | enum Position { |
41 | | FIRST_TERTIARY_IGNORABLE, |
42 | | LAST_TERTIARY_IGNORABLE, |
43 | | FIRST_SECONDARY_IGNORABLE, |
44 | | LAST_SECONDARY_IGNORABLE, |
45 | | FIRST_PRIMARY_IGNORABLE, |
46 | | LAST_PRIMARY_IGNORABLE, |
47 | | FIRST_VARIABLE, |
48 | | LAST_VARIABLE, |
49 | | FIRST_REGULAR, |
50 | | LAST_REGULAR, |
51 | | FIRST_IMPLICIT, |
52 | | LAST_IMPLICIT, |
53 | | FIRST_TRAILING, |
54 | | LAST_TRAILING |
55 | | }; |
56 | | |
57 | | /** |
58 | | * First character of contractions that encode special reset positions. |
59 | | * U+FFFE cannot be tailored via rule syntax. |
60 | | * |
61 | | * The second contraction character is POS_BASE + Position. |
62 | | */ |
63 | | static const UChar POS_LEAD = 0xfffe; |
64 | | /** |
65 | | * Base for the second character of contractions that encode special reset positions. |
66 | | * Braille characters U+28xx are printable and normalization-inert. |
67 | | * @see POS_LEAD |
68 | | */ |
69 | | static const UChar POS_BASE = 0x2800; |
70 | | |
71 | | class U_I18N_API Sink : public UObject { |
72 | | public: |
73 | | virtual ~Sink(); |
74 | | /** |
75 | | * Adds a reset. |
76 | | * strength=UCOL_IDENTICAL for &str. |
77 | | * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3. |
78 | | */ |
79 | | virtual void addReset(int32_t strength, const UnicodeString &str, |
80 | | const char *&errorReason, UErrorCode &errorCode) = 0; |
81 | | /** |
82 | | * Adds a relation with strength and prefix | str / extension. |
83 | | */ |
84 | | virtual void addRelation(int32_t strength, const UnicodeString &prefix, |
85 | | const UnicodeString &str, const UnicodeString &extension, |
86 | | const char *&errorReason, UErrorCode &errorCode) = 0; |
87 | | |
88 | | virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason, |
89 | | UErrorCode &errorCode); |
90 | | |
91 | | virtual void optimize(const UnicodeSet &set, const char *&errorReason, |
92 | | UErrorCode &errorCode); |
93 | | }; |
94 | | |
95 | | class U_I18N_API Importer : public UObject { |
96 | | public: |
97 | | virtual ~Importer(); |
98 | | virtual void getRules( |
99 | | const char *localeID, const char *collationType, |
100 | | UnicodeString &rules, |
101 | | const char *&errorReason, UErrorCode &errorCode) = 0; |
102 | | }; |
103 | | |
104 | | /** |
105 | | * Constructor. |
106 | | * The Sink must be set before parsing. |
107 | | * The Importer can be set, otherwise [import locale] syntax is not supported. |
108 | | */ |
109 | | CollationRuleParser(const CollationData *base, UErrorCode &errorCode); |
110 | | ~CollationRuleParser(); |
111 | | |
112 | | /** |
113 | | * Sets the pointer to a Sink object. |
114 | | * The pointer is aliased: Pointer copy without cloning or taking ownership. |
115 | | */ |
116 | 0 | void setSink(Sink *sinkAlias) { |
117 | 0 | sink = sinkAlias; |
118 | 0 | } |
119 | | |
120 | | /** |
121 | | * Sets the pointer to an Importer object. |
122 | | * The pointer is aliased: Pointer copy without cloning or taking ownership. |
123 | | */ |
124 | 0 | void setImporter(Importer *importerAlias) { |
125 | 0 | importer = importerAlias; |
126 | 0 | } |
127 | | |
128 | | void parse(const UnicodeString &ruleString, |
129 | | CollationSettings &outSettings, |
130 | | UParseError *outParseError, |
131 | | UErrorCode &errorCode); |
132 | | |
133 | 0 | const char *getErrorReason() const { return errorReason; } |
134 | | |
135 | | /** |
136 | | * Gets a script or reorder code from its string representation. |
137 | | * @return the script/reorder code, or |
138 | | * -1 if not recognized |
139 | | */ |
140 | | static int32_t getReorderCode(const char *word); |
141 | | |
142 | | private: |
143 | | /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */ |
144 | | static const int32_t STRENGTH_MASK = 0xf; |
145 | | static const int32_t STARRED_FLAG = 0x10; |
146 | | static const int32_t OFFSET_SHIFT = 8; |
147 | | |
148 | | void parse(const UnicodeString &ruleString, UErrorCode &errorCode); |
149 | | void parseRuleChain(UErrorCode &errorCode); |
150 | | int32_t parseResetAndPosition(UErrorCode &errorCode); |
151 | | int32_t parseRelationOperator(UErrorCode &errorCode); |
152 | | void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode); |
153 | | void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode); |
154 | | int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); |
155 | | int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode); |
156 | | |
157 | | /** |
158 | | * Sets str to a contraction of U+FFFE and (U+2800 + Position). |
159 | | * @return rule index after the special reset position |
160 | | */ |
161 | | int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode); |
162 | | void parseSetting(UErrorCode &errorCode); |
163 | | void parseReordering(const UnicodeString &raw, UErrorCode &errorCode); |
164 | | static UColAttributeValue getOnOffValue(const UnicodeString &s); |
165 | | |
166 | | int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode); |
167 | | int32_t readWords(int32_t i, UnicodeString &raw) const; |
168 | | int32_t skipComment(int32_t i) const; |
169 | | |
170 | | void setParseError(const char *reason, UErrorCode &errorCode); |
171 | | void setErrorContext(); |
172 | | |
173 | | /** |
174 | | * ASCII [:P:] and [:S:]: |
175 | | * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E] |
176 | | */ |
177 | | static UBool isSyntaxChar(UChar32 c); |
178 | | int32_t skipWhiteSpace(int32_t i) const; |
179 | | |
180 | | const Normalizer2 &nfd, &nfc; |
181 | | |
182 | | const UnicodeString *rules; |
183 | | const CollationData *const baseData; |
184 | | CollationSettings *settings; |
185 | | UParseError *parseError; |
186 | | const char *errorReason; |
187 | | |
188 | | Sink *sink; |
189 | | Importer *importer; |
190 | | |
191 | | int32_t ruleIndex; |
192 | | }; |
193 | | |
194 | | U_NAMESPACE_END |
195 | | |
196 | | #endif // !UCONFIG_NO_COLLATION |
197 | | #endif // __COLLATIONRULEPARSER_H__ |