/src/poco/Foundation/include/Poco/Unicode.h
Line | Count | Source |
1 | | // |
2 | | // Unicode.h |
3 | | // |
4 | | // Library: Foundation |
5 | | // Package: Text |
6 | | // Module: Unicode |
7 | | // |
8 | | // Definition of the Unicode class. |
9 | | // |
10 | | // Copyright (c) 2007, Applied Informatics Software Engineering GmbH. |
11 | | // and Contributors. |
12 | | // |
13 | | // SPDX-License-Identifier: BSL-1.0 |
14 | | // |
15 | | |
16 | | |
17 | | #ifndef Foundation_Unicode_INCLUDED |
18 | | #define Foundation_Unicode_INCLUDED |
19 | | |
20 | | |
21 | | #include "Poco/Foundation.h" |
22 | | |
23 | | |
24 | | namespace Poco { |
25 | | |
26 | | |
27 | | class Foundation_API Unicode |
28 | | /// This class contains enumerations and static |
29 | | /// utility functions for dealing with Unicode characters |
30 | | /// and their properties. |
31 | | /// |
32 | | /// For more information on Unicode, see <http://www.unicode.org>. |
33 | | /// |
34 | | /// The implementation is based on the Unicode support |
35 | | /// functions in PCRE. |
36 | | { |
37 | | public: |
38 | | // Implementation note: the following definitions must be kept |
39 | | // in sync with those from pcre2_ucp.h (PCRE). |
40 | | enum CharacterCategory |
41 | | /// Unicode character categories. |
42 | | { |
43 | | UCP_OTHER, |
44 | | UCP_LETTER, |
45 | | UCP_MARK, |
46 | | UCP_NUMBER, |
47 | | UCP_PUNCTUATION, |
48 | | UCP_SYMBOL, |
49 | | UCP_SEPARATOR |
50 | | }; |
51 | | |
52 | | enum CharacterType |
53 | | /// Unicode character types. |
54 | | { |
55 | | UCP_CONTROL, |
56 | | UCP_FORMAT, |
57 | | UCP_UNASSIGNED, |
58 | | UCP_PRIVATE_USE, |
59 | | UCP_SURROGATE, |
60 | | UCP_LOWER_CASE_LETTER, |
61 | | UCP_MODIFIER_LETTER, |
62 | | UCP_OTHER_LETTER, |
63 | | UCP_TITLE_CASE_LETTER, |
64 | | UCP_UPPER_CASE_LETTER, |
65 | | UCP_SPACING_MARK, |
66 | | UCP_ENCLOSING_MARK, |
67 | | UCP_NON_SPACING_MARK, |
68 | | UCP_DECIMAL_NUMBER, |
69 | | UCP_LETTER_NUMBER, |
70 | | UCP_OTHER_NUMBER, |
71 | | UCP_CONNECTOR_PUNCTUATION, |
72 | | UCP_DASH_PUNCTUATION, |
73 | | UCP_CLOSE_PUNCTUATION, |
74 | | UCP_FINAL_PUNCTUATION, |
75 | | UCP_INITIAL_PUNCTUATION, |
76 | | UCP_OTHER_PUNCTUATION, |
77 | | UCP_OPEN_PUNCTUATION, |
78 | | UCP_CURRENCY_SYMBOL, |
79 | | UCP_MODIFIER_SYMBOL, |
80 | | UCP_MATHEMATICAL_SYMBOL, |
81 | | UCP_OTHER_SYMBOL, |
82 | | UCP_LINE_SEPARATOR, |
83 | | UCP_PARAGRAPH_SEPARATOR, |
84 | | UCP_SPACE_SEPARATOR |
85 | | }; |
86 | | |
87 | | enum Script |
88 | | /// Unicode 7.0 script identifiers. |
89 | | { |
90 | | UCP_ARABIC, |
91 | | UCP_ARMENIAN, |
92 | | UCP_BENGALI, |
93 | | UCP_BOPOMOFO, |
94 | | UCP_BRAILLE, |
95 | | UCP_BUGINESE, |
96 | | UCP_BUHID, |
97 | | UCP_CANADIAN_ABORIGINAL, |
98 | | UCP_CHEROKEE, |
99 | | UCP_COMMON, |
100 | | UCP_COPTIC, |
101 | | UCP_CYPRIOT, |
102 | | UCP_CYRILLIC, |
103 | | UCP_DESERET, |
104 | | UCP_DEVANAGARI, |
105 | | UCP_ETHIOPIC, |
106 | | UCP_GEORGIAN, |
107 | | UCP_GLAGOLITIC, |
108 | | UCP_GOTHIC, |
109 | | UCP_GREEK, |
110 | | UCP_GUJARATI, |
111 | | UCP_GURMUKHI, |
112 | | UCP_HAN, |
113 | | UCP_HANGUL, |
114 | | UCP_HANUNOO, |
115 | | UCP_HEBREW, |
116 | | UCP_HIRAGANA, |
117 | | UCP_INHERITED, |
118 | | UCP_KANNADA, |
119 | | UCP_KATAKANA, |
120 | | UCP_KHAROSHTHI, |
121 | | UCP_KHMER, |
122 | | UCP_LAO, |
123 | | UCP_LATIN, |
124 | | UCP_LIMBU, |
125 | | UCP_LINEAR_B, |
126 | | UCP_MALAYALAM, |
127 | | UCP_MONGOLIAN, |
128 | | UCP_MYANMAR, |
129 | | UCP_NEW_TAI_LUE, |
130 | | UCP_OGHAM, |
131 | | UCP_OLD_ITALIC, |
132 | | UCP_OLD_PERSIAN, |
133 | | UCP_ORIYA, |
134 | | UCP_OSMANYA, |
135 | | UCP_RUNIC, |
136 | | UCP_SHAVIAN, |
137 | | UCP_SINHALA, |
138 | | UCP_SYLOTI_NAGRI, |
139 | | UCP_SYRIAC, |
140 | | UCP_TAGALOG, |
141 | | UCP_TAGBANWA, |
142 | | UCP_TAI_LE, |
143 | | UCP_TAMIL, |
144 | | UCP_TELUGU, |
145 | | UCP_THAANA, |
146 | | UCP_THAI, |
147 | | UCP_TIBETAN, |
148 | | UCP_TIFINAGH, |
149 | | UCP_UGARITIC, |
150 | | UCP_YI, |
151 | | // Unicode 5.0 |
152 | | UCP_BALINESE, |
153 | | UCP_CUNEIFORM, |
154 | | UCP_NKO, |
155 | | UCP_PHAGS_PA, |
156 | | UCP_PHOENICIAN, |
157 | | // Unicode 5.1 |
158 | | UCP_CARIAN, |
159 | | UCP_CHAM, |
160 | | UCP_KAYAH_LI, |
161 | | UCP_LEPCHA, |
162 | | UCP_LYCIAN, |
163 | | UCP_LYDIAN, |
164 | | UCP_OL_CHIKI, |
165 | | UCP_REJANG, |
166 | | UCP_SAURASHTRA, |
167 | | UCP_SUNDANESE, |
168 | | UCP_VAI, |
169 | | // Unicode 5.2 |
170 | | UCP_AVESTAN, |
171 | | UCP_BAMUM, |
172 | | UCP_EGYPTIAN_HIEROGLYPHS, |
173 | | UCP_IMPERIAL_ARAMAIC, |
174 | | UCP_INSCRIPTIONAL_PAHLAVI, |
175 | | UCP_INSCRIPTIONAL_PARTHIAN, |
176 | | UCP_JAVANESE, |
177 | | UCP_KAITHI, |
178 | | UCP_LISU, |
179 | | UCP_MEETEI_MAYEK, |
180 | | UCP_OLD_SOUTH_ARABIAN, |
181 | | UCP_OLD_TURKIC, |
182 | | UCP_SAMARITAN, |
183 | | UCP_TAI_THAM, |
184 | | UCP_TAI_VIET, |
185 | | // Unicode 6.0 |
186 | | UCP_BATAK, |
187 | | UCP_BRAHMI, |
188 | | UCP_MANDAIC, |
189 | | // Unicode 6.1 |
190 | | UCP_CHAKMA, |
191 | | UCP_MEROITIC_CURSIVE, |
192 | | UCP_MEROITIC_HIEROGLYPHS, |
193 | | UCP_MIAO, |
194 | | UCP_SHARADA, |
195 | | UCP_SORA_SOMPENG, |
196 | | UCP_TAKRI, |
197 | | // Unicode 7.0 |
198 | | UCP_BASSA_VAH, |
199 | | UCP_CAUCASIAN_ALBANIAN, |
200 | | UCP_DUPLOYAN, |
201 | | UCP_ELBASAN, |
202 | | UCP_GRANTHA, |
203 | | UCP_KHOJKI, |
204 | | UCP_KHUDAWADI, |
205 | | UCP_LINEAR_A, |
206 | | UCP_MAHAJANI, |
207 | | UCP_MANICHAEAN, |
208 | | UCP_MENDE_KIKAKUI, |
209 | | UCP_MODI, |
210 | | UCP_MRO, |
211 | | UCP_NABATAEAN, |
212 | | UCP_OLD_NORTH_ARABIAN, |
213 | | UCP_OLD_PERMIC, |
214 | | UCP_PAHAWH_HMONG, |
215 | | UCP_PALMYRENE, |
216 | | UCP_PSALTER_PAHLAVI, |
217 | | UCP_PAU_CIN_HAU, |
218 | | UCP_SIDDHAM, |
219 | | UCP_TIRHUTA, |
220 | | UCP_WARANG_CITI |
221 | | }; |
222 | | |
223 | | enum |
224 | | { |
225 | | UCP_MAX_CODEPOINT = 0x10FFFF |
226 | | }; |
227 | | |
228 | | struct CharacterProperties |
229 | | /// This structure holds the character properties |
230 | | /// of an Unicode character. |
231 | | { |
232 | | CharacterCategory category; |
233 | | CharacterType type; |
234 | | Script script; |
235 | | }; |
236 | | |
237 | | static void properties(int ch, CharacterProperties& props); |
238 | | /// Return the Unicode character properties for the |
239 | | /// character with the given Unicode value. |
240 | | |
241 | | static bool isSpace(int ch); |
242 | | /// Returns true iff the given character is a separator. |
243 | | |
244 | | static bool isDigit(int ch); |
245 | | /// Returns true iff the given character is a numeric character. |
246 | | |
247 | | static bool isPunct(int ch); |
248 | | /// Returns true iff the given character is a punctuation character. |
249 | | |
250 | | static bool isAlpha(int ch); |
251 | | /// Returns true iff the given character is a letter. |
252 | | |
253 | | static bool isLower(int ch); |
254 | | /// Returns true iff the given character is a lowercase |
255 | | /// character. |
256 | | |
257 | | static bool isUpper(int ch); |
258 | | /// Returns true iff the given character is an uppercase |
259 | | /// character. |
260 | | |
261 | | static int toLower(int ch); |
262 | | /// If the given character is an uppercase character, |
263 | | /// return its lowercase counterpart, otherwise return |
264 | | /// the character. |
265 | | |
266 | | static int toUpper(int ch); |
267 | | /// If the given character is a lowercase character, |
268 | | /// return its uppercase counterpart, otherwise return |
269 | | /// the character. |
270 | | }; |
271 | | |
272 | | |
273 | | // |
274 | | // inlines |
275 | | // |
276 | | inline bool Unicode::isSpace(int ch) |
277 | 0 | { |
278 | 0 | CharacterProperties props; |
279 | 0 | properties(ch, props); |
280 | 0 | return props.category == UCP_SEPARATOR; |
281 | 0 | } |
282 | | |
283 | | |
284 | | inline bool Unicode::isDigit(int ch) |
285 | 0 | { |
286 | 0 | CharacterProperties props; |
287 | 0 | properties(ch, props); |
288 | 0 | return props.category == UCP_NUMBER; |
289 | 0 | } |
290 | | |
291 | | |
292 | | inline bool Unicode::isPunct(int ch) |
293 | 0 | { |
294 | 0 | CharacterProperties props; |
295 | 0 | properties(ch, props); |
296 | 0 | return props.category == UCP_PUNCTUATION; |
297 | 0 | } |
298 | | |
299 | | |
300 | | inline bool Unicode::isAlpha(int ch) |
301 | 0 | { |
302 | 0 | CharacterProperties props; |
303 | 0 | properties(ch, props); |
304 | 0 | return props.category == UCP_LETTER; |
305 | 0 | } |
306 | | |
307 | | |
308 | | inline bool Unicode::isLower(int ch) |
309 | 23.6M | { |
310 | 23.6M | CharacterProperties props; |
311 | 23.6M | properties(ch, props); |
312 | 23.6M | return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER; |
313 | 23.6M | } |
314 | | |
315 | | |
316 | | inline bool Unicode::isUpper(int ch) |
317 | 120M | { |
318 | 120M | CharacterProperties props; |
319 | 120M | properties(ch, props); |
320 | 120M | return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER; |
321 | 120M | } |
322 | | |
323 | | |
324 | | } // namespace Poco |
325 | | |
326 | | |
327 | | #endif // Foundation_Unicode_INCLUDED |