Coverage Report

Created: 2025-11-06 06:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/poco/Foundation/include/Poco/Unicode.h
Line
Count
Source
1
//
2
// Unicode.h
3
//
4
// Library: Foundation
5
// Package: Text
6
// Module:  Unicode
7
//
8
// Definition of the Unicode class.
9
//
10
// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
11
// and Contributors.
12
//
13
// SPDX-License-Identifier: BSL-1.0
14
//
15
16
17
#ifndef Foundation_Unicode_INCLUDED
18
#define Foundation_Unicode_INCLUDED
19
20
21
#include "Poco/Foundation.h"
22
23
24
namespace Poco {
25
26
27
class Foundation_API Unicode
28
  /// This class contains enumerations and static
29
  /// utility functions for dealing with Unicode characters
30
  /// and their properties.
31
  ///
32
  /// For more information on Unicode, see <http://www.unicode.org>.
33
  ///
34
  /// The implementation is based on the Unicode support
35
  /// functions in PCRE.
36
{
37
public:
38
  // Implementation note: the following definitions must be kept
39
  // in sync with those from pcre2_ucp.h (PCRE).
40
  enum CharacterCategory
41
    /// Unicode character categories.
42
  {
43
    UCP_OTHER,
44
    UCP_LETTER,
45
    UCP_MARK,
46
    UCP_NUMBER,
47
    UCP_PUNCTUATION,
48
    UCP_SYMBOL,
49
    UCP_SEPARATOR
50
  };
51
52
  enum CharacterType
53
    /// Unicode character types.
54
  {
55
    UCP_CONTROL,
56
    UCP_FORMAT,
57
    UCP_UNASSIGNED,
58
    UCP_PRIVATE_USE,
59
    UCP_SURROGATE,
60
    UCP_LOWER_CASE_LETTER,
61
    UCP_MODIFIER_LETTER,
62
    UCP_OTHER_LETTER,
63
    UCP_TITLE_CASE_LETTER,
64
    UCP_UPPER_CASE_LETTER,
65
    UCP_SPACING_MARK,
66
    UCP_ENCLOSING_MARK,
67
    UCP_NON_SPACING_MARK,
68
    UCP_DECIMAL_NUMBER,
69
    UCP_LETTER_NUMBER,
70
    UCP_OTHER_NUMBER,
71
    UCP_CONNECTOR_PUNCTUATION,
72
    UCP_DASH_PUNCTUATION,
73
    UCP_CLOSE_PUNCTUATION,
74
    UCP_FINAL_PUNCTUATION,
75
    UCP_INITIAL_PUNCTUATION,
76
    UCP_OTHER_PUNCTUATION,
77
    UCP_OPEN_PUNCTUATION,
78
    UCP_CURRENCY_SYMBOL,
79
    UCP_MODIFIER_SYMBOL,
80
    UCP_MATHEMATICAL_SYMBOL,
81
    UCP_OTHER_SYMBOL,
82
    UCP_LINE_SEPARATOR,
83
    UCP_PARAGRAPH_SEPARATOR,
84
    UCP_SPACE_SEPARATOR
85
  };
86
87
  enum Script
88
    /// Unicode 7.0 script identifiers.
89
  {
90
    UCP_ARABIC,
91
    UCP_ARMENIAN,
92
    UCP_BENGALI,
93
    UCP_BOPOMOFO,
94
    UCP_BRAILLE,
95
    UCP_BUGINESE,
96
    UCP_BUHID,
97
    UCP_CANADIAN_ABORIGINAL,
98
    UCP_CHEROKEE,
99
    UCP_COMMON,
100
    UCP_COPTIC,
101
    UCP_CYPRIOT,
102
    UCP_CYRILLIC,
103
    UCP_DESERET,
104
    UCP_DEVANAGARI,
105
    UCP_ETHIOPIC,
106
    UCP_GEORGIAN,
107
    UCP_GLAGOLITIC,
108
    UCP_GOTHIC,
109
    UCP_GREEK,
110
    UCP_GUJARATI,
111
    UCP_GURMUKHI,
112
    UCP_HAN,
113
    UCP_HANGUL,
114
    UCP_HANUNOO,
115
    UCP_HEBREW,
116
    UCP_HIRAGANA,
117
    UCP_INHERITED,
118
    UCP_KANNADA,
119
    UCP_KATAKANA,
120
    UCP_KHAROSHTHI,
121
    UCP_KHMER,
122
    UCP_LAO,
123
    UCP_LATIN,
124
    UCP_LIMBU,
125
    UCP_LINEAR_B,
126
    UCP_MALAYALAM,
127
    UCP_MONGOLIAN,
128
    UCP_MYANMAR,
129
    UCP_NEW_TAI_LUE,
130
    UCP_OGHAM,
131
    UCP_OLD_ITALIC,
132
    UCP_OLD_PERSIAN,
133
    UCP_ORIYA,
134
    UCP_OSMANYA,
135
    UCP_RUNIC,
136
    UCP_SHAVIAN,
137
    UCP_SINHALA,
138
    UCP_SYLOTI_NAGRI,
139
    UCP_SYRIAC,
140
    UCP_TAGALOG,
141
    UCP_TAGBANWA,
142
    UCP_TAI_LE,
143
    UCP_TAMIL,
144
    UCP_TELUGU,
145
    UCP_THAANA,
146
    UCP_THAI,
147
    UCP_TIBETAN,
148
    UCP_TIFINAGH,
149
    UCP_UGARITIC,
150
    UCP_YI,
151
    // Unicode 5.0
152
    UCP_BALINESE,
153
    UCP_CUNEIFORM,
154
    UCP_NKO,
155
    UCP_PHAGS_PA,
156
    UCP_PHOENICIAN,
157
    // Unicode 5.1
158
    UCP_CARIAN,
159
    UCP_CHAM,
160
    UCP_KAYAH_LI,
161
    UCP_LEPCHA,
162
    UCP_LYCIAN,
163
    UCP_LYDIAN,
164
    UCP_OL_CHIKI,
165
    UCP_REJANG,
166
    UCP_SAURASHTRA,
167
    UCP_SUNDANESE,
168
    UCP_VAI,
169
    // Unicode 5.2
170
    UCP_AVESTAN,
171
    UCP_BAMUM,
172
    UCP_EGYPTIAN_HIEROGLYPHS,
173
    UCP_IMPERIAL_ARAMAIC,
174
    UCP_INSCRIPTIONAL_PAHLAVI,
175
    UCP_INSCRIPTIONAL_PARTHIAN,
176
    UCP_JAVANESE,
177
    UCP_KAITHI,
178
    UCP_LISU,
179
    UCP_MEETEI_MAYEK,
180
    UCP_OLD_SOUTH_ARABIAN,
181
    UCP_OLD_TURKIC,
182
    UCP_SAMARITAN,
183
    UCP_TAI_THAM,
184
    UCP_TAI_VIET,
185
    // Unicode 6.0
186
    UCP_BATAK,
187
    UCP_BRAHMI,
188
    UCP_MANDAIC,
189
    // Unicode 6.1
190
    UCP_CHAKMA,
191
    UCP_MEROITIC_CURSIVE,
192
    UCP_MEROITIC_HIEROGLYPHS,
193
    UCP_MIAO,
194
    UCP_SHARADA,
195
    UCP_SORA_SOMPENG,
196
    UCP_TAKRI,
197
    // Unicode 7.0
198
    UCP_BASSA_VAH,
199
    UCP_CAUCASIAN_ALBANIAN,
200
    UCP_DUPLOYAN,
201
    UCP_ELBASAN,
202
    UCP_GRANTHA,
203
    UCP_KHOJKI,
204
    UCP_KHUDAWADI,
205
    UCP_LINEAR_A,
206
    UCP_MAHAJANI,
207
    UCP_MANICHAEAN,
208
    UCP_MENDE_KIKAKUI,
209
    UCP_MODI,
210
    UCP_MRO,
211
    UCP_NABATAEAN,
212
    UCP_OLD_NORTH_ARABIAN,
213
    UCP_OLD_PERMIC,
214
    UCP_PAHAWH_HMONG,
215
    UCP_PALMYRENE,
216
    UCP_PSALTER_PAHLAVI,
217
    UCP_PAU_CIN_HAU,
218
    UCP_SIDDHAM,
219
    UCP_TIRHUTA,
220
    UCP_WARANG_CITI
221
  };
222
223
  enum
224
  {
225
    UCP_MAX_CODEPOINT = 0x10FFFF
226
  };
227
228
  struct CharacterProperties
229
    /// This structure holds the character properties
230
    /// of an Unicode character.
231
  {
232
    CharacterCategory category;
233
    CharacterType     type;
234
    Script            script;
235
  };
236
237
  static void properties(int ch, CharacterProperties& props);
238
    /// Return the Unicode character properties for the
239
    /// character with the given Unicode value.
240
241
  static bool isSpace(int ch);
242
    /// Returns true iff the given character is a separator.
243
244
  static bool isDigit(int ch);
245
    /// Returns true iff the given character is a numeric character.
246
247
  static bool isPunct(int ch);
248
    /// Returns true iff the given character is a punctuation character.
249
250
  static bool isAlpha(int ch);
251
    /// Returns true iff the given character is a letter.
252
253
  static bool isLower(int ch);
254
    /// Returns true iff the given character is a lowercase
255
    /// character.
256
257
  static bool isUpper(int ch);
258
    /// Returns true iff the given character is an uppercase
259
    /// character.
260
261
  static int toLower(int ch);
262
    /// If the given character is an uppercase character,
263
    /// return its lowercase counterpart, otherwise return
264
    /// the character.
265
266
  static int toUpper(int ch);
267
    /// If the given character is a lowercase character,
268
    /// return its uppercase counterpart, otherwise return
269
    /// the character.
270
};
271
272
273
//
274
// inlines
275
//
276
inline bool Unicode::isSpace(int ch)
277
0
{
278
0
  CharacterProperties props;
279
0
  properties(ch, props);
280
0
  return props.category == UCP_SEPARATOR;
281
0
}
282
283
284
inline bool Unicode::isDigit(int ch)
285
0
{
286
0
  CharacterProperties props;
287
0
  properties(ch, props);
288
0
  return props.category == UCP_NUMBER;
289
0
}
290
291
292
inline bool Unicode::isPunct(int ch)
293
0
{
294
0
  CharacterProperties props;
295
0
  properties(ch, props);
296
0
  return props.category == UCP_PUNCTUATION;
297
0
}
298
299
300
inline bool Unicode::isAlpha(int ch)
301
0
{
302
0
  CharacterProperties props;
303
0
  properties(ch, props);
304
0
  return props.category == UCP_LETTER;
305
0
}
306
307
308
inline bool Unicode::isLower(int ch)
309
23.6M
{
310
23.6M
  CharacterProperties props;
311
23.6M
  properties(ch, props);
312
23.6M
  return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
313
23.6M
}
314
315
316
inline bool Unicode::isUpper(int ch)
317
120M
{
318
120M
  CharacterProperties props;
319
120M
  properties(ch, props);
320
120M
  return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
321
120M
}
322
323
324
} // namespace Poco
325
326
327
#endif // Foundation_Unicode_INCLUDED