/src/poco/Foundation/include/Poco/Unicode.h

Source
//
// Unicode.h
//
// Library: Foundation
// Package: Text
// Module:  Unicode
//
// Definition of the Unicode class.
//
// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
// and Contributors.
//
// SPDX-License-Identifier: BSL-1.0
//


#ifndef Foundation_Unicode_INCLUDED
#define Foundation_Unicode_INCLUDED


#include "Poco/Foundation.h"


namespace Poco {


class Foundation_API Unicode
  /// This class contains enumerations and static
  /// utility functions for dealing with Unicode characters
  /// and their properties.
  ///
  /// For more information on Unicode, see <http://www.unicode.org>.
  ///
  /// The implementation is based on the Unicode support
  /// functions in PCRE.
{
public:
  // Implementation note: the following definitions must be kept
  // in sync with those from pcre2_ucp.h (PCRE).
  enum CharacterCategory
    /// Unicode character categories.
  {
    UCP_OTHER,
    UCP_LETTER,
    UCP_MARK,
    UCP_NUMBER,
    UCP_PUNCTUATION,
    UCP_SYMBOL,
    UCP_SEPARATOR
  };

  enum CharacterType
    /// Unicode character types.
  {
    UCP_CONTROL,
    UCP_FORMAT,
    UCP_UNASSIGNED,
    UCP_PRIVATE_USE,
    UCP_SURROGATE,
    UCP_LOWER_CASE_LETTER,
    UCP_MODIFIER_LETTER,
    UCP_OTHER_LETTER,
    UCP_TITLE_CASE_LETTER,
    UCP_UPPER_CASE_LETTER,
    UCP_SPACING_MARK,
    UCP_ENCLOSING_MARK,
    UCP_NON_SPACING_MARK,
    UCP_DECIMAL_NUMBER,
    UCP_LETTER_NUMBER,
    UCP_OTHER_NUMBER,
    UCP_CONNECTOR_PUNCTUATION,
    UCP_DASH_PUNCTUATION,
    UCP_CLOSE_PUNCTUATION,
    UCP_FINAL_PUNCTUATION,
    UCP_INITIAL_PUNCTUATION,
    UCP_OTHER_PUNCTUATION,
    UCP_OPEN_PUNCTUATION,
    UCP_CURRENCY_SYMBOL,
    UCP_MODIFIER_SYMBOL,
    UCP_MATHEMATICAL_SYMBOL,
    UCP_OTHER_SYMBOL,
    UCP_LINE_SEPARATOR,
    UCP_PARAGRAPH_SEPARATOR,
    UCP_SPACE_SEPARATOR
  };

  enum Script
    /// Unicode 7.0 script identifiers.
  {
    UCP_ARABIC,
    UCP_ARMENIAN,
    UCP_BENGALI,
    UCP_BOPOMOFO,
    UCP_BRAILLE,
    UCP_BUGINESE,
    UCP_BUHID,
    UCP_CANADIAN_ABORIGINAL,
    UCP_CHEROKEE,
    UCP_COMMON,
    UCP_COPTIC,
    UCP_CYPRIOT,
    UCP_CYRILLIC,
    UCP_DESERET,
    UCP_DEVANAGARI,
    UCP_ETHIOPIC,
    UCP_GEORGIAN,
    UCP_GLAGOLITIC,
    UCP_GOTHIC,
    UCP_GREEK,
    UCP_GUJARATI,
    UCP_GURMUKHI,
    UCP_HAN,
    UCP_HANGUL,
    UCP_HANUNOO,
    UCP_HEBREW,
    UCP_HIRAGANA,
    UCP_INHERITED,
    UCP_KANNADA,
    UCP_KATAKANA,
    UCP_KHAROSHTHI,
    UCP_KHMER,
    UCP_LAO,
    UCP_LATIN,
    UCP_LIMBU,
    UCP_LINEAR_B,
    UCP_MALAYALAM,
    UCP_MONGOLIAN,
    UCP_MYANMAR,
    UCP_NEW_TAI_LUE,
    UCP_OGHAM,
    UCP_OLD_ITALIC,
    UCP_OLD_PERSIAN,
    UCP_ORIYA,
    UCP_OSMANYA,
    UCP_RUNIC,
    UCP_SHAVIAN,
    UCP_SINHALA,
    UCP_SYLOTI_NAGRI,
    UCP_SYRIAC,
    UCP_TAGALOG,
    UCP_TAGBANWA,
    UCP_TAI_LE,
    UCP_TAMIL,
    UCP_TELUGU,
    UCP_THAANA,
    UCP_THAI,
    UCP_TIBETAN,
    UCP_TIFINAGH,
    UCP_UGARITIC,
    UCP_YI,
    // Unicode 5.0
    UCP_BALINESE,
    UCP_CUNEIFORM,
    UCP_NKO,
    UCP_PHAGS_PA,
    UCP_PHOENICIAN,
    // Unicode 5.1
    UCP_CARIAN,
    UCP_CHAM,
    UCP_KAYAH_LI,
    UCP_LEPCHA,
    UCP_LYCIAN,
    UCP_LYDIAN,
    UCP_OL_CHIKI,
    UCP_REJANG,
    UCP_SAURASHTRA,
    UCP_SUNDANESE,
    UCP_VAI,
    // Unicode 5.2
    UCP_AVESTAN,
    UCP_BAMUM,
    UCP_EGYPTIAN_HIEROGLYPHS,
    UCP_IMPERIAL_ARAMAIC,
    UCP_INSCRIPTIONAL_PAHLAVI,
    UCP_INSCRIPTIONAL_PARTHIAN,
    UCP_JAVANESE,
    UCP_KAITHI,
    UCP_LISU,
    UCP_MEETEI_MAYEK,
    UCP_OLD_SOUTH_ARABIAN,
    UCP_OLD_TURKIC,
    UCP_SAMARITAN,
    UCP_TAI_THAM,
    UCP_TAI_VIET,
    // Unicode 6.0
    UCP_BATAK,
    UCP_BRAHMI,
    UCP_MANDAIC,
    // Unicode 6.1
    UCP_CHAKMA,
    UCP_MEROITIC_CURSIVE,
    UCP_MEROITIC_HIEROGLYPHS,
    UCP_MIAO,
    UCP_SHARADA,
    UCP_SORA_SOMPENG,
    UCP_TAKRI,
    // Unicode 7.0
    UCP_BASSA_VAH,
    UCP_CAUCASIAN_ALBANIAN,
    UCP_DUPLOYAN,
    UCP_ELBASAN,
    UCP_GRANTHA,
    UCP_KHOJKI,
    UCP_KHUDAWADI,
    UCP_LINEAR_A,
    UCP_MAHAJANI,
    UCP_MANICHAEAN,
    UCP_MENDE_KIKAKUI,
    UCP_MODI,
    UCP_MRO,
    UCP_NABATAEAN,
    UCP_OLD_NORTH_ARABIAN,
    UCP_OLD_PERMIC,
    UCP_PAHAWH_HMONG,
    UCP_PALMYRENE,
    UCP_PSALTER_PAHLAVI,
    UCP_PAU_CIN_HAU,
    UCP_SIDDHAM,
    UCP_TIRHUTA,
    UCP_WARANG_CITI
  };

  enum
  {
    UCP_MAX_CODEPOINT = 0x10FFFF
  };

  struct CharacterProperties
    /// This structure holds the character properties
    /// of an Unicode character.
  {
    CharacterCategory category;
    CharacterType     type;
    Script            script;
  };

  static void properties(int ch, CharacterProperties& props);
    /// Return the Unicode character properties for the
    /// character with the given Unicode value.

  static bool isSpace(int ch);
    /// Returns true iff the given character is a separator.

  static bool isDigit(int ch);
    /// Returns true iff the given character is a numeric character.

  static bool isPunct(int ch);
    /// Returns true iff the given character is a punctuation character.

  static bool isAlpha(int ch);
    /// Returns true iff the given character is a letter.

  static bool isLower(int ch);
    /// Returns true iff the given character is a lowercase
    /// character.

  static bool isUpper(int ch);
    /// Returns true iff the given character is an uppercase
    /// character.

  static int toLower(int ch);
    /// If the given character is an uppercase character,
    /// return its lowercase counterpart, otherwise return
    /// the character.

  static int toUpper(int ch);
    /// If the given character is a lowercase character,
    /// return its uppercase counterpart, otherwise return
    /// the character.
};


//
// inlines
//
inline bool Unicode::isSpace(int ch)
{
  CharacterProperties props;
  properties(ch, props);
  return props.category == UCP_SEPARATOR;
}


inline bool Unicode::isDigit(int ch)
{
  CharacterProperties props;
  properties(ch, props);
  return props.category == UCP_NUMBER;
}


inline bool Unicode::isPunct(int ch)
{
  CharacterProperties props;
  properties(ch, props);
  return props.category == UCP_PUNCTUATION;
}


inline bool Unicode::isAlpha(int ch)
{
  CharacterProperties props;
  properties(ch, props);
  return props.category == UCP_LETTER;
}


inline bool Unicode::isLower(int ch)
{
  CharacterProperties props;
  properties(ch, props);
  return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
}


inline bool Unicode::isUpper(int ch)
{
  CharacterProperties props;
  properties(ch, props);
  return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
}


} // namespace Poco


#endif // Foundation_Unicode_INCLUDED

Coverage Report

Created: 2025-11-06 06:54

Line	Count	Source
1		//
2		// Unicode.h
3		//
4		// Library: Foundation
5		// Package: Text
6		// Module: Unicode
7		//
8		// Definition of the Unicode class.
9		//
10		// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
11		// and Contributors.
12		//
13		// SPDX-License-Identifier: BSL-1.0
14		//
15
16
17		#ifndef Foundation_Unicode_INCLUDED
18		#define Foundation_Unicode_INCLUDED
19
20
21		#include "Poco/Foundation.h"
22
23
24		namespace Poco {
25
26
27		class Foundation_API Unicode
28		/// This class contains enumerations and static
29		/// utility functions for dealing with Unicode characters
30		/// and their properties.
31		///
32		/// For more information on Unicode, see <http://www.unicode.org>.
33		///
34		/// The implementation is based on the Unicode support
35		/// functions in PCRE.
36		{
37		public:
38		// Implementation note: the following definitions must be kept
39		// in sync with those from pcre2_ucp.h (PCRE).
40		enum CharacterCategory
41		/// Unicode character categories.
42		{
43		UCP_OTHER,
44		UCP_LETTER,
45		UCP_MARK,
46		UCP_NUMBER,
47		UCP_PUNCTUATION,
48		UCP_SYMBOL,
49		UCP_SEPARATOR
50		};
51
52		enum CharacterType
53		/// Unicode character types.
54		{
55		UCP_CONTROL,
56		UCP_FORMAT,
57		UCP_UNASSIGNED,
58		UCP_PRIVATE_USE,
59		UCP_SURROGATE,
60		UCP_LOWER_CASE_LETTER,
61		UCP_MODIFIER_LETTER,
62		UCP_OTHER_LETTER,
63		UCP_TITLE_CASE_LETTER,
64		UCP_UPPER_CASE_LETTER,
65		UCP_SPACING_MARK,
66		UCP_ENCLOSING_MARK,
67		UCP_NON_SPACING_MARK,
68		UCP_DECIMAL_NUMBER,
69		UCP_LETTER_NUMBER,
70		UCP_OTHER_NUMBER,
71		UCP_CONNECTOR_PUNCTUATION,
72		UCP_DASH_PUNCTUATION,
73		UCP_CLOSE_PUNCTUATION,
74		UCP_FINAL_PUNCTUATION,
75		UCP_INITIAL_PUNCTUATION,
76		UCP_OTHER_PUNCTUATION,
77		UCP_OPEN_PUNCTUATION,
78		UCP_CURRENCY_SYMBOL,
79		UCP_MODIFIER_SYMBOL,
80		UCP_MATHEMATICAL_SYMBOL,
81		UCP_OTHER_SYMBOL,
82		UCP_LINE_SEPARATOR,
83		UCP_PARAGRAPH_SEPARATOR,
84		UCP_SPACE_SEPARATOR
85		};
86
87		enum Script
88		/// Unicode 7.0 script identifiers.
89		{
90		UCP_ARABIC,
91		UCP_ARMENIAN,
92		UCP_BENGALI,
93		UCP_BOPOMOFO,
94		UCP_BRAILLE,
95		UCP_BUGINESE,
96		UCP_BUHID,
97		UCP_CANADIAN_ABORIGINAL,
98		UCP_CHEROKEE,
99		UCP_COMMON,
100		UCP_COPTIC,
101		UCP_CYPRIOT,
102		UCP_CYRILLIC,
103		UCP_DESERET,
104		UCP_DEVANAGARI,
105		UCP_ETHIOPIC,
106		UCP_GEORGIAN,
107		UCP_GLAGOLITIC,
108		UCP_GOTHIC,
109		UCP_GREEK,
110		UCP_GUJARATI,
111		UCP_GURMUKHI,
112		UCP_HAN,
113		UCP_HANGUL,
114		UCP_HANUNOO,
115		UCP_HEBREW,
116		UCP_HIRAGANA,
117		UCP_INHERITED,
118		UCP_KANNADA,
119		UCP_KATAKANA,
120		UCP_KHAROSHTHI,
121		UCP_KHMER,
122		UCP_LAO,
123		UCP_LATIN,
124		UCP_LIMBU,
125		UCP_LINEAR_B,
126		UCP_MALAYALAM,
127		UCP_MONGOLIAN,
128		UCP_MYANMAR,
129		UCP_NEW_TAI_LUE,
130		UCP_OGHAM,
131		UCP_OLD_ITALIC,
132		UCP_OLD_PERSIAN,
133		UCP_ORIYA,
134		UCP_OSMANYA,
135		UCP_RUNIC,
136		UCP_SHAVIAN,
137		UCP_SINHALA,
138		UCP_SYLOTI_NAGRI,
139		UCP_SYRIAC,
140		UCP_TAGALOG,
141		UCP_TAGBANWA,
142		UCP_TAI_LE,
143		UCP_TAMIL,
144		UCP_TELUGU,
145		UCP_THAANA,
146		UCP_THAI,
147		UCP_TIBETAN,
148		UCP_TIFINAGH,
149		UCP_UGARITIC,
150		UCP_YI,
151		// Unicode 5.0
152		UCP_BALINESE,
153		UCP_CUNEIFORM,
154		UCP_NKO,
155		UCP_PHAGS_PA,
156		UCP_PHOENICIAN,
157		// Unicode 5.1
158		UCP_CARIAN,
159		UCP_CHAM,
160		UCP_KAYAH_LI,
161		UCP_LEPCHA,
162		UCP_LYCIAN,
163		UCP_LYDIAN,
164		UCP_OL_CHIKI,
165		UCP_REJANG,
166		UCP_SAURASHTRA,
167		UCP_SUNDANESE,
168		UCP_VAI,
169		// Unicode 5.2
170		UCP_AVESTAN,
171		UCP_BAMUM,
172		UCP_EGYPTIAN_HIEROGLYPHS,
173		UCP_IMPERIAL_ARAMAIC,
174		UCP_INSCRIPTIONAL_PAHLAVI,
175		UCP_INSCRIPTIONAL_PARTHIAN,
176		UCP_JAVANESE,
177		UCP_KAITHI,
178		UCP_LISU,
179		UCP_MEETEI_MAYEK,
180		UCP_OLD_SOUTH_ARABIAN,
181		UCP_OLD_TURKIC,
182		UCP_SAMARITAN,
183		UCP_TAI_THAM,
184		UCP_TAI_VIET,
185		// Unicode 6.0
186		UCP_BATAK,
187		UCP_BRAHMI,
188		UCP_MANDAIC,
189		// Unicode 6.1
190		UCP_CHAKMA,
191		UCP_MEROITIC_CURSIVE,
192		UCP_MEROITIC_HIEROGLYPHS,
193		UCP_MIAO,
194		UCP_SHARADA,
195		UCP_SORA_SOMPENG,
196		UCP_TAKRI,
197		// Unicode 7.0
198		UCP_BASSA_VAH,
199		UCP_CAUCASIAN_ALBANIAN,
200		UCP_DUPLOYAN,
201		UCP_ELBASAN,
202		UCP_GRANTHA,
203		UCP_KHOJKI,
204		UCP_KHUDAWADI,
205		UCP_LINEAR_A,
206		UCP_MAHAJANI,
207		UCP_MANICHAEAN,
208		UCP_MENDE_KIKAKUI,
209		UCP_MODI,
210		UCP_MRO,
211		UCP_NABATAEAN,
212		UCP_OLD_NORTH_ARABIAN,
213		UCP_OLD_PERMIC,
214		UCP_PAHAWH_HMONG,
215		UCP_PALMYRENE,
216		UCP_PSALTER_PAHLAVI,
217		UCP_PAU_CIN_HAU,
218		UCP_SIDDHAM,
219		UCP_TIRHUTA,
220		UCP_WARANG_CITI
221		};
222
223		enum
224		{
225		UCP_MAX_CODEPOINT = 0x10FFFF
226		};
227
228		struct CharacterProperties
229		/// This structure holds the character properties
230		/// of an Unicode character.
231		{
232		CharacterCategory category;
233		CharacterType type;
234		Script script;
235		};
236
237		static void properties(int ch, CharacterProperties& props);
238		/// Return the Unicode character properties for the
239		/// character with the given Unicode value.
240
241		static bool isSpace(int ch);
242		/// Returns true iff the given character is a separator.
243
244		static bool isDigit(int ch);
245		/// Returns true iff the given character is a numeric character.
246
247		static bool isPunct(int ch);
248		/// Returns true iff the given character is a punctuation character.
249
250		static bool isAlpha(int ch);
251		/// Returns true iff the given character is a letter.
252
253		static bool isLower(int ch);
254		/// Returns true iff the given character is a lowercase
255		/// character.
256
257		static bool isUpper(int ch);
258		/// Returns true iff the given character is an uppercase
259		/// character.
260
261		static int toLower(int ch);
262		/// If the given character is an uppercase character,
263		/// return its lowercase counterpart, otherwise return
264		/// the character.
265
266		static int toUpper(int ch);
267		/// If the given character is a lowercase character,
268		/// return its uppercase counterpart, otherwise return
269		/// the character.
270		};
271
272
273		//
274		// inlines
275		//
276		inline bool Unicode::isSpace(int ch)
277	0	{
278	0	CharacterProperties props;
279	0	properties(ch, props);
280	0	return props.category == UCP_SEPARATOR;
281	0	}
282
283
284		inline bool Unicode::isDigit(int ch)
285	0	{
286	0	CharacterProperties props;
287	0	properties(ch, props);
288	0	return props.category == UCP_NUMBER;
289	0	}
290
291
292		inline bool Unicode::isPunct(int ch)
293	0	{
294	0	CharacterProperties props;
295	0	properties(ch, props);
296	0	return props.category == UCP_PUNCTUATION;
297	0	}
298
299
300		inline bool Unicode::isAlpha(int ch)
301	0	{
302	0	CharacterProperties props;
303	0	properties(ch, props);
304	0	return props.category == UCP_LETTER;
305	0	}
306
307
308		inline bool Unicode::isLower(int ch)
309	23.6M	{
310	23.6M	CharacterProperties props;
311	23.6M	properties(ch, props);
312	23.6M	return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
313	23.6M	}
314
315
316		inline bool Unicode::isUpper(int ch)
317	120M	{
318	120M	CharacterProperties props;
319	120M	properties(ch, props);
320	120M	return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
321	120M	}
322
323
324		} // namespace Poco
325
326
327		#endif // Foundation_Unicode_INCLUDED