/src/glib/glib/guniprop.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* guniprop.c - Unicode character properties. |
2 | | * |
3 | | * Copyright (C) 1999 Tom Tromey |
4 | | * Copyright (C) 2000 Red Hat, Inc. |
5 | | * |
6 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
7 | | * |
8 | | * This library is free software; you can redistribute it and/or |
9 | | * modify it under the terms of the GNU Lesser General Public |
10 | | * License as published by the Free Software Foundation; either |
11 | | * version 2.1 of the License, or (at your option) any later version. |
12 | | * |
13 | | * This library is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | | * Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General Public |
19 | | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
20 | | */ |
21 | | |
22 | | #include "config.h" |
23 | | |
24 | | #include <stdlib.h> |
25 | | #include <stddef.h> |
26 | | #include <string.h> |
27 | | #include <locale.h> |
28 | | |
29 | | #include "gmem.h" |
30 | | #include "gstring.h" |
31 | | #include "gtestutils.h" |
32 | | #include "gtypes.h" |
33 | | #include "gunicode.h" |
34 | | #include "gunichartables.h" |
35 | | #include "gmirroringtable.h" |
36 | | #include "gscripttable.h" |
37 | | #include "gunicodeprivate.h" |
38 | | #ifdef G_OS_WIN32 |
39 | | #include "gwin32.h" |
40 | | #endif |
41 | | |
42 | 0 | #define G_UNICHAR_FULLWIDTH_A 0xff21 |
43 | 0 | #define G_UNICHAR_FULLWIDTH_I 0xff29 |
44 | 0 | #define G_UNICHAR_FULLWIDTH_J 0xff2a |
45 | 0 | #define G_UNICHAR_FULLWIDTH_F 0xff26 |
46 | 0 | #define G_UNICHAR_FULLWIDTH_a 0xff41 |
47 | 0 | #define G_UNICHAR_FULLWIDTH_f 0xff46 |
48 | | |
49 | 0 | #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ |
50 | 0 | ? attr_table_part1[Page] \ |
51 | 0 | : attr_table_part2[(Page) - 0xe00]) |
52 | | |
53 | | #define ATTTABLE(Page, Char) \ |
54 | 0 | ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char])) |
55 | | |
56 | | #define TTYPE_PART1(Page, Char) \ |
57 | 0 | ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
58 | 0 | ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
59 | 0 | : (type_data[type_table_part1[Page]][Char])) |
60 | | |
61 | | #define TTYPE_PART2(Page, Char) \ |
62 | 0 | ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
63 | 0 | ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
64 | 0 | : (type_data[type_table_part2[Page]][Char])) |
65 | | |
66 | | #define TYPE(Char) \ |
67 | 0 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
68 | 0 | ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \ |
69 | 0 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
70 | 0 | ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
71 | 0 | : G_UNICODE_UNASSIGNED)) |
72 | | |
73 | | |
74 | 0 | #define IS(Type, Class) (((guint)1 << (Type)) & (Class)) |
75 | | #define OR(Type, Rest) (((guint)1 << (Type)) | (Rest)) |
76 | | |
77 | | |
78 | | |
79 | 0 | #define ISALPHA(Type) IS ((Type), \ |
80 | 0 | OR (G_UNICODE_LOWERCASE_LETTER, \ |
81 | 0 | OR (G_UNICODE_UPPERCASE_LETTER, \ |
82 | 0 | OR (G_UNICODE_TITLECASE_LETTER, \ |
83 | 0 | OR (G_UNICODE_MODIFIER_LETTER, \ |
84 | 0 | OR (G_UNICODE_OTHER_LETTER, 0)))))) |
85 | | |
86 | 0 | #define ISALDIGIT(Type) IS ((Type), \ |
87 | 0 | OR (G_UNICODE_DECIMAL_NUMBER, \ |
88 | 0 | OR (G_UNICODE_LETTER_NUMBER, \ |
89 | 0 | OR (G_UNICODE_OTHER_NUMBER, \ |
90 | 0 | OR (G_UNICODE_LOWERCASE_LETTER, \ |
91 | 0 | OR (G_UNICODE_UPPERCASE_LETTER, \ |
92 | 0 | OR (G_UNICODE_TITLECASE_LETTER, \ |
93 | 0 | OR (G_UNICODE_MODIFIER_LETTER, \ |
94 | 0 | OR (G_UNICODE_OTHER_LETTER, 0))))))))) |
95 | | |
96 | 0 | #define ISMARK(Type) IS ((Type), \ |
97 | 0 | OR (G_UNICODE_NON_SPACING_MARK, \ |
98 | 0 | OR (G_UNICODE_SPACING_MARK, \ |
99 | 0 | OR (G_UNICODE_ENCLOSING_MARK, 0)))) |
100 | | |
101 | | #define ISZEROWIDTHTYPE(Type) IS ((Type), \ |
102 | | OR (G_UNICODE_NON_SPACING_MARK, \ |
103 | | OR (G_UNICODE_ENCLOSING_MARK, \ |
104 | | OR (G_UNICODE_FORMAT, 0)))) |
105 | | |
106 | | /** |
107 | | * g_unichar_isalnum: |
108 | | * @c: a Unicode character |
109 | | * |
110 | | * Determines whether a character is alphanumeric. |
111 | | * Given some UTF-8 text, obtain a character value |
112 | | * with g_utf8_get_char(). |
113 | | * |
114 | | * Returns: %TRUE if @c is an alphanumeric character |
115 | | **/ |
116 | | gboolean |
117 | | g_unichar_isalnum (gunichar c) |
118 | 0 | { |
119 | 0 | return ISALDIGIT (TYPE (c)) ? TRUE : FALSE; |
120 | 0 | } |
121 | | |
122 | | /** |
123 | | * g_unichar_isalpha: |
124 | | * @c: a Unicode character |
125 | | * |
126 | | * Determines whether a character is alphabetic (i.e. a letter). |
127 | | * Given some UTF-8 text, obtain a character value with |
128 | | * g_utf8_get_char(). |
129 | | * |
130 | | * Returns: %TRUE if @c is an alphabetic character |
131 | | **/ |
132 | | gboolean |
133 | | g_unichar_isalpha (gunichar c) |
134 | 0 | { |
135 | 0 | return ISALPHA (TYPE (c)) ? TRUE : FALSE; |
136 | 0 | } |
137 | | |
138 | | |
139 | | /** |
140 | | * g_unichar_iscntrl: |
141 | | * @c: a Unicode character |
142 | | * |
143 | | * Determines whether a character is a control character. |
144 | | * Given some UTF-8 text, obtain a character value with |
145 | | * g_utf8_get_char(). |
146 | | * |
147 | | * Returns: %TRUE if @c is a control character |
148 | | **/ |
149 | | gboolean |
150 | | g_unichar_iscntrl (gunichar c) |
151 | 0 | { |
152 | 0 | return TYPE (c) == G_UNICODE_CONTROL; |
153 | 0 | } |
154 | | |
155 | | /** |
156 | | * g_unichar_isdigit: |
157 | | * @c: a Unicode character |
158 | | * |
159 | | * Determines whether a character is numeric (i.e. a digit). This |
160 | | * covers ASCII 0-9 and also digits in other languages/scripts. Given |
161 | | * some UTF-8 text, obtain a character value with g_utf8_get_char(). |
162 | | * |
163 | | * Returns: %TRUE if @c is a digit |
164 | | **/ |
165 | | gboolean |
166 | | g_unichar_isdigit (gunichar c) |
167 | 0 | { |
168 | 0 | return TYPE (c) == G_UNICODE_DECIMAL_NUMBER; |
169 | 0 | } |
170 | | |
171 | | |
172 | | /** |
173 | | * g_unichar_isgraph: |
174 | | * @c: a Unicode character |
175 | | * |
176 | | * Determines whether a character is printable and not a space |
177 | | * (returns %FALSE for control characters, format characters, and |
178 | | * spaces). g_unichar_isprint() is similar, but returns %TRUE for |
179 | | * spaces. Given some UTF-8 text, obtain a character value with |
180 | | * g_utf8_get_char(). |
181 | | * |
182 | | * Returns: %TRUE if @c is printable unless it's a space |
183 | | **/ |
184 | | gboolean |
185 | | g_unichar_isgraph (gunichar c) |
186 | 0 | { |
187 | 0 | return !IS (TYPE(c), |
188 | 0 | OR (G_UNICODE_CONTROL, |
189 | 0 | OR (G_UNICODE_FORMAT, |
190 | 0 | OR (G_UNICODE_UNASSIGNED, |
191 | 0 | OR (G_UNICODE_SURROGATE, |
192 | 0 | OR (G_UNICODE_SPACE_SEPARATOR, |
193 | 0 | 0)))))); |
194 | 0 | } |
195 | | |
196 | | /** |
197 | | * g_unichar_islower: |
198 | | * @c: a Unicode character |
199 | | * |
200 | | * Determines whether a character is a lowercase letter. |
201 | | * Given some UTF-8 text, obtain a character value with |
202 | | * g_utf8_get_char(). |
203 | | * |
204 | | * Returns: %TRUE if @c is a lowercase letter |
205 | | **/ |
206 | | gboolean |
207 | | g_unichar_islower (gunichar c) |
208 | 0 | { |
209 | 0 | return TYPE (c) == G_UNICODE_LOWERCASE_LETTER; |
210 | 0 | } |
211 | | |
212 | | |
213 | | /** |
214 | | * g_unichar_isprint: |
215 | | * @c: a Unicode character |
216 | | * |
217 | | * Determines whether a character is printable. |
218 | | * Unlike g_unichar_isgraph(), returns %TRUE for spaces. |
219 | | * Given some UTF-8 text, obtain a character value with |
220 | | * g_utf8_get_char(). |
221 | | * |
222 | | * Returns: %TRUE if @c is printable |
223 | | **/ |
224 | | gboolean |
225 | | g_unichar_isprint (gunichar c) |
226 | 0 | { |
227 | 0 | return !IS (TYPE(c), |
228 | 0 | OR (G_UNICODE_CONTROL, |
229 | 0 | OR (G_UNICODE_FORMAT, |
230 | 0 | OR (G_UNICODE_UNASSIGNED, |
231 | 0 | OR (G_UNICODE_SURROGATE, |
232 | 0 | 0))))); |
233 | 0 | } |
234 | | |
235 | | /** |
236 | | * g_unichar_ispunct: |
237 | | * @c: a Unicode character |
238 | | * |
239 | | * Determines whether a character is punctuation or a symbol. |
240 | | * Given some UTF-8 text, obtain a character value with |
241 | | * g_utf8_get_char(). |
242 | | * |
243 | | * Returns: %TRUE if @c is a punctuation or symbol character |
244 | | **/ |
245 | | gboolean |
246 | | g_unichar_ispunct (gunichar c) |
247 | 0 | { |
248 | 0 | return IS (TYPE(c), |
249 | 0 | OR (G_UNICODE_CONNECT_PUNCTUATION, |
250 | 0 | OR (G_UNICODE_DASH_PUNCTUATION, |
251 | 0 | OR (G_UNICODE_CLOSE_PUNCTUATION, |
252 | 0 | OR (G_UNICODE_FINAL_PUNCTUATION, |
253 | 0 | OR (G_UNICODE_INITIAL_PUNCTUATION, |
254 | 0 | OR (G_UNICODE_OTHER_PUNCTUATION, |
255 | 0 | OR (G_UNICODE_OPEN_PUNCTUATION, |
256 | 0 | OR (G_UNICODE_CURRENCY_SYMBOL, |
257 | 0 | OR (G_UNICODE_MODIFIER_SYMBOL, |
258 | 0 | OR (G_UNICODE_MATH_SYMBOL, |
259 | 0 | OR (G_UNICODE_OTHER_SYMBOL, |
260 | 0 | 0)))))))))))) ? TRUE : FALSE; |
261 | 0 | } |
262 | | |
263 | | /** |
264 | | * g_unichar_isspace: |
265 | | * @c: a Unicode character |
266 | | * |
267 | | * Determines whether a character is a space, tab, or line separator |
268 | | * (newline, carriage return, etc.). Given some UTF-8 text, obtain a |
269 | | * character value with g_utf8_get_char(). |
270 | | * |
271 | | * (Note: don't use this to do word breaking; you have to use |
272 | | * Pango or equivalent to get word breaking right, the algorithm |
273 | | * is fairly complex.) |
274 | | * |
275 | | * Returns: %TRUE if @c is a space character |
276 | | **/ |
277 | | gboolean |
278 | | g_unichar_isspace (gunichar c) |
279 | 0 | { |
280 | 0 | switch (c) |
281 | 0 | { |
282 | | /* special-case these since Unicode thinks they are not spaces */ |
283 | 0 | case '\t': |
284 | 0 | case '\n': |
285 | 0 | case '\r': |
286 | 0 | case '\f': |
287 | 0 | return TRUE; |
288 | 0 | break; |
289 | | |
290 | 0 | default: |
291 | 0 | { |
292 | 0 | return IS (TYPE(c), |
293 | 0 | OR (G_UNICODE_SPACE_SEPARATOR, |
294 | 0 | OR (G_UNICODE_LINE_SEPARATOR, |
295 | 0 | OR (G_UNICODE_PARAGRAPH_SEPARATOR, |
296 | 0 | 0)))) ? TRUE : FALSE; |
297 | 0 | } |
298 | 0 | break; |
299 | 0 | } |
300 | 0 | } |
301 | | |
302 | | /** |
303 | | * g_unichar_ismark: |
304 | | * @c: a Unicode character |
305 | | * |
306 | | * Determines whether a character is a mark (non-spacing mark, |
307 | | * combining mark, or enclosing mark in Unicode speak). |
308 | | * Given some UTF-8 text, obtain a character value |
309 | | * with g_utf8_get_char(). |
310 | | * |
311 | | * Note: in most cases where isalpha characters are allowed, |
312 | | * ismark characters should be allowed to as they are essential |
313 | | * for writing most European languages as well as many non-Latin |
314 | | * scripts. |
315 | | * |
316 | | * Returns: %TRUE if @c is a mark character |
317 | | * |
318 | | * Since: 2.14 |
319 | | **/ |
320 | | gboolean |
321 | | g_unichar_ismark (gunichar c) |
322 | 0 | { |
323 | 0 | return ISMARK (TYPE (c)); |
324 | 0 | } |
325 | | |
326 | | /** |
327 | | * g_unichar_isupper: |
328 | | * @c: a Unicode character |
329 | | * |
330 | | * Determines if a character is uppercase. |
331 | | * |
332 | | * Returns: %TRUE if @c is an uppercase character |
333 | | **/ |
334 | | gboolean |
335 | | g_unichar_isupper (gunichar c) |
336 | 0 | { |
337 | 0 | return TYPE (c) == G_UNICODE_UPPERCASE_LETTER; |
338 | 0 | } |
339 | | |
340 | | /** |
341 | | * g_unichar_istitle: |
342 | | * @c: a Unicode character |
343 | | * |
344 | | * Determines if a character is titlecase. Some characters in |
345 | | * Unicode which are composites, such as the DZ digraph |
346 | | * have three case variants instead of just two. The titlecase |
347 | | * form is used at the beginning of a word where only the |
348 | | * first letter is capitalized. The titlecase form of the DZ |
349 | | * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. |
350 | | * |
351 | | * Returns: %TRUE if the character is titlecase |
352 | | **/ |
353 | | gboolean |
354 | | g_unichar_istitle (gunichar c) |
355 | 0 | { |
356 | 0 | unsigned int i; |
357 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
358 | 0 | if (title_table[i][0] == c) |
359 | 0 | return TRUE; |
360 | 0 | return FALSE; |
361 | 0 | } |
362 | | |
363 | | /** |
364 | | * g_unichar_isxdigit: |
365 | | * @c: a Unicode character. |
366 | | * |
367 | | * Determines if a character is a hexadecimal digit. |
368 | | * |
369 | | * Returns: %TRUE if the character is a hexadecimal digit |
370 | | **/ |
371 | | gboolean |
372 | | g_unichar_isxdigit (gunichar c) |
373 | 0 | { |
374 | 0 | return ((c >= 'a' && c <= 'f') || |
375 | 0 | (c >= 'A' && c <= 'F') || |
376 | 0 | (c >= G_UNICHAR_FULLWIDTH_a && c <= G_UNICHAR_FULLWIDTH_f) || |
377 | 0 | (c >= G_UNICHAR_FULLWIDTH_A && c <= G_UNICHAR_FULLWIDTH_F) || |
378 | 0 | (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)); |
379 | 0 | } |
380 | | |
381 | | /** |
382 | | * g_unichar_isdefined: |
383 | | * @c: a Unicode character |
384 | | * |
385 | | * Determines if a given character is assigned in the Unicode |
386 | | * standard. |
387 | | * |
388 | | * Returns: %TRUE if the character has an assigned value |
389 | | **/ |
390 | | gboolean |
391 | | g_unichar_isdefined (gunichar c) |
392 | 0 | { |
393 | 0 | return !IS (TYPE(c), |
394 | 0 | OR (G_UNICODE_UNASSIGNED, |
395 | 0 | OR (G_UNICODE_SURROGATE, |
396 | 0 | 0))); |
397 | 0 | } |
398 | | |
399 | | /** |
400 | | * g_unichar_iszerowidth: |
401 | | * @c: a Unicode character |
402 | | * |
403 | | * Determines if a given character typically takes zero width when rendered. |
404 | | * The return value is %TRUE for all non-spacing and enclosing marks |
405 | | * (e.g., combining accents), format characters, zero-width |
406 | | * space, but not U+00AD SOFT HYPHEN. |
407 | | * |
408 | | * A typical use of this function is with one of g_unichar_iswide() or |
409 | | * g_unichar_iswide_cjk() to determine the number of cells a string occupies |
410 | | * when displayed on a grid display (terminals). However, note that not all |
411 | | * terminals support zero-width rendering of zero-width marks. |
412 | | * |
413 | | * Returns: %TRUE if the character has zero width |
414 | | * |
415 | | * Since: 2.14 |
416 | | **/ |
417 | | gboolean |
418 | | g_unichar_iszerowidth (gunichar c) |
419 | 0 | { |
420 | 0 | if (G_UNLIKELY (c == 0x00AD)) |
421 | 0 | return FALSE; |
422 | | |
423 | 0 | if (G_UNLIKELY (ISZEROWIDTHTYPE (TYPE (c)))) |
424 | 0 | return TRUE; |
425 | | |
426 | | /* A few additional codepoints are zero-width: |
427 | | * - Part of the Hangul Jamo block covering medial/vowels/jungseong and |
428 | | * final/trailing_consonants/jongseong Jamo |
429 | | * - Jungseong and jongseong for Old Korean |
430 | | * - Zero-width space (U+200B) |
431 | | */ |
432 | 0 | if (G_UNLIKELY ((c >= 0x1160 && c < 0x1200) || |
433 | 0 | (c >= 0xD7B0 && c < 0xD800) || |
434 | 0 | c == 0x200B)) |
435 | 0 | return TRUE; |
436 | | |
437 | 0 | return FALSE; |
438 | 0 | } |
439 | | |
440 | | static int |
441 | | interval_compare (const void *key, const void *elt) |
442 | 0 | { |
443 | 0 | gunichar c = GPOINTER_TO_UINT (key); |
444 | 0 | struct Interval *interval = (struct Interval *)elt; |
445 | |
|
446 | 0 | if (c < interval->start) |
447 | 0 | return -1; |
448 | 0 | if (c > interval->end) |
449 | 0 | return +1; |
450 | | |
451 | 0 | return 0; |
452 | 0 | } |
453 | | |
454 | 0 | #define G_WIDTH_TABLE_MIDPOINT (G_N_ELEMENTS (g_unicode_width_table_wide) / 2) |
455 | | |
456 | | static inline gboolean |
457 | | g_unichar_iswide_bsearch (gunichar ch) |
458 | 0 | { |
459 | 0 | int lower = 0; |
460 | 0 | int upper = G_N_ELEMENTS (g_unicode_width_table_wide) - 1; |
461 | 0 | static int saved_mid = G_WIDTH_TABLE_MIDPOINT; |
462 | 0 | int mid = saved_mid; |
463 | |
|
464 | 0 | do |
465 | 0 | { |
466 | 0 | if (ch < g_unicode_width_table_wide[mid].start) |
467 | 0 | upper = mid - 1; |
468 | 0 | else if (ch > g_unicode_width_table_wide[mid].end) |
469 | 0 | lower = mid + 1; |
470 | 0 | else |
471 | 0 | return TRUE; |
472 | | |
473 | 0 | mid = (lower + upper) / 2; |
474 | 0 | } |
475 | 0 | while (lower <= upper); |
476 | | |
477 | 0 | return FALSE; |
478 | 0 | } |
479 | | |
480 | | static const struct Interval default_wide_blocks[] = { |
481 | | { 0x3400, 0x4dbf }, |
482 | | { 0x4e00, 0x9fff }, |
483 | | { 0xf900, 0xfaff }, |
484 | | { 0x20000, 0x2fffd }, |
485 | | { 0x30000, 0x3fffd } |
486 | | }; |
487 | | |
488 | | /** |
489 | | * g_unichar_iswide: |
490 | | * @c: a Unicode character |
491 | | * |
492 | | * Determines if a character is typically rendered in a double-width |
493 | | * cell. |
494 | | * |
495 | | * Returns: %TRUE if the character is wide |
496 | | **/ |
497 | | gboolean |
498 | | g_unichar_iswide (gunichar c) |
499 | 0 | { |
500 | 0 | if (c < g_unicode_width_table_wide[0].start) |
501 | 0 | return FALSE; |
502 | 0 | else if (g_unichar_iswide_bsearch (c)) |
503 | 0 | return TRUE; |
504 | 0 | else if (g_unichar_type (c) == G_UNICODE_UNASSIGNED && |
505 | 0 | bsearch (GUINT_TO_POINTER (c), |
506 | 0 | default_wide_blocks, |
507 | 0 | G_N_ELEMENTS (default_wide_blocks), |
508 | 0 | sizeof default_wide_blocks[0], |
509 | 0 | interval_compare)) |
510 | 0 | return TRUE; |
511 | | |
512 | 0 | return FALSE; |
513 | 0 | } |
514 | | |
515 | | |
516 | | /** |
517 | | * g_unichar_iswide_cjk: |
518 | | * @c: a Unicode character |
519 | | * |
520 | | * Determines if a character is typically rendered in a double-width |
521 | | * cell under legacy East Asian locales. If a character is wide according to |
522 | | * g_unichar_iswide(), then it is also reported wide with this function, but |
523 | | * the converse is not necessarily true. See the |
524 | | * [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) |
525 | | * for details. |
526 | | * |
527 | | * If a character passes the g_unichar_iswide() test then it will also pass |
528 | | * this test, but not the other way around. Note that some characters may |
529 | | * pass both this test and g_unichar_iszerowidth(). |
530 | | * |
531 | | * Returns: %TRUE if the character is wide in legacy East Asian locales |
532 | | * |
533 | | * Since: 2.12 |
534 | | */ |
535 | | gboolean |
536 | | g_unichar_iswide_cjk (gunichar c) |
537 | 0 | { |
538 | 0 | if (g_unichar_iswide (c)) |
539 | 0 | return TRUE; |
540 | | |
541 | | /* bsearch() is declared attribute(nonnull(1)) so we can't validly search |
542 | | * for a NULL key */ |
543 | 0 | if (c == 0) |
544 | 0 | return FALSE; |
545 | | |
546 | 0 | if (bsearch (GUINT_TO_POINTER (c), |
547 | 0 | g_unicode_width_table_ambiguous, |
548 | 0 | G_N_ELEMENTS (g_unicode_width_table_ambiguous), |
549 | 0 | sizeof g_unicode_width_table_ambiguous[0], |
550 | 0 | interval_compare)) |
551 | 0 | return TRUE; |
552 | | |
553 | 0 | return FALSE; |
554 | 0 | } |
555 | | |
556 | | |
557 | | /** |
558 | | * g_unichar_toupper: |
559 | | * @c: a Unicode character |
560 | | * |
561 | | * Converts a character to uppercase. |
562 | | * |
563 | | * Returns: the result of converting @c to uppercase. |
564 | | * If @c is not a lowercase or titlecase character, |
565 | | * or has no upper case equivalent @c is returned unchanged. |
566 | | **/ |
567 | | gunichar |
568 | | g_unichar_toupper (gunichar c) |
569 | 0 | { |
570 | 0 | int t = TYPE (c); |
571 | 0 | if (t == G_UNICODE_LOWERCASE_LETTER) |
572 | 0 | { |
573 | 0 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
574 | 0 | if (val >= 0x1000000) |
575 | 0 | { |
576 | 0 | const gchar *p = special_case_table + (val - 0x1000000); |
577 | 0 | val = g_utf8_get_char (p); |
578 | 0 | } |
579 | | /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, |
580 | | * do not have an uppercase equivalent, in which case val will be |
581 | | * zero. |
582 | | */ |
583 | 0 | return val ? val : c; |
584 | 0 | } |
585 | 0 | else if (t == G_UNICODE_TITLECASE_LETTER) |
586 | 0 | { |
587 | 0 | unsigned int i; |
588 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
589 | 0 | { |
590 | 0 | if (title_table[i][0] == c) |
591 | 0 | return title_table[i][1] ? title_table[i][1] : c; |
592 | 0 | } |
593 | 0 | } |
594 | 0 | return c; |
595 | 0 | } |
596 | | |
597 | | /** |
598 | | * g_unichar_tolower: |
599 | | * @c: a Unicode character. |
600 | | * |
601 | | * Converts a character to lower case. |
602 | | * |
603 | | * Returns: the result of converting @c to lower case. |
604 | | * If @c is not an upperlower or titlecase character, |
605 | | * or has no lowercase equivalent @c is returned unchanged. |
606 | | **/ |
607 | | gunichar |
608 | | g_unichar_tolower (gunichar c) |
609 | 0 | { |
610 | 0 | int t = TYPE (c); |
611 | 0 | if (t == G_UNICODE_UPPERCASE_LETTER) |
612 | 0 | { |
613 | 0 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
614 | 0 | if (val >= 0x1000000) |
615 | 0 | { |
616 | 0 | const gchar *p = special_case_table + (val - 0x1000000); |
617 | 0 | return g_utf8_get_char (p); |
618 | 0 | } |
619 | 0 | else |
620 | 0 | { |
621 | | /* Not all uppercase letters are guaranteed to have a lowercase |
622 | | * equivalent. If this is the case, val will be zero. */ |
623 | 0 | return val ? val : c; |
624 | 0 | } |
625 | 0 | } |
626 | 0 | else if (t == G_UNICODE_TITLECASE_LETTER) |
627 | 0 | { |
628 | 0 | unsigned int i; |
629 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
630 | 0 | { |
631 | 0 | if (title_table[i][0] == c) |
632 | 0 | return title_table[i][2]; |
633 | 0 | } |
634 | 0 | } |
635 | 0 | return c; |
636 | 0 | } |
637 | | |
638 | | /** |
639 | | * g_unichar_totitle: |
640 | | * @c: a Unicode character |
641 | | * |
642 | | * Converts a character to the titlecase. |
643 | | * |
644 | | * Returns: the result of converting @c to titlecase. |
645 | | * If @c is not an uppercase or lowercase character, |
646 | | * @c is returned unchanged. |
647 | | **/ |
648 | | gunichar |
649 | | g_unichar_totitle (gunichar c) |
650 | 0 | { |
651 | 0 | unsigned int i; |
652 | | |
653 | | /* We handle U+0000 explicitly because some elements in |
654 | | * title_table[i][1] may be null. */ |
655 | 0 | if (c == 0) |
656 | 0 | return c; |
657 | | |
658 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
659 | 0 | { |
660 | 0 | if (title_table[i][0] == c || title_table[i][1] == c |
661 | 0 | || title_table[i][2] == c) |
662 | 0 | return title_table[i][0]; |
663 | 0 | } |
664 | | |
665 | 0 | if (TYPE (c) == G_UNICODE_LOWERCASE_LETTER) |
666 | 0 | return g_unichar_toupper (c); |
667 | | |
668 | 0 | return c; |
669 | 0 | } |
670 | | |
671 | | /** |
672 | | * g_unichar_digit_value: |
673 | | * @c: a Unicode character |
674 | | * |
675 | | * Determines the numeric value of a character as a decimal |
676 | | * digit. |
677 | | * |
678 | | * Returns: If @c is a decimal digit (according to |
679 | | * g_unichar_isdigit()), its numeric value. Otherwise, -1. |
680 | | **/ |
681 | | int |
682 | | g_unichar_digit_value (gunichar c) |
683 | 0 | { |
684 | 0 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
685 | 0 | return ATTTABLE (c >> 8, c & 0xff); |
686 | 0 | return -1; |
687 | 0 | } |
688 | | |
689 | | /** |
690 | | * g_unichar_xdigit_value: |
691 | | * @c: a Unicode character |
692 | | * |
693 | | * Determines the numeric value of a character as a hexadecimal |
694 | | * digit. |
695 | | * |
696 | | * Returns: If @c is a hex digit (according to |
697 | | * g_unichar_isxdigit()), its numeric value. Otherwise, -1. |
698 | | **/ |
699 | | int |
700 | | g_unichar_xdigit_value (gunichar c) |
701 | 0 | { |
702 | 0 | if (c >= 'A' && c <= 'F') |
703 | 0 | return c - 'A' + 10; |
704 | 0 | if (c >= 'a' && c <= 'f') |
705 | 0 | return c - 'a' + 10; |
706 | 0 | if (c >= G_UNICHAR_FULLWIDTH_A && c <= G_UNICHAR_FULLWIDTH_F) |
707 | 0 | return c - G_UNICHAR_FULLWIDTH_A + 10; |
708 | 0 | if (c >= G_UNICHAR_FULLWIDTH_a && c <= G_UNICHAR_FULLWIDTH_f) |
709 | 0 | return c - G_UNICHAR_FULLWIDTH_a + 10; |
710 | 0 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
711 | 0 | return ATTTABLE (c >> 8, c & 0xff); |
712 | 0 | return -1; |
713 | 0 | } |
714 | | |
715 | | /** |
716 | | * g_unichar_type: |
717 | | * @c: a Unicode character |
718 | | * |
719 | | * Classifies a Unicode character by type. |
720 | | * |
721 | | * Returns: the type of the character. |
722 | | **/ |
723 | | GUnicodeType |
724 | | g_unichar_type (gunichar c) |
725 | 0 | { |
726 | 0 | return TYPE (c); |
727 | 0 | } |
728 | | |
729 | | /* |
730 | | * Case mapping functions |
731 | | */ |
732 | | |
733 | | typedef enum { |
734 | | LOCALE_NORMAL, |
735 | | LOCALE_TURKIC, |
736 | | LOCALE_LITHUANIAN |
737 | | } LocaleType; |
738 | | |
739 | | static LocaleType |
740 | | get_locale_type (void) |
741 | 0 | { |
742 | | #ifdef G_OS_WIN32 |
743 | | char *tem = g_win32_getlocale (); |
744 | | char locale[2]; |
745 | | |
746 | | locale[0] = tem[0]; |
747 | | locale[1] = tem[1]; |
748 | | g_free (tem); |
749 | | #else |
750 | 0 | const char *locale = setlocale (LC_CTYPE, NULL); |
751 | |
|
752 | 0 | if (locale == NULL) |
753 | 0 | return LOCALE_NORMAL; |
754 | 0 | #endif |
755 | | |
756 | 0 | switch (locale[0]) |
757 | 0 | { |
758 | 0 | case 'a': |
759 | 0 | if (locale[1] == 'z') |
760 | 0 | return LOCALE_TURKIC; |
761 | 0 | break; |
762 | 0 | case 'l': |
763 | 0 | if (locale[1] == 't') |
764 | 0 | return LOCALE_LITHUANIAN; |
765 | 0 | break; |
766 | 0 | case 't': |
767 | 0 | if (locale[1] == 'r') |
768 | 0 | return LOCALE_TURKIC; |
769 | 0 | break; |
770 | 0 | } |
771 | | |
772 | 0 | return LOCALE_NORMAL; |
773 | 0 | } |
774 | | |
775 | | static gint |
776 | | output_marks (const char **p_inout, |
777 | | char *out_buffer, |
778 | | gboolean remove_dot) |
779 | 0 | { |
780 | 0 | const char *p = *p_inout; |
781 | 0 | gint len = 0; |
782 | | |
783 | 0 | while (*p) |
784 | 0 | { |
785 | 0 | gunichar c = g_utf8_get_char (p); |
786 | | |
787 | 0 | if (ISMARK (TYPE (c))) |
788 | 0 | { |
789 | 0 | if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */) |
790 | 0 | len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL); |
791 | 0 | p = g_utf8_next_char (p); |
792 | 0 | } |
793 | 0 | else |
794 | 0 | break; |
795 | 0 | } |
796 | |
|
797 | 0 | *p_inout = p; |
798 | 0 | return len; |
799 | 0 | } |
800 | | |
801 | | static gint |
802 | | output_special_case (gchar *out_buffer, |
803 | | int offset, |
804 | | int type, |
805 | | int which) |
806 | 0 | { |
807 | 0 | const gchar *p = special_case_table + offset; |
808 | 0 | size_t len; |
809 | |
|
810 | 0 | if (type != G_UNICODE_TITLECASE_LETTER) |
811 | 0 | p = g_utf8_next_char (p); |
812 | |
|
813 | 0 | if (which == 1) |
814 | 0 | p += strlen (p) + 1; |
815 | |
|
816 | 0 | len = strlen (p); |
817 | 0 | if (out_buffer) |
818 | 0 | memcpy (out_buffer, p, len); |
819 | |
|
820 | 0 | return len; |
821 | 0 | } |
822 | | |
823 | | static gsize |
824 | | real_toupper (const gchar *str, |
825 | | gssize max_len, |
826 | | gchar *out_buffer, |
827 | | LocaleType locale_type) |
828 | 0 | { |
829 | 0 | const gchar *p = str; |
830 | 0 | const char *last = NULL; |
831 | 0 | gsize len = 0; |
832 | 0 | gboolean last_was_i = FALSE; |
833 | |
|
834 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
835 | 0 | { |
836 | 0 | gunichar c = g_utf8_get_char (p); |
837 | 0 | int t = TYPE (c); |
838 | 0 | gunichar val; |
839 | |
|
840 | 0 | last = p; |
841 | 0 | p = g_utf8_next_char (p); |
842 | |
|
843 | 0 | if (locale_type == LOCALE_LITHUANIAN) |
844 | 0 | { |
845 | 0 | if (c == 'i') |
846 | 0 | last_was_i = TRUE; |
847 | 0 | else |
848 | 0 | { |
849 | 0 | if (last_was_i) |
850 | 0 | { |
851 | | /* Nasty, need to remove any dot above. Though |
852 | | * I think only E WITH DOT ABOVE occurs in practice |
853 | | * which could simplify this considerably. |
854 | | */ |
855 | 0 | gsize decomp_len, i; |
856 | 0 | gunichar decomp[G_UNICHAR_MAX_DECOMPOSITION_LENGTH]; |
857 | |
|
858 | 0 | decomp_len = g_unichar_fully_decompose (c, FALSE, decomp, G_N_ELEMENTS (decomp)); |
859 | 0 | for (i=0; i < decomp_len; i++) |
860 | 0 | { |
861 | 0 | if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */) |
862 | 0 | len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL); |
863 | 0 | } |
864 | | |
865 | 0 | len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE); |
866 | |
|
867 | 0 | continue; |
868 | 0 | } |
869 | | |
870 | 0 | if (!ISMARK (t)) |
871 | 0 | last_was_i = FALSE; |
872 | 0 | } |
873 | 0 | } |
874 | | |
875 | 0 | if (locale_type == LOCALE_TURKIC && c == 'i') |
876 | 0 | { |
877 | | /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
878 | 0 | len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL); |
879 | 0 | } |
880 | 0 | else if (c == 0x0345) /* COMBINING GREEK YPOGEGRAMMENI */ |
881 | 0 | { |
882 | | /* Nasty, need to move it after other combining marks .. this would go away if |
883 | | * we normalized first. |
884 | | */ |
885 | 0 | len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE); |
886 | | |
887 | | /* And output as GREEK CAPITAL LETTER IOTA */ |
888 | 0 | len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); |
889 | 0 | } |
890 | 0 | else if (IS (t, |
891 | 0 | OR (G_UNICODE_LOWERCASE_LETTER, |
892 | 0 | OR (G_UNICODE_TITLECASE_LETTER, |
893 | 0 | 0)))) |
894 | 0 | { |
895 | 0 | val = ATTTABLE (c >> 8, c & 0xff); |
896 | |
|
897 | 0 | if (val >= 0x1000000) |
898 | 0 | { |
899 | 0 | len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, |
900 | 0 | t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1); |
901 | 0 | } |
902 | 0 | else |
903 | 0 | { |
904 | 0 | if (t == G_UNICODE_TITLECASE_LETTER) |
905 | 0 | { |
906 | 0 | unsigned int i; |
907 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
908 | 0 | { |
909 | 0 | if (title_table[i][0] == c) |
910 | 0 | { |
911 | 0 | val = title_table[i][1]; |
912 | 0 | break; |
913 | 0 | } |
914 | 0 | } |
915 | 0 | } |
916 | | |
917 | | /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, |
918 | | * do not have an uppercase equivalent, in which case val will be |
919 | | * zero. */ |
920 | 0 | len += g_unichar_to_utf8 (val ? val : c, out_buffer ? out_buffer + len : NULL); |
921 | 0 | } |
922 | 0 | } |
923 | 0 | else |
924 | 0 | { |
925 | 0 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
926 | |
|
927 | 0 | if (out_buffer) |
928 | 0 | memcpy (out_buffer + len, last, char_len); |
929 | |
|
930 | 0 | len += char_len; |
931 | 0 | } |
932 | |
|
933 | 0 | } |
934 | |
|
935 | 0 | return len; |
936 | 0 | } |
937 | | |
938 | | /** |
939 | | * g_utf8_strup: |
940 | | * @str: a UTF-8 encoded string |
941 | | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
942 | | * |
943 | | * Converts all Unicode characters in the string that have a case |
944 | | * to uppercase. The exact manner that this is done depends |
945 | | * on the current locale, and may result in the number of |
946 | | * characters in the string increasing. (For instance, the |
947 | | * German ess-zet will be changed to SS.) |
948 | | * |
949 | | * Returns: a newly allocated string, with all characters |
950 | | * converted to uppercase. |
951 | | **/ |
952 | | gchar * |
953 | | g_utf8_strup (const gchar *str, |
954 | | gssize len) |
955 | 0 | { |
956 | 0 | gsize result_len; |
957 | 0 | LocaleType locale_type; |
958 | 0 | gchar *result; |
959 | |
|
960 | 0 | g_return_val_if_fail (str != NULL, NULL); |
961 | | |
962 | 0 | locale_type = get_locale_type (); |
963 | | |
964 | | /* |
965 | | * We use a two pass approach to keep memory management simple |
966 | | */ |
967 | 0 | result_len = real_toupper (str, len, NULL, locale_type); |
968 | 0 | result = g_malloc (result_len + 1); |
969 | 0 | real_toupper (str, len, result, locale_type); |
970 | 0 | result[result_len] = '\0'; |
971 | |
|
972 | 0 | return result; |
973 | 0 | } |
974 | | |
975 | | /* traverses the string checking for characters with combining class == 230 |
976 | | * until a base character is found */ |
977 | | static gboolean |
978 | | has_more_above (const gchar *str) |
979 | 0 | { |
980 | 0 | const gchar *p = str; |
981 | 0 | gint combining_class; |
982 | |
|
983 | 0 | while (*p) |
984 | 0 | { |
985 | 0 | combining_class = g_unichar_combining_class (g_utf8_get_char (p)); |
986 | 0 | if (combining_class == 230) |
987 | 0 | return TRUE; |
988 | 0 | else if (combining_class == 0) |
989 | 0 | break; |
990 | | |
991 | 0 | p = g_utf8_next_char (p); |
992 | 0 | } |
993 | | |
994 | 0 | return FALSE; |
995 | 0 | } |
996 | | |
997 | | static gsize |
998 | | real_tolower (const gchar *str, |
999 | | gssize max_len, |
1000 | | gchar *out_buffer, |
1001 | | LocaleType locale_type) |
1002 | 0 | { |
1003 | 0 | const gchar *p = str; |
1004 | 0 | const char *last = NULL; |
1005 | 0 | gsize len = 0; |
1006 | |
|
1007 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
1008 | 0 | { |
1009 | 0 | gunichar c = g_utf8_get_char (p); |
1010 | 0 | int t = TYPE (c); |
1011 | 0 | gunichar val; |
1012 | |
|
1013 | 0 | last = p; |
1014 | 0 | p = g_utf8_next_char (p); |
1015 | |
|
1016 | 0 | if (locale_type == LOCALE_TURKIC && (c == 'I' || c == 0x130 || |
1017 | 0 | c == G_UNICHAR_FULLWIDTH_I)) |
1018 | 0 | { |
1019 | 0 | gboolean combining_dot = (c == 'I' || c == G_UNICHAR_FULLWIDTH_I) && |
1020 | 0 | g_utf8_get_char (p) == 0x0307; |
1021 | 0 | if (combining_dot || c == 0x130) |
1022 | 0 | { |
1023 | | /* I + COMBINING DOT ABOVE => i (U+0069) |
1024 | | * LATIN CAPITAL LETTER I WITH DOT ABOVE => i (U+0069) */ |
1025 | 0 | len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
1026 | 0 | if (combining_dot) |
1027 | 0 | p = g_utf8_next_char (p); |
1028 | 0 | } |
1029 | 0 | else |
1030 | 0 | { |
1031 | | /* I => LATIN SMALL LETTER DOTLESS I */ |
1032 | 0 | len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); |
1033 | 0 | } |
1034 | 0 | } |
1035 | | /* Introduce an explicit dot above when lowercasing capital I's and J's |
1036 | | * whenever there are more accents above. [SpecialCasing.txt] */ |
1037 | 0 | else if (locale_type == LOCALE_LITHUANIAN && |
1038 | 0 | (c == 0x00cc || c == 0x00cd || c == 0x0128)) |
1039 | 0 | { |
1040 | 0 | len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
1041 | 0 | len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
1042 | |
|
1043 | 0 | switch (c) |
1044 | 0 | { |
1045 | 0 | case 0x00cc: |
1046 | 0 | len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL); |
1047 | 0 | break; |
1048 | 0 | case 0x00cd: |
1049 | 0 | len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL); |
1050 | 0 | break; |
1051 | 0 | case 0x0128: |
1052 | 0 | len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL); |
1053 | 0 | break; |
1054 | 0 | } |
1055 | 0 | } |
1056 | 0 | else if (locale_type == LOCALE_LITHUANIAN && |
1057 | 0 | (c == 'I' || c == G_UNICHAR_FULLWIDTH_I || |
1058 | 0 | c == 'J' || c == G_UNICHAR_FULLWIDTH_J || c == 0x012e) && |
1059 | 0 | has_more_above (p)) |
1060 | 0 | { |
1061 | 0 | len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL); |
1062 | 0 | len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
1063 | 0 | } |
1064 | 0 | else if (c == 0x03A3) /* GREEK CAPITAL LETTER SIGMA */ |
1065 | 0 | { |
1066 | 0 | if ((max_len < 0 || p < str + max_len) && *p) |
1067 | 0 | { |
1068 | 0 | gunichar next_c = g_utf8_get_char (p); |
1069 | 0 | int next_type = TYPE(next_c); |
1070 | | |
1071 | | /* SIGMA mapps differently depending on whether it is |
1072 | | * final or not. The following simplified test would |
1073 | | * fail in the case of combining marks following the |
1074 | | * sigma, but I don't think that occurs in real text. |
1075 | | * The test here matches that in ICU. |
1076 | | */ |
1077 | 0 | if (ISALPHA (next_type)) /* Lu,Ll,Lt,Lm,Lo */ |
1078 | 0 | val = 0x3c3; /* GREEK SMALL SIGMA */ |
1079 | 0 | else |
1080 | 0 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
1081 | 0 | } |
1082 | 0 | else |
1083 | 0 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
1084 | |
|
1085 | 0 | len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); |
1086 | 0 | } |
1087 | 0 | else if (IS (t, |
1088 | 0 | OR (G_UNICODE_UPPERCASE_LETTER, |
1089 | 0 | OR (G_UNICODE_TITLECASE_LETTER, |
1090 | 0 | 0)))) |
1091 | 0 | { |
1092 | 0 | val = ATTTABLE (c >> 8, c & 0xff); |
1093 | |
|
1094 | 0 | if (val >= 0x1000000) |
1095 | 0 | { |
1096 | 0 | len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0); |
1097 | 0 | } |
1098 | 0 | else |
1099 | 0 | { |
1100 | 0 | if (t == G_UNICODE_TITLECASE_LETTER) |
1101 | 0 | { |
1102 | 0 | unsigned int i; |
1103 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
1104 | 0 | { |
1105 | 0 | if (title_table[i][0] == c) |
1106 | 0 | { |
1107 | 0 | val = title_table[i][2]; |
1108 | 0 | break; |
1109 | 0 | } |
1110 | 0 | } |
1111 | 0 | } |
1112 | | |
1113 | | /* Not all uppercase letters are guaranteed to have a lowercase |
1114 | | * equivalent. If this is the case, val will be zero. */ |
1115 | 0 | len += g_unichar_to_utf8 (val ? val : c, out_buffer ? out_buffer + len : NULL); |
1116 | 0 | } |
1117 | 0 | } |
1118 | 0 | else |
1119 | 0 | { |
1120 | 0 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
1121 | |
|
1122 | 0 | if (out_buffer) |
1123 | 0 | memcpy (out_buffer + len, last, char_len); |
1124 | |
|
1125 | 0 | len += char_len; |
1126 | 0 | } |
1127 | |
|
1128 | 0 | } |
1129 | | |
1130 | 0 | return len; |
1131 | 0 | } |
1132 | | |
1133 | | /** |
1134 | | * g_utf8_strdown: |
1135 | | * @str: a UTF-8 encoded string |
1136 | | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
1137 | | * |
1138 | | * Converts all Unicode characters in the string that have a case |
1139 | | * to lowercase. The exact manner that this is done depends |
1140 | | * on the current locale, and may result in the number of |
1141 | | * characters in the string changing. |
1142 | | * |
1143 | | * Returns: a newly allocated string, with all characters |
1144 | | * converted to lowercase. |
1145 | | **/ |
1146 | | gchar * |
1147 | | g_utf8_strdown (const gchar *str, |
1148 | | gssize len) |
1149 | 0 | { |
1150 | 0 | gsize result_len; |
1151 | 0 | LocaleType locale_type; |
1152 | 0 | gchar *result; |
1153 | |
|
1154 | 0 | g_return_val_if_fail (str != NULL, NULL); |
1155 | | |
1156 | 0 | locale_type = get_locale_type (); |
1157 | | |
1158 | | /* |
1159 | | * We use a two pass approach to keep memory management simple |
1160 | | */ |
1161 | 0 | result_len = real_tolower (str, len, NULL, locale_type); |
1162 | 0 | result = g_malloc (result_len + 1); |
1163 | 0 | real_tolower (str, len, result, locale_type); |
1164 | 0 | result[result_len] = '\0'; |
1165 | |
|
1166 | 0 | return result; |
1167 | 0 | } |
1168 | | |
1169 | | /** |
1170 | | * g_utf8_casefold: |
1171 | | * @str: a UTF-8 encoded string |
1172 | | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
1173 | | * |
1174 | | * Converts a string into a form that is independent of case. The |
1175 | | * result will not correspond to any particular case, but can be |
1176 | | * compared for equality or ordered with the results of calling |
1177 | | * g_utf8_casefold() on other strings. |
1178 | | * |
1179 | | * Note that calling g_utf8_casefold() followed by g_utf8_collate() is |
1180 | | * only an approximation to the correct linguistic case insensitive |
1181 | | * ordering, though it is a fairly good one. Getting this exactly |
1182 | | * right would require a more sophisticated collation function that |
1183 | | * takes case sensitivity into account. GLib does not currently |
1184 | | * provide such a function. |
1185 | | * |
1186 | | * Returns: a newly allocated string, that is a |
1187 | | * case independent form of @str. |
1188 | | **/ |
1189 | | gchar * |
1190 | | g_utf8_casefold (const gchar *str, |
1191 | | gssize len) |
1192 | 0 | { |
1193 | 0 | GString *result; |
1194 | 0 | const char *p; |
1195 | |
|
1196 | 0 | g_return_val_if_fail (str != NULL, NULL); |
1197 | | |
1198 | 0 | result = g_string_new (NULL); |
1199 | 0 | p = str; |
1200 | 0 | while ((len < 0 || p < str + len) && *p) |
1201 | 0 | { |
1202 | 0 | gunichar ch = g_utf8_get_char (p); |
1203 | |
|
1204 | 0 | int start = 0; |
1205 | 0 | int end = G_N_ELEMENTS (casefold_table); |
1206 | |
|
1207 | 0 | if (ch >= casefold_table[start].ch && |
1208 | 0 | ch <= casefold_table[end - 1].ch) |
1209 | 0 | { |
1210 | 0 | while (TRUE) |
1211 | 0 | { |
1212 | 0 | int half = (start + end) / 2; |
1213 | 0 | if (ch == casefold_table[half].ch) |
1214 | 0 | { |
1215 | 0 | g_string_append (result, casefold_table[half].data); |
1216 | 0 | goto next; |
1217 | 0 | } |
1218 | 0 | else if (half == start) |
1219 | 0 | break; |
1220 | 0 | else if (ch > casefold_table[half].ch) |
1221 | 0 | start = half; |
1222 | 0 | else |
1223 | 0 | end = half; |
1224 | 0 | } |
1225 | 0 | } |
1226 | | |
1227 | 0 | g_string_append_unichar (result, g_unichar_tolower (ch)); |
1228 | | |
1229 | 0 | next: |
1230 | 0 | p = g_utf8_next_char (p); |
1231 | 0 | } |
1232 | | |
1233 | 0 | return g_string_free (result, FALSE); |
1234 | 0 | } |
1235 | | |
1236 | | /** |
1237 | | * g_unichar_get_mirror_char: |
1238 | | * @ch: a Unicode character |
1239 | | * @mirrored_ch: (out): location to store the mirrored character |
1240 | | * |
1241 | | * In Unicode, some characters are "mirrored". This means that their |
1242 | | * images are mirrored horizontally in text that is laid out from right |
1243 | | * to left. For instance, "(" would become its mirror image, ")", in |
1244 | | * right-to-left text. |
1245 | | * |
1246 | | * If @ch has the Unicode mirrored property and there is another unicode |
1247 | | * character that typically has a glyph that is the mirror image of @ch's |
1248 | | * glyph and @mirrored_ch is set, it puts that character in the address |
1249 | | * pointed to by @mirrored_ch. Otherwise the original character is put. |
1250 | | * |
1251 | | * Returns: %TRUE if @ch has a mirrored character, %FALSE otherwise |
1252 | | * |
1253 | | * Since: 2.4 |
1254 | | **/ |
1255 | | gboolean |
1256 | | g_unichar_get_mirror_char (gunichar ch, |
1257 | | gunichar *mirrored_ch) |
1258 | 0 | { |
1259 | 0 | gboolean found; |
1260 | 0 | gunichar mirrored; |
1261 | |
|
1262 | 0 | mirrored = GLIB_GET_MIRRORING(ch); |
1263 | |
|
1264 | 0 | found = ch != mirrored; |
1265 | 0 | if (mirrored_ch) |
1266 | 0 | *mirrored_ch = mirrored; |
1267 | |
|
1268 | 0 | return found; |
1269 | |
|
1270 | 0 | } |
1271 | | |
1272 | 0 | #define G_SCRIPT_TABLE_MIDPOINT (G_N_ELEMENTS (g_script_table) / 2) |
1273 | | |
1274 | | static inline GUnicodeScript |
1275 | | g_unichar_get_script_bsearch (gunichar ch) |
1276 | 0 | { |
1277 | 0 | int lower = 0; |
1278 | 0 | int upper = G_N_ELEMENTS (g_script_table) - 1; |
1279 | 0 | static int saved_mid = G_SCRIPT_TABLE_MIDPOINT; |
1280 | 0 | int mid = saved_mid; |
1281 | | |
1282 | |
|
1283 | 0 | do |
1284 | 0 | { |
1285 | 0 | if (ch < g_script_table[mid].start) |
1286 | 0 | upper = mid - 1; |
1287 | 0 | else if (ch >= g_script_table[mid].start + g_script_table[mid].chars) |
1288 | 0 | lower = mid + 1; |
1289 | 0 | else |
1290 | 0 | return g_script_table[saved_mid = mid].script; |
1291 | | |
1292 | 0 | mid = (lower + upper) / 2; |
1293 | 0 | } |
1294 | 0 | while (lower <= upper); |
1295 | | |
1296 | 0 | return G_UNICODE_SCRIPT_UNKNOWN; |
1297 | 0 | } |
1298 | | |
1299 | | /** |
1300 | | * g_unichar_get_script: |
1301 | | * @ch: a Unicode character |
1302 | | * |
1303 | | * Looks up the #GUnicodeScript for a particular character (as defined |
1304 | | * by Unicode Standard Annex \#24). No check is made for @ch being a |
1305 | | * valid Unicode character; if you pass in invalid character, the |
1306 | | * result is undefined. |
1307 | | * |
1308 | | * This function is equivalent to pango_script_for_unichar() and the |
1309 | | * two are interchangeable. |
1310 | | * |
1311 | | * Returns: the #GUnicodeScript for the character. |
1312 | | * |
1313 | | * Since: 2.14 |
1314 | | */ |
1315 | | GUnicodeScript |
1316 | | g_unichar_get_script (gunichar ch) |
1317 | 0 | { |
1318 | 0 | if (ch < G_EASY_SCRIPTS_RANGE) |
1319 | 0 | return g_script_easy_table[ch]; |
1320 | 0 | else |
1321 | 0 | return g_unichar_get_script_bsearch (ch); |
1322 | 0 | } |
1323 | | |
1324 | | |
1325 | | /* http://unicode.org/iso15924/ */ |
1326 | | static const guint32 iso15924_tags[] = |
1327 | | { |
1328 | | #define PACK(a,b,c,d) ((guint32)((((guint8)(a))<<24)|(((guint8)(b))<<16)|(((guint8)(c))<<8)|((guint8)(d)))) |
1329 | | |
1330 | | PACK ('Z','y','y','y'), /* G_UNICODE_SCRIPT_COMMON */ |
1331 | | PACK ('Z','i','n','h'), /* G_UNICODE_SCRIPT_INHERITED */ |
1332 | | PACK ('A','r','a','b'), /* G_UNICODE_SCRIPT_ARABIC */ |
1333 | | PACK ('A','r','m','n'), /* G_UNICODE_SCRIPT_ARMENIAN */ |
1334 | | PACK ('B','e','n','g'), /* G_UNICODE_SCRIPT_BENGALI */ |
1335 | | PACK ('B','o','p','o'), /* G_UNICODE_SCRIPT_BOPOMOFO */ |
1336 | | PACK ('C','h','e','r'), /* G_UNICODE_SCRIPT_CHEROKEE */ |
1337 | | PACK ('C','o','p','t'), /* G_UNICODE_SCRIPT_COPTIC */ |
1338 | | PACK ('C','y','r','l'), /* G_UNICODE_SCRIPT_CYRILLIC */ |
1339 | | PACK ('D','s','r','t'), /* G_UNICODE_SCRIPT_DESERET */ |
1340 | | PACK ('D','e','v','a'), /* G_UNICODE_SCRIPT_DEVANAGARI */ |
1341 | | PACK ('E','t','h','i'), /* G_UNICODE_SCRIPT_ETHIOPIC */ |
1342 | | PACK ('G','e','o','r'), /* G_UNICODE_SCRIPT_GEORGIAN */ |
1343 | | PACK ('G','o','t','h'), /* G_UNICODE_SCRIPT_GOTHIC */ |
1344 | | PACK ('G','r','e','k'), /* G_UNICODE_SCRIPT_GREEK */ |
1345 | | PACK ('G','u','j','r'), /* G_UNICODE_SCRIPT_GUJARATI */ |
1346 | | PACK ('G','u','r','u'), /* G_UNICODE_SCRIPT_GURMUKHI */ |
1347 | | PACK ('H','a','n','i'), /* G_UNICODE_SCRIPT_HAN */ |
1348 | | PACK ('H','a','n','g'), /* G_UNICODE_SCRIPT_HANGUL */ |
1349 | | PACK ('H','e','b','r'), /* G_UNICODE_SCRIPT_HEBREW */ |
1350 | | PACK ('H','i','r','a'), /* G_UNICODE_SCRIPT_HIRAGANA */ |
1351 | | PACK ('K','n','d','a'), /* G_UNICODE_SCRIPT_KANNADA */ |
1352 | | PACK ('K','a','n','a'), /* G_UNICODE_SCRIPT_KATAKANA */ |
1353 | | PACK ('K','h','m','r'), /* G_UNICODE_SCRIPT_KHMER */ |
1354 | | PACK ('L','a','o','o'), /* G_UNICODE_SCRIPT_LAO */ |
1355 | | PACK ('L','a','t','n'), /* G_UNICODE_SCRIPT_LATIN */ |
1356 | | PACK ('M','l','y','m'), /* G_UNICODE_SCRIPT_MALAYALAM */ |
1357 | | PACK ('M','o','n','g'), /* G_UNICODE_SCRIPT_MONGOLIAN */ |
1358 | | PACK ('M','y','m','r'), /* G_UNICODE_SCRIPT_MYANMAR */ |
1359 | | PACK ('O','g','a','m'), /* G_UNICODE_SCRIPT_OGHAM */ |
1360 | | PACK ('I','t','a','l'), /* G_UNICODE_SCRIPT_OLD_ITALIC */ |
1361 | | PACK ('O','r','y','a'), /* G_UNICODE_SCRIPT_ORIYA */ |
1362 | | PACK ('R','u','n','r'), /* G_UNICODE_SCRIPT_RUNIC */ |
1363 | | PACK ('S','i','n','h'), /* G_UNICODE_SCRIPT_SINHALA */ |
1364 | | PACK ('S','y','r','c'), /* G_UNICODE_SCRIPT_SYRIAC */ |
1365 | | PACK ('T','a','m','l'), /* G_UNICODE_SCRIPT_TAMIL */ |
1366 | | PACK ('T','e','l','u'), /* G_UNICODE_SCRIPT_TELUGU */ |
1367 | | PACK ('T','h','a','a'), /* G_UNICODE_SCRIPT_THAANA */ |
1368 | | PACK ('T','h','a','i'), /* G_UNICODE_SCRIPT_THAI */ |
1369 | | PACK ('T','i','b','t'), /* G_UNICODE_SCRIPT_TIBETAN */ |
1370 | | PACK ('C','a','n','s'), /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */ |
1371 | | PACK ('Y','i','i','i'), /* G_UNICODE_SCRIPT_YI */ |
1372 | | PACK ('T','g','l','g'), /* G_UNICODE_SCRIPT_TAGALOG */ |
1373 | | PACK ('H','a','n','o'), /* G_UNICODE_SCRIPT_HANUNOO */ |
1374 | | PACK ('B','u','h','d'), /* G_UNICODE_SCRIPT_BUHID */ |
1375 | | PACK ('T','a','g','b'), /* G_UNICODE_SCRIPT_TAGBANWA */ |
1376 | | |
1377 | | /* Unicode-4.0 additions */ |
1378 | | PACK ('B','r','a','i'), /* G_UNICODE_SCRIPT_BRAILLE */ |
1379 | | PACK ('C','p','r','t'), /* G_UNICODE_SCRIPT_CYPRIOT */ |
1380 | | PACK ('L','i','m','b'), /* G_UNICODE_SCRIPT_LIMBU */ |
1381 | | PACK ('O','s','m','a'), /* G_UNICODE_SCRIPT_OSMANYA */ |
1382 | | PACK ('S','h','a','w'), /* G_UNICODE_SCRIPT_SHAVIAN */ |
1383 | | PACK ('L','i','n','b'), /* G_UNICODE_SCRIPT_LINEAR_B */ |
1384 | | PACK ('T','a','l','e'), /* G_UNICODE_SCRIPT_TAI_LE */ |
1385 | | PACK ('U','g','a','r'), /* G_UNICODE_SCRIPT_UGARITIC */ |
1386 | | |
1387 | | /* Unicode-4.1 additions */ |
1388 | | PACK ('T','a','l','u'), /* G_UNICODE_SCRIPT_NEW_TAI_LUE */ |
1389 | | PACK ('B','u','g','i'), /* G_UNICODE_SCRIPT_BUGINESE */ |
1390 | | PACK ('G','l','a','g'), /* G_UNICODE_SCRIPT_GLAGOLITIC */ |
1391 | | PACK ('T','f','n','g'), /* G_UNICODE_SCRIPT_TIFINAGH */ |
1392 | | PACK ('S','y','l','o'), /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */ |
1393 | | PACK ('X','p','e','o'), /* G_UNICODE_SCRIPT_OLD_PERSIAN */ |
1394 | | PACK ('K','h','a','r'), /* G_UNICODE_SCRIPT_KHAROSHTHI */ |
1395 | | |
1396 | | /* Unicode-5.0 additions */ |
1397 | | PACK ('Z','z','z','z'), /* G_UNICODE_SCRIPT_UNKNOWN */ |
1398 | | PACK ('B','a','l','i'), /* G_UNICODE_SCRIPT_BALINESE */ |
1399 | | PACK ('X','s','u','x'), /* G_UNICODE_SCRIPT_CUNEIFORM */ |
1400 | | PACK ('P','h','n','x'), /* G_UNICODE_SCRIPT_PHOENICIAN */ |
1401 | | PACK ('P','h','a','g'), /* G_UNICODE_SCRIPT_PHAGS_PA */ |
1402 | | PACK ('N','k','o','o'), /* G_UNICODE_SCRIPT_NKO */ |
1403 | | |
1404 | | /* Unicode-5.1 additions */ |
1405 | | PACK ('K','a','l','i'), /* G_UNICODE_SCRIPT_KAYAH_LI */ |
1406 | | PACK ('L','e','p','c'), /* G_UNICODE_SCRIPT_LEPCHA */ |
1407 | | PACK ('R','j','n','g'), /* G_UNICODE_SCRIPT_REJANG */ |
1408 | | PACK ('S','u','n','d'), /* G_UNICODE_SCRIPT_SUNDANESE */ |
1409 | | PACK ('S','a','u','r'), /* G_UNICODE_SCRIPT_SAURASHTRA */ |
1410 | | PACK ('C','h','a','m'), /* G_UNICODE_SCRIPT_CHAM */ |
1411 | | PACK ('O','l','c','k'), /* G_UNICODE_SCRIPT_OL_CHIKI */ |
1412 | | PACK ('V','a','i','i'), /* G_UNICODE_SCRIPT_VAI */ |
1413 | | PACK ('C','a','r','i'), /* G_UNICODE_SCRIPT_CARIAN */ |
1414 | | PACK ('L','y','c','i'), /* G_UNICODE_SCRIPT_LYCIAN */ |
1415 | | PACK ('L','y','d','i'), /* G_UNICODE_SCRIPT_LYDIAN */ |
1416 | | |
1417 | | /* Unicode-5.2 additions */ |
1418 | | PACK ('A','v','s','t'), /* G_UNICODE_SCRIPT_AVESTAN */ |
1419 | | PACK ('B','a','m','u'), /* G_UNICODE_SCRIPT_BAMUM */ |
1420 | | PACK ('E','g','y','p'), /* G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS */ |
1421 | | PACK ('A','r','m','i'), /* G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC */ |
1422 | | PACK ('P','h','l','i'), /* G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI */ |
1423 | | PACK ('P','r','t','i'), /* G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN */ |
1424 | | PACK ('J','a','v','a'), /* G_UNICODE_SCRIPT_JAVANESE */ |
1425 | | PACK ('K','t','h','i'), /* G_UNICODE_SCRIPT_KAITHI */ |
1426 | | PACK ('L','i','s','u'), /* G_UNICODE_SCRIPT_LISU */ |
1427 | | PACK ('M','t','e','i'), /* G_UNICODE_SCRIPT_MEETEI_MAYEK */ |
1428 | | PACK ('S','a','r','b'), /* G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN */ |
1429 | | PACK ('O','r','k','h'), /* G_UNICODE_SCRIPT_OLD_TURKIC */ |
1430 | | PACK ('S','a','m','r'), /* G_UNICODE_SCRIPT_SAMARITAN */ |
1431 | | PACK ('L','a','n','a'), /* G_UNICODE_SCRIPT_TAI_THAM */ |
1432 | | PACK ('T','a','v','t'), /* G_UNICODE_SCRIPT_TAI_VIET */ |
1433 | | |
1434 | | /* Unicode-6.0 additions */ |
1435 | | PACK ('B','a','t','k'), /* G_UNICODE_SCRIPT_BATAK */ |
1436 | | PACK ('B','r','a','h'), /* G_UNICODE_SCRIPT_BRAHMI */ |
1437 | | PACK ('M','a','n','d'), /* G_UNICODE_SCRIPT_MANDAIC */ |
1438 | | |
1439 | | /* Unicode-6.1 additions */ |
1440 | | PACK ('C','a','k','m'), /* G_UNICODE_SCRIPT_CHAKMA */ |
1441 | | PACK ('M','e','r','c'), /* G_UNICODE_SCRIPT_MEROITIC_CURSIVE */ |
1442 | | PACK ('M','e','r','o'), /* G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS */ |
1443 | | PACK ('P','l','r','d'), /* G_UNICODE_SCRIPT_MIAO */ |
1444 | | PACK ('S','h','r','d'), /* G_UNICODE_SCRIPT_SHARADA */ |
1445 | | PACK ('S','o','r','a'), /* G_UNICODE_SCRIPT_SORA_SOMPENG */ |
1446 | | PACK ('T','a','k','r'), /* G_UNICODE_SCRIPT_TAKRI */ |
1447 | | |
1448 | | /* Unicode 7.0 additions */ |
1449 | | PACK ('B','a','s','s'), /* G_UNICODE_SCRIPT_BASSA_VAH */ |
1450 | | PACK ('A','g','h','b'), /* G_UNICODE_SCRIPT_CAUCASIAN_ALBANIAN */ |
1451 | | PACK ('D','u','p','l'), /* G_UNICODE_SCRIPT_DUPLOYAN */ |
1452 | | PACK ('E','l','b','a'), /* G_UNICODE_SCRIPT_ELBASAN */ |
1453 | | PACK ('G','r','a','n'), /* G_UNICODE_SCRIPT_GRANTHA */ |
1454 | | PACK ('K','h','o','j'), /* G_UNICODE_SCRIPT_KHOJKI*/ |
1455 | | PACK ('S','i','n','d'), /* G_UNICODE_SCRIPT_KHUDAWADI */ |
1456 | | PACK ('L','i','n','a'), /* G_UNICODE_SCRIPT_LINEAR_A */ |
1457 | | PACK ('M','a','h','j'), /* G_UNICODE_SCRIPT_MAHAJANI */ |
1458 | | PACK ('M','a','n','i'), /* G_UNICODE_SCRIPT_MANICHAEAN */ |
1459 | | PACK ('M','e','n','d'), /* G_UNICODE_SCRIPT_MENDE_KIKAKUI */ |
1460 | | PACK ('M','o','d','i'), /* G_UNICODE_SCRIPT_MODI */ |
1461 | | PACK ('M','r','o','o'), /* G_UNICODE_SCRIPT_MRO */ |
1462 | | PACK ('N','b','a','t'), /* G_UNICODE_SCRIPT_NABATAEAN */ |
1463 | | PACK ('N','a','r','b'), /* G_UNICODE_SCRIPT_OLD_NORTH_ARABIAN */ |
1464 | | PACK ('P','e','r','m'), /* G_UNICODE_SCRIPT_OLD_PERMIC */ |
1465 | | PACK ('H','m','n','g'), /* G_UNICODE_SCRIPT_PAHAWH_HMONG */ |
1466 | | PACK ('P','a','l','m'), /* G_UNICODE_SCRIPT_PALMYRENE */ |
1467 | | PACK ('P','a','u','c'), /* G_UNICODE_SCRIPT_PAU_CIN_HAU */ |
1468 | | PACK ('P','h','l','p'), /* G_UNICODE_SCRIPT_PSALTER_PAHLAVI */ |
1469 | | PACK ('S','i','d','d'), /* G_UNICODE_SCRIPT_SIDDHAM */ |
1470 | | PACK ('T','i','r','h'), /* G_UNICODE_SCRIPT_TIRHUTA */ |
1471 | | PACK ('W','a','r','a'), /* G_UNICODE_SCRIPT_WARANG_CITI */ |
1472 | | |
1473 | | /* Unicode 8.0 additions */ |
1474 | | PACK ('A','h','o','m'), /* G_UNICODE_SCRIPT_AHOM */ |
1475 | | PACK ('H','l','u','w'), /* G_UNICODE_SCRIPT_ANATOLIAN_HIEROGLYPHS */ |
1476 | | PACK ('H','a','t','r'), /* G_UNICODE_SCRIPT_HATRAN */ |
1477 | | PACK ('M','u','l','t'), /* G_UNICODE_SCRIPT_MULTANI */ |
1478 | | PACK ('H','u','n','g'), /* G_UNICODE_SCRIPT_OLD_HUNGARIAN */ |
1479 | | PACK ('S','g','n','w'), /* G_UNICODE_SCRIPT_SIGNWRITING */ |
1480 | | |
1481 | | /* Unicode 9.0 additions */ |
1482 | | PACK ('A','d','l','m'), /* G_UNICODE_SCRIPT_ADLAM */ |
1483 | | PACK ('B','h','k','s'), /* G_UNICODE_SCRIPT_BHAIKSUKI */ |
1484 | | PACK ('M','a','r','c'), /* G_UNICODE_SCRIPT_MARCHEN */ |
1485 | | PACK ('N','e','w','a'), /* G_UNICODE_SCRIPT_NEWA */ |
1486 | | PACK ('O','s','g','e'), /* G_UNICODE_SCRIPT_OSAGE */ |
1487 | | PACK ('T','a','n','g'), /* G_UNICODE_SCRIPT_TANGUT */ |
1488 | | |
1489 | | /* Unicode 10.0 additions */ |
1490 | | PACK ('G','o','n','m'), /* G_UNICODE_SCRIPT_MASARAM_GONDI */ |
1491 | | PACK ('N','s','h','u'), /* G_UNICODE_SCRIPT_NUSHU */ |
1492 | | PACK ('S','o','y','o'), /* G_UNICODE_SCRIPT_SOYOMBO */ |
1493 | | PACK ('Z','a','n','b'), /* G_UNICODE_SCRIPT_ZANABAZAR_SQUARE */ |
1494 | | |
1495 | | /* Unicode 11.0 additions */ |
1496 | | PACK ('D','o','g','r'), /* G_UNICODE_SCRIPT_DOGRA */ |
1497 | | PACK ('G','o','n','g'), /* G_UNICODE_SCRIPT_GUNJALA_GONDI */ |
1498 | | PACK ('R','o','h','g'), /* G_UNICODE_SCRIPT_HANIFI_ROHINGYA */ |
1499 | | PACK ('M','a','k','a'), /* G_UNICODE_SCRIPT_MAKASAR */ |
1500 | | PACK ('M','e','d','f'), /* G_UNICODE_SCRIPT_MEDEFAIDRIN */ |
1501 | | PACK ('S','o','g','o'), /* G_UNICODE_SCRIPT_OLD_SOGDIAN */ |
1502 | | PACK ('S','o','g','d'), /* G_UNICODE_SCRIPT_SOGDIAN */ |
1503 | | |
1504 | | /* Unicode 12.0 additions */ |
1505 | | PACK ('E','l','y','m'), /* G_UNICODE_SCRIPT_ELYMAIC */ |
1506 | | PACK ('N','a','n','d'), /* G_UNICODE_SCRIPT_NANDINAGARI */ |
1507 | | PACK ('H','m','n','p'), /* G_UNICODE_SCRIPT_NYIAKENG_PUACHUE_HMONG */ |
1508 | | PACK ('W','c','h','o'), /* G_UNICODE_SCRIPT_WANCHO */ |
1509 | | |
1510 | | /* Unicode 13.0 additions */ |
1511 | | PACK ('C', 'h', 'r', 's'), /* G_UNICODE_SCRIPT_CHORASMIAN */ |
1512 | | PACK ('D', 'i', 'a', 'k'), /* G_UNICODE_SCRIPT_DIVES_AKURU */ |
1513 | | PACK ('K', 'i', 't', 's'), /* G_UNICODE_SCRIPT_KHITAN_SMALL_SCRIPT */ |
1514 | | PACK ('Y', 'e', 'z', 'i'), /* G_UNICODE_SCRIPT_YEZIDI */ |
1515 | | |
1516 | | /* Unicode 14.0 additions */ |
1517 | | PACK ('C', 'p', 'm', 'n'), /* G_UNICODE_SCRIPT_CYPRO_MINOAN */ |
1518 | | PACK ('O', 'u', 'g', 'r'), /* G_UNICODE_SCRIPT_OLD_UYHUR */ |
1519 | | PACK ('T', 'n', 's', 'a'), /* G_UNICODE_SCRIPT_TANGSA */ |
1520 | | PACK ('T', 'o', 't', 'o'), /* G_UNICODE_SCRIPT_TOTO */ |
1521 | | PACK ('V', 'i', 't', 'h'), /* G_UNICODE_SCRIPT_VITHKUQI */ |
1522 | | |
1523 | | /* not really a Unicode script, but part of ISO 15924 */ |
1524 | | PACK ('Z', 'm', 't', 'h'), /* G_UNICODE_SCRIPT_MATH */ |
1525 | | |
1526 | | /* Unicode 15.0 additions */ |
1527 | | PACK ('K', 'a', 'w', 'i'), /* G_UNICODE_SCRIPT_KAWI */ |
1528 | | PACK ('N', 'a', 'g', 'm'), /* G_UNICODE_SCRIPT_NAG_MUNDARI */ |
1529 | | |
1530 | | /* Unicode 16.0 additions */ |
1531 | | PACK ('T', 'o', 'd', 'r'), /* G_UNICODE_SCRIPT_TODHRI */ |
1532 | | PACK ('G', 'a', 'r', 'a'), /* G_UNICODE_SCRIPT_GARAY */ |
1533 | | PACK ('T', 'u', 't', 'g'), /* G_UNICODE_SCRIPT_TULU_TIGALARI */ |
1534 | | PACK ('S', 'u', 'n', 'u'), /* G_UNICODE_SCRIPT_SUNUWAR */ |
1535 | | PACK ('G', 'u', 'k', 'h'), /* G_UNICODE_SCRIPT_GURUNG_KHEMA */ |
1536 | | PACK ('K', 'r', 'a', 'i'), /* G_UNICODE_SCRIPT_KIRAT_RAI */ |
1537 | | PACK ('O', 'n', 'a', 'o'), /* G_UNICODE_SCRIPT_OL_ONAL */ |
1538 | | |
1539 | | #undef PACK |
1540 | | }; |
1541 | | |
1542 | | /** |
1543 | | * g_unicode_script_to_iso15924: |
1544 | | * @script: a Unicode script |
1545 | | * |
1546 | | * Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter |
1547 | | * codes to scripts. For example, the code for Arabic is 'Arab'. The |
1548 | | * four letter codes are encoded as a @guint32 by this function in a |
1549 | | * big-endian fashion. That is, the code returned for Arabic is |
1550 | | * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). |
1551 | | * |
1552 | | * See |
1553 | | * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) |
1554 | | * for details. |
1555 | | * |
1556 | | * Returns: the ISO 15924 code for @script, encoded as an integer, |
1557 | | * of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or |
1558 | | * ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood. |
1559 | | * |
1560 | | * Since: 2.30 |
1561 | | */ |
1562 | | guint32 |
1563 | | g_unicode_script_to_iso15924 (GUnicodeScript script) |
1564 | 0 | { |
1565 | 0 | if (G_UNLIKELY (script == G_UNICODE_SCRIPT_INVALID_CODE)) |
1566 | 0 | return 0; |
1567 | | |
1568 | 0 | if (G_UNLIKELY (script < 0 || script >= (int) G_N_ELEMENTS (iso15924_tags))) |
1569 | 0 | return 0x5A7A7A7A; |
1570 | | |
1571 | 0 | return iso15924_tags[script]; |
1572 | 0 | } |
1573 | | |
1574 | | /** |
1575 | | * g_unicode_script_from_iso15924: |
1576 | | * @iso15924: a Unicode script |
1577 | | * |
1578 | | * Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter |
1579 | | * codes to scripts. For example, the code for Arabic is 'Arab'. |
1580 | | * This function accepts four letter codes encoded as a @guint32 in a |
1581 | | * big-endian fashion. That is, the code expected for Arabic is |
1582 | | * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). |
1583 | | * |
1584 | | * See |
1585 | | * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) |
1586 | | * for details. |
1587 | | * |
1588 | | * Returns: the Unicode script for @iso15924, or |
1589 | | * of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and |
1590 | | * %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown. |
1591 | | * |
1592 | | * Since: 2.30 |
1593 | | */ |
1594 | | GUnicodeScript |
1595 | | g_unicode_script_from_iso15924 (guint32 iso15924) |
1596 | 0 | { |
1597 | 0 | unsigned int i; |
1598 | |
|
1599 | 0 | if (!iso15924) |
1600 | 0 | return G_UNICODE_SCRIPT_INVALID_CODE; |
1601 | | |
1602 | 0 | for (i = 0; i < G_N_ELEMENTS (iso15924_tags); i++) |
1603 | 0 | if (iso15924_tags[i] == iso15924) |
1604 | 0 | return (GUnicodeScript) i; |
1605 | | |
1606 | 0 | return G_UNICODE_SCRIPT_UNKNOWN; |
1607 | 0 | } |