/src/glib/glib/guniprop.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* guniprop.c - Unicode character properties. |
2 | | * |
3 | | * Copyright (C) 1999 Tom Tromey |
4 | | * Copyright (C) 2000 Red Hat, Inc. |
5 | | * |
6 | | * This library is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * This library is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public |
17 | | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | | */ |
19 | | |
20 | | #include "config.h" |
21 | | |
22 | | #include <stdlib.h> |
23 | | #include <stddef.h> |
24 | | #include <string.h> |
25 | | #include <locale.h> |
26 | | |
27 | | #include "gmem.h" |
28 | | #include "gstring.h" |
29 | | #include "gtestutils.h" |
30 | | #include "gtypes.h" |
31 | | #include "gunicode.h" |
32 | | #include "gunichartables.h" |
33 | | #include "gmirroringtable.h" |
34 | | #include "gscripttable.h" |
35 | | #include "gunicodeprivate.h" |
36 | | #ifdef G_OS_WIN32 |
37 | | #include "gwin32.h" |
38 | | #endif |
39 | | |
40 | 0 | #define G_UNICHAR_FULLWIDTH_A 0xff21 |
41 | 0 | #define G_UNICHAR_FULLWIDTH_I 0xff29 |
42 | 0 | #define G_UNICHAR_FULLWIDTH_J 0xff2a |
43 | 0 | #define G_UNICHAR_FULLWIDTH_F 0xff26 |
44 | 0 | #define G_UNICHAR_FULLWIDTH_a 0xff41 |
45 | 0 | #define G_UNICHAR_FULLWIDTH_f 0xff46 |
46 | | |
47 | 0 | #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ |
48 | 0 | ? attr_table_part1[Page] \ |
49 | 0 | : attr_table_part2[(Page) - 0xe00]) |
50 | | |
51 | | #define ATTTABLE(Page, Char) \ |
52 | 0 | ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char])) |
53 | | |
54 | | #define TTYPE_PART1(Page, Char) \ |
55 | 0 | ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
56 | 0 | ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
57 | 0 | : (type_data[type_table_part1[Page]][Char])) |
58 | | |
59 | | #define TTYPE_PART2(Page, Char) \ |
60 | 0 | ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
61 | 0 | ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
62 | 0 | : (type_data[type_table_part2[Page]][Char])) |
63 | | |
64 | | #define TYPE(Char) \ |
65 | 0 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
66 | 0 | ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \ |
67 | 0 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
68 | 0 | ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
69 | 0 | : G_UNICODE_UNASSIGNED)) |
70 | | |
71 | | |
72 | 0 | #define IS(Type, Class) (((guint)1 << (Type)) & (Class)) |
73 | | #define OR(Type, Rest) (((guint)1 << (Type)) | (Rest)) |
74 | | |
75 | | |
76 | | |
77 | 0 | #define ISALPHA(Type) IS ((Type), \ |
78 | 0 | OR (G_UNICODE_LOWERCASE_LETTER, \ |
79 | 0 | OR (G_UNICODE_UPPERCASE_LETTER, \ |
80 | 0 | OR (G_UNICODE_TITLECASE_LETTER, \ |
81 | 0 | OR (G_UNICODE_MODIFIER_LETTER, \ |
82 | 0 | OR (G_UNICODE_OTHER_LETTER, 0)))))) |
83 | | |
84 | 0 | #define ISALDIGIT(Type) IS ((Type), \ |
85 | 0 | OR (G_UNICODE_DECIMAL_NUMBER, \ |
86 | 0 | OR (G_UNICODE_LETTER_NUMBER, \ |
87 | 0 | OR (G_UNICODE_OTHER_NUMBER, \ |
88 | 0 | OR (G_UNICODE_LOWERCASE_LETTER, \ |
89 | 0 | OR (G_UNICODE_UPPERCASE_LETTER, \ |
90 | 0 | OR (G_UNICODE_TITLECASE_LETTER, \ |
91 | 0 | OR (G_UNICODE_MODIFIER_LETTER, \ |
92 | 0 | OR (G_UNICODE_OTHER_LETTER, 0))))))))) |
93 | | |
94 | 0 | #define ISMARK(Type) IS ((Type), \ |
95 | 0 | OR (G_UNICODE_NON_SPACING_MARK, \ |
96 | 0 | OR (G_UNICODE_SPACING_MARK, \ |
97 | 0 | OR (G_UNICODE_ENCLOSING_MARK, 0)))) |
98 | | |
99 | | #define ISZEROWIDTHTYPE(Type) IS ((Type), \ |
100 | | OR (G_UNICODE_NON_SPACING_MARK, \ |
101 | | OR (G_UNICODE_ENCLOSING_MARK, \ |
102 | | OR (G_UNICODE_FORMAT, 0)))) |
103 | | |
104 | | /** |
105 | | * g_unichar_isalnum: |
106 | | * @c: a Unicode character |
107 | | * |
108 | | * Determines whether a character is alphanumeric. |
109 | | * Given some UTF-8 text, obtain a character value |
110 | | * with g_utf8_get_char(). |
111 | | * |
112 | | * Returns: %TRUE if @c is an alphanumeric character |
113 | | **/ |
114 | | gboolean |
115 | | g_unichar_isalnum (gunichar c) |
116 | 0 | { |
117 | 0 | return ISALDIGIT (TYPE (c)) ? TRUE : FALSE; |
118 | 0 | } |
119 | | |
120 | | /** |
121 | | * g_unichar_isalpha: |
122 | | * @c: a Unicode character |
123 | | * |
124 | | * Determines whether a character is alphabetic (i.e. a letter). |
125 | | * Given some UTF-8 text, obtain a character value with |
126 | | * g_utf8_get_char(). |
127 | | * |
128 | | * Returns: %TRUE if @c is an alphabetic character |
129 | | **/ |
130 | | gboolean |
131 | | g_unichar_isalpha (gunichar c) |
132 | 0 | { |
133 | 0 | return ISALPHA (TYPE (c)) ? TRUE : FALSE; |
134 | 0 | } |
135 | | |
136 | | |
137 | | /** |
138 | | * g_unichar_iscntrl: |
139 | | * @c: a Unicode character |
140 | | * |
141 | | * Determines whether a character is a control character. |
142 | | * Given some UTF-8 text, obtain a character value with |
143 | | * g_utf8_get_char(). |
144 | | * |
145 | | * Returns: %TRUE if @c is a control character |
146 | | **/ |
147 | | gboolean |
148 | | g_unichar_iscntrl (gunichar c) |
149 | 0 | { |
150 | 0 | return TYPE (c) == G_UNICODE_CONTROL; |
151 | 0 | } |
152 | | |
153 | | /** |
154 | | * g_unichar_isdigit: |
155 | | * @c: a Unicode character |
156 | | * |
157 | | * Determines whether a character is numeric (i.e. a digit). This |
158 | | * covers ASCII 0-9 and also digits in other languages/scripts. Given |
159 | | * some UTF-8 text, obtain a character value with g_utf8_get_char(). |
160 | | * |
161 | | * Returns: %TRUE if @c is a digit |
162 | | **/ |
163 | | gboolean |
164 | | g_unichar_isdigit (gunichar c) |
165 | 0 | { |
166 | 0 | return TYPE (c) == G_UNICODE_DECIMAL_NUMBER; |
167 | 0 | } |
168 | | |
169 | | |
170 | | /** |
171 | | * g_unichar_isgraph: |
172 | | * @c: a Unicode character |
173 | | * |
174 | | * Determines whether a character is printable and not a space |
175 | | * (returns %FALSE for control characters, format characters, and |
176 | | * spaces). g_unichar_isprint() is similar, but returns %TRUE for |
177 | | * spaces. Given some UTF-8 text, obtain a character value with |
178 | | * g_utf8_get_char(). |
179 | | * |
180 | | * Returns: %TRUE if @c is printable unless it's a space |
181 | | **/ |
182 | | gboolean |
183 | | g_unichar_isgraph (gunichar c) |
184 | 0 | { |
185 | 0 | return !IS (TYPE(c), |
186 | 0 | OR (G_UNICODE_CONTROL, |
187 | 0 | OR (G_UNICODE_FORMAT, |
188 | 0 | OR (G_UNICODE_UNASSIGNED, |
189 | 0 | OR (G_UNICODE_SURROGATE, |
190 | 0 | OR (G_UNICODE_SPACE_SEPARATOR, |
191 | 0 | 0)))))); |
192 | 0 | } |
193 | | |
194 | | /** |
195 | | * g_unichar_islower: |
196 | | * @c: a Unicode character |
197 | | * |
198 | | * Determines whether a character is a lowercase letter. |
199 | | * Given some UTF-8 text, obtain a character value with |
200 | | * g_utf8_get_char(). |
201 | | * |
202 | | * Returns: %TRUE if @c is a lowercase letter |
203 | | **/ |
204 | | gboolean |
205 | | g_unichar_islower (gunichar c) |
206 | 0 | { |
207 | 0 | return TYPE (c) == G_UNICODE_LOWERCASE_LETTER; |
208 | 0 | } |
209 | | |
210 | | |
211 | | /** |
212 | | * g_unichar_isprint: |
213 | | * @c: a Unicode character |
214 | | * |
215 | | * Determines whether a character is printable. |
216 | | * Unlike g_unichar_isgraph(), returns %TRUE for spaces. |
217 | | * Given some UTF-8 text, obtain a character value with |
218 | | * g_utf8_get_char(). |
219 | | * |
220 | | * Returns: %TRUE if @c is printable |
221 | | **/ |
222 | | gboolean |
223 | | g_unichar_isprint (gunichar c) |
224 | 0 | { |
225 | 0 | return !IS (TYPE(c), |
226 | 0 | OR (G_UNICODE_CONTROL, |
227 | 0 | OR (G_UNICODE_FORMAT, |
228 | 0 | OR (G_UNICODE_UNASSIGNED, |
229 | 0 | OR (G_UNICODE_SURROGATE, |
230 | 0 | 0))))); |
231 | 0 | } |
232 | | |
233 | | /** |
234 | | * g_unichar_ispunct: |
235 | | * @c: a Unicode character |
236 | | * |
237 | | * Determines whether a character is punctuation or a symbol. |
238 | | * Given some UTF-8 text, obtain a character value with |
239 | | * g_utf8_get_char(). |
240 | | * |
241 | | * Returns: %TRUE if @c is a punctuation or symbol character |
242 | | **/ |
243 | | gboolean |
244 | | g_unichar_ispunct (gunichar c) |
245 | 0 | { |
246 | 0 | return IS (TYPE(c), |
247 | 0 | OR (G_UNICODE_CONNECT_PUNCTUATION, |
248 | 0 | OR (G_UNICODE_DASH_PUNCTUATION, |
249 | 0 | OR (G_UNICODE_CLOSE_PUNCTUATION, |
250 | 0 | OR (G_UNICODE_FINAL_PUNCTUATION, |
251 | 0 | OR (G_UNICODE_INITIAL_PUNCTUATION, |
252 | 0 | OR (G_UNICODE_OTHER_PUNCTUATION, |
253 | 0 | OR (G_UNICODE_OPEN_PUNCTUATION, |
254 | 0 | OR (G_UNICODE_CURRENCY_SYMBOL, |
255 | 0 | OR (G_UNICODE_MODIFIER_SYMBOL, |
256 | 0 | OR (G_UNICODE_MATH_SYMBOL, |
257 | 0 | OR (G_UNICODE_OTHER_SYMBOL, |
258 | 0 | 0)))))))))))) ? TRUE : FALSE; |
259 | 0 | } |
260 | | |
261 | | /** |
262 | | * g_unichar_isspace: |
263 | | * @c: a Unicode character |
264 | | * |
265 | | * Determines whether a character is a space, tab, or line separator |
266 | | * (newline, carriage return, etc.). Given some UTF-8 text, obtain a |
267 | | * character value with g_utf8_get_char(). |
268 | | * |
269 | | * (Note: don't use this to do word breaking; you have to use |
270 | | * Pango or equivalent to get word breaking right, the algorithm |
271 | | * is fairly complex.) |
272 | | * |
273 | | * Returns: %TRUE if @c is a space character |
274 | | **/ |
275 | | gboolean |
276 | | g_unichar_isspace (gunichar c) |
277 | 0 | { |
278 | 0 | switch (c) |
279 | 0 | { |
280 | | /* special-case these since Unicode thinks they are not spaces */ |
281 | 0 | case '\t': |
282 | 0 | case '\n': |
283 | 0 | case '\r': |
284 | 0 | case '\f': |
285 | 0 | return TRUE; |
286 | 0 | break; |
287 | | |
288 | 0 | default: |
289 | 0 | { |
290 | 0 | return IS (TYPE(c), |
291 | 0 | OR (G_UNICODE_SPACE_SEPARATOR, |
292 | 0 | OR (G_UNICODE_LINE_SEPARATOR, |
293 | 0 | OR (G_UNICODE_PARAGRAPH_SEPARATOR, |
294 | 0 | 0)))) ? TRUE : FALSE; |
295 | 0 | } |
296 | 0 | break; |
297 | 0 | } |
298 | 0 | } |
299 | | |
300 | | /** |
301 | | * g_unichar_ismark: |
302 | | * @c: a Unicode character |
303 | | * |
304 | | * Determines whether a character is a mark (non-spacing mark, |
305 | | * combining mark, or enclosing mark in Unicode speak). |
306 | | * Given some UTF-8 text, obtain a character value |
307 | | * with g_utf8_get_char(). |
308 | | * |
309 | | * Note: in most cases where isalpha characters are allowed, |
310 | | * ismark characters should be allowed to as they are essential |
311 | | * for writing most European languages as well as many non-Latin |
312 | | * scripts. |
313 | | * |
314 | | * Returns: %TRUE if @c is a mark character |
315 | | * |
316 | | * Since: 2.14 |
317 | | **/ |
318 | | gboolean |
319 | | g_unichar_ismark (gunichar c) |
320 | 0 | { |
321 | 0 | return ISMARK (TYPE (c)); |
322 | 0 | } |
323 | | |
324 | | /** |
325 | | * g_unichar_isupper: |
326 | | * @c: a Unicode character |
327 | | * |
328 | | * Determines if a character is uppercase. |
329 | | * |
330 | | * Returns: %TRUE if @c is an uppercase character |
331 | | **/ |
332 | | gboolean |
333 | | g_unichar_isupper (gunichar c) |
334 | 0 | { |
335 | 0 | return TYPE (c) == G_UNICODE_UPPERCASE_LETTER; |
336 | 0 | } |
337 | | |
338 | | /** |
339 | | * g_unichar_istitle: |
340 | | * @c: a Unicode character |
341 | | * |
342 | | * Determines if a character is titlecase. Some characters in |
343 | | * Unicode which are composites, such as the DZ digraph |
344 | | * have three case variants instead of just two. The titlecase |
345 | | * form is used at the beginning of a word where only the |
346 | | * first letter is capitalized. The titlecase form of the DZ |
347 | | * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. |
348 | | * |
349 | | * Returns: %TRUE if the character is titlecase |
350 | | **/ |
351 | | gboolean |
352 | | g_unichar_istitle (gunichar c) |
353 | 0 | { |
354 | 0 | unsigned int i; |
355 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
356 | 0 | if (title_table[i][0] == c) |
357 | 0 | return TRUE; |
358 | 0 | return FALSE; |
359 | 0 | } |
360 | | |
361 | | /** |
362 | | * g_unichar_isxdigit: |
363 | | * @c: a Unicode character. |
364 | | * |
365 | | * Determines if a character is a hexadecimal digit. |
366 | | * |
367 | | * Returns: %TRUE if the character is a hexadecimal digit |
368 | | **/ |
369 | | gboolean |
370 | | g_unichar_isxdigit (gunichar c) |
371 | 0 | { |
372 | 0 | return ((c >= 'a' && c <= 'f') || |
373 | 0 | (c >= 'A' && c <= 'F') || |
374 | 0 | (c >= G_UNICHAR_FULLWIDTH_a && c <= G_UNICHAR_FULLWIDTH_f) || |
375 | 0 | (c >= G_UNICHAR_FULLWIDTH_A && c <= G_UNICHAR_FULLWIDTH_F) || |
376 | 0 | (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)); |
377 | 0 | } |
378 | | |
379 | | /** |
380 | | * g_unichar_isdefined: |
381 | | * @c: a Unicode character |
382 | | * |
383 | | * Determines if a given character is assigned in the Unicode |
384 | | * standard. |
385 | | * |
386 | | * Returns: %TRUE if the character has an assigned value |
387 | | **/ |
388 | | gboolean |
389 | | g_unichar_isdefined (gunichar c) |
390 | 0 | { |
391 | 0 | return !IS (TYPE(c), |
392 | 0 | OR (G_UNICODE_UNASSIGNED, |
393 | 0 | OR (G_UNICODE_SURROGATE, |
394 | 0 | 0))); |
395 | 0 | } |
396 | | |
397 | | /** |
398 | | * g_unichar_iszerowidth: |
399 | | * @c: a Unicode character |
400 | | * |
401 | | * Determines if a given character typically takes zero width when rendered. |
402 | | * The return value is %TRUE for all non-spacing and enclosing marks |
403 | | * (e.g., combining accents), format characters, zero-width |
404 | | * space, but not U+00AD SOFT HYPHEN. |
405 | | * |
406 | | * A typical use of this function is with one of g_unichar_iswide() or |
407 | | * g_unichar_iswide_cjk() to determine the number of cells a string occupies |
408 | | * when displayed on a grid display (terminals). However, note that not all |
409 | | * terminals support zero-width rendering of zero-width marks. |
410 | | * |
411 | | * Returns: %TRUE if the character has zero width |
412 | | * |
413 | | * Since: 2.14 |
414 | | **/ |
415 | | gboolean |
416 | | g_unichar_iszerowidth (gunichar c) |
417 | 0 | { |
418 | 0 | if (G_UNLIKELY (c == 0x00AD)) |
419 | 0 | return FALSE; |
420 | | |
421 | 0 | if (G_UNLIKELY (ISZEROWIDTHTYPE (TYPE (c)))) |
422 | 0 | return TRUE; |
423 | | |
424 | 0 | if (G_UNLIKELY ((c >= 0x1160 && c < 0x1200) || |
425 | 0 | c == 0x200B)) |
426 | 0 | return TRUE; |
427 | | |
428 | 0 | return FALSE; |
429 | 0 | } |
430 | | |
431 | | static int |
432 | | interval_compare (const void *key, const void *elt) |
433 | 0 | { |
434 | 0 | gunichar c = GPOINTER_TO_UINT (key); |
435 | 0 | struct Interval *interval = (struct Interval *)elt; |
436 | |
|
437 | 0 | if (c < interval->start) |
438 | 0 | return -1; |
439 | 0 | if (c > interval->end) |
440 | 0 | return +1; |
441 | | |
442 | 0 | return 0; |
443 | 0 | } |
444 | | |
445 | 0 | #define G_WIDTH_TABLE_MIDPOINT (G_N_ELEMENTS (g_unicode_width_table_wide) / 2) |
446 | | |
447 | | static inline gboolean |
448 | | g_unichar_iswide_bsearch (gunichar ch) |
449 | 0 | { |
450 | 0 | int lower = 0; |
451 | 0 | int upper = G_N_ELEMENTS (g_unicode_width_table_wide) - 1; |
452 | 0 | static int saved_mid = G_WIDTH_TABLE_MIDPOINT; |
453 | 0 | int mid = saved_mid; |
454 | |
|
455 | 0 | do |
456 | 0 | { |
457 | 0 | if (ch < g_unicode_width_table_wide[mid].start) |
458 | 0 | upper = mid - 1; |
459 | 0 | else if (ch > g_unicode_width_table_wide[mid].end) |
460 | 0 | lower = mid + 1; |
461 | 0 | else |
462 | 0 | return TRUE; |
463 | | |
464 | 0 | mid = (lower + upper) / 2; |
465 | 0 | } |
466 | 0 | while (lower <= upper); |
467 | | |
468 | 0 | return FALSE; |
469 | 0 | } |
470 | | |
471 | | /** |
472 | | * g_unichar_iswide: |
473 | | * @c: a Unicode character |
474 | | * |
475 | | * Determines if a character is typically rendered in a double-width |
476 | | * cell. |
477 | | * |
478 | | * Returns: %TRUE if the character is wide |
479 | | **/ |
480 | | gboolean |
481 | | g_unichar_iswide (gunichar c) |
482 | 0 | { |
483 | 0 | if (c < g_unicode_width_table_wide[0].start) |
484 | 0 | return FALSE; |
485 | 0 | else |
486 | 0 | return g_unichar_iswide_bsearch (c); |
487 | 0 | } |
488 | | |
489 | | |
490 | | /** |
491 | | * g_unichar_iswide_cjk: |
492 | | * @c: a Unicode character |
493 | | * |
494 | | * Determines if a character is typically rendered in a double-width |
495 | | * cell under legacy East Asian locales. If a character is wide according to |
496 | | * g_unichar_iswide(), then it is also reported wide with this function, but |
497 | | * the converse is not necessarily true. See the |
498 | | * [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) |
499 | | * for details. |
500 | | * |
501 | | * If a character passes the g_unichar_iswide() test then it will also pass |
502 | | * this test, but not the other way around. Note that some characters may |
503 | | * pass both this test and g_unichar_iszerowidth(). |
504 | | * |
505 | | * Returns: %TRUE if the character is wide in legacy East Asian locales |
506 | | * |
507 | | * Since: 2.12 |
508 | | */ |
509 | | gboolean |
510 | | g_unichar_iswide_cjk (gunichar c) |
511 | 0 | { |
512 | 0 | if (g_unichar_iswide (c)) |
513 | 0 | return TRUE; |
514 | | |
515 | | /* bsearch() is declared attribute(nonnull(1)) so we can't validly search |
516 | | * for a NULL key */ |
517 | 0 | if (c == 0) |
518 | 0 | return FALSE; |
519 | | |
520 | 0 | if (bsearch (GUINT_TO_POINTER (c), |
521 | 0 | g_unicode_width_table_ambiguous, |
522 | 0 | G_N_ELEMENTS (g_unicode_width_table_ambiguous), |
523 | 0 | sizeof g_unicode_width_table_ambiguous[0], |
524 | 0 | interval_compare)) |
525 | 0 | return TRUE; |
526 | | |
527 | 0 | return FALSE; |
528 | 0 | } |
529 | | |
530 | | |
531 | | /** |
532 | | * g_unichar_toupper: |
533 | | * @c: a Unicode character |
534 | | * |
535 | | * Converts a character to uppercase. |
536 | | * |
537 | | * Returns: the result of converting @c to uppercase. |
538 | | * If @c is not a lowercase or titlecase character, |
539 | | * or has no upper case equivalent @c is returned unchanged. |
540 | | **/ |
541 | | gunichar |
542 | | g_unichar_toupper (gunichar c) |
543 | 0 | { |
544 | 0 | int t = TYPE (c); |
545 | 0 | if (t == G_UNICODE_LOWERCASE_LETTER) |
546 | 0 | { |
547 | 0 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
548 | 0 | if (val >= 0x1000000) |
549 | 0 | { |
550 | 0 | const gchar *p = special_case_table + val - 0x1000000; |
551 | 0 | val = g_utf8_get_char (p); |
552 | 0 | } |
553 | | /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, |
554 | | * do not have an uppercase equivalent, in which case val will be |
555 | | * zero. |
556 | | */ |
557 | 0 | return val ? val : c; |
558 | 0 | } |
559 | 0 | else if (t == G_UNICODE_TITLECASE_LETTER) |
560 | 0 | { |
561 | 0 | unsigned int i; |
562 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
563 | 0 | { |
564 | 0 | if (title_table[i][0] == c) |
565 | 0 | return title_table[i][1] ? title_table[i][1] : c; |
566 | 0 | } |
567 | 0 | } |
568 | 0 | return c; |
569 | 0 | } |
570 | | |
571 | | /** |
572 | | * g_unichar_tolower: |
573 | | * @c: a Unicode character. |
574 | | * |
575 | | * Converts a character to lower case. |
576 | | * |
577 | | * Returns: the result of converting @c to lower case. |
578 | | * If @c is not an upperlower or titlecase character, |
579 | | * or has no lowercase equivalent @c is returned unchanged. |
580 | | **/ |
581 | | gunichar |
582 | | g_unichar_tolower (gunichar c) |
583 | 0 | { |
584 | 0 | int t = TYPE (c); |
585 | 0 | if (t == G_UNICODE_UPPERCASE_LETTER) |
586 | 0 | { |
587 | 0 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
588 | 0 | if (val >= 0x1000000) |
589 | 0 | { |
590 | 0 | const gchar *p = special_case_table + val - 0x1000000; |
591 | 0 | return g_utf8_get_char (p); |
592 | 0 | } |
593 | 0 | else |
594 | 0 | { |
595 | | /* Not all uppercase letters are guaranteed to have a lowercase |
596 | | * equivalent. If this is the case, val will be zero. */ |
597 | 0 | return val ? val : c; |
598 | 0 | } |
599 | 0 | } |
600 | 0 | else if (t == G_UNICODE_TITLECASE_LETTER) |
601 | 0 | { |
602 | 0 | unsigned int i; |
603 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
604 | 0 | { |
605 | 0 | if (title_table[i][0] == c) |
606 | 0 | return title_table[i][2]; |
607 | 0 | } |
608 | 0 | } |
609 | 0 | return c; |
610 | 0 | } |
611 | | |
612 | | /** |
613 | | * g_unichar_totitle: |
614 | | * @c: a Unicode character |
615 | | * |
616 | | * Converts a character to the titlecase. |
617 | | * |
618 | | * Returns: the result of converting @c to titlecase. |
619 | | * If @c is not an uppercase or lowercase character, |
620 | | * @c is returned unchanged. |
621 | | **/ |
622 | | gunichar |
623 | | g_unichar_totitle (gunichar c) |
624 | 0 | { |
625 | 0 | unsigned int i; |
626 | | |
627 | | /* We handle U+0000 explicitly because some elements in |
628 | | * title_table[i][1] may be null. */ |
629 | 0 | if (c == 0) |
630 | 0 | return c; |
631 | | |
632 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
633 | 0 | { |
634 | 0 | if (title_table[i][0] == c || title_table[i][1] == c |
635 | 0 | || title_table[i][2] == c) |
636 | 0 | return title_table[i][0]; |
637 | 0 | } |
638 | | |
639 | 0 | if (TYPE (c) == G_UNICODE_LOWERCASE_LETTER) |
640 | 0 | return g_unichar_toupper (c); |
641 | | |
642 | 0 | return c; |
643 | 0 | } |
644 | | |
645 | | /** |
646 | | * g_unichar_digit_value: |
647 | | * @c: a Unicode character |
648 | | * |
649 | | * Determines the numeric value of a character as a decimal |
650 | | * digit. |
651 | | * |
652 | | * Returns: If @c is a decimal digit (according to |
653 | | * g_unichar_isdigit()), its numeric value. Otherwise, -1. |
654 | | **/ |
655 | | int |
656 | | g_unichar_digit_value (gunichar c) |
657 | 0 | { |
658 | 0 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
659 | 0 | return ATTTABLE (c >> 8, c & 0xff); |
660 | 0 | return -1; |
661 | 0 | } |
662 | | |
663 | | /** |
664 | | * g_unichar_xdigit_value: |
665 | | * @c: a Unicode character |
666 | | * |
667 | | * Determines the numeric value of a character as a hexadecimal |
668 | | * digit. |
669 | | * |
670 | | * Returns: If @c is a hex digit (according to |
671 | | * g_unichar_isxdigit()), its numeric value. Otherwise, -1. |
672 | | **/ |
673 | | int |
674 | | g_unichar_xdigit_value (gunichar c) |
675 | 0 | { |
676 | 0 | if (c >= 'A' && c <= 'F') |
677 | 0 | return c - 'A' + 10; |
678 | 0 | if (c >= 'a' && c <= 'f') |
679 | 0 | return c - 'a' + 10; |
680 | 0 | if (c >= G_UNICHAR_FULLWIDTH_A && c <= G_UNICHAR_FULLWIDTH_F) |
681 | 0 | return c - G_UNICHAR_FULLWIDTH_A + 10; |
682 | 0 | if (c >= G_UNICHAR_FULLWIDTH_a && c <= G_UNICHAR_FULLWIDTH_f) |
683 | 0 | return c - G_UNICHAR_FULLWIDTH_a + 10; |
684 | 0 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
685 | 0 | return ATTTABLE (c >> 8, c & 0xff); |
686 | 0 | return -1; |
687 | 0 | } |
688 | | |
689 | | /** |
690 | | * g_unichar_type: |
691 | | * @c: a Unicode character |
692 | | * |
693 | | * Classifies a Unicode character by type. |
694 | | * |
695 | | * Returns: the type of the character. |
696 | | **/ |
697 | | GUnicodeType |
698 | | g_unichar_type (gunichar c) |
699 | 0 | { |
700 | 0 | return TYPE (c); |
701 | 0 | } |
702 | | |
703 | | /* |
704 | | * Case mapping functions |
705 | | */ |
706 | | |
707 | | typedef enum { |
708 | | LOCALE_NORMAL, |
709 | | LOCALE_TURKIC, |
710 | | LOCALE_LITHUANIAN |
711 | | } LocaleType; |
712 | | |
713 | | static LocaleType |
714 | | get_locale_type (void) |
715 | 0 | { |
716 | | #ifdef G_OS_WIN32 |
717 | | char *tem = g_win32_getlocale (); |
718 | | char locale[2]; |
719 | | |
720 | | locale[0] = tem[0]; |
721 | | locale[1] = tem[1]; |
722 | | g_free (tem); |
723 | | #else |
724 | 0 | const char *locale = setlocale (LC_CTYPE, NULL); |
725 | |
|
726 | 0 | if (locale == NULL) |
727 | 0 | return LOCALE_NORMAL; |
728 | 0 | #endif |
729 | | |
730 | 0 | switch (locale[0]) |
731 | 0 | { |
732 | 0 | case 'a': |
733 | 0 | if (locale[1] == 'z') |
734 | 0 | return LOCALE_TURKIC; |
735 | 0 | break; |
736 | 0 | case 'l': |
737 | 0 | if (locale[1] == 't') |
738 | 0 | return LOCALE_LITHUANIAN; |
739 | 0 | break; |
740 | 0 | case 't': |
741 | 0 | if (locale[1] == 'r') |
742 | 0 | return LOCALE_TURKIC; |
743 | 0 | break; |
744 | 0 | } |
745 | | |
746 | 0 | return LOCALE_NORMAL; |
747 | 0 | } |
748 | | |
749 | | static gint |
750 | | output_marks (const char **p_inout, |
751 | | char *out_buffer, |
752 | | gboolean remove_dot) |
753 | 0 | { |
754 | 0 | const char *p = *p_inout; |
755 | 0 | gint len = 0; |
756 | | |
757 | 0 | while (*p) |
758 | 0 | { |
759 | 0 | gunichar c = g_utf8_get_char (p); |
760 | | |
761 | 0 | if (ISMARK (TYPE (c))) |
762 | 0 | { |
763 | 0 | if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */) |
764 | 0 | len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL); |
765 | 0 | p = g_utf8_next_char (p); |
766 | 0 | } |
767 | 0 | else |
768 | 0 | break; |
769 | 0 | } |
770 | |
|
771 | 0 | *p_inout = p; |
772 | 0 | return len; |
773 | 0 | } |
774 | | |
775 | | static gint |
776 | | output_special_case (gchar *out_buffer, |
777 | | int offset, |
778 | | int type, |
779 | | int which) |
780 | 0 | { |
781 | 0 | const gchar *p = special_case_table + offset; |
782 | 0 | gint len; |
783 | |
|
784 | 0 | if (type != G_UNICODE_TITLECASE_LETTER) |
785 | 0 | p = g_utf8_next_char (p); |
786 | |
|
787 | 0 | if (which == 1) |
788 | 0 | p += strlen (p) + 1; |
789 | |
|
790 | 0 | len = strlen (p); |
791 | 0 | if (out_buffer) |
792 | 0 | memcpy (out_buffer, p, len); |
793 | |
|
794 | 0 | return len; |
795 | 0 | } |
796 | | |
797 | | static gsize |
798 | | real_toupper (const gchar *str, |
799 | | gssize max_len, |
800 | | gchar *out_buffer, |
801 | | LocaleType locale_type) |
802 | 0 | { |
803 | 0 | const gchar *p = str; |
804 | 0 | const char *last = NULL; |
805 | 0 | gsize len = 0; |
806 | 0 | gboolean last_was_i = FALSE; |
807 | |
|
808 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
809 | 0 | { |
810 | 0 | gunichar c = g_utf8_get_char (p); |
811 | 0 | int t = TYPE (c); |
812 | 0 | gunichar val; |
813 | |
|
814 | 0 | last = p; |
815 | 0 | p = g_utf8_next_char (p); |
816 | |
|
817 | 0 | if (locale_type == LOCALE_LITHUANIAN) |
818 | 0 | { |
819 | 0 | if (c == 'i') |
820 | 0 | last_was_i = TRUE; |
821 | 0 | else |
822 | 0 | { |
823 | 0 | if (last_was_i) |
824 | 0 | { |
825 | | /* Nasty, need to remove any dot above. Though |
826 | | * I think only E WITH DOT ABOVE occurs in practice |
827 | | * which could simplify this considerably. |
828 | | */ |
829 | 0 | gsize decomp_len, i; |
830 | 0 | gunichar decomp[G_UNICHAR_MAX_DECOMPOSITION_LENGTH]; |
831 | |
|
832 | 0 | decomp_len = g_unichar_fully_decompose (c, FALSE, decomp, G_N_ELEMENTS (decomp)); |
833 | 0 | for (i=0; i < decomp_len; i++) |
834 | 0 | { |
835 | 0 | if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */) |
836 | 0 | len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL); |
837 | 0 | } |
838 | | |
839 | 0 | len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE); |
840 | |
|
841 | 0 | continue; |
842 | 0 | } |
843 | | |
844 | 0 | if (!ISMARK (t)) |
845 | 0 | last_was_i = FALSE; |
846 | 0 | } |
847 | 0 | } |
848 | | |
849 | 0 | if (locale_type == LOCALE_TURKIC && c == 'i') |
850 | 0 | { |
851 | | /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
852 | 0 | len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL); |
853 | 0 | } |
854 | 0 | else if (c == 0x0345) /* COMBINING GREEK YPOGEGRAMMENI */ |
855 | 0 | { |
856 | | /* Nasty, need to move it after other combining marks .. this would go away if |
857 | | * we normalized first. |
858 | | */ |
859 | 0 | len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE); |
860 | | |
861 | | /* And output as GREEK CAPITAL LETTER IOTA */ |
862 | 0 | len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); |
863 | 0 | } |
864 | 0 | else if (IS (t, |
865 | 0 | OR (G_UNICODE_LOWERCASE_LETTER, |
866 | 0 | OR (G_UNICODE_TITLECASE_LETTER, |
867 | 0 | 0)))) |
868 | 0 | { |
869 | 0 | val = ATTTABLE (c >> 8, c & 0xff); |
870 | |
|
871 | 0 | if (val >= 0x1000000) |
872 | 0 | { |
873 | 0 | len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, |
874 | 0 | t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1); |
875 | 0 | } |
876 | 0 | else |
877 | 0 | { |
878 | 0 | if (t == G_UNICODE_TITLECASE_LETTER) |
879 | 0 | { |
880 | 0 | unsigned int i; |
881 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
882 | 0 | { |
883 | 0 | if (title_table[i][0] == c) |
884 | 0 | { |
885 | 0 | val = title_table[i][1]; |
886 | 0 | break; |
887 | 0 | } |
888 | 0 | } |
889 | 0 | } |
890 | | |
891 | | /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, |
892 | | * do not have an uppercase equivalent, in which case val will be |
893 | | * zero. */ |
894 | 0 | len += g_unichar_to_utf8 (val ? val : c, out_buffer ? out_buffer + len : NULL); |
895 | 0 | } |
896 | 0 | } |
897 | 0 | else |
898 | 0 | { |
899 | 0 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
900 | |
|
901 | 0 | if (out_buffer) |
902 | 0 | memcpy (out_buffer + len, last, char_len); |
903 | |
|
904 | 0 | len += char_len; |
905 | 0 | } |
906 | |
|
907 | 0 | } |
908 | |
|
909 | 0 | return len; |
910 | 0 | } |
911 | | |
912 | | /** |
913 | | * g_utf8_strup: |
914 | | * @str: a UTF-8 encoded string |
915 | | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
916 | | * |
917 | | * Converts all Unicode characters in the string that have a case |
918 | | * to uppercase. The exact manner that this is done depends |
919 | | * on the current locale, and may result in the number of |
920 | | * characters in the string increasing. (For instance, the |
921 | | * German ess-zet will be changed to SS.) |
922 | | * |
923 | | * Returns: a newly allocated string, with all characters |
924 | | * converted to uppercase. |
925 | | **/ |
926 | | gchar * |
927 | | g_utf8_strup (const gchar *str, |
928 | | gssize len) |
929 | 0 | { |
930 | 0 | gsize result_len; |
931 | 0 | LocaleType locale_type; |
932 | 0 | gchar *result; |
933 | |
|
934 | 0 | g_return_val_if_fail (str != NULL, NULL); |
935 | | |
936 | 0 | locale_type = get_locale_type (); |
937 | | |
938 | | /* |
939 | | * We use a two pass approach to keep memory management simple |
940 | | */ |
941 | 0 | result_len = real_toupper (str, len, NULL, locale_type); |
942 | 0 | result = g_malloc (result_len + 1); |
943 | 0 | real_toupper (str, len, result, locale_type); |
944 | 0 | result[result_len] = '\0'; |
945 | |
|
946 | 0 | return result; |
947 | 0 | } |
948 | | |
949 | | /* traverses the string checking for characters with combining class == 230 |
950 | | * until a base character is found */ |
951 | | static gboolean |
952 | | has_more_above (const gchar *str) |
953 | 0 | { |
954 | 0 | const gchar *p = str; |
955 | 0 | gint combining_class; |
956 | |
|
957 | 0 | while (*p) |
958 | 0 | { |
959 | 0 | combining_class = g_unichar_combining_class (g_utf8_get_char (p)); |
960 | 0 | if (combining_class == 230) |
961 | 0 | return TRUE; |
962 | 0 | else if (combining_class == 0) |
963 | 0 | break; |
964 | | |
965 | 0 | p = g_utf8_next_char (p); |
966 | 0 | } |
967 | | |
968 | 0 | return FALSE; |
969 | 0 | } |
970 | | |
971 | | static gsize |
972 | | real_tolower (const gchar *str, |
973 | | gssize max_len, |
974 | | gchar *out_buffer, |
975 | | LocaleType locale_type) |
976 | 0 | { |
977 | 0 | const gchar *p = str; |
978 | 0 | const char *last = NULL; |
979 | 0 | gsize len = 0; |
980 | |
|
981 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
982 | 0 | { |
983 | 0 | gunichar c = g_utf8_get_char (p); |
984 | 0 | int t = TYPE (c); |
985 | 0 | gunichar val; |
986 | |
|
987 | 0 | last = p; |
988 | 0 | p = g_utf8_next_char (p); |
989 | |
|
990 | 0 | if (locale_type == LOCALE_TURKIC && (c == 'I' || c == 0x130 || |
991 | 0 | c == G_UNICHAR_FULLWIDTH_I)) |
992 | 0 | { |
993 | 0 | gboolean combining_dot = (c == 'I' || c == G_UNICHAR_FULLWIDTH_I) && |
994 | 0 | g_utf8_get_char (p) == 0x0307; |
995 | 0 | if (combining_dot || c == 0x130) |
996 | 0 | { |
997 | | /* I + COMBINING DOT ABOVE => i (U+0069) |
998 | | * LATIN CAPITAL LETTER I WITH DOT ABOVE => i (U+0069) */ |
999 | 0 | len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
1000 | 0 | if (combining_dot) |
1001 | 0 | p = g_utf8_next_char (p); |
1002 | 0 | } |
1003 | 0 | else |
1004 | 0 | { |
1005 | | /* I => LATIN SMALL LETTER DOTLESS I */ |
1006 | 0 | len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); |
1007 | 0 | } |
1008 | 0 | } |
1009 | | /* Introduce an explicit dot above when lowercasing capital I's and J's |
1010 | | * whenever there are more accents above. [SpecialCasing.txt] */ |
1011 | 0 | else if (locale_type == LOCALE_LITHUANIAN && |
1012 | 0 | (c == 0x00cc || c == 0x00cd || c == 0x0128)) |
1013 | 0 | { |
1014 | 0 | len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
1015 | 0 | len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
1016 | |
|
1017 | 0 | switch (c) |
1018 | 0 | { |
1019 | 0 | case 0x00cc: |
1020 | 0 | len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL); |
1021 | 0 | break; |
1022 | 0 | case 0x00cd: |
1023 | 0 | len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL); |
1024 | 0 | break; |
1025 | 0 | case 0x0128: |
1026 | 0 | len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL); |
1027 | 0 | break; |
1028 | 0 | } |
1029 | 0 | } |
1030 | 0 | else if (locale_type == LOCALE_LITHUANIAN && |
1031 | 0 | (c == 'I' || c == G_UNICHAR_FULLWIDTH_I || |
1032 | 0 | c == 'J' || c == G_UNICHAR_FULLWIDTH_J || c == 0x012e) && |
1033 | 0 | has_more_above (p)) |
1034 | 0 | { |
1035 | 0 | len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL); |
1036 | 0 | len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
1037 | 0 | } |
1038 | 0 | else if (c == 0x03A3) /* GREEK CAPITAL LETTER SIGMA */ |
1039 | 0 | { |
1040 | 0 | if ((max_len < 0 || p < str + max_len) && *p) |
1041 | 0 | { |
1042 | 0 | gunichar next_c = g_utf8_get_char (p); |
1043 | 0 | int next_type = TYPE(next_c); |
1044 | | |
1045 | | /* SIGMA mapps differently depending on whether it is |
1046 | | * final or not. The following simplified test would |
1047 | | * fail in the case of combining marks following the |
1048 | | * sigma, but I don't think that occurs in real text. |
1049 | | * The test here matches that in ICU. |
1050 | | */ |
1051 | 0 | if (ISALPHA (next_type)) /* Lu,Ll,Lt,Lm,Lo */ |
1052 | 0 | val = 0x3c3; /* GREEK SMALL SIGMA */ |
1053 | 0 | else |
1054 | 0 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
1055 | 0 | } |
1056 | 0 | else |
1057 | 0 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
1058 | |
|
1059 | 0 | len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); |
1060 | 0 | } |
1061 | 0 | else if (IS (t, |
1062 | 0 | OR (G_UNICODE_UPPERCASE_LETTER, |
1063 | 0 | OR (G_UNICODE_TITLECASE_LETTER, |
1064 | 0 | 0)))) |
1065 | 0 | { |
1066 | 0 | val = ATTTABLE (c >> 8, c & 0xff); |
1067 | |
|
1068 | 0 | if (val >= 0x1000000) |
1069 | 0 | { |
1070 | 0 | len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0); |
1071 | 0 | } |
1072 | 0 | else |
1073 | 0 | { |
1074 | 0 | if (t == G_UNICODE_TITLECASE_LETTER) |
1075 | 0 | { |
1076 | 0 | unsigned int i; |
1077 | 0 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
1078 | 0 | { |
1079 | 0 | if (title_table[i][0] == c) |
1080 | 0 | { |
1081 | 0 | val = title_table[i][2]; |
1082 | 0 | break; |
1083 | 0 | } |
1084 | 0 | } |
1085 | 0 | } |
1086 | | |
1087 | | /* Not all uppercase letters are guaranteed to have a lowercase |
1088 | | * equivalent. If this is the case, val will be zero. */ |
1089 | 0 | len += g_unichar_to_utf8 (val ? val : c, out_buffer ? out_buffer + len : NULL); |
1090 | 0 | } |
1091 | 0 | } |
1092 | 0 | else |
1093 | 0 | { |
1094 | 0 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
1095 | |
|
1096 | 0 | if (out_buffer) |
1097 | 0 | memcpy (out_buffer + len, last, char_len); |
1098 | |
|
1099 | 0 | len += char_len; |
1100 | 0 | } |
1101 | |
|
1102 | 0 | } |
1103 | | |
1104 | 0 | return len; |
1105 | 0 | } |
1106 | | |
1107 | | /** |
1108 | | * g_utf8_strdown: |
1109 | | * @str: a UTF-8 encoded string |
1110 | | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
1111 | | * |
1112 | | * Converts all Unicode characters in the string that have a case |
1113 | | * to lowercase. The exact manner that this is done depends |
1114 | | * on the current locale, and may result in the number of |
1115 | | * characters in the string changing. |
1116 | | * |
1117 | | * Returns: a newly allocated string, with all characters |
1118 | | * converted to lowercase. |
1119 | | **/ |
1120 | | gchar * |
1121 | | g_utf8_strdown (const gchar *str, |
1122 | | gssize len) |
1123 | 0 | { |
1124 | 0 | gsize result_len; |
1125 | 0 | LocaleType locale_type; |
1126 | 0 | gchar *result; |
1127 | |
|
1128 | 0 | g_return_val_if_fail (str != NULL, NULL); |
1129 | | |
1130 | 0 | locale_type = get_locale_type (); |
1131 | | |
1132 | | /* |
1133 | | * We use a two pass approach to keep memory management simple |
1134 | | */ |
1135 | 0 | result_len = real_tolower (str, len, NULL, locale_type); |
1136 | 0 | result = g_malloc (result_len + 1); |
1137 | 0 | real_tolower (str, len, result, locale_type); |
1138 | 0 | result[result_len] = '\0'; |
1139 | |
|
1140 | 0 | return result; |
1141 | 0 | } |
1142 | | |
1143 | | /** |
1144 | | * g_utf8_casefold: |
1145 | | * @str: a UTF-8 encoded string |
1146 | | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
1147 | | * |
1148 | | * Converts a string into a form that is independent of case. The |
1149 | | * result will not correspond to any particular case, but can be |
1150 | | * compared for equality or ordered with the results of calling |
1151 | | * g_utf8_casefold() on other strings. |
1152 | | * |
1153 | | * Note that calling g_utf8_casefold() followed by g_utf8_collate() is |
1154 | | * only an approximation to the correct linguistic case insensitive |
1155 | | * ordering, though it is a fairly good one. Getting this exactly |
1156 | | * right would require a more sophisticated collation function that |
1157 | | * takes case sensitivity into account. GLib does not currently |
1158 | | * provide such a function. |
1159 | | * |
1160 | | * Returns: a newly allocated string, that is a |
1161 | | * case independent form of @str. |
1162 | | **/ |
1163 | | gchar * |
1164 | | g_utf8_casefold (const gchar *str, |
1165 | | gssize len) |
1166 | 0 | { |
1167 | 0 | GString *result; |
1168 | 0 | const char *p; |
1169 | |
|
1170 | 0 | g_return_val_if_fail (str != NULL, NULL); |
1171 | | |
1172 | 0 | result = g_string_new (NULL); |
1173 | 0 | p = str; |
1174 | 0 | while ((len < 0 || p < str + len) && *p) |
1175 | 0 | { |
1176 | 0 | gunichar ch = g_utf8_get_char (p); |
1177 | |
|
1178 | 0 | int start = 0; |
1179 | 0 | int end = G_N_ELEMENTS (casefold_table); |
1180 | |
|
1181 | 0 | if (ch >= casefold_table[start].ch && |
1182 | 0 | ch <= casefold_table[end - 1].ch) |
1183 | 0 | { |
1184 | 0 | while (TRUE) |
1185 | 0 | { |
1186 | 0 | int half = (start + end) / 2; |
1187 | 0 | if (ch == casefold_table[half].ch) |
1188 | 0 | { |
1189 | 0 | g_string_append (result, casefold_table[half].data); |
1190 | 0 | goto next; |
1191 | 0 | } |
1192 | 0 | else if (half == start) |
1193 | 0 | break; |
1194 | 0 | else if (ch > casefold_table[half].ch) |
1195 | 0 | start = half; |
1196 | 0 | else |
1197 | 0 | end = half; |
1198 | 0 | } |
1199 | 0 | } |
1200 | | |
1201 | 0 | g_string_append_unichar (result, g_unichar_tolower (ch)); |
1202 | | |
1203 | 0 | next: |
1204 | 0 | p = g_utf8_next_char (p); |
1205 | 0 | } |
1206 | | |
1207 | 0 | return g_string_free (result, FALSE); |
1208 | 0 | } |
1209 | | |
1210 | | /** |
1211 | | * g_unichar_get_mirror_char: |
1212 | | * @ch: a Unicode character |
1213 | | * @mirrored_ch: location to store the mirrored character |
1214 | | * |
1215 | | * In Unicode, some characters are "mirrored". This means that their |
1216 | | * images are mirrored horizontally in text that is laid out from right |
1217 | | * to left. For instance, "(" would become its mirror image, ")", in |
1218 | | * right-to-left text. |
1219 | | * |
1220 | | * If @ch has the Unicode mirrored property and there is another unicode |
1221 | | * character that typically has a glyph that is the mirror image of @ch's |
1222 | | * glyph and @mirrored_ch is set, it puts that character in the address |
1223 | | * pointed to by @mirrored_ch. Otherwise the original character is put. |
1224 | | * |
1225 | | * Returns: %TRUE if @ch has a mirrored character, %FALSE otherwise |
1226 | | * |
1227 | | * Since: 2.4 |
1228 | | **/ |
1229 | | gboolean |
1230 | | g_unichar_get_mirror_char (gunichar ch, |
1231 | | gunichar *mirrored_ch) |
1232 | 0 | { |
1233 | 0 | gboolean found; |
1234 | 0 | gunichar mirrored; |
1235 | |
|
1236 | 0 | mirrored = GLIB_GET_MIRRORING(ch); |
1237 | |
|
1238 | 0 | found = ch != mirrored; |
1239 | 0 | if (mirrored_ch) |
1240 | 0 | *mirrored_ch = mirrored; |
1241 | |
|
1242 | 0 | return found; |
1243 | |
|
1244 | 0 | } |
1245 | | |
1246 | 0 | #define G_SCRIPT_TABLE_MIDPOINT (G_N_ELEMENTS (g_script_table) / 2) |
1247 | | |
1248 | | static inline GUnicodeScript |
1249 | | g_unichar_get_script_bsearch (gunichar ch) |
1250 | 0 | { |
1251 | 0 | int lower = 0; |
1252 | 0 | int upper = G_N_ELEMENTS (g_script_table) - 1; |
1253 | 0 | static int saved_mid = G_SCRIPT_TABLE_MIDPOINT; |
1254 | 0 | int mid = saved_mid; |
1255 | | |
1256 | |
|
1257 | 0 | do |
1258 | 0 | { |
1259 | 0 | if (ch < g_script_table[mid].start) |
1260 | 0 | upper = mid - 1; |
1261 | 0 | else if (ch >= g_script_table[mid].start + g_script_table[mid].chars) |
1262 | 0 | lower = mid + 1; |
1263 | 0 | else |
1264 | 0 | return g_script_table[saved_mid = mid].script; |
1265 | | |
1266 | 0 | mid = (lower + upper) / 2; |
1267 | 0 | } |
1268 | 0 | while (lower <= upper); |
1269 | | |
1270 | 0 | return G_UNICODE_SCRIPT_UNKNOWN; |
1271 | 0 | } |
1272 | | |
1273 | | /** |
1274 | | * g_unichar_get_script: |
1275 | | * @ch: a Unicode character |
1276 | | * |
1277 | | * Looks up the #GUnicodeScript for a particular character (as defined |
1278 | | * by Unicode Standard Annex \#24). No check is made for @ch being a |
1279 | | * valid Unicode character; if you pass in invalid character, the |
1280 | | * result is undefined. |
1281 | | * |
1282 | | * This function is equivalent to pango_script_for_unichar() and the |
1283 | | * two are interchangeable. |
1284 | | * |
1285 | | * Returns: the #GUnicodeScript for the character. |
1286 | | * |
1287 | | * Since: 2.14 |
1288 | | */ |
1289 | | GUnicodeScript |
1290 | | g_unichar_get_script (gunichar ch) |
1291 | 0 | { |
1292 | 0 | if (ch < G_EASY_SCRIPTS_RANGE) |
1293 | 0 | return g_script_easy_table[ch]; |
1294 | 0 | else |
1295 | 0 | return g_unichar_get_script_bsearch (ch); |
1296 | 0 | } |
1297 | | |
1298 | | |
1299 | | /* http://unicode.org/iso15924/ */ |
1300 | | static const guint32 iso15924_tags[] = |
1301 | | { |
1302 | | #define PACK(a,b,c,d) ((guint32)((((guint8)(a))<<24)|(((guint8)(b))<<16)|(((guint8)(c))<<8)|((guint8)(d)))) |
1303 | | |
1304 | | PACK ('Z','y','y','y'), /* G_UNICODE_SCRIPT_COMMON */ |
1305 | | PACK ('Z','i','n','h'), /* G_UNICODE_SCRIPT_INHERITED */ |
1306 | | PACK ('A','r','a','b'), /* G_UNICODE_SCRIPT_ARABIC */ |
1307 | | PACK ('A','r','m','n'), /* G_UNICODE_SCRIPT_ARMENIAN */ |
1308 | | PACK ('B','e','n','g'), /* G_UNICODE_SCRIPT_BENGALI */ |
1309 | | PACK ('B','o','p','o'), /* G_UNICODE_SCRIPT_BOPOMOFO */ |
1310 | | PACK ('C','h','e','r'), /* G_UNICODE_SCRIPT_CHEROKEE */ |
1311 | | PACK ('C','o','p','t'), /* G_UNICODE_SCRIPT_COPTIC */ |
1312 | | PACK ('C','y','r','l'), /* G_UNICODE_SCRIPT_CYRILLIC */ |
1313 | | PACK ('D','s','r','t'), /* G_UNICODE_SCRIPT_DESERET */ |
1314 | | PACK ('D','e','v','a'), /* G_UNICODE_SCRIPT_DEVANAGARI */ |
1315 | | PACK ('E','t','h','i'), /* G_UNICODE_SCRIPT_ETHIOPIC */ |
1316 | | PACK ('G','e','o','r'), /* G_UNICODE_SCRIPT_GEORGIAN */ |
1317 | | PACK ('G','o','t','h'), /* G_UNICODE_SCRIPT_GOTHIC */ |
1318 | | PACK ('G','r','e','k'), /* G_UNICODE_SCRIPT_GREEK */ |
1319 | | PACK ('G','u','j','r'), /* G_UNICODE_SCRIPT_GUJARATI */ |
1320 | | PACK ('G','u','r','u'), /* G_UNICODE_SCRIPT_GURMUKHI */ |
1321 | | PACK ('H','a','n','i'), /* G_UNICODE_SCRIPT_HAN */ |
1322 | | PACK ('H','a','n','g'), /* G_UNICODE_SCRIPT_HANGUL */ |
1323 | | PACK ('H','e','b','r'), /* G_UNICODE_SCRIPT_HEBREW */ |
1324 | | PACK ('H','i','r','a'), /* G_UNICODE_SCRIPT_HIRAGANA */ |
1325 | | PACK ('K','n','d','a'), /* G_UNICODE_SCRIPT_KANNADA */ |
1326 | | PACK ('K','a','n','a'), /* G_UNICODE_SCRIPT_KATAKANA */ |
1327 | | PACK ('K','h','m','r'), /* G_UNICODE_SCRIPT_KHMER */ |
1328 | | PACK ('L','a','o','o'), /* G_UNICODE_SCRIPT_LAO */ |
1329 | | PACK ('L','a','t','n'), /* G_UNICODE_SCRIPT_LATIN */ |
1330 | | PACK ('M','l','y','m'), /* G_UNICODE_SCRIPT_MALAYALAM */ |
1331 | | PACK ('M','o','n','g'), /* G_UNICODE_SCRIPT_MONGOLIAN */ |
1332 | | PACK ('M','y','m','r'), /* G_UNICODE_SCRIPT_MYANMAR */ |
1333 | | PACK ('O','g','a','m'), /* G_UNICODE_SCRIPT_OGHAM */ |
1334 | | PACK ('I','t','a','l'), /* G_UNICODE_SCRIPT_OLD_ITALIC */ |
1335 | | PACK ('O','r','y','a'), /* G_UNICODE_SCRIPT_ORIYA */ |
1336 | | PACK ('R','u','n','r'), /* G_UNICODE_SCRIPT_RUNIC */ |
1337 | | PACK ('S','i','n','h'), /* G_UNICODE_SCRIPT_SINHALA */ |
1338 | | PACK ('S','y','r','c'), /* G_UNICODE_SCRIPT_SYRIAC */ |
1339 | | PACK ('T','a','m','l'), /* G_UNICODE_SCRIPT_TAMIL */ |
1340 | | PACK ('T','e','l','u'), /* G_UNICODE_SCRIPT_TELUGU */ |
1341 | | PACK ('T','h','a','a'), /* G_UNICODE_SCRIPT_THAANA */ |
1342 | | PACK ('T','h','a','i'), /* G_UNICODE_SCRIPT_THAI */ |
1343 | | PACK ('T','i','b','t'), /* G_UNICODE_SCRIPT_TIBETAN */ |
1344 | | PACK ('C','a','n','s'), /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */ |
1345 | | PACK ('Y','i','i','i'), /* G_UNICODE_SCRIPT_YI */ |
1346 | | PACK ('T','g','l','g'), /* G_UNICODE_SCRIPT_TAGALOG */ |
1347 | | PACK ('H','a','n','o'), /* G_UNICODE_SCRIPT_HANUNOO */ |
1348 | | PACK ('B','u','h','d'), /* G_UNICODE_SCRIPT_BUHID */ |
1349 | | PACK ('T','a','g','b'), /* G_UNICODE_SCRIPT_TAGBANWA */ |
1350 | | |
1351 | | /* Unicode-4.0 additions */ |
1352 | | PACK ('B','r','a','i'), /* G_UNICODE_SCRIPT_BRAILLE */ |
1353 | | PACK ('C','p','r','t'), /* G_UNICODE_SCRIPT_CYPRIOT */ |
1354 | | PACK ('L','i','m','b'), /* G_UNICODE_SCRIPT_LIMBU */ |
1355 | | PACK ('O','s','m','a'), /* G_UNICODE_SCRIPT_OSMANYA */ |
1356 | | PACK ('S','h','a','w'), /* G_UNICODE_SCRIPT_SHAVIAN */ |
1357 | | PACK ('L','i','n','b'), /* G_UNICODE_SCRIPT_LINEAR_B */ |
1358 | | PACK ('T','a','l','e'), /* G_UNICODE_SCRIPT_TAI_LE */ |
1359 | | PACK ('U','g','a','r'), /* G_UNICODE_SCRIPT_UGARITIC */ |
1360 | | |
1361 | | /* Unicode-4.1 additions */ |
1362 | | PACK ('T','a','l','u'), /* G_UNICODE_SCRIPT_NEW_TAI_LUE */ |
1363 | | PACK ('B','u','g','i'), /* G_UNICODE_SCRIPT_BUGINESE */ |
1364 | | PACK ('G','l','a','g'), /* G_UNICODE_SCRIPT_GLAGOLITIC */ |
1365 | | PACK ('T','f','n','g'), /* G_UNICODE_SCRIPT_TIFINAGH */ |
1366 | | PACK ('S','y','l','o'), /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */ |
1367 | | PACK ('X','p','e','o'), /* G_UNICODE_SCRIPT_OLD_PERSIAN */ |
1368 | | PACK ('K','h','a','r'), /* G_UNICODE_SCRIPT_KHAROSHTHI */ |
1369 | | |
1370 | | /* Unicode-5.0 additions */ |
1371 | | PACK ('Z','z','z','z'), /* G_UNICODE_SCRIPT_UNKNOWN */ |
1372 | | PACK ('B','a','l','i'), /* G_UNICODE_SCRIPT_BALINESE */ |
1373 | | PACK ('X','s','u','x'), /* G_UNICODE_SCRIPT_CUNEIFORM */ |
1374 | | PACK ('P','h','n','x'), /* G_UNICODE_SCRIPT_PHOENICIAN */ |
1375 | | PACK ('P','h','a','g'), /* G_UNICODE_SCRIPT_PHAGS_PA */ |
1376 | | PACK ('N','k','o','o'), /* G_UNICODE_SCRIPT_NKO */ |
1377 | | |
1378 | | /* Unicode-5.1 additions */ |
1379 | | PACK ('K','a','l','i'), /* G_UNICODE_SCRIPT_KAYAH_LI */ |
1380 | | PACK ('L','e','p','c'), /* G_UNICODE_SCRIPT_LEPCHA */ |
1381 | | PACK ('R','j','n','g'), /* G_UNICODE_SCRIPT_REJANG */ |
1382 | | PACK ('S','u','n','d'), /* G_UNICODE_SCRIPT_SUNDANESE */ |
1383 | | PACK ('S','a','u','r'), /* G_UNICODE_SCRIPT_SAURASHTRA */ |
1384 | | PACK ('C','h','a','m'), /* G_UNICODE_SCRIPT_CHAM */ |
1385 | | PACK ('O','l','c','k'), /* G_UNICODE_SCRIPT_OL_CHIKI */ |
1386 | | PACK ('V','a','i','i'), /* G_UNICODE_SCRIPT_VAI */ |
1387 | | PACK ('C','a','r','i'), /* G_UNICODE_SCRIPT_CARIAN */ |
1388 | | PACK ('L','y','c','i'), /* G_UNICODE_SCRIPT_LYCIAN */ |
1389 | | PACK ('L','y','d','i'), /* G_UNICODE_SCRIPT_LYDIAN */ |
1390 | | |
1391 | | /* Unicode-5.2 additions */ |
1392 | | PACK ('A','v','s','t'), /* G_UNICODE_SCRIPT_AVESTAN */ |
1393 | | PACK ('B','a','m','u'), /* G_UNICODE_SCRIPT_BAMUM */ |
1394 | | PACK ('E','g','y','p'), /* G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS */ |
1395 | | PACK ('A','r','m','i'), /* G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC */ |
1396 | | PACK ('P','h','l','i'), /* G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI */ |
1397 | | PACK ('P','r','t','i'), /* G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN */ |
1398 | | PACK ('J','a','v','a'), /* G_UNICODE_SCRIPT_JAVANESE */ |
1399 | | PACK ('K','t','h','i'), /* G_UNICODE_SCRIPT_KAITHI */ |
1400 | | PACK ('L','i','s','u'), /* G_UNICODE_SCRIPT_LISU */ |
1401 | | PACK ('M','t','e','i'), /* G_UNICODE_SCRIPT_MEETEI_MAYEK */ |
1402 | | PACK ('S','a','r','b'), /* G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN */ |
1403 | | PACK ('O','r','k','h'), /* G_UNICODE_SCRIPT_OLD_TURKIC */ |
1404 | | PACK ('S','a','m','r'), /* G_UNICODE_SCRIPT_SAMARITAN */ |
1405 | | PACK ('L','a','n','a'), /* G_UNICODE_SCRIPT_TAI_THAM */ |
1406 | | PACK ('T','a','v','t'), /* G_UNICODE_SCRIPT_TAI_VIET */ |
1407 | | |
1408 | | /* Unicode-6.0 additions */ |
1409 | | PACK ('B','a','t','k'), /* G_UNICODE_SCRIPT_BATAK */ |
1410 | | PACK ('B','r','a','h'), /* G_UNICODE_SCRIPT_BRAHMI */ |
1411 | | PACK ('M','a','n','d'), /* G_UNICODE_SCRIPT_MANDAIC */ |
1412 | | |
1413 | | /* Unicode-6.1 additions */ |
1414 | | PACK ('C','a','k','m'), /* G_UNICODE_SCRIPT_CHAKMA */ |
1415 | | PACK ('M','e','r','c'), /* G_UNICODE_SCRIPT_MEROITIC_CURSIVE */ |
1416 | | PACK ('M','e','r','o'), /* G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS */ |
1417 | | PACK ('P','l','r','d'), /* G_UNICODE_SCRIPT_MIAO */ |
1418 | | PACK ('S','h','r','d'), /* G_UNICODE_SCRIPT_SHARADA */ |
1419 | | PACK ('S','o','r','a'), /* G_UNICODE_SCRIPT_SORA_SOMPENG */ |
1420 | | PACK ('T','a','k','r'), /* G_UNICODE_SCRIPT_TAKRI */ |
1421 | | |
1422 | | /* Unicode 7.0 additions */ |
1423 | | PACK ('B','a','s','s'), /* G_UNICODE_SCRIPT_BASSA_VAH */ |
1424 | | PACK ('A','g','h','b'), /* G_UNICODE_SCRIPT_CAUCASIAN_ALBANIAN */ |
1425 | | PACK ('D','u','p','l'), /* G_UNICODE_SCRIPT_DUPLOYAN */ |
1426 | | PACK ('E','l','b','a'), /* G_UNICODE_SCRIPT_ELBASAN */ |
1427 | | PACK ('G','r','a','n'), /* G_UNICODE_SCRIPT_GRANTHA */ |
1428 | | PACK ('K','h','o','j'), /* G_UNICODE_SCRIPT_KHOJKI*/ |
1429 | | PACK ('S','i','n','d'), /* G_UNICODE_SCRIPT_KHUDAWADI */ |
1430 | | PACK ('L','i','n','a'), /* G_UNICODE_SCRIPT_LINEAR_A */ |
1431 | | PACK ('M','a','h','j'), /* G_UNICODE_SCRIPT_MAHAJANI */ |
1432 | | PACK ('M','a','n','i'), /* G_UNICODE_SCRIPT_MANICHAEAN */ |
1433 | | PACK ('M','e','n','d'), /* G_UNICODE_SCRIPT_MENDE_KIKAKUI */ |
1434 | | PACK ('M','o','d','i'), /* G_UNICODE_SCRIPT_MODI */ |
1435 | | PACK ('M','r','o','o'), /* G_UNICODE_SCRIPT_MRO */ |
1436 | | PACK ('N','b','a','t'), /* G_UNICODE_SCRIPT_NABATAEAN */ |
1437 | | PACK ('N','a','r','b'), /* G_UNICODE_SCRIPT_OLD_NORTH_ARABIAN */ |
1438 | | PACK ('P','e','r','m'), /* G_UNICODE_SCRIPT_OLD_PERMIC */ |
1439 | | PACK ('H','m','n','g'), /* G_UNICODE_SCRIPT_PAHAWH_HMONG */ |
1440 | | PACK ('P','a','l','m'), /* G_UNICODE_SCRIPT_PALMYRENE */ |
1441 | | PACK ('P','a','u','c'), /* G_UNICODE_SCRIPT_PAU_CIN_HAU */ |
1442 | | PACK ('P','h','l','p'), /* G_UNICODE_SCRIPT_PSALTER_PAHLAVI */ |
1443 | | PACK ('S','i','d','d'), /* G_UNICODE_SCRIPT_SIDDHAM */ |
1444 | | PACK ('T','i','r','h'), /* G_UNICODE_SCRIPT_TIRHUTA */ |
1445 | | PACK ('W','a','r','a'), /* G_UNICODE_SCRIPT_WARANG_CITI */ |
1446 | | |
1447 | | /* Unicode 8.0 additions */ |
1448 | | PACK ('A','h','o','m'), /* G_UNICODE_SCRIPT_AHOM */ |
1449 | | PACK ('H','l','u','w'), /* G_UNICODE_SCRIPT_ANATOLIAN_HIEROGLYPHS */ |
1450 | | PACK ('H','a','t','r'), /* G_UNICODE_SCRIPT_HATRAN */ |
1451 | | PACK ('M','u','l','t'), /* G_UNICODE_SCRIPT_MULTANI */ |
1452 | | PACK ('H','u','n','g'), /* G_UNICODE_SCRIPT_OLD_HUNGARIAN */ |
1453 | | PACK ('S','g','n','w'), /* G_UNICODE_SCRIPT_SIGNWRITING */ |
1454 | | |
1455 | | /* Unicode 9.0 additions */ |
1456 | | PACK ('A','d','l','m'), /* G_UNICODE_SCRIPT_ADLAM */ |
1457 | | PACK ('B','h','k','s'), /* G_UNICODE_SCRIPT_BHAIKSUKI */ |
1458 | | PACK ('M','a','r','c'), /* G_UNICODE_SCRIPT_MARCHEN */ |
1459 | | PACK ('N','e','w','a'), /* G_UNICODE_SCRIPT_NEWA */ |
1460 | | PACK ('O','s','g','e'), /* G_UNICODE_SCRIPT_OSAGE */ |
1461 | | PACK ('T','a','n','g'), /* G_UNICODE_SCRIPT_TANGUT */ |
1462 | | |
1463 | | /* Unicode 10.0 additions */ |
1464 | | PACK ('G','o','n','m'), /* G_UNICODE_SCRIPT_MASARAM_GONDI */ |
1465 | | PACK ('N','s','h','u'), /* G_UNICODE_SCRIPT_NUSHU */ |
1466 | | PACK ('S','o','y','o'), /* G_UNICODE_SCRIPT_SOYOMBO */ |
1467 | | PACK ('Z','a','n','b'), /* G_UNICODE_SCRIPT_ZANABAZAR_SQUARE */ |
1468 | | |
1469 | | /* Unicode 11.0 additions */ |
1470 | | PACK ('D','o','g','r'), /* G_UNICODE_SCRIPT_DOGRA */ |
1471 | | PACK ('G','o','n','g'), /* G_UNICODE_SCRIPT_GUNJALA_GONDI */ |
1472 | | PACK ('R','o','h','g'), /* G_UNICODE_SCRIPT_HANIFI_ROHINGYA */ |
1473 | | PACK ('M','a','k','a'), /* G_UNICODE_SCRIPT_MAKASAR */ |
1474 | | PACK ('M','e','d','f'), /* G_UNICODE_SCRIPT_MEDEFAIDRIN */ |
1475 | | PACK ('S','o','g','o'), /* G_UNICODE_SCRIPT_OLD_SOGDIAN */ |
1476 | | PACK ('S','o','g','d'), /* G_UNICODE_SCRIPT_SOGDIAN */ |
1477 | | |
1478 | | /* Unicode 12.0 additions */ |
1479 | | PACK ('E','l','y','m'), /* G_UNICODE_SCRIPT_ELYMAIC */ |
1480 | | PACK ('N','a','n','d'), /* G_UNICODE_SCRIPT_NANDINAGARI */ |
1481 | | PACK ('H','m','n','p'), /* G_UNICODE_SCRIPT_NYIAKENG_PUACHUE_HMONG */ |
1482 | | PACK ('W','c','h','o'), /* G_UNICODE_SCRIPT_WANCHO */ |
1483 | | |
1484 | | /* Unicode 13.0 additions */ |
1485 | | PACK ('C', 'h', 'r', 's'), /* G_UNICODE_SCRIPT_CHORASMIAN */ |
1486 | | PACK ('D', 'i', 'a', 'k'), /* G_UNICODE_SCRIPT_DIVES_AKURU */ |
1487 | | PACK ('K', 'i', 't', 's'), /* G_UNICODE_SCRIPT_KHITAN_SMALL_SCRIPT */ |
1488 | | PACK ('Y', 'e', 'z', 'i'), /* G_UNICODE_SCRIPT_YEZIDI */ |
1489 | | #undef PACK |
1490 | | }; |
1491 | | |
1492 | | /** |
1493 | | * g_unicode_script_to_iso15924: |
1494 | | * @script: a Unicode script |
1495 | | * |
1496 | | * Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter |
1497 | | * codes to scripts. For example, the code for Arabic is 'Arab'. The |
1498 | | * four letter codes are encoded as a @guint32 by this function in a |
1499 | | * big-endian fashion. That is, the code returned for Arabic is |
1500 | | * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). |
1501 | | * |
1502 | | * See |
1503 | | * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) |
1504 | | * for details. |
1505 | | * |
1506 | | * Returns: the ISO 15924 code for @script, encoded as an integer, |
1507 | | * of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or |
1508 | | * ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood. |
1509 | | * |
1510 | | * Since: 2.30 |
1511 | | */ |
1512 | | guint32 |
1513 | | g_unicode_script_to_iso15924 (GUnicodeScript script) |
1514 | 0 | { |
1515 | 0 | if (G_UNLIKELY (script == G_UNICODE_SCRIPT_INVALID_CODE)) |
1516 | 0 | return 0; |
1517 | | |
1518 | 0 | if (G_UNLIKELY (script < 0 || script >= (int) G_N_ELEMENTS (iso15924_tags))) |
1519 | 0 | return 0x5A7A7A7A; |
1520 | | |
1521 | 0 | return iso15924_tags[script]; |
1522 | 0 | } |
1523 | | |
1524 | | /** |
1525 | | * g_unicode_script_from_iso15924: |
1526 | | * @iso15924: a Unicode script |
1527 | | * |
1528 | | * Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter |
1529 | | * codes to scripts. For example, the code for Arabic is 'Arab'. |
1530 | | * This function accepts four letter codes encoded as a @guint32 in a |
1531 | | * big-endian fashion. That is, the code expected for Arabic is |
1532 | | * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). |
1533 | | * |
1534 | | * See |
1535 | | * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) |
1536 | | * for details. |
1537 | | * |
1538 | | * Returns: the Unicode script for @iso15924, or |
1539 | | * of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and |
1540 | | * %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown. |
1541 | | * |
1542 | | * Since: 2.30 |
1543 | | */ |
1544 | | GUnicodeScript |
1545 | | g_unicode_script_from_iso15924 (guint32 iso15924) |
1546 | 0 | { |
1547 | 0 | unsigned int i; |
1548 | |
|
1549 | 0 | if (!iso15924) |
1550 | 0 | return G_UNICODE_SCRIPT_INVALID_CODE; |
1551 | | |
1552 | 0 | for (i = 0; i < G_N_ELEMENTS (iso15924_tags); i++) |
1553 | 0 | if (iso15924_tags[i] == iso15924) |
1554 | 0 | return (GUnicodeScript) i; |
1555 | | |
1556 | 0 | return G_UNICODE_SCRIPT_UNKNOWN; |
1557 | 0 | } |