/src/tinysparql/subprojects/glib-2.80.3/glib/gunidecomp.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* decomp.c - Character decomposition. |
2 | | * |
3 | | * Copyright (C) 1999, 2000 Tom Tromey |
4 | | * Copyright 2000 Red Hat, Inc. |
5 | | * |
6 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
7 | | * |
8 | | * This library is free software; you can redistribute it and/or |
9 | | * modify it under the terms of the GNU Lesser General Public |
10 | | * License as published by the Free Software Foundation; either |
11 | | * version 2.1 of the License, or (at your option) any later version. |
12 | | * |
13 | | * This library is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | | * Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General Public License |
19 | | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
20 | | */ |
21 | | |
22 | | #include "config.h" |
23 | | |
24 | | #include <stdlib.h> |
25 | | |
26 | | #include "gunicode.h" |
27 | | #include "gunidecomp.h" |
28 | | #include "gmem.h" |
29 | | #include "gunicomp.h" |
30 | | #include "gunicodeprivate.h" |
31 | | |
32 | | |
33 | | #define CC_PART1(Page, Char) \ |
34 | 0 | ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
35 | 0 | ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
36 | 0 | : (cclass_data[combining_class_table_part1[Page]][Char])) |
37 | | |
38 | | #define CC_PART2(Page, Char) \ |
39 | 0 | ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
40 | 0 | ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
41 | 0 | : (cclass_data[combining_class_table_part2[Page]][Char])) |
42 | | |
43 | | #define COMBINING_CLASS(Char) \ |
44 | 0 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
45 | 0 | ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ |
46 | 0 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
47 | 0 | ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
48 | 0 | : 0)) |
49 | | |
50 | | /** |
51 | | * g_unichar_combining_class: |
52 | | * @uc: a Unicode character |
53 | | * |
54 | | * Determines the canonical combining class of a Unicode character. |
55 | | * |
56 | | * Returns: the combining class of the character |
57 | | * |
58 | | * Since: 2.14 |
59 | | **/ |
60 | | gint |
61 | | g_unichar_combining_class (gunichar uc) |
62 | 0 | { |
63 | 0 | return COMBINING_CLASS (uc); |
64 | 0 | } |
65 | | |
66 | | /* constants for hangul syllable [de]composition */ |
67 | 0 | #define SBase 0xAC00 |
68 | 0 | #define LBase 0x1100 |
69 | 0 | #define VBase 0x1161 |
70 | 0 | #define TBase 0x11A7 |
71 | 0 | #define LCount 19 |
72 | 0 | #define VCount 21 |
73 | 0 | #define TCount 28 |
74 | 0 | #define NCount (VCount * TCount) |
75 | 0 | #define SCount (LCount * NCount) |
76 | | |
77 | | /** |
78 | | * g_unicode_canonical_ordering: |
79 | | * @string: (array length=len) (element-type gunichar): a UCS-4 encoded string. |
80 | | * @len: the maximum length of @string to use. |
81 | | * |
82 | | * Computes the canonical ordering of a string in-place. |
83 | | * This rearranges decomposed characters in the string |
84 | | * according to their combining classes. See the Unicode |
85 | | * manual for more information. |
86 | | **/ |
87 | | void |
88 | | g_unicode_canonical_ordering (gunichar *string, |
89 | | gsize len) |
90 | 0 | { |
91 | 0 | gsize i; |
92 | 0 | int swap = 1; |
93 | |
|
94 | 0 | while (swap) |
95 | 0 | { |
96 | 0 | int last; |
97 | 0 | swap = 0; |
98 | 0 | last = COMBINING_CLASS (string[0]); |
99 | 0 | for (i = 0; i < len - 1; ++i) |
100 | 0 | { |
101 | 0 | int next = COMBINING_CLASS (string[i + 1]); |
102 | 0 | if (next != 0 && last > next) |
103 | 0 | { |
104 | 0 | gsize j; |
105 | | /* Percolate item leftward through string. */ |
106 | 0 | for (j = i + 1; j > 0; --j) |
107 | 0 | { |
108 | 0 | gunichar t; |
109 | 0 | if (COMBINING_CLASS (string[j - 1]) <= next) |
110 | 0 | break; |
111 | 0 | t = string[j]; |
112 | 0 | string[j] = string[j - 1]; |
113 | 0 | string[j - 1] = t; |
114 | 0 | swap = 1; |
115 | 0 | } |
116 | | /* We're re-entering the loop looking at the old |
117 | | character again. */ |
118 | 0 | next = last; |
119 | 0 | } |
120 | 0 | last = next; |
121 | 0 | } |
122 | 0 | } |
123 | 0 | } |
124 | | |
125 | | /* http://www.unicode.org/unicode/reports/tr15/#Hangul |
126 | | * r should be null or have sufficient space. Calling with r == NULL will |
127 | | * only calculate the result_len; however, a buffer with space for three |
128 | | * characters will always be big enough. */ |
129 | | static void |
130 | | decompose_hangul (gunichar s, |
131 | | gunichar *r, |
132 | | gsize *result_len) |
133 | 0 | { |
134 | 0 | gint SIndex = s - SBase; |
135 | 0 | gint TIndex = SIndex % TCount; |
136 | |
|
137 | 0 | if (r) |
138 | 0 | { |
139 | 0 | r[0] = LBase + SIndex / NCount; |
140 | 0 | r[1] = VBase + (SIndex % NCount) / TCount; |
141 | 0 | } |
142 | |
|
143 | 0 | if (TIndex) |
144 | 0 | { |
145 | 0 | if (r) |
146 | 0 | r[2] = TBase + TIndex; |
147 | 0 | *result_len = 3; |
148 | 0 | } |
149 | 0 | else |
150 | 0 | *result_len = 2; |
151 | 0 | } |
152 | | |
153 | | /* returns a pointer to a null-terminated UTF-8 string */ |
154 | | static const gchar * |
155 | | find_decomposition (gunichar ch, |
156 | | gboolean compat) |
157 | 0 | { |
158 | 0 | int start = 0; |
159 | 0 | int end = G_N_ELEMENTS (decomp_table); |
160 | | |
161 | 0 | if (ch >= decomp_table[start].ch && |
162 | 0 | ch <= decomp_table[end - 1].ch) |
163 | 0 | { |
164 | 0 | while (TRUE) |
165 | 0 | { |
166 | 0 | int half = (start + end) / 2; |
167 | 0 | if (ch == decomp_table[half].ch) |
168 | 0 | { |
169 | 0 | int offset; |
170 | |
|
171 | 0 | if (compat) |
172 | 0 | { |
173 | 0 | offset = decomp_table[half].compat_offset; |
174 | 0 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
175 | 0 | offset = decomp_table[half].canon_offset; |
176 | 0 | } |
177 | 0 | else |
178 | 0 | { |
179 | 0 | offset = decomp_table[half].canon_offset; |
180 | 0 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
181 | 0 | return NULL; |
182 | 0 | } |
183 | | |
184 | 0 | return &(decomp_expansion_string[offset]); |
185 | 0 | } |
186 | 0 | else if (half == start) |
187 | 0 | break; |
188 | 0 | else if (ch > decomp_table[half].ch) |
189 | 0 | start = half; |
190 | 0 | else |
191 | 0 | end = half; |
192 | 0 | } |
193 | 0 | } |
194 | | |
195 | 0 | return NULL; |
196 | 0 | } |
197 | | |
198 | | /** |
199 | | * g_unicode_canonical_decomposition: |
200 | | * @ch: a Unicode character. |
201 | | * @result_len: location to store the length of the return value. |
202 | | * |
203 | | * Computes the canonical decomposition of a Unicode character. |
204 | | * |
205 | | * Returns: a newly allocated string of Unicode characters. |
206 | | * @result_len is set to the resulting length of the string. |
207 | | * |
208 | | * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose() |
209 | | * instead. |
210 | | **/ |
211 | | gunichar * |
212 | | g_unicode_canonical_decomposition (gunichar ch, |
213 | | gsize *result_len) |
214 | 0 | { |
215 | 0 | const gchar *decomp; |
216 | 0 | const gchar *p; |
217 | 0 | gunichar *r; |
218 | | |
219 | | /* Hangul syllable */ |
220 | 0 | if (ch >= SBase && ch < SBase + SCount) |
221 | 0 | { |
222 | 0 | decompose_hangul (ch, NULL, result_len); |
223 | 0 | r = g_malloc (*result_len * sizeof (gunichar)); |
224 | 0 | decompose_hangul (ch, r, result_len); |
225 | 0 | } |
226 | 0 | else if ((decomp = find_decomposition (ch, FALSE)) != NULL) |
227 | 0 | { |
228 | | /* Found it. */ |
229 | 0 | int i; |
230 | | |
231 | 0 | *result_len = g_utf8_strlen (decomp, -1); |
232 | 0 | r = g_malloc (*result_len * sizeof (gunichar)); |
233 | | |
234 | 0 | for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++) |
235 | 0 | r[i] = g_utf8_get_char (p); |
236 | 0 | } |
237 | 0 | else |
238 | 0 | { |
239 | | /* Not in our table. */ |
240 | 0 | r = g_malloc (sizeof (gunichar)); |
241 | 0 | *r = ch; |
242 | 0 | *result_len = 1; |
243 | 0 | } |
244 | |
|
245 | 0 | return r; |
246 | 0 | } |
247 | | |
248 | | /* L,V => LV and LV,T => LVT */ |
249 | | static gboolean |
250 | | combine_hangul (gunichar a, |
251 | | gunichar b, |
252 | | gunichar *result) |
253 | 0 | { |
254 | 0 | gint LIndex = a - LBase; |
255 | 0 | gint SIndex = a - SBase; |
256 | |
|
257 | 0 | gint VIndex = b - VBase; |
258 | 0 | gint TIndex = b - TBase; |
259 | |
|
260 | 0 | if (0 <= LIndex && LIndex < LCount |
261 | 0 | && 0 <= VIndex && VIndex < VCount) |
262 | 0 | { |
263 | 0 | *result = SBase + (LIndex * VCount + VIndex) * TCount; |
264 | 0 | return TRUE; |
265 | 0 | } |
266 | 0 | else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 |
267 | 0 | && 0 < TIndex && TIndex < TCount) |
268 | 0 | { |
269 | 0 | *result = a + TIndex; |
270 | 0 | return TRUE; |
271 | 0 | } |
272 | | |
273 | 0 | return FALSE; |
274 | 0 | } |
275 | | |
276 | | #define CI(Page, Char) \ |
277 | 0 | ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
278 | 0 | ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
279 | 0 | : (compose_data[compose_table[Page]][Char])) |
280 | | |
281 | | #define COMPOSE_INDEX(Char) \ |
282 | 0 | (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) |
283 | | |
284 | | static gboolean |
285 | | combine (gunichar a, |
286 | | gunichar b, |
287 | | gunichar *result) |
288 | 0 | { |
289 | 0 | gushort index_a, index_b; |
290 | |
|
291 | 0 | if (combine_hangul (a, b, result)) |
292 | 0 | return TRUE; |
293 | | |
294 | 0 | index_a = COMPOSE_INDEX(a); |
295 | |
|
296 | 0 | if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) |
297 | 0 | { |
298 | 0 | if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) |
299 | 0 | { |
300 | 0 | *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; |
301 | 0 | return TRUE; |
302 | 0 | } |
303 | 0 | else |
304 | 0 | return FALSE; |
305 | 0 | } |
306 | | |
307 | 0 | index_b = COMPOSE_INDEX(b); |
308 | |
|
309 | 0 | if (index_b >= COMPOSE_SECOND_SINGLE_START) |
310 | 0 | { |
311 | 0 | if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) |
312 | 0 | { |
313 | 0 | *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; |
314 | 0 | return TRUE; |
315 | 0 | } |
316 | 0 | else |
317 | 0 | return FALSE; |
318 | 0 | } |
319 | | |
320 | 0 | if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START && |
321 | 0 | index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START) |
322 | 0 | { |
323 | 0 | gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START]; |
324 | |
|
325 | 0 | if (res) |
326 | 0 | { |
327 | 0 | *result = res; |
328 | 0 | return TRUE; |
329 | 0 | } |
330 | 0 | } |
331 | | |
332 | 0 | return FALSE; |
333 | 0 | } |
334 | | |
335 | | gunichar * |
336 | | _g_utf8_normalize_wc (const gchar *str, |
337 | | gssize max_len, |
338 | | GNormalizeMode mode) |
339 | 0 | { |
340 | 0 | gsize n_wc; |
341 | 0 | gunichar *wc_buffer; |
342 | 0 | const char *p; |
343 | 0 | gsize last_start; |
344 | 0 | gboolean do_compat = (mode == G_NORMALIZE_NFKC || |
345 | 0 | mode == G_NORMALIZE_NFKD); |
346 | 0 | gboolean do_compose = (mode == G_NORMALIZE_NFC || |
347 | 0 | mode == G_NORMALIZE_NFKC); |
348 | |
|
349 | 0 | n_wc = 0; |
350 | 0 | p = str; |
351 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
352 | 0 | { |
353 | 0 | const gchar *decomp; |
354 | 0 | const char *next, *between; |
355 | 0 | gunichar wc; |
356 | |
|
357 | 0 | next = g_utf8_next_char (p); |
358 | | /* Avoid reading truncated multibyte characters |
359 | | which run past the end of the buffer */ |
360 | 0 | if (max_len < 0) |
361 | 0 | { |
362 | | /* Does the character contain a NUL terminator? */ |
363 | 0 | for (between = &p[1]; between < next; between++) |
364 | 0 | { |
365 | 0 | if (G_UNLIKELY (!*between)) |
366 | 0 | return NULL; |
367 | 0 | } |
368 | 0 | } |
369 | 0 | else |
370 | 0 | { |
371 | 0 | if (G_UNLIKELY (next > str + max_len)) |
372 | 0 | return NULL; |
373 | 0 | } |
374 | 0 | wc = g_utf8_get_char (p); |
375 | |
|
376 | 0 | if (G_UNLIKELY (wc == (gunichar) -1)) |
377 | 0 | { |
378 | 0 | return NULL; |
379 | 0 | } |
380 | 0 | else if (wc >= SBase && wc < SBase + SCount) |
381 | 0 | { |
382 | 0 | gsize result_len; |
383 | 0 | decompose_hangul (wc, NULL, &result_len); |
384 | 0 | n_wc += result_len; |
385 | 0 | } |
386 | 0 | else |
387 | 0 | { |
388 | 0 | decomp = find_decomposition (wc, do_compat); |
389 | |
|
390 | 0 | if (decomp) |
391 | 0 | n_wc += g_utf8_strlen (decomp, -1); |
392 | 0 | else |
393 | 0 | n_wc++; |
394 | 0 | } |
395 | | |
396 | 0 | p = next; |
397 | 0 | } |
398 | | |
399 | 0 | wc_buffer = g_new (gunichar, n_wc + 1); |
400 | |
|
401 | 0 | last_start = 0; |
402 | 0 | n_wc = 0; |
403 | 0 | p = str; |
404 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
405 | 0 | { |
406 | 0 | gunichar wc = g_utf8_get_char (p); |
407 | 0 | const gchar *decomp; |
408 | 0 | int cc; |
409 | 0 | gsize old_n_wc = n_wc; |
410 | | |
411 | 0 | if (wc >= SBase && wc < SBase + SCount) |
412 | 0 | { |
413 | 0 | gsize result_len; |
414 | 0 | decompose_hangul (wc, wc_buffer + n_wc, &result_len); |
415 | 0 | n_wc += result_len; |
416 | 0 | } |
417 | 0 | else |
418 | 0 | { |
419 | 0 | decomp = find_decomposition (wc, do_compat); |
420 | | |
421 | 0 | if (decomp) |
422 | 0 | { |
423 | 0 | const char *pd; |
424 | 0 | for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) |
425 | 0 | wc_buffer[n_wc++] = g_utf8_get_char (pd); |
426 | 0 | } |
427 | 0 | else |
428 | 0 | wc_buffer[n_wc++] = wc; |
429 | 0 | } |
430 | |
|
431 | 0 | if (n_wc > 0) |
432 | 0 | { |
433 | 0 | cc = COMBINING_CLASS (wc_buffer[old_n_wc]); |
434 | |
|
435 | 0 | if (cc == 0) |
436 | 0 | { |
437 | 0 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
438 | 0 | last_start = old_n_wc; |
439 | 0 | } |
440 | 0 | } |
441 | | |
442 | 0 | p = g_utf8_next_char (p); |
443 | 0 | } |
444 | |
|
445 | 0 | if (n_wc > 0) |
446 | 0 | { |
447 | 0 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
448 | 0 | last_start = n_wc; |
449 | 0 | (void) last_start; |
450 | 0 | } |
451 | | |
452 | 0 | wc_buffer[n_wc] = 0; |
453 | | |
454 | | /* All decomposed and reordered */ |
455 | |
|
456 | 0 | if (do_compose && n_wc > 0) |
457 | 0 | { |
458 | 0 | gsize i, j; |
459 | 0 | int last_cc = 0; |
460 | 0 | last_start = 0; |
461 | | |
462 | 0 | for (i = 0; i < n_wc; i++) |
463 | 0 | { |
464 | 0 | int cc = COMBINING_CLASS (wc_buffer[i]); |
465 | |
|
466 | 0 | if (i > 0 && |
467 | 0 | (last_cc == 0 || last_cc < cc) && |
468 | 0 | combine (wc_buffer[last_start], wc_buffer[i], |
469 | 0 | &wc_buffer[last_start])) |
470 | 0 | { |
471 | 0 | for (j = i + 1; j < n_wc; j++) |
472 | 0 | wc_buffer[j-1] = wc_buffer[j]; |
473 | 0 | n_wc--; |
474 | 0 | i--; |
475 | | |
476 | 0 | if (i == last_start) |
477 | 0 | last_cc = 0; |
478 | 0 | else |
479 | 0 | last_cc = COMBINING_CLASS (wc_buffer[i-1]); |
480 | | |
481 | 0 | continue; |
482 | 0 | } |
483 | | |
484 | 0 | if (cc == 0) |
485 | 0 | last_start = i; |
486 | |
|
487 | 0 | last_cc = cc; |
488 | 0 | } |
489 | 0 | } |
490 | |
|
491 | 0 | wc_buffer[n_wc] = 0; |
492 | |
|
493 | 0 | return wc_buffer; |
494 | 0 | } |
495 | | |
496 | | /** |
497 | | * g_utf8_normalize: |
498 | | * @str: a UTF-8 encoded string. |
499 | | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
500 | | * @mode: the type of normalization to perform. |
501 | | * |
502 | | * Converts a string into canonical form, standardizing |
503 | | * such issues as whether a character with an accent |
504 | | * is represented as a base character and combining |
505 | | * accent or as a single precomposed character. The |
506 | | * string has to be valid UTF-8, otherwise %NULL is |
507 | | * returned. You should generally call g_utf8_normalize() |
508 | | * before comparing two Unicode strings. |
509 | | * |
510 | | * The normalization mode %G_NORMALIZE_DEFAULT only |
511 | | * standardizes differences that do not affect the |
512 | | * text content, such as the above-mentioned accent |
513 | | * representation. %G_NORMALIZE_ALL also standardizes |
514 | | * the "compatibility" characters in Unicode, such |
515 | | * as SUPERSCRIPT THREE to the standard forms |
516 | | * (in this case DIGIT THREE). Formatting information |
517 | | * may be lost but for most text operations such |
518 | | * characters should be considered the same. |
519 | | * |
520 | | * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE |
521 | | * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, |
522 | | * but returned a result with composed forms rather |
523 | | * than a maximally decomposed form. This is often |
524 | | * useful if you intend to convert the string to |
525 | | * a legacy encoding or pass it to a system with |
526 | | * less capable Unicode handling. |
527 | | * |
528 | | * Returns: (nullable): a newly allocated string, that |
529 | | * is the normalized form of @str, or %NULL if @str |
530 | | * is not valid UTF-8. |
531 | | **/ |
532 | | gchar * |
533 | | g_utf8_normalize (const gchar *str, |
534 | | gssize len, |
535 | | GNormalizeMode mode) |
536 | 0 | { |
537 | 0 | gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); |
538 | 0 | gchar *result = NULL; |
539 | |
|
540 | 0 | if (G_LIKELY (result_wc != NULL)) |
541 | 0 | { |
542 | 0 | result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); |
543 | 0 | g_free (result_wc); |
544 | 0 | } |
545 | |
|
546 | 0 | return result; |
547 | 0 | } |
548 | | |
549 | | static gboolean |
550 | | decompose_hangul_step (gunichar ch, |
551 | | gunichar *a, |
552 | | gunichar *b) |
553 | 0 | { |
554 | 0 | gint SIndex, TIndex; |
555 | |
|
556 | 0 | if (ch < SBase || ch >= SBase + SCount) |
557 | 0 | return FALSE; /* not a hangul syllable */ |
558 | | |
559 | 0 | SIndex = ch - SBase; |
560 | 0 | TIndex = SIndex % TCount; |
561 | |
|
562 | 0 | if (TIndex) |
563 | 0 | { |
564 | | /* split LVT -> LV,T */ |
565 | 0 | *a = ch - TIndex; |
566 | 0 | *b = TBase + TIndex; |
567 | 0 | } |
568 | 0 | else |
569 | 0 | { |
570 | | /* split LV -> L,V */ |
571 | 0 | *a = LBase + SIndex / NCount; |
572 | 0 | *b = VBase + (SIndex % NCount) / TCount; |
573 | 0 | } |
574 | |
|
575 | 0 | return TRUE; |
576 | 0 | } |
577 | | |
578 | | /** |
579 | | * g_unichar_decompose: |
580 | | * @ch: a Unicode character |
581 | | * @a: (out) (not optional): return location for the first component of @ch |
582 | | * @b: (out) (not optional): return location for the second component of @ch |
583 | | * |
584 | | * Performs a single decomposition step of the |
585 | | * Unicode canonical decomposition algorithm. |
586 | | * |
587 | | * This function does not include compatibility |
588 | | * decompositions. It does, however, include algorithmic |
589 | | * Hangul Jamo decomposition, as well as 'singleton' |
590 | | * decompositions which replace a character by a single |
591 | | * other character. In the case of singletons *@b will |
592 | | * be set to zero. |
593 | | * |
594 | | * If @ch is not decomposable, *@a is set to @ch and *@b |
595 | | * is set to zero. |
596 | | * |
597 | | * Note that the way Unicode decomposition pairs are |
598 | | * defined, it is guaranteed that @b would not decompose |
599 | | * further, but @a may itself decompose. To get the full |
600 | | * canonical decomposition for @ch, one would need to |
601 | | * recursively call this function on @a. Or use |
602 | | * g_unichar_fully_decompose(). |
603 | | * |
604 | | * See |
605 | | * [UAX#15](http://unicode.org/reports/tr15/) |
606 | | * for details. |
607 | | * |
608 | | * Returns: %TRUE if the character could be decomposed |
609 | | * |
610 | | * Since: 2.30 |
611 | | */ |
612 | | gboolean |
613 | | g_unichar_decompose (gunichar ch, |
614 | | gunichar *a, |
615 | | gunichar *b) |
616 | 0 | { |
617 | 0 | gint start = 0; |
618 | 0 | gint end = G_N_ELEMENTS (decomp_step_table); |
619 | |
|
620 | 0 | if (decompose_hangul_step (ch, a, b)) |
621 | 0 | return TRUE; |
622 | | |
623 | | /* TODO use bsearch() */ |
624 | 0 | if (ch >= decomp_step_table[start].ch && |
625 | 0 | ch <= decomp_step_table[end - 1].ch) |
626 | 0 | { |
627 | 0 | while (TRUE) |
628 | 0 | { |
629 | 0 | gint half = (start + end) / 2; |
630 | 0 | const decomposition_step *p = &(decomp_step_table[half]); |
631 | 0 | if (ch == p->ch) |
632 | 0 | { |
633 | 0 | *a = p->a; |
634 | 0 | *b = p->b; |
635 | 0 | return TRUE; |
636 | 0 | } |
637 | 0 | else if (half == start) |
638 | 0 | break; |
639 | 0 | else if (ch > p->ch) |
640 | 0 | start = half; |
641 | 0 | else |
642 | 0 | end = half; |
643 | 0 | } |
644 | 0 | } |
645 | | |
646 | 0 | *a = ch; |
647 | 0 | *b = 0; |
648 | |
|
649 | 0 | return FALSE; |
650 | 0 | } |
651 | | |
652 | | /** |
653 | | * g_unichar_compose: |
654 | | * @a: a Unicode character |
655 | | * @b: a Unicode character |
656 | | * @ch: (out) (not optional): return location for the composed character |
657 | | * |
658 | | * Performs a single composition step of the |
659 | | * Unicode canonical composition algorithm. |
660 | | * |
661 | | * This function includes algorithmic Hangul Jamo composition, |
662 | | * but it is not exactly the inverse of g_unichar_decompose(). |
663 | | * No composition can have either of @a or @b equal to zero. |
664 | | * To be precise, this function composes if and only if |
665 | | * there exists a Primary Composite P which is canonically |
666 | | * equivalent to the sequence <@a,@b>. See the Unicode |
667 | | * Standard for the definition of Primary Composite. |
668 | | * |
669 | | * If @a and @b do not compose a new character, @ch is set to zero. |
670 | | * |
671 | | * See |
672 | | * [UAX#15](http://unicode.org/reports/tr15/) |
673 | | * for details. |
674 | | * |
675 | | * Returns: %TRUE if the characters could be composed |
676 | | * |
677 | | * Since: 2.30 |
678 | | */ |
679 | | gboolean |
680 | | g_unichar_compose (gunichar a, |
681 | | gunichar b, |
682 | | gunichar *ch) |
683 | 0 | { |
684 | 0 | if (combine (a, b, ch)) |
685 | 0 | return TRUE; |
686 | | |
687 | 0 | *ch = 0; |
688 | 0 | return FALSE; |
689 | 0 | } |
690 | | |
691 | | /** |
692 | | * g_unichar_fully_decompose: |
693 | | * @ch: a Unicode character. |
694 | | * @compat: whether perform canonical or compatibility decomposition |
695 | | * @result: (optional) (out caller-allocates): location to store decomposed result, or %NULL |
696 | | * @result_len: length of @result |
697 | | * |
698 | | * Computes the canonical or compatibility decomposition of a |
699 | | * Unicode character. For compatibility decomposition, |
700 | | * pass %TRUE for @compat; for canonical decomposition |
701 | | * pass %FALSE for @compat. |
702 | | * |
703 | | * The decomposed sequence is placed in @result. Only up to |
704 | | * @result_len characters are written into @result. The length |
705 | | * of the full decomposition (irrespective of @result_len) is |
706 | | * returned by the function. For canonical decomposition, |
707 | | * currently all decompositions are of length at most 4, but |
708 | | * this may change in the future (very unlikely though). |
709 | | * At any rate, Unicode does guarantee that a buffer of length |
710 | | * 18 is always enough for both compatibility and canonical |
711 | | * decompositions, so that is the size recommended. This is provided |
712 | | * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. |
713 | | * |
714 | | * See |
715 | | * [UAX#15](http://unicode.org/reports/tr15/) |
716 | | * for details. |
717 | | * |
718 | | * Returns: the length of the full decomposition. |
719 | | * |
720 | | * Since: 2.30 |
721 | | **/ |
722 | | gsize |
723 | | g_unichar_fully_decompose (gunichar ch, |
724 | | gboolean compat, |
725 | | gunichar *result, |
726 | | gsize result_len) |
727 | 0 | { |
728 | 0 | const gchar *decomp; |
729 | 0 | const gchar *p; |
730 | | |
731 | | /* Hangul syllable */ |
732 | 0 | if (ch >= SBase && ch < SBase + SCount) |
733 | 0 | { |
734 | 0 | gsize len, i; |
735 | 0 | gunichar buffer[3]; |
736 | 0 | decompose_hangul (ch, result ? buffer : NULL, &len); |
737 | 0 | if (result) |
738 | 0 | for (i = 0; i < len && i < result_len; i++) |
739 | 0 | result[i] = buffer[i]; |
740 | 0 | return len; |
741 | 0 | } |
742 | 0 | else if ((decomp = find_decomposition (ch, compat)) != NULL) |
743 | 0 | { |
744 | | /* Found it. */ |
745 | 0 | gsize len, i; |
746 | |
|
747 | 0 | len = g_utf8_strlen (decomp, -1); |
748 | |
|
749 | 0 | for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++) |
750 | 0 | result[i] = g_utf8_get_char (p); |
751 | |
|
752 | 0 | return len; |
753 | 0 | } |
754 | | |
755 | | /* Does not decompose */ |
756 | 0 | if (result && result_len >= 1) |
757 | 0 | *result = ch; |
758 | 0 | return 1; |
759 | 0 | } |