/src/glib/glib/gunidecomp.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* decomp.c - Character decomposition. |
2 | | * |
3 | | * Copyright (C) 1999, 2000 Tom Tromey |
4 | | * Copyright 2000 Red Hat, Inc. |
5 | | * |
6 | | * This library is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU Lesser General Public |
8 | | * License as published by the Free Software Foundation; either |
9 | | * version 2.1 of the License, or (at your option) any later version. |
10 | | * |
11 | | * This library is distributed in the hope that it will be useful, |
12 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | | * Lesser General Public License for more details. |
15 | | * |
16 | | * You should have received a copy of the GNU Lesser General Public License |
17 | | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | | */ |
19 | | |
20 | | /** |
21 | | * SECTION:unicode |
22 | | * @Title: Unicode Manipulation |
23 | | * @Short_description: functions operating on Unicode characters and |
24 | | * UTF-8 strings |
25 | | * @See_also: g_locale_to_utf8(), g_locale_from_utf8() |
26 | | * |
27 | | * This section describes a number of functions for dealing with |
28 | | * Unicode characters and strings. There are analogues of the |
29 | | * traditional `ctype.h` character classification and case conversion |
30 | | * functions, UTF-8 analogues of some string utility functions, |
31 | | * functions to perform normalization, case conversion and collation |
32 | | * on UTF-8 strings and finally functions to convert between the UTF-8, |
33 | | * UTF-16 and UCS-4 encodings of Unicode. |
34 | | * |
35 | | * The implementations of the Unicode functions in GLib are based |
36 | | * on the Unicode Character Data tables, which are available from |
37 | | * [www.unicode.org](http://www.unicode.org/). |
38 | | * |
39 | | * * Unicode 4.0 was added in GLib 2.8 |
40 | | * * Unicode 4.1 was added in GLib 2.10 |
41 | | * * Unicode 5.0 was added in GLib 2.12 |
42 | | * * Unicode 5.1 was added in GLib 2.16.3 |
43 | | * * Unicode 6.0 was added in GLib 2.30 |
44 | | * * Unicode 6.1 was added in GLib 2.32 |
45 | | * * Unicode 6.2 was added in GLib 2.36 |
46 | | * * Unicode 6.3 was added in GLib 2.40 |
47 | | * * Unicode 7.0 was added in GLib 2.42 |
48 | | * * Unicode 8.0 was added in GLib 2.48 |
49 | | * * Unicode 9.0 was added in GLib 2.50.1 |
50 | | * * Unicode 10.0 was added in GLib 2.54 |
51 | | * * Unicode 11.10 was added in GLib 2.58 |
52 | | * * Unicode 12.0 was added in GLib 2.62 |
53 | | * * Unicode 12.1 was added in GLib 2.62 |
54 | | * * Unicode 13.0 was added in GLib 2.66 |
55 | | */ |
56 | | |
57 | | #include "config.h" |
58 | | |
59 | | #include <stdlib.h> |
60 | | |
61 | | #include "gunicode.h" |
62 | | #include "gunidecomp.h" |
63 | | #include "gmem.h" |
64 | | #include "gunicomp.h" |
65 | | #include "gunicodeprivate.h" |
66 | | |
67 | | |
68 | | #define CC_PART1(Page, Char) \ |
69 | 0 | ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
70 | 0 | ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
71 | 0 | : (cclass_data[combining_class_table_part1[Page]][Char])) |
72 | | |
73 | | #define CC_PART2(Page, Char) \ |
74 | 0 | ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
75 | 0 | ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
76 | 0 | : (cclass_data[combining_class_table_part2[Page]][Char])) |
77 | | |
78 | | #define COMBINING_CLASS(Char) \ |
79 | 0 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
80 | 0 | ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ |
81 | 0 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
82 | 0 | ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
83 | 0 | : 0)) |
84 | | |
85 | | /** |
86 | | * g_unichar_combining_class: |
87 | | * @uc: a Unicode character |
88 | | * |
89 | | * Determines the canonical combining class of a Unicode character. |
90 | | * |
91 | | * Returns: the combining class of the character |
92 | | * |
93 | | * Since: 2.14 |
94 | | **/ |
95 | | gint |
96 | | g_unichar_combining_class (gunichar uc) |
97 | 0 | { |
98 | 0 | return COMBINING_CLASS (uc); |
99 | 0 | } |
100 | | |
101 | | /* constants for hangul syllable [de]composition */ |
102 | 0 | #define SBase 0xAC00 |
103 | 0 | #define LBase 0x1100 |
104 | 0 | #define VBase 0x1161 |
105 | 0 | #define TBase 0x11A7 |
106 | 0 | #define LCount 19 |
107 | 0 | #define VCount 21 |
108 | 0 | #define TCount 28 |
109 | 0 | #define NCount (VCount * TCount) |
110 | 0 | #define SCount (LCount * NCount) |
111 | | |
112 | | /** |
113 | | * g_unicode_canonical_ordering: |
114 | | * @string: a UCS-4 encoded string. |
115 | | * @len: the maximum length of @string to use. |
116 | | * |
117 | | * Computes the canonical ordering of a string in-place. |
118 | | * This rearranges decomposed characters in the string |
119 | | * according to their combining classes. See the Unicode |
120 | | * manual for more information. |
121 | | **/ |
122 | | void |
123 | | g_unicode_canonical_ordering (gunichar *string, |
124 | | gsize len) |
125 | 0 | { |
126 | 0 | gsize i; |
127 | 0 | int swap = 1; |
128 | |
|
129 | 0 | while (swap) |
130 | 0 | { |
131 | 0 | int last; |
132 | 0 | swap = 0; |
133 | 0 | last = COMBINING_CLASS (string[0]); |
134 | 0 | for (i = 0; i < len - 1; ++i) |
135 | 0 | { |
136 | 0 | int next = COMBINING_CLASS (string[i + 1]); |
137 | 0 | if (next != 0 && last > next) |
138 | 0 | { |
139 | 0 | gsize j; |
140 | | /* Percolate item leftward through string. */ |
141 | 0 | for (j = i + 1; j > 0; --j) |
142 | 0 | { |
143 | 0 | gunichar t; |
144 | 0 | if (COMBINING_CLASS (string[j - 1]) <= next) |
145 | 0 | break; |
146 | 0 | t = string[j]; |
147 | 0 | string[j] = string[j - 1]; |
148 | 0 | string[j - 1] = t; |
149 | 0 | swap = 1; |
150 | 0 | } |
151 | | /* We're re-entering the loop looking at the old |
152 | | character again. */ |
153 | 0 | next = last; |
154 | 0 | } |
155 | 0 | last = next; |
156 | 0 | } |
157 | 0 | } |
158 | 0 | } |
159 | | |
160 | | /* http://www.unicode.org/unicode/reports/tr15/#Hangul |
161 | | * r should be null or have sufficient space. Calling with r == NULL will |
162 | | * only calculate the result_len; however, a buffer with space for three |
163 | | * characters will always be big enough. */ |
164 | | static void |
165 | | decompose_hangul (gunichar s, |
166 | | gunichar *r, |
167 | | gsize *result_len) |
168 | 0 | { |
169 | 0 | gint SIndex = s - SBase; |
170 | 0 | gint TIndex = SIndex % TCount; |
171 | |
|
172 | 0 | if (r) |
173 | 0 | { |
174 | 0 | r[0] = LBase + SIndex / NCount; |
175 | 0 | r[1] = VBase + (SIndex % NCount) / TCount; |
176 | 0 | } |
177 | |
|
178 | 0 | if (TIndex) |
179 | 0 | { |
180 | 0 | if (r) |
181 | 0 | r[2] = TBase + TIndex; |
182 | 0 | *result_len = 3; |
183 | 0 | } |
184 | 0 | else |
185 | 0 | *result_len = 2; |
186 | 0 | } |
187 | | |
188 | | /* returns a pointer to a null-terminated UTF-8 string */ |
189 | | static const gchar * |
190 | | find_decomposition (gunichar ch, |
191 | | gboolean compat) |
192 | 0 | { |
193 | 0 | int start = 0; |
194 | 0 | int end = G_N_ELEMENTS (decomp_table); |
195 | | |
196 | 0 | if (ch >= decomp_table[start].ch && |
197 | 0 | ch <= decomp_table[end - 1].ch) |
198 | 0 | { |
199 | 0 | while (TRUE) |
200 | 0 | { |
201 | 0 | int half = (start + end) / 2; |
202 | 0 | if (ch == decomp_table[half].ch) |
203 | 0 | { |
204 | 0 | int offset; |
205 | |
|
206 | 0 | if (compat) |
207 | 0 | { |
208 | 0 | offset = decomp_table[half].compat_offset; |
209 | 0 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
210 | 0 | offset = decomp_table[half].canon_offset; |
211 | 0 | } |
212 | 0 | else |
213 | 0 | { |
214 | 0 | offset = decomp_table[half].canon_offset; |
215 | 0 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
216 | 0 | return NULL; |
217 | 0 | } |
218 | | |
219 | 0 | return &(decomp_expansion_string[offset]); |
220 | 0 | } |
221 | 0 | else if (half == start) |
222 | 0 | break; |
223 | 0 | else if (ch > decomp_table[half].ch) |
224 | 0 | start = half; |
225 | 0 | else |
226 | 0 | end = half; |
227 | 0 | } |
228 | 0 | } |
229 | | |
230 | 0 | return NULL; |
231 | 0 | } |
232 | | |
233 | | /** |
234 | | * g_unicode_canonical_decomposition: |
235 | | * @ch: a Unicode character. |
236 | | * @result_len: location to store the length of the return value. |
237 | | * |
238 | | * Computes the canonical decomposition of a Unicode character. |
239 | | * |
240 | | * Returns: a newly allocated string of Unicode characters. |
241 | | * @result_len is set to the resulting length of the string. |
242 | | * |
243 | | * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose() |
244 | | * instead. |
245 | | **/ |
246 | | gunichar * |
247 | | g_unicode_canonical_decomposition (gunichar ch, |
248 | | gsize *result_len) |
249 | 0 | { |
250 | 0 | const gchar *decomp; |
251 | 0 | const gchar *p; |
252 | 0 | gunichar *r; |
253 | | |
254 | | /* Hangul syllable */ |
255 | 0 | if (ch >= SBase && ch < SBase + SCount) |
256 | 0 | { |
257 | 0 | decompose_hangul (ch, NULL, result_len); |
258 | 0 | r = g_malloc (*result_len * sizeof (gunichar)); |
259 | 0 | decompose_hangul (ch, r, result_len); |
260 | 0 | } |
261 | 0 | else if ((decomp = find_decomposition (ch, FALSE)) != NULL) |
262 | 0 | { |
263 | | /* Found it. */ |
264 | 0 | int i; |
265 | | |
266 | 0 | *result_len = g_utf8_strlen (decomp, -1); |
267 | 0 | r = g_malloc (*result_len * sizeof (gunichar)); |
268 | | |
269 | 0 | for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++) |
270 | 0 | r[i] = g_utf8_get_char (p); |
271 | 0 | } |
272 | 0 | else |
273 | 0 | { |
274 | | /* Not in our table. */ |
275 | 0 | r = g_malloc (sizeof (gunichar)); |
276 | 0 | *r = ch; |
277 | 0 | *result_len = 1; |
278 | 0 | } |
279 | |
|
280 | 0 | return r; |
281 | 0 | } |
282 | | |
283 | | /* L,V => LV and LV,T => LVT */ |
284 | | static gboolean |
285 | | combine_hangul (gunichar a, |
286 | | gunichar b, |
287 | | gunichar *result) |
288 | 0 | { |
289 | 0 | gint LIndex = a - LBase; |
290 | 0 | gint SIndex = a - SBase; |
291 | |
|
292 | 0 | gint VIndex = b - VBase; |
293 | 0 | gint TIndex = b - TBase; |
294 | |
|
295 | 0 | if (0 <= LIndex && LIndex < LCount |
296 | 0 | && 0 <= VIndex && VIndex < VCount) |
297 | 0 | { |
298 | 0 | *result = SBase + (LIndex * VCount + VIndex) * TCount; |
299 | 0 | return TRUE; |
300 | 0 | } |
301 | 0 | else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 |
302 | 0 | && 0 < TIndex && TIndex < TCount) |
303 | 0 | { |
304 | 0 | *result = a + TIndex; |
305 | 0 | return TRUE; |
306 | 0 | } |
307 | | |
308 | 0 | return FALSE; |
309 | 0 | } |
310 | | |
311 | | #define CI(Page, Char) \ |
312 | 0 | ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
313 | 0 | ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
314 | 0 | : (compose_data[compose_table[Page]][Char])) |
315 | | |
316 | | #define COMPOSE_INDEX(Char) \ |
317 | 0 | (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) |
318 | | |
319 | | static gboolean |
320 | | combine (gunichar a, |
321 | | gunichar b, |
322 | | gunichar *result) |
323 | 0 | { |
324 | 0 | gushort index_a, index_b; |
325 | |
|
326 | 0 | if (combine_hangul (a, b, result)) |
327 | 0 | return TRUE; |
328 | | |
329 | 0 | index_a = COMPOSE_INDEX(a); |
330 | |
|
331 | 0 | if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) |
332 | 0 | { |
333 | 0 | if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) |
334 | 0 | { |
335 | 0 | *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; |
336 | 0 | return TRUE; |
337 | 0 | } |
338 | 0 | else |
339 | 0 | return FALSE; |
340 | 0 | } |
341 | | |
342 | 0 | index_b = COMPOSE_INDEX(b); |
343 | |
|
344 | 0 | if (index_b >= COMPOSE_SECOND_SINGLE_START) |
345 | 0 | { |
346 | 0 | if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) |
347 | 0 | { |
348 | 0 | *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; |
349 | 0 | return TRUE; |
350 | 0 | } |
351 | 0 | else |
352 | 0 | return FALSE; |
353 | 0 | } |
354 | | |
355 | 0 | if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START && |
356 | 0 | index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START) |
357 | 0 | { |
358 | 0 | gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START]; |
359 | |
|
360 | 0 | if (res) |
361 | 0 | { |
362 | 0 | *result = res; |
363 | 0 | return TRUE; |
364 | 0 | } |
365 | 0 | } |
366 | | |
367 | 0 | return FALSE; |
368 | 0 | } |
369 | | |
370 | | gunichar * |
371 | | _g_utf8_normalize_wc (const gchar *str, |
372 | | gssize max_len, |
373 | | GNormalizeMode mode) |
374 | 0 | { |
375 | 0 | gsize n_wc; |
376 | 0 | gunichar *wc_buffer; |
377 | 0 | const char *p; |
378 | 0 | gsize last_start; |
379 | 0 | gboolean do_compat = (mode == G_NORMALIZE_NFKC || |
380 | 0 | mode == G_NORMALIZE_NFKD); |
381 | 0 | gboolean do_compose = (mode == G_NORMALIZE_NFC || |
382 | 0 | mode == G_NORMALIZE_NFKC); |
383 | |
|
384 | 0 | n_wc = 0; |
385 | 0 | p = str; |
386 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
387 | 0 | { |
388 | 0 | const gchar *decomp; |
389 | 0 | gunichar wc = g_utf8_get_char (p); |
390 | |
|
391 | 0 | if (wc >= SBase && wc < SBase + SCount) |
392 | 0 | { |
393 | 0 | gsize result_len; |
394 | 0 | decompose_hangul (wc, NULL, &result_len); |
395 | 0 | n_wc += result_len; |
396 | 0 | } |
397 | 0 | else |
398 | 0 | { |
399 | 0 | decomp = find_decomposition (wc, do_compat); |
400 | |
|
401 | 0 | if (decomp) |
402 | 0 | n_wc += g_utf8_strlen (decomp, -1); |
403 | 0 | else |
404 | 0 | n_wc++; |
405 | 0 | } |
406 | |
|
407 | 0 | p = g_utf8_next_char (p); |
408 | 0 | } |
409 | |
|
410 | 0 | wc_buffer = g_new (gunichar, n_wc + 1); |
411 | |
|
412 | 0 | last_start = 0; |
413 | 0 | n_wc = 0; |
414 | 0 | p = str; |
415 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
416 | 0 | { |
417 | 0 | gunichar wc = g_utf8_get_char (p); |
418 | 0 | const gchar *decomp; |
419 | 0 | int cc; |
420 | 0 | gsize old_n_wc = n_wc; |
421 | | |
422 | 0 | if (wc >= SBase && wc < SBase + SCount) |
423 | 0 | { |
424 | 0 | gsize result_len; |
425 | 0 | decompose_hangul (wc, wc_buffer + n_wc, &result_len); |
426 | 0 | n_wc += result_len; |
427 | 0 | } |
428 | 0 | else |
429 | 0 | { |
430 | 0 | decomp = find_decomposition (wc, do_compat); |
431 | | |
432 | 0 | if (decomp) |
433 | 0 | { |
434 | 0 | const char *pd; |
435 | 0 | for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) |
436 | 0 | wc_buffer[n_wc++] = g_utf8_get_char (pd); |
437 | 0 | } |
438 | 0 | else |
439 | 0 | wc_buffer[n_wc++] = wc; |
440 | 0 | } |
441 | |
|
442 | 0 | if (n_wc > 0) |
443 | 0 | { |
444 | 0 | cc = COMBINING_CLASS (wc_buffer[old_n_wc]); |
445 | |
|
446 | 0 | if (cc == 0) |
447 | 0 | { |
448 | 0 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
449 | 0 | last_start = old_n_wc; |
450 | 0 | } |
451 | 0 | } |
452 | | |
453 | 0 | p = g_utf8_next_char (p); |
454 | 0 | } |
455 | |
|
456 | 0 | if (n_wc > 0) |
457 | 0 | { |
458 | 0 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
459 | 0 | last_start = n_wc; |
460 | 0 | (void) last_start; |
461 | 0 | } |
462 | | |
463 | 0 | wc_buffer[n_wc] = 0; |
464 | | |
465 | | /* All decomposed and reordered */ |
466 | |
|
467 | 0 | if (do_compose && n_wc > 0) |
468 | 0 | { |
469 | 0 | gsize i, j; |
470 | 0 | int last_cc = 0; |
471 | 0 | last_start = 0; |
472 | | |
473 | 0 | for (i = 0; i < n_wc; i++) |
474 | 0 | { |
475 | 0 | int cc = COMBINING_CLASS (wc_buffer[i]); |
476 | |
|
477 | 0 | if (i > 0 && |
478 | 0 | (last_cc == 0 || last_cc < cc) && |
479 | 0 | combine (wc_buffer[last_start], wc_buffer[i], |
480 | 0 | &wc_buffer[last_start])) |
481 | 0 | { |
482 | 0 | for (j = i + 1; j < n_wc; j++) |
483 | 0 | wc_buffer[j-1] = wc_buffer[j]; |
484 | 0 | n_wc--; |
485 | 0 | i--; |
486 | | |
487 | 0 | if (i == last_start) |
488 | 0 | last_cc = 0; |
489 | 0 | else |
490 | 0 | last_cc = COMBINING_CLASS (wc_buffer[i-1]); |
491 | | |
492 | 0 | continue; |
493 | 0 | } |
494 | | |
495 | 0 | if (cc == 0) |
496 | 0 | last_start = i; |
497 | |
|
498 | 0 | last_cc = cc; |
499 | 0 | } |
500 | 0 | } |
501 | |
|
502 | 0 | wc_buffer[n_wc] = 0; |
503 | |
|
504 | 0 | return wc_buffer; |
505 | 0 | } |
506 | | |
507 | | /** |
508 | | * g_utf8_normalize: |
509 | | * @str: a UTF-8 encoded string. |
510 | | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
511 | | * @mode: the type of normalization to perform. |
512 | | * |
513 | | * Converts a string into canonical form, standardizing |
514 | | * such issues as whether a character with an accent |
515 | | * is represented as a base character and combining |
516 | | * accent or as a single precomposed character. The |
517 | | * string has to be valid UTF-8, otherwise %NULL is |
518 | | * returned. You should generally call g_utf8_normalize() |
519 | | * before comparing two Unicode strings. |
520 | | * |
521 | | * The normalization mode %G_NORMALIZE_DEFAULT only |
522 | | * standardizes differences that do not affect the |
523 | | * text content, such as the above-mentioned accent |
524 | | * representation. %G_NORMALIZE_ALL also standardizes |
525 | | * the "compatibility" characters in Unicode, such |
526 | | * as SUPERSCRIPT THREE to the standard forms |
527 | | * (in this case DIGIT THREE). Formatting information |
528 | | * may be lost but for most text operations such |
529 | | * characters should be considered the same. |
530 | | * |
531 | | * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE |
532 | | * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, |
533 | | * but returned a result with composed forms rather |
534 | | * than a maximally decomposed form. This is often |
535 | | * useful if you intend to convert the string to |
536 | | * a legacy encoding or pass it to a system with |
537 | | * less capable Unicode handling. |
538 | | * |
539 | | * Returns: (nullable): a newly allocated string, that |
540 | | * is the normalized form of @str, or %NULL if @str |
541 | | * is not valid UTF-8. |
542 | | **/ |
543 | | gchar * |
544 | | g_utf8_normalize (const gchar *str, |
545 | | gssize len, |
546 | | GNormalizeMode mode) |
547 | 0 | { |
548 | 0 | gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); |
549 | 0 | gchar *result; |
550 | |
|
551 | 0 | result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); |
552 | 0 | g_free (result_wc); |
553 | |
|
554 | 0 | return result; |
555 | 0 | } |
556 | | |
557 | | static gboolean |
558 | | decompose_hangul_step (gunichar ch, |
559 | | gunichar *a, |
560 | | gunichar *b) |
561 | 0 | { |
562 | 0 | gint SIndex, TIndex; |
563 | |
|
564 | 0 | if (ch < SBase || ch >= SBase + SCount) |
565 | 0 | return FALSE; /* not a hangul syllable */ |
566 | | |
567 | 0 | SIndex = ch - SBase; |
568 | 0 | TIndex = SIndex % TCount; |
569 | |
|
570 | 0 | if (TIndex) |
571 | 0 | { |
572 | | /* split LVT -> LV,T */ |
573 | 0 | *a = ch - TIndex; |
574 | 0 | *b = TBase + TIndex; |
575 | 0 | } |
576 | 0 | else |
577 | 0 | { |
578 | | /* split LV -> L,V */ |
579 | 0 | *a = LBase + SIndex / NCount; |
580 | 0 | *b = VBase + (SIndex % NCount) / TCount; |
581 | 0 | } |
582 | |
|
583 | 0 | return TRUE; |
584 | 0 | } |
585 | | |
586 | | /** |
587 | | * g_unichar_decompose: |
588 | | * @ch: a Unicode character |
589 | | * @a: (out) (not optional): return location for the first component of @ch |
590 | | * @b: (out) (not optional): return location for the second component of @ch |
591 | | * |
592 | | * Performs a single decomposition step of the |
593 | | * Unicode canonical decomposition algorithm. |
594 | | * |
595 | | * This function does not include compatibility |
596 | | * decompositions. It does, however, include algorithmic |
597 | | * Hangul Jamo decomposition, as well as 'singleton' |
598 | | * decompositions which replace a character by a single |
599 | | * other character. In the case of singletons *@b will |
600 | | * be set to zero. |
601 | | * |
602 | | * If @ch is not decomposable, *@a is set to @ch and *@b |
603 | | * is set to zero. |
604 | | * |
605 | | * Note that the way Unicode decomposition pairs are |
606 | | * defined, it is guaranteed that @b would not decompose |
607 | | * further, but @a may itself decompose. To get the full |
608 | | * canonical decomposition for @ch, one would need to |
609 | | * recursively call this function on @a. Or use |
610 | | * g_unichar_fully_decompose(). |
611 | | * |
612 | | * See |
613 | | * [UAX#15](http://unicode.org/reports/tr15/) |
614 | | * for details. |
615 | | * |
616 | | * Returns: %TRUE if the character could be decomposed |
617 | | * |
618 | | * Since: 2.30 |
619 | | */ |
620 | | gboolean |
621 | | g_unichar_decompose (gunichar ch, |
622 | | gunichar *a, |
623 | | gunichar *b) |
624 | 0 | { |
625 | 0 | gint start = 0; |
626 | 0 | gint end = G_N_ELEMENTS (decomp_step_table); |
627 | |
|
628 | 0 | if (decompose_hangul_step (ch, a, b)) |
629 | 0 | return TRUE; |
630 | | |
631 | | /* TODO use bsearch() */ |
632 | 0 | if (ch >= decomp_step_table[start].ch && |
633 | 0 | ch <= decomp_step_table[end - 1].ch) |
634 | 0 | { |
635 | 0 | while (TRUE) |
636 | 0 | { |
637 | 0 | gint half = (start + end) / 2; |
638 | 0 | const decomposition_step *p = &(decomp_step_table[half]); |
639 | 0 | if (ch == p->ch) |
640 | 0 | { |
641 | 0 | *a = p->a; |
642 | 0 | *b = p->b; |
643 | 0 | return TRUE; |
644 | 0 | } |
645 | 0 | else if (half == start) |
646 | 0 | break; |
647 | 0 | else if (ch > p->ch) |
648 | 0 | start = half; |
649 | 0 | else |
650 | 0 | end = half; |
651 | 0 | } |
652 | 0 | } |
653 | | |
654 | 0 | *a = ch; |
655 | 0 | *b = 0; |
656 | |
|
657 | 0 | return FALSE; |
658 | 0 | } |
659 | | |
660 | | /** |
661 | | * g_unichar_compose: |
662 | | * @a: a Unicode character |
663 | | * @b: a Unicode character |
664 | | * @ch: (out) (not optional): return location for the composed character |
665 | | * |
666 | | * Performs a single composition step of the |
667 | | * Unicode canonical composition algorithm. |
668 | | * |
669 | | * This function includes algorithmic Hangul Jamo composition, |
670 | | * but it is not exactly the inverse of g_unichar_decompose(). |
671 | | * No composition can have either of @a or @b equal to zero. |
672 | | * To be precise, this function composes if and only if |
673 | | * there exists a Primary Composite P which is canonically |
674 | | * equivalent to the sequence <@a,@b>. See the Unicode |
675 | | * Standard for the definition of Primary Composite. |
676 | | * |
677 | | * If @a and @b do not compose a new character, @ch is set to zero. |
678 | | * |
679 | | * See |
680 | | * [UAX#15](http://unicode.org/reports/tr15/) |
681 | | * for details. |
682 | | * |
683 | | * Returns: %TRUE if the characters could be composed |
684 | | * |
685 | | * Since: 2.30 |
686 | | */ |
687 | | gboolean |
688 | | g_unichar_compose (gunichar a, |
689 | | gunichar b, |
690 | | gunichar *ch) |
691 | 0 | { |
692 | 0 | if (combine (a, b, ch)) |
693 | 0 | return TRUE; |
694 | | |
695 | 0 | *ch = 0; |
696 | 0 | return FALSE; |
697 | 0 | } |
698 | | |
699 | | /** |
700 | | * g_unichar_fully_decompose: |
701 | | * @ch: a Unicode character. |
702 | | * @compat: whether perform canonical or compatibility decomposition |
703 | | * @result: (optional) (out caller-allocates): location to store decomposed result, or %NULL |
704 | | * @result_len: length of @result |
705 | | * |
706 | | * Computes the canonical or compatibility decomposition of a |
707 | | * Unicode character. For compatibility decomposition, |
708 | | * pass %TRUE for @compat; for canonical decomposition |
709 | | * pass %FALSE for @compat. |
710 | | * |
711 | | * The decomposed sequence is placed in @result. Only up to |
712 | | * @result_len characters are written into @result. The length |
713 | | * of the full decomposition (irrespective of @result_len) is |
714 | | * returned by the function. For canonical decomposition, |
715 | | * currently all decompositions are of length at most 4, but |
716 | | * this may change in the future (very unlikely though). |
717 | | * At any rate, Unicode does guarantee that a buffer of length |
718 | | * 18 is always enough for both compatibility and canonical |
719 | | * decompositions, so that is the size recommended. This is provided |
720 | | * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. |
721 | | * |
722 | | * See |
723 | | * [UAX#15](http://unicode.org/reports/tr15/) |
724 | | * for details. |
725 | | * |
726 | | * Returns: the length of the full decomposition. |
727 | | * |
728 | | * Since: 2.30 |
729 | | **/ |
730 | | gsize |
731 | | g_unichar_fully_decompose (gunichar ch, |
732 | | gboolean compat, |
733 | | gunichar *result, |
734 | | gsize result_len) |
735 | 0 | { |
736 | 0 | const gchar *decomp; |
737 | 0 | const gchar *p; |
738 | | |
739 | | /* Hangul syllable */ |
740 | 0 | if (ch >= SBase && ch < SBase + SCount) |
741 | 0 | { |
742 | 0 | gsize len, i; |
743 | 0 | gunichar buffer[3]; |
744 | 0 | decompose_hangul (ch, result ? buffer : NULL, &len); |
745 | 0 | if (result) |
746 | 0 | for (i = 0; i < len && i < result_len; i++) |
747 | 0 | result[i] = buffer[i]; |
748 | 0 | return len; |
749 | 0 | } |
750 | 0 | else if ((decomp = find_decomposition (ch, compat)) != NULL) |
751 | 0 | { |
752 | | /* Found it. */ |
753 | 0 | gsize len, i; |
754 | |
|
755 | 0 | len = g_utf8_strlen (decomp, -1); |
756 | |
|
757 | 0 | for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++) |
758 | 0 | result[i] = g_utf8_get_char (p); |
759 | |
|
760 | 0 | return len; |
761 | 0 | } |
762 | | |
763 | | /* Does not decompose */ |
764 | 0 | if (result && result_len >= 1) |
765 | 0 | *result = ch; |
766 | 0 | return 1; |
767 | 0 | } |