/src/irssi/subprojects/glib-2.74.3/glib/gunidecomp.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* decomp.c - Character decomposition. |
2 | | * |
3 | | * Copyright (C) 1999, 2000 Tom Tromey |
4 | | * Copyright 2000 Red Hat, Inc. |
5 | | * |
6 | | * SPDX-License-Identifier: LGPL-2.1-or-later |
7 | | * |
8 | | * This library is free software; you can redistribute it and/or |
9 | | * modify it under the terms of the GNU Lesser General Public |
10 | | * License as published by the Free Software Foundation; either |
11 | | * version 2.1 of the License, or (at your option) any later version. |
12 | | * |
13 | | * This library is distributed in the hope that it will be useful, |
14 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 | | * Lesser General Public License for more details. |
17 | | * |
18 | | * You should have received a copy of the GNU Lesser General Public License |
19 | | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
20 | | */ |
21 | | |
22 | | /** |
23 | | * SECTION:unicode |
24 | | * @Title: Unicode Manipulation |
25 | | * @Short_description: functions operating on Unicode characters and |
26 | | * UTF-8 strings |
27 | | * @See_also: g_locale_to_utf8(), g_locale_from_utf8() |
28 | | * |
29 | | * This section describes a number of functions for dealing with |
30 | | * Unicode characters and strings. There are analogues of the |
31 | | * traditional `ctype.h` character classification and case conversion |
32 | | * functions, UTF-8 analogues of some string utility functions, |
33 | | * functions to perform normalization, case conversion and collation |
34 | | * on UTF-8 strings and finally functions to convert between the UTF-8, |
35 | | * UTF-16 and UCS-4 encodings of Unicode. |
36 | | * |
37 | | * The implementations of the Unicode functions in GLib are based |
38 | | * on the Unicode Character Data tables, which are available from |
39 | | * [www.unicode.org](http://www.unicode.org/). |
40 | | * |
41 | | * * Unicode 4.0 was added in GLib 2.8 |
42 | | * * Unicode 4.1 was added in GLib 2.10 |
43 | | * * Unicode 5.0 was added in GLib 2.12 |
44 | | * * Unicode 5.1 was added in GLib 2.16.3 |
45 | | * * Unicode 6.0 was added in GLib 2.30 |
46 | | * * Unicode 6.1 was added in GLib 2.32 |
47 | | * * Unicode 6.2 was added in GLib 2.36 |
48 | | * * Unicode 6.3 was added in GLib 2.40 |
49 | | * * Unicode 7.0 was added in GLib 2.42 |
50 | | * * Unicode 8.0 was added in GLib 2.48 |
51 | | * * Unicode 9.0 was added in GLib 2.50.1 |
52 | | * * Unicode 10.0 was added in GLib 2.54 |
53 | | * * Unicode 11.10 was added in GLib 2.58 |
54 | | * * Unicode 12.0 was added in GLib 2.62 |
55 | | * * Unicode 12.1 was added in GLib 2.62 |
56 | | * * Unicode 13.0 was added in GLib 2.66 |
57 | | */ |
58 | | |
59 | | #include "config.h" |
60 | | |
61 | | #include <stdlib.h> |
62 | | |
63 | | #include "gunicode.h" |
64 | | #include "gunidecomp.h" |
65 | | #include "gmem.h" |
66 | | #include "gunicomp.h" |
67 | | #include "gunicodeprivate.h" |
68 | | |
69 | | |
70 | | #define CC_PART1(Page, Char) \ |
71 | 0 | ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
72 | 0 | ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
73 | 0 | : (cclass_data[combining_class_table_part1[Page]][Char])) |
74 | | |
75 | | #define CC_PART2(Page, Char) \ |
76 | 0 | ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
77 | 0 | ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
78 | 0 | : (cclass_data[combining_class_table_part2[Page]][Char])) |
79 | | |
80 | | #define COMBINING_CLASS(Char) \ |
81 | 0 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
82 | 0 | ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ |
83 | 0 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
84 | 0 | ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
85 | 0 | : 0)) |
86 | | |
87 | | /** |
88 | | * g_unichar_combining_class: |
89 | | * @uc: a Unicode character |
90 | | * |
91 | | * Determines the canonical combining class of a Unicode character. |
92 | | * |
93 | | * Returns: the combining class of the character |
94 | | * |
95 | | * Since: 2.14 |
96 | | **/ |
97 | | gint |
98 | | g_unichar_combining_class (gunichar uc) |
99 | 0 | { |
100 | 0 | return COMBINING_CLASS (uc); |
101 | 0 | } |
102 | | |
103 | | /* constants for hangul syllable [de]composition */ |
104 | 0 | #define SBase 0xAC00 |
105 | 0 | #define LBase 0x1100 |
106 | 0 | #define VBase 0x1161 |
107 | 0 | #define TBase 0x11A7 |
108 | 0 | #define LCount 19 |
109 | 0 | #define VCount 21 |
110 | 0 | #define TCount 28 |
111 | 0 | #define NCount (VCount * TCount) |
112 | 0 | #define SCount (LCount * NCount) |
113 | | |
114 | | /** |
115 | | * g_unicode_canonical_ordering: |
116 | | * @string: a UCS-4 encoded string. |
117 | | * @len: the maximum length of @string to use. |
118 | | * |
119 | | * Computes the canonical ordering of a string in-place. |
120 | | * This rearranges decomposed characters in the string |
121 | | * according to their combining classes. See the Unicode |
122 | | * manual for more information. |
123 | | **/ |
124 | | void |
125 | | g_unicode_canonical_ordering (gunichar *string, |
126 | | gsize len) |
127 | 0 | { |
128 | 0 | gsize i; |
129 | 0 | int swap = 1; |
130 | |
|
131 | 0 | while (swap) |
132 | 0 | { |
133 | 0 | int last; |
134 | 0 | swap = 0; |
135 | 0 | last = COMBINING_CLASS (string[0]); |
136 | 0 | for (i = 0; i < len - 1; ++i) |
137 | 0 | { |
138 | 0 | int next = COMBINING_CLASS (string[i + 1]); |
139 | 0 | if (next != 0 && last > next) |
140 | 0 | { |
141 | 0 | gsize j; |
142 | | /* Percolate item leftward through string. */ |
143 | 0 | for (j = i + 1; j > 0; --j) |
144 | 0 | { |
145 | 0 | gunichar t; |
146 | 0 | if (COMBINING_CLASS (string[j - 1]) <= next) |
147 | 0 | break; |
148 | 0 | t = string[j]; |
149 | 0 | string[j] = string[j - 1]; |
150 | 0 | string[j - 1] = t; |
151 | 0 | swap = 1; |
152 | 0 | } |
153 | | /* We're re-entering the loop looking at the old |
154 | | character again. */ |
155 | 0 | next = last; |
156 | 0 | } |
157 | 0 | last = next; |
158 | 0 | } |
159 | 0 | } |
160 | 0 | } |
161 | | |
162 | | /* http://www.unicode.org/unicode/reports/tr15/#Hangul |
163 | | * r should be null or have sufficient space. Calling with r == NULL will |
164 | | * only calculate the result_len; however, a buffer with space for three |
165 | | * characters will always be big enough. */ |
166 | | static void |
167 | | decompose_hangul (gunichar s, |
168 | | gunichar *r, |
169 | | gsize *result_len) |
170 | 0 | { |
171 | 0 | gint SIndex = s - SBase; |
172 | 0 | gint TIndex = SIndex % TCount; |
173 | |
|
174 | 0 | if (r) |
175 | 0 | { |
176 | 0 | r[0] = LBase + SIndex / NCount; |
177 | 0 | r[1] = VBase + (SIndex % NCount) / TCount; |
178 | 0 | } |
179 | |
|
180 | 0 | if (TIndex) |
181 | 0 | { |
182 | 0 | if (r) |
183 | 0 | r[2] = TBase + TIndex; |
184 | 0 | *result_len = 3; |
185 | 0 | } |
186 | 0 | else |
187 | 0 | *result_len = 2; |
188 | 0 | } |
189 | | |
190 | | /* returns a pointer to a null-terminated UTF-8 string */ |
191 | | static const gchar * |
192 | | find_decomposition (gunichar ch, |
193 | | gboolean compat) |
194 | 0 | { |
195 | 0 | int start = 0; |
196 | 0 | int end = G_N_ELEMENTS (decomp_table); |
197 | | |
198 | 0 | if (ch >= decomp_table[start].ch && |
199 | 0 | ch <= decomp_table[end - 1].ch) |
200 | 0 | { |
201 | 0 | while (TRUE) |
202 | 0 | { |
203 | 0 | int half = (start + end) / 2; |
204 | 0 | if (ch == decomp_table[half].ch) |
205 | 0 | { |
206 | 0 | int offset; |
207 | |
|
208 | 0 | if (compat) |
209 | 0 | { |
210 | 0 | offset = decomp_table[half].compat_offset; |
211 | 0 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
212 | 0 | offset = decomp_table[half].canon_offset; |
213 | 0 | } |
214 | 0 | else |
215 | 0 | { |
216 | 0 | offset = decomp_table[half].canon_offset; |
217 | 0 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
218 | 0 | return NULL; |
219 | 0 | } |
220 | | |
221 | 0 | return &(decomp_expansion_string[offset]); |
222 | 0 | } |
223 | 0 | else if (half == start) |
224 | 0 | break; |
225 | 0 | else if (ch > decomp_table[half].ch) |
226 | 0 | start = half; |
227 | 0 | else |
228 | 0 | end = half; |
229 | 0 | } |
230 | 0 | } |
231 | | |
232 | 0 | return NULL; |
233 | 0 | } |
234 | | |
235 | | /** |
236 | | * g_unicode_canonical_decomposition: |
237 | | * @ch: a Unicode character. |
238 | | * @result_len: location to store the length of the return value. |
239 | | * |
240 | | * Computes the canonical decomposition of a Unicode character. |
241 | | * |
242 | | * Returns: a newly allocated string of Unicode characters. |
243 | | * @result_len is set to the resulting length of the string. |
244 | | * |
245 | | * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose() |
246 | | * instead. |
247 | | **/ |
248 | | gunichar * |
249 | | g_unicode_canonical_decomposition (gunichar ch, |
250 | | gsize *result_len) |
251 | 0 | { |
252 | 0 | const gchar *decomp; |
253 | 0 | const gchar *p; |
254 | 0 | gunichar *r; |
255 | | |
256 | | /* Hangul syllable */ |
257 | 0 | if (ch >= SBase && ch < SBase + SCount) |
258 | 0 | { |
259 | 0 | decompose_hangul (ch, NULL, result_len); |
260 | 0 | r = g_malloc (*result_len * sizeof (gunichar)); |
261 | 0 | decompose_hangul (ch, r, result_len); |
262 | 0 | } |
263 | 0 | else if ((decomp = find_decomposition (ch, FALSE)) != NULL) |
264 | 0 | { |
265 | | /* Found it. */ |
266 | 0 | int i; |
267 | | |
268 | 0 | *result_len = g_utf8_strlen (decomp, -1); |
269 | 0 | r = g_malloc (*result_len * sizeof (gunichar)); |
270 | | |
271 | 0 | for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++) |
272 | 0 | r[i] = g_utf8_get_char (p); |
273 | 0 | } |
274 | 0 | else |
275 | 0 | { |
276 | | /* Not in our table. */ |
277 | 0 | r = g_malloc (sizeof (gunichar)); |
278 | 0 | *r = ch; |
279 | 0 | *result_len = 1; |
280 | 0 | } |
281 | |
|
282 | 0 | return r; |
283 | 0 | } |
284 | | |
285 | | /* L,V => LV and LV,T => LVT */ |
286 | | static gboolean |
287 | | combine_hangul (gunichar a, |
288 | | gunichar b, |
289 | | gunichar *result) |
290 | 0 | { |
291 | 0 | gint LIndex = a - LBase; |
292 | 0 | gint SIndex = a - SBase; |
293 | |
|
294 | 0 | gint VIndex = b - VBase; |
295 | 0 | gint TIndex = b - TBase; |
296 | |
|
297 | 0 | if (0 <= LIndex && LIndex < LCount |
298 | 0 | && 0 <= VIndex && VIndex < VCount) |
299 | 0 | { |
300 | 0 | *result = SBase + (LIndex * VCount + VIndex) * TCount; |
301 | 0 | return TRUE; |
302 | 0 | } |
303 | 0 | else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 |
304 | 0 | && 0 < TIndex && TIndex < TCount) |
305 | 0 | { |
306 | 0 | *result = a + TIndex; |
307 | 0 | return TRUE; |
308 | 0 | } |
309 | | |
310 | 0 | return FALSE; |
311 | 0 | } |
312 | | |
313 | | #define CI(Page, Char) \ |
314 | 0 | ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
315 | 0 | ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
316 | 0 | : (compose_data[compose_table[Page]][Char])) |
317 | | |
318 | | #define COMPOSE_INDEX(Char) \ |
319 | 0 | (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) |
320 | | |
321 | | static gboolean |
322 | | combine (gunichar a, |
323 | | gunichar b, |
324 | | gunichar *result) |
325 | 0 | { |
326 | 0 | gushort index_a, index_b; |
327 | |
|
328 | 0 | if (combine_hangul (a, b, result)) |
329 | 0 | return TRUE; |
330 | | |
331 | 0 | index_a = COMPOSE_INDEX(a); |
332 | |
|
333 | 0 | if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) |
334 | 0 | { |
335 | 0 | if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) |
336 | 0 | { |
337 | 0 | *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; |
338 | 0 | return TRUE; |
339 | 0 | } |
340 | 0 | else |
341 | 0 | return FALSE; |
342 | 0 | } |
343 | | |
344 | 0 | index_b = COMPOSE_INDEX(b); |
345 | |
|
346 | 0 | if (index_b >= COMPOSE_SECOND_SINGLE_START) |
347 | 0 | { |
348 | 0 | if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) |
349 | 0 | { |
350 | 0 | *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; |
351 | 0 | return TRUE; |
352 | 0 | } |
353 | 0 | else |
354 | 0 | return FALSE; |
355 | 0 | } |
356 | | |
357 | 0 | if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START && |
358 | 0 | index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START) |
359 | 0 | { |
360 | 0 | gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START]; |
361 | |
|
362 | 0 | if (res) |
363 | 0 | { |
364 | 0 | *result = res; |
365 | 0 | return TRUE; |
366 | 0 | } |
367 | 0 | } |
368 | | |
369 | 0 | return FALSE; |
370 | 0 | } |
371 | | |
372 | | gunichar * |
373 | | _g_utf8_normalize_wc (const gchar *str, |
374 | | gssize max_len, |
375 | | GNormalizeMode mode) |
376 | 0 | { |
377 | 0 | gsize n_wc; |
378 | 0 | gunichar *wc_buffer; |
379 | 0 | const char *p; |
380 | 0 | gsize last_start; |
381 | 0 | gboolean do_compat = (mode == G_NORMALIZE_NFKC || |
382 | 0 | mode == G_NORMALIZE_NFKD); |
383 | 0 | gboolean do_compose = (mode == G_NORMALIZE_NFC || |
384 | 0 | mode == G_NORMALIZE_NFKC); |
385 | |
|
386 | 0 | n_wc = 0; |
387 | 0 | p = str; |
388 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
389 | 0 | { |
390 | 0 | const gchar *decomp; |
391 | 0 | gunichar wc = g_utf8_get_char (p); |
392 | |
|
393 | 0 | if (wc >= SBase && wc < SBase + SCount) |
394 | 0 | { |
395 | 0 | gsize result_len; |
396 | 0 | decompose_hangul (wc, NULL, &result_len); |
397 | 0 | n_wc += result_len; |
398 | 0 | } |
399 | 0 | else |
400 | 0 | { |
401 | 0 | decomp = find_decomposition (wc, do_compat); |
402 | |
|
403 | 0 | if (decomp) |
404 | 0 | n_wc += g_utf8_strlen (decomp, -1); |
405 | 0 | else |
406 | 0 | n_wc++; |
407 | 0 | } |
408 | |
|
409 | 0 | p = g_utf8_next_char (p); |
410 | 0 | } |
411 | |
|
412 | 0 | wc_buffer = g_new (gunichar, n_wc + 1); |
413 | |
|
414 | 0 | last_start = 0; |
415 | 0 | n_wc = 0; |
416 | 0 | p = str; |
417 | 0 | while ((max_len < 0 || p < str + max_len) && *p) |
418 | 0 | { |
419 | 0 | gunichar wc = g_utf8_get_char (p); |
420 | 0 | const gchar *decomp; |
421 | 0 | int cc; |
422 | 0 | gsize old_n_wc = n_wc; |
423 | | |
424 | 0 | if (wc >= SBase && wc < SBase + SCount) |
425 | 0 | { |
426 | 0 | gsize result_len; |
427 | 0 | decompose_hangul (wc, wc_buffer + n_wc, &result_len); |
428 | 0 | n_wc += result_len; |
429 | 0 | } |
430 | 0 | else |
431 | 0 | { |
432 | 0 | decomp = find_decomposition (wc, do_compat); |
433 | | |
434 | 0 | if (decomp) |
435 | 0 | { |
436 | 0 | const char *pd; |
437 | 0 | for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) |
438 | 0 | wc_buffer[n_wc++] = g_utf8_get_char (pd); |
439 | 0 | } |
440 | 0 | else |
441 | 0 | wc_buffer[n_wc++] = wc; |
442 | 0 | } |
443 | |
|
444 | 0 | if (n_wc > 0) |
445 | 0 | { |
446 | 0 | cc = COMBINING_CLASS (wc_buffer[old_n_wc]); |
447 | |
|
448 | 0 | if (cc == 0) |
449 | 0 | { |
450 | 0 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
451 | 0 | last_start = old_n_wc; |
452 | 0 | } |
453 | 0 | } |
454 | | |
455 | 0 | p = g_utf8_next_char (p); |
456 | 0 | } |
457 | |
|
458 | 0 | if (n_wc > 0) |
459 | 0 | { |
460 | 0 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
461 | 0 | last_start = n_wc; |
462 | 0 | (void) last_start; |
463 | 0 | } |
464 | | |
465 | 0 | wc_buffer[n_wc] = 0; |
466 | | |
467 | | /* All decomposed and reordered */ |
468 | |
|
469 | 0 | if (do_compose && n_wc > 0) |
470 | 0 | { |
471 | 0 | gsize i, j; |
472 | 0 | int last_cc = 0; |
473 | 0 | last_start = 0; |
474 | | |
475 | 0 | for (i = 0; i < n_wc; i++) |
476 | 0 | { |
477 | 0 | int cc = COMBINING_CLASS (wc_buffer[i]); |
478 | |
|
479 | 0 | if (i > 0 && |
480 | 0 | (last_cc == 0 || last_cc < cc) && |
481 | 0 | combine (wc_buffer[last_start], wc_buffer[i], |
482 | 0 | &wc_buffer[last_start])) |
483 | 0 | { |
484 | 0 | for (j = i + 1; j < n_wc; j++) |
485 | 0 | wc_buffer[j-1] = wc_buffer[j]; |
486 | 0 | n_wc--; |
487 | 0 | i--; |
488 | | |
489 | 0 | if (i == last_start) |
490 | 0 | last_cc = 0; |
491 | 0 | else |
492 | 0 | last_cc = COMBINING_CLASS (wc_buffer[i-1]); |
493 | | |
494 | 0 | continue; |
495 | 0 | } |
496 | | |
497 | 0 | if (cc == 0) |
498 | 0 | last_start = i; |
499 | |
|
500 | 0 | last_cc = cc; |
501 | 0 | } |
502 | 0 | } |
503 | |
|
504 | 0 | wc_buffer[n_wc] = 0; |
505 | |
|
506 | 0 | return wc_buffer; |
507 | 0 | } |
508 | | |
509 | | /** |
510 | | * g_utf8_normalize: |
511 | | * @str: a UTF-8 encoded string. |
512 | | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
513 | | * @mode: the type of normalization to perform. |
514 | | * |
515 | | * Converts a string into canonical form, standardizing |
516 | | * such issues as whether a character with an accent |
517 | | * is represented as a base character and combining |
518 | | * accent or as a single precomposed character. The |
519 | | * string has to be valid UTF-8, otherwise %NULL is |
520 | | * returned. You should generally call g_utf8_normalize() |
521 | | * before comparing two Unicode strings. |
522 | | * |
523 | | * The normalization mode %G_NORMALIZE_DEFAULT only |
524 | | * standardizes differences that do not affect the |
525 | | * text content, such as the above-mentioned accent |
526 | | * representation. %G_NORMALIZE_ALL also standardizes |
527 | | * the "compatibility" characters in Unicode, such |
528 | | * as SUPERSCRIPT THREE to the standard forms |
529 | | * (in this case DIGIT THREE). Formatting information |
530 | | * may be lost but for most text operations such |
531 | | * characters should be considered the same. |
532 | | * |
533 | | * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE |
534 | | * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, |
535 | | * but returned a result with composed forms rather |
536 | | * than a maximally decomposed form. This is often |
537 | | * useful if you intend to convert the string to |
538 | | * a legacy encoding or pass it to a system with |
539 | | * less capable Unicode handling. |
540 | | * |
541 | | * Returns: (nullable): a newly allocated string, that |
542 | | * is the normalized form of @str, or %NULL if @str |
543 | | * is not valid UTF-8. |
544 | | **/ |
545 | | gchar * |
546 | | g_utf8_normalize (const gchar *str, |
547 | | gssize len, |
548 | | GNormalizeMode mode) |
549 | 0 | { |
550 | 0 | gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); |
551 | 0 | gchar *result; |
552 | |
|
553 | 0 | result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); |
554 | 0 | g_free (result_wc); |
555 | |
|
556 | 0 | return result; |
557 | 0 | } |
558 | | |
559 | | static gboolean |
560 | | decompose_hangul_step (gunichar ch, |
561 | | gunichar *a, |
562 | | gunichar *b) |
563 | 0 | { |
564 | 0 | gint SIndex, TIndex; |
565 | |
|
566 | 0 | if (ch < SBase || ch >= SBase + SCount) |
567 | 0 | return FALSE; /* not a hangul syllable */ |
568 | | |
569 | 0 | SIndex = ch - SBase; |
570 | 0 | TIndex = SIndex % TCount; |
571 | |
|
572 | 0 | if (TIndex) |
573 | 0 | { |
574 | | /* split LVT -> LV,T */ |
575 | 0 | *a = ch - TIndex; |
576 | 0 | *b = TBase + TIndex; |
577 | 0 | } |
578 | 0 | else |
579 | 0 | { |
580 | | /* split LV -> L,V */ |
581 | 0 | *a = LBase + SIndex / NCount; |
582 | 0 | *b = VBase + (SIndex % NCount) / TCount; |
583 | 0 | } |
584 | |
|
585 | 0 | return TRUE; |
586 | 0 | } |
587 | | |
588 | | /** |
589 | | * g_unichar_decompose: |
590 | | * @ch: a Unicode character |
591 | | * @a: (out) (not optional): return location for the first component of @ch |
592 | | * @b: (out) (not optional): return location for the second component of @ch |
593 | | * |
594 | | * Performs a single decomposition step of the |
595 | | * Unicode canonical decomposition algorithm. |
596 | | * |
597 | | * This function does not include compatibility |
598 | | * decompositions. It does, however, include algorithmic |
599 | | * Hangul Jamo decomposition, as well as 'singleton' |
600 | | * decompositions which replace a character by a single |
601 | | * other character. In the case of singletons *@b will |
602 | | * be set to zero. |
603 | | * |
604 | | * If @ch is not decomposable, *@a is set to @ch and *@b |
605 | | * is set to zero. |
606 | | * |
607 | | * Note that the way Unicode decomposition pairs are |
608 | | * defined, it is guaranteed that @b would not decompose |
609 | | * further, but @a may itself decompose. To get the full |
610 | | * canonical decomposition for @ch, one would need to |
611 | | * recursively call this function on @a. Or use |
612 | | * g_unichar_fully_decompose(). |
613 | | * |
614 | | * See |
615 | | * [UAX#15](http://unicode.org/reports/tr15/) |
616 | | * for details. |
617 | | * |
618 | | * Returns: %TRUE if the character could be decomposed |
619 | | * |
620 | | * Since: 2.30 |
621 | | */ |
622 | | gboolean |
623 | | g_unichar_decompose (gunichar ch, |
624 | | gunichar *a, |
625 | | gunichar *b) |
626 | 0 | { |
627 | 0 | gint start = 0; |
628 | 0 | gint end = G_N_ELEMENTS (decomp_step_table); |
629 | |
|
630 | 0 | if (decompose_hangul_step (ch, a, b)) |
631 | 0 | return TRUE; |
632 | | |
633 | | /* TODO use bsearch() */ |
634 | 0 | if (ch >= decomp_step_table[start].ch && |
635 | 0 | ch <= decomp_step_table[end - 1].ch) |
636 | 0 | { |
637 | 0 | while (TRUE) |
638 | 0 | { |
639 | 0 | gint half = (start + end) / 2; |
640 | 0 | const decomposition_step *p = &(decomp_step_table[half]); |
641 | 0 | if (ch == p->ch) |
642 | 0 | { |
643 | 0 | *a = p->a; |
644 | 0 | *b = p->b; |
645 | 0 | return TRUE; |
646 | 0 | } |
647 | 0 | else if (half == start) |
648 | 0 | break; |
649 | 0 | else if (ch > p->ch) |
650 | 0 | start = half; |
651 | 0 | else |
652 | 0 | end = half; |
653 | 0 | } |
654 | 0 | } |
655 | | |
656 | 0 | *a = ch; |
657 | 0 | *b = 0; |
658 | |
|
659 | 0 | return FALSE; |
660 | 0 | } |
661 | | |
662 | | /** |
663 | | * g_unichar_compose: |
664 | | * @a: a Unicode character |
665 | | * @b: a Unicode character |
666 | | * @ch: (out) (not optional): return location for the composed character |
667 | | * |
668 | | * Performs a single composition step of the |
669 | | * Unicode canonical composition algorithm. |
670 | | * |
671 | | * This function includes algorithmic Hangul Jamo composition, |
672 | | * but it is not exactly the inverse of g_unichar_decompose(). |
673 | | * No composition can have either of @a or @b equal to zero. |
674 | | * To be precise, this function composes if and only if |
675 | | * there exists a Primary Composite P which is canonically |
676 | | * equivalent to the sequence <@a,@b>. See the Unicode |
677 | | * Standard for the definition of Primary Composite. |
678 | | * |
679 | | * If @a and @b do not compose a new character, @ch is set to zero. |
680 | | * |
681 | | * See |
682 | | * [UAX#15](http://unicode.org/reports/tr15/) |
683 | | * for details. |
684 | | * |
685 | | * Returns: %TRUE if the characters could be composed |
686 | | * |
687 | | * Since: 2.30 |
688 | | */ |
689 | | gboolean |
690 | | g_unichar_compose (gunichar a, |
691 | | gunichar b, |
692 | | gunichar *ch) |
693 | 0 | { |
694 | 0 | if (combine (a, b, ch)) |
695 | 0 | return TRUE; |
696 | | |
697 | 0 | *ch = 0; |
698 | 0 | return FALSE; |
699 | 0 | } |
700 | | |
701 | | /** |
702 | | * g_unichar_fully_decompose: |
703 | | * @ch: a Unicode character. |
704 | | * @compat: whether perform canonical or compatibility decomposition |
705 | | * @result: (optional) (out caller-allocates): location to store decomposed result, or %NULL |
706 | | * @result_len: length of @result |
707 | | * |
708 | | * Computes the canonical or compatibility decomposition of a |
709 | | * Unicode character. For compatibility decomposition, |
710 | | * pass %TRUE for @compat; for canonical decomposition |
711 | | * pass %FALSE for @compat. |
712 | | * |
713 | | * The decomposed sequence is placed in @result. Only up to |
714 | | * @result_len characters are written into @result. The length |
715 | | * of the full decomposition (irrespective of @result_len) is |
716 | | * returned by the function. For canonical decomposition, |
717 | | * currently all decompositions are of length at most 4, but |
718 | | * this may change in the future (very unlikely though). |
719 | | * At any rate, Unicode does guarantee that a buffer of length |
720 | | * 18 is always enough for both compatibility and canonical |
721 | | * decompositions, so that is the size recommended. This is provided |
722 | | * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. |
723 | | * |
724 | | * See |
725 | | * [UAX#15](http://unicode.org/reports/tr15/) |
726 | | * for details. |
727 | | * |
728 | | * Returns: the length of the full decomposition. |
729 | | * |
730 | | * Since: 2.30 |
731 | | **/ |
732 | | gsize |
733 | | g_unichar_fully_decompose (gunichar ch, |
734 | | gboolean compat, |
735 | | gunichar *result, |
736 | | gsize result_len) |
737 | 0 | { |
738 | 0 | const gchar *decomp; |
739 | 0 | const gchar *p; |
740 | | |
741 | | /* Hangul syllable */ |
742 | 0 | if (ch >= SBase && ch < SBase + SCount) |
743 | 0 | { |
744 | 0 | gsize len, i; |
745 | 0 | gunichar buffer[3]; |
746 | 0 | decompose_hangul (ch, result ? buffer : NULL, &len); |
747 | 0 | if (result) |
748 | 0 | for (i = 0; i < len && i < result_len; i++) |
749 | 0 | result[i] = buffer[i]; |
750 | 0 | return len; |
751 | 0 | } |
752 | 0 | else if ((decomp = find_decomposition (ch, compat)) != NULL) |
753 | 0 | { |
754 | | /* Found it. */ |
755 | 0 | gsize len, i; |
756 | |
|
757 | 0 | len = g_utf8_strlen (decomp, -1); |
758 | |
|
759 | 0 | for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++) |
760 | 0 | result[i] = g_utf8_get_char (p); |
761 | |
|
762 | 0 | return len; |
763 | 0 | } |
764 | | |
765 | | /* Does not decompose */ |
766 | 0 | if (result && result_len >= 1) |
767 | 0 | *result = ch; |
768 | 0 | return 1; |
769 | 0 | } |