/src/libunistring/lib/uninorm/canonical-decomposition.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Canonical decomposition of Unicode characters. |
2 | | Copyright (C) 2009-2024 Free Software Foundation, Inc. |
3 | | Written by Bruno Haible <bruno@clisp.org>, 2009. |
4 | | |
5 | | This file is free software: you can redistribute it and/or modify |
6 | | it under the terms of the GNU Lesser General Public License as |
7 | | published by the Free Software Foundation; either version 2.1 of the |
8 | | License, or (at your option) any later version. |
9 | | |
10 | | This file is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | GNU Lesser General Public License for more details. |
14 | | |
15 | | You should have received a copy of the GNU Lesser General Public License |
16 | | along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
17 | | |
18 | | #include <config.h> |
19 | | |
20 | | /* Specification. */ |
21 | | #include "uninorm.h" |
22 | | |
23 | | #include <stdlib.h> |
24 | | |
25 | | #include "uninorm/decomposition-table.h" |
26 | | |
27 | | int |
28 | | uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition) |
29 | 2.27M | { |
30 | 2.27M | if (uc >= 0xAC00 && uc < 0xD7A4) |
31 | 6.54k | { |
32 | | /* Hangul syllable. See Unicode standard, chapter 3, section |
33 | | "Hangul Syllable Decomposition", See also the clarification at |
34 | | <https://www.unicode.org/versions/Unicode5.1.0/>, section |
35 | | "Clarification of Hangul Jamo Handling". */ |
36 | 6.54k | unsigned int t; |
37 | | |
38 | 6.54k | uc -= 0xAC00; |
39 | 6.54k | t = uc % 28; |
40 | | |
41 | 6.54k | if (t == 0) |
42 | 3.44k | { |
43 | 3.44k | unsigned int v, l; |
44 | | |
45 | 3.44k | uc = uc / 28; |
46 | 3.44k | v = uc % 21; |
47 | 3.44k | l = uc / 21; |
48 | | |
49 | 3.44k | decomposition[0] = 0x1100 + l; |
50 | 3.44k | decomposition[1] = 0x1161 + v; |
51 | 3.44k | return 2; |
52 | 3.44k | } |
53 | 3.10k | else |
54 | 3.10k | { |
55 | 3.10k | #if 1 /* Return the pairwise decomposition, not the full decomposition. */ |
56 | 3.10k | decomposition[0] = 0xAC00 + uc - t; /* = 0xAC00 + (l * 21 + v) * 28; */ |
57 | 3.10k | decomposition[1] = 0x11A7 + t; |
58 | 3.10k | return 2; |
59 | | #else |
60 | | unsigned int v, l; |
61 | | |
62 | | uc = uc / 28; |
63 | | v = uc % 21; |
64 | | l = uc / 21; |
65 | | |
66 | | decomposition[0] = 0x1100 + l; |
67 | | decomposition[1] = 0x1161 + v; |
68 | | decomposition[2] = 0x11A7 + t; |
69 | | return 3; |
70 | | #endif |
71 | 3.10k | } |
72 | 6.54k | } |
73 | 2.26M | else if (uc < 0x110000) |
74 | 2.26M | { |
75 | 2.26M | unsigned short entry = decomp_index (uc); |
76 | | /* An entry of (unsigned short)(-1) denotes an absent entry. |
77 | | Otherwise, bit 15 of the entry tells whether the decomposition |
78 | | is a canonical one. */ |
79 | 2.26M | if (entry < 0x8000) |
80 | 120k | { |
81 | 120k | const unsigned char *p; |
82 | 120k | unsigned int element; |
83 | 120k | unsigned int length; |
84 | | |
85 | 120k | p = &gl_uninorm_decomp_chars_table[3 * entry]; |
86 | 120k | element = (p[0] << 16) | (p[1] << 8) | p[2]; |
87 | | /* The first element has 5 bits for the decomposition type. */ |
88 | 120k | if (((element >> 18) & 0x1f) != UC_DECOMP_CANONICAL) |
89 | 0 | abort (); |
90 | 120k | length = 1; |
91 | 120k | for (;;) |
92 | 239k | { |
93 | | /* Every element has an 18 bits wide Unicode code point. */ |
94 | 239k | *decomposition = element & 0x3ffff; |
95 | | /* Bit 23 tells whether there are more elements, */ |
96 | 239k | if ((element & (1 << 23)) == 0) |
97 | 120k | break; |
98 | 119k | p += 3; |
99 | 119k | element = (p[0] << 16) | (p[1] << 8) | p[2]; |
100 | 119k | decomposition++; |
101 | 119k | length++; |
102 | 119k | } |
103 | 120k | return length; |
104 | 120k | } |
105 | 2.26M | } |
106 | 2.14M | return -1; |
107 | 2.27M | } |