/src/libunistring/lib/uninorm/canonical-decomposition.c
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* Canonical decomposition of Unicode characters.  | 
2  |  |    Copyright (C) 2009-2023 Free Software Foundation, Inc.  | 
3  |  |    Written by Bruno Haible <bruno@clisp.org>, 2009.  | 
4  |  |  | 
5  |  |    This file is free software: you can redistribute it and/or modify  | 
6  |  |    it under the terms of the GNU Lesser General Public License as  | 
7  |  |    published by the Free Software Foundation; either version 2.1 of the  | 
8  |  |    License, or (at your option) any later version.  | 
9  |  |  | 
10  |  |    This file is distributed in the hope that it will be useful,  | 
11  |  |    but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
12  |  |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  | 
13  |  |    GNU Lesser General Public License for more details.  | 
14  |  |  | 
15  |  |    You should have received a copy of the GNU Lesser General Public License  | 
16  |  |    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */  | 
17  |  |  | 
18  |  | #include <config.h>  | 
19  |  |  | 
20  |  | /* Specification.  */  | 
21  |  | #include "uninorm.h"  | 
22  |  |  | 
23  |  | #include <stdlib.h>  | 
24  |  |  | 
25  |  | #include "uninorm/decomposition-table.h"  | 
26  |  |  | 
27  |  | int  | 
28  |  | uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition)  | 
29  | 21.0M  | { | 
30  | 21.0M  |   if (uc >= 0xAC00 && uc < 0xD7A4)  | 
31  | 10.2k  |     { | 
32  |  |       /* Hangul syllable.  See Unicode standard, chapter 3, section  | 
33  |  |          "Hangul Syllable Decomposition",  See also the clarification at  | 
34  |  |          <https://www.unicode.org/versions/Unicode5.1.0/>, section  | 
35  |  |          "Clarification of Hangul Jamo Handling".  */  | 
36  | 10.2k  |       unsigned int t;  | 
37  |  |  | 
38  | 10.2k  |       uc -= 0xAC00;  | 
39  | 10.2k  |       t = uc % 28;  | 
40  |  |  | 
41  | 10.2k  |       if (t == 0)  | 
42  | 6.23k  |         { | 
43  | 6.23k  |           unsigned int v, l;  | 
44  |  |  | 
45  | 6.23k  |           uc = uc / 28;  | 
46  | 6.23k  |           v = uc % 21;  | 
47  | 6.23k  |           l = uc / 21;  | 
48  |  |  | 
49  | 6.23k  |           decomposition[0] = 0x1100 + l;  | 
50  | 6.23k  |           decomposition[1] = 0x1161 + v;  | 
51  | 6.23k  |           return 2;  | 
52  | 6.23k  |         }  | 
53  | 3.98k  |       else  | 
54  | 3.98k  |         { | 
55  | 3.98k  | #if 1 /* Return the pairwise decomposition, not the full decomposition.  */  | 
56  | 3.98k  |           decomposition[0] = 0xAC00 + uc - t; /* = 0xAC00 + (l * 21 + v) * 28; */  | 
57  | 3.98k  |           decomposition[1] = 0x11A7 + t;  | 
58  | 3.98k  |           return 2;  | 
59  |  | #else  | 
60  |  |           unsigned int v, l;  | 
61  |  |  | 
62  |  |           uc = uc / 28;  | 
63  |  |           v = uc % 21;  | 
64  |  |           l = uc / 21;  | 
65  |  |  | 
66  |  |           decomposition[0] = 0x1100 + l;  | 
67  |  |           decomposition[1] = 0x1161 + v;  | 
68  |  |           decomposition[2] = 0x11A7 + t;  | 
69  |  |           return 3;  | 
70  |  | #endif  | 
71  | 3.98k  |         }  | 
72  | 10.2k  |     }  | 
73  | 20.9M  |   else if (uc < 0x110000)  | 
74  | 20.9M  |     { | 
75  | 20.9M  |       unsigned short entry = decomp_index (uc);  | 
76  |  |       /* An entry of (unsigned short)(-1) denotes an absent entry.  | 
77  |  |          Otherwise, bit 15 of the entry tells whether the decomposition  | 
78  |  |          is a canonical one.  */  | 
79  | 20.9M  |       if (entry < 0x8000)  | 
80  | 21.8k  |         { | 
81  | 21.8k  |           const unsigned char *p;  | 
82  | 21.8k  |           unsigned int element;  | 
83  | 21.8k  |           unsigned int length;  | 
84  |  |  | 
85  | 21.8k  |           p = &gl_uninorm_decomp_chars_table[3 * entry];  | 
86  | 21.8k  |           element = (p[0] << 16) | (p[1] << 8) | p[2];  | 
87  |  |           /* The first element has 5 bits for the decomposition type.  */  | 
88  | 21.8k  |           if (((element >> 18) & 0x1f) != UC_DECOMP_CANONICAL)  | 
89  | 0  |             abort ();  | 
90  | 21.8k  |           length = 1;  | 
91  | 21.8k  |           for (;;)  | 
92  | 43.5k  |             { | 
93  |  |               /* Every element has an 18 bits wide Unicode code point.  */  | 
94  | 43.5k  |               *decomposition = element & 0x3ffff;  | 
95  |  |               /* Bit 23 tells whether there are more elements,  */  | 
96  | 43.5k  |               if ((element & (1 << 23)) == 0)  | 
97  | 21.8k  |                 break;  | 
98  | 21.6k  |               p += 3;  | 
99  | 21.6k  |               element = (p[0] << 16) | (p[1] << 8) | p[2];  | 
100  | 21.6k  |               decomposition++;  | 
101  | 21.6k  |               length++;  | 
102  | 21.6k  |             }  | 
103  | 21.8k  |           return length;  | 
104  | 21.8k  |         }  | 
105  | 20.9M  |     }  | 
106  | 20.9M  |   return -1;  | 
107  | 21.0M  | }  |