/src/gnutls/lib/unistring/uninorm/u-normalize-internal.h
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | /* Decomposition and composition of Unicode strings.  | 
2  |  |    Copyright (C) 2009-2025 Free Software Foundation, Inc.  | 
3  |  |    Written by Bruno Haible <bruno@clisp.org>, 2009.  | 
4  |  |  | 
5  |  |    This file is free software: you can redistribute it and/or modify  | 
6  |  |    it under the terms of the GNU Lesser General Public License as  | 
7  |  |    published by the Free Software Foundation; either version 2.1 of the  | 
8  |  |    License, or (at your option) any later version.  | 
9  |  |  | 
10  |  |    This file is distributed in the hope that it will be useful,  | 
11  |  |    but WITHOUT ANY WARRANTY; without even the implied warranty of  | 
12  |  |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  | 
13  |  |    GNU Lesser General Public License for more details.  | 
14  |  |  | 
15  |  |    You should have received a copy of the GNU Lesser General Public License  | 
16  |  |    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */  | 
17  |  |  | 
18  |  | UNIT *  | 
19  |  | FUNC (uninorm_t nf, const UNIT *s, size_t n,  | 
20  |  |       UNIT *resultbuf, size_t *lengthp)  | 
21  | 0  | { | 
22  | 0  |   int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;  | 
23  | 0  |   ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;  | 
24  |  |  | 
25  |  |   /* The result being accumulated.  */  | 
26  | 0  |   UNIT *result;  | 
27  | 0  |   size_t length;  | 
28  | 0  |   size_t allocated;  | 
29  |  |   /* The buffer for sorting.  */  | 
30  | 0  |   #define SORTBUF_PREALLOCATED 64  | 
31  | 0  |   struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];  | 
32  | 0  |   struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */  | 
33  | 0  |   size_t sortbuf_allocated;  | 
34  | 0  |   size_t sortbuf_count;  | 
35  |  |  | 
36  |  |   /* Initialize the accumulator.  */  | 
37  | 0  |   if (resultbuf == NULL)  | 
38  | 0  |     { | 
39  | 0  |       result = NULL;  | 
40  | 0  |       allocated = 0;  | 
41  | 0  |     }  | 
42  | 0  |   else  | 
43  | 0  |     { | 
44  | 0  |       result = resultbuf;  | 
45  | 0  |       allocated = *lengthp;  | 
46  | 0  |     }  | 
47  | 0  |   length = 0;  | 
48  |  |  | 
49  |  |   /* Initialize the buffer for sorting.  */  | 
50  | 0  |   sortbuf = sortbuf_preallocated;  | 
51  | 0  |   sortbuf_allocated = SORTBUF_PREALLOCATED;  | 
52  | 0  |   sortbuf_count = 0;  | 
53  |  | 
  | 
54  | 0  |   { | 
55  | 0  |     const UNIT *s_end = s + n;  | 
56  |  | 
  | 
57  | 0  |     for (;;)  | 
58  | 0  |       { | 
59  | 0  |         int count;  | 
60  | 0  |         ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];  | 
61  | 0  |         int decomposed_count;  | 
62  | 0  |         int i;  | 
63  |  | 
  | 
64  | 0  |         if (s < s_end)  | 
65  | 0  |           { | 
66  |  |             /* Fetch the next character.  */  | 
67  | 0  |             count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);  | 
68  | 0  |             decomposed_count = 1;  | 
69  |  |  | 
70  |  |             /* Decompose it, recursively.  | 
71  |  |                It would be possible to precompute the recursive decomposition  | 
72  |  |                and store it in a table.  But this would significantly increase  | 
73  |  |                the size of the decomposition tables, because for example for  | 
74  |  |                U+1FC1 the recursive canonical decomposition and the recursive  | 
75  |  |                compatibility decomposition are different.  */  | 
76  | 0  |             { | 
77  | 0  |               int curr;  | 
78  |  | 
  | 
79  | 0  |               for (curr = 0; curr < decomposed_count; )  | 
80  | 0  |                 { | 
81  |  |                   /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.  | 
82  |  |                      all elements are atomic.  */  | 
83  | 0  |                   ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];  | 
84  | 0  |                   int curr_decomposed_count;  | 
85  |  | 
  | 
86  | 0  |                   curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed);  | 
87  | 0  |                   if (curr_decomposed_count >= 0)  | 
88  | 0  |                     { | 
89  |  |                       /* Move curr_decomposed[0..curr_decomposed_count-1] over  | 
90  |  |                          decomposed[curr], making room.  It's not worth using  | 
91  |  |                          memcpy() here, since the counts are so small.  */  | 
92  | 0  |                       int shift = curr_decomposed_count - 1;  | 
93  |  | 
  | 
94  | 0  |                       if (shift < 0)  | 
95  | 0  |                         abort ();  | 
96  | 0  |                       if (shift > 0)  | 
97  | 0  |                         { | 
98  | 0  |                           int j;  | 
99  |  | 
  | 
100  | 0  |                           decomposed_count += shift;  | 
101  | 0  |                           if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)  | 
102  | 0  |                             abort ();  | 
103  | 0  |                           for (j = decomposed_count - 1 - shift; j > curr; j--)  | 
104  | 0  |                             decomposed[j + shift] = decomposed[j];  | 
105  | 0  |                         }  | 
106  | 0  |                       for (; shift >= 0; shift--)  | 
107  | 0  |                         decomposed[curr + shift] = curr_decomposed[shift];  | 
108  | 0  |                     }  | 
109  | 0  |                   else  | 
110  | 0  |                     { | 
111  |  |                       /* decomposed[curr] is atomic.  */  | 
112  | 0  |                       curr++;  | 
113  | 0  |                     }  | 
114  | 0  |                 }  | 
115  | 0  |             }  | 
116  | 0  |           }  | 
117  | 0  |         else  | 
118  | 0  |           { | 
119  | 0  |             count = 0;  | 
120  | 0  |             decomposed_count = 0;  | 
121  | 0  |           }  | 
122  |  |  | 
123  | 0  |         i = 0;  | 
124  | 0  |         for (;;)  | 
125  | 0  |           { | 
126  | 0  |             ucs4_t uc;  | 
127  | 0  |             int ccc;  | 
128  |  | 
  | 
129  | 0  |             if (s < s_end)  | 
130  | 0  |               { | 
131  |  |                 /* Fetch the next character from the decomposition.  */  | 
132  | 0  |                 if (i == decomposed_count)  | 
133  | 0  |                   break;  | 
134  | 0  |                 uc = decomposed[i];  | 
135  | 0  |                 ccc = uc_combining_class (uc);  | 
136  | 0  |               }  | 
137  | 0  |             else  | 
138  | 0  |               { | 
139  |  |                 /* End of string reached.  */  | 
140  | 0  |                 uc = 0;  | 
141  | 0  |                 ccc = 0;  | 
142  | 0  |               }  | 
143  |  |  | 
144  | 0  |             if (ccc == 0)  | 
145  | 0  |               { | 
146  | 0  |                 size_t j;  | 
147  |  |  | 
148  |  |                 /* Apply the canonical ordering algorithm to the accumulated  | 
149  |  |                    sequence of characters.  */  | 
150  | 0  |                 if (sortbuf_count > 1)  | 
151  | 0  |                   gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count,  | 
152  | 0  |                                                            sortbuf + sortbuf_count);  | 
153  |  | 
  | 
154  | 0  |                 if (composer != NULL)  | 
155  | 0  |                   { | 
156  |  |                     /* Attempt to combine decomposed characters, as specified  | 
157  |  |                        in the Unicode Standard Annex #15 "Unicode Normalization  | 
158  |  |                        Forms".  We need to check  | 
159  |  |                          1. whether the first accumulated character is a  | 
160  |  |                             "starter" (i.e. has ccc = 0).  This is usually the  | 
161  |  |                             case.  But when the string starts with a  | 
162  |  |                             non-starter, the sortbuf also starts with a  | 
163  |  |                             non-starter.  Btw, this check could also be  | 
164  |  |                             omitted, because the composition table has only  | 
165  |  |                             entries (code1, code2) for which code1 is a  | 
166  |  |                             starter; if the first accumulated character is not  | 
167  |  |                             a starter, no lookup will succeed.  | 
168  |  |                          2. If the sortbuf has more than one character, check  | 
169  |  |                             for each of these characters that are not "blocked"  | 
170  |  |                             from the starter (i.e. have a ccc that is higher  | 
171  |  |                             than the ccc of the previous character) whether it  | 
172  |  |                             can be combined with the first character.  | 
173  |  |                          3. If only one character is left in sortbuf, check  | 
174  |  |                             whether it can be combined with the next character  | 
175  |  |                             (also a starter).  */  | 
176  | 0  |                     if (sortbuf_count > 0 && sortbuf[0].ccc == 0)  | 
177  | 0  |                       { | 
178  | 0  |                         for (j = 1; j < sortbuf_count; )  | 
179  | 0  |                           { | 
180  | 0  |                             if (sortbuf[j].ccc > sortbuf[j - 1].ccc)  | 
181  | 0  |                               { | 
182  | 0  |                                 ucs4_t combined =  | 
183  | 0  |                                   composer (sortbuf[0].code, sortbuf[j].code);  | 
184  | 0  |                                 if (combined)  | 
185  | 0  |                                   { | 
186  | 0  |                                     size_t k;  | 
187  |  | 
  | 
188  | 0  |                                     sortbuf[0].code = combined;  | 
189  |  |                                     /* sortbuf[0].ccc = 0, still valid.  */  | 
190  | 0  |                                     for (k = j + 1; k < sortbuf_count; k++)  | 
191  | 0  |                                       sortbuf[k - 1] = sortbuf[k];  | 
192  | 0  |                                     sortbuf_count--;  | 
193  | 0  |                                     continue;  | 
194  | 0  |                                   }  | 
195  | 0  |                               }  | 
196  | 0  |                             j++;  | 
197  | 0  |                           }  | 
198  | 0  |                         if (s < s_end && sortbuf_count == 1)  | 
199  | 0  |                           { | 
200  | 0  |                             ucs4_t combined =  | 
201  | 0  |                               composer (sortbuf[0].code, uc);  | 
202  | 0  |                             if (combined)  | 
203  | 0  |                               { | 
204  | 0  |                                 uc = combined;  | 
205  | 0  |                                 ccc = 0;  | 
206  |  |                                 /* uc could be further combined with subsequent  | 
207  |  |                                    characters.  So don't put it into sortbuf[0] in  | 
208  |  |                                    this round, only in the next round.  */  | 
209  | 0  |                                 sortbuf_count = 0;  | 
210  | 0  |                               }  | 
211  | 0  |                           }  | 
212  | 0  |                       }  | 
213  | 0  |                   }  | 
214  |  | 
  | 
215  | 0  |                 for (j = 0; j < sortbuf_count; j++)  | 
216  | 0  |                   { | 
217  | 0  |                     ucs4_t muc = sortbuf[j].code;  | 
218  |  |  | 
219  |  |                     /* Append muc to the result accumulator.  */  | 
220  | 0  |                     if (length < allocated)  | 
221  | 0  |                       { | 
222  | 0  |                         int ret =  | 
223  | 0  |                           U_UCTOMB (result + length, muc, allocated - length);  | 
224  | 0  |                         if (ret == -1)  | 
225  | 0  |                           { | 
226  | 0  |                             errno = EINVAL;  | 
227  | 0  |                             goto fail;  | 
228  | 0  |                           }  | 
229  | 0  |                         if (ret >= 0)  | 
230  | 0  |                           { | 
231  | 0  |                             length += ret;  | 
232  | 0  |                             goto done_appending;  | 
233  | 0  |                           }  | 
234  | 0  |                       }  | 
235  | 0  |                     { | 
236  | 0  |                       size_t old_allocated = allocated;  | 
237  | 0  |                       size_t new_allocated = 2 * old_allocated;  | 
238  | 0  |                       if (new_allocated < 64)  | 
239  | 0  |                         new_allocated = 64;  | 
240  | 0  |                       if (new_allocated < old_allocated) /* integer overflow? */  | 
241  | 0  |                         abort ();  | 
242  | 0  |                       { | 
243  | 0  |                         UNIT *larger_result;  | 
244  | 0  |                         if (result == NULL)  | 
245  | 0  |                           { | 
246  | 0  |                             larger_result =  | 
247  | 0  |                               (UNIT *) malloc (new_allocated * sizeof (UNIT));  | 
248  | 0  |                             if (larger_result == NULL)  | 
249  | 0  |                               { | 
250  | 0  |                                 errno = ENOMEM;  | 
251  | 0  |                                 goto fail;  | 
252  | 0  |                               }  | 
253  | 0  |                           }  | 
254  | 0  |                         else if (result == resultbuf)  | 
255  | 0  |                           { | 
256  | 0  |                             larger_result =  | 
257  | 0  |                               (UNIT *) malloc (new_allocated * sizeof (UNIT));  | 
258  | 0  |                             if (larger_result == NULL)  | 
259  | 0  |                               { | 
260  | 0  |                                 errno = ENOMEM;  | 
261  | 0  |                                 goto fail;  | 
262  | 0  |                               }  | 
263  | 0  |                             U_CPY (larger_result, resultbuf, length);  | 
264  | 0  |                           }  | 
265  | 0  |                         else  | 
266  | 0  |                           { | 
267  | 0  |                             larger_result =  | 
268  | 0  |                               (UNIT *) realloc (result, new_allocated * sizeof (UNIT));  | 
269  | 0  |                             if (larger_result == NULL)  | 
270  | 0  |                               { | 
271  | 0  |                                 errno = ENOMEM;  | 
272  | 0  |                                 goto fail;  | 
273  | 0  |                               }  | 
274  | 0  |                           }  | 
275  | 0  |                         result = larger_result;  | 
276  | 0  |                         allocated = new_allocated;  | 
277  | 0  |                         { | 
278  | 0  |                           int ret =  | 
279  | 0  |                             U_UCTOMB (result + length, muc, allocated - length);  | 
280  | 0  |                           if (ret == -1)  | 
281  | 0  |                             { | 
282  | 0  |                               errno = EINVAL;  | 
283  | 0  |                               goto fail;  | 
284  | 0  |                             }  | 
285  | 0  |                           if (ret < 0)  | 
286  | 0  |                             abort ();  | 
287  | 0  |                           length += ret;  | 
288  | 0  |                           goto done_appending;  | 
289  | 0  |                         }  | 
290  | 0  |                       }  | 
291  | 0  |                     }  | 
292  | 0  |                    done_appending: ;  | 
293  | 0  |                   }  | 
294  |  |  | 
295  |  |                 /* sortbuf is now empty.  */  | 
296  | 0  |                 sortbuf_count = 0;  | 
297  | 0  |               }  | 
298  |  |  | 
299  | 0  |             if (!(s < s_end))  | 
300  |  |               /* End of string reached.  */  | 
301  | 0  |               break;  | 
302  |  |  | 
303  |  |             /* Append (uc, ccc) to sortbuf.  */  | 
304  | 0  |             if (sortbuf_count == sortbuf_allocated)  | 
305  | 0  |               { | 
306  | 0  |                 struct ucs4_with_ccc *new_sortbuf;  | 
307  |  | 
  | 
308  | 0  |                 sortbuf_allocated = 2 * sortbuf_allocated;  | 
309  | 0  |                 if (sortbuf_allocated < sortbuf_count) /* integer overflow? */  | 
310  | 0  |                   abort ();  | 
311  | 0  |                 new_sortbuf =  | 
312  | 0  |                   (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc));  | 
313  | 0  |                 if (new_sortbuf == NULL)  | 
314  | 0  |                   { | 
315  | 0  |                     errno = ENOMEM;  | 
316  | 0  |                     goto fail;  | 
317  | 0  |                   }  | 
318  | 0  |                 memcpy (new_sortbuf, sortbuf,  | 
319  | 0  |                         sortbuf_count * sizeof (struct ucs4_with_ccc));  | 
320  | 0  |                 if (sortbuf != sortbuf_preallocated)  | 
321  | 0  |                   free (sortbuf);  | 
322  | 0  |                 sortbuf = new_sortbuf;  | 
323  | 0  |               }  | 
324  | 0  |             sortbuf[sortbuf_count].code = uc;  | 
325  | 0  |             sortbuf[sortbuf_count].ccc = ccc;  | 
326  | 0  |             sortbuf_count++;  | 
327  |  | 
  | 
328  | 0  |             i++;  | 
329  | 0  |           }  | 
330  |  |  | 
331  | 0  |         if (!(s < s_end))  | 
332  |  |           /* End of string reached.  */  | 
333  | 0  |           break;  | 
334  |  |  | 
335  | 0  |         s += count;  | 
336  | 0  |       }  | 
337  | 0  |   }  | 
338  |  |  | 
339  | 0  |   if (length == 0)  | 
340  | 0  |     { | 
341  | 0  |       if (result == NULL)  | 
342  | 0  |         { | 
343  |  |           /* Return a non-NULL value.  NULL means error.  */  | 
344  | 0  |           result = (UNIT *) malloc (1);  | 
345  | 0  |           if (result == NULL)  | 
346  | 0  |             { | 
347  | 0  |               errno = ENOMEM;  | 
348  | 0  |               goto fail;  | 
349  | 0  |             }  | 
350  | 0  |         }  | 
351  | 0  |     }  | 
352  | 0  |   else if (result != resultbuf && length < allocated)  | 
353  | 0  |     { | 
354  |  |       /* Shrink the allocated memory if possible.  */  | 
355  | 0  |       UNIT *memory;  | 
356  |  | 
  | 
357  | 0  |       memory = (UNIT *) realloc (result, length * sizeof (UNIT));  | 
358  | 0  |       if (memory != NULL)  | 
359  | 0  |         result = memory;  | 
360  | 0  |     }  | 
361  |  |  | 
362  | 0  |   if (sortbuf_count > 0)  | 
363  | 0  |     abort ();  | 
364  | 0  |   if (sortbuf != sortbuf_preallocated)  | 
365  | 0  |     free (sortbuf);  | 
366  |  | 
  | 
367  | 0  |   *lengthp = length;  | 
368  | 0  |   return result;  | 
369  |  |  | 
370  | 0  |  fail:  | 
371  | 0  |   { | 
372  | 0  |     int saved_errno = errno;  | 
373  | 0  |     if (sortbuf != sortbuf_preallocated)  | 
374  | 0  |       free (sortbuf);  | 
375  | 0  |     if (result != resultbuf)  | 
376  | 0  |       free (result);  | 
377  | 0  |     errno = saved_errno;  | 
378  | 0  |   }  | 
379  | 0  |   return NULL;  | 
380  | 0  | } Unexecuted instantiation: u16_normalize Unexecuted instantiation: u32_normalize  |