/src/e2fsprogs/lib/ext2fs/nls_utf8.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2014 SGI. |
3 | | * Copyright (c) 2018 Collabora Ltd. |
4 | | * All rights reserved. |
5 | | * |
6 | | * This program is free software; you can redistribute it and/or |
7 | | * modify it under the terms of the GNU General Public License as |
8 | | * published by the Free Software Foundation. |
9 | | * |
10 | | * This program is distributed in the hope that it would be useful, |
11 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | | * GNU General Public License for more details. |
14 | | * |
15 | | */ |
16 | | |
17 | | /* |
18 | | * This code is adapted from the Linux Kernel. We have a |
19 | | * userspace version here such that the hashes will match that |
20 | | * implementation. |
21 | | */ |
22 | | |
23 | | #include "config.h" |
24 | | #include <stdint.h> |
25 | | #include <unistd.h> |
26 | | #include <string.h> |
27 | | #include <limits.h> |
28 | | #include <errno.h> |
29 | | |
30 | | #include "ext2_fs.h" |
31 | | #include "ext2fs.h" |
32 | | #include "ext2fsP.h" |
33 | | |
34 | | /* Encoding a unicode version number as a single unsigned int. */ |
35 | | #define UNICODE_MAJ_SHIFT (16) |
36 | | #define UNICODE_MIN_SHIFT (8) |
37 | | |
38 | | #define UNICODE_AGE(MAJ, MIN, REV) \ |
39 | | (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ |
40 | | ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ |
41 | | ((unsigned int)(REV))) |
42 | | |
43 | | /* Needed in struct utf8cursor below. */ |
44 | | #define UTF8HANGULLEAF (12) |
45 | | |
46 | | /* |
47 | | * Cursor structure used by the normalizer. |
48 | | */ |
49 | | struct utf8cursor { |
50 | | const struct utf8data *data; |
51 | | const char *s; |
52 | | const char *p; |
53 | | const char *ss; |
54 | | const char *sp; |
55 | | unsigned int len; |
56 | | unsigned int slen; |
57 | | short int ccc; |
58 | | short int nccc; |
59 | | unsigned char hangul[UTF8HANGULLEAF]; |
60 | | }; |
61 | | |
62 | | /* |
63 | | * Initialize a utf8cursor to normalize a string. |
64 | | * Returns 0 on success. |
65 | | * Returns -1 on failure. |
66 | | */ |
67 | | // extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, |
68 | | // const char *s); |
69 | | // extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, |
70 | | // const char *s, size_t len); |
71 | | |
72 | | /* |
73 | | * Get the next byte in the normalization. |
74 | | * Returns a value > 0 && < 256 on success. |
75 | | * Returns 0 when the end of the normalization is reached. |
76 | | * Returns -1 if the string being normalized is not valid UTF-8. |
77 | | */ |
78 | | // extern int utf8byte(struct utf8cursor *u8c); |
79 | | |
80 | | |
81 | | struct utf8data { |
82 | | unsigned int maxage; |
83 | | unsigned int offset; |
84 | | }; |
85 | | |
86 | | #define __INCLUDED_FROM_UTF8NORM_C__ |
87 | | #include "utf8data.h" |
88 | | #undef __INCLUDED_FROM_UTF8NORM_C__ |
89 | | |
90 | | #define ARRAY_SIZE(array) \ |
91 | 0 | (sizeof(array) / sizeof(array[0])) |
92 | | |
93 | | #if 0 |
94 | | /* Highest unicode version supported by the data tables. */ |
95 | | static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev) |
96 | | { |
97 | | int i = ARRAY_SIZE(utf8agetab) - 1; |
98 | | unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev); |
99 | | |
100 | | while (i >= 0 && utf8agetab[i] != 0) { |
101 | | if (sb_utf8version == utf8agetab[i]) |
102 | | return 1; |
103 | | i--; |
104 | | } |
105 | | return 0; |
106 | | } |
107 | | #endif |
108 | | |
109 | | #if 0 |
110 | | static int utf8version_latest(void) |
111 | | { |
112 | | return utf8vers; |
113 | | } |
114 | | #endif |
115 | | |
116 | | /* |
117 | | * UTF-8 valid ranges. |
118 | | * |
119 | | * The UTF-8 encoding spreads the bits of a 32bit word over several |
120 | | * bytes. This table gives the ranges that can be held and how they'd |
121 | | * be represented. |
122 | | * |
123 | | * 0x00000000 0x0000007F: 0xxxxxxx |
124 | | * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx |
125 | | * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx |
126 | | * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
127 | | * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
128 | | * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
129 | | * |
130 | | * There is an additional requirement on UTF-8, in that only the |
131 | | * shortest representation of a 32bit value is to be used. A decoder |
132 | | * must not decode sequences that do not satisfy this requirement. |
133 | | * Thus the allowed ranges have a lower bound. |
134 | | * |
135 | | * 0x00000000 0x0000007F: 0xxxxxxx |
136 | | * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx |
137 | | * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx |
138 | | * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
139 | | * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
140 | | * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
141 | | * |
142 | | * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, |
143 | | * 17 planes of 65536 values. This limits the sequences actually seen |
144 | | * even more, to just the following. |
145 | | * |
146 | | * 0 - 0x7F: 0 - 0x7F |
147 | | * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF |
148 | | * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF |
149 | | * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF |
150 | | * |
151 | | * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. |
152 | | * |
153 | | * Note that the longest sequence seen with valid usage is 4 bytes, |
154 | | * the same a single UTF-32 character. This makes the UTF-8 |
155 | | * representation of Unicode strictly smaller than UTF-32. |
156 | | * |
157 | | * The shortest sequence requirement was introduced by: |
158 | | * Corrigendum #1: UTF-8 Shortest Form |
159 | | * It can be found here: |
160 | | * http://www.unicode.org/versions/corrigendum1.html |
161 | | * |
162 | | */ |
163 | | |
164 | | /* |
165 | | * Return the number of bytes used by the current UTF-8 sequence. |
166 | | * Assumes the input points to the first byte of a valid UTF-8 |
167 | | * sequence. |
168 | | */ |
169 | | static inline int utf8clen(const char *s) |
170 | 0 | { |
171 | 0 | unsigned char c = *s; |
172 | |
|
173 | 0 | return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); |
174 | 0 | } |
175 | | |
176 | | /* |
177 | | * Decode a 3-byte UTF-8 sequence. |
178 | | */ |
179 | | static unsigned int |
180 | | utf8decode3(const char *str) |
181 | 0 | { |
182 | 0 | unsigned int uc; |
183 | |
|
184 | 0 | uc = *str++ & 0x0F; |
185 | 0 | uc <<= 6; |
186 | 0 | uc |= *str++ & 0x3F; |
187 | 0 | uc <<= 6; |
188 | 0 | uc |= *str++ & 0x3F; |
189 | |
|
190 | 0 | return uc; |
191 | 0 | } |
192 | | |
193 | | /* |
194 | | * Encode a 3-byte UTF-8 sequence. |
195 | | */ |
196 | | static int |
197 | | utf8encode3(char *str, unsigned int val) |
198 | 0 | { |
199 | 0 | str[2] = (val & 0x3F) | 0x80; |
200 | 0 | val >>= 6; |
201 | 0 | str[1] = (val & 0x3F) | 0x80; |
202 | 0 | val >>= 6; |
203 | 0 | str[0] = val | 0xE0; |
204 | |
|
205 | 0 | return 3; |
206 | 0 | } |
207 | | |
208 | | /* |
209 | | * utf8trie_t |
210 | | * |
211 | | * A compact binary tree, used to decode UTF-8 characters. |
212 | | * |
213 | | * Internal nodes are one byte for the node itself, and up to three |
214 | | * bytes for an offset into the tree. The first byte contains the |
215 | | * following information: |
216 | | * NEXTBYTE - flag - advance to next byte if set |
217 | | * BITNUM - 3 bit field - the bit number to tested |
218 | | * OFFLEN - 2 bit field - number of bytes in the offset |
219 | | * if offlen == 0 (non-branching node) |
220 | | * RIGHTPATH - 1 bit field - set if the following node is for the |
221 | | * right-hand path (tested bit is set) |
222 | | * TRIENODE - 1 bit field - set if the following node is an internal |
223 | | * node, otherwise it is a leaf node |
224 | | * if offlen != 0 (branching node) |
225 | | * LEFTNODE - 1 bit field - set if the left-hand node is internal |
226 | | * RIGHTNODE - 1 bit field - set if the right-hand node is internal |
227 | | * |
228 | | * Due to the way utf8 works, there cannot be branching nodes with |
229 | | * NEXTBYTE set, and moreover those nodes always have a righthand |
230 | | * descendant. |
231 | | */ |
232 | | typedef const unsigned char utf8trie_t; |
233 | 0 | #define BITNUM 0x07 |
234 | 0 | #define NEXTBYTE 0x08 |
235 | 0 | #define OFFLEN 0x30 |
236 | 0 | #define OFFLEN_SHIFT 4 |
237 | 0 | #define RIGHTPATH 0x40 |
238 | 0 | #define TRIENODE 0x80 |
239 | 0 | #define RIGHTNODE 0x40 |
240 | 0 | #define LEFTNODE 0x80 |
241 | | |
242 | | /* |
243 | | * utf8leaf_t |
244 | | * |
245 | | * The leaves of the trie are embedded in the trie, and so the same |
246 | | * underlying datatype: unsigned char. |
247 | | * |
248 | | * leaf[0]: The unicode version, stored as a generation number that is |
249 | | * an index into utf8agetab[]. With this we can filter code |
250 | | * points based on the unicode version in which they were |
251 | | * defined. The CCC of a non-defined code point is 0. |
252 | | * leaf[1]: Canonical Combining Class. During normalization, we need |
253 | | * to do a stable sort into ascending order of all characters |
254 | | * with a non-zero CCC that occur between two characters with |
255 | | * a CCC of 0, or at the begin or end of a string. |
256 | | * The unicode standard guarantees that all CCC values are |
257 | | * between 0 and 254 inclusive, which leaves 255 available as |
258 | | * a special value. |
259 | | * Code points with CCC 0 are known as stoppers. |
260 | | * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the |
261 | | * start of a NUL-terminated string that is the decomposition |
262 | | * of the character. |
263 | | * The CCC of a decomposable character is the same as the CCC |
264 | | * of the first character of its decomposition. |
265 | | * Some characters decompose as the empty string: these are |
266 | | * characters with the Default_Ignorable_Code_Point property. |
267 | | * These do affect normalization, as they all have CCC 0. |
268 | | * |
269 | | * The decompositions in the trie have been fully expanded, with the |
270 | | * exception of Hangul syllables, which are decomposed algorithmically. |
271 | | * |
272 | | * Casefolding, if applicable, is also done using decompositions. |
273 | | * |
274 | | * The trie is constructed in such a way that leaves exist for all |
275 | | * UTF-8 sequences that match the criteria from the "UTF-8 valid |
276 | | * ranges" comment above, and only for those sequences. Therefore a |
277 | | * lookup in the trie can be used to validate the UTF-8 input. |
278 | | */ |
279 | | typedef const unsigned char utf8leaf_t; |
280 | | |
281 | 0 | #define LEAF_GEN(LEAF) ((LEAF)[0]) |
282 | 0 | #define LEAF_CCC(LEAF) ((LEAF)[1]) |
283 | 0 | #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) |
284 | | |
285 | 0 | #define MINCCC (0) |
286 | 0 | #define MAXCCC (254) |
287 | 0 | #define STOPPER (0) |
288 | 0 | #define DECOMPOSE (255) |
289 | | |
290 | | /* Marker for hangul syllable decomposition. */ |
291 | 0 | #define HANGUL ((char)(255)) |
292 | | /* Size of the synthesized leaf used for Hangul syllable decomposition. */ |
293 | | #define UTF8HANGULLEAF (12) |
294 | | |
295 | | /* |
296 | | * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) |
297 | | * |
298 | | * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; |
299 | | * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; |
300 | | * |
301 | | * SBase = 0xAC00 |
302 | | * LBase = 0x1100 |
303 | | * VBase = 0x1161 |
304 | | * TBase = 0x11A7 |
305 | | * LCount = 19 |
306 | | * VCount = 21 |
307 | | * TCount = 28 |
308 | | * NCount = 588 (VCount * TCount) |
309 | | * SCount = 11172 (LCount * NCount) |
310 | | * |
311 | | * Decomposition: |
312 | | * SIndex = s - SBase |
313 | | * |
314 | | * LV (Canonical/Full) |
315 | | * LIndex = SIndex / NCount |
316 | | * VIndex = (Sindex % NCount) / TCount |
317 | | * LPart = LBase + LIndex |
318 | | * VPart = VBase + VIndex |
319 | | * |
320 | | * LVT (Canonical) |
321 | | * LVIndex = (SIndex / TCount) * TCount |
322 | | * TIndex = (Sindex % TCount) |
323 | | * LVPart = SBase + LVIndex |
324 | | * TPart = TBase + TIndex |
325 | | * |
326 | | * LVT (Full) |
327 | | * LIndex = SIndex / NCount |
328 | | * VIndex = (Sindex % NCount) / TCount |
329 | | * TIndex = (Sindex % TCount) |
330 | | * LPart = LBase + LIndex |
331 | | * VPart = VBase + VIndex |
332 | | * if (TIndex == 0) { |
333 | | * d = <LPart, VPart> |
334 | | * } else { |
335 | | * TPart = TBase + TIndex |
336 | | * d = <LPart, TPart, VPart> |
337 | | * } |
338 | | */ |
339 | | |
340 | | /* Constants */ |
341 | 0 | #define SB (0xAC00) |
342 | 0 | #define LB (0x1100) |
343 | 0 | #define VB (0x1161) |
344 | 0 | #define TB (0x11A7) |
345 | | #define LC (19) |
346 | 0 | #define VC (21) |
347 | 0 | #define TC (28) |
348 | 0 | #define NC (VC * TC) |
349 | | #define SC (LC * NC) |
350 | | |
351 | | /* Algorithmic decomposition of hangul syllable. */ |
352 | | static utf8leaf_t * |
353 | | utf8hangul(const char *str, unsigned char *hangul) |
354 | 0 | { |
355 | 0 | unsigned int si; |
356 | 0 | unsigned int li; |
357 | 0 | unsigned int vi; |
358 | 0 | unsigned int ti; |
359 | 0 | unsigned char *h; |
360 | | |
361 | | /* Calculate the SI, LI, VI, and TI values. */ |
362 | 0 | si = utf8decode3(str) - SB; |
363 | 0 | li = si / NC; |
364 | 0 | vi = (si % NC) / TC; |
365 | 0 | ti = si % TC; |
366 | | |
367 | | /* Fill in base of leaf. */ |
368 | 0 | h = hangul; |
369 | 0 | LEAF_GEN(h) = 2; |
370 | 0 | LEAF_CCC(h) = DECOMPOSE; |
371 | 0 | h += 2; |
372 | | |
373 | | /* Add LPart, a 3-byte UTF-8 sequence. */ |
374 | 0 | h += utf8encode3((char *)h, li + LB); |
375 | | |
376 | | /* Add VPart, a 3-byte UTF-8 sequence. */ |
377 | 0 | h += utf8encode3((char *)h, vi + VB); |
378 | | |
379 | | /* Add TPart if required, also a 3-byte UTF-8 sequence. */ |
380 | 0 | if (ti) |
381 | 0 | h += utf8encode3((char *)h, ti + TB); |
382 | | |
383 | | /* Terminate string. */ |
384 | 0 | h[0] = '\0'; |
385 | |
|
386 | 0 | return hangul; |
387 | 0 | } |
388 | | |
389 | | /* |
390 | | * Use trie to scan s, touching at most len bytes. |
391 | | * Returns the leaf if one exists, NULL otherwise. |
392 | | * |
393 | | * A non-NULL return guarantees that the UTF-8 sequence starting at s |
394 | | * is well-formed and corresponds to a known unicode code point. The |
395 | | * shorthand for this will be "is valid UTF-8 unicode". |
396 | | */ |
397 | | static utf8leaf_t *utf8nlookup(const struct utf8data *data, |
398 | | unsigned char *hangul, const char *s, size_t len) |
399 | 0 | { |
400 | 0 | utf8trie_t *trie; |
401 | 0 | int offlen; |
402 | 0 | int offset; |
403 | 0 | int mask; |
404 | 0 | int node; |
405 | |
|
406 | 0 | if (!data) |
407 | 0 | return NULL; |
408 | 0 | if (len == 0) |
409 | 0 | return NULL; |
410 | | |
411 | 0 | trie = utf8data + data->offset; |
412 | 0 | node = 1; |
413 | 0 | while (node) { |
414 | 0 | offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; |
415 | 0 | if (*trie & NEXTBYTE) { |
416 | 0 | if (--len == 0) |
417 | 0 | return NULL; |
418 | 0 | s++; |
419 | 0 | } |
420 | 0 | mask = 1 << (*trie & BITNUM); |
421 | 0 | if (*s & mask) { |
422 | | /* Right leg */ |
423 | 0 | if (offlen) { |
424 | | /* Right node at offset of trie */ |
425 | 0 | node = (*trie & RIGHTNODE); |
426 | 0 | offset = trie[offlen]; |
427 | 0 | while (--offlen) { |
428 | 0 | offset <<= 8; |
429 | 0 | offset |= trie[offlen]; |
430 | 0 | } |
431 | 0 | trie += offset; |
432 | 0 | } else if (*trie & RIGHTPATH) { |
433 | | /* Right node after this node */ |
434 | 0 | node = (*trie & TRIENODE); |
435 | 0 | trie++; |
436 | 0 | } else { |
437 | | /* No right node. */ |
438 | 0 | return NULL; |
439 | 0 | } |
440 | 0 | } else { |
441 | | /* Left leg */ |
442 | 0 | if (offlen) { |
443 | | /* Left node after this node. */ |
444 | 0 | node = (*trie & LEFTNODE); |
445 | 0 | trie += offlen + 1; |
446 | 0 | } else if (*trie & RIGHTPATH) { |
447 | | /* No left node. */ |
448 | 0 | return NULL; |
449 | 0 | } else { |
450 | | /* Left node after this node */ |
451 | 0 | node = (*trie & TRIENODE); |
452 | 0 | trie++; |
453 | 0 | } |
454 | 0 | } |
455 | 0 | } |
456 | | /* |
457 | | * Hangul decomposition is done algorithmically. These are the |
458 | | * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is |
459 | | * always 3 bytes long, so s has been advanced twice, and the |
460 | | * start of the sequence is at s-2. |
461 | | */ |
462 | 0 | if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) |
463 | 0 | trie = utf8hangul(s - 2, hangul); |
464 | 0 | return trie; |
465 | 0 | } |
466 | | |
467 | | /* |
468 | | * Use trie to scan s. |
469 | | * Returns the leaf if one exists, NULL otherwise. |
470 | | * |
471 | | * Forwards to utf8nlookup(). |
472 | | */ |
473 | | static utf8leaf_t *utf8lookup(const struct utf8data *data, |
474 | | unsigned char *hangul, const char *s) |
475 | 0 | { |
476 | 0 | return utf8nlookup(data, hangul, s, (size_t)-1); |
477 | 0 | } |
478 | | |
479 | | #if 0 |
480 | | /* |
481 | | * Maximum age of any character in s. |
482 | | * Return -1 if s is not valid UTF-8 unicode. |
483 | | * Return 0 if only non-assigned code points are used. |
484 | | */ |
485 | | static int utf8agemax(const struct utf8data *data, const char *s) |
486 | | { |
487 | | utf8leaf_t *leaf; |
488 | | int age = 0; |
489 | | int leaf_age; |
490 | | unsigned char hangul[UTF8HANGULLEAF]; |
491 | | |
492 | | if (!data) |
493 | | return -1; |
494 | | |
495 | | while (*s) { |
496 | | leaf = utf8lookup(data, hangul, s); |
497 | | if (!leaf) |
498 | | return -1; |
499 | | |
500 | | leaf_age = utf8agetab[LEAF_GEN(leaf)]; |
501 | | if (leaf_age <= data->maxage && leaf_age > age) |
502 | | age = leaf_age; |
503 | | s += utf8clen(s); |
504 | | } |
505 | | return age; |
506 | | } |
507 | | #endif |
508 | | |
509 | | #if 0 |
510 | | /* |
511 | | * Minimum age of any character in s. |
512 | | * Return -1 if s is not valid UTF-8 unicode. |
513 | | * Return 0 if non-assigned code points are used. |
514 | | */ |
515 | | static int utf8agemin(const struct utf8data *data, const char *s) |
516 | | { |
517 | | utf8leaf_t *leaf; |
518 | | int age; |
519 | | int leaf_age; |
520 | | unsigned char hangul[UTF8HANGULLEAF]; |
521 | | |
522 | | if (!data) |
523 | | return -1; |
524 | | age = data->maxage; |
525 | | while (*s) { |
526 | | leaf = utf8lookup(data, hangul, s); |
527 | | if (!leaf) |
528 | | return -1; |
529 | | leaf_age = utf8agetab[LEAF_GEN(leaf)]; |
530 | | if (leaf_age <= data->maxage && leaf_age < age) |
531 | | age = leaf_age; |
532 | | s += utf8clen(s); |
533 | | } |
534 | | return age; |
535 | | } |
536 | | #endif |
537 | | |
538 | | #if 0 |
539 | | /* |
540 | | * Maximum age of any character in s, touch at most len bytes. |
541 | | * Return -1 if s is not valid UTF-8 unicode. |
542 | | */ |
543 | | static int utf8nagemax(const struct utf8data *data, const char *s, size_t len) |
544 | | { |
545 | | utf8leaf_t *leaf; |
546 | | int age = 0; |
547 | | int leaf_age; |
548 | | unsigned char hangul[UTF8HANGULLEAF]; |
549 | | |
550 | | if (!data) |
551 | | return -1; |
552 | | |
553 | | while (len && *s) { |
554 | | leaf = utf8nlookup(data, hangul, s, len); |
555 | | if (!leaf) |
556 | | return -1; |
557 | | leaf_age = utf8agetab[LEAF_GEN(leaf)]; |
558 | | if (leaf_age <= data->maxage && leaf_age > age) |
559 | | age = leaf_age; |
560 | | len -= utf8clen(s); |
561 | | s += utf8clen(s); |
562 | | } |
563 | | return age; |
564 | | } |
565 | | #endif |
566 | | |
567 | | #if 0 |
568 | | /* |
569 | | * Maximum age of any character in s, touch at most len bytes. |
570 | | * Return -1 if s is not valid UTF-8 unicode. |
571 | | */ |
572 | | static int utf8nagemin(const struct utf8data *data, const char *s, size_t len) |
573 | | { |
574 | | utf8leaf_t *leaf; |
575 | | int leaf_age; |
576 | | int age; |
577 | | unsigned char hangul[UTF8HANGULLEAF]; |
578 | | |
579 | | if (!data) |
580 | | return -1; |
581 | | age = data->maxage; |
582 | | while (len && *s) { |
583 | | leaf = utf8nlookup(data, hangul, s, len); |
584 | | if (!leaf) |
585 | | return -1; |
586 | | leaf_age = utf8agetab[LEAF_GEN(leaf)]; |
587 | | if (leaf_age <= data->maxage && leaf_age < age) |
588 | | age = leaf_age; |
589 | | len -= utf8clen(s); |
590 | | s += utf8clen(s); |
591 | | } |
592 | | return age; |
593 | | } |
594 | | #endif |
595 | | |
596 | | #if 0 |
597 | | /* |
598 | | * Length of the normalization of s. |
599 | | * Return -1 if s is not valid UTF-8 unicode. |
600 | | * |
601 | | * A string of Default_Ignorable_Code_Point has length 0. |
602 | | */ |
603 | | static ssize_t utf8len(const struct utf8data *data, const char *s) |
604 | | { |
605 | | utf8leaf_t *leaf; |
606 | | size_t ret = 0; |
607 | | unsigned char hangul[UTF8HANGULLEAF]; |
608 | | |
609 | | if (!data) |
610 | | return -1; |
611 | | while (*s) { |
612 | | leaf = utf8lookup(data, hangul, s); |
613 | | if (!leaf) |
614 | | return -1; |
615 | | if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) |
616 | | ret += utf8clen(s); |
617 | | else if (LEAF_CCC(leaf) == DECOMPOSE) |
618 | | ret += strlen(LEAF_STR(leaf)); |
619 | | else |
620 | | ret += utf8clen(s); |
621 | | s += utf8clen(s); |
622 | | } |
623 | | return ret; |
624 | | } |
625 | | #endif |
626 | | |
627 | | #if 0 |
628 | | /* |
629 | | * Length of the normalization of s, touch at most len bytes. |
630 | | * Return -1 if s is not valid UTF-8 unicode. |
631 | | */ |
632 | | static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) |
633 | | { |
634 | | utf8leaf_t *leaf; |
635 | | size_t ret = 0; |
636 | | unsigned char hangul[UTF8HANGULLEAF]; |
637 | | |
638 | | if (!data) |
639 | | return -1; |
640 | | while (len && *s) { |
641 | | leaf = utf8nlookup(data, hangul, s, len); |
642 | | if (!leaf) |
643 | | return -1; |
644 | | if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) |
645 | | ret += utf8clen(s); |
646 | | else if (LEAF_CCC(leaf) == DECOMPOSE) |
647 | | ret += strlen(LEAF_STR(leaf)); |
648 | | else |
649 | | ret += utf8clen(s); |
650 | | len -= utf8clen(s); |
651 | | s += utf8clen(s); |
652 | | } |
653 | | return ret; |
654 | | } |
655 | | #endif |
656 | | |
657 | | /* |
658 | | * Set up an utf8cursor for use by utf8byte(). |
659 | | * |
660 | | * u8c : pointer to cursor. |
661 | | * data : const struct utf8data to use for normalization. |
662 | | * s : string. |
663 | | * len : length of s. |
664 | | * |
665 | | * Returns -1 on error, 0 on success. |
666 | | */ |
667 | | static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, |
668 | | const char *s, size_t len) |
669 | 0 | { |
670 | 0 | if (!data) |
671 | 0 | return -1; |
672 | 0 | if (!s) |
673 | 0 | return -1; |
674 | 0 | u8c->data = data; |
675 | 0 | u8c->s = s; |
676 | 0 | u8c->p = NULL; |
677 | 0 | u8c->ss = NULL; |
678 | 0 | u8c->sp = NULL; |
679 | 0 | u8c->len = len; |
680 | 0 | u8c->slen = 0; |
681 | 0 | u8c->ccc = STOPPER; |
682 | 0 | u8c->nccc = STOPPER; |
683 | | /* Check we didn't clobber the maximum length. */ |
684 | 0 | if (u8c->len != len) |
685 | 0 | return -1; |
686 | | /* The first byte of s may not be an utf8 continuation. */ |
687 | 0 | if (len > 0 && (*s & 0xC0) == 0x80) |
688 | 0 | return -1; |
689 | 0 | return 0; |
690 | 0 | } |
691 | | |
692 | | #if 0 |
693 | | /* |
694 | | * Set up an utf8cursor for use by utf8byte(). |
695 | | * |
696 | | * u8c : pointer to cursor. |
697 | | * data : const struct utf8data to use for normalization. |
698 | | * s : NUL-terminated string. |
699 | | * |
700 | | * Returns -1 on error, 0 on success. |
701 | | */ |
702 | | static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, |
703 | | const char *s) |
704 | | { |
705 | | return utf8ncursor(u8c, data, s, (unsigned int)-1); |
706 | | } |
707 | | #endif |
708 | | |
709 | | /* |
710 | | * Get one byte from the normalized form of the string described by u8c. |
711 | | * |
712 | | * Returns the byte cast to an unsigned char on success, and -1 on failure. |
713 | | * |
714 | | * The cursor keeps track of the location in the string in u8c->s. |
715 | | * When a character is decomposed, the current location is stored in |
716 | | * u8c->p, and u8c->s is set to the start of the decomposition. Note |
717 | | * that bytes from a decomposition do not count against u8c->len. |
718 | | * |
719 | | * Characters are emitted if they match the current CCC in u8c->ccc. |
720 | | * Hitting end-of-string while u8c->ccc == STOPPER means we're done, |
721 | | * and the function returns 0 in that case. |
722 | | * |
723 | | * Sorting by CCC is done by repeatedly scanning the string. The |
724 | | * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at |
725 | | * the start of the scan. The first pass finds the lowest CCC to be |
726 | | * emitted and stores it in u8c->nccc, the second pass emits the |
727 | | * characters with this CCC and finds the next lowest CCC. This limits |
728 | | * the number of passes to 1 + the number of different CCCs in the |
729 | | * sequence being scanned. |
730 | | * |
731 | | * Therefore: |
732 | | * u8c->p != NULL -> a decomposition is being scanned. |
733 | | * u8c->ss != NULL -> this is a repeating scan. |
734 | | * u8c->ccc == -1 -> this is the first scan of a repeating scan. |
735 | | */ |
736 | | static int utf8byte(struct utf8cursor *u8c) |
737 | 0 | { |
738 | 0 | utf8leaf_t *leaf; |
739 | 0 | int ccc; |
740 | |
|
741 | 0 | for (;;) { |
742 | | /* Check for the end of a decomposed character. */ |
743 | 0 | if (u8c->p && *u8c->s == '\0') { |
744 | 0 | u8c->s = u8c->p; |
745 | 0 | u8c->p = NULL; |
746 | 0 | } |
747 | | |
748 | | /* Check for end-of-string. */ |
749 | 0 | if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { |
750 | | /* There is no next byte. */ |
751 | 0 | if (u8c->ccc == STOPPER) |
752 | 0 | return 0; |
753 | | /* End-of-string during a scan counts as a stopper. */ |
754 | 0 | ccc = STOPPER; |
755 | 0 | goto ccc_mismatch; |
756 | 0 | } else if ((*u8c->s & 0xC0) == 0x80) { |
757 | | /* This is a continuation of the current character. */ |
758 | 0 | if (!u8c->p) |
759 | 0 | u8c->len--; |
760 | 0 | return (unsigned char)*u8c->s++; |
761 | 0 | } |
762 | | |
763 | | /* Look up the data for the current character. */ |
764 | 0 | if (u8c->p) { |
765 | 0 | leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); |
766 | 0 | } else { |
767 | 0 | leaf = utf8nlookup(u8c->data, u8c->hangul, |
768 | 0 | u8c->s, u8c->len); |
769 | 0 | } |
770 | | |
771 | | /* No leaf found implies that the input is a binary blob. */ |
772 | 0 | if (!leaf) |
773 | 0 | return -1; |
774 | | |
775 | 0 | ccc = LEAF_CCC(leaf); |
776 | | /* Characters that are too new have CCC 0. */ |
777 | 0 | if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) { |
778 | 0 | ccc = STOPPER; |
779 | 0 | } else if (ccc == DECOMPOSE) { |
780 | 0 | u8c->len -= utf8clen(u8c->s); |
781 | 0 | u8c->p = u8c->s + utf8clen(u8c->s); |
782 | 0 | u8c->s = LEAF_STR(leaf); |
783 | | /* Empty decomposition implies CCC 0. */ |
784 | 0 | if (*u8c->s == '\0') { |
785 | 0 | if (u8c->ccc == STOPPER) |
786 | 0 | continue; |
787 | 0 | ccc = STOPPER; |
788 | 0 | goto ccc_mismatch; |
789 | 0 | } |
790 | | |
791 | 0 | leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); |
792 | 0 | if (!leaf) |
793 | 0 | return -1; |
794 | 0 | ccc = LEAF_CCC(leaf); |
795 | 0 | } |
796 | | |
797 | | /* |
798 | | * If this is not a stopper, then see if it updates |
799 | | * the next canonical class to be emitted. |
800 | | */ |
801 | 0 | if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) |
802 | 0 | u8c->nccc = ccc; |
803 | | |
804 | | /* |
805 | | * Return the current byte if this is the current |
806 | | * combining class. |
807 | | */ |
808 | 0 | if (ccc == u8c->ccc) { |
809 | 0 | if (!u8c->p) |
810 | 0 | u8c->len--; |
811 | 0 | return (unsigned char)*u8c->s++; |
812 | 0 | } |
813 | | |
814 | | /* Current combining class mismatch. */ |
815 | 0 | ccc_mismatch: |
816 | 0 | if (u8c->nccc == STOPPER) { |
817 | | /* |
818 | | * Scan forward for the first canonical class |
819 | | * to be emitted. Save the position from |
820 | | * which to restart. |
821 | | */ |
822 | 0 | u8c->ccc = MINCCC - 1; |
823 | 0 | u8c->nccc = ccc; |
824 | 0 | u8c->sp = u8c->p; |
825 | 0 | u8c->ss = u8c->s; |
826 | 0 | u8c->slen = u8c->len; |
827 | 0 | if (!u8c->p) |
828 | 0 | u8c->len -= utf8clen(u8c->s); |
829 | 0 | u8c->s += utf8clen(u8c->s); |
830 | 0 | } else if (ccc != STOPPER) { |
831 | | /* Not a stopper, and not the ccc we're emitting. */ |
832 | 0 | if (!u8c->p) |
833 | 0 | u8c->len -= utf8clen(u8c->s); |
834 | 0 | u8c->s += utf8clen(u8c->s); |
835 | 0 | } else if (u8c->nccc != MAXCCC + 1) { |
836 | | /* At a stopper, restart for next ccc. */ |
837 | 0 | u8c->ccc = u8c->nccc; |
838 | 0 | u8c->nccc = MAXCCC + 1; |
839 | 0 | u8c->s = u8c->ss; |
840 | 0 | u8c->p = u8c->sp; |
841 | 0 | u8c->len = u8c->slen; |
842 | 0 | } else { |
843 | | /* All done, proceed from here. */ |
844 | 0 | u8c->ccc = STOPPER; |
845 | 0 | u8c->nccc = STOPPER; |
846 | 0 | u8c->sp = NULL; |
847 | 0 | u8c->ss = NULL; |
848 | 0 | u8c->slen = 0; |
849 | 0 | } |
850 | 0 | } |
851 | 0 | } |
852 | | |
853 | | #if 0 |
854 | | /* |
855 | | * Look for the correct const struct utf8data for a unicode version. |
856 | | * Returns NULL if the version requested is too new. |
857 | | * |
858 | | * Two normalization forms are supported: nfdi and nfdicf. |
859 | | * |
860 | | * nfdi: |
861 | | * - Apply unicode normalization form NFD. |
862 | | * - Remove any Default_Ignorable_Code_Point. |
863 | | * |
864 | | * nfdicf: |
865 | | * - Apply unicode normalization form NFD. |
866 | | * - Remove any Default_Ignorable_Code_Point. |
867 | | * - Apply a full casefold (C + F). |
868 | | */ |
869 | | static const struct utf8data *utf8nfdi(unsigned int maxage) |
870 | | { |
871 | | int i = ARRAY_SIZE(utf8nfdidata) - 1; |
872 | | |
873 | | while (maxage < utf8nfdidata[i].maxage) |
874 | | i--; |
875 | | if (maxage > utf8nfdidata[i].maxage) |
876 | | return NULL; |
877 | | return &utf8nfdidata[i]; |
878 | | } |
879 | | #endif |
880 | | |
881 | | static const struct utf8data *utf8nfdicf(unsigned int maxage) |
882 | 0 | { |
883 | 0 | int i = ARRAY_SIZE(utf8nfdicfdata) - 1; |
884 | |
|
885 | 0 | while (maxage < utf8nfdicfdata[i].maxage) |
886 | 0 | i--; |
887 | 0 | if (maxage > utf8nfdicfdata[i].maxage) |
888 | 0 | return NULL; |
889 | 0 | return &utf8nfdicfdata[i]; |
890 | 0 | } |
891 | | |
892 | | static int utf8_casefold(const struct ext2fs_nls_table *table, |
893 | | const unsigned char *str, size_t len, |
894 | | unsigned char *dest, size_t dlen) |
895 | 0 | { |
896 | 0 | const struct utf8data *data = utf8nfdicf(table->version); |
897 | 0 | struct utf8cursor cur; |
898 | 0 | size_t nlen = 0; |
899 | |
|
900 | 0 | if (utf8ncursor(&cur, data, (const char *) str, len) < 0) |
901 | 0 | goto invalid_seq; |
902 | | |
903 | 0 | for (nlen = 0; nlen < dlen; nlen++) { |
904 | 0 | int c = utf8byte(&cur); |
905 | |
|
906 | 0 | dest[nlen] = c; |
907 | 0 | if (!c) |
908 | 0 | return nlen; |
909 | 0 | if (c == -1) |
910 | 0 | break; |
911 | 0 | } |
912 | | |
913 | 0 | return -ENAMETOOLONG; |
914 | | |
915 | 0 | invalid_seq: |
916 | 0 | if (dlen < len) |
917 | 0 | return -ENAMETOOLONG; |
918 | | |
919 | | /* Signal invalid sequence */ |
920 | 0 | return -EINVAL; |
921 | 0 | } |
922 | | |
923 | | static int utf8_validate(const struct ext2fs_nls_table *table, |
924 | | char *s, size_t len, char **pos) |
925 | 0 | { |
926 | 0 | const struct utf8data *data = utf8nfdicf(table->version); |
927 | 0 | utf8leaf_t *leaf; |
928 | 0 | unsigned char hangul[UTF8HANGULLEAF]; |
929 | |
|
930 | 0 | if (!data) |
931 | 0 | return -1; |
932 | 0 | while (len && *s) { |
933 | 0 | leaf = utf8nlookup(data, hangul, s, len); |
934 | 0 | if (!leaf) { |
935 | 0 | *pos = s; |
936 | 0 | return 1; |
937 | 0 | } |
938 | 0 | len -= utf8clen(s); |
939 | 0 | s += utf8clen(s); |
940 | 0 | } |
941 | 0 | return 0; |
942 | 0 | } |
943 | | |
944 | | static int utf8_casefold_cmp(const struct ext2fs_nls_table *table, |
945 | | const unsigned char *str1, size_t len1, |
946 | | const unsigned char *str2, size_t len2) |
947 | 0 | { |
948 | 0 | const struct utf8data *data = utf8nfdicf(table->version); |
949 | 0 | int c1, c2; |
950 | 0 | struct utf8cursor cur1, cur2; |
951 | |
|
952 | 0 | if (utf8ncursor(&cur1, data, (const char *) str1, len1) < 0) |
953 | 0 | return -1; |
954 | 0 | if (utf8ncursor(&cur2, data, (const char *) str2, len2) < 0) |
955 | 0 | return -1; |
956 | | |
957 | 0 | do { |
958 | 0 | c1 = utf8byte(&cur1); |
959 | 0 | c2 = utf8byte(&cur2); |
960 | |
|
961 | 0 | if (c1 < 0 || c2 < 0) |
962 | 0 | return -1; |
963 | 0 | if (c1 != c2) |
964 | 0 | return c1 - c2; |
965 | 0 | } while (c1); |
966 | | |
967 | 0 | return 0; |
968 | 0 | } |
969 | | |
970 | | static const struct ext2fs_nls_ops utf8_ops = { |
971 | | .casefold = utf8_casefold, |
972 | | .validate = utf8_validate, |
973 | | .casefold_cmp = utf8_casefold_cmp, |
974 | | }; |
975 | | |
976 | | static const struct ext2fs_nls_table nls_utf8 = { |
977 | | .ops = &utf8_ops, |
978 | | .version = UNICODE_AGE(12, 1, 0), |
979 | | }; |
980 | | |
981 | | const struct ext2fs_nls_table *ext2fs_load_nls_table(int encoding) |
982 | 21 | { |
983 | 21 | if (encoding == EXT4_ENC_UTF8_12_1) |
984 | 1 | return &nls_utf8; |
985 | | |
986 | 20 | return NULL; |
987 | 21 | } |
988 | | |
989 | | int ext2fs_check_encoded_name(const struct ext2fs_nls_table *table, |
990 | | char *name, size_t len, char **pos) |
991 | 0 | { |
992 | 0 | return table->ops->validate(table, name, len, pos); |
993 | 0 | } |
994 | | |
995 | | int ext2fs_casefold_cmp(const struct ext2fs_nls_table *table, |
996 | | const unsigned char *str1, size_t len1, |
997 | | const unsigned char *str2, size_t len2) |
998 | 0 | { |
999 | 0 | return table->ops->casefold_cmp(table, str1, len1, str2, len2); |
1000 | 0 | } |