Coverage Report

Created: 2026-03-12 06:19

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/e2fsprogs/lib/ext2fs/nls_utf8.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2014 SGI.
3
 * Copyright (c) 2018 Collabora Ltd.
4
 * All rights reserved.
5
 *
6
 * This program is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU General Public License as
8
 * published by the Free Software Foundation.
9
 *
10
 * This program is distributed in the hope that it would be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 * GNU General Public License for more details.
14
 *
15
 */
16
17
/*
18
 * This code is adapted from the Linux Kernel.  We have a
19
 * userspace version here such that the hashes will match that
20
 * implementation.
21
 */
22
23
#include "config.h"
24
#include <stdint.h>
25
#include <unistd.h>
26
#include <string.h>
27
#include <limits.h>
28
#include <errno.h>
29
30
#include "ext2_fs.h"
31
#include "ext2fs.h"
32
#include "ext2fsP.h"
33
34
/* Encoding a unicode version number as a single unsigned int. */
35
#define UNICODE_MAJ_SHIFT   (16)
36
#define UNICODE_MIN_SHIFT   (8)
37
38
#define UNICODE_AGE(MAJ, MIN, REV)      \
39
  (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
40
   ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
41
   ((unsigned int)(REV)))
42
43
/* Needed in struct utf8cursor below. */
44
#define UTF8HANGULLEAF  (12)
45
46
/*
47
 * Cursor structure used by the normalizer.
48
 */
49
struct utf8cursor {
50
  const struct utf8data *data;
51
  const char  *s;
52
  const char  *p;
53
  const char  *ss;
54
  const char  *sp;
55
  unsigned int  len;
56
  unsigned int  slen;
57
  short int ccc;
58
  short int nccc;
59
  unsigned char hangul[UTF8HANGULLEAF];
60
};
61
62
/*
63
 * Initialize a utf8cursor to normalize a string.
64
 * Returns 0 on success.
65
 * Returns -1 on failure.
66
 */
67
// extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
68
//          const char *s);
69
// extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
70
//           const char *s, size_t len);
71
72
/*
73
 * Get the next byte in the normalization.
74
 * Returns a value > 0 && < 256 on success.
75
 * Returns 0 when the end of the normalization is reached.
76
 * Returns -1 if the string being normalized is not valid UTF-8.
77
 */
78
// extern int utf8byte(struct utf8cursor *u8c);
79
80
81
struct utf8data {
82
  unsigned int maxage;
83
  unsigned int offset;
84
};
85
86
#define __INCLUDED_FROM_UTF8NORM_C__
87
#include "utf8data.h"
88
#undef __INCLUDED_FROM_UTF8NORM_C__
89
90
#define ARRAY_SIZE(array)     \
91
0
        (sizeof(array) / sizeof(array[0]))
92
93
#if 0
94
/* Highest unicode version supported by the data tables. */
95
static int utf8version_is_supported(uint8_t maj, uint8_t min, uint8_t rev)
96
{
97
  int i = ARRAY_SIZE(utf8agetab) - 1;
98
  unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
99
100
  while (i >= 0 && utf8agetab[i] != 0) {
101
    if (sb_utf8version == utf8agetab[i])
102
      return 1;
103
    i--;
104
  }
105
  return 0;
106
}
107
#endif
108
109
#if 0
110
static int utf8version_latest(void)
111
{
112
  return utf8vers;
113
}
114
#endif
115
116
/*
117
 * UTF-8 valid ranges.
118
 *
119
 * The UTF-8 encoding spreads the bits of a 32bit word over several
120
 * bytes. This table gives the ranges that can be held and how they'd
121
 * be represented.
122
 *
123
 * 0x00000000 0x0000007F: 0xxxxxxx
124
 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
125
 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
126
 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
127
 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
128
 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
129
 *
130
 * There is an additional requirement on UTF-8, in that only the
131
 * shortest representation of a 32bit value is to be used.  A decoder
132
 * must not decode sequences that do not satisfy this requirement.
133
 * Thus the allowed ranges have a lower bound.
134
 *
135
 * 0x00000000 0x0000007F: 0xxxxxxx
136
 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
137
 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
138
 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
139
 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
140
 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
141
 *
142
 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
143
 * 17 planes of 65536 values.  This limits the sequences actually seen
144
 * even more, to just the following.
145
 *
146
 *          0 -     0x7F: 0                   - 0x7F
147
 *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
148
 *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
149
 *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
150
 *
151
 * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
152
 *
153
 * Note that the longest sequence seen with valid usage is 4 bytes,
154
 * the same a single UTF-32 character.  This makes the UTF-8
155
 * representation of Unicode strictly smaller than UTF-32.
156
 *
157
 * The shortest sequence requirement was introduced by:
158
 *    Corrigendum #1: UTF-8 Shortest Form
159
 * It can be found here:
160
 *    http://www.unicode.org/versions/corrigendum1.html
161
 *
162
 */
163
164
/*
165
 * Return the number of bytes used by the current UTF-8 sequence.
166
 * Assumes the input points to the first byte of a valid UTF-8
167
 * sequence.
168
 */
169
static inline int utf8clen(const char *s)
170
0
{
171
0
  unsigned char c = *s;
172
173
0
  return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
174
0
}
175
176
/*
177
 * Decode a 3-byte UTF-8 sequence.
178
 */
179
static unsigned int
180
utf8decode3(const char *str)
181
0
{
182
0
  unsigned int    uc;
183
184
0
  uc = *str++ & 0x0F;
185
0
  uc <<= 6;
186
0
  uc |= *str++ & 0x3F;
187
0
  uc <<= 6;
188
0
  uc |= *str++ & 0x3F;
189
190
0
  return uc;
191
0
}
192
193
/*
194
 * Encode a 3-byte UTF-8 sequence.
195
 */
196
static int
197
utf8encode3(char *str, unsigned int val)
198
0
{
199
0
  str[2] = (val & 0x3F) | 0x80;
200
0
  val >>= 6;
201
0
  str[1] = (val & 0x3F) | 0x80;
202
0
  val >>= 6;
203
0
  str[0] = val | 0xE0;
204
205
0
  return 3;
206
0
}
207
208
/*
209
 * utf8trie_t
210
 *
211
 * A compact binary tree, used to decode UTF-8 characters.
212
 *
213
 * Internal nodes are one byte for the node itself, and up to three
214
 * bytes for an offset into the tree.  The first byte contains the
215
 * following information:
216
 *  NEXTBYTE  - flag        - advance to next byte if set
217
 *  BITNUM    - 3 bit field - the bit number to tested
218
 *  OFFLEN    - 2 bit field - number of bytes in the offset
219
 * if offlen == 0 (non-branching node)
220
 *  RIGHTPATH - 1 bit field - set if the following node is for the
221
 *                            right-hand path (tested bit is set)
222
 *  TRIENODE  - 1 bit field - set if the following node is an internal
223
 *                            node, otherwise it is a leaf node
224
 * if offlen != 0 (branching node)
225
 *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
226
 *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
227
 *
228
 * Due to the way utf8 works, there cannot be branching nodes with
229
 * NEXTBYTE set, and moreover those nodes always have a righthand
230
 * descendant.
231
 */
232
typedef const unsigned char utf8trie_t;
233
0
#define BITNUM    0x07
234
0
#define NEXTBYTE  0x08
235
0
#define OFFLEN    0x30
236
0
#define OFFLEN_SHIFT  4
237
0
#define RIGHTPATH 0x40
238
0
#define TRIENODE  0x80
239
0
#define RIGHTNODE 0x40
240
0
#define LEFTNODE  0x80
241
242
/*
243
 * utf8leaf_t
244
 *
245
 * The leaves of the trie are embedded in the trie, and so the same
246
 * underlying datatype: unsigned char.
247
 *
248
 * leaf[0]: The unicode version, stored as a generation number that is
249
 *          an index into utf8agetab[].  With this we can filter code
250
 *          points based on the unicode version in which they were
251
 *          defined.  The CCC of a non-defined code point is 0.
252
 * leaf[1]: Canonical Combining Class. During normalization, we need
253
 *          to do a stable sort into ascending order of all characters
254
 *          with a non-zero CCC that occur between two characters with
255
 *          a CCC of 0, or at the begin or end of a string.
256
 *          The unicode standard guarantees that all CCC values are
257
 *          between 0 and 254 inclusive, which leaves 255 available as
258
 *          a special value.
259
 *          Code points with CCC 0 are known as stoppers.
260
 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
261
 *          start of a NUL-terminated string that is the decomposition
262
 *          of the character.
263
 *          The CCC of a decomposable character is the same as the CCC
264
 *          of the first character of its decomposition.
265
 *          Some characters decompose as the empty string: these are
266
 *          characters with the Default_Ignorable_Code_Point property.
267
 *          These do affect normalization, as they all have CCC 0.
268
 *
269
 * The decompositions in the trie have been fully expanded, with the
270
 * exception of Hangul syllables, which are decomposed algorithmically.
271
 *
272
 * Casefolding, if applicable, is also done using decompositions.
273
 *
274
 * The trie is constructed in such a way that leaves exist for all
275
 * UTF-8 sequences that match the criteria from the "UTF-8 valid
276
 * ranges" comment above, and only for those sequences.  Therefore a
277
 * lookup in the trie can be used to validate the UTF-8 input.
278
 */
279
typedef const unsigned char utf8leaf_t;
280
281
0
#define LEAF_GEN(LEAF)  ((LEAF)[0])
282
0
#define LEAF_CCC(LEAF)  ((LEAF)[1])
283
0
#define LEAF_STR(LEAF)  ((const char *)((LEAF) + 2))
284
285
0
#define MINCCC    (0)
286
0
#define MAXCCC    (254)
287
0
#define STOPPER   (0)
288
0
#define DECOMPOSE (255)
289
290
/* Marker for hangul syllable decomposition. */
291
0
#define HANGUL    ((char)(255))
292
/* Size of the synthesized leaf used for Hangul syllable decomposition. */
293
#define UTF8HANGULLEAF  (12)
294
295
/*
296
 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
297
 *
298
 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
299
 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
300
 *
301
 * SBase = 0xAC00
302
 * LBase = 0x1100
303
 * VBase = 0x1161
304
 * TBase = 0x11A7
305
 * LCount = 19
306
 * VCount = 21
307
 * TCount = 28
308
 * NCount = 588 (VCount * TCount)
309
 * SCount = 11172 (LCount * NCount)
310
 *
311
 * Decomposition:
312
 *   SIndex = s - SBase
313
 *
314
 * LV (Canonical/Full)
315
 *   LIndex = SIndex / NCount
316
 *   VIndex = (Sindex % NCount) / TCount
317
 *   LPart = LBase + LIndex
318
 *   VPart = VBase + VIndex
319
 *
320
 * LVT (Canonical)
321
 *   LVIndex = (SIndex / TCount) * TCount
322
 *   TIndex = (Sindex % TCount)
323
 *   LVPart = SBase + LVIndex
324
 *   TPart = TBase + TIndex
325
 *
326
 * LVT (Full)
327
 *   LIndex = SIndex / NCount
328
 *   VIndex = (Sindex % NCount) / TCount
329
 *   TIndex = (Sindex % TCount)
330
 *   LPart = LBase + LIndex
331
 *   VPart = VBase + VIndex
332
 *   if (TIndex == 0) {
333
 *          d = <LPart, VPart>
334
 *   } else {
335
 *          TPart = TBase + TIndex
336
 *          d = <LPart, TPart, VPart>
337
 *   }
338
 */
339
340
/* Constants */
341
0
#define SB  (0xAC00)
342
0
#define LB  (0x1100)
343
0
#define VB  (0x1161)
344
0
#define TB  (0x11A7)
345
#define LC  (19)
346
0
#define VC  (21)
347
0
#define TC  (28)
348
0
#define NC  (VC * TC)
349
#define SC  (LC * NC)
350
351
/* Algorithmic decomposition of hangul syllable. */
352
static utf8leaf_t *
353
utf8hangul(const char *str, unsigned char *hangul)
354
0
{
355
0
  unsigned int  si;
356
0
  unsigned int  li;
357
0
  unsigned int  vi;
358
0
  unsigned int  ti;
359
0
  unsigned char *h;
360
361
  /* Calculate the SI, LI, VI, and TI values. */
362
0
  si = utf8decode3(str) - SB;
363
0
  li = si / NC;
364
0
  vi = (si % NC) / TC;
365
0
  ti = si % TC;
366
367
  /* Fill in base of leaf. */
368
0
  h = hangul;
369
0
  LEAF_GEN(h) = 2;
370
0
  LEAF_CCC(h) = DECOMPOSE;
371
0
  h += 2;
372
373
  /* Add LPart, a 3-byte UTF-8 sequence. */
374
0
  h += utf8encode3((char *)h, li + LB);
375
376
  /* Add VPart, a 3-byte UTF-8 sequence. */
377
0
  h += utf8encode3((char *)h, vi + VB);
378
379
  /* Add TPart if required, also a 3-byte UTF-8 sequence. */
380
0
  if (ti)
381
0
    h += utf8encode3((char *)h, ti + TB);
382
383
  /* Terminate string. */
384
0
  h[0] = '\0';
385
386
0
  return hangul;
387
0
}
388
389
/*
390
 * Use trie to scan s, touching at most len bytes.
391
 * Returns the leaf if one exists, NULL otherwise.
392
 *
393
 * A non-NULL return guarantees that the UTF-8 sequence starting at s
394
 * is well-formed and corresponds to a known unicode code point.  The
395
 * shorthand for this will be "is valid UTF-8 unicode".
396
 */
397
static utf8leaf_t *utf8nlookup(const struct utf8data *data,
398
             unsigned char *hangul, const char *s, size_t len)
399
0
{
400
0
  utf8trie_t  *trie;
401
0
  int   offlen;
402
0
  int   offset;
403
0
  int   mask;
404
0
  int   node;
405
406
0
  if (!data)
407
0
    return NULL;
408
0
  if (len == 0)
409
0
    return NULL;
410
411
0
  trie = utf8data + data->offset;
412
0
  node = 1;
413
0
  while (node) {
414
0
    offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
415
0
    if (*trie & NEXTBYTE) {
416
0
      if (--len == 0)
417
0
        return NULL;
418
0
      s++;
419
0
    }
420
0
    mask = 1 << (*trie & BITNUM);
421
0
    if (*s & mask) {
422
      /* Right leg */
423
0
      if (offlen) {
424
        /* Right node at offset of trie */
425
0
        node = (*trie & RIGHTNODE);
426
0
        offset = trie[offlen];
427
0
        while (--offlen) {
428
0
          offset <<= 8;
429
0
          offset |= trie[offlen];
430
0
        }
431
0
        trie += offset;
432
0
      } else if (*trie & RIGHTPATH) {
433
        /* Right node after this node */
434
0
        node = (*trie & TRIENODE);
435
0
        trie++;
436
0
      } else {
437
        /* No right node. */
438
0
        return NULL;
439
0
      }
440
0
    } else {
441
      /* Left leg */
442
0
      if (offlen) {
443
        /* Left node after this node. */
444
0
        node = (*trie & LEFTNODE);
445
0
        trie += offlen + 1;
446
0
      } else if (*trie & RIGHTPATH) {
447
        /* No left node. */
448
0
        return NULL;
449
0
      } else {
450
        /* Left node after this node */
451
0
        node = (*trie & TRIENODE);
452
0
        trie++;
453
0
      }
454
0
    }
455
0
  }
456
  /*
457
   * Hangul decomposition is done algorithmically. These are the
458
   * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
459
   * always 3 bytes long, so s has been advanced twice, and the
460
   * start of the sequence is at s-2.
461
   */
462
0
  if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
463
0
    trie = utf8hangul(s - 2, hangul);
464
0
  return trie;
465
0
}
466
467
/*
468
 * Use trie to scan s.
469
 * Returns the leaf if one exists, NULL otherwise.
470
 *
471
 * Forwards to utf8nlookup().
472
 */
473
static utf8leaf_t *utf8lookup(const struct utf8data *data,
474
            unsigned char *hangul, const char *s)
475
0
{
476
0
  return utf8nlookup(data, hangul, s, (size_t)-1);
477
0
}
478
479
#if 0
480
/*
481
 * Maximum age of any character in s.
482
 * Return -1 if s is not valid UTF-8 unicode.
483
 * Return 0 if only non-assigned code points are used.
484
 */
485
static int utf8agemax(const struct utf8data *data, const char *s)
486
{
487
  utf8leaf_t  *leaf;
488
  int   age = 0;
489
  int   leaf_age;
490
  unsigned char hangul[UTF8HANGULLEAF];
491
492
  if (!data)
493
    return -1;
494
495
  while (*s) {
496
    leaf = utf8lookup(data, hangul, s);
497
    if (!leaf)
498
      return -1;
499
500
    leaf_age = utf8agetab[LEAF_GEN(leaf)];
501
    if (leaf_age <= data->maxage && leaf_age > age)
502
      age = leaf_age;
503
    s += utf8clen(s);
504
  }
505
  return age;
506
}
507
#endif
508
509
#if 0
510
/*
511
 * Minimum age of any character in s.
512
 * Return -1 if s is not valid UTF-8 unicode.
513
 * Return 0 if non-assigned code points are used.
514
 */
515
static int utf8agemin(const struct utf8data *data, const char *s)
516
{
517
  utf8leaf_t  *leaf;
518
  int   age;
519
  int   leaf_age;
520
  unsigned char hangul[UTF8HANGULLEAF];
521
522
  if (!data)
523
    return -1;
524
  age = data->maxage;
525
  while (*s) {
526
    leaf = utf8lookup(data, hangul, s);
527
    if (!leaf)
528
      return -1;
529
    leaf_age = utf8agetab[LEAF_GEN(leaf)];
530
    if (leaf_age <= data->maxage && leaf_age < age)
531
      age = leaf_age;
532
    s += utf8clen(s);
533
  }
534
  return age;
535
}
536
#endif
537
538
#if 0
539
/*
540
 * Maximum age of any character in s, touch at most len bytes.
541
 * Return -1 if s is not valid UTF-8 unicode.
542
 */
543
static int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
544
{
545
  utf8leaf_t  *leaf;
546
  int   age = 0;
547
  int   leaf_age;
548
  unsigned char hangul[UTF8HANGULLEAF];
549
550
  if (!data)
551
    return -1;
552
553
  while (len && *s) {
554
    leaf = utf8nlookup(data, hangul, s, len);
555
    if (!leaf)
556
      return -1;
557
    leaf_age = utf8agetab[LEAF_GEN(leaf)];
558
    if (leaf_age <= data->maxage && leaf_age > age)
559
      age = leaf_age;
560
    len -= utf8clen(s);
561
    s += utf8clen(s);
562
  }
563
  return age;
564
}
565
#endif
566
567
#if 0
568
/*
569
 * Maximum age of any character in s, touch at most len bytes.
570
 * Return -1 if s is not valid UTF-8 unicode.
571
 */
572
static int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
573
{
574
  utf8leaf_t  *leaf;
575
  int   leaf_age;
576
  int   age;
577
  unsigned char hangul[UTF8HANGULLEAF];
578
579
  if (!data)
580
    return -1;
581
  age = data->maxage;
582
  while (len && *s) {
583
    leaf = utf8nlookup(data, hangul, s, len);
584
    if (!leaf)
585
      return -1;
586
    leaf_age = utf8agetab[LEAF_GEN(leaf)];
587
    if (leaf_age <= data->maxage && leaf_age < age)
588
      age = leaf_age;
589
    len -= utf8clen(s);
590
    s += utf8clen(s);
591
  }
592
  return age;
593
}
594
#endif
595
596
#if 0
597
/*
598
 * Length of the normalization of s.
599
 * Return -1 if s is not valid UTF-8 unicode.
600
 *
601
 * A string of Default_Ignorable_Code_Point has length 0.
602
 */
603
static ssize_t utf8len(const struct utf8data *data, const char *s)
604
{
605
  utf8leaf_t  *leaf;
606
  size_t    ret = 0;
607
  unsigned char hangul[UTF8HANGULLEAF];
608
609
  if (!data)
610
    return -1;
611
  while (*s) {
612
    leaf = utf8lookup(data, hangul, s);
613
    if (!leaf)
614
      return -1;
615
    if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
616
      ret += utf8clen(s);
617
    else if (LEAF_CCC(leaf) == DECOMPOSE)
618
      ret += strlen(LEAF_STR(leaf));
619
    else
620
      ret += utf8clen(s);
621
    s += utf8clen(s);
622
  }
623
  return ret;
624
}
625
#endif
626
627
#if 0
628
/*
629
 * Length of the normalization of s, touch at most len bytes.
630
 * Return -1 if s is not valid UTF-8 unicode.
631
 */
632
static ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
633
{
634
  utf8leaf_t  *leaf;
635
  size_t    ret = 0;
636
  unsigned char hangul[UTF8HANGULLEAF];
637
638
  if (!data)
639
    return -1;
640
  while (len && *s) {
641
    leaf = utf8nlookup(data, hangul, s, len);
642
    if (!leaf)
643
      return -1;
644
    if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
645
      ret += utf8clen(s);
646
    else if (LEAF_CCC(leaf) == DECOMPOSE)
647
      ret += strlen(LEAF_STR(leaf));
648
    else
649
      ret += utf8clen(s);
650
    len -= utf8clen(s);
651
    s += utf8clen(s);
652
  }
653
  return ret;
654
}
655
#endif
656
657
/*
658
 * Set up an utf8cursor for use by utf8byte().
659
 *
660
 *   u8c    : pointer to cursor.
661
 *   data   : const struct utf8data to use for normalization.
662
 *   s      : string.
663
 *   len    : length of s.
664
 *
665
 * Returns -1 on error, 0 on success.
666
 */
667
static int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
668
    const char *s, size_t len)
669
0
{
670
0
  if (!data)
671
0
    return -1;
672
0
  if (!s)
673
0
    return -1;
674
0
  u8c->data = data;
675
0
  u8c->s = s;
676
0
  u8c->p = NULL;
677
0
  u8c->ss = NULL;
678
0
  u8c->sp = NULL;
679
0
  u8c->len = len;
680
0
  u8c->slen = 0;
681
0
  u8c->ccc = STOPPER;
682
0
  u8c->nccc = STOPPER;
683
  /* Check we didn't clobber the maximum length. */
684
0
  if (u8c->len != len)
685
0
    return -1;
686
  /* The first byte of s may not be an utf8 continuation. */
687
0
  if (len > 0 && (*s & 0xC0) == 0x80)
688
0
    return -1;
689
0
  return 0;
690
0
}
691
692
#if 0
693
/*
694
 * Set up an utf8cursor for use by utf8byte().
695
 *
696
 *   u8c    : pointer to cursor.
697
 *   data   : const struct utf8data to use for normalization.
698
 *   s      : NUL-terminated string.
699
 *
700
 * Returns -1 on error, 0 on success.
701
 */
702
static int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
703
         const char *s)
704
{
705
  return utf8ncursor(u8c, data, s, (unsigned int)-1);
706
}
707
#endif
708
709
/*
710
 * Get one byte from the normalized form of the string described by u8c.
711
 *
712
 * Returns the byte cast to an unsigned char on success, and -1 on failure.
713
 *
714
 * The cursor keeps track of the location in the string in u8c->s.
715
 * When a character is decomposed, the current location is stored in
716
 * u8c->p, and u8c->s is set to the start of the decomposition. Note
717
 * that bytes from a decomposition do not count against u8c->len.
718
 *
719
 * Characters are emitted if they match the current CCC in u8c->ccc.
720
 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
721
 * and the function returns 0 in that case.
722
 *
723
 * Sorting by CCC is done by repeatedly scanning the string.  The
724
 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
725
 * the start of the scan.  The first pass finds the lowest CCC to be
726
 * emitted and stores it in u8c->nccc, the second pass emits the
727
 * characters with this CCC and finds the next lowest CCC. This limits
728
 * the number of passes to 1 + the number of different CCCs in the
729
 * sequence being scanned.
730
 *
731
 * Therefore:
732
 *  u8c->p  != NULL -> a decomposition is being scanned.
733
 *  u8c->ss != NULL -> this is a repeating scan.
734
 *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
735
 */
736
static int utf8byte(struct utf8cursor *u8c)
737
0
{
738
0
  utf8leaf_t *leaf;
739
0
  int ccc;
740
741
0
  for (;;) {
742
    /* Check for the end of a decomposed character. */
743
0
    if (u8c->p && *u8c->s == '\0') {
744
0
      u8c->s = u8c->p;
745
0
      u8c->p = NULL;
746
0
    }
747
748
    /* Check for end-of-string. */
749
0
    if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
750
      /* There is no next byte. */
751
0
      if (u8c->ccc == STOPPER)
752
0
        return 0;
753
      /* End-of-string during a scan counts as a stopper. */
754
0
      ccc = STOPPER;
755
0
      goto ccc_mismatch;
756
0
    } else if ((*u8c->s & 0xC0) == 0x80) {
757
      /* This is a continuation of the current character. */
758
0
      if (!u8c->p)
759
0
        u8c->len--;
760
0
      return (unsigned char)*u8c->s++;
761
0
    }
762
763
    /* Look up the data for the current character. */
764
0
    if (u8c->p) {
765
0
      leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
766
0
    } else {
767
0
      leaf = utf8nlookup(u8c->data, u8c->hangul,
768
0
             u8c->s, u8c->len);
769
0
    }
770
771
    /* No leaf found implies that the input is a binary blob. */
772
0
    if (!leaf)
773
0
      return -1;
774
775
0
    ccc = LEAF_CCC(leaf);
776
    /* Characters that are too new have CCC 0. */
777
0
    if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
778
0
      ccc = STOPPER;
779
0
    } else if (ccc == DECOMPOSE) {
780
0
      u8c->len -= utf8clen(u8c->s);
781
0
      u8c->p = u8c->s + utf8clen(u8c->s);
782
0
      u8c->s = LEAF_STR(leaf);
783
      /* Empty decomposition implies CCC 0. */
784
0
      if (*u8c->s == '\0') {
785
0
        if (u8c->ccc == STOPPER)
786
0
          continue;
787
0
        ccc = STOPPER;
788
0
        goto ccc_mismatch;
789
0
      }
790
791
0
      leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
792
0
      if (!leaf)
793
0
        return -1;
794
0
      ccc = LEAF_CCC(leaf);
795
0
    }
796
797
    /*
798
     * If this is not a stopper, then see if it updates
799
     * the next canonical class to be emitted.
800
     */
801
0
    if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
802
0
      u8c->nccc = ccc;
803
804
    /*
805
     * Return the current byte if this is the current
806
     * combining class.
807
     */
808
0
    if (ccc == u8c->ccc) {
809
0
      if (!u8c->p)
810
0
        u8c->len--;
811
0
      return (unsigned char)*u8c->s++;
812
0
    }
813
814
    /* Current combining class mismatch. */
815
0
ccc_mismatch:
816
0
    if (u8c->nccc == STOPPER) {
817
      /*
818
       * Scan forward for the first canonical class
819
       * to be emitted.  Save the position from
820
       * which to restart.
821
       */
822
0
      u8c->ccc = MINCCC - 1;
823
0
      u8c->nccc = ccc;
824
0
      u8c->sp = u8c->p;
825
0
      u8c->ss = u8c->s;
826
0
      u8c->slen = u8c->len;
827
0
      if (!u8c->p)
828
0
        u8c->len -= utf8clen(u8c->s);
829
0
      u8c->s += utf8clen(u8c->s);
830
0
    } else if (ccc != STOPPER) {
831
      /* Not a stopper, and not the ccc we're emitting. */
832
0
      if (!u8c->p)
833
0
        u8c->len -= utf8clen(u8c->s);
834
0
      u8c->s += utf8clen(u8c->s);
835
0
    } else if (u8c->nccc != MAXCCC + 1) {
836
      /* At a stopper, restart for next ccc. */
837
0
      u8c->ccc = u8c->nccc;
838
0
      u8c->nccc = MAXCCC + 1;
839
0
      u8c->s = u8c->ss;
840
0
      u8c->p = u8c->sp;
841
0
      u8c->len = u8c->slen;
842
0
    } else {
843
      /* All done, proceed from here. */
844
0
      u8c->ccc = STOPPER;
845
0
      u8c->nccc = STOPPER;
846
0
      u8c->sp = NULL;
847
0
      u8c->ss = NULL;
848
0
      u8c->slen = 0;
849
0
    }
850
0
  }
851
0
}
852
853
#if 0
854
/*
855
 * Look for the correct const struct utf8data for a unicode version.
856
 * Returns NULL if the version requested is too new.
857
 *
858
 * Two normalization forms are supported: nfdi and nfdicf.
859
 *
860
 * nfdi:
861
 *  - Apply unicode normalization form NFD.
862
 *  - Remove any Default_Ignorable_Code_Point.
863
 *
864
 * nfdicf:
865
 *  - Apply unicode normalization form NFD.
866
 *  - Remove any Default_Ignorable_Code_Point.
867
 *  - Apply a full casefold (C + F).
868
 */
869
static const struct utf8data *utf8nfdi(unsigned int maxage)
870
{
871
  int i = ARRAY_SIZE(utf8nfdidata) - 1;
872
873
  while (maxage < utf8nfdidata[i].maxage)
874
    i--;
875
  if (maxage > utf8nfdidata[i].maxage)
876
    return NULL;
877
  return &utf8nfdidata[i];
878
}
879
#endif
880
881
static const struct utf8data *utf8nfdicf(unsigned int maxage)
882
0
{
883
0
  int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
884
885
0
  while (maxage < utf8nfdicfdata[i].maxage)
886
0
    i--;
887
0
  if (maxage > utf8nfdicfdata[i].maxage)
888
0
    return NULL;
889
0
  return &utf8nfdicfdata[i];
890
0
}
891
892
static int utf8_casefold(const struct ext2fs_nls_table *table,
893
        const unsigned char *str, size_t len,
894
        unsigned char *dest, size_t dlen)
895
0
{
896
0
  const struct utf8data *data = utf8nfdicf(table->version);
897
0
  struct utf8cursor cur;
898
0
  size_t nlen = 0;
899
900
0
  if (utf8ncursor(&cur, data, (const char *) str, len) < 0)
901
0
    goto invalid_seq;
902
903
0
  for (nlen = 0; nlen < dlen; nlen++) {
904
0
    int c = utf8byte(&cur);
905
906
0
    dest[nlen] = c;
907
0
    if (!c)
908
0
      return nlen;
909
0
    if (c == -1)
910
0
      break;
911
0
  }
912
913
0
  return -ENAMETOOLONG;
914
915
0
invalid_seq:
916
0
  if (dlen < len)
917
0
    return -ENAMETOOLONG;
918
919
  /* Signal invalid sequence */
920
0
  return -EINVAL;
921
0
}
922
923
static int utf8_validate(const struct ext2fs_nls_table *table,
924
       char *s, size_t len, char **pos)
925
0
{
926
0
  const struct utf8data *data = utf8nfdicf(table->version);
927
0
  utf8leaf_t  *leaf;
928
0
  unsigned char hangul[UTF8HANGULLEAF];
929
930
0
  if (!data)
931
0
    return -1;
932
0
  while (len && *s) {
933
0
    leaf = utf8nlookup(data, hangul, s, len);
934
0
    if (!leaf) {
935
0
      *pos = s;
936
0
      return 1;
937
0
    }
938
0
    len -= utf8clen(s);
939
0
    s += utf8clen(s);
940
0
  }
941
0
  return 0;
942
0
}
943
944
static int utf8_casefold_cmp(const struct ext2fs_nls_table *table,
945
           const unsigned char *str1, size_t len1,
946
           const unsigned char *str2, size_t len2)
947
0
{
948
0
  const struct utf8data *data = utf8nfdicf(table->version);
949
0
  int c1, c2;
950
0
  struct utf8cursor cur1, cur2;
951
952
0
  if (utf8ncursor(&cur1, data, (const char *) str1, len1) < 0)
953
0
    return -1;
954
0
  if (utf8ncursor(&cur2, data, (const char *) str2, len2) < 0)
955
0
    return -1;
956
957
0
  do {
958
0
    c1 = utf8byte(&cur1);
959
0
    c2 = utf8byte(&cur2);
960
961
0
    if (c1 < 0 || c2 < 0)
962
0
      return -1;
963
0
    if (c1 != c2)
964
0
      return c1 - c2;
965
0
  } while (c1);
966
967
0
  return 0;
968
0
}
969
970
static const struct ext2fs_nls_ops utf8_ops = {
971
  .casefold = utf8_casefold,
972
  .validate = utf8_validate,
973
  .casefold_cmp = utf8_casefold_cmp,
974
};
975
976
static const struct ext2fs_nls_table nls_utf8 = {
977
  .ops = &utf8_ops,
978
  .version = UNICODE_AGE(12, 1, 0),
979
};
980
981
const struct ext2fs_nls_table *ext2fs_load_nls_table(int encoding)
982
21
{
983
21
  if (encoding == EXT4_ENC_UTF8_12_1)
984
1
    return &nls_utf8;
985
986
20
  return NULL;
987
21
}
988
989
int ext2fs_check_encoded_name(const struct ext2fs_nls_table *table,
990
            char *name, size_t len, char **pos)
991
0
{
992
0
  return table->ops->validate(table, name, len, pos);
993
0
}
994
995
int ext2fs_casefold_cmp(const struct ext2fs_nls_table *table,
996
      const unsigned char *str1, size_t len1,
997
      const unsigned char *str2, size_t len2)
998
0
{
999
0
  return table->ops->casefold_cmp(table, str1, len1, str2, len2);
1000
0
}