Coverage Report

Created: 2025-08-28 06:48

/src/hermes/external/llvh/lib/Support/ConvertUTF.cpp
Line
Count
Source (jump to first uncovered line)
1
/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
2
 *
3
 *                     The LLVM Compiler Infrastructure
4
 *
5
 * This file is distributed under the University of Illinois Open Source
6
 * License. See LICENSE.TXT for details.
7
 *
8
 *===------------------------------------------------------------------------=*/
9
/*
10
 * Copyright 2001-2004 Unicode, Inc.
11
 *
12
 * Disclaimer
13
 *
14
 * This source code is provided as is by Unicode, Inc. No claims are
15
 * made as to fitness for any particular purpose. No warranties of any
16
 * kind are expressed or implied. The recipient agrees to determine
17
 * applicability of information provided. If this file has been
18
 * purchased on magnetic or optical media from Unicode, Inc., the
19
 * sole remedy for any claim will be exchange of defective media
20
 * within 90 days of receipt.
21
 *
22
 * Limitations on Rights to Redistribute This Code
23
 *
24
 * Unicode, Inc. hereby grants the right to freely use the information
25
 * supplied in this file in the creation of products supporting the
26
 * Unicode Standard, and to make copies of this file in any form
27
 * for internal or external distribution as long as this notice
28
 * remains attached.
29
 */
30
31
/* ---------------------------------------------------------------------
32
33
    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
34
    Author: Mark E. Davis, 1994.
35
    Rev History: Rick McGowan, fixes & updates May 2001.
36
    Sept 2001: fixed const & error conditions per
37
        mods suggested by S. Parent & A. Lillich.
38
    June 2002: Tim Dodd added detection and handling of incomplete
39
        source sequences, enhanced error detection, added casts
40
        to eliminate compiler warnings.
41
    July 2003: slight mods to back out aggressive FFFE detection.
42
    Jan 2004: updated switches in from-UTF8 conversions.
43
    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
44
45
    See the header file "ConvertUTF.h" for complete documentation.
46
47
------------------------------------------------------------------------ */
48
49
#include "llvh/Support/ConvertUTF.h"
50
#ifdef CVTUTF_DEBUG
51
#include <stdio.h>
52
#endif
53
#include <assert.h>
54
55
/*
56
 * This code extensively uses fall-through switches.
57
 * Keep the compiler from warning about that.
58
 */
59
#if defined(__clang__) && defined(__has_warning)
60
# if __has_warning("-Wimplicit-fallthrough")
61
#  define ConvertUTF_DISABLE_WARNINGS \
62
    _Pragma("clang diagnostic push")  \
63
    _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
64
#  define ConvertUTF_RESTORE_WARNINGS \
65
    _Pragma("clang diagnostic pop")
66
# endif
67
#elif defined(__GNUC__) && __GNUC__ > 6
68
# define ConvertUTF_DISABLE_WARNINGS \
69
   _Pragma("GCC diagnostic push")    \
70
   _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
71
# define ConvertUTF_RESTORE_WARNINGS \
72
   _Pragma("GCC diagnostic pop")
73
#endif
74
#ifndef ConvertUTF_DISABLE_WARNINGS
75
# define ConvertUTF_DISABLE_WARNINGS
76
#endif
77
#ifndef ConvertUTF_RESTORE_WARNINGS
78
# define ConvertUTF_RESTORE_WARNINGS
79
#endif
80
81
ConvertUTF_DISABLE_WARNINGS
82
83
namespace llvh {
84
85
static const int halfShift  = 10; /* used for shifting by 10 bits */
86
87
static const UTF32 halfBase = 0x0010000UL;
88
static const UTF32 halfMask = 0x3FFUL;
89
90
0
#define UNI_SUR_HIGH_START  (UTF32)0xD800
91
0
#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
92
0
#define UNI_SUR_LOW_START   (UTF32)0xDC00
93
0
#define UNI_SUR_LOW_END     (UTF32)0xDFFF
94
95
/* --------------------------------------------------------------------- */
96
97
/*
98
 * Index into the table below with the first byte of a UTF-8 sequence to
99
 * get the number of trailing bytes that are supposed to follow it.
100
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
101
 * left as-is for anyone who may want to do such conversion, which was
102
 * allowed in earlier algorithms.
103
 */
104
static const char trailingBytesForUTF8[256] = {
105
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
106
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
107
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
108
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
109
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
110
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
111
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
113
};
114
115
/*
116
 * Magic values subtracted from a buffer value during UTF8 conversion.
117
 * This table contains as many values as there might be trailing bytes
118
 * in a UTF-8 sequence.
119
 */
120
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
121
                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
122
123
/*
124
 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
125
 * into the first byte, depending on how many bytes follow.  There are
126
 * as many entries in this table as there are UTF-8 sequence types.
127
 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
128
 * for *legal* UTF-8 will be 4 or fewer bytes total.
129
 */
130
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
131
132
/* --------------------------------------------------------------------- */
133
134
/* The interface converts a whole buffer to avoid function-call overhead.
135
 * Constants have been gathered. Loops & conditionals have been removed as
136
 * much as possible for efficiency, in favor of drop-through switches.
137
 * (See "Note A" at the bottom of the file for equivalent code.)
138
 * If your compiler supports it, the "isLegalUTF8" call can be turned
139
 * into an inline function.
140
 */
141
142
143
/* --------------------------------------------------------------------- */
144
145
ConversionResult ConvertUTF32toUTF16 (
146
        const UTF32** sourceStart, const UTF32* sourceEnd,
147
0
        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
148
0
    ConversionResult result = conversionOK;
149
0
    const UTF32* source = *sourceStart;
150
0
    UTF16* target = *targetStart;
151
0
    while (source < sourceEnd) {
152
0
        UTF32 ch;
153
0
        if (target >= targetEnd) {
154
0
            result = targetExhausted; break;
155
0
        }
156
0
        ch = *source++;
157
0
        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
158
            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
159
0
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
160
0
                if (flags == strictConversion) {
161
0
                    --source; /* return to the illegal value itself */
162
0
                    result = sourceIllegal;
163
0
                    break;
164
0
                } else {
165
0
                    *target++ = UNI_REPLACEMENT_CHAR;
166
0
                }
167
0
            } else {
168
0
                *target++ = (UTF16)ch; /* normal case */
169
0
            }
170
0
        } else if (ch > UNI_MAX_LEGAL_UTF32) {
171
0
            if (flags == strictConversion) {
172
0
                result = sourceIllegal;
173
0
            } else {
174
0
                *target++ = UNI_REPLACEMENT_CHAR;
175
0
            }
176
0
        } else {
177
            /* target is a character in range 0xFFFF - 0x10FFFF. */
178
0
            if (target + 1 >= targetEnd) {
179
0
                --source; /* Back up source pointer! */
180
0
                result = targetExhausted; break;
181
0
            }
182
0
            ch -= halfBase;
183
0
            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
184
0
            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
185
0
        }
186
0
    }
187
0
    *sourceStart = source;
188
0
    *targetStart = target;
189
0
    return result;
190
0
}
191
192
/* --------------------------------------------------------------------- */
193
194
ConversionResult ConvertUTF16toUTF32 (
195
        const UTF16** sourceStart, const UTF16* sourceEnd,
196
0
        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
197
0
    ConversionResult result = conversionOK;
198
0
    const UTF16* source = *sourceStart;
199
0
    UTF32* target = *targetStart;
200
0
    UTF32 ch, ch2;
201
0
    while (source < sourceEnd) {
202
0
        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
203
0
        ch = *source++;
204
        /* If we have a surrogate pair, convert to UTF32 first. */
205
0
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
206
            /* If the 16 bits following the high surrogate are in the source buffer... */
207
0
            if (source < sourceEnd) {
208
0
                ch2 = *source;
209
                /* If it's a low surrogate, convert to UTF32. */
210
0
                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
211
0
                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
212
0
                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
213
0
                    ++source;
214
0
                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
215
0
                    --source; /* return to the illegal value itself */
216
0
                    result = sourceIllegal;
217
0
                    break;
218
0
                }
219
0
            } else { /* We don't have the 16 bits following the high surrogate. */
220
0
                --source; /* return to the high surrogate */
221
0
                result = sourceExhausted;
222
0
                break;
223
0
            }
224
0
        } else if (flags == strictConversion) {
225
            /* UTF-16 surrogate values are illegal in UTF-32 */
226
0
            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
227
0
                --source; /* return to the illegal value itself */
228
0
                result = sourceIllegal;
229
0
                break;
230
0
            }
231
0
        }
232
0
        if (target >= targetEnd) {
233
0
            source = oldSource; /* Back up source pointer! */
234
0
            result = targetExhausted; break;
235
0
        }
236
0
        *target++ = ch;
237
0
    }
238
0
    *sourceStart = source;
239
0
    *targetStart = target;
240
#ifdef CVTUTF_DEBUG
241
if (result == sourceIllegal) {
242
    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
243
    fflush(stderr);
244
}
245
#endif
246
0
    return result;
247
0
}
248
ConversionResult ConvertUTF16toUTF8 (
249
        const UTF16** sourceStart, const UTF16* sourceEnd,
250
0
        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
251
0
    ConversionResult result = conversionOK;
252
0
    const UTF16* source = *sourceStart;
253
0
    UTF8* target = *targetStart;
254
0
    while (source < sourceEnd) {
255
0
        UTF32 ch;
256
0
        unsigned short bytesToWrite = 0;
257
0
        const UTF32 byteMask = 0xBF;
258
0
        const UTF32 byteMark = 0x80;
259
0
        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
260
0
        ch = *source++;
261
        /* If we have a surrogate pair, convert to UTF32 first. */
262
0
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
263
            /* If the 16 bits following the high surrogate are in the source buffer... */
264
0
            if (source < sourceEnd) {
265
0
                UTF32 ch2 = *source;
266
                /* If it's a low surrogate, convert to UTF32. */
267
0
                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
268
0
                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
269
0
                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
270
0
                    ++source;
271
0
                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
272
0
                    --source; /* return to the illegal value itself */
273
0
                    result = sourceIllegal;
274
0
                    break;
275
0
                }
276
0
            } else { /* We don't have the 16 bits following the high surrogate. */
277
0
                --source; /* return to the high surrogate */
278
0
                result = sourceExhausted;
279
0
                break;
280
0
            }
281
0
        } else if (flags == strictConversion) {
282
            /* UTF-16 surrogate values are illegal in UTF-32 */
283
0
            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
284
0
                --source; /* return to the illegal value itself */
285
0
                result = sourceIllegal;
286
0
                break;
287
0
            }
288
0
        }
289
        /* Figure out how many bytes the result will require */
290
0
        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
291
0
        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
292
0
        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
293
0
        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
294
0
        } else {                            bytesToWrite = 3;
295
0
                                            ch = UNI_REPLACEMENT_CHAR;
296
0
        }
297
298
0
        target += bytesToWrite;
299
0
        if (target > targetEnd) {
300
0
            source = oldSource; /* Back up source pointer! */
301
0
            target -= bytesToWrite; result = targetExhausted; break;
302
0
        }
303
0
        switch (bytesToWrite) { /* note: everything falls through. */
304
0
            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
305
0
            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
306
0
            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
307
0
            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
308
0
        }
309
0
        target += bytesToWrite;
310
0
    }
311
0
    *sourceStart = source;
312
0
    *targetStart = target;
313
0
    return result;
314
0
}
315
316
/* --------------------------------------------------------------------- */
317
318
ConversionResult ConvertUTF32toUTF8 (
319
        const UTF32** sourceStart, const UTF32* sourceEnd,
320
0
        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
321
0
    ConversionResult result = conversionOK;
322
0
    const UTF32* source = *sourceStart;
323
0
    UTF8* target = *targetStart;
324
0
    while (source < sourceEnd) {
325
0
        UTF32 ch;
326
0
        unsigned short bytesToWrite = 0;
327
0
        const UTF32 byteMask = 0xBF;
328
0
        const UTF32 byteMark = 0x80;
329
0
        ch = *source++;
330
0
        if (flags == strictConversion ) {
331
            /* UTF-16 surrogate values are illegal in UTF-32 */
332
0
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
333
0
                --source; /* return to the illegal value itself */
334
0
                result = sourceIllegal;
335
0
                break;
336
0
            }
337
0
        }
338
        /*
339
         * Figure out how many bytes the result will require. Turn any
340
         * illegally large UTF32 things (> Plane 17) into replacement chars.
341
         */
342
0
        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
343
0
        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
344
0
        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
345
0
        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
346
0
        } else {                            bytesToWrite = 3;
347
0
                                            ch = UNI_REPLACEMENT_CHAR;
348
0
                                            result = sourceIllegal;
349
0
        }
350
351
0
        target += bytesToWrite;
352
0
        if (target > targetEnd) {
353
0
            --source; /* Back up source pointer! */
354
0
            target -= bytesToWrite; result = targetExhausted; break;
355
0
        }
356
0
        switch (bytesToWrite) { /* note: everything falls through. */
357
0
            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
358
0
            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
359
0
            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
360
0
            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
361
0
        }
362
0
        target += bytesToWrite;
363
0
    }
364
0
    *sourceStart = source;
365
0
    *targetStart = target;
366
0
    return result;
367
0
}
368
369
/* --------------------------------------------------------------------- */
370
371
/*
372
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
373
 * This must be called with the length pre-determined by the first byte.
374
 * If not calling this from ConvertUTF8to*, then the length can be set by:
375
 *  length = trailingBytesForUTF8[*source]+1;
376
 * and the sequence is illegal right away if there aren't that many bytes
377
 * available.
378
 * If presented with a length > 4, this returns false.  The Unicode
379
 * definition of UTF-8 goes up to 4-byte sequences.
380
 */
381
382
0
static Boolean isLegalUTF8(const UTF8 *source, int length) {
383
0
    UTF8 a;
384
0
    const UTF8 *srcptr = source+length;
385
0
    switch (length) {
386
0
    default: return false;
387
        /* Everything else falls through when "true"... */
388
0
    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
389
0
    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
390
0
    case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
391
392
0
        switch (*source) {
393
            /* no fall-through in this inner switch */
394
0
            case 0xE0: if (a < 0xA0) return false; break;
395
0
            case 0xED: if (a > 0x9F) return false; break;
396
0
            case 0xF0: if (a < 0x90) return false; break;
397
0
            case 0xF4: if (a > 0x8F) return false; break;
398
0
            default:   if (a < 0x80) return false;
399
0
        }
400
401
0
    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
402
0
    }
403
0
    if (*source > 0xF4) return false;
404
0
    return true;
405
0
}
406
407
/* --------------------------------------------------------------------- */
408
409
/*
410
 * Exported function to return whether a UTF-8 sequence is legal or not.
411
 * This is not used here; it's just exported.
412
 */
413
0
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
414
0
    int length = trailingBytesForUTF8[*source]+1;
415
0
    if (length > sourceEnd - source) {
416
0
        return false;
417
0
    }
418
0
    return isLegalUTF8(source, length);
419
0
}
420
421
/* --------------------------------------------------------------------- */
422
423
static unsigned
424
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
425
0
                                          const UTF8 *sourceEnd) {
426
0
  UTF8 b1, b2, b3;
427
428
0
  assert(!isLegalUTF8Sequence(source, sourceEnd));
429
430
  /*
431
   * Unicode 6.3.0, D93b:
432
   *
433
   *   Maximal subpart of an ill-formed subsequence: The longest code unit
434
   *   subsequence starting at an unconvertible offset that is either:
435
   *   a. the initial subsequence of a well-formed code unit sequence, or
436
   *   b. a subsequence of length one.
437
   */
438
439
0
  if (source == sourceEnd)
440
0
    return 0;
441
442
  /*
443
   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
444
   * Byte Sequences.
445
   */
446
447
0
  b1 = *source;
448
0
  ++source;
449
0
  if (b1 >= 0xC2 && b1 <= 0xDF) {
450
    /*
451
     * First byte is valid, but we know that this code unit sequence is
452
     * invalid, so the maximal subpart has to end after the first byte.
453
     */
454
0
    return 1;
455
0
  }
456
457
0
  if (source == sourceEnd)
458
0
    return 1;
459
460
0
  b2 = *source;
461
0
  ++source;
462
463
0
  if (b1 == 0xE0) {
464
0
    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
465
0
  }
466
0
  if (b1 >= 0xE1 && b1 <= 0xEC) {
467
0
    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
468
0
  }
469
0
  if (b1 == 0xED) {
470
0
    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
471
0
  }
472
0
  if (b1 >= 0xEE && b1 <= 0xEF) {
473
0
    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
474
0
  }
475
0
  if (b1 == 0xF0) {
476
0
    if (b2 >= 0x90 && b2 <= 0xBF) {
477
0
      if (source == sourceEnd)
478
0
        return 2;
479
480
0
      b3 = *source;
481
0
      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
482
0
    }
483
0
    return 1;
484
0
  }
485
0
  if (b1 >= 0xF1 && b1 <= 0xF3) {
486
0
    if (b2 >= 0x80 && b2 <= 0xBF) {
487
0
      if (source == sourceEnd)
488
0
        return 2;
489
490
0
      b3 = *source;
491
0
      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
492
0
    }
493
0
    return 1;
494
0
  }
495
0
  if (b1 == 0xF4) {
496
0
    if (b2 >= 0x80 && b2 <= 0x8F) {
497
0
      if (source == sourceEnd)
498
0
        return 2;
499
500
0
      b3 = *source;
501
0
      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
502
0
    }
503
0
    return 1;
504
0
  }
505
506
0
  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
507
  /*
508
   * There are no valid sequences that start with these bytes.  Maximal subpart
509
   * is defined to have length 1 in these cases.
510
   */
511
0
  return 1;
512
0
}
513
514
/* --------------------------------------------------------------------- */
515
516
/*
517
 * Exported function to return the total number of bytes in a codepoint
518
 * represented in UTF-8, given the value of the first byte.
519
 */
520
0
unsigned getNumBytesForUTF8(UTF8 first) {
521
0
  return trailingBytesForUTF8[first] + 1;
522
0
}
523
524
/* --------------------------------------------------------------------- */
525
526
/*
527
 * Exported function to return whether a UTF-8 string is legal or not.
528
 * This is not used here; it's just exported.
529
 */
530
0
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
531
0
    while (*source != sourceEnd) {
532
0
        int length = trailingBytesForUTF8[**source] + 1;
533
0
        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
534
0
            return false;
535
0
        *source += length;
536
0
    }
537
0
    return true;
538
0
}
539
540
/* --------------------------------------------------------------------- */
541
542
ConversionResult ConvertUTF8toUTF16 (
543
        const UTF8** sourceStart, const UTF8* sourceEnd,
544
0
        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
545
0
    ConversionResult result = conversionOK;
546
0
    const UTF8* source = *sourceStart;
547
0
    UTF16* target = *targetStart;
548
0
    while (source < sourceEnd) {
549
0
        UTF32 ch = 0;
550
0
        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
551
0
        if (extraBytesToRead >= sourceEnd - source) {
552
0
            result = sourceExhausted; break;
553
0
        }
554
        /* Do this check whether lenient or strict */
555
0
        if (!isLegalUTF8(source, extraBytesToRead+1)) {
556
0
            result = sourceIllegal;
557
0
            break;
558
0
        }
559
        /*
560
         * The cases all fall through. See "Note A" below.
561
         */
562
0
        switch (extraBytesToRead) {
563
0
            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
564
0
            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
565
0
            case 3: ch += *source++; ch <<= 6;
566
0
            case 2: ch += *source++; ch <<= 6;
567
0
            case 1: ch += *source++; ch <<= 6;
568
0
            case 0: ch += *source++;
569
0
        }
570
0
        ch -= offsetsFromUTF8[extraBytesToRead];
571
572
0
        if (target >= targetEnd) {
573
0
            source -= (extraBytesToRead+1); /* Back up source pointer! */
574
0
            result = targetExhausted; break;
575
0
        }
576
0
        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
577
            /* UTF-16 surrogate values are illegal in UTF-32 */
578
0
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
579
0
                if (flags == strictConversion) {
580
0
                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
581
0
                    result = sourceIllegal;
582
0
                    break;
583
0
                } else {
584
0
                    *target++ = UNI_REPLACEMENT_CHAR;
585
0
                }
586
0
            } else {
587
0
                *target++ = (UTF16)ch; /* normal case */
588
0
            }
589
0
        } else if (ch > UNI_MAX_UTF16) {
590
0
            if (flags == strictConversion) {
591
0
                result = sourceIllegal;
592
0
                source -= (extraBytesToRead+1); /* return to the start */
593
0
                break; /* Bail out; shouldn't continue */
594
0
            } else {
595
0
                *target++ = UNI_REPLACEMENT_CHAR;
596
0
            }
597
0
        } else {
598
            /* target is a character in range 0xFFFF - 0x10FFFF. */
599
0
            if (target + 1 >= targetEnd) {
600
0
                source -= (extraBytesToRead+1); /* Back up source pointer! */
601
0
                result = targetExhausted; break;
602
0
            }
603
0
            ch -= halfBase;
604
0
            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
605
0
            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
606
0
        }
607
0
    }
608
0
    *sourceStart = source;
609
0
    *targetStart = target;
610
0
    return result;
611
0
}
612
613
/* --------------------------------------------------------------------- */
614
615
static ConversionResult ConvertUTF8toUTF32Impl(
616
        const UTF8** sourceStart, const UTF8* sourceEnd,
617
        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
618
0
        Boolean InputIsPartial) {
619
0
    ConversionResult result = conversionOK;
620
0
    const UTF8* source = *sourceStart;
621
0
    UTF32* target = *targetStart;
622
0
    while (source < sourceEnd) {
623
0
        UTF32 ch = 0;
624
0
        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
625
0
        if (extraBytesToRead >= sourceEnd - source) {
626
0
            if (flags == strictConversion || InputIsPartial) {
627
0
                result = sourceExhausted;
628
0
                break;
629
0
            } else {
630
0
                result = sourceIllegal;
631
632
                /*
633
                 * Replace the maximal subpart of ill-formed sequence with
634
                 * replacement character.
635
                 */
636
0
                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
637
0
                                                                    sourceEnd);
638
0
                *target++ = UNI_REPLACEMENT_CHAR;
639
0
                continue;
640
0
            }
641
0
        }
642
0
        if (target >= targetEnd) {
643
0
            result = targetExhausted; break;
644
0
        }
645
646
        /* Do this check whether lenient or strict */
647
0
        if (!isLegalUTF8(source, extraBytesToRead+1)) {
648
0
            result = sourceIllegal;
649
0
            if (flags == strictConversion) {
650
                /* Abort conversion. */
651
0
                break;
652
0
            } else {
653
                /*
654
                 * Replace the maximal subpart of ill-formed sequence with
655
                 * replacement character.
656
                 */
657
0
                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
658
0
                                                                    sourceEnd);
659
0
                *target++ = UNI_REPLACEMENT_CHAR;
660
0
                continue;
661
0
            }
662
0
        }
663
        /*
664
         * The cases all fall through. See "Note A" below.
665
         */
666
0
        switch (extraBytesToRead) {
667
0
            case 5: ch += *source++; ch <<= 6;
668
0
            case 4: ch += *source++; ch <<= 6;
669
0
            case 3: ch += *source++; ch <<= 6;
670
0
            case 2: ch += *source++; ch <<= 6;
671
0
            case 1: ch += *source++; ch <<= 6;
672
0
            case 0: ch += *source++;
673
0
        }
674
0
        ch -= offsetsFromUTF8[extraBytesToRead];
675
676
0
        if (ch <= UNI_MAX_LEGAL_UTF32) {
677
            /*
678
             * UTF-16 surrogate values are illegal in UTF-32, and anything
679
             * over Plane 17 (> 0x10FFFF) is illegal.
680
             */
681
0
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
682
0
                if (flags == strictConversion) {
683
0
                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
684
0
                    result = sourceIllegal;
685
0
                    break;
686
0
                } else {
687
0
                    *target++ = UNI_REPLACEMENT_CHAR;
688
0
                }
689
0
            } else {
690
0
                *target++ = ch;
691
0
            }
692
0
        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
693
0
            result = sourceIllegal;
694
0
            *target++ = UNI_REPLACEMENT_CHAR;
695
0
        }
696
0
    }
697
0
    *sourceStart = source;
698
0
    *targetStart = target;
699
0
    return result;
700
0
}
701
702
ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
703
                                           const UTF8 *sourceEnd,
704
                                           UTF32 **targetStart,
705
                                           UTF32 *targetEnd,
706
0
                                           ConversionFlags flags) {
707
0
  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
708
0
                                flags, /*InputIsPartial=*/true);
709
0
}
710
711
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
712
                                    const UTF8 *sourceEnd, UTF32 **targetStart,
713
0
                                    UTF32 *targetEnd, ConversionFlags flags) {
714
0
  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
715
0
                                flags, /*InputIsPartial=*/false);
716
0
}
717
718
/* ---------------------------------------------------------------------
719
720
    Note A.
721
    The fall-through switches in UTF-8 reading code save a
722
    temp variable, some decrements & conditionals.  The switches
723
    are equivalent to the following loop:
724
        {
725
            int tmpBytesToRead = extraBytesToRead+1;
726
            do {
727
                ch += *source++;
728
                --tmpBytesToRead;
729
                if (tmpBytesToRead) ch <<= 6;
730
            } while (tmpBytesToRead > 0);
731
        }
732
    In UTF-8 writing code, the switches on "bytesToWrite" are
733
    similarly unrolled loops.
734
735
   --------------------------------------------------------------------- */
736
737
} // namespace llvh
738
739
ConvertUTF_RESTORE_WARNINGS