Coverage Report

Created: 2025-10-10 06:29

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/sleuthkit/tsk/base/tsk_unicode.c
Line
Count
Source
1
/*
2
 * Copyright 2001-2004 Unicode, Inc.
3
 *
4
 * Disclaimer
5
 *
6
 * This source code is provided as is by Unicode, Inc. No claims are
7
 * made as to fitness for any particular purpose. No warranties of any
8
 * kind are expressed or implied. The recipient agrees to determine
9
 * applicability of information provided. If this file has been
10
 * purchased on magnetic or optical media from Unicode, Inc., the
11
 * sole remedy for any claim will be exchange of defective media
12
 * within 90 days of receipt.
13
 *
14
 * Limitations on Rights to Redistribute This Code
15
 *
16
 * Unicode, Inc. hereby grants the right to freely use the information
17
 * supplied in this file in the creation of products supporting the
18
 * Unicode Standard, and to make copies of this file in any form
19
 * for internal or external distribution as long as this notice
20
 * remains attached.
21
 */
22
23
/* ---------------------------------------------------------------------
24
25
    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26
    Author: Mark E. Davis, 1994.
27
    Rev History: Rick McGowan, fixes & updates May 2001.
28
    Sept 2001: fixed const & error conditions per
29
  mods suggested by S. Parent & A. Lillich.
30
    June 2002: Tim Dodd added detection and handling of incomplete
31
  source sequences, enhanced error detection, added casts
32
  to eliminate compiler warnings.
33
    July 2003: slight mods to back out aggressive FFFE detection.
34
    Jan 2004: updated switches in from-UTF8 conversions.
35
    Oct 2004: updated to use TSK_UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37
    See the header file "ConvertUTF.h" for complete documentation.
38
39
------------------------------------------------------------------------ */
40
41
/** \file tsk_unicode.c
42
 * A local copy of the Unicode conversion routines from unicode.org.
43
 */
44
45
#include "tsk_base_i.h"
46
#include <wchar.h>
47
48
/* Some fundamental constants */
49
typedef unsigned long UTF32;    /* at least 32 bits */
50
0
#define TSK_UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
51
0
#define TSK_UNI_MAX_BMP (UTF32)0x0000FFFF
52
0
#define TSK_UNI_MAX_UTF16 (UTF32)0x0010FFFF
53
#define TSK_UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
54
#define TSK_UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
55
56
57
static const int halfShift = 10;        /* used for shifting by 10 bits */
58
59
static const UTF32 halfBase = 0x0010000UL;
60
static const UTF32 halfMask = 0x3FFUL;
61
62
253M
#define UNI_SUR_HIGH_START  (UTF32)0xD800
63
23.2M
#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
64
253M
#define UNI_SUR_LOW_START   (UTF32)0xDC00
65
21.8M
#define UNI_SUR_LOW_END     (UTF32)0xDFFF
66
0
#define false    0
67
0
#define true      1
68
69
/* --------------------------------------------------------------------- */
70
71
72
/* --------------------------------------------------------------------- */
73
74
/*
75
 * Index into the table below with the first byte of a UTF-8 sequence to
76
 * get the number of trailing bytes that are supposed to follow it.
77
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
78
 * left as-is for anyone who may want to do such conversion, which was
79
 * allowed in earlier algorithms.
80
 */
81
static const char trailingBytesForUTF8[256] = {
82
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83
    0, 0, 0, 0, 0, 0, 0, 0,
84
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85
    0, 0, 0, 0, 0, 0, 0, 0,
86
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87
    0, 0, 0, 0, 0, 0, 0, 0,
88
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
89
    0, 0, 0, 0, 0, 0, 0, 0,
90
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91
    0, 0, 0, 0, 0, 0, 0, 0,
92
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
93
    0, 0, 0, 0, 0, 0, 0, 0,
94
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
95
    1, 1, 1, 1, 1, 1, 1, 1,
96
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
97
    4, 4, 4, 4, 5, 5, 5, 5
98
};
99
100
/*
101
 * Magic values subtracted from a buffer value during UTF8 conversion.
102
 * This table contains as many values as there might be trailing bytes
103
 * in a UTF-8 sequence.
104
 */
105
static const UTF32 offsetsFromUTF8[6] =
106
    { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
107
    0x03C82080UL, 0xFA082080UL, 0x82082080UL
108
};
109
110
111
/*
112
 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
113
 * into the first byte, depending on how many bytes follow.  There are
114
 * as many entries in this table as there are UTF-8 sequence types.
115
 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
116
 * for *legal* UTF-8 will be 4 or fewer bytes total.
117
 */
118
static const UTF8 firstByteMark[7] =
119
    { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
120
121
122
/* --------------------------------------------------------------------- */
123
124
/* The interface converts a whole buffer to avoid function-call overhead.
125
 * Constants have been gathered. Loops & conditionals have been removed as
126
 * much as possible for efficiency, in favor of drop-through switches.
127
 * (See "Note A" at the bottom of the file for equivalent code.)
128
 * If your compiler supports it, the "isLegalUTF8" call can be turned
129
 * into an inline function.
130
 */
131
132
/* --------------------------------------------------------------------- */
133
134
135
/**
136
 * \ingroup baselib
137
 * Convert a UTF-16 string to UTF-8.
138
 * @param endian Endian ordering flag of UTF-16 text
139
 * @param sourceStart Pointer to pointer to start of UTF-16 string.  Will be updated to last char processed.
140
 * @param sourceEnd Pointer to one entry past end of UTF-16 string
141
 * @param targetStart Pointer to pointer to place where UTF-8 string should be written.  Will be updated to next place to write to.
142
 * @param targetEnd Pointer to end of UTF-8 buffer
143
 * @param flags Flags used during conversion
144
 * @returns error code
145
 */
146
TSKConversionResult
147
tsk_UTF16toUTF8(TSK_ENDIAN_ENUM endian, const UTF16 ** sourceStart,
148
    const UTF16 * sourceEnd, UTF8 ** targetStart,
149
    UTF8 * targetEnd, TSKConversionFlags flags)
150
3.66M
{
151
3.66M
    TSKConversionResult result = TSKconversionOK;
152
3.66M
    const UTF16 *source = *sourceStart;
153
3.66M
    UTF8 *target = *targetStart;
154
155
130M
    while (source < sourceEnd) {
156
126M
        UTF32 ch;
157
126M
        unsigned short bytesToWrite = 0;
158
126M
        const UTF32 byteMask = 0xBF;
159
126M
        const UTF32 byteMark = 0x80;
160
126M
        const UTF16 *oldSource = source;        /* In case we have to back up because of target overflow. */
161
162
        // Need at least 2 bytes
163
126M
        ch = tsk_getu16(endian, (uint8_t *) source);
164
126M
        source++;
165
166
        /* If we have a surrogate pair, convert to UTF32 first. */
167
126M
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
168
            /* If the 16 bits following the high surrogate are in the source buffer... */
169
2.10M
            if (source < sourceEnd) {
170
                // Need at least 2 bytes
171
2.03M
                UTF32 ch2 = tsk_getu16(endian, (uint8_t *) source);
172
2.03M
                ++source;
173
174
                /* If it's a low surrogate, convert to UTF32. */
175
2.03M
                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
176
365k
                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
177
365k
                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
178
365k
                }
179
1.67M
                else if (flags == TSKstrictConversion) {        /* it's an unpaired high surrogate */
180
88
                    result = TSKsourceIllegal;
181
88
                    break;
182
88
                }
183
                // replace with another character
184
1.67M
                else {
185
1.67M
                    ch = '^';
186
1.67M
                }
187
2.03M
            }
188
72.3k
            else {              /* We don't have the 16 bits following the high surrogate. */
189
72.3k
                --source;       /* return to the high surrogate */
190
72.3k
                result = TSKsourceExhausted;
191
72.3k
                break;
192
72.3k
            }
193
2.10M
        }
194
        /* UTF-16 surrogate values are illegal in UTF-32 */
195
124M
        else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
196
1.27M
            if (flags == TSKstrictConversion) {
197
57
                --source;       /* return to the illegal value itself */
198
57
                result = TSKsourceIllegal;
199
57
                break;
200
57
            }
201
            // replace with another character
202
1.27M
            else {
203
1.27M
                ch = '^';
204
1.27M
            }
205
1.27M
        }
206
207
        /* Figure out how many bytes the result will require */
208
126M
        if (ch < (UTF32) 0x80) {
209
64.2M
            bytesToWrite = 1;
210
64.2M
        }
211
62.2M
        else if (ch < (UTF32) 0x800) {
212
9.00M
            bytesToWrite = 2;
213
9.00M
        }
214
53.2M
        else if (ch < (UTF32) 0x10000) {
215
52.8M
            bytesToWrite = 3;
216
52.8M
        }
217
365k
        else if (ch < (UTF32) 0x110000) {
218
365k
            bytesToWrite = 4;
219
365k
        }
220
0
        else {
221
0
            bytesToWrite = 3;
222
0
            ch = TSK_UNI_REPLACEMENT_CHAR;
223
0
        }
224
225
126M
        target += bytesToWrite;
226
126M
        if (target > targetEnd) {
227
26.0k
            source = oldSource; /* Back up source pointer! */
228
26.0k
            target -= bytesToWrite;
229
26.0k
            result = TSKtargetExhausted;
230
26.0k
            break;
231
26.0k
        }
232
126M
        switch (bytesToWrite) { /* note: everything falls through. */
233
365k
        case 4:
234
365k
            *--target = (UTF8) ((ch | byteMark) & byteMask);
235
365k
            ch >>= 6;
236
365k
            FALLTHROUGH;
237
53.2M
        case 3:
238
53.2M
            *--target = (UTF8) ((ch | byteMark) & byteMask);
239
53.2M
            ch >>= 6;
240
53.2M
            FALLTHROUGH;
241
62.2M
        case 2:
242
62.2M
            *--target = (UTF8) ((ch | byteMark) & byteMask);
243
62.2M
            ch >>= 6;
244
62.2M
            FALLTHROUGH;
245
126M
        case 1:
246
126M
            *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
247
126M
        }
248
126M
        target += bytesToWrite;
249
126M
    }
250
3.66M
    *sourceStart = source;
251
3.66M
    *targetStart = target;
252
3.66M
    return result;
253
3.66M
}
254
255
256
/**
257
* \ingroup baselib
258
* Convert a UTF-16 string in local endian ordering to UTF-8.
259
* @param sourceStart Pointer to pointer to start of UTF-16 string.  Will be updated to last char processed.
260
* @param sourceEnd Pointer to one entry past end of UTF-16 string
261
* @param targetStart Pointer to pointer to place where UTF-8 string should be written.  Will be updated to next place to write to.
262
* @param targetEnd Pointer to end of UTF-8 buffer
263
* @param flags Flags used during conversion
264
* @returns error code
265
*/
266
TSKConversionResult
267
tsk_UTF16toUTF8_lclorder(const UTF16 ** sourceStart,
268
    const UTF16 * sourceEnd, UTF8 ** targetStart,
269
    UTF8 * targetEnd, TSKConversionFlags flags)
270
0
{
271
0
    TSKConversionResult result = TSKconversionOK;
272
0
    const UTF16 *source = *sourceStart;
273
0
    UTF8 *target = *targetStart;
274
0
    while (source < sourceEnd) {
275
0
        UTF32 ch;
276
0
        unsigned short bytesToWrite = 0;
277
0
        const UTF32 byteMask = 0xBF;
278
0
        const UTF32 byteMark = 0x80;
279
0
        const UTF16 *oldSource = source;        /* In case we have to back up because of target overflow. */
280
0
        ch = *source++;
281
282
        /* If we have a surrogate pair, convert to UTF32 first. */
283
0
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
284
            /* If the 16 bits following the high surrogate are in the source buffer... */
285
0
            if (source < sourceEnd) {
286
0
                UTF32 ch2 = *source;
287
0
                source++;
288
                /* If it's a low surrogate, convert to UTF32. */
289
0
                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
290
0
                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
291
0
                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
292
0
                }
293
0
                else if (flags == TSKstrictConversion) {        /* it's an unpaired high surrogate */
294
0
                    result = TSKsourceIllegal;
295
0
                    break;
296
0
                }
297
                // replace with another character
298
0
                else {
299
0
                    ch = '^';
300
0
                }
301
0
            }
302
0
            else {              /* We don't have the 16 bits following the high surrogate. */
303
0
                --source;       /* return to the high surrogate */
304
0
                result = TSKsourceExhausted;
305
0
                break;
306
0
            }
307
0
        }
308
        /* UTF-16 surrogate values are illegal in UTF-32 */
309
0
        else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
310
0
            if (flags == TSKstrictConversion) {
311
0
                --source;       /* return to the illegal value itself */
312
0
                result = TSKsourceIllegal;
313
0
                break;
314
0
            }
315
            // replace with another character
316
0
            else {
317
0
                ch = '^';
318
0
            }
319
0
        }
320
321
        /* Figure out how many bytes the result will require */
322
0
        if (ch < (UTF32) 0x80) {
323
0
            bytesToWrite = 1;
324
0
        }
325
0
        else if (ch < (UTF32) 0x800) {
326
0
            bytesToWrite = 2;
327
0
        }
328
0
        else if (ch < (UTF32) 0x10000) {
329
0
            bytesToWrite = 3;
330
0
        }
331
0
        else if (ch < (UTF32) 0x110000) {
332
0
            bytesToWrite = 4;
333
0
        }
334
0
        else {
335
0
            bytesToWrite = 3;
336
0
            ch = TSK_UNI_REPLACEMENT_CHAR;
337
0
        }
338
339
0
        target += bytesToWrite;
340
0
        if (target > targetEnd) {
341
0
            source = oldSource; /* Back up source pointer! */
342
0
            target -= bytesToWrite;
343
0
            result = TSKtargetExhausted;
344
0
            break;
345
0
        }
346
0
        switch (bytesToWrite) { /* note: everything falls through. */
347
0
        case 4:
348
0
            *--target = (UTF8) ((ch | byteMark) & byteMask);
349
0
            ch >>= 6;
350
0
            FALLTHROUGH;
351
0
        case 3:
352
0
            *--target = (UTF8) ((ch | byteMark) & byteMask);
353
0
            ch >>= 6;
354
0
            FALLTHROUGH;
355
0
        case 2:
356
0
            *--target = (UTF8) ((ch | byteMark) & byteMask);
357
0
            ch >>= 6;
358
0
            FALLTHROUGH;
359
0
        case 1:
360
0
            *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
361
0
        }
362
0
        target += bytesToWrite;
363
0
    }
364
0
    *sourceStart = source;
365
0
    *targetStart = target;
366
0
    return result;
367
0
}
368
369
TSKConversionResult
370
tsk_UTF16WtoUTF8_lclorder(const wchar_t ** sourceStart,
371
    const wchar_t * sourceEnd, UTF8 ** targetStart,
372
    UTF8 * targetEnd, TSKConversionFlags flags)
373
0
{
374
0
    TSKConversionResult result = TSKconversionOK;
375
0
    const wchar_t *source = *sourceStart;
376
0
    UTF8 *target = *targetStart;
377
0
    while (source < sourceEnd) {
378
0
        UTF32 ch;
379
0
        unsigned short bytesToWrite = 0;
380
0
        const UTF32 byteMask = 0xBF;
381
0
        const UTF32 byteMark = 0x80;
382
0
        const wchar_t *oldSource = source;        /* In case we have to back up because of target overflow. */
383
0
        ch = *source++;
384
385
        /* If we have a surrogate pair, convert to UTF32 first. */
386
0
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
387
            /* If the 16 bits following the high surrogate are in the source buffer... */
388
0
            if (source < sourceEnd) {
389
0
                UTF32 ch2 = *source;
390
0
                source++;
391
                /* If it's a low surrogate, convert to UTF32. */
392
0
                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
393
0
                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
394
0
                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
395
0
                }
396
0
                else if (flags == TSKstrictConversion) {        /* it's an unpaired high surrogate */
397
0
                    result = TSKsourceIllegal;
398
0
                    break;
399
0
                }
400
                // replace with another character
401
0
                else {
402
0
                    ch = '^';
403
0
                }
404
0
            }
405
0
            else {              /* We don't have the 16 bits following the high surrogate. */
406
0
                --source;       /* return to the high surrogate */
407
0
                result = TSKsourceExhausted;
408
0
                break;
409
0
            }
410
0
        }
411
        /* UTF-16 surrogate values are illegal in UTF-32 */
412
0
        else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
413
0
            if (flags == TSKstrictConversion) {
414
0
                --source;       /* return to the illegal value itself */
415
0
                result = TSKsourceIllegal;
416
0
                break;
417
0
            }
418
            // replace with another character
419
0
            else {
420
0
                ch = '^';
421
0
            }
422
0
        }
423
424
        /* Figure out how many bytes the result will require */
425
0
        if (ch < (UTF32) 0x80) {
426
0
            bytesToWrite = 1;
427
0
        }
428
0
        else if (ch < (UTF32) 0x800) {
429
0
            bytesToWrite = 2;
430
0
        }
431
0
        else if (ch < (UTF32) 0x10000) {
432
0
            bytesToWrite = 3;
433
0
        }
434
0
        else if (ch < (UTF32) 0x110000) {
435
0
            bytesToWrite = 4;
436
0
        }
437
0
        else {
438
0
            bytesToWrite = 3;
439
0
            ch = TSK_UNI_REPLACEMENT_CHAR;
440
0
        }
441
442
0
        target += bytesToWrite;
443
0
        if (target > targetEnd) {
444
0
            source = oldSource; /* Back up source pointer! */
445
0
            target -= bytesToWrite;
446
0
            result = TSKtargetExhausted;
447
0
            break;
448
0
        }
449
0
        switch (bytesToWrite) { /* note: everything falls through. */
450
0
        case 4:
451
0
            *--target = (UTF8) ((ch | byteMark) & byteMask);
452
0
            ch >>= 6;
453
0
            FALLTHROUGH;
454
0
        case 3:
455
0
            *--target = (UTF8) ((ch | byteMark) & byteMask);
456
0
            ch >>= 6;
457
0
            FALLTHROUGH;
458
0
        case 2:
459
0
            *--target = (UTF8) ((ch | byteMark) & byteMask);
460
0
            ch >>= 6;
461
0
            FALLTHROUGH;
462
0
        case 1:
463
0
            *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
464
0
        }
465
0
        target += bytesToWrite;
466
0
    }
467
0
    *sourceStart = source;
468
0
    *targetStart = target;
469
0
    return result;
470
0
}
471
472
/* --------------------------------------------------------------------- */
473
474
/*
475
 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
476
 * This must be called with the length pre-determined by the first byte.
477
 * If not calling this from ConvertUTF8to*, then the length can be set by:
478
 *  length = trailingBytesForUTF8[*source]+1;
479
 * and the sequence is illegal right away if there aren't that many bytes
480
 * available.
481
 * If presented with a length > 4, this returns false.  The Unicode
482
 * definition of UTF-8 goes up to 4-byte sequences.
483
 */
484
485
static Boolean
486
isLegalUTF8(const UTF8 * source, int length)
487
0
{
488
0
    UTF8 a;
489
0
    const UTF8 *srcptr = source + length;
490
0
    switch (length) {
491
0
    default:
492
0
        return false;
493
        /* Everything else falls through when "true"... */
494
0
    case 4:
495
0
        if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
496
0
            return false;
497
0
        FALLTHROUGH;
498
0
    case 3:
499
0
        if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
500
0
            return false;
501
0
        FALLTHROUGH;
502
0
    case 2:
503
0
        if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
504
0
            return false;
505
506
0
        switch (*source) {
507
            /* no fall-through in this inner switch */
508
0
        case 0xE0:
509
0
            if (a < 0xA0)
510
0
                return false;
511
0
            break;
512
0
        case 0xED:
513
0
            if (a > 0x9F)
514
0
                return false;
515
0
            break;
516
0
        case 0xF0:
517
0
            if (a < 0x90)
518
0
                return false;
519
0
            break;
520
0
        case 0xF4:
521
0
            if (a > 0x8F)
522
0
                return false;
523
0
            break;
524
0
        default:
525
0
            if (a < 0x80)
526
0
                return false;
527
0
        }
528
0
        FALLTHROUGH;
529
530
0
    case 1:
531
0
        if (*source >= 0x80 && *source < 0xC2)
532
0
            return false;
533
0
    }
534
0
    if (*source > 0xF4)
535
0
        return false;
536
0
    return true;
537
0
}
538
539
/* --------------------------------------------------------------------- */
540
541
/*
542
 * Exported function to return whether a UTF-8 sequence is legal or not.
543
 * This is not used here; it's just exported.
544
 */
545
Boolean
546
tsk_isLegalUTF8Sequence(const UTF8 * source, const UTF8 * sourceEnd)
547
0
{
548
0
    int length = trailingBytesForUTF8[*source] + 1;
549
0
    if (source + length > sourceEnd) {
550
0
        return false;
551
0
    }
552
0
    return isLegalUTF8(source, length);
553
0
}
554
555
/**
556
 * Cleans up the passed in string to replace invalid
557
 * UTF-8 values with the passed in character.
558
 * @param source String to be cleaned up
559
 * @param replacement Character to insert into source as needed.
560
 */
561
void
562
tsk_cleanupUTF8(char *source, const char replacement)
563
0
{
564
0
    size_t total_len = strlen(source);
565
0
    size_t cur_idx = 0;
566
567
0
    while (cur_idx < total_len) {
568
0
        int length = trailingBytesForUTF8[(UTF8) source[cur_idx]] + 1;
569
0
        if (cur_idx + length > total_len) {
570
0
            while (cur_idx < total_len) {
571
0
                source[cur_idx] = replacement;
572
0
                cur_idx++;
573
0
            }
574
0
            break;
575
0
        }
576
0
        if (isLegalUTF8((UTF8 *) & source[cur_idx], length) == false) {
577
0
            int i;
578
0
            for (i = 0; i < length; i++) {
579
0
                source[cur_idx + i] = replacement;
580
0
            }
581
0
        }
582
0
        cur_idx += length;
583
0
    }
584
0
}
585
586
587
/**
588
 * Cleans up the passed in string to replace invalid
589
 * UTF-16 values with the passed in character.
590
 * @param endian Ordering that data is stored in
591
 * @param source String to be cleaned up
592
 * @param source_len Number of wchar_t characters in source
593
 * @param replacement Character to insert into source as needed.
594
 */
595
void
596
0
tsk_cleanupUTF16(TSK_ENDIAN_ENUM endian, wchar_t *source, size_t source_len, const wchar_t replacement) {
597
598
0
    size_t cur_idx = 0;
599
0
    while (cur_idx < source_len) {
600
0
        UTF32 ch = tsk_getu16(endian, (uint8_t *) &source[cur_idx]);
601
602
        /* If we have a surrogate pair, check out the high part. */
603
0
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
604
            /* If the 16 bits following the high surrogate are in the source buffer... */
605
0
            if (cur_idx + 1 < source_len) {
606
0
                UTF32 ch2 = tsk_getu16(endian, (uint8_t *) &source[cur_idx+1]);
607
608
                /* If it's a low surrogate, we're good. */
609
0
                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
610
                    // all good, use both
611
0
                    cur_idx++;
612
0
                }
613
0
                else {
614
0
                    source[cur_idx] = replacement;
615
0
                }
616
0
            }
617
0
            else {   /* We don't have the 16 bits following the high surrogate. */
618
0
                source[cur_idx] = replacement;
619
0
            }
620
0
        }
621
        /* UTF-16 surrogate values are illegal in UTF-32 */
622
0
        else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
623
0
            source[cur_idx] = replacement;
624
0
        }
625
0
        cur_idx++;
626
0
    }
627
0
}
628
629
630
/* --------------------------------------------------------------------- */
631
632
633
634
/**
635
* \ingroup baselib
636
* Convert a UTF-8 string to UTF-16 (in local endian ordering).
637
* @param sourceStart Pointer to pointer to start of UTF-8 string.  Will be updated to last char processed.
638
* @param sourceEnd Pointer to one entry past end of UTF-8 string
639
* @param targetStart Pointer to pointer to place where UTF-16 string should be written.  Will be updated to next place to write to.
640
* @param targetEnd Pointer to end of UTF-16 buffer
641
* @param flags Flags used during conversion
642
* @returns error code
643
*/
644
TSKConversionResult
645
tsk_UTF8toUTF16(const UTF8 ** sourceStart,
646
    const UTF8 * sourceEnd, UTF16 ** targetStart,
647
    UTF16 * targetEnd, TSKConversionFlags flags)
648
0
{
649
0
    TSKConversionResult result = TSKconversionOK;
650
0
    const UTF8 *source = *sourceStart;
651
0
    UTF16 *target = *targetStart;
652
0
    while (source < sourceEnd) {
653
0
        UTF32 ch = 0;
654
0
        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
655
0
        if (source + extraBytesToRead >= sourceEnd) {
656
0
            result = TSKsourceExhausted;
657
0
            break;
658
0
        }
659
        /* Do this check whether lenient or strict */
660
0
        if (!isLegalUTF8(source, extraBytesToRead + 1)) {
661
0
            result = TSKsourceIllegal;
662
0
            break;
663
0
        }
664
        /*
665
         * The cases all fall through. See "Note A" below.
666
         */
667
0
        switch (extraBytesToRead) {
668
0
        case 5:
669
0
            ch += *source++;
670
0
            ch <<= 6;           /* remember, illegal UTF-8 */
671
0
            FALLTHROUGH;
672
0
        case 4:
673
0
            ch += *source++;
674
0
            ch <<= 6;           /* remember, illegal UTF-8 */
675
0
            FALLTHROUGH;
676
0
        case 3:
677
0
            ch += *source++;
678
0
            ch <<= 6;
679
0
            FALLTHROUGH;
680
0
        case 2:
681
0
            ch += *source++;
682
0
            ch <<= 6;
683
0
            FALLTHROUGH;
684
0
        case 1:
685
0
            ch += *source++;
686
0
            ch <<= 6;
687
0
            FALLTHROUGH;
688
0
        case 0:
689
0
            ch += *source++;
690
0
        }
691
0
        ch -= offsetsFromUTF8[extraBytesToRead];
692
693
0
        if (target >= targetEnd) {
694
0
            source -= (extraBytesToRead + 1);   /* Back up source pointer! */
695
0
            result = TSKtargetExhausted;
696
0
            break;
697
0
        }
698
0
        if (ch <= TSK_UNI_MAX_BMP) {    /* Target is a character <= 0xFFFF */
699
            /* UTF-16 surrogate values are illegal in UTF-32 */
700
0
            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
701
0
                if (flags == TSKstrictConversion) {
702
0
                    source -= (extraBytesToRead + 1);   /* return to the illegal value itself */
703
0
                    result = TSKsourceIllegal;
704
0
                    break;
705
0
                }
706
0
                else {
707
0
                    *target++ = TSK_UNI_REPLACEMENT_CHAR;
708
0
                }
709
0
            }
710
0
            else {
711
0
                *target++ = (UTF16) ch; /* normal case */
712
0
            }
713
0
        }
714
0
        else if (ch > TSK_UNI_MAX_UTF16) {
715
0
            if (flags == TSKstrictConversion) {
716
0
                result = TSKsourceIllegal;
717
0
                source -= (extraBytesToRead + 1);       /* return to the start */
718
0
                break;          /* Bail out; shouldn't continue */
719
0
            }
720
0
            else {
721
0
                *target++ = TSK_UNI_REPLACEMENT_CHAR;
722
0
            }
723
0
        }
724
0
        else {
725
            /* target is a character in range 0xFFFF - 0x10FFFF. */
726
0
            if (target + 1 >= targetEnd) {
727
0
                source -= (extraBytesToRead + 1);       /* Back up source pointer! */
728
0
                result = TSKtargetExhausted;
729
0
                break;
730
0
            }
731
0
            ch -= halfBase;
732
0
            *target++ = (UTF16) ((ch >> halfShift) + UNI_SUR_HIGH_START);
733
0
            *target++ = (UTF16) ((ch & halfMask) + UNI_SUR_LOW_START);
734
0
        }
735
0
    }
736
0
    *sourceStart = source;
737
0
    *targetStart = target;
738
0
    return result;
739
0
}
740