Coverage Report

Created: 2025-12-07 06:36

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/utf_impl.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
******************************************************************************
5
*
6
*   Copyright (C) 1999-2012, International Business Machines
7
*   Corporation and others.  All Rights Reserved.
8
*
9
******************************************************************************
10
*   file name:  utf_impl.cpp
11
*   encoding:   UTF-8
12
*   tab size:   8 (not used)
13
*   indentation:4
14
*
15
*   created on: 1999sep13
16
*   created by: Markus W. Scherer
17
*
18
*   This file provides implementation functions for macros in the utfXX.h
19
*   that would otherwise be too long as macros.
20
*/
21
22
/* set import/export definitions */
23
#ifndef U_UTF8_IMPL
24
#   define U_UTF8_IMPL
25
#endif
26
27
#include "unicode/utypes.h"
28
#include "unicode/utf.h"
29
#include "unicode/utf8.h"
30
#include "uassert.h"
31
32
/*
33
 * Table of the number of utf8 trail bytes, indexed by the lead byte.
34
 * Used by the deprecated macro UTF8_COUNT_TRAIL_BYTES, defined in utf_old.h
35
 *
36
 * The current macro, U8_COUNT_TRAIL_BYTES, does _not_ use this table.
37
 *
38
 * Note that this table cannot be removed, even if UTF8_COUNT_TRAIL_BYTES were
39
 * changed to no longer use it. References to the table from expansions of UTF8_COUNT_TRAIL_BYTES
40
 * may exist in old client code that must continue to run with newer icu library versions.
41
 *
42
 * This table could be replaced on many machines by
43
 * a few lines of assembler code using an
44
 * "index of first 0-bit from msb" instruction and
45
 * one or two more integer instructions.
46
 *
47
 * For example, on an i386, do something like
48
 * - MOV AL, leadByte
49
 * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)
50
 * - MOV AH, 0
51
 * - BSR BX, AX     (16-bit)
52
 * - MOV AX, 6      (result)
53
 * - JZ finish      (ZF==1 if leadByte==0xff)
54
 * - SUB AX, BX (result)
55
 * -finish:
56
 * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
57
 */
58
U_CAPI const uint8_t
59
utf8_countTrailBytes[256]={
60
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
61
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
62
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
63
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
64
65
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
66
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
67
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
68
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69
70
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74
75
    // illegal C0 & C1
76
    // 2-byte lead bytes C2..DF
77
    0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
78
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
79
80
    // 3-byte lead bytes E0..EF
81
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82
    // 4-byte lead bytes F0..F4
83
    // illegal F5..FF
84
    3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
85
};
86
87
static const UChar32
88
utf8_errorValue[6]={
89
    // Same values as UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE,
90
    // but without relying on the obsolete unicode/utf_old.h.
91
    0x15, 0x9f, 0xffff,
92
    0x10ffff
93
};
94
95
static UChar32
96
9.61M
errorValue(int32_t count, int8_t strict) {
97
9.61M
    if(strict>=0) {
98
0
        return utf8_errorValue[count];
99
9.61M
    } else if(strict==-3) {
100
0
        return 0xfffd;
101
9.61M
    } else {
102
9.61M
        return U_SENTINEL;
103
9.61M
    }
104
9.61M
}
105
106
/*
107
 * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros
108
 * and their obsolete sibling UTF8_NEXT_CHAR_SAFE().
109
 *
110
 * U8_NEXT() supports NUL-terminated strings indicated via length<0.
111
 *
112
 * The "strict" parameter controls the error behavior:
113
 * <0  "Safe" behavior of U8_NEXT():
114
 *     -1: All illegal byte sequences yield U_SENTINEL=-1.
115
 *     -2: Same as -1, except for lenient treatment of surrogate code points as legal.
116
 *         Some implementations use this for roundtripping of
117
 *         Unicode 16-bit strings that are not well-formed UTF-16, that is, they
118
 *         contain unpaired surrogates.
119
 *     -3: All illegal byte sequences yield U+FFFD.
120
 *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., false):
121
 *     All illegal byte sequences yield a positive code point such that this
122
 *     result code point would be encoded with the same number of bytes as
123
 *     the illegal sequence.
124
 * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., true):
125
 *     Same as the obsolete "safe" behavior, but non-characters are also treated
126
 *     like illegal sequences.
127
 */
128
U_CAPI UChar32 U_EXPORT2
129
9.59M
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, int8_t strict) {
130
    // *pi is one after byte c.
131
9.59M
    int32_t i=*pi;
132
    // length can be negative for NUL-terminated strings: Read and validate one byte at a time.
133
9.59M
    if(i==length || c>0xf4) {
134
        // end of string, or not a lead byte
135
5.85M
    } else if(c>=0xf0) {
136
        // Test for 4-byte sequences first because
137
        // U8_NEXT() handles shorter valid sequences inline.
138
186k
        uint8_t t1=s[i], t2, t3;
139
186k
        c&=7;
140
186k
        if(U8_IS_VALID_LEAD4_AND_T1(c, t1) &&
141
66.9k
                ++i!=length && (t2=s[i]-0x80)<=0x3f &&
142
42.8k
                ++i!=length && (t3=s[i]-0x80)<=0x3f) {
143
15.4k
            ++i;
144
15.4k
            c=(c<<18)|((t1&0x3f)<<12)|(t2<<6)|t3;
145
            // strict: forbid non-characters like U+fffe
146
15.4k
            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
147
15.4k
                *pi=i;
148
15.4k
                return c;
149
15.4k
            }
150
15.4k
        }
151
5.67M
    } else if(c>=0xe0) {
152
494k
        c&=0xf;
153
494k
        if(strict!=-2) {
154
494k
            uint8_t t1=s[i], t2;
155
494k
            if(U8_IS_VALID_LEAD3_AND_T1(c, t1) &&
156
65.6k
                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
157
0
                ++i;
158
0
                c=(c<<12)|((t1&0x3f)<<6)|t2;
159
                // strict: forbid non-characters like U+fffe
160
0
                if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
161
0
                    *pi=i;
162
0
                    return c;
163
0
                }
164
0
            }
165
494k
        } else {
166
            // strict=-2 -> lenient: allow surrogates
167
0
            uint8_t t1=s[i]-0x80, t2;
168
0
            if(t1<=0x3f && (c>0 || t1>=0x20) &&
169
0
                    ++i!=length && (t2=s[i]-0x80)<=0x3f) {
170
0
                *pi=i+1;
171
0
                return (c<<12)|(t1<<6)|t2;
172
0
            }
173
0
        }
174
5.17M
    } else if(c>=0xc2) {
175
886k
        uint8_t t1=s[i]-0x80;
176
886k
        if(t1<=0x3f) {
177
0
            *pi=i+1;
178
0
            return ((c-0xc0)<<6)|t1;
179
0
        }
180
886k
    }  // else 0x80<=c<0xc2 is not a lead byte
181
182
    /* error handling */
183
9.58M
    c=errorValue(i-*pi, strict);
184
9.58M
    *pi=i;
185
9.58M
    return c;
186
9.59M
}
187
188
U_CAPI int32_t U_EXPORT2
189
0
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
190
0
    if((uint32_t)(c)<=0x7ff) {
191
0
        if((i)+1<(length)) {
192
0
            (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
193
0
            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
194
0
            return i;
195
0
        }
196
0
    } else if((uint32_t)(c)<=0xffff) {
197
        /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
198
0
        if((i)+2<(length) && !U_IS_SURROGATE(c)) {
199
0
            (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
200
0
            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
201
0
            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
202
0
            return i;
203
0
        }
204
0
    } else if((uint32_t)(c)<=0x10ffff) {
205
0
        if((i)+3<(length)) {
206
0
            (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0);
207
0
            (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80);
208
0
            (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
209
0
            (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
210
0
            return i;
211
0
        }
212
0
    }
213
    /* c>0x10ffff or not enough space, write an error value */
214
0
    if(pIsError!=nullptr) {
215
0
        *pIsError=true;
216
0
    } else {
217
0
        length-=i;
218
0
        if(length>0) {
219
0
            int32_t offset;
220
0
            if(length>3) {
221
0
                length=3;
222
0
            }
223
0
            s+=i;
224
0
            offset=0;
225
0
            c=utf8_errorValue[length-1];
226
0
            U8_APPEND_UNSAFE(s, offset, c);
227
0
            i=i+offset;
228
0
        }
229
0
    }
230
0
    return i;
231
0
}
232
233
U_CAPI UChar32 U_EXPORT2
234
88.2k
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, int8_t strict) {
235
    // *pi is the index of byte c.
236
88.2k
    int32_t i=*pi;
237
88.2k
    if(U8_IS_TRAIL(c) && i>start) {
238
84.5k
        uint8_t b1=s[--i];
239
84.5k
        if(U8_IS_LEAD(b1)) {
240
33.3k
            if(b1<0xe0) {
241
30.6k
                *pi=i;
242
30.6k
                return ((b1-0xc0)<<6)|(c&0x3f);
243
30.6k
            } else if(b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c)) {
244
                // Truncated 3- or 4-byte sequence.
245
2.08k
                *pi=i;
246
2.08k
                return errorValue(1, strict);
247
2.08k
            }
248
51.2k
        } else if(U8_IS_TRAIL(b1) && i>start) {
249
            // Extract the value bits from the last trail byte.
250
48.2k
            c&=0x3f;
251
48.2k
            uint8_t b2=s[--i];
252
48.2k
            if(0xe0<=b2 && b2<=0xf4) {
253
16.3k
                if(b2<0xf0) {
254
10.7k
                    b2&=0xf;
255
10.7k
                    if(strict!=-2) {
256
10.7k
                        if(U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
257
10.2k
                            *pi=i;
258
10.2k
                            c=(b2<<12)|((b1&0x3f)<<6)|c;
259
10.2k
                            if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
260
10.2k
                                return c;
261
10.2k
                            } else {
262
                                // strict: forbid non-characters like U+fffe
263
0
                                return errorValue(2, strict);
264
0
                            }
265
10.2k
                        }
266
10.7k
                    } else {
267
                        // strict=-2 -> lenient: allow surrogates
268
0
                        b1-=0x80;
269
0
                        if((b2>0 || b1>=0x20)) {
270
0
                            *pi=i;
271
0
                            return (b2<<12)|(b1<<6)|c;
272
0
                        }
273
0
                    }
274
10.7k
                } else if(U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
275
                    // Truncated 4-byte sequence.
276
5.42k
                    *pi=i;
277
5.42k
                    return errorValue(2, strict);
278
5.42k
                }
279
31.9k
            } else if(U8_IS_TRAIL(b2) && i>start) {
280
28.8k
                uint8_t b3=s[--i];
281
28.8k
                if(0xf0<=b3 && b3<=0xf4) {
282
11.3k
                    b3&=7;
283
11.3k
                    if(U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
284
9.86k
                        *pi=i;
285
9.86k
                        c=(b3<<18)|((b2&0x3f)<<12)|((b1&0x3f)<<6)|c;
286
9.86k
                        if(strict<=0 || !U_IS_UNICODE_NONCHAR(c)) {
287
9.86k
                            return c;
288
9.86k
                        } else {
289
                            // strict: forbid non-characters like U+fffe
290
0
                            return errorValue(3, strict);
291
0
                        }
292
9.86k
                    }
293
11.3k
                }
294
28.8k
            }
295
48.2k
        }
296
84.5k
    }
297
29.9k
    return errorValue(0, strict);
298
88.2k
}
299
300
U_CAPI int32_t U_EXPORT2
301
196k
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) {
302
    // Same as utf8_prevCharSafeBody(..., strict=-1) minus assembling code points.
303
196k
    int32_t orig_i=i;
304
196k
    uint8_t c=s[i];
305
196k
    if(U8_IS_TRAIL(c) && i>start) {
306
196k
        uint8_t b1=s[--i];
307
196k
        if(U8_IS_LEAD(b1)) {
308
2.19k
            if(b1<0xe0 ||
309
2.19k
                    (b1<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b1, c) : U8_IS_VALID_LEAD4_AND_T1(b1, c))) {
310
0
                return i;
311
0
            }
312
194k
        } else if(U8_IS_TRAIL(b1) && i>start) {
313
128k
            uint8_t b2=s[--i];
314
128k
            if(0xe0<=b2 && b2<=0xf4) {
315
1.29k
                if(b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(b2, b1) : U8_IS_VALID_LEAD4_AND_T1(b2, b1)) {
316
0
                    return i;
317
0
                }
318
127k
            } else if(U8_IS_TRAIL(b2) && i>start) {
319
100k
                uint8_t b3=s[--i];
320
100k
                if(0xf0<=b3 && b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(b3, b2)) {
321
0
                    return i;
322
0
                }
323
100k
            }
324
128k
        }
325
196k
    }
326
196k
    return orig_i;
327
196k
}