Coverage Report

Created: 2025-11-24 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/unit/src/nxt_utf8.c
Line
Count
Source
1
2
/*
3
 * Copyright (C) Igor Sysoev
4
 * Copyright (C) NGINX, Inc.
5
 */
6
7
#include <nxt_main.h>
8
9
/*
10
 * The nxt_unicode_lowcase.h file is the auto-generated file from
11
 * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.:
12
 *
13
 *   ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt
14
 *
15
 * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h
16
 * file and utf8_file_name_test should be built with this file.
17
 * Then a correct system specific file should be generated:
18
 *
19
 *   ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl
20
 *
21
 * Only common and simple case foldings are supported.  Full case foldings
22
 * is not supported.  Combined characters are also not supported.
23
 */
24
25
#if (NXT_MACOSX)
26
#include <nxt_unicode_macosx_lowcase.h>
27
28
#else
29
#include <nxt_unicode_lowcase.h>
30
#endif
31
32
33
u_char *
34
nxt_utf8_encode(u_char *p, uint32_t u)
35
0
{
36
0
    if (u < 0x80) {
37
0
        *p++ = (u_char) (u & 0xFF);
38
0
        return p;
39
0
    }
40
41
0
    if (u < 0x0800) {
42
0
        *p++ = (u_char) (( u >> 6)          | 0xC0);
43
0
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
44
0
        return p;
45
0
    }
46
47
0
    if (u < 0x10000) {
48
0
        *p++ = (u_char) ( (u >> 12)         | 0xE0);
49
0
        *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
50
0
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
51
0
        return p;
52
0
    }
53
54
0
    if (u < 0x110000) {
55
0
        *p++ = (u_char) ( (u >> 18)         | 0xF0);
56
0
        *p++ = (u_char) (((u >> 12) & 0x3F) | 0x80);
57
0
        *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
58
0
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
59
0
        return p;
60
0
    }
61
62
0
    return NULL;
63
0
}
64
65
66
/*
67
 * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid
68
 * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
69
 * UTF-8 sequence.
70
 */
71
72
uint32_t
73
nxt_utf8_decode(const u_char **start, const u_char *end)
74
1.24k
{
75
1.24k
    uint32_t  u;
76
77
1.24k
    u = (uint32_t) **start;
78
79
1.24k
    if (u < 0x80) {
80
1.14k
        (*start)++;
81
1.14k
        return u;
82
1.14k
    }
83
84
101
    return nxt_utf8_decode2(start, end);
85
1.24k
}
86
87
88
/*
89
 * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only
90
 * and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for
91
 * invalid or overlong UTF-8 sequence.
92
 */
93
94
uint32_t
95
nxt_utf8_decode2(const u_char **start, const u_char *end)
96
273
{
97
273
    u_char        c;
98
273
    size_t        n;
99
273
    uint32_t      u, overlong;
100
273
    const u_char  *p;
101
102
273
    p = *start;
103
273
    u = (uint32_t) *p;
104
105
273
    if (u >= 0xE0) {
106
107
157
        if (u >= 0xF0) {
108
109
120
            if (nxt_slow_path(u > 0xF4)) {
110
                /*
111
                 * The maximum valid Unicode character is 0x10FFFF
112
                 * which is encoded as 0xF4 0x8F 0xBF 0xBF.
113
                 */
114
14
                return 0xFFFFFFFF;
115
14
            }
116
117
106
            u &= 0x07;
118
106
            overlong = 0x00FFFF;
119
106
            n = 3;
120
121
106
        } else {
122
37
            u &= 0x0F;
123
37
            overlong = 0x07FF;
124
37
            n = 2;
125
37
        }
126
127
157
    } else if (u >= 0xC2) {
128
129
        /* 0x80 is encoded as 0xC2 0x80. */
130
131
91
        u &= 0x1F;
132
91
        overlong = 0x007F;
133
91
        n = 1;
134
135
91
    } else {
136
        /* u <= 0xC2 */
137
25
        return 0xFFFFFFFF;
138
25
    }
139
140
234
    p++;
141
142
234
    if (nxt_fast_path(p + n <= end)) {
143
144
471
        do {
145
471
            c = *p++;
146
            /*
147
             * The byte must in the 0x80 - 0xBF range.
148
             * Values below 0x80 become >= 0x80.
149
             */
150
471
            c = c - 0x80;
151
152
471
            if (nxt_slow_path(c > 0x3F)) {
153
21
                return 0xFFFFFFFF;
154
21
            }
155
156
450
            u = (u << 6) | c;
157
450
            n--;
158
159
450
        } while (n != 0);
160
161
208
        if (overlong < u && u < 0x110000) {
162
166
            *start = p;
163
166
            return u;
164
166
        }
165
208
    }
166
167
47
    return 0xFFFFFFFF;
168
234
}
169
170
171
/*
172
 * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
173
 * requires lengths of both strings because otherwise nxt_utf8_decode2()
174
 * may fail due to incomplete sequence.
175
 */
176
177
nxt_int_t
178
nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1,
179
    size_t len2)
180
1.24k
{
181
1.24k
    int32_t       n;
182
1.24k
    uint32_t      u1, u2;
183
1.24k
    const u_char  *end1, *end2;
184
185
1.24k
    end1 = start1 + len1;
186
1.24k
    end2 = start2 + len2;
187
188
1.34k
    while (start1 < end1 && start2 < end2) {
189
190
1.34k
        u1 = nxt_utf8_lowcase(&start1, end1);
191
192
1.34k
        u2 = nxt_utf8_lowcase(&start2, end2);
193
194
1.34k
        if (nxt_slow_path((u1 | u2) == 0xFFFFFFFF)) {
195
58
            return NXT_UTF8_SORT_INVALID;
196
58
        }
197
198
1.28k
        n = u1 - u2;
199
200
1.28k
        if (n != 0) {
201
1.18k
            return (nxt_int_t) n;
202
1.18k
        }
203
1.28k
    }
204
205
3
    return 0;
206
1.24k
}
207
208
209
uint32_t
210
nxt_utf8_lowcase(const u_char **start, const u_char *end)
211
2.68k
{
212
2.68k
    uint32_t        u;
213
2.68k
    const uint32_t  *block;
214
215
2.68k
    u = (uint32_t) **start;
216
217
2.68k
    if (nxt_fast_path(u < 0x80)) {
218
2.51k
        (*start)++;
219
220
2.51k
        return nxt_unicode_block_000[u];
221
2.51k
    }
222
223
172
    u = nxt_utf8_decode2(start, end);
224
225
172
    if (u <= NXT_UNICODE_MAX_LOWCASE) {
226
86
        block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE];
227
228
86
        if (block != NULL) {
229
71
            return block[u % NXT_UNICODE_BLOCK_SIZE];
230
71
        }
231
86
    }
232
233
101
    return u;
234
172
}
235
236
237
ssize_t
238
nxt_utf8_length(const u_char *p, size_t len)
239
0
{
240
0
    ssize_t       length;
241
0
    const u_char  *end;
242
243
0
    length = 0;
244
245
0
    end = p + len;
246
247
0
    while (p < end) {
248
0
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
249
0
            return -1;
250
0
        }
251
252
0
        length++;
253
0
    }
254
255
0
    return length;
256
0
}
257
258
259
nxt_bool_t
260
nxt_utf8_is_valid(const u_char *p, size_t len)
261
0
{
262
0
    const u_char  *end;
263
264
0
    end = p + len;
265
266
0
    while (p < end) {
267
0
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
268
0
            return 0;
269
0
        }
270
0
    }
271
272
0
    return 1;
273
0
}