/src/unit/src/nxt_utf8.c

Source (jump to first uncovered line)

/*
 * Copyright (C) Igor Sysoev
 * Copyright (C) NGINX, Inc.
 */

#include <nxt_main.h>

/*
 * The nxt_unicode_lowcase.h file is the auto-generated file from
 * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.:
 *
 *   ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt
 *
 * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h
 * file and utf8_file_name_test should be built with this file.
 * Then a correct system specific file should be generated:
 *
 *   ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl
 *
 * Only common and simple case foldings are supported.  Full case foldings
 * is not supported.  Combined characters are also not supported.
 */

#if (NXT_MACOSX)
#include <nxt_unicode_macosx_lowcase.h>

#else
#include <nxt_unicode_lowcase.h>
#endif


u_char *
nxt_utf8_encode(u_char *p, uint32_t u)
{
    if (u < 0x80) {
        *p++ = (u_char) (u & 0xFF);
        return p;
    }

    if (u < 0x0800) {
        *p++ = (u_char) (( u >> 6)          | 0xC0);
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
        return p;
    }

    if (u < 0x10000) {
        *p++ = (u_char) ( (u >> 12)         | 0xE0);
        *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
        return p;
    }

    if (u < 0x110000) {
        *p++ = (u_char) ( (u >> 18)         | 0xF0);
        *p++ = (u_char) (((u >> 12) & 0x3F) | 0x80);
        *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
        return p;
    }

    return NULL;
}


/*
 * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid
 * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
 * UTF-8 sequence.
 */

uint32_t
nxt_utf8_decode(const u_char **start, const u_char *end)
{
    uint32_t  u;

    u = (uint32_t) **start;

    if (u < 0x80) {
        (*start)++;
        return u;
    }

    return nxt_utf8_decode2(start, end);
}


/*
 * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only
 * and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for
 * invalid or overlong UTF-8 sequence.
 */

uint32_t
nxt_utf8_decode2(const u_char **start, const u_char *end)
{
    u_char        c;
    size_t        n;
    uint32_t      u, overlong;
    const u_char  *p;

    p = *start;
    u = (uint32_t) *p;

    if (u >= 0xE0) {

        if (u >= 0xF0) {

            if (nxt_slow_path(u > 0xF4)) {
                /*
                 * The maximum valid Unicode character is 0x10FFFF
                 * which is encoded as 0xF4 0x8F 0xBF 0xBF.
                 */
                return 0xFFFFFFFF;
            }

            u &= 0x07;
            overlong = 0x00FFFF;
            n = 3;

        } else {
            u &= 0x0F;
            overlong = 0x07FF;
            n = 2;
        }

    } else if (u >= 0xC2) {

        /* 0x80 is encoded as 0xC2 0x80. */

        u &= 0x1F;
        overlong = 0x007F;
        n = 1;

    } else {
        /* u <= 0xC2 */
        return 0xFFFFFFFF;
    }

    p++;

    if (nxt_fast_path(p + n <= end)) {

        do {
            c = *p++;
            /*
             * The byte must in the 0x80 - 0xBF range.
             * Values below 0x80 become >= 0x80.
             */
            c = c - 0x80;

            if (nxt_slow_path(c > 0x3F)) {
                return 0xFFFFFFFF;
            }

            u = (u << 6) | c;
            n--;

        } while (n != 0);

        if (overlong < u && u < 0x110000) {
            *start = p;
            return u;
        }
    }

    return 0xFFFFFFFF;
}


/*
 * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
 * requires lengths of both strings because otherwise nxt_utf8_decode2()
 * may fail due to incomplete sequence.
 */

nxt_int_t
nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1,
    size_t len2)
{
    int32_t       n;
    uint32_t      u1, u2;
    const u_char  *end1, *end2;

    end1 = start1 + len1;
    end2 = start2 + len2;

    while (start1 < end1 && start2 < end2) {

        u1 = nxt_utf8_lowcase(&start1, end1);

        u2 = nxt_utf8_lowcase(&start2, end2);

        if (nxt_slow_path((u1 | u2) == 0xFFFFFFFF)) {
            return NXT_UTF8_SORT_INVALID;
        }

        n = u1 - u2;

        if (n != 0) {
            return (nxt_int_t) n;
        }
    }

    return 0;
}


uint32_t
nxt_utf8_lowcase(const u_char **start, const u_char *end)
{
    uint32_t        u;
    const uint32_t  *block;

    u = (uint32_t) **start;

    if (nxt_fast_path(u < 0x80)) {
        (*start)++;

        return nxt_unicode_block_000[u];
    }

    u = nxt_utf8_decode2(start, end);

    if (u <= NXT_UNICODE_MAX_LOWCASE) {
        block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE];

        if (block != NULL) {
            return block[u % NXT_UNICODE_BLOCK_SIZE];
        }
    }

    return u;
}


ssize_t
nxt_utf8_length(const u_char *p, size_t len)
{
    ssize_t       length;
    const u_char  *end;

    length = 0;

    end = p + len;

    while (p < end) {
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
            return -1;
        }

        length++;
    }

    return length;
}


nxt_bool_t
nxt_utf8_is_valid(const u_char *p, size_t len)
{
    const u_char  *end;

    end = p + len;

    while (p < end) {
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
            return 0;
        }
    }

    return 1;
}

Coverage Report

Created: 2025-07-12 06:16

Line	Count	Source (jump to first uncovered line)
1
2		/*
3		* Copyright (C) Igor Sysoev
4		* Copyright (C) NGINX, Inc.
5		*/
6
7		#include <nxt_main.h>
8
9		/*
10		* The nxt_unicode_lowcase.h file is the auto-generated file from
11		* the CaseFolding-6.3.0.txt file provided by Unicode, Inc.:
12		*
13		* ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt
14		*
15		* This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h
16		* file and utf8_file_name_test should be built with this file.
17		* Then a correct system specific file should be generated:
18		*
19		* ./build/utf8_file_name_test \| ./lib/src/nxt_unicode_lowcase.pl
20		*
21		* Only common and simple case foldings are supported. Full case foldings
22		* is not supported. Combined characters are also not supported.
23		*/
24
25		#if (NXT_MACOSX)
26		#include <nxt_unicode_macosx_lowcase.h>
27
28		#else
29		#include <nxt_unicode_lowcase.h>
30		#endif
31
32
33		u_char *
34		nxt_utf8_encode(u_char *p, uint32_t u)
35	0	{
36	0	if (u < 0x80) {
37	0	*p++ = (u_char) (u & 0xFF);
38	0	return p;
39	0	}
40
41	0	if (u < 0x0800) {
42	0	*p++ = (u_char) (( u >> 6) \| 0xC0);
43	0	*p++ = (u_char) (( u & 0x3F) \| 0x80);
44	0	return p;
45	0	}
46
47	0	if (u < 0x10000) {
48	0	*p++ = (u_char) ( (u >> 12) \| 0xE0);
49	0	*p++ = (u_char) (((u >> 6) & 0x3F) \| 0x80);
50	0	*p++ = (u_char) (( u & 0x3F) \| 0x80);
51	0	return p;
52	0	}
53
54	0	if (u < 0x110000) {
55	0	*p++ = (u_char) ( (u >> 18) \| 0xF0);
56	0	*p++ = (u_char) (((u >> 12) & 0x3F) \| 0x80);
57	0	*p++ = (u_char) (((u >> 6) & 0x3F) \| 0x80);
58	0	*p++ = (u_char) (( u & 0x3F) \| 0x80);
59	0	return p;
60	0	}
61
62	0	return NULL;
63	0	}
64
65
66		/*
67		* nxt_utf8_decode() decodes UTF-8 sequences and returns a valid
68		* character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
69		* UTF-8 sequence.
70		*/
71
72		uint32_t
73		nxt_utf8_decode(const u_char *start, const u_char end)
74	0	{
75	0	uint32_t u;
76
77	0	u = (uint32_t) **start;
78
79	0	if (u < 0x80) {
80	0	(*start)++;
81	0	return u;
82	0	}
83
84	0	return nxt_utf8_decode2(start, end);
85	0	}
86
87
88		/*
89		* nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only
90		* and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for
91		* invalid or overlong UTF-8 sequence.
92		*/
93
94		uint32_t
95		nxt_utf8_decode2(const u_char *start, const u_char end)
96	0	{
97	0	u_char c;
98	0	size_t n;
99	0	uint32_t u, overlong;
100	0	const u_char *p;
101
102	0	p = *start;
103	0	u = (uint32_t) *p;
104
105	0	if (u >= 0xE0) {
106
107	0	if (u >= 0xF0) {
108
109	0	if (nxt_slow_path(u > 0xF4)) {
110		/*
111		* The maximum valid Unicode character is 0x10FFFF
112		* which is encoded as 0xF4 0x8F 0xBF 0xBF.
113		*/
114	0	return 0xFFFFFFFF;
115	0	}
116
117	0	u &= 0x07;
118	0	overlong = 0x00FFFF;
119	0	n = 3;
120
121	0	} else {
122	0	u &= 0x0F;
123	0	overlong = 0x07FF;
124	0	n = 2;
125	0	}
126
127	0	} else if (u >= 0xC2) {
128
129		/* 0x80 is encoded as 0xC2 0x80. */
130
131	0	u &= 0x1F;
132	0	overlong = 0x007F;
133	0	n = 1;
134
135	0	} else {
136		/* u <= 0xC2 */
137	0	return 0xFFFFFFFF;
138	0	}
139
140	0	p++;
141
142	0	if (nxt_fast_path(p + n <= end)) {
143
144	0	do {
145	0	c = *p++;
146		/*
147		* The byte must in the 0x80 - 0xBF range.
148		* Values below 0x80 become >= 0x80.
149		*/
150	0	c = c - 0x80;
151
152	0	if (nxt_slow_path(c > 0x3F)) {
153	0	return 0xFFFFFFFF;
154	0	}
155
156	0	u = (u << 6) \| c;
157	0	n--;
158
159	0	} while (n != 0);
160
161	0	if (overlong < u && u < 0x110000) {
162	0	*start = p;
163	0	return u;
164	0	}
165	0	}
166
167	0	return 0xFFFFFFFF;
168	0	}
169
170
171		/*
172		* nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
173		* requires lengths of both strings because otherwise nxt_utf8_decode2()
174		* may fail due to incomplete sequence.
175		*/
176
177		nxt_int_t
178		nxt_utf8_casecmp(const u_char start1, const u_char start2, size_t len1,
179		size_t len2)
180	0	{
181	0	int32_t n;
182	0	uint32_t u1, u2;
183	0	const u_char end1, end2;
184
185	0	end1 = start1 + len1;
186	0	end2 = start2 + len2;
187
188	0	while (start1 < end1 && start2 < end2) {
189
190	0	u1 = nxt_utf8_lowcase(&start1, end1);
191
192	0	u2 = nxt_utf8_lowcase(&start2, end2);
193
194	0	if (nxt_slow_path((u1 \| u2) == 0xFFFFFFFF)) {
195	0	return NXT_UTF8_SORT_INVALID;
196	0	}
197
198	0	n = u1 - u2;
199
200	0	if (n != 0) {
201	0	return (nxt_int_t) n;
202	0	}
203	0	}
204
205	0	return 0;
206	0	}
207
208
209		uint32_t
210		nxt_utf8_lowcase(const u_char *start, const u_char end)
211	0	{
212	0	uint32_t u;
213	0	const uint32_t *block;
214
215	0	u = (uint32_t) **start;
216
217	0	if (nxt_fast_path(u < 0x80)) {
218	0	(*start)++;
219
220	0	return nxt_unicode_block_000[u];
221	0	}
222
223	0	u = nxt_utf8_decode2(start, end);
224
225	0	if (u <= NXT_UNICODE_MAX_LOWCASE) {
226	0	block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE];
227
228	0	if (block != NULL) {
229	0	return block[u % NXT_UNICODE_BLOCK_SIZE];
230	0	}
231	0	}
232
233	0	return u;
234	0	}
235
236
237		ssize_t
238		nxt_utf8_length(const u_char *p, size_t len)
239	0	{
240	0	ssize_t length;
241	0	const u_char *end;
242
243	0	length = 0;
244
245	0	end = p + len;
246
247	0	while (p < end) {
248	0	if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
249	0	return -1;
250	0	}
251
252	0	length++;
253	0	}
254
255	0	return length;
256	0	}
257
258
259		nxt_bool_t
260		nxt_utf8_is_valid(const u_char *p, size_t len)
261	0	{
262	0	const u_char *end;
263
264	0	end = p + len;
265
266	0	while (p < end) {
267	0	if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
268	0	return 0;
269	0	}
270	0	}
271
272	0	return 1;
273	0	}