/src/unit/src/nxt_utf8.c

Source

/*
 * Copyright (C) Igor Sysoev
 * Copyright (C) NGINX, Inc.
 */

#include <nxt_main.h>

/*
 * The nxt_unicode_lowcase.h file is the auto-generated file from
 * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.:
 *
 *   ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt
 *
 * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h
 * file and utf8_file_name_test should be built with this file.
 * Then a correct system specific file should be generated:
 *
 *   ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl
 *
 * Only common and simple case foldings are supported.  Full case foldings
 * is not supported.  Combined characters are also not supported.
 */

#if (NXT_MACOSX)
#include <nxt_unicode_macosx_lowcase.h>

#else
#include <nxt_unicode_lowcase.h>
#endif


u_char *
nxt_utf8_encode(u_char *p, uint32_t u)
{
    if (u < 0x80) {
        *p++ = (u_char) (u & 0xFF);
        return p;
    }

    if (u < 0x0800) {
        *p++ = (u_char) (( u >> 6)          | 0xC0);
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
        return p;
    }

    if (u < 0x10000) {
        *p++ = (u_char) ( (u >> 12)         | 0xE0);
        *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
        return p;
    }

    if (u < 0x110000) {
        *p++ = (u_char) ( (u >> 18)         | 0xF0);
        *p++ = (u_char) (((u >> 12) & 0x3F) | 0x80);
        *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
        return p;
    }

    return NULL;
}


/*
 * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid
 * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
 * UTF-8 sequence.
 */

uint32_t
nxt_utf8_decode(const u_char **start, const u_char *end)
{
    uint32_t  u;

    u = (uint32_t) **start;

    if (u < 0x80) {
        (*start)++;
        return u;
    }

    return nxt_utf8_decode2(start, end);
}


/*
 * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only
 * and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for
 * invalid or overlong UTF-8 sequence.
 */

uint32_t
nxt_utf8_decode2(const u_char **start, const u_char *end)
{
    u_char        c;
    size_t        n;
    uint32_t      u, overlong;
    const u_char  *p;

    p = *start;
    u = (uint32_t) *p;

    if (u >= 0xE0) {

        if (u >= 0xF0) {

            if (nxt_slow_path(u > 0xF4)) {
                /*
                 * The maximum valid Unicode character is 0x10FFFF
                 * which is encoded as 0xF4 0x8F 0xBF 0xBF.
                 */
                return 0xFFFFFFFF;
            }

            u &= 0x07;
            overlong = 0x00FFFF;
            n = 3;

        } else {
            u &= 0x0F;
            overlong = 0x07FF;
            n = 2;
        }

    } else if (u >= 0xC2) {

        /* 0x80 is encoded as 0xC2 0x80. */

        u &= 0x1F;
        overlong = 0x007F;
        n = 1;

    } else {
        /* u <= 0xC2 */
        return 0xFFFFFFFF;
    }

    p++;

    if (nxt_fast_path(p + n <= end)) {

        do {
            c = *p++;
            /*
             * The byte must in the 0x80 - 0xBF range.
             * Values below 0x80 become >= 0x80.
             */
            c = c - 0x80;

            if (nxt_slow_path(c > 0x3F)) {
                return 0xFFFFFFFF;
            }

            u = (u << 6) | c;
            n--;

        } while (n != 0);

        if (overlong < u && u < 0x110000) {
            *start = p;
            return u;
        }
    }

    return 0xFFFFFFFF;
}


/*
 * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
 * requires lengths of both strings because otherwise nxt_utf8_decode2()
 * may fail due to incomplete sequence.
 */

nxt_int_t
nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1,
    size_t len2)
{
    int32_t       n;
    uint32_t      u1, u2;
    const u_char  *end1, *end2;

    end1 = start1 + len1;
    end2 = start2 + len2;

    while (start1 < end1 && start2 < end2) {

        u1 = nxt_utf8_lowcase(&start1, end1);

        u2 = nxt_utf8_lowcase(&start2, end2);

        if (nxt_slow_path((u1 | u2) == 0xFFFFFFFF)) {
            return NXT_UTF8_SORT_INVALID;
        }

        n = u1 - u2;

        if (n != 0) {
            return (nxt_int_t) n;
        }
    }

    return 0;
}


uint32_t
nxt_utf8_lowcase(const u_char **start, const u_char *end)
{
    uint32_t        u;
    const uint32_t  *block;

    u = (uint32_t) **start;

    if (nxt_fast_path(u < 0x80)) {
        (*start)++;

        return nxt_unicode_block_000[u];
    }

    u = nxt_utf8_decode2(start, end);

    if (u <= NXT_UNICODE_MAX_LOWCASE) {
        block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE];

        if (block != NULL) {
            return block[u % NXT_UNICODE_BLOCK_SIZE];
        }
    }

    return u;
}


ssize_t
nxt_utf8_length(const u_char *p, size_t len)
{
    ssize_t       length;
    const u_char  *end;

    length = 0;

    end = p + len;

    while (p < end) {
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
            return -1;
        }

        length++;
    }

    return length;
}


nxt_bool_t
nxt_utf8_is_valid(const u_char *p, size_t len)
{
    const u_char  *end;

    end = p + len;

    while (p < end) {
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
            return 0;
        }
    }

    return 1;
}

Coverage Report

Created: 2025-11-24 06:33

Line	Count	Source
1
2		/*
3		* Copyright (C) Igor Sysoev
4		* Copyright (C) NGINX, Inc.
5		*/
6
7		#include <nxt_main.h>
8
9		/*
10		* The nxt_unicode_lowcase.h file is the auto-generated file from
11		* the CaseFolding-6.3.0.txt file provided by Unicode, Inc.:
12		*
13		* ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt
14		*
15		* This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h
16		* file and utf8_file_name_test should be built with this file.
17		* Then a correct system specific file should be generated:
18		*
19		* ./build/utf8_file_name_test \| ./lib/src/nxt_unicode_lowcase.pl
20		*
21		* Only common and simple case foldings are supported. Full case foldings
22		* is not supported. Combined characters are also not supported.
23		*/
24
25		#if (NXT_MACOSX)
26		#include <nxt_unicode_macosx_lowcase.h>
27
28		#else
29		#include <nxt_unicode_lowcase.h>
30		#endif
31
32
33		u_char *
34		nxt_utf8_encode(u_char *p, uint32_t u)
35	0	{
36	0	if (u < 0x80) {
37	0	*p++ = (u_char) (u & 0xFF);
38	0	return p;
39	0	}
40
41	0	if (u < 0x0800) {
42	0	*p++ = (u_char) (( u >> 6) \| 0xC0);
43	0	*p++ = (u_char) (( u & 0x3F) \| 0x80);
44	0	return p;
45	0	}
46
47	0	if (u < 0x10000) {
48	0	*p++ = (u_char) ( (u >> 12) \| 0xE0);
49	0	*p++ = (u_char) (((u >> 6) & 0x3F) \| 0x80);
50	0	*p++ = (u_char) (( u & 0x3F) \| 0x80);
51	0	return p;
52	0	}
53
54	0	if (u < 0x110000) {
55	0	*p++ = (u_char) ( (u >> 18) \| 0xF0);
56	0	*p++ = (u_char) (((u >> 12) & 0x3F) \| 0x80);
57	0	*p++ = (u_char) (((u >> 6) & 0x3F) \| 0x80);
58	0	*p++ = (u_char) (( u & 0x3F) \| 0x80);
59	0	return p;
60	0	}
61
62	0	return NULL;
63	0	}
64
65
66		/*
67		* nxt_utf8_decode() decodes UTF-8 sequences and returns a valid
68		* character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
69		* UTF-8 sequence.
70		*/
71
72		uint32_t
73		nxt_utf8_decode(const u_char *start, const u_char end)
74	1.24k	{
75	1.24k	uint32_t u;
76
77	1.24k	u = (uint32_t) **start;
78
79	1.24k	if (u < 0x80) {
80	1.14k	(*start)++;
81	1.14k	return u;
82	1.14k	}
83
84	101	return nxt_utf8_decode2(start, end);
85	1.24k	}
86
87
88		/*
89		* nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only
90		* and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for
91		* invalid or overlong UTF-8 sequence.
92		*/
93
94		uint32_t
95		nxt_utf8_decode2(const u_char *start, const u_char end)
96	273	{
97	273	u_char c;
98	273	size_t n;
99	273	uint32_t u, overlong;
100	273	const u_char *p;
101
102	273	p = *start;
103	273	u = (uint32_t) *p;
104
105	273	if (u >= 0xE0) {
106
107	157	if (u >= 0xF0) {
108
109	120	if (nxt_slow_path(u > 0xF4)) {
110		/*
111		* The maximum valid Unicode character is 0x10FFFF
112		* which is encoded as 0xF4 0x8F 0xBF 0xBF.
113		*/
114	14	return 0xFFFFFFFF;
115	14	}
116
117	106	u &= 0x07;
118	106	overlong = 0x00FFFF;
119	106	n = 3;
120
121	106	} else {
122	37	u &= 0x0F;
123	37	overlong = 0x07FF;
124	37	n = 2;
125	37	}
126
127	157	} else if (u >= 0xC2) {
128
129		/* 0x80 is encoded as 0xC2 0x80. */
130
131	91	u &= 0x1F;
132	91	overlong = 0x007F;
133	91	n = 1;
134
135	91	} else {
136		/* u <= 0xC2 */
137	25	return 0xFFFFFFFF;
138	25	}
139
140	234	p++;
141
142	234	if (nxt_fast_path(p + n <= end)) {
143
144	471	do {
145	471	c = *p++;
146		/*
147		* The byte must in the 0x80 - 0xBF range.
148		* Values below 0x80 become >= 0x80.
149		*/
150	471	c = c - 0x80;
151
152	471	if (nxt_slow_path(c > 0x3F)) {
153	21	return 0xFFFFFFFF;
154	21	}
155
156	450	u = (u << 6) \| c;
157	450	n--;
158
159	450	} while (n != 0);
160
161	208	if (overlong < u && u < 0x110000) {
162	166	*start = p;
163	166	return u;
164	166	}
165	208	}
166
167	47	return 0xFFFFFFFF;
168	234	}
169
170
171		/*
172		* nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
173		* requires lengths of both strings because otherwise nxt_utf8_decode2()
174		* may fail due to incomplete sequence.
175		*/
176
177		nxt_int_t
178		nxt_utf8_casecmp(const u_char start1, const u_char start2, size_t len1,
179		size_t len2)
180	1.24k	{
181	1.24k	int32_t n;
182	1.24k	uint32_t u1, u2;
183	1.24k	const u_char end1, end2;
184
185	1.24k	end1 = start1 + len1;
186	1.24k	end2 = start2 + len2;
187
188	1.34k	while (start1 < end1 && start2 < end2) {
189
190	1.34k	u1 = nxt_utf8_lowcase(&start1, end1);
191
192	1.34k	u2 = nxt_utf8_lowcase(&start2, end2);
193
194	1.34k	if (nxt_slow_path((u1 \| u2) == 0xFFFFFFFF)) {
195	58	return NXT_UTF8_SORT_INVALID;
196	58	}
197
198	1.28k	n = u1 - u2;
199
200	1.28k	if (n != 0) {
201	1.18k	return (nxt_int_t) n;
202	1.18k	}
203	1.28k	}
204
205	3	return 0;
206	1.24k	}
207
208
209		uint32_t
210		nxt_utf8_lowcase(const u_char *start, const u_char end)
211	2.68k	{
212	2.68k	uint32_t u;
213	2.68k	const uint32_t *block;
214
215	2.68k	u = (uint32_t) **start;
216
217	2.68k	if (nxt_fast_path(u < 0x80)) {
218	2.51k	(*start)++;
219
220	2.51k	return nxt_unicode_block_000[u];
221	2.51k	}
222
223	172	u = nxt_utf8_decode2(start, end);
224
225	172	if (u <= NXT_UNICODE_MAX_LOWCASE) {
226	86	block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE];
227
228	86	if (block != NULL) {
229	71	return block[u % NXT_UNICODE_BLOCK_SIZE];
230	71	}
231	86	}
232
233	101	return u;
234	172	}
235
236
237		ssize_t
238		nxt_utf8_length(const u_char *p, size_t len)
239	0	{
240	0	ssize_t length;
241	0	const u_char *end;
242
243	0	length = 0;
244
245	0	end = p + len;
246
247	0	while (p < end) {
248	0	if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
249	0	return -1;
250	0	}
251
252	0	length++;
253	0	}
254
255	0	return length;
256	0	}
257
258
259		nxt_bool_t
260		nxt_utf8_is_valid(const u_char *p, size_t len)
261	0	{
262	0	const u_char *end;
263
264	0	end = p + len;
265
266	0	while (p < end) {
267	0	if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
268	0	return 0;
269	0	}
270	0	}
271
272	0	return 1;
273	0	}