Line | Count | Source |
1 | | |
2 | | /* |
3 | | * Copyright (C) Igor Sysoev |
4 | | * Copyright (C) NGINX, Inc. |
5 | | */ |
6 | | |
7 | | #include <nxt_main.h> |
8 | | |
9 | | /* |
10 | | * The nxt_unicode_lowcase.h file is the auto-generated file from |
11 | | * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.: |
12 | | * |
13 | | * ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt |
14 | | * |
15 | | * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h |
16 | | * file and utf8_file_name_test should be built with this file. |
17 | | * Then a correct system specific file should be generated: |
18 | | * |
19 | | * ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl |
20 | | * |
21 | | * Only common and simple case foldings are supported. Full case foldings |
22 | | * is not supported. Combined characters are also not supported. |
23 | | */ |
24 | | |
25 | | #if (NXT_MACOSX) |
26 | | #include <nxt_unicode_macosx_lowcase.h> |
27 | | |
28 | | #else |
29 | | #include <nxt_unicode_lowcase.h> |
30 | | #endif |
31 | | |
32 | | |
33 | | u_char * |
34 | | nxt_utf8_encode(u_char *p, uint32_t u) |
35 | 0 | { |
36 | 0 | if (u < 0x80) { |
37 | 0 | *p++ = (u_char) (u & 0xFF); |
38 | 0 | return p; |
39 | 0 | } |
40 | | |
41 | 0 | if (u < 0x0800) { |
42 | 0 | *p++ = (u_char) (( u >> 6) | 0xC0); |
43 | 0 | *p++ = (u_char) (( u & 0x3F) | 0x80); |
44 | 0 | return p; |
45 | 0 | } |
46 | | |
47 | 0 | if (u < 0x10000) { |
48 | 0 | *p++ = (u_char) ( (u >> 12) | 0xE0); |
49 | 0 | *p++ = (u_char) (((u >> 6) & 0x3F) | 0x80); |
50 | 0 | *p++ = (u_char) (( u & 0x3F) | 0x80); |
51 | 0 | return p; |
52 | 0 | } |
53 | | |
54 | 0 | if (u < 0x110000) { |
55 | 0 | *p++ = (u_char) ( (u >> 18) | 0xF0); |
56 | 0 | *p++ = (u_char) (((u >> 12) & 0x3F) | 0x80); |
57 | 0 | *p++ = (u_char) (((u >> 6) & 0x3F) | 0x80); |
58 | 0 | *p++ = (u_char) (( u & 0x3F) | 0x80); |
59 | 0 | return p; |
60 | 0 | } |
61 | | |
62 | 0 | return NULL; |
63 | 0 | } |
64 | | |
65 | | |
66 | | /* |
67 | | * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid |
68 | | * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong |
69 | | * UTF-8 sequence. |
70 | | */ |
71 | | |
72 | | uint32_t |
73 | | nxt_utf8_decode(const u_char **start, const u_char *end) |
74 | 1.24k | { |
75 | 1.24k | uint32_t u; |
76 | | |
77 | 1.24k | u = (uint32_t) **start; |
78 | | |
79 | 1.24k | if (u < 0x80) { |
80 | 1.14k | (*start)++; |
81 | 1.14k | return u; |
82 | 1.14k | } |
83 | | |
84 | 101 | return nxt_utf8_decode2(start, end); |
85 | 1.24k | } |
86 | | |
87 | | |
88 | | /* |
89 | | * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only |
90 | | * and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for |
91 | | * invalid or overlong UTF-8 sequence. |
92 | | */ |
93 | | |
94 | | uint32_t |
95 | | nxt_utf8_decode2(const u_char **start, const u_char *end) |
96 | 273 | { |
97 | 273 | u_char c; |
98 | 273 | size_t n; |
99 | 273 | uint32_t u, overlong; |
100 | 273 | const u_char *p; |
101 | | |
102 | 273 | p = *start; |
103 | 273 | u = (uint32_t) *p; |
104 | | |
105 | 273 | if (u >= 0xE0) { |
106 | | |
107 | 157 | if (u >= 0xF0) { |
108 | | |
109 | 120 | if (nxt_slow_path(u > 0xF4)) { |
110 | | /* |
111 | | * The maximum valid Unicode character is 0x10FFFF |
112 | | * which is encoded as 0xF4 0x8F 0xBF 0xBF. |
113 | | */ |
114 | 14 | return 0xFFFFFFFF; |
115 | 14 | } |
116 | | |
117 | 106 | u &= 0x07; |
118 | 106 | overlong = 0x00FFFF; |
119 | 106 | n = 3; |
120 | | |
121 | 106 | } else { |
122 | 37 | u &= 0x0F; |
123 | 37 | overlong = 0x07FF; |
124 | 37 | n = 2; |
125 | 37 | } |
126 | | |
127 | 157 | } else if (u >= 0xC2) { |
128 | | |
129 | | /* 0x80 is encoded as 0xC2 0x80. */ |
130 | | |
131 | 91 | u &= 0x1F; |
132 | 91 | overlong = 0x007F; |
133 | 91 | n = 1; |
134 | | |
135 | 91 | } else { |
136 | | /* u <= 0xC2 */ |
137 | 25 | return 0xFFFFFFFF; |
138 | 25 | } |
139 | | |
140 | 234 | p++; |
141 | | |
142 | 234 | if (nxt_fast_path(p + n <= end)) { |
143 | | |
144 | 471 | do { |
145 | 471 | c = *p++; |
146 | | /* |
147 | | * The byte must in the 0x80 - 0xBF range. |
148 | | * Values below 0x80 become >= 0x80. |
149 | | */ |
150 | 471 | c = c - 0x80; |
151 | | |
152 | 471 | if (nxt_slow_path(c > 0x3F)) { |
153 | 21 | return 0xFFFFFFFF; |
154 | 21 | } |
155 | | |
156 | 450 | u = (u << 6) | c; |
157 | 450 | n--; |
158 | | |
159 | 450 | } while (n != 0); |
160 | | |
161 | 208 | if (overlong < u && u < 0x110000) { |
162 | 166 | *start = p; |
163 | 166 | return u; |
164 | 166 | } |
165 | 208 | } |
166 | | |
167 | 47 | return 0xFFFFFFFF; |
168 | 234 | } |
169 | | |
170 | | |
171 | | /* |
172 | | * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but |
173 | | * requires lengths of both strings because otherwise nxt_utf8_decode2() |
174 | | * may fail due to incomplete sequence. |
175 | | */ |
176 | | |
177 | | nxt_int_t |
178 | | nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1, |
179 | | size_t len2) |
180 | 1.24k | { |
181 | 1.24k | int32_t n; |
182 | 1.24k | uint32_t u1, u2; |
183 | 1.24k | const u_char *end1, *end2; |
184 | | |
185 | 1.24k | end1 = start1 + len1; |
186 | 1.24k | end2 = start2 + len2; |
187 | | |
188 | 1.34k | while (start1 < end1 && start2 < end2) { |
189 | | |
190 | 1.34k | u1 = nxt_utf8_lowcase(&start1, end1); |
191 | | |
192 | 1.34k | u2 = nxt_utf8_lowcase(&start2, end2); |
193 | | |
194 | 1.34k | if (nxt_slow_path((u1 | u2) == 0xFFFFFFFF)) { |
195 | 58 | return NXT_UTF8_SORT_INVALID; |
196 | 58 | } |
197 | | |
198 | 1.28k | n = u1 - u2; |
199 | | |
200 | 1.28k | if (n != 0) { |
201 | 1.18k | return (nxt_int_t) n; |
202 | 1.18k | } |
203 | 1.28k | } |
204 | | |
205 | 3 | return 0; |
206 | 1.24k | } |
207 | | |
208 | | |
209 | | uint32_t |
210 | | nxt_utf8_lowcase(const u_char **start, const u_char *end) |
211 | 2.68k | { |
212 | 2.68k | uint32_t u; |
213 | 2.68k | const uint32_t *block; |
214 | | |
215 | 2.68k | u = (uint32_t) **start; |
216 | | |
217 | 2.68k | if (nxt_fast_path(u < 0x80)) { |
218 | 2.51k | (*start)++; |
219 | | |
220 | 2.51k | return nxt_unicode_block_000[u]; |
221 | 2.51k | } |
222 | | |
223 | 172 | u = nxt_utf8_decode2(start, end); |
224 | | |
225 | 172 | if (u <= NXT_UNICODE_MAX_LOWCASE) { |
226 | 86 | block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE]; |
227 | | |
228 | 86 | if (block != NULL) { |
229 | 71 | return block[u % NXT_UNICODE_BLOCK_SIZE]; |
230 | 71 | } |
231 | 86 | } |
232 | | |
233 | 101 | return u; |
234 | 172 | } |
235 | | |
236 | | |
237 | | ssize_t |
238 | | nxt_utf8_length(const u_char *p, size_t len) |
239 | 0 | { |
240 | 0 | ssize_t length; |
241 | 0 | const u_char *end; |
242 | |
|
243 | 0 | length = 0; |
244 | |
|
245 | 0 | end = p + len; |
246 | |
|
247 | 0 | while (p < end) { |
248 | 0 | if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) { |
249 | 0 | return -1; |
250 | 0 | } |
251 | | |
252 | 0 | length++; |
253 | 0 | } |
254 | | |
255 | 0 | return length; |
256 | 0 | } |
257 | | |
258 | | |
259 | | nxt_bool_t |
260 | | nxt_utf8_is_valid(const u_char *p, size_t len) |
261 | 0 | { |
262 | 0 | const u_char *end; |
263 | |
|
264 | 0 | end = p + len; |
265 | |
|
266 | 0 | while (p < end) { |
267 | 0 | if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) { |
268 | 0 | return 0; |
269 | 0 | } |
270 | 0 | } |
271 | | |
272 | 0 | return 1; |
273 | 0 | } |