Line | Count | Source (jump to first uncovered line) |
1 | | |
2 | | /* |
3 | | * Copyright (C) Igor Sysoev |
4 | | * Copyright (C) NGINX, Inc. |
5 | | */ |
6 | | |
7 | | #include <nxt_main.h> |
8 | | |
9 | | /* |
10 | | * The nxt_unicode_lowcase.h file is the auto-generated file from |
11 | | * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.: |
12 | | * |
13 | | * ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt |
14 | | * |
15 | | * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h |
16 | | * file and utf8_file_name_test should be built with this file. |
17 | | * Then a correct system specific file should be generated: |
18 | | * |
19 | | * ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl |
20 | | * |
21 | | * Only common and simple case foldings are supported. Full case foldings |
22 | | * is not supported. Combined characters are also not supported. |
23 | | */ |
24 | | |
25 | | #if (NXT_MACOSX) |
26 | | #include <nxt_unicode_macosx_lowcase.h> |
27 | | |
28 | | #else |
29 | | #include <nxt_unicode_lowcase.h> |
30 | | #endif |
31 | | |
32 | | |
33 | | u_char * |
34 | | nxt_utf8_encode(u_char *p, uint32_t u) |
35 | 0 | { |
36 | 0 | if (u < 0x80) { |
37 | 0 | *p++ = (u_char) (u & 0xFF); |
38 | 0 | return p; |
39 | 0 | } |
40 | | |
41 | 0 | if (u < 0x0800) { |
42 | 0 | *p++ = (u_char) (( u >> 6) | 0xC0); |
43 | 0 | *p++ = (u_char) (( u & 0x3F) | 0x80); |
44 | 0 | return p; |
45 | 0 | } |
46 | | |
47 | 0 | if (u < 0x10000) { |
48 | 0 | *p++ = (u_char) ( (u >> 12) | 0xE0); |
49 | 0 | *p++ = (u_char) (((u >> 6) & 0x3F) | 0x80); |
50 | 0 | *p++ = (u_char) (( u & 0x3F) | 0x80); |
51 | 0 | return p; |
52 | 0 | } |
53 | | |
54 | 0 | if (u < 0x110000) { |
55 | 0 | *p++ = (u_char) ( (u >> 18) | 0xF0); |
56 | 0 | *p++ = (u_char) (((u >> 12) & 0x3F) | 0x80); |
57 | 0 | *p++ = (u_char) (((u >> 6) & 0x3F) | 0x80); |
58 | 0 | *p++ = (u_char) (( u & 0x3F) | 0x80); |
59 | 0 | return p; |
60 | 0 | } |
61 | | |
62 | 0 | return NULL; |
63 | 0 | } |
64 | | |
65 | | |
66 | | /* |
67 | | * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid |
68 | | * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong |
69 | | * UTF-8 sequence. |
70 | | */ |
71 | | |
72 | | uint32_t |
73 | | nxt_utf8_decode(const u_char **start, const u_char *end) |
74 | 0 | { |
75 | 0 | uint32_t u; |
76 | |
|
77 | 0 | u = (uint32_t) **start; |
78 | |
|
79 | 0 | if (u < 0x80) { |
80 | 0 | (*start)++; |
81 | 0 | return u; |
82 | 0 | } |
83 | | |
84 | 0 | return nxt_utf8_decode2(start, end); |
85 | 0 | } |
86 | | |
87 | | |
88 | | /* |
89 | | * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only |
90 | | * and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for |
91 | | * invalid or overlong UTF-8 sequence. |
92 | | */ |
93 | | |
94 | | uint32_t |
95 | | nxt_utf8_decode2(const u_char **start, const u_char *end) |
96 | 0 | { |
97 | 0 | u_char c; |
98 | 0 | size_t n; |
99 | 0 | uint32_t u, overlong; |
100 | 0 | const u_char *p; |
101 | |
|
102 | 0 | p = *start; |
103 | 0 | u = (uint32_t) *p; |
104 | |
|
105 | 0 | if (u >= 0xE0) { |
106 | |
|
107 | 0 | if (u >= 0xF0) { |
108 | |
|
109 | 0 | if (nxt_slow_path(u > 0xF4)) { |
110 | | /* |
111 | | * The maximum valid Unicode character is 0x10FFFF |
112 | | * which is encoded as 0xF4 0x8F 0xBF 0xBF. |
113 | | */ |
114 | 0 | return 0xFFFFFFFF; |
115 | 0 | } |
116 | | |
117 | 0 | u &= 0x07; |
118 | 0 | overlong = 0x00FFFF; |
119 | 0 | n = 3; |
120 | |
|
121 | 0 | } else { |
122 | 0 | u &= 0x0F; |
123 | 0 | overlong = 0x07FF; |
124 | 0 | n = 2; |
125 | 0 | } |
126 | |
|
127 | 0 | } else if (u >= 0xC2) { |
128 | | |
129 | | /* 0x80 is encoded as 0xC2 0x80. */ |
130 | |
|
131 | 0 | u &= 0x1F; |
132 | 0 | overlong = 0x007F; |
133 | 0 | n = 1; |
134 | |
|
135 | 0 | } else { |
136 | | /* u <= 0xC2 */ |
137 | 0 | return 0xFFFFFFFF; |
138 | 0 | } |
139 | | |
140 | 0 | p++; |
141 | |
|
142 | 0 | if (nxt_fast_path(p + n <= end)) { |
143 | |
|
144 | 0 | do { |
145 | 0 | c = *p++; |
146 | | /* |
147 | | * The byte must in the 0x80 - 0xBF range. |
148 | | * Values below 0x80 become >= 0x80. |
149 | | */ |
150 | 0 | c = c - 0x80; |
151 | |
|
152 | 0 | if (nxt_slow_path(c > 0x3F)) { |
153 | 0 | return 0xFFFFFFFF; |
154 | 0 | } |
155 | | |
156 | 0 | u = (u << 6) | c; |
157 | 0 | n--; |
158 | |
|
159 | 0 | } while (n != 0); |
160 | | |
161 | 0 | if (overlong < u && u < 0x110000) { |
162 | 0 | *start = p; |
163 | 0 | return u; |
164 | 0 | } |
165 | 0 | } |
166 | | |
167 | 0 | return 0xFFFFFFFF; |
168 | 0 | } |
169 | | |
170 | | |
171 | | /* |
172 | | * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but |
173 | | * requires lengths of both strings because otherwise nxt_utf8_decode2() |
174 | | * may fail due to incomplete sequence. |
175 | | */ |
176 | | |
177 | | nxt_int_t |
178 | | nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1, |
179 | | size_t len2) |
180 | 0 | { |
181 | 0 | int32_t n; |
182 | 0 | uint32_t u1, u2; |
183 | 0 | const u_char *end1, *end2; |
184 | |
|
185 | 0 | end1 = start1 + len1; |
186 | 0 | end2 = start2 + len2; |
187 | |
|
188 | 0 | while (start1 < end1 && start2 < end2) { |
189 | |
|
190 | 0 | u1 = nxt_utf8_lowcase(&start1, end1); |
191 | |
|
192 | 0 | u2 = nxt_utf8_lowcase(&start2, end2); |
193 | |
|
194 | 0 | if (nxt_slow_path((u1 | u2) == 0xFFFFFFFF)) { |
195 | 0 | return NXT_UTF8_SORT_INVALID; |
196 | 0 | } |
197 | | |
198 | 0 | n = u1 - u2; |
199 | |
|
200 | 0 | if (n != 0) { |
201 | 0 | return (nxt_int_t) n; |
202 | 0 | } |
203 | 0 | } |
204 | | |
205 | 0 | return 0; |
206 | 0 | } |
207 | | |
208 | | |
209 | | uint32_t |
210 | | nxt_utf8_lowcase(const u_char **start, const u_char *end) |
211 | 0 | { |
212 | 0 | uint32_t u; |
213 | 0 | const uint32_t *block; |
214 | |
|
215 | 0 | u = (uint32_t) **start; |
216 | |
|
217 | 0 | if (nxt_fast_path(u < 0x80)) { |
218 | 0 | (*start)++; |
219 | |
|
220 | 0 | return nxt_unicode_block_000[u]; |
221 | 0 | } |
222 | | |
223 | 0 | u = nxt_utf8_decode2(start, end); |
224 | |
|
225 | 0 | if (u <= NXT_UNICODE_MAX_LOWCASE) { |
226 | 0 | block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE]; |
227 | |
|
228 | 0 | if (block != NULL) { |
229 | 0 | return block[u % NXT_UNICODE_BLOCK_SIZE]; |
230 | 0 | } |
231 | 0 | } |
232 | | |
233 | 0 | return u; |
234 | 0 | } |
235 | | |
236 | | |
237 | | ssize_t |
238 | | nxt_utf8_length(const u_char *p, size_t len) |
239 | 0 | { |
240 | 0 | ssize_t length; |
241 | 0 | const u_char *end; |
242 | |
|
243 | 0 | length = 0; |
244 | |
|
245 | 0 | end = p + len; |
246 | |
|
247 | 0 | while (p < end) { |
248 | 0 | if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) { |
249 | 0 | return -1; |
250 | 0 | } |
251 | | |
252 | 0 | length++; |
253 | 0 | } |
254 | | |
255 | 0 | return length; |
256 | 0 | } |
257 | | |
258 | | |
259 | | nxt_bool_t |
260 | | nxt_utf8_is_valid(const u_char *p, size_t len) |
261 | 0 | { |
262 | 0 | const u_char *end; |
263 | |
|
264 | 0 | end = p + len; |
265 | |
|
266 | 0 | while (p < end) { |
267 | 0 | if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) { |
268 | 0 | return 0; |
269 | 0 | } |
270 | 0 | } |
271 | | |
272 | 0 | return 1; |
273 | 0 | } |