/src/lighttpd1.4/src/burl.c
Line | Count | Source |
1 | | /* |
2 | | * burl - buffer URL normalization |
3 | | * |
4 | | * Copyright(c) 2018 Glenn Strauss gstrauss()gluelogic.com All rights reserved |
5 | | * License: BSD 3-clause (same as lighttpd) |
6 | | */ |
7 | | #include "first.h" |
8 | | #include "burl.h" |
9 | | |
10 | | #include <string.h> |
11 | | |
12 | | #include "buffer.h" |
13 | | #include "base64.h" |
14 | | |
15 | | static const char hex_chars_uc[] = "0123456789ABCDEF"; |
16 | | |
17 | | /* everything except: ! $ & ' ( ) * + , - . / 0-9 : ; = ? @ A-Z _ a-z ~ */ |
18 | | static const char encoded_chars_http_uri_reqd[] = { |
19 | | /* |
20 | | 0 1 2 3 4 5 6 7 8 9 A B C D E F |
21 | | */ |
22 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00 - 0F control chars */ |
23 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10 - 1F */ |
24 | | 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 20 - 2F space " # % */ |
25 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 30 - 3F < > */ |
26 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40 - 4F */ |
27 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, /* 50 - 5F [ \ ] ^ */ |
28 | | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60 - 6F ` */ |
29 | | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, /* 70 - 7F { | } DEL */ |
30 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 80 - 8F */ |
31 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 90 - 9F */ |
32 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* A0 - AF */ |
33 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* B0 - BF */ |
34 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* C0 - CF */ |
35 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* D0 - DF */ |
36 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* E0 - EF */ |
37 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* F0 - FF */ |
38 | | }; |
39 | | |
40 | | |
41 | | /* c (char) and n (nibble) MUST be unsigned integer types */ |
42 | | #define li_cton(c,n) \ |
43 | 42.9M | (((n) = (c) - '0') <= 9 || (((n) = ((c)&0xdf) - 'A') <= 5 ? ((n) += 10) : 0)) |
44 | | |
45 | | /* b (byte) MUST be unsigned integer type |
46 | | * https://en.wikipedia.org/wiki/UTF-8 |
47 | | * detect invalid UTF-8 byte and byte in overlong encoding of 7-bit ASCII |
48 | | * (but does not detect other invalid/overlong multibyte encoding sequences) */ |
49 | 42.7M | #define li_utf8_invalid_byte(b) light_utf8_invalid_byte(b) |
50 | | |
51 | | |
52 | | static int burl_is_unreserved (const int c) |
53 | 9.63k | { |
54 | 9.63k | return (light_isalnum(c) || c == '-' || c == '.' || c == '_' || c == '~'); |
55 | 9.63k | } |
56 | | |
57 | | |
58 | | static int burl_normalize_basic_unreserved_fix (buffer *b, buffer *t, int i, int qs) |
59 | 590 | { |
60 | 590 | int j = i; |
61 | 590 | const int used = (int)buffer_clen(b); |
62 | 590 | const unsigned char * const s = (unsigned char *)b->ptr; |
63 | 590 | unsigned char * const p = |
64 | 590 | (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1); |
65 | 590 | unsigned int n1, n2; |
66 | 590 | memcpy(p, s, (size_t)i); |
67 | 22.2M | for (; i < used; ++i, ++j) { |
68 | 22.2M | if (!encoded_chars_http_uri_reqd[s[i]]) { |
69 | 1.82M | p[j] = s[i]; |
70 | 1.82M | if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = j; |
71 | 1.82M | } |
72 | 20.4M | else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) { |
73 | 7.32k | const unsigned int x = (n1 << 4) | n2; |
74 | 7.32k | if (burl_is_unreserved(x)) { |
75 | 2.33k | p[j] = x; |
76 | 2.33k | } |
77 | 4.99k | else { |
78 | 4.99k | p[j] = '%'; |
79 | 4.99k | p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/ |
80 | 4.99k | p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/ |
81 | 4.99k | if (li_utf8_invalid_byte(x)) qs = -2; |
82 | 4.99k | } |
83 | 7.32k | i+=2; |
84 | 7.32k | } |
85 | 20.4M | else if (s[i] == '#') break; /* ignore fragment */ |
86 | 20.4M | else { |
87 | 20.4M | p[j] = '%'; |
88 | 20.4M | p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF]; |
89 | 20.4M | p[++j] = hex_chars_uc[s[i] & 0xF]; |
90 | 20.4M | if (li_utf8_invalid_byte(s[i])) qs = -2; |
91 | 20.4M | } |
92 | 22.2M | } |
93 | 590 | buffer_copy_string_len(b, (char *)p, (size_t)j); |
94 | 590 | return qs; |
95 | 590 | } |
96 | | |
97 | | |
98 | | static int burl_normalize_basic_unreserved (buffer *b, buffer *t) |
99 | 755 | { |
100 | 755 | const unsigned char * const s = (unsigned char *)b->ptr; |
101 | 755 | const int used = (int)buffer_clen(b); |
102 | 755 | unsigned int n1, n2, x; |
103 | 755 | int qs = -1; |
104 | | |
105 | 742k | for (int i = 0; i < used; ++i) { |
106 | 742k | if (!encoded_chars_http_uri_reqd[s[i]]) { |
107 | 739k | if (__builtin_expect( (s[i] == '?'), 0) && -1 == qs) qs = i; |
108 | 739k | } |
109 | 2.84k | else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2) |
110 | 2.30k | && !burl_is_unreserved((x = (n1 << 4) | n2))) { |
111 | 2.25k | if (li_utf8_invalid_byte(x)) qs = -2; |
112 | 2.25k | if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */ |
113 | 2.25k | if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */ |
114 | 2.25k | i+=2; |
115 | 2.25k | } |
116 | 591 | else if (s[i] == '#') { /* ignore fragment */ |
117 | 1 | buffer_truncate(b, (size_t)i); |
118 | 1 | break; |
119 | 1 | } |
120 | 590 | else { |
121 | 590 | qs = burl_normalize_basic_unreserved_fix(b, t, i, qs); |
122 | 590 | break; |
123 | 590 | } |
124 | 742k | } |
125 | | |
126 | 755 | return qs; |
127 | 755 | } |
128 | | |
129 | | |
130 | | static int burl_normalize_basic_required_fix (buffer *b, buffer *t, int i, int qs) |
131 | 678 | { |
132 | 678 | int j = i; |
133 | 678 | const int used = (int)buffer_clen(b); |
134 | 678 | const unsigned char * const s = (unsigned char *)b->ptr; |
135 | 678 | unsigned char * const p = |
136 | 678 | (unsigned char *)buffer_string_prepare_copy(t,i+(used-i)*3+1); |
137 | 678 | unsigned int n1, n2; |
138 | 678 | int invalid_utf8 = 0; |
139 | 678 | memcpy(p, s, (size_t)i); |
140 | 23.9M | for (; i < used; ++i, ++j) { |
141 | 23.9M | if (!encoded_chars_http_uri_reqd[s[i]]) { |
142 | 1.63M | p[j] = s[i]; |
143 | 1.63M | if (__builtin_expect( (s[i] == '?'), 0)) qs = j; |
144 | 1.63M | } |
145 | 22.3M | else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2)) { |
146 | 24.4k | const unsigned int x = (n1 << 4) | n2; |
147 | 24.4k | if (!encoded_chars_http_uri_reqd[x] |
148 | 15.8k | && (qs < 0 |
149 | 15.8k | ? (x != '/' && x != '?') |
150 | 15.8k | : (x != '&' && x != '=' && x != ';' && x != '+'))) { |
151 | 9.09k | p[j] = x; |
152 | 9.09k | } |
153 | 15.3k | else { |
154 | 15.3k | p[j] = '%'; |
155 | 15.3k | p[++j] = hex_chars_uc[n1]; /*(s[i+1] & 0xdf)*/ |
156 | 15.3k | p[++j] = hex_chars_uc[n2]; /*(s[i+2] & 0xdf)*/ |
157 | 15.3k | invalid_utf8 |= li_utf8_invalid_byte(x); |
158 | 15.3k | } |
159 | 24.4k | i+=2; |
160 | 24.4k | } |
161 | 22.3M | else if (s[i] == '#') break; /* ignore fragment */ |
162 | 22.3M | else { |
163 | 22.3M | p[j] = '%'; |
164 | 22.3M | p[++j] = hex_chars_uc[(s[i] >> 4) & 0xF]; |
165 | 22.3M | p[++j] = hex_chars_uc[s[i] & 0xF]; |
166 | 22.3M | invalid_utf8 |= li_utf8_invalid_byte(s[i]); |
167 | 22.3M | } |
168 | 23.9M | } |
169 | 678 | buffer_copy_string_len(b, (char *)p, (size_t)j); |
170 | 678 | return !invalid_utf8 ? qs : -2; |
171 | 678 | } |
172 | | |
173 | | |
174 | | static int burl_normalize_basic_required (buffer *b, buffer *t) |
175 | 884 | { |
176 | 884 | const unsigned char * const s = (unsigned char *)b->ptr; |
177 | 884 | const int used = (int)buffer_clen(b); |
178 | 884 | unsigned int n1, n2, x; |
179 | 884 | int qs = -1; |
180 | 884 | int invalid_utf8 = 0; |
181 | | |
182 | 733k | for (int i = 0; i < used; ++i) { |
183 | 732k | if (!encoded_chars_http_uri_reqd[s[i]]) { |
184 | 723k | if (s[i] == '?') qs = i; |
185 | 723k | } |
186 | 9.79k | else if (s[i]=='%' && li_cton(s[i+1], n1) && li_cton(s[i+2], n2) |
187 | 9.18k | && (encoded_chars_http_uri_reqd[(x = (n1 << 4) | n2)] |
188 | 4.02k | || (qs < 0 |
189 | 4.02k | ? (x == '/' || x == '?') |
190 | 9.11k | : (x == '&' || x == '=' || x == ';' || x == '+')))) { |
191 | 9.11k | invalid_utf8 |= li_utf8_invalid_byte(x); |
192 | 9.11k | if (s[i+1] >= 'a') b->ptr[i+1] &= 0xdf; /* uppercase hex */ |
193 | 9.11k | if (s[i+2] >= 'a') b->ptr[i+2] &= 0xdf; /* uppercase hex */ |
194 | 9.11k | i+=2; |
195 | 9.11k | } |
196 | 680 | else if (s[i] == '#') { /* ignore fragment */ |
197 | 2 | buffer_truncate(b, (size_t)i); |
198 | 2 | break; |
199 | 2 | } |
200 | 678 | else { |
201 | 678 | qs = burl_normalize_basic_required_fix(b, t, i, qs); |
202 | 678 | break; |
203 | 678 | } |
204 | 732k | } |
205 | | |
206 | 884 | return !invalid_utf8 ? qs : -2; |
207 | 884 | } |
208 | | |
209 | | |
210 | | static int burl_contains_ctrls (const buffer *b) |
211 | 345 | { |
212 | 345 | const char * const s = b->ptr; |
213 | 345 | const int used = (int)buffer_clen(b); |
214 | 12.3M | for (int i = 0; i < used; ++i) { |
215 | 12.3M | if (s[i] == '%' && (s[i+1] < '2' || (s[i+1] == '7' && s[i+2] == 'F'))) |
216 | 83 | return 1; |
217 | 12.3M | } |
218 | 262 | return 0; |
219 | 345 | } |
220 | | |
221 | | |
222 | | static void burl_normalize_qs20_to_plus_fix (buffer *b, int i) |
223 | 101 | { |
224 | 101 | char * const s = b->ptr; |
225 | 101 | const int used = (int)buffer_clen(b); |
226 | 101 | int j = i; |
227 | 5.65M | for (; i < used; ++i, ++j) { |
228 | 5.65M | s[j] = s[i]; |
229 | 5.65M | if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') { |
230 | 1.06k | s[j] = '+'; |
231 | 1.06k | i+=2; |
232 | 1.06k | } |
233 | 5.65M | } |
234 | 101 | buffer_truncate(b, j); |
235 | 101 | } |
236 | | |
237 | | |
238 | | static void burl_normalize_qs20_to_plus (buffer *b, int qs) |
239 | 408 | { |
240 | 408 | const char * const s = b->ptr; |
241 | 408 | const int used = qs < 0 ? 0 : (int)buffer_clen(b); |
242 | 408 | int i; |
243 | 408 | if (qs < 0) return; |
244 | 11.1M | for (i = qs+1; i < used; ++i) { |
245 | 11.1M | if (s[i] == '%' && s[i+1] == '2' && s[i+2] == '0') break; |
246 | 11.1M | } |
247 | 408 | if (i != used) burl_normalize_qs20_to_plus_fix(b, i); |
248 | 408 | } |
249 | | |
250 | | |
251 | | static int burl_normalize_2F_to_slash_fix (buffer *b, int qs, int i) |
252 | 196 | { |
253 | 196 | char * const s = b->ptr; |
254 | 196 | const int blen = (int)buffer_clen(b); |
255 | 196 | const int used = qs < 0 ? blen : qs; |
256 | 196 | int j = i; |
257 | 23.7M | for (; i < used; ++i, ++j) { |
258 | 23.7M | s[j] = s[i]; |
259 | 23.7M | if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') { |
260 | 2.38k | s[j] = '/'; |
261 | 2.38k | i+=2; |
262 | 2.38k | } |
263 | 23.7M | } |
264 | 196 | if (qs >= 0) { |
265 | 69 | const int qslen = blen - qs; |
266 | 69 | memmove(s+j, s+qs, (size_t)qslen); |
267 | 69 | qs = j; |
268 | 69 | j += qslen; |
269 | 69 | } |
270 | 196 | buffer_truncate(b, j); |
271 | 196 | return qs; |
272 | 196 | } |
273 | | |
274 | | |
275 | | static int burl_normalize_2F_to_slash (buffer *b, int qs, int flags) |
276 | 1.22k | { |
277 | | /*("%2F" must already have been uppercased during normalization)*/ |
278 | 1.22k | const char * const s = b->ptr; |
279 | 1.22k | const int used = qs < 0 ? (int)buffer_clen(b) : qs; |
280 | 50.0M | for (int i = 0; i < used; ++i) { |
281 | 50.0M | if (s[i] == '%' && s[i+1] == '2' && s[i+2] == 'F') { |
282 | 201 | return (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE) |
283 | 201 | ? burl_normalize_2F_to_slash_fix(b, qs, i) |
284 | 201 | : -2; /*(flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)*/ |
285 | 201 | } |
286 | 50.0M | } |
287 | 1.02k | return qs; |
288 | 1.22k | } |
289 | | |
290 | | |
291 | | static int burl_normalize_path (buffer *b, buffer *t, int qs, int flags) |
292 | 1.16k | { |
293 | 1.16k | const unsigned char * const s = (unsigned char *)b->ptr; |
294 | 1.16k | const int used = (int)buffer_clen(b); |
295 | 1.16k | int path_simplify = 0; |
296 | 20.5k | for (int i = 0, len = qs < 0 ? used : qs; i < len; ++i) { |
297 | 19.7k | if (s[i] == '.' && (s[i+1] != '.' || ++i) |
298 | 12.4k | && (s[i+1] == '/' || s[i+1] == '?' || s[i+1] == '\0')) { |
299 | 233 | path_simplify = 1; |
300 | 233 | break; |
301 | 233 | } |
302 | 46.7M | while (i < len && s[i] != '/') ++i; |
303 | 19.5k | if (s[i] == '/' && s[i+1] == '/') { /*(s[len] != '/')*/ |
304 | 178 | path_simplify = 1; |
305 | 178 | break; |
306 | 178 | } |
307 | 19.5k | } |
308 | | |
309 | 1.16k | if (path_simplify) { |
310 | 411 | if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT) return -2; |
311 | 388 | if (qs >= 0) { |
312 | 149 | buffer_copy_string_len(t, b->ptr+qs, used - qs); |
313 | 149 | buffer_truncate(b, qs); |
314 | 149 | } |
315 | | |
316 | 388 | buffer_path_simplify(b); |
317 | | |
318 | 388 | if (qs >= 0) { |
319 | 149 | qs = (int)buffer_clen(b); |
320 | 149 | buffer_append_string_len(b, BUF_PTR_LEN(t)); |
321 | 149 | } |
322 | 388 | } |
323 | | |
324 | 1.14k | return qs; |
325 | 1.16k | } |
326 | | |
327 | | |
328 | | __attribute_cold__ |
329 | | __attribute_noinline__ |
330 | | __attribute_pure__ |
331 | 437 | static int burl_scan_qmark (const buffer * const b) { |
332 | 437 | const char * const qmark = strchr(b->ptr, '?'); |
333 | 437 | return qmark ? (int)(qmark - b->ptr) : -1; |
334 | 437 | } |
335 | | |
336 | | |
337 | | int burl_normalize (buffer *b, buffer *t, int flags) |
338 | 1.63k | { |
339 | 1.63k | int qs; |
340 | | |
341 | | #if defined(_WIN32) || defined(__CYGWIN__) |
342 | | /* Windows and Cygwin treat '\\' as '/' if '\\' is present in path; |
343 | | * convert to '/' for consistency before percent-encoding |
344 | | * normalization which will convert '\\' to "%5C" in the URL. |
345 | | * (Clients still should not be sending '\\' unencoded in requests.) */ |
346 | | if (flags & HTTP_PARSEOPT_URL_NORMALIZE_PATH_BACKSLASH_TRANS) { |
347 | | for (char *p = b->ptr; *p != '?' && *p != '\0'; ++p) { |
348 | | if (*p == '\\') *p = '/'; |
349 | | } |
350 | | } |
351 | | #endif |
352 | | |
353 | 1.63k | qs = (flags & HTTP_PARSEOPT_URL_NORMALIZE_REQUIRED) |
354 | 1.63k | ? burl_normalize_basic_required(b, t) |
355 | 1.63k | : burl_normalize_basic_unreserved(b, t); |
356 | 1.63k | if (-2 == qs) { |
357 | 532 | if (flags & HTTP_PARSEOPT_URL_NORMALIZE_INVALID_UTF8_REJECT) return -2; |
358 | 437 | qs = burl_scan_qmark(b); |
359 | 437 | } |
360 | | |
361 | 1.54k | if (flags & HTTP_PARSEOPT_URL_NORMALIZE_CTRLS_REJECT) { |
362 | 345 | if (burl_contains_ctrls(b)) return -2; |
363 | 345 | } |
364 | | |
365 | 1.46k | if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_DECODE |
366 | 1.46k | |HTTP_PARSEOPT_URL_NORMALIZE_PATH_2F_REJECT)) { |
367 | 1.22k | qs = burl_normalize_2F_to_slash(b, qs, flags); |
368 | 1.22k | if (-2 == qs) return -2; |
369 | 1.22k | } |
370 | | |
371 | 1.45k | if (flags & (HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REMOVE |
372 | 1.45k | |HTTP_PARSEOPT_URL_NORMALIZE_PATH_DOTSEG_REJECT)) { |
373 | 1.16k | qs = burl_normalize_path(b, t, qs, flags); |
374 | 1.16k | if (-2 == qs) return -2; |
375 | 1.16k | } |
376 | | |
377 | 1.43k | if (flags & HTTP_PARSEOPT_URL_NORMALIZE_QUERY_20_PLUS) { |
378 | 875 | if (qs >= 0) burl_normalize_qs20_to_plus(b, qs); |
379 | 875 | } |
380 | | |
381 | 1.43k | return qs; |
382 | 1.45k | } |
383 | | |
384 | | |
385 | | static void burl_append_encode_nde (buffer * const b, const char * const str, const size_t len) |
386 | 0 | { |
387 | | /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~ |
388 | | * unless already percent-encoded (does not double-encode) */ |
389 | | /* Note: not checking for invalid UTF-8 */ |
390 | 0 | char * const p = buffer_string_prepare_append(b, len*3); |
391 | 0 | unsigned int n1, n2; |
392 | 0 | int j = 0; |
393 | 0 | for (unsigned int i = 0; i < len; ++i, ++j) { |
394 | 0 | if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) { |
395 | 0 | const unsigned int x = (n1 << 4) | n2; |
396 | 0 | if (burl_is_unreserved((int)x)) { |
397 | 0 | p[j] = (char)x; |
398 | 0 | } |
399 | 0 | else { /* leave UTF-8, control chars, and required chars encoded */ |
400 | 0 | p[j] = '%'; |
401 | 0 | p[++j] = str[i+1]; |
402 | 0 | p[++j] = str[i+2]; |
403 | 0 | } |
404 | 0 | i+=2; |
405 | 0 | } |
406 | 0 | else if (burl_is_unreserved(str[i])) { |
407 | 0 | p[j] = str[i]; |
408 | 0 | } |
409 | 0 | else { |
410 | 0 | p[j] = '%'; |
411 | 0 | p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF]; |
412 | 0 | p[++j] = hex_chars_uc[str[i] & 0xF]; |
413 | 0 | } |
414 | 0 | } |
415 | 0 | buffer_commit(b, j); |
416 | 0 | } |
417 | | |
418 | | |
419 | | static void burl_append_encode_psnde (buffer * const b, const char * const str, const size_t len) |
420 | 0 | { |
421 | | /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~ plus / |
422 | | * unless already percent-encoded (does not double-encode) */ |
423 | | /* Note: not checking for invalid UTF-8 */ |
424 | 0 | char * const p = buffer_string_prepare_append(b, len*3); |
425 | 0 | unsigned int n1, n2; |
426 | 0 | int j = 0; |
427 | 0 | for (unsigned int i = 0; i < len; ++i, ++j) { |
428 | 0 | if (str[i]=='%' && li_cton(str[i+1], n1) && li_cton(str[i+2], n2)) { |
429 | 0 | const unsigned int x = (n1 << 4) | n2; |
430 | 0 | if (burl_is_unreserved((int)x)) { |
431 | 0 | p[j] = (char)x; |
432 | 0 | } |
433 | 0 | else { /* leave UTF-8, control chars, and required chars encoded */ |
434 | 0 | p[j] = '%'; |
435 | 0 | p[++j] = str[i+1]; |
436 | 0 | p[++j] = str[i+2]; |
437 | 0 | } |
438 | 0 | i+=2; |
439 | 0 | } |
440 | 0 | else if (burl_is_unreserved(str[i]) || str[i] == '/') { |
441 | 0 | p[j] = str[i]; |
442 | 0 | } |
443 | 0 | else { |
444 | 0 | p[j] = '%'; |
445 | 0 | p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF]; |
446 | 0 | p[++j] = hex_chars_uc[str[i] & 0xF]; |
447 | 0 | } |
448 | 0 | } |
449 | 0 | buffer_commit(b, j); |
450 | 0 | } |
451 | | |
452 | | |
453 | | static void burl_append_encode_all (buffer * const b, const char * const str, const size_t len) |
454 | 0 | { |
455 | | /* percent-encodes everything except unreserved - . 0-9 A-Z _ a-z ~ |
456 | | * Note: double-encodes any existing '%') */ |
457 | | /* Note: not checking for invalid UTF-8 */ |
458 | 0 | char * const p = buffer_string_prepare_append(b, len*3); |
459 | 0 | int j = 0; |
460 | 0 | for (unsigned int i = 0; i < len; ++i, ++j) { |
461 | 0 | if (burl_is_unreserved(str[i])) { |
462 | 0 | p[j] = str[i]; |
463 | 0 | } |
464 | 0 | else { |
465 | 0 | p[j] = '%'; |
466 | 0 | p[++j] = hex_chars_uc[(str[i] >> 4) & 0xF]; |
467 | 0 | p[++j] = hex_chars_uc[str[i] & 0xF]; |
468 | 0 | } |
469 | 0 | } |
470 | 0 | buffer_commit(b, j); |
471 | 0 | } |
472 | | |
473 | | |
474 | | static void burl_offset_tolower (buffer * const b, const size_t off) |
475 | 0 | { |
476 | | /*(skips over all percent-encodings, including encoding of alpha chars)*/ |
477 | 0 | for (char *p = b->ptr+off; p[0]; ++p) { |
478 | 0 | if (light_isupper(p[0])) p[0] |= 0x20; |
479 | 0 | else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2])) |
480 | 0 | p+=2; |
481 | 0 | } |
482 | 0 | } |
483 | | |
484 | | |
485 | | static void burl_offset_toupper (buffer * const b, const size_t off) |
486 | 0 | { |
487 | | /*(skips over all percent-encodings, including encoding of alpha chars)*/ |
488 | 0 | for (char *p = b->ptr+off; p[0]; ++p) { |
489 | 0 | if (light_islower(p[0])) p[0] &= 0xdf; |
490 | 0 | else if (p[0]=='%' && light_isxdigit(p[1]) && light_isxdigit(p[2])) |
491 | 0 | p+=2; |
492 | 0 | } |
493 | 0 | } |
494 | | |
495 | | |
496 | | void burl_append (buffer * const b, const char * const str, const size_t len, const int flags) |
497 | 0 | { |
498 | 0 | size_t off = 0; |
499 | |
|
500 | 0 | if (0 == len) return; |
501 | | |
502 | 0 | if (0 == flags) { |
503 | 0 | buffer_append_string_len(b, str, len); |
504 | 0 | return; |
505 | 0 | } |
506 | | |
507 | 0 | if (flags & (BURL_TOUPPER|BURL_TOLOWER)) off = buffer_clen(b); |
508 | |
|
509 | 0 | if (flags & BURL_ENCODE_NONE) { |
510 | 0 | buffer_append_string_len(b, str, len); |
511 | 0 | } |
512 | 0 | else if (flags & BURL_ENCODE_ALL) { |
513 | 0 | burl_append_encode_all(b, str, len); |
514 | 0 | } |
515 | 0 | else if (flags & BURL_ENCODE_NDE) { |
516 | 0 | burl_append_encode_nde(b, str, len); |
517 | 0 | } |
518 | 0 | else if (flags & BURL_ENCODE_PSNDE) { |
519 | 0 | burl_append_encode_psnde(b, str, len); |
520 | 0 | } |
521 | 0 | else if (flags & BURL_ENCODE_B64U) { |
522 | 0 | const unsigned char *s = (const unsigned char *)str; |
523 | 0 | buffer_append_base64_encode_no_padding(b, s, len, BASE64_URL); |
524 | 0 | } |
525 | 0 | else if (flags & BURL_DECODE_B64U) { |
526 | 0 | buffer_append_base64_decode(b, str, len, BASE64_URL); |
527 | 0 | } |
528 | | |
529 | | /* note: not normalizing str, which could come from arbitrary header, |
530 | | * so it is possible that alpha chars are percent-encoded upper/lowercase */ |
531 | 0 | if (flags & (BURL_TOLOWER|BURL_TOUPPER)) { |
532 | 0 | (flags & BURL_TOLOWER) |
533 | 0 | ? burl_offset_tolower(b, off) /*(flags & BURL_TOLOWER)*/ |
534 | 0 | : burl_offset_toupper(b, off); /*(flags & BURL_TOUPPER)*/ |
535 | 0 | } |
536 | 0 | } |