Line | Count | Source (jump to first uncovered line) |
1 | | /* HTTP support. |
2 | | Copyright (C) 1996-2012, 2014-2015, 2018-2024 Free Software |
3 | | Foundation, Inc. |
4 | | |
5 | | This file is part of GNU Wget. |
6 | | |
7 | | GNU Wget is free software; you can redistribute it and/or modify |
8 | | it under the terms of the GNU General Public License as published by |
9 | | the Free Software Foundation; either version 3 of the License, or |
10 | | (at your option) any later version. |
11 | | |
12 | | GNU Wget is distributed in the hope that it will be useful, |
13 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | GNU General Public License for more details. |
16 | | |
17 | | You should have received a copy of the GNU General Public License |
18 | | along with Wget. If not, see <http://www.gnu.org/licenses/>. |
19 | | |
20 | | Additional permission under GNU GPL version 3 section 7 |
21 | | |
22 | | If you modify this program, or any covered work, by linking or |
23 | | combining it with the OpenSSL project's OpenSSL library (or a |
24 | | modified version of that library), containing parts covered by the |
25 | | terms of the OpenSSL or SSLeay licenses, the Free Software Foundation |
26 | | grants you additional permission to convey the resulting work. |
27 | | Corresponding Source for a non-source form of such a combination |
28 | | shall include the source code for the parts of OpenSSL used as well |
29 | | as that of the covered work. */ |
30 | | |
31 | | #include "wget.h" |
32 | | |
33 | | #include <stdio.h> |
34 | | #include <stdlib.h> |
35 | | #include <string.h> |
36 | | #include <unistd.h> |
37 | | #include <assert.h> |
38 | | #include <errno.h> |
39 | | #include <time.h> |
40 | | #include <locale.h> |
41 | | #include <fcntl.h> |
42 | | |
43 | | #include "hash.h" |
44 | | #include "http.h" |
45 | | #include "hsts.h" |
46 | | #include "utils.h" |
47 | | #include "url.h" |
48 | | #include "host.h" |
49 | | #include "retr.h" |
50 | | #include "connect.h" |
51 | | #include "netrc.h" |
52 | | #ifdef HAVE_SSL |
53 | | # include "ssl.h" |
54 | | #endif |
55 | | #ifdef ENABLE_NTLM |
56 | | # include "http-ntlm.h" |
57 | | #endif |
58 | | #include "cookies.h" |
59 | | #include "md5.h" |
60 | | #include "convert.h" |
61 | | #include "spider.h" |
62 | | #include "warc.h" |
63 | | #include "c-strcase.h" |
64 | | #include "version.h" |
65 | | #include "xstrndup.h" |
66 | | #ifdef HAVE_METALINK |
67 | | # include "metalink.h" |
68 | | #endif |
69 | | #ifdef ENABLE_XATTR |
70 | | #include "xattr.h" |
71 | | #endif |
72 | | |
73 | | #ifdef TESTING |
74 | | #include "../tests/unit-tests.h" |
75 | | #endif |
76 | | |
77 | | #ifdef __VMS |
78 | | # include "vms.h" |
79 | | #endif /* def __VMS */ |
80 | | |
81 | | |
82 | | /* Forward decls. */ |
83 | | struct http_stat; |
84 | | static char *create_authorization_line (const char *, const char *, |
85 | | const char *, const char *, |
86 | | const char *, bool *, uerr_t *); |
87 | | static char *basic_authentication_encode (const char *, const char *); |
88 | | static bool known_authentication_scheme_p (const char *, const char *); |
89 | | static void ensure_extension (struct http_stat *, const char *, int *); |
90 | | static void load_cookies (void); |
91 | | |
92 | | static bool cookies_loaded_p; |
93 | | static struct cookie_jar *wget_cookie_jar; |
94 | | |
95 | 0 | #define TEXTHTML_S "text/html" |
96 | 0 | #define TEXTXHTML_S "application/xhtml+xml" |
97 | 0 | #define TEXTCSS_S "text/css" |
98 | | |
99 | | /* Some status code validation macros: */ |
100 | 0 | #define H_10X(x) (((x) >= 100) && ((x) < 200)) |
101 | 0 | #define H_20X(x) (((x) >= 200) && ((x) < 300)) |
102 | 0 | #define H_PARTIAL(x) ((x) == HTTP_STATUS_PARTIAL_CONTENTS) |
103 | 0 | #define H_REDIRECTED(x) ((x) == HTTP_STATUS_MOVED_PERMANENTLY \ |
104 | 0 | || (x) == HTTP_STATUS_MOVED_TEMPORARILY \ |
105 | 0 | || (x) == HTTP_STATUS_SEE_OTHER \ |
106 | 0 | || (x) == HTTP_STATUS_TEMPORARY_REDIRECT \ |
107 | 0 | || (x) == HTTP_STATUS_PERMANENT_REDIRECT) |
108 | | |
109 | | /* HTTP/1.0 status codes from RFC1945, provided for reference. */ |
110 | | /* Successful 2xx. */ |
111 | 0 | #define HTTP_STATUS_OK 200 |
112 | | #define HTTP_STATUS_CREATED 201 |
113 | | #define HTTP_STATUS_ACCEPTED 202 |
114 | 0 | #define HTTP_STATUS_NO_CONTENT 204 |
115 | 0 | #define HTTP_STATUS_PARTIAL_CONTENTS 206 |
116 | | |
117 | | /* Redirection 3xx. */ |
118 | 0 | #define HTTP_STATUS_MULTIPLE_CHOICES 300 |
119 | 0 | #define HTTP_STATUS_MOVED_PERMANENTLY 301 |
120 | 0 | #define HTTP_STATUS_MOVED_TEMPORARILY 302 |
121 | 0 | #define HTTP_STATUS_SEE_OTHER 303 /* from HTTP/1.1 */ |
122 | 0 | #define HTTP_STATUS_NOT_MODIFIED 304 |
123 | 0 | #define HTTP_STATUS_TEMPORARY_REDIRECT 307 /* from HTTP/1.1 */ |
124 | 0 | #define HTTP_STATUS_PERMANENT_REDIRECT 308 /* from HTTP/1.1 */ |
125 | | |
126 | | /* Client error 4xx. */ |
127 | | #define HTTP_STATUS_BAD_REQUEST 400 |
128 | 0 | #define HTTP_STATUS_UNAUTHORIZED 401 |
129 | | #define HTTP_STATUS_FORBIDDEN 403 |
130 | | #define HTTP_STATUS_NOT_FOUND 404 |
131 | 0 | #define HTTP_STATUS_RANGE_NOT_SATISFIABLE 416 |
132 | | |
133 | | /* Server errors 5xx. */ |
134 | | #define HTTP_STATUS_INTERNAL 500 |
135 | | #define HTTP_STATUS_NOT_IMPLEMENTED 501 |
136 | | #define HTTP_STATUS_BAD_GATEWAY 502 |
137 | | #define HTTP_STATUS_UNAVAILABLE 503 |
138 | 0 | #define HTTP_STATUS_GATEWAY_TIMEOUT 504 |
139 | | |
140 | | enum rp { |
141 | | rel_none, rel_name, rel_value, rel_both |
142 | | }; |
143 | | |
144 | | struct request { |
145 | | const char *method; |
146 | | char *arg; |
147 | | |
148 | | struct request_header { |
149 | | char *name, *value; |
150 | | enum rp release_policy; |
151 | | } *headers; |
152 | | int hcount, hcapacity; |
153 | | }; |
154 | | |
155 | | |
156 | | /* Create a new, empty request. Set the request's method and its |
157 | | arguments. METHOD should be a literal string (or it should outlive |
158 | | the request) because it will not be freed. ARG will be freed by |
159 | | request_free. */ |
160 | | |
161 | | static struct request * |
162 | | request_new (const char *method, char *arg) |
163 | 0 | { |
164 | 0 | struct request *req = xnew0 (struct request); |
165 | 0 | req->hcapacity = 8; |
166 | 0 | req->headers = xnew_array (struct request_header, req->hcapacity); |
167 | 0 | req->method = method; |
168 | 0 | req->arg = arg; |
169 | 0 | return req; |
170 | 0 | } |
171 | | |
172 | | /* Return the method string passed with the last call to |
173 | | request_set_method. */ |
174 | | |
175 | | static const char * |
176 | | request_method (const struct request *req) |
177 | 0 | { |
178 | 0 | return req->method; |
179 | 0 | } |
180 | | |
181 | | /* Free one header according to the release policy specified with |
182 | | request_set_header. */ |
183 | | |
184 | | static void |
185 | | release_header (struct request_header *hdr) |
186 | 0 | { |
187 | 0 | switch (hdr->release_policy) |
188 | 0 | { |
189 | 0 | case rel_none: |
190 | 0 | break; |
191 | 0 | case rel_name: |
192 | 0 | xfree (hdr->name); |
193 | 0 | break; |
194 | 0 | case rel_value: |
195 | 0 | xfree (hdr->value); |
196 | 0 | break; |
197 | 0 | case rel_both: |
198 | 0 | xfree (hdr->name); |
199 | 0 | xfree (hdr->value); |
200 | 0 | break; |
201 | 0 | } |
202 | 0 | } |
203 | | |
204 | | /* Set the request named NAME to VALUE. Specifically, this means that |
205 | | a "NAME: VALUE\r\n" header line will be used in the request. If a |
206 | | header with the same name previously existed in the request, its |
207 | | value will be replaced by this one. A NULL value means do nothing. |
208 | | |
209 | | RELEASE_POLICY determines whether NAME and VALUE should be released |
210 | | (freed) with request_free. Allowed values are: |
211 | | |
212 | | - rel_none - don't free NAME or VALUE |
213 | | - rel_name - free NAME when done |
214 | | - rel_value - free VALUE when done |
215 | | - rel_both - free both NAME and VALUE when done |
216 | | |
217 | | Setting release policy is useful when arguments come from different |
218 | | sources. For example: |
219 | | |
220 | | // Don't free literal strings! |
221 | | request_set_header (req, "Pragma", "no-cache", rel_none); |
222 | | |
223 | | // Don't free a global variable, we'll need it later. |
224 | | request_set_header (req, "Referer", opt.referer, rel_none); |
225 | | |
226 | | // Value freshly allocated, free it when done. |
227 | | request_set_header (req, "Range", |
228 | | aprintf ("bytes=%s-", number_to_static_string (hs->restval)), |
229 | | rel_value); |
230 | | */ |
231 | | |
232 | | static void |
233 | | request_set_header (struct request *req, const char *name, const char *value, |
234 | | enum rp release_policy) |
235 | 0 | { |
236 | 0 | struct request_header *hdr; |
237 | 0 | int i; |
238 | |
|
239 | 0 | if (!value) |
240 | 0 | { |
241 | | /* A NULL value is a no-op; if freeing the name is requested, |
242 | | free it now to avoid leaks. */ |
243 | 0 | if (release_policy == rel_name || release_policy == rel_both) |
244 | 0 | xfree (name); |
245 | 0 | return; |
246 | 0 | } |
247 | | |
248 | 0 | for (i = 0; i < req->hcount; i++) |
249 | 0 | { |
250 | 0 | hdr = &req->headers[i]; |
251 | 0 | if (0 == c_strcasecmp (name, hdr->name)) |
252 | 0 | { |
253 | | /* Replace existing header. */ |
254 | 0 | release_header (hdr); |
255 | 0 | hdr->name = (void *)name; |
256 | 0 | hdr->value = (void *)value; |
257 | 0 | hdr->release_policy = release_policy; |
258 | 0 | return; |
259 | 0 | } |
260 | 0 | } |
261 | | |
262 | | /* Install new header. */ |
263 | | |
264 | 0 | if (req->hcount >= req->hcapacity) |
265 | 0 | { |
266 | 0 | req->hcapacity <<= 1; |
267 | 0 | req->headers = xrealloc (req->headers, req->hcapacity * sizeof (*hdr)); |
268 | 0 | } |
269 | 0 | hdr = &req->headers[req->hcount++]; |
270 | 0 | hdr->name = (void *)name; |
271 | 0 | hdr->value = (void *)value; |
272 | 0 | hdr->release_policy = release_policy; |
273 | 0 | } |
274 | | |
275 | | /* Like request_set_header, but sets the whole header line, as |
276 | | provided by the user using the `--header' option. For example, |
277 | | request_set_user_header (req, "Foo: bar") works just like |
278 | | request_set_header (req, "Foo", "bar"). */ |
279 | | |
280 | | static void |
281 | | request_set_user_header (struct request *req, const char *header) |
282 | 0 | { |
283 | 0 | const char *name, *p; |
284 | |
|
285 | 0 | if (!(p = strchr (header, ':'))) |
286 | 0 | return; |
287 | | |
288 | 0 | name = xstrndup(header, p - header); |
289 | |
|
290 | 0 | ++p; |
291 | 0 | while (c_isspace (*p)) |
292 | 0 | ++p; |
293 | |
|
294 | 0 | request_set_header (req, name, p, rel_name); |
295 | 0 | } |
296 | | |
297 | | /* Remove the header with specified name from REQ. Returns true if |
298 | | the header was actually removed, false otherwise. */ |
299 | | |
300 | | static bool |
301 | | request_remove_header (struct request *req, const char *name) |
302 | 0 | { |
303 | 0 | int i; |
304 | 0 | for (i = 0; i < req->hcount; i++) |
305 | 0 | { |
306 | 0 | struct request_header *hdr = &req->headers[i]; |
307 | 0 | if (0 == c_strcasecmp (name, hdr->name)) |
308 | 0 | { |
309 | 0 | release_header (hdr); |
310 | | /* Move the remaining headers by one. */ |
311 | 0 | if (i < req->hcount - 1) |
312 | 0 | memmove (hdr, hdr + 1, (req->hcount - i - 1) * sizeof (*hdr)); |
313 | 0 | --req->hcount; |
314 | 0 | return true; |
315 | 0 | } |
316 | 0 | } |
317 | 0 | return false; |
318 | 0 | } |
319 | | |
320 | 0 | #define APPEND(p, str) do { \ |
321 | 0 | int A_len = strlen (str); \ |
322 | 0 | memcpy (p, str, A_len); \ |
323 | 0 | p += A_len; \ |
324 | 0 | } while (0) |
325 | | |
326 | | /* Construct the request and write it to FD using fd_write. |
327 | | If warc_tmp is set to a file pointer, the request string will |
328 | | also be written to that file. */ |
329 | | |
330 | | static int |
331 | | request_send (const struct request *req, int fd, FILE *warc_tmp) |
332 | 0 | { |
333 | 0 | char *request_string, *p; |
334 | 0 | int i, size, write_error; |
335 | | |
336 | | /* Count the request size. */ |
337 | 0 | size = 0; |
338 | | |
339 | | /* METHOD " " ARG " " "HTTP/1.0" "\r\n" */ |
340 | 0 | size += strlen (req->method) + 1 + strlen (req->arg) + 1 + 8 + 2; |
341 | |
|
342 | 0 | for (i = 0; i < req->hcount; i++) |
343 | 0 | { |
344 | 0 | struct request_header *hdr = &req->headers[i]; |
345 | | /* NAME ": " VALUE "\r\n" */ |
346 | 0 | size += strlen (hdr->name) + 2 + strlen (hdr->value) + 2; |
347 | 0 | } |
348 | | |
349 | | /* "\r\n\0" */ |
350 | 0 | size += 3; |
351 | |
|
352 | 0 | p = request_string = xmalloc (size); |
353 | | |
354 | | /* Generate the request. */ |
355 | |
|
356 | 0 | APPEND (p, req->method); *p++ = ' '; |
357 | 0 | APPEND (p, req->arg); *p++ = ' '; |
358 | 0 | memcpy (p, "HTTP/1.1\r\n", 10); p += 10; |
359 | |
|
360 | 0 | for (i = 0; i < req->hcount; i++) |
361 | 0 | { |
362 | 0 | struct request_header *hdr = &req->headers[i]; |
363 | 0 | APPEND (p, hdr->name); |
364 | 0 | *p++ = ':', *p++ = ' '; |
365 | 0 | APPEND (p, hdr->value); |
366 | 0 | *p++ = '\r', *p++ = '\n'; |
367 | 0 | } |
368 | |
|
369 | 0 | *p++ = '\r', *p++ = '\n', *p++ = '\0'; |
370 | 0 | assert (p - request_string == size); |
371 | |
|
372 | 0 | #undef APPEND |
373 | |
|
374 | 0 | DEBUGP (("\n---request begin---\n%s---request end---\n", request_string)); |
375 | | |
376 | | /* Send the request to the server. */ |
377 | |
|
378 | 0 | write_error = fd_write (fd, request_string, size - 1, -1); |
379 | 0 | if (write_error < 0) |
380 | 0 | logprintf (LOG_VERBOSE, _("Failed writing HTTP request: %s.\n"), |
381 | 0 | fd_errstr (fd)); |
382 | 0 | else if (warc_tmp != NULL) |
383 | 0 | { |
384 | | /* Write a copy of the data to the WARC record. */ |
385 | 0 | int warc_tmp_written = fwrite (request_string, 1, size - 1, warc_tmp); |
386 | 0 | if (warc_tmp_written != size - 1) |
387 | 0 | write_error = -2; |
388 | 0 | } |
389 | 0 | xfree (request_string); |
390 | 0 | return write_error; |
391 | 0 | } |
392 | | |
393 | | /* Release the resources used by REQ. |
394 | | It is safe to call it with a valid pointer to a NULL pointer. |
395 | | It is not safe to call it with an invalid or NULL pointer. */ |
396 | | |
397 | | static void |
398 | | request_free (struct request **req_ref) |
399 | 0 | { |
400 | 0 | int i; |
401 | 0 | struct request *req = *req_ref; |
402 | |
|
403 | 0 | if (!req) |
404 | 0 | return; |
405 | | |
406 | 0 | xfree (req->arg); |
407 | 0 | for (i = 0; i < req->hcount; i++) |
408 | 0 | release_header (&req->headers[i]); |
409 | 0 | xfree (req->headers); |
410 | 0 | xfree (req); |
411 | 0 | *req_ref = NULL; |
412 | 0 | } |
413 | | |
414 | | static struct hash_table *basic_authed_hosts; |
415 | | |
416 | | /* Find out if this host has issued a Basic challenge yet; if so, give |
417 | | * it the username, password. A temporary measure until we can get |
418 | | * proper authentication in place. */ |
419 | | |
420 | | static bool |
421 | | maybe_send_basic_creds (const char *hostname, const char *user, |
422 | | const char *passwd, struct request *req) |
423 | 0 | { |
424 | 0 | bool do_challenge = false; |
425 | |
|
426 | 0 | if (opt.auth_without_challenge) |
427 | 0 | { |
428 | 0 | DEBUGP (("Auth-without-challenge set, sending Basic credentials.\n")); |
429 | 0 | do_challenge = true; |
430 | 0 | } |
431 | 0 | else if (basic_authed_hosts |
432 | 0 | && hash_table_contains (basic_authed_hosts, hostname)) |
433 | 0 | { |
434 | 0 | DEBUGP (("Found %s in basic_authed_hosts.\n", quote (hostname))); |
435 | 0 | do_challenge = true; |
436 | 0 | } |
437 | 0 | else |
438 | 0 | { |
439 | 0 | DEBUGP (("Host %s has not issued a general basic challenge.\n", |
440 | 0 | quote (hostname))); |
441 | 0 | } |
442 | 0 | if (do_challenge) |
443 | 0 | { |
444 | 0 | request_set_header (req, "Authorization", |
445 | 0 | basic_authentication_encode (user, passwd), |
446 | 0 | rel_value); |
447 | 0 | } |
448 | 0 | return do_challenge; |
449 | 0 | } |
450 | | |
451 | | static void |
452 | | register_basic_auth_host (const char *hostname) |
453 | 0 | { |
454 | 0 | if (!basic_authed_hosts) |
455 | 0 | { |
456 | 0 | basic_authed_hosts = make_nocase_string_hash_table (1); |
457 | 0 | } |
458 | 0 | if (!hash_table_contains (basic_authed_hosts, hostname)) |
459 | 0 | { |
460 | 0 | hash_table_put (basic_authed_hosts, xstrdup (hostname), NULL); |
461 | 0 | DEBUGP (("Inserted %s into basic_authed_hosts\n", quote (hostname))); |
462 | 0 | } |
463 | 0 | } |
464 | | |
465 | | /* Send the contents of FILE_NAME to SOCK. Make sure that exactly |
466 | | PROMISED_SIZE bytes are sent over the wire -- if the file is |
467 | | longer, read only that much; if the file is shorter, report an error. |
468 | | If warc_tmp is set to a file pointer, the post data will |
469 | | also be written to that file. */ |
470 | | |
471 | | static int |
472 | | body_file_send (int sock, const char *file_name, wgint promised_size, FILE *warc_tmp) |
473 | 0 | { |
474 | 0 | static char chunk[8192]; |
475 | 0 | wgint written = 0; |
476 | 0 | int write_error; |
477 | 0 | FILE *fp; |
478 | |
|
479 | 0 | DEBUGP (("[writing BODY file %s ... ", file_name)); |
480 | |
|
481 | 0 | fp = fopen (file_name, "rb"); |
482 | 0 | if (!fp) |
483 | 0 | return -1; |
484 | 0 | while (!feof (fp) && written < promised_size) |
485 | 0 | { |
486 | 0 | int towrite; |
487 | 0 | int length = fread (chunk, 1, sizeof (chunk), fp); |
488 | 0 | if (length == 0) |
489 | 0 | break; |
490 | 0 | towrite = MIN (promised_size - written, length); |
491 | 0 | write_error = fd_write (sock, chunk, towrite, -1); |
492 | 0 | if (write_error < 0) |
493 | 0 | { |
494 | 0 | fclose (fp); |
495 | 0 | return -1; |
496 | 0 | } |
497 | 0 | if (warc_tmp != NULL) |
498 | 0 | { |
499 | | /* Write a copy of the data to the WARC record. */ |
500 | 0 | int warc_tmp_written = fwrite (chunk, 1, towrite, warc_tmp); |
501 | 0 | if (warc_tmp_written != towrite) |
502 | 0 | { |
503 | 0 | fclose (fp); |
504 | 0 | return -2; |
505 | 0 | } |
506 | 0 | } |
507 | 0 | written += towrite; |
508 | 0 | } |
509 | 0 | fclose (fp); |
510 | | |
511 | | /* If we've written less than was promised, report a (probably |
512 | | nonsensical) error rather than break the promise. */ |
513 | 0 | if (written < promised_size) |
514 | 0 | { |
515 | 0 | errno = EINVAL; |
516 | 0 | return -1; |
517 | 0 | } |
518 | | |
519 | 0 | assert (written == promised_size); |
520 | 0 | DEBUGP (("done]\n")); |
521 | 0 | return 0; |
522 | 0 | } |
523 | | |
524 | | /* Determine whether [START, PEEKED + PEEKLEN) contains an empty line. |
525 | | If so, return the pointer to the position after the line, otherwise |
526 | | return NULL. This is used as callback to fd_read_hunk. The data |
527 | | between START and PEEKED has been read and cannot be "unread"; the |
528 | | data after PEEKED has only been peeked. */ |
529 | | |
530 | | static const char * |
531 | | response_head_terminator (const char *start, const char *peeked, int peeklen) |
532 | 0 | { |
533 | 0 | const char *p, *end; |
534 | | |
535 | | /* If at first peek, verify whether HUNK starts with "HTTP". If |
536 | | not, this is a HTTP/0.9 request and we must bail out without |
537 | | reading anything. */ |
538 | 0 | if (start == peeked && 0 != memcmp (start, "HTTP", MIN (peeklen, 4))) |
539 | 0 | return start; |
540 | | |
541 | | /* Look for "\n[\r]\n", and return the following position if found. |
542 | | Start two chars before the current to cover the possibility that |
543 | | part of the terminator (e.g. "\n\r") arrived in the previous |
544 | | batch. */ |
545 | 0 | p = peeked - start < 2 ? start : peeked - 2; |
546 | 0 | end = peeked + peeklen; |
547 | | |
548 | | /* Check for \n\r\n or \n\n anywhere in [p, end-2). */ |
549 | 0 | for (; p < end - 2; p++) |
550 | 0 | if (*p == '\n') |
551 | 0 | { |
552 | 0 | if (p[1] == '\r' && p[2] == '\n') |
553 | 0 | return p + 3; |
554 | 0 | else if (p[1] == '\n') |
555 | 0 | return p + 2; |
556 | 0 | } |
557 | | /* p==end-2: check for \n\n directly preceding END. */ |
558 | 0 | if (peeklen >= 2 && p[0] == '\n' && p[1] == '\n') |
559 | 0 | return p + 2; |
560 | | |
561 | 0 | return NULL; |
562 | 0 | } |
563 | | |
564 | | /* The maximum size of a single HTTP response we care to read. Rather |
565 | | than being a limit of the reader implementation, this limit |
566 | | prevents Wget from slurping all available memory upon encountering |
567 | | malicious or buggy server output, thus protecting the user. Define |
568 | | it to 0 to remove the limit. */ |
569 | | |
570 | 0 | #define HTTP_RESPONSE_MAX_SIZE 65536 |
571 | | |
572 | | /* Read the HTTP request head from FD and return it. The error |
573 | | conditions are the same as with fd_read_hunk. |
574 | | |
575 | | To support HTTP/0.9 responses, this function tries to make sure |
576 | | that the data begins with "HTTP". If this is not the case, no data |
577 | | is read and an empty request is returned, so that the remaining |
578 | | data can be treated as body. */ |
579 | | |
580 | | static char * |
581 | | read_http_response_head (int fd) |
582 | 0 | { |
583 | 0 | return fd_read_hunk (fd, response_head_terminator, 512, |
584 | 0 | HTTP_RESPONSE_MAX_SIZE); |
585 | 0 | } |
586 | | |
587 | | struct response { |
588 | | /* The response data. */ |
589 | | const char *data; |
590 | | |
591 | | /* The array of pointers that indicate where each header starts. |
592 | | For example, given this HTTP response: |
593 | | |
594 | | HTTP/1.0 200 Ok |
595 | | Description: some |
596 | | text |
597 | | Etag: x |
598 | | |
599 | | The headers are located like this: |
600 | | |
601 | | "HTTP/1.0 200 Ok\r\nDescription: some\r\n text\r\nEtag: x\r\n\r\n" |
602 | | ^ ^ ^ ^ |
603 | | headers[0] headers[1] headers[2] headers[3] |
604 | | |
605 | | I.e. headers[0] points to the beginning of the request, |
606 | | headers[1] points to the end of the first header and the |
607 | | beginning of the second one, etc. */ |
608 | | |
609 | | const char **headers; |
610 | | }; |
611 | | |
612 | | /* Create a new response object from the text of the HTTP response, |
613 | | available in HEAD. That text is automatically split into |
614 | | constituent header lines for fast retrieval using |
615 | | resp_header_*. */ |
616 | | |
617 | | static struct response * |
618 | | resp_new (char *head) |
619 | 0 | { |
620 | 0 | char *hdr; |
621 | 0 | int count, size; |
622 | |
|
623 | 0 | struct response *resp = xnew0 (struct response); |
624 | 0 | resp->data = head; |
625 | |
|
626 | 0 | if (*head == '\0') |
627 | 0 | { |
628 | | /* Empty head means that we're dealing with a headerless |
629 | | (HTTP/0.9) response. In that case, don't set HEADERS at |
630 | | all. */ |
631 | 0 | return resp; |
632 | 0 | } |
633 | | |
634 | | /* Split HEAD into header lines, so that resp_header_* functions |
635 | | don't need to do this over and over again. */ |
636 | | |
637 | 0 | size = count = 0; |
638 | 0 | hdr = head; |
639 | 0 | while (1) |
640 | 0 | { |
641 | 0 | DO_REALLOC (resp->headers, size, count + 1, const char *); |
642 | 0 | resp->headers[count++] = hdr; |
643 | | |
644 | | /* Break upon encountering an empty line. */ |
645 | 0 | if (!hdr[0] || (hdr[0] == '\r' && hdr[1] == '\n') || hdr[0] == '\n') |
646 | 0 | break; |
647 | | |
648 | | /* Find the end of HDR, including continuations. */ |
649 | 0 | for (;;) |
650 | 0 | { |
651 | 0 | char *end = strchr (hdr, '\n'); |
652 | |
|
653 | 0 | if (!end) |
654 | 0 | { |
655 | 0 | hdr += strlen (hdr); |
656 | 0 | break; |
657 | 0 | } |
658 | | |
659 | 0 | hdr = end + 1; |
660 | |
|
661 | 0 | if (*hdr != ' ' && *hdr != '\t') |
662 | 0 | break; |
663 | | |
664 | | // continuation, transform \r and \n into spaces |
665 | 0 | *end = ' '; |
666 | 0 | if (end > head && end[-1] == '\r') |
667 | 0 | end[-1] = ' '; |
668 | 0 | } |
669 | 0 | } |
670 | 0 | DO_REALLOC (resp->headers, size, count + 1, const char *); |
671 | 0 | resp->headers[count] = NULL; |
672 | |
|
673 | 0 | return resp; |
674 | 0 | } |
675 | | |
676 | | /* Locate the header named NAME in the request data, starting with |
677 | | position START. This allows the code to loop through the request |
678 | | data, filtering for all requests of a given name. Returns the |
679 | | found position, or -1 for failure. The code that uses this |
680 | | function typically looks like this: |
681 | | |
682 | | for (pos = 0; (pos = resp_header_locate (...)) != -1; pos++) |
683 | | ... do something with header ... |
684 | | |
685 | | If you only care about one header, use resp_header_get instead of |
686 | | this function. */ |
687 | | |
688 | | static int |
689 | | resp_header_locate (const struct response *resp, const char *name, int start, |
690 | | const char **begptr, const char **endptr) |
691 | 0 | { |
692 | 0 | int i; |
693 | 0 | const char **headers = resp->headers; |
694 | 0 | int name_len; |
695 | |
|
696 | 0 | if (!headers || !headers[1]) |
697 | 0 | return -1; |
698 | | |
699 | 0 | name_len = strlen (name); |
700 | 0 | if (start > 0) |
701 | 0 | i = start; |
702 | 0 | else |
703 | 0 | i = 1; |
704 | |
|
705 | 0 | for (; headers[i + 1]; i++) |
706 | 0 | { |
707 | 0 | const char *b = headers[i]; |
708 | 0 | const char *e = headers[i + 1]; |
709 | 0 | if (e - b > name_len |
710 | 0 | && b[name_len] == ':' |
711 | 0 | && 0 == c_strncasecmp (b, name, name_len)) |
712 | 0 | { |
713 | 0 | b += name_len + 1; |
714 | 0 | while (b < e && c_isspace (*b)) |
715 | 0 | ++b; |
716 | 0 | while (b < e && c_isspace (e[-1])) |
717 | 0 | --e; |
718 | 0 | *begptr = b; |
719 | 0 | *endptr = e; |
720 | 0 | return i; |
721 | 0 | } |
722 | 0 | } |
723 | 0 | return -1; |
724 | 0 | } |
725 | | |
726 | | /* Find and retrieve the header named NAME in the request data. If |
727 | | found, set *BEGPTR to its starting, and *ENDPTR to its ending |
728 | | position, and return true. Otherwise return false. |
729 | | |
730 | | This function is used as a building block for resp_header_copy |
731 | | and resp_header_strdup. */ |
732 | | |
733 | | static bool |
734 | | resp_header_get (const struct response *resp, const char *name, |
735 | | const char **begptr, const char **endptr) |
736 | 0 | { |
737 | 0 | int pos = resp_header_locate (resp, name, 0, begptr, endptr); |
738 | 0 | return pos != -1; |
739 | 0 | } |
740 | | |
741 | | /* Copy the response header named NAME to buffer BUF, no longer than |
742 | | BUFSIZE (BUFSIZE includes the terminating 0). If the header |
743 | | exists, true is returned, false otherwise. If there should be no |
744 | | limit on the size of the header, use resp_header_strdup instead. |
745 | | |
746 | | If BUFSIZE is 0, no data is copied, but the boolean indication of |
747 | | whether the header is present is still returned. */ |
748 | | |
749 | | static bool |
750 | | resp_header_copy (const struct response *resp, const char *name, |
751 | | char *buf, int bufsize) |
752 | 0 | { |
753 | 0 | const char *b, *e; |
754 | 0 | if (!resp_header_get (resp, name, &b, &e)) |
755 | 0 | return false; |
756 | 0 | if (bufsize) |
757 | 0 | { |
758 | 0 | int len = MIN (e - b, bufsize - 1); |
759 | 0 | memcpy (buf, b, len); |
760 | 0 | buf[len] = '\0'; |
761 | 0 | } |
762 | 0 | return true; |
763 | 0 | } |
764 | | |
765 | | /* Return the value of header named NAME in RESP, allocated with |
766 | | malloc. If such a header does not exist in RESP, return NULL. */ |
767 | | |
768 | | static char * |
769 | | resp_header_strdup (const struct response *resp, const char *name) |
770 | 0 | { |
771 | 0 | const char *b, *e; |
772 | 0 | if (!resp_header_get (resp, name, &b, &e)) |
773 | 0 | return NULL; |
774 | 0 | return strdupdelim (b, e); |
775 | 0 | } |
776 | | |
777 | | /* Parse the HTTP status line, which is of format: |
778 | | |
779 | | HTTP-Version SP Status-Code SP Reason-Phrase |
780 | | |
781 | | The function returns the status-code, or -1 if the status line |
782 | | appears malformed. The pointer to "reason-phrase" message is |
783 | | returned in *MESSAGE. */ |
784 | | |
785 | | static int |
786 | | resp_status (const struct response *resp, char **message) |
787 | 0 | { |
788 | 0 | int status; |
789 | 0 | const char *p, *end; |
790 | |
|
791 | 0 | if (!resp->headers) |
792 | 0 | { |
793 | | /* For a HTTP/0.9 response, assume status 200. */ |
794 | 0 | if (message) |
795 | 0 | *message = xstrdup (_("No headers, assuming HTTP/0.9")); |
796 | 0 | return 200; |
797 | 0 | } |
798 | | |
799 | 0 | p = resp->headers[0]; |
800 | 0 | end = resp->headers[1]; |
801 | |
|
802 | 0 | if (!end) |
803 | 0 | return -1; |
804 | | |
805 | | /* "HTTP" */ |
806 | 0 | if (end - p < 4 || 0 != strncmp (p, "HTTP", 4)) |
807 | 0 | return -1; |
808 | 0 | p += 4; |
809 | | |
810 | | /* Match the HTTP version. This is optional because Gnutella |
811 | | servers have been reported to not specify HTTP version. */ |
812 | 0 | if (p < end && *p == '/') |
813 | 0 | { |
814 | 0 | ++p; |
815 | 0 | while (p < end && c_isdigit (*p)) |
816 | 0 | ++p; |
817 | 0 | if (p < end && *p == '.') |
818 | 0 | ++p; |
819 | 0 | while (p < end && c_isdigit (*p)) |
820 | 0 | ++p; |
821 | 0 | } |
822 | |
|
823 | 0 | while (p < end && c_isspace (*p)) |
824 | 0 | ++p; |
825 | 0 | if (end - p < 3 || !c_isdigit (p[0]) || !c_isdigit (p[1]) || !c_isdigit (p[2])) |
826 | 0 | return -1; |
827 | | |
828 | 0 | status = 100 * (p[0] - '0') + 10 * (p[1] - '0') + (p[2] - '0'); |
829 | 0 | p += 3; |
830 | |
|
831 | 0 | if (message) |
832 | 0 | { |
833 | 0 | while (p < end && c_isspace (*p)) |
834 | 0 | ++p; |
835 | 0 | while (p < end && c_isspace (end[-1])) |
836 | 0 | --end; |
837 | 0 | *message = strdupdelim (p, end); |
838 | 0 | } |
839 | |
|
840 | 0 | return status; |
841 | 0 | } |
842 | | |
843 | | /* Release the resources used by RESP. |
844 | | It is safe to call it with a valid pointer to a NULL pointer. |
845 | | It is not safe to call it with a invalid or NULL pointer. */ |
846 | | |
847 | | static void |
848 | | resp_free (struct response **resp_ref) |
849 | 0 | { |
850 | 0 | struct response *resp = *resp_ref; |
851 | |
|
852 | 0 | if (!resp) |
853 | 0 | return; |
854 | | |
855 | 0 | xfree (resp->headers); |
856 | 0 | xfree (resp); |
857 | |
|
858 | 0 | *resp_ref = NULL; |
859 | 0 | } |
860 | | |
861 | | /* Print a single line of response, the characters [b, e). We tried |
862 | | getting away with |
863 | | logprintf (LOG_VERBOSE, "%s%.*s\n", prefix, (int) (e - b), b); |
864 | | but that failed to escape the non-printable characters and, in fact, |
865 | | caused crashes in UTF-8 locales. */ |
866 | | |
867 | | static void |
868 | | print_response_line (const char *prefix, const char *b, const char *e) |
869 | 0 | { |
870 | 0 | char buf[1024], *copy; |
871 | 0 | size_t len = e - b; |
872 | |
|
873 | 0 | if (len < sizeof (buf)) |
874 | 0 | copy = buf; |
875 | 0 | else |
876 | 0 | copy = xmalloc(len + 1); |
877 | |
|
878 | 0 | memcpy(copy, b, len); |
879 | 0 | copy[len] = 0; |
880 | |
|
881 | 0 | logprintf (LOG_ALWAYS, "%s%s\n", prefix, |
882 | 0 | quotearg_style (escape_quoting_style, copy)); |
883 | |
|
884 | 0 | if (copy != buf) |
885 | 0 | xfree (copy); |
886 | 0 | } |
887 | | |
888 | | /* Print the server response, line by line, omitting the trailing CRLF |
889 | | from individual header lines, and prefixed with PREFIX. */ |
890 | | |
891 | | static void |
892 | | print_server_response (const struct response *resp, const char *prefix) |
893 | 0 | { |
894 | 0 | int i; |
895 | 0 | if (!resp->headers) |
896 | 0 | return; |
897 | 0 | for (i = 0; resp->headers[i + 1]; i++) |
898 | 0 | { |
899 | 0 | const char *b = resp->headers[i]; |
900 | 0 | const char *e = resp->headers[i + 1]; |
901 | | /* Skip CRLF */ |
902 | 0 | if (b < e && e[-1] == '\n') |
903 | 0 | --e; |
904 | 0 | if (b < e && e[-1] == '\r') |
905 | 0 | --e; |
906 | 0 | print_response_line (prefix, b, e); |
907 | 0 | } |
908 | 0 | } |
909 | | |
910 | | /* Parse the `Content-Range' header and extract the information it |
911 | | contains. Returns true if successful, false otherwise. */ |
912 | | static bool |
913 | | parse_content_range (const char *hdr, wgint *first_byte_ptr, |
914 | | wgint *last_byte_ptr, wgint *entity_length_ptr) |
915 | 0 | { |
916 | 0 | wgint num; |
917 | | |
918 | | /* Ancient versions of Netscape proxy server, presumably predating |
919 | | rfc2068, sent out `Content-Range' without the "bytes" |
920 | | specifier. */ |
921 | 0 | if (0 == strncasecmp (hdr, "bytes", 5)) |
922 | 0 | { |
923 | 0 | hdr += 5; |
924 | | /* "JavaWebServer/1.1.1" sends "bytes: x-y/z", contrary to the |
925 | | HTTP spec. */ |
926 | 0 | if (*hdr == ':') |
927 | 0 | ++hdr; |
928 | 0 | while (c_isspace (*hdr)) |
929 | 0 | ++hdr; |
930 | 0 | if (!*hdr) |
931 | 0 | return false; |
932 | 0 | } |
933 | 0 | if (!c_isdigit (*hdr)) |
934 | 0 | return false; |
935 | 0 | for (num = 0; c_isdigit (*hdr); hdr++) |
936 | 0 | num = 10 * num + (*hdr - '0'); |
937 | 0 | if (*hdr != '-' || !c_isdigit (*(hdr + 1))) |
938 | 0 | return false; |
939 | 0 | *first_byte_ptr = num; |
940 | 0 | ++hdr; |
941 | 0 | for (num = 0; c_isdigit (*hdr); hdr++) |
942 | 0 | num = 10 * num + (*hdr - '0'); |
943 | 0 | if (*hdr != '/') |
944 | 0 | return false; |
945 | 0 | *last_byte_ptr = num; |
946 | 0 | if (!(c_isdigit (*(hdr + 1)) || *(hdr + 1) == '*')) |
947 | 0 | return false; |
948 | 0 | if (*last_byte_ptr < *first_byte_ptr) |
949 | 0 | return false; |
950 | 0 | ++hdr; |
951 | 0 | if (*hdr == '*') |
952 | 0 | num = -1; |
953 | 0 | else |
954 | 0 | for (num = 0; c_isdigit (*hdr); hdr++) |
955 | 0 | num = 10 * num + (*hdr - '0'); |
956 | 0 | *entity_length_ptr = num; |
957 | 0 | if ((*entity_length_ptr <= *last_byte_ptr) && *entity_length_ptr != -1) |
958 | 0 | return false; |
959 | 0 | return true; |
960 | 0 | } |
961 | | |
962 | | /* Read the body of the request, but don't store it anywhere and don't |
963 | | display a progress gauge. This is useful for reading the bodies of |
964 | | administrative responses to which we will soon issue another |
965 | | request. The response is not useful to the user, but reading it |
966 | | allows us to continue using the same connection to the server. |
967 | | |
968 | | If reading fails, false is returned, true otherwise. In debug |
969 | | mode, the body is displayed for debugging purposes. */ |
970 | | |
971 | | static bool |
972 | | skip_short_body (int fd, wgint contlen, bool chunked) |
973 | 0 | { |
974 | 0 | enum { |
975 | 0 | SKIP_SIZE = 512, /* size of the download buffer */ |
976 | 0 | SKIP_THRESHOLD = 4096 /* the largest size we read */ |
977 | 0 | }; |
978 | 0 | wgint remaining_chunk_size = 0; |
979 | 0 | char dlbuf[SKIP_SIZE + 1]; |
980 | 0 | dlbuf[SKIP_SIZE] = '\0'; /* so DEBUGP can safely print it */ |
981 | | |
982 | | /* If the body is too large, it makes more sense to simply close the |
983 | | connection than to try to read the body. */ |
984 | 0 | if (contlen > SKIP_THRESHOLD) |
985 | 0 | return false; |
986 | | |
987 | 0 | while (contlen > 0 || chunked) |
988 | 0 | { |
989 | 0 | int ret; |
990 | 0 | if (chunked) |
991 | 0 | { |
992 | 0 | if (remaining_chunk_size == 0) |
993 | 0 | { |
994 | 0 | char *line = fd_read_line (fd); |
995 | 0 | char *endl; |
996 | 0 | if (line == NULL) |
997 | 0 | break; |
998 | | |
999 | 0 | remaining_chunk_size = strtol (line, &endl, 16); |
1000 | 0 | xfree (line); |
1001 | |
|
1002 | 0 | if (remaining_chunk_size < 0) |
1003 | 0 | return false; |
1004 | | |
1005 | 0 | if (remaining_chunk_size == 0) |
1006 | 0 | { |
1007 | 0 | line = fd_read_line (fd); |
1008 | 0 | xfree (line); |
1009 | 0 | break; |
1010 | 0 | } |
1011 | 0 | } |
1012 | | |
1013 | 0 | contlen = MIN (remaining_chunk_size, SKIP_SIZE); |
1014 | 0 | } |
1015 | | |
1016 | 0 | DEBUGP (("Skipping %s bytes of body: [", number_to_static_string (contlen))); |
1017 | |
|
1018 | 0 | ret = fd_read (fd, dlbuf, MIN (contlen, SKIP_SIZE), -1); |
1019 | 0 | if (ret <= 0) |
1020 | 0 | { |
1021 | | /* Don't normally report the error since this is an |
1022 | | optimization that should be invisible to the user. */ |
1023 | 0 | DEBUGP (("] aborting (%s).\n", |
1024 | 0 | ret < 0 ? fd_errstr (fd) : "EOF received")); |
1025 | 0 | return false; |
1026 | 0 | } |
1027 | 0 | contlen -= ret; |
1028 | |
|
1029 | 0 | if (chunked) |
1030 | 0 | { |
1031 | 0 | remaining_chunk_size -= ret; |
1032 | 0 | if (remaining_chunk_size == 0) |
1033 | 0 | { |
1034 | 0 | char *line = fd_read_line (fd); |
1035 | 0 | if (line == NULL) |
1036 | 0 | return false; |
1037 | 0 | else |
1038 | 0 | xfree (line); |
1039 | 0 | } |
1040 | 0 | } |
1041 | | |
1042 | | /* Safe even if %.*s bogusly expects terminating \0 because |
1043 | | we've zero-terminated dlbuf above. */ |
1044 | 0 | DEBUGP (("%.*s", ret, dlbuf)); |
1045 | 0 | } |
1046 | | |
1047 | 0 | DEBUGP (("] done.\n")); |
1048 | 0 | return true; |
1049 | 0 | } |
1050 | | |
1051 | 31.9k | #define NOT_RFC2231 0 |
1052 | 969 | #define RFC2231_NOENCODING 1 |
1053 | 21.0k | #define RFC2231_ENCODING 2 |
1054 | | |
1055 | | /* extract_param extracts the parameter name into NAME. |
1056 | | However, if the parameter name is in RFC2231 format then |
1057 | | this function adjusts NAME by stripping of the trailing |
1058 | | characters that are not part of the name but are present to |
1059 | | indicate the presence of encoding information in the value |
1060 | | or a fragment of a long parameter value |
1061 | | */ |
1062 | | static int |
1063 | | modify_param_name (param_token *name) |
1064 | 18.7k | { |
1065 | 18.7k | const char *delim1 = memchr (name->b, '*', name->e - name->b); |
1066 | 18.7k | const char *delim2 = memrchr (name->b, '*', name->e - name->b); |
1067 | | |
1068 | 18.7k | int result; |
1069 | | |
1070 | 18.7k | if (delim1 == NULL) |
1071 | 13.2k | { |
1072 | 13.2k | result = NOT_RFC2231; |
1073 | 13.2k | } |
1074 | 5.51k | else if (delim1 == delim2) |
1075 | 4.54k | { |
1076 | 4.54k | if ((name->e - 1) == delim1) |
1077 | 3.57k | { |
1078 | 3.57k | result = RFC2231_ENCODING; |
1079 | 3.57k | } |
1080 | 969 | else |
1081 | 969 | { |
1082 | 969 | result = RFC2231_NOENCODING; |
1083 | 969 | } |
1084 | 4.54k | name->e = delim1; |
1085 | 4.54k | } |
1086 | 969 | else |
1087 | 969 | { |
1088 | 969 | name->e = delim1; |
1089 | 969 | result = RFC2231_ENCODING; |
1090 | 969 | } |
1091 | 18.7k | return result; |
1092 | 18.7k | } |
1093 | | |
1094 | | /* extract_param extract the parameter value into VALUE. |
1095 | | Like modify_param_name this function modifies VALUE by |
1096 | | stripping off the encoding information from the actual value |
1097 | | */ |
1098 | | static void |
1099 | | modify_param_value (param_token *value, int encoding_type ) |
1100 | 5.51k | { |
1101 | 5.51k | if (encoding_type == RFC2231_ENCODING) |
1102 | 4.54k | { |
1103 | 4.54k | const char *delim = memrchr (value->b, '\'', value->e - value->b); |
1104 | 4.54k | if (delim != NULL) |
1105 | 969 | { |
1106 | 969 | value->b = (delim+1); |
1107 | 969 | } |
1108 | 4.54k | } |
1109 | 5.51k | } |
1110 | | |
1111 | | /* Extract a parameter from the string (typically an HTTP header) at |
1112 | | **SOURCE and advance SOURCE to the next parameter. Return false |
1113 | | when there are no more parameters to extract. The name of the |
1114 | | parameter is returned in NAME, and the value in VALUE. If the |
1115 | | parameter has no value, the token's value is zeroed out. |
1116 | | |
1117 | | For example, if *SOURCE points to the string "attachment; |
1118 | | filename=\"foo bar\"", the first call to this function will return |
1119 | | the token named "attachment" and no value, and the second call will |
1120 | | return the token named "filename" and value "foo bar". The third |
1121 | | call will return false, indicating no more valid tokens. |
1122 | | |
1123 | | is_url_encoded is an out parameter. If not NULL, a boolean value will be |
1124 | | stored into it, letting the caller know whether or not the extracted value is |
1125 | | URL-encoded. The caller can then decode it with url_unescape(), which however |
1126 | | performs decoding in-place. URL-encoding is used by RFC 2231 to support |
1127 | | non-US-ASCII characters in HTTP header values. */ |
1128 | | |
1129 | | bool |
1130 | | extract_param (const char **source, param_token *name, param_token *value, |
1131 | | char separator, bool *is_url_encoded) |
1132 | 33.9k | { |
1133 | 33.9k | const char *p = *source; |
1134 | 33.9k | int param_type; |
1135 | 33.9k | if (is_url_encoded) |
1136 | 0 | *is_url_encoded = false; /* initializing the out parameter */ |
1137 | | |
1138 | 39.7k | while (c_isspace (*p)) ++p; |
1139 | 33.9k | if (!*p) |
1140 | 2.44k | { |
1141 | 2.44k | *source = p; |
1142 | 2.44k | return false; /* no error; nothing more to extract */ |
1143 | 2.44k | } |
1144 | | |
1145 | | /* Extract name. */ |
1146 | 31.4k | name->b = p; |
1147 | 277k | while (*p && !c_isspace (*p) && *p != '=' && *p != separator) ++p; |
1148 | 31.4k | name->e = p; |
1149 | 31.4k | if (name->b == name->e) |
1150 | 84 | return false; /* empty name: error */ |
1151 | 33.8k | while (c_isspace (*p)) ++p; |
1152 | 31.3k | if (*p == separator || !*p) /* no value */ |
1153 | 12.3k | { |
1154 | 12.3k | xzero (*value); |
1155 | 12.3k | if (*p == separator) ++p; |
1156 | 12.3k | *source = p; |
1157 | 12.3k | return true; |
1158 | 12.3k | } |
1159 | 18.9k | if (*p != '=') |
1160 | 186 | return false; /* error */ |
1161 | | |
1162 | | /* *p is '=', extract value */ |
1163 | 18.7k | ++p; |
1164 | 19.7k | while (c_isspace (*p)) ++p; |
1165 | 18.7k | if (*p == '"') /* quoted */ |
1166 | 1.65k | { |
1167 | 1.65k | value->b = ++p; |
1168 | 4.43k | while (*p && *p != '"') ++p; |
1169 | 1.65k | if (!*p) |
1170 | 66 | return false; |
1171 | 1.59k | value->e = p++; |
1172 | | /* Currently at closing quote; find the end of param. */ |
1173 | 2.56k | while (c_isspace (*p)) ++p; |
1174 | 2.63k | while (*p && *p != separator) ++p; |
1175 | 1.59k | if (*p == separator) |
1176 | 1.45k | ++p; |
1177 | 138 | else if (*p) |
1178 | | /* garbage after closed quote, e.g. foo="bar"baz */ |
1179 | 0 | return false; |
1180 | 1.59k | } |
1181 | 17.1k | else /* unquoted */ |
1182 | 17.1k | { |
1183 | 17.1k | value->b = p; |
1184 | 84.5k | while (*p && *p != separator) ++p; |
1185 | 17.1k | value->e = p; |
1186 | 19.8k | while (value->e != value->b && c_isspace (value->e[-1])) |
1187 | 2.72k | --value->e; |
1188 | 17.1k | if (*p == separator) ++p; |
1189 | 17.1k | } |
1190 | 18.7k | *source = p; |
1191 | | |
1192 | 18.7k | param_type = modify_param_name (name); |
1193 | 18.7k | if (param_type != NOT_RFC2231) |
1194 | 5.51k | { |
1195 | 5.51k | if (param_type == RFC2231_ENCODING && is_url_encoded) |
1196 | 0 | *is_url_encoded = true; |
1197 | 5.51k | modify_param_value (value, param_type); |
1198 | 5.51k | } |
1199 | 18.7k | return true; |
1200 | 18.7k | } |
1201 | | |
1202 | | #undef NOT_RFC2231 |
1203 | | #undef RFC2231_NOENCODING |
1204 | | #undef RFC2231_ENCODING |
1205 | | |
1206 | | /* Appends the string represented by VALUE to FILENAME */ |
1207 | | |
1208 | | static void |
1209 | | append_value_to_filename (char **filename, param_token const * const value, |
1210 | | bool is_url_encoded) |
1211 | 0 | { |
1212 | 0 | int original_length = strlen (*filename); |
1213 | 0 | int new_length = strlen (*filename) + (value->e - value->b); |
1214 | 0 | *filename = xrealloc (*filename, new_length+1); |
1215 | 0 | memcpy (*filename + original_length, value->b, (value->e - value->b)); |
1216 | 0 | (*filename)[new_length] = '\0'; |
1217 | 0 | if (is_url_encoded) |
1218 | 0 | url_unescape (*filename + original_length); |
1219 | 0 | } |
1220 | | |
1221 | | /* Parse the contents of the `Content-Disposition' header, extracting |
1222 | | the information useful to Wget. Content-Disposition is a header |
1223 | | borrowed from MIME; when used in HTTP, it typically serves for |
1224 | | specifying the desired file name of the resource. For example: |
1225 | | |
1226 | | Content-Disposition: attachment; filename="flora.jpg" |
1227 | | |
1228 | | Wget will skip the tokens it doesn't care about, such as |
1229 | | "attachment" in the previous example; it will also skip other |
1230 | | unrecognized params. If the header is syntactically correct and |
1231 | | contains a file name, a copy of the file name is stored in |
1232 | | *filename and true is returned. Otherwise, the function returns |
1233 | | false. |
1234 | | |
1235 | | The file name is stripped of directory components and must not be |
1236 | | empty. |
1237 | | |
1238 | | Historically, this function returned filename prefixed with opt.dir_prefix, |
1239 | | now that logic is handled by the caller, new code should pay attention, |
1240 | | changed by crq, Sep 2010. |
1241 | | |
1242 | | */ |
1243 | | static bool |
1244 | | parse_content_disposition (const char *hdr, char **filename) |
1245 | 0 | { |
1246 | 0 | param_token name, value; |
1247 | 0 | bool is_url_encoded = false; |
1248 | |
|
1249 | 0 | char *encodedFilename = NULL; |
1250 | 0 | char *unencodedFilename = NULL; |
1251 | 0 | for ( ; extract_param (&hdr, &name, &value, ';', &is_url_encoded); |
1252 | 0 | is_url_encoded = false) |
1253 | 0 | { |
1254 | 0 | int isFilename = BOUNDED_EQUAL_NO_CASE (name.b, name.e, "filename"); |
1255 | 0 | if ( isFilename && value.b != NULL) |
1256 | 0 | { |
1257 | | /* Make the file name begin at the last slash or backslash. */ |
1258 | 0 | bool isEncodedFilename; |
1259 | 0 | char **outFilename; |
1260 | 0 | const char *last_slash = memrchr (value.b, '/', value.e - value.b); |
1261 | 0 | const char *last_bs = memrchr (value.b, '\\', value.e - value.b); |
1262 | 0 | if (last_slash && last_bs) |
1263 | 0 | value.b = 1 + MAX (last_slash, last_bs); |
1264 | 0 | else if (last_slash || last_bs) |
1265 | 0 | value.b = 1 + (last_slash ? last_slash : last_bs); |
1266 | 0 | if (value.b == value.e) |
1267 | 0 | continue; |
1268 | | |
1269 | | /* Check if the name is "filename*" as specified in RFC 6266. |
1270 | | * Since "filename" could be broken up as "filename*N" (RFC 2231), |
1271 | | * a check is needed to make sure this is not the case */ |
1272 | 0 | isEncodedFilename = *name.e == '*' && !c_isdigit (*(name.e + 1)); |
1273 | 0 | outFilename = isEncodedFilename ? &encodedFilename |
1274 | 0 | : &unencodedFilename; |
1275 | 0 | if (*outFilename) |
1276 | 0 | append_value_to_filename (outFilename, &value, is_url_encoded); |
1277 | 0 | else |
1278 | 0 | { |
1279 | 0 | *outFilename = strdupdelim (value.b, value.e); |
1280 | 0 | if (is_url_encoded) |
1281 | 0 | url_unescape (*outFilename); |
1282 | 0 | } |
1283 | 0 | } |
1284 | 0 | } |
1285 | 0 | if (encodedFilename) |
1286 | 0 | { |
1287 | 0 | xfree (unencodedFilename); |
1288 | 0 | *filename = encodedFilename; |
1289 | 0 | } |
1290 | 0 | else |
1291 | 0 | { |
1292 | 0 | xfree (encodedFilename); |
1293 | 0 | *filename = unencodedFilename; |
1294 | 0 | } |
1295 | 0 | if (*filename) |
1296 | 0 | return true; |
1297 | 0 | else |
1298 | 0 | return false; |
1299 | 0 | } |
1300 | | |
1301 | | #ifdef HAVE_HSTS |
1302 | | static bool |
1303 | | parse_strict_transport_security (const char *header, int64_t *max_age, bool *include_subdomains) |
1304 | 0 | { |
1305 | 0 | param_token name, value; |
1306 | 0 | const char *c_max_age = NULL; |
1307 | 0 | bool is = false; /* includeSubDomains */ |
1308 | 0 | bool is_url_encoded = false; |
1309 | 0 | bool success = false; |
1310 | |
|
1311 | 0 | if (header) |
1312 | 0 | { |
1313 | | /* Process the STS header. Keys should be matched case-insensitively. */ |
1314 | 0 | for (; extract_param (&header, &name, &value, ';', &is_url_encoded); is_url_encoded = false) |
1315 | 0 | { |
1316 | 0 | if (BOUNDED_EQUAL_NO_CASE (name.b, name.e, "max-age")) |
1317 | 0 | { |
1318 | 0 | xfree (c_max_age); |
1319 | 0 | c_max_age = strdupdelim (value.b, value.e); |
1320 | 0 | } |
1321 | 0 | else if (BOUNDED_EQUAL_NO_CASE (name.b, name.e, "includeSubDomains")) |
1322 | 0 | is = true; |
1323 | 0 | } |
1324 | | |
1325 | | /* pass the parsed values over */ |
1326 | 0 | if (c_max_age) |
1327 | 0 | { |
1328 | | /* If the string value goes out of a long's bounds, strtol() will return LONG_MIN or LONG_MAX. |
1329 | | * In theory, the HSTS engine should be able to handle it. |
1330 | | * Also, time_t is normally defined as a long, so this should not break. |
1331 | | */ |
1332 | 0 | if (max_age) |
1333 | 0 | *max_age = (int64_t) strtoll (c_max_age, NULL, 10); |
1334 | 0 | if (include_subdomains) |
1335 | 0 | *include_subdomains = is; |
1336 | |
|
1337 | 0 | DEBUGP (("Parsed Strict-Transport-Security max-age = %s, includeSubDomains = %s\n", |
1338 | 0 | c_max_age, (is ? "true" : "false"))); |
1339 | |
|
1340 | 0 | xfree (c_max_age); |
1341 | 0 | success = true; |
1342 | 0 | } |
1343 | 0 | else |
1344 | 0 | { |
1345 | | /* something weird happened */ |
1346 | 0 | logprintf (LOG_VERBOSE, "Could not parse Strict-Transport-Security header\n"); |
1347 | 0 | success = false; |
1348 | 0 | } |
1349 | 0 | } |
1350 | |
|
1351 | 0 | return success; |
1352 | 0 | } |
1353 | | #endif |
1354 | | |
1355 | | /* Persistent connections. Currently, we cache the most recently used |
1356 | | connection as persistent, provided that the HTTP server agrees to |
1357 | | make it such. The persistence data is stored in the variables |
1358 | | below. Ideally, it should be possible to cache an arbitrary fixed |
1359 | | number of these connections. */ |
1360 | | |
1361 | | /* Whether a persistent connection is active. */ |
1362 | | static bool pconn_active; |
1363 | | |
1364 | | static struct { |
1365 | | /* The socket of the connection. */ |
1366 | | int socket; |
1367 | | |
1368 | | /* Host and port of the currently active persistent connection. */ |
1369 | | char *host; |
1370 | | int port; |
1371 | | |
1372 | | /* Whether a ssl handshake has occurred on this connection. */ |
1373 | | bool ssl; |
1374 | | |
1375 | | /* Whether the connection was authorized. This is only done by |
1376 | | NTLM, which authorizes *connections* rather than individual |
1377 | | requests. (That practice is peculiar for HTTP, but it is a |
1378 | | useful optimization.) */ |
1379 | | bool authorized; |
1380 | | |
1381 | | #ifdef ENABLE_NTLM |
1382 | | /* NTLM data of the current connection. */ |
1383 | | struct ntlmdata ntlm; |
1384 | | #endif |
1385 | | } pconn; |
1386 | | |
1387 | | /* Mark the persistent connection as invalid and free the resources it |
1388 | | uses. This is used by the CLOSE_* macros after they forcefully |
1389 | | close a registered persistent connection. */ |
1390 | | |
1391 | | static void |
1392 | | invalidate_persistent (void) |
1393 | 0 | { |
1394 | 0 | DEBUGP (("Disabling further reuse of socket %d.\n", pconn.socket)); |
1395 | 0 | pconn_active = false; |
1396 | 0 | fd_close (pconn.socket); |
1397 | 0 | xfree (pconn.host); |
1398 | 0 | xzero (pconn); |
1399 | 0 | } |
1400 | | |
1401 | | /* Register FD, which should be a TCP/IP connection to HOST:PORT, as |
1402 | | persistent. This will enable someone to use the same connection |
1403 | | later. In the context of HTTP, this must be called only AFTER the |
1404 | | response has been received and the server has promised that the |
1405 | | connection will remain alive. |
1406 | | |
1407 | | If a previous connection was persistent, it is closed. */ |
1408 | | |
1409 | | static void |
1410 | | register_persistent (const char *host, int port, int fd, bool ssl) |
1411 | 0 | { |
1412 | 0 | if (pconn_active) |
1413 | 0 | { |
1414 | 0 | if (pconn.socket == fd) |
1415 | 0 | { |
1416 | | /* The connection FD is already registered. */ |
1417 | 0 | return; |
1418 | 0 | } |
1419 | 0 | else |
1420 | 0 | { |
1421 | | /* The old persistent connection is still active; close it |
1422 | | first. This situation arises whenever a persistent |
1423 | | connection exists, but we then connect to a different |
1424 | | host, and try to register a persistent connection to that |
1425 | | one. */ |
1426 | 0 | invalidate_persistent (); |
1427 | 0 | } |
1428 | 0 | } |
1429 | | |
1430 | 0 | pconn_active = true; |
1431 | 0 | pconn.socket = fd; |
1432 | 0 | pconn.host = xstrdup (host); |
1433 | 0 | pconn.port = port; |
1434 | 0 | pconn.ssl = ssl; |
1435 | 0 | pconn.authorized = false; |
1436 | |
|
1437 | 0 | DEBUGP (("Registered socket %d for persistent reuse.\n", fd)); |
1438 | 0 | } |
1439 | | |
1440 | | /* Return true if a persistent connection is available for connecting |
1441 | | to HOST:PORT. */ |
1442 | | |
1443 | | static bool |
1444 | | persistent_available_p (const char *host, int port, bool ssl, |
1445 | | bool *host_lookup_failed) |
1446 | 0 | { |
1447 | | /* First, check whether a persistent connection is active at all. */ |
1448 | 0 | if (!pconn_active) |
1449 | 0 | return false; |
1450 | | |
1451 | | /* If we want SSL and the last connection wasn't or vice versa, |
1452 | | don't use it. Checking for host and port is not enough because |
1453 | | HTTP and HTTPS can apparently coexist on the same port. */ |
1454 | 0 | if (ssl != pconn.ssl) |
1455 | 0 | return false; |
1456 | | |
1457 | | /* If we're not connecting to the same port, we're not interested. */ |
1458 | 0 | if (port != pconn.port) |
1459 | 0 | return false; |
1460 | | |
1461 | | /* If the host is the same, we're in business. If not, there is |
1462 | | still hope -- read below. */ |
1463 | 0 | if (0 != strcasecmp (host, pconn.host)) |
1464 | 0 | { |
1465 | | /* Check if pconn.socket is talking to HOST under another name. |
1466 | | This happens often when both sites are virtual hosts |
1467 | | distinguished only by name and served by the same network |
1468 | | interface, and hence the same web server (possibly set up by |
1469 | | the ISP and serving many different web sites). This |
1470 | | admittedly unconventional optimization does not contradict |
1471 | | HTTP and works well with popular server software. */ |
1472 | |
|
1473 | 0 | bool found; |
1474 | 0 | ip_address ip; |
1475 | 0 | struct address_list *al; |
1476 | |
|
1477 | 0 | if (ssl) |
1478 | | /* Don't try to talk to two different SSL sites over the same |
1479 | | secure connection! (Besides, it's not clear that |
1480 | | name-based virtual hosting is even possible with SSL.) */ |
1481 | 0 | return false; |
1482 | | |
1483 | | /* If pconn.socket's peer is one of the IP addresses HOST |
1484 | | resolves to, pconn.socket is for all intents and purposes |
1485 | | already talking to HOST. */ |
1486 | | |
1487 | 0 | if (!socket_ip_address (pconn.socket, &ip, ENDPOINT_PEER)) |
1488 | 0 | { |
1489 | | /* Can't get the peer's address -- something must be very |
1490 | | wrong with the connection. */ |
1491 | 0 | invalidate_persistent (); |
1492 | 0 | return false; |
1493 | 0 | } |
1494 | 0 | al = lookup_host (host, 0); |
1495 | 0 | if (!al) |
1496 | 0 | { |
1497 | 0 | *host_lookup_failed = true; |
1498 | 0 | return false; |
1499 | 0 | } |
1500 | | |
1501 | 0 | found = address_list_contains (al, &ip); |
1502 | 0 | address_list_release (al); |
1503 | |
|
1504 | 0 | if (!found) |
1505 | 0 | return false; |
1506 | | |
1507 | | /* The persistent connection's peer address was found among the |
1508 | | addresses HOST resolved to; therefore, pconn.sock is in fact |
1509 | | already talking to HOST -- no need to reconnect. */ |
1510 | 0 | } |
1511 | | |
1512 | | /* Finally, check whether the connection is still open. This is |
1513 | | important because most servers implement liberal (short) timeout |
1514 | | on persistent connections. Wget can of course always reconnect |
1515 | | if the connection doesn't work out, but it's nicer to know in |
1516 | | advance. This test is a logical followup of the first test, but |
1517 | | is "expensive" and therefore placed at the end of the list. |
1518 | | |
1519 | | (Current implementation of test_socket_open has a nice side |
1520 | | effect that it treats sockets with pending data as "closed". |
1521 | | This is exactly what we want: if a broken server sends message |
1522 | | body in response to HEAD, or if it sends more than conent-length |
1523 | | data, we won't reuse the corrupted connection.) */ |
1524 | | |
1525 | 0 | if (!test_socket_open (pconn.socket)) |
1526 | 0 | { |
1527 | | /* Oops, the socket is no longer open. Now that we know that, |
1528 | | let's invalidate the persistent connection before returning |
1529 | | 0. */ |
1530 | 0 | invalidate_persistent (); |
1531 | 0 | return false; |
1532 | 0 | } |
1533 | | |
1534 | 0 | return true; |
1535 | 0 | } |
1536 | | |
1537 | | /* The idea behind these two CLOSE macros is to distinguish between |
1538 | | two cases: one when the job we've been doing is finished, and we |
1539 | | want to close the connection and leave, and two when something is |
1540 | | seriously wrong and we're closing the connection as part of |
1541 | | cleanup. |
1542 | | |
1543 | | In case of keep_alive, CLOSE_FINISH should leave the connection |
1544 | | open, while CLOSE_INVALIDATE should still close it. |
1545 | | |
1546 | | Note that the semantics of the flag `keep_alive' is "this |
1547 | | connection *will* be reused (the server has promised not to close |
1548 | | the connection once we're done)", while the semantics of |
1549 | | `pc_active_p && (fd) == pc_last_fd' is "we're *now* using an |
1550 | | active, registered connection". */ |
1551 | | |
1552 | 0 | #define CLOSE_FINISH(fd) do { \ |
1553 | 0 | if (!keep_alive) \ |
1554 | 0 | { \ |
1555 | 0 | if (pconn_active && (fd) == pconn.socket) \ |
1556 | 0 | invalidate_persistent (); \ |
1557 | 0 | else \ |
1558 | 0 | fd_close (fd); \ |
1559 | 0 | fd = -1; \ |
1560 | 0 | } \ |
1561 | 0 | } while (0) |
1562 | | |
1563 | 0 | #define CLOSE_INVALIDATE(fd) do { \ |
1564 | 0 | if (pconn_active && (fd) == pconn.socket) \ |
1565 | 0 | invalidate_persistent (); \ |
1566 | 0 | else \ |
1567 | 0 | fd_close (fd); \ |
1568 | 0 | fd = -1; \ |
1569 | 0 | } while (0) |
1570 | | |
1571 | | typedef enum |
1572 | | { |
1573 | | ENC_INVALID = -1, /* invalid encoding */ |
1574 | | ENC_NONE = 0, /* no special encoding */ |
1575 | | ENC_GZIP, /* gzip compression */ |
1576 | | ENC_DEFLATE, /* deflate compression */ |
1577 | | ENC_COMPRESS, /* compress compression */ |
1578 | | ENC_BROTLI /* brotli compression */ |
1579 | | } encoding_t; |
1580 | | |
1581 | | struct http_stat |
1582 | | { |
1583 | | wgint len; /* received length */ |
1584 | | wgint contlen; /* expected length */ |
1585 | | wgint restval; /* the restart value */ |
1586 | | int res; /* the result of last read */ |
1587 | | char *rderrmsg; /* error message from read error */ |
1588 | | char *newloc; /* new location (redirection) */ |
1589 | | char *remote_time; /* remote time-stamp string */ |
1590 | | char *error; /* textual HTTP error */ |
1591 | | int statcode; /* status code */ |
1592 | | char *message; /* status message */ |
1593 | | wgint rd_size; /* amount of data read from socket */ |
1594 | | double dltime; /* time it took to download the data */ |
1595 | | const char *referer; /* value of the referer header. */ |
1596 | | char *local_file; /* local file name. */ |
1597 | | bool existence_checked; /* true if we already checked for a file's |
1598 | | existence after having begun to download |
1599 | | (needed in gethttp for when connection is |
1600 | | interrupted/restarted. */ |
1601 | | bool timestamp_checked; /* true if pre-download time-stamping checks |
1602 | | * have already been performed */ |
1603 | | char *orig_file_name; /* name of file to compare for time-stamping |
1604 | | * (might be != local_file if -K is set) */ |
1605 | | wgint orig_file_size; /* size of file to compare for time-stamping */ |
1606 | | time_t orig_file_tstamp; /* time-stamp of file to compare for |
1607 | | * time-stamping */ |
1608 | | #ifdef HAVE_METALINK |
1609 | | metalink_t *metalink; |
1610 | | #endif |
1611 | | |
1612 | | encoding_t local_encoding; /* the encoding of the local file */ |
1613 | | encoding_t remote_encoding; /* the encoding of the remote file */ |
1614 | | |
1615 | | bool temporary; /* downloading a temporary file */ |
1616 | | }; |
1617 | | |
1618 | | static void |
1619 | | free_hstat (struct http_stat *hs) |
1620 | 0 | { |
1621 | 0 | xfree (hs->newloc); |
1622 | 0 | xfree (hs->remote_time); |
1623 | 0 | xfree (hs->error); |
1624 | 0 | xfree (hs->rderrmsg); |
1625 | 0 | xfree (hs->local_file); |
1626 | 0 | xfree (hs->orig_file_name); |
1627 | 0 | xfree (hs->message); |
1628 | | #ifdef HAVE_METALINK |
1629 | | metalink_delete (hs->metalink); |
1630 | | hs->metalink = NULL; |
1631 | | #endif |
1632 | 0 | } |
1633 | | |
1634 | | static void |
1635 | | get_file_flags (const char *filename, int *dt) |
1636 | 0 | { |
1637 | 0 | logprintf (LOG_VERBOSE, _("\ |
1638 | 0 | File %s already there; not retrieving.\n\n"), quote (filename)); |
1639 | | /* If the file is there, we suppose it's retrieved OK. */ |
1640 | 0 | *dt |= RETROKF; |
1641 | | |
1642 | | /* #### Bogusness alert. */ |
1643 | | /* If its suffix is "html" or "htm" or similar, assume text/html. */ |
1644 | 0 | if (has_html_suffix_p (filename)) |
1645 | 0 | *dt |= TEXTHTML; |
1646 | 0 | } |
1647 | | |
1648 | | /* Download the response body from the socket and writes it to |
1649 | | an output file. The headers have already been read from the |
1650 | | socket. If WARC is enabled, the response body will also be |
1651 | | written to a WARC response record. |
1652 | | |
1653 | | hs, contlen, contrange, chunked_transfer_encoding and url are |
1654 | | parameters from the gethttp method. fp is a pointer to the |
1655 | | output file. |
1656 | | |
1657 | | url, warc_timestamp_str, warc_request_uuid, warc_ip, type |
1658 | | and statcode will be saved in the headers of the WARC record. |
1659 | | The head parameter contains the HTTP headers of the response. |
1660 | | |
1661 | | If fp is NULL and WARC is enabled, the response body will be |
1662 | | written only to the WARC file. If WARC is disabled and fp |
1663 | | is a file pointer, the data will be written to the file. |
1664 | | If fp is a file pointer and WARC is enabled, the body will |
1665 | | be written to both destinations. |
1666 | | |
1667 | | Returns the error code. */ |
1668 | | static int |
1669 | | read_response_body (struct http_stat *hs, int sock, FILE *fp, wgint contlen, |
1670 | | wgint contrange, bool chunked_transfer_encoding, |
1671 | | char *url, char *warc_timestamp_str, char *warc_request_uuid, |
1672 | | ip_address *warc_ip, char *type, int statcode, char *head) |
1673 | 0 | { |
1674 | 0 | int warc_payload_offset = 0; |
1675 | 0 | FILE *warc_tmp = NULL; |
1676 | 0 | int warcerr = 0; |
1677 | 0 | int flags = 0; |
1678 | |
|
1679 | 0 | if (opt.warc_filename != NULL) |
1680 | 0 | { |
1681 | | /* Open a temporary file where we can write the response before we |
1682 | | add it to the WARC record. */ |
1683 | 0 | warc_tmp = warc_tempfile (); |
1684 | 0 | if (warc_tmp == NULL) |
1685 | 0 | warcerr = WARC_TMP_FOPENERR; |
1686 | |
|
1687 | 0 | if (warcerr == 0) |
1688 | 0 | { |
1689 | | /* We should keep the response headers for the WARC record. */ |
1690 | 0 | int head_len = strlen (head); |
1691 | 0 | int warc_tmp_written = fwrite (head, 1, head_len, warc_tmp); |
1692 | 0 | if (warc_tmp_written != head_len) |
1693 | 0 | warcerr = WARC_TMP_FWRITEERR; |
1694 | 0 | warc_payload_offset = head_len; |
1695 | 0 | } |
1696 | |
|
1697 | 0 | if (warcerr != 0) |
1698 | 0 | { |
1699 | 0 | if (warc_tmp != NULL) |
1700 | 0 | fclose (warc_tmp); |
1701 | 0 | return warcerr; |
1702 | 0 | } |
1703 | 0 | } |
1704 | | |
1705 | 0 | if (fp != NULL) |
1706 | 0 | { |
1707 | | /* This confuses the timestamping code that checks for file size. |
1708 | | #### The timestamping code should be smarter about file size. */ |
1709 | 0 | if (opt.save_headers && hs->restval == 0) |
1710 | 0 | fwrite (head, 1, strlen (head), fp); |
1711 | 0 | } |
1712 | | |
1713 | | /* Read the response body. */ |
1714 | 0 | if (contlen != -1) |
1715 | | /* If content-length is present, read that much; otherwise, read |
1716 | | until EOF. The HTTP spec doesn't require the server to |
1717 | | actually close the connection when it's done sending data. */ |
1718 | 0 | flags |= rb_read_exactly; |
1719 | 0 | if (fp != NULL && hs->restval > 0 && contrange == 0) |
1720 | | /* If the server ignored our range request, instruct fd_read_body |
1721 | | to skip the first RESTVAL bytes of body. */ |
1722 | 0 | flags |= rb_skip_startpos; |
1723 | 0 | if (chunked_transfer_encoding) |
1724 | 0 | flags |= rb_chunked_transfer_encoding; |
1725 | |
|
1726 | 0 | if (hs->remote_encoding == ENC_GZIP) |
1727 | 0 | flags |= rb_compressed_gzip; |
1728 | |
|
1729 | 0 | hs->len = hs->restval; |
1730 | 0 | hs->rd_size = 0; |
1731 | | /* Download the response body and write it to fp. |
1732 | | If we are working on a WARC file, we simultaneously write the |
1733 | | response body to warc_tmp. */ |
1734 | 0 | hs->res = fd_read_body (hs->local_file, sock, fp, contlen != -1 ? contlen : 0, |
1735 | 0 | hs->restval, &hs->rd_size, &hs->len, &hs->dltime, |
1736 | 0 | flags, warc_tmp); |
1737 | 0 | if (hs->res >= 0) |
1738 | 0 | { |
1739 | 0 | if (warc_tmp != NULL) |
1740 | 0 | { |
1741 | | /* Create a response record and write it to the WARC file. |
1742 | | Note: per the WARC standard, the request and response should share |
1743 | | the same date header. We re-use the timestamp of the request. |
1744 | | The response record should also refer to the uuid of the request. */ |
1745 | 0 | bool r = warc_write_response_record (url, warc_timestamp_str, |
1746 | 0 | warc_request_uuid, warc_ip, |
1747 | 0 | warc_tmp, warc_payload_offset, |
1748 | 0 | type, statcode, hs->newloc); |
1749 | | |
1750 | | /* warc_write_response_record has closed warc_tmp. */ |
1751 | |
|
1752 | 0 | if (! r) |
1753 | 0 | return WARC_ERR; |
1754 | 0 | } |
1755 | | |
1756 | 0 | return RETRFINISHED; |
1757 | 0 | } |
1758 | | |
1759 | 0 | if (warc_tmp != NULL) |
1760 | 0 | fclose (warc_tmp); |
1761 | |
|
1762 | 0 | if (hs->res == -2) |
1763 | 0 | { |
1764 | | /* Error while writing to fd. */ |
1765 | 0 | return FWRITEERR; |
1766 | 0 | } |
1767 | 0 | else if (hs->res == -3) |
1768 | 0 | { |
1769 | | /* Error while writing to warc_tmp. */ |
1770 | 0 | return WARC_TMP_FWRITEERR; |
1771 | 0 | } |
1772 | 0 | else |
1773 | 0 | { |
1774 | | /* A read error! */ |
1775 | 0 | xfree (hs->rderrmsg); |
1776 | 0 | hs->rderrmsg = xstrdup (fd_errstr (sock)); |
1777 | 0 | return RETRFINISHED; |
1778 | 0 | } |
1779 | 0 | } |
1780 | | |
1781 | | #define BEGINS_WITH(line, string_constant) \ |
1782 | 0 | (!c_strncasecmp (line, string_constant, sizeof (string_constant) - 1) \ |
1783 | 0 | && (c_isspace (line[sizeof (string_constant) - 1]) \ |
1784 | 0 | || !line[sizeof (string_constant) - 1])) |
1785 | | |
1786 | 0 | #define SET_USER_AGENT(req) do { \ |
1787 | 0 | if (!opt.useragent) \ |
1788 | 0 | request_set_header (req, "User-Agent", \ |
1789 | 0 | aprintf ("Wget/%s", \ |
1790 | 0 | version_string), \ |
1791 | 0 | rel_value); \ |
1792 | 0 | else if (*opt.useragent) \ |
1793 | 0 | request_set_header (req, "User-Agent", opt.useragent, rel_none); \ |
1794 | 0 | } while (0) |
1795 | | |
1796 | | /* |
1797 | | Convert time_t to one of valid HTTP date formats |
1798 | | ie. rfc1123-date. |
1799 | | |
1800 | | HTTP-date = rfc1123-date | rfc850-date | asctime-date |
1801 | | rfc1123-date = wkday "," SP date1 SP time SP "GMT" |
1802 | | rfc850-date = weekday "," SP date2 SP time SP "GMT" |
1803 | | asctime-date = wkday SP date3 SP time SP 4DIGIT |
1804 | | date1 = 2DIGIT SP month SP 4DIGIT |
1805 | | ; day month year (e.g., 02 Jun 1982) |
1806 | | date2 = 2DIGIT "-" month "-" 2DIGIT |
1807 | | ; day-month-year (e.g., 02-Jun-82) |
1808 | | date3 = month SP ( 2DIGIT | ( SP 1DIGIT )) |
1809 | | ; month day (e.g., Jun 2) |
1810 | | time = 2DIGIT ":" 2DIGIT ":" 2DIGIT |
1811 | | ; 00:00:00 - 23:59:59 |
1812 | | wkday = "Mon" | "Tue" | "Wed" |
1813 | | | "Thu" | "Fri" | "Sat" | "Sun" |
1814 | | weekday = "Monday" | "Tuesday" | "Wednesday" |
1815 | | | "Thursday" | "Friday" | "Saturday" | "Sunday" |
1816 | | month = "Jan" | "Feb" | "Mar" | "Apr" |
1817 | | | "May" | "Jun" | "Jul" | "Aug" |
1818 | | | "Sep" | "Oct" | "Nov" | "Dec" |
1819 | | |
1820 | | source: RFC2616 */ |
1821 | | static uerr_t |
1822 | | time_to_rfc1123 (time_t time, char *buf, size_t bufsize) |
1823 | 0 | { |
1824 | 0 | static const char *wkday[] = { "Sun", "Mon", "Tue", "Wed", |
1825 | 0 | "Thu", "Fri", "Sat" }; |
1826 | 0 | static const char *month[] = { "Jan", "Feb", "Mar", "Apr", |
1827 | 0 | "May", "Jun", "Jul", "Aug", |
1828 | 0 | "Sep", "Oct", "Nov", "Dec" }; |
1829 | |
|
1830 | 0 | struct tm *gtm = gmtime (&time); |
1831 | 0 | if (!gtm) |
1832 | 0 | { |
1833 | 0 | logprintf (LOG_NOTQUIET, |
1834 | 0 | _("gmtime failed. This is probably a bug.\n")); |
1835 | 0 | return TIMECONV_ERR; |
1836 | 0 | } |
1837 | | |
1838 | | /* rfc1123 example: Thu, 01 Jan 1998 22:12:57 GMT */ |
1839 | 0 | snprintf (buf, bufsize, "%s, %02d %s %04d %02d:%02d:%02d GMT", |
1840 | 0 | wkday[gtm->tm_wday], |
1841 | 0 | gtm->tm_mday, month[gtm->tm_mon], |
1842 | 0 | gtm->tm_year + 1900, gtm->tm_hour, |
1843 | 0 | gtm->tm_min, gtm->tm_sec); |
1844 | |
|
1845 | 0 | return RETROK; |
1846 | 0 | } |
1847 | | |
1848 | | static struct request * |
1849 | | initialize_request (const struct url *u, struct http_stat *hs, int *dt, struct url *proxy, |
1850 | | bool inhibit_keep_alive, bool *basic_auth_finished, |
1851 | | wgint *body_data_size, char **user, char **passwd, uerr_t *ret) |
1852 | 0 | { |
1853 | 0 | bool head_only = !!(*dt & HEAD_ONLY); |
1854 | 0 | struct request *req; |
1855 | | |
1856 | | /* Prepare the request to send. */ |
1857 | 0 | { |
1858 | 0 | char *meth_arg; |
1859 | 0 | const char *meth = "GET"; |
1860 | 0 | if (head_only) |
1861 | 0 | meth = "HEAD"; |
1862 | 0 | else if (opt.method) |
1863 | 0 | meth = opt.method; |
1864 | | /* Use the full path, i.e. one that includes the leading slash and |
1865 | | the query string. E.g. if u->path is "foo/bar" and u->query is |
1866 | | "param=value", full_path will be "/foo/bar?param=value". */ |
1867 | 0 | if (proxy |
1868 | 0 | #ifdef HAVE_SSL |
1869 | | /* When using SSL over proxy, CONNECT establishes a direct |
1870 | | connection to the HTTPS server. Therefore use the same |
1871 | | argument as when talking to the server directly. */ |
1872 | 0 | && u->scheme != SCHEME_HTTPS |
1873 | 0 | #endif |
1874 | 0 | ) |
1875 | 0 | meth_arg = xstrdup (u->url); |
1876 | 0 | else |
1877 | 0 | meth_arg = url_full_path (u); |
1878 | 0 | req = request_new (meth, meth_arg); |
1879 | 0 | } |
1880 | | |
1881 | | /* Generate the Host header, HOST:PORT. Take into account that: |
1882 | | |
1883 | | - Broken server-side software often doesn't recognize the PORT |
1884 | | argument, so we must generate "Host: www.server.com" instead of |
1885 | | "Host: www.server.com:80" (and likewise for https port). |
1886 | | |
1887 | | - IPv6 addresses contain ":", so "Host: 3ffe:8100:200:2::2:1234" |
1888 | | becomes ambiguous and needs to be rewritten as "Host: |
1889 | | [3ffe:8100:200:2::2]:1234". */ |
1890 | 0 | { |
1891 | | /* Formats arranged for hfmt[add_port][add_squares]. */ |
1892 | 0 | static const char *hfmt[][2] = { |
1893 | 0 | { "%s", "[%s]" }, { "%s:%d", "[%s]:%d" } |
1894 | 0 | }; |
1895 | 0 | int add_port = u->port != scheme_default_port (u->scheme); |
1896 | 0 | int add_squares = strchr (u->host, ':') != NULL; |
1897 | 0 | request_set_header (req, "Host", |
1898 | 0 | aprintf (hfmt[add_port][add_squares], u->host, u->port), |
1899 | 0 | rel_value); |
1900 | 0 | } |
1901 | |
|
1902 | 0 | request_set_header (req, "Referer", hs->referer, rel_none); |
1903 | 0 | if (*dt & SEND_NOCACHE) |
1904 | 0 | { |
1905 | | /* Cache-Control MUST be obeyed by all HTTP/1.1 caching mechanisms... */ |
1906 | 0 | request_set_header (req, "Cache-Control", "no-cache", rel_none); |
1907 | | |
1908 | | /* ... but some HTTP/1.0 caches doesn't implement Cache-Control. */ |
1909 | 0 | request_set_header (req, "Pragma", "no-cache", rel_none); |
1910 | 0 | } |
1911 | 0 | if (*dt & IF_MODIFIED_SINCE) |
1912 | 0 | { |
1913 | 0 | char strtime[32]; |
1914 | 0 | uerr_t err = time_to_rfc1123 (hs->orig_file_tstamp, strtime, countof (strtime)); |
1915 | |
|
1916 | 0 | if (err != RETROK) |
1917 | 0 | { |
1918 | 0 | logputs (LOG_VERBOSE, _("Cannot convert timestamp to http format. " |
1919 | 0 | "Falling back to time 0 as last modification " |
1920 | 0 | "time.\n")); |
1921 | 0 | strcpy (strtime, "Thu, 01 Jan 1970 00:00:00 GMT"); |
1922 | 0 | } |
1923 | 0 | request_set_header (req, "If-Modified-Since", xstrdup (strtime), rel_value); |
1924 | 0 | } |
1925 | 0 | if (hs->restval) |
1926 | 0 | request_set_header (req, "Range", |
1927 | 0 | aprintf ("bytes=%s-", |
1928 | 0 | number_to_static_string (hs->restval)), |
1929 | 0 | rel_value); |
1930 | 0 | SET_USER_AGENT (req); |
1931 | 0 | request_set_header (req, "Accept", "*/*", rel_none); |
1932 | 0 | #ifdef HAVE_LIBZ |
1933 | 0 | if (opt.compression != compression_none) |
1934 | 0 | request_set_header (req, "Accept-Encoding", "gzip", rel_none); |
1935 | 0 | else |
1936 | 0 | #endif |
1937 | 0 | request_set_header (req, "Accept-Encoding", "identity", rel_none); |
1938 | | |
1939 | | /* Find the username with priority */ |
1940 | 0 | if (u->user) |
1941 | 0 | *user = u->user; |
1942 | 0 | else if (opt.user && (opt.use_askpass || opt.ask_passwd)) |
1943 | 0 | *user = opt.user; |
1944 | 0 | else if (opt.http_user) |
1945 | 0 | *user = opt.http_user; |
1946 | 0 | else if (opt.user) |
1947 | 0 | *user = opt.user; |
1948 | 0 | else |
1949 | 0 | *user = NULL; |
1950 | | |
1951 | | /* Find the password with priority */ |
1952 | 0 | if (u->passwd) |
1953 | 0 | *passwd = u->passwd; |
1954 | 0 | else if (opt.passwd && (opt.use_askpass || opt.ask_passwd)) |
1955 | 0 | *passwd = opt.passwd; |
1956 | 0 | else if (opt.http_passwd) |
1957 | 0 | *passwd = opt.http_passwd; |
1958 | 0 | else if (opt.passwd) |
1959 | 0 | *passwd = opt.passwd; |
1960 | 0 | else |
1961 | 0 | *passwd = NULL; |
1962 | | |
1963 | | /* Check for ~/.netrc if none of the above match */ |
1964 | 0 | if (opt.netrc && (!*user || !*passwd)) |
1965 | 0 | search_netrc (u->host, (const char **) user, (const char **) passwd, 0, NULL); |
1966 | | |
1967 | | /* We only do "site-wide" authentication with "global" user/password |
1968 | | * values unless --auth-no-challenge has been requested; URL user/password |
1969 | | * info overrides. */ |
1970 | 0 | if (*user && *passwd && (!u->user || opt.auth_without_challenge)) |
1971 | 0 | { |
1972 | | /* If this is a host for which we've already received a Basic |
1973 | | * challenge, we'll go ahead and send Basic authentication creds. */ |
1974 | 0 | *basic_auth_finished = maybe_send_basic_creds (u->host, *user, *passwd, req); |
1975 | 0 | } |
1976 | |
|
1977 | 0 | if (inhibit_keep_alive) |
1978 | 0 | request_set_header (req, "Connection", "Close", rel_none); |
1979 | 0 | else |
1980 | 0 | { |
1981 | 0 | request_set_header (req, "Connection", "Keep-Alive", rel_none); |
1982 | 0 | if (proxy) |
1983 | 0 | request_set_header (req, "Proxy-Connection", "Keep-Alive", rel_none); |
1984 | 0 | } |
1985 | |
|
1986 | 0 | if (opt.method) |
1987 | 0 | { |
1988 | |
|
1989 | 0 | if (opt.body_data || opt.body_file) |
1990 | 0 | { |
1991 | 0 | request_set_header (req, "Content-Type", |
1992 | 0 | "application/x-www-form-urlencoded", rel_none); |
1993 | |
|
1994 | 0 | if (opt.body_data) |
1995 | 0 | *body_data_size = strlen (opt.body_data); |
1996 | 0 | else |
1997 | 0 | { |
1998 | 0 | *body_data_size = file_size (opt.body_file); |
1999 | 0 | if (*body_data_size == -1) |
2000 | 0 | { |
2001 | 0 | logprintf (LOG_NOTQUIET, _("BODY data file %s missing: %s\n"), |
2002 | 0 | quote (opt.body_file), strerror (errno)); |
2003 | 0 | request_free (&req); |
2004 | 0 | *ret = FILEBADFILE; |
2005 | 0 | return NULL; |
2006 | 0 | } |
2007 | 0 | } |
2008 | 0 | request_set_header (req, "Content-Length", |
2009 | 0 | xstrdup (number_to_static_string (*body_data_size)), |
2010 | 0 | rel_value); |
2011 | 0 | } |
2012 | 0 | else if (c_strcasecmp (opt.method, "post") == 0 |
2013 | 0 | || c_strcasecmp (opt.method, "put") == 0 |
2014 | 0 | || c_strcasecmp (opt.method, "patch") == 0) |
2015 | 0 | request_set_header (req, "Content-Length", "0", rel_none); |
2016 | 0 | } |
2017 | 0 | return req; |
2018 | 0 | } |
2019 | | |
2020 | | static void |
2021 | | initialize_proxy_configuration (const struct url *u, struct request *req, |
2022 | | struct url *proxy, char **proxyauth) |
2023 | 0 | { |
2024 | 0 | char *proxy_user, *proxy_passwd; |
2025 | | /* For normal username and password, URL components override |
2026 | | command-line/wgetrc parameters. With proxy |
2027 | | authentication, it's the reverse, because proxy URLs are |
2028 | | normally the "permanent" ones, so command-line args |
2029 | | should take precedence. */ |
2030 | 0 | if (opt.proxy_user && opt.proxy_passwd) |
2031 | 0 | { |
2032 | 0 | proxy_user = opt.proxy_user; |
2033 | 0 | proxy_passwd = opt.proxy_passwd; |
2034 | 0 | } |
2035 | 0 | else |
2036 | 0 | { |
2037 | 0 | proxy_user = proxy->user; |
2038 | 0 | proxy_passwd = proxy->passwd; |
2039 | 0 | } |
2040 | | /* #### This does not appear right. Can't the proxy request, |
2041 | | say, `Digest' authentication? */ |
2042 | 0 | if (proxy_user && proxy_passwd) |
2043 | 0 | *proxyauth = basic_authentication_encode (proxy_user, proxy_passwd); |
2044 | | |
2045 | | /* Proxy authorization over SSL is handled below. */ |
2046 | 0 | #ifdef HAVE_SSL |
2047 | 0 | if (u->scheme != SCHEME_HTTPS) |
2048 | 0 | #endif |
2049 | 0 | request_set_header (req, "Proxy-Authorization", *proxyauth, rel_value); |
2050 | 0 | } |
2051 | | |
2052 | | static uerr_t |
2053 | | establish_connection (const struct url *u, const struct url **conn_ref, |
2054 | | struct http_stat *hs, struct url *proxy, |
2055 | | char **proxyauth, |
2056 | | struct request **req_ref, bool *using_ssl, |
2057 | | bool inhibit_keep_alive, |
2058 | | int *sock_ref) |
2059 | 0 | { |
2060 | 0 | bool host_lookup_failed = false; |
2061 | 0 | int sock = *sock_ref; |
2062 | 0 | struct request *req = *req_ref; |
2063 | 0 | const struct url *conn = *conn_ref; |
2064 | 0 | struct response *resp; |
2065 | 0 | int write_error; |
2066 | 0 | int statcode; |
2067 | |
|
2068 | 0 | if (! inhibit_keep_alive) |
2069 | 0 | { |
2070 | | /* Look for a persistent connection to target host, unless a |
2071 | | proxy is used. The exception is when SSL is in use, in which |
2072 | | case the proxy is nothing but a passthrough to the target |
2073 | | host, registered as a connection to the latter. */ |
2074 | 0 | const struct url *relevant = conn; |
2075 | 0 | #ifdef HAVE_SSL |
2076 | 0 | if (u->scheme == SCHEME_HTTPS) |
2077 | 0 | relevant = u; |
2078 | 0 | #endif |
2079 | |
|
2080 | 0 | if (persistent_available_p (relevant->host, relevant->port, |
2081 | 0 | #ifdef HAVE_SSL |
2082 | 0 | relevant->scheme == SCHEME_HTTPS, |
2083 | | #else |
2084 | | 0, |
2085 | | #endif |
2086 | 0 | &host_lookup_failed)) |
2087 | 0 | { |
2088 | 0 | int family = socket_family (pconn.socket, ENDPOINT_PEER); |
2089 | 0 | sock = pconn.socket; |
2090 | 0 | *using_ssl = pconn.ssl; |
2091 | 0 | #if ENABLE_IPV6 |
2092 | 0 | if (family == AF_INET6) |
2093 | 0 | logprintf (LOG_VERBOSE, _("Reusing existing connection to [%s]:%d.\n"), |
2094 | 0 | quotearg_style (escape_quoting_style, pconn.host), |
2095 | 0 | pconn.port); |
2096 | 0 | else |
2097 | 0 | #endif |
2098 | 0 | logprintf (LOG_VERBOSE, _("Reusing existing connection to %s:%d.\n"), |
2099 | 0 | quotearg_style (escape_quoting_style, pconn.host), |
2100 | 0 | pconn.port); |
2101 | 0 | DEBUGP (("Reusing fd %d.\n", sock)); |
2102 | 0 | if (pconn.authorized) |
2103 | | /* If the connection is already authorized, the "Basic" |
2104 | | authorization added by code above is unnecessary and |
2105 | | only hurts us. */ |
2106 | 0 | request_remove_header (req, "Authorization"); |
2107 | 0 | } |
2108 | 0 | else if (host_lookup_failed) |
2109 | 0 | { |
2110 | 0 | logprintf(LOG_NOTQUIET, |
2111 | 0 | _("%s: unable to resolve host address %s\n"), |
2112 | 0 | exec_name, quote (relevant->host)); |
2113 | 0 | return HOSTERR; |
2114 | 0 | } |
2115 | 0 | else if (sock != -1) |
2116 | 0 | { |
2117 | 0 | sock = -1; |
2118 | 0 | } |
2119 | 0 | } |
2120 | | |
2121 | 0 | if (sock < 0) |
2122 | 0 | { |
2123 | 0 | sock = connect_to_host (conn->host, conn->port); |
2124 | 0 | if (sock == E_HOST) |
2125 | 0 | return HOSTERR; |
2126 | 0 | else if (sock < 0) |
2127 | 0 | return (retryable_socket_connect_error (errno) |
2128 | 0 | ? CONERROR : CONIMPOSSIBLE); |
2129 | | |
2130 | 0 | #ifdef HAVE_SSL |
2131 | 0 | if (proxy && u->scheme == SCHEME_HTTPS) |
2132 | 0 | { |
2133 | 0 | char *head; |
2134 | 0 | char *message; |
2135 | | /* When requesting SSL URLs through proxies, use the |
2136 | | CONNECT method to request passthrough. */ |
2137 | 0 | struct request *connreq = request_new ("CONNECT", |
2138 | 0 | aprintf ("%s:%d", u->host, u->port)); |
2139 | 0 | SET_USER_AGENT (connreq); |
2140 | 0 | if (proxyauth) |
2141 | 0 | { |
2142 | 0 | request_set_header (connreq, "Proxy-Authorization", |
2143 | 0 | *proxyauth, rel_value); |
2144 | | /* Now that PROXYAUTH is part of the CONNECT request, |
2145 | | zero it out so we don't send proxy authorization with |
2146 | | the regular request below. */ |
2147 | 0 | *proxyauth = NULL; |
2148 | 0 | } |
2149 | 0 | request_set_header (connreq, "Host", |
2150 | 0 | aprintf ("%s:%d", u->host, u->port), |
2151 | 0 | rel_value); |
2152 | |
|
2153 | 0 | write_error = request_send (connreq, sock, 0); |
2154 | 0 | request_free (&connreq); |
2155 | 0 | if (write_error < 0) |
2156 | 0 | { |
2157 | 0 | CLOSE_INVALIDATE (sock); |
2158 | 0 | return WRITEFAILED; |
2159 | 0 | } |
2160 | | |
2161 | 0 | head = read_http_response_head (sock); |
2162 | 0 | if (!head) |
2163 | 0 | { |
2164 | 0 | logprintf (LOG_VERBOSE, _("Failed reading proxy response: %s\n"), |
2165 | 0 | fd_errstr (sock)); |
2166 | 0 | CLOSE_INVALIDATE (sock); |
2167 | 0 | return HERR; |
2168 | 0 | } |
2169 | 0 | message = NULL; |
2170 | 0 | if (!*head) |
2171 | 0 | { |
2172 | 0 | xfree (head); |
2173 | 0 | goto failed_tunnel; |
2174 | 0 | } |
2175 | 0 | DEBUGP (("proxy responded with: [%s]\n", head)); |
2176 | |
|
2177 | 0 | resp = resp_new (head); |
2178 | 0 | statcode = resp_status (resp, &message); |
2179 | 0 | if (statcode < 0) |
2180 | 0 | { |
2181 | 0 | char *tms = datetime_str (time (NULL)); |
2182 | 0 | logprintf (LOG_VERBOSE, "%d\n", statcode); |
2183 | 0 | logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), tms, statcode, |
2184 | 0 | quotearg_style (escape_quoting_style, |
2185 | 0 | _("Malformed status line"))); |
2186 | 0 | xfree (head); |
2187 | 0 | return HERR; |
2188 | 0 | } |
2189 | 0 | xfree (hs->message); |
2190 | 0 | hs->message = xstrdup (message); |
2191 | 0 | resp_free (&resp); |
2192 | 0 | xfree (head); |
2193 | 0 | if (statcode != 200) |
2194 | 0 | { |
2195 | 0 | failed_tunnel: |
2196 | 0 | logprintf (LOG_NOTQUIET, _("Proxy tunneling failed: %s"), |
2197 | 0 | message ? quotearg_style (escape_quoting_style, message) : "?"); |
2198 | 0 | xfree (message); |
2199 | 0 | return CONSSLERR; |
2200 | 0 | } |
2201 | 0 | xfree (message); |
2202 | | |
2203 | | /* SOCK is now *really* connected to u->host, so update CONN |
2204 | | to reflect this. That way register_persistent will |
2205 | | register SOCK as being connected to u->host:u->port. */ |
2206 | 0 | conn = u; |
2207 | 0 | } |
2208 | | |
2209 | 0 | if (conn->scheme == SCHEME_HTTPS) |
2210 | 0 | { |
2211 | 0 | if (!ssl_connect_wget (sock, u->host, NULL)) |
2212 | 0 | { |
2213 | 0 | CLOSE_INVALIDATE (sock); |
2214 | 0 | return CONSSLERR; |
2215 | 0 | } |
2216 | 0 | else if (!ssl_check_certificate (sock, u->host)) |
2217 | 0 | { |
2218 | 0 | CLOSE_INVALIDATE (sock); |
2219 | 0 | return VERIFCERTERR; |
2220 | 0 | } |
2221 | 0 | *using_ssl = true; |
2222 | 0 | } |
2223 | 0 | #endif /* HAVE_SSL */ |
2224 | 0 | } |
2225 | 0 | *conn_ref = conn; |
2226 | 0 | *req_ref = req; |
2227 | 0 | *sock_ref = sock; |
2228 | 0 | return RETROK; |
2229 | 0 | } |
2230 | | |
2231 | | static uerr_t |
2232 | | set_file_timestamp (struct http_stat *hs) |
2233 | 0 | { |
2234 | 0 | bool local_dot_orig_file_exists = false; |
2235 | 0 | char *local_filename = NULL; |
2236 | 0 | struct stat st; |
2237 | 0 | char buf[1024]; |
2238 | |
|
2239 | 0 | if (opt.backup_converted) |
2240 | | /* If -K is specified, we'll act on the assumption that it was specified |
2241 | | last time these files were downloaded as well, and instead of just |
2242 | | comparing local file X against server file X, we'll compare local |
2243 | | file X.orig (if extant, else X) against server file X. If -K |
2244 | | _wasn't_ specified last time, or the server contains files called |
2245 | | *.orig, -N will be back to not operating correctly with -k. */ |
2246 | 0 | { |
2247 | 0 | size_t filename_len = strlen (hs->local_file); |
2248 | 0 | char *filename_plus_orig_suffix; |
2249 | |
|
2250 | 0 | if (filename_len + sizeof (ORIG_SFX) > sizeof (buf)) |
2251 | 0 | filename_plus_orig_suffix = xmalloc (filename_len + sizeof (ORIG_SFX)); |
2252 | 0 | else |
2253 | 0 | filename_plus_orig_suffix = buf; |
2254 | | |
2255 | | /* Would a single s[n]printf() call be faster? --dan |
2256 | | |
2257 | | Definitely not. sprintf() is horribly slow. It's a |
2258 | | different question whether the difference between the two |
2259 | | affects a program. Usually I'd say "no", but at one |
2260 | | point I profiled Wget, and found that a measurable and |
2261 | | non-negligible amount of time was lost calling sprintf() |
2262 | | in url.c. Replacing sprintf with inline calls to |
2263 | | strcpy() and number_to_string() made a difference. |
2264 | | --hniksic */ |
2265 | 0 | memcpy (filename_plus_orig_suffix, hs->local_file, filename_len); |
2266 | 0 | memcpy (filename_plus_orig_suffix + filename_len, |
2267 | 0 | ORIG_SFX, sizeof (ORIG_SFX)); |
2268 | | |
2269 | | /* Try to stat() the .orig file. */ |
2270 | 0 | if (stat (filename_plus_orig_suffix, &st) == 0) |
2271 | 0 | { |
2272 | 0 | local_dot_orig_file_exists = true; |
2273 | 0 | local_filename = filename_plus_orig_suffix; |
2274 | 0 | } |
2275 | 0 | } |
2276 | |
|
2277 | 0 | if (!local_dot_orig_file_exists) |
2278 | | /* Couldn't stat() <file>.orig, so try to stat() <file>. */ |
2279 | 0 | if (stat (hs->local_file, &st) == 0) |
2280 | 0 | { |
2281 | 0 | if (local_filename != buf) |
2282 | 0 | xfree (local_filename); |
2283 | 0 | local_filename = hs->local_file; |
2284 | 0 | } |
2285 | |
|
2286 | 0 | if (local_filename != NULL) |
2287 | | /* There was a local file, so we'll check later to see if the version |
2288 | | the server has is the same version we already have, allowing us to |
2289 | | skip a download. */ |
2290 | 0 | { |
2291 | 0 | if (local_filename == buf || local_filename == hs->local_file) |
2292 | 0 | hs->orig_file_name = xstrdup (local_filename); // on stack or a copy, make a heap copy |
2293 | 0 | else |
2294 | 0 | hs->orig_file_name = local_filename; // was previously malloc'ed |
2295 | 0 | hs->orig_file_size = st.st_size; |
2296 | 0 | hs->orig_file_tstamp = st.st_mtime; |
2297 | | #ifdef WINDOWS |
2298 | | /* Modification time granularity is 2 seconds for Windows, so |
2299 | | increase local time by 1 second for later comparison. */ |
2300 | | ++hs->orig_file_tstamp; |
2301 | | #endif |
2302 | 0 | hs->timestamp_checked = true; |
2303 | 0 | } |
2304 | |
|
2305 | 0 | return RETROK; |
2306 | 0 | } |
2307 | | |
2308 | | static uerr_t |
2309 | | check_file_output (const struct url *u, struct http_stat *hs, |
2310 | | struct response *resp, char *hdrval, size_t hdrsize) |
2311 | 0 | { |
2312 | | /* Determine the local filename if needed. Notice that if -O is used |
2313 | | * hstat.local_file is set by http_loop to the argument of -O. */ |
2314 | 0 | if (!hs->local_file) |
2315 | 0 | { |
2316 | 0 | char *local_file = NULL; |
2317 | | |
2318 | | /* Honor Content-Disposition whether possible. */ |
2319 | 0 | if (!opt.content_disposition |
2320 | 0 | || !resp_header_copy (resp, "Content-Disposition", |
2321 | 0 | hdrval, hdrsize) |
2322 | 0 | || !parse_content_disposition (hdrval, &local_file)) |
2323 | 0 | { |
2324 | | /* The Content-Disposition header is missing or broken. |
2325 | | * Choose unique file name according to given URL. */ |
2326 | 0 | hs->local_file = url_file_name (u, NULL); |
2327 | 0 | } |
2328 | 0 | else |
2329 | 0 | { |
2330 | 0 | DEBUGP (("Parsed filename from Content-Disposition: %s\n", |
2331 | 0 | local_file)); |
2332 | 0 | hs->local_file = url_file_name (u, local_file); |
2333 | 0 | } |
2334 | |
|
2335 | 0 | xfree (local_file); |
2336 | 0 | } |
2337 | |
|
2338 | 0 | hs->temporary = opt.delete_after || opt.spider || !acceptable (hs->local_file); |
2339 | 0 | if (hs->temporary) |
2340 | 0 | { |
2341 | 0 | char *tmp = aprintf ("%s.tmp", hs->local_file); |
2342 | 0 | xfree (hs->local_file); |
2343 | 0 | hs->local_file = tmp; |
2344 | 0 | } |
2345 | | |
2346 | | /* TODO: perform this check only once. */ |
2347 | 0 | if (!hs->existence_checked && file_exists_p (hs->local_file, NULL)) |
2348 | 0 | { |
2349 | 0 | if (opt.noclobber && !opt.output_document) |
2350 | 0 | { |
2351 | | /* If opt.noclobber is turned on and file already exists, do not |
2352 | | retrieve the file. But if the output_document was given, then this |
2353 | | test was already done and the file didn't exist. Hence the !opt.output_document */ |
2354 | 0 | return RETRUNNEEDED; |
2355 | 0 | } |
2356 | 0 | else if (!ALLOW_CLOBBER) |
2357 | 0 | { |
2358 | 0 | char *unique = unique_name_passthrough (hs->local_file); |
2359 | 0 | if (unique != hs->local_file) |
2360 | 0 | xfree (hs->local_file); |
2361 | 0 | hs->local_file = unique; |
2362 | 0 | } |
2363 | 0 | } |
2364 | 0 | hs->existence_checked = true; |
2365 | | |
2366 | | /* Support timestamping */ |
2367 | 0 | if (opt.timestamping && !hs->timestamp_checked) |
2368 | 0 | { |
2369 | 0 | uerr_t timestamp_err = set_file_timestamp (hs); |
2370 | 0 | if (timestamp_err != RETROK) |
2371 | 0 | return timestamp_err; |
2372 | 0 | } |
2373 | 0 | return RETROK; |
2374 | 0 | } |
2375 | | |
2376 | | static uerr_t |
2377 | | check_auth (const struct url *u, char *user, char *passwd, struct response *resp, |
2378 | | struct request *req, bool *ntlm_seen_ref, bool *retry, |
2379 | | bool *basic_auth_finished_ref, bool *auth_finished_ref) |
2380 | 0 | { |
2381 | 0 | uerr_t auth_err = RETROK; |
2382 | 0 | bool basic_auth_finished = *basic_auth_finished_ref; |
2383 | 0 | bool auth_finished = *auth_finished_ref; |
2384 | 0 | bool ntlm_seen = *ntlm_seen_ref; |
2385 | 0 | char buf[256], *tmp = NULL; |
2386 | |
|
2387 | 0 | *retry = false; |
2388 | |
|
2389 | 0 | if (!auth_finished && (user && passwd)) |
2390 | 0 | { |
2391 | | /* IIS sends multiple copies of WWW-Authenticate, one with |
2392 | | the value "negotiate", and other(s) with data. Loop over |
2393 | | all the occurrences and pick the one we recognize. */ |
2394 | 0 | int wapos; |
2395 | 0 | const char *www_authenticate = NULL; |
2396 | 0 | const char *wabeg, *waend; |
2397 | 0 | const char *digest = NULL, *basic = NULL, *ntlm = NULL; |
2398 | |
|
2399 | 0 | for (wapos = 0; !ntlm |
2400 | 0 | && (wapos = resp_header_locate (resp, "WWW-Authenticate", wapos, |
2401 | 0 | &wabeg, &waend)) != -1; |
2402 | 0 | ++wapos) |
2403 | 0 | { |
2404 | 0 | param_token name, value; |
2405 | 0 | size_t len = waend - wabeg; |
2406 | |
|
2407 | 0 | if (tmp != buf) |
2408 | 0 | xfree (tmp); |
2409 | |
|
2410 | 0 | if (len < sizeof (buf)) |
2411 | 0 | tmp = buf; |
2412 | 0 | else |
2413 | 0 | tmp = xmalloc (len + 1); |
2414 | |
|
2415 | 0 | memcpy (tmp, wabeg, len); |
2416 | 0 | tmp[len] = 0; |
2417 | |
|
2418 | 0 | www_authenticate = tmp; |
2419 | |
|
2420 | 0 | for (;!ntlm;) |
2421 | 0 | { |
2422 | | /* extract the auth-scheme */ |
2423 | 0 | while (c_isspace (*www_authenticate)) www_authenticate++; |
2424 | 0 | name.e = name.b = www_authenticate; |
2425 | 0 | while (*name.e && !c_isspace (*name.e)) name.e++; |
2426 | |
|
2427 | 0 | if (name.b == name.e) |
2428 | 0 | break; |
2429 | | |
2430 | 0 | DEBUGP (("Auth scheme found '%.*s'\n", (int) (name.e - name.b), name.b)); |
2431 | |
|
2432 | 0 | if (known_authentication_scheme_p (name.b, name.e)) |
2433 | 0 | { |
2434 | 0 | if (BEGINS_WITH (name.b, "NTLM")) |
2435 | 0 | { |
2436 | 0 | ntlm = name.b; |
2437 | 0 | break; /* this is the most secure challenge, stop here */ |
2438 | 0 | } |
2439 | 0 | else if (!digest && BEGINS_WITH (name.b, "Digest")) |
2440 | 0 | digest = name.b; |
2441 | 0 | else if (!basic && BEGINS_WITH (name.b, "Basic")) |
2442 | 0 | basic = name.b; |
2443 | 0 | } |
2444 | | |
2445 | | /* now advance over the auth-params */ |
2446 | 0 | www_authenticate = name.e; |
2447 | 0 | DEBUGP (("Auth param list '%s'\n", www_authenticate)); |
2448 | 0 | while (extract_param (&www_authenticate, &name, &value, ',', NULL) && name.b && value.b) |
2449 | 0 | { |
2450 | 0 | DEBUGP (("Auth param %.*s=%.*s\n", |
2451 | 0 | (int) (name.e - name.b), name.b, (int) (value.e - value.b), value.b)); |
2452 | 0 | } |
2453 | 0 | } |
2454 | 0 | } |
2455 | |
|
2456 | 0 | if (!basic && !digest && !ntlm) |
2457 | 0 | { |
2458 | | /* If the authentication header is missing or |
2459 | | unrecognized, there's no sense in retrying. */ |
2460 | 0 | logputs (LOG_NOTQUIET, _("Unknown authentication scheme.\n")); |
2461 | 0 | } |
2462 | 0 | else if (!basic_auth_finished |
2463 | 0 | || !basic) |
2464 | 0 | { |
2465 | 0 | char *pth = url_full_path (u); |
2466 | 0 | const char *value; |
2467 | 0 | uerr_t *auth_stat; |
2468 | 0 | auth_stat = xmalloc (sizeof (uerr_t)); |
2469 | 0 | *auth_stat = RETROK; |
2470 | |
|
2471 | 0 | if (ntlm) |
2472 | 0 | www_authenticate = ntlm; |
2473 | 0 | else if (digest) |
2474 | 0 | www_authenticate = digest; |
2475 | 0 | else |
2476 | 0 | www_authenticate = basic; |
2477 | |
|
2478 | 0 | logprintf (LOG_NOTQUIET, _("Authentication selected: %s\n"), www_authenticate); |
2479 | |
|
2480 | 0 | value = create_authorization_line (www_authenticate, |
2481 | 0 | user, passwd, |
2482 | 0 | request_method (req), |
2483 | 0 | pth, |
2484 | 0 | &auth_finished, |
2485 | 0 | auth_stat); |
2486 | |
|
2487 | 0 | auth_err = *auth_stat; |
2488 | 0 | xfree (auth_stat); |
2489 | 0 | xfree (pth); |
2490 | 0 | if (auth_err == RETROK) |
2491 | 0 | { |
2492 | 0 | request_set_header (req, "Authorization", value, rel_value); |
2493 | |
|
2494 | 0 | if (BEGINS_WITH (www_authenticate, "NTLM")) |
2495 | 0 | ntlm_seen = true; |
2496 | 0 | else if (!u->user && BEGINS_WITH (www_authenticate, "Basic")) |
2497 | 0 | { |
2498 | | /* Need to register this host as using basic auth, |
2499 | | * so we automatically send creds next time. */ |
2500 | 0 | register_basic_auth_host (u->host); |
2501 | 0 | } |
2502 | |
|
2503 | 0 | *retry = true; |
2504 | 0 | goto cleanup; |
2505 | 0 | } |
2506 | 0 | else |
2507 | 0 | { |
2508 | | /* Creating the Authorization header went wrong */ |
2509 | 0 | xfree (value); |
2510 | 0 | } |
2511 | 0 | } |
2512 | 0 | else |
2513 | 0 | { |
2514 | | /* We already did Basic auth, and it failed. Gotta |
2515 | | * give up. */ |
2516 | 0 | } |
2517 | 0 | } |
2518 | | |
2519 | 0 | cleanup: |
2520 | 0 | if (tmp != buf) |
2521 | 0 | xfree (tmp); |
2522 | 0 | *ntlm_seen_ref = ntlm_seen; |
2523 | 0 | *basic_auth_finished_ref = basic_auth_finished; |
2524 | 0 | *auth_finished_ref = auth_finished; |
2525 | 0 | return auth_err; |
2526 | 0 | } |
2527 | | |
2528 | | static uerr_t |
2529 | | open_output_stream (struct http_stat *hs, int count, FILE **fp) |
2530 | 0 | { |
2531 | | /* 2005-06-17 SMS. |
2532 | | For VMS, define common fopen() optional arguments. |
2533 | | */ |
2534 | | #ifdef __VMS |
2535 | | # define FOPEN_OPT_ARGS "fop=sqo", "acc", acc_cb, &open_id |
2536 | | # define FOPEN_BIN_FLAG 3 |
2537 | | #else /* def __VMS */ |
2538 | 0 | # define FOPEN_BIN_FLAG true |
2539 | 0 | #endif /* def __VMS [else] */ |
2540 | | |
2541 | | /* Open the local file. */ |
2542 | 0 | if (!output_stream) |
2543 | 0 | { |
2544 | 0 | mkalldirs (hs->local_file); |
2545 | 0 | if (opt.backups) |
2546 | 0 | rotate_backups (hs->local_file); |
2547 | 0 | if (hs->restval) |
2548 | 0 | { |
2549 | | #ifdef __VMS |
2550 | | int open_id; |
2551 | | |
2552 | | open_id = 21; |
2553 | | *fp = fopen (hs->local_file, "ab", FOPEN_OPT_ARGS); |
2554 | | #else /* def __VMS */ |
2555 | 0 | *fp = fopen (hs->local_file, "ab"); |
2556 | 0 | #endif /* def __VMS [else] */ |
2557 | 0 | } |
2558 | 0 | else if (ALLOW_CLOBBER || count > 0) |
2559 | 0 | { |
2560 | 0 | if (opt.unlink_requested && file_exists_p (hs->local_file, NULL)) |
2561 | 0 | { |
2562 | 0 | if (unlink (hs->local_file) < 0) |
2563 | 0 | { |
2564 | 0 | logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, |
2565 | 0 | strerror (errno)); |
2566 | 0 | return UNLINKERR; |
2567 | 0 | } |
2568 | 0 | } |
2569 | | |
2570 | | #ifdef __VMS |
2571 | | int open_id; |
2572 | | |
2573 | | open_id = 22; |
2574 | | *fp = fopen (hs->local_file, "wb", FOPEN_OPT_ARGS); |
2575 | | #else /* def __VMS */ |
2576 | 0 | if (hs->temporary) |
2577 | 0 | { |
2578 | 0 | *fp = fdopen (open (hs->local_file, O_BINARY | O_CREAT | O_TRUNC | O_WRONLY, S_IRUSR | S_IWUSR), "wb"); |
2579 | 0 | } |
2580 | 0 | else |
2581 | 0 | { |
2582 | 0 | *fp = fopen (hs->local_file, "wb"); |
2583 | 0 | } |
2584 | |
|
2585 | 0 | #endif /* def __VMS [else] */ |
2586 | 0 | } |
2587 | 0 | else |
2588 | 0 | { |
2589 | 0 | *fp = fopen_excl (hs->local_file, FOPEN_BIN_FLAG); |
2590 | 0 | if (!*fp && errno == EEXIST) |
2591 | 0 | { |
2592 | | /* We cannot just invent a new name and use it (which is |
2593 | | what functions like unique_create typically do) |
2594 | | because we told the user we'd use this name. |
2595 | | Instead, return and retry the download. */ |
2596 | 0 | logprintf (LOG_NOTQUIET, |
2597 | 0 | _("%s has sprung into existence.\n"), |
2598 | 0 | hs->local_file); |
2599 | 0 | return FOPEN_EXCL_ERR; |
2600 | 0 | } |
2601 | 0 | } |
2602 | 0 | if (!*fp) |
2603 | 0 | { |
2604 | 0 | logprintf (LOG_NOTQUIET, "%s: %s\n", hs->local_file, strerror (errno)); |
2605 | 0 | return FOPENERR; |
2606 | 0 | } |
2607 | 0 | } |
2608 | 0 | else |
2609 | 0 | *fp = output_stream; |
2610 | | |
2611 | | /* Print fetch message, if opt.verbose. */ |
2612 | 0 | logprintf (LOG_VERBOSE, _("Saving to: %s\n"), |
2613 | 0 | HYPHENP (hs->local_file) ? quote ("STDOUT") : quote (hs->local_file)); |
2614 | |
|
2615 | 0 | return RETROK; |
2616 | 0 | } |
2617 | | |
2618 | | /* Set proper type flags based on type string. */ |
2619 | | static void |
2620 | | set_content_type (int *dt, const char *type) |
2621 | 0 | { |
2622 | | /* If content-type is not given, assume text/html. This is because |
2623 | | of the multitude of broken CGI's that "forget" to generate the |
2624 | | content-type. */ |
2625 | 0 | if (!type || |
2626 | 0 | 0 == c_strcasecmp (type, TEXTHTML_S) || |
2627 | 0 | 0 == c_strcasecmp (type, TEXTXHTML_S)) |
2628 | 0 | *dt |= TEXTHTML; |
2629 | 0 | else |
2630 | 0 | *dt &= ~TEXTHTML; |
2631 | |
|
2632 | 0 | if (type && |
2633 | 0 | 0 == c_strcasecmp (type, TEXTCSS_S)) |
2634 | 0 | *dt |= TEXTCSS; |
2635 | 0 | else |
2636 | 0 | *dt &= ~TEXTCSS; |
2637 | 0 | } |
2638 | | |
2639 | | #ifdef HAVE_METALINK |
2640 | | /* Will return proper metalink_t structure if enough data was found in |
2641 | | http response resp. Otherwise returns NULL. |
2642 | | Two exit points: one for success and one for failure. */ |
2643 | | static metalink_t * |
2644 | | metalink_from_http (const struct response *resp, const struct http_stat *hs, |
2645 | | const struct url *u) |
2646 | | { |
2647 | | metalink_t *metalink = NULL; |
2648 | | metalink_file_t *mfile = xnew0 (metalink_file_t); |
2649 | | const char *val_beg, *val_end; |
2650 | | int res_count = 0, meta_count = 0, hash_count = 0, sig_count = 0, i; |
2651 | | |
2652 | | DEBUGP (("Checking for Metalink in HTTP response\n")); |
2653 | | |
2654 | | /* Initialize metalink file for our simple use case. */ |
2655 | | if (hs->local_file) |
2656 | | mfile->name = xstrdup (hs->local_file); |
2657 | | else |
2658 | | mfile->name = url_file_name (u, NULL); |
2659 | | |
2660 | | /* Begin with 1-element array (for 0-termination). */ |
2661 | | mfile->checksums = xnew0 (metalink_checksum_t *); |
2662 | | mfile->resources = xnew0 (metalink_resource_t *); |
2663 | | mfile->metaurls = xnew0 (metalink_metaurl_t *); |
2664 | | |
2665 | | /* Process the Content-Type header. */ |
2666 | | if (resp_header_locate (resp, "Content-Type", 0, &val_beg, &val_end) != -1) |
2667 | | { |
2668 | | metalink_metaurl_t murl = {0}; |
2669 | | |
2670 | | const char *type_beg, *type_end; |
2671 | | char *typestr = NULL; |
2672 | | char *namestr = NULL; |
2673 | | size_t type_len; |
2674 | | |
2675 | | DEBUGP (("Processing Content-Type header...\n")); |
2676 | | |
2677 | | /* Find beginning of type. */ |
2678 | | type_beg = val_beg; |
2679 | | while (type_beg < val_end && c_isspace (*type_beg)) |
2680 | | type_beg++; |
2681 | | |
2682 | | /* Find end of type. */ |
2683 | | type_end = type_beg + 1; |
2684 | | while (type_end < val_end && |
2685 | | *type_end != ';' && |
2686 | | *type_end != ' ' && |
2687 | | *type_end != '\r' && |
2688 | | *type_end != '\n') |
2689 | | type_end++; |
2690 | | |
2691 | | if (type_beg >= val_end || type_end > val_end) |
2692 | | { |
2693 | | DEBUGP (("Invalid Content-Type header. Ignoring.\n")); |
2694 | | goto skip_content_type; |
2695 | | } |
2696 | | |
2697 | | type_len = type_end - type_beg; |
2698 | | typestr = xstrndup (type_beg, type_len); |
2699 | | |
2700 | | DEBUGP (("Content-Type: %s\n", typestr)); |
2701 | | |
2702 | | if (strcmp (typestr, "application/metalink4+xml")) |
2703 | | { |
2704 | | xfree (typestr); |
2705 | | goto skip_content_type; |
2706 | | } |
2707 | | |
2708 | | /* |
2709 | | Valid ranges for the "pri" attribute are from |
2710 | | 1 to 999999. Mirror servers with a lower value of the "pri" |
2711 | | attribute have a higher priority, while mirrors with an undefined |
2712 | | "pri" attribute are considered to have a value of 999999, which is |
2713 | | the lowest priority. |
2714 | | |
2715 | | rfc6249 section 3.1 |
2716 | | */ |
2717 | | murl.priority = DEFAULT_PRI; |
2718 | | |
2719 | | murl.mediatype = typestr; |
2720 | | typestr = NULL; |
2721 | | |
2722 | | if (opt.content_disposition |
2723 | | && resp_header_locate (resp, "Content-Disposition", 0, &val_beg, &val_end) != -1) |
2724 | | { |
2725 | | find_key_value (val_beg, val_end, "filename", &namestr); |
2726 | | murl.name = namestr; |
2727 | | namestr = NULL; |
2728 | | } |
2729 | | |
2730 | | murl.url = xstrdup (u->url); |
2731 | | |
2732 | | DEBUGP (("URL=%s\n", murl.url)); |
2733 | | DEBUGP (("MEDIATYPE=%s\n", murl.mediatype)); |
2734 | | DEBUGP (("NAME=%s\n", murl.name ? murl.name : "")); |
2735 | | DEBUGP (("PRIORITY=%d\n", murl.priority)); |
2736 | | |
2737 | | /* 1 slot from new resource, 1 slot for null-termination. */ |
2738 | | mfile->metaurls = xrealloc (mfile->metaurls, |
2739 | | sizeof (metalink_metaurl_t *) * (meta_count + 2)); |
2740 | | mfile->metaurls[meta_count] = xnew0 (metalink_metaurl_t); |
2741 | | *mfile->metaurls[meta_count] = murl; |
2742 | | meta_count++; |
2743 | | } |
2744 | | skip_content_type: |
2745 | | |
2746 | | /* Find all Link headers. */ |
2747 | | for (i = 0; |
2748 | | (i = resp_header_locate (resp, "Link", i, &val_beg, &val_end)) != -1; |
2749 | | i++) |
2750 | | { |
2751 | | char *rel = NULL, *reltype = NULL; |
2752 | | char *urlstr = NULL; |
2753 | | const char *url_beg, *url_end, *attrs_beg; |
2754 | | size_t url_len; |
2755 | | |
2756 | | /* Sample Metalink Link headers: |
2757 | | |
2758 | | Link: <http://www2.example.com/dir1/dir2/dir3/dir4/dir5/example.ext>; |
2759 | | rel=duplicate; pri=1; pref; geo=gb; depth=4 |
2760 | | |
2761 | | Link: <http://example.com/example.ext.asc>; rel=describedby; |
2762 | | type="application/pgp-signature" |
2763 | | */ |
2764 | | |
2765 | | /* Find beginning of URL. */ |
2766 | | url_beg = val_beg; |
2767 | | while (url_beg < val_end - 1 && c_isspace (*url_beg)) |
2768 | | url_beg++; |
2769 | | |
2770 | | /* Find end of URL. */ |
2771 | | /* The convention here is that end ptr points to one element after |
2772 | | end of string. In this case, it should be pointing to the '>', which |
2773 | | is one element after end of actual URL. Therefore, it should never point |
2774 | | to val_end, which is one element after entire header value string. */ |
2775 | | url_end = url_beg + 1; |
2776 | | while (url_end < val_end - 1 && *url_end != '>') |
2777 | | url_end++; |
2778 | | |
2779 | | if (url_beg >= val_end || url_end >= val_end || |
2780 | | *url_beg != '<' || *url_end != '>') |
2781 | | { |
2782 | | DEBUGP (("This is not a valid Link header. Ignoring.\n")); |
2783 | | continue; |
2784 | | } |
2785 | | |
2786 | | /* Skip <. */ |
2787 | | url_beg++; |
2788 | | url_len = url_end - url_beg; |
2789 | | |
2790 | | /* URL found. Now handle the attributes. */ |
2791 | | attrs_beg = url_end + 1; |
2792 | | |
2793 | | /* First we need to find out what type of link it is. Currently, we |
2794 | | support rel=duplicate and rel=describedby. */ |
2795 | | if (!find_key_value (attrs_beg, val_end, "rel", &rel)) |
2796 | | { |
2797 | | DEBUGP (("No rel value in Link header, skipping.\n")); |
2798 | | continue; |
2799 | | } |
2800 | | |
2801 | | urlstr = xstrndup (url_beg, url_len); |
2802 | | DEBUGP (("URL=%s\n", urlstr)); |
2803 | | DEBUGP (("rel=%s\n", rel)); |
2804 | | |
2805 | | if (!strcmp (rel, "describedby")) |
2806 | | find_key_value (attrs_beg, val_end, "type", &reltype); |
2807 | | |
2808 | | /* Handle signatures. |
2809 | | Libmetalink only supports one signature per file. Therefore we stop |
2810 | | as soon as we successfully get first supported signature. */ |
2811 | | if (sig_count == 0 && |
2812 | | reltype && !strcmp (reltype, "application/pgp-signature")) |
2813 | | { |
2814 | | /* Download the signature to a temporary file. */ |
2815 | | FILE *_output_stream = output_stream; |
2816 | | bool _output_stream_regular = output_stream_regular; |
2817 | | |
2818 | | output_stream = tmpfile (); |
2819 | | if (output_stream) |
2820 | | { |
2821 | | struct iri *iri = iri_new (); |
2822 | | struct url *url; |
2823 | | int url_err; |
2824 | | |
2825 | | set_uri_encoding (iri, opt.locale, true); |
2826 | | url = url_parse (urlstr, &url_err, iri, false); |
2827 | | |
2828 | | if (!url) |
2829 | | { |
2830 | | logprintf (LOG_NOTQUIET, _("When downloading signature:\n" |
2831 | | "%s: %s.\n"), urlstr, url_error (url_err)); |
2832 | | iri_free (iri); |
2833 | | } |
2834 | | else |
2835 | | { |
2836 | | /* Avoid recursive Metalink from HTTP headers. */ |
2837 | | bool _metalink_http = opt.metalink_over_http; |
2838 | | uerr_t retr_err; |
2839 | | |
2840 | | opt.metalink_over_http = false; |
2841 | | retr_err = retrieve_url (url, urlstr, NULL, NULL, |
2842 | | NULL, NULL, false, iri, false); |
2843 | | opt.metalink_over_http = _metalink_http; |
2844 | | |
2845 | | url_free (url); |
2846 | | iri_free (iri); |
2847 | | |
2848 | | if (retr_err == RETROK) |
2849 | | { |
2850 | | /* Signature is in the temporary file. Read it into |
2851 | | metalink resource structure. */ |
2852 | | metalink_signature_t msig; |
2853 | | size_t siglen; |
2854 | | |
2855 | | fseek (output_stream, 0, SEEK_END); |
2856 | | siglen = ftell (output_stream); |
2857 | | fseek (output_stream, 0, SEEK_SET); |
2858 | | |
2859 | | DEBUGP (("siglen=%lu\n", siglen)); |
2860 | | |
2861 | | msig.signature = xmalloc (siglen + 1); |
2862 | | if (fread (msig.signature, siglen, 1, output_stream) != 1) |
2863 | | { |
2864 | | logputs (LOG_NOTQUIET, |
2865 | | _("Unable to read signature content from " |
2866 | | "temporary file. Skipping.\n")); |
2867 | | xfree (msig.signature); |
2868 | | } |
2869 | | else |
2870 | | { |
2871 | | msig.signature[siglen] = '\0'; /* Just in case. */ |
2872 | | msig.mediatype = xstrdup ("application/pgp-signature"); |
2873 | | |
2874 | | DEBUGP (("Signature (%s):\n%s\n", |
2875 | | msig.mediatype, msig.signature)); |
2876 | | |
2877 | | mfile->signature = xnew (metalink_signature_t); |
2878 | | *mfile->signature = msig; |
2879 | | |
2880 | | sig_count++; |
2881 | | } |
2882 | | } |
2883 | | } |
2884 | | fclose (output_stream); |
2885 | | } |
2886 | | else |
2887 | | { |
2888 | | logputs (LOG_NOTQUIET, _("Could not create temporary file. " |
2889 | | "Skipping signature download.\n")); |
2890 | | } |
2891 | | output_stream_regular = _output_stream_regular; |
2892 | | output_stream = _output_stream; |
2893 | | } /* Iterate over signatures. */ |
2894 | | |
2895 | | /* Handle Metalink resources. */ |
2896 | | else if (!strcmp (rel, "duplicate")) |
2897 | | { |
2898 | | metalink_resource_t mres = {0}; |
2899 | | char *pristr; |
2900 | | |
2901 | | /* |
2902 | | Valid ranges for the "pri" attribute are from |
2903 | | 1 to 999999. Mirror servers with a lower value of the "pri" |
2904 | | attribute have a higher priority, while mirrors with an undefined |
2905 | | "pri" attribute are considered to have a value of 999999, which is |
2906 | | the lowest priority. |
2907 | | |
2908 | | rfc6249 section 3.1 |
2909 | | */ |
2910 | | mres.priority = DEFAULT_PRI; |
2911 | | if (find_key_value (url_end, val_end, "pri", &pristr)) |
2912 | | { |
2913 | | long pri; |
2914 | | char *end_pristr; |
2915 | | /* Do not care for errno since 0 is error in this case. */ |
2916 | | pri = strtol (pristr, &end_pristr, 10); |
2917 | | if (end_pristr != pristr + strlen (pristr) || |
2918 | | !VALID_PRI_RANGE (pri)) |
2919 | | { |
2920 | | /* This is against the specification, so let's inform the user. */ |
2921 | | logprintf (LOG_NOTQUIET, |
2922 | | _("Invalid pri value. Assuming %d.\n"), |
2923 | | DEFAULT_PRI); |
2924 | | } |
2925 | | else |
2926 | | mres.priority = pri; |
2927 | | xfree (pristr); |
2928 | | } |
2929 | | |
2930 | | switch (url_scheme (urlstr)) |
2931 | | { |
2932 | | case SCHEME_HTTP: |
2933 | | mres.type = xstrdup ("http"); |
2934 | | break; |
2935 | | #ifdef HAVE_SSL |
2936 | | case SCHEME_HTTPS: |
2937 | | mres.type = xstrdup ("https"); |
2938 | | break; |
2939 | | case SCHEME_FTPS: |
2940 | | mres.type = xstrdup ("ftps"); |
2941 | | break; |
2942 | | #endif |
2943 | | case SCHEME_FTP: |
2944 | | mres.type = xstrdup ("ftp"); |
2945 | | break; |
2946 | | default: |
2947 | | DEBUGP (("Unsupported url scheme in %s. Skipping resource.\n", urlstr)); |
2948 | | } |
2949 | | |
2950 | | if (mres.type) |
2951 | | { |
2952 | | DEBUGP (("TYPE=%s\n", mres.type)); |
2953 | | |
2954 | | /* At this point we have validated the new resource. */ |
2955 | | |
2956 | | find_key_value (url_end, val_end, "geo", &mres.location); |
2957 | | |
2958 | | mres.url = urlstr; |
2959 | | urlstr = NULL; |
2960 | | |
2961 | | mres.preference = 0; |
2962 | | if (has_key (url_end, val_end, "pref")) |
2963 | | { |
2964 | | DEBUGP (("This resource has preference\n")); |
2965 | | mres.preference = 1; |
2966 | | } |
2967 | | |
2968 | | /* 1 slot from new resource, 1 slot for null-termination. */ |
2969 | | mfile->resources = xrealloc (mfile->resources, |
2970 | | sizeof (metalink_resource_t *) * (res_count + 2)); |
2971 | | mfile->resources[res_count] = xnew0 (metalink_resource_t); |
2972 | | *mfile->resources[res_count] = mres; |
2973 | | res_count++; |
2974 | | } |
2975 | | } /* Handle resource link (rel=duplicate). */ |
2976 | | |
2977 | | /* Handle Metalink/XML resources. */ |
2978 | | else if (reltype && !strcmp (reltype, "application/metalink4+xml")) |
2979 | | { |
2980 | | metalink_metaurl_t murl = {0}; |
2981 | | char *pristr; |
2982 | | |
2983 | | /* |
2984 | | Valid ranges for the "pri" attribute are from |
2985 | | 1 to 999999. Mirror servers with a lower value of the "pri" |
2986 | | attribute have a higher priority, while mirrors with an undefined |
2987 | | "pri" attribute are considered to have a value of 999999, which is |
2988 | | the lowest priority. |
2989 | | |
2990 | | rfc6249 section 3.1 |
2991 | | */ |
2992 | | murl.priority = DEFAULT_PRI; |
2993 | | if (find_key_value (url_end, val_end, "pri", &pristr)) |
2994 | | { |
2995 | | long pri; |
2996 | | char *end_pristr; |
2997 | | /* Do not care for errno since 0 is error in this case. */ |
2998 | | pri = strtol (pristr, &end_pristr, 10); |
2999 | | if (end_pristr != pristr + strlen (pristr) || |
3000 | | !VALID_PRI_RANGE (pri)) |
3001 | | { |
3002 | | /* This is against the specification, so let's inform the user. */ |
3003 | | logprintf (LOG_NOTQUIET, |
3004 | | _("Invalid pri value. Assuming %d.\n"), |
3005 | | DEFAULT_PRI); |
3006 | | } |
3007 | | else |
3008 | | murl.priority = pri; |
3009 | | xfree (pristr); |
3010 | | } |
3011 | | |
3012 | | murl.mediatype = xstrdup (reltype); |
3013 | | |
3014 | | DEBUGP (("MEDIATYPE=%s\n", murl.mediatype)); |
3015 | | |
3016 | | /* At this point we have validated the new resource. */ |
3017 | | |
3018 | | find_key_value (url_end, val_end, "name", &murl.name); |
3019 | | |
3020 | | murl.url = urlstr; |
3021 | | urlstr = NULL; |
3022 | | |
3023 | | /* 1 slot from new resource, 1 slot for null-termination. */ |
3024 | | mfile->metaurls = xrealloc (mfile->metaurls, |
3025 | | sizeof (metalink_metaurl_t *) * (meta_count + 2)); |
3026 | | mfile->metaurls[meta_count] = xnew0 (metalink_metaurl_t); |
3027 | | *mfile->metaurls[meta_count] = murl; |
3028 | | meta_count++; |
3029 | | } /* Handle resource link (rel=describedby). */ |
3030 | | else |
3031 | | DEBUGP (("This link header was not used for Metalink\n")); |
3032 | | |
3033 | | xfree (urlstr); |
3034 | | xfree (reltype); |
3035 | | xfree (rel); |
3036 | | } /* Iterate over link headers. */ |
3037 | | |
3038 | | /* Null-terminate resources array. */ |
3039 | | mfile->resources[res_count] = 0; |
3040 | | mfile->metaurls[meta_count] = 0; |
3041 | | |
3042 | | if (res_count == 0 && meta_count == 0) |
3043 | | { |
3044 | | DEBUGP (("No valid metalink references found.\n")); |
3045 | | goto fail; |
3046 | | } |
3047 | | |
3048 | | /* Find all Digest headers. */ |
3049 | | for (i = 0; |
3050 | | (i = resp_header_locate (resp, "Digest", i, &val_beg, &val_end)) != -1; |
3051 | | i++) |
3052 | | { |
3053 | | const char *dig_pos; |
3054 | | char *dig_type, *dig_hash; |
3055 | | |
3056 | | /* Each Digest header can include multiple hashes. Example: |
3057 | | Digest: SHA=thvDyvhfIqlvFe+A9MYgxAfm1q5=,unixsum=30637 |
3058 | | Digest: md5=HUXZLQLMuI/KZ5KDcJPcOA== |
3059 | | */ |
3060 | | for (dig_pos = val_beg; |
3061 | | (dig_pos = find_key_values (dig_pos, val_end, &dig_type, &dig_hash)); |
3062 | | dig_pos++) |
3063 | | { |
3064 | | /* The hash here is assumed to be base64. We need the hash in hex. |
3065 | | Therefore we convert: base64 -> binary -> hex. */ |
3066 | | const size_t dig_hash_str_len = strlen (dig_hash); |
3067 | | char bin_hash[256]; |
3068 | | ssize_t hash_bin_len; |
3069 | | |
3070 | | // there is no hash with that size |
3071 | | if (dig_hash_str_len >= sizeof (bin_hash)) |
3072 | | { |
3073 | | DEBUGP (("Hash too long, ignored.\n")); |
3074 | | xfree (dig_type); |
3075 | | xfree (dig_hash); |
3076 | | continue; |
3077 | | } |
3078 | | |
3079 | | hash_bin_len = wget_base64_decode (dig_hash, bin_hash, dig_hash_str_len * 3 / 4 + 1); |
3080 | | |
3081 | | /* Detect malformed base64 input. */ |
3082 | | if (hash_bin_len < 0) |
3083 | | { |
3084 | | DEBUGP (("Malformed base64 input, ignored.\n")); |
3085 | | xfree (dig_type); |
3086 | | xfree (dig_hash); |
3087 | | continue; |
3088 | | } |
3089 | | |
3090 | | /* One slot for me, one for zero-termination. */ |
3091 | | mfile->checksums = |
3092 | | xrealloc (mfile->checksums, |
3093 | | sizeof (metalink_checksum_t *) * (hash_count + 2)); |
3094 | | mfile->checksums[hash_count] = xnew (metalink_checksum_t); |
3095 | | mfile->checksums[hash_count]->type = dig_type; |
3096 | | |
3097 | | mfile->checksums[hash_count]->hash = xmalloc ((size_t)hash_bin_len * 2 + 1); |
3098 | | wg_hex_to_string (mfile->checksums[hash_count]->hash, bin_hash, (size_t)hash_bin_len); |
3099 | | |
3100 | | xfree (dig_hash); |
3101 | | |
3102 | | hash_count++; |
3103 | | } |
3104 | | } |
3105 | | |
3106 | | /* Zero-terminate checksums array. */ |
3107 | | mfile->checksums[hash_count] = 0; |
3108 | | |
3109 | | /* |
3110 | | If Instance Digests are not provided by the Metalink servers, the |
3111 | | Link header fields pertaining to this specification MUST be ignored. |
3112 | | |
3113 | | rfc6249 section 6 |
3114 | | */ |
3115 | | if (res_count && hash_count == 0) |
3116 | | { |
3117 | | logputs (LOG_VERBOSE, |
3118 | | _("Could not find acceptable digest for Metalink resources.\n" |
3119 | | "Ignoring them.\n")); |
3120 | | goto fail; |
3121 | | } |
3122 | | |
3123 | | /* Metalink data is OK. Now we just need to sort the resources based |
3124 | | on their priorities, preference, and perhaps location. */ |
3125 | | stable_sort (mfile->resources, res_count, sizeof (metalink_resource_t *), metalink_res_cmp); |
3126 | | stable_sort (mfile->metaurls, meta_count, sizeof (metalink_metaurl_t *), metalink_meta_cmp); |
3127 | | |
3128 | | /* Restore sensible preference values (in case someone cares to look). */ |
3129 | | for (i = 0; i < res_count; ++i) |
3130 | | mfile->resources[i]->preference = 1000000 - mfile->resources[i]->priority; |
3131 | | |
3132 | | metalink = xnew0 (metalink_t); |
3133 | | metalink->files = xmalloc (sizeof (metalink_file_t *) * 2); |
3134 | | metalink->files[0] = mfile; |
3135 | | metalink->files[1] = 0; |
3136 | | metalink->origin = xstrdup (u->url); |
3137 | | metalink->version = METALINK_VERSION_4; |
3138 | | /* Leave other fields set to 0. */ |
3139 | | |
3140 | | return metalink; |
3141 | | |
3142 | | fail: |
3143 | | /* Free all allocated memory. */ |
3144 | | if (metalink) |
3145 | | metalink_delete (metalink); |
3146 | | else |
3147 | | metalink_file_delete (mfile); |
3148 | | return NULL; |
3149 | | } |
3150 | | #endif /* HAVE_METALINK */ |
3151 | | |
3152 | | /* Retrieve a document through HTTP protocol. It recognizes status |
3153 | | code, and correctly handles redirections. It closes the network |
3154 | | socket. If it receives an error from the functions below it, it |
3155 | | will print it if there is enough information to do so (almost |
3156 | | always), returning the error to the caller (i.e. http_loop). |
3157 | | |
3158 | | Various HTTP parameters are stored to hs. |
3159 | | |
3160 | | If PROXY is non-NULL, the connection will be made to the proxy |
3161 | | server, and u->url will be requested. */ |
3162 | | static uerr_t |
3163 | | gethttp (const struct url *u, struct url *original_url, struct http_stat *hs, |
3164 | | int *dt, struct url *proxy, struct iri *iri, int count) |
3165 | 0 | { |
3166 | 0 | struct request *req = NULL; |
3167 | |
|
3168 | 0 | char *type = NULL; |
3169 | 0 | char *user, *passwd; |
3170 | 0 | char *proxyauth; |
3171 | 0 | int statcode; |
3172 | 0 | int write_error; |
3173 | 0 | wgint contlen, contrange; |
3174 | 0 | const struct url *conn; |
3175 | 0 | FILE *fp; |
3176 | 0 | int err; |
3177 | 0 | uerr_t retval; |
3178 | 0 | #ifdef HAVE_HSTS |
3179 | 0 | #ifdef TESTING |
3180 | | /* we don't link against main.o when we're testing */ |
3181 | 0 | hsts_store_t hsts_store = NULL; |
3182 | | #else |
3183 | | extern hsts_store_t hsts_store; |
3184 | | #endif |
3185 | 0 | #endif |
3186 | |
|
3187 | 0 | int sock = -1; |
3188 | | |
3189 | | /* Set to 1 when the authorization has already been sent and should |
3190 | | not be tried again. */ |
3191 | 0 | bool auth_finished = false; |
3192 | | |
3193 | | /* Set to 1 when just globally-set Basic authorization has been sent; |
3194 | | * should prevent further Basic negotiations, but not other |
3195 | | * mechanisms. */ |
3196 | 0 | bool basic_auth_finished = false; |
3197 | | |
3198 | | /* Whether NTLM authentication is used for this request. */ |
3199 | 0 | bool ntlm_seen = false; |
3200 | | |
3201 | | /* Whether our connection to the remote host is through SSL. */ |
3202 | 0 | bool using_ssl = false; |
3203 | | |
3204 | | /* Whether a HEAD request will be issued (as opposed to GET or |
3205 | | POST). */ |
3206 | 0 | bool head_only = !!(*dt & HEAD_ONLY); |
3207 | | |
3208 | | /* Whether conditional get request will be issued. */ |
3209 | 0 | bool cond_get = !!(*dt & IF_MODIFIED_SINCE); |
3210 | |
|
3211 | | #ifdef HAVE_METALINK |
3212 | | /* Are we looking for metalink info in HTTP headers? */ |
3213 | | bool metalink = !!(*dt & METALINK_METADATA); |
3214 | | #endif |
3215 | |
|
3216 | 0 | char *head = NULL; |
3217 | 0 | struct response *resp = NULL; |
3218 | 0 | char hdrval[512]; |
3219 | 0 | char *message = NULL; |
3220 | | |
3221 | | /* Declare WARC variables. */ |
3222 | 0 | bool warc_enabled = (opt.warc_filename != NULL); |
3223 | 0 | FILE *warc_tmp = NULL; |
3224 | 0 | char warc_timestamp_str [21]; |
3225 | 0 | char warc_request_uuid [48]; |
3226 | 0 | ip_address warc_ip_buf, *warc_ip = NULL; |
3227 | 0 | off_t warc_payload_offset = -1; |
3228 | | |
3229 | | /* Whether this connection will be kept alive after the HTTP request |
3230 | | is done. */ |
3231 | 0 | bool keep_alive; |
3232 | | |
3233 | | /* Is the server using the chunked transfer encoding? */ |
3234 | 0 | bool chunked_transfer_encoding = false; |
3235 | | |
3236 | | /* Whether keep-alive should be inhibited. */ |
3237 | 0 | bool inhibit_keep_alive = |
3238 | 0 | !opt.http_keep_alive || opt.ignore_length; |
3239 | | |
3240 | | /* Headers sent when using POST. */ |
3241 | 0 | wgint body_data_size = 0; |
3242 | |
|
3243 | 0 | #ifdef HAVE_SSL |
3244 | 0 | if (u->scheme == SCHEME_HTTPS) |
3245 | 0 | { |
3246 | | /* Initialize the SSL context. After this has once been done, |
3247 | | it becomes a no-op. */ |
3248 | 0 | if (!ssl_init ()) |
3249 | 0 | { |
3250 | 0 | scheme_disable (SCHEME_HTTPS); |
3251 | 0 | logprintf (LOG_NOTQUIET, |
3252 | 0 | _("Disabling SSL due to encountered errors.\n")); |
3253 | 0 | retval = SSLINITFAILED; |
3254 | 0 | goto cleanup; |
3255 | 0 | } |
3256 | 0 | } |
3257 | 0 | #endif /* HAVE_SSL */ |
3258 | | |
3259 | | /* Initialize certain elements of struct http_stat. |
3260 | | * Since this function is called in a loop, we have to xfree certain |
3261 | | * members. */ |
3262 | 0 | hs->len = 0; |
3263 | 0 | hs->contlen = -1; |
3264 | 0 | hs->res = -1; |
3265 | 0 | xfree (hs->rderrmsg); |
3266 | 0 | xfree (hs->newloc); |
3267 | 0 | xfree (hs->remote_time); |
3268 | 0 | xfree (hs->error); |
3269 | 0 | xfree (hs->message); |
3270 | 0 | hs->local_encoding = ENC_NONE; |
3271 | 0 | hs->remote_encoding = ENC_NONE; |
3272 | |
|
3273 | 0 | conn = u; |
3274 | |
|
3275 | 0 | { |
3276 | 0 | uerr_t ret; |
3277 | 0 | req = initialize_request (u, hs, dt, proxy, inhibit_keep_alive, |
3278 | 0 | &basic_auth_finished, &body_data_size, |
3279 | 0 | &user, &passwd, &ret); |
3280 | 0 | if (req == NULL) |
3281 | 0 | { |
3282 | 0 | retval = ret; |
3283 | 0 | goto cleanup; |
3284 | 0 | } |
3285 | 0 | } |
3286 | 0 | retry_with_auth: |
3287 | | /* We need to come back here when the initial attempt to retrieve |
3288 | | without authorization header fails. (Expected to happen at least |
3289 | | for the Digest authorization scheme.) */ |
3290 | |
|
3291 | 0 | if (opt.cookies) |
3292 | 0 | request_set_header (req, "Cookie", |
3293 | 0 | cookie_header (wget_cookie_jar, |
3294 | 0 | u->host, u->port, u->path, |
3295 | 0 | #ifdef HAVE_SSL |
3296 | 0 | u->scheme == SCHEME_HTTPS |
3297 | | #else |
3298 | | 0 |
3299 | | #endif |
3300 | 0 | ), |
3301 | 0 | rel_value); |
3302 | | |
3303 | | /* Add the user headers. */ |
3304 | 0 | if (opt.user_headers) |
3305 | 0 | { |
3306 | 0 | int i; |
3307 | 0 | for (i = 0; opt.user_headers[i]; i++) |
3308 | 0 | request_set_user_header (req, opt.user_headers[i]); |
3309 | 0 | } |
3310 | |
|
3311 | 0 | proxyauth = NULL; |
3312 | 0 | if (proxy) |
3313 | 0 | { |
3314 | 0 | conn = proxy; |
3315 | 0 | initialize_proxy_configuration (u, req, proxy, &proxyauth); |
3316 | 0 | } |
3317 | 0 | keep_alive = true; |
3318 | | |
3319 | | /* Establish the connection. */ |
3320 | 0 | if (inhibit_keep_alive) |
3321 | 0 | keep_alive = false; |
3322 | |
|
3323 | 0 | { |
3324 | 0 | uerr_t conn_err = establish_connection (u, &conn, hs, proxy, &proxyauth, &req, |
3325 | 0 | &using_ssl, inhibit_keep_alive, &sock); |
3326 | 0 | if (conn_err != RETROK) |
3327 | 0 | { |
3328 | 0 | retval = conn_err; |
3329 | 0 | goto cleanup; |
3330 | 0 | } |
3331 | 0 | } |
3332 | | |
3333 | | /* Open the temporary file where we will write the request. */ |
3334 | 0 | if (warc_enabled) |
3335 | 0 | { |
3336 | 0 | warc_tmp = warc_tempfile (); |
3337 | 0 | if (warc_tmp == NULL) |
3338 | 0 | { |
3339 | 0 | CLOSE_INVALIDATE (sock); |
3340 | 0 | retval = WARC_TMP_FOPENERR; |
3341 | 0 | goto cleanup; |
3342 | 0 | } |
3343 | | |
3344 | 0 | if (! proxy) |
3345 | 0 | { |
3346 | 0 | warc_ip = &warc_ip_buf; |
3347 | 0 | socket_ip_address (sock, warc_ip, ENDPOINT_PEER); |
3348 | 0 | } |
3349 | 0 | } |
3350 | | |
3351 | | /* Send the request to server. */ |
3352 | 0 | write_error = request_send (req, sock, warc_tmp); |
3353 | |
|
3354 | 0 | if (write_error >= 0) |
3355 | 0 | { |
3356 | 0 | if (opt.body_data) |
3357 | 0 | { |
3358 | 0 | DEBUGP (("[BODY data: %s]\n", opt.body_data)); |
3359 | 0 | write_error = fd_write (sock, opt.body_data, body_data_size, -1); |
3360 | 0 | if (write_error >= 0 && warc_tmp != NULL) |
3361 | 0 | { |
3362 | 0 | int warc_tmp_written; |
3363 | | |
3364 | | /* Remember end of headers / start of payload. */ |
3365 | 0 | warc_payload_offset = ftello (warc_tmp); |
3366 | | |
3367 | | /* Write a copy of the data to the WARC record. */ |
3368 | 0 | warc_tmp_written = fwrite (opt.body_data, 1, body_data_size, warc_tmp); |
3369 | 0 | if (warc_tmp_written != body_data_size) |
3370 | 0 | write_error = -2; |
3371 | 0 | } |
3372 | 0 | } |
3373 | 0 | else if (opt.body_file && body_data_size != 0) |
3374 | 0 | { |
3375 | 0 | if (warc_tmp != NULL) |
3376 | | /* Remember end of headers / start of payload */ |
3377 | 0 | warc_payload_offset = ftello (warc_tmp); |
3378 | |
|
3379 | 0 | write_error = body_file_send (sock, opt.body_file, body_data_size, warc_tmp); |
3380 | 0 | } |
3381 | 0 | } |
3382 | |
|
3383 | 0 | if (write_error < 0) |
3384 | 0 | { |
3385 | 0 | CLOSE_INVALIDATE (sock); |
3386 | |
|
3387 | 0 | if (warc_tmp != NULL) |
3388 | 0 | fclose (warc_tmp); |
3389 | |
|
3390 | 0 | if (write_error == -2) |
3391 | 0 | retval = WARC_TMP_FWRITEERR; |
3392 | 0 | else |
3393 | 0 | retval = WRITEFAILED; |
3394 | 0 | goto cleanup; |
3395 | 0 | } |
3396 | 0 | logprintf (LOG_VERBOSE, _("%s request sent, awaiting response... "), |
3397 | 0 | proxy ? "Proxy" : "HTTP"); |
3398 | 0 | contlen = -1; |
3399 | 0 | contrange = 0; |
3400 | 0 | *dt &= ~RETROKF; |
3401 | | |
3402 | |
|
3403 | 0 | if (warc_enabled) |
3404 | 0 | { |
3405 | 0 | bool warc_result; |
3406 | | |
3407 | | /* Generate a timestamp and uuid for this request. */ |
3408 | 0 | warc_timestamp (warc_timestamp_str, sizeof (warc_timestamp_str)); |
3409 | 0 | warc_uuid_str (warc_request_uuid, sizeof (warc_request_uuid)); |
3410 | | |
3411 | | /* Create a request record and store it in the WARC file. */ |
3412 | 0 | warc_result = warc_write_request_record (u->url, warc_timestamp_str, |
3413 | 0 | warc_request_uuid, warc_ip, |
3414 | 0 | warc_tmp, warc_payload_offset); |
3415 | 0 | if (! warc_result) |
3416 | 0 | { |
3417 | 0 | CLOSE_INVALIDATE (sock); |
3418 | 0 | retval = WARC_ERR; |
3419 | 0 | goto cleanup; |
3420 | 0 | } |
3421 | | |
3422 | | /* warc_write_request_record has also closed warc_tmp. */ |
3423 | 0 | } |
3424 | | |
3425 | | /* Repeat while we receive a 10x response code. */ |
3426 | 0 | { |
3427 | 0 | bool _repeat; |
3428 | |
|
3429 | 0 | do |
3430 | 0 | { |
3431 | 0 | head = read_http_response_head (sock); |
3432 | 0 | if (!head) |
3433 | 0 | { |
3434 | 0 | if (errno == 0) |
3435 | 0 | { |
3436 | 0 | logputs (LOG_NOTQUIET, _("No data received.\n")); |
3437 | 0 | CLOSE_INVALIDATE (sock); |
3438 | 0 | retval = HEOF; |
3439 | 0 | } |
3440 | 0 | else |
3441 | 0 | { |
3442 | 0 | logprintf (LOG_NOTQUIET, _("Read error (%s) in headers.\n"), |
3443 | 0 | fd_errstr (sock)); |
3444 | 0 | CLOSE_INVALIDATE (sock); |
3445 | 0 | retval = HERR; |
3446 | 0 | } |
3447 | 0 | goto cleanup; |
3448 | 0 | } |
3449 | 0 | DEBUGP (("\n---response begin---\n%s---response end---\n", head)); |
3450 | |
|
3451 | 0 | resp = resp_new (head); |
3452 | | |
3453 | | /* Check for status line. */ |
3454 | 0 | xfree (message); |
3455 | 0 | statcode = resp_status (resp, &message); |
3456 | 0 | if (statcode < 0) |
3457 | 0 | { |
3458 | 0 | char *tms = datetime_str (time (NULL)); |
3459 | 0 | logprintf (LOG_VERBOSE, "%d\n", statcode); |
3460 | 0 | logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), tms, statcode, |
3461 | 0 | quotearg_style (escape_quoting_style, |
3462 | 0 | _("Malformed status line"))); |
3463 | 0 | CLOSE_INVALIDATE (sock); |
3464 | 0 | retval = HERR; |
3465 | 0 | goto cleanup; |
3466 | 0 | } |
3467 | | |
3468 | 0 | if (H_10X (statcode)) |
3469 | 0 | { |
3470 | 0 | xfree (head); |
3471 | 0 | resp_free (&resp); |
3472 | 0 | _repeat = true; |
3473 | 0 | DEBUGP (("Ignoring response\n")); |
3474 | 0 | } |
3475 | 0 | else |
3476 | 0 | { |
3477 | 0 | _repeat = false; |
3478 | 0 | } |
3479 | 0 | } |
3480 | 0 | while (_repeat); |
3481 | 0 | } |
3482 | | |
3483 | 0 | xfree (hs->message); |
3484 | 0 | hs->message = xstrdup (message); |
3485 | 0 | if (!opt.server_response) |
3486 | 0 | logprintf (LOG_VERBOSE, "%2d %s\n", statcode, |
3487 | 0 | message ? quotearg_style (escape_quoting_style, message) : ""); |
3488 | 0 | else |
3489 | 0 | { |
3490 | 0 | logprintf (LOG_VERBOSE, "\n"); |
3491 | 0 | print_server_response (resp, " "); |
3492 | 0 | } |
3493 | |
|
3494 | 0 | if (!opt.ignore_length |
3495 | 0 | && resp_header_copy (resp, "Content-Length", hdrval, sizeof (hdrval))) |
3496 | 0 | { |
3497 | 0 | wgint parsed; |
3498 | 0 | errno = 0; |
3499 | 0 | parsed = str_to_wgint (hdrval, NULL, 10); |
3500 | 0 | if (parsed == WGINT_MAX && errno == ERANGE) |
3501 | 0 | { |
3502 | | /* Out of range. |
3503 | | #### If Content-Length is out of range, it most likely |
3504 | | means that the file is larger than 2G and that we're |
3505 | | compiled without LFS. In that case we should probably |
3506 | | refuse to even attempt to download the file. */ |
3507 | 0 | contlen = -1; |
3508 | 0 | } |
3509 | 0 | else if (parsed < 0) |
3510 | 0 | { |
3511 | | /* Negative Content-Length; nonsensical, so we can't |
3512 | | assume any information about the content to receive. */ |
3513 | 0 | contlen = -1; |
3514 | 0 | } |
3515 | 0 | else |
3516 | 0 | contlen = parsed; |
3517 | 0 | } |
3518 | | |
3519 | | /* Check for keep-alive related responses. */ |
3520 | 0 | if (!inhibit_keep_alive) |
3521 | 0 | { |
3522 | 0 | if (resp_header_copy (resp, "Connection", hdrval, sizeof (hdrval))) |
3523 | 0 | { |
3524 | 0 | if (0 == c_strcasecmp (hdrval, "Close")) |
3525 | 0 | keep_alive = false; |
3526 | 0 | } |
3527 | 0 | } |
3528 | |
|
3529 | 0 | chunked_transfer_encoding = false; |
3530 | 0 | if (resp_header_copy (resp, "Transfer-Encoding", hdrval, sizeof (hdrval)) |
3531 | 0 | && 0 == c_strcasecmp (hdrval, "chunked")) |
3532 | 0 | chunked_transfer_encoding = true; |
3533 | | |
3534 | | /* Handle (possibly multiple instances of) the Set-Cookie header. */ |
3535 | 0 | if (opt.cookies) |
3536 | 0 | { |
3537 | 0 | int scpos; |
3538 | 0 | const char *scbeg, *scend; |
3539 | | /* The jar should have been created by now. */ |
3540 | 0 | assert (wget_cookie_jar != NULL); |
3541 | 0 | for (scpos = 0; |
3542 | 0 | (scpos = resp_header_locate (resp, "Set-Cookie", scpos, |
3543 | 0 | &scbeg, &scend)) != -1; |
3544 | 0 | ++scpos) |
3545 | 0 | { |
3546 | 0 | char buf[1024], *set_cookie; |
3547 | 0 | size_t len = scend - scbeg; |
3548 | |
|
3549 | 0 | if (len < sizeof (buf)) |
3550 | 0 | set_cookie = buf; |
3551 | 0 | else |
3552 | 0 | set_cookie = xmalloc (len + 1); |
3553 | |
|
3554 | 0 | memcpy (set_cookie, scbeg, len); |
3555 | 0 | set_cookie[len] = 0; |
3556 | |
|
3557 | 0 | cookie_handle_set_cookie (wget_cookie_jar, u->host, u->port, |
3558 | 0 | u->path, set_cookie); |
3559 | |
|
3560 | 0 | if (set_cookie != buf) |
3561 | 0 | xfree (set_cookie); |
3562 | 0 | } |
3563 | 0 | } |
3564 | |
|
3565 | 0 | if (keep_alive) |
3566 | | /* The server has promised that it will not close the connection |
3567 | | when we're done. This means that we can register it. */ |
3568 | 0 | register_persistent (conn->host, conn->port, sock, using_ssl); |
3569 | |
|
3570 | | #ifdef HAVE_METALINK |
3571 | | /* We need to check for the Metalink data in the very first response |
3572 | | we get from the server (before redirections, authorization, etc.). */ |
3573 | | if (metalink) |
3574 | | { |
3575 | | hs->metalink = metalink_from_http (resp, hs, u); |
3576 | | /* Bugfix: hs->local_file is NULL (opt.content_disposition). */ |
3577 | | if (!hs->local_file && hs->metalink && hs->metalink->origin) |
3578 | | hs->local_file = xstrdup (hs->metalink->origin); |
3579 | | xfree (hs->message); |
3580 | | retval = RETR_WITH_METALINK; |
3581 | | CLOSE_FINISH (sock); |
3582 | | goto cleanup; |
3583 | | } |
3584 | | #endif |
3585 | |
|
3586 | 0 | if (statcode == HTTP_STATUS_UNAUTHORIZED) |
3587 | 0 | { |
3588 | | /* Authorization is required. */ |
3589 | 0 | uerr_t auth_err = RETROK; |
3590 | 0 | bool retry; |
3591 | | /* Normally we are not interested in the response body. |
3592 | | But if we are writing a WARC file we are: we like to keep everything. */ |
3593 | 0 | if (warc_enabled) |
3594 | 0 | { |
3595 | 0 | int _err; |
3596 | 0 | type = resp_header_strdup (resp, "Content-Type"); |
3597 | 0 | _err = read_response_body (hs, sock, NULL, contlen, 0, |
3598 | 0 | chunked_transfer_encoding, |
3599 | 0 | u->url, warc_timestamp_str, |
3600 | 0 | warc_request_uuid, warc_ip, type, |
3601 | 0 | statcode, head); |
3602 | 0 | xfree (type); |
3603 | |
|
3604 | 0 | if (_err != RETRFINISHED || hs->res < 0) |
3605 | 0 | { |
3606 | 0 | CLOSE_INVALIDATE (sock); |
3607 | 0 | retval = _err; |
3608 | 0 | goto cleanup; |
3609 | 0 | } |
3610 | 0 | else |
3611 | 0 | CLOSE_FINISH (sock); |
3612 | 0 | } |
3613 | 0 | else |
3614 | 0 | { |
3615 | | /* Since WARC is disabled, we are not interested in the response body. */ |
3616 | 0 | if (keep_alive && !head_only |
3617 | 0 | && skip_short_body (sock, contlen, chunked_transfer_encoding)) |
3618 | 0 | CLOSE_FINISH (sock); |
3619 | 0 | else |
3620 | 0 | CLOSE_INVALIDATE (sock); |
3621 | 0 | } |
3622 | | |
3623 | 0 | pconn.authorized = false; |
3624 | |
|
3625 | 0 | { |
3626 | 0 | auth_err = check_auth (u, user, passwd, resp, req, |
3627 | 0 | &ntlm_seen, &retry, |
3628 | 0 | &basic_auth_finished, |
3629 | 0 | &auth_finished); |
3630 | 0 | if (auth_err == RETROK && retry) |
3631 | 0 | { |
3632 | 0 | resp_free (&resp); |
3633 | 0 | xfree (message); |
3634 | 0 | xfree (head); |
3635 | 0 | goto retry_with_auth; |
3636 | 0 | } |
3637 | 0 | } |
3638 | 0 | if (auth_err == RETROK) |
3639 | 0 | retval = AUTHFAILED; |
3640 | 0 | else |
3641 | 0 | retval = auth_err; |
3642 | 0 | goto cleanup; |
3643 | 0 | } |
3644 | 0 | else /* statcode != HTTP_STATUS_UNAUTHORIZED */ |
3645 | 0 | { |
3646 | | /* Kludge: if NTLM is used, mark the TCP connection as authorized. */ |
3647 | 0 | if (ntlm_seen) |
3648 | 0 | pconn.authorized = true; |
3649 | 0 | } |
3650 | | |
3651 | 0 | { |
3652 | 0 | uerr_t ret = check_file_output (u, hs, resp, hdrval, sizeof hdrval); |
3653 | 0 | if (ret != RETROK) |
3654 | 0 | { |
3655 | 0 | retval = ret; |
3656 | 0 | goto cleanup; |
3657 | 0 | } |
3658 | 0 | } |
3659 | | |
3660 | 0 | hs->statcode = statcode; |
3661 | 0 | xfree (hs->error); |
3662 | 0 | if (statcode == -1) |
3663 | 0 | hs->error = xstrdup (_("Malformed status line")); |
3664 | 0 | else if (!message || !*message) |
3665 | 0 | hs->error = xstrdup (_("(no description)")); |
3666 | 0 | else |
3667 | 0 | hs->error = xstrdup (message); |
3668 | |
|
3669 | 0 | #ifdef HAVE_HSTS |
3670 | 0 | if (opt.hsts && hsts_store) |
3671 | 0 | { |
3672 | 0 | int64_t max_age; |
3673 | 0 | const char *hsts_params = resp_header_strdup (resp, "Strict-Transport-Security"); |
3674 | 0 | bool include_subdomains; |
3675 | |
|
3676 | 0 | if (parse_strict_transport_security (hsts_params, &max_age, &include_subdomains)) |
3677 | 0 | { |
3678 | | /* process strict transport security */ |
3679 | 0 | if (hsts_store_entry (hsts_store, u->scheme, u->host, u->port, max_age, include_subdomains)) |
3680 | 0 | DEBUGP(("Added new HSTS host: %s:%" PRIu32 " (max-age: %" PRId64 ", includeSubdomains: %s)\n", |
3681 | 0 | u->host, |
3682 | 0 | (uint32_t) u->port, |
3683 | 0 | max_age, |
3684 | 0 | (include_subdomains ? "true" : "false"))); |
3685 | 0 | else |
3686 | 0 | DEBUGP(("Updated HSTS host: %s:%" PRIu32 " (max-age: %" PRId64 ", includeSubdomains: %s)\n", |
3687 | 0 | u->host, |
3688 | 0 | (uint32_t) u->port, |
3689 | 0 | max_age, |
3690 | 0 | (include_subdomains ? "true" : "false"))); |
3691 | 0 | } |
3692 | 0 | xfree (hsts_params); |
3693 | 0 | } |
3694 | 0 | #endif |
3695 | |
|
3696 | 0 | type = resp_header_strdup (resp, "Content-Type"); |
3697 | 0 | if (type) |
3698 | 0 | { |
3699 | 0 | char *tmp = strchr (type, ';'); |
3700 | 0 | if (tmp) |
3701 | 0 | { |
3702 | 0 | #ifdef ENABLE_IRI |
3703 | | /* sXXXav: only needed if IRI support is enabled */ |
3704 | 0 | char *tmp2 = tmp + 1; |
3705 | 0 | #endif |
3706 | |
|
3707 | 0 | while (tmp > type && c_isspace (tmp[-1])) |
3708 | 0 | --tmp; |
3709 | 0 | *tmp = '\0'; |
3710 | |
|
3711 | 0 | #ifdef ENABLE_IRI |
3712 | | /* Try to get remote encoding if needed */ |
3713 | 0 | if (opt.enable_iri && !opt.encoding_remote) |
3714 | 0 | { |
3715 | 0 | tmp = parse_charset (tmp2); |
3716 | 0 | if (tmp) |
3717 | 0 | set_content_encoding (iri, tmp); |
3718 | 0 | xfree (tmp); |
3719 | 0 | } |
3720 | 0 | #endif |
3721 | 0 | } |
3722 | 0 | } |
3723 | 0 | xfree (hs->newloc); |
3724 | 0 | hs->newloc = resp_header_strdup (resp, "Location"); |
3725 | 0 | xfree (hs->remote_time); |
3726 | 0 | hs->remote_time = resp_header_strdup (resp, "Last-Modified"); |
3727 | 0 | if (!hs->remote_time) // now look for the Wayback Machine's timestamp |
3728 | 0 | hs->remote_time = resp_header_strdup (resp, "X-Archive-Orig-last-modified"); |
3729 | |
|
3730 | 0 | if (resp_header_copy (resp, "Content-Range", hdrval, sizeof (hdrval))) |
3731 | 0 | { |
3732 | 0 | wgint first_byte_pos, last_byte_pos, entity_length; |
3733 | 0 | if (parse_content_range (hdrval, &first_byte_pos, &last_byte_pos, |
3734 | 0 | &entity_length)) |
3735 | 0 | { |
3736 | 0 | contrange = first_byte_pos; |
3737 | 0 | contlen = last_byte_pos - first_byte_pos + 1; |
3738 | 0 | } |
3739 | 0 | } |
3740 | |
|
3741 | 0 | if (resp_header_copy (resp, "Content-Encoding", hdrval, sizeof (hdrval))) |
3742 | 0 | { |
3743 | 0 | hs->local_encoding = ENC_INVALID; |
3744 | |
|
3745 | 0 | switch (hdrval[0]) |
3746 | 0 | { |
3747 | 0 | case 'b': case 'B': |
3748 | 0 | if (0 == c_strcasecmp(hdrval, "br")) |
3749 | 0 | hs->local_encoding = ENC_BROTLI; |
3750 | 0 | break; |
3751 | 0 | case 'c': case 'C': |
3752 | 0 | if (0 == c_strcasecmp(hdrval, "compress")) |
3753 | 0 | hs->local_encoding = ENC_COMPRESS; |
3754 | 0 | break; |
3755 | 0 | case 'd': case 'D': |
3756 | 0 | if (0 == c_strcasecmp(hdrval, "deflate")) |
3757 | 0 | hs->local_encoding = ENC_DEFLATE; |
3758 | 0 | break; |
3759 | 0 | case 'g': case 'G': |
3760 | 0 | if (0 == c_strcasecmp(hdrval, "gzip")) |
3761 | 0 | hs->local_encoding = ENC_GZIP; |
3762 | 0 | break; |
3763 | 0 | case 'i': case 'I': |
3764 | 0 | if (0 == c_strcasecmp(hdrval, "identity")) |
3765 | 0 | hs->local_encoding = ENC_NONE; |
3766 | 0 | break; |
3767 | 0 | case 'x': case 'X': |
3768 | 0 | if (0 == c_strcasecmp(hdrval, "x-compress")) |
3769 | 0 | hs->local_encoding = ENC_COMPRESS; |
3770 | 0 | else if (0 == c_strcasecmp(hdrval, "x-gzip")) |
3771 | 0 | hs->local_encoding = ENC_GZIP; |
3772 | 0 | break; |
3773 | 0 | case '\0': |
3774 | 0 | hs->local_encoding = ENC_NONE; |
3775 | 0 | } |
3776 | | |
3777 | 0 | if (hs->local_encoding == ENC_INVALID) |
3778 | 0 | { |
3779 | 0 | DEBUGP (("Unrecognized Content-Encoding: %s\n", hdrval)); |
3780 | 0 | hs->local_encoding = ENC_NONE; |
3781 | 0 | } |
3782 | 0 | #ifdef HAVE_LIBZ |
3783 | 0 | else if (hs->local_encoding == ENC_GZIP |
3784 | 0 | && opt.compression != compression_none) |
3785 | 0 | { |
3786 | 0 | const char *p; |
3787 | | |
3788 | | /* Make sure the Content-Type is not gzip before decompressing */ |
3789 | 0 | if (type) |
3790 | 0 | { |
3791 | 0 | p = strchr (type, '/'); |
3792 | 0 | if (p == NULL) |
3793 | 0 | { |
3794 | 0 | hs->remote_encoding = ENC_GZIP; |
3795 | 0 | hs->local_encoding = ENC_NONE; |
3796 | 0 | } |
3797 | 0 | else |
3798 | 0 | { |
3799 | 0 | p++; |
3800 | 0 | if (c_tolower(p[0]) == 'x' && p[1] == '-') |
3801 | 0 | p += 2; |
3802 | 0 | if (0 != c_strcasecmp (p, "gzip")) |
3803 | 0 | { |
3804 | 0 | hs->remote_encoding = ENC_GZIP; |
3805 | 0 | hs->local_encoding = ENC_NONE; |
3806 | 0 | } |
3807 | 0 | } |
3808 | 0 | } |
3809 | 0 | else |
3810 | 0 | { |
3811 | 0 | hs->remote_encoding = ENC_GZIP; |
3812 | 0 | hs->local_encoding = ENC_NONE; |
3813 | 0 | } |
3814 | | |
3815 | | /* don't uncompress if a file ends with '.gz' or '.tgz' */ |
3816 | 0 | if (hs->remote_encoding == ENC_GZIP |
3817 | 0 | && (p = strrchr(u->file, '.')) |
3818 | 0 | && (c_strcasecmp(p, ".gz") == 0 || c_strcasecmp(p, ".tgz") == 0)) |
3819 | 0 | { |
3820 | 0 | DEBUGP (("Enabling broken server workaround. Will not decompress this GZip file.\n")); |
3821 | 0 | hs->remote_encoding = ENC_NONE; |
3822 | 0 | } |
3823 | 0 | } |
3824 | 0 | #endif |
3825 | 0 | } |
3826 | | |
3827 | | /* 20x responses are counted among successful by default. */ |
3828 | 0 | if (H_20X (statcode)) |
3829 | 0 | *dt |= RETROKF; |
3830 | |
|
3831 | 0 | if (statcode == HTTP_STATUS_NO_CONTENT) |
3832 | 0 | { |
3833 | | /* 204 response has no body (RFC 2616, 4.3) */ |
3834 | | |
3835 | | /* In case the caller cares to look... */ |
3836 | 0 | hs->len = 0; |
3837 | 0 | hs->res = 0; |
3838 | 0 | hs->restval = 0; |
3839 | |
|
3840 | 0 | CLOSE_FINISH (sock); |
3841 | |
|
3842 | 0 | retval = RETRFINISHED; |
3843 | 0 | goto cleanup; |
3844 | 0 | } |
3845 | | |
3846 | | /* Return if redirected. */ |
3847 | 0 | if (H_REDIRECTED (statcode) || statcode == HTTP_STATUS_MULTIPLE_CHOICES) |
3848 | 0 | { |
3849 | | /* RFC2068 says that in case of the 300 (multiple choices) |
3850 | | response, the server can output a preferred URL through |
3851 | | `Location' header; otherwise, the request should be treated |
3852 | | like GET. So, if the location is set, it will be a |
3853 | | redirection; otherwise, just proceed normally. */ |
3854 | 0 | if (statcode == HTTP_STATUS_MULTIPLE_CHOICES && !hs->newloc) |
3855 | 0 | *dt |= RETROKF; |
3856 | 0 | else |
3857 | 0 | { |
3858 | 0 | logprintf (LOG_VERBOSE, |
3859 | 0 | _("Location: %s%s\n"), |
3860 | 0 | hs->newloc ? escnonprint_uri (hs->newloc) : _("unspecified"), |
3861 | 0 | hs->newloc ? _(" [following]") : ""); |
3862 | | |
3863 | | /* In case the caller cares to look... */ |
3864 | 0 | hs->len = 0; |
3865 | 0 | hs->res = 0; |
3866 | 0 | hs->restval = 0; |
3867 | | |
3868 | | /* Normally we are not interested in the response body of a redirect. |
3869 | | But if we are writing a WARC file we are: we like to keep everything. */ |
3870 | 0 | if (warc_enabled) |
3871 | 0 | { |
3872 | 0 | int _err = read_response_body (hs, sock, NULL, contlen, 0, |
3873 | 0 | chunked_transfer_encoding, |
3874 | 0 | u->url, warc_timestamp_str, |
3875 | 0 | warc_request_uuid, warc_ip, type, |
3876 | 0 | statcode, head); |
3877 | |
|
3878 | 0 | if (_err != RETRFINISHED || hs->res < 0) |
3879 | 0 | { |
3880 | 0 | CLOSE_INVALIDATE (sock); |
3881 | 0 | retval = _err; |
3882 | 0 | goto cleanup; |
3883 | 0 | } |
3884 | 0 | else |
3885 | 0 | CLOSE_FINISH (sock); |
3886 | 0 | } |
3887 | 0 | else |
3888 | 0 | { |
3889 | | /* Since WARC is disabled, we are not interested in the response body. */ |
3890 | 0 | if (keep_alive && !head_only |
3891 | 0 | && skip_short_body (sock, contlen, chunked_transfer_encoding)) |
3892 | 0 | CLOSE_FINISH (sock); |
3893 | 0 | else |
3894 | 0 | CLOSE_INVALIDATE (sock); |
3895 | 0 | } |
3896 | | |
3897 | | /* From RFC2616: The status codes 303 and 307 have |
3898 | | been added for servers that wish to make unambiguously |
3899 | | clear which kind of reaction is expected of the client. |
3900 | | |
3901 | | A 307 should be redirected using the same method, |
3902 | | in other words, a POST should be preserved and not |
3903 | | converted to a GET in that case. |
3904 | | |
3905 | | With strict adherence to RFC2616, POST requests are not |
3906 | | converted to a GET request on 301 Permanent Redirect |
3907 | | or 302 Temporary Redirect. |
3908 | | |
3909 | | A switch may be provided later based on the HTTPbis draft |
3910 | | that allows clients to convert POST requests to GET |
3911 | | requests on 301 and 302 response codes. */ |
3912 | 0 | switch (statcode) |
3913 | 0 | { |
3914 | 0 | case HTTP_STATUS_TEMPORARY_REDIRECT: |
3915 | 0 | case HTTP_STATUS_PERMANENT_REDIRECT: |
3916 | 0 | retval = NEWLOCATION_KEEP_POST; |
3917 | 0 | goto cleanup; |
3918 | 0 | case HTTP_STATUS_MOVED_PERMANENTLY: |
3919 | 0 | if (opt.method && c_strcasecmp (opt.method, "post") != 0) |
3920 | 0 | { |
3921 | 0 | retval = NEWLOCATION_KEEP_POST; |
3922 | 0 | goto cleanup; |
3923 | 0 | } |
3924 | 0 | break; |
3925 | 0 | case HTTP_STATUS_MOVED_TEMPORARILY: |
3926 | 0 | if (opt.method && c_strcasecmp (opt.method, "post") != 0) |
3927 | 0 | { |
3928 | 0 | retval = NEWLOCATION_KEEP_POST; |
3929 | 0 | goto cleanup; |
3930 | 0 | } |
3931 | 0 | break; |
3932 | 0 | } |
3933 | 0 | retval = NEWLOCATION; |
3934 | 0 | goto cleanup; |
3935 | 0 | } |
3936 | 0 | } |
3937 | | |
3938 | 0 | if (cond_get) |
3939 | 0 | { |
3940 | 0 | if (statcode == HTTP_STATUS_NOT_MODIFIED) |
3941 | 0 | { |
3942 | 0 | logprintf (LOG_VERBOSE, |
3943 | 0 | _ ("File %s not modified on server. Omitting download.\n\n"), |
3944 | 0 | quote (hs->local_file)); |
3945 | 0 | *dt |= RETROKF; |
3946 | 0 | CLOSE_FINISH (sock); |
3947 | 0 | retval = RETRUNNEEDED; |
3948 | 0 | goto cleanup; |
3949 | 0 | } |
3950 | 0 | } |
3951 | | |
3952 | 0 | set_content_type (dt, type); |
3953 | |
|
3954 | 0 | if (opt.adjust_extension) |
3955 | 0 | { |
3956 | 0 | const char *encoding_ext = NULL; |
3957 | 0 | switch (hs->local_encoding) |
3958 | 0 | { |
3959 | 0 | case ENC_INVALID: |
3960 | 0 | case ENC_NONE: |
3961 | 0 | break; |
3962 | 0 | case ENC_BROTLI: |
3963 | 0 | encoding_ext = ".br"; |
3964 | 0 | break; |
3965 | 0 | case ENC_COMPRESS: |
3966 | 0 | encoding_ext = ".Z"; |
3967 | 0 | break; |
3968 | 0 | case ENC_DEFLATE: |
3969 | 0 | encoding_ext = ".zlib"; |
3970 | 0 | break; |
3971 | 0 | case ENC_GZIP: |
3972 | 0 | encoding_ext = ".gz"; |
3973 | 0 | break; |
3974 | 0 | default: |
3975 | 0 | DEBUGP (("No extension found for encoding %d\n", |
3976 | 0 | hs->local_encoding)); |
3977 | 0 | } |
3978 | 0 | if (encoding_ext != NULL) |
3979 | 0 | { |
3980 | 0 | char *file_ext = strrchr (hs->local_file, '.'); |
3981 | | /* strip Content-Encoding extension (it will be re-added later) */ |
3982 | 0 | if (file_ext != NULL && 0 == strcasecmp (file_ext, encoding_ext)) |
3983 | 0 | *file_ext = '\0'; |
3984 | 0 | } |
3985 | 0 | if (*dt & TEXTHTML) |
3986 | | /* -E / --adjust-extension / adjust_extension = on was specified, |
3987 | | and this is a text/html file. If some case-insensitive |
3988 | | variation on ".htm[l]" isn't already the file's suffix, |
3989 | | tack on ".html". */ |
3990 | 0 | { |
3991 | 0 | ensure_extension (hs, ".html", dt); |
3992 | 0 | } |
3993 | 0 | else if (*dt & TEXTCSS) |
3994 | 0 | { |
3995 | 0 | ensure_extension (hs, ".css", dt); |
3996 | 0 | } |
3997 | 0 | if (encoding_ext != NULL) |
3998 | 0 | { |
3999 | 0 | ensure_extension (hs, encoding_ext, dt); |
4000 | 0 | } |
4001 | 0 | } |
4002 | | |
4003 | 0 | if (cond_get) |
4004 | 0 | { |
4005 | | /* Handle the case when server ignores If-Modified-Since header. */ |
4006 | 0 | if (statcode == HTTP_STATUS_OK && hs->remote_time) |
4007 | 0 | { |
4008 | 0 | time_t tmr = http_atotm (hs->remote_time); |
4009 | | |
4010 | | /* Check if the local file is up-to-date based on Last-Modified header |
4011 | | and content length. */ |
4012 | 0 | if (tmr != (time_t) - 1 && tmr <= hs->orig_file_tstamp |
4013 | 0 | && (contlen == -1 || contlen == hs->orig_file_size)) |
4014 | 0 | { |
4015 | 0 | logprintf (LOG_VERBOSE, |
4016 | 0 | _("Server ignored If-Modified-Since header for file %s.\n" |
4017 | 0 | "You might want to add --no-if-modified-since option." |
4018 | 0 | "\n\n"), |
4019 | 0 | quote (hs->local_file)); |
4020 | 0 | *dt |= RETROKF; |
4021 | 0 | CLOSE_INVALIDATE (sock); |
4022 | 0 | retval = RETRUNNEEDED; |
4023 | 0 | goto cleanup; |
4024 | 0 | } |
4025 | 0 | } |
4026 | 0 | } |
4027 | | |
4028 | 0 | if (statcode == HTTP_STATUS_RANGE_NOT_SATISFIABLE |
4029 | 0 | || (!opt.timestamping && hs->restval > 0 && statcode == HTTP_STATUS_OK |
4030 | 0 | && contrange == 0 && contlen >= 0 && hs->restval >= contlen)) |
4031 | 0 | { |
4032 | | /* If `-c' is in use and the file has been fully downloaded (or |
4033 | | the remote file has shrunk), Wget effectively requests bytes |
4034 | | after the end of file and the server response with 416 |
4035 | | (or 200 with a <= Content-Length. */ |
4036 | 0 | logputs (LOG_VERBOSE, _("\ |
4037 | 0 | \n The file is already fully retrieved; nothing to do.\n\n")); |
4038 | | /* In case the caller inspects. */ |
4039 | 0 | hs->len = contlen; |
4040 | 0 | hs->res = 0; |
4041 | | /* Mark as successfully retrieved. */ |
4042 | 0 | *dt |= RETROKF; |
4043 | | |
4044 | | /* Try to maintain the keep-alive connection. It is often cheaper to |
4045 | | * consume some bytes which have already been sent than to negotiate |
4046 | | * a new connection. However, if the body is too large, or we don't |
4047 | | * care about keep-alive, then simply terminate the connection */ |
4048 | 0 | if (keep_alive && |
4049 | 0 | skip_short_body (sock, contlen, chunked_transfer_encoding)) |
4050 | 0 | CLOSE_FINISH (sock); |
4051 | 0 | else |
4052 | 0 | CLOSE_INVALIDATE (sock); |
4053 | 0 | retval = RETRUNNEEDED; |
4054 | 0 | goto cleanup; |
4055 | 0 | } |
4056 | 0 | if ((contrange != 0 && contrange != hs->restval) |
4057 | 0 | || (H_PARTIAL (statcode) && !contrange && hs->restval)) |
4058 | 0 | { |
4059 | | /* The Range request was somehow misunderstood by the server. |
4060 | | Bail out. */ |
4061 | 0 | CLOSE_INVALIDATE (sock); |
4062 | 0 | retval = RANGEERR; |
4063 | 0 | goto cleanup; |
4064 | 0 | } |
4065 | 0 | if (contlen == -1) |
4066 | 0 | hs->contlen = -1; |
4067 | | /* If the response is gzipped, the uncompressed size is unknown. */ |
4068 | 0 | else if (hs->remote_encoding == ENC_GZIP) |
4069 | 0 | hs->contlen = -1; |
4070 | 0 | else |
4071 | 0 | hs->contlen = contlen + contrange; |
4072 | |
|
4073 | 0 | if (opt.verbose) |
4074 | 0 | { |
4075 | 0 | if (*dt & RETROKF) |
4076 | 0 | { |
4077 | | /* No need to print this output if the body won't be |
4078 | | downloaded at all, or if the original server response is |
4079 | | printed. */ |
4080 | 0 | logputs (LOG_VERBOSE, _("Length: ")); |
4081 | 0 | if (contlen != -1) |
4082 | 0 | { |
4083 | 0 | logputs (LOG_VERBOSE, number_to_static_string (contlen + contrange)); |
4084 | 0 | if (contlen + contrange >= 1024) |
4085 | 0 | logprintf (LOG_VERBOSE, " (%s)", |
4086 | 0 | human_readable (contlen + contrange, 10, 1)); |
4087 | 0 | if (contrange) |
4088 | 0 | { |
4089 | 0 | if (contlen >= 1024) |
4090 | 0 | logprintf (LOG_VERBOSE, _(", %s (%s) remaining"), |
4091 | 0 | number_to_static_string (contlen), |
4092 | 0 | human_readable (contlen, 10, 1)); |
4093 | 0 | else |
4094 | 0 | logprintf (LOG_VERBOSE, _(", %s remaining"), |
4095 | 0 | number_to_static_string (contlen)); |
4096 | 0 | } |
4097 | 0 | } |
4098 | 0 | else |
4099 | 0 | logputs (LOG_VERBOSE, |
4100 | 0 | opt.ignore_length ? _("ignored") : _("unspecified")); |
4101 | 0 | if (type) |
4102 | 0 | logprintf (LOG_VERBOSE, " [%s]\n", quotearg_style (escape_quoting_style, type)); |
4103 | 0 | else |
4104 | 0 | logputs (LOG_VERBOSE, "\n"); |
4105 | 0 | } |
4106 | 0 | } |
4107 | | |
4108 | | /* Return if we have no intention of further downloading. */ |
4109 | 0 | if ((!(*dt & RETROKF) && !opt.content_on_error) || head_only || (opt.spider && !opt.recursive)) |
4110 | 0 | { |
4111 | | /* In case the caller cares to look... */ |
4112 | 0 | hs->len = 0; |
4113 | 0 | hs->res = 0; |
4114 | 0 | hs->restval = 0; |
4115 | | |
4116 | | /* Normally we are not interested in the response body of a error responses. |
4117 | | But if we are writing a WARC file we are: we like to keep everything. */ |
4118 | 0 | if (warc_enabled) |
4119 | 0 | { |
4120 | 0 | int _err = read_response_body (hs, sock, NULL, contlen, 0, |
4121 | 0 | chunked_transfer_encoding, |
4122 | 0 | u->url, warc_timestamp_str, |
4123 | 0 | warc_request_uuid, warc_ip, type, |
4124 | 0 | statcode, head); |
4125 | |
|
4126 | 0 | if (_err != RETRFINISHED || hs->res < 0) |
4127 | 0 | { |
4128 | 0 | CLOSE_INVALIDATE (sock); |
4129 | 0 | retval = _err; |
4130 | 0 | goto cleanup; |
4131 | 0 | } |
4132 | | |
4133 | 0 | CLOSE_FINISH (sock); |
4134 | 0 | } |
4135 | 0 | else |
4136 | 0 | { |
4137 | | /* Since WARC is disabled, we are not interested in the response body. */ |
4138 | 0 | if (head_only) |
4139 | | /* Pre-1.10 Wget used CLOSE_INVALIDATE here. Now we trust the |
4140 | | servers not to send body in response to a HEAD request, and |
4141 | | those that do will likely be caught by test_socket_open. |
4142 | | If not, they can be worked around using |
4143 | | `--no-http-keep-alive'. */ |
4144 | 0 | CLOSE_FINISH (sock); |
4145 | 0 | else if (opt.spider && !opt.recursive) |
4146 | | /* we just want to see if the page exists - no downloading required */ |
4147 | 0 | CLOSE_INVALIDATE (sock); |
4148 | 0 | else if (keep_alive |
4149 | 0 | && skip_short_body (sock, contlen, chunked_transfer_encoding)) |
4150 | | /* Successfully skipped the body; also keep using the socket. */ |
4151 | 0 | CLOSE_FINISH (sock); |
4152 | 0 | else |
4153 | 0 | CLOSE_INVALIDATE (sock); |
4154 | 0 | } |
4155 | | |
4156 | 0 | if (statcode == HTTP_STATUS_GATEWAY_TIMEOUT) |
4157 | 0 | retval = GATEWAYTIMEOUT; |
4158 | 0 | else |
4159 | 0 | retval = RETRFINISHED; |
4160 | |
|
4161 | 0 | goto cleanup; |
4162 | 0 | } |
4163 | | |
4164 | 0 | err = open_output_stream (hs, count, &fp); |
4165 | 0 | if (err != RETROK) |
4166 | 0 | { |
4167 | | /* Make sure that errno doesn't get clobbered. |
4168 | | * This is the case for OpenSSL's SSL_shutdown(). */ |
4169 | 0 | int tmp_errno = errno; |
4170 | 0 | CLOSE_INVALIDATE (sock); |
4171 | 0 | errno = tmp_errno; |
4172 | 0 | retval = err; |
4173 | 0 | goto cleanup; |
4174 | 0 | } |
4175 | | |
4176 | 0 | #ifdef ENABLE_XATTR |
4177 | 0 | if (opt.enable_xattr) |
4178 | 0 | { |
4179 | 0 | if (original_url != u) |
4180 | 0 | set_file_metadata (u, original_url, fp); |
4181 | 0 | else |
4182 | 0 | set_file_metadata (u, NULL, fp); |
4183 | 0 | } |
4184 | 0 | #endif |
4185 | |
|
4186 | 0 | err = read_response_body (hs, sock, fp, contlen, contrange, |
4187 | 0 | chunked_transfer_encoding, |
4188 | 0 | u->url, warc_timestamp_str, |
4189 | 0 | warc_request_uuid, warc_ip, type, |
4190 | 0 | statcode, head); |
4191 | |
|
4192 | 0 | if (hs->res >= 0) |
4193 | 0 | CLOSE_FINISH (sock); |
4194 | 0 | else |
4195 | 0 | CLOSE_INVALIDATE (sock); |
4196 | |
|
4197 | 0 | if (!output_stream) |
4198 | 0 | fclose (fp); |
4199 | |
|
4200 | 0 | retval = err; |
4201 | |
|
4202 | 0 | cleanup: |
4203 | 0 | xfree (head); |
4204 | 0 | xfree (type); |
4205 | 0 | xfree (message); |
4206 | 0 | resp_free (&resp); |
4207 | 0 | request_free (&req); |
4208 | |
|
4209 | 0 | return retval; |
4210 | 0 | } |
4211 | | |
4212 | | /* Check whether the supplied HTTP status code is among those |
4213 | | listed for the --retry-on-http-error option. */ |
4214 | | static bool |
4215 | | check_retry_on_http_error (const int statcode) |
4216 | 0 | { |
4217 | 0 | const char *tok = opt.retry_on_http_error; |
4218 | 0 | while (tok && *tok) |
4219 | 0 | { |
4220 | 0 | if (atoi (tok) == statcode) |
4221 | 0 | return true; |
4222 | 0 | if ((tok = strchr (tok, ','))) |
4223 | 0 | ++tok; |
4224 | 0 | } |
4225 | 0 | return false; |
4226 | 0 | } |
4227 | | |
4228 | | /* The genuine HTTP loop! This is the part where the retrieval is |
4229 | | retried, and retried, and retried, and... */ |
4230 | | uerr_t |
4231 | | http_loop (const struct url *u, struct url *original_url, char **newloc, |
4232 | | char **local_file, const char *referer, int *dt, struct url *proxy, |
4233 | | struct iri *iri) |
4234 | 0 | { |
4235 | 0 | int count; |
4236 | 0 | bool got_head = false; /* used for time-stamping and filename detection */ |
4237 | 0 | bool time_came_from_head = false; |
4238 | 0 | bool got_name = false; |
4239 | 0 | char *tms; |
4240 | 0 | const char *tmrate; |
4241 | 0 | uerr_t err, ret = TRYLIMEXC; |
4242 | 0 | time_t tmr = -1; /* remote time-stamp */ |
4243 | 0 | struct http_stat hstat; /* HTTP status */ |
4244 | 0 | struct stat st; |
4245 | 0 | bool send_head_first = true; |
4246 | 0 | bool force_full_retrieve = false; |
4247 | | |
4248 | | |
4249 | | /* If we are writing to a WARC file: always retrieve the whole file. */ |
4250 | 0 | if (opt.warc_filename != NULL) |
4251 | 0 | force_full_retrieve = true; |
4252 | | |
4253 | | |
4254 | | /* Assert that no value for *LOCAL_FILE was passed. */ |
4255 | 0 | assert (local_file == NULL || *local_file == NULL); |
4256 | | |
4257 | | /* Set LOCAL_FILE parameter. */ |
4258 | 0 | if (local_file && opt.output_document) |
4259 | 0 | *local_file = HYPHENP (opt.output_document) ? NULL : xstrdup (opt.output_document); |
4260 | | |
4261 | | /* Reset NEWLOC parameter. */ |
4262 | 0 | *newloc = NULL; |
4263 | | |
4264 | | /* This used to be done in main, but it's a better idea to do it |
4265 | | here so that we don't go through the hoops if we're just using |
4266 | | FTP or whatever. */ |
4267 | 0 | if (opt.cookies) |
4268 | 0 | load_cookies (); |
4269 | | |
4270 | | /* Warn on (likely bogus) wildcard usage in HTTP. */ |
4271 | 0 | if (opt.ftp_glob && has_wildcards_p (u->path)) |
4272 | 0 | logputs (LOG_VERBOSE, _("Warning: wildcards not supported in HTTP.\n")); |
4273 | | |
4274 | | /* Setup hstat struct. */ |
4275 | 0 | xzero (hstat); |
4276 | 0 | hstat.referer = referer; |
4277 | |
|
4278 | 0 | if (opt.output_document) |
4279 | 0 | { |
4280 | 0 | hstat.local_file = xstrdup (opt.output_document); |
4281 | 0 | got_name = true; |
4282 | 0 | } |
4283 | 0 | else if (!opt.content_disposition) |
4284 | 0 | { |
4285 | 0 | hstat.local_file = |
4286 | 0 | url_file_name (opt.trustservernames ? u : original_url, NULL); |
4287 | 0 | got_name = true; |
4288 | 0 | } |
4289 | |
|
4290 | 0 | if (got_name && file_exists_p (hstat.local_file, NULL) && opt.noclobber && !opt.output_document) |
4291 | 0 | { |
4292 | | /* If opt.noclobber is turned on and file already exists, do not |
4293 | | retrieve the file. But if the output_document was given, then this |
4294 | | test was already done and the file didn't exist. Hence the !opt.output_document */ |
4295 | 0 | get_file_flags (hstat.local_file, dt); |
4296 | 0 | ret = RETROK; |
4297 | 0 | goto exit; |
4298 | 0 | } |
4299 | | |
4300 | | /* Reset the counter. */ |
4301 | 0 | count = 0; |
4302 | | |
4303 | | /* Reset the document type. */ |
4304 | 0 | *dt = 0; |
4305 | | |
4306 | | /* Skip preliminary HEAD request if we're not in spider mode. */ |
4307 | 0 | if (!opt.spider) |
4308 | 0 | send_head_first = false; |
4309 | | |
4310 | | /* Send preliminary HEAD request if --content-disposition and -c are used |
4311 | | together. */ |
4312 | 0 | if (opt.content_disposition && opt.always_rest) |
4313 | 0 | send_head_first = true; |
4314 | |
|
4315 | | #ifdef HAVE_METALINK |
4316 | | if (opt.metalink_over_http) |
4317 | | { |
4318 | | *dt |= METALINK_METADATA; |
4319 | | send_head_first = true; |
4320 | | } |
4321 | | #endif |
4322 | |
|
4323 | 0 | if (opt.timestamping) |
4324 | 0 | { |
4325 | | /* Use conditional get request if requested |
4326 | | * and if timestamp is known at this moment. */ |
4327 | 0 | if (opt.if_modified_since && !send_head_first && got_name && file_exists_p (hstat.local_file, NULL)) |
4328 | 0 | { |
4329 | 0 | *dt |= IF_MODIFIED_SINCE; |
4330 | 0 | { |
4331 | 0 | uerr_t timestamp_err = set_file_timestamp (&hstat); |
4332 | 0 | if (timestamp_err != RETROK) |
4333 | 0 | return timestamp_err; |
4334 | 0 | } |
4335 | 0 | } |
4336 | | /* Send preliminary HEAD request if -N is given and we have existing |
4337 | | * destination file or content disposition is enabled. */ |
4338 | 0 | else if (opt.content_disposition || file_exists_p (hstat.local_file, NULL)) |
4339 | 0 | send_head_first = true; |
4340 | 0 | } |
4341 | | |
4342 | | /* THE loop */ |
4343 | 0 | do |
4344 | 0 | { |
4345 | | /* Increment the pass counter. */ |
4346 | 0 | ++count; |
4347 | 0 | sleep_between_retrievals (count); |
4348 | | |
4349 | | /* Get the current time string. */ |
4350 | 0 | tms = datetime_str (time (NULL)); |
4351 | |
|
4352 | 0 | if (opt.spider && !got_head) |
4353 | 0 | logprintf (LOG_VERBOSE, |
4354 | 0 | _("Spider mode enabled. Check if remote file exists.\n")); |
4355 | | |
4356 | | /* Print fetch message, if opt.verbose. */ |
4357 | 0 | if (opt.verbose) |
4358 | 0 | { |
4359 | 0 | char *hurl = url_string (u, URL_AUTH_HIDE_PASSWD); |
4360 | |
|
4361 | 0 | if (count > 1) |
4362 | 0 | { |
4363 | 0 | char tmp[256]; |
4364 | 0 | sprintf (tmp, _("(try:%2d)"), count); |
4365 | 0 | logprintf (LOG_NOTQUIET, "--%s-- %s %s\n", |
4366 | 0 | tms, tmp, hurl); |
4367 | 0 | } |
4368 | 0 | else |
4369 | 0 | { |
4370 | 0 | logprintf (LOG_NOTQUIET, "--%s-- %s\n", |
4371 | 0 | tms, hurl); |
4372 | 0 | } |
4373 | |
|
4374 | | #ifdef WINDOWS |
4375 | | ws_changetitle (hurl); |
4376 | | #endif |
4377 | 0 | xfree (hurl); |
4378 | 0 | } |
4379 | | |
4380 | | /* Default document type is empty. However, if spider mode is |
4381 | | on or time-stamping is employed, HEAD_ONLY commands is |
4382 | | encoded within *dt. */ |
4383 | 0 | if (send_head_first && !got_head) |
4384 | 0 | *dt |= HEAD_ONLY; |
4385 | 0 | else |
4386 | 0 | *dt &= ~HEAD_ONLY; |
4387 | | |
4388 | | /* Decide whether or not to restart. */ |
4389 | 0 | if (force_full_retrieve) |
4390 | 0 | hstat.restval = hstat.len; |
4391 | 0 | else if (opt.start_pos >= 0) |
4392 | 0 | hstat.restval = opt.start_pos; |
4393 | 0 | else if (opt.always_rest |
4394 | 0 | && got_name |
4395 | 0 | && stat (hstat.local_file, &st) == 0 |
4396 | 0 | && S_ISREG (st.st_mode)) |
4397 | | /* When -c is used, continue from on-disk size. (Can't use |
4398 | | hstat.len even if count>1 because we don't want a failed |
4399 | | first attempt to clobber existing data.) */ |
4400 | 0 | hstat.restval = st.st_size; |
4401 | 0 | else if (count > 1) |
4402 | 0 | { |
4403 | | /* otherwise, continue where the previous try left off */ |
4404 | 0 | if (hstat.len < hstat.restval) |
4405 | 0 | hstat.restval -= hstat.len; |
4406 | 0 | else |
4407 | 0 | hstat.restval = hstat.len; |
4408 | 0 | } |
4409 | 0 | else |
4410 | 0 | hstat.restval = 0; |
4411 | | |
4412 | | /* Decide whether to send the no-cache directive. We send it in |
4413 | | two cases: |
4414 | | a) we're using a proxy, and we're past our first retrieval. |
4415 | | Some proxies are notorious for caching incomplete data, so |
4416 | | we require a fresh get. |
4417 | | b) caching is explicitly inhibited. */ |
4418 | 0 | if ((proxy && count > 1) /* a */ |
4419 | 0 | || !opt.allow_cache) /* b */ |
4420 | 0 | *dt |= SEND_NOCACHE; |
4421 | 0 | else |
4422 | 0 | *dt &= ~SEND_NOCACHE; |
4423 | | |
4424 | | /* Try fetching the document, or at least its head. */ |
4425 | 0 | err = gethttp (u, original_url, &hstat, dt, proxy, iri, count); |
4426 | | |
4427 | | /* Time? */ |
4428 | 0 | tms = datetime_str (time (NULL)); |
4429 | | |
4430 | | /* Get the new location (with or without the redirection). */ |
4431 | 0 | if (hstat.newloc) |
4432 | 0 | *newloc = xstrdup (hstat.newloc); |
4433 | |
|
4434 | 0 | switch (err) |
4435 | 0 | { |
4436 | 0 | case HERR: case HEOF: case CONSOCKERR: |
4437 | 0 | case CONERROR: case READERR: case WRITEFAILED: |
4438 | 0 | case RANGEERR: case FOPEN_EXCL_ERR: case GATEWAYTIMEOUT: |
4439 | | /* Non-fatal errors continue executing the loop, which will |
4440 | | bring them to "while" statement at the end, to judge |
4441 | | whether the number of tries was exceeded. */ |
4442 | 0 | printwhat (count, opt.ntry); |
4443 | 0 | continue; |
4444 | 0 | case FWRITEERR: case FOPENERR: |
4445 | | /* Another fatal error. */ |
4446 | 0 | logputs (LOG_VERBOSE, "\n"); |
4447 | 0 | logprintf (LOG_NOTQUIET, _("Cannot write to %s (%s).\n"), |
4448 | 0 | quote (hstat.local_file), strerror (errno)); |
4449 | 0 | ret = err; |
4450 | 0 | goto exit; |
4451 | 0 | case HOSTERR: |
4452 | | /* Fatal unless option set otherwise. */ |
4453 | 0 | if ( opt.retry_on_host_error ) |
4454 | 0 | { |
4455 | 0 | printwhat (count, opt.ntry); |
4456 | 0 | continue; |
4457 | 0 | } |
4458 | 0 | ret = err; |
4459 | 0 | goto exit; |
4460 | 0 | case CONIMPOSSIBLE: case PROXERR: case SSLINITFAILED: |
4461 | 0 | case CONTNOTSUPPORTED: case VERIFCERTERR: case FILEBADFILE: |
4462 | 0 | case UNKNOWNATTR: |
4463 | | /* Fatal errors just return from the function. */ |
4464 | 0 | ret = err; |
4465 | 0 | goto exit; |
4466 | 0 | case ATTRMISSING: |
4467 | | /* A missing attribute in a Header is a fatal Protocol error. */ |
4468 | 0 | logputs (LOG_VERBOSE, "\n"); |
4469 | 0 | logprintf (LOG_NOTQUIET, _("Required attribute missing from Header received.\n")); |
4470 | 0 | ret = err; |
4471 | 0 | goto exit; |
4472 | 0 | case AUTHFAILED: |
4473 | 0 | logputs (LOG_VERBOSE, "\n"); |
4474 | 0 | logprintf (LOG_NOTQUIET, _("Username/Password Authentication Failed.\n")); |
4475 | 0 | ret = err; |
4476 | 0 | goto exit; |
4477 | 0 | case WARC_ERR: |
4478 | | /* A fatal WARC error. */ |
4479 | 0 | logputs (LOG_VERBOSE, "\n"); |
4480 | 0 | logprintf (LOG_NOTQUIET, _("Cannot write to WARC file.\n")); |
4481 | 0 | ret = err; |
4482 | 0 | goto exit; |
4483 | 0 | case WARC_TMP_FOPENERR: case WARC_TMP_FWRITEERR: |
4484 | | /* A fatal WARC error. */ |
4485 | 0 | logputs (LOG_VERBOSE, "\n"); |
4486 | 0 | logprintf (LOG_NOTQUIET, _("Cannot write to temporary WARC file.\n")); |
4487 | 0 | ret = err; |
4488 | 0 | goto exit; |
4489 | 0 | case CONSSLERR: |
4490 | | /* Another fatal error. */ |
4491 | 0 | logprintf (LOG_NOTQUIET, _("Unable to establish SSL connection.\n")); |
4492 | 0 | ret = err; |
4493 | 0 | goto exit; |
4494 | 0 | case UNLINKERR: |
4495 | | /* Another fatal error. */ |
4496 | 0 | logputs (LOG_VERBOSE, "\n"); |
4497 | 0 | logprintf (LOG_NOTQUIET, _("Cannot unlink %s (%s).\n"), |
4498 | 0 | quote (hstat.local_file), strerror (errno)); |
4499 | 0 | ret = err; |
4500 | 0 | goto exit; |
4501 | 0 | case NEWLOCATION: |
4502 | 0 | case NEWLOCATION_KEEP_POST: |
4503 | | /* Return the new location to the caller. */ |
4504 | 0 | if (!*newloc) |
4505 | 0 | { |
4506 | 0 | logprintf (LOG_NOTQUIET, |
4507 | 0 | _("ERROR: Redirection (%d) without location.\n"), |
4508 | 0 | hstat.statcode); |
4509 | 0 | ret = WRONGCODE; |
4510 | 0 | } |
4511 | 0 | else |
4512 | 0 | { |
4513 | 0 | ret = err; |
4514 | 0 | } |
4515 | 0 | goto exit; |
4516 | 0 | case RETRUNNEEDED: |
4517 | | /* The file was already fully retrieved. */ |
4518 | 0 | ret = RETROK; |
4519 | 0 | goto exit; |
4520 | 0 | case RETRFINISHED: |
4521 | | /* Deal with you later. */ |
4522 | 0 | break; |
4523 | | #ifdef HAVE_METALINK |
4524 | | case RETR_WITH_METALINK: |
4525 | | { |
4526 | | if (hstat.metalink == NULL) |
4527 | | { |
4528 | | logputs (LOG_NOTQUIET, |
4529 | | _("Could not find Metalink data in HTTP response. " |
4530 | | "Downloading file using HTTP GET.\n")); |
4531 | | *dt &= ~METALINK_METADATA; |
4532 | | *dt &= ~HEAD_ONLY; |
4533 | | got_head = true; |
4534 | | continue; |
4535 | | } |
4536 | | |
4537 | | logputs (LOG_VERBOSE, |
4538 | | _("Metalink headers found. " |
4539 | | "Switching to Metalink mode.\n")); |
4540 | | |
4541 | | ret = retrieve_from_metalink (hstat.metalink); |
4542 | | goto exit; |
4543 | | } |
4544 | | break; |
4545 | | #endif |
4546 | 0 | default: |
4547 | | /* All possibilities should have been exhausted. */ |
4548 | 0 | abort (); |
4549 | 0 | } |
4550 | | |
4551 | 0 | if (!(*dt & RETROKF)) |
4552 | 0 | { |
4553 | 0 | char *hurl = NULL; |
4554 | 0 | if (!opt.verbose) |
4555 | 0 | { |
4556 | | /* #### Ugly ugly ugly! */ |
4557 | 0 | hurl = url_string (u, URL_AUTH_HIDE_PASSWD); |
4558 | 0 | logprintf (LOG_NONVERBOSE, "%s:\n", hurl); |
4559 | 0 | } |
4560 | | |
4561 | | /* Fall back to GET if HEAD fails with a 500 or 501 error code. */ |
4562 | 0 | if (*dt & HEAD_ONLY |
4563 | 0 | && (hstat.statcode == 500 || hstat.statcode == 501)) |
4564 | 0 | { |
4565 | 0 | got_head = true; |
4566 | 0 | xfree (hurl); |
4567 | 0 | continue; |
4568 | 0 | } |
4569 | | /* Maybe we should always keep track of broken links, not just in |
4570 | | * spider mode. |
4571 | | * Don't log error if it was UTF-8 encoded because we will try |
4572 | | * once unencoded. */ |
4573 | 0 | else if (opt.spider && !iri->utf8_encode) |
4574 | 0 | { |
4575 | | /* #### Again: ugly ugly ugly! */ |
4576 | 0 | if (!hurl) |
4577 | 0 | hurl = url_string (u, URL_AUTH_HIDE_PASSWD); |
4578 | 0 | nonexisting_url (hurl); |
4579 | 0 | logprintf (LOG_NOTQUIET, _("\ |
4580 | 0 | Remote file does not exist -- broken link!!!\n")); |
4581 | 0 | } |
4582 | 0 | else if (check_retry_on_http_error (hstat.statcode)) |
4583 | 0 | { |
4584 | 0 | printwhat (count, opt.ntry); |
4585 | 0 | xfree (hurl); |
4586 | 0 | continue; |
4587 | 0 | } |
4588 | 0 | else |
4589 | 0 | { |
4590 | 0 | logprintf (LOG_NOTQUIET, _("%s ERROR %d: %s.\n"), |
4591 | 0 | tms, hstat.statcode, |
4592 | 0 | quotearg_style (escape_quoting_style, hstat.error)); |
4593 | 0 | } |
4594 | 0 | logputs (LOG_VERBOSE, "\n"); |
4595 | 0 | ret = WRONGCODE; |
4596 | 0 | xfree (hurl); |
4597 | 0 | goto exit; |
4598 | 0 | } |
4599 | | |
4600 | | /* Did we get the time-stamp? */ |
4601 | 0 | if (!got_head || (opt.spider && !opt.recursive)) |
4602 | 0 | { |
4603 | 0 | got_head = true; /* no more time-stamping */ |
4604 | |
|
4605 | 0 | if (opt.timestamping && !hstat.remote_time) |
4606 | 0 | { |
4607 | 0 | logputs (LOG_NOTQUIET, _("\ |
4608 | 0 | Last-modified header missing -- time-stamps turned off.\n")); |
4609 | 0 | } |
4610 | 0 | else if (hstat.remote_time) |
4611 | 0 | { |
4612 | | /* Convert the date-string into struct tm. */ |
4613 | 0 | tmr = http_atotm (hstat.remote_time); |
4614 | 0 | if (tmr == (time_t) (-1)) |
4615 | 0 | logputs (LOG_VERBOSE, _("\ |
4616 | 0 | Last-modified header invalid -- time-stamp ignored.\n")); |
4617 | 0 | if (*dt & HEAD_ONLY) |
4618 | 0 | time_came_from_head = true; |
4619 | 0 | } |
4620 | |
|
4621 | 0 | if (send_head_first) |
4622 | 0 | { |
4623 | | /* The time-stamping section. */ |
4624 | 0 | if (opt.timestamping) |
4625 | 0 | { |
4626 | 0 | if (hstat.orig_file_name) /* Perform the following |
4627 | | checks only if the file |
4628 | | we're supposed to |
4629 | | download already exists. */ |
4630 | 0 | { |
4631 | 0 | if (hstat.remote_time && |
4632 | 0 | tmr != (time_t) (-1)) |
4633 | 0 | { |
4634 | | /* Now time-stamping can be used validly. |
4635 | | Time-stamping means that if the sizes of |
4636 | | the local and remote file match, and local |
4637 | | file is newer than the remote file, it will |
4638 | | not be retrieved. Otherwise, the normal |
4639 | | download procedure is resumed. */ |
4640 | 0 | if (hstat.orig_file_tstamp >= tmr) |
4641 | 0 | { |
4642 | 0 | if (hstat.contlen == -1 |
4643 | 0 | || hstat.orig_file_size == hstat.contlen) |
4644 | 0 | { |
4645 | 0 | logprintf (LOG_VERBOSE, _("\ |
4646 | 0 | Server file no newer than local file %s -- not retrieving.\n\n"), |
4647 | 0 | quote (hstat.orig_file_name)); |
4648 | 0 | ret = RETROK; |
4649 | 0 | goto exit; |
4650 | 0 | } |
4651 | 0 | else |
4652 | 0 | { |
4653 | 0 | logprintf (LOG_VERBOSE, _("\ |
4654 | 0 | The sizes do not match (local %s) -- retrieving.\n"), |
4655 | 0 | number_to_static_string (hstat.orig_file_size)); |
4656 | 0 | } |
4657 | 0 | } |
4658 | 0 | else |
4659 | 0 | { |
4660 | 0 | force_full_retrieve = true; |
4661 | 0 | logputs (LOG_VERBOSE, |
4662 | 0 | _("Remote file is newer, retrieving.\n")); |
4663 | 0 | } |
4664 | | |
4665 | 0 | logputs (LOG_VERBOSE, "\n"); |
4666 | 0 | } |
4667 | 0 | } |
4668 | | |
4669 | | /* free_hstat (&hstat); */ |
4670 | 0 | hstat.timestamp_checked = true; |
4671 | 0 | } |
4672 | | |
4673 | 0 | if (opt.spider) |
4674 | 0 | { |
4675 | 0 | bool finished = true; |
4676 | 0 | if (opt.recursive) |
4677 | 0 | { |
4678 | 0 | if ((*dt & TEXTHTML) || (*dt & TEXTCSS)) |
4679 | 0 | { |
4680 | 0 | logputs (LOG_VERBOSE, _("\ |
4681 | 0 | Remote file exists and could contain links to other resources -- retrieving.\n\n")); |
4682 | 0 | finished = false; |
4683 | 0 | } |
4684 | 0 | else |
4685 | 0 | { |
4686 | 0 | logprintf (LOG_VERBOSE, _("\ |
4687 | 0 | Remote file exists but does not contain any link -- not retrieving.\n\n")); |
4688 | 0 | ret = RETROK; /* RETRUNNEEDED is not for caller. */ |
4689 | 0 | } |
4690 | 0 | } |
4691 | 0 | else |
4692 | 0 | { |
4693 | 0 | if ((*dt & TEXTHTML) || (*dt & TEXTCSS)) |
4694 | 0 | { |
4695 | 0 | logprintf (LOG_VERBOSE, _("\ |
4696 | 0 | Remote file exists and could contain further links,\n\ |
4697 | 0 | but recursion is disabled -- not retrieving.\n\n")); |
4698 | 0 | } |
4699 | 0 | else |
4700 | 0 | { |
4701 | 0 | logprintf (LOG_VERBOSE, _("\ |
4702 | 0 | Remote file exists.\n\n")); |
4703 | 0 | } |
4704 | 0 | ret = RETROK; /* RETRUNNEEDED is not for caller. */ |
4705 | 0 | } |
4706 | |
|
4707 | 0 | if (finished) |
4708 | 0 | { |
4709 | 0 | logprintf (LOG_NONVERBOSE, |
4710 | 0 | _("%s URL: %s %2d %s\n"), |
4711 | 0 | tms, u->url, hstat.statcode, |
4712 | 0 | hstat.message ? quotearg_style (escape_quoting_style, hstat.message) : ""); |
4713 | 0 | goto exit; |
4714 | 0 | } |
4715 | 0 | } |
4716 | | |
4717 | 0 | got_name = true; |
4718 | 0 | *dt &= ~HEAD_ONLY; |
4719 | 0 | count = 0; /* the retrieve count for HEAD is reset */ |
4720 | 0 | continue; |
4721 | 0 | } /* send_head_first */ |
4722 | 0 | } /* !got_head */ |
4723 | | |
4724 | 0 | if (opt.useservertimestamps |
4725 | 0 | && (tmr != (time_t) (-1)) |
4726 | 0 | && ((hstat.len == hstat.contlen) || |
4727 | 0 | ((hstat.res == 0) && (hstat.contlen == -1)))) |
4728 | 0 | { |
4729 | 0 | const char *fl = NULL; |
4730 | 0 | set_local_file (&fl, hstat.local_file); |
4731 | 0 | if (fl) |
4732 | 0 | { |
4733 | 0 | time_t newtmr = -1; |
4734 | | /* Reparse time header, in case it's changed. */ |
4735 | 0 | if (time_came_from_head |
4736 | 0 | && hstat.remote_time && hstat.remote_time[0]) |
4737 | 0 | { |
4738 | 0 | newtmr = http_atotm (hstat.remote_time); |
4739 | 0 | if (newtmr != (time_t)-1) |
4740 | 0 | tmr = newtmr; |
4741 | 0 | } |
4742 | 0 | touch (fl, tmr); |
4743 | 0 | } |
4744 | 0 | } |
4745 | | /* End of time-stamping section. */ |
4746 | |
|
4747 | 0 | tmrate = retr_rate (hstat.rd_size, hstat.dltime); |
4748 | 0 | total_download_time += hstat.dltime; |
4749 | |
|
4750 | 0 | if (hstat.len == hstat.contlen) |
4751 | 0 | { |
4752 | 0 | if (*dt & RETROKF || opt.content_on_error) |
4753 | 0 | { |
4754 | 0 | bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document)); |
4755 | |
|
4756 | 0 | logprintf (LOG_VERBOSE, |
4757 | 0 | write_to_stdout |
4758 | 0 | ? _("%s (%s) - written to stdout %s[%s/%s]\n\n") |
4759 | 0 | : _("%s (%s) - %s saved [%s/%s]\n\n"), |
4760 | 0 | tms, tmrate, |
4761 | 0 | write_to_stdout ? "" : quote (hstat.local_file), |
4762 | 0 | number_to_static_string (hstat.len), |
4763 | 0 | number_to_static_string (hstat.contlen)); |
4764 | 0 | logprintf (LOG_NONVERBOSE, |
4765 | 0 | "%s URL:%s [%s/%s] -> \"%s\" [%d]\n", |
4766 | 0 | tms, u->url, |
4767 | 0 | number_to_static_string (hstat.len), |
4768 | 0 | number_to_static_string (hstat.contlen), |
4769 | 0 | hstat.local_file, count); |
4770 | 0 | } |
4771 | 0 | ++numurls; |
4772 | 0 | total_downloaded_bytes += hstat.rd_size; |
4773 | | |
4774 | | /* Remember that we downloaded the file for later ".orig" code. */ |
4775 | 0 | if (*dt & ADDED_HTML_EXTENSION) |
4776 | 0 | downloaded_file (FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, hstat.local_file); |
4777 | 0 | else |
4778 | 0 | downloaded_file (FILE_DOWNLOADED_NORMALLY, hstat.local_file); |
4779 | |
|
4780 | 0 | ret = RETROK; |
4781 | 0 | goto exit; |
4782 | 0 | } |
4783 | 0 | else if (hstat.res == 0) /* No read error */ |
4784 | 0 | { |
4785 | 0 | if (hstat.contlen == -1) /* We don't know how much we were supposed |
4786 | | to get, so assume we succeeded. */ |
4787 | 0 | { |
4788 | 0 | if (*dt & RETROKF || opt.content_on_error) |
4789 | 0 | { |
4790 | 0 | bool write_to_stdout = (opt.output_document && HYPHENP (opt.output_document)); |
4791 | |
|
4792 | 0 | logprintf (LOG_VERBOSE, |
4793 | 0 | write_to_stdout |
4794 | 0 | ? _("%s (%s) - written to stdout %s[%s]\n\n") |
4795 | 0 | : _("%s (%s) - %s saved [%s]\n\n"), |
4796 | 0 | tms, tmrate, |
4797 | 0 | write_to_stdout ? "" : quote (hstat.local_file), |
4798 | 0 | number_to_static_string (hstat.len)); |
4799 | 0 | if (!(opt.verbose || opt.quiet)) |
4800 | 0 | { |
4801 | 0 | char *url = url_string (u, URL_AUTH_HIDE_PASSWD); |
4802 | 0 | logprintf (LOG_NONVERBOSE, |
4803 | 0 | "%s URL:%s [%s] -> \"%s\" [%d]\n", |
4804 | 0 | tms, url, number_to_static_string (hstat.len), |
4805 | 0 | hstat.local_file, count); |
4806 | 0 | xfree (url); |
4807 | 0 | } |
4808 | 0 | } |
4809 | 0 | ++numurls; |
4810 | 0 | total_downloaded_bytes += hstat.rd_size; |
4811 | | |
4812 | | /* Remember that we downloaded the file for later ".orig" code. */ |
4813 | 0 | if (*dt & ADDED_HTML_EXTENSION) |
4814 | 0 | downloaded_file (FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED, hstat.local_file); |
4815 | 0 | else |
4816 | 0 | downloaded_file (FILE_DOWNLOADED_NORMALLY, hstat.local_file); |
4817 | |
|
4818 | 0 | ret = RETROK; |
4819 | 0 | goto exit; |
4820 | 0 | } |
4821 | 0 | else if (hstat.len < hstat.contlen) /* meaning we lost the |
4822 | | connection too soon */ |
4823 | 0 | { |
4824 | 0 | logprintf (LOG_VERBOSE, |
4825 | 0 | _("%s (%s) - Connection closed at byte %s. "), |
4826 | 0 | tms, tmrate, number_to_static_string (hstat.len)); |
4827 | 0 | printwhat (count, opt.ntry); |
4828 | 0 | continue; |
4829 | 0 | } |
4830 | 0 | else if (hstat.len != hstat.restval) |
4831 | | /* Getting here would mean reading more data than |
4832 | | requested with content-length, which we never do. */ |
4833 | 0 | abort (); |
4834 | 0 | else |
4835 | 0 | { |
4836 | | /* Getting here probably means that the content-length was |
4837 | | * _less_ than the original, local size. We should probably |
4838 | | * truncate or re-read, or something. FIXME */ |
4839 | 0 | ret = RETROK; |
4840 | 0 | goto exit; |
4841 | 0 | } |
4842 | 0 | } |
4843 | 0 | else /* from now on hstat.res can only be -1 */ |
4844 | 0 | { |
4845 | 0 | if (hstat.contlen == -1) |
4846 | 0 | { |
4847 | 0 | logprintf (LOG_VERBOSE, |
4848 | 0 | _("%s (%s) - Read error at byte %s (%s)."), |
4849 | 0 | tms, tmrate, number_to_static_string (hstat.len), |
4850 | 0 | hstat.rderrmsg); |
4851 | 0 | printwhat (count, opt.ntry); |
4852 | 0 | continue; |
4853 | 0 | } |
4854 | 0 | else /* hstat.res == -1 and contlen is given */ |
4855 | 0 | { |
4856 | 0 | logprintf (LOG_VERBOSE, |
4857 | 0 | _("%s (%s) - Read error at byte %s/%s (%s). "), |
4858 | 0 | tms, tmrate, |
4859 | 0 | number_to_static_string (hstat.len), |
4860 | 0 | number_to_static_string (hstat.contlen), |
4861 | 0 | hstat.rderrmsg); |
4862 | 0 | printwhat (count, opt.ntry); |
4863 | 0 | continue; |
4864 | 0 | } |
4865 | 0 | } |
4866 | | /* not reached */ |
4867 | 0 | } |
4868 | 0 | while (!opt.ntry || (count < opt.ntry)); |
4869 | | |
4870 | 0 | exit: |
4871 | 0 | if ((ret == RETROK || opt.content_on_error) && local_file) |
4872 | 0 | { |
4873 | 0 | xfree (*local_file); |
4874 | | /* Bugfix: Prevent SIGSEGV when hstat.local_file was left NULL |
4875 | | (i.e. due to opt.content_disposition). */ |
4876 | 0 | if (hstat.local_file) |
4877 | 0 | { |
4878 | 0 | *local_file = hstat.local_file; |
4879 | 0 | hstat.local_file = NULL; |
4880 | 0 | } |
4881 | 0 | } |
4882 | 0 | free_hstat (&hstat); |
4883 | |
|
4884 | 0 | return ret; |
4885 | 0 | } |
4886 | | |
4887 | | /* Check whether the result of strptime() indicates success. |
4888 | | strptime() returns the pointer to how far it got to in the string. |
4889 | | The processing has been successful if the string is at `GMT' or |
4890 | | `+X', or at the end of the string. |
4891 | | |
4892 | | In extended regexp parlance, the function returns 1 if P matches |
4893 | | "^ *(GMT|[+-][0-9]|$)", 0 otherwise. P being NULL (which strptime |
4894 | | can return) is considered a failure and 0 is returned. */ |
4895 | | static bool |
4896 | | check_end (const char *p) |
4897 | 11.8k | { |
4898 | 11.8k | if (!p) |
4899 | 9.28k | return false; |
4900 | 4.50k | while (c_isspace (*p)) |
4901 | 1.93k | ++p; |
4902 | 2.57k | if (!*p |
4903 | 2.57k | || (p[0] == 'G' && p[1] == 'M' && p[2] == 'T') |
4904 | 2.57k | || ((p[0] == '+' || p[0] == '-') && c_isdigit (p[1]))) |
4905 | 1.46k | return true; |
4906 | 1.10k | else |
4907 | 1.10k | return false; |
4908 | 2.57k | } |
4909 | | |
4910 | | /* Convert the textual specification of time in TIME_STRING to the |
4911 | | number of seconds since the Epoch. |
4912 | | |
4913 | | TIME_STRING can be in any of the three formats RFC2616 allows the |
4914 | | HTTP servers to emit -- RFC1123-date, RFC850-date or asctime-date, |
4915 | | as well as the time format used in the Set-Cookie header. |
4916 | | Timezones are ignored, and should be GMT. |
4917 | | |
4918 | | Return the computed time_t representation, or -1 if the conversion |
4919 | | fails. |
4920 | | |
4921 | | This function uses strptime with various string formats for parsing |
4922 | | TIME_STRING. This results in a parser that is not as lenient in |
4923 | | interpreting TIME_STRING as I would like it to be. Being based on |
4924 | | strptime, it always allows shortened months, one-digit days, etc., |
4925 | | but due to the multitude of formats in which time can be |
4926 | | represented, an ideal HTTP time parser would be even more |
4927 | | forgiving. It should completely ignore things like week days and |
4928 | | concentrate only on the various forms of representing years, |
4929 | | months, days, hours, minutes, and seconds. For example, it would |
4930 | | be nice if it accepted ISO 8601 out of the box. |
4931 | | |
4932 | | I've investigated free and PD code for this purpose, but none was |
4933 | | usable. getdate was big and unwieldy, and had potential copyright |
4934 | | issues, or so I was informed. Dr. Marcus Hennecke's atotm(), |
4935 | | distributed with phttpd, is excellent, but we cannot use it because |
4936 | | it is not assigned to the FSF. So I stuck it with strptime. */ |
4937 | | |
4938 | | time_t |
4939 | | http_atotm (const char *time_string) |
4940 | 3.33k | { |
4941 | | /* NOTE: Solaris strptime man page claims that %n and %t match white |
4942 | | space, but that's not universally available. Instead, we simply |
4943 | | use ` ' to mean "skip all WS", which works under all strptime |
4944 | | implementations I've tested. */ |
4945 | | |
4946 | 3.33k | static const char *time_formats[] = { |
4947 | 3.33k | "%a, %d %b %Y %T", /* rfc1123: Thu, 29 Jan 1998 22:12:57 */ |
4948 | 3.33k | "%A, %d-%b-%y %T", /* rfc850: Thursday, 29-Jan-98 22:12:57 */ |
4949 | 3.33k | "%a %b %d %T %Y", /* asctime: Thu Jan 29 22:12:57 1998 */ |
4950 | 3.33k | "%a, %d-%b-%Y %T" /* cookies: Thu, 29-Jan-1998 22:12:57 |
4951 | | (used in Set-Cookie, defined in the |
4952 | | Netscape cookie specification.) */ |
4953 | 3.33k | }; |
4954 | 3.33k | const char *oldlocale; |
4955 | 3.33k | char savedlocale[256]; |
4956 | 3.33k | size_t i; |
4957 | 3.33k | time_t ret = (time_t) -1; |
4958 | | |
4959 | | /* Solaris strptime fails to recognize English month names in |
4960 | | non-English locales, which we work around by temporarily setting |
4961 | | locale to C before invoking strptime. */ |
4962 | 3.33k | oldlocale = setlocale (LC_TIME, NULL); |
4963 | 3.33k | if (oldlocale) |
4964 | 3.33k | { |
4965 | 3.33k | size_t l = strlen (oldlocale) + 1; |
4966 | 3.33k | if (l >= sizeof savedlocale) |
4967 | 0 | savedlocale[0] = '\0'; |
4968 | 3.33k | else |
4969 | 3.33k | memcpy (savedlocale, oldlocale, l); |
4970 | 3.33k | } |
4971 | 0 | else savedlocale[0] = '\0'; |
4972 | | |
4973 | 3.33k | setlocale (LC_TIME, "C"); |
4974 | | |
4975 | 13.7k | for (i = 0; i < countof (time_formats); i++) |
4976 | 11.8k | { |
4977 | 11.8k | struct tm t; |
4978 | | |
4979 | | /* Some versions of strptime use the existing contents of struct |
4980 | | tm to recalculate the date according to format. Zero it out |
4981 | | to prevent stack garbage from influencing strptime. */ |
4982 | 11.8k | xzero (t); |
4983 | | |
4984 | 11.8k | if (check_end (strptime (time_string, time_formats[i], &t))) |
4985 | 1.46k | { |
4986 | 1.46k | ret = timegm (&t); |
4987 | 1.46k | break; |
4988 | 1.46k | } |
4989 | 11.8k | } |
4990 | | |
4991 | | /* Restore the previous locale. */ |
4992 | 3.33k | if (savedlocale[0]) |
4993 | 3.33k | setlocale (LC_TIME, savedlocale); |
4994 | | |
4995 | 3.33k | return ret; |
4996 | 3.33k | } |
4997 | | |
4998 | | /* Authorization support: We support three authorization schemes: |
4999 | | |
5000 | | * `Basic' scheme, consisting of base64-ing USER:PASSWORD string; |
5001 | | |
5002 | | * `Digest' scheme, added by Junio Hamano <junio@twinsun.com>, |
5003 | | consisting of answering to the server's challenge with the proper |
5004 | | MD5 digests. |
5005 | | |
5006 | | * `NTLM' ("NT Lan Manager") scheme, based on code written by Daniel |
5007 | | Stenberg for libcurl. Like digest, NTLM is based on a |
5008 | | challenge-response mechanism, but unlike digest, it is non-standard |
5009 | | (authenticates TCP connections rather than requests), undocumented |
5010 | | and Microsoft-specific. */ |
5011 | | |
5012 | | /* Create the authentication header contents for the `Basic' scheme. |
5013 | | This is done by encoding the string "USER:PASS" to base64 and |
5014 | | prepending the string "Basic " in front of it. */ |
5015 | | |
5016 | | static char * |
5017 | | basic_authentication_encode (const char *user, const char *passwd) |
5018 | 0 | { |
5019 | 0 | char buf_t1[256], buf_t2[256]; |
5020 | 0 | char *t1, *t2, *ret; |
5021 | 0 | size_t len1 = strlen (user) + 1 + strlen (passwd); |
5022 | |
|
5023 | 0 | if (len1 < sizeof (buf_t1)) |
5024 | 0 | t1 = buf_t1; |
5025 | 0 | else |
5026 | 0 | t1 = xmalloc(len1 + 1); |
5027 | |
|
5028 | 0 | if (BASE64_LENGTH (len1) < sizeof (buf_t2)) |
5029 | 0 | t2 = buf_t2; |
5030 | 0 | else |
5031 | 0 | t2 = xmalloc (BASE64_LENGTH (len1) + 1); |
5032 | |
|
5033 | 0 | sprintf (t1, "%s:%s", user, passwd); |
5034 | 0 | wget_base64_encode (t1, len1, t2); |
5035 | |
|
5036 | 0 | ret = concat_strings ("Basic ", t2, (char *) 0); |
5037 | |
|
5038 | 0 | if (t2 != buf_t2) |
5039 | 0 | xfree (t2); |
5040 | |
|
5041 | 0 | if (t1 != buf_t1) |
5042 | 0 | xfree (t1); |
5043 | |
|
5044 | 0 | return ret; |
5045 | 0 | } |
5046 | | |
5047 | | #define SKIP_WS(x) do { \ |
5048 | | while (c_isspace (*(x))) \ |
5049 | | ++(x); \ |
5050 | | } while (0) |
5051 | | |
5052 | | #ifdef ENABLE_DIGEST |
5053 | | /* Dump the hexadecimal representation of HASH to BUF. HASH should be |
5054 | | an array of 16 bytes containing the hash keys, and BUF should be a |
5055 | | buffer of 33 writable characters (32 for hex digits plus one for |
5056 | | zero termination). */ |
5057 | | static void |
5058 | | dump_hash (char *buf, const unsigned char *hash) |
5059 | 0 | { |
5060 | 0 | int i; |
5061 | |
|
5062 | 0 | for (i = 0; i < MD5_DIGEST_SIZE; i++, hash++) |
5063 | 0 | { |
5064 | 0 | *buf++ = XNUM_TO_digit (*hash >> 4); |
5065 | 0 | *buf++ = XNUM_TO_digit (*hash & 0xf); |
5066 | 0 | } |
5067 | 0 | *buf = '\0'; |
5068 | 0 | } |
5069 | | |
5070 | | /* Take the line apart to find the challenge, and compose a digest |
5071 | | authorization header. See RFC2069 section 2.1.2. */ |
5072 | | static char * |
5073 | | digest_authentication_encode (const char *au, const char *user, |
5074 | | const char *passwd, const char *method, |
5075 | | const char *path, uerr_t *auth_err) |
5076 | 0 | { |
5077 | 0 | static char *realm, *opaque, *nonce, *qop, *algorithm; |
5078 | 0 | static struct { |
5079 | 0 | const char *name; |
5080 | 0 | char **variable; |
5081 | 0 | } options[] = { |
5082 | 0 | { "realm", &realm }, |
5083 | 0 | { "opaque", &opaque }, |
5084 | 0 | { "nonce", &nonce }, |
5085 | 0 | { "qop", &qop }, |
5086 | 0 | { "algorithm", &algorithm } |
5087 | 0 | }; |
5088 | 0 | char cnonce[16] = ""; |
5089 | 0 | char *res = NULL; |
5090 | 0 | int res_len; |
5091 | 0 | size_t res_size; |
5092 | 0 | param_token name, value; |
5093 | | |
5094 | |
|
5095 | 0 | realm = opaque = nonce = algorithm = qop = NULL; |
5096 | |
|
5097 | 0 | au += 6; /* skip over `Digest' */ |
5098 | 0 | while (extract_param (&au, &name, &value, ',', NULL)) |
5099 | 0 | { |
5100 | 0 | size_t i; |
5101 | 0 | size_t namelen = name.e - name.b; |
5102 | 0 | for (i = 0; i < countof (options); i++) |
5103 | 0 | if (namelen == strlen (options[i].name) |
5104 | 0 | && 0 == strncmp (name.b, options[i].name, |
5105 | 0 | namelen)) |
5106 | 0 | { |
5107 | 0 | *options[i].variable = strdupdelim (value.b, value.e); |
5108 | 0 | break; |
5109 | 0 | } |
5110 | 0 | } |
5111 | |
|
5112 | 0 | if (qop && strcmp (qop, "auth")) |
5113 | 0 | { |
5114 | 0 | logprintf (LOG_NOTQUIET, _("Unsupported quality of protection '%s'.\n"), qop); |
5115 | 0 | xfree (qop); /* force freeing mem and continue */ |
5116 | 0 | } |
5117 | 0 | else if (algorithm && strcmp (algorithm,"MD5") && strcmp (algorithm,"MD5-sess")) |
5118 | 0 | { |
5119 | 0 | logprintf (LOG_NOTQUIET, _("Unsupported algorithm '%s'.\n"), algorithm); |
5120 | 0 | xfree (algorithm); /* force freeing mem and continue */ |
5121 | 0 | } |
5122 | |
|
5123 | 0 | if (!realm || !nonce || !user || !passwd || !path || !method) |
5124 | 0 | { |
5125 | 0 | *auth_err = ATTRMISSING; |
5126 | 0 | goto cleanup; |
5127 | 0 | } |
5128 | | |
5129 | | /* Calculate the digest value. */ |
5130 | 0 | { |
5131 | 0 | struct md5_ctx ctx; |
5132 | 0 | unsigned char hash[MD5_DIGEST_SIZE]; |
5133 | 0 | char a1buf[MD5_DIGEST_SIZE * 2 + 1], a2buf[MD5_DIGEST_SIZE * 2 + 1]; |
5134 | 0 | char response_digest[MD5_DIGEST_SIZE * 2 + 1]; |
5135 | | |
5136 | | /* A1BUF = H(user ":" realm ":" password) */ |
5137 | 0 | md5_init_ctx (&ctx); |
5138 | 0 | md5_process_bytes ((unsigned char *)user, strlen (user), &ctx); |
5139 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5140 | 0 | md5_process_bytes ((unsigned char *)realm, strlen (realm), &ctx); |
5141 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5142 | 0 | md5_process_bytes ((unsigned char *)passwd, strlen (passwd), &ctx); |
5143 | 0 | md5_finish_ctx (&ctx, hash); |
5144 | |
|
5145 | 0 | dump_hash (a1buf, hash); |
5146 | |
|
5147 | 0 | if (algorithm && !strcmp (algorithm, "MD5-sess")) |
5148 | 0 | { |
5149 | | /* A1BUF = H( H(user ":" realm ":" password) ":" nonce ":" cnonce ) */ |
5150 | 0 | snprintf (cnonce, sizeof (cnonce), "%08x", |
5151 | 0 | (unsigned) random_number (INT_MAX)); |
5152 | |
|
5153 | 0 | md5_init_ctx (&ctx); |
5154 | | /* md5_process_bytes (hash, MD5_DIGEST_SIZE, &ctx); */ |
5155 | 0 | md5_process_bytes (a1buf, MD5_DIGEST_SIZE * 2, &ctx); |
5156 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5157 | 0 | md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); |
5158 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5159 | 0 | md5_process_bytes ((unsigned char *)cnonce, strlen (cnonce), &ctx); |
5160 | 0 | md5_finish_ctx (&ctx, hash); |
5161 | |
|
5162 | 0 | dump_hash (a1buf, hash); |
5163 | 0 | } |
5164 | | |
5165 | | /* A2BUF = H(method ":" path) */ |
5166 | 0 | md5_init_ctx (&ctx); |
5167 | 0 | md5_process_bytes ((unsigned char *)method, strlen (method), &ctx); |
5168 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5169 | 0 | md5_process_bytes ((unsigned char *)path, strlen (path), &ctx); |
5170 | 0 | md5_finish_ctx (&ctx, hash); |
5171 | 0 | dump_hash (a2buf, hash); |
5172 | |
|
5173 | 0 | if (qop && !strcmp (qop, "auth")) |
5174 | 0 | { |
5175 | | /* RFC 2617 Digest Access Authentication */ |
5176 | | /* generate random hex string */ |
5177 | 0 | if (!*cnonce) |
5178 | 0 | snprintf (cnonce, sizeof (cnonce), "%08x", |
5179 | 0 | (unsigned) random_number (INT_MAX)); |
5180 | | |
5181 | | /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" noncecount ":" clientnonce ":" qop ": " A2BUF) */ |
5182 | 0 | md5_init_ctx (&ctx); |
5183 | 0 | md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); |
5184 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5185 | 0 | md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); |
5186 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5187 | 0 | md5_process_bytes ((unsigned char *)"00000001", 8, &ctx); /* TODO: keep track of server nonce values */ |
5188 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5189 | 0 | md5_process_bytes ((unsigned char *)cnonce, strlen (cnonce), &ctx); |
5190 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5191 | 0 | md5_process_bytes ((unsigned char *)qop, strlen (qop), &ctx); |
5192 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5193 | 0 | md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); |
5194 | 0 | md5_finish_ctx (&ctx, hash); |
5195 | 0 | } |
5196 | 0 | else |
5197 | 0 | { |
5198 | | /* RFC 2069 Digest Access Authentication */ |
5199 | | /* RESPONSE_DIGEST = H(A1BUF ":" nonce ":" A2BUF) */ |
5200 | 0 | md5_init_ctx (&ctx); |
5201 | 0 | md5_process_bytes ((unsigned char *)a1buf, MD5_DIGEST_SIZE * 2, &ctx); |
5202 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5203 | 0 | md5_process_bytes ((unsigned char *)nonce, strlen (nonce), &ctx); |
5204 | 0 | md5_process_bytes ((unsigned char *)":", 1, &ctx); |
5205 | 0 | md5_process_bytes ((unsigned char *)a2buf, MD5_DIGEST_SIZE * 2, &ctx); |
5206 | 0 | md5_finish_ctx (&ctx, hash); |
5207 | 0 | } |
5208 | |
|
5209 | 0 | dump_hash (response_digest, hash); |
5210 | |
|
5211 | 0 | res_size = strlen (user) |
5212 | 0 | + strlen (realm) |
5213 | 0 | + strlen (nonce) |
5214 | 0 | + strlen (path) |
5215 | | + 2 * MD5_DIGEST_SIZE /*strlen (response_digest)*/ |
5216 | 0 | + (opaque ? strlen (opaque) : 0) |
5217 | 0 | + (algorithm ? strlen (algorithm) : 0) |
5218 | 0 | + (qop ? 128: 0) |
5219 | 0 | + strlen (cnonce) |
5220 | 0 | + 128; |
5221 | |
|
5222 | 0 | res = xmalloc (res_size); |
5223 | |
|
5224 | 0 | if (qop && !strcmp (qop, "auth")) |
5225 | 0 | { |
5226 | 0 | res_len = snprintf (res, res_size, "Digest "\ |
5227 | 0 | "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\""\ |
5228 | 0 | ", qop=auth, nc=00000001, cnonce=\"%s\"", |
5229 | 0 | user, realm, nonce, path, response_digest, cnonce); |
5230 | |
|
5231 | 0 | } |
5232 | 0 | else |
5233 | 0 | { |
5234 | 0 | res_len = snprintf (res, res_size, "Digest "\ |
5235 | 0 | "username=\"%s\", realm=\"%s\", nonce=\"%s\", uri=\"%s\", response=\"%s\"", |
5236 | 0 | user, realm, nonce, path, response_digest); |
5237 | 0 | } |
5238 | |
|
5239 | 0 | if (opaque) |
5240 | 0 | { |
5241 | 0 | res_len += snprintf (res + res_len, res_size - res_len, ", opaque=\"%s\"", opaque); |
5242 | 0 | } |
5243 | |
|
5244 | 0 | if (algorithm) |
5245 | 0 | { |
5246 | 0 | snprintf (res + res_len, res_size - res_len, ", algorithm=\"%s\"", algorithm); |
5247 | 0 | } |
5248 | 0 | } |
5249 | |
|
5250 | 0 | cleanup: |
5251 | 0 | xfree (realm); |
5252 | 0 | xfree (opaque); |
5253 | 0 | xfree (nonce); |
5254 | 0 | xfree (qop); |
5255 | 0 | xfree (algorithm); |
5256 | |
|
5257 | 0 | return res; |
5258 | 0 | } |
5259 | | #endif /* ENABLE_DIGEST */ |
5260 | | |
5261 | | /* Computing the size of a string literal must take into account that |
5262 | | value returned by sizeof includes the terminating \0. */ |
5263 | 0 | #define STRSIZE(literal) (sizeof (literal) - 1) |
5264 | | |
5265 | | /* Whether chars in [b, e) begin with the literal string provided as |
5266 | | first argument and are followed by whitespace or terminating \0. |
5267 | | The comparison is case-insensitive. */ |
5268 | | #define STARTS(literal, b, e) \ |
5269 | 0 | ((e > b) \ |
5270 | 0 | && ((size_t) ((e) - (b))) >= STRSIZE (literal) \ |
5271 | 0 | && 0 == c_strncasecmp (b, literal, STRSIZE (literal)) \ |
5272 | 0 | && ((size_t) ((e) - (b)) == STRSIZE (literal) \ |
5273 | 0 | || c_isspace (b[STRSIZE (literal)]))) |
5274 | | |
5275 | | static bool |
5276 | | known_authentication_scheme_p (const char *hdrbeg, const char *hdrend) |
5277 | 0 | { |
5278 | 0 | return STARTS ("Basic", hdrbeg, hdrend) |
5279 | 0 | #ifdef ENABLE_DIGEST |
5280 | 0 | || STARTS ("Digest", hdrbeg, hdrend) |
5281 | 0 | #endif |
5282 | 0 | #ifdef ENABLE_NTLM |
5283 | 0 | || STARTS ("NTLM", hdrbeg, hdrend) |
5284 | 0 | #endif |
5285 | 0 | ; |
5286 | 0 | } |
5287 | | |
5288 | | #undef STARTS |
5289 | | |
5290 | | /* Create the HTTP authorization request header. When the |
5291 | | `WWW-Authenticate' response header is seen, according to the |
5292 | | authorization scheme specified in that header (`Basic' and `Digest' |
5293 | | are supported by the current implementation), produce an |
5294 | | appropriate HTTP authorization request header. */ |
5295 | | static char * |
5296 | | create_authorization_line (const char *au, const char *user, |
5297 | | const char *passwd, const char *method, |
5298 | | const char *path, bool *finished, uerr_t *auth_err) |
5299 | 0 | { |
5300 | | /* We are called only with known schemes, so we can dispatch on the |
5301 | | first letter. */ |
5302 | 0 | switch (c_toupper (*au)) |
5303 | 0 | { |
5304 | 0 | case 'B': /* Basic */ |
5305 | 0 | *finished = true; |
5306 | 0 | return basic_authentication_encode (user, passwd); |
5307 | 0 | #ifdef ENABLE_DIGEST |
5308 | 0 | case 'D': /* Digest */ |
5309 | 0 | *finished = true; |
5310 | 0 | return digest_authentication_encode (au, user, passwd, method, path, auth_err); |
5311 | 0 | #endif |
5312 | 0 | #ifdef ENABLE_NTLM |
5313 | 0 | case 'N': /* NTLM */ |
5314 | 0 | if (!ntlm_input (&pconn.ntlm, au)) |
5315 | 0 | { |
5316 | 0 | *finished = true; |
5317 | 0 | return NULL; |
5318 | 0 | } |
5319 | 0 | return ntlm_output (&pconn.ntlm, user, passwd, finished); |
5320 | 0 | #endif |
5321 | 0 | default: |
5322 | | /* We shouldn't get here -- this function should be only called |
5323 | | with values approved by known_authentication_scheme_p. */ |
5324 | 0 | abort (); |
5325 | 0 | } |
5326 | 0 | } |
5327 | | |
5328 | | static void |
5329 | | load_cookies (void) |
5330 | 0 | { |
5331 | 0 | if (!wget_cookie_jar) |
5332 | 0 | wget_cookie_jar = cookie_jar_new (); |
5333 | 0 | if (opt.cookies_input && !cookies_loaded_p) |
5334 | 0 | { |
5335 | 0 | cookie_jar_load (wget_cookie_jar, opt.cookies_input); |
5336 | 0 | cookies_loaded_p = true; |
5337 | 0 | } |
5338 | 0 | } |
5339 | | |
5340 | | void |
5341 | | save_cookies (void) |
5342 | 0 | { |
5343 | 0 | if (wget_cookie_jar) |
5344 | 0 | cookie_jar_save (wget_cookie_jar, opt.cookies_output); |
5345 | 0 | } |
5346 | | |
5347 | | #if defined DEBUG_MALLOC || defined TESTING |
5348 | | void |
5349 | | http_cleanup (void) |
5350 | 0 | { |
5351 | 0 | if (pconn_active) |
5352 | 0 | invalidate_persistent (); |
5353 | |
|
5354 | 0 | if (wget_cookie_jar) |
5355 | 0 | { |
5356 | 0 | cookie_jar_delete (wget_cookie_jar); |
5357 | 0 | wget_cookie_jar = NULL; |
5358 | 0 | } |
5359 | |
|
5360 | 0 | if (basic_authed_hosts) |
5361 | 0 | { |
5362 | 0 | hash_table_iterator iter; |
5363 | 0 | for (hash_table_iterate (basic_authed_hosts, &iter); hash_table_iter_next (&iter); ) |
5364 | 0 | { |
5365 | 0 | xfree (iter.key); |
5366 | 0 | } |
5367 | 0 | hash_table_destroy (basic_authed_hosts); |
5368 | 0 | basic_authed_hosts = NULL; |
5369 | 0 | } |
5370 | 0 | } |
5371 | | #endif |
5372 | | |
5373 | | void |
5374 | | ensure_extension (struct http_stat *hs, const char *ext, int *dt) |
5375 | 0 | { |
5376 | 0 | char *last_period_in_local_filename = strrchr (hs->local_file, '.'); |
5377 | 0 | char shortext[8]; |
5378 | 0 | int len; |
5379 | 0 | shortext[0] = '\0'; |
5380 | 0 | len = strlen (ext); |
5381 | 0 | if (len == 5) |
5382 | 0 | { |
5383 | 0 | memcpy (shortext, ext, len - 1); |
5384 | 0 | shortext[len - 1] = '\0'; |
5385 | 0 | } |
5386 | |
|
5387 | 0 | if (last_period_in_local_filename == NULL |
5388 | 0 | || !(0 == strcasecmp (last_period_in_local_filename, shortext) |
5389 | 0 | || 0 == strcasecmp (last_period_in_local_filename, ext))) |
5390 | 0 | { |
5391 | 0 | int local_filename_len = strlen (hs->local_file); |
5392 | | /* Resize the local file, allowing for ".html" preceded by |
5393 | | optional ".NUMBER". */ |
5394 | 0 | hs->local_file = xrealloc (hs->local_file, |
5395 | 0 | local_filename_len + 24 + len); |
5396 | 0 | strcpy (hs->local_file + local_filename_len, ext); |
5397 | | /* If clobbering is not allowed and the file, as named, |
5398 | | exists, tack on ".NUMBER.html" instead. */ |
5399 | 0 | if (!ALLOW_CLOBBER && file_exists_p (hs->local_file, NULL)) |
5400 | 0 | { |
5401 | 0 | int ext_num = 1; |
5402 | 0 | do |
5403 | 0 | sprintf (hs->local_file + local_filename_len, |
5404 | 0 | ".%d%s", ext_num++, ext); |
5405 | 0 | while (file_exists_p (hs->local_file, NULL)); |
5406 | 0 | } |
5407 | 0 | *dt |= ADDED_HTML_EXTENSION; |
5408 | 0 | } |
5409 | 0 | } |
5410 | | |
5411 | | #ifdef TESTING |
5412 | | |
5413 | | const char * |
5414 | | test_parse_range_header (void) |
5415 | 0 | { |
5416 | 0 | unsigned i; |
5417 | 0 | static const struct { |
5418 | 0 | const char * rangehdr; |
5419 | 0 | const wgint firstbyte; |
5420 | 0 | const wgint lastbyte; |
5421 | 0 | const wgint length; |
5422 | 0 | const bool shouldPass; |
5423 | 0 | } test_array[] = { |
5424 | 0 | { "bytes 0-1000/1000", 0, 1000, 1000, false }, |
5425 | 0 | { "bytes 0-999/1000", 0, 999, 1000, true }, |
5426 | 0 | { "bytes 100-99/1000", 100, 99, 1000, false }, |
5427 | 0 | { "bytes 100-100/1000", 100, 100, 1000, true }, |
5428 | 0 | { "bytes 0-1000/100000000", 0, 1000, 100000000, true }, |
5429 | 0 | { "bytes 1-999/1000", 1, 999, 1000, true }, |
5430 | 0 | { "bytes 42-1233/1234", 42, 1233, 1234, true }, |
5431 | 0 | { "bytes 42-1233/*", 42, 1233, -1, true }, |
5432 | 0 | { "bytes 0-2147483648/2147483649", 0, 2147483648U, 2147483649U, true }, |
5433 | 0 | { "bytes 2147483648-4294967296/4294967297", 2147483648U, 4294967296ULL, 4294967297ULL, true }, |
5434 | 0 | }; |
5435 | |
|
5436 | 0 | wgint firstbyteptr[sizeof(wgint)]; |
5437 | 0 | wgint lastbyteptr[sizeof(wgint)]; |
5438 | 0 | wgint lengthptr[sizeof(wgint)]; |
5439 | 0 | bool result; |
5440 | 0 | for (i = 0; i < countof (test_array); i++) |
5441 | 0 | { |
5442 | 0 | result = parse_content_range (test_array[i].rangehdr, firstbyteptr, lastbyteptr, lengthptr); |
5443 | | #if 0 |
5444 | | printf ("%ld %ld\n", test_array[i].firstbyte, *firstbyteptr); |
5445 | | printf ("%ld %ld\n", test_array[i].lastbyte, *lastbyteptr); |
5446 | | printf ("%ld %ld\n", test_array[i].length, *lengthptr); |
5447 | | printf ("\n"); |
5448 | | #endif |
5449 | 0 | mu_assert ("test_parse_range_header: False Negative", result == test_array[i].shouldPass); |
5450 | 0 | mu_assert ("test_parse_range_header: Bad parse", test_array[i].firstbyte == *firstbyteptr && |
5451 | 0 | test_array[i].lastbyte == *lastbyteptr && |
5452 | 0 | test_array[i].length == *lengthptr); |
5453 | 0 | } |
5454 | | |
5455 | 0 | return NULL; |
5456 | 0 | } |
5457 | | |
5458 | | const char * |
5459 | | test_parse_content_disposition (void) |
5460 | 0 | { |
5461 | 0 | unsigned i; |
5462 | 0 | static const struct { |
5463 | 0 | const char *hdrval; |
5464 | 0 | const char *filename; |
5465 | 0 | bool result; |
5466 | 0 | } test_array[] = { |
5467 | 0 | { "filename=\"file.ext\"", "file.ext", true }, |
5468 | 0 | { "attachment; filename=\"file.ext\"", "file.ext", true }, |
5469 | 0 | { "attachment; filename=\"file.ext\"; dummy", "file.ext", true }, |
5470 | 0 | { "attachment", NULL, false }, |
5471 | 0 | { "attachment; filename*=UTF-8'en-US'hello.txt", "hello.txt", true }, |
5472 | 0 | { "attachment; filename*0=\"hello\"; filename*1=\"world.txt\"", |
5473 | 0 | "helloworld.txt", true }, |
5474 | 0 | { "attachment; filename=\"A.ext\"; filename*=\"B.ext\"", "B.ext", true }, |
5475 | 0 | { "attachment; filename*=\"A.ext\"; filename*0=\"B\"; filename*1=\"B.ext\"", |
5476 | 0 | "A.ext", true }, |
5477 | 0 | { "filename**0=\"A\"; filename**1=\"A.ext\"; filename*0=\"B\";\ |
5478 | 0 | filename*1=\"B\"", "AA.ext", true }, |
5479 | 0 | }; |
5480 | |
|
5481 | 0 | for (i = 0; i < countof (test_array); ++i) |
5482 | 0 | { |
5483 | 0 | char *filename; |
5484 | 0 | bool res; |
5485 | |
|
5486 | 0 | res = parse_content_disposition (test_array[i].hdrval, &filename); |
5487 | |
|
5488 | 0 | mu_assert ("test_parse_content_disposition: wrong result", |
5489 | 0 | res == test_array[i].result |
5490 | 0 | && (res == false |
5491 | 0 | || 0 == strcmp (test_array[i].filename, filename))); |
5492 | 0 | xfree (filename); |
5493 | 0 | } |
5494 | | |
5495 | 0 | return NULL; |
5496 | 0 | } |
5497 | | |
5498 | | #endif /* TESTING */ |
5499 | | |
5500 | | /* |
5501 | | * vim: et sts=2 sw=2 cino+={s |
5502 | | */ |